From d2e1911345ac035a1de371c5e30d45e2df4eab3d Mon Sep 17 00:00:00 2001 From: Basilisk-Dev Date: Mon, 23 Feb 2026 16:10:38 -0500 Subject: [PATCH] Issue #2925 - Upgrade libvpx to 1.16.0 --- media/libvpx/README_MOZILLA | 2 +- media/libvpx/config/generic/vp8_rtcd.h | 71 +- media/libvpx/config/generic/vp9_rtcd.h | 52 +- media/libvpx/config/generic/vpx_config.asm | 33 +- media/libvpx/config/generic/vpx_config.h | 29 +- media/libvpx/config/generic/vpx_dsp_rtcd.h | 452 +- media/libvpx/config/generic/vpx_scale_rtcd.h | 16 +- media/libvpx/config/linux/arm/vp8_rtcd.h | 167 +- media/libvpx/config/linux/arm/vp9_rtcd.h | 93 +- media/libvpx/config/linux/arm/vpx_config.asm | 33 +- media/libvpx/config/linux/arm/vpx_config.h | 29 +- media/libvpx/config/linux/arm/vpx_dsp_rtcd.h | 1236 +- .../libvpx/config/linux/arm/vpx_scale_rtcd.h | 16 +- media/libvpx/config/linux/arm64/vp8_rtcd.h | 211 + media/libvpx/config/linux/arm64/vp9_rtcd.h | 125 + .../libvpx/config/linux/arm64/vpx_config.asm | 97 + media/libvpx/config/linux/arm64/vpx_config.c | 10 + media/libvpx/config/linux/arm64/vpx_config.h | 108 + .../libvpx/config/linux/arm64/vpx_dsp_rtcd.h | 1197 + .../config/linux/arm64/vpx_scale_rtcd.h | 85 + media/libvpx/config/linux/ia32/vp8_rtcd.h | 206 +- media/libvpx/config/linux/ia32/vp9_rtcd.h | 117 +- media/libvpx/config/linux/ia32/vpx_config.asm | 29 +- media/libvpx/config/linux/ia32/vpx_config.h | 29 +- media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h | 1229 +- .../libvpx/config/linux/ia32/vpx_scale_rtcd.h | 16 +- .../config/linux/loongarch64/vp8_rtcd.h | 229 + .../config/linux/loongarch64/vp9_rtcd.h | 118 + .../config/linux/loongarch64/vpx_config.asm | 97 + .../config/linux/loongarch64/vpx_config.c | 10 + .../config/linux/loongarch64/vpx_config.h | 108 + .../config/linux/loongarch64/vpx_dsp_rtcd.h | 929 + .../config/linux/loongarch64/vpx_scale_rtcd.h | 84 + media/libvpx/config/linux/mips32/vp8_rtcd.h | 370 + media/libvpx/config/linux/mips32/vp9_rtcd.h | 244 + .../libvpx/config/linux/mips32/vpx_config.asm | 97 + media/libvpx/config/linux/mips32/vpx_config.c | 10 + media/libvpx/config/linux/mips32/vpx_config.h | 108 + .../libvpx/config/linux/mips32/vpx_dsp_rtcd.h | 1524 ++ .../config/linux/mips32/vpx_scale_rtcd.h | 176 + media/libvpx/config/linux/mips64/vp8_rtcd.h | 259 + media/libvpx/config/linux/mips64/vp9_rtcd.h | 138 + .../libvpx/config/linux/mips64/vpx_config.asm | 97 + media/libvpx/config/linux/mips64/vpx_config.c | 10 + media/libvpx/config/linux/mips64/vpx_config.h | 108 + .../libvpx/config/linux/mips64/vpx_dsp_rtcd.h | 1029 + .../config/linux/mips64/vpx_scale_rtcd.h | 96 + media/libvpx/config/linux/ppc64le/vp8_rtcd.h | 180 + media/libvpx/config/linux/ppc64le/vp9_rtcd.h | 132 + .../config/linux/ppc64le/vpx_config.asm | 97 + .../libvpx/config/linux/ppc64le/vpx_config.c | 10 + .../libvpx/config/linux/ppc64le/vpx_config.h | 108 + .../config/linux/ppc64le/vpx_dsp_rtcd.h | 1015 + .../config/linux/ppc64le/vpx_scale_rtcd.h | 83 + media/libvpx/config/linux/x64/vp8_rtcd.h | 168 +- media/libvpx/config/linux/x64/vp9_rtcd.h | 112 +- media/libvpx/config/linux/x64/vpx_config.asm | 29 +- media/libvpx/config/linux/x64/vpx_config.h | 29 +- media/libvpx/config/linux/x64/vpx_dsp_rtcd.h | 1113 +- .../libvpx/config/linux/x64/vpx_scale_rtcd.h | 16 +- media/libvpx/config/mac/arm64/vp8_rtcd.h | 211 + media/libvpx/config/mac/arm64/vp9_rtcd.h | 125 + media/libvpx/config/mac/arm64/vpx_config.asm | 97 + media/libvpx/config/mac/arm64/vpx_config.c | 10 + media/libvpx/config/mac/arm64/vpx_config.h | 108 + media/libvpx/config/mac/arm64/vpx_dsp_rtcd.h | 1197 + .../libvpx/config/mac/arm64/vpx_scale_rtcd.h | 85 + media/libvpx/config/mac/ia32/vp8_rtcd.h | 206 +- media/libvpx/config/mac/ia32/vp9_rtcd.h | 117 +- media/libvpx/config/mac/ia32/vpx_config.asm | 29 +- media/libvpx/config/mac/ia32/vpx_config.h | 29 +- media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h | 1229 +- media/libvpx/config/mac/ia32/vpx_scale_rtcd.h | 16 +- media/libvpx/config/mac/x64/vp8_rtcd.h | 168 +- media/libvpx/config/mac/x64/vp9_rtcd.h | 112 +- media/libvpx/config/mac/x64/vpx_config.asm | 29 +- media/libvpx/config/mac/x64/vpx_config.h | 29 +- media/libvpx/config/mac/x64/vpx_dsp_rtcd.h | 1113 +- media/libvpx/config/mac/x64/vpx_scale_rtcd.h | 16 +- media/libvpx/config/vpx_version.h | 12 +- media/libvpx/config/win/ia32/vp8_rtcd.h | 206 +- media/libvpx/config/win/ia32/vp9_rtcd.h | 117 +- media/libvpx/config/win/ia32/vpx_config.asm | 29 +- media/libvpx/config/win/ia32/vpx_config.h | 29 +- media/libvpx/config/win/ia32/vpx_dsp_rtcd.h | 1229 +- media/libvpx/config/win/ia32/vpx_scale_rtcd.h | 16 +- media/libvpx/config/win/x64/vp8_rtcd.h | 168 +- media/libvpx/config/win/x64/vp9_rtcd.h | 112 +- media/libvpx/config/win/x64/vpx_config.asm | 29 +- media/libvpx/config/win/x64/vpx_config.c | 2 +- media/libvpx/config/win/x64/vpx_config.h | 31 +- media/libvpx/config/win/x64/vpx_dsp_rtcd.h | 1113 +- media/libvpx/config/win/x64/vpx_scale_rtcd.h | 16 +- media/libvpx/generate_sources_mozbuild.sh | 48 +- media/libvpx/libvpx/.clang-format | 84 +- media/libvpx/libvpx/.mailmap | 27 +- media/libvpx/libvpx/AUTHORS | 108 +- media/libvpx/libvpx/CHANGELOG | 492 +- media/libvpx/libvpx/CONTRIBUTING.md | 29 + media/libvpx/libvpx/README | 133 +- media/libvpx/libvpx/args.c | 35 +- media/libvpx/libvpx/args.h | 20 +- media/libvpx/libvpx/build/make/Android.mk | 56 +- media/libvpx/libvpx/build/make/Makefile | 66 +- .../libvpx/libvpx/build/make/ads2armasm_ms.pl | 2 +- media/libvpx/libvpx/build/make/ads2gas.pl | 161 +- .../libvpx/libvpx/build/make/ads2gas_apple.pl | 146 +- media/libvpx/libvpx/build/make/configure.sh | 659 +- .../libvpx/libvpx/build/make/gen_asm_deps.sh | 2 +- .../libvpx/libvpx/build/make/gen_msvs_sln.sh | 35 +- .../libvpx/build/make/gen_msvs_vcxproj.sh | 88 +- media/libvpx/libvpx/build/make/iosbuild.sh | 19 +- media/libvpx/libvpx/build/make/msvs_common.sh | 12 +- media/libvpx/libvpx/build/make/rtcd.pl | 162 +- media/libvpx/libvpx/build/make/thumb.pm | 12 +- media/libvpx/libvpx/build/make/version.sh | 4 + media/libvpx/libvpx/codereview.settings | 5 +- media/libvpx/libvpx/configure | 191 +- media/libvpx/libvpx/examples.mk | 73 +- .../libvpx/examples/decode_with_drops.c | 2 +- media/libvpx/libvpx/examples/postproc.c | 6 +- media/libvpx/libvpx/examples/resize_util.c | 123 - media/libvpx/libvpx/examples/set_maps.c | 2 +- media/libvpx/libvpx/examples/simple_decoder.c | 2 +- media/libvpx/libvpx/examples/simple_encoder.c | 2 +- .../libvpx/{vpx => examples}/svc_context.h | 27 +- .../{vpx/src => examples}/svc_encodeframe.c | 184 +- .../libvpx/libvpx/examples/twopass_encoder.c | 7 +- .../examples/vp8_multi_resolution_encoder.c | 54 +- media/libvpx/libvpx/examples/vp8cx_set_ref.c | 2 +- .../libvpx/examples/vp9_lossless_encoder.c | 2 +- .../libvpx/examples/vp9_spatial_svc_encoder.c | 870 +- media/libvpx/libvpx/examples/vp9cx_set_ref.c | 126 +- .../libvpx/libvpx/examples/vpx_dec_fuzzer.cc | 159 + .../libvpx/libvpx/examples/vpx_enc_fuzzer.cc | 236 + .../examples/vpx_temporal_svc_encoder.c | 338 +- media/libvpx/libvpx/ivfdec.c | 8 +- media/libvpx/libvpx/ivfdec.h | 6 +- media/libvpx/libvpx/ivfenc.c | 30 +- media/libvpx/libvpx/ivfenc.h | 13 +- media/libvpx/libvpx/libs.doxy_template | 60 +- media/libvpx/libvpx/libs.mk | 270 +- media/libvpx/libvpx/mainpage.dox | 2 + media/libvpx/libvpx/md5_utils.c | 20 +- media/libvpx/libvpx/md5_utils.h | 6 +- media/libvpx/libvpx/rate_hist.c | 70 +- media/libvpx/libvpx/rate_hist.h | 6 +- media/libvpx/libvpx/test/acm_random.h | 49 +- .../libvpx/test/active_map_refresh_test.cc | 20 +- media/libvpx/libvpx/test/active_map_test.cc | 24 +- media/libvpx/libvpx/test/add_noise_test.cc | 58 +- .../libvpx/test/alt_ref_aq_segment_test.cc | 20 +- media/libvpx/libvpx/test/altref_test.cc | 43 +- media/libvpx/libvpx/test/android/Android.mk | 11 + media/libvpx/libvpx/test/android/README | 11 +- media/libvpx/libvpx/test/android/get_files.py | 17 +- media/libvpx/libvpx/test/aq_segment_test.cc | 20 +- media/libvpx/libvpx/test/avg_test.cc | 529 +- media/libvpx/libvpx/test/bench.cc | 39 + media/libvpx/libvpx/test/bench.h | 32 + media/libvpx/libvpx/test/blockiness_test.cc | 37 +- media/libvpx/libvpx/test/borders_test.cc | 24 +- media/libvpx/libvpx/test/buffer.h | 382 + .../libvpx/libvpx/test/byte_alignment_test.cc | 30 +- media/libvpx/libvpx/test/clear_system_state.h | 16 +- media/libvpx/libvpx/test/codec_factory.h | 97 +- .../libvpx/libvpx/test/comp_avg_pred_test.cc | 276 + media/libvpx/libvpx/test/config_test.cc | 14 +- media/libvpx/libvpx/test/consistency_test.cc | 39 +- media/libvpx/libvpx/test/convolve_test.cc | 1423 +- media/libvpx/libvpx/test/cpu_speed_test.cc | 25 +- media/libvpx/libvpx/test/cq_test.cc | 34 +- media/libvpx/libvpx/test/cx_set_ref.sh | 2 +- media/libvpx/libvpx/test/datarate_test.cc | 1476 -- media/libvpx/libvpx/test/dct16x16_test.cc | 380 +- media/libvpx/libvpx/test/dct32x32_test.cc | 348 +- media/libvpx/libvpx/test/dct_partial_test.cc | 184 + media/libvpx/libvpx/test/dct_test.cc | 791 + media/libvpx/libvpx/test/decode_api_test.cc | 140 +- media/libvpx/libvpx/test/decode_corrupted.cc | 104 + media/libvpx/libvpx/test/decode_perf_test.cc | 50 +- media/libvpx/libvpx/test/decode_svc_test.cc | 30 +- .../libvpx/libvpx/test/decode_test_driver.cc | 27 +- media/libvpx/libvpx/test/decode_test_driver.h | 10 +- media/libvpx/libvpx/test/decode_to_md5.sh | 2 +- media/libvpx/libvpx/test/decode_with_drops.sh | 10 +- media/libvpx/libvpx/test/encode_api_test.cc | 2266 +- media/libvpx/libvpx/test/encode_perf_test.cc | 28 +- .../libvpx/libvpx/test/encode_test_driver.cc | 35 +- media/libvpx/libvpx/test/encode_test_driver.h | 85 +- .../libvpx/test/error_resilience_test.cc | 50 +- .../libvpx/test/external_frame_buffer_test.cc | 123 +- media/libvpx/libvpx/test/fdct4x4_test.cc | 511 - media/libvpx/libvpx/test/fdct8x8_test.cc | 217 +- media/libvpx/libvpx/test/frame_size_tests.cc | 165 +- media/libvpx/libvpx/test/hadamard_test.cc | 401 +- media/libvpx/libvpx/test/i420_video_source.h | 6 +- media/libvpx/libvpx/test/idct8x8_test.cc | 3 +- media/libvpx/libvpx/test/idct_test.cc | 208 +- media/libvpx/libvpx/test/init_vpx_test.cc | 99 + media/libvpx/libvpx/test/init_vpx_test.h | 18 + media/libvpx/libvpx/test/invalid_file_test.cc | 68 +- media/libvpx/libvpx/test/ivf_video_source.h | 38 +- media/libvpx/libvpx/test/keyframe_test.cc | 134 +- media/libvpx/libvpx/test/level_test.cc | 53 +- media/libvpx/libvpx/test/lpf_test.cc | 171 +- media/libvpx/libvpx/test/md5_helper.h | 8 +- media/libvpx/libvpx/test/minmax_test.cc | 145 +- .../libvpx/libvpx/test/non_greedy_mv_test.cc | 200 + media/libvpx/libvpx/test/partial_idct_test.cc | 593 +- media/libvpx/libvpx/test/postproc.sh | 2 +- media/libvpx/libvpx/test/pp_filter_test.cc | 664 +- media/libvpx/libvpx/test/predict_test.cc | 110 +- media/libvpx/libvpx/test/quantize_test.cc | 71 +- media/libvpx/libvpx/test/realtime_test.cc | 89 +- .../libvpx/libvpx/test/register_state_check.h | 56 +- media/libvpx/libvpx/test/resize_test.cc | 401 +- media/libvpx/libvpx/test/resize_util.sh | 69 - media/libvpx/libvpx/test/sad_test.cc | 1297 +- media/libvpx/libvpx/test/set_maps.sh | 2 +- media/libvpx/libvpx/test/set_roi.cc | 14 +- media/libvpx/libvpx/test/simple_decoder.sh | 2 +- media/libvpx/libvpx/test/simple_encoder.sh | 2 +- media/libvpx/libvpx/test/stress.sh | 80 +- media/libvpx/libvpx/test/sum_squares_test.cc | 251 +- media/libvpx/libvpx/test/superframe_test.cc | 28 +- media/libvpx/libvpx/test/svc_datarate_test.cc | 2208 ++ .../libvpx/libvpx/test/svc_end_to_end_test.cc | 825 + media/libvpx/libvpx/test/svc_test.cc | 874 +- media/libvpx/libvpx/test/svc_test.h | 67 + media/libvpx/libvpx/test/test-data.mk | 42 +- media/libvpx/libvpx/test/test-data.sha1 | 39 +- media/libvpx/libvpx/test/test.mk | 70 +- .../libvpx/test/test_intra_pred_speed.cc | 296 +- media/libvpx/libvpx/test/test_libvpx.cc | 61 +- media/libvpx/libvpx/test/test_rc_interface.cc | 16 + media/libvpx/libvpx/test/test_vector_test.cc | 105 +- media/libvpx/libvpx/test/test_vectors.cc | 2 + media/libvpx/libvpx/test/test_vectors.h | 6 +- .../libvpx/test/tile_independence_test.cc | 18 +- media/libvpx/libvpx/test/timestamp_test.cc | 109 + media/libvpx/libvpx/test/tools_common.sh | 17 +- media/libvpx/libvpx/test/twopass_encoder.sh | 11 +- media/libvpx/libvpx/test/user_priv_test.cc | 14 +- media/libvpx/libvpx/test/util.h | 12 +- media/libvpx/libvpx/test/variance_test.cc | 1891 +- media/libvpx/libvpx/test/video_source.h | 80 +- .../libvpx/libvpx/test/vp8_boolcoder_test.cc | 5 +- media/libvpx/libvpx/test/vp8_datarate_test.cc | 508 + media/libvpx/libvpx/test/vp8_decrypt_test.cc | 2 +- .../libvpx/test/vp8_denoiser_sse2_test.cc | 20 +- media/libvpx/libvpx/test/vp8_fdct4x4_test.cc | 69 +- .../libvpx/libvpx/test/vp8_fragments_test.cc | 14 +- .../test/vp8_multi_resolution_encoder.sh | 24 +- .../libvpx/test/vp8_ratectrl_rtc_test.cc | 424 + media/libvpx/libvpx/test/vp9_arf_freq_test.cc | 33 +- ..._block_test.cc => vp9_block_error_test.cc} | 122 +- .../libvpx/libvpx/test/vp9_boolcoder_test.cc | 32 +- .../libvpx/test/vp9_c_vs_simd_encode.sh | 420 + media/libvpx/libvpx/test/vp9_datarate_test.cc | 1151 + media/libvpx/libvpx/test/vp9_decrypt_test.cc | 2 +- ...iser_sse2_test.cc => vp9_denoiser_test.cc} | 65 +- .../test/vp9_encoder_parms_get_to_decoder.cc | 34 +- .../libvpx/libvpx/test/vp9_end_to_end_test.cc | 263 +- media/libvpx/libvpx/test/vp9_ethread_test.cc | 355 +- .../libvpx/test/vp9_ext_ratectrl_test.cc | 271 + .../libvpx/test/vp9_frame_parallel_test.cc | 217 - .../libvpx/libvpx/test/vp9_intrapred_test.cc | 521 +- media/libvpx/libvpx/test/vp9_lossless_test.cc | 29 +- .../libvpx/test/vp9_motion_vector_test.cc | 102 + media/libvpx/libvpx/test/vp9_quantize_test.cc | 930 +- .../libvpx/test/vp9_ratectrl_rtc_test.cc | 675 + media/libvpx/libvpx/test/vp9_roi_test.cc | 148 + media/libvpx/libvpx/test/vp9_scale_test.cc | 214 + .../libvpx/test/vp9_skip_loopfilter_test.cc | 37 +- .../libvpx/test/vp9_spatial_svc_encoder.sh | 72 - media/libvpx/libvpx/test/vp9_subtract_test.cc | 333 +- media/libvpx/libvpx/test/vp9_thread_test.cc | 195 +- media/libvpx/libvpx/test/vpx_image_test.cc | 128 + media/libvpx/libvpx/test/vpx_scale_test.cc | 197 +- media/libvpx/libvpx/test/vpx_scale_test.h | 201 + .../libvpx/test/vpx_temporal_svc_encoder.sh | 146 +- media/libvpx/libvpx/test/vpxdec.sh | 39 +- media/libvpx/libvpx/test/vpxenc.sh | 196 +- media/libvpx/libvpx/test/webm_video_source.h | 34 +- media/libvpx/libvpx/test/y4m_test.cc | 105 +- media/libvpx/libvpx/test/y4m_video_source.h | 45 +- .../libvpx/test/yuv_temporal_filter_test.cc | 727 + media/libvpx/libvpx/test/yuv_video_source.h | 41 +- .../third_party/googletest/README.libvpx | 28 +- .../third_party/googletest/src/.clang-format | 4 + .../libvpx/third_party/googletest/src/CHANGES | 157 - .../third_party/googletest/src/CONTRIBUTORS | 28 + .../libvpx/third_party/googletest/src/README | 435 - .../third_party/googletest/src/README.md | 217 + .../include/gtest/gtest-assertion-result.h | 237 + .../src/include/gtest/gtest-death-test.h | 345 + .../src/include/gtest/gtest-matchers.h | 956 + .../src/include/gtest/gtest-message.h | 218 + .../src/include/gtest/gtest-param-test.h | 510 + .../src/include/gtest/gtest-printers.h | 1048 + .../googletest/src/include/gtest/gtest-spi.h | 248 + .../src/include/gtest/gtest-test-part.h | 190 + .../src/include/gtest/gtest-typed-test.h | 331 + .../googletest/src/include/gtest/gtest.h | 19368 +--------------- .../src/include/gtest/gtest_pred_impl.h | 279 + .../googletest/src/include/gtest/gtest_prod.h | 60 + .../include/gtest/internal/custom/README.md | 44 + .../gtest/internal/custom/gtest-port.h | 68 + .../gtest/internal/custom/gtest-printers.h | 42 + .../src/include/gtest/internal/custom/gtest.h | 37 + .../internal/gtest-death-test-internal.h | 306 + .../include/gtest/internal/gtest-filepath.h | 210 + .../include/gtest/internal/gtest-internal.h | 1570 ++ .../include/gtest/internal/gtest-param-util.h | 956 + .../include/gtest/internal/gtest-port-arch.h | 116 + .../src/include/gtest/internal/gtest-port.h | 2413 ++ .../src/include/gtest/internal/gtest-string.h | 177 + .../include/gtest/internal/gtest-type-util.h | 186 + .../googletest/src/src/gtest-all.cc | 9565 +------- .../src/src/gtest-assertion-result.cc | 77 + .../googletest/src/src/gtest-death-test.cc | 1620 ++ .../googletest/src/src/gtest-filepath.cc | 367 + .../googletest/src/src/gtest-internal-inl.h | 1212 + .../googletest/src/src/gtest-matchers.cc | 98 + .../googletest/src/src/gtest-port.cc | 1394 ++ .../googletest/src/src/gtest-printers.cc | 553 + .../googletest/src/src/gtest-test-part.cc | 105 + .../googletest/src/src/gtest-typed-test.cc | 104 + .../third_party/googletest/src/src/gtest.cc | 6795 ++++++ .../googletest/src/src/gtest_main.cc | 19 +- .../libvpx/third_party/libwebm/AUTHORS.TXT | 1 + .../libvpx/third_party/libwebm/Android.mk | 8 +- .../libvpx/third_party/libwebm/README.libvpx | 16 +- .../third_party/libwebm/common/file_util.cc | 19 +- .../third_party/libwebm/common/file_util.h | 5 +- .../third_party/libwebm/common/hdr_util.cc | 11 +- .../third_party/libwebm/common/hdr_util.h | 2 +- .../third_party/libwebm/common/webmids.h | 1 + .../third_party/libwebm/mkvmuxer/mkvmuxer.cc | 234 +- .../third_party/libwebm/mkvmuxer/mkvmuxer.h | 15 +- .../libwebm/mkvmuxer/mkvmuxerutil.cc | 32 +- .../libwebm/mkvmuxer/mkvmuxerutil.h | 7 +- .../third_party/libwebm/mkvmuxer/mkvwriter.cc | 12 +- .../libwebm/mkvparser/mkvparser.cc | 225 +- .../third_party/libwebm/mkvparser/mkvparser.h | 6 +- .../libwebm/mkvparser/mkvreader.cc | 12 +- .../libvpx/libvpx/third_party/libyuv/LICENSE | 29 + .../libvpx/third_party/libyuv/README.libvpx | 23 +- .../libyuv/include/libyuv/basic_types.h | 109 +- .../libyuv/include/libyuv/compare.h | 93 +- .../libyuv/include/libyuv/convert.h | 421 +- .../libyuv/include/libyuv/convert_argb.h | 676 +- .../libyuv/include/libyuv/convert_from.h | 377 +- .../libyuv/include/libyuv/convert_from_argb.h | 283 +- .../libyuv/include/libyuv/cpu_id.h | 75 +- .../libyuv/include/libyuv/macros_msa.h | 233 + .../libyuv/include/libyuv/mjpeg_decoder.h | 33 +- .../libyuv/include/libyuv/planar_functions.h | 1248 +- .../libyuv/include/libyuv/rotate.h | 143 +- .../libyuv/include/libyuv/rotate_argb.h | 14 +- .../libyuv/include/libyuv/rotate_row.h | 203 +- .../third_party/libyuv/include/libyuv/row.h | 4065 +++- .../third_party/libyuv/include/libyuv/scale.h | 110 +- .../libyuv/include/libyuv/scale_argb.h | 60 +- .../libyuv/include/libyuv/scale_row.h | 1083 +- .../libyuv/include/libyuv/version.h | 6 +- .../libyuv/include/libyuv/video_common.h | 52 +- .../third_party/libyuv/source/compare.cc | 267 +- .../libyuv/source/compare_common.cc | 70 +- .../third_party/libyuv/source/compare_gcc.cc | 427 +- .../third_party/libyuv/source/compare_msa.cc | 97 + .../third_party/libyuv/source/compare_neon.cc | 94 +- .../libyuv/source/compare_neon64.cc | 88 +- .../third_party/libyuv/source/compare_win.cc | 119 +- .../third_party/libyuv/source/convert.cc | 963 +- .../third_party/libyuv/source/convert_argb.cc | 1777 +- .../third_party/libyuv/source/convert_from.cc | 1165 +- .../libyuv/source/convert_from_argb.cc | 839 +- .../third_party/libyuv/source/convert_jpeg.cc | 243 +- .../libyuv/source/convert_to_argb.cc | 246 +- .../libyuv/source/convert_to_i420.cc | 302 +- .../third_party/libyuv/source/cpu_id.cc | 208 +- .../libyuv/source/mjpeg_decoder.cc | 126 +- .../libyuv/source/mjpeg_validate.cc | 11 +- .../libyuv/source/planar_functions.cc | 1876 +- .../third_party/libyuv/source/rotate.cc | 377 +- .../third_party/libyuv/source/rotate_any.cc | 57 +- .../third_party/libyuv/source/rotate_argb.cc | 163 +- .../libyuv/source/rotate_common.cc | 40 +- .../third_party/libyuv/source/rotate_gcc.cc | 660 +- .../third_party/libyuv/source/rotate_mips.cc | 484 - .../third_party/libyuv/source/rotate_msa.cc | 250 + .../third_party/libyuv/source/rotate_neon.cc | 567 +- .../libyuv/source/rotate_neon64.cc | 685 +- .../third_party/libyuv/source/rotate_win.cc | 51 +- .../third_party/libyuv/source/row_any.cc | 937 +- .../third_party/libyuv/source/row_common.cc | 2514 +- .../third_party/libyuv/source/row_gcc.cc | 9987 ++++---- .../third_party/libyuv/source/row_mips.cc | 782 - .../third_party/libyuv/source/row_msa.cc | 3512 +++ .../third_party/libyuv/source/row_neon.cc | 4374 ++-- .../third_party/libyuv/source/row_neon64.cc | 4147 ++-- .../third_party/libyuv/source/row_win.cc | 3943 ++-- .../libvpx/third_party/libyuv/source/scale.cc | 987 +- .../third_party/libyuv/source/scale_any.cc | 489 +- .../third_party/libyuv/source/scale_argb.cc | 573 +- .../third_party/libyuv/source/scale_common.cc | 808 +- .../third_party/libyuv/source/scale_gcc.cc | 2280 +- .../third_party/libyuv/source/scale_mips.cc | 644 - .../third_party/libyuv/source/scale_msa.cc | 949 + .../third_party/libyuv/source/scale_neon.cc | 1453 +- .../third_party/libyuv/source/scale_neon64.cc | 1582 +- .../third_party/libyuv/source/scale_win.cc | 861 +- .../third_party/libyuv/source/video_common.cc | 51 +- .../libvpx/libvpx/third_party/nalloc/LICENSE | 21 + .../libvpx/third_party/nalloc/README.libvpx | 11 + .../libvpx/libvpx/third_party/nalloc/nalloc.h | 342 + .../libvpx/third_party/x86inc/README.libvpx | 9 +- .../libvpx/third_party/x86inc/x86inc.asm | 750 +- media/libvpx/libvpx/tools.mk | 25 +- .../3D-Reconstruction/MotionEST/Anandan.py | 193 + .../3D-Reconstruction/MotionEST/Exhaust.py | 259 + .../MotionEST/GroundTruth.py | 48 + .../MotionEST/HornSchunck.py | 212 + .../3D-Reconstruction/MotionEST/MotionEST.py | 117 + .../MotionEST/SearchSmooth.py | 221 + .../tools/3D-Reconstruction/MotionEST/Util.py | 46 + .../tools/3D-Reconstruction/genY4M/genY4M.py | 85 + .../sketch_3D_reconstruction/BVH.pde | 163 + .../sketch_3D_reconstruction/Camera.pde | 138 + .../sketch_3D_reconstruction/MotionField.pde | 102 + .../sketch_3D_reconstruction/PointCloud.pde | 138 + .../sketch_3D_reconstruction/Ray_Tracing.pde | 61 + .../sketch_3D_reconstruction/Scene.pde | 59 + .../sketch_3D_reconstruction/Transform.pde | 82 + .../sketch_3D_reconstruction/Util.pde | 28 + .../sketch_3D_reconstruction.pde | 74 + media/libvpx/libvpx/tools/README.pgo.md | 24 + media/libvpx/libvpx/tools/all_builds.py | 72 - .../libvpx/tools/author_first_release.sh | 15 - media/libvpx/libvpx/tools/cpplint.py | 3428 ++- media/libvpx/libvpx/tools/diff.py | 2 +- media/libvpx/libvpx/tools/ftfy.sh | 158 - media/libvpx/libvpx/tools/intersect-diffs.py | 4 +- media/libvpx/libvpx/tools/lint-hunks.py | 38 +- .../tools/non_greedy_mv/non_greedy_mv.py | 195 + media/libvpx/libvpx/tools/set_analyzer_env.sh | 135 + media/libvpx/libvpx/tools/tiny_ssim.c | 623 +- media/libvpx/libvpx/tools/wrap-commit-msg.py | 2 +- media/libvpx/libvpx/tools_common.c | 369 +- media/libvpx/libvpx/tools_common.h | 81 +- media/libvpx/libvpx/usage_cx.dox | 2 + media/libvpx/libvpx/usage_dx.dox | 2 + media/libvpx/libvpx/video_common.h | 6 +- media/libvpx/libvpx/video_reader.c | 32 +- media/libvpx/libvpx/video_reader.h | 6 +- media/libvpx/libvpx/video_writer.c | 14 +- media/libvpx/libvpx/video_writer.h | 6 +- media/libvpx/libvpx/vp8/common/alloccommon.h | 8 +- .../libvpx/vp8/common/arm/loopfilter_arm.c | 22 +- .../libvpx/vp8/common/arm/loopfilter_arm.h | 31 + .../common/arm/neon/bilinearpredict_neon.c | 36 +- .../libvpx/vp8/common/arm/neon/copymem_neon.c | 2 + .../vp8/common/arm/neon/dequantizeb_neon.c | 1 + .../vp8/common/arm/neon/idct_blk_neon.c | 251 +- .../common/arm/neon/idct_dequant_0_2x_neon.c | 59 - .../arm/neon/idct_dequant_full_2x_neon.c | 182 - .../libvpx/vp8/common/arm/neon/iwalsh_neon.c | 2 + .../loopfiltersimplehorizontaledge_neon.c | 2 + .../neon/loopfiltersimpleverticaledge_neon.c | 2 + .../vp8/common/arm/neon/mbloopfilter_neon.c | 2 + .../vp8/common/arm/neon/sixtappredict_neon.c | 46 +- .../vp8/common/arm/neon/vp8_loopfilter_neon.c | 2 + media/libvpx/libvpx/vp8/common/blockd.c | 12 +- media/libvpx/libvpx/vp8/common/blockd.h | 32 +- .../libvpx/vp8/common/coefupdateprobs.h | 6 +- media/libvpx/libvpx/vp8/common/common.h | 24 +- .../libvpx/vp8/common/default_coef_probs.h | 8 +- media/libvpx/libvpx/vp8/common/entropy.c | 20 +- media/libvpx/libvpx/vp8/common/entropy.h | 6 +- media/libvpx/libvpx/vp8/common/entropymode.c | 23 +- media/libvpx/libvpx/vp8/common/entropymode.h | 10 +- media/libvpx/libvpx/vp8/common/entropymv.h | 6 +- media/libvpx/libvpx/vp8/common/extend.c | 54 +- media/libvpx/libvpx/vp8/common/extend.h | 6 +- media/libvpx/libvpx/vp8/common/filter.h | 6 +- media/libvpx/libvpx/vp8/common/findnearmv.c | 32 +- media/libvpx/libvpx/vp8/common/findnearmv.h | 8 +- .../vp8/common/generic/systemdependent.c | 57 +- media/libvpx/libvpx/vp8/common/header.h | 6 +- media/libvpx/libvpx/vp8/common/idct_blk.c | 26 +- media/libvpx/libvpx/vp8/common/invtrans.h | 6 +- .../libvpx/vp8/common/loongarch/idct_lsx.c | 322 + .../common/loongarch/loopfilter_filters_lsx.c | 743 + .../vp8/common/loongarch/sixtap_filter_lsx.c | 1904 ++ media/libvpx/libvpx/vp8/common/loopfilter.h | 12 +- .../libvpx/vp8/common/loopfilter_filters.c | 124 +- media/libvpx/libvpx/vp8/common/mfqe.c | 19 +- .../vp8/common/mips/dspr2/filter_dspr2.c | 26 +- .../vp8/common/mips/dspr2/idct_blk_dspr2.c | 20 +- .../mips/dspr2/vp8_loopfilter_filters_dspr2.c | 12 +- .../libvpx/vp8/common/mips/mmi/copymem_mmi.c | 114 + .../vp8/common/mips/mmi/dequantize_mmi.c | 115 + .../libvpx/vp8/common/mips/mmi/idct_blk_mmi.c | 70 + .../libvpx/vp8/common/mips/mmi/idctllm_mmi.c | 335 + .../common/mips/mmi/loopfilter_filters_mmi.c | 1415 ++ .../vp8/common/mips/mmi/sixtap_filter_mmi.c | 427 + .../libvpx/vp8/common/mips/msa/idct_msa.c | 58 +- .../vp8/common/mips/msa/sixtap_filter_msa.c | 181 +- .../vp8/common/mips/msa/vp8_macros_msa.h | 262 +- media/libvpx/libvpx/vp8/common/modecont.c | 36 +- media/libvpx/libvpx/vp8/common/modecont.h | 6 +- media/libvpx/libvpx/vp8/common/mv.h | 6 +- media/libvpx/libvpx/vp8/common/onyx.h | 65 +- media/libvpx/libvpx/vp8/common/onyxc_int.h | 7 +- media/libvpx/libvpx/vp8/common/onyxd.h | 19 +- media/libvpx/libvpx/vp8/common/postproc.c | 140 +- media/libvpx/libvpx/vp8/common/postproc.h | 15 +- media/libvpx/libvpx/vp8/common/ppflags.h | 6 +- media/libvpx/libvpx/vp8/common/quant_common.h | 6 +- media/libvpx/libvpx/vp8/common/reconinter.c | 7 + media/libvpx/libvpx/vp8/common/reconinter.h | 29 +- media/libvpx/libvpx/vp8/common/reconintra.c | 8 + media/libvpx/libvpx/vp8/common/reconintra.h | 6 +- .../libvpx/libvpx/vp8/common/reconintra4x4.c | 16 +- .../libvpx/libvpx/vp8/common/reconintra4x4.h | 8 +- media/libvpx/libvpx/vp8/common/rtcd.c | 2 +- media/libvpx/libvpx/vp8/common/rtcd_defs.pl | 148 +- .../libvpx/vp8/common/setupintrarecon.h | 6 +- .../libvpx/libvpx/vp8/common/swapyv12buffer.h | 6 +- .../libvpx/vp8/common/systemdependent.h | 6 +- media/libvpx/libvpx/vp8/common/threading.h | 217 +- media/libvpx/libvpx/vp8/common/treecoder.c | 9 +- media/libvpx/libvpx/vp8/common/treecoder.h | 8 +- .../libvpx/vp8/common/vp8_entropymodedata.h | 8 +- .../libvpx/libvpx/vp8/common/vp8_loopfilter.c | 17 +- .../libvpx/vp8/common/vp8_skin_detection.c | 109 + .../libvpx/vp8/common/vp8_skin_detection.h | 47 + .../vp8/common/x86/bilinear_filter_sse2.c | 336 + .../libvpx/vp8/common/x86/dequantize_mmx.asm | 5 +- .../libvpx/libvpx/vp8/common/x86/filter_x86.c | 29 - .../libvpx/libvpx/vp8/common/x86/filter_x86.h | 33 - .../libvpx/vp8/common/x86/idct_blk_sse2.c | 24 +- .../libvpx/vp8/common/x86/idctllm_mmx.asm | 5 +- .../libvpx/vp8/common/x86/idctllm_sse2.asm | 10 +- .../libvpx/vp8/common/x86/iwalsh_sse2.asm | 6 +- .../x86/loopfilter_block_sse2_x86_64.asm | 6 +- .../libvpx/vp8/common/x86/loopfilter_sse2.asm | 22 +- .../libvpx/vp8/common/x86/loopfilter_x86.c | 6 +- .../libvpx/vp8/common/x86/mfqe_sse2.asm | 8 +- .../libvpx/vp8/common/x86/recon_mmx.asm | 5 +- .../libvpx/vp8/common/x86/recon_sse2.asm | 4 +- .../libvpx/vp8/common/x86/subpixel_mmx.asm | 281 +- .../libvpx/vp8/common/x86/subpixel_sse2.asm | 431 +- .../libvpx/vp8/common/x86/subpixel_ssse3.asm | 17 +- .../libvpx/vp8/common/x86/vp8_asm_stubs.c | 13 +- media/libvpx/libvpx/vp8/decoder/dboolhuff.c | 10 +- media/libvpx/libvpx/vp8/decoder/dboolhuff.h | 12 +- media/libvpx/libvpx/vp8/decoder/decodeframe.c | 69 +- media/libvpx/libvpx/vp8/decoder/decodemv.c | 21 +- media/libvpx/libvpx/vp8/decoder/decodemv.h | 6 +- .../libvpx/vp8/decoder/decoderthreading.h | 8 +- media/libvpx/libvpx/vp8/decoder/detokenize.c | 6 +- media/libvpx/libvpx/vp8/decoder/detokenize.h | 6 +- media/libvpx/libvpx/vp8/decoder/ec_types.h | 10 +- .../libvpx/vp8/decoder/error_concealment.c | 10 +- .../libvpx/vp8/decoder/error_concealment.h | 6 +- media/libvpx/libvpx/vp8/decoder/onyxd_if.c | 43 +- media/libvpx/libvpx/vp8/decoder/onyxd_int.h | 54 +- media/libvpx/libvpx/vp8/decoder/threading.c | 198 +- media/libvpx/libvpx/vp8/decoder/treereader.h | 8 +- .../vp8/encoder/arm/neon/fastquantizeb_neon.c | 12 +- .../vp8/encoder/arm/neon/shortfdct_neon.c | 2 + .../encoder/arm/neon/vp8_shortwalsh4x4_neon.c | 2 + media/libvpx/libvpx/vp8/encoder/bitstream.c | 190 +- media/libvpx/libvpx/vp8/encoder/bitstream.h | 14 +- media/libvpx/libvpx/vp8/encoder/block.h | 6 +- media/libvpx/libvpx/vp8/encoder/boolhuff.c | 26 +- media/libvpx/libvpx/vp8/encoder/boolhuff.h | 70 +- .../libvpx/vp8/{common => encoder}/copy_c.c | 0 .../libvpx/vp8/encoder/dct_value_cost.h | 6 +- .../libvpx/vp8/encoder/dct_value_tokens.h | 6 +- .../libvpx/vp8/encoder/defaultcoefcounts.h | 6 +- media/libvpx/libvpx/vp8/encoder/denoising.c | 49 +- media/libvpx/libvpx/vp8/encoder/denoising.h | 6 +- media/libvpx/libvpx/vp8/encoder/encodeframe.c | 151 +- media/libvpx/libvpx/vp8/encoder/encodeframe.h | 31 +- media/libvpx/libvpx/vp8/encoder/encodeintra.c | 3 +- media/libvpx/libvpx/vp8/encoder/encodeintra.h | 8 +- media/libvpx/libvpx/vp8/encoder/encodemb.c | 12 +- media/libvpx/libvpx/vp8/encoder/encodemb.h | 6 +- media/libvpx/libvpx/vp8/encoder/encodemv.c | 36 +- media/libvpx/libvpx/vp8/encoder/encodemv.h | 6 +- media/libvpx/libvpx/vp8/encoder/ethreading.c | 154 +- media/libvpx/libvpx/vp8/encoder/ethreading.h | 32 + media/libvpx/libvpx/vp8/encoder/firstpass.c | 274 +- media/libvpx/libvpx/vp8/encoder/firstpass.h | 6 +- media/libvpx/libvpx/vp8/encoder/lookahead.c | 4 +- media/libvpx/libvpx/vp8/encoder/lookahead.h | 8 +- .../libvpx/vp8/encoder/loongarch/dct_lsx.c | 161 + .../vp8/encoder/loongarch/encodeopt_lsx.c | 82 + .../vp8/encoder/loongarch/vp8_quantize_lsx.c | 145 + media/libvpx/libvpx/vp8/encoder/mcomp.c | 505 +- media/libvpx/libvpx/vp8/encoder/mcomp.h | 42 +- .../libvpx/vp8/encoder/mips/mmi/dct_mmi.c | 434 + .../vp8/encoder/mips/mmi/vp8_quantize_mmi.c | 263 + media/libvpx/libvpx/vp8/encoder/modecosts.h | 8 +- media/libvpx/libvpx/vp8/encoder/mr_dissim.c | 2 +- media/libvpx/libvpx/vp8/encoder/mr_dissim.h | 6 +- media/libvpx/libvpx/vp8/encoder/onyx_if.c | 991 +- media/libvpx/libvpx/vp8/encoder/onyx_int.h | 96 +- media/libvpx/libvpx/vp8/encoder/pickinter.c | 172 +- media/libvpx/libvpx/vp8/encoder/pickinter.h | 6 +- media/libvpx/libvpx/vp8/encoder/picklpf.c | 11 +- media/libvpx/libvpx/vp8/encoder/picklpf.h | 30 + media/libvpx/libvpx/vp8/encoder/quantize.h | 6 +- media/libvpx/libvpx/vp8/encoder/ratectrl.c | 260 +- media/libvpx/libvpx/vp8/encoder/ratectrl.h | 6 +- media/libvpx/libvpx/vp8/encoder/rdopt.c | 157 +- media/libvpx/libvpx/vp8/encoder/rdopt.h | 27 +- .../libvpx/libvpx/vp8/encoder/segmentation.c | 4 +- .../libvpx/libvpx/vp8/encoder/segmentation.h | 10 +- .../libvpx/vp8/encoder/temporal_filter.c | 4 +- .../libvpx/vp8/encoder/temporal_filter.h | 26 + media/libvpx/libvpx/vp8/encoder/tokenize.c | 70 - media/libvpx/libvpx/vp8/encoder/tokenize.h | 16 +- media/libvpx/libvpx/vp8/encoder/treewriter.h | 22 +- .../libvpx/libvpx/vp8/encoder/vp8_quantize.c | 13 +- .../{encodeopt.asm => block_error_sse2.asm} | 8 +- .../vp8/{common => encoder}/x86/copy_sse2.asm | 3 +- .../vp8/{common => encoder}/x86/copy_sse3.asm | 3 +- .../libvpx/vp8/encoder/x86/dct_sse2.asm | 6 +- .../libvpx/vp8/encoder/x86/denoising_sse2.c | 2 +- .../libvpx/vp8/encoder/x86/fwalsh_sse2.asm | 4 +- .../libvpx/vp8/encoder/x86/quantize_mmx.asm | 286 - .../libvpx/vp8/encoder/x86/quantize_sse4.c | 121 +- .../x86/temporal_filter_apply_sse2.asm | 6 +- .../vp8/encoder/x86/vp8_enc_stubs_mmx.c | 34 - ...{quantize_ssse3.c => vp8_quantize_ssse3.c} | 40 +- media/libvpx/libvpx/vp8/vp8_common.mk | 28 +- media/libvpx/libvpx/vp8/vp8_cx_iface.c | 368 +- media/libvpx/libvpx/vp8/vp8_dx_iface.c | 174 +- media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc | 440 + media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h | 66 + media/libvpx/libvpx/vp8/vp8cx.mk | 23 +- .../arm/neon/vp9_highbd_iht16x16_add_neon.c | 446 + .../arm/neon/vp9_highbd_iht4x4_add_neon.c | 181 + .../arm/neon/vp9_highbd_iht8x8_add_neon.c | 345 + .../common/arm/neon/vp9_iht16x16_add_neon.c | 279 + .../vp9/common/arm/neon/vp9_iht4x4_add_neon.c | 238 +- .../vp9/common/arm/neon/vp9_iht8x8_add_neon.c | 542 +- .../libvpx/vp9/common/arm/neon/vp9_iht_neon.h | 272 + .../vp9/common/mips/msa/vp9_idct16x16_msa.c | 1 + .../vp9/common/mips/msa/vp9_idct4x4_msa.c | 1 + .../vp9/common/mips/msa/vp9_idct8x8_msa.c | 1 + .../libvpx/vp9/common/ppc/vp9_idct_vsx.c | 116 + .../libvpx/vp9/common/vp9_alloccommon.c | 74 +- .../libvpx/vp9/common/vp9_alloccommon.h | 13 +- media/libvpx/libvpx/vp9/common/vp9_blockd.c | 5 +- media/libvpx/libvpx/vp9/common/vp9_blockd.h | 46 +- media/libvpx/libvpx/vp9/common/vp9_common.h | 39 +- .../libvpx/vp9/common/vp9_common_data.c | 2 +- .../libvpx/vp9/common/vp9_common_data.h | 6 +- .../libvpx/libvpx/vp9/common/vp9_debugmodes.c | 2 +- media/libvpx/libvpx/vp9/common/vp9_entropy.c | 2 + media/libvpx/libvpx/vp9/common/vp9_entropy.h | 7 +- .../libvpx/vp9/common/vp9_entropymode.c | 24 +- .../libvpx/vp9/common/vp9_entropymode.h | 8 +- .../libvpx/libvpx/vp9/common/vp9_entropymv.c | 4 +- .../libvpx/libvpx/vp9/common/vp9_entropymv.h | 10 +- media/libvpx/libvpx/vp9/common/vp9_enums.h | 8 +- media/libvpx/libvpx/vp9/common/vp9_filter.c | 18 +- media/libvpx/libvpx/vp9/common/vp9_filter.h | 9 +- .../libvpx/vp9/common/vp9_frame_buffers.c | 14 +- .../libvpx/vp9/common/vp9_frame_buffers.h | 6 +- media/libvpx/libvpx/vp9/common/vp9_idct.c | 33 +- media/libvpx/libvpx/vp9/common/vp9_idct.h | 22 +- .../libvpx/libvpx/vp9/common/vp9_loopfilter.c | 70 +- .../libvpx/libvpx/vp9/common/vp9_loopfilter.h | 14 +- media/libvpx/libvpx/vp9/common/vp9_mfqe.c | 2 +- media/libvpx/libvpx/vp9/common/vp9_mfqe.h | 6 +- media/libvpx/libvpx/vp9/common/vp9_mv.h | 8 +- .../libvpx/vp9/common/vp9_mvref_common.h | 10 +- .../libvpx/libvpx/vp9/common/vp9_onyxc_int.h | 125 +- media/libvpx/libvpx/vp9/common/vp9_postproc.c | 58 +- media/libvpx/libvpx/vp9/common/vp9_postproc.h | 17 +- media/libvpx/libvpx/vp9/common/vp9_ppflags.h | 6 +- .../libvpx/vp9/common/vp9_pred_common.c | 31 +- .../libvpx/vp9/common/vp9_pred_common.h | 16 +- .../libvpx/vp9/common/vp9_quant_common.h | 6 +- .../libvpx/libvpx/vp9/common/vp9_reconinter.c | 47 +- .../libvpx/libvpx/vp9/common/vp9_reconinter.h | 33 +- .../libvpx/libvpx/vp9/common/vp9_reconintra.h | 6 +- media/libvpx/libvpx/vp9/common/vp9_rtcd.c | 6 +- .../libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl | 221 +- media/libvpx/libvpx/vp9/common/vp9_scale.h | 10 +- media/libvpx/libvpx/vp9/common/vp9_scan.c | 297 +- media/libvpx/libvpx/vp9/common/vp9_scan.h | 18 +- .../libvpx/libvpx/vp9/common/vp9_seg_common.h | 11 +- .../libvpx/vp9/common/vp9_thread_common.c | 282 +- .../libvpx/vp9/common/vp9_thread_common.h | 33 +- .../libvpx/vp9/common/vp9_tile_common.h | 6 +- .../common/x86/vp9_highbd_iht16x16_add_sse4.c | 419 + .../common/x86/vp9_highbd_iht4x4_add_sse4.c | 131 + .../common/x86/vp9_highbd_iht8x8_add_sse4.c | 255 + .../vp9/common/x86/vp9_idct_intrin_sse2.c | 181 +- .../libvpx/vp9/common/x86/vp9_mfqe_sse2.asm | 8 +- .../libvpx/vp9/decoder/vp9_decodeframe.c | 1281 +- .../libvpx/vp9/decoder/vp9_decodeframe.h | 6 +- .../libvpx/libvpx/vp9/decoder/vp9_decodemv.c | 150 +- .../libvpx/libvpx/vp9/decoder/vp9_decodemv.h | 6 +- media/libvpx/libvpx/vp9/decoder/vp9_decoder.c | 258 +- media/libvpx/libvpx/vp9/decoder/vp9_decoder.h | 75 +- .../libvpx/vp9/decoder/vp9_detokenize.c | 64 +- .../libvpx/vp9/decoder/vp9_detokenize.h | 11 +- media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h | 6 +- media/libvpx/libvpx/vp9/decoder/vp9_dthread.c | 190 - media/libvpx/libvpx/vp9/decoder/vp9_dthread.h | 74 - .../libvpx/libvpx/vp9/decoder/vp9_job_queue.c | 125 + .../libvpx/libvpx/vp9/decoder/vp9_job_queue.h | 45 + .../vp9/encoder/arm/neon/vp9_dct_neon.c | 2172 +- .../vp9/encoder/arm/neon/vp9_denoiser_neon.c | 356 + .../arm/neon/vp9_diamond_search_sad_neon.c | 296 + .../vp9/encoder/arm/neon/vp9_error_neon.c | 99 +- .../vp9/encoder/arm/neon/vp9_error_sve.c | 78 + .../encoder/arm/neon/vp9_frame_scale_neon.c | 844 + .../encoder/arm/neon/vp9_highbd_error_neon.c | 49 + .../neon/vp9_highbd_temporal_filter_neon.c | 1076 + .../neon/vp9_highbd_temporal_filter_sve2.c | 285 + .../vp9/encoder/arm/neon/vp9_quantize_neon.c | 470 +- .../arm/neon/vp9_temporal_filter_neon.c | 1103 + .../neon/vp9_temporal_filter_neon_dotprod.c | 367 + .../arm/neon/vp9_temporal_filter_neon_i8mm.c | 351 + .../vp9/encoder/mips/msa/vp9_error_msa.c | 3 + .../vp9/encoder/mips/msa/vp9_fdct16x16_msa.c | 1 + .../vp9/encoder/mips/msa/vp9_fdct4x4_msa.c | 1 + .../vp9/encoder/mips/msa/vp9_fdct8x8_msa.c | 1 + .../vp9/encoder/mips/msa/vp9_fdct_msa.h | 6 +- .../mips/msa/vp9_temporal_filter_msa.c | 283 - .../libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c | 287 + .../libvpx/vp9/encoder/vp9_alt_ref_aq.c | 2 +- .../libvpx/vp9/encoder/vp9_alt_ref_aq.h | 8 +- media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h | 6 +- .../libvpx/vp9/encoder/vp9_aq_complexity.c | 2 +- .../libvpx/vp9/encoder/vp9_aq_complexity.h | 6 +- .../libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c | 384 +- .../libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h | 25 +- .../libvpx/vp9/encoder/vp9_aq_variance.c | 79 +- .../libvpx/vp9/encoder/vp9_aq_variance.h | 10 +- .../libvpx/libvpx/vp9/encoder/vp9_bitstream.c | 309 +- .../libvpx/libvpx/vp9/encoder/vp9_bitstream.h | 21 +- media/libvpx/libvpx/vp9/encoder/vp9_block.h | 88 +- .../libvpx/vp9/encoder/vp9_blockiness.c | 1 + .../libvpx/vp9/encoder/vp9_blockiness.h | 26 + .../libvpx/vp9/encoder/vp9_context_tree.c | 49 +- .../libvpx/vp9/encoder/vp9_context_tree.h | 18 +- media/libvpx/libvpx/vp9/encoder/vp9_cost.h | 11 +- media/libvpx/libvpx/vp9/encoder/vp9_dct.c | 108 - .../libvpx/libvpx/vp9/encoder/vp9_denoiser.c | 480 +- .../libvpx/libvpx/vp9/encoder/vp9_denoiser.h | 60 +- .../libvpx/vp9/encoder/vp9_encodeframe.c | 3379 ++- .../libvpx/vp9/encoder/vp9_encodeframe.h | 24 +- .../libvpx/libvpx/vp9/encoder/vp9_encodemb.c | 906 +- .../libvpx/libvpx/vp9/encoder/vp9_encodemb.h | 21 +- .../libvpx/libvpx/vp9/encoder/vp9_encodemv.h | 8 +- media/libvpx/libvpx/vp9/encoder/vp9_encoder.c | 4301 ++-- media/libvpx/libvpx/vp9/encoder/vp9_encoder.h | 746 +- media/libvpx/libvpx/vp9/encoder/vp9_ethread.c | 661 +- media/libvpx/libvpx/vp9/encoder/vp9_ethread.h | 52 +- .../libvpx/vp9/encoder/vp9_ext_ratectrl.c | 257 + .../libvpx/vp9/encoder/vp9_ext_ratectrl.h | 63 + media/libvpx/libvpx/vp9/encoder/vp9_extend.c | 61 +- media/libvpx/libvpx/vp9/encoder/vp9_extend.h | 9 +- .../libvpx/libvpx/vp9/encoder/vp9_firstpass.c | 3824 +-- .../libvpx/libvpx/vp9/encoder/vp9_firstpass.h | 202 +- .../libvpx/vp9/encoder/vp9_firstpass_stats.h | 54 + .../libvpx/vp9/encoder/vp9_frame_scale.c | 136 + .../libvpx/libvpx/vp9/encoder/vp9_job_queue.h | 46 + .../libvpx/libvpx/vp9/encoder/vp9_lookahead.c | 126 +- .../libvpx/libvpx/vp9/encoder/vp9_lookahead.h | 47 +- media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c | 53 +- media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h | 10 +- media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c | 1554 +- media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h | 110 +- .../libvpx/vp9/encoder/vp9_multi_thread.c | 342 + .../libvpx/vp9/encoder/vp9_multi_thread.h | 41 + .../libvpx/vp9/encoder/vp9_noise_estimate.c | 211 +- .../libvpx/vp9/encoder/vp9_noise_estimate.h | 9 +- .../libvpx/vp9/encoder/vp9_non_greedy_mv.c | 536 + .../libvpx/vp9/encoder/vp9_non_greedy_mv.h | 129 + .../libvpx/vp9/encoder/vp9_partition_models.h | 975 + media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c | 33 +- media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h | 6 +- .../libvpx/libvpx/vp9/encoder/vp9_pickmode.c | 1592 +- .../libvpx/libvpx/vp9/encoder/vp9_pickmode.h | 6 +- .../libvpx/libvpx/vp9/encoder/vp9_quantize.c | 241 +- .../libvpx/libvpx/vp9/encoder/vp9_quantize.h | 11 +- .../libvpx/libvpx/vp9/encoder/vp9_ratectrl.c | 1962 +- .../libvpx/libvpx/vp9/encoder/vp9_ratectrl.h | 90 +- media/libvpx/libvpx/vp9/encoder/vp9_rd.c | 373 +- media/libvpx/libvpx/vp9/encoder/vp9_rd.h | 64 +- media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c | 1457 +- media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h | 10 +- media/libvpx/libvpx/vp9/encoder/vp9_resize.c | 102 +- media/libvpx/libvpx/vp9/encoder/vp9_resize.h | 39 +- .../libvpx/vp9/encoder/vp9_segmentation.c | 56 +- .../libvpx/vp9/encoder/vp9_segmentation.h | 11 +- .../libvpx/vp9/encoder/vp9_skin_detection.c | 233 +- .../libvpx/vp9/encoder/vp9_skin_detection.h | 19 +- .../libvpx/vp9/encoder/vp9_speed_features.c | 742 +- .../libvpx/vp9/encoder/vp9_speed_features.h | 266 +- media/libvpx/libvpx/vp9/encoder/vp9_subexp.c | 43 +- media/libvpx/libvpx/vp9/encoder/vp9_subexp.h | 19 +- .../libvpx/vp9/encoder/vp9_svc_layercontext.c | 978 +- .../libvpx/vp9/encoder/vp9_svc_layercontext.h | 172 +- .../libvpx/vp9/encoder/vp9_temporal_filter.c | 1513 +- .../libvpx/vp9/encoder/vp9_temporal_filter.h | 54 +- .../encoder/vp9_temporal_filter_constants.h | 410 + .../libvpx/libvpx/vp9/encoder/vp9_tokenize.c | 8 +- .../libvpx/libvpx/vp9/encoder/vp9_tokenize.h | 25 +- .../libvpx/libvpx/vp9/encoder/vp9_tpl_model.c | 1791 ++ .../libvpx/libvpx/vp9/encoder/vp9_tpl_model.h | 47 + .../libvpx/vp9/encoder/vp9_treewriter.h | 6 +- .../encoder/x86/highbd_temporal_filter_avx2.c | 263 + .../encoder/x86/highbd_temporal_filter_sse4.c | 893 + .../x86/highbd_temporal_filter_ssse3.c | 233 + .../vp9/encoder/x86/temporal_filter_avx2.c | 441 + .../vp9/encoder/x86/temporal_filter_sse4.c | 875 + .../vp9/encoder/x86/temporal_filter_ssse3.c | 279 + .../vp9/encoder/x86/vp9_dct_intrin_sse2.c | 550 +- .../libvpx/vp9/encoder/x86/vp9_dct_sse2.asm | 23 +- .../libvpx/vp9/encoder/x86/vp9_dct_ssse3.c | 467 - .../vp9/encoder/x86/vp9_denoiser_sse2.c | 1 - .../encoder/x86/vp9_diamond_search_sad_avx.c | 310 - .../libvpx/vp9/encoder/x86/vp9_error_avx2.c | 161 + .../vp9/encoder/x86/vp9_error_intrin_avx2.c | 71 - .../libvpx/vp9/encoder/x86/vp9_error_sse2.asm | 63 +- .../vp9/encoder/x86/vp9_frame_scale_ssse3.c | 1037 +- .../x86/vp9_highbd_block_error_intrin_sse2.c | 19 +- .../vp9/encoder/x86/vp9_highbd_error_avx.asm | 261 - .../vp9/encoder/x86/vp9_highbd_error_sse2.asm | 98 - .../vp9/encoder/x86/vp9_quantize_avx2.c | 439 + .../vp9/encoder/x86/vp9_quantize_sse2.c | 272 +- .../vp9/encoder/x86/vp9_quantize_ssse3.c | 252 + .../encoder/x86/vp9_quantize_ssse3_x86_64.asm | 201 - .../x86/vp9_temporal_filter_apply_sse2.asm | 212 - media/libvpx/libvpx/vp9/ratectrl_rtc.cc | 354 + media/libvpx/libvpx/vp9/ratectrl_rtc.h | 106 + media/libvpx/libvpx/vp9/vp9_common.mk | 45 +- media/libvpx/libvpx/vp9/vp9_cx_iface.c | 1514 +- media/libvpx/libvpx/vp9/vp9_cx_iface.h | 49 + media/libvpx/libvpx/vp9/vp9_dx_iface.c | 679 +- media/libvpx/libvpx/vp9/vp9_dx_iface.h | 33 +- media/libvpx/libvpx/vp9/vp9_iface_common.c | 136 + media/libvpx/libvpx/vp9/vp9_iface_common.h | 143 +- media/libvpx/libvpx/vp9/vp9cx.mk | 88 +- media/libvpx/libvpx/vp9/vp9dx.mk | 4 +- media/libvpx/libvpx/vpx/exports_spatial_svc | 6 - .../libvpx/vpx/internal/vpx_codec_internal.h | 77 +- .../libvpx/vpx/internal/vpx_ratectrl_rtc.h | 80 + media/libvpx/libvpx/vpx/src/vpx_codec.c | 9 +- media/libvpx/libvpx/vpx/src/vpx_decoder.c | 22 +- media/libvpx/libvpx/vpx/src/vpx_encoder.c | 80 +- media/libvpx/libvpx/vpx/src/vpx_image.c | 131 +- media/libvpx/libvpx/vpx/vp8.h | 27 +- media/libvpx/libvpx/vpx/vp8cx.h | 498 +- media/libvpx/libvpx/vpx/vp8dx.h | 68 +- media/libvpx/libvpx/vpx/vpx_codec.h | 96 +- media/libvpx/libvpx/vpx/vpx_codec.mk | 9 +- media/libvpx/libvpx/vpx/vpx_decoder.h | 72 +- media/libvpx/libvpx/vpx/vpx_encoder.h | 342 +- media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h | 605 + media/libvpx/libvpx/vpx/vpx_frame_buffer.h | 14 +- media/libvpx/libvpx/vpx/vpx_image.h | 104 +- media/libvpx/libvpx/vpx/vpx_integer.h | 43 +- media/libvpx/libvpx/vpx/vpx_tpl.h | 69 + media/libvpx/libvpx/vpx_dsp/add_noise.c | 3 + media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c | 191 +- .../libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c | 65 + .../libvpx/libvpx/vpx_dsp/arm/deblock_neon.c | 104 +- .../libvpx/vpx_dsp/arm/fdct16x16_neon.c | 439 + .../libvpx/vpx_dsp/arm/fdct16x16_neon.h | 318 + .../libvpx/vpx_dsp/arm/fdct32x32_neon.c | 419 + .../libvpx/vpx_dsp/arm/fdct32x32_neon.h | 2919 +++ .../libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c | 85 + .../libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h | 105 + .../libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c | 143 + .../libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h | 307 + media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h | 542 + .../libvpx/vpx_dsp/arm/fdct_partial_neon.c | 183 + .../libvpx/libvpx/vpx_dsp/arm/fwd_txfm_neon.c | 220 - .../libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c | 85 +- .../libvpx/vpx_dsp/arm/highbd_avg_neon.c | 140 + .../libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c | 64 + .../vpx_dsp/arm/highbd_convolve8_neon.h | 46 + .../libvpx/vpx_dsp/arm/highbd_convolve8_sve.h | 99 + .../libvpx/vpx_dsp/arm/highbd_hadamard_neon.c | 215 + .../vpx_dsp/arm/highbd_idct16x16_add_neon.c | 1361 ++ .../arm/highbd_idct32x32_1024_add_neon.c | 640 + .../arm/highbd_idct32x32_135_add_neon.c | 757 + .../arm/highbd_idct32x32_34_add_neon.c | 625 + .../vpx_dsp/arm/highbd_idct32x32_add_neon.c | 88 + .../vpx_dsp/arm/highbd_idct4x4_add_neon.c | 142 +- .../vpx_dsp/arm/highbd_idct8x8_add_neon.c | 625 +- .../libvpx/vpx_dsp/arm/highbd_idct_neon.h | 474 + .../vpx_dsp/arm/highbd_intrapred_neon.c | 1886 +- .../vpx_dsp/arm/highbd_loopfilter_neon.c | 15 + .../libvpx/vpx_dsp/arm/highbd_quantize_neon.c | 300 + .../libvpx/vpx_dsp/arm/highbd_sad4d_neon.c | 273 + .../libvpx/vpx_dsp/arm/highbd_sad_neon.c | 452 + .../libvpx/vpx_dsp/arm/highbd_sse_neon.c | 238 + .../vpx_dsp/arm/highbd_subpel_variance_neon.c | 586 + .../libvpx/vpx_dsp/arm/highbd_variance_neon.c | 436 + .../arm/highbd_variance_neon_dotprod.c | 96 + .../libvpx/vpx_dsp/arm/highbd_variance_sve.c | 344 + .../vpx_dsp/arm/highbd_vpx_convolve8_neon.c | 1940 +- .../vpx_dsp/arm/highbd_vpx_convolve8_sve.c | 271 + .../vpx_dsp/arm/highbd_vpx_convolve8_sve2.c | 660 + .../arm/highbd_vpx_convolve_avg_neon.c | 20 +- .../arm/highbd_vpx_convolve_copy_neon.c | 124 +- .../vpx_dsp/arm/highbd_vpx_convolve_neon.c | 65 - .../vpx_dsp/arm/idct16x16_1_add_neon.asm | 196 - .../libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c | 3 +- .../libvpx/vpx_dsp/arm/idct16x16_add_neon.asm | 1176 - .../libvpx/vpx_dsp/arm/idct16x16_add_neon.c | 1932 +- .../libvpx/vpx_dsp/arm/idct16x16_neon.c | 160 - .../vpx_dsp/arm/idct32x32_135_add_neon.c | 908 +- .../libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c | 3 +- .../vpx_dsp/arm/idct32x32_34_add_neon.c | 683 +- .../libvpx/vpx_dsp/arm/idct32x32_add_neon.c | 967 +- .../libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c | 4 +- .../libvpx/vpx_dsp/arm/idct4x4_add_neon.asm | 2 +- .../libvpx/vpx_dsp/arm/idct4x4_add_neon.c | 57 +- .../libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm | 86 - .../libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c | 3 +- .../libvpx/vpx_dsp/arm/idct8x8_add_neon.asm | 507 - .../libvpx/vpx_dsp/arm/idct8x8_add_neon.c | 113 +- media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h | 1008 +- .../libvpx/vpx_dsp/arm/intrapred_neon.c | 1345 +- .../libvpx/vpx_dsp/arm/loopfilter_8_neon.asm | 2 +- .../libvpx/vpx_dsp/arm/loopfilter_neon.c | 17 +- media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h | 757 + .../libvpx/libvpx/vpx_dsp/arm/quantize_neon.c | 286 + media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c | 372 +- .../libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c | 176 + media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c | 516 +- .../libvpx/vpx_dsp/arm/sad_neon_dotprod.c | 247 + media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c | 188 + .../libvpx/vpx_dsp/arm/sse_neon_dotprod.c | 197 + .../libvpx/vpx_dsp/arm/subpel_variance_neon.c | 556 +- .../libvpx/libvpx/vpx_dsp/arm/subtract_neon.c | 140 +- media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h | 275 + .../libvpx/vpx_dsp/arm/sum_squares_neon.c | 100 + .../libvpx/vpx_dsp/arm/sum_squares_sve.c | 73 + .../libvpx/vpx_dsp/arm/transpose_neon.h | 685 +- .../libvpx/libvpx/vpx_dsp/arm/variance_neon.c | 655 +- .../vpx_dsp/arm/variance_neon_dotprod.c | 298 + ..._convolve8_avg_horiz_filter_type1_neon.asm | 438 + ..._convolve8_avg_horiz_filter_type2_neon.asm | 439 + .../arm/vpx_convolve8_avg_neon_asm.asm | 292 - ...x_convolve8_avg_vert_filter_type1_neon.asm | 486 + ...x_convolve8_avg_vert_filter_type2_neon.asm | 487 + .../vpx_convolve8_horiz_filter_type1_neon.asm | 415 + .../vpx_convolve8_horiz_filter_type2_neon.asm | 415 + .../libvpx/vpx_dsp/arm/vpx_convolve8_neon.c | 1356 +- .../libvpx/vpx_dsp/arm/vpx_convolve8_neon.h | 172 + .../vpx_dsp/arm/vpx_convolve8_neon_asm.asm | 270 - .../vpx_dsp/arm/vpx_convolve8_neon_asm.c | 41 + .../vpx_dsp/arm/vpx_convolve8_neon_asm.h | 29 + .../vpx_dsp/arm/vpx_convolve8_neon_dotprod.c | 1024 + .../vpx_dsp/arm/vpx_convolve8_neon_i8mm.c | 943 + .../vpx_convolve8_vert_filter_type1_neon.asm | 457 + .../vpx_convolve8_vert_filter_type2_neon.asm | 455 + .../vpx_dsp/arm/vpx_convolve_avg_neon.c | 22 +- .../vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm | 2 +- .../vpx_dsp/arm/vpx_convolve_copy_neon.c | 26 +- .../arm/vpx_convolve_copy_neon_asm.asm | 2 +- .../libvpx/vpx_dsp/arm/vpx_convolve_neon.c | 68 +- .../libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h | 46 + .../libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h | 56 + .../vpx_dsp/arm/vpx_scaled_convolve8_neon.c | 316 + media/libvpx/libvpx/vpx_dsp/avg.c | 248 +- media/libvpx/libvpx/vpx_dsp/bitreader.h | 37 +- .../libvpx/libvpx/vpx_dsp/bitreader_buffer.c | 8 +- .../libvpx/libvpx/vpx_dsp/bitreader_buffer.h | 6 +- media/libvpx/libvpx/vpx_dsp/bitwriter.c | 32 +- media/libvpx/libvpx/vpx_dsp/bitwriter.h | 67 +- .../libvpx/libvpx/vpx_dsp/bitwriter_buffer.c | 25 +- .../libvpx/libvpx/vpx_dsp/bitwriter_buffer.h | 22 +- media/libvpx/libvpx/vpx_dsp/deblock.c | 48 +- media/libvpx/libvpx/vpx_dsp/fastssim.c | 56 +- media/libvpx/libvpx/vpx_dsp/fwd_txfm.c | 69 +- media/libvpx/libvpx/vpx_dsp/fwd_txfm.h | 6 +- media/libvpx/libvpx/vpx_dsp/intrapred.c | 300 +- media/libvpx/libvpx/vpx_dsp/inv_txfm.c | 1866 +- media/libvpx/libvpx/vpx_dsp/inv_txfm.h | 7 +- .../libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c | 90 + .../libvpx/vpx_dsp/loongarch/avg_pred_lsx.c | 83 + .../loongarch/bitdepth_conversion_lsx.h | 41 + .../vpx_dsp/loongarch/fwd_dct32x32_lsx.c | 1176 + .../libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c | 350 + .../libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h | 381 + .../libvpx/vpx_dsp/loongarch/idct32x32_lsx.c | 834 + .../libvpx/vpx_dsp/loongarch/intrapred_lsx.c | 98 + .../vpx_dsp/loongarch/loopfilter_16_lsx.c | 1320 ++ .../vpx_dsp/loongarch/loopfilter_4_lsx.c | 214 + .../vpx_dsp/loongarch/loopfilter_8_lsx.c | 458 + .../libvpx/vpx_dsp/loongarch/loopfilter_lsx.h | 167 + .../libvpx/vpx_dsp/loongarch/quantize_lsx.c | 244 + .../libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c | 717 + .../loongarch/sub_pixel_variance_lsx.c | 874 + .../libvpx/vpx_dsp/loongarch/subtract_lsx.c | 371 + .../vpx_dsp/loongarch/txfm_macros_lsx.h | 48 + .../libvpx/vpx_dsp/loongarch/variance_lsx.c | 263 + .../libvpx/vpx_dsp/loongarch/variance_lsx.h | 62 + .../loongarch/vpx_convolve8_avg_horiz_lsx.c | 972 + .../vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c | 737 + .../loongarch/vpx_convolve8_avg_vert_lsx.c | 918 + .../loongarch/vpx_convolve8_horiz_lsx.c | 814 + .../vpx_dsp/loongarch/vpx_convolve8_lsx.c | 697 + .../loongarch/vpx_convolve8_vert_lsx.c | 825 + .../vpx_dsp/loongarch/vpx_convolve_avg_lsx.c | 321 + .../vpx_dsp/loongarch/vpx_convolve_copy_lsx.c | 437 + .../vpx_dsp/loongarch/vpx_convolve_lsx.h | 138 + media/libvpx/libvpx/vpx_dsp/loopfilter.c | 220 +- .../libvpx/vpx_dsp/mips/add_noise_msa.c | 4 +- media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c | 675 + .../libvpx/libvpx/vpx_dsp/mips/common_dspr2.h | 6 +- .../libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c | 11 +- .../vpx_dsp/mips/convolve2_avg_horiz_dspr2.c | 9 +- .../vpx_dsp/mips/convolve2_horiz_dspr2.c | 11 +- .../vpx_dsp/mips/convolve2_vert_dspr2.c | 11 +- .../libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c | 44 +- .../vpx_dsp/mips/convolve8_avg_horiz_dspr2.c | 17 +- .../libvpx/vpx_dsp/mips/convolve8_dspr2.c | 25 +- .../vpx_dsp/mips/convolve8_horiz_dspr2.c | 15 +- .../vpx_dsp/mips/convolve8_vert_dspr2.c | 15 +- .../vpx_dsp/mips/convolve_common_dspr2.h | 28 +- .../libvpx/libvpx/vpx_dsp/mips/deblock_msa.c | 151 +- .../libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c | 33 +- .../libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c | 45 +- .../libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h | 22 +- .../libvpx/vpx_dsp/mips/idct16x16_msa.c | 1 + .../libvpx/vpx_dsp/mips/idct32x32_msa.c | 1 + .../libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c | 1 + .../libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c | 1 + .../libvpx/vpx_dsp/mips/inv_txfm_dspr2.h | 7 +- .../libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h | 6 +- .../libvpx/vpx_dsp/mips/itrans4_dspr2.c | 1 + .../libvpx/vpx_dsp/mips/loopfilter_16_msa.c | 64 +- .../libvpx/vpx_dsp/mips/loopfilter_4_msa.c | 1 + .../libvpx/vpx_dsp/mips/loopfilter_8_msa.c | 1 + .../vpx_dsp/mips/loopfilter_filters_dspr2.h | 6 +- .../vpx_dsp/mips/loopfilter_macros_dspr2.h | 6 +- .../vpx_dsp/mips/loopfilter_masks_dspr2.h | 6 +- .../libvpx/vpx_dsp/mips/loopfilter_msa.h | 6 +- media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h | 596 +- media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c | 807 + media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c | 750 +- .../vpx_dsp/mips/sub_pixel_variance_msa.c | 61 +- .../libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c | 306 + .../libvpx/vpx_dsp/mips/sum_squares_msa.c | 129 + .../libvpx/vpx_dsp/mips/txfm_macros_msa.h | 6 +- .../libvpx/libvpx/vpx_dsp/mips/variance_mmi.c | 1357 ++ .../libvpx/libvpx/vpx_dsp/mips/variance_msa.c | 5 +- .../mips/vpx_convolve8_avg_horiz_msa.c | 123 +- .../vpx_dsp/mips/vpx_convolve8_avg_msa.c | 105 +- .../vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c | 82 +- .../vpx_dsp/mips/vpx_convolve8_horiz_msa.c | 15 +- .../libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c | 716 + .../libvpx/vpx_dsp/mips/vpx_convolve8_msa.c | 622 +- .../vpx_dsp/mips/vpx_convolve8_vert_msa.c | 15 +- .../vpx_dsp/mips/vpx_convolve_avg_msa.c | 14 +- .../vpx_dsp/mips/vpx_convolve_copy_msa.c | 14 +- .../libvpx/vpx_dsp/mips/vpx_convolve_msa.h | 23 +- media/libvpx/libvpx/vpx_dsp/postproc.h | 6 +- .../vpx_dsp/ppc/bitdepth_conversion_vsx.h | 47 + media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c | 374 + .../libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c | 553 + .../libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c | 119 + .../libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c | 767 + .../libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c | 1828 ++ .../libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h | 48 + .../libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c | 301 + media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c | 261 + .../libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c | 117 + .../libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h | 133 + .../libvpx/vpx_dsp/ppc/txfm_common_vsx.h | 90 + media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h | 108 + .../libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c | 271 + .../libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c | 408 + media/libvpx/libvpx/vpx_dsp/prob.h | 12 +- media/libvpx/libvpx/vpx_dsp/psnr.c | 102 +- media/libvpx/libvpx/vpx_dsp/psnr.h | 48 +- media/libvpx/libvpx/vpx_dsp/psnrhvs.c | 18 +- media/libvpx/libvpx/vpx_dsp/quantize.c | 336 +- media/libvpx/libvpx/vpx_dsp/quantize.h | 32 +- media/libvpx/libvpx/vpx_dsp/sad.c | 200 +- media/libvpx/libvpx/vpx_dsp/skin_detection.c | 79 + media/libvpx/libvpx/vpx_dsp/skin_detection.h | 24 + media/libvpx/libvpx/vpx_dsp/sse.c | 59 + media/libvpx/libvpx/vpx_dsp/ssim.c | 31 +- media/libvpx/libvpx/vpx_dsp/ssim.h | 6 +- media/libvpx/libvpx/vpx_dsp/subtract.c | 28 +- media/libvpx/libvpx/vpx_dsp/sum_squares.c | 5 +- media/libvpx/libvpx/vpx_dsp/txfm_common.h | 76 +- media/libvpx/libvpx/vpx_dsp/variance.c | 567 +- media/libvpx/libvpx/vpx_dsp/variance.h | 55 +- media/libvpx/libvpx/vpx_dsp/vpx_convolve.c | 406 +- media/libvpx/libvpx/vpx_dsp/vpx_convolve.h | 18 +- media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk | 227 +- media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h | 37 +- media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c | 2 +- .../libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl | 2103 +- media/libvpx/libvpx/vpx_dsp/vpx_filter.h | 27 +- .../libvpx/vpx_dsp/x86/add_noise_sse2.asm | 8 +- .../libvpx/vpx_dsp/x86/avg_intrin_avx2.c | 519 + .../libvpx/vpx_dsp/x86/avg_intrin_sse2.c | 272 +- .../libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c | 111 + .../libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c | 69 + .../libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm | 83 +- .../vpx_dsp/x86/bitdepth_conversion_avx2.h | 44 + .../vpx_dsp/x86/bitdepth_conversion_sse2.asm | 90 + .../{fdct.h => bitdepth_conversion_sse2.h} | 17 +- media/libvpx/libvpx/vpx_dsp/x86/convolve.h | 353 +- .../libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h | 161 + .../libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h | 88 + .../libvpx/vpx_dsp/x86/convolve_ssse3.h | 112 + .../libvpx/vpx_dsp/x86/deblock_sse2.asm | 241 +- .../vpx_dsp/x86/fwd_dct32x32_impl_avx2.h | 256 +- .../vpx_dsp/x86/fwd_dct32x32_impl_sse2.h | 262 +- .../libvpx/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c | 377 + .../libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h | 10 +- .../libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h | 10 +- .../vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm | 416 +- .../libvpx/vpx_dsp/x86/highbd_convolve_avx2.c | 1495 ++ .../vpx_dsp/x86/highbd_idct16x16_add_sse2.c | 355 + .../vpx_dsp/x86/highbd_idct16x16_add_sse4.c | 349 + .../vpx_dsp/x86/highbd_idct32x32_add_sse2.c | 782 + .../vpx_dsp/x86/highbd_idct32x32_add_sse4.c | 765 + .../vpx_dsp/x86/highbd_idct4x4_add_sse2.c | 160 + .../vpx_dsp/x86/highbd_idct4x4_add_sse4.c | 47 + .../vpx_dsp/x86/highbd_idct8x8_add_sse2.c | 213 + .../vpx_dsp/x86/highbd_idct8x8_add_sse4.c | 210 + .../x86/highbd_intrapred_intrin_sse2.c | 534 + .../x86/highbd_intrapred_intrin_ssse3.c | 930 + .../vpx_dsp/x86/highbd_intrapred_sse2.asm | 16 +- .../libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h | 404 + .../libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h | 112 + .../vpx_dsp/x86/highbd_loopfilter_sse2.c | 375 +- .../vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 254 + .../vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 187 +- .../libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c | 462 + .../libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm | 43 +- .../libvpx/vpx_dsp/x86/highbd_sad_avx2.c | 522 + .../libvpx/vpx_dsp/x86/highbd_sad_sse2.asm | 63 +- .../x86/highbd_subpel_variance_impl_sse2.asm | 398 +- .../vpx_dsp/x86/highbd_variance_impl_sse2.asm | 22 +- .../libvpx/vpx_dsp/x86/highbd_variance_sse2.c | 242 +- .../libvpx/vpx_dsp/x86/intrapred_sse2.asm | 2 +- .../libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c | 626 + .../libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c | 4318 +--- .../libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h | 820 +- .../libvpx/vpx_dsp/x86/inv_txfm_ssse3.c | 364 + .../libvpx/vpx_dsp/x86/inv_txfm_ssse3.h | 110 + .../vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm | 1793 -- .../libvpx/vpx_dsp/x86/inv_wht_sse2.asm | 12 +- .../libvpx/vpx_dsp/x86/loopfilter_avx2.c | 202 +- ...filter_intrin_sse2.c => loopfilter_sse2.c} | 597 +- media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h | 154 + .../libvpx/vpx_dsp/x86/post_proc_sse2.c | 141 + .../libvpx/libvpx/vpx_dsp/x86/quantize_avx.c | 254 + .../libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c | 290 + .../vpx_dsp/x86/quantize_avx_x86_64.asm | 544 - .../libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c | 254 +- .../libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h | 126 + .../libvpx/vpx_dsp/x86/quantize_ssse3.c | 228 + .../libvpx/vpx_dsp/x86/quantize_ssse3.h | 51 + .../vpx_dsp/x86/quantize_ssse3_x86_64.asm | 346 - media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c | 306 +- .../libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c | 105 + .../libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm | 43 +- media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c | 174 +- media/libvpx/libvpx/vpx_dsp/x86/sad_avx512.c | 88 + media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm | 74 +- media/libvpx/libvpx/vpx_dsp/x86/sad_sse3.asm | 374 - media/libvpx/libvpx/vpx_dsp/x86/sad_sse4.asm | 359 - media/libvpx/libvpx/vpx_dsp/x86/sad_ssse3.asm | 370 - media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c | 368 + media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c | 312 + .../libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm | 97 +- .../vpx_dsp/x86/subpel_variance_sse2.asm | 359 +- .../libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c | 203 + .../libvpx/vpx_dsp/x86/subtract_sse2.asm | 1 + .../libvpx/vpx_dsp/x86/sum_squares_sse2.c | 191 +- .../libvpx/vpx_dsp/x86/transpose_sse2.h | 367 + .../libvpx/vpx_dsp/x86/txfm_common_sse2.h | 9 +- .../libvpx/libvpx/vpx_dsp/x86/variance_avx2.c | 899 +- .../libvpx/vpx_dsp/x86/variance_impl_avx2.c | 708 - .../libvpx/libvpx/vpx_dsp/x86/variance_sse2.c | 708 +- .../libvpx/libvpx/vpx_dsp/x86/vpx_asm_stubs.c | 162 - .../vpx_dsp/x86/vpx_convolve_copy_sse2.asm | 10 +- .../vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm | 42 +- .../x86/vpx_high_subpixel_bilinear_sse2.asm | 40 +- .../vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 1161 + .../vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 1674 +- .../x86/vpx_subpixel_8t_intrin_ssse3.c | 1281 +- .../vpx_dsp/x86/vpx_subpixel_8t_sse2.asm | 26 +- .../vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm | 50 +- .../x86/vpx_subpixel_bilinear_sse2.asm | 26 +- .../x86/vpx_subpixel_bilinear_ssse3.asm | 26 +- .../libvpx/vpx_mem/include/vpx_mem_intrnl.h | 8 +- media/libvpx/libvpx/vpx_mem/vpx_mem.c | 11 +- media/libvpx/libvpx/vpx_mem/vpx_mem.h | 15 +- .../libvpx/vpx_ports/aarch32_cpudetect.c | 90 + .../libvpx/vpx_ports/aarch64_cpudetect.c | 241 + media/libvpx/libvpx/vpx_ports/arm.h | 22 +- media/libvpx/libvpx/vpx_ports/arm_cpudetect.c | 154 - media/libvpx/libvpx/vpx_ports/arm_cpudetect.h | 52 + media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h | 81 + media/libvpx/libvpx/vpx_ports/bitops.h | 31 +- .../libvpx/vpx_ports/compiler_attributes.h | 69 + .../libvpx/vpx_ports/emmintrin_compat.h | 6 +- media/libvpx/libvpx/vpx_ports/emms_mmx.asm | 18 + .../libvpx/vpx_ports/{config.h => emms_mmx.c} | 9 +- .../{emms.asm => float_control_word.asm} | 9 +- media/libvpx/libvpx/vpx_ports/loongarch.h | 29 + .../libvpx/vpx_ports/loongarch_cpudetect.c | 40 + media/libvpx/libvpx/vpx_ports/mem.h | 36 +- media/libvpx/libvpx/vpx_ports/mem_ops.h | 7 +- .../libvpx/libvpx/vpx_ports/mem_ops_aligned.h | 6 +- media/libvpx/libvpx/vpx_ports/mips.h | 27 + .../libvpx/libvpx/vpx_ports/mips_cpudetect.c | 57 + media/libvpx/libvpx/vpx_ports/msvc.h | 32 - media/libvpx/libvpx/vpx_ports/ppc.h | 29 + media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c | 80 + media/libvpx/libvpx/vpx_ports/static_assert.h | 30 + media/libvpx/libvpx/vpx_ports/system_state.h | 22 +- media/libvpx/libvpx/vpx_ports/vpx_once.h | 31 +- media/libvpx/libvpx/vpx_ports/vpx_ports.mk | 40 +- media/libvpx/libvpx/vpx_ports/vpx_timer.h | 38 +- media/libvpx/libvpx/vpx_ports/x86.h | 196 +- .../libvpx/vpx_ports/x86_abi_support.asm | 75 +- .../libvpx/vpx_scale/generic/gen_scalers.c | 4 +- .../libvpx/vpx_scale/generic/vpx_scale.c | 7 +- .../libvpx/vpx_scale/generic/yv12config.c | 73 +- .../libvpx/vpx_scale/generic/yv12extend.c | 67 +- media/libvpx/libvpx/vpx_scale/vpx_scale.h | 6 +- .../libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c | 2 +- .../libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl | 12 + media/libvpx/libvpx/vpx_scale/yv12config.h | 10 +- media/libvpx/libvpx/vpx_util/endian_inl.h | 6 +- .../libvpx/vpx_util/loongson_intrinsics.h | 2090 ++ media/libvpx/libvpx/vpx_util/vpx_atomics.h | 110 + media/libvpx/libvpx/vpx_util/vpx_debug_util.c | 282 + media/libvpx/libvpx/vpx_util/vpx_debug_util.h | 70 + media/libvpx/libvpx/vpx_util/vpx_pthread.h | 150 + media/libvpx/libvpx/vpx_util/vpx_thread.c | 93 +- media/libvpx/libvpx/vpx_util/vpx_thread.h | 349 +- media/libvpx/libvpx/vpx_util/vpx_timestamp.h | 49 + media/libvpx/libvpx/vpx_util/vpx_util.mk | 7 + .../libvpx/vpx_util/vpx_write_yuv_frame.c | 46 + .../libvpx/vpx_util/vpx_write_yuv_frame.h | 27 + media/libvpx/libvpx/vpx_version.h | 11 + media/libvpx/libvpx/vpxdec.c | 191 +- media/libvpx/libvpx/vpxenc.c | 695 +- media/libvpx/libvpx/vpxenc.h | 7 +- media/libvpx/libvpx/vpxstats.c | 2 +- media/libvpx/libvpx/vpxstats.h | 6 +- media/libvpx/libvpx/warnings.c | 2 +- media/libvpx/libvpx/warnings.h | 6 +- media/libvpx/libvpx/webmdec.cc | 49 +- media/libvpx/libvpx/webmdec.h | 8 +- media/libvpx/libvpx/webmenc.cc | 4 +- media/libvpx/libvpx/webmenc.h | 6 +- media/libvpx/libvpx/y4menc.c | 44 +- media/libvpx/libvpx/y4menc.h | 6 +- media/libvpx/libvpx/y4minput.c | 717 +- media/libvpx/libvpx/y4minput.h | 18 +- media/libvpx/moz.build | 55 +- media/libvpx/sources.mozbuild | 1297 +- 1282 files changed, 260258 insertions(+), 118999 deletions(-) create mode 100644 media/libvpx/config/linux/arm64/vp8_rtcd.h create mode 100644 media/libvpx/config/linux/arm64/vp9_rtcd.h create mode 100644 media/libvpx/config/linux/arm64/vpx_config.asm create mode 100644 media/libvpx/config/linux/arm64/vpx_config.c create mode 100644 media/libvpx/config/linux/arm64/vpx_config.h create mode 100644 media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/linux/arm64/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/linux/loongarch64/vp8_rtcd.h create mode 100644 media/libvpx/config/linux/loongarch64/vp9_rtcd.h create mode 100644 media/libvpx/config/linux/loongarch64/vpx_config.asm create mode 100644 media/libvpx/config/linux/loongarch64/vpx_config.c create mode 100644 media/libvpx/config/linux/loongarch64/vpx_config.h create mode 100644 media/libvpx/config/linux/loongarch64/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/linux/loongarch64/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/linux/mips32/vp8_rtcd.h create mode 100644 media/libvpx/config/linux/mips32/vp9_rtcd.h create mode 100644 media/libvpx/config/linux/mips32/vpx_config.asm create mode 100644 media/libvpx/config/linux/mips32/vpx_config.c create mode 100644 media/libvpx/config/linux/mips32/vpx_config.h create mode 100644 media/libvpx/config/linux/mips32/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/linux/mips32/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/linux/mips64/vp8_rtcd.h create mode 100644 media/libvpx/config/linux/mips64/vp9_rtcd.h create mode 100644 media/libvpx/config/linux/mips64/vpx_config.asm create mode 100644 media/libvpx/config/linux/mips64/vpx_config.c create mode 100644 media/libvpx/config/linux/mips64/vpx_config.h create mode 100644 media/libvpx/config/linux/mips64/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/linux/mips64/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/linux/ppc64le/vp8_rtcd.h create mode 100644 media/libvpx/config/linux/ppc64le/vp9_rtcd.h create mode 100644 media/libvpx/config/linux/ppc64le/vpx_config.asm create mode 100644 media/libvpx/config/linux/ppc64le/vpx_config.c create mode 100644 media/libvpx/config/linux/ppc64le/vpx_config.h create mode 100644 media/libvpx/config/linux/ppc64le/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/linux/ppc64le/vpx_scale_rtcd.h create mode 100644 media/libvpx/config/mac/arm64/vp8_rtcd.h create mode 100644 media/libvpx/config/mac/arm64/vp9_rtcd.h create mode 100644 media/libvpx/config/mac/arm64/vpx_config.asm create mode 100644 media/libvpx/config/mac/arm64/vpx_config.c create mode 100644 media/libvpx/config/mac/arm64/vpx_config.h create mode 100644 media/libvpx/config/mac/arm64/vpx_dsp_rtcd.h create mode 100644 media/libvpx/config/mac/arm64/vpx_scale_rtcd.h create mode 100644 media/libvpx/libvpx/CONTRIBUTING.md delete mode 100644 media/libvpx/libvpx/examples/resize_util.c rename media/libvpx/libvpx/{vpx => examples}/svc_context.h (83%) rename media/libvpx/libvpx/{vpx/src => examples}/svc_encodeframe.c (80%) create mode 100644 media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc create mode 100644 media/libvpx/libvpx/examples/vpx_enc_fuzzer.cc create mode 100644 media/libvpx/libvpx/test/bench.cc create mode 100644 media/libvpx/libvpx/test/bench.h create mode 100644 media/libvpx/libvpx/test/buffer.h create mode 100644 media/libvpx/libvpx/test/comp_avg_pred_test.cc delete mode 100644 media/libvpx/libvpx/test/datarate_test.cc create mode 100644 media/libvpx/libvpx/test/dct_partial_test.cc create mode 100644 media/libvpx/libvpx/test/dct_test.cc create mode 100644 media/libvpx/libvpx/test/decode_corrupted.cc delete mode 100644 media/libvpx/libvpx/test/fdct4x4_test.cc create mode 100644 media/libvpx/libvpx/test/init_vpx_test.cc create mode 100644 media/libvpx/libvpx/test/init_vpx_test.h create mode 100644 media/libvpx/libvpx/test/non_greedy_mv_test.cc delete mode 100644 media/libvpx/libvpx/test/resize_util.sh create mode 100644 media/libvpx/libvpx/test/svc_datarate_test.cc create mode 100644 media/libvpx/libvpx/test/svc_end_to_end_test.cc create mode 100644 media/libvpx/libvpx/test/svc_test.h create mode 100644 media/libvpx/libvpx/test/test_rc_interface.cc create mode 100644 media/libvpx/libvpx/test/timestamp_test.cc create mode 100644 media/libvpx/libvpx/test/vp8_datarate_test.cc create mode 100644 media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc rename media/libvpx/libvpx/test/{vp9_error_block_test.cc => vp9_block_error_test.cc} (58%) create mode 100755 media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh create mode 100644 media/libvpx/libvpx/test/vp9_datarate_test.cc rename media/libvpx/libvpx/test/{vp9_denoiser_sse2_test.cc => vp9_denoiser_test.cc} (50%) create mode 100644 media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc delete mode 100644 media/libvpx/libvpx/test/vp9_frame_parallel_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_motion_vector_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_roi_test.cc create mode 100644 media/libvpx/libvpx/test/vp9_scale_test.cc delete mode 100644 media/libvpx/libvpx/test/vp9_spatial_svc_encoder.sh create mode 100644 media/libvpx/libvpx/test/vpx_image_test.cc create mode 100644 media/libvpx/libvpx/test/vpx_scale_test.h create mode 100644 media/libvpx/libvpx/test/yuv_temporal_filter_test.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/.clang-format delete mode 100644 media/libvpx/libvpx/third_party/googletest/src/CHANGES delete mode 100644 media/libvpx/libvpx/third_party/googletest/src/README create mode 100644 media/libvpx/libvpx/third_party/googletest/src/README.md create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc create mode 100644 media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/LICENSE create mode 100644 media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc delete mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_mips.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc delete mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_mips.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc delete mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_mips.cc create mode 100644 media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc create mode 100644 media/libvpx/libvpx/third_party/nalloc/LICENSE create mode 100644 media/libvpx/libvpx/third_party/nalloc/README.libvpx create mode 100644 media/libvpx/libvpx/third_party/nalloc/nalloc.h create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Anandan.py create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/HornSchunck.py create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/SearchSmooth.py create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Util.py create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde create mode 100644 media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde create mode 100644 media/libvpx/libvpx/tools/README.pgo.md delete mode 100644 media/libvpx/libvpx/tools/all_builds.py delete mode 100644 media/libvpx/libvpx/tools/author_first_release.sh delete mode 100644 media/libvpx/libvpx/tools/ftfy.sh create mode 100644 media/libvpx/libvpx/tools/non_greedy_mv/non_greedy_mv.py create mode 100644 media/libvpx/libvpx/tools/set_analyzer_env.sh create mode 100644 media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h delete mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c delete mode 100644 media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c create mode 100644 media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c create mode 100644 media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c create mode 100644 media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c create mode 100644 media/libvpx/libvpx/vp8/common/vp8_skin_detection.c create mode 100644 media/libvpx/libvpx/vp8/common/vp8_skin_detection.h create mode 100644 media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c delete mode 100644 media/libvpx/libvpx/vp8/common/x86/filter_x86.c delete mode 100644 media/libvpx/libvpx/vp8/common/x86/filter_x86.h rename media/libvpx/libvpx/vp8/{common => encoder}/copy_c.c (100%) create mode 100644 media/libvpx/libvpx/vp8/encoder/ethreading.h create mode 100644 media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c create mode 100644 media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c create mode 100644 media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c create mode 100644 media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c create mode 100644 media/libvpx/libvpx/vp8/encoder/picklpf.h create mode 100644 media/libvpx/libvpx/vp8/encoder/temporal_filter.h rename media/libvpx/libvpx/vp8/encoder/x86/{encodeopt.asm => block_error_sse2.asm} (97%) rename media/libvpx/libvpx/vp8/{common => encoder}/x86/copy_sse2.asm (98%) rename media/libvpx/libvpx/vp8/{common => encoder}/x86/copy_sse3.asm (99%) delete mode 100644 media/libvpx/libvpx/vp8/encoder/x86/quantize_mmx.asm delete mode 100644 media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c rename media/libvpx/libvpx/vp8/encoder/x86/{quantize_ssse3.c => vp8_quantize_ssse3.c} (76%) create mode 100644 media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc create mode 100644 media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c create mode 100644 media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h create mode 100644 media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c create mode 100644 media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c delete mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_dthread.c delete mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_dthread.h create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c create mode 100644 media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c delete mode 100644 media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c create mode 100644 media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_avx2.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_ssse3.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_avx2.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_ssse3.c delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c create mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm delete mode 100644 media/libvpx/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm create mode 100644 media/libvpx/libvpx/vp9/ratectrl_rtc.cc create mode 100644 media/libvpx/libvpx/vp9/ratectrl_rtc.h create mode 100644 media/libvpx/libvpx/vp9/vp9_cx_iface.h create mode 100644 media/libvpx/libvpx/vp9/vp9_iface_common.c delete mode 100644 media/libvpx/libvpx/vpx/exports_spatial_svc create mode 100644 media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h create mode 100644 media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h create mode 100644 media/libvpx/libvpx/vpx/vpx_tpl.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/fwd_txfm_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_sve.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct16x16_neon.c delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h delete mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/quantize_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c create mode 100644 media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/skin_detection.c create mode 100644 media/libvpx/libvpx/vpx_dsp/skin_detection.h create mode 100644 media/libvpx/libvpx/vpx_dsp/sse.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm rename media/libvpx/libvpx/vpx_dsp/x86/{fdct.h => bitdepth_conversion_sse2.h} (76%) create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm rename media/libvpx/libvpx/vpx_dsp/x86/{loopfilter_intrin_sse2.c => loopfilter_sse2.c} (80%) create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad_avx512.c delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad_sse3.asm delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad_sse4.asm delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sad_ssse3.asm create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/variance_impl_avx2.c delete mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_asm_stubs.c create mode 100644 media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c create mode 100644 media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c create mode 100644 media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c delete mode 100644 media/libvpx/libvpx/vpx_ports/arm_cpudetect.c create mode 100644 media/libvpx/libvpx/vpx_ports/arm_cpudetect.h create mode 100644 media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h create mode 100644 media/libvpx/libvpx/vpx_ports/compiler_attributes.h create mode 100644 media/libvpx/libvpx/vpx_ports/emms_mmx.asm rename media/libvpx/libvpx/vpx_ports/{config.h => emms_mmx.c} (66%) rename media/libvpx/libvpx/vpx_ports/{emms.asm => float_control_word.asm} (81%) create mode 100644 media/libvpx/libvpx/vpx_ports/loongarch.h create mode 100644 media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c create mode 100644 media/libvpx/libvpx/vpx_ports/mips.h create mode 100644 media/libvpx/libvpx/vpx_ports/mips_cpudetect.c delete mode 100644 media/libvpx/libvpx/vpx_ports/msvc.h create mode 100644 media/libvpx/libvpx/vpx_ports/ppc.h create mode 100644 media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c create mode 100644 media/libvpx/libvpx/vpx_ports/static_assert.h create mode 100644 media/libvpx/libvpx/vpx_util/loongson_intrinsics.h create mode 100644 media/libvpx/libvpx/vpx_util/vpx_atomics.h create mode 100644 media/libvpx/libvpx/vpx_util/vpx_debug_util.c create mode 100644 media/libvpx/libvpx/vpx_util/vpx_debug_util.h create mode 100644 media/libvpx/libvpx/vpx_util/vpx_pthread.h create mode 100644 media/libvpx/libvpx/vpx_util/vpx_timestamp.h create mode 100644 media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c create mode 100644 media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h create mode 100644 media/libvpx/libvpx/vpx_version.h diff --git a/media/libvpx/README_MOZILLA b/media/libvpx/README_MOZILLA index 855d24111d..a04dea3485 100644 --- a/media/libvpx/README_MOZILLA +++ b/media/libvpx/README_MOZILLA @@ -8,4 +8,4 @@ The libvpx git repository is: https://chromium.googlesource.com/webm/libvpx -The git commit ID used was v1.6.1 +The git commit ID used was v1.16.0 diff --git a/media/libvpx/config/generic/vp8_rtcd.h b/media/libvpx/config/generic/vp8_rtcd.h index d0aebb66de..c68b01f24f 100644 --- a/media/libvpx/config/generic/vp8_rtcd.h +++ b/media/libvpx/config/generic/vp8_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ @@ -26,31 +37,34 @@ struct yv12_buffer_config; extern "C" { #endif -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); #define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); #define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); #define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); #define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c int vp8_block_error_c(short *coeff, short *dqcoeff); #define vp8_block_error vp8_block_error_c -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem16x16 vp8_copy_mem16x16_c -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem8x4 vp8_copy_mem8x4_c -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem8x8 vp8_copy_mem8x8_c -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); #define vp8_dc_only_idct_add vp8_dc_only_idct_add_c int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); @@ -59,7 +73,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); #define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); #define vp8_dequant_idct_add vp8_dequant_idct_add_c void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); @@ -68,7 +82,7 @@ void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c -void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_c(struct blockd*, short *DQC); #define vp8_dequantize_b vp8_dequantize_b_c int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); @@ -77,31 +91,28 @@ int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct block void vp8_fast_quantize_b_c(struct block *, struct blockd *); #define vp8_fast_quantize_b vp8_fast_quantize_b_c -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -#define vp8_full_search_sad vp8_full_search_sad_c - -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_bh vp8_loop_filter_bh_c -void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_bv vp8_loop_filter_bv_c -void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbh vp8_loop_filter_mbh_c -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbv vp8_loop_filter_mbv_c -void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c int vp8_mbblock_error_c(struct macroblock *mb, int dc); @@ -110,7 +121,7 @@ int vp8_mbblock_error_c(struct macroblock *mb, int dc); int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_refining_search_sad vp8_refining_search_sad_c void vp8_regular_quantize_b_c(struct block *, struct blockd *); @@ -122,28 +133,28 @@ void vp8_short_fdct4x4_c(short *input, short *output, int pitch); void vp8_short_fdct8x4_c(short *input, short *output, int pitch); #define vp8_short_fdct8x4 vp8_short_fdct8x4_c -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); #define vp8_short_idct4x4llm vp8_short_idct4x4llm_c -void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c void vp8_short_walsh4x4_c(short *input, short *output, int pitch); #define vp8_short_walsh4x4 vp8_short_walsh4x4_c -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); #define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); #define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); #define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); #define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); @@ -163,4 +174,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/generic/vp9_rtcd.h b/media/libvpx/config/generic/vp9_rtcd.h index acda9dadf1..c914437a70 100644 --- a/media/libvpx/config/generic/vp9_rtcd.h +++ b/media/libvpx/config/generic/vp9_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -14,12 +25,18 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -29,18 +46,18 @@ struct yv12_buffer_config; extern "C" { #endif +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c + int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); #define vp9_block_error vp9_block_error_c -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); #define vp9_block_error_fp vp9_block_error_fp_c -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); #define vp9_diamond_search_sad vp9_diamond_search_sad_c -void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c - void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht16x16 vp9_fht16x16_c @@ -50,13 +67,10 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); #define vp9_fht8x8 vp9_fht8x8_c -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c -void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); @@ -65,17 +79,23 @@ void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp vp9_quantize_fp_c -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12 vpx_convolve12_c + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_horiz vpx_convolve12_horiz_c + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_vert vpx_convolve12_vert_c void vp9_rtcd(void); @@ -91,4 +111,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/generic/vpx_config.asm b/media/libvpx/config/generic/vpx_config.asm index b4366a5a8e..cabc9f0a07 100644 --- a/media/libvpx/config/generic/vpx_config.asm +++ b/media/libvpx/config/generic/vpx_config.asm @@ -1,12 +1,19 @@ @ This file was created from a .asm file @ using the ads2gas.pl script. - .equ DO1STROUNDING, 0 -.equ ARCH_ARM , 0 -.equ ARCH_MIPS , 0 -.equ ARCH_X86 , 0 -.equ ARCH_X86_64 , 0 -.equ HAVE_NEON , 0 +.syntax unified +.equ VPX_ARCH_ARM , 0 +.equ VPX_ARCH_AARCH64 , 0 +.equ VPX_ARCH_MIPS , 0 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 0 .equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 0 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 @@ -19,6 +26,11 @@ .equ HAVE_SSE4_1 , 0 .equ HAVE_AVX , 0 .equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_PTHREAD_H , 1 .equ CONFIG_DEPENDENCY_TRACKING , 1 @@ -75,8 +87,11 @@ .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 1 -.equ CONFIG_SPATIAL_SVC , 0 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 .equ CONFIG_FP_MB_STATS , 0 .equ CONFIG_EMULATE_HARDWARE , 0 -.equ CONFIG_MISC_FIXES , 0 - .section .note.GNU-stack,"",%progbits +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/generic/vpx_config.h b/media/libvpx/config/generic/vpx_config.h index 8cc215eb5a..09d09da6da 100644 --- a/media/libvpx/config/generic/vpx_config.h +++ b/media/libvpx/config/generic/vpx_config.h @@ -10,12 +10,19 @@ #define VPX_CONFIG_H #define RESTRICT #define INLINE inline -#define ARCH_ARM 0 -#define ARCH_MIPS 0 -#define ARCH_X86 0 -#define ARCH_X86_64 0 -#define HAVE_NEON 0 +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -28,6 +35,11 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define CONFIG_DEPENDENCY_TRACKING 1 @@ -84,10 +96,13 @@ #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 -#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 -#define CONFIG_MISC_FIXES 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/generic/vpx_dsp_rtcd.h b/media/libvpx/config/generic/vpx_dsp_rtcd.h index 523bf33aed..ea44ef01ca 100644 --- a/media/libvpx/config/generic/vpx_dsp_rtcd.h +++ b/media/libvpx/config/generic/vpx_dsp_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +24,11 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -28,187 +44,154 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); #define vpx_comp_avg_pred vpx_comp_avg_pred_c -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8 vpx_convolve8_c -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg vpx_convolve8_avg_c -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_horiz vpx_convolve8_horiz_c -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve8_vert vpx_convolve8_vert_c -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_c -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_c -void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c -void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c -void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c -void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c -void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c -void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c -void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c -void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c -void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c -void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c -void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c -void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c -void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c -void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c -void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c -void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c -void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c - -void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c - -void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c - -void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c - -void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c -void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c -void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c -void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c -void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c - -void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c - -void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c -void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c - -void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c -void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c -void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c -void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c -void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c - -void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c - -void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c -void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c - -void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c - -void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c -void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c -void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c -void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c -void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c -void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c -void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c -void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c -void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c -void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c -void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c -void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c -void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c -void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c -void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c -void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); @@ -238,37 +221,40 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct8x8_1 vpx_fdct8x8_1_c -void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); #define vpx_get16x16var vpx_get16x16var_c -unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c -void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); #define vpx_get8x8var vpx_get8x8var_c unsigned int vpx_get_mb_ss_c(const int16_t *); #define vpx_get_mb_ss vpx_get_mb_ss_c -void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c -void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c -void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c -void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); #define vpx_hadamard_16x16 vpx_hadamard_16x16_c -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_c + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); #define vpx_hadamard_8x8 vpx_hadamard_8x8_c -void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -280,6 +266,9 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c + void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c @@ -310,7 +299,7 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); #define vpx_int_pro_col vpx_int_pro_col_c -void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_c void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -358,22 +347,22 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vpx_minmax_8x8 vpx_minmax_8x8_c -unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse16x16 vpx_mse16x16_c -unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse16x8 vpx_mse16x8_c -unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse8x16 vpx_mse8x16_c -unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse8x8 vpx_mse8x8_c -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b vpx_quantize_b_c -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -382,22 +371,16 @@ unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x16_avg vpx_sad16x16_avg_c -void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad16x16x3 vpx_sad16x16x3_c - -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x16x4d vpx_sad16x16x4d_c -void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad16x16x8 vpx_sad16x16x8_c - unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x32 vpx_sad16x32_c unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x32_avg vpx_sad16x32_avg_c -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x32x4d vpx_sad16x32x4d_c unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -406,22 +389,16 @@ unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x8_avg vpx_sad16x8_avg_c -void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad16x8x3 vpx_sad16x8x3_c - -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x8x4d vpx_sad16x8x4d_c -void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad16x8x8 vpx_sad16x8x8_c - unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x16 vpx_sad32x16_c unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x16_avg vpx_sad32x16_avg_c -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x16x4d vpx_sad32x16x4d_c unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -430,22 +407,16 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x32_avg vpx_sad32x32_avg_c -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x32x4d vpx_sad32x32x4d_c -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c - unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad32x64 vpx_sad32x64_c unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad32x64_avg vpx_sad32x64_avg_c -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x64x4d vpx_sad32x64x4d_c unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -454,34 +425,25 @@ unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x4_avg vpx_sad4x4_avg_c -void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x4x3 vpx_sad4x4x3_c - -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x4x4d vpx_sad4x4x4d_c -void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x4x8 vpx_sad4x4x8_c - unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad4x8 vpx_sad4x8_c unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x8_avg vpx_sad4x8_avg_c -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x8x4d vpx_sad4x8x4d_c -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad64x32 vpx_sad64x32_c unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x32_avg vpx_sad64x32_avg_c -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x32x4d vpx_sad64x32x4d_c unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -490,154 +452,214 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad64x64_avg vpx_sad64x64_avg_c -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x64x4d vpx_sad64x64x4d_c -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c - unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x16 vpx_sad8x16_c unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x16_avg vpx_sad8x16_avg_c -void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x16x3 vpx_sad8x16x3_c - -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x16x4d vpx_sad8x16x4d_c -void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x16x8 vpx_sad8x16x8_c - unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x4 vpx_sad8x4_c unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x4_avg vpx_sad8x4_avg_c -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x4x4d vpx_sad8x4x4d_c -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_c unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x8_avg vpx_sad8x8_avg_c -void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x8x3 vpx_sad8x8x3_c - -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x8x4d vpx_sad8x8x4d_c -void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x8x8 vpx_sad8x8x8_c +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c int vpx_satd_c(const int16_t *coeff, int length); #define vpx_satd vpx_satd_c -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_2d vpx_scaled_2d_c -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c -uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +#define vpx_sse vpx_sse_c + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c -uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c -uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c -uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c -uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c -uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c -uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c -uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c -uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c -uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c -uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c -uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c -uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); #define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c -uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c -uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c -uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c -uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c -uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c -uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c -uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c -uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c -uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c -uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c -uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c -uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c -uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); #define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); @@ -646,70 +668,70 @@ void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); #define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c -void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c -void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c -void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c -void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c -void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c -void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c -void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c -void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c -unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance16x16 vpx_variance16x16_c -unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance16x32 vpx_variance16x32_c -unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance16x8 vpx_variance16x8_c -unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance32x16 vpx_variance32x16_c -unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance32x32 vpx_variance32x32_c -unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance32x64 vpx_variance32x64_c -unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance4x4 vpx_variance4x4_c -unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance4x8 vpx_variance4x8_c -unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance64x32 vpx_variance64x32_c -unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance64x64 vpx_variance64x64_c -unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance8x16 vpx_variance8x16_c -unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance8x4 vpx_variance8x4_c -unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance8x8 vpx_variance8x8_c -void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); @@ -729,4 +751,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/generic/vpx_scale_rtcd.h b/media/libvpx/config/generic/vpx_scale_rtcd.h index f419cc7a5f..58485cb00a 100644 --- a/media/libvpx/config/generic/vpx_scale_rtcd.h +++ b/media/libvpx/config/generic/vpx_scale_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c @@ -63,4 +77,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/linux/arm/vp8_rtcd.h b/media/libvpx/config/linux/arm/vp8_rtcd.h index 098795508a..688aa3ead5 100644 --- a/media/libvpx/config/linux/arm/vp8_rtcd.h +++ b/media/libvpx/config/linux/arm/vp8_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ @@ -26,40 +37,43 @@ struct yv12_buffer_config; extern "C" { #endif -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); int vp8_block_error_c(short *coeff, short *dqcoeff); #define vp8_block_error vp8_block_error_c -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); @@ -69,9 +83,9 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, int stride); -RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *dest, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride); void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); @@ -81,9 +95,9 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_neon(struct blockd*, short *dqc); -RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_neon(struct blockd*, short *DQC); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC); int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_diamond_search_sad vp8_diamond_search_sad_c @@ -92,40 +106,37 @@ void vp8_fast_quantize_b_c(struct block *, struct blockd *); void vp8_fast_quantize_b_neon(struct block *, struct blockd *); RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -#define vp8_full_search_sad vp8_full_search_sad_c +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); int vp8_mbblock_error_c(struct macroblock *mb, int dc); #define vp8_mbblock_error vp8_mbblock_error_c @@ -133,7 +144,7 @@ int vp8_mbblock_error_c(struct macroblock *mb, int dc); int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_refining_search_sad vp8_refining_search_sad_c void vp8_regular_quantize_b_c(struct block *, struct blockd *); @@ -147,36 +158,36 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch); void vp8_short_fdct8x4_neon(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_neon(short *input, short *output); -RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff); -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_neon(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); void vp8_rtcd(void); @@ -261,4 +272,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/linux/arm/vp9_rtcd.h b/media/libvpx/config/linux/arm/vp9_rtcd.h index 4a41b66711..bd174d6bdc 100644 --- a/media/libvpx/config/linux/arm/vp9_rtcd.h +++ b/media/libvpx/config/linux/arm/vp9_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -14,12 +25,18 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -30,36 +47,35 @@ extern "C" { #endif int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_c +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -RTCD_EXTERN int64_t (*vp9_block_error_fp)(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -#define vp9_diamond_search_sad vp9_diamond_search_sad_c - -void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_fht16x16 vp9_fht16x16_c +void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_fht4x4 vp9_fht4x4_c +void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht4x4)(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); -#define vp9_fht8x8 vp9_fht8x8_c - -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -#define vp9_full_search_sad vp9_full_search_sad_c +void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht8x8)(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_c -void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); @@ -69,18 +85,17 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c - -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); void vp9_rtcd(void); @@ -94,16 +109,30 @@ static void setup_rtcd_internal(void) (void)flags; + vp9_block_error = vp9_block_error_c; + if (flags & HAS_NEON) vp9_block_error = vp9_block_error_neon; vp9_block_error_fp = vp9_block_error_fp_c; if (flags & HAS_NEON) vp9_block_error_fp = vp9_block_error_fp_neon; - vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; - if (flags & HAS_NEON) vp9_fdct8x8_quant = vp9_fdct8x8_quant_neon; + vp9_diamond_search_sad = vp9_diamond_search_sad_c; + if (flags & HAS_NEON) vp9_diamond_search_sad = vp9_diamond_search_sad_neon; + vp9_fht16x16 = vp9_fht16x16_c; + if (flags & HAS_NEON) vp9_fht16x16 = vp9_fht16x16_neon; + vp9_fht4x4 = vp9_fht4x4_c; + if (flags & HAS_NEON) vp9_fht4x4 = vp9_fht4x4_neon; + vp9_fht8x8 = vp9_fht8x8_c; + if (flags & HAS_NEON) vp9_fht8x8 = vp9_fht8x8_neon; + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_NEON) vp9_iht16x16_256_add = vp9_iht16x16_256_add_neon; vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; if (flags & HAS_NEON) vp9_iht4x4_16_add = vp9_iht4x4_16_add_neon; vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; if (flags & HAS_NEON) vp9_iht8x8_64_add = vp9_iht8x8_64_add_neon; vp9_quantize_fp = vp9_quantize_fp_c; if (flags & HAS_NEON) vp9_quantize_fp = vp9_quantize_fp_neon; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_NEON) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_neon; + vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; + if (flags & HAS_NEON) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_neon; } #endif @@ -111,4 +140,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/linux/arm/vpx_config.asm b/media/libvpx/config/linux/arm/vpx_config.asm index 0aced5ddc3..9ea8e5f6e0 100644 --- a/media/libvpx/config/linux/arm/vpx_config.asm +++ b/media/libvpx/config/linux/arm/vpx_config.asm @@ -1,12 +1,19 @@ @ This file was created from a .asm file @ using the ads2gas.pl script. - .equ DO1STROUNDING, 0 -.equ ARCH_ARM , 1 -.equ ARCH_MIPS , 0 -.equ ARCH_X86 , 0 -.equ ARCH_X86_64 , 0 -.equ HAVE_NEON , 1 +.syntax unified +.equ VPX_ARCH_ARM , 1 +.equ VPX_ARCH_AARCH64 , 0 +.equ VPX_ARCH_MIPS , 0 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 0 .equ HAVE_NEON_ASM , 1 +.equ HAVE_NEON , 1 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 .equ HAVE_MSA , 0 @@ -19,6 +26,11 @@ .equ HAVE_SSE4_1 , 0 .equ HAVE_AVX , 0 .equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 .equ HAVE_VPX_PORTS , 1 .equ HAVE_PTHREAD_H , 1 .equ CONFIG_DEPENDENCY_TRACKING , 1 @@ -75,8 +87,11 @@ .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 .equ CONFIG_EXPERIMENTAL , 0 .equ CONFIG_SIZE_LIMIT , 1 -.equ CONFIG_SPATIAL_SVC , 0 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 .equ CONFIG_FP_MB_STATS , 0 .equ CONFIG_EMULATE_HARDWARE , 0 -.equ CONFIG_MISC_FIXES , 0 - .section .note.GNU-stack,"",%progbits +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/linux/arm/vpx_config.h b/media/libvpx/config/linux/arm/vpx_config.h index 6b3d8581e2..1a7d21384d 100644 --- a/media/libvpx/config/linux/arm/vpx_config.h +++ b/media/libvpx/config/linux/arm/vpx_config.h @@ -10,12 +10,19 @@ #define VPX_CONFIG_H #define RESTRICT #define INLINE inline -#define ARCH_ARM 1 -#define ARCH_MIPS 0 -#define ARCH_X86 0 -#define ARCH_X86_64 0 -#define HAVE_NEON 1 +#define VPX_ARCH_ARM 1 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 #define HAVE_NEON_ASM 1 +#define HAVE_NEON 1 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -28,6 +35,11 @@ #define HAVE_SSE4_1 0 #define HAVE_AVX 0 #define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define CONFIG_DEPENDENCY_TRACKING 1 @@ -84,10 +96,13 @@ #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 -#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 -#define CONFIG_MISC_FIXES 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h b/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h index 760ec9a364..0c84d180ad 100644 --- a/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h +++ b/media/libvpx/config/linux/arm/vpx_dsp_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +24,11 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -28,243 +44,234 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *, int p); RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d117_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d117_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d117_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d117_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d135_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d135_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d135_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c - -void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c - -void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c -void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c - -void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c - -void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c - -void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c -void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16 vpx_fdct16x16_c +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16_1 vpx_fdct16x16_1_c +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16_1)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32 vpx_fdct32x32_c +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32_1 vpx_fdct32x32_1_c +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_1)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4 vpx_fdct4x4_c +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct4x4_1 vpx_fdct4x4_1_c +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4_1)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); @@ -274,46 +281,50 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride); -void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); -void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); unsigned int vpx_get_mb_ss_c(const int16_t *); #define vpx_get_mb_ss vpx_get_mb_ss_c -void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -328,6 +339,10 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride) void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride); + void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); @@ -368,9 +383,9 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width); RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width); -void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c @@ -430,391 +445,514 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); -unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vpx_mse16x8 vpx_mse16x8_c +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vpx_mse8x16 vpx_mse8x16_c +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vpx_mse8x8 vpx_mse8x8_c +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b vpx_quantize_b_c +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad16x16_avg vpx_sad16x16_avg_c +unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad16x16x3 vpx_sad16x16x3_c - -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad16x16x8 vpx_sad16x16x8_c +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad16x32 vpx_sad16x32_c +unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad16x32_avg vpx_sad16x32_avg_c +unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad16x32x4d vpx_sad16x32x4d_c +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad16x8_avg vpx_sad16x8_avg_c +unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad16x8x3 vpx_sad16x8x3_c - -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad16x8x4d vpx_sad16x8x4d_c - -void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad16x8x8 vpx_sad16x8x8_c +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad32x16 vpx_sad32x16_c +unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad32x16_avg vpx_sad32x16_avg_c +unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad32x16x4d vpx_sad32x16x4d_c +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad32x32_avg vpx_sad32x32_avg_c +unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad32x64 vpx_sad32x64_c +unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad32x64_avg vpx_sad32x64_avg_c +unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad32x64x4d vpx_sad32x64x4d_c +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad4x4_avg vpx_sad4x4_avg_c +unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x4x3 vpx_sad4x4x3_c - -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad4x4x4d vpx_sad4x4x4d_c - -void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x4x8 vpx_sad4x4x8_c +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad4x8 vpx_sad4x8_c +unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad4x8_avg vpx_sad4x8_avg_c +unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x4d vpx_sad4x8x4d_c - -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad64x32 vpx_sad64x32_c +unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad64x32_avg vpx_sad64x32_avg_c +unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad64x32x4d vpx_sad64x32x4d_c +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad64x64_avg vpx_sad64x64_avg_c +unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad8x16_avg vpx_sad8x16_avg_c +unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x16x3 vpx_sad8x16x3_c - -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad8x16x4d vpx_sad8x16x4d_c - -void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x16x8 vpx_sad8x16x8_c +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -#define vpx_sad8x4 vpx_sad8x4_c +unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad8x4_avg vpx_sad8x4_avg_c +unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x4d vpx_sad8x4x4d_c - -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -#define vpx_sad8x8_avg vpx_sad8x8_avg_c +unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x8x3 vpx_sad8x8x3_c +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -#define vpx_sad8x8x4d vpx_sad8x8x4d_c +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x8x8 vpx_sad8x8x8_c +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); int vpx_satd_c(const int16_t *coeff, int length); int vpx_satd_neon(const int16_t *coeff, int length); RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vpx_scaled_2d vpx_scaled_2d_c +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c -uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); -uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); -#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c +uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size); +RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); -void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x32 vpx_variance16x32_c +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance32x16 vpx_variance32x16_c +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance4x4 vpx_variance4x4_c +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance4x8 vpx_variance4x8_c +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_c +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); @@ -837,6 +975,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_avg_4x4 = vpx_avg_4x4_neon; vpx_avg_8x8 = vpx_avg_8x8_c; if (flags & HAS_NEON) vpx_avg_8x8 = vpx_avg_8x8_neon; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_NEON) vpx_comp_avg_pred = vpx_comp_avg_pred_neon; vpx_convolve8 = vpx_convolve8_c; if (flags & HAS_NEON) vpx_convolve8 = vpx_convolve8_neon; vpx_convolve8_avg = vpx_convolve8_avg_c; @@ -853,6 +993,14 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_convolve_avg = vpx_convolve_avg_neon; vpx_convolve_copy = vpx_convolve_copy_c; if (flags & HAS_NEON) vpx_convolve_copy = vpx_convolve_copy_neon; + vpx_d117_predictor_16x16 = vpx_d117_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d117_predictor_16x16 = vpx_d117_predictor_16x16_neon; + vpx_d117_predictor_32x32 = vpx_d117_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d117_predictor_32x32 = vpx_d117_predictor_32x32_neon; + vpx_d117_predictor_4x4 = vpx_d117_predictor_4x4_c; + if (flags & HAS_NEON) vpx_d117_predictor_4x4 = vpx_d117_predictor_4x4_neon; + vpx_d117_predictor_8x8 = vpx_d117_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d117_predictor_8x8 = vpx_d117_predictor_8x8_neon; vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_c; if (flags & HAS_NEON) vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_neon; vpx_d135_predictor_32x32 = vpx_d135_predictor_32x32_c; @@ -861,6 +1009,22 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_neon; vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_c; if (flags & HAS_NEON) vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_neon; + vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d153_predictor_16x16 = vpx_d153_predictor_16x16_neon; + vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d153_predictor_32x32 = vpx_d153_predictor_32x32_neon; + vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_c; + if (flags & HAS_NEON) vpx_d153_predictor_4x4 = vpx_d153_predictor_4x4_neon; + vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d153_predictor_8x8 = vpx_d153_predictor_8x8_neon; + vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d207_predictor_16x16 = vpx_d207_predictor_16x16_neon; + vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d207_predictor_32x32 = vpx_d207_predictor_32x32_neon; + vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_c; + if (flags & HAS_NEON) vpx_d207_predictor_4x4 = vpx_d207_predictor_4x4_neon; + vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d207_predictor_8x8 = vpx_d207_predictor_8x8_neon; vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; if (flags & HAS_NEON) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_neon; vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; @@ -869,6 +1033,14 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_neon; vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c; if (flags & HAS_NEON) vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_neon; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_neon; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_neon; + vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_c; + if (flags & HAS_NEON) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_neon; + vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_neon; vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c; if (flags & HAS_NEON) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_neon; vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c; @@ -901,6 +1073,20 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_dc_top_predictor_4x4 = vpx_dc_top_predictor_4x4_neon; vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_c; if (flags & HAS_NEON) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_neon; + vpx_fdct16x16 = vpx_fdct16x16_c; + if (flags & HAS_NEON) vpx_fdct16x16 = vpx_fdct16x16_neon; + vpx_fdct16x16_1 = vpx_fdct16x16_1_c; + if (flags & HAS_NEON) vpx_fdct16x16_1 = vpx_fdct16x16_1_neon; + vpx_fdct32x32 = vpx_fdct32x32_c; + if (flags & HAS_NEON) vpx_fdct32x32 = vpx_fdct32x32_neon; + vpx_fdct32x32_1 = vpx_fdct32x32_1_c; + if (flags & HAS_NEON) vpx_fdct32x32_1 = vpx_fdct32x32_1_neon; + vpx_fdct32x32_rd = vpx_fdct32x32_rd_c; + if (flags & HAS_NEON) vpx_fdct32x32_rd = vpx_fdct32x32_rd_neon; + vpx_fdct4x4 = vpx_fdct4x4_c; + if (flags & HAS_NEON) vpx_fdct4x4 = vpx_fdct4x4_neon; + vpx_fdct4x4_1 = vpx_fdct4x4_1_c; + if (flags & HAS_NEON) vpx_fdct4x4_1 = vpx_fdct4x4_1_neon; vpx_fdct8x8 = vpx_fdct8x8_c; if (flags & HAS_NEON) vpx_fdct8x8 = vpx_fdct8x8_neon; vpx_fdct8x8_1 = vpx_fdct8x8_1_c; @@ -921,6 +1107,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_neon; vpx_hadamard_16x16 = vpx_hadamard_16x16_c; if (flags & HAS_NEON) vpx_hadamard_16x16 = vpx_hadamard_16x16_neon; + vpx_hadamard_32x32 = vpx_hadamard_32x32_c; + if (flags & HAS_NEON) vpx_hadamard_32x32 = vpx_hadamard_32x32_neon; vpx_hadamard_8x8 = vpx_hadamard_8x8_c; if (flags & HAS_NEON) vpx_hadamard_8x8 = vpx_hadamard_8x8_neon; vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; @@ -929,6 +1117,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_idct16x16_1_add = vpx_idct16x16_1_add_neon; vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; if (flags & HAS_NEON) vpx_idct16x16_256_add = vpx_idct16x16_256_add_neon; + vpx_idct16x16_38_add = vpx_idct16x16_38_add_c; + if (flags & HAS_NEON) vpx_idct16x16_38_add = vpx_idct16x16_38_add_neon; vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; if (flags & HAS_NEON) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_neon; vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; @@ -979,38 +1169,208 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_minmax_8x8 = vpx_minmax_8x8_neon; vpx_mse16x16 = vpx_mse16x16_c; if (flags & HAS_NEON) vpx_mse16x16 = vpx_mse16x16_neon; + vpx_mse16x8 = vpx_mse16x8_c; + if (flags & HAS_NEON) vpx_mse16x8 = vpx_mse16x8_neon; + vpx_mse8x16 = vpx_mse8x16_c; + if (flags & HAS_NEON) vpx_mse8x16 = vpx_mse8x16_neon; + vpx_mse8x8 = vpx_mse8x8_c; + if (flags & HAS_NEON) vpx_mse8x8 = vpx_mse8x8_neon; + vpx_quantize_b = vpx_quantize_b_c; + if (flags & HAS_NEON) vpx_quantize_b = vpx_quantize_b_neon; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_NEON) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_neon; vpx_sad16x16 = vpx_sad16x16_c; if (flags & HAS_NEON) vpx_sad16x16 = vpx_sad16x16_neon; + vpx_sad16x16_avg = vpx_sad16x16_avg_c; + if (flags & HAS_NEON) vpx_sad16x16_avg = vpx_sad16x16_avg_neon; vpx_sad16x16x4d = vpx_sad16x16x4d_c; if (flags & HAS_NEON) vpx_sad16x16x4d = vpx_sad16x16x4d_neon; + vpx_sad16x32 = vpx_sad16x32_c; + if (flags & HAS_NEON) vpx_sad16x32 = vpx_sad16x32_neon; + vpx_sad16x32_avg = vpx_sad16x32_avg_c; + if (flags & HAS_NEON) vpx_sad16x32_avg = vpx_sad16x32_avg_neon; + vpx_sad16x32x4d = vpx_sad16x32x4d_c; + if (flags & HAS_NEON) vpx_sad16x32x4d = vpx_sad16x32x4d_neon; vpx_sad16x8 = vpx_sad16x8_c; if (flags & HAS_NEON) vpx_sad16x8 = vpx_sad16x8_neon; + vpx_sad16x8_avg = vpx_sad16x8_avg_c; + if (flags & HAS_NEON) vpx_sad16x8_avg = vpx_sad16x8_avg_neon; + vpx_sad16x8x4d = vpx_sad16x8x4d_c; + if (flags & HAS_NEON) vpx_sad16x8x4d = vpx_sad16x8x4d_neon; + vpx_sad32x16 = vpx_sad32x16_c; + if (flags & HAS_NEON) vpx_sad32x16 = vpx_sad32x16_neon; + vpx_sad32x16_avg = vpx_sad32x16_avg_c; + if (flags & HAS_NEON) vpx_sad32x16_avg = vpx_sad32x16_avg_neon; + vpx_sad32x16x4d = vpx_sad32x16x4d_c; + if (flags & HAS_NEON) vpx_sad32x16x4d = vpx_sad32x16x4d_neon; vpx_sad32x32 = vpx_sad32x32_c; if (flags & HAS_NEON) vpx_sad32x32 = vpx_sad32x32_neon; + vpx_sad32x32_avg = vpx_sad32x32_avg_c; + if (flags & HAS_NEON) vpx_sad32x32_avg = vpx_sad32x32_avg_neon; vpx_sad32x32x4d = vpx_sad32x32x4d_c; if (flags & HAS_NEON) vpx_sad32x32x4d = vpx_sad32x32x4d_neon; + vpx_sad32x64 = vpx_sad32x64_c; + if (flags & HAS_NEON) vpx_sad32x64 = vpx_sad32x64_neon; + vpx_sad32x64_avg = vpx_sad32x64_avg_c; + if (flags & HAS_NEON) vpx_sad32x64_avg = vpx_sad32x64_avg_neon; + vpx_sad32x64x4d = vpx_sad32x64x4d_c; + if (flags & HAS_NEON) vpx_sad32x64x4d = vpx_sad32x64x4d_neon; vpx_sad4x4 = vpx_sad4x4_c; if (flags & HAS_NEON) vpx_sad4x4 = vpx_sad4x4_neon; + vpx_sad4x4_avg = vpx_sad4x4_avg_c; + if (flags & HAS_NEON) vpx_sad4x4_avg = vpx_sad4x4_avg_neon; + vpx_sad4x4x4d = vpx_sad4x4x4d_c; + if (flags & HAS_NEON) vpx_sad4x4x4d = vpx_sad4x4x4d_neon; + vpx_sad4x8 = vpx_sad4x8_c; + if (flags & HAS_NEON) vpx_sad4x8 = vpx_sad4x8_neon; + vpx_sad4x8_avg = vpx_sad4x8_avg_c; + if (flags & HAS_NEON) vpx_sad4x8_avg = vpx_sad4x8_avg_neon; + vpx_sad4x8x4d = vpx_sad4x8x4d_c; + if (flags & HAS_NEON) vpx_sad4x8x4d = vpx_sad4x8x4d_neon; + vpx_sad64x32 = vpx_sad64x32_c; + if (flags & HAS_NEON) vpx_sad64x32 = vpx_sad64x32_neon; + vpx_sad64x32_avg = vpx_sad64x32_avg_c; + if (flags & HAS_NEON) vpx_sad64x32_avg = vpx_sad64x32_avg_neon; + vpx_sad64x32x4d = vpx_sad64x32x4d_c; + if (flags & HAS_NEON) vpx_sad64x32x4d = vpx_sad64x32x4d_neon; vpx_sad64x64 = vpx_sad64x64_c; if (flags & HAS_NEON) vpx_sad64x64 = vpx_sad64x64_neon; + vpx_sad64x64_avg = vpx_sad64x64_avg_c; + if (flags & HAS_NEON) vpx_sad64x64_avg = vpx_sad64x64_avg_neon; vpx_sad64x64x4d = vpx_sad64x64x4d_c; if (flags & HAS_NEON) vpx_sad64x64x4d = vpx_sad64x64x4d_neon; vpx_sad8x16 = vpx_sad8x16_c; if (flags & HAS_NEON) vpx_sad8x16 = vpx_sad8x16_neon; + vpx_sad8x16_avg = vpx_sad8x16_avg_c; + if (flags & HAS_NEON) vpx_sad8x16_avg = vpx_sad8x16_avg_neon; + vpx_sad8x16x4d = vpx_sad8x16x4d_c; + if (flags & HAS_NEON) vpx_sad8x16x4d = vpx_sad8x16x4d_neon; + vpx_sad8x4 = vpx_sad8x4_c; + if (flags & HAS_NEON) vpx_sad8x4 = vpx_sad8x4_neon; + vpx_sad8x4_avg = vpx_sad8x4_avg_c; + if (flags & HAS_NEON) vpx_sad8x4_avg = vpx_sad8x4_avg_neon; + vpx_sad8x4x4d = vpx_sad8x4x4d_c; + if (flags & HAS_NEON) vpx_sad8x4x4d = vpx_sad8x4x4d_neon; vpx_sad8x8 = vpx_sad8x8_c; if (flags & HAS_NEON) vpx_sad8x8 = vpx_sad8x8_neon; + vpx_sad8x8_avg = vpx_sad8x8_avg_c; + if (flags & HAS_NEON) vpx_sad8x8_avg = vpx_sad8x8_avg_neon; + vpx_sad8x8x4d = vpx_sad8x8x4d_c; + if (flags & HAS_NEON) vpx_sad8x8x4d = vpx_sad8x8x4d_neon; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c; + if (flags & HAS_NEON) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c; + if (flags & HAS_NEON) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c; + if (flags & HAS_NEON) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c; + if (flags & HAS_NEON) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c; + if (flags & HAS_NEON) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c; + if (flags & HAS_NEON) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon; + vpx_sad_skip_4x4 = vpx_sad_skip_4x4_c; + if (flags & HAS_NEON) vpx_sad_skip_4x4 = vpx_sad_skip_4x4_neon; + vpx_sad_skip_4x4x4d = vpx_sad_skip_4x4x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_4x4x4d = vpx_sad_skip_4x4x4d_neon; + vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c; + if (flags & HAS_NEON) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_neon; + vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_neon; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c; + if (flags & HAS_NEON) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c; + if (flags & HAS_NEON) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon; + vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c; + if (flags & HAS_NEON) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_neon; + vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_neon; + vpx_sad_skip_8x4 = vpx_sad_skip_8x4_c; + if (flags & HAS_NEON) vpx_sad_skip_8x4 = vpx_sad_skip_8x4_neon; + vpx_sad_skip_8x4x4d = vpx_sad_skip_8x4x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_8x4x4d = vpx_sad_skip_8x4x4d_neon; + vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c; + if (flags & HAS_NEON) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_neon; + vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c; + if (flags & HAS_NEON) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_neon; vpx_satd = vpx_satd_c; if (flags & HAS_NEON) vpx_satd = vpx_satd_neon; + vpx_scaled_2d = vpx_scaled_2d_c; + if (flags & HAS_NEON) vpx_scaled_2d = vpx_scaled_2d_neon; + vpx_sse = vpx_sse_c; + if (flags & HAS_NEON) vpx_sse = vpx_sse_neon; + vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_neon; + vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_neon; + vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance16x8 = vpx_sub_pixel_avg_variance16x8_neon; + vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance32x16 = vpx_sub_pixel_avg_variance32x16_neon; + vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance32x32 = vpx_sub_pixel_avg_variance32x32_neon; + vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance32x64 = vpx_sub_pixel_avg_variance32x64_neon; + vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance4x4 = vpx_sub_pixel_avg_variance4x4_neon; + vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance4x8 = vpx_sub_pixel_avg_variance4x8_neon; + vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance64x32 = vpx_sub_pixel_avg_variance64x32_neon; + vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_neon; + vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance8x16 = vpx_sub_pixel_avg_variance8x16_neon; + vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance8x4 = vpx_sub_pixel_avg_variance8x4_neon; + vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_avg_variance8x8 = vpx_sub_pixel_avg_variance8x8_neon; vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_c; if (flags & HAS_NEON) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_neon; + vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance16x32 = vpx_sub_pixel_variance16x32_neon; + vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance16x8 = vpx_sub_pixel_variance16x8_neon; + vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance32x16 = vpx_sub_pixel_variance32x16_neon; vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_c; if (flags & HAS_NEON) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_neon; + vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance32x64 = vpx_sub_pixel_variance32x64_neon; + vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance4x4 = vpx_sub_pixel_variance4x4_neon; + vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance4x8 = vpx_sub_pixel_variance4x8_neon; + vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance64x32 = vpx_sub_pixel_variance64x32_neon; vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_c; if (flags & HAS_NEON) vpx_sub_pixel_variance64x64 = vpx_sub_pixel_variance64x64_neon; + vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance8x16 = vpx_sub_pixel_variance8x16_neon; + vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_c; + if (flags & HAS_NEON) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_neon; vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_c; if (flags & HAS_NEON) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_neon; vpx_subtract_block = vpx_subtract_block_c; if (flags & HAS_NEON) vpx_subtract_block = vpx_subtract_block_neon; + vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c; + if (flags & HAS_NEON) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon; vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; if (flags & HAS_NEON) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_neon; vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c; @@ -1029,18 +1389,28 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_v_predictor_8x8 = vpx_v_predictor_8x8_neon; vpx_variance16x16 = vpx_variance16x16_c; if (flags & HAS_NEON) vpx_variance16x16 = vpx_variance16x16_neon; + vpx_variance16x32 = vpx_variance16x32_c; + if (flags & HAS_NEON) vpx_variance16x32 = vpx_variance16x32_neon; vpx_variance16x8 = vpx_variance16x8_c; if (flags & HAS_NEON) vpx_variance16x8 = vpx_variance16x8_neon; + vpx_variance32x16 = vpx_variance32x16_c; + if (flags & HAS_NEON) vpx_variance32x16 = vpx_variance32x16_neon; vpx_variance32x32 = vpx_variance32x32_c; if (flags & HAS_NEON) vpx_variance32x32 = vpx_variance32x32_neon; vpx_variance32x64 = vpx_variance32x64_c; if (flags & HAS_NEON) vpx_variance32x64 = vpx_variance32x64_neon; + vpx_variance4x4 = vpx_variance4x4_c; + if (flags & HAS_NEON) vpx_variance4x4 = vpx_variance4x4_neon; + vpx_variance4x8 = vpx_variance4x8_c; + if (flags & HAS_NEON) vpx_variance4x8 = vpx_variance4x8_neon; vpx_variance64x32 = vpx_variance64x32_c; if (flags & HAS_NEON) vpx_variance64x32 = vpx_variance64x32_neon; vpx_variance64x64 = vpx_variance64x64_c; if (flags & HAS_NEON) vpx_variance64x64 = vpx_variance64x64_neon; vpx_variance8x16 = vpx_variance8x16_c; if (flags & HAS_NEON) vpx_variance8x16 = vpx_variance8x16_neon; + vpx_variance8x4 = vpx_variance8x4_c; + if (flags & HAS_NEON) vpx_variance8x4 = vpx_variance8x4_neon; vpx_variance8x8 = vpx_variance8x8_c; if (flags & HAS_NEON) vpx_variance8x8 = vpx_variance8x8_neon; vpx_vector_var = vpx_vector_var_c; @@ -1052,4 +1422,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/linux/arm/vpx_scale_rtcd.h b/media/libvpx/config/linux/arm/vpx_scale_rtcd.h index a1564b7ad6..ca594ea140 100644 --- a/media/libvpx/config/linux/arm/vpx_scale_rtcd.h +++ b/media/libvpx/config/linux/arm/vpx_scale_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c @@ -68,4 +82,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/linux/arm64/vp8_rtcd.h b/media/libvpx/config/linux/arm64/vp8_rtcd.h new file mode 100644 index 0000000000..36e6855a6d --- /dev/null +++ b/media/libvpx/config/linux/arm64/vp8_rtcd.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_neon + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_neon + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_neon + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_neon(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_neon + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_neon(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_neon + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_neon + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_neon + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_neon(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_neon + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/linux/arm64/vp9_rtcd.h b/media/libvpx/config/linux/arm64/vp9_rtcd.h new file mode 100644 index 0000000000..e1b572fe62 --- /dev/null +++ b/media/libvpx/config/linux/arm64/vp9_rtcd.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_neon + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_neon + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_neon + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_neon + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_neon + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp vp9_quantize_fp_neon + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + + vp9_block_error = vp9_block_error_neon; + if (flags & HAS_SVE) vp9_block_error = vp9_block_error_sve; + vp9_block_error_fp = vp9_block_error_fp_neon; + if (flags & HAS_SVE) vp9_block_error_fp = vp9_block_error_fp_sve; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/linux/arm64/vpx_config.asm b/media/libvpx/config/linux/arm64/vpx_config.asm new file mode 100644 index 0000000000..6758dcb4d3 --- /dev/null +++ b/media/libvpx/config/linux/arm64/vpx_config.asm @@ -0,0 +1,97 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.syntax unified +.equ VPX_ARCH_ARM , 1 +.equ VPX_ARCH_AARCH64 , 1 +.equ VPX_ARCH_MIPS , 0 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 0 +.equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 1 +.equ HAVE_NEON_DOTPROD , 1 +.equ HAVE_NEON_I8MM , 1 +.equ HAVE_SVE , 1 +.equ HAVE_SVE2 , 1 +.equ HAVE_MIPS32 , 0 +.equ HAVE_DSPR2 , 0 +.equ HAVE_MSA , 0 +.equ HAVE_MIPS64 , 0 +.equ HAVE_MMX , 0 +.equ HAVE_SSE , 0 +.equ HAVE_SSE2 , 0 +.equ HAVE_SSE3 , 0 +.equ HAVE_SSSE3 , 0 +.equ HAVE_SSE4_1 , 0 +.equ HAVE_AVX , 0 +.equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 +.equ HAVE_VPX_PORTS , 1 +.equ HAVE_PTHREAD_H , 1 +.equ CONFIG_DEPENDENCY_TRACKING , 1 +.equ CONFIG_EXTERNAL_BUILD , 1 +.equ CONFIG_INSTALL_DOCS , 0 +.equ CONFIG_INSTALL_BINS , 1 +.equ CONFIG_INSTALL_LIBS , 1 +.equ CONFIG_INSTALL_SRCS , 0 +.equ CONFIG_DEBUG , 0 +.equ CONFIG_GPROF , 0 +.equ CONFIG_GCOV , 0 +.equ CONFIG_RVCT , 0 +.equ CONFIG_GCC , 1 +.equ CONFIG_MSVS , 0 +.equ CONFIG_PIC , 1 +.equ CONFIG_BIG_ENDIAN , 0 +.equ CONFIG_CODEC_SRCS , 0 +.equ CONFIG_DEBUG_LIBS , 0 +.equ CONFIG_DEQUANT_TOKENS , 0 +.equ CONFIG_DC_RECON , 0 +.equ CONFIG_RUNTIME_CPU_DETECT , 1 +.equ CONFIG_POSTPROC , 0 +.equ CONFIG_VP9_POSTPROC , 0 +.equ CONFIG_MULTITHREAD , 1 +.equ CONFIG_INTERNAL_STATS , 0 +.equ CONFIG_VP8_ENCODER , 1 +.equ CONFIG_VP8_DECODER , 1 +.equ CONFIG_VP9_ENCODER , 1 +.equ CONFIG_VP9_DECODER , 1 +.equ CONFIG_VP8 , 1 +.equ CONFIG_VP9 , 1 +.equ CONFIG_ENCODERS , 1 +.equ CONFIG_DECODERS , 1 +.equ CONFIG_STATIC_MSVCRT , 0 +.equ CONFIG_SPATIAL_RESAMPLING , 1 +.equ CONFIG_REALTIME_ONLY , 1 +.equ CONFIG_ONTHEFLY_BITPACKING , 0 +.equ CONFIG_ERROR_CONCEALMENT , 0 +.equ CONFIG_SHARED , 0 +.equ CONFIG_STATIC , 1 +.equ CONFIG_SMALL , 0 +.equ CONFIG_POSTPROC_VISUALIZER , 0 +.equ CONFIG_OS_SUPPORT , 1 +.equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 1 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 +.equ CONFIG_MULTI_RES_ENCODING , 1 +.equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 +.equ CONFIG_EXPERIMENTAL , 0 +.equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 +.equ CONFIG_FP_MB_STATS , 0 +.equ CONFIG_EMULATE_HARDWARE , 0 +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/linux/arm64/vpx_config.c b/media/libvpx/config/linux/arm64/vpx_config.c new file mode 100644 index 0000000000..d9ea4527ca --- /dev/null +++ b/media/libvpx/config/linux/arm64/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect --enable-realtime-only"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/arm64/vpx_config.h b/media/libvpx/config/linux/arm64/vpx_config.h new file mode 100644 index 0000000000..b5163a213c --- /dev/null +++ b/media/libvpx/config/linux/arm64/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 1 +#define VPX_ARCH_AARCH64 1 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 1 +#define HAVE_NEON_DOTPROD 1 +#define HAVE_NEON_I8MM 1 +#define HAVE_SVE 1 +#define HAVE_SVE2 1 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 1 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..3c34cafc59 --- /dev/null +++ b/media/libvpx/config/linux/arm64/vpx_dsp_rtcd.h @@ -0,0 +1,1197 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_neon(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_neon + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_neon(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_neon + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_neon + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_neon + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_neon + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_neon + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_neon + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_neon + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_neon + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_neon + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_neon + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_neon + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_neon + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_neon + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_neon + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_neon + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_neon + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_neon + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_neon + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_neon + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_neon + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_neon + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_neon + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_neon + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_neon + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_neon + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_neon + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_neon + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_neon + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_neon + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_neon + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_neon + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_neon + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_neon + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_neon + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_neon + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_neon + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_neon + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_neon + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_neon + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_neon + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_neon + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_neon + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_neon + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_neon + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_neon + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_neon + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_neon + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_neon + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_neon + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_neon + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_neon + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_neon + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_neon + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_neon + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_neon + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_neon + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_neon + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_neon + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_neon + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b vpx_quantize_b_neon + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_neon + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_neon + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_neon + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_neon + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_neon + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_neon + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_neon + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_neon + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_neon + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_neon + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_neon + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_neon + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_neon + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_neon + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_neon + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_neon + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_neon + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_neon + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_neon + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_neon + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_neon + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_neon + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_neon + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_neon + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_neon + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_neon(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_neon + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_neon + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_neon + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vpx_subtract_block vpx_subtract_block_neon + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size); +RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_neon + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_neon + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_neon + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_neon + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_neon + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_neon + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_neon + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_neon + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_neon + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + + vpx_convolve8 = vpx_convolve8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8 = vpx_convolve8_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8 = vpx_convolve8_neon_i8mm; + vpx_convolve8_avg = vpx_convolve8_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg = vpx_convolve8_avg_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg = vpx_convolve8_avg_neon_i8mm; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_i8mm; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_i8mm; + vpx_convolve8_horiz = vpx_convolve8_horiz_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_i8mm; + vpx_convolve8_vert = vpx_convolve8_vert_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_vert = vpx_convolve8_vert_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_vert = vpx_convolve8_vert_neon_i8mm; + vpx_get16x16var = vpx_get16x16var_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get16x16var = vpx_get16x16var_neon_dotprod; + vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon_dotprod; + vpx_get8x8var = vpx_get8x8var_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get8x8var = vpx_get8x8var_neon_dotprod; + vpx_mse16x16 = vpx_mse16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse16x16 = vpx_mse16x16_neon_dotprod; + vpx_mse16x8 = vpx_mse16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse16x8 = vpx_mse16x8_neon_dotprod; + vpx_mse8x16 = vpx_mse8x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse8x16 = vpx_mse8x16_neon_dotprod; + vpx_mse8x8 = vpx_mse8x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse8x8 = vpx_mse8x8_neon_dotprod; + vpx_sad16x16 = vpx_sad16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16 = vpx_sad16x16_neon_dotprod; + vpx_sad16x16_avg = vpx_sad16x16_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16_avg = vpx_sad16x16_avg_neon_dotprod; + vpx_sad16x16x4d = vpx_sad16x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16x4d = vpx_sad16x16x4d_neon_dotprod; + vpx_sad16x32 = vpx_sad16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32 = vpx_sad16x32_neon_dotprod; + vpx_sad16x32_avg = vpx_sad16x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32_avg = vpx_sad16x32_avg_neon_dotprod; + vpx_sad16x32x4d = vpx_sad16x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32x4d = vpx_sad16x32x4d_neon_dotprod; + vpx_sad16x8 = vpx_sad16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8 = vpx_sad16x8_neon_dotprod; + vpx_sad16x8_avg = vpx_sad16x8_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8_avg = vpx_sad16x8_avg_neon_dotprod; + vpx_sad16x8x4d = vpx_sad16x8x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8x4d = vpx_sad16x8x4d_neon_dotprod; + vpx_sad32x16 = vpx_sad32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16 = vpx_sad32x16_neon_dotprod; + vpx_sad32x16_avg = vpx_sad32x16_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16_avg = vpx_sad32x16_avg_neon_dotprod; + vpx_sad32x16x4d = vpx_sad32x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16x4d = vpx_sad32x16x4d_neon_dotprod; + vpx_sad32x32 = vpx_sad32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32 = vpx_sad32x32_neon_dotprod; + vpx_sad32x32_avg = vpx_sad32x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32_avg = vpx_sad32x32_avg_neon_dotprod; + vpx_sad32x32x4d = vpx_sad32x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32x4d = vpx_sad32x32x4d_neon_dotprod; + vpx_sad32x64 = vpx_sad32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64 = vpx_sad32x64_neon_dotprod; + vpx_sad32x64_avg = vpx_sad32x64_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64_avg = vpx_sad32x64_avg_neon_dotprod; + vpx_sad32x64x4d = vpx_sad32x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64x4d = vpx_sad32x64x4d_neon_dotprod; + vpx_sad64x32 = vpx_sad64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32 = vpx_sad64x32_neon_dotprod; + vpx_sad64x32_avg = vpx_sad64x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32_avg = vpx_sad64x32_avg_neon_dotprod; + vpx_sad64x32x4d = vpx_sad64x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32x4d = vpx_sad64x32x4d_neon_dotprod; + vpx_sad64x64 = vpx_sad64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64 = vpx_sad64x64_neon_dotprod; + vpx_sad64x64_avg = vpx_sad64x64_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64_avg = vpx_sad64x64_avg_neon_dotprod; + vpx_sad64x64x4d = vpx_sad64x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64x4d = vpx_sad64x64x4d_neon_dotprod; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon_dotprod; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon_dotprod; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon_dotprod; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon_dotprod; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon_dotprod; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon_dotprod; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon_dotprod; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon_dotprod; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon_dotprod; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon_dotprod; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon_dotprod; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon_dotprod; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon_dotprod; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon_dotprod; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon_dotprod; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon_dotprod; + vpx_sse = vpx_sse_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sse = vpx_sse_neon_dotprod; + vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon; + if (flags & HAS_SVE) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sve; + vpx_variance16x16 = vpx_variance16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x16 = vpx_variance16x16_neon_dotprod; + vpx_variance16x32 = vpx_variance16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x32 = vpx_variance16x32_neon_dotprod; + vpx_variance16x8 = vpx_variance16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x8 = vpx_variance16x8_neon_dotprod; + vpx_variance32x16 = vpx_variance32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x16 = vpx_variance32x16_neon_dotprod; + vpx_variance32x32 = vpx_variance32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x32 = vpx_variance32x32_neon_dotprod; + vpx_variance32x64 = vpx_variance32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x64 = vpx_variance32x64_neon_dotprod; + vpx_variance4x4 = vpx_variance4x4_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance4x4 = vpx_variance4x4_neon_dotprod; + vpx_variance4x8 = vpx_variance4x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance4x8 = vpx_variance4x8_neon_dotprod; + vpx_variance64x32 = vpx_variance64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance64x32 = vpx_variance64x32_neon_dotprod; + vpx_variance64x64 = vpx_variance64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance64x64 = vpx_variance64x64_neon_dotprod; + vpx_variance8x16 = vpx_variance8x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x16 = vpx_variance8x16_neon_dotprod; + vpx_variance8x4 = vpx_variance8x4_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x4 = vpx_variance8x4_neon_dotprod; + vpx_variance8x8 = vpx_variance8x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x8 = vpx_variance8x8_neon_dotprod; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/linux/arm64/vpx_scale_rtcd.h b/media/libvpx/config/linux/arm64/vpx_scale_rtcd.h new file mode 100644 index 0000000000..ca594ea140 --- /dev/null +++ b/media/libvpx/config/linux/arm64/vpx_scale_rtcd.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/linux/ia32/vp8_rtcd.h b/media/libvpx/config/linux/ia32/vp8_rtcd.h index 5db5bbad85..ca42bb8a5e 100644 --- a/media/libvpx/config/linux/ia32/vp8_rtcd.h +++ b/media/libvpx/config/linux/ia32/vp8_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ @@ -26,57 +37,48 @@ struct yv12_buffer_config; extern "C" { #endif -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c - -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c - -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); int vp8_block_error_c(short *coeff, short *dqcoeff); int vp8_block_error_sse2(short *coeff, short *dqcoeff); RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); @@ -86,9 +88,9 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); -RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride); void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); @@ -98,9 +100,9 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC); int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); @@ -122,42 +124,37 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); int vp8_mbblock_error_c(struct macroblock *mb, int dc); int vp8_mbblock_error_sse2(struct macroblock *mb, int dc); @@ -167,9 +164,9 @@ int vp8_mbuverror_c(struct macroblock *mb); int vp8_mbuverror_sse2(struct macroblock *mb); RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); void vp8_regular_quantize_b_c(struct block *, struct blockd *); void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); @@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff); -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); @@ -237,9 +234,9 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; + if (flags & HAS_SSE2) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_sse2; vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; + if (flags & HAS_SSE2) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_sse2; vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; @@ -277,9 +274,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; - vp8_full_search_sad = vp8_full_search_sad_c; - if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; - if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; vp8_loop_filter_bh = vp8_loop_filter_bh_c; if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; vp8_loop_filter_bv = vp8_loop_filter_bv_c; @@ -336,4 +330,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/linux/ia32/vp9_rtcd.h b/media/libvpx/config/linux/ia32/vp9_rtcd.h index da53895ca4..28c82a831c 100644 --- a/media/libvpx/config/linux/ia32/vp9_rtcd.h +++ b/media/libvpx/config/linux/ia32/vp9_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -14,12 +25,18 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -29,23 +46,22 @@ struct yv12_buffer_config; extern "C" { #endif +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -RTCD_EXTERN int64_t (*vp9_block_error_fp)(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); - -void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -67,18 +83,13 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); RTCD_EXTERN void (*vp9_filter_by_weight8x8)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride); -void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); @@ -88,20 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); void vp9_rtcd(void); @@ -113,16 +139,14 @@ static void setup_rtcd_internal(void) (void)flags; + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; vp9_block_error = vp9_block_error_c; if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2; if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; vp9_block_error_fp = vp9_block_error_fp_c; if (flags & HAS_SSE2) vp9_block_error_fp = vp9_block_error_fp_sse2; - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; - vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; - if (flags & HAS_SSE2) vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2; - if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; vp9_fht16x16 = vp9_fht16x16_c; if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; vp9_fht4x4 = vp9_fht4x4_c; @@ -133,9 +157,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_sse2; vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_c; if (flags & HAS_SSE2) vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_sse2; - vp9_full_search_sad = vp9_full_search_sad_c; - if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; - if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; vp9_fwht4x4 = vp9_fwht4x4_c; if (flags & HAS_SSE2) vp9_fwht4x4 = vp9_fwht4x4_sse2; vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; @@ -146,10 +167,22 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; vp9_quantize_fp = vp9_quantize_fp_c; if (flags & HAS_SSE2) vp9_quantize_fp = vp9_quantize_fp_sse2; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; - vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; - if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; + vpx_convolve12 = vpx_convolve12_c; + if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3; + if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2; + vpx_convolve12_horiz = vpx_convolve12_horiz_c; + if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2; + vpx_convolve12_vert = vpx_convolve12_vert_c; + if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2; } #endif @@ -157,4 +190,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/linux/ia32/vpx_config.asm b/media/libvpx/config/linux/ia32/vpx_config.asm index 2efc8b978b..bafe400119 100644 --- a/media/libvpx/config/linux/ia32/vpx_config.asm +++ b/media/libvpx/config/linux/ia32/vpx_config.asm @@ -1,9 +1,16 @@ -%define ARCH_ARM 0 -%define ARCH_MIPS 0 -%define ARCH_X86 1 -%define ARCH_X86_64 0 -%define HAVE_NEON 0 +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 1 +%define VPX_ARCH_X86_64 0 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 %define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 @@ -16,6 +23,11 @@ %define HAVE_SSE4_1 1 %define HAVE_AVX 1 %define HAVE_AVX2 1 +%define HAVE_AVX512 1 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 %define HAVE_VPX_PORTS 1 %define HAVE_PTHREAD_H 1 %define CONFIG_DEPENDENCY_TRACKING 1 @@ -72,7 +84,10 @@ %define CONFIG_BETTER_HW_COMPATIBILITY 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 1 -%define CONFIG_SPATIAL_SVC 0 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 %define CONFIG_FP_MB_STATS 0 %define CONFIG_EMULATE_HARDWARE 0 -%define CONFIG_MISC_FIXES 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/linux/ia32/vpx_config.h b/media/libvpx/config/linux/ia32/vpx_config.h index b26f9a51e8..a4cc48afe1 100644 --- a/media/libvpx/config/linux/ia32/vpx_config.h +++ b/media/libvpx/config/linux/ia32/vpx_config.h @@ -10,12 +10,19 @@ #define VPX_CONFIG_H #define RESTRICT #define INLINE inline -#define ARCH_ARM 0 -#define ARCH_MIPS 0 -#define ARCH_X86 1 -#define ARCH_X86_64 0 -#define HAVE_NEON 0 +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 1 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -28,6 +35,11 @@ #define HAVE_SSE4_1 1 #define HAVE_AVX 1 #define HAVE_AVX2 1 +#define HAVE_AVX512 1 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define CONFIG_DEPENDENCY_TRACKING 1 @@ -84,10 +96,13 @@ #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 -#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 -#define CONFIG_MISC_FIXES 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h index 112e326a64..0a6266a322 100644 --- a/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h +++ b/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +24,11 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -28,242 +44,215 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c -void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c -void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c -void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c -void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c -void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c -void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c -void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c -void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c - -void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c - -void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c -void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c - -void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c - -void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c -void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); @@ -300,47 +289,53 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride); -void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c -void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); unsigned int vpx_get_mb_ss_c(const int16_t *); unsigned int vpx_get_mb_ss_sse2(const int16_t *); RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *); -void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -353,14 +348,22 @@ RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride); + void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -369,6 +372,7 @@ RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -381,6 +385,7 @@ RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, in void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -395,9 +400,9 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width); -void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); @@ -456,9 +461,9 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); -void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); -RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols,int flimit); void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); @@ -468,22 +473,23 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); -unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); @@ -493,12 +499,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -508,18 +520,9 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -529,9 +532,9 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -541,18 +544,9 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -564,9 +558,9 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -578,16 +572,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -599,9 +587,9 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -611,17 +599,9 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -631,47 +611,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -681,17 +657,9 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -701,12 +669,9 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -716,273 +681,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); int vpx_satd_c(const int16_t *coeff, int length); int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_avx2(const int16_t *coeff, int length); RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c -uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); -uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); -void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); @@ -1003,6 +1087,9 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2; vpx_avg_8x8 = vpx_avg_8x8_c; if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; vpx_convolve8 = vpx_convolve8_c; if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; @@ -1010,12 +1097,15 @@ static void setup_rtcd_internal(void) vpx_convolve8_avg = vpx_convolve8_avg_c; if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; vpx_convolve8_horiz = vpx_convolve8_horiz_c; if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; @@ -1094,6 +1184,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2; vpx_fdct16x16 = vpx_fdct16x16_c; if (flags & HAS_SSE2) vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; vpx_fdct16x16_1 = vpx_fdct16x16_1_c; if (flags & HAS_SSE2) vpx_fdct16x16_1 = vpx_fdct16x16_1_sse2; vpx_fdct32x32 = vpx_fdct32x32_c; @@ -1129,6 +1220,10 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2; vpx_hadamard_16x16 = vpx_hadamard_16x16_c; if (flags & HAS_SSE2) vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_c; + if (flags & HAS_SSE2) vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; vpx_hadamard_8x8 = vpx_hadamard_8x8_c; if (flags & HAS_SSE2) vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; @@ -1137,20 +1232,28 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2; vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; + vpx_idct16x16_38_add = vpx_idct16x16_38_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_38_add = vpx_idct16x16_38_add_sse2; vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; - if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2; vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2; vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2; vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2; vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; @@ -1198,6 +1301,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; vpx_mse16x8 = vpx_mse16x8_c; if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; vpx_mse8x16 = vpx_mse8x16_c; if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2; vpx_mse8x8 = vpx_mse8x8_c; @@ -1208,17 +1312,19 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_sse2; vpx_quantize_b = vpx_quantize_b_c; if (flags & HAS_SSE2) vpx_quantize_b = vpx_quantize_b_sse2; + if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; + if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; + if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; vpx_sad16x16 = vpx_sad16x16_c; if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2; vpx_sad16x16_avg = vpx_sad16x16_avg_c; if (flags & HAS_SSE2) vpx_sad16x16_avg = vpx_sad16x16_avg_sse2; - vpx_sad16x16x3 = vpx_sad16x16x3_c; - if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3; vpx_sad16x16x4d = vpx_sad16x16x4d_c; if (flags & HAS_SSE2) vpx_sad16x16x4d = vpx_sad16x16x4d_sse2; - vpx_sad16x16x8 = vpx_sad16x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1; vpx_sad16x32 = vpx_sad16x32_c; if (flags & HAS_SSE2) vpx_sad16x32 = vpx_sad16x32_sse2; vpx_sad16x32_avg = vpx_sad16x32_avg_c; @@ -1229,13 +1335,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad16x8 = vpx_sad16x8_sse2; vpx_sad16x8_avg = vpx_sad16x8_avg_c; if (flags & HAS_SSE2) vpx_sad16x8_avg = vpx_sad16x8_avg_sse2; - vpx_sad16x8x3 = vpx_sad16x8x3_c; - if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3; vpx_sad16x8x4d = vpx_sad16x8x4d_c; if (flags & HAS_SSE2) vpx_sad16x8x4d = vpx_sad16x8x4d_sse2; - vpx_sad16x8x8 = vpx_sad16x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1; vpx_sad32x16 = vpx_sad32x16_c; if (flags & HAS_SSE2) vpx_sad32x16 = vpx_sad32x16_sse2; if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; @@ -1265,12 +1366,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad4x4 = vpx_sad4x4_sse2; vpx_sad4x4_avg = vpx_sad4x4_avg_c; if (flags & HAS_SSE2) vpx_sad4x4_avg = vpx_sad4x4_avg_sse2; - vpx_sad4x4x3 = vpx_sad4x4x3_c; - if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3; vpx_sad4x4x4d = vpx_sad4x4x4d_c; if (flags & HAS_SSE2) vpx_sad4x4x4d = vpx_sad4x4x4d_sse2; - vpx_sad4x4x8 = vpx_sad4x4x8_c; - if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1; vpx_sad4x8 = vpx_sad4x8_c; if (flags & HAS_SSE2) vpx_sad4x8 = vpx_sad4x8_sse2; vpx_sad4x8_avg = vpx_sad4x8_avg_c; @@ -1280,30 +1377,31 @@ static void setup_rtcd_internal(void) vpx_sad64x32 = vpx_sad64x32_c; if (flags & HAS_SSE2) vpx_sad64x32 = vpx_sad64x32_sse2; if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512; vpx_sad64x32_avg = vpx_sad64x32_avg_c; if (flags & HAS_SSE2) vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512; vpx_sad64x32x4d = vpx_sad64x32x4d_c; if (flags & HAS_SSE2) vpx_sad64x32x4d = vpx_sad64x32x4d_sse2; vpx_sad64x64 = vpx_sad64x64_c; if (flags & HAS_SSE2) vpx_sad64x64 = vpx_sad64x64_sse2; if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512; vpx_sad64x64_avg = vpx_sad64x64_avg_c; if (flags & HAS_SSE2) vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512; vpx_sad64x64x4d = vpx_sad64x64x4d_c; if (flags & HAS_SSE2) vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; + if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512; vpx_sad8x16 = vpx_sad8x16_c; if (flags & HAS_SSE2) vpx_sad8x16 = vpx_sad8x16_sse2; vpx_sad8x16_avg = vpx_sad8x16_avg_c; if (flags & HAS_SSE2) vpx_sad8x16_avg = vpx_sad8x16_avg_sse2; - vpx_sad8x16x3 = vpx_sad8x16x3_c; - if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3; vpx_sad8x16x4d = vpx_sad8x16x4d_c; if (flags & HAS_SSE2) vpx_sad8x16x4d = vpx_sad8x16x4d_sse2; - vpx_sad8x16x8 = vpx_sad8x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1; vpx_sad8x4 = vpx_sad8x4_c; if (flags & HAS_SSE2) vpx_sad8x4 = vpx_sad8x4_sse2; vpx_sad8x4_avg = vpx_sad8x4_avg_c; @@ -1314,16 +1412,74 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad8x8 = vpx_sad8x8_sse2; vpx_sad8x8_avg = vpx_sad8x8_avg_c; if (flags & HAS_SSE2) vpx_sad8x8_avg = vpx_sad8x8_avg_sse2; - vpx_sad8x8x3 = vpx_sad8x8x3_c; - if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3; vpx_sad8x8x4d = vpx_sad8x8x4d_c; if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2; - vpx_sad8x8x8 = vpx_sad8x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_sse2; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_sse2; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_sse2; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_sse2; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_sse2; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_sse2; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_sse2; + vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_sse2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512; + vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_sse2; + vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_sse2; + vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_sse2; + vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_sse2; vpx_satd = vpx_satd_c; if (flags & HAS_SSE2) vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; vpx_scaled_2d = vpx_scaled_2d_c; if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c; if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; @@ -1408,6 +1564,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; vpx_subtract_block = vpx_subtract_block_c; if (flags & HAS_SSE2) vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c; if (flags & HAS_SSE2) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sse2; vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; @@ -1431,8 +1588,10 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; vpx_variance16x32 = vpx_variance16x32_c; if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; vpx_variance16x8 = vpx_variance16x8_c; if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; vpx_variance32x16 = vpx_variance32x16_c; if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2; if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; @@ -1441,6 +1600,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; vpx_variance32x64 = vpx_variance32x64_c; if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; vpx_variance4x4 = vpx_variance4x4_c; if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2; vpx_variance4x8 = vpx_variance4x8_c; @@ -1453,10 +1613,13 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; vpx_variance8x16 = vpx_variance8x16_c; if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; vpx_variance8x4 = vpx_variance8x4_c; if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; vpx_variance8x8 = vpx_variance8x8_c; if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; vpx_vector_var = vpx_vector_var_c; if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2; } @@ -1466,4 +1629,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h b/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h index ddf7d01cca..18e5b71579 100644 --- a/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h +++ b/media/libvpx/config/linux/ia32/vpx_scale_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c @@ -66,4 +80,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/linux/loongarch64/vp8_rtcd.h b/media/libvpx/config/linux/loongarch64/vp8_rtcd.h new file mode 100644 index 0000000000..460f493156 --- /dev/null +++ b/media/libvpx/config/linux/loongarch64/vp8_rtcd.h @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_lsx(short *coeff, short *dqcoeff); +RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_c + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_c + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_c + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_lsx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_c + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_c + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_lsx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_lsx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_c + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_c + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_lsx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_lsx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_lsx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_lsx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_lsx(struct macroblock *mb, int dc); +RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_lsx(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_lsx(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_lsx(short *input, short *output, int pitch); +RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_c + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_lsx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_lsx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_lsx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/loongarch.h" +static void setup_rtcd_internal(void) +{ + int flags = loongarch_cpu_caps(); + + (void)flags; + vp8_block_error = vp8_block_error_c; + if (flags & HAS_LSX) vp8_block_error = vp8_block_error_lsx; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_LSX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_lsx; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_LSX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_lsx; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_LSX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_lsx; + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_LSX) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_LSX) vp8_loop_filter_bh = vp8_loop_filter_bh_lsx; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_LSX) vp8_loop_filter_bv = vp8_loop_filter_bv_lsx; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_LSX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_lsx; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_LSX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_lsx; + vp8_mbblock_error = vp8_mbblock_error_c; + if (flags & HAS_LSX) vp8_mbblock_error = vp8_mbblock_error_lsx; + vp8_regular_quantize_b = vp8_regular_quantize_b_c; + if (flags & HAS_LSX) vp8_regular_quantize_b = vp8_regular_quantize_b_lsx; + vp8_short_fdct4x4 = vp8_short_fdct4x4_c; + if (flags & HAS_LSX) vp8_short_fdct4x4 = vp8_short_fdct4x4_lsx; + vp8_short_fdct8x4 = vp8_short_fdct8x4_c; + if (flags & HAS_LSX) vp8_short_fdct8x4 = vp8_short_fdct8x4_lsx; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_LSX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_lsx; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_LSX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_lsx; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_LSX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_lsx; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/linux/loongarch64/vp9_rtcd.h b/media/libvpx/config/linux/loongarch64/vp9_rtcd.h new file mode 100644 index 0000000000..80428e6571 --- /dev/null +++ b/media/libvpx/config/linux/loongarch64/vp9_rtcd.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_c + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_c + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_c + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_c + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp vp9_quantize_fp_c + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c + +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12 vpx_convolve12_c + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_horiz vpx_convolve12_horiz_c + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_vert vpx_convolve12_vert_c + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/loongarch.h" +static void setup_rtcd_internal(void) +{ + int flags = loongarch_cpu_caps(); + + (void)flags; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/linux/loongarch64/vpx_config.asm b/media/libvpx/config/linux/loongarch64/vpx_config.asm new file mode 100644 index 0000000000..d23406cac5 --- /dev/null +++ b/media/libvpx/config/linux/loongarch64/vpx_config.asm @@ -0,0 +1,97 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.syntax unified +.equ VPX_ARCH_ARM , 0 +.equ VPX_ARCH_AARCH64 , 0 +.equ VPX_ARCH_MIPS , 0 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 1 +.equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 0 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 +.equ HAVE_MIPS32 , 0 +.equ HAVE_DSPR2 , 0 +.equ HAVE_MSA , 0 +.equ HAVE_MIPS64 , 0 +.equ HAVE_MMX , 0 +.equ HAVE_SSE , 0 +.equ HAVE_SSE2 , 0 +.equ HAVE_SSE3 , 0 +.equ HAVE_SSSE3 , 0 +.equ HAVE_SSE4_1 , 0 +.equ HAVE_AVX , 0 +.equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 1 +.equ HAVE_LASX , 1 +.equ HAVE_VPX_PORTS , 1 +.equ HAVE_PTHREAD_H , 1 +.equ CONFIG_DEPENDENCY_TRACKING , 1 +.equ CONFIG_EXTERNAL_BUILD , 1 +.equ CONFIG_INSTALL_DOCS , 0 +.equ CONFIG_INSTALL_BINS , 1 +.equ CONFIG_INSTALL_LIBS , 1 +.equ CONFIG_INSTALL_SRCS , 0 +.equ CONFIG_DEBUG , 0 +.equ CONFIG_GPROF , 0 +.equ CONFIG_GCOV , 0 +.equ CONFIG_RVCT , 0 +.equ CONFIG_GCC , 1 +.equ CONFIG_MSVS , 0 +.equ CONFIG_PIC , 1 +.equ CONFIG_BIG_ENDIAN , 0 +.equ CONFIG_CODEC_SRCS , 0 +.equ CONFIG_DEBUG_LIBS , 0 +.equ CONFIG_DEQUANT_TOKENS , 0 +.equ CONFIG_DC_RECON , 0 +.equ CONFIG_RUNTIME_CPU_DETECT , 1 +.equ CONFIG_POSTPROC , 0 +.equ CONFIG_VP9_POSTPROC , 0 +.equ CONFIG_MULTITHREAD , 1 +.equ CONFIG_INTERNAL_STATS , 0 +.equ CONFIG_VP8_ENCODER , 1 +.equ CONFIG_VP8_DECODER , 1 +.equ CONFIG_VP9_ENCODER , 1 +.equ CONFIG_VP9_DECODER , 1 +.equ CONFIG_VP8 , 1 +.equ CONFIG_VP9 , 1 +.equ CONFIG_ENCODERS , 1 +.equ CONFIG_DECODERS , 1 +.equ CONFIG_STATIC_MSVCRT , 0 +.equ CONFIG_SPATIAL_RESAMPLING , 1 +.equ CONFIG_REALTIME_ONLY , 0 +.equ CONFIG_ONTHEFLY_BITPACKING , 0 +.equ CONFIG_ERROR_CONCEALMENT , 0 +.equ CONFIG_SHARED , 0 +.equ CONFIG_STATIC , 1 +.equ CONFIG_SMALL , 0 +.equ CONFIG_POSTPROC_VISUALIZER , 0 +.equ CONFIG_OS_SUPPORT , 1 +.equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 1 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 +.equ CONFIG_MULTI_RES_ENCODING , 1 +.equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 +.equ CONFIG_EXPERIMENTAL , 0 +.equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 +.equ CONFIG_FP_MB_STATS , 0 +.equ CONFIG_EMULATE_HARDWARE , 0 +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/linux/loongarch64/vpx_config.c b/media/libvpx/config/linux/loongarch64/vpx_config.c new file mode 100644 index 0000000000..8ee96400cc --- /dev/null +++ b/media/libvpx/config/linux/loongarch64/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=loongarch64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/loongarch64/vpx_config.h b/media/libvpx/config/linux/loongarch64/vpx_config.h new file mode 100644 index 0000000000..e104a8b366 --- /dev/null +++ b/media/libvpx/config/linux/loongarch64/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 1 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 1 +#define HAVE_LASX 1 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/loongarch64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/loongarch64/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..06b98b8b24 --- /dev/null +++ b/media/libvpx/config/linux/loongarch64/vpx_dsp_rtcd.h @@ -0,0 +1,929 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_c + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_c + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_lsx(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_c + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_lsx(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_c + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_lsx(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_lsx(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct4x4)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_c + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_lsx(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct8x8)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_c + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_lsx(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_c + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_lsx(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_lsx(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_lsx(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_lsx(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_lsx(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_c + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_c + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_lsx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vpx_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_lsx(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_lsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_c + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_c + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_c + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x32x4d vpx_sad16x32x4d_c + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x8 vpx_sad16x8_c + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_c + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x8x4d vpx_sad16x8x4d_c + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x16 vpx_sad32x16_c + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x16_avg vpx_sad32x16_avg_c + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x16x4d vpx_sad32x16x4d_c + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x64 vpx_sad32x64_c + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x64_avg vpx_sad32x64_avg_c + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_c + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_c + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_c + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_c + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_c + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_c + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x32 vpx_sad64x32_c + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x32_avg vpx_sad64x32_avg_c + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_c + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_c + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_c + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_c + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_c + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_c + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_c + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c + +int vpx_satd_c(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_c + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +#define vpx_sse vpx_sse_c + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_lsx(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_lsx(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_lsx(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_lsx(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_lsx(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_c + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_c + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_c + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_c + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_lsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_c + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/loongarch.h" +static void setup_rtcd_internal(void) +{ + int flags = loongarch_cpu_caps(); + + (void)flags; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_LSX) vpx_comp_avg_pred = vpx_comp_avg_pred_lsx; + vpx_convolve8 = vpx_convolve8_c; + if (flags & HAS_LSX) vpx_convolve8 = vpx_convolve8_lsx; + vpx_convolve8_avg = vpx_convolve8_avg_c; + if (flags & HAS_LSX) vpx_convolve8_avg = vpx_convolve8_avg_lsx; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; + if (flags & HAS_LSX) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_lsx; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; + if (flags & HAS_LSX) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_lsx; + vpx_convolve8_horiz = vpx_convolve8_horiz_c; + if (flags & HAS_LSX) vpx_convolve8_horiz = vpx_convolve8_horiz_lsx; + vpx_convolve8_vert = vpx_convolve8_vert_c; + if (flags & HAS_LSX) vpx_convolve8_vert = vpx_convolve8_vert_lsx; + vpx_convolve_avg = vpx_convolve_avg_c; + if (flags & HAS_LSX) vpx_convolve_avg = vpx_convolve_avg_lsx; + vpx_convolve_copy = vpx_convolve_copy_c; + if (flags & HAS_LSX) vpx_convolve_copy = vpx_convolve_copy_lsx; + vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c; + if (flags & HAS_LSX) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_lsx; + vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_c; + if (flags & HAS_LSX) vpx_dc_predictor_8x8 = vpx_dc_predictor_8x8_lsx; + vpx_fdct16x16 = vpx_fdct16x16_c; + if (flags & HAS_LSX) vpx_fdct16x16 = vpx_fdct16x16_lsx; + vpx_fdct32x32 = vpx_fdct32x32_c; + if (flags & HAS_LSX) vpx_fdct32x32 = vpx_fdct32x32_lsx; + vpx_fdct32x32_rd = vpx_fdct32x32_rd_c; + if (flags & HAS_LSX) vpx_fdct32x32_rd = vpx_fdct32x32_rd_lsx; + vpx_fdct4x4 = vpx_fdct4x4_c; + if (flags & HAS_LSX) vpx_fdct4x4 = vpx_fdct4x4_lsx; + vpx_fdct8x8 = vpx_fdct8x8_c; + if (flags & HAS_LSX) vpx_fdct8x8 = vpx_fdct8x8_lsx; + vpx_get16x16var = vpx_get16x16var_c; + if (flags & HAS_LSX) vpx_get16x16var = vpx_get16x16var_lsx; + vpx_hadamard_16x16 = vpx_hadamard_16x16_c; + if (flags & HAS_LSX) vpx_hadamard_16x16 = vpx_hadamard_16x16_lsx; + vpx_hadamard_8x8 = vpx_hadamard_8x8_c; + if (flags & HAS_LSX) vpx_hadamard_8x8 = vpx_hadamard_8x8_lsx; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; + if (flags & HAS_LSX) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_lsx; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; + if (flags & HAS_LSX) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_lsx; + vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; + if (flags & HAS_LSX) vpx_idct32x32_1_add = vpx_idct32x32_1_add_lsx; + vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; + if (flags & HAS_LSX) vpx_idct32x32_34_add = vpx_idct32x32_34_add_lsx; + vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_c; + if (flags & HAS_LSX) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_lsx; + vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_c; + if (flags & HAS_LSX) vpx_lpf_horizontal_4 = vpx_lpf_horizontal_4_lsx; + vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_c; + if (flags & HAS_LSX) vpx_lpf_horizontal_4_dual = vpx_lpf_horizontal_4_dual_lsx; + vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_c; + if (flags & HAS_LSX) vpx_lpf_horizontal_8 = vpx_lpf_horizontal_8_lsx; + vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_c; + if (flags & HAS_LSX) vpx_lpf_horizontal_8_dual = vpx_lpf_horizontal_8_dual_lsx; + vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_c; + if (flags & HAS_LSX) vpx_lpf_vertical_16_dual = vpx_lpf_vertical_16_dual_lsx; + vpx_lpf_vertical_4 = vpx_lpf_vertical_4_c; + if (flags & HAS_LSX) vpx_lpf_vertical_4 = vpx_lpf_vertical_4_lsx; + vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_c; + if (flags & HAS_LSX) vpx_lpf_vertical_4_dual = vpx_lpf_vertical_4_dual_lsx; + vpx_lpf_vertical_8 = vpx_lpf_vertical_8_c; + if (flags & HAS_LSX) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_lsx; + vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; + if (flags & HAS_LSX) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_lsx; + vpx_mse16x16 = vpx_mse16x16_c; + if (flags & HAS_LSX) vpx_mse16x16 = vpx_mse16x16_lsx; + vpx_quantize_b = vpx_quantize_b_c; + if (flags & HAS_LSX) vpx_quantize_b = vpx_quantize_b_lsx; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_LSX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_lsx; + vpx_sad16x16 = vpx_sad16x16_c; + if (flags & HAS_LSX) vpx_sad16x16 = vpx_sad16x16_lsx; + vpx_sad16x16x4d = vpx_sad16x16x4d_c; + if (flags & HAS_LSX) vpx_sad16x16x4d = vpx_sad16x16x4d_lsx; + vpx_sad32x32 = vpx_sad32x32_c; + if (flags & HAS_LSX) vpx_sad32x32 = vpx_sad32x32_lsx; + vpx_sad32x32_avg = vpx_sad32x32_avg_c; + if (flags & HAS_LSX) vpx_sad32x32_avg = vpx_sad32x32_avg_lsx; + vpx_sad32x32x4d = vpx_sad32x32x4d_c; + if (flags & HAS_LSX) vpx_sad32x32x4d = vpx_sad32x32x4d_lsx; + vpx_sad32x64x4d = vpx_sad32x64x4d_c; + if (flags & HAS_LSX) vpx_sad32x64x4d = vpx_sad32x64x4d_lsx; + vpx_sad64x32x4d = vpx_sad64x32x4d_c; + if (flags & HAS_LSX) vpx_sad64x32x4d = vpx_sad64x32x4d_lsx; + vpx_sad64x64 = vpx_sad64x64_c; + if (flags & HAS_LSX) vpx_sad64x64 = vpx_sad64x64_lsx; + vpx_sad64x64_avg = vpx_sad64x64_avg_c; + if (flags & HAS_LSX) vpx_sad64x64_avg = vpx_sad64x64_avg_lsx; + vpx_sad64x64x4d = vpx_sad64x64x4d_c; + if (flags & HAS_LSX) vpx_sad64x64x4d = vpx_sad64x64x4d_lsx; + vpx_sad8x8 = vpx_sad8x8_c; + if (flags & HAS_LSX) vpx_sad8x8 = vpx_sad8x8_lsx; + vpx_sad8x8x4d = vpx_sad8x8x4d_c; + if (flags & HAS_LSX) vpx_sad8x8x4d = vpx_sad8x8x4d_lsx; + vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_c; + if (flags & HAS_LSX) vpx_sub_pixel_avg_variance64x64 = vpx_sub_pixel_avg_variance64x64_lsx; + vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_c; + if (flags & HAS_LSX) vpx_sub_pixel_variance16x16 = vpx_sub_pixel_variance16x16_lsx; + vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_c; + if (flags & HAS_LSX) vpx_sub_pixel_variance32x32 = vpx_sub_pixel_variance32x32_lsx; + vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_c; + if (flags & HAS_LSX) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_lsx; + vpx_subtract_block = vpx_subtract_block_c; + if (flags & HAS_LSX) vpx_subtract_block = vpx_subtract_block_lsx; + vpx_variance16x16 = vpx_variance16x16_c; + if (flags & HAS_LSX) vpx_variance16x16 = vpx_variance16x16_lsx; + vpx_variance32x32 = vpx_variance32x32_c; + if (flags & HAS_LSX) vpx_variance32x32 = vpx_variance32x32_lsx; + vpx_variance64x64 = vpx_variance64x64_c; + if (flags & HAS_LSX) vpx_variance64x64 = vpx_variance64x64_lsx; + vpx_variance8x8 = vpx_variance8x8_c; + if (flags & HAS_LSX) vpx_variance8x8 = vpx_variance8x8_lsx; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/linux/loongarch64/vpx_scale_rtcd.h b/media/libvpx/config/linux/loongarch64/vpx_scale_rtcd.h new file mode 100644 index 0000000000..9fb8e25d07 --- /dev/null +++ b/media/libvpx/config/linux/loongarch64/vpx_scale_rtcd.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/loongarch.h" +static void setup_rtcd_internal(void) +{ + int flags = loongarch_cpu_caps(); + + (void)flags; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/linux/mips32/vp8_rtcd.h b/media/libvpx/config/linux/mips32/vp8_rtcd.h new file mode 100644 index 0000000000..32c48c547b --- /dev/null +++ b/media/libvpx/config/linux/mips32/vp8_rtcd.h @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_c + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_c + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_c + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_c + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_c + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_c + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_c + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_c + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_c + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_c + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_c + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_c + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_RTCD_H_ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_c + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_c + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_c + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_c + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_c + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_c + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_c + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_c + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_c + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_c + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_c + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_c + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/mips.h" +static void setup_rtcd_internal(void) +{ + int flags = mips_cpu_caps(); + + (void)flags; + +#if HAVE_DSPR2 +void vpx_dsputil_static_init(); +#if CONFIG_VP8 +void dsputil_static_init(); +#endif + +vpx_dsputil_static_init(); +#if CONFIG_VP8 +dsputil_static_init(); +#endif +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/linux/mips32/vp9_rtcd.h b/media/libvpx/config/linux/mips32/vp9_rtcd.h new file mode 100644 index 0000000000..876acc10ef --- /dev/null +++ b/media/libvpx/config/linux/mips32/vp9_rtcd.h @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_c + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_c + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_c + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_c + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp vp9_quantize_fp_c + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c + +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12 vpx_convolve12_c + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_horiz vpx_convolve12_horiz_c + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_vert vpx_convolve12_vert_c + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_RTCD_H_ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_c + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_c + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_c + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_c + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp vp9_quantize_fp_c + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c + +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12 vpx_convolve12_c + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_horiz vpx_convolve12_horiz_c + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_vert vpx_convolve12_vert_c + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/mips.h" +static void setup_rtcd_internal(void) +{ + int flags = mips_cpu_caps(); + + (void)flags; + +#if HAVE_DSPR2 +void vpx_dsputil_static_init(); +#if CONFIG_VP8 +void dsputil_static_init(); +#endif + +vpx_dsputil_static_init(); +#if CONFIG_VP8 +dsputil_static_init(); +#endif +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/linux/mips32/vpx_config.asm b/media/libvpx/config/linux/mips32/vpx_config.asm new file mode 100644 index 0000000000..799a79a073 --- /dev/null +++ b/media/libvpx/config/linux/mips32/vpx_config.asm @@ -0,0 +1,97 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.syntax unified +.equ VPX_ARCH_ARM , 0 +.equ VPX_ARCH_AARCH64 , 0 +.equ VPX_ARCH_MIPS , 1 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 0 +.equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 0 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 +.equ HAVE_MIPS32 , 1 +.equ HAVE_DSPR2 , 0 +.equ HAVE_MSA , 0 +.equ HAVE_MIPS64 , 0 +.equ HAVE_MMX , 0 +.equ HAVE_SSE , 0 +.equ HAVE_SSE2 , 0 +.equ HAVE_SSE3 , 0 +.equ HAVE_SSSE3 , 0 +.equ HAVE_SSE4_1 , 0 +.equ HAVE_AVX , 0 +.equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 +.equ HAVE_VPX_PORTS , 1 +.equ HAVE_PTHREAD_H , 1 +.equ CONFIG_DEPENDENCY_TRACKING , 1 +.equ CONFIG_EXTERNAL_BUILD , 1 +.equ CONFIG_INSTALL_DOCS , 0 +.equ CONFIG_INSTALL_BINS , 1 +.equ CONFIG_INSTALL_LIBS , 1 +.equ CONFIG_INSTALL_SRCS , 0 +.equ CONFIG_DEBUG , 0 +.equ CONFIG_GPROF , 0 +.equ CONFIG_GCOV , 0 +.equ CONFIG_RVCT , 0 +.equ CONFIG_GCC , 1 +.equ CONFIG_MSVS , 0 +.equ CONFIG_PIC , 1 +.equ CONFIG_BIG_ENDIAN , 0 +.equ CONFIG_CODEC_SRCS , 0 +.equ CONFIG_DEBUG_LIBS , 0 +.equ CONFIG_DEQUANT_TOKENS , 1 +.equ CONFIG_DC_RECON , 1 +.equ CONFIG_RUNTIME_CPU_DETECT , 0 +.equ CONFIG_POSTPROC , 0 +.equ CONFIG_VP9_POSTPROC , 0 +.equ CONFIG_MULTITHREAD , 1 +.equ CONFIG_INTERNAL_STATS , 0 +.equ CONFIG_VP8_ENCODER , 1 +.equ CONFIG_VP8_DECODER , 1 +.equ CONFIG_VP9_ENCODER , 1 +.equ CONFIG_VP9_DECODER , 1 +.equ CONFIG_VP8 , 1 +.equ CONFIG_VP9 , 1 +.equ CONFIG_ENCODERS , 1 +.equ CONFIG_DECODERS , 1 +.equ CONFIG_STATIC_MSVCRT , 0 +.equ CONFIG_SPATIAL_RESAMPLING , 1 +.equ CONFIG_REALTIME_ONLY , 0 +.equ CONFIG_ONTHEFLY_BITPACKING , 0 +.equ CONFIG_ERROR_CONCEALMENT , 0 +.equ CONFIG_SHARED , 0 +.equ CONFIG_STATIC , 1 +.equ CONFIG_SMALL , 0 +.equ CONFIG_POSTPROC_VISUALIZER , 0 +.equ CONFIG_OS_SUPPORT , 1 +.equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 1 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 +.equ CONFIG_MULTI_RES_ENCODING , 1 +.equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 +.equ CONFIG_EXPERIMENTAL , 0 +.equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 +.equ CONFIG_FP_MB_STATS , 0 +.equ CONFIG_EMULATE_HARDWARE , 0 +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/linux/mips32/vpx_config.c b/media/libvpx/config/linux/mips32/vpx_config.c new file mode 100644 index 0000000000..c05113f670 --- /dev/null +++ b/media/libvpx/config/linux/mips32/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=mips32-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/mips32/vpx_config.h b/media/libvpx/config/linux/mips32/vpx_config.h new file mode 100644 index 0000000000..863749cd8c --- /dev/null +++ b/media/libvpx/config/linux/mips32/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 1 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 +#define HAVE_MIPS32 1 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 1 +#define CONFIG_DC_RECON 1 +#define CONFIG_RUNTIME_CPU_DETECT 0 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/mips32/vpx_dsp_rtcd.h b/media/libvpx/config/linux/mips32/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..291bf5d03b --- /dev/null +++ b/media/libvpx/config/linux/mips32/vpx_dsp_rtcd.h @@ -0,0 +1,1524 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_c + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_c + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8 vpx_convolve8_c + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg vpx_convolve8_avg_c + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_horiz vpx_convolve8_horiz_c + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_vert vpx_convolve8_vert_c + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_c + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_c + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_c + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_c + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_c + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_c + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_c + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_c + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_c + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_c + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_c + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_c + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_c + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_c + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_c + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_c + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_c + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_c + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b vpx_quantize_b_c + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x16 vpx_sad16x16_c + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_c + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x16x4d vpx_sad16x16x4d_c + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_c + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_c + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x32x4d vpx_sad16x32x4d_c + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x8 vpx_sad16x8_c + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_c + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x8x4d vpx_sad16x8x4d_c + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x16 vpx_sad32x16_c + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x16_avg vpx_sad32x16_avg_c + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x16x4d vpx_sad32x16x4d_c + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x32 vpx_sad32x32_c + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x32_avg vpx_sad32x32_avg_c + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x32x4d vpx_sad32x32x4d_c + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x64 vpx_sad32x64_c + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x64_avg vpx_sad32x64_avg_c + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x64x4d vpx_sad32x64x4d_c + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_c + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_c + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_c + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_c + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_c + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_c + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x32 vpx_sad64x32_c + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x32_avg vpx_sad64x32_avg_c + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x32x4d vpx_sad64x32x4d_c + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x64 vpx_sad64x64_c + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x64_avg vpx_sad64x64_avg_c + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x64x4d vpx_sad64x64x4d_c + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_c + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_c + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_c + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_c + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_c + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_c + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_c + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_c + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_c + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c + +int vpx_satd_c(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_c + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +#define vpx_sse vpx_sse_c + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vpx_subtract_block vpx_subtract_block_c + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_c + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_c + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_c + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_c + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_c + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_c + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_c + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_c + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_c + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_RTCD_H_ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_c + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_c + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8 vpx_convolve8_c + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg vpx_convolve8_avg_c + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_horiz vpx_convolve8_horiz_c + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_vert vpx_convolve8_vert_c + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_c + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_c + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_c + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_c + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_c + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_c + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_c + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_c + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_c + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_c + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_c + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_c + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_c + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_c + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_c + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_c + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_c + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_c + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_c + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_c + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_c + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_c + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_c + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_c + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_c + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_c + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_c + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_c + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_c + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b vpx_quantize_b_c + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x16 vpx_sad16x16_c + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_c + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x16x4d vpx_sad16x16x4d_c + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_c + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_c + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x32x4d vpx_sad16x32x4d_c + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x8 vpx_sad16x8_c + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_c + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x8x4d vpx_sad16x8x4d_c + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x16 vpx_sad32x16_c + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x16_avg vpx_sad32x16_avg_c + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x16x4d vpx_sad32x16x4d_c + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x32 vpx_sad32x32_c + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x32_avg vpx_sad32x32_avg_c + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x32x4d vpx_sad32x32x4d_c + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x64 vpx_sad32x64_c + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x64_avg vpx_sad32x64_avg_c + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x64x4d vpx_sad32x64x4d_c + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_c + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_c + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_c + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_c + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_c + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_c + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x32 vpx_sad64x32_c + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x32_avg vpx_sad64x32_avg_c + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x32x4d vpx_sad64x32x4d_c + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x64 vpx_sad64x64_c + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x64_avg vpx_sad64x64_avg_c + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x64x4d vpx_sad64x64x4d_c + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_c + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_c + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_c + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_c + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_c + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_c + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_c + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_c + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_c + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c + +int vpx_satd_c(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_c + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +#define vpx_sse vpx_sse_c + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vpx_subtract_block vpx_subtract_block_c + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_c + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_c + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_c + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_c + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_c + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_c + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_c + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_c + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_c + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_c + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_c + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_c + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_c + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/mips.h" +static void setup_rtcd_internal(void) +{ + int flags = mips_cpu_caps(); + + (void)flags; + +#if HAVE_DSPR2 +void vpx_dsputil_static_init(); +#if CONFIG_VP8 +void dsputil_static_init(); +#endif + +vpx_dsputil_static_init(); +#if CONFIG_VP8 +dsputil_static_init(); +#endif +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/linux/mips32/vpx_scale_rtcd.h b/media/libvpx/config/linux/mips32/vpx_scale_rtcd.h new file mode 100644 index 0000000000..982dbe30d9 --- /dev/null +++ b/media/libvpx/config/linux/mips32/vpx_scale_rtcd.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +static void setup_rtcd_internal(void) +{ +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_SCALE_RTCD_H_ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/mips.h" +static void setup_rtcd_internal(void) +{ + int flags = mips_cpu_caps(); + + (void)flags; + +#if HAVE_DSPR2 +void vpx_dsputil_static_init(); +#if CONFIG_VP8 +void dsputil_static_init(); +#endif + +vpx_dsputil_static_init(); +#if CONFIG_VP8 +dsputil_static_init(); +#endif +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/linux/mips64/vp8_rtcd.h b/media/libvpx/config/linux/mips64/vp8_rtcd.h new file mode 100644 index 0000000000..426c1a6d37 --- /dev/null +++ b/media/libvpx/config/linux/mips64/vp8_rtcd.h @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_msa + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_msa + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_msa + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_msa + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_msa(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_msa + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_msa(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_msa + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_msa(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_msa + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_msa(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_msa + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmi(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_msa(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_msa + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_msa(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_msa + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_msa(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_msa + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmi(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_msa(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_msa + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_mmi(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_msa(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_msa + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmi(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_msa(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_msa + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmi(struct blockd*, short *DQC); +void vp8_dequantize_b_msa(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_msa + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sadx4 + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_mmi(struct block *, struct blockd *); +void vp8_fast_quantize_b_msa(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_msa + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_msa(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_msa + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_msa(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_msa + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_msa(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_msa + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_msa(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_msa + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_msa(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_msa + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_msa(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_msa + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_msa(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_msa + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_msa(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_msa + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_msa(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_msa + +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_msa(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_msa + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sadx4 + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_mmi(struct block *, struct blockd *); +void vp8_regular_quantize_b_msa(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_msa + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_mmi(short *input, short *output, int pitch); +void vp8_short_fdct4x4_msa(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_msa + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_mmi(short *input, short *output, int pitch); +void vp8_short_fdct8x4_msa(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_msa + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmi(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_msa(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_msa + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_mmi(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_msa(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_msa + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_mmi(short *input, short *output, int pitch); +void vp8_short_walsh4x4_msa(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_msa + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_mmi(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_msa + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmi(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_msa + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_mmi(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_msa + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_mmi(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_msa(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_msa + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +void vp8_temporal_filter_apply_msa(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +#define vp8_temporal_filter_apply vp8_temporal_filter_apply_msa + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/mips.h" +static void setup_rtcd_internal(void) +{ + int flags = mips_cpu_caps(); + + (void)flags; + +#if HAVE_DSPR2 +void vpx_dsputil_static_init(); +#if CONFIG_VP8 +void dsputil_static_init(); +#endif + +vpx_dsputil_static_init(); +#if CONFIG_VP8 +dsputil_static_init(); +#endif +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/linux/mips64/vp9_rtcd.h b/media/libvpx/config/linux/mips64/vp9_rtcd.h new file mode 100644 index 0000000000..4ab5c98198 --- /dev/null +++ b/media/libvpx/config/linux/mips64/vp9_rtcd.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_msa(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_msa + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_msa + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_msa + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_msa + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vp9_fwht4x4_msa(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_msa + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_msa + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_msa + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_msa + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp vp9_quantize_fp_c + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c + +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12 vpx_convolve12_c + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_horiz vpx_convolve12_horiz_c + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_vert vpx_convolve12_vert_c + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/mips.h" +static void setup_rtcd_internal(void) +{ + int flags = mips_cpu_caps(); + + (void)flags; + +#if HAVE_DSPR2 +void vpx_dsputil_static_init(); +#if CONFIG_VP8 +void dsputil_static_init(); +#endif + +vpx_dsputil_static_init(); +#if CONFIG_VP8 +dsputil_static_init(); +#endif +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/linux/mips64/vpx_config.asm b/media/libvpx/config/linux/mips64/vpx_config.asm new file mode 100644 index 0000000000..0a8b7a6c85 --- /dev/null +++ b/media/libvpx/config/linux/mips64/vpx_config.asm @@ -0,0 +1,97 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.syntax unified +.equ VPX_ARCH_ARM , 0 +.equ VPX_ARCH_AARCH64 , 0 +.equ VPX_ARCH_MIPS , 1 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 0 +.equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 0 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 +.equ HAVE_MIPS32 , 0 +.equ HAVE_DSPR2 , 0 +.equ HAVE_MSA , 1 +.equ HAVE_MIPS64 , 1 +.equ HAVE_MMX , 0 +.equ HAVE_SSE , 0 +.equ HAVE_SSE2 , 0 +.equ HAVE_SSE3 , 0 +.equ HAVE_SSSE3 , 0 +.equ HAVE_SSE4_1 , 0 +.equ HAVE_AVX , 0 +.equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 1 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 +.equ HAVE_VPX_PORTS , 1 +.equ HAVE_PTHREAD_H , 1 +.equ CONFIG_DEPENDENCY_TRACKING , 1 +.equ CONFIG_EXTERNAL_BUILD , 1 +.equ CONFIG_INSTALL_DOCS , 0 +.equ CONFIG_INSTALL_BINS , 1 +.equ CONFIG_INSTALL_LIBS , 1 +.equ CONFIG_INSTALL_SRCS , 0 +.equ CONFIG_DEBUG , 0 +.equ CONFIG_GPROF , 0 +.equ CONFIG_GCOV , 0 +.equ CONFIG_RVCT , 0 +.equ CONFIG_GCC , 1 +.equ CONFIG_MSVS , 0 +.equ CONFIG_PIC , 1 +.equ CONFIG_BIG_ENDIAN , 0 +.equ CONFIG_CODEC_SRCS , 0 +.equ CONFIG_DEBUG_LIBS , 0 +.equ CONFIG_DEQUANT_TOKENS , 1 +.equ CONFIG_DC_RECON , 1 +.equ CONFIG_RUNTIME_CPU_DETECT , 0 +.equ CONFIG_POSTPROC , 0 +.equ CONFIG_VP9_POSTPROC , 0 +.equ CONFIG_MULTITHREAD , 1 +.equ CONFIG_INTERNAL_STATS , 0 +.equ CONFIG_VP8_ENCODER , 1 +.equ CONFIG_VP8_DECODER , 1 +.equ CONFIG_VP9_ENCODER , 1 +.equ CONFIG_VP9_DECODER , 1 +.equ CONFIG_VP8 , 1 +.equ CONFIG_VP9 , 1 +.equ CONFIG_ENCODERS , 1 +.equ CONFIG_DECODERS , 1 +.equ CONFIG_STATIC_MSVCRT , 0 +.equ CONFIG_SPATIAL_RESAMPLING , 1 +.equ CONFIG_REALTIME_ONLY , 0 +.equ CONFIG_ONTHEFLY_BITPACKING , 0 +.equ CONFIG_ERROR_CONCEALMENT , 0 +.equ CONFIG_SHARED , 0 +.equ CONFIG_STATIC , 1 +.equ CONFIG_SMALL , 0 +.equ CONFIG_POSTPROC_VISUALIZER , 0 +.equ CONFIG_OS_SUPPORT , 1 +.equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 0 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 +.equ CONFIG_MULTI_RES_ENCODING , 1 +.equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 +.equ CONFIG_EXPERIMENTAL , 0 +.equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 +.equ CONFIG_FP_MB_STATS , 0 +.equ CONFIG_EMULATE_HARDWARE , 0 +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/linux/mips64/vpx_config.c b/media/libvpx/config/linux/mips64/vpx_config.c new file mode 100644 index 0000000000..29d6e5c004 --- /dev/null +++ b/media/libvpx/config/linux/mips64/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=mips64-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect --cpu=loongson3"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/mips64/vpx_config.h b/media/libvpx/config/linux/mips64/vpx_config.h new file mode 100644 index 0000000000..ceed69be84 --- /dev/null +++ b/media/libvpx/config/linux/mips64/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 1 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 1 +#define HAVE_MIPS64 1 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 1 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 1 +#define CONFIG_DC_RECON 1 +#define CONFIG_RUNTIME_CPU_DETECT 0 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 0 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/mips64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/mips64/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..bf43d539bd --- /dev/null +++ b/media/libvpx/config/linux/mips64/vpx_dsp_rtcd.h @@ -0,0 +1,1029 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_msa(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_msa + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_msa(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_msa + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8 vpx_convolve8_msa + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg vpx_convolve8_avg_msa + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_msa + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_msa + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_horiz vpx_convolve8_horiz_msa + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve8_vert vpx_convolve8_vert_msa + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_msa + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_msa + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_c + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_msa + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_msa + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_msa + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_msa + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_msa + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_msa + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_msa + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_msa + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_msa + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_msa + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_msa + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_msa + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_msa + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_msa + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_msa + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_msa + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_msa(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_msa + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_msa(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_msa + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_msa(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_msa + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_msa(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_msa + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_msa(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_msa + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_msa(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_msa + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_c + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_msa(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_msa + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_msa + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_msa + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_msa(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_msa + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_msa + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_msa(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_msa + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_msa + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_msa + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_msa + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_msa + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_msa(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_msa + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_c + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_msa(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_msa + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_msa + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_msa + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_msa + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_256_add_msa + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_msa + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_msa + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_msa + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_msa + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_msa + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_msa + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_msa + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_msa + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_msa + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_msa + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_msa + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_msa + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_msa + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_msa + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_msa + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_msa + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_msa + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_msa + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_msa + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_msa + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_msa + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_msa + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_msa + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_msa + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_msa(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_msa + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_msa + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_msa + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_msa + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_msa + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_msa + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b vpx_quantize_b_c + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x16 vpx_sad16x16_msa + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_msa + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x16x4d vpx_sad16x16x4d_msa + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_msa + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_msa + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x32x4d vpx_sad16x32x4d_msa + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x8 vpx_sad16x8_msa + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_msa + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad16x8x4d vpx_sad16x8x4d_msa + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x16 vpx_sad32x16_msa + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x16_avg vpx_sad32x16_avg_msa + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x16x4d vpx_sad32x16x4d_msa + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x32 vpx_sad32x32_msa + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x32_avg vpx_sad32x32_avg_msa + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x32x4d vpx_sad32x32x4d_msa + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x64 vpx_sad32x64_msa + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x64_avg vpx_sad32x64_avg_msa + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad32x64x4d vpx_sad32x64x4d_msa + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_msa + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_msa + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_msa + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_msa + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_msa + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_msa + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x32 vpx_sad64x32_msa + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x32_avg vpx_sad64x32_avg_msa + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x32x4d vpx_sad64x32x4d_msa + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x64 vpx_sad64x64_msa + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x64_avg vpx_sad64x64_avg_msa + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad64x64x4d vpx_sad64x64x4d_msa + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_msa + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_msa + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_msa + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_msa + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_msa + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_msa + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_msa + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_msa + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_msa + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_msa(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_msa + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_msa + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +#define vpx_sse vpx_sse_c + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_msa + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_msa + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_msa + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_msa + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_msa + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_msa + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_msa + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_msa + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_msa + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_msa + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_msa + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_msa + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_msa + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_msa + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_msa + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_msa + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_msa + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_msa + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_msa + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_msa + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_msa + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_msa + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_msa + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_msa + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_msa + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_mmi(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_msa(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_msa + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_msa(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vpx_subtract_block vpx_subtract_block_msa + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_msa + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_msa + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_msa + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_msa + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_msa + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_msa + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_msa + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_msa + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_msa + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_msa + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_msa + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_msa + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_msa + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_msa + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_msa + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_msa + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_msa + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_msa + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_msa + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_msa + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_msa + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_mmi(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_msa + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_msa + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/mips.h" +static void setup_rtcd_internal(void) +{ + int flags = mips_cpu_caps(); + + (void)flags; + +#if HAVE_DSPR2 +void vpx_dsputil_static_init(); +#if CONFIG_VP8 +void dsputil_static_init(); +#endif + +vpx_dsputil_static_init(); +#if CONFIG_VP8 +dsputil_static_init(); +#endif +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/linux/mips64/vpx_scale_rtcd.h b/media/libvpx/config/linux/mips64/vpx_scale_rtcd.h new file mode 100644 index 0000000000..c0d59a108d --- /dev/null +++ b/media/libvpx/config/linux/mips64/vpx_scale_rtcd.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/mips.h" +static void setup_rtcd_internal(void) +{ + int flags = mips_cpu_caps(); + + (void)flags; + +#if HAVE_DSPR2 +void vpx_dsputil_static_init(); +#if CONFIG_VP8 +void dsputil_static_init(); +#endif + +vpx_dsputil_static_init(); +#if CONFIG_VP8 +dsputil_static_init(); +#endif +#endif +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/linux/ppc64le/vp8_rtcd.h b/media/libvpx/config/linux/ppc64le/vp8_rtcd.h new file mode 100644 index 0000000000..1b7314e6bd --- /dev/null +++ b/media/libvpx/config/linux/ppc64le/vp8_rtcd.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_c + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_c + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_c + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_c + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_c + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_c + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_c + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_c + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_c + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_c + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_c + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_c + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_c + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c + +void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); +#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/ppc.h" +static void setup_rtcd_internal(void) +{ + int flags = ppc_simd_caps(); + (void)flags; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/linux/ppc64le/vp9_rtcd.h b/media/libvpx/config/linux/ppc64le/vp9_rtcd.h new file mode 100644 index 0000000000..1d4de12268 --- /dev/null +++ b/media/libvpx/config/linux/ppc64le/vp9_rtcd.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +#define vp9_apply_temporal_filter vp9_apply_temporal_filter_c + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_c + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_c + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_c + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_c + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c + +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12 vpx_convolve12_c + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_horiz vpx_convolve12_horiz_c + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve12_vert vpx_convolve12_vert_c + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/ppc.h" +static void setup_rtcd_internal(void) +{ + int flags = ppc_simd_caps(); + (void)flags; + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_VSX) vp9_iht16x16_256_add = vp9_iht16x16_256_add_vsx; + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_VSX) vp9_iht4x4_16_add = vp9_iht4x4_16_add_vsx; + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_VSX) vp9_iht8x8_64_add = vp9_iht8x8_64_add_vsx; + vp9_quantize_fp = vp9_quantize_fp_c; + if (flags & HAS_VSX) vp9_quantize_fp = vp9_quantize_fp_vsx; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_VSX) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_vsx; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/linux/ppc64le/vpx_config.asm b/media/libvpx/config/linux/ppc64le/vpx_config.asm new file mode 100644 index 0000000000..543a627257 --- /dev/null +++ b/media/libvpx/config/linux/ppc64le/vpx_config.asm @@ -0,0 +1,97 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.syntax unified +.equ VPX_ARCH_ARM , 0 +.equ VPX_ARCH_AARCH64 , 0 +.equ VPX_ARCH_MIPS , 0 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 1 +.equ VPX_ARCH_LOONGARCH , 0 +.equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 0 +.equ HAVE_NEON_DOTPROD , 0 +.equ HAVE_NEON_I8MM , 0 +.equ HAVE_SVE , 0 +.equ HAVE_SVE2 , 0 +.equ HAVE_MIPS32 , 0 +.equ HAVE_DSPR2 , 0 +.equ HAVE_MSA , 0 +.equ HAVE_MIPS64 , 0 +.equ HAVE_MMX , 0 +.equ HAVE_SSE , 0 +.equ HAVE_SSE2 , 0 +.equ HAVE_SSE3 , 0 +.equ HAVE_SSSE3 , 0 +.equ HAVE_SSE4_1 , 0 +.equ HAVE_AVX , 0 +.equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 1 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 +.equ HAVE_VPX_PORTS , 1 +.equ HAVE_PTHREAD_H , 1 +.equ CONFIG_DEPENDENCY_TRACKING , 1 +.equ CONFIG_EXTERNAL_BUILD , 1 +.equ CONFIG_INSTALL_DOCS , 0 +.equ CONFIG_INSTALL_BINS , 1 +.equ CONFIG_INSTALL_LIBS , 1 +.equ CONFIG_INSTALL_SRCS , 0 +.equ CONFIG_DEBUG , 0 +.equ CONFIG_GPROF , 0 +.equ CONFIG_GCOV , 0 +.equ CONFIG_RVCT , 0 +.equ CONFIG_GCC , 1 +.equ CONFIG_MSVS , 0 +.equ CONFIG_PIC , 1 +.equ CONFIG_BIG_ENDIAN , 0 +.equ CONFIG_CODEC_SRCS , 0 +.equ CONFIG_DEBUG_LIBS , 0 +.equ CONFIG_DEQUANT_TOKENS , 0 +.equ CONFIG_DC_RECON , 0 +.equ CONFIG_RUNTIME_CPU_DETECT , 1 +.equ CONFIG_POSTPROC , 0 +.equ CONFIG_VP9_POSTPROC , 0 +.equ CONFIG_MULTITHREAD , 1 +.equ CONFIG_INTERNAL_STATS , 0 +.equ CONFIG_VP8_ENCODER , 1 +.equ CONFIG_VP8_DECODER , 1 +.equ CONFIG_VP9_ENCODER , 1 +.equ CONFIG_VP9_DECODER , 1 +.equ CONFIG_VP8 , 1 +.equ CONFIG_VP9 , 1 +.equ CONFIG_ENCODERS , 1 +.equ CONFIG_DECODERS , 1 +.equ CONFIG_STATIC_MSVCRT , 0 +.equ CONFIG_SPATIAL_RESAMPLING , 1 +.equ CONFIG_REALTIME_ONLY , 0 +.equ CONFIG_ONTHEFLY_BITPACKING , 0 +.equ CONFIG_ERROR_CONCEALMENT , 0 +.equ CONFIG_SHARED , 0 +.equ CONFIG_STATIC , 1 +.equ CONFIG_SMALL , 0 +.equ CONFIG_POSTPROC_VISUALIZER , 0 +.equ CONFIG_OS_SUPPORT , 1 +.equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 1 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 +.equ CONFIG_MULTI_RES_ENCODING , 1 +.equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 +.equ CONFIG_EXPERIMENTAL , 0 +.equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 +.equ CONFIG_FP_MB_STATS , 0 +.equ CONFIG_EMULATE_HARDWARE , 0 +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/linux/ppc64le/vpx_config.c b/media/libvpx/config/linux/ppc64le/vpx_config.c new file mode 100644 index 0000000000..edd90f65bc --- /dev/null +++ b/media/libvpx/config/linux/ppc64le/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=ppc64le-linux-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect --enable-vsx"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/linux/ppc64le/vpx_config.h b/media/libvpx/config/linux/ppc64le/vpx_config.h new file mode 100644 index 0000000000..883e4d686e --- /dev/null +++ b/media/libvpx/config/linux/ppc64le/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 1 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 1 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 0 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/ppc64le/vpx_dsp_rtcd.h b/media/libvpx/config/linux/ppc64le/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..596c863f88 --- /dev/null +++ b/media/libvpx/config/linux/ppc64le/vpx_dsp_rtcd.h @@ -0,0 +1,1015 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_c + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_c + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_c + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_c + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_c + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_c + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_c + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_c + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_c + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_c + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_c + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_c + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_c + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_c + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_c + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_vsx(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct32x32_rd)(const int16_t *input, tran_low_t *output, int stride); + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_c + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_c + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_c + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_c + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_vsx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +unsigned int vpx_get_mb_ss_vsx(const int16_t *); +RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *); + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_c + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_c + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_c + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_c + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_c + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_c + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_c + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_c + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_c + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_c + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_c + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_c + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_c + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_c + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_c + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_c + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_c + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_c + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_c + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_c + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_c + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_c + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_c + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_c + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_c + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_c + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_c + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_c + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_c + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c + +int vpx_satd_c(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_c + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +#define vpx_sse vpx_sse_c + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_c + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_c + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_c + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_c + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_c + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_c + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_c + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_c + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_vsx(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_c + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/ppc.h" +static void setup_rtcd_internal(void) +{ + int flags = ppc_simd_caps(); + (void)flags; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_VSX) vpx_comp_avg_pred = vpx_comp_avg_pred_vsx; + vpx_convolve8 = vpx_convolve8_c; + if (flags & HAS_VSX) vpx_convolve8 = vpx_convolve8_vsx; + vpx_convolve8_avg = vpx_convolve8_avg_c; + if (flags & HAS_VSX) vpx_convolve8_avg = vpx_convolve8_avg_vsx; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; + if (flags & HAS_VSX) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_vsx; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; + if (flags & HAS_VSX) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_vsx; + vpx_convolve8_horiz = vpx_convolve8_horiz_c; + if (flags & HAS_VSX) vpx_convolve8_horiz = vpx_convolve8_horiz_vsx; + vpx_convolve8_vert = vpx_convolve8_vert_c; + if (flags & HAS_VSX) vpx_convolve8_vert = vpx_convolve8_vert_vsx; + vpx_convolve_avg = vpx_convolve_avg_c; + if (flags & HAS_VSX) vpx_convolve_avg = vpx_convolve_avg_vsx; + vpx_convolve_copy = vpx_convolve_copy_c; + if (flags & HAS_VSX) vpx_convolve_copy = vpx_convolve_copy_vsx; + vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; + if (flags & HAS_VSX) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_vsx; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_VSX) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_vsx; + vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_c; + if (flags & HAS_VSX) vpx_d63_predictor_16x16 = vpx_d63_predictor_16x16_vsx; + vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_c; + if (flags & HAS_VSX) vpx_d63_predictor_32x32 = vpx_d63_predictor_32x32_vsx; + vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_c; + if (flags & HAS_VSX) vpx_dc_128_predictor_16x16 = vpx_dc_128_predictor_16x16_vsx; + vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_c; + if (flags & HAS_VSX) vpx_dc_128_predictor_32x32 = vpx_dc_128_predictor_32x32_vsx; + vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_c; + if (flags & HAS_VSX) vpx_dc_left_predictor_16x16 = vpx_dc_left_predictor_16x16_vsx; + vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_c; + if (flags & HAS_VSX) vpx_dc_left_predictor_32x32 = vpx_dc_left_predictor_32x32_vsx; + vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_c; + if (flags & HAS_VSX) vpx_dc_predictor_16x16 = vpx_dc_predictor_16x16_vsx; + vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_c; + if (flags & HAS_VSX) vpx_dc_predictor_32x32 = vpx_dc_predictor_32x32_vsx; + vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_c; + if (flags & HAS_VSX) vpx_dc_top_predictor_16x16 = vpx_dc_top_predictor_16x16_vsx; + vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_c; + if (flags & HAS_VSX) vpx_dc_top_predictor_32x32 = vpx_dc_top_predictor_32x32_vsx; + vpx_fdct32x32_rd = vpx_fdct32x32_rd_c; + if (flags & HAS_VSX) vpx_fdct32x32_rd = vpx_fdct32x32_rd_vsx; + vpx_get16x16var = vpx_get16x16var_c; + if (flags & HAS_VSX) vpx_get16x16var = vpx_get16x16var_vsx; + vpx_get4x4sse_cs = vpx_get4x4sse_cs_c; + if (flags & HAS_VSX) vpx_get4x4sse_cs = vpx_get4x4sse_cs_vsx; + vpx_get8x8var = vpx_get8x8var_c; + if (flags & HAS_VSX) vpx_get8x8var = vpx_get8x8var_vsx; + vpx_get_mb_ss = vpx_get_mb_ss_c; + if (flags & HAS_VSX) vpx_get_mb_ss = vpx_get_mb_ss_vsx; + vpx_h_predictor_16x16 = vpx_h_predictor_16x16_c; + if (flags & HAS_VSX) vpx_h_predictor_16x16 = vpx_h_predictor_16x16_vsx; + vpx_h_predictor_32x32 = vpx_h_predictor_32x32_c; + if (flags & HAS_VSX) vpx_h_predictor_32x32 = vpx_h_predictor_32x32_vsx; + vpx_hadamard_16x16 = vpx_hadamard_16x16_c; + if (flags & HAS_VSX) vpx_hadamard_16x16 = vpx_hadamard_16x16_vsx; + vpx_hadamard_8x8 = vpx_hadamard_8x8_c; + if (flags & HAS_VSX) vpx_hadamard_8x8 = vpx_hadamard_8x8_vsx; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; + if (flags & HAS_VSX) vpx_idct16x16_256_add = vpx_idct16x16_256_add_vsx; + vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; + if (flags & HAS_VSX) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_vsx; + vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; + if (flags & HAS_VSX) vpx_idct4x4_16_add = vpx_idct4x4_16_add_vsx; + vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; + if (flags & HAS_VSX) vpx_idct8x8_64_add = vpx_idct8x8_64_add_vsx; + vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_c; + if (flags & HAS_VSX) vpx_iwht4x4_16_add = vpx_iwht4x4_16_add_vsx; + vpx_mse16x16 = vpx_mse16x16_c; + if (flags & HAS_VSX) vpx_mse16x16 = vpx_mse16x16_vsx; + vpx_mse16x8 = vpx_mse16x8_c; + if (flags & HAS_VSX) vpx_mse16x8 = vpx_mse16x8_vsx; + vpx_mse8x16 = vpx_mse8x16_c; + if (flags & HAS_VSX) vpx_mse8x16 = vpx_mse8x16_vsx; + vpx_mse8x8 = vpx_mse8x8_c; + if (flags & HAS_VSX) vpx_mse8x8 = vpx_mse8x8_vsx; + vpx_quantize_b = vpx_quantize_b_c; + if (flags & HAS_VSX) vpx_quantize_b = vpx_quantize_b_vsx; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_VSX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_vsx; + vpx_sad16x16 = vpx_sad16x16_c; + if (flags & HAS_VSX) vpx_sad16x16 = vpx_sad16x16_vsx; + vpx_sad16x16_avg = vpx_sad16x16_avg_c; + if (flags & HAS_VSX) vpx_sad16x16_avg = vpx_sad16x16_avg_vsx; + vpx_sad16x16x4d = vpx_sad16x16x4d_c; + if (flags & HAS_VSX) vpx_sad16x16x4d = vpx_sad16x16x4d_vsx; + vpx_sad16x32 = vpx_sad16x32_c; + if (flags & HAS_VSX) vpx_sad16x32 = vpx_sad16x32_vsx; + vpx_sad16x32_avg = vpx_sad16x32_avg_c; + if (flags & HAS_VSX) vpx_sad16x32_avg = vpx_sad16x32_avg_vsx; + vpx_sad16x32x4d = vpx_sad16x32x4d_c; + if (flags & HAS_VSX) vpx_sad16x32x4d = vpx_sad16x32x4d_vsx; + vpx_sad16x8 = vpx_sad16x8_c; + if (flags & HAS_VSX) vpx_sad16x8 = vpx_sad16x8_vsx; + vpx_sad16x8_avg = vpx_sad16x8_avg_c; + if (flags & HAS_VSX) vpx_sad16x8_avg = vpx_sad16x8_avg_vsx; + vpx_sad16x8x4d = vpx_sad16x8x4d_c; + if (flags & HAS_VSX) vpx_sad16x8x4d = vpx_sad16x8x4d_vsx; + vpx_sad32x16 = vpx_sad32x16_c; + if (flags & HAS_VSX) vpx_sad32x16 = vpx_sad32x16_vsx; + vpx_sad32x16_avg = vpx_sad32x16_avg_c; + if (flags & HAS_VSX) vpx_sad32x16_avg = vpx_sad32x16_avg_vsx; + vpx_sad32x16x4d = vpx_sad32x16x4d_c; + if (flags & HAS_VSX) vpx_sad32x16x4d = vpx_sad32x16x4d_vsx; + vpx_sad32x32 = vpx_sad32x32_c; + if (flags & HAS_VSX) vpx_sad32x32 = vpx_sad32x32_vsx; + vpx_sad32x32_avg = vpx_sad32x32_avg_c; + if (flags & HAS_VSX) vpx_sad32x32_avg = vpx_sad32x32_avg_vsx; + vpx_sad32x32x4d = vpx_sad32x32x4d_c; + if (flags & HAS_VSX) vpx_sad32x32x4d = vpx_sad32x32x4d_vsx; + vpx_sad32x64 = vpx_sad32x64_c; + if (flags & HAS_VSX) vpx_sad32x64 = vpx_sad32x64_vsx; + vpx_sad32x64_avg = vpx_sad32x64_avg_c; + if (flags & HAS_VSX) vpx_sad32x64_avg = vpx_sad32x64_avg_vsx; + vpx_sad32x64x4d = vpx_sad32x64x4d_c; + if (flags & HAS_VSX) vpx_sad32x64x4d = vpx_sad32x64x4d_vsx; + vpx_sad64x32 = vpx_sad64x32_c; + if (flags & HAS_VSX) vpx_sad64x32 = vpx_sad64x32_vsx; + vpx_sad64x32_avg = vpx_sad64x32_avg_c; + if (flags & HAS_VSX) vpx_sad64x32_avg = vpx_sad64x32_avg_vsx; + vpx_sad64x32x4d = vpx_sad64x32x4d_c; + if (flags & HAS_VSX) vpx_sad64x32x4d = vpx_sad64x32x4d_vsx; + vpx_sad64x64 = vpx_sad64x64_c; + if (flags & HAS_VSX) vpx_sad64x64 = vpx_sad64x64_vsx; + vpx_sad64x64_avg = vpx_sad64x64_avg_c; + if (flags & HAS_VSX) vpx_sad64x64_avg = vpx_sad64x64_avg_vsx; + vpx_sad64x64x4d = vpx_sad64x64x4d_c; + if (flags & HAS_VSX) vpx_sad64x64x4d = vpx_sad64x64x4d_vsx; + vpx_sad8x16 = vpx_sad8x16_c; + if (flags & HAS_VSX) vpx_sad8x16 = vpx_sad8x16_vsx; + vpx_sad8x4 = vpx_sad8x4_c; + if (flags & HAS_VSX) vpx_sad8x4 = vpx_sad8x4_vsx; + vpx_sad8x8 = vpx_sad8x8_c; + if (flags & HAS_VSX) vpx_sad8x8 = vpx_sad8x8_vsx; + vpx_subtract_block = vpx_subtract_block_c; + if (flags & HAS_VSX) vpx_subtract_block = vpx_subtract_block_vsx; + vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; + if (flags & HAS_VSX) vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_vsx; + vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_c; + if (flags & HAS_VSX) vpx_tm_predictor_32x32 = vpx_tm_predictor_32x32_vsx; + vpx_v_predictor_16x16 = vpx_v_predictor_16x16_c; + if (flags & HAS_VSX) vpx_v_predictor_16x16 = vpx_v_predictor_16x16_vsx; + vpx_v_predictor_32x32 = vpx_v_predictor_32x32_c; + if (flags & HAS_VSX) vpx_v_predictor_32x32 = vpx_v_predictor_32x32_vsx; + vpx_variance16x16 = vpx_variance16x16_c; + if (flags & HAS_VSX) vpx_variance16x16 = vpx_variance16x16_vsx; + vpx_variance16x32 = vpx_variance16x32_c; + if (flags & HAS_VSX) vpx_variance16x32 = vpx_variance16x32_vsx; + vpx_variance16x8 = vpx_variance16x8_c; + if (flags & HAS_VSX) vpx_variance16x8 = vpx_variance16x8_vsx; + vpx_variance32x16 = vpx_variance32x16_c; + if (flags & HAS_VSX) vpx_variance32x16 = vpx_variance32x16_vsx; + vpx_variance32x32 = vpx_variance32x32_c; + if (flags & HAS_VSX) vpx_variance32x32 = vpx_variance32x32_vsx; + vpx_variance32x64 = vpx_variance32x64_c; + if (flags & HAS_VSX) vpx_variance32x64 = vpx_variance32x64_vsx; + vpx_variance4x4 = vpx_variance4x4_c; + if (flags & HAS_VSX) vpx_variance4x4 = vpx_variance4x4_vsx; + vpx_variance4x8 = vpx_variance4x8_c; + if (flags & HAS_VSX) vpx_variance4x8 = vpx_variance4x8_vsx; + vpx_variance64x32 = vpx_variance64x32_c; + if (flags & HAS_VSX) vpx_variance64x32 = vpx_variance64x32_vsx; + vpx_variance64x64 = vpx_variance64x64_c; + if (flags & HAS_VSX) vpx_variance64x64 = vpx_variance64x64_vsx; + vpx_variance8x16 = vpx_variance8x16_c; + if (flags & HAS_VSX) vpx_variance8x16 = vpx_variance8x16_vsx; + vpx_variance8x4 = vpx_variance8x4_c; + if (flags & HAS_VSX) vpx_variance8x4 = vpx_variance8x4_vsx; + vpx_variance8x8 = vpx_variance8x8_c; + if (flags & HAS_VSX) vpx_variance8x8 = vpx_variance8x8_vsx; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/linux/ppc64le/vpx_scale_rtcd.h b/media/libvpx/config/linux/ppc64le/vpx_scale_rtcd.h new file mode 100644 index 0000000000..11c5b6d933 --- /dev/null +++ b/media/libvpx/config/linux/ppc64le/vpx_scale_rtcd.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/ppc.h" +static void setup_rtcd_internal(void) +{ + int flags = ppc_simd_caps(); + (void)flags; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/linux/x64/vp8_rtcd.h b/media/libvpx/config/linux/x64/vp8_rtcd.h index 4728639704..9e7746b813 100644 --- a/media/libvpx/config/linux/x64/vp8_rtcd.h +++ b/media/libvpx/config/linux/x64/vp8_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ @@ -26,56 +37,47 @@ struct yv12_buffer_config; extern "C" { #endif -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_sse2 -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_sse2 -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c - -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c - -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); int vp8_block_error_c(short *coeff, short *dqcoeff); int vp8_block_error_sse2(short *coeff, short *dqcoeff); #define vp8_block_error vp8_block_error_sse2 -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); #define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); @@ -86,8 +88,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); #define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2 -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); @@ -98,8 +100,8 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); #define vp8_dequantize_b vp8_dequantize_b_mmx int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); @@ -122,41 +124,36 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); #define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 -void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_bv vp8_loop_filter_bv_sse2 -void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2 -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 -void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2 -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 int vp8_mbblock_error_c(struct macroblock *mb, int dc); @@ -167,8 +164,8 @@ int vp8_mbuverror_c(struct macroblock *mb); int vp8_mbuverror_sse2(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_sse2 -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_refining_search_sad vp8_refining_search_sadx4 void vp8_regular_quantize_b_c(struct block *, struct blockd *); @@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2 -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); #define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2 -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); @@ -241,9 +238,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - vp8_full_search_sad = vp8_full_search_sad_c; - if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; - if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; @@ -261,4 +255,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/linux/x64/vp9_rtcd.h b/media/libvpx/config/linux/x64/vp9_rtcd.h index 47a708444d..52d6f0657d 100644 --- a/media/libvpx/config/linux/x64/vp9_rtcd.h +++ b/media/libvpx/config/linux/x64/vp9_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -14,12 +25,18 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -29,23 +46,22 @@ struct yv12_buffer_config; extern "C" { #endif +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -#define vp9_block_error_fp vp9_block_error_fp_sse2 +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); - -void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -67,17 +83,12 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); #define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_sse2 -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_sse2 -void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); @@ -88,22 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2 +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); void vp9_rtcd(void); @@ -115,21 +139,29 @@ static void setup_rtcd_internal(void) (void)flags; + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; vp9_block_error = vp9_block_error_sse2; if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; - vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2; - if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; - vp9_full_search_sad = vp9_full_search_sad_c; - if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; - if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; + vp9_block_error_fp = vp9_block_error_fp_sse2; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; vp9_quantize_fp = vp9_quantize_fp_sse2; if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; + vpx_convolve12 = vpx_convolve12_c; + if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3; + if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2; + vpx_convolve12_horiz = vpx_convolve12_horiz_c; + if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2; + vpx_convolve12_vert = vpx_convolve12_vert_c; + if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2; } #endif @@ -137,4 +169,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/linux/x64/vpx_config.asm b/media/libvpx/config/linux/x64/vpx_config.asm index e474409958..83a97b3c7c 100644 --- a/media/libvpx/config/linux/x64/vpx_config.asm +++ b/media/libvpx/config/linux/x64/vpx_config.asm @@ -1,9 +1,16 @@ -%define ARCH_ARM 0 -%define ARCH_MIPS 0 -%define ARCH_X86 0 -%define ARCH_X86_64 1 -%define HAVE_NEON 0 +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 0 +%define VPX_ARCH_X86_64 1 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 %define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 @@ -16,6 +23,11 @@ %define HAVE_SSE4_1 1 %define HAVE_AVX 1 %define HAVE_AVX2 1 +%define HAVE_AVX512 1 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 %define HAVE_VPX_PORTS 1 %define HAVE_PTHREAD_H 1 %define CONFIG_DEPENDENCY_TRACKING 1 @@ -72,7 +84,10 @@ %define CONFIG_BETTER_HW_COMPATIBILITY 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 1 -%define CONFIG_SPATIAL_SVC 0 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 %define CONFIG_FP_MB_STATS 0 %define CONFIG_EMULATE_HARDWARE 0 -%define CONFIG_MISC_FIXES 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/linux/x64/vpx_config.h b/media/libvpx/config/linux/x64/vpx_config.h index 23f32d1487..b0bfa58fe5 100644 --- a/media/libvpx/config/linux/x64/vpx_config.h +++ b/media/libvpx/config/linux/x64/vpx_config.h @@ -10,12 +10,19 @@ #define VPX_CONFIG_H #define RESTRICT #define INLINE inline -#define ARCH_ARM 0 -#define ARCH_MIPS 0 -#define ARCH_X86 0 -#define ARCH_X86_64 1 -#define HAVE_NEON 0 +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 1 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -28,6 +35,11 @@ #define HAVE_SSE4_1 1 #define HAVE_AVX 1 #define HAVE_AVX2 1 +#define HAVE_AVX512 1 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define CONFIG_DEPENDENCY_TRACKING 1 @@ -84,10 +96,13 @@ #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 -#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 -#define CONFIG_MISC_FIXES 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h index 6f181c0a0e..8e17c8ab8a 100644 --- a/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h +++ b/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +24,11 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -28,243 +44,216 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); #define vpx_avg_8x8 vpx_avg_8x8_sse2 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_sse2 -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_sse2 -void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c -void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c -void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c -void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c -void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c -void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c -void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c -void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c -void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_sse2 -void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c - -void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c - -void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_sse2 -void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_sse2 -void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c - -void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c - -void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c -void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c - -void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c - -void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c -void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c - -void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c - -void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2 -void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2 -void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse2 -void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse2 -void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2 -void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2 -void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse2 -void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse2 -void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2 -void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2 -void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse2 -void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse2 -void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2 -void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2 -void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse2 -void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse2 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16 vpx_fdct16x16_sse2 +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride); @@ -301,48 +290,54 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2 -void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c -void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); #define vpx_get8x8var vpx_get8x8var_sse2 unsigned int vpx_get_mb_ss_c(const int16_t *); unsigned int vpx_get_mb_ss_sse2(const int16_t *); #define vpx_get_mb_ss vpx_get_mb_ss_sse2 -void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_16x16 vpx_h_predictor_16x16_sse2 -void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_32x32 vpx_h_predictor_32x32_sse2 -void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_4x4 vpx_h_predictor_4x4_sse2 -void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2 -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -#define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2 +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -355,16 +350,22 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2 +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -395,15 +396,14 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); -RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); #define vpx_int_pro_col vpx_int_pro_col_sse2 -void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_sse2 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -463,8 +463,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2 -void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); #define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); @@ -475,21 +475,22 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vpx_minmax_8x8 vpx_minmax_8x8_sse2 -unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vpx_mse16x8 vpx_mse16x8_sse2 +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse8x16 vpx_mse8x16_sse2 -unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse8x8 vpx_mse8x8_sse2 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); @@ -500,16 +501,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2 -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -519,19 +522,10 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2 -void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2 -void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x32 vpx_sad16x32_sse2 @@ -540,8 +534,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2 -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -552,19 +546,10 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2 -void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2 -void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -575,8 +560,8 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -589,16 +574,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -610,8 +589,8 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -622,18 +601,10 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2 -void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2 -void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad4x8 vpx_sad4x8_sse2 @@ -642,47 +613,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2 -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2 -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -692,18 +659,10 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2 -void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2 -void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x4 vpx_sad8x4_sse2 @@ -712,13 +671,10 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2 -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2 -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_sse2 @@ -727,273 +683,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2 -void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2 -void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2 + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2 + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2 + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2 + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2 + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2 + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2 + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2 + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2 + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2 + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2 + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2 int vpx_satd_c(const int16_t *coeff, int length); int vpx_satd_sse2(const int16_t *coeff, int length); -#define vpx_satd vpx_satd_sse2 +int vpx_satd_avx2(const int16_t *coeff, int length); +RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c -uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); -uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vpx_subtract_block vpx_subtract_block_sse2 +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); #define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_sse2 -void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2 -void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2 -void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse2 -void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2 -void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2 -void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2 -void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse2 -void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse2 -unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x32 vpx_variance16x32_sse2 +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x8 vpx_variance16x8_sse2 +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance32x64 vpx_variance32x64_sse2 +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance4x4 vpx_variance4x4_sse2 -unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance4x8 vpx_variance4x8_sse2 -unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x16 vpx_variance8x16_sse2 +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_sse2 +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x8 vpx_variance8x8_sse2 +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); @@ -1010,15 +1085,20 @@ static void setup_rtcd_internal(void) (void)flags; + vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; vpx_convolve8 = vpx_convolve8_sse2; if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2; vpx_convolve8_avg = vpx_convolve8_avg_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2; @@ -1051,6 +1131,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; vpx_fdct32x32 = vpx_fdct32x32_sse2; if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2; vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2; @@ -1059,40 +1141,39 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_fdct8x8 = vpx_fdct8x8_ssse3; vpx_get16x16var = vpx_get16x16var_sse2; if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; - if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3; - vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; - vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2; - if (flags & HAS_SSSE3) vpx_idct8x8_64_add = vpx_idct8x8_64_add_ssse3; vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2; if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2; vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2; if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2; vpx_mse16x16 = vpx_mse16x16_sse2; if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; vpx_quantize_b = vpx_quantize_b_sse2; if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; - vpx_sad16x16x3 = vpx_sad16x16x3_c; - if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3; - vpx_sad16x16x8 = vpx_sad16x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1; - vpx_sad16x8x3 = vpx_sad16x8x3_c; - if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3; - vpx_sad16x8x8 = vpx_sad16x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; vpx_sad32x16 = vpx_sad32x16_sse2; if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; vpx_sad32x16_avg = vpx_sad32x16_avg_sse2; @@ -1107,30 +1188,52 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2; vpx_sad32x64_avg = vpx_sad32x64_avg_sse2; if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2; - vpx_sad4x4x3 = vpx_sad4x4x3_c; - if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3; - vpx_sad4x4x8 = vpx_sad4x4x8_c; - if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1; vpx_sad64x32 = vpx_sad64x32_sse2; if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512; vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512; vpx_sad64x64 = vpx_sad64x64_sse2; if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512; vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512; vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; - vpx_sad8x16x3 = vpx_sad8x16x3_c; - if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3; - vpx_sad8x16x8 = vpx_sad8x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1; - vpx_sad8x8x3 = vpx_sad8x8x3_c; - if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3; - vpx_sad8x8x8 = vpx_sad8x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512; + vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; vpx_scaled_2d = vpx_scaled_2d_c; if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2; @@ -1187,16 +1290,30 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3; vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2; if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; + vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; vpx_variance16x16 = vpx_variance16x16_sse2; if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; + vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; vpx_variance32x16 = vpx_variance32x16_sse2; if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; vpx_variance32x32 = vpx_variance32x32_sse2; if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; vpx_variance64x32 = vpx_variance64x32_sse2; if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; vpx_variance64x64 = vpx_variance64x64_sse2; if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; } #endif @@ -1204,4 +1321,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/linux/x64/vpx_scale_rtcd.h b/media/libvpx/config/linux/x64/vpx_scale_rtcd.h index ddf7d01cca..18e5b71579 100644 --- a/media/libvpx/config/linux/x64/vpx_scale_rtcd.h +++ b/media/libvpx/config/linux/x64/vpx_scale_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c @@ -66,4 +80,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/mac/arm64/vp8_rtcd.h b/media/libvpx/config/mac/arm64/vp8_rtcd.h new file mode 100644 index 0000000000..36e6855a6d --- /dev/null +++ b/media/libvpx/config/mac/arm64/vp8_rtcd.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon + +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_neon + +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon + +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +#define vp8_copy32xn vp8_copy32xn_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon + +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon + +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon + +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_neon + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *dest, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_neon + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon + +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_neon(struct blockd*, short *DQC); +#define vp8_dequantize_b vp8_dequantize_b_neon + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_neon(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_neon + +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_neon + +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_neon + +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon + +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon + +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon + +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon + +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon + +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_neon(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon + +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon + +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_neon + +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon + +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/mac/arm64/vp9_rtcd.h b/media/libvpx/config/mac/arm64/vp9_rtcd.h new file mode 100644 index 0000000000..e1b572fe62 --- /dev/null +++ b/media/libvpx/config/mac/arm64/vp9_rtcd.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); + +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_neon + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_neon + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_neon + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_neon + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_neon + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp vp9_quantize_fp_neon + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + + vp9_block_error = vp9_block_error_neon; + if (flags & HAS_SVE) vp9_block_error = vp9_block_error_sve; + vp9_block_error_fp = vp9_block_error_fp_neon; + if (flags & HAS_SVE) vp9_block_error_fp = vp9_block_error_fp_sve; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/mac/arm64/vpx_config.asm b/media/libvpx/config/mac/arm64/vpx_config.asm new file mode 100644 index 0000000000..cd4868978f --- /dev/null +++ b/media/libvpx/config/mac/arm64/vpx_config.asm @@ -0,0 +1,97 @@ +@ This file was created from a .asm file +@ using the ads2gas.pl script. +.syntax unified +.equ VPX_ARCH_ARM , 1 +.equ VPX_ARCH_AARCH64 , 1 +.equ VPX_ARCH_MIPS , 0 +.equ VPX_ARCH_X86 , 0 +.equ VPX_ARCH_X86_64 , 0 +.equ VPX_ARCH_PPC , 0 +.equ VPX_ARCH_LOONGARCH , 0 +.equ HAVE_NEON_ASM , 0 +.equ HAVE_NEON , 1 +.equ HAVE_NEON_DOTPROD , 1 +.equ HAVE_NEON_I8MM , 1 +.equ HAVE_SVE , 1 +.equ HAVE_SVE2 , 1 +.equ HAVE_MIPS32 , 0 +.equ HAVE_DSPR2 , 0 +.equ HAVE_MSA , 0 +.equ HAVE_MIPS64 , 0 +.equ HAVE_MMX , 0 +.equ HAVE_SSE , 0 +.equ HAVE_SSE2 , 0 +.equ HAVE_SSE3 , 0 +.equ HAVE_SSSE3 , 0 +.equ HAVE_SSE4_1 , 0 +.equ HAVE_AVX , 0 +.equ HAVE_AVX2 , 0 +.equ HAVE_AVX512 , 0 +.equ HAVE_VSX , 0 +.equ HAVE_MMI , 0 +.equ HAVE_LSX , 0 +.equ HAVE_LASX , 0 +.equ HAVE_VPX_PORTS , 1 +.equ HAVE_PTHREAD_H , 1 +.equ CONFIG_DEPENDENCY_TRACKING , 1 +.equ CONFIG_EXTERNAL_BUILD , 1 +.equ CONFIG_INSTALL_DOCS , 0 +.equ CONFIG_INSTALL_BINS , 1 +.equ CONFIG_INSTALL_LIBS , 1 +.equ CONFIG_INSTALL_SRCS , 0 +.equ CONFIG_DEBUG , 0 +.equ CONFIG_GPROF , 0 +.equ CONFIG_GCOV , 0 +.equ CONFIG_RVCT , 0 +.equ CONFIG_GCC , 1 +.equ CONFIG_MSVS , 0 +.equ CONFIG_PIC , 1 +.equ CONFIG_BIG_ENDIAN , 0 +.equ CONFIG_CODEC_SRCS , 0 +.equ CONFIG_DEBUG_LIBS , 0 +.equ CONFIG_DEQUANT_TOKENS , 0 +.equ CONFIG_DC_RECON , 0 +.equ CONFIG_RUNTIME_CPU_DETECT , 1 +.equ CONFIG_POSTPROC , 0 +.equ CONFIG_VP9_POSTPROC , 0 +.equ CONFIG_MULTITHREAD , 1 +.equ CONFIG_INTERNAL_STATS , 0 +.equ CONFIG_VP8_ENCODER , 1 +.equ CONFIG_VP8_DECODER , 1 +.equ CONFIG_VP9_ENCODER , 1 +.equ CONFIG_VP9_DECODER , 1 +.equ CONFIG_VP8 , 1 +.equ CONFIG_VP9 , 1 +.equ CONFIG_ENCODERS , 1 +.equ CONFIG_DECODERS , 1 +.equ CONFIG_STATIC_MSVCRT , 0 +.equ CONFIG_SPATIAL_RESAMPLING , 1 +.equ CONFIG_REALTIME_ONLY , 1 +.equ CONFIG_ONTHEFLY_BITPACKING , 0 +.equ CONFIG_ERROR_CONCEALMENT , 0 +.equ CONFIG_SHARED , 0 +.equ CONFIG_STATIC , 1 +.equ CONFIG_SMALL , 0 +.equ CONFIG_POSTPROC_VISUALIZER , 0 +.equ CONFIG_OS_SUPPORT , 1 +.equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 0 +.equ CONFIG_LIBYUV , 0 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 +.equ CONFIG_MULTI_RES_ENCODING , 1 +.equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 +.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 +.equ CONFIG_EXPERIMENTAL , 0 +.equ CONFIG_SIZE_LIMIT , 1 +.equ CONFIG_ALWAYS_ADJUST_BPM , 0 +.equ CONFIG_BITSTREAM_DEBUG , 0 +.equ CONFIG_MISMATCH_DEBUG , 0 +.equ CONFIG_FP_MB_STATS , 0 +.equ CONFIG_EMULATE_HARDWARE , 0 +.equ CONFIG_NON_GREEDY_MV , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 + .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/mac/arm64/vpx_config.c b/media/libvpx/config/mac/arm64/vpx_config.c new file mode 100644 index 0000000000..9689084601 --- /dev/null +++ b/media/libvpx/config/mac/arm64/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=arm64-darwin-gcc --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-runtime-cpu-detect --enable-realtime-only"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/mac/arm64/vpx_config.h b/media/libvpx/config/mac/arm64/vpx_config.h new file mode 100644 index 0000000000..a9808406c8 --- /dev/null +++ b/media/libvpx/config/mac/arm64/vpx_config.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define VPX_ARCH_ARM 1 +#define VPX_ARCH_AARCH64 1 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 +#define HAVE_NEON_ASM 0 +#define HAVE_NEON 1 +#define HAVE_NEON_DOTPROD 1 +#define HAVE_NEON_I8MM 1 +#define HAVE_SVE 1 +#define HAVE_SVE2 1 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_AVX512 0 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 1 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 1 +#define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 1 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 0 +#define CONFIG_LIBYUV 0 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define DECODE_WIDTH_LIMIT 8192 +#define DECODE_HEIGHT_LIMIT 4608 +#endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/mac/arm64/vpx_dsp_rtcd.h b/media/libvpx/config/mac/arm64/vpx_dsp_rtcd.h new file mode 100644 index 0000000000..3c34cafc59 --- /dev/null +++ b/media/libvpx/config/mac/arm64/vpx_dsp_rtcd.h @@ -0,0 +1,1197 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_neon(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_neon + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_neon(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_neon + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_neon + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_neon + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_neon + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_neon + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_neon + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_neon + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_neon + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_neon + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_neon + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_neon + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_neon + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_neon + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_neon + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_neon + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_neon + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_neon + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_neon + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_neon + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_neon + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_neon + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_neon + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_neon + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_neon + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_neon + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_neon + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_neon + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_neon + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_neon + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_neon + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_neon + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_neon + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_neon + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_neon + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_neon + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_neon + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_neon + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_neon + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_neon + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_neon + +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_get4x4sse_cs)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); + +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_neon + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_neon + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_neon + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon + +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon + +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon + +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_neon + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_neon + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_neon + +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_neon + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_neon + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_neon + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_neon + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_neon + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_neon + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_neon + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_neon + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_neon + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_neon + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_neon + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_neon + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_neon + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b vpx_quantize_b_neon + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad16x8_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x16_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad32x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_neon + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_neon + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x4x4d vpx_sad4x4x4d_neon + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_neon + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_neon + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad4x8x4d vpx_sad4x8x4d_neon + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_neon + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_neon + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x16x4d vpx_sad8x16x4d_neon + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_neon + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_neon + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x4x4d vpx_sad8x4x4d_neon + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_neon + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_neon + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad8x8x4d vpx_sad8x8x4d_neon + +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_neon + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_neon + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_neon + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_neon + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_neon + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_neon + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_neon + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_neon + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_neon + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_neon + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_neon(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_neon + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_neon + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_neon + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_neon + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_neon + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_neon + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_neon + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_neon + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_neon + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vpx_subtract_block vpx_subtract_block_neon + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size); +uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size); +RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_neon + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_neon + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_neon + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_neon + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_neon + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_neon + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_neon + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_neon + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon_dotprod(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_neon + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + + vpx_convolve8 = vpx_convolve8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8 = vpx_convolve8_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8 = vpx_convolve8_neon_i8mm; + vpx_convolve8_avg = vpx_convolve8_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg = vpx_convolve8_avg_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg = vpx_convolve8_avg_neon_i8mm; + vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_neon_i8mm; + vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_neon_i8mm; + vpx_convolve8_horiz = vpx_convolve8_horiz_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_horiz = vpx_convolve8_horiz_neon_i8mm; + vpx_convolve8_vert = vpx_convolve8_vert_neon; + if (flags & HAS_NEON_DOTPROD) vpx_convolve8_vert = vpx_convolve8_vert_neon_dotprod; + if (flags & HAS_NEON_I8MM) vpx_convolve8_vert = vpx_convolve8_vert_neon_i8mm; + vpx_get16x16var = vpx_get16x16var_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get16x16var = vpx_get16x16var_neon_dotprod; + vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get4x4sse_cs = vpx_get4x4sse_cs_neon_dotprod; + vpx_get8x8var = vpx_get8x8var_neon; + if (flags & HAS_NEON_DOTPROD) vpx_get8x8var = vpx_get8x8var_neon_dotprod; + vpx_mse16x16 = vpx_mse16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse16x16 = vpx_mse16x16_neon_dotprod; + vpx_mse16x8 = vpx_mse16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse16x8 = vpx_mse16x8_neon_dotprod; + vpx_mse8x16 = vpx_mse8x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse8x16 = vpx_mse8x16_neon_dotprod; + vpx_mse8x8 = vpx_mse8x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_mse8x8 = vpx_mse8x8_neon_dotprod; + vpx_sad16x16 = vpx_sad16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16 = vpx_sad16x16_neon_dotprod; + vpx_sad16x16_avg = vpx_sad16x16_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16_avg = vpx_sad16x16_avg_neon_dotprod; + vpx_sad16x16x4d = vpx_sad16x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x16x4d = vpx_sad16x16x4d_neon_dotprod; + vpx_sad16x32 = vpx_sad16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32 = vpx_sad16x32_neon_dotprod; + vpx_sad16x32_avg = vpx_sad16x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32_avg = vpx_sad16x32_avg_neon_dotprod; + vpx_sad16x32x4d = vpx_sad16x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x32x4d = vpx_sad16x32x4d_neon_dotprod; + vpx_sad16x8 = vpx_sad16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8 = vpx_sad16x8_neon_dotprod; + vpx_sad16x8_avg = vpx_sad16x8_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8_avg = vpx_sad16x8_avg_neon_dotprod; + vpx_sad16x8x4d = vpx_sad16x8x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad16x8x4d = vpx_sad16x8x4d_neon_dotprod; + vpx_sad32x16 = vpx_sad32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16 = vpx_sad32x16_neon_dotprod; + vpx_sad32x16_avg = vpx_sad32x16_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16_avg = vpx_sad32x16_avg_neon_dotprod; + vpx_sad32x16x4d = vpx_sad32x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x16x4d = vpx_sad32x16x4d_neon_dotprod; + vpx_sad32x32 = vpx_sad32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32 = vpx_sad32x32_neon_dotprod; + vpx_sad32x32_avg = vpx_sad32x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32_avg = vpx_sad32x32_avg_neon_dotprod; + vpx_sad32x32x4d = vpx_sad32x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x32x4d = vpx_sad32x32x4d_neon_dotprod; + vpx_sad32x64 = vpx_sad32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64 = vpx_sad32x64_neon_dotprod; + vpx_sad32x64_avg = vpx_sad32x64_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64_avg = vpx_sad32x64_avg_neon_dotprod; + vpx_sad32x64x4d = vpx_sad32x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad32x64x4d = vpx_sad32x64x4d_neon_dotprod; + vpx_sad64x32 = vpx_sad64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32 = vpx_sad64x32_neon_dotprod; + vpx_sad64x32_avg = vpx_sad64x32_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32_avg = vpx_sad64x32_avg_neon_dotprod; + vpx_sad64x32x4d = vpx_sad64x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x32x4d = vpx_sad64x32x4d_neon_dotprod; + vpx_sad64x64 = vpx_sad64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64 = vpx_sad64x64_neon_dotprod; + vpx_sad64x64_avg = vpx_sad64x64_avg_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64_avg = vpx_sad64x64_avg_neon_dotprod; + vpx_sad64x64x4d = vpx_sad64x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad64x64x4d = vpx_sad64x64x4d_neon_dotprod; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_neon_dotprod; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_neon_dotprod; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_neon_dotprod; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_neon_dotprod; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_neon_dotprod; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_neon_dotprod; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_neon_dotprod; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_neon_dotprod; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_neon_dotprod; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_neon_dotprod; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_neon_dotprod; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_neon_dotprod; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_neon_dotprod; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_neon_dotprod; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_neon_dotprod; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_neon_dotprod; + vpx_sse = vpx_sse_neon; + if (flags & HAS_NEON_DOTPROD) vpx_sse = vpx_sse_neon_dotprod; + vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_neon; + if (flags & HAS_SVE) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sve; + vpx_variance16x16 = vpx_variance16x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x16 = vpx_variance16x16_neon_dotprod; + vpx_variance16x32 = vpx_variance16x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x32 = vpx_variance16x32_neon_dotprod; + vpx_variance16x8 = vpx_variance16x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance16x8 = vpx_variance16x8_neon_dotprod; + vpx_variance32x16 = vpx_variance32x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x16 = vpx_variance32x16_neon_dotprod; + vpx_variance32x32 = vpx_variance32x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x32 = vpx_variance32x32_neon_dotprod; + vpx_variance32x64 = vpx_variance32x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance32x64 = vpx_variance32x64_neon_dotprod; + vpx_variance4x4 = vpx_variance4x4_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance4x4 = vpx_variance4x4_neon_dotprod; + vpx_variance4x8 = vpx_variance4x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance4x8 = vpx_variance4x8_neon_dotprod; + vpx_variance64x32 = vpx_variance64x32_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance64x32 = vpx_variance64x32_neon_dotprod; + vpx_variance64x64 = vpx_variance64x64_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance64x64 = vpx_variance64x64_neon_dotprod; + vpx_variance8x16 = vpx_variance8x16_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x16 = vpx_variance8x16_neon_dotprod; + vpx_variance8x4 = vpx_variance8x4_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x4 = vpx_variance8x4_neon_dotprod; + vpx_variance8x8 = vpx_variance8x8_neon; + if (flags & HAS_NEON_DOTPROD) vpx_variance8x8 = vpx_variance8x8_neon_dotprod; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/mac/arm64/vpx_scale_rtcd.h b/media/libvpx/config/mac/arm64/vpx_scale_rtcd.h new file mode 100644 index 0000000000..ca594ea140 --- /dev/null +++ b/media/libvpx/config/mac/arm64/vpx_scale_rtcd.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/mac/ia32/vp8_rtcd.h b/media/libvpx/config/mac/ia32/vp8_rtcd.h index 5db5bbad85..ca42bb8a5e 100644 --- a/media/libvpx/config/mac/ia32/vp8_rtcd.h +++ b/media/libvpx/config/mac/ia32/vp8_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ @@ -26,57 +37,48 @@ struct yv12_buffer_config; extern "C" { #endif -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c - -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c - -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); int vp8_block_error_c(short *coeff, short *dqcoeff); int vp8_block_error_sse2(short *coeff, short *dqcoeff); RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); @@ -86,9 +88,9 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); -RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride); void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); @@ -98,9 +100,9 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC); int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); @@ -122,42 +124,37 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); int vp8_mbblock_error_c(struct macroblock *mb, int dc); int vp8_mbblock_error_sse2(struct macroblock *mb, int dc); @@ -167,9 +164,9 @@ int vp8_mbuverror_c(struct macroblock *mb); int vp8_mbuverror_sse2(struct macroblock *mb); RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); void vp8_regular_quantize_b_c(struct block *, struct blockd *); void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); @@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff); -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); @@ -237,9 +234,9 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; + if (flags & HAS_SSE2) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_sse2; vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; + if (flags & HAS_SSE2) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_sse2; vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; @@ -277,9 +274,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; - vp8_full_search_sad = vp8_full_search_sad_c; - if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; - if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; vp8_loop_filter_bh = vp8_loop_filter_bh_c; if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; vp8_loop_filter_bv = vp8_loop_filter_bv_c; @@ -336,4 +330,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/mac/ia32/vp9_rtcd.h b/media/libvpx/config/mac/ia32/vp9_rtcd.h index da53895ca4..28c82a831c 100644 --- a/media/libvpx/config/mac/ia32/vp9_rtcd.h +++ b/media/libvpx/config/mac/ia32/vp9_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -14,12 +25,18 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -29,23 +46,22 @@ struct yv12_buffer_config; extern "C" { #endif +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -RTCD_EXTERN int64_t (*vp9_block_error_fp)(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); - -void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -67,18 +83,13 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); RTCD_EXTERN void (*vp9_filter_by_weight8x8)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride); -void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); @@ -88,20 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); void vp9_rtcd(void); @@ -113,16 +139,14 @@ static void setup_rtcd_internal(void) (void)flags; + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; vp9_block_error = vp9_block_error_c; if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2; if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; vp9_block_error_fp = vp9_block_error_fp_c; if (flags & HAS_SSE2) vp9_block_error_fp = vp9_block_error_fp_sse2; - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; - vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; - if (flags & HAS_SSE2) vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2; - if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; vp9_fht16x16 = vp9_fht16x16_c; if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; vp9_fht4x4 = vp9_fht4x4_c; @@ -133,9 +157,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_sse2; vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_c; if (flags & HAS_SSE2) vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_sse2; - vp9_full_search_sad = vp9_full_search_sad_c; - if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; - if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; vp9_fwht4x4 = vp9_fwht4x4_c; if (flags & HAS_SSE2) vp9_fwht4x4 = vp9_fwht4x4_sse2; vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; @@ -146,10 +167,22 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; vp9_quantize_fp = vp9_quantize_fp_c; if (flags & HAS_SSE2) vp9_quantize_fp = vp9_quantize_fp_sse2; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; - vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; - if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; + vpx_convolve12 = vpx_convolve12_c; + if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3; + if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2; + vpx_convolve12_horiz = vpx_convolve12_horiz_c; + if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2; + vpx_convolve12_vert = vpx_convolve12_vert_c; + if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2; } #endif @@ -157,4 +190,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/mac/ia32/vpx_config.asm b/media/libvpx/config/mac/ia32/vpx_config.asm index 2efc8b978b..bafe400119 100644 --- a/media/libvpx/config/mac/ia32/vpx_config.asm +++ b/media/libvpx/config/mac/ia32/vpx_config.asm @@ -1,9 +1,16 @@ -%define ARCH_ARM 0 -%define ARCH_MIPS 0 -%define ARCH_X86 1 -%define ARCH_X86_64 0 -%define HAVE_NEON 0 +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 1 +%define VPX_ARCH_X86_64 0 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 %define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 @@ -16,6 +23,11 @@ %define HAVE_SSE4_1 1 %define HAVE_AVX 1 %define HAVE_AVX2 1 +%define HAVE_AVX512 1 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 %define HAVE_VPX_PORTS 1 %define HAVE_PTHREAD_H 1 %define CONFIG_DEPENDENCY_TRACKING 1 @@ -72,7 +84,10 @@ %define CONFIG_BETTER_HW_COMPATIBILITY 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 1 -%define CONFIG_SPATIAL_SVC 0 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 %define CONFIG_FP_MB_STATS 0 %define CONFIG_EMULATE_HARDWARE 0 -%define CONFIG_MISC_FIXES 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/mac/ia32/vpx_config.h b/media/libvpx/config/mac/ia32/vpx_config.h index b26f9a51e8..a4cc48afe1 100644 --- a/media/libvpx/config/mac/ia32/vpx_config.h +++ b/media/libvpx/config/mac/ia32/vpx_config.h @@ -10,12 +10,19 @@ #define VPX_CONFIG_H #define RESTRICT #define INLINE inline -#define ARCH_ARM 0 -#define ARCH_MIPS 0 -#define ARCH_X86 1 -#define ARCH_X86_64 0 -#define HAVE_NEON 0 +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 1 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -28,6 +35,11 @@ #define HAVE_SSE4_1 1 #define HAVE_AVX 1 #define HAVE_AVX2 1 +#define HAVE_AVX512 1 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define CONFIG_DEPENDENCY_TRACKING 1 @@ -84,10 +96,13 @@ #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 -#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 -#define CONFIG_MISC_FIXES 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h index 112e326a64..0a6266a322 100644 --- a/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h +++ b/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +24,11 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -28,242 +44,215 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c -void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c -void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c -void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c -void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c -void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c -void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c -void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c -void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c - -void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c - -void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c -void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c - -void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c - -void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c -void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); @@ -300,47 +289,53 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride); -void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c -void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); unsigned int vpx_get_mb_ss_c(const int16_t *); unsigned int vpx_get_mb_ss_sse2(const int16_t *); RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *); -void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -353,14 +348,22 @@ RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride); + void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -369,6 +372,7 @@ RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -381,6 +385,7 @@ RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, in void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -395,9 +400,9 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width); -void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); @@ -456,9 +461,9 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); -void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); -RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols,int flimit); void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); @@ -468,22 +473,23 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); -unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); @@ -493,12 +499,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -508,18 +520,9 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -529,9 +532,9 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -541,18 +544,9 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -564,9 +558,9 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -578,16 +572,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -599,9 +587,9 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -611,17 +599,9 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -631,47 +611,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -681,17 +657,9 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -701,12 +669,9 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -716,273 +681,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); int vpx_satd_c(const int16_t *coeff, int length); int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_avx2(const int16_t *coeff, int length); RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c -uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); -uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); -void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); @@ -1003,6 +1087,9 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2; vpx_avg_8x8 = vpx_avg_8x8_c; if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; vpx_convolve8 = vpx_convolve8_c; if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; @@ -1010,12 +1097,15 @@ static void setup_rtcd_internal(void) vpx_convolve8_avg = vpx_convolve8_avg_c; if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; vpx_convolve8_horiz = vpx_convolve8_horiz_c; if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; @@ -1094,6 +1184,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2; vpx_fdct16x16 = vpx_fdct16x16_c; if (flags & HAS_SSE2) vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; vpx_fdct16x16_1 = vpx_fdct16x16_1_c; if (flags & HAS_SSE2) vpx_fdct16x16_1 = vpx_fdct16x16_1_sse2; vpx_fdct32x32 = vpx_fdct32x32_c; @@ -1129,6 +1220,10 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2; vpx_hadamard_16x16 = vpx_hadamard_16x16_c; if (flags & HAS_SSE2) vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_c; + if (flags & HAS_SSE2) vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; vpx_hadamard_8x8 = vpx_hadamard_8x8_c; if (flags & HAS_SSE2) vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; @@ -1137,20 +1232,28 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2; vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; + vpx_idct16x16_38_add = vpx_idct16x16_38_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_38_add = vpx_idct16x16_38_add_sse2; vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; - if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2; vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2; vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2; vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2; vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; @@ -1198,6 +1301,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; vpx_mse16x8 = vpx_mse16x8_c; if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; vpx_mse8x16 = vpx_mse8x16_c; if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2; vpx_mse8x8 = vpx_mse8x8_c; @@ -1208,17 +1312,19 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_sse2; vpx_quantize_b = vpx_quantize_b_c; if (flags & HAS_SSE2) vpx_quantize_b = vpx_quantize_b_sse2; + if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; + if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; + if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; vpx_sad16x16 = vpx_sad16x16_c; if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2; vpx_sad16x16_avg = vpx_sad16x16_avg_c; if (flags & HAS_SSE2) vpx_sad16x16_avg = vpx_sad16x16_avg_sse2; - vpx_sad16x16x3 = vpx_sad16x16x3_c; - if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3; vpx_sad16x16x4d = vpx_sad16x16x4d_c; if (flags & HAS_SSE2) vpx_sad16x16x4d = vpx_sad16x16x4d_sse2; - vpx_sad16x16x8 = vpx_sad16x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1; vpx_sad16x32 = vpx_sad16x32_c; if (flags & HAS_SSE2) vpx_sad16x32 = vpx_sad16x32_sse2; vpx_sad16x32_avg = vpx_sad16x32_avg_c; @@ -1229,13 +1335,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad16x8 = vpx_sad16x8_sse2; vpx_sad16x8_avg = vpx_sad16x8_avg_c; if (flags & HAS_SSE2) vpx_sad16x8_avg = vpx_sad16x8_avg_sse2; - vpx_sad16x8x3 = vpx_sad16x8x3_c; - if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3; vpx_sad16x8x4d = vpx_sad16x8x4d_c; if (flags & HAS_SSE2) vpx_sad16x8x4d = vpx_sad16x8x4d_sse2; - vpx_sad16x8x8 = vpx_sad16x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1; vpx_sad32x16 = vpx_sad32x16_c; if (flags & HAS_SSE2) vpx_sad32x16 = vpx_sad32x16_sse2; if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; @@ -1265,12 +1366,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad4x4 = vpx_sad4x4_sse2; vpx_sad4x4_avg = vpx_sad4x4_avg_c; if (flags & HAS_SSE2) vpx_sad4x4_avg = vpx_sad4x4_avg_sse2; - vpx_sad4x4x3 = vpx_sad4x4x3_c; - if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3; vpx_sad4x4x4d = vpx_sad4x4x4d_c; if (flags & HAS_SSE2) vpx_sad4x4x4d = vpx_sad4x4x4d_sse2; - vpx_sad4x4x8 = vpx_sad4x4x8_c; - if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1; vpx_sad4x8 = vpx_sad4x8_c; if (flags & HAS_SSE2) vpx_sad4x8 = vpx_sad4x8_sse2; vpx_sad4x8_avg = vpx_sad4x8_avg_c; @@ -1280,30 +1377,31 @@ static void setup_rtcd_internal(void) vpx_sad64x32 = vpx_sad64x32_c; if (flags & HAS_SSE2) vpx_sad64x32 = vpx_sad64x32_sse2; if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512; vpx_sad64x32_avg = vpx_sad64x32_avg_c; if (flags & HAS_SSE2) vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512; vpx_sad64x32x4d = vpx_sad64x32x4d_c; if (flags & HAS_SSE2) vpx_sad64x32x4d = vpx_sad64x32x4d_sse2; vpx_sad64x64 = vpx_sad64x64_c; if (flags & HAS_SSE2) vpx_sad64x64 = vpx_sad64x64_sse2; if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512; vpx_sad64x64_avg = vpx_sad64x64_avg_c; if (flags & HAS_SSE2) vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512; vpx_sad64x64x4d = vpx_sad64x64x4d_c; if (flags & HAS_SSE2) vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; + if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512; vpx_sad8x16 = vpx_sad8x16_c; if (flags & HAS_SSE2) vpx_sad8x16 = vpx_sad8x16_sse2; vpx_sad8x16_avg = vpx_sad8x16_avg_c; if (flags & HAS_SSE2) vpx_sad8x16_avg = vpx_sad8x16_avg_sse2; - vpx_sad8x16x3 = vpx_sad8x16x3_c; - if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3; vpx_sad8x16x4d = vpx_sad8x16x4d_c; if (flags & HAS_SSE2) vpx_sad8x16x4d = vpx_sad8x16x4d_sse2; - vpx_sad8x16x8 = vpx_sad8x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1; vpx_sad8x4 = vpx_sad8x4_c; if (flags & HAS_SSE2) vpx_sad8x4 = vpx_sad8x4_sse2; vpx_sad8x4_avg = vpx_sad8x4_avg_c; @@ -1314,16 +1412,74 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad8x8 = vpx_sad8x8_sse2; vpx_sad8x8_avg = vpx_sad8x8_avg_c; if (flags & HAS_SSE2) vpx_sad8x8_avg = vpx_sad8x8_avg_sse2; - vpx_sad8x8x3 = vpx_sad8x8x3_c; - if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3; vpx_sad8x8x4d = vpx_sad8x8x4d_c; if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2; - vpx_sad8x8x8 = vpx_sad8x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_sse2; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_sse2; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_sse2; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_sse2; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_sse2; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_sse2; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_sse2; + vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_sse2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512; + vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_sse2; + vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_sse2; + vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_sse2; + vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_sse2; vpx_satd = vpx_satd_c; if (flags & HAS_SSE2) vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; vpx_scaled_2d = vpx_scaled_2d_c; if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c; if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; @@ -1408,6 +1564,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; vpx_subtract_block = vpx_subtract_block_c; if (flags & HAS_SSE2) vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c; if (flags & HAS_SSE2) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sse2; vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; @@ -1431,8 +1588,10 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; vpx_variance16x32 = vpx_variance16x32_c; if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; vpx_variance16x8 = vpx_variance16x8_c; if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; vpx_variance32x16 = vpx_variance32x16_c; if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2; if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; @@ -1441,6 +1600,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; vpx_variance32x64 = vpx_variance32x64_c; if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; vpx_variance4x4 = vpx_variance4x4_c; if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2; vpx_variance4x8 = vpx_variance4x8_c; @@ -1453,10 +1613,13 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; vpx_variance8x16 = vpx_variance8x16_c; if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; vpx_variance8x4 = vpx_variance8x4_c; if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; vpx_variance8x8 = vpx_variance8x8_c; if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; vpx_vector_var = vpx_vector_var_c; if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2; } @@ -1466,4 +1629,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h b/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h index ddf7d01cca..18e5b71579 100644 --- a/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h +++ b/media/libvpx/config/mac/ia32/vpx_scale_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c @@ -66,4 +80,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/mac/x64/vp8_rtcd.h b/media/libvpx/config/mac/x64/vp8_rtcd.h index 4728639704..9e7746b813 100644 --- a/media/libvpx/config/mac/x64/vp8_rtcd.h +++ b/media/libvpx/config/mac/x64/vp8_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ @@ -26,56 +37,47 @@ struct yv12_buffer_config; extern "C" { #endif -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_sse2 -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_sse2 -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c - -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c - -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); int vp8_block_error_c(short *coeff, short *dqcoeff); int vp8_block_error_sse2(short *coeff, short *dqcoeff); #define vp8_block_error vp8_block_error_sse2 -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); #define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); @@ -86,8 +88,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); #define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2 -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); @@ -98,8 +100,8 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); #define vp8_dequantize_b vp8_dequantize_b_mmx int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); @@ -122,41 +124,36 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); #define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 -void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_bv vp8_loop_filter_bv_sse2 -void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2 -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 -void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2 -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 int vp8_mbblock_error_c(struct macroblock *mb, int dc); @@ -167,8 +164,8 @@ int vp8_mbuverror_c(struct macroblock *mb); int vp8_mbuverror_sse2(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_sse2 -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_refining_search_sad vp8_refining_search_sadx4 void vp8_regular_quantize_b_c(struct block *, struct blockd *); @@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2 -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); #define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2 -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); @@ -241,9 +238,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - vp8_full_search_sad = vp8_full_search_sad_c; - if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; - if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; @@ -261,4 +255,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/mac/x64/vp9_rtcd.h b/media/libvpx/config/mac/x64/vp9_rtcd.h index 47a708444d..52d6f0657d 100644 --- a/media/libvpx/config/mac/x64/vp9_rtcd.h +++ b/media/libvpx/config/mac/x64/vp9_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -14,12 +25,18 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -29,23 +46,22 @@ struct yv12_buffer_config; extern "C" { #endif +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -#define vp9_block_error_fp vp9_block_error_fp_sse2 +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); - -void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -67,17 +83,12 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); #define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_sse2 -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_sse2 -void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); @@ -88,22 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2 +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); void vp9_rtcd(void); @@ -115,21 +139,29 @@ static void setup_rtcd_internal(void) (void)flags; + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; vp9_block_error = vp9_block_error_sse2; if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; - vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2; - if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; - vp9_full_search_sad = vp9_full_search_sad_c; - if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; - if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; + vp9_block_error_fp = vp9_block_error_fp_sse2; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; vp9_quantize_fp = vp9_quantize_fp_sse2; if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; + vpx_convolve12 = vpx_convolve12_c; + if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3; + if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2; + vpx_convolve12_horiz = vpx_convolve12_horiz_c; + if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2; + vpx_convolve12_vert = vpx_convolve12_vert_c; + if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2; } #endif @@ -137,4 +169,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/mac/x64/vpx_config.asm b/media/libvpx/config/mac/x64/vpx_config.asm index e474409958..83a97b3c7c 100644 --- a/media/libvpx/config/mac/x64/vpx_config.asm +++ b/media/libvpx/config/mac/x64/vpx_config.asm @@ -1,9 +1,16 @@ -%define ARCH_ARM 0 -%define ARCH_MIPS 0 -%define ARCH_X86 0 -%define ARCH_X86_64 1 -%define HAVE_NEON 0 +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 0 +%define VPX_ARCH_X86_64 1 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 %define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 @@ -16,6 +23,11 @@ %define HAVE_SSE4_1 1 %define HAVE_AVX 1 %define HAVE_AVX2 1 +%define HAVE_AVX512 1 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 %define HAVE_VPX_PORTS 1 %define HAVE_PTHREAD_H 1 %define CONFIG_DEPENDENCY_TRACKING 1 @@ -72,7 +84,10 @@ %define CONFIG_BETTER_HW_COMPATIBILITY 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 1 -%define CONFIG_SPATIAL_SVC 0 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 %define CONFIG_FP_MB_STATS 0 %define CONFIG_EMULATE_HARDWARE 0 -%define CONFIG_MISC_FIXES 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/mac/x64/vpx_config.h b/media/libvpx/config/mac/x64/vpx_config.h index 23f32d1487..b0bfa58fe5 100644 --- a/media/libvpx/config/mac/x64/vpx_config.h +++ b/media/libvpx/config/mac/x64/vpx_config.h @@ -10,12 +10,19 @@ #define VPX_CONFIG_H #define RESTRICT #define INLINE inline -#define ARCH_ARM 0 -#define ARCH_MIPS 0 -#define ARCH_X86 0 -#define ARCH_X86_64 1 -#define HAVE_NEON 0 +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 1 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -28,6 +35,11 @@ #define HAVE_SSE4_1 1 #define HAVE_AVX 1 #define HAVE_AVX2 1 +#define HAVE_AVX512 1 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 1 #define CONFIG_DEPENDENCY_TRACKING 1 @@ -84,10 +96,13 @@ #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 -#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 -#define CONFIG_MISC_FIXES 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h b/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h index 6f181c0a0e..8e17c8ab8a 100644 --- a/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h +++ b/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +24,11 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -28,243 +44,216 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); #define vpx_avg_8x8 vpx_avg_8x8_sse2 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_sse2 -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_sse2 -void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c -void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c -void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c -void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c -void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c -void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c -void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c -void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c -void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_sse2 -void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c - -void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c - -void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_sse2 -void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_sse2 -void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c - -void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c - -void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c -void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c - -void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c - -void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c -void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c - -void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c - -void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2 -void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2 -void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse2 -void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse2 -void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2 -void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2 -void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse2 -void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse2 -void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2 -void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2 -void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse2 -void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse2 -void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2 -void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2 -void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse2 -void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse2 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16 vpx_fdct16x16_sse2 +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride); @@ -301,48 +290,54 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2 -void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c -void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); #define vpx_get8x8var vpx_get8x8var_sse2 unsigned int vpx_get_mb_ss_c(const int16_t *); unsigned int vpx_get_mb_ss_sse2(const int16_t *); #define vpx_get_mb_ss vpx_get_mb_ss_sse2 -void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_16x16 vpx_h_predictor_16x16_sse2 -void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_32x32 vpx_h_predictor_32x32_sse2 -void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_4x4 vpx_h_predictor_4x4_sse2 -void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2 -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -#define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2 +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -355,16 +350,22 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2 +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -395,15 +396,14 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); -RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); #define vpx_int_pro_col vpx_int_pro_col_sse2 -void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_sse2 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -463,8 +463,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2 -void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); #define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); @@ -475,21 +475,22 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vpx_minmax_8x8 vpx_minmax_8x8_sse2 -unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vpx_mse16x8 vpx_mse16x8_sse2 +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse8x16 vpx_mse8x16_sse2 -unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse8x8 vpx_mse8x8_sse2 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); @@ -500,16 +501,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2 -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -519,19 +522,10 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2 -void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2 -void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x32 vpx_sad16x32_sse2 @@ -540,8 +534,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2 -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -552,19 +546,10 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2 -void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2 -void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -575,8 +560,8 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -589,16 +574,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -610,8 +589,8 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -622,18 +601,10 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2 -void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2 -void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad4x8 vpx_sad4x8_sse2 @@ -642,47 +613,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2 -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2 -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -692,18 +659,10 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2 -void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2 -void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x4 vpx_sad8x4_sse2 @@ -712,13 +671,10 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2 -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2 -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_sse2 @@ -727,273 +683,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2 -void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2 -void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2 + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2 + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2 + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2 + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2 + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2 + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2 + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2 + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2 + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2 + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2 + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2 int vpx_satd_c(const int16_t *coeff, int length); int vpx_satd_sse2(const int16_t *coeff, int length); -#define vpx_satd vpx_satd_sse2 +int vpx_satd_avx2(const int16_t *coeff, int length); +RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c -uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); -uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vpx_subtract_block vpx_subtract_block_sse2 +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); #define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_sse2 -void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2 -void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2 -void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse2 -void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2 -void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2 -void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2 -void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse2 -void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse2 -unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x32 vpx_variance16x32_sse2 +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x8 vpx_variance16x8_sse2 +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance32x64 vpx_variance32x64_sse2 +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance4x4 vpx_variance4x4_sse2 -unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance4x8 vpx_variance4x8_sse2 -unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x16 vpx_variance8x16_sse2 +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_sse2 +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x8 vpx_variance8x8_sse2 +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); @@ -1010,15 +1085,20 @@ static void setup_rtcd_internal(void) (void)flags; + vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; vpx_convolve8 = vpx_convolve8_sse2; if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2; vpx_convolve8_avg = vpx_convolve8_avg_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2; @@ -1051,6 +1131,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; vpx_fdct32x32 = vpx_fdct32x32_sse2; if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2; vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2; @@ -1059,40 +1141,39 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_fdct8x8 = vpx_fdct8x8_ssse3; vpx_get16x16var = vpx_get16x16var_sse2; if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; - if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3; - vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; - vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2; - if (flags & HAS_SSSE3) vpx_idct8x8_64_add = vpx_idct8x8_64_add_ssse3; vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2; if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2; vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2; if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2; vpx_mse16x16 = vpx_mse16x16_sse2; if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; vpx_quantize_b = vpx_quantize_b_sse2; if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; - vpx_sad16x16x3 = vpx_sad16x16x3_c; - if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3; - vpx_sad16x16x8 = vpx_sad16x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1; - vpx_sad16x8x3 = vpx_sad16x8x3_c; - if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3; - vpx_sad16x8x8 = vpx_sad16x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; vpx_sad32x16 = vpx_sad32x16_sse2; if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; vpx_sad32x16_avg = vpx_sad32x16_avg_sse2; @@ -1107,30 +1188,52 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2; vpx_sad32x64_avg = vpx_sad32x64_avg_sse2; if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2; - vpx_sad4x4x3 = vpx_sad4x4x3_c; - if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3; - vpx_sad4x4x8 = vpx_sad4x4x8_c; - if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1; vpx_sad64x32 = vpx_sad64x32_sse2; if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512; vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512; vpx_sad64x64 = vpx_sad64x64_sse2; if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512; vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512; vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; - vpx_sad8x16x3 = vpx_sad8x16x3_c; - if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3; - vpx_sad8x16x8 = vpx_sad8x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1; - vpx_sad8x8x3 = vpx_sad8x8x3_c; - if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3; - vpx_sad8x8x8 = vpx_sad8x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512; + vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; vpx_scaled_2d = vpx_scaled_2d_c; if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2; @@ -1187,16 +1290,30 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3; vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2; if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; + vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; vpx_variance16x16 = vpx_variance16x16_sse2; if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; + vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; vpx_variance32x16 = vpx_variance32x16_sse2; if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; vpx_variance32x32 = vpx_variance32x32_sse2; if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; vpx_variance64x32 = vpx_variance64x32_sse2; if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; vpx_variance64x64 = vpx_variance64x64_sse2; if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; } #endif @@ -1204,4 +1321,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/mac/x64/vpx_scale_rtcd.h b/media/libvpx/config/mac/x64/vpx_scale_rtcd.h index ddf7d01cca..18e5b71579 100644 --- a/media/libvpx/config/mac/x64/vpx_scale_rtcd.h +++ b/media/libvpx/config/mac/x64/vpx_scale_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c @@ -66,4 +80,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/vpx_version.h b/media/libvpx/config/vpx_version.h index 24da169b4f..ba9b63a4a3 100644 --- a/media/libvpx/config/vpx_version.h +++ b/media/libvpx/config/vpx_version.h @@ -1,7 +1,11 @@ +// This file is generated. Do not edit. +#ifndef VPX_VERSION_H_ +#define VPX_VERSION_H_ #define VERSION_MAJOR 1 -#define VERSION_MINOR 6 -#define VERSION_PATCH 1 +#define VERSION_MINOR 16 +#define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.1" -#define VERSION_STRING " v1.6.1" +#define VERSION_STRING_NOSP "v1.16.0" +#define VERSION_STRING " v1.16.0" +#endif // VPX_VERSION_H_ diff --git a/media/libvpx/config/win/ia32/vp8_rtcd.h b/media/libvpx/config/win/ia32/vp8_rtcd.h index 5db5bbad85..ca42bb8a5e 100644 --- a/media/libvpx/config/win/ia32/vp8_rtcd.h +++ b/media/libvpx/config/win/ia32/vp8_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ @@ -26,57 +37,48 @@ struct yv12_buffer_config; extern "C" { #endif -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c - -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c - -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); int vp8_block_error_c(short *coeff, short *dqcoeff); int vp8_block_error_sse2(short *coeff, short *dqcoeff); RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); @@ -86,9 +88,9 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); -RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); +RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride); void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); @@ -98,9 +100,9 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC); int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); @@ -122,42 +124,37 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); int vp8_mbblock_error_c(struct macroblock *mb, int dc); int vp8_mbblock_error_sse2(struct macroblock *mb, int dc); @@ -167,9 +164,9 @@ int vp8_mbuverror_c(struct macroblock *mb); int vp8_mbuverror_sse2(struct macroblock *mb); RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); void vp8_regular_quantize_b_c(struct block *, struct blockd *); void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); @@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff); -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); @@ -237,9 +234,9 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; + if (flags & HAS_SSE2) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_sse2; vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; + if (flags & HAS_SSE2) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_sse2; vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; @@ -277,9 +274,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; - vp8_full_search_sad = vp8_full_search_sad_c; - if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; - if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; vp8_loop_filter_bh = vp8_loop_filter_bh_c; if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; vp8_loop_filter_bv = vp8_loop_filter_bv_c; @@ -336,4 +330,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/win/ia32/vp9_rtcd.h b/media/libvpx/config/win/ia32/vp9_rtcd.h index da53895ca4..28c82a831c 100644 --- a/media/libvpx/config/win/ia32/vp9_rtcd.h +++ b/media/libvpx/config/win/ia32/vp9_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -14,12 +25,18 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -29,23 +46,22 @@ struct yv12_buffer_config; extern "C" { #endif +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -RTCD_EXTERN int64_t (*vp9_block_error_fp)(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); - -void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -67,18 +83,13 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); RTCD_EXTERN void (*vp9_filter_by_weight8x8)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride); -void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); @@ -88,20 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); void vp9_rtcd(void); @@ -113,16 +139,14 @@ static void setup_rtcd_internal(void) (void)flags; + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; vp9_block_error = vp9_block_error_c; if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2; if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; vp9_block_error_fp = vp9_block_error_fp_c; if (flags & HAS_SSE2) vp9_block_error_fp = vp9_block_error_fp_sse2; - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; - vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; - if (flags & HAS_SSE2) vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2; - if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; vp9_fht16x16 = vp9_fht16x16_c; if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; vp9_fht4x4 = vp9_fht4x4_c; @@ -133,9 +157,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_filter_by_weight16x16 = vp9_filter_by_weight16x16_sse2; vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_c; if (flags & HAS_SSE2) vp9_filter_by_weight8x8 = vp9_filter_by_weight8x8_sse2; - vp9_full_search_sad = vp9_full_search_sad_c; - if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; - if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; vp9_fwht4x4 = vp9_fwht4x4_c; if (flags & HAS_SSE2) vp9_fwht4x4 = vp9_fwht4x4_sse2; vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; @@ -146,10 +167,22 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; vp9_quantize_fp = vp9_quantize_fp_c; if (flags & HAS_SSE2) vp9_quantize_fp = vp9_quantize_fp_sse2; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; - vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; - if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; + vpx_convolve12 = vpx_convolve12_c; + if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3; + if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2; + vpx_convolve12_horiz = vpx_convolve12_horiz_c; + if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2; + vpx_convolve12_vert = vpx_convolve12_vert_c; + if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2; } #endif @@ -157,4 +190,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/win/ia32/vpx_config.asm b/media/libvpx/config/win/ia32/vpx_config.asm index 4d443be604..a157619c4b 100644 --- a/media/libvpx/config/win/ia32/vpx_config.asm +++ b/media/libvpx/config/win/ia32/vpx_config.asm @@ -1,9 +1,16 @@ -%define ARCH_ARM 0 -%define ARCH_MIPS 0 -%define ARCH_X86 1 -%define ARCH_X86_64 0 -%define HAVE_NEON 0 +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 1 +%define VPX_ARCH_X86_64 0 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 %define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 @@ -16,6 +23,11 @@ %define HAVE_SSE4_1 1 %define HAVE_AVX 1 %define HAVE_AVX2 1 +%define HAVE_AVX512 1 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 %define HAVE_VPX_PORTS 1 %define HAVE_PTHREAD_H 0 %define CONFIG_DEPENDENCY_TRACKING 1 @@ -72,7 +84,10 @@ %define CONFIG_BETTER_HW_COMPATIBILITY 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 1 -%define CONFIG_SPATIAL_SVC 0 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 %define CONFIG_FP_MB_STATS 0 %define CONFIG_EMULATE_HARDWARE 0 -%define CONFIG_MISC_FIXES 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/win/ia32/vpx_config.h b/media/libvpx/config/win/ia32/vpx_config.h index 1aef765a31..37719d5b8d 100644 --- a/media/libvpx/config/win/ia32/vpx_config.h +++ b/media/libvpx/config/win/ia32/vpx_config.h @@ -10,12 +10,19 @@ #define VPX_CONFIG_H #define RESTRICT #define INLINE inline -#define ARCH_ARM 0 -#define ARCH_MIPS 0 -#define ARCH_X86 1 -#define ARCH_X86_64 0 -#define HAVE_NEON 0 +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 1 +#define VPX_ARCH_X86_64 0 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -28,6 +35,11 @@ #define HAVE_SSE4_1 1 #define HAVE_AVX 1 #define HAVE_AVX2 1 +#define HAVE_AVX512 1 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 0 #define CONFIG_DEPENDENCY_TRACKING 1 @@ -84,10 +96,13 @@ #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 -#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 -#define CONFIG_MISC_FIXES 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h index 112e326a64..0a6266a322 100644 --- a/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h +++ b/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +24,11 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -28,242 +44,215 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); RTCD_EXTERN unsigned int (*vpx_avg_8x8)(const uint8_t *, int p); void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c -void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c -void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c -void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c -void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c -void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c -void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c -void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c -void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c - -void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c - -void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c -void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c - -void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c - -void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c -void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); @@ -300,47 +289,53 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride); -void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c -void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); unsigned int vpx_get_mb_ss_c(const int16_t *); unsigned int vpx_get_mb_ss_sse2(const int16_t *); RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *); -void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -353,14 +348,22 @@ RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_38_add)(const tran_low_t *input, uint8_t *dest, int stride); + void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -369,6 +372,7 @@ RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -381,6 +385,7 @@ RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, in void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -395,9 +400,9 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); RTCD_EXTERN int16_t (*vpx_int_pro_col)(const uint8_t *ref, const int width); -void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +RTCD_EXTERN void (*vpx_int_pro_row)(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); @@ -456,9 +461,9 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); -void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); -RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols,int flimit); void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); @@ -468,22 +473,23 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); -unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); @@ -493,12 +499,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -508,18 +520,9 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad16x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -529,9 +532,9 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -541,18 +544,9 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad16x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -564,9 +558,9 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -578,16 +572,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -599,9 +587,9 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -611,17 +599,9 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad4x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -631,47 +611,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -681,17 +657,9 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad8x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -701,12 +669,9 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -716,273 +681,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad8x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); -void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); int vpx_satd_c(const int16_t *coeff, int length); int vpx_satd_sse2(const int16_t *coeff, int length); +int vpx_satd_avx2(const int16_t *coeff, int length); RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c -uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); -uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size); -void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); @@ -1003,6 +1087,9 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_avg_4x4 = vpx_avg_4x4_sse2; vpx_avg_8x8 = vpx_avg_8x8_c; if (flags & HAS_SSE2) vpx_avg_8x8 = vpx_avg_8x8_sse2; + vpx_comp_avg_pred = vpx_comp_avg_pred_c; + if (flags & HAS_SSE2) vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; vpx_convolve8 = vpx_convolve8_c; if (flags & HAS_SSE2) vpx_convolve8 = vpx_convolve8_sse2; if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; @@ -1010,12 +1097,15 @@ static void setup_rtcd_internal(void) vpx_convolve8_avg = vpx_convolve8_avg_c; if (flags & HAS_SSE2) vpx_convolve8_avg = vpx_convolve8_avg_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_c; if (flags & HAS_SSE2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_c; if (flags & HAS_SSE2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; vpx_convolve8_horiz = vpx_convolve8_horiz_c; if (flags & HAS_SSE2) vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; @@ -1094,6 +1184,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_dc_top_predictor_8x8 = vpx_dc_top_predictor_8x8_sse2; vpx_fdct16x16 = vpx_fdct16x16_c; if (flags & HAS_SSE2) vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; vpx_fdct16x16_1 = vpx_fdct16x16_1_c; if (flags & HAS_SSE2) vpx_fdct16x16_1 = vpx_fdct16x16_1_sse2; vpx_fdct32x32 = vpx_fdct32x32_c; @@ -1129,6 +1220,10 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_h_predictor_8x8 = vpx_h_predictor_8x8_sse2; vpx_hadamard_16x16 = vpx_hadamard_16x16_c; if (flags & HAS_SSE2) vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_c; + if (flags & HAS_SSE2) vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; vpx_hadamard_8x8 = vpx_hadamard_8x8_c; if (flags & HAS_SSE2) vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; vpx_idct16x16_10_add = vpx_idct16x16_10_add_c; @@ -1137,20 +1232,28 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_idct16x16_1_add = vpx_idct16x16_1_add_sse2; vpx_idct16x16_256_add = vpx_idct16x16_256_add_c; if (flags & HAS_SSE2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; + vpx_idct16x16_38_add = vpx_idct16x16_38_add_c; + if (flags & HAS_SSE2) vpx_idct16x16_38_add = vpx_idct16x16_38_add_sse2; vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; if (flags & HAS_SSE2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; - if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_SSE2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; if (flags & HAS_SSE2) vpx_idct32x32_1_add = vpx_idct32x32_1_add_sse2; vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; if (flags & HAS_SSE2) vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; + if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; if (flags & HAS_SSE2) vpx_idct4x4_16_add = vpx_idct4x4_16_add_sse2; vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; if (flags & HAS_SSE2) vpx_idct4x4_1_add = vpx_idct4x4_1_add_sse2; vpx_idct8x8_12_add = vpx_idct8x8_12_add_c; if (flags & HAS_SSE2) vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; vpx_idct8x8_1_add = vpx_idct8x8_1_add_c; if (flags & HAS_SSE2) vpx_idct8x8_1_add = vpx_idct8x8_1_add_sse2; vpx_idct8x8_64_add = vpx_idct8x8_64_add_c; @@ -1198,6 +1301,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; vpx_mse16x8 = vpx_mse16x8_c; if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; vpx_mse8x16 = vpx_mse8x16_c; if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2; vpx_mse8x8 = vpx_mse8x8_c; @@ -1208,17 +1312,19 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_sse2; vpx_quantize_b = vpx_quantize_b_c; if (flags & HAS_SSE2) vpx_quantize_b = vpx_quantize_b_sse2; + if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; + if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; + vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; + if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; vpx_sad16x16 = vpx_sad16x16_c; if (flags & HAS_SSE2) vpx_sad16x16 = vpx_sad16x16_sse2; vpx_sad16x16_avg = vpx_sad16x16_avg_c; if (flags & HAS_SSE2) vpx_sad16x16_avg = vpx_sad16x16_avg_sse2; - vpx_sad16x16x3 = vpx_sad16x16x3_c; - if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3; vpx_sad16x16x4d = vpx_sad16x16x4d_c; if (flags & HAS_SSE2) vpx_sad16x16x4d = vpx_sad16x16x4d_sse2; - vpx_sad16x16x8 = vpx_sad16x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1; vpx_sad16x32 = vpx_sad16x32_c; if (flags & HAS_SSE2) vpx_sad16x32 = vpx_sad16x32_sse2; vpx_sad16x32_avg = vpx_sad16x32_avg_c; @@ -1229,13 +1335,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad16x8 = vpx_sad16x8_sse2; vpx_sad16x8_avg = vpx_sad16x8_avg_c; if (flags & HAS_SSE2) vpx_sad16x8_avg = vpx_sad16x8_avg_sse2; - vpx_sad16x8x3 = vpx_sad16x8x3_c; - if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3; vpx_sad16x8x4d = vpx_sad16x8x4d_c; if (flags & HAS_SSE2) vpx_sad16x8x4d = vpx_sad16x8x4d_sse2; - vpx_sad16x8x8 = vpx_sad16x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1; vpx_sad32x16 = vpx_sad32x16_c; if (flags & HAS_SSE2) vpx_sad32x16 = vpx_sad32x16_sse2; if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; @@ -1265,12 +1366,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad4x4 = vpx_sad4x4_sse2; vpx_sad4x4_avg = vpx_sad4x4_avg_c; if (flags & HAS_SSE2) vpx_sad4x4_avg = vpx_sad4x4_avg_sse2; - vpx_sad4x4x3 = vpx_sad4x4x3_c; - if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3; vpx_sad4x4x4d = vpx_sad4x4x4d_c; if (flags & HAS_SSE2) vpx_sad4x4x4d = vpx_sad4x4x4d_sse2; - vpx_sad4x4x8 = vpx_sad4x4x8_c; - if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1; vpx_sad4x8 = vpx_sad4x8_c; if (flags & HAS_SSE2) vpx_sad4x8 = vpx_sad4x8_sse2; vpx_sad4x8_avg = vpx_sad4x8_avg_c; @@ -1280,30 +1377,31 @@ static void setup_rtcd_internal(void) vpx_sad64x32 = vpx_sad64x32_c; if (flags & HAS_SSE2) vpx_sad64x32 = vpx_sad64x32_sse2; if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512; vpx_sad64x32_avg = vpx_sad64x32_avg_c; if (flags & HAS_SSE2) vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512; vpx_sad64x32x4d = vpx_sad64x32x4d_c; if (flags & HAS_SSE2) vpx_sad64x32x4d = vpx_sad64x32x4d_sse2; vpx_sad64x64 = vpx_sad64x64_c; if (flags & HAS_SSE2) vpx_sad64x64 = vpx_sad64x64_sse2; if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512; vpx_sad64x64_avg = vpx_sad64x64_avg_c; if (flags & HAS_SSE2) vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512; vpx_sad64x64x4d = vpx_sad64x64x4d_c; if (flags & HAS_SSE2) vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; + if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512; vpx_sad8x16 = vpx_sad8x16_c; if (flags & HAS_SSE2) vpx_sad8x16 = vpx_sad8x16_sse2; vpx_sad8x16_avg = vpx_sad8x16_avg_c; if (flags & HAS_SSE2) vpx_sad8x16_avg = vpx_sad8x16_avg_sse2; - vpx_sad8x16x3 = vpx_sad8x16x3_c; - if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3; vpx_sad8x16x4d = vpx_sad8x16x4d_c; if (flags & HAS_SSE2) vpx_sad8x16x4d = vpx_sad8x16x4d_sse2; - vpx_sad8x16x8 = vpx_sad8x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1; vpx_sad8x4 = vpx_sad8x4_c; if (flags & HAS_SSE2) vpx_sad8x4 = vpx_sad8x4_sse2; vpx_sad8x4_avg = vpx_sad8x4_avg_c; @@ -1314,16 +1412,74 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_sad8x8 = vpx_sad8x8_sse2; vpx_sad8x8_avg = vpx_sad8x8_avg_c; if (flags & HAS_SSE2) vpx_sad8x8_avg = vpx_sad8x8_avg_sse2; - vpx_sad8x8x3 = vpx_sad8x8x3_c; - if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3; vpx_sad8x8x4d = vpx_sad8x8x4d_c; if (flags & HAS_SSE2) vpx_sad8x8x4d = vpx_sad8x8x4d_sse2; - vpx_sad8x8x8 = vpx_sad8x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + vpx_sad_skip_16x16 = vpx_sad_skip_16x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16 = vpx_sad_skip_16x16_sse2; + vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x16x4d = vpx_sad_skip_16x16x4d_sse2; + vpx_sad_skip_16x32 = vpx_sad_skip_16x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32 = vpx_sad_skip_16x32_sse2; + vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x32x4d = vpx_sad_skip_16x32x4d_sse2; + vpx_sad_skip_16x8 = vpx_sad_skip_16x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8 = vpx_sad_skip_16x8_sse2; + vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_16x8x4d = vpx_sad_skip_16x8x4d_sse2; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_4x8 = vpx_sad_skip_4x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8 = vpx_sad_skip_4x8_sse2; + vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_4x8x4d = vpx_sad_skip_4x8x4d_sse2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512; + vpx_sad_skip_8x16 = vpx_sad_skip_8x16_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16 = vpx_sad_skip_8x16_sse2; + vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x16x4d = vpx_sad_skip_8x16x4d_sse2; + vpx_sad_skip_8x8 = vpx_sad_skip_8x8_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8 = vpx_sad_skip_8x8_sse2; + vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_c; + if (flags & HAS_SSE2) vpx_sad_skip_8x8x4d = vpx_sad_skip_8x8x4d_sse2; vpx_satd = vpx_satd_c; if (flags & HAS_SSE2) vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; vpx_scaled_2d = vpx_scaled_2d_c; if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_c; if (flags & HAS_SSE2) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; @@ -1408,6 +1564,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; vpx_subtract_block = vpx_subtract_block_c; if (flags & HAS_SSE2) vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_c; if (flags & HAS_SSE2) vpx_sum_squares_2d_i16 = vpx_sum_squares_2d_i16_sse2; vpx_tm_predictor_16x16 = vpx_tm_predictor_16x16_c; @@ -1431,8 +1588,10 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; vpx_variance16x32 = vpx_variance16x32_c; if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; vpx_variance16x8 = vpx_variance16x8_c; if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; vpx_variance32x16 = vpx_variance32x16_c; if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2; if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; @@ -1441,6 +1600,7 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; vpx_variance32x64 = vpx_variance32x64_c; if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; vpx_variance4x4 = vpx_variance4x4_c; if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2; vpx_variance4x8 = vpx_variance4x8_c; @@ -1453,10 +1613,13 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; vpx_variance8x16 = vpx_variance8x16_c; if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; vpx_variance8x4 = vpx_variance8x4_c; if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; vpx_variance8x8 = vpx_variance8x8_c; if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; vpx_vector_var = vpx_vector_var_c; if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2; } @@ -1466,4 +1629,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/win/ia32/vpx_scale_rtcd.h b/media/libvpx/config/win/ia32/vpx_scale_rtcd.h index ddf7d01cca..18e5b71579 100644 --- a/media/libvpx/config/win/ia32/vpx_scale_rtcd.h +++ b/media/libvpx/config/win/ia32/vpx_scale_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c @@ -66,4 +80,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/config/win/x64/vp8_rtcd.h b/media/libvpx/config/win/x64/vp8_rtcd.h index 4728639704..9e7746b813 100644 --- a/media/libvpx/config/win/x64/vp8_rtcd.h +++ b/media/libvpx/config/win/x64/vp8_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP8_RTCD_H_ #define VP8_RTCD_H_ @@ -26,56 +37,47 @@ struct yv12_buffer_config; extern "C" { #endif -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx +void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_sse2 -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx +void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_sse2 -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c - -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c - -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); int vp8_block_error_c(short *coeff, short *dqcoeff); int vp8_block_error_sse2(short *coeff, short *dqcoeff); #define vp8_block_error vp8_block_error_sse2 -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n); +void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height); -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride); #define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); #define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); @@ -86,8 +88,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); #define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2 -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride); +void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride); #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); @@ -98,8 +100,8 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +void vp8_dequantize_b_c(struct blockd*, short *DQC); +void vp8_dequantize_b_mmx(struct blockd*, short *DQC); #define vp8_dequantize_b vp8_dequantize_b_mmx int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); @@ -122,41 +124,36 @@ void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); #define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 -void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_bv vp8_loop_filter_bv_sse2 -void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2 -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 -void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2 -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit); #define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 int vp8_mbblock_error_c(struct macroblock *mb, int dc); @@ -167,8 +164,8 @@ int vp8_mbuverror_c(struct macroblock *mb); int vp8_mbuverror_sse2(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_sse2 -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_refining_search_sad vp8_refining_search_sadx4 void vp8_regular_quantize_b_c(struct block *, struct blockd *); @@ -184,40 +181,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2 -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); #define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff); +void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff); #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2 -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch); void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); @@ -241,9 +238,6 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - vp8_full_search_sad = vp8_full_search_sad_c; - if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; - if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; @@ -261,4 +255,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP8_RTCD_H_ diff --git a/media/libvpx/config/win/x64/vp9_rtcd.h b/media/libvpx/config/win/x64/vp9_rtcd.h index 47a708444d..52d6f0657d 100644 --- a/media/libvpx/config/win/x64/vp9_rtcd.h +++ b/media/libvpx/config/win/x64/vp9_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VP9_RTCD_H_ #define VP9_RTCD_H_ @@ -14,12 +25,18 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_filter.h" +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#include "vp9/encoder/vp9_temporal_filter.h" +#endif struct macroblockd; /* Encoder forward decls */ struct macroblock; -struct vp9_variance_vtable; +struct macroblock_plane; +struct vp9_sad_table; +struct ScanOrder; struct search_site_config; struct mv; union int_mv; @@ -29,23 +46,22 @@ struct yv12_buffer_config; extern "C" { #endif +void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); +RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); RTCD_EXTERN int64_t (*vp9_block_error)(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size); -#define vp9_block_error_fp vp9_block_error_fp_sse2 +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); +RTCD_EXTERN int64_t (*vp9_block_error_fp)(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size); -int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); - -void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -67,17 +83,12 @@ void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, void vp9_filter_by_weight8x8_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); #define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_sse2 -int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); -RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); - void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride); #define vp9_fwht4x4 vp9_fwht4x4_sse2 -void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); @@ -88,22 +99,35 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 -void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); -RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); +RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler); -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -#define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2 +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve12_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); void vp9_rtcd(void); @@ -115,21 +139,29 @@ static void setup_rtcd_internal(void) (void)flags; + vp9_apply_temporal_filter = vp9_apply_temporal_filter_c; + if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1; vp9_block_error = vp9_block_error_sse2; if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; - vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2; - if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; - vp9_full_search_sad = vp9_full_search_sad_c; - if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; - if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; + vp9_block_error_fp = vp9_block_error_fp_sse2; + if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2; vp9_quantize_fp = vp9_quantize_fp_sse2; if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2; vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + if (flags & HAS_AVX2) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_avx2; vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c; if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3; + vpx_convolve12 = vpx_convolve12_c; + if (flags & HAS_SSSE3) vpx_convolve12 = vpx_convolve12_ssse3; + if (flags & HAS_AVX2) vpx_convolve12 = vpx_convolve12_avx2; + vpx_convolve12_horiz = vpx_convolve12_horiz_c; + if (flags & HAS_SSSE3) vpx_convolve12_horiz = vpx_convolve12_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_horiz = vpx_convolve12_horiz_avx2; + vpx_convolve12_vert = vpx_convolve12_vert_c; + if (flags & HAS_SSSE3) vpx_convolve12_vert = vpx_convolve12_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve12_vert = vpx_convolve12_vert_avx2; } #endif @@ -137,4 +169,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VP9_RTCD_H_ diff --git a/media/libvpx/config/win/x64/vpx_config.asm b/media/libvpx/config/win/x64/vpx_config.asm index 27e5dabd9d..2044a9e9b8 100644 --- a/media/libvpx/config/win/x64/vpx_config.asm +++ b/media/libvpx/config/win/x64/vpx_config.asm @@ -1,9 +1,16 @@ -%define ARCH_ARM 0 -%define ARCH_MIPS 0 -%define ARCH_X86 0 -%define ARCH_X86_64 1 -%define HAVE_NEON 0 +%define VPX_ARCH_ARM 0 +%define VPX_ARCH_AARCH64 0 +%define VPX_ARCH_MIPS 0 +%define VPX_ARCH_X86 0 +%define VPX_ARCH_X86_64 1 +%define VPX_ARCH_PPC 0 +%define VPX_ARCH_LOONGARCH 0 %define HAVE_NEON_ASM 0 +%define HAVE_NEON 0 +%define HAVE_NEON_DOTPROD 0 +%define HAVE_NEON_I8MM 0 +%define HAVE_SVE 0 +%define HAVE_SVE2 0 %define HAVE_MIPS32 0 %define HAVE_DSPR2 0 %define HAVE_MSA 0 @@ -16,6 +23,11 @@ %define HAVE_SSE4_1 1 %define HAVE_AVX 1 %define HAVE_AVX2 1 +%define HAVE_AVX512 1 +%define HAVE_VSX 0 +%define HAVE_MMI 0 +%define HAVE_LSX 0 +%define HAVE_LASX 0 %define HAVE_VPX_PORTS 1 %define HAVE_PTHREAD_H 0 %define CONFIG_DEPENDENCY_TRACKING 1 @@ -72,7 +84,10 @@ %define CONFIG_BETTER_HW_COMPATIBILITY 0 %define CONFIG_EXPERIMENTAL 0 %define CONFIG_SIZE_LIMIT 1 -%define CONFIG_SPATIAL_SVC 0 +%define CONFIG_ALWAYS_ADJUST_BPM 0 +%define CONFIG_BITSTREAM_DEBUG 0 +%define CONFIG_MISMATCH_DEBUG 0 %define CONFIG_FP_MB_STATS 0 %define CONFIG_EMULATE_HARDWARE 0 -%define CONFIG_MISC_FIXES 0 +%define CONFIG_NON_GREEDY_MV 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/win/x64/vpx_config.c b/media/libvpx/config/win/x64/vpx_config.c index 8e75cba148..ace29b7b87 100644 --- a/media/libvpx/config/win/x64/vpx_config.c +++ b/media/libvpx/config/win/x64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=x86_64-win64-vs12 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-postproc --enable-vp9-postproc --as=yasm"; +static const char* const cfg = "--target=x86_64-win64-vs17 --enable-external-build --disable-examples --disable-install-docs --disable-unit-tests --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic --enable-postproc --enable-vp9-postproc --as=yasm"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/media/libvpx/config/win/x64/vpx_config.h b/media/libvpx/config/win/x64/vpx_config.h index 435815ad7a..d758715c3e 100644 --- a/media/libvpx/config/win/x64/vpx_config.h +++ b/media/libvpx/config/win/x64/vpx_config.h @@ -9,13 +9,20 @@ #ifndef VPX_CONFIG_H #define VPX_CONFIG_H #define RESTRICT -#define INLINE __forceinline -#define ARCH_ARM 0 -#define ARCH_MIPS 0 -#define ARCH_X86 0 -#define ARCH_X86_64 1 -#define HAVE_NEON 0 +#define INLINE __inline +#define VPX_ARCH_ARM 0 +#define VPX_ARCH_AARCH64 0 +#define VPX_ARCH_MIPS 0 +#define VPX_ARCH_X86 0 +#define VPX_ARCH_X86_64 1 +#define VPX_ARCH_PPC 0 +#define VPX_ARCH_LOONGARCH 0 #define HAVE_NEON_ASM 0 +#define HAVE_NEON 0 +#define HAVE_NEON_DOTPROD 0 +#define HAVE_NEON_I8MM 0 +#define HAVE_SVE 0 +#define HAVE_SVE2 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 #define HAVE_MSA 0 @@ -28,6 +35,11 @@ #define HAVE_SSE4_1 1 #define HAVE_AVX 1 #define HAVE_AVX2 1 +#define HAVE_AVX512 1 +#define HAVE_VSX 0 +#define HAVE_MMI 0 +#define HAVE_LSX 0 +#define HAVE_LASX 0 #define HAVE_VPX_PORTS 1 #define HAVE_PTHREAD_H 0 #define CONFIG_DEPENDENCY_TRACKING 1 @@ -84,10 +96,13 @@ #define CONFIG_BETTER_HW_COMPATIBILITY 0 #define CONFIG_EXPERIMENTAL 0 #define CONFIG_SIZE_LIMIT 1 -#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_ALWAYS_ADJUST_BPM 0 +#define CONFIG_BITSTREAM_DEBUG 0 +#define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_FP_MB_STATS 0 #define CONFIG_EMULATE_HARDWARE 0 -#define CONFIG_MISC_FIXES 0 +#define CONFIG_NON_GREEDY_MV 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/win/x64/vpx_dsp_rtcd.h b/media/libvpx/config/win/x64/vpx_dsp_rtcd.h index 6f181c0a0e..8e17c8ab8a 100644 --- a/media/libvpx/config/win/x64/vpx_dsp_rtcd.h +++ b/media/libvpx/config/win/x64/vpx_dsp_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_DSP_RTCD_H_ #define VPX_DSP_RTCD_H_ @@ -13,6 +24,11 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#if CONFIG_VP9_ENCODER + struct macroblock_plane; + struct ScanOrder; +#endif #ifdef __cplusplus @@ -28,243 +44,216 @@ unsigned int vpx_avg_8x8_sse2(const uint8_t *, int p); #define vpx_avg_8x8 vpx_avg_8x8_sse2 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -#define vpx_comp_avg_pred vpx_comp_avg_pred_c +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +RTCD_EXTERN void (*vpx_comp_avg_pred)(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_avg vpx_convolve_avg_sse2 -void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_convolve_copy vpx_convolve_copy_sse2 -void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c -void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c -void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c -void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c -void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c -void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c -void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c -void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c -void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_sse2 -void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c - -void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c - -void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_sse2 -void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_sse2 -void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c - -void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c - -void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c -void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c - -void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c - -void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c -void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c - -void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c - -void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2 -void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2 -void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse2 -void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse2 -void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2 -void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2 -void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse2 -void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse2 -void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2 -void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2 -void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse2 -void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse2 -void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2 -void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2 -void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse2 -void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse2 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride); -#define vpx_fdct16x16 vpx_fdct16x16_sse2 +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride); +RTCD_EXTERN void (*vpx_fdct16x16)(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride); @@ -301,48 +290,54 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride); #define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2 -void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride); #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c -void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); #define vpx_get8x8var vpx_get8x8var_sse2 unsigned int vpx_get_mb_ss_c(const int16_t *); unsigned int vpx_get_mb_ss_sse2(const int16_t *); #define vpx_get_mb_ss vpx_get_mb_ss_sse2 -void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_16x16 vpx_h_predictor_16x16_sse2 -void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_32x32 vpx_h_predictor_32x32_sse2 -void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_4x4 vpx_h_predictor_4x4_sse2 -void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2 -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -#define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2 +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff); -void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, int src_stride, int16_t *coeff); -RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); -void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); +RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff); + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -355,16 +350,22 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -#define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2 +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, int stride); RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -395,15 +396,14 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); -RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); #define vpx_int_pro_col vpx_int_pro_col_sse2 -void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_sse2 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); @@ -463,8 +463,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2 -void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit); #define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); @@ -475,21 +475,22 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); #define vpx_minmax_8x8 vpx_minmax_8x8_sse2 -unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vpx_mse16x8 vpx_mse16x8_sse2 +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse8x16 vpx_mse8x16_sse2 -unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_mse8x8 vpx_mse8x8_sse2 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); @@ -500,16 +501,18 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2 -void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); +RTCD_EXTERN void (*vpx_quantize_b_32x32)(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order); unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -519,19 +522,10 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2 -void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2 -void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad16x32 vpx_sad16x32_sse2 @@ -540,8 +534,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2 -void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -552,19 +546,10 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2 -void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2 -void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad16x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -575,8 +560,8 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -589,16 +574,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x3 vpx_sad32x32x3_c - -void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad32x32x8 vpx_sad32x32x8_c +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -610,8 +589,8 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -622,18 +601,10 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2 -void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2 -void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad4x4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad4x8 vpx_sad4x8_sse2 @@ -642,47 +613,43 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2 -void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2 -void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad4x8x8 vpx_sad4x8x8_c - unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x32_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); RTCD_EXTERN unsigned int (*vpx_sad64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vpx_sad64x64_avg_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x3 vpx_sad64x64x3_c - -void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); - -void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad64x64x8 vpx_sad64x64x8_c +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); @@ -692,18 +659,10 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2 -void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2 -void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x4 vpx_sad8x4_sse2 @@ -712,13 +671,10 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2 -void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2 -void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vpx_sad8x4x8 vpx_sad8x4x8_c - unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); #define vpx_sad8x8 vpx_sad8x8_sse2 @@ -727,273 +683,392 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2 -void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); - -void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2 -void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -RTCD_EXTERN void (*vpx_sad8x8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2 + +void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2 + +unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2 + +void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2 + +unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2 + +void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2 + +unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x16x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_32x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c + +void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c + +unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2 + +void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2 + +unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x32x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vpx_sad_skip_64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); + +void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +RTCD_EXTERN void (*vpx_sad_skip_64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); + +unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2 + +void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2 + +unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c + +void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c + +unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2 + +void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]); +#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2 int vpx_satd_c(const int16_t *coeff, int length); int vpx_satd_sse2(const int16_t *coeff, int length); -#define vpx_satd vpx_satd_sse2 +int vpx_satd_avx2(const int16_t *coeff, int length); +RTCD_EXTERN int (*vpx_satd)(const int16_t *coeff, int length); -void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vpx_scaled_2d)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); -void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c -void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c -void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c -void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_horiz vpx_scaled_horiz_c -void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #define vpx_scaled_vert vpx_scaled_vert_c -uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +int64_t vpx_sse_c(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_sse4_1(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +int64_t vpx_sse_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); +RTCD_EXTERN int64_t (*vpx_sse)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int width, int height); -uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); -uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); -RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vpx_subtract_block vpx_subtract_block_sse2 +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vpx_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size); #define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_sse2 -void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2 -void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2 -void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse2 -void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2 -void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2 -void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2 -void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse2 -void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse2 -unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x32 vpx_variance16x32_sse2 +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance16x8 vpx_variance16x8_sse2 +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance32x64 vpx_variance32x64_sse2 +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance4x4 vpx_variance4x4_sse2 -unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); #define vpx_variance4x8 vpx_variance4x8_sse2 -unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x16 vpx_variance8x16_sse2 +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_sse2 +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x8 vpx_variance8x8_sse2 +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); @@ -1010,15 +1085,20 @@ static void setup_rtcd_internal(void) (void)flags; + vpx_comp_avg_pred = vpx_comp_avg_pred_sse2; + if (flags & HAS_AVX2) vpx_comp_avg_pred = vpx_comp_avg_pred_avx2; vpx_convolve8 = vpx_convolve8_sse2; if (flags & HAS_SSSE3) vpx_convolve8 = vpx_convolve8_ssse3; if (flags & HAS_AVX2) vpx_convolve8 = vpx_convolve8_avx2; vpx_convolve8_avg = vpx_convolve8_avg_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg = vpx_convolve8_avg_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg = vpx_convolve8_avg_avx2; vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_horiz = vpx_convolve8_avg_horiz_avx2; vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_ssse3; + if (flags & HAS_AVX2) vpx_convolve8_avg_vert = vpx_convolve8_avg_vert_avx2; vpx_convolve8_horiz = vpx_convolve8_horiz_sse2; if (flags & HAS_SSSE3) vpx_convolve8_horiz = vpx_convolve8_horiz_ssse3; if (flags & HAS_AVX2) vpx_convolve8_horiz = vpx_convolve8_horiz_avx2; @@ -1051,6 +1131,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_d63_predictor_4x4 = vpx_d63_predictor_4x4_ssse3; vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_c; if (flags & HAS_SSSE3) vpx_d63_predictor_8x8 = vpx_d63_predictor_8x8_ssse3; + vpx_fdct16x16 = vpx_fdct16x16_sse2; + if (flags & HAS_AVX2) vpx_fdct16x16 = vpx_fdct16x16_avx2; vpx_fdct32x32 = vpx_fdct32x32_sse2; if (flags & HAS_AVX2) vpx_fdct32x32 = vpx_fdct32x32_avx2; vpx_fdct32x32_rd = vpx_fdct32x32_rd_sse2; @@ -1059,40 +1141,39 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_fdct8x8 = vpx_fdct8x8_ssse3; vpx_get16x16var = vpx_get16x16var_sse2; if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2; + vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2; + if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2; + vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2; + if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2; vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2; if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3; + vpx_idct16x16_256_add = vpx_idct16x16_256_add_sse2; + if (flags & HAS_AVX2) vpx_idct16x16_256_add = vpx_idct16x16_256_add_avx2; vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_sse2; - if (flags & HAS_SSSE3) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_ssse3; - vpx_idct32x32_135_add = vpx_idct32x32_1024_add_sse2; + if (flags & HAS_AVX2) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_avx2; + vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2; if (flags & HAS_SSSE3) vpx_idct32x32_135_add = vpx_idct32x32_135_add_ssse3; + if (flags & HAS_AVX2) vpx_idct32x32_135_add = vpx_idct32x32_135_add_avx2; vpx_idct32x32_34_add = vpx_idct32x32_34_add_sse2; if (flags & HAS_SSSE3) vpx_idct32x32_34_add = vpx_idct32x32_34_add_ssse3; vpx_idct8x8_12_add = vpx_idct8x8_12_add_sse2; if (flags & HAS_SSSE3) vpx_idct8x8_12_add = vpx_idct8x8_12_add_ssse3; - vpx_idct8x8_64_add = vpx_idct8x8_64_add_sse2; - if (flags & HAS_SSSE3) vpx_idct8x8_64_add = vpx_idct8x8_64_add_ssse3; vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_sse2; if (flags & HAS_AVX2) vpx_lpf_horizontal_16 = vpx_lpf_horizontal_16_avx2; vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_sse2; if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2; vpx_mse16x16 = vpx_mse16x16_sse2; if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2; + vpx_mse16x8 = vpx_mse16x8_sse2; + if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2; vpx_quantize_b = vpx_quantize_b_sse2; if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3; if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx; + if (flags & HAS_AVX2) vpx_quantize_b = vpx_quantize_b_avx2; vpx_quantize_b_32x32 = vpx_quantize_b_32x32_c; if (flags & HAS_SSSE3) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_ssse3; if (flags & HAS_AVX) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx; - vpx_sad16x16x3 = vpx_sad16x16x3_c; - if (flags & HAS_SSE3) vpx_sad16x16x3 = vpx_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x16x3 = vpx_sad16x16x3_ssse3; - vpx_sad16x16x8 = vpx_sad16x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x16x8 = vpx_sad16x16x8_sse4_1; - vpx_sad16x8x3 = vpx_sad16x8x3_c; - if (flags & HAS_SSE3) vpx_sad16x8x3 = vpx_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vpx_sad16x8x3 = vpx_sad16x8x3_ssse3; - vpx_sad16x8x8 = vpx_sad16x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad16x8x8 = vpx_sad16x8x8_sse4_1; + if (flags & HAS_AVX2) vpx_quantize_b_32x32 = vpx_quantize_b_32x32_avx2; vpx_sad32x16 = vpx_sad32x16_sse2; if (flags & HAS_AVX2) vpx_sad32x16 = vpx_sad32x16_avx2; vpx_sad32x16_avg = vpx_sad32x16_avg_sse2; @@ -1107,30 +1188,52 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_sad32x64 = vpx_sad32x64_avx2; vpx_sad32x64_avg = vpx_sad32x64_avg_sse2; if (flags & HAS_AVX2) vpx_sad32x64_avg = vpx_sad32x64_avg_avx2; - vpx_sad4x4x3 = vpx_sad4x4x3_c; - if (flags & HAS_SSE3) vpx_sad4x4x3 = vpx_sad4x4x3_sse3; - vpx_sad4x4x8 = vpx_sad4x4x8_c; - if (flags & HAS_SSE4_1) vpx_sad4x4x8 = vpx_sad4x4x8_sse4_1; vpx_sad64x32 = vpx_sad64x32_sse2; if (flags & HAS_AVX2) vpx_sad64x32 = vpx_sad64x32_avx2; + if (flags & HAS_AVX512) vpx_sad64x32 = vpx_sad64x32_avx512; vpx_sad64x32_avg = vpx_sad64x32_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x32_avg = vpx_sad64x32_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x32_avg = vpx_sad64x32_avg_avx512; vpx_sad64x64 = vpx_sad64x64_sse2; if (flags & HAS_AVX2) vpx_sad64x64 = vpx_sad64x64_avx2; + if (flags & HAS_AVX512) vpx_sad64x64 = vpx_sad64x64_avx512; vpx_sad64x64_avg = vpx_sad64x64_avg_sse2; if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2; + if (flags & HAS_AVX512) vpx_sad64x64_avg = vpx_sad64x64_avg_avx512; vpx_sad64x64x4d = vpx_sad64x64x4d_sse2; if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2; - vpx_sad8x16x3 = vpx_sad8x16x3_c; - if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3; - vpx_sad8x16x8 = vpx_sad8x16x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x16x8 = vpx_sad8x16x8_sse4_1; - vpx_sad8x8x3 = vpx_sad8x8x3_c; - if (flags & HAS_SSE3) vpx_sad8x8x3 = vpx_sad8x8x3_sse3; - vpx_sad8x8x8 = vpx_sad8x8x8_c; - if (flags & HAS_SSE4_1) vpx_sad8x8x8 = vpx_sad8x8x8_sse4_1; + if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512; + vpx_sad_skip_32x16 = vpx_sad_skip_32x16_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16 = vpx_sad_skip_32x16_avx2; + vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x16x4d = vpx_sad_skip_32x16x4d_avx2; + vpx_sad_skip_32x32 = vpx_sad_skip_32x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32 = vpx_sad_skip_32x32_avx2; + vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x32x4d = vpx_sad_skip_32x32x4d_avx2; + vpx_sad_skip_32x64 = vpx_sad_skip_32x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64 = vpx_sad_skip_32x64_avx2; + vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_32x64x4d = vpx_sad_skip_32x64x4d_avx2; + vpx_sad_skip_64x32 = vpx_sad_skip_64x32_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32 = vpx_sad_skip_64x32_avx512; + vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x32x4d = vpx_sad_skip_64x32x4d_avx512; + vpx_sad_skip_64x64 = vpx_sad_skip_64x64_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64 = vpx_sad_skip_64x64_avx512; + vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_sse2; + if (flags & HAS_AVX2) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx2; + if (flags & HAS_AVX512) vpx_sad_skip_64x64x4d = vpx_sad_skip_64x64x4d_avx512; + vpx_satd = vpx_satd_sse2; + if (flags & HAS_AVX2) vpx_satd = vpx_satd_avx2; vpx_scaled_2d = vpx_scaled_2d_c; if (flags & HAS_SSSE3) vpx_scaled_2d = vpx_scaled_2d_ssse3; + vpx_sse = vpx_sse_c; + if (flags & HAS_SSE4_1) vpx_sse = vpx_sse_sse4_1; + if (flags & HAS_AVX2) vpx_sse = vpx_sse_avx2; vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_sse2; if (flags & HAS_SSSE3) vpx_sub_pixel_avg_variance16x16 = vpx_sub_pixel_avg_variance16x16_ssse3; vpx_sub_pixel_avg_variance16x32 = vpx_sub_pixel_avg_variance16x32_sse2; @@ -1187,16 +1290,30 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x4 = vpx_sub_pixel_variance8x4_ssse3; vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_sse2; if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3; + vpx_subtract_block = vpx_subtract_block_sse2; + if (flags & HAS_AVX2) vpx_subtract_block = vpx_subtract_block_avx2; vpx_variance16x16 = vpx_variance16x16_sse2; if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2; + vpx_variance16x32 = vpx_variance16x32_sse2; + if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2; + vpx_variance16x8 = vpx_variance16x8_sse2; + if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2; vpx_variance32x16 = vpx_variance32x16_sse2; if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2; vpx_variance32x32 = vpx_variance32x32_sse2; if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2; + vpx_variance32x64 = vpx_variance32x64_sse2; + if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2; vpx_variance64x32 = vpx_variance64x32_sse2; if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; vpx_variance64x64 = vpx_variance64x64_sse2; if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; } #endif @@ -1204,4 +1321,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_DSP_RTCD_H_ diff --git a/media/libvpx/config/win/x64/vpx_scale_rtcd.h b/media/libvpx/config/win/x64/vpx_scale_rtcd.h index ddf7d01cca..18e5b71579 100644 --- a/media/libvpx/config/win/x64/vpx_scale_rtcd.h +++ b/media/libvpx/config/win/x64/vpx_scale_rtcd.h @@ -1,3 +1,14 @@ +/* + * Copyright (c) 2026 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is generated. Do not edit. #ifndef VPX_SCALE_RTCD_H_ #define VPX_SCALE_RTCD_H_ @@ -46,6 +57,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c +void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c @@ -66,4 +80,4 @@ static void setup_rtcd_internal(void) } // extern "C" #endif -#endif +#endif // VPX_SCALE_RTCD_H_ diff --git a/media/libvpx/generate_sources_mozbuild.sh b/media/libvpx/generate_sources_mozbuild.sh index 57d648fc60..cb0c2793ac 100644 --- a/media/libvpx/generate_sources_mozbuild.sh +++ b/media/libvpx/generate_sources_mozbuild.sh @@ -195,14 +195,24 @@ all_platforms="--enable-external-build --disable-examples --disable-install-docs all_platforms="${all_platforms} --enable-multi-res-encoding --size-limit=8192x4608 --enable-pic" x86_platforms="--enable-postproc --enable-vp9-postproc --as=yasm" arm_platforms="--enable-runtime-cpu-detect --enable-realtime-only" +aarch64_platforms="--enable-runtime-cpu-detect --enable-realtime-only" +mips64_platforms="--enable-runtime-cpu-detect --cpu=loongson3" +ppc64le_platforms="--enable-runtime-cpu-detect --enable-vsx" +other_arch_platforms="--enable-runtime-cpu-detect" gen_config_files linux/x64 "--target=x86_64-linux-gcc ${all_platforms} ${x86_platforms}" gen_config_files linux/ia32 "--target=x86-linux-gcc ${all_platforms} ${x86_platforms}" gen_config_files mac/x64 "--target=x86_64-darwin9-gcc ${all_platforms} ${x86_platforms}" gen_config_files mac/ia32 "--target=x86-darwin9-gcc ${all_platforms} ${x86_platforms}" -gen_config_files win/x64 "--target=x86_64-win64-vs12 ${all_platforms} ${x86_platforms}" +gen_config_files win/x64 "--target=x86_64-win64-vs17 ${all_platforms} ${x86_platforms}" gen_config_files win/ia32 "--target=x86-win32-gcc ${all_platforms} ${x86_platforms}" gen_config_files linux/arm "--target=armv7-linux-gcc ${all_platforms} ${arm_platforms}" +gen_config_files linux/arm64 "--target=arm64-linux-gcc ${all_platforms} ${aarch64_platforms}" +gen_config_files mac/arm64 "--target=arm64-darwin-gcc ${all_platforms} ${aarch64_platforms}" +gen_config_files linux/mips32 "--target=mips32-linux-gcc ${all_platforms} ${other_arch_platforms}" +gen_config_files linux/mips64 "--target=mips64-linux-gcc ${all_platforms} ${mips64_platforms}" +gen_config_files linux/ppc64le "--target=ppc64le-linux-gcc ${all_platforms} ${ppc64le_platforms}" +gen_config_files linux/loongarch64 "--target=loongarch64-linux-gcc ${all_platforms} ${other_arch_platforms}" gen_config_files generic "--target=generic-gnu ${all_platforms}" @@ -224,6 +234,12 @@ gen_rtcd_header win/x64 x86_64 gen_rtcd_header win/ia32 x86 gen_rtcd_header linux/arm armv7 +gen_rtcd_header linux/arm64 arm64 +gen_rtcd_header mac/arm64 arm64 +gen_rtcd_header linux/mips32 mips32 +gen_rtcd_header linux/mips64 mips64 +gen_rtcd_header linux/ppc64le ppc64le +gen_rtcd_header linux/loongarch64 loongarch64 gen_rtcd_header generic generic @@ -257,6 +273,36 @@ make_clean make libvpx_srcs.txt target=libs $config > /dev/null convert_srcs_to_project_files libvpx_srcs.txt ARM +echo "Generate AARCH64 source list." +config=$(print_config linux/arm64) +make_clean +make libvpx_srcs.txt target=libs $config > /dev/null +convert_srcs_to_project_files libvpx_srcs.txt AARCH64 + +echo "Generate MIPS32 source list." +config=$(print_config linux/mips32) +make_clean +make libvpx_srcs.txt target=libs $config > /dev/null +convert_srcs_to_project_files libvpx_srcs.txt MIPS32 + +echo "Generate MIPS64 source list." +config=$(print_config linux/mips64) +make_clean +make libvpx_srcs.txt target=libs $config > /dev/null +convert_srcs_to_project_files libvpx_srcs.txt MIPS64 + +echo "Generate PPC64LE source list." +config=$(print_config linux/ppc64le) +make_clean +make libvpx_srcs.txt target=libs $config > /dev/null +convert_srcs_to_project_files libvpx_srcs.txt PPC64LE + +echo "Generate LOONGARCH64 source list." +config=$(print_config linux/loongarch64) +make_clean +make libvpx_srcs.txt target=libs $config > /dev/null +convert_srcs_to_project_files libvpx_srcs.txt LOONGARCH64 + echo "Generate generic source list." config=$(print_config generic) make_clean diff --git a/media/libvpx/libvpx/.clang-format b/media/libvpx/libvpx/.clang-format index d91cf89aac..a8bc4967c3 100644 --- a/media/libvpx/libvpx/.clang-format +++ b/media/libvpx/libvpx/.clang-format @@ -1,91 +1,9 @@ --- Language: Cpp -# BasedOnStyle: Google -# Generated with clang-format 3.8.1 -AccessModifierOffset: -1 -AlignAfterOpenBracket: Align -AlignConsecutiveAssignments: false -AlignConsecutiveDeclarations: false -AlignEscapedNewlinesLeft: true -AlignOperands: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortBlocksOnASingleLine: false +BasedOnStyle: Google AllowShortCaseLabelsOnASingleLine: true -AllowShortFunctionsOnASingleLine: All -AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: true -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true -AlwaysBreakTemplateDeclarations: true -BinPackArguments: true -BinPackParameters: true -BraceWrapping: - AfterClass: false - AfterControlStatement: false - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - BeforeCatch: false - BeforeElse: false - IndentBraces: false -BreakBeforeBinaryOperators: None -BreakBeforeBraces: Attach -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' ConstructorInitializerAllOnOneLineOrOnePerLine: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 Cpp11BracedListStyle: false DerivePointerAlignment: false -DisableFormat: false -ExperimentalAutoDetectBinPacking: false -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] -IncludeCategories: - - Regex: '^<.*\.h>' - Priority: 1 - - Regex: '^<.*' - Priority: 2 - - Regex: '.*' - Priority: 3 -IndentCaseLabels: true -IndentWidth: 2 -IndentWrappedFunctionNames: false -KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: false -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Right -ReflowComments: true SortIncludes: false -SpaceAfterCStyleCast: false -SpaceBeforeAssignmentOperators: true -SpaceBeforeParens: ControlStatements -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 2 -SpacesInAngles: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false -SpacesInSquareBrackets: false -Standard: Auto -TabWidth: 8 -UseTab: Never -... - diff --git a/media/libvpx/libvpx/.mailmap b/media/libvpx/libvpx/.mailmap index 94cb1ecfe7..2fc6c7d39d 100644 --- a/media/libvpx/libvpx/.mailmap +++ b/media/libvpx/libvpx/.mailmap @@ -1,37 +1,58 @@ Adrian Grange -Aℓex Converse -Aℓex Converse +Aℓex Converse +Aℓex Converse +Aℓex Converse Alexis Ballier Alpha Lam +Angie Chiang +Bohan Li +Chris Cunningham +Chi Yo Tsai Daniele Castagna Deb Mukherjee +Elliott Karpilovsky Erik Niemeyer +Fyodor Kyslov +Gregor Jasny +Gregor Jasny Guillaume Martres Hangyu Kuang Hui Su Jacky Chen Jim Bankoski Johann Koenig +Johann Koenig Johann Koenig -Johann Koenig Johann Koenig +Johann John Koleszar Joshua Litt +Konstantinos Margaritis Marco Paniconi Marco Paniconi +Martin Storsjö +Michael Horowitz Pascal Massimino Paul Wilkins +Peter Boström Peter de Rivaz Peter de Rivaz Ralph Giles Ralph Giles Ronald S. Bultje +Sai Deng Sami Pietilä +Shiyou Yin Tamar Levy Tamar Levy Tero Rintaluoma Timothy B. Terriberry Tom Finegan Tom Finegan +Urvang Joshi +Yaowu Xu Yaowu Xu Yaowu Xu +Venkatarama NG. Avadhani +Vitaly Buka +Xiwei Gu diff --git a/media/libvpx/libvpx/AUTHORS b/media/libvpx/libvpx/AUTHORS index 87a5e845cf..7d264b36ac 100644 --- a/media/libvpx/libvpx/AUTHORS +++ b/media/libvpx/libvpx/AUTHORS @@ -3,60 +3,102 @@ Aaron Watry Abo Talib Mahfoodh -Adam Xu +Adam B. Goode Adrian Grange -Aℓex Converse Ahmad Sharif +Aidan Welch Aleksey Vasenev Alexander Potapenko Alexander Voronov +Alexandra Hájková +Aℓex Converse +Alex Davicenko Alexis Ballier Alok Ahuja Alpha Lam A.Mahfoodh Ami Fischman Andoni Morales Alastruey +Andres Calderon Jaramillo Andres Mejia +Andrew Lewis Andrew Russell +Andrew Salkeld +Angie Chen Angie Chiang +Anton Venema +Anupam Pandey Aron Rosenberg Attila Nagy +Birk Magnussen +Bohan Li +Brian Foley Brion Vibber +Casey Smalley changjun.yang Charles 'Buck' Krasic +Cheng Chen +Chen Wang +Cherma Rajan A +Chi Yo Tsai chm +Chris Cunningham Christian Duvivier +Chunbo Hua +Chun-Min Chang +Clement Courbet +Daniel Cheng Daniele Castagna Daniel Kang +Daniel Sommermann +Dan Zhu +David Benjamin Deb Mukherjee Deepa K G +Diksha Singh Dim Temp Dmitry Kovalev Dragan Mrdjan Ed Baker Ehsan Akhgari +Elliott Karpilovsky Erik Niemeyer Fabio Pedretti +Feng, Feifei +Florian Mayer Frank Galligan Fredrik Söderquist Fritz Koenig +Fyodor Kyslov Gabriel Marin Gaute Strokkenes +George Steed +Gerda Zsejke More Geza Lore Ghislain MARY Giuseppe Scrivano Gordana Cmiljanovic +Gregor Jasny Guillaume Martres Guillermo Ballester Valor +Hang Nguyen Hangyu Kuang Hanno Böck +Han Shen +Hao Chen +Hari Limaye +Harish Mahendrakar Henrik Lundin +Hien Ho +Hirokazu Honda Hui Su +Ilya Kurdyukov Ivan Krasin Ivan Maltz Jacek Caban Jacky Chen James Berry +James Touton James Yu James Zern Jan Gerber @@ -66,44 +108,71 @@ Jean-Yves Avenard Jeff Faust Jeff Muizelaar Jeff Petkau +Jeremy Dorfman +Jeremy Leconte Jerome Jiang Jia Jia +Jianhui Dai Jian Zhou Jim Bankoski +jinbo +Jin Bo Jingning Han +Joel Fernandes Joey Parrish +Johann Johann Koenig John Koleszar Johnny Klonaris John Stark +Jonathan Wright +Jon Kunkee +Jorge E. Moreira Joshua Bleecher Snyder Joshua Litt Julia Robson Justin Clift Justin Lebar Kaustubh Raste +Kexy Biscuit KO Myung-Hun +Konstantinos Margaritis +Kyle Siefring Lawrence Velázquez +L. E. Segovia Linfeng Zhang +Lin Zheng +Liu Peng Lou Quillio Luca Barbato +Luc Trudeau +Lu Wang Makoto Kato Mans Rullgard Marco Paniconi Mark Mentovai Martin Ettl -Martin Storsjo +Martin Storsjö Matthew Heaney +Matthias Räncker +Michael Horowitz Michael Kohler +Michał Janiszewski Mike Frysinger Mike Hommey Mikhal Shemer +Mikko Koivisto Min Chen Minghai Shang Min Ye +Mirko Bonadei +Moriyoshi Koizumi Morton Jonuschat Nathan E. Egge +Neeraj Gadgil +Neil Birkbeck Nico Weber +Niveditha Rau Parag Salasakar Pascal Massimino Patrik Westin @@ -111,29 +180,47 @@ Paul Wilkins Pavol Rusnak Paweł Hajdan Pengchong Jin -Peter Boström +Peter Boström +Peter Collingbourne Peter de Rivaz +Peter Kasting Philip Jägenstedt +Philippe Antoine +Philipp Hancke Priit Laes Rafael Ávila de Espíndola Rafaël Carré +Rafael de Lucena Valle +Rahul Chaudhry Ralph Giles Ranjit Kumar Tulabandu +Raphael Kubo da Costa +Ravi Chaudhary +Ritu Baldwa Rob Bradford Ronald S. Bultje Rui Ueyama +Sai Deng +Salome Thirot Sami Pietilä +Sam James Sarah Parker Sasi Inguva Scott Graham Scott LaVarnway Sean McGovern Sergey Kolomenkin +Sergey Silkin Sergey Ulanov Shimon Doodkin +Shiyou Yin +Shubham Tandle Shunyao Li +Sreerenj Balachandran Stefan Holmer Suman Sunkara +Supradeep T R +Sylvestre Ledru Taekhyun Kim Takanori MATSUURA Tamar Levy @@ -145,13 +232,26 @@ Timothy B. Terriberry Tom Finegan Tristan Matthews Urvang Joshi +Venkatarama NG. Avadhani Vignesh Venkatasubramanian +Vitaly Buka +Vlad Tsyrklevich +Wan-Teh Chang +Wonkap Jang +Xiahong Bao +Xiwei Gu Yaowu Xu Yi Luo +Yongseok Jeon Yongzhe Wang +yuanhecai +Yue Chen +Yun Liu Yunqing Wang Yury Gitman Zoe Liu +Zoltan Kuscsik +zuxy Google Inc. The Mozilla Foundation The Xiph.Org Foundation diff --git a/media/libvpx/libvpx/CHANGELOG b/media/libvpx/libvpx/CHANGELOG index 7e7aec67ac..df77c20235 100644 --- a/media/libvpx/libvpx/CHANGELOG +++ b/media/libvpx/libvpx/CHANGELOG @@ -1,3 +1,493 @@ +2026-01-06 v1.16.0 "Xenonetta Duck" + This release includes Arm SVE2 and Neon optimizations for 12-tap filters, + AVX512 implementations for SAD, support for per-frame and per-spatial-layer + PSNR calculation, and numerous bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + + Unit tests require C++17 to build. + + Support for 32-bit iOS targets (armv7, armv7s, and i386) has been removed. + + - Enhancement: + Optimized Arm SVE2 and Neon implementations for 12-tap convolution filters. + Optimized Neon High Bitdepth (HBD) SAD and sad_avg functions. + Added Arm Neon DotProd and I8MM implementations for vpx_convolve12. + Added AVX512 implementations for SAD64 and sad_skip functions. + Added SSSE3 and AVX2 implementations for 12-tap temporal filter prediction. + Added support for per-frame and per-spatial-layer PSNR calculation. + + Adjusted temporal filter strength to improve visual quality and reduce block + artifacts. + + Added support for darwin24 (macOS 15) and darwin25 (macOS 26). + libwebm is upgraded to commit b4f01ea. + + - Bug fixes: + Fix to heap buffer overflow in vp9_deblock, vp9_post_proc_frame, and + vp9_pack_bitstream. + + Fix to integer overflow in vp9_highbd_post_proc, vp9_rc_regulate_q, + tiny_ssim, and vp9_calc_pframe_target_size_one_pass_cbr. + + Fix to use-of-uninitialized-value in vp9_highbd_post_proc, mfqe, and + vp8_datarate_test. + + Fix to out-of-bounds in log_tile_cols_from_picsize_level. + Fix to double free on initialization failure in vpx_codec_enc_init_multi. + Fix to division-by-zero crash in vpxenc with 0 FPS numerator input. + + Fix to various build failures for Arm/SVE2, macOS cross-compilation, and + Xcode 16. + +2025-05-28 v1.15.2 "Wigeon Duck" + This release fixes CVE-2025-5283 (bug webm:413411335), and is ABI compatible + with the previous release. + +2025-01-09 v1.15.1 "Wigeon Duck" + This release bumps up the SO major version and fixes the language about ABI + compatibility in the previous release changelog. + +2024-10-22 v1.15.0 "Wigeon Duck" + This release includes new codec control for key frame filtering, more Neon + optimizations, improvements to RTC encoding and bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + + It is strongly recommended to skip this release and upgrade to v1.15.1 since + the shared object was versioned incorrectly, as shown in + https://issues.webmproject.org/issues/384672478. + + Temporal filtering improvement that can be turned on with the new codec + control VP9E_SET_KEY_FRAME_FILTERING, which gives 1+% BD-rate saving with + minimal encoder time increase. + + libwebm is upgraded to libwebm-1.0.0.31-10-g3b63004 + + - Enhancement: + Neon optimization speed up + 1-3% speed up across speed 5 to 10 for RTC + 3% speed up for speed 0 and 1 for VoD in standard bitdepth + 3% and 7% speed up for speed 0 and 1 respectively for VoD in high bitdepth + Scene detection is allowed for all RTC speeds (>=5) + Support profile guided optimizations + + Delta quantization parameters for UV channels for vp8 is supported in RTC + rate control library + + Rate control parameters are reset and maximum QP is enforced on scene + changes in SVC when there is no inter-layer prediction + + - Bug fixes: + Fix to Uninitialized scalar variable in `vp9_rd_pick_inter_mode_sb()` + Fix to Integer-overflow in `resize_multistep` + Fix to Heap-buffer-overflow in `vpx_sad64x64_avx2` + Fix to Crash in `vpx_sad8x8_sse2` + Fix to Assertion in `write_modes` + Support profile guided optimizations + Fix to Integer-overflow in `encode_frame_to_data_rate` + Fix to Integer-overflow in `vp9_svc_check_reset_layer_rc_flag` + Fix to core dump error from /usr/bin/tools/tiny_ssim --help + Fix to use-of-uninitialized-value in `vp9_setup_tpl_stats` + Fix to Undefined-shift in `vp9_cyclic_refresh_setup` + Fix to redundant `&& __GNUC__` preproc check + Fix to valgrind warning in EncodeAPI.OssFuzz69906 + Fix to Index-out-of-bounds in `vp8_rd_pick_inter_mode` + Fix to Integer-overflow in `vp8_pick_frame_size` + Fix to Use-of-uninitialized-value in `vpx_codec_peek_stream_info` + Fix to log clutters with the message "Warning: Desired height too large" + Fix to Integer-overflow in `vp9_svc_adjust_avg_frame_qindex` + + Fix to integer overflows caused by huge target bitrate, frame rate, or + g_timebase numerator or denominator + + Fix to missing license headers + Fix to build failure for Android Armv7 + Fix to integer overflows in image helpers + Fix to Integer-overflow in `vp9_calc_iframe_target_size_one_pass_cbr` + Fix to Heap-buffer-overflow in `vp9_pick_inter_mode` + Fix to Segv in `vp9_multi_thread_tile_init` + Fix to Use-of-uninitialized-value in `vp9_row_mt_sync_mem_dealloc` + Fix to Crash in `mbloop_filter_vertical_edge_c` + Fix to Check failed in CheckUnwind + Fix to Heap-buffer-overflow in `write_modes_b` and `vpx_write` + Fix to Possible signed integer overflow found in `vpx_codec_encode` + Fix to build conflicts between Abseil and libaom/libvpx in Win ARM64 builds + Fix to build failures on aarch64 + Fix to Data race in libvpx ARM NEON + Fix to Heap-buffer-overflow in `scale_plane_1_to_2_phase_0` + Fix to integer overflow in `encode_mb_row` + Fix to Floating-point-exception in `vp8_pick_frame_size` + Fix to Heap-buffer-overflow in `vp9_enc_setup_mi` + Fix to build failure with --target=arm64-win64-vs17 + Fix to heap-buffer-overflow write in `vpx_img_read()` + Fix to C vs armv8-linux-gcc encode mismatches for `y4m_360p_10bit_input` + Fix to Null-dereference READ in `ml_predict_var_rd_partitioning` + Fix to Heap-buffer-overflow in `vpx_scaled_2d_ssse3` + Fix to Crash in `convolve_horiz` + Fix to Ill in `vpx_scaled_2d_ssse3` + Fix to Global-buffer-overflow in `cost_coeffs` + +2024-05-21 v1.14.1 "Venetian Duck" + This release includes enhancements and bug fixes. + + - Upgrading: + This release is ABI compatible with the previous release. + + - Enhancement: + Improved the detection of compiler support for AArch64 extensions, + particularly SVE. + + Added vpx_codec_get_global_headers() support for VP9. + + - Bug fixes: + Added buffer bounds checks to vpx_writer and vpx_write_bit_buffer. + Fix to GetSegmentationData() crash in aq_mode=0 for RTC rate control. + Fix to alloc for row_base_thresh_freq_fac. + Free row mt memory before freeing cpi->tile_data. + Fix to buffer alloc for vp9_bitstream_worker_data. + Fix to VP8 race issue for multi-thread with pnsr_calc. + Fix to uv width/height in vp9_scale_and_extend_frame_ssse3. + Fix to integer division by zero and overflow in calc_pframe_target_size(). + Fix to integer overflow in vpx_img_alloc() & vpx_img_wrap()(CVE-2024-5197). + Fix to UBSan error in vp9_rc_update_framerate(). + Fix to UBSan errors in vp8_new_framerate(). + Fix to integer overflow in vp8 encodeframe.c. + Handle EINTR from sem_wait(). + +2024-01-02 v1.14.0 "Venetian Duck" + This release drops support for old C compilers, such as Visual Studio 2012 + and older, that disallow mixing variable declarations and statements (a C99 + feature). It adds support for run-time CPU feature detection for Arm + platforms, as well as support for darwin23 (macOS 14). + + - Upgrading: + This release is ABI incompatible with the previous release. + + Various new features for rate control library for real-time: SVC parallel + encoding, loopfilter level, support for frame dropping, and screen content. + + New callback function send_tpl_gop_stats for vp9 external rate control + library, which can be used to transmit TPL stats for a group of pictures. A + public header vpx_tpl.h is added for the definition of TPL stats used in + this callback. + + libwebm is upgraded to libwebm-1.0.0.29-9-g1930e3c. + + - Enhancement: + Improvements on Neon optimizations: VoD: 12-35% speed up for bitdepth 8, + 68%-151% speed up for high bitdepth. + + Improvements on AVX2 and SSE optimizations. + Improvements on LSX optimizations for LoongArch. + 42-49% speedup on speed 0 VoD encoding. + Android API level predicates. + + - Bug fixes: + Fix to missing prototypes from the rtcd header. + Fix to segfault when total size is enlarged but width is smaller. + Fix to the build for arm64ec using MSVC. + Fix to copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic. + Fix to -Wshadow warnings. + Fix to heap overflow in vpx_get4x4sse_cs_neon. + Fix to buffer overrun in highbd Neon subpel variance filters. + Added bitexact encode test script. + Fix to -Wl,-z,defs with Clang's sanitizers. + Fix to decoder stability after error & continued decoding. + Fix to mismatch of VP9 encode with NEON intrinsics with C only version. + Fix to Arm64 MSVC compile vpx_highbd_fdct4x4_neon. + Fix to fragments count before use. + Fix to a case where target bandwidth is 0 for SVC. + Fix mask in vp9_quantize_avx2,highbd_get_max_lane_eob. + Fix to int overflow in vp9_calc_pframe_target_size_one_pass_cbr. + Fix to integer overflow in vp8,ratectrl.c. + Fix to integer overflow in vp9 svc. + Fix to avg_frame_bandwidth overflow. + Fix to per frame qp for temporal layers. + Fix to unsigned integer overflow in sse computation. + Fix to uninitialized mesh feature for BEST mode. + Fix to overflow in highbd temporal_filter. + Fix to unaligned loads w/w==4 in vpx_convolve_copy_neon. + Skip arm64_neon.h workaround w/VS >= 2019. + Fix to c vs avx mismatch of diamond_search_sad(). + Fix to c vs intrinsic mismatch of vpx_hadamard_32x32() function. + Fix to a bug in vpx_hadamard_32x32_neon(). + Fix to Clang -Wunreachable-code-aggressive warnings. + Fix to a bug in vpx_highbd_hadamard_32x32_neon(). + Fix to -Wunreachable-code in mfqe_partition. + Force mode search on 64x64 if no mode is selected. + Fix to ubsan failure caused by left shift of negative. + Fix to integer overflow in calc_pframe_target_size. + Fix to float-cast-overflow in vp8_change_config(). + Fix to a null ptr before use. + Conditionally skip using inter frames in speed features. + Remove invalid reference frames. + Disable intra mode search speed features conditionally. + Set nonrd keyframe under dynamic change of deadline for rtc. + Fix to scaled reference offsets. + Set skip_recode=0 in nonrd_pick_sb_modes. + Fix to an edge case when downsizing to one. + Fix to a bug in frame scaling. + Fix to pred buffer stride. + Fix to a bug in simple motion search. + Update frame size in actual encoding. + +2023-09-29 v1.13.1 "Ugly Duckling" + This release contains two security related fixes. One each for VP8 and VP9. + + - Upgrading: + This release is ABI compatible with the previous release. + + - Bug fixes: + https://crbug.com/1486441 (CVE-2023-5217) + Fix to a crash related to VP9 encoding (#1642, CVE-2023-6349) + +2023-01-31 v1.13.0 "Ugly Duckling" + This release includes more Neon and AVX2 optimizations, adds a new codec + control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes + numerous bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + + New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP. + + GoogleTest is upgraded to v1.12.1. + + .clang-format is upgraded to clang-format-11. + + VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the + feature of using external rate control models for vp9. + + - Enhancement: + Numerous improvements on Neon optimizations. + Numerous improvements on AVX2 optimizations. + Additional ARM targets added for Visual Studio. + + - Bug fixes: + Fix to calculating internal stats when frame dropped. + Fix to segfault for external resize test in vp9. + Fix to build system with replacing egrep with grep -E. + Fix to a few bugs with external RTC rate control library. + Fix to make SVC work with VBR. + Fix to key frame setting in VP9 external RC. + Fix to -Wimplicit-int (Clang 16). + Fix to VP8 external RC for buffer levels. + Fix to VP8 external RC for dynamic update of layers. + Fix to VP9 auto level. + Fix to off-by-one error of max w/h in validate_config. + Fix to make SVC work for Profile 1. + +2022-06-17 v1.12.0 "Torrent Duck" + This release adds optimizations for Loongarch, adds support for vp8 in the + real-time rate control library, upgrades GoogleTest to v1.11.0, updates + libwebm to libwebm-1.0.0.28-20-g206d268, and includes numerous bug fixes. + + - Upgrading: + This release is ABI compatible with the previous release. + + vp8 support in the real-time rate control library. + New codec control VP8E_SET_RTC_EXTERNAL_RATECTRL is added. + + Configure support for darwin21 is added. + + GoogleTest is upgraded to v1.11.0. + + libwebm is updated to libwebm-1.0.0.28-20-g206d268. + + Allow SimpleEncode environment to take target level as input to match + the level conformance in vp9. + + - Enhancement: + Numerous improvements on checking memory allocations. + Optimizations for Loongarch. + Code clean-up. + + - Bug fixes: + Fix to a crash related to {vp8/vp9}_set_roi_map. + Fix to compiling failure with -Wformat-nonliteral. + Fix to integer overflow with vp9 with high resolution content. + Fix to AddNoiseTest failure with ARMv7. + Fix to libvpx Null-dereference READ in vp8. + +2021-09-27 v1.11.0 "Smew Duck" + This maintenance release adds support for VBR mode in VP9 rate control + interface, new codec controls to get quantization parameters and loop filter + levels, and includes several improvements to NEON and numerous bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + New codec control is added to get quantization parameters and loop filter + levels. + + VBR mode is supported in VP9 rate control library. + + - Enhancement: + Numerous improvements for Neon optimizations. + Code clean-up and refactoring. + Calculation of rd multiplier is changed with BDRATE gains. + + - Bug fixes: + Fix to overflow on duration. + Fix to several instances of -Wunused-but-set-variable. + Fix to avoid chroma resampling for 420mpeg2 input. + Fix to overflow in calc_iframe_target_size. + Fix to disallow skipping transform and quantization. + Fix some -Wsign-compare warnings in simple_encode. + Fix input file path in simple_encode_test. + Fix valid range for under/over_shoot pct. + +2021-03-09 v1.10.0 "Ruddy Duck" + This maintenance release adds support for darwin20 and new codec controls, as + well as numerous bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + New codec control is added to disable loopfilter for VP9. + + New encoder control is added to disable feature to increase Q on overshoot + detection for CBR. + + Configure support for darwin20 is added. + + New codec control is added for VP9 rate control. The control ID of this + interface is VP9E_SET_EXTERNAL_RATE_CONTROL. To make VP9 use a customized + external rate control model, users will have to implement each callback + function in vpx_rc_funcs_t and register them using libvpx API + vpx_codec_control_() with the control ID. + + - Enhancement: + Use -std=gnu++11 instead of -std=c++11 for c++ files. + + - Bug fixes: + Override assembler with --as option of configure for MSVS. + Fix several compilation issues with gcc 4.8.5. + Fix to resetting rate control for temporal layers. + Fix to the rate control stats of SVC example encoder when number of spatial + layers is 1. + Fix to reusing motion vectors from the base spatial layer in SVC. + 2 pass related flags removed from SVC example encoder. + +2020-07-29 v1.9.0 "Quacking Duck" + This release adds support for NV12, a separate library for rate control, as + well as incremental improvements. + + - Upgrading: + This release is ABI compatible with the previous release. + NV12 support is added to this release. + A new interface is added for VP9 rate control. The new library libvp9rc.a + must be linked by applications. + Googletest is updated to v1.10.0. + simple_encode.cc is compiled into a new library libsimple_encode.a with + CONFIG_RATE_CTRL. + + - Enhancement: + Various changes to improve VP9 SVC, rate control, quality and speed to real + time encoding. + + - Bug fixes: + Fix key frame update refresh simulcast flexible svc. + Fix to disable_16x16part speed feature for real time encoding. + Fix some signed integer overflows for VP9 rate control. + Fix initialization of delta_q_uv. + Fix condition in regulate_q for cyclic refresh. + Various fixes to dynamic resizing for VP9 SVC. + +2019-12-09 v1.8.2 "Pekin Duck" + This release collects incremental improvements to many aspects of the library. + + - Upgrading: + This release is ABI compatible with the previous release. + ARCH_* defines have been removed in favor of VPX_ARCH_*. + +2019-07-15 v1.8.1 "Orpington Duck" + This release collects incremental improvements to many aspects of the library. + + - Upgrading: + This release is ABI incompatible with the previous release. + VP8E_SET_CPUUSED now accepts values up to 9 for vp9. + VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E). + The --sdk-path option has been removed. If you were using it to build for + Android please read build/make/Android.mk for alternatives. + All PPC optimizations have been disabled: + https://bugs.chromium.org/p/webm/issues/detail?id=1522. + + - Enhancements: + Various changes to improve encoder rate control, quality and speed + for practically every use case. + + - Bug fixes: + vp9-rtc: Fix color artifacts for speed >= 8. + +2019-01-31 v1.8.0 "Northern Shoveler Duck" + This release focused on encoding performance for realtime and VOD use cases. + + - Upgrading: + This release is ABI incompatible with the previous release. This adds and + improves several vp9 controls. Most are related to SVC: + VP9E_SET_SVC_FRAME_DROP_LAYER: + - Frame dropping in SVC. + VP9E_SET_SVC_INTER_LAYER_PRED: + - Inter-layer prediction in SVC. + VP9E_SET_SVC_GF_TEMPORAL_REF: + - Enable long term temporal reference in SVC. + VP9E_SET_SVC_REF_FRAME_CONFIG/VP9E_GET_SVC_REF_FRAME_CONFIG: + - Extend and improve this control for better flexibility in setting SVC + pattern dynamically. + VP9E_SET_POSTENCODE_DROP: + - Allow for post-encode frame dropping (applies to non-SVC too). + VP9E_SET_SVC_SPATIAL_LAYER_SYNC: + - Enable spatial layer sync frames. + VP9E_SET_SVC_LAYER_ID: + - Extend api to specify temporal id for each spatial layers. + VP9E_SET_ROI_MAP: + - Extend Region of Interest functionality to VP9. + + - Enhancements: + 2 pass vp9 encoding has improved substantially. When using --auto-alt-ref=6, + we see approximately 8% for VBR and 10% for CQ. When using --auto-alt-ref=1, + the gains are approximately 4% for VBR and 5% for CQ. + + For real-time encoding, speed 7 has improved by ~5-10%. Encodes targeted at + screen sharing have improved when the content changes significantly (slide + sharing) or scrolls. There is a new speed 9 setting for mobile devices which + is about 10-20% faster than speed 8. + + - Bug fixes: + VP9 denoiser issue. + VP9 partition issue for 1080p. + VP9 rate control improvments. + Postprocessing Multi Frame Quality Enhancement (MFQE) issue. + VP8 multithread decoder issues. + A variety of fuzzing issues. + +2018-01-04 v1.7.0 "Mandarin Duck" + This release focused on high bit depth performance (10/12 bit) and vp9 + encoding improvements. + + - Upgrading: + This release is ABI incompatible due to new vp9 encoder features. + + Frame parallel decoding for vp9 has been removed. + + - Enhancements: + vp9 encoding supports additional threads with --row-mt. This can be greater + than the number of tiles. + + Two new vp9 encoder options have been added: + --corpus-complexity + --tune-content=film + + Additional tooling for respecting the vp9 "level" profiles has been added. + + - Bug fixes: + A variety of fuzzing issues. + vp8 threading fix for ARM. + Codec control VP9_SET_SKIP_LOOP_FILTER fixed. + Reject invalid multi resolution configurations. + 2017-01-09 v1.6.1 "Long Tailed Duck" This release improves upon the VP9 encoder and speeds up the encoding and decoding processes. @@ -272,7 +762,7 @@ of particular interest to real time streaming applications. Temporal scalability allows the encoder to produce a stream that can - be decimated to different frame rates, with independent rate targetting + be decimated to different frame rates, with independent rate targeting for each substream. Multiframe quality enhancement postprocessing can make visual quality diff --git a/media/libvpx/libvpx/CONTRIBUTING.md b/media/libvpx/libvpx/CONTRIBUTING.md new file mode 100644 index 0000000000..7a73a30317 --- /dev/null +++ b/media/libvpx/libvpx/CONTRIBUTING.md @@ -0,0 +1,29 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use a [Gerrit](https://www.gerritcodereview.com) instance hosted at +https://chromium-review.googlesource.com for this purpose. See the +[WebM Project page](https://www.webmproject.org/code/contribute/submitting-patches/) +for additional details. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). diff --git a/media/libvpx/libvpx/README b/media/libvpx/libvpx/README index 6d29968865..e47aa20ad4 100644 --- a/media/libvpx/libvpx/README +++ b/media/libvpx/libvpx/README @@ -1,5 +1,3 @@ -README - 9 January 2017 - Welcome to the WebM VP8/VP9 Codec SDK! COMPILING THE APPLICATIONS/LIBRARIES: @@ -9,22 +7,31 @@ COMPILING THE APPLICATIONS/LIBRARIES: 1. Prerequisites - * All x86 targets require the Yasm[1] assembler be installed. - * All Windows builds require that Cygwin[2] be installed. - * Building the documentation requires Doxygen[3]. If you do not + * All x86 targets require the NASM[0] or Yasm[1] assembler be installed[2]. + * All Windows builds require that Cygwin[3] or MSYS2[4] be installed. + * Building the documentation requires Doxygen[5]. If you do not have this package, the install-docs option will be disabled. - * Downloading the data for the unit tests requires curl[4] and sha1sum. + * Downloading the data for the unit tests requires curl[6] and sha1sum. sha1sum is provided via the GNU coreutils, installed by default on many *nix platforms, as well as MinGW and Cygwin. If coreutils is not available, a compatible version of sha1sum can be built from - source[5]. These requirements are optional if not running the unit + source[7]. These requirements are optional if not running the unit tests. + [0]: https://www.nasm.us/ [1]: http://www.tortall.net/projects/yasm - [2]: http://www.cygwin.com - [3]: http://www.doxygen.org - [4]: http://curl.haxx.se - [5]: http://www.microbrew.org/tools/md5sha1sum/ + [2]: For Visual Studio the base yasm binary (not vsyasm) should be in the + PATH for Visual Studio. For VS2017 it is sufficient to rename + yasm--.exe to yasm.exe and place it in: + Program Files (x86)/Microsoft Visual Studio/2017//Common7/Tools/ + The MSYS2 version of the yasm binary can also be used and avoids an + issue caused by a missing Visual C++ Redistributable install (Visual + Studio 2010, MSVCR100.dll). + [3]: http://www.cygwin.com + [4]: http://www.msys2.org/ + [5]: http://www.doxygen.org + [6]: http://curl.haxx.se + [7]: http://www.microbrew.org/tools/md5sha1sum/ 2. Out-of-tree builds Out of tree builds are a supported method of building the application. For @@ -41,7 +48,16 @@ COMPILING THE APPLICATIONS/LIBRARIES: used to get a list of supported options: $ ../libvpx/configure --help - 4. Cross development + 4. Compiler analyzers + Compilers have added sanitizers which instrument binaries with information + about address calculation, memory usage, threading, undefined behavior, and + other common errors. To simplify building libvpx with some of these features + use tools/set_analyzer_env.sh before running configure. It will set the + compiler and necessary flags for building as well as environment variables + read by the analyzer when testing the binaries. + $ source ../libvpx/tools/set_analyzer_env.sh address + + 5. Cross development For cross development, the most notable option is the --target option. The most up-to-date list of supported targets can be found at the bottom of the --help output of the configure script. As of this writing, the list of @@ -49,19 +65,36 @@ COMPILING THE APPLICATIONS/LIBRARIES: arm64-android-gcc arm64-darwin-gcc + arm64-darwin20-gcc + arm64-darwin21-gcc + arm64-darwin22-gcc + arm64-darwin23-gcc + arm64-darwin24-gcc + arm64-darwin25-gcc arm64-linux-gcc + arm64-win64-gcc + arm64-win64-vs15 + arm64-win64-vs16 + arm64-win64-vs16-clangcl + arm64-win64-vs17 + arm64-win64-vs17-clangcl armv7-android-gcc armv7-darwin-gcc armv7-linux-rvct armv7-linux-gcc armv7-none-rvct - armv7-win32-vs11 - armv7-win32-vs12 + armv7-win32-gcc armv7-win32-vs14 + armv7-win32-vs15 + armv7-win32-vs16 + armv7-win32-vs17 armv7s-darwin-gcc armv8-linux-gcc + loongarch32-linux-gcc + loongarch64-linux-gcc mips32-linux-gcc mips64-linux-gcc + ppc64le-linux-gcc sparc-solaris-gcc x86-android-gcc x86-darwin8-gcc @@ -74,16 +107,18 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86-darwin13-gcc x86-darwin14-gcc x86-darwin15-gcc + x86-darwin16-gcc + x86-darwin17-gcc x86-iphonesimulator-gcc x86-linux-gcc x86-linux-icc x86-os2-gcc x86-solaris-gcc x86-win32-gcc - x86-win32-vs10 - x86-win32-vs11 - x86-win32-vs12 x86-win32-vs14 + x86-win32-vs15 + x86-win32-vs16 + x86-win32-vs17 x86_64-android-gcc x86_64-darwin9-gcc x86_64-darwin10-gcc @@ -92,15 +127,25 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-darwin13-gcc x86_64-darwin14-gcc x86_64-darwin15-gcc + x86_64-darwin16-gcc + x86_64-darwin17-gcc + x86_64-darwin18-gcc + x86_64-darwin19-gcc + x86_64-darwin20-gcc + x86_64-darwin21-gcc + x86_64-darwin22-gcc + x86_64-darwin23-gcc + x86_64-darwin24-gcc + x86_64-darwin25-gcc x86_64-iphonesimulator-gcc x86_64-linux-gcc x86_64-linux-icc x86_64-solaris-gcc x86_64-win64-gcc - x86_64-win64-vs10 - x86_64-win64-vs11 - x86_64-win64-vs12 x86_64-win64-vs14 + x86_64-win64-vs15 + x86_64-win64-vs16 + x86_64-win64-vs17 generic-gnu The generic-gnu target, in conjunction with the CROSS environment variable, @@ -113,10 +158,10 @@ COMPILING THE APPLICATIONS/LIBRARIES: $ CROSS=mipsel-linux-uclibc- ../libvpx/configure In addition, the executables to be invoked can be overridden by specifying the - environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be - passed to these executables with CFLAGS, LDFLAGS, and ASFLAGS. + environment variables: AR, AS, CC, CXX, LD, STRIP. Additional flags can be + passed to these executables with ASFLAGS, CFLAGS, CXXFLAGS, and LDFLAGS. - 5. Configuration errors + 6. Configuration errors If the configuration step fails, the first step is to look in the error log. This defaults to config.log. This should give a good indication of what went wrong. If not, contact us for support. @@ -144,7 +189,49 @@ CODE STYLE: See also: http://clang.llvm.org/docs/ClangFormat.html +PROFILE GUIDED OPTIMIZATION (PGO) + Profile Guided Optimization can be enabled for Clang builds using the + commands: + + $ export CC=clang + $ export CXX=clang++ + $ ../libvpx/configure --enable-profile + $ make + + Generate one or multiple PGO profile files by running vpxdec or vpxenc. For + example: + + $ ./vpxdec ../vpx/out_ful/vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm \ + -o - > /dev/null + + To convert and merge the raw profile files, use the llvm-profdata tool: + + $ llvm-profdata merge -o perf.profdata default_8382761441159425451_0.profraw + + Then, rebuild the project with the new profile file: + + $ make clean + $ ../libvpx/configure --use-profile=perf.profdata + $ make + + Note: Always use the llvm-profdata from the toolchain that is used for + compiling the PGO-enabled binary. + + To observe the improvements from a PGO-enabled build, enable and compare the + list of failed optimizations by using the -Rpass-missed compiler flag. For + example, to list the failed loop vectorizations: + + $ ../libvpx/configure --use-profile=perf.profdata \ + --extra-cflags=-Rpass-missed=loop-vectorize + + For guidance on utilizing PGO files to identify potential optimization + opportunities, see: tools/README.pgo.md + SUPPORT This library is an open source project supported by its community. Please email webm-discuss@webmproject.org for help. +BUG REPORTS + Bug reports can be filed in the libvpx issue tracker: + https://issues.webmproject.org/. + For security reports, select 'Security report' from the Template dropdown. diff --git a/media/libvpx/libvpx/args.c b/media/libvpx/libvpx/args.c index a87b138b9d..f7dfacf8a4 100644 --- a/media/libvpx/libvpx/args.c +++ b/media/libvpx/libvpx/args.c @@ -8,21 +8,23 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include #include "args.h" #include "vpx/vpx_integer.h" -#include "vpx_ports/msvc.h" -#if defined(__GNUC__) && __GNUC__ -extern void die(const char *fmt, ...) __attribute__((noreturn)); +#if defined(__GNUC__) +__attribute__((noreturn)) extern void die(const char *fmt, ...); +#elif defined(_MSC_VER) +__declspec(noreturn) extern void die(const char *fmt, ...); #else extern void die(const char *fmt, ...); #endif -struct arg arg_init(char **argv) { +static struct arg arg_init(char **argv) { struct arg a; a.argv = argv; @@ -81,6 +83,7 @@ const char *arg_next(struct arg *arg) { char **argv_dup(int argc, const char **argv) { char **new_argv = malloc((argc + 1) * sizeof(*argv)); + if (!new_argv) return NULL; memcpy(new_argv, argv, argc * sizeof(*argv)); new_argv[argc] = NULL; @@ -88,24 +91,31 @@ char **argv_dup(int argc, const char **argv) { } void arg_show_usage(FILE *fp, const struct arg_def *const *defs) { - char option_text[40] = { 0 }; - for (; *defs; defs++) { const struct arg_def *def = *defs; char *short_val = def->has_val ? " " : ""; char *long_val = def->has_val ? "=" : ""; + int n = 0; + // Short options are indented with two spaces. Long options are indented + // with 12 spaces. if (def->short_name && def->long_name) { char *comma = def->has_val ? "," : ", "; - snprintf(option_text, 37, "-%s%s%s --%s%6s", def->short_name, short_val, - comma, def->long_name, long_val); + n = fprintf(fp, " -%s%s%s --%s%s", def->short_name, short_val, comma, + def->long_name, long_val); } else if (def->short_name) - snprintf(option_text, 37, "-%s%s", def->short_name, short_val); + n = fprintf(fp, " -%s%s", def->short_name, short_val); else if (def->long_name) - snprintf(option_text, 37, " --%s%s", def->long_name, long_val); + n = fprintf(fp, " --%s%s", def->long_name, long_val); - fprintf(fp, " %-37s\t%s\n", option_text, def->desc); + // Descriptions are indented with 40 spaces. If an option is 40 characters + // or longer, its description starts on the next line. + if (n < 40) + for (int i = 0; i < 40 - n; i++) fputc(' ', fp); + else + fputs("\n ", fp); + fprintf(fp, "%s\n", def->desc); if (def->enums) { const struct arg_enum_list *listptr; @@ -132,7 +142,6 @@ unsigned int arg_parse_uint(const struct arg *arg) { } die("Option %s: Invalid character '%c'\n", arg->name, *endptr); - return 0; } int arg_parse_int(const struct arg *arg) { @@ -149,7 +158,6 @@ int arg_parse_int(const struct arg *arg) { } die("Option %s: Invalid character '%c'\n", arg->name, *endptr); - return 0; } struct vpx_rational { @@ -206,7 +214,6 @@ int arg_parse_enum(const struct arg *arg) { if (!strcmp(arg->val, listptr->name)) return listptr->val; die("Option %s: Invalid value '%s'\n", arg->name, arg->val); - return 0; } int arg_parse_enum_or_int(const struct arg *arg) { diff --git a/media/libvpx/libvpx/args.h b/media/libvpx/libvpx/args.h index 54abe04607..6b3cc1c28b 100644 --- a/media/libvpx/libvpx/args.h +++ b/media/libvpx/libvpx/args.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef ARGS_H_ -#define ARGS_H_ +#ifndef VPX_ARGS_H_ +#define VPX_ARGS_H_ #include #ifdef __cplusplus @@ -28,8 +28,7 @@ struct arg_enum_list { const char *name; int val; }; -#define ARG_ENUM_LIST_END \ - { 0 } +#define ARG_ENUM_LIST_END { 0 } typedef struct arg_def { const char *short_name; @@ -38,19 +37,16 @@ typedef struct arg_def { const char *desc; const struct arg_enum_list *enums; } arg_def_t; -#define ARG_DEF(s, l, v, d) \ - { s, l, v, d, NULL } -#define ARG_DEF_ENUM(s, l, v, d, e) \ - { s, l, v, d, e } -#define ARG_DEF_LIST_END \ - { 0 } +#define ARG_DEF(s, l, v, d) { s, l, v, d, NULL } +#define ARG_DEF_ENUM(s, l, v, d, e) { s, l, v, d, e } +#define ARG_DEF_LIST_END { 0 } -struct arg arg_init(char **argv); int arg_match(struct arg *arg_, const struct arg_def *def, char **argv); const char *arg_next(struct arg *arg); void arg_show_usage(FILE *fp, const struct arg_def *const *defs); char **argv_dup(int argc, const char **argv); +// Note: arg_match() must be called before invoking these functions. unsigned int arg_parse_uint(const struct arg *arg); int arg_parse_int(const struct arg *arg); struct vpx_rational arg_parse_rational(const struct arg *arg); @@ -60,4 +56,4 @@ int arg_parse_enum_or_int(const struct arg *arg); } // extern "C" #endif -#endif // ARGS_H_ +#endif // VPX_ARGS_H_ diff --git a/media/libvpx/libvpx/build/make/Android.mk b/media/libvpx/libvpx/build/make/Android.mk index a88f90056e..3663c38305 100644 --- a/media/libvpx/libvpx/build/make/Android.mk +++ b/media/libvpx/libvpx/build/make/Android.mk @@ -8,17 +8,15 @@ ## be found in the AUTHORS file in the root of the source tree. ## +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT # # This file is to be used for compiling libvpx for Android using the NDK. # In an Android project place a libvpx checkout in the jni directory. # Run the configure script from the jni directory. Base libvpx # encoder/decoder configuration will look similar to: -# ./libvpx/configure --target=armv7-android-gcc --disable-examples \ -# --sdk-path=/opt/android-ndk-r6b/ -# -# When targeting Android, realtime-only is enabled by default. This can -# be overridden by adding the command line flag: -# --disable-realtime-only +# ./libvpx/configure --target=arm64-android-gcc --disable-examples \ +# --enable-external-build # # This will create .mk files that contain variables that contain the # source files to compile. @@ -27,39 +25,25 @@ # Android.mk file in the libvpx directory: # LOCAL_PATH := $(call my-dir) # include $(CLEAR_VARS) -# include jni/libvpx/build/make/Android.mk +# include libvpx/build/make/Android.mk # -# By default libvpx will detect at runtime the existance of NEON extension. -# For this we import the 'cpufeatures' module from the NDK sources. -# libvpx can also be configured without this runtime detection method. -# Configuring with --disable-runtime-cpu-detect will assume presence of NEON. -# Configuring with --disable-runtime-cpu-detect --disable-neon \ -# --disable-neon-asm -# will remove any NEON dependency. +# By default libvpx will use the 'cpufeatures' module from the NDK. This allows +# the library to be built with all available optimizations (SSE2->AVX512 for +# x86, NEON for arm, DSPr2 for mips). This can be disabled with +# --disable-runtime-cpu-detect +# but the resulting library *must* be run on devices supporting all of the +# enabled extensions. They can be disabled individually with +# --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512} +# --disable-neon{, -asm, -neon-dotprod, -neon-i8mm} +# --disable-sve +# --disable-{dspr2, msa} # -# Running ndk-build will build libvpx and include it in your project. +# Running ndk-build will build libvpx and include it in your project. Set +# APP_ABI to match the --target passed to configure: +# https://developer.android.com/ndk/guides/application_mk#app_abi. # -# Alternatively, building the examples and unit tests can be accomplished in the -# following way: -# -# Create a standalone toolchain from the NDK: -# https://developer.android.com/ndk/guides/standalone_toolchain.html -# -# For example - to test on arm64 devices with clang: -# $NDK/build/tools/make_standalone_toolchain.py \ -# --arch arm64 --install-dir=/tmp/my-android-toolchain -# export PATH=/tmp/my-android-toolchain/bin:$PATH -# CROSS=aarch64-linux-android- CC=clang CXX=clang++ /path/to/libvpx/configure \ -# --target=arm64-android-gcc -# -# Push the resulting binaries to a device and run them: -# adb push test_libvpx /data/tmp/test_libvpx -# adb shell /data/tmp/test_libvpx --gtest_filter=\*Sixtap\* -# -# Make sure to push the test data as well and set LIBVPX_TEST_DATA - CONFIG_DIR := $(LOCAL_PATH)/ LIBVPX_PATH := $(LOCAL_PATH)/libvpx ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas @@ -183,6 +167,9 @@ LOCAL_CFLAGS += \ -I$(ASM_CNV_PATH)/libvpx LOCAL_MODULE := libvpx +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) LOCAL_STATIC_LIBRARIES := cpufeatures @@ -226,3 +213,4 @@ endif ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) $(call import-module,android/cpufeatures) endif +endif # NDK_ROOT diff --git a/media/libvpx/libvpx/build/make/Makefile b/media/libvpx/libvpx/build/make/Makefile index cba605786c..39dff825b6 100644 --- a/media/libvpx/libvpx/build/make/Makefile +++ b/media/libvpx/libvpx/build/make/Makefile @@ -21,9 +21,9 @@ all: .DEFAULT clean:: .DEFAULT exampletest: .DEFAULT install:: .DEFAULT -test:: .DEFAULT -test-no-data-check:: .DEFAULT -testdata:: .DEFAULT +test: .DEFAULT +test-no-data-check: .DEFAULT +testdata: .DEFAULT utiltest: .DEFAULT exampletest-no-data-check utiltest-no-data-check: .DEFAULT test_%: .DEFAULT ; @@ -99,6 +99,7 @@ distclean: clean rm -f Makefile; \ rm -f config.log config.mk; \ rm -f vpx_config.[hc] vpx_config.asm; \ + rm -f arm_neon.h; \ else \ rm -f $(target)-$(TOOLCHAIN).mk; \ fi @@ -110,13 +111,13 @@ exampletest: .PHONY: install install:: .PHONY: test -test:: +test: .PHONY: testdata -testdata:: +testdata: .PHONY: utiltest utiltest: .PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check -test-no-data-check:: +test-no-data-check: exampletest-no-data-check utiltest-no-data-check: # Force to realign stack always on OS/2 @@ -124,6 +125,7 @@ ifeq ($(TOOLCHAIN), x86-os2-gcc) CFLAGS += -mstackrealign endif +# x86[_64] $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx $(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 @@ -138,6 +140,32 @@ $(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 +$(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl +$(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl + +# AARCH64 +$(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod +$(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod +$(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm +$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm +$(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve +$(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve +$(BUILD_PFX)%_sve2.c.d: CFLAGS += -march=armv9-a+i8mm+sve2 +$(BUILD_PFX)%_sve2.c.o: CFLAGS += -march=armv9-a+i8mm+sve2 + +# POWER +$(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx +$(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx + +# MIPS +$(BUILD_PFX)%_msa.c.d: CFLAGS += -mmsa +$(BUILD_PFX)%_msa.c.o: CFLAGS += -mmsa + +# LOONGARCH +$(BUILD_PFX)%_lsx.c.d: CFLAGS += -mlsx +$(BUILD_PFX)%_lsx.c.o: CFLAGS += -mlsx +$(BUILD_PFX)%_lasx.c.d: CFLAGS += -mlasx +$(BUILD_PFX)%_lasx.c.o: CFLAGS += -mlasx $(BUILD_PFX)%.c.d: %.c $(if $(quiet),@echo " [DEP] $@") @@ -286,6 +314,19 @@ $(1): $(qexec)$$(AR) $$(ARFLAGS) $$@ $$^ endef +# Don't use -Wl,-z,defs with Clang's sanitizers. +# +# Clang's AddressSanitizer documentation says "When linking shared libraries, +# the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link +# errors (don't use it with AddressSanitizer)." See +# https://clang.llvm.org/docs/AddressSanitizer.html#usage. +NO_UNDEFINED := -Wl,-z,defs +ifeq ($(findstring clang,$(CC)),clang) + ifneq ($(filter -fsanitize=%,$(LDFLAGS)),) + NO_UNDEFINED := + endif +endif + define so_template # Not using a pattern rule here because we don't want to generate empty # archives when they are listed as a dependency in files not responsible @@ -295,7 +336,8 @@ define so_template $(1): $(if $(quiet),@echo " [LD] $$@") $(qexec)$$(LD) -shared $$(LDFLAGS) \ - -Wl,--no-undefined -Wl,-soname,$$(SONAME) \ + $(NO_UNDEFINED) \ + -Wl,-soname,$$(SONAME) \ -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \ $$(filter %.o,$$^) $$(extralibs) endef @@ -422,10 +464,10 @@ ifneq ($(call enabled,DIST-SRCS),) DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_vcxproj.sh DIST-SRCS-$(CONFIG_MSVS) += build/make/msvs_common.sh DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh - DIST-SRCS-$(ARCH_ARM) += build/make/ads2gas.pl - DIST-SRCS-$(ARCH_ARM) += build/make/ads2gas_apple.pl - DIST-SRCS-$(ARCH_ARM) += build/make/ads2armasm_ms.pl - DIST-SRCS-$(ARCH_ARM) += build/make/thumb.pm + DIST-SRCS-$(VPX_ARCH_ARM) += build/make/ads2gas.pl + DIST-SRCS-$(VPX_ARCH_ARM) += build/make/ads2gas_apple.pl + DIST-SRCS-$(VPX_ARCH_ARM) += build/make/ads2armasm_ms.pl + DIST-SRCS-$(VPX_ARCH_ARM) += build/make/thumb.pm DIST-SRCS-yes += $(target:-$(TOOLCHAIN)=).mk endif INSTALL-SRCS := $(call cond_enabled,CONFIG_INSTALL_SRCS,INSTALL-SRCS) @@ -447,6 +489,6 @@ INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins all: $(BUILD_TARGETS) install:: $(INSTALL_TARGETS) dist: $(INSTALL_TARGETS) -test:: +test: .SUFFIXES: # Delete default suffix rules diff --git a/media/libvpx/libvpx/build/make/ads2armasm_ms.pl b/media/libvpx/libvpx/build/make/ads2armasm_ms.pl index 2a2c470ff8..dd4e0318c4 100644 --- a/media/libvpx/libvpx/build/make/ads2armasm_ms.pl +++ b/media/libvpx/libvpx/build/make/ads2armasm_ms.pl @@ -28,7 +28,7 @@ while () s/qsubaddx/qsax/i; s/qaddsubx/qasx/i; - thumb::FixThumbInstructions($_, 1); + thumb::FixThumbInstructions($_); s/ldrneb/ldrbne/i; s/ldrneh/ldrhne/i; diff --git a/media/libvpx/libvpx/build/make/ads2gas.pl b/media/libvpx/libvpx/build/make/ads2gas.pl index 029cc4a56f..c301b7f829 100644 --- a/media/libvpx/libvpx/build/make/ads2gas.pl +++ b/media/libvpx/libvpx/build/make/ads2gas.pl @@ -23,16 +23,17 @@ use lib $FindBin::Bin; use thumb; my $thumb = 0; +my $elf = 1; foreach my $arg (@ARGV) { $thumb = 1 if ($arg eq "-thumb"); + $elf = 0 if ($arg eq "-noelf"); } print "@ This file was created from a .asm file\n"; print "@ using the ads2gas.pl script.\n"; -print "\t.equ DO1STROUNDING, 0\n"; +print ".syntax unified\n"; if ($thumb) { - print "\t.syntax unified\n"; print "\t.thumb\n"; } @@ -41,39 +42,11 @@ if ($thumb) { while () { - undef $comment; - undef $line; - $comment_char = ";"; - $comment_sub = "@"; - - # Handle comments. - if (/$comment_char/) - { - $comment = ""; - ($line, $comment) = /(.*?)$comment_char(.*)/; - $_ = $line; - } - # Load and store alignment s/@/,:/g; - # Hexadecimal constants prefaced by 0x - s/#&/#0x/g; - - # Convert :OR: to | - s/:OR:/ | /g; - - # Convert :AND: to & - s/:AND:/ & /g; - - # Convert :NOT: to ~ - s/:NOT:/ ~ /g; - - # Convert :SHL: to << - s/:SHL:/ << /g; - - # Convert :SHR: to >> - s/:SHR:/ >> /g; + # Comment character + s/;/@/; # Convert ELSE to .else s/\bELSE\b/.else/g; @@ -81,128 +54,83 @@ while () # Convert ENDIF to .endif s/\bENDIF\b/.endif/g; - # Convert ELSEIF to .elseif - s/\bELSEIF\b/.elseif/g; - - # Convert LTORG to .ltorg - s/\bLTORG\b/.ltorg/g; - - # Convert endfunc to nothing. - s/\bendfunc\b//ig; - - # Convert FUNCTION to nothing. - s/\bFUNCTION\b//g; - s/\bfunction\b//g; - - s/\bENTRY\b//g; - s/\bMSARMASM\b/0/g; - s/^\s+end\s+$//g; - - # Convert IF :DEF:to .if - # gcc doesn't have the ability to do a conditional - # if defined variable that is set by IF :DEF: on - # armasm, so convert it to a normal .if and then - # make sure to define a value elesewhere - if (s/\bIF :DEF:\b/.if /g) - { - s/=/==/g; - } - # Convert IF to .if - if (s/\bIF\b/.if/g) - { + if (s/\bIF\b/.if/g) { s/=+/==/g; } # Convert INCLUDE to .INCLUDE "file" - s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/; - - # Code directive (ARM vs Thumb) - s/CODE([0-9][0-9])/.code $1/; + s/INCLUDE\s?(.*)$/.include \"$1\"/; # No AREA required # But ALIGNs in AREA must be obeyed - s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/; + s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/; # If no ALIGN, strip the AREA and align to 4 bytes - s/^\s*AREA.*$/.text\n.p2align 2/; + s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/; - # DCD to .word - # This one is for incoming symbols - s/DCD\s+\|(\w*)\|/.long $1/; + # Make function visible to linker. + if ($elf) { + s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2\n$1.type $2, function/; + } else { + s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2/; + } - # DCW to .short - s/DCW\s+\|(\w*)\|/.short $1/; - s/DCW(.*)/.short $1/; - - # Constants defined in scope - s/DCD(.*)/.long $1/; - s/DCB(.*)/.byte $1/; - - # Make function visible to linker, and make additional symbol with - # prepended underscore - s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/; - s/IMPORT\s+\|([\$\w]*)\|/.global $1/; - - s/EXPORT\s+([\$\w]*)/.global $1/; - s/export\s+([\$\w]*)/.global $1/; - - # No vertical bars required; make additional symbol with prepended - # underscore - s/^\|(\$?\w+)\|/_$1\n\t$1:/g; + # No vertical bars on function names + s/^\|(\$?\w+)\|/$1/g; # Labels need trailing colon -# s/^(\w+)/$1:/ if !/EQU/; - # put the colon at the end of the line in the macro s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/; # ALIGN directive s/\bALIGN\b/.balign/g; if ($thumb) { - # ARM code - we force everything to thumb with the declaration in the header - s/\sARM//g; + # ARM code - we force everything to thumb with the declaration in the + # header + s/\bARM\b//g; } else { # ARM code - s/\sARM/.arm/g; + s/\bARM\b/.arm/g; } # push/pop s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g; s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g; - # NEON code - s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g; - s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g; - if ($thumb) { - thumb::FixThumbInstructions($_, 0); + thumb::FixThumbInstructions($_); } # eabi_attributes numerical equivalents can be found in the # "ARM IHI 0045C" document. - # REQUIRE8 Stack is required to be 8-byte aligned - s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g; + if ($elf) { + # REQUIRE8 Stack is required to be 8-byte aligned + s/\bREQUIRE8\b/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g; - # PRESERVE8 Stack 8-byte align is preserved - s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g; + # PRESERVE8 Stack 8-byte align is preserved + s/\bPRESERVE8\b/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g; + } else { + s/\bREQUIRE8\b//; + s/\bPRESERVE8\b//; + } # Use PROC and ENDP to give the symbols a .size directive. # This makes them show up properly in debugging tools like gdb and valgrind. - if (/\bPROC\b/) - { + if (/\bPROC\b/) { my $proc; - /^_([\.0-9A-Z_a-z]\w+)\b/; + # Match the function name so it can be stored in $proc + /^([\.0-9A-Z_a-z]\w+)\b/; $proc = $1; push(@proc_stack, $proc) if ($proc); s/\bPROC\b/@ $&/; } - if (/\bENDP\b/) - { + + if (/\bENDP\b/) { my $proc; s/\bENDP\b/@ $&/; $proc = pop(@proc_stack); - $_ = "\t.size $proc, .-$proc".$_ if ($proc); + $_ = ".size $proc, .-$proc".$_ if ($proc and $elf); } # EQU directive @@ -210,19 +138,20 @@ while () # Begin macro definition if (/\bMACRO\b/) { + # Process next line down, which will be the macro definition $_ = ; s/^/.macro/; - s/\$//g; # remove formal param reference - s/;/@/g; # change comment characters + s/\$//g; # Remove $ from the variables in the declaration } - # For macros, use \ to reference formal params - s/\$/\\/g; # End macro definition - s/\bMEND\b/.endm/; # No need to tell it where to stop assembling + s/\$/\\/g; # Use \ to reference formal parameters + # End macro definition + + s/\bMEND\b/.endm/; # No need to tell it where to stop assembling next if /^\s*END\s*$/; + s/[ \t]+$//; print; - print "$comment_sub$comment\n" if defined $comment; } # Mark that this object doesn't need an executable stack. -printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n"); +printf (" .section .note.GNU-stack,\"\",\%\%progbits\n") if $elf; diff --git a/media/libvpx/libvpx/build/make/ads2gas_apple.pl b/media/libvpx/libvpx/build/make/ads2gas_apple.pl index e1ae7b4f87..62491c1918 100644 --- a/media/libvpx/libvpx/build/make/ads2gas_apple.pl +++ b/media/libvpx/libvpx/build/make/ads2gas_apple.pl @@ -20,19 +20,14 @@ print "@ This file was created from a .asm file\n"; print "@ using the ads2gas_apple.pl script.\n\n"; -print "\t.set WIDE_REFERENCE, 0\n"; -print "\t.set ARCHITECTURE, 5\n"; -print "\t.set DO1STROUNDING, 0\n"; +print ".syntax unified\n"; -my %register_aliases; my %macro_aliases; my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8", "\$9"); my @incoming_array; -my @imported_functions; - # Perl trim function to remove whitespace from the start and end of the string sub trim($) { @@ -48,25 +43,7 @@ while () s/@/,:/g; # Comment character - s/;/ @/g; - - # Hexadecimal constants prefaced by 0x - s/#&/#0x/g; - - # Convert :OR: to | - s/:OR:/ | /g; - - # Convert :AND: to & - s/:AND:/ & /g; - - # Convert :NOT: to ~ - s/:NOT:/ ~ /g; - - # Convert :SHL: to << - s/:SHL:/ << /g; - - # Convert :SHR: to >> - s/:SHR:/ >> /g; + s/;/@/; # Convert ELSE to .else s/\bELSE\b/.else/g; @@ -74,131 +51,64 @@ while () # Convert ENDIF to .endif s/\bENDIF\b/.endif/g; - # Convert ELSEIF to .elseif - s/\bELSEIF\b/.elseif/g; - - # Convert LTORG to .ltorg - s/\bLTORG\b/.ltorg/g; - - # Convert IF :DEF:to .if - # gcc doesn't have the ability to do a conditional - # if defined variable that is set by IF :DEF: on - # armasm, so convert it to a normal .if and then - # make sure to define a value elesewhere - if (s/\bIF :DEF:\b/.if /g) - { - s/=/==/g; - } - # Convert IF to .if - if (s/\bIF\b/.if/g) - { - s/=/==/g; + if (s/\bIF\b/.if/g) { + s/=+/==/g; } # Convert INCLUDE to .INCLUDE "file" - s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/; - - # Code directive (ARM vs Thumb) - s/CODE([0-9][0-9])/.code $1/; + s/INCLUDE\s?(.*)$/.include \"$1\"/; # No AREA required # But ALIGNs in AREA must be obeyed - s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/; + s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/; # If no ALIGN, strip the AREA and align to 4 bytes - s/^\s*AREA.*$/.text\n.p2align 2/; + s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/; - # DCD to .word - # This one is for incoming symbols - s/DCD\s+\|(\w*)\|/.long $1/; + # Make function visible to linker. + s/EXPORT\s+\|([\$\w]*)\|/.globl _$1/; - # DCW to .short - s/DCW\s+\|(\w*)\|/.short $1/; - s/DCW(.*)/.short $1/; + # No vertical bars on function names + s/^\|(\$?\w+)\|/$1/g; - # Constants defined in scope - s/DCD(.*)/.long $1/; - s/DCB(.*)/.byte $1/; + # Labels and functions need a leading underscore and trailing colon + s/^([a-zA-Z_0-9\$]+)/_$1:/ if !/EQU/; - # Make function visible to linker, and make additional symbol with - # prepended underscore - s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/; - - # Prepend imported functions with _ - if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/) - { - $function = trim($1); - push(@imported_functions, $function); - } - - foreach $function (@imported_functions) - { - s/$function/_$function/; - } - - # No vertical bars required; make additional symbol with prepended - # underscore - s/^\|(\$?\w+)\|/_$1\n\t$1:/g; - - # Labels need trailing colon -# s/^(\w+)/$1:/ if !/EQU/; - # put the colon at the end of the line in the macro - s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/; + # Branches need to call the correct, underscored, function + s/^(\s+b[egln]?[teq]?\s+)([a-zA-Z_0-9\$]+)/$1 _$2/ if !/EQU/; # ALIGN directive s/\bALIGN\b/.balign/g; # Strip ARM - s/\sARM/@ ARM/g; + s/\s+ARM//; # Strip REQUIRE8 - #s/\sREQUIRE8/@ REQUIRE8/g; - s/\sREQUIRE8/@ /g; + s/\s+REQUIRE8//; # Strip PRESERVE8 - s/\sPRESERVE8/@ PRESERVE8/g; + s/\s+PRESERVE8//; # Strip PROC and ENDPROC - s/\bPROC\b/@/g; - s/\bENDP\b/@/g; + s/\bPROC\b//g; + s/\bENDP\b//g; # EQU directive - s/(.*)EQU(.*)/.set $1, $2/; + s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/; # Begin macro definition - if (/\bMACRO\b/) - { + if (/\bMACRO\b/) { # Process next line down, which will be the macro definition $_ = ; - - $trimmed = trim($_); - - # remove commas that are separating list - $trimmed =~ s/,//g; - - # string to array - @incoming_array = split(/\s+/, $trimmed); - - print ".macro @incoming_array[0]\n"; - - # remove the first element, as that is the name of the macro - shift (@incoming_array); - - @macro_aliases{@incoming_array} = @mapping_list; - - next; + s/^/.macro/; + s/\$//g; # Remove $ from the variables in the declaration } - while (($key, $value) = each(%macro_aliases)) - { - $key =~ s/\$/\\\$/; - s/$key\b/$value/g; - } + s/\$/\\/g; # Use \ to reference formal parameters + # End macro definition - # For macros, use \ to reference formal params -# s/\$/\\/g; # End macro definition - s/\bMEND\b/.endm/; # No need to tell it where to stop assembling + s/\bMEND\b/.endm/; # No need to tell it where to stop assembling next if /^\s*END\s*$/; - + s/[ \t]+$//; print; } diff --git a/media/libvpx/libvpx/build/make/configure.sh b/media/libvpx/libvpx/build/make/configure.sh index 007e020002..21407f3d89 100644 --- a/media/libvpx/libvpx/build/make/configure.sh +++ b/media/libvpx/libvpx/build/make/configure.sh @@ -74,6 +74,8 @@ Build options: --cpu=CPU optimize for a specific cpu rather than a family --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS [$CFLAGS] --extra-cxxflags=ECXXFLAGS add ECXXFLAGS to CXXFLAGS [$CXXFLAGS] + --use-profile=PROFILE_FILE + Use PROFILE_FILE for PGO ${toggle_extra_warnings} emit harmless warnings (always non-fatal) ${toggle_werror} treat warnings as errors, if possible (not available with all compilers) @@ -81,6 +83,7 @@ Build options: ${toggle_pic} turn on/off Position Independent Code ${toggle_ccache} turn on/off compiler cache ${toggle_debug} enable/disable debug mode + ${toggle_profile} enable/disable profiling ${toggle_gprof} enable/disable gprof profiling instrumentation ${toggle_gcov} enable/disable gcov coverage instrumentation ${toggle_thumb} enable/disable building arm assembly in thumb mode @@ -262,6 +265,9 @@ if [ -z "$source_path" ] || [ "$source_path" = "." ]; then source_path="`pwd`" disable_feature source_path_used fi +# Makefiles greedily process the '#' character as a comment, even if it is +# inside quotes. So, this character must be escaped in all paths in Makefiles. +source_path_mk=$(echo $source_path | sed -e 's;\#;\\\#;g') if test ! -z "$TMPDIR" ; then TMPDIRx="${TMPDIR}" @@ -319,6 +325,12 @@ check_ld() { && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs} } +check_lib() { + log check_lib "$@" + check_cc $@ \ + && check_cmd ${LD} ${LDFLAGS} -o ${TMP_X} ${TMP_O} "$@" ${extralibs} +} + check_header(){ log check_header "$@" header=$1 @@ -403,6 +415,90 @@ check_gcc_machine_option() { fi } +# tests for -m$2, -m$3, -m$4... toggling the feature given in $1. +check_gcc_machine_options() { + feature="$1" + shift + flags="-m$1" + shift + for opt in $*; do + flags="$flags -m$opt" + done + + if enabled gcc && ! disabled "$feature" && ! check_cflags $flags; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature " + else + soft_enable "$feature" + fi +} + +check_neon_sve_bridge_compiles() { + if enabled sve; then + check_cc -march=armv8.2-a+dotprod+i8mm+sve < +#include +EOF + compile_result=$? + if [ ${compile_result} -eq 0 ]; then + # Check whether the compiler can compile SVE functions that require + # backup/restore of SVE registers according to AAPCS. Clang for Windows + # used to fail this, see + # https://github.com/llvm/llvm-project/issues/80009. + check_cc -march=armv8.2-a+dotprod+i8mm+sve < +void other(void); +svfloat32_t func(svfloat32_t a) { + other(); + return a; +} +EOF + compile_result=$? + fi + + if [ ${compile_result} -ne 0 ]; then + log_echo " disabling sve: arm_neon_sve_bridge.h not supported by compiler" + log_echo " disabling sve2: arm_neon_sve_bridge.h not supported by compiler" + disable_feature sve + disable_feature sve2 + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sve --disable-sve2 " + fi + fi +} + +check_gcc_avx512_compiles() { + if disabled gcc; then + return + fi + + check_cc -mavx512f < +void f(void) { + __m512i x = _mm512_set1_epi16(0); + (void)x; +} +EOF + compile_result=$? + if [ ${compile_result} -ne 0 ]; then + log_echo " disabling avx512: not supported by compiler" + disable_feature avx512 + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 " + fi +} + +check_inline_asm() { + log check_inline_asm "$@" + name="$1" + code="$2" + shift 2 + disable_feature $name + check_cc "$@" <> config.mk @@ -438,11 +534,11 @@ write_common_target_config_mk() { cat >> $1 << EOF # This file automatically generated by configure. Do not edit! -SRC_PATH="$source_path" -SRC_PATH_BARE=$source_path +SRC_PATH="$source_path_mk" +SRC_PATH_BARE=$source_path_mk BUILD_PFX=${BUILD_PFX} TOOLCHAIN=${toolchain} -ASM_CONVERSION=${asm_conversion_cmd:-${source_path}/build/make/ads2gas.pl} +ASM_CONVERSION=${asm_conversion_cmd:-${source_path_mk}/build/make/ads2gas.pl} GEN_VCPROJ=${gen_vcproj_cmd} MSVS_ARCH_DIR=${msvs_arch_dir} @@ -452,7 +548,6 @@ AR=${AR} LD=${LD} AS=${AS} STRIP=${STRIP} -NM=${NM} CFLAGS = ${CFLAGS} CXXFLAGS = ${CXXFLAGS} @@ -464,6 +559,8 @@ AS_SFX = ${AS_SFX:-.asm} EXE_SFX = ${EXE_SFX} VCPROJ_SFX = ${VCPROJ_SFX} RTCD_OPTIONS = ${RTCD_OPTIONS} +LIBWEBM_CXXFLAGS = ${LIBWEBM_CXXFLAGS} +LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS} EOF if enabled rvct; then cat >> $1 << EOF @@ -474,10 +571,10 @@ fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\${@:.d=.o} \$@;' EOF fi - print_config_mk ARCH "${1}" ${ARCH_LIST} - print_config_mk HAVE "${1}" ${HAVE_LIST} - print_config_mk CONFIG "${1}" ${CONFIG_LIST} - print_config_mk HAVE "${1}" gnu_strip + print_config_mk VPX_ARCH "${1}" ${ARCH_LIST} + print_config_mk HAVE "${1}" ${HAVE_LIST} + print_config_mk CONFIG "${1}" ${CONFIG_LIST} + print_config_mk HAVE "${1}" gnu_strip enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}" @@ -494,15 +591,33 @@ write_common_target_config_h() { #define RESTRICT ${RESTRICT} #define INLINE ${INLINE} EOF - print_config_h ARCH "${TMP_H}" ${ARCH_LIST} - print_config_h HAVE "${TMP_H}" ${HAVE_LIST} - print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST} - print_config_vars_h "${TMP_H}" ${VAR_LIST} + print_config_h VPX_ARCH "${TMP_H}" ${ARCH_LIST} + print_config_h HAVE "${TMP_H}" ${HAVE_LIST} + print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST} + print_config_vars_h "${TMP_H}" ${VAR_LIST} echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H} mkdir -p `dirname "$1"` cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1" } +write_win_arm64_neon_h_workaround() { + print_webm_license ${TMP_H} "/*" " */" + cat >> ${TMP_H} << EOF +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_WIN_ARM_NEON_H_WORKAROUND +#define VPX_WIN_ARM_NEON_H_WORKAROUND +/* The Windows SDK has arm_neon.h, but unlike on other platforms it is + * ARM32-only. ARM64 NEON support is provided by arm64_neon.h, a proper + * superset of arm_neon.h. Work around this by providing a more local + * arm_neon.h that simply #includes arm64_neon.h. + */ +#include +#endif /* VPX_WIN_ARM_NEON_H_WORKAROUND */ +EOF + mkdir -p `dirname "$1"` + cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1" +} + process_common_cmdline() { for opt in "$@"; do optval="${opt#*=}" @@ -534,6 +649,9 @@ process_common_cmdline() { --extra-cxxflags=*) extra_cxxflags="${optval}" ;; + --use-profile=*) + pgo_file=${optval} + ;; --enable-?*|--disable-?*) eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'` if is_in ${option} ${ARCH_EXT_LIST}; then @@ -585,11 +703,7 @@ process_common_cmdline() { --libdir=*) libdir="${optval}" ;; - --sdk-path=*) - [ -d "${optval}" ] || die "Not a directory: ${optval}" - sdk_path="${optval}" - ;; - --libc|--as|--prefix|--libdir|--sdk-path) + --libc|--as|--prefix|--libdir) die "Option ${opt} requires argument" ;; --help|-h) @@ -634,7 +748,6 @@ setup_gnu_toolchain() { LD=${LD:-${CROSS}${link_with_cc:-ld}} AS=${AS:-${CROSS}as} STRIP=${STRIP:-${CROSS}strip} - NM=${NM:-${CROSS}nm} AS_SFX=.S EXE_SFX= } @@ -674,7 +787,6 @@ check_xcode_minimum_version() { process_common_toolchain() { if [ -z "$toolchain" ]; then gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}" - # detect tgt_isa case "$gcctarget" in aarch64*) @@ -697,37 +809,39 @@ process_common_toolchain() { *sparc*) tgt_isa=sparc ;; + power*64le*-*) + tgt_isa=ppc64le + ;; + *mips64el*) + tgt_isa=mips64 + ;; + *mips32el*) + tgt_isa=mips32 + ;; + loongarch32*) + tgt_isa=loongarch32 + ;; + loongarch64*) + tgt_isa=loongarch64 + ;; esac # detect tgt_os case "$gcctarget" in - *darwin10*) + *darwin1[0-9]*) tgt_isa=x86_64 - tgt_os=darwin10 + tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'` ;; - *darwin11*) - tgt_isa=x86_64 - tgt_os=darwin11 - ;; - *darwin12*) - tgt_isa=x86_64 - tgt_os=darwin12 - ;; - *darwin13*) - tgt_isa=x86_64 - tgt_os=darwin13 - ;; - *darwin14*) - tgt_isa=x86_64 - tgt_os=darwin14 - ;; - *darwin15*) - tgt_isa=x86_64 - tgt_os=darwin15 + *darwin2[0-5]*) + tgt_isa=`uname -m` + tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'` ;; x86_64*mingw32*) tgt_os=win64 ;; + x86_64*cygwin*) + tgt_os=win64 + ;; *mingw32*|*cygwin*) [ -z "$tgt_isa" ] && tgt_isa=x86 tgt_os=win32 @@ -769,16 +883,34 @@ process_common_toolchain() { # Enable the architecture family case ${tgt_isa} in + arm64 | armv8) + enable_feature arm + enable_feature aarch64 + ;; arm*) enable_feature arm ;; mips*) enable_feature mips ;; + ppc*) + enable_feature ppc + ;; + loongarch*) + soft_enable lsx + soft_enable lasx + enable_feature loongarch + ;; esac - # PIC is probably what we want when building shared libs + # Position independent code (PIC) is probably what we want when building + # shared libs or position independent executable (PIE) targets. enabled shared && soft_enable pic + check_cpp << EOF || soft_enable pic +#if !(__pie__ || __PIE__) +#error Neither __pie__ or __PIE__ are set +#endif +EOF # Minimum iOS version for all target platforms (darwin and iphonesimulator). # Shared library framework builds are only possible on iOS 8 and later. @@ -787,13 +919,13 @@ process_common_toolchain() { IOS_VERSION_MIN="8.0" else IOS_VERSION_OPTIONS="" - IOS_VERSION_MIN="6.0" + IOS_VERSION_MIN="7.0" fi # Handle darwin variants. Newer SDKs allow targeting older # platforms, so use the newest one available. case ${toolchain} in - arm*-darwin*) + arm*-darwin-*) add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" iphoneos_sdk_dir="$(show_darwin_sdk_path iphoneos)" if [ -d "${iphoneos_sdk_dir}" ]; then @@ -801,7 +933,7 @@ process_common_toolchain() { add_ldflags "-isysroot ${iphoneos_sdk_dir}" fi ;; - x86*-darwin*) + *-darwin*) osx_sdk_dir="$(show_darwin_sdk_path macosx)" if [ -d "${osx_sdk_dir}" ]; then add_cflags "-isysroot ${osx_sdk_dir}" @@ -843,6 +975,26 @@ process_common_toolchain() { add_cflags "-mmacosx-version-min=10.11" add_ldflags "-mmacosx-version-min=10.11" ;; + *-darwin16-*) + add_cflags "-mmacosx-version-min=10.12" + add_ldflags "-mmacosx-version-min=10.12" + ;; + *-darwin17-*) + add_cflags "-mmacosx-version-min=10.13" + add_ldflags "-mmacosx-version-min=10.13" + ;; + *-darwin18-*) + add_cflags "-mmacosx-version-min=10.14" + add_ldflags "-mmacosx-version-min=10.14" + ;; + *-darwin19-*) + add_cflags "-mmacosx-version-min=10.15" + add_ldflags "-mmacosx-version-min=10.15" + ;; + *-darwin2[0-5]-*) + add_cflags "-arch ${toolchain%%-*}" + add_ldflags "-arch ${toolchain%%-*}" + ;; *-iphonesimulator-*) add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}" add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}" @@ -864,34 +1016,36 @@ process_common_toolchain() { ;; esac - # Process ARM architecture variants + # Process architecture variants case ${toolchain} in arm*) - # on arm, isa versions are supersets - case ${tgt_isa} in - arm64|armv8) - soft_enable neon + case ${toolchain} in + armv7*-darwin*) + # Runtime cpu detection is not defined for these targets. + enabled runtime_cpu_detect && disable_feature runtime_cpu_detect ;; - armv7|armv7s) - soft_enable neon - # Only enable neon_asm when neon is also enabled. - enabled neon && soft_enable neon_asm - # If someone tries to force it through, die. - if disabled neon && enabled neon_asm; then - die "Disabling neon while keeping neon-asm is not supported" - fi + *) + soft_enable runtime_cpu_detect ;; esac - asm_conversion_cmd="cat" + if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then + soft_enable neon + # Only enable neon_asm when neon is also enabled. + enabled neon && soft_enable neon_asm + # If someone tries to force it through, die. + if disabled neon && enabled neon_asm; then + die "Disabling neon while keeping neon-asm is not supported" + fi + fi + asm_conversion_cmd="cat" case ${tgt_cc} in gcc) link_with_cc=gcc setup_gnu_toolchain arch_int=${tgt_isa##armv} arch_int=${arch_int%%te} - check_add_asflags --defsym ARCHITECTURE=${arch_int} tune_cflags="-mtune=" if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then if [ -z "${float_abi}" ]; then @@ -917,7 +1071,17 @@ EOF fi enabled debug && add_asflags -g - asm_conversion_cmd="${source_path}/build/make/ads2gas.pl" + asm_conversion_cmd="${source_path_mk}/build/make/ads2gas.pl" + + case ${tgt_os} in + win*) + asm_conversion_cmd="$asm_conversion_cmd -noelf" + AS="$CC -c" + EXE_SFX=.exe + enable_feature thumb + ;; + esac + if enabled thumb; then asm_conversion_cmd="$asm_conversion_cmd -thumb" check_add_cflags -mthumb @@ -925,18 +1089,44 @@ EOF fi ;; vs*) - asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl" - AS_SFX=.S - msvs_arch_dir=arm-msvs - disable_feature multithread - disable_feature unit_tests - vs_version=${tgt_cc##vs} - if [ $vs_version -ge 12 ]; then - # MSVC 2013 doesn't allow doing plain .exe projects for ARM, - # only "AppContainerApplication" which requires an AppxManifest. - # Therefore disable the examples, just build the library. - disable_feature examples - disable_feature tools + # A number of ARM-based Windows platforms are constrained by their + # respective SDKs' limitations. Fortunately, these are all 32-bit ABIs + # and so can be selected as 'win32'. + if [ ${tgt_os} = "win32" ]; then + asm_conversion_cmd="${source_path_mk}/build/make/ads2armasm_ms.pl" + AS_SFX=.S + msvs_arch_dir=arm-msvs + disable_feature multithread + disable_feature unit_tests + if [ ${tgt_cc##vs} -ge 12 ]; then + # MSVC 2013 doesn't allow doing plain .exe projects for ARM32, + # only "AppContainerApplication" which requires an AppxManifest. + # Therefore disable the examples, just build the library. + disable_feature examples + disable_feature tools + fi + else + # Windows 10 on ARM, on the other hand, has full Windows SDK support + # for building Win32 ARM64 applications in addition to ARM64 + # Windows Store apps. It is the only 64-bit ARM ABI that + # Windows supports, so it is the default definition of 'win64'. + # ARM64 build support officially shipped in Visual Studio 15.9.0. + + # Because the ARM64 Windows SDK's arm_neon.h is ARM32-specific + # while LLVM's is not, probe its validity. + if enabled neon; then + if [ -n "${CC}" ]; then + check_header arm_neon.h || check_header arm64_neon.h && \ + enable_feature win_arm64_neon_h_workaround + else + # If a probe is not possible, assume this is the pure Windows + # SDK and so the workaround is necessary when using Visual + # Studio < 2019. + if [ ${tgt_cc##vs} -lt 16 ]; then + enable_feature win_arm64_neon_h_workaround + fi + fi + fi fi ;; rvct) @@ -945,7 +1135,6 @@ EOF AS=armasm LD="${source_path}/build/make/armlink_adapter.sh" STRIP=arm-none-linux-gnueabi-strip - NM=arm-none-linux-gnueabi-nm tune_cflags="--cpu=" tune_asflags="--cpu=" if [ -z "${tune_cpu}" ]; then @@ -964,7 +1153,6 @@ EOF fi arch_int=${tgt_isa##armv} arch_int=${arch_int%%te} - check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\"" enabled debug && add_asflags -g add_cflags --gnu add_cflags --enum_is_int @@ -979,109 +1167,77 @@ EOF ;; android*) - if [ -n "${sdk_path}" ]; then - SDK_PATH=${sdk_path} - COMPILER_LOCATION=`find "${SDK_PATH}" \ - -name "arm-linux-androideabi-gcc*" -print -quit` - TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi- - CC=${TOOLCHAIN_PATH}gcc - CXX=${TOOLCHAIN_PATH}g++ - AR=${TOOLCHAIN_PATH}ar - LD=${TOOLCHAIN_PATH}gcc - AS=${TOOLCHAIN_PATH}as - STRIP=${TOOLCHAIN_PATH}strip - NM=${TOOLCHAIN_PATH}nm - - if [ -z "${alt_libc}" ]; then - alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \ - awk '{n = split($0,a,"/"); \ - split(a[n-1],b,"-"); \ - print $0 " " b[2]}' | \ - sort -g -k 2 | \ - awk '{ print $1 }' | tail -1` - fi - - if [ -d "${alt_libc}" ]; then - add_cflags "--sysroot=${alt_libc}" - add_ldflags "--sysroot=${alt_libc}" - fi - - # linker flag that routes around a CPU bug in some - # Cortex-A8 implementations (NDK Dev Guide) - add_ldflags "-Wl,--fix-cortex-a8" - - enable_feature pic - soft_enable realtime_only - if [ ${tgt_isa} = "armv7" ]; then - soft_enable runtime_cpu_detect - fi - if enabled runtime_cpu_detect; then - add_cflags "-I${SDK_PATH}/sources/android/cpufeatures" - fi - else - echo "Assuming standalone build with NDK toolchain." - echo "See build/make/Android.mk for details." - check_add_ldflags -static - soft_enable unit_tests - fi - ;; - - darwin*) - XCRUN_FIND="xcrun --sdk iphoneos --find" - CXX="$(${XCRUN_FIND} clang++)" - CC="$(${XCRUN_FIND} clang)" - AR="$(${XCRUN_FIND} ar)" - AS="$(${XCRUN_FIND} as)" - STRIP="$(${XCRUN_FIND} strip)" - NM="$(${XCRUN_FIND} nm)" - RANLIB="$(${XCRUN_FIND} ranlib)" - AS_SFX=.S - LD="${CXX:-$(${XCRUN_FIND} ld)}" - - # ASFLAGS is written here instead of using check_add_asflags - # because we need to overwrite all of ASFLAGS and purge the - # options that were put in above - ASFLAGS="-arch ${tgt_isa} -g" - - add_cflags -arch ${tgt_isa} - add_ldflags -arch ${tgt_isa} - - alt_libc="$(show_darwin_sdk_path iphoneos)" - if [ -d "${alt_libc}" ]; then - add_cflags -isysroot ${alt_libc} - fi - - if [ "${LD}" = "${CXX}" ]; then - add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}" - else - add_ldflags -ios_version_min "${IOS_VERSION_MIN}" - fi - - for d in lib usr/lib usr/lib/system; do - try_dir="${alt_libc}/${d}" - [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}" - done - - case ${tgt_isa} in - armv7|armv7s|armv8|arm64) - if enabled neon && ! check_xcode_minimum_version; then - soft_disable neon - log_echo " neon disabled: upgrade Xcode (need v6.3+)." - if enabled neon_asm; then - soft_disable neon_asm - log_echo " neon_asm disabled: upgrade Xcode (need v6.3+)." - fi - fi + echo "Assuming standalone build with NDK toolchain." + echo "See build/make/Android.mk for details." + check_add_ldflags -static + soft_enable unit_tests + case "$AS" in + *clang) + # The GNU Assembler was removed in the r24 version of the NDK. + # clang's internal assembler works, but `-c` is necessary to + # avoid linking. + add_asflags -c ;; esac + ;; - asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl" + darwin) + if ! enabled external_build; then + XCRUN_FIND="xcrun --sdk iphoneos --find" + CXX="$(${XCRUN_FIND} clang++)" + CC="$(${XCRUN_FIND} clang)" + AR="$(${XCRUN_FIND} ar)" + AS="$(${XCRUN_FIND} as)" + STRIP="$(${XCRUN_FIND} strip)" + AS_SFX=.S + LD="${CXX:-$(${XCRUN_FIND} ld)}" - if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then - check_add_cflags -fembed-bitcode - check_add_asflags -fembed-bitcode - check_add_ldflags -fembed-bitcode + # ASFLAGS is written here instead of using check_add_asflags + # because we need to overwrite all of ASFLAGS and purge the + # options that were put in above + ASFLAGS="-arch ${tgt_isa} -g" + + add_cflags -arch ${tgt_isa} + add_ldflags -arch ${tgt_isa} + + alt_libc="$(show_darwin_sdk_path iphoneos)" + if [ -d "${alt_libc}" ]; then + add_cflags -isysroot ${alt_libc} + fi + + if [ "${LD}" = "${CXX}" ]; then + add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}" + else + add_ldflags -ios_version_min "${IOS_VERSION_MIN}" + fi + + for d in lib usr/lib usr/lib/system; do + try_dir="${alt_libc}/${d}" + [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}" + done + + case ${tgt_isa} in + armv7|armv7s|armv8|arm64) + if enabled neon && ! check_xcode_minimum_version; then + soft_disable neon + log_echo " neon disabled: upgrade Xcode (need v6.3+)." + if enabled neon_asm; then + soft_disable neon_asm + log_echo " neon_asm disabled: upgrade Xcode (need v6.3+)." + fi + fi + ;; + esac + + if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ] \ + && [ "$(show_xcode_version | cut -d. -f1)" -lt 16 ]; then + check_add_cflags -fembed-bitcode + check_add_asflags -fembed-bitcode + check_add_ldflags -fembed-bitcode + fi fi + + asm_conversion_cmd="${source_path_mk}/build/make/ads2gas_apple.pl" ;; linux*) @@ -1108,6 +1264,38 @@ EOF fi ;; esac + + # AArch64 ISA extensions are treated as supersets. + if [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then + aarch64_arch_flag_neon="arch=armv8-a" + aarch64_arch_flag_neon_dotprod="arch=armv8.2-a+dotprod" + aarch64_arch_flag_neon_i8mm="arch=armv8.2-a+dotprod+i8mm" + aarch64_arch_flag_sve="arch=armv8.2-a+dotprod+i8mm+sve" + aarch64_arch_flag_sve2="arch=armv9-a+sve2" + for ext in ${ARCH_EXT_LIST_AARCH64}; do + if [ "$disable_exts" = "yes" ]; then + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " + soft_disable $ext + else + # Check the compiler supports the -march flag for the extension. + # This needs to happen after toolchain/OS inspection so we handle + # $CROSS etc correctly when checking for flags, else these will + # always fail. + flag="$(eval echo \$"aarch64_arch_flag_${ext}")" + check_gcc_machine_option "${flag}" "${ext}" + if ! enabled $ext; then + # Disable higher order extensions to simplify dependencies. + disable_exts="yes" + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} " + soft_disable $ext + fi + fi + done + if enabled sve; then + check_neon_sve_bridge_compiles + fi + fi + ;; mips*) link_with_cc=gcc @@ -1135,12 +1323,24 @@ EOF check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64 check_add_ldflags -mips64r6 -mabi=64 -mfp64 ;; + loongson3*) + check_cflags -march=loongson3a && soft_enable mmi \ + || disable_feature mmi + check_cflags -mmsa && soft_enable msa \ + || disable_feature msa + tgt_isa=loongson3a + ;; esac + if enabled mmi || enabled msa; then + soft_enable runtime_cpu_detect + fi + if enabled msa; then - add_cflags -mmsa - add_asflags -mmsa - add_ldflags -mmsa + # TODO(libyuv:793) + # The new mips functions in libyuv do not build + # with the toolchains we currently use for testing. + soft_disable libyuv fi fi @@ -1148,8 +1348,25 @@ EOF check_add_asflags -march=${tgt_isa} check_add_asflags -KPIC ;; + ppc64le*) + link_with_cc=gcc + setup_gnu_toolchain + # Do not enable vsx by default. + # https://bugs.chromium.org/p/webm/issues/detail?id=1522 + enabled vsx || RTCD_OPTIONS="${RTCD_OPTIONS}--disable-vsx " + if [ -n "${tune_cpu}" ]; then + case ${tune_cpu} in + power?) + tune_cflags="-mcpu=" + ;; + esac + fi + ;; x86*) case ${tgt_os} in + android) + soft_enable realtime_only + ;; win*) enabled gcc && add_cflags -fno-common ;; @@ -1164,6 +1381,10 @@ EOF AS=${AS:-nasm} add_ldflags -Zhigh-mem ;; + darwin*) + enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64" + add_cflags ${darwin_arch} + add_ldflags ${darwin_arch} esac AS="${alt_as:-${AS:-auto}}" @@ -1196,24 +1417,12 @@ EOF enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer ;; vs*) - # When building with Microsoft Visual Studio the assembler is - # invoked directly. Checking at configure time is unnecessary. - # Skip the check by setting AS arbitrarily - AS=msvs msvs_arch_dir=x86-msvs - vc_version=${tgt_cc##vs} - case $vc_version in - 7|8|9|10) - echo "${tgt_cc} does not support avx/avx2, disabling....." - RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 " - soft_disable avx - soft_disable avx2 - ;; - esac - case $vc_version in - 7|8|9) - echo "${tgt_cc} omits stdint.h, disabling webm-io..." - soft_disable webm_io + case ${tgt_cc##vs} in + 14) + echo "${tgt_cc} does not support avx512, disabling....." + RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 " + soft_disable avx512 ;; esac ;; @@ -1246,8 +1455,13 @@ EOF elif disabled $ext; then disable_exts="yes" else - # use the shortened version for the flag: sse4_1 -> sse4 - check_gcc_machine_option ${ext%_*} $ext + if [ "$ext" = "avx512" ]; then + check_gcc_machine_options $ext avx512f avx512cd avx512bw avx512dq avx512vl + check_gcc_avx512_compiles + else + # use the shortened version for the flag: sse4_1 -> sse4 + check_gcc_machine_option ${ext%_*} $ext + fi fi done @@ -1273,7 +1487,6 @@ EOF esac log_echo " using $AS" fi - [ "${AS##*/}" = nasm ] && add_asflags -Ox AS_SFX=.asm case ${tgt_os} in win32) @@ -1282,7 +1495,7 @@ EOF EXE_SFX=.exe ;; win64) - add_asflags -f x64 + add_asflags -f win64 enabled debug && add_asflags -g cv8 EXE_SFX=.exe ;; @@ -1294,9 +1507,6 @@ EOF ;; darwin*) add_asflags -f macho${bits} - enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64" - add_cflags ${darwin_arch} - add_ldflags ${darwin_arch} # -mdynamic-no-pic is still a bit of voodoo -- it was required at # one time, but does not seem to be now, and it breaks some of the # code that still relies on inline assembly. @@ -1309,7 +1519,8 @@ EOF add_cflags ${sim_arch} add_ldflags ${sim_arch} - if [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then + if [ "$(disabled external_build)" ] && + [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then # yasm v1.3.0 doesn't know what -fembed-bitcode means, so turning it # on is pointless (unless building a C-only lib). Warn the user, but # do nothing here. @@ -1326,6 +1537,15 @@ EOF ;; esac ;; + loongarch*) + link_with_cc=gcc + setup_gnu_toolchain + + enabled lsx && check_inline_asm lsx '"vadd.b $vr0, $vr1, $vr1"' + enabled lsx && soft_enable runtime_cpu_detect + enabled lasx && check_inline_asm lasx '"xvadd.b $xr0, $xr1, $xr1"' + enabled lasx && soft_enable runtime_cpu_detect + ;; *-gcc|generic-gnu) link_with_cc=gcc enable_feature gcc @@ -1333,6 +1553,14 @@ EOF ;; esac + # Enable PGO + if [ -n "${pgo_file}" ]; then + check_add_cflags -fprofile-use=${pgo_file} || \ + die "-fprofile-use is not supported by compiler" + check_add_ldflags -fprofile-use=${pgo_file} || \ + die "-fprofile-use is not supported by linker" + fi + # Try to enable CPU specific tuning if [ -n "${tune_cpu}" ]; then if [ -n "${tune_cflags}" ]; then @@ -1353,6 +1581,9 @@ EOF else check_add_cflags -DNDEBUG fi + enabled profile && + check_add_cflags -fprofile-generate && + check_add_ldflags -fprofile-generate enabled gprof && check_add_cflags -pg && check_add_ldflags -pg enabled gcov && @@ -1387,7 +1618,7 @@ EOF # Try to find which inline keywords are supported check_cc < +#include +int main(void) { return pthread_create(NULL, NULL, NULL, NULL); } +EOF ;; esac fi @@ -1416,6 +1651,26 @@ EOF echo "msa optimizations are available only for little endian platforms" disable_feature msa fi + if enabled mmi; then + echo "mmi optimizations are available only for little endian platforms" + disable_feature mmi + fi + fi + ;; + esac + + # only for LOONGARCH platforms + case ${toolchain} in + loongarch*) + if enabled big_endian; then + if enabled lsx; then + echo "lsx optimizations are available only for little endian platforms" + disable_feature lsx + fi + if enabled lasx; then + echo "lasx optimizations are available only for little endian platforms" + disable_feature lasx + fi fi ;; esac diff --git a/media/libvpx/libvpx/build/make/gen_asm_deps.sh b/media/libvpx/libvpx/build/make/gen_asm_deps.sh index 6a7bff9ebc..3bd4d125f1 100644 --- a/media/libvpx/libvpx/build/make/gen_asm_deps.sh +++ b/media/libvpx/libvpx/build/make/gen_asm_deps.sh @@ -42,7 +42,7 @@ done [ -n "$srcfile" ] || show_help sfx=${sfx:-asm} -includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile | +includes=$(LC_ALL=C grep -E -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile | perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;") #" restore editor state for inc in ${includes}; do diff --git a/media/libvpx/libvpx/build/make/gen_msvs_sln.sh b/media/libvpx/libvpx/build/make/gen_msvs_sln.sh index 7d5f468109..0b312850fe 100644 --- a/media/libvpx/libvpx/build/make/gen_msvs_sln.sh +++ b/media/libvpx/libvpx/build/make/gen_msvs_sln.sh @@ -25,7 +25,7 @@ files. Options: --help Print this message --out=outfile Redirect output to a file - --ver=version Version (7,8,9,10,11,12,14) of visual studio to generate for + --ver=version Version (14-17) of visual studio to generate for --target=isa-os-cc Target specifier EOF exit 1 @@ -213,13 +213,15 @@ for opt in "$@"; do ;; --dep=*) eval "${optval%%:*}_deps=\"\${${optval%%:*}_deps} ${optval##*:}\"" ;; - --ver=*) vs_ver="$optval" - case $optval in - 10|11|12|14) - ;; - *) die Unrecognized Visual Studio Version in $opt - ;; - esac + --ver=*) + vs_ver="$optval" + case $optval in + 14) vs_year=2015 ;; + 15) vs_year=2017 ;; + 16) vs_year=2019 ;; + 17) vs_year=2022 ;; + *) die Unrecognized Visual Studio Version in $opt ;; + esac ;; --target=*) target="${optval}" ;; @@ -230,18 +232,11 @@ for opt in "$@"; do done outfile=${outfile:-/dev/stdout} mkoutfile=${mkoutfile:-/dev/stdout} -case "${vs_ver:-10}" in - 10) sln_vers="11.00" - sln_vers_str="Visual Studio 2010" - ;; - 11) sln_vers="12.00" - sln_vers_str="Visual Studio 2012" - ;; - 12) sln_vers="12.00" - sln_vers_str="Visual Studio 2013" - ;; - 14) sln_vers="14.00" - sln_vers_str="Visual Studio 2015" +case "${vs_ver}" in + 1[4-7]) + # VS has used Format Version 12.00 continuously since vs11. + sln_vers="12.00" + sln_vers_str="Visual Studio ${vs_year}" ;; esac sfx=vcxproj diff --git a/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh b/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh index 2cf62c117c..1e1db05bb2 100644 --- a/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh +++ b/media/libvpx/libvpx/build/make/gen_msvs_vcxproj.sh @@ -34,7 +34,7 @@ Options: --name=project_name Name of the project (required) --proj-guid=GUID GUID to use for the project --module-def=filename File containing export definitions (for DLLs) - --ver=version Version (10,11,12,14) of visual studio to generate for + --ver=version Version (14-16) of visual studio to generate for --src-path-bare=dir Path to root of source tree -Ipath/to/include Additional include directories -DFLAG[=value] Preprocessor macros to define @@ -82,7 +82,7 @@ generate_filter() { | sed -e "s,$src_path_bare,," \ -e 's/^[\./]\+//g' -e 's,[:/ ],_,g') - if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $asm_use_custom_step; then + if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $uses_asm; then # Avoid object file name collisions, i.e. vpx_config.c and # vpx_config.asm produce the same object file without # this additional suffix. @@ -141,7 +141,17 @@ for opt in "$@"; do case "$opt" in --help|-h) show_help ;; - --target=*) target="${optval}" + --target=*) + target="${optval}" + platform_toolset=$(echo ${target} | awk 'BEGIN{FS="-"}{print $4}') + case "$platform_toolset" in + clangcl) platform_toolset="ClangCl" + ;; + "") + ;; + *) die Unrecognized Visual Studio Platform Toolset in $opt + ;; + esac ;; --out=*) outfile="$optval" ;; @@ -157,6 +167,8 @@ for opt in "$@"; do ;; --lib) proj_kind="lib" ;; + --as=*) as="${optval}" + ;; --src-path-bare=*) src_path_bare=$(fix_path "$optval") src_path_bare=${src_path_bare%/} @@ -168,7 +180,7 @@ for opt in "$@"; do --ver=*) vs_ver="$optval" case "$optval" in - 10|11|12|14) + 1[4-7]) ;; *) die Unrecognized Visual Studio Version in $opt ;; @@ -215,13 +227,7 @@ fix_file_list file_list outfile=${outfile:-/dev/stdout} guid=${guid:-`generate_uuid`} -asm_use_custom_step=false uses_asm=${uses_asm:-false} -case "${vs_ver:-11}" in - 10|11|12|14) - asm_use_custom_step=$uses_asm - ;; -esac [ -n "$name" ] || die "Project name (--name) must be specified!" [ -n "$target" ] || die "Target (--target) must be specified!" @@ -253,13 +259,22 @@ libs=${libs// /;} case "$target" in x86_64*) platforms[0]="x64" - asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} "%(FullPath)"" - asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} "%(FullPath)"" + asm_Debug_cmdline="${as} -Xvc -gcv8 -f win64 ${yasmincs} "%(FullPath)"" + asm_Release_cmdline="${as} -Xvc -f win64 ${yasmincs} "%(FullPath)"" ;; x86*) platforms[0]="Win32" - asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} "%(FullPath)"" - asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} "%(FullPath)"" + asm_Debug_cmdline="${as} -Xvc -gcv8 -f win32 ${yasmincs} "%(FullPath)"" + asm_Release_cmdline="${as} -Xvc -f win32 ${yasmincs} "%(FullPath)"" + ;; + arm64*) + platforms[0]="ARM64" + # As of Visual Studio 2022 17.5.5, clang-cl does not support ARM64EC. + if [ "$vs_ver" -ge 17 -a "$platform_toolset" != "ClangCl" ]; then + platforms[1]="ARM64EC" + fi + asm_Debug_cmdline="armasm64 -nologo -oldit "%(FullPath)"" + asm_Release_cmdline="armasm64 -nologo -oldit "%(FullPath)"" ;; arm*) platforms[0]="ARM" @@ -307,6 +322,16 @@ generate_vcxproj() { tag_content ApplicationType "Windows Store" tag_content ApplicationTypeRevision 8.1 fi + if [ "${platforms[0]}" = "ARM64" ]; then + # Require the first Visual Studio version to have ARM64 support. + tag_content MinimumVisualStudioVersion 15.9 + fi + if [ $vs_ver -eq 15 ] && [ "${platforms[0]}" = "ARM64" ]; then + # Since VS 15 does not have a 'use latest SDK version' facility, + # specifically require the contemporaneous SDK with official ARM64 + # support. + tag_content WindowsTargetPlatformVersion 10.0.17763.0 + fi close_tag PropertyGroup tag Import \ @@ -324,28 +349,21 @@ generate_vcxproj() { else tag_content ConfigurationType StaticLibrary fi - if [ "$vs_ver" = "11" ]; then - if [ "$plat" = "ARM" ]; then - # Setting the wp80 toolchain automatically sets the - # WINAPI_FAMILY define, which is required for building - # code for arm with the windows headers. Alternatively, - # one could add AppContainerApplication=true in the Globals - # section and add PrecompiledHeader=NotUsing and - # CompileAsWinRT=false in ClCompile and SubSystem=Console - # in Link. - tag_content PlatformToolset v110_wp80 - else - tag_content PlatformToolset v110 + if [ -n "$platform_toolset" ]; then + tag_content PlatformToolset "$platform_toolset" + else + if [ "$vs_ver" = "14" ]; then + tag_content PlatformToolset v140 + fi + if [ "$vs_ver" = "15" ]; then + tag_content PlatformToolset v141 + fi + if [ "$vs_ver" = "16" ]; then + tag_content PlatformToolset v142 + fi + if [ "$vs_ver" = "17" ]; then + tag_content PlatformToolset v143 fi - fi - if [ "$vs_ver" = "12" ]; then - # Setting a PlatformToolset indicating windows phone isn't - # enough to build code for arm with MSVC 2013, one strictly - # has to enable AppContainerApplication as well. - tag_content PlatformToolset v120 - fi - if [ "$vs_ver" = "14" ]; then - tag_content PlatformToolset v140 fi tag_content CharacterSet Unicode if [ "$config" = "Release" ]; then diff --git a/media/libvpx/libvpx/build/make/iosbuild.sh b/media/libvpx/libvpx/build/make/iosbuild.sh index c703f22b0c..d9594b9816 100644 --- a/media/libvpx/libvpx/build/make/iosbuild.sh +++ b/media/libvpx/libvpx/build/make/iosbuild.sh @@ -30,13 +30,9 @@ SCRIPT_DIR=$(dirname "$0") LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd) LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo) ORIG_PWD="$(pwd)" -ARM_TARGETS="arm64-darwin-gcc - armv7-darwin-gcc - armv7s-darwin-gcc" -SIM_TARGETS="x86-iphonesimulator-gcc - x86_64-iphonesimulator-gcc" -OSX_TARGETS="x86-darwin15-gcc - x86_64-darwin15-gcc" +ARM_TARGETS="arm64-darwin-gcc" +SIM_TARGETS="x86_64-iphonesimulator-gcc" +OSX_TARGETS="x86_64-darwin16-gcc" TARGETS="${ARM_TARGETS} ${SIM_TARGETS}" # Configures for the target specified by $1, and invokes make with the dist @@ -132,7 +128,8 @@ create_vpx_framework_config_shim() { done # Consume the last line of output from the loop: We don't want it. - sed -i '' -e '$d' "${config_file}" + sed -i.bak -e '$d' "${config_file}" + rm "${config_file}.bak" printf "#endif\n\n" >> "${config_file}" printf "#endif // ${include_guard}" >> "${config_file}" @@ -244,7 +241,7 @@ build_framework() { # Trap function. Cleans up the subtree used to build all targets contained in # $TARGETS. cleanup() { - local readonly res=$? + local res=$? cd "${ORIG_PWD}" if [ $res -ne 0 ]; then @@ -271,7 +268,7 @@ cat << EOF --help: Display this message and exit. --enable-shared: Build a dynamic framework for use on iOS 8 or later. --extra-configure-args : Extra args to pass when configuring libvpx. - --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86 + --macosx: Uses darwin16 targets instead of iphonesimulator targets for x86 and x86_64. Allows linking to framework when builds target MacOSX instead of iOS. --preserve-build-output: Do not delete the build directory. @@ -350,7 +347,7 @@ if [ "$ENABLE_SHARED" = "yes" ]; then IOS_VERSION_MIN="8.0" else IOS_VERSION_OPTIONS="" - IOS_VERSION_MIN="6.0" + IOS_VERSION_MIN="7.0" fi if [ "${VERBOSE}" = "yes" ]; then diff --git a/media/libvpx/libvpx/build/make/msvs_common.sh b/media/libvpx/libvpx/build/make/msvs_common.sh index 88f1cf9b57..3989fec0d5 100644 --- a/media/libvpx/libvpx/build/make/msvs_common.sh +++ b/media/libvpx/libvpx/build/make/msvs_common.sh @@ -9,7 +9,8 @@ ## be found in the AUTHORS file in the root of the source tree. ## -if [ "$(uname -o 2>/dev/null)" = "Cygwin" ] \ +shell_name="$(uname -o 2>/dev/null)" +if [[ "$shell_name" = "Cygwin" || "$shell_name" = "Msys" ]] \ && cygpath --help >/dev/null 2>&1; then FIXPATH='cygpath -m' else @@ -41,6 +42,15 @@ fix_path() { # Corrects the paths in file_list in one pass for efficiency. # $1 is the name of the array to be modified. fix_file_list() { + if [ "${FIXPATH}" = "echo_path" ] ; then + # When used with echo_path, fix_file_list is a no-op. Avoid warning about + # unsupported 'declare -n' when it is not important. + return 0 + elif [ "${BASH_VERSINFO}" -lt 4 ] ; then + echo "Cygwin path conversion has failed. Please use a version of bash" + echo "which supports nameref (-n), introduced in bash 4.3" + return 1 + fi declare -n array_ref=$1 files=$(fix_path "${array_ref[@]}") local IFS=$'\n' diff --git a/media/libvpx/libvpx/build/make/rtcd.pl b/media/libvpx/libvpx/build/make/rtcd.pl index 9e746c46d0..156199abd7 100644 --- a/media/libvpx/libvpx/build/make/rtcd.pl +++ b/media/libvpx/libvpx/build/make/rtcd.pl @@ -1,4 +1,13 @@ #!/usr/bin/env perl +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## no strict 'refs'; use warnings; @@ -64,6 +73,10 @@ sub vpx_config($) { } sub specialize { + if (@_ <= 1) { + die "'specialize' must be called with a function name and at least one ", + "architecture ('C' is implied): \n@_\n"; + } my $fn=$_[0]; shift; foreach my $opt (@_) { @@ -199,7 +212,20 @@ sub filter { # sub common_top() { my $include_guard = uc($opts{sym})."_H_"; + my @time = localtime; + my $year = $time[5] + 1900; print < $PRIORITY_INDEX{$b} } keys %required); if ($opts{arch} eq 'x86') { - @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/); + @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/); x86; } elsif ($opts{arch} eq 'x86_64') { - @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/); - @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/); - &require(@REQUIRES); + @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/); + if (keys %required == 0) { + @REQUIRES = filter(qw/mmx sse sse2/); + &require(@REQUIRES); + } x86; } elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') { + my $have_dspr2 = 0; + my $have_msa = 0; + my $have_mmi = 0; @ALL_ARCHS = filter("$opts{arch}"); open CONFIG_FILE, $opts{config} or die "Error opening config file '$opts{config}': $!\n"; while () { if (/HAVE_DSPR2=yes/) { - @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/); - last; + $have_dspr2 = 1; } if (/HAVE_MSA=yes/) { - @ALL_ARCHS = filter("$opts{arch}", qw/msa/); - last; + $have_msa = 1; + } + if (/HAVE_MMI=yes/) { + $have_mmi = 1; } } close CONFIG_FILE; + if ($have_dspr2 == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/); + } elsif ($have_msa == 1 && $have_mmi == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/mmi msa/); + } elsif ($have_msa == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/msa/); + } elsif ($have_mmi == 1) { + @ALL_ARCHS = filter("$opts{arch}", qw/mmi/); + } else { + unoptimized; + } mips; } elsif ($opts{arch} =~ /armv7\w?/) { @ALL_ARCHS = filter(qw/neon_asm neon/); arm; } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { - @ALL_ARCHS = filter(qw/neon/); + @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve sve2/); + if (keys %required == 0) { + @REQUIRES = filter(qw/neon/); + &require(@REQUIRES); + } arm; +} elsif ($opts{arch} =~ /^ppc/ ) { + @ALL_ARCHS = filter(qw/vsx/); + ppc; +} elsif ($opts{arch} =~ /loongarch/ ) { + @ALL_ARCHS = filter(qw/lsx lasx/); + loongarch; } else { unoptimized; } diff --git a/media/libvpx/libvpx/build/make/thumb.pm b/media/libvpx/libvpx/build/make/thumb.pm index 483c2539c6..ef4b316771 100644 --- a/media/libvpx/libvpx/build/make/thumb.pm +++ b/media/libvpx/libvpx/build/make/thumb.pm @@ -11,11 +11,8 @@ package thumb; -sub FixThumbInstructions($$) +sub FixThumbInstructions($) { - my $short_branches = $_[1]; - my $branch_shift_offset = $short_branches ? 1 : 0; - # Write additions with shifts, such as "add r10, r11, lsl #8", # in three operand form, "add r10, r10, r11, lsl #8". s/(add\s+)(r\d+),\s*(r\d+),\s*(lsl #\d+)/$1$2, $2, $3, $4/g; @@ -54,13 +51,6 @@ sub FixThumbInstructions($$) # "addne r0, r0, r2". s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g; - # Convert a conditional addition to the pc register into a series of - # instructions. This converts "addlt pc, pc, r3, lsl #2" into - # "itttt lt", "movlt.n r12, pc", "addlt.w r12, #12", - # "addlt.w r12, r12, r3, lsl #2", "movlt.n pc, r12". - # This assumes that r12 is free at this point. - s/^(\s*)addlt(\s+)pc,\s*pc,\s*(\w+),\s*lsl\s*#(\d+)/$1itttt$2lt\n$1movlt.n$2r12, pc\n$1addlt.w$2r12, #12\n$1addlt.w$2r12, r12, $3, lsl #($4-$branch_shift_offset)\n$1movlt.n$2pc, r12/g; - # Convert "mov pc, lr" into "bx lr", since the former only works # for switching from arm to thumb (and only in armv7), but not # from thumb to arm. diff --git a/media/libvpx/libvpx/build/make/version.sh b/media/libvpx/libvpx/build/make/version.sh index 6967527771..8f717cc96f 100644 --- a/media/libvpx/libvpx/build/make/version.sh +++ b/media/libvpx/libvpx/build/make/version.sh @@ -60,6 +60,9 @@ if [ ${bare} ]; then echo "${changelog_version}${git_version_id}" > $$.tmp else cat<$$.tmp +// This file is generated. Do not edit. +#ifndef VPX_VERSION_H_ +#define VPX_VERSION_H_ #define VERSION_MAJOR $major_version #define VERSION_MINOR $minor_version #define VERSION_PATCH $patch_version @@ -67,6 +70,7 @@ else #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) #define ${id}_NOSP "${version_str}" #define ${id} " ${version_str}" +#endif // VPX_VERSION_H_ EOF fi if [ -n "$out_file" ]; then diff --git a/media/libvpx/libvpx/codereview.settings b/media/libvpx/libvpx/codereview.settings index 34c6f1d9de..ccba2eeed2 100644 --- a/media/libvpx/libvpx/codereview.settings +++ b/media/libvpx/libvpx/codereview.settings @@ -1,5 +1,4 @@ -# This file is used by gcl to get repository specific information. -GERRIT_HOST: chromium-review.googlesource.com -GERRIT_PORT: 29418 +# This file is used by git cl to get repository specific information. +GERRIT_HOST: True CODE_REVIEW_SERVER: chromium-review.googlesource.com GERRIT_SQUASH_UPLOADS: False diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure index fb732acf3e..8eee4e4425 100644 --- a/media/libvpx/libvpx/configure +++ b/media/libvpx/libvpx/configure @@ -31,7 +31,6 @@ Advanced options: --libc=PATH path to alternate libc --size-limit=WxH max size to allow in the decoder --as={yasm|nasm|auto} use specified assembler [auto, yasm preferred] - --sdk-path=PATH path to root of sdk (android builds only) ${toggle_codec_srcs} in/exclude codec library source code ${toggle_debug_libs} in/exclude debug version of libraries ${toggle_static_msvcrt} use static MSVCRT (VS builds only) @@ -100,19 +99,36 @@ EOF # alphabetically by architecture, generic-gnu last. all_platforms="${all_platforms} arm64-android-gcc" all_platforms="${all_platforms} arm64-darwin-gcc" +all_platforms="${all_platforms} arm64-darwin20-gcc" +all_platforms="${all_platforms} arm64-darwin21-gcc" +all_platforms="${all_platforms} arm64-darwin22-gcc" +all_platforms="${all_platforms} arm64-darwin23-gcc" +all_platforms="${all_platforms} arm64-darwin24-gcc" +all_platforms="${all_platforms} arm64-darwin25-gcc" all_platforms="${all_platforms} arm64-linux-gcc" +all_platforms="${all_platforms} arm64-win64-gcc" +all_platforms="${all_platforms} arm64-win64-vs15" +all_platforms="${all_platforms} arm64-win64-vs16" +all_platforms="${all_platforms} arm64-win64-vs16-clangcl" +all_platforms="${all_platforms} arm64-win64-vs17" +all_platforms="${all_platforms} arm64-win64-vs17-clangcl" all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-none-rvct" #neon Cortex-A8 -all_platforms="${all_platforms} armv7-win32-vs11" -all_platforms="${all_platforms} armv7-win32-vs12" +all_platforms="${all_platforms} armv7-win32-gcc" all_platforms="${all_platforms} armv7-win32-vs14" +all_platforms="${all_platforms} armv7-win32-vs15" +all_platforms="${all_platforms} armv7-win32-vs16" +all_platforms="${all_platforms} armv7-win32-vs17" all_platforms="${all_platforms} armv7s-darwin-gcc" all_platforms="${all_platforms} armv8-linux-gcc" +all_platforms="${all_platforms} loongarch32-linux-gcc" +all_platforms="${all_platforms} loongarch64-linux-gcc" all_platforms="${all_platforms} mips32-linux-gcc" all_platforms="${all_platforms} mips64-linux-gcc" +all_platforms="${all_platforms} ppc64le-linux-gcc" all_platforms="${all_platforms} sparc-solaris-gcc" all_platforms="${all_platforms} x86-android-gcc" all_platforms="${all_platforms} x86-darwin8-gcc" @@ -125,16 +141,18 @@ all_platforms="${all_platforms} x86-darwin12-gcc" all_platforms="${all_platforms} x86-darwin13-gcc" all_platforms="${all_platforms} x86-darwin14-gcc" all_platforms="${all_platforms} x86-darwin15-gcc" +all_platforms="${all_platforms} x86-darwin16-gcc" +all_platforms="${all_platforms} x86-darwin17-gcc" all_platforms="${all_platforms} x86-iphonesimulator-gcc" all_platforms="${all_platforms} x86-linux-gcc" all_platforms="${all_platforms} x86-linux-icc" all_platforms="${all_platforms} x86-os2-gcc" all_platforms="${all_platforms} x86-solaris-gcc" all_platforms="${all_platforms} x86-win32-gcc" -all_platforms="${all_platforms} x86-win32-vs10" -all_platforms="${all_platforms} x86-win32-vs11" -all_platforms="${all_platforms} x86-win32-vs12" all_platforms="${all_platforms} x86-win32-vs14" +all_platforms="${all_platforms} x86-win32-vs15" +all_platforms="${all_platforms} x86-win32-vs16" +all_platforms="${all_platforms} x86-win32-vs17" all_platforms="${all_platforms} x86_64-android-gcc" all_platforms="${all_platforms} x86_64-darwin9-gcc" all_platforms="${all_platforms} x86_64-darwin10-gcc" @@ -143,15 +161,25 @@ all_platforms="${all_platforms} x86_64-darwin12-gcc" all_platforms="${all_platforms} x86_64-darwin13-gcc" all_platforms="${all_platforms} x86_64-darwin14-gcc" all_platforms="${all_platforms} x86_64-darwin15-gcc" +all_platforms="${all_platforms} x86_64-darwin16-gcc" +all_platforms="${all_platforms} x86_64-darwin17-gcc" +all_platforms="${all_platforms} x86_64-darwin18-gcc" +all_platforms="${all_platforms} x86_64-darwin19-gcc" +all_platforms="${all_platforms} x86_64-darwin20-gcc" +all_platforms="${all_platforms} x86_64-darwin21-gcc" +all_platforms="${all_platforms} x86_64-darwin22-gcc" +all_platforms="${all_platforms} x86_64-darwin23-gcc" +all_platforms="${all_platforms} x86_64-darwin24-gcc" +all_platforms="${all_platforms} x86_64-darwin25-gcc" all_platforms="${all_platforms} x86_64-iphonesimulator-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" all_platforms="${all_platforms} x86_64-solaris-gcc" all_platforms="${all_platforms} x86_64-win64-gcc" -all_platforms="${all_platforms} x86_64-win64-vs10" -all_platforms="${all_platforms} x86_64-win64-vs11" -all_platforms="${all_platforms} x86_64-win64-vs12" all_platforms="${all_platforms} x86_64-win64-vs14" +all_platforms="${all_platforms} x86_64-win64-vs15" +all_platforms="${all_platforms} x86_64-win64-vs16" +all_platforms="${all_platforms} x86_64-win64-vs17" all_platforms="${all_platforms} generic-gnu" # all_targets is a list of all targets that can be configured @@ -163,11 +191,14 @@ for t in ${all_targets}; do [ -f "${source_path}/${t}.mk" ] && enable_feature ${t} done +if ! diff --version >/dev/null; then + die "diff missing: Try installing diffutils via your package manager." +fi + if ! perl --version >/dev/null; then die "Perl is required to build" fi - if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then # test to see if source_path already configured if [ -f "${source_path}/vpx_config.h" ]; then @@ -220,10 +251,22 @@ CODEC_FAMILIES=" ARCH_LIST=" arm + aarch64 mips x86 x86_64 + ppc + loongarch " + +ARCH_EXT_LIST_AARCH64=" + neon + neon_dotprod + neon_i8mm + sve + sve2 +" + ARCH_EXT_LIST_X86=" mmx sse @@ -233,10 +276,18 @@ ARCH_EXT_LIST_X86=" sse4_1 avx avx2 + avx512 " + +ARCH_EXT_LIST_LOONGSON=" + mmi + lsx + lasx +" + ARCH_EXT_LIST=" - neon neon_asm + ${ARCH_EXT_LIST_AARCH64} mips32 dspr2 @@ -244,6 +295,10 @@ ARCH_EXT_LIST=" mips64 ${ARCH_EXT_LIST_X86} + + vsx + + ${ARCH_EXT_LIST_LOONGSON} " HAVE_LIST=" ${ARCH_EXT_LIST} @@ -252,10 +307,10 @@ HAVE_LIST=" unistd_h " EXPERIMENT_LIST=" - spatial_svc fp_mb_stats emulate_hardware - misc_fixes + non_greedy_mv + collect_component_timing " CONFIG_LIST=" dependency_tracking @@ -310,6 +365,9 @@ CONFIG_LIST=" better_hw_compatibility experimental size_limit + always_adjust_bpm + bitstream_debug + mismatch_debug ${EXPERIMENT_LIST} " CMDLINE_SELECT=" @@ -322,6 +380,7 @@ CMDLINE_SELECT=" install_libs install_srcs debug + profile gprof gcov pic @@ -369,6 +428,9 @@ CMDLINE_SELECT=" better_hw_compatibility vp9_highbitdepth experimental + always_adjust_bpm + bitstream_debug + mismatch_debug " process_cmdline() { @@ -399,6 +461,12 @@ process_cmdline() { } post_process_cmdline() { + if enabled coefficient_range_checking; then + echo "coefficient-range-checking is for decoders only, disabling encoders:" + soft_disable vp8_encoder + soft_disable vp9_encoder + fi + c="" # Enable all detected codecs, if they haven't been disabled @@ -420,6 +488,7 @@ process_targets() { enabled child || write_common_config_banner write_common_target_config_h ${BUILD_PFX}vpx_config.h write_common_config_targets + enabled win_arm64_neon_h_workaround && write_win_arm64_neon_h_workaround ${BUILD_PFX}arm_neon.h # Calculate the default distribution name, based on the enabled features cf="" @@ -496,7 +565,7 @@ process_detect() { # here rather than at option parse time because the target auto-detect # magic happens after the command line has been parsed. case "${tgt_os}" in - linux|os2|darwin*|iphonesimulator*) + linux|os2|solaris|darwin*|iphonesimulator*) # Supported platforms ;; *) @@ -548,16 +617,30 @@ process_detect() { check_ld() { true } + check_lib() { + true + } fi check_header stdio.h || die "Unable to invoke compiler: ${CC} ${CFLAGS}" check_ld < +#include +int main(void) { return pthread_create(NULL, NULL, NULL, NULL); } +EOF check_header unistd.h # for sysconf(3) and friends. check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports + + if enabled neon && ! enabled external_build; then + check_header arm_neon.h || die "Unable to find arm_neon.h" + fi } process_toolchain() { @@ -567,32 +650,66 @@ process_toolchain() { if enabled gcc; then enabled werror && check_add_cflags -Werror check_add_cflags -Wall - check_add_cflags -Wdeclaration-after-statement check_add_cflags -Wdisabled-optimization + check_add_cflags -Wextra-semi + check_add_cflags -Wextra-semi-stmt check_add_cflags -Wfloat-conversion + check_add_cflags -Wformat=2 + check_add_cflags -Wparentheses-equality check_add_cflags -Wpointer-arith check_add_cflags -Wtype-limits check_add_cflags -Wcast-qual check_add_cflags -Wvla check_add_cflags -Wimplicit-function-declaration + check_add_cflags -Wmissing-declarations + check_add_cflags -Wmissing-prototypes + check_add_cflags -Wshadow + check_add_cflags -Wstrict-prototypes check_add_cflags -Wuninitialized + check_add_cflags -Wunreachable-code-aggressive check_add_cflags -Wunused - # -Wextra has some tricky cases. Rather than fix them all now, get the - # flag for as many files as possible and fix the remaining issues - # piecemeal. - # https://bugs.chromium.org/p/webm/issues/detail?id=1069 check_add_cflags -Wextra # check_add_cflags also adds to cxxflags. gtest does not do well with - # -Wundef so add it explicitly to CFLAGS only. + # these flags so add them explicitly to CFLAGS only. check_cflags -Wundef && add_cflags_only -Wundef + check_cflags -Wframe-larger-than=52000 && \ + add_cflags_only -Wframe-larger-than=52000 if enabled mips || [ -z "${INLINE}" ]; then enabled extra_warnings || check_add_cflags -Wno-unused-function fi - if ! enabled vp9_highbitdepth; then - # Avoid this warning for third_party C++ sources. Some reorganization - # would be needed to apply this only to test/*.cc. - check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32 + # Enforce C99 for C files. Allow GNU extensions. + check_cflags -std=gnu99 && add_cflags_only -std=gnu99 + # Avoid this warning for third_party C++ sources. Some reorganization + # would be needed to apply this only to test/*.cc. + check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32 + + # Do not allow implicit vector type conversions on Clang builds (this + # is already the default on GCC builds). + check_add_cflags -flax-vector-conversions=none + + # Quiet gcc 6 vs 7 abi warnings: + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728 + if enabled arm; then + check_add_cxxflags -Wno-psabi fi + + # Enforce C++17 compatibility. + check_add_cxxflags -Wc++20-extensions + check_add_cxxflags -Wc++23-extensions + check_add_cxxflags -Wnon-virtual-dtor + + # disable some warnings specific to libyuv / libwebm. + check_cxxflags -Wno-missing-declarations \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations" + check_cxxflags -Wno-missing-prototypes \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes" + check_cxxflags -Wno-pass-failed \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed" + check_cxxflags -Wno-shadow \ + && LIBWEBM_CXXFLAGS="${LIBWEBM_CXXFLAGS} -Wno-shadow" \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-shadow" + check_cxxflags -Wno-unused-parameter \ + && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter" fi if enabled icc; then @@ -644,7 +761,7 @@ process_toolchain() { gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror" all_targets="${all_targets} solution" - INLINE="__forceinline" + INLINE="__inline" ;; esac @@ -663,39 +780,33 @@ process_toolchain() { soft_enable libyuv ;; *-android-*) - soft_enable webm_io + check_add_cxxflags -std=gnu++17 && soft_enable webm_io soft_enable libyuv # GTestLog must be modified to use Android logging utilities. ;; *-darwin-*) + check_add_cxxflags -std=gnu++17 # iOS/ARM builds do not work with gtest. This does not match # x86 targets. ;; *-iphonesimulator-*) - soft_enable webm_io + check_add_cxxflags -std=gnu++17 && soft_enable webm_io soft_enable libyuv ;; *-win*) # Some mingw toolchains don't have pthread available by default. # Treat these more like visual studio where threading in gtest # would be disabled for the same reason. - check_cxx "$@" < $@ +CLEAN-OBJS += vpxdec_srcs.txt + # # Documentation Rules # @@ -403,3 +409,4 @@ CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox) DOCS-yes += examples.doxy samples.dox examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox) @echo "INPUT += $^" > $@ + @echo "ENABLED_SECTIONS += samples" >> $@ diff --git a/media/libvpx/libvpx/examples/decode_with_drops.c b/media/libvpx/libvpx/examples/decode_with_drops.c index e69e2a9f9b..03c79a4561 100644 --- a/media/libvpx/libvpx/examples/decode_with_drops.c +++ b/media/libvpx/libvpx/examples/decode_with_drops.c @@ -106,7 +106,7 @@ int main(int argc, char **argv) { printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface())); if (vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) - die_codec(&codec, "Failed to initialize decoder."); + die("Failed to initialize decoder."); while (vpx_video_reader_read_frame(reader)) { vpx_codec_iter_t iter = NULL; diff --git a/media/libvpx/libvpx/examples/postproc.c b/media/libvpx/libvpx/examples/postproc.c index 15713b946a..b53c15ea15 100644 --- a/media/libvpx/libvpx/examples/postproc.c +++ b/media/libvpx/libvpx/examples/postproc.c @@ -86,9 +86,9 @@ int main(int argc, char **argv) { res = vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, VPX_CODEC_USE_POSTPROC); if (res == VPX_CODEC_INCAPABLE) - die_codec(&codec, "Postproc not supported by this decoder."); + die("Postproc not supported by this decoder."); - if (res) die_codec(&codec, "Failed to initialize decoder."); + if (res) die("Failed to initialize decoder."); while (vpx_video_reader_read_frame(reader)) { vpx_codec_iter_t iter = NULL; @@ -109,7 +109,7 @@ int main(int argc, char **argv) { 0 }; if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) die_codec(&codec, "Failed to turn on postproc."); - }; + } // Decode the frame with 15ms deadline if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 15000)) diff --git a/media/libvpx/libvpx/examples/resize_util.c b/media/libvpx/libvpx/examples/resize_util.c deleted file mode 100644 index 7e529b2e20..0000000000 --- a/media/libvpx/libvpx/examples/resize_util.c +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include -#include -#include -#include - -#include "../tools_common.h" -#include "../vp9/encoder/vp9_resize.h" - -static const char *exec_name = NULL; - -static void usage() { - printf("Usage:\n"); - printf("%s x x ", - exec_name); - printf(" []\n"); -} - -void usage_exit(void) { - usage(); - exit(EXIT_FAILURE); -} - -static int parse_dim(char *v, int *width, int *height) { - char *x = strchr(v, 'x'); - if (x == NULL) x = strchr(v, 'X'); - if (x == NULL) return 0; - *width = atoi(v); - *height = atoi(&x[1]); - if (*width <= 0 || *height <= 0) - return 0; - else - return 1; -} - -int main(int argc, char *argv[]) { - char *fin, *fout; - FILE *fpin, *fpout; - uint8_t *inbuf, *outbuf; - uint8_t *inbuf_u, *outbuf_u; - uint8_t *inbuf_v, *outbuf_v; - int f, frames; - int width, height, target_width, target_height; - - exec_name = argv[0]; - - if (argc < 5) { - printf("Incorrect parameters:\n"); - usage(); - return 1; - } - - fin = argv[1]; - fout = argv[4]; - if (!parse_dim(argv[2], &width, &height)) { - printf("Incorrect parameters: %s\n", argv[2]); - usage(); - return 1; - } - if (!parse_dim(argv[3], &target_width, &target_height)) { - printf("Incorrect parameters: %s\n", argv[3]); - usage(); - return 1; - } - - fpin = fopen(fin, "rb"); - if (fpin == NULL) { - printf("Can't open file %s to read\n", fin); - usage(); - return 1; - } - fpout = fopen(fout, "wb"); - if (fpout == NULL) { - printf("Can't open file %s to write\n", fout); - usage(); - return 1; - } - if (argc >= 6) - frames = atoi(argv[5]); - else - frames = INT_MAX; - - printf("Input size: %dx%d\n", width, height); - printf("Target size: %dx%d, Frames: ", target_width, target_height); - if (frames == INT_MAX) - printf("All\n"); - else - printf("%d\n", frames); - - inbuf = (uint8_t *)malloc(width * height * 3 / 2); - outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2); - inbuf_u = inbuf + width * height; - inbuf_v = inbuf_u + width * height / 4; - outbuf_u = outbuf + target_width * target_height; - outbuf_v = outbuf_u + target_width * target_height / 4; - f = 0; - while (f < frames) { - if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break; - vp9_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height, - width, outbuf, target_width, outbuf_u, outbuf_v, - target_width / 2, target_height, target_width); - fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout); - f++; - } - printf("%d frames processed\n", f); - fclose(fpin); - fclose(fpout); - - free(inbuf); - free(outbuf); - return 0; -} diff --git a/media/libvpx/libvpx/examples/set_maps.c b/media/libvpx/libvpx/examples/set_maps.c index c0c7d10e72..867e473aea 100644 --- a/media/libvpx/libvpx/examples/set_maps.c +++ b/media/libvpx/libvpx/examples/set_maps.c @@ -209,7 +209,7 @@ int main(int argc, char **argv) { die("Failed to open %s for reading.", argv[4]); if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) - die_codec(&codec, "Failed to initialize encoder"); + die("Failed to initialize encoder"); // Encode frames. while (vpx_img_read(&raw, infile)) { diff --git a/media/libvpx/libvpx/examples/simple_decoder.c b/media/libvpx/libvpx/examples/simple_decoder.c index 2bb1a05245..d089e826d5 100644 --- a/media/libvpx/libvpx/examples/simple_decoder.c +++ b/media/libvpx/libvpx/examples/simple_decoder.c @@ -118,7 +118,7 @@ int main(int argc, char **argv) { printf("Using %s\n", vpx_codec_iface_name(decoder->codec_interface())); if (vpx_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) - die_codec(&codec, "Failed to initialize decoder."); + die("Failed to initialize decoder."); while (vpx_video_reader_read_frame(reader)) { vpx_codec_iter_t iter = NULL; diff --git a/media/libvpx/libvpx/examples/simple_encoder.c b/media/libvpx/libvpx/examples/simple_encoder.c index dde6344f8d..dffdd6d7da 100644 --- a/media/libvpx/libvpx/examples/simple_encoder.c +++ b/media/libvpx/libvpx/examples/simple_encoder.c @@ -218,7 +218,7 @@ int main(int argc, char **argv) { die("Failed to open %s for reading.", infile_arg); if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) - die_codec(&codec, "Failed to initialize encoder"); + die("Failed to initialize encoder"); // Encode frames. while (vpx_img_read(&raw, infile)) { diff --git a/media/libvpx/libvpx/vpx/svc_context.h b/media/libvpx/libvpx/examples/svc_context.h similarity index 83% rename from media/libvpx/libvpx/vpx/svc_context.h rename to media/libvpx/libvpx/examples/svc_context.h index 462785075c..7ca987dc83 100644 --- a/media/libvpx/libvpx/vpx/svc_context.h +++ b/media/libvpx/libvpx/examples/svc_context.h @@ -13,11 +13,11 @@ * spatial SVC frame */ -#ifndef VPX_SVC_CONTEXT_H_ -#define VPX_SVC_CONTEXT_H_ +#ifndef VPX_EXAMPLES_SVC_CONTEXT_H_ +#define VPX_EXAMPLES_SVC_CONTEXT_H_ -#include "./vp8cx.h" -#include "./vpx_encoder.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" #ifdef __cplusplus extern "C" { @@ -35,12 +35,11 @@ typedef struct { int temporal_layers; // number of temporal layers int temporal_layering_mode; SVC_LOG_LEVEL log_level; // amount of information to display - int log_print; // when set, printf log messages instead of returning the - // message with svc_get_message - int output_rc_stat; // for outputting rc stats - int speed; // speed setting for codec + int output_rc_stat; // for outputting rc stats + int speed; // speed setting for codec int threads; int aqmode; // turns on aq-mode=3 (cyclic_refresh): 0=off, 1=on. + int use_psnr; // private storage for vpx_svc_encode void *internal; } SvcContext; @@ -60,6 +59,7 @@ typedef struct SvcInternal { double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; // total/Y/U/V uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; uint32_t bytes_sum[VPX_SS_MAX_LAYERS]; + int number_of_frames[VPX_SS_MAX_LAYERS]; // codec encoding values int width; // width of highest layer @@ -67,11 +67,9 @@ typedef struct SvcInternal { int kf_dist; // distance between keyframes // state variables - int psnr_pkt_received; int layer; int use_multiple_frame_contexts; - char message_buffer[2048]; vpx_codec_ctx_t *codec_ctx; } SvcInternal_t; @@ -106,15 +104,10 @@ void vpx_svc_release(SvcContext *svc_ctx); /** * dump accumulated statistics and reset accumulated values */ -const char *vpx_svc_dump_statistics(SvcContext *svc_ctx); - -/** - * get status message from previous encode - */ -const char *vpx_svc_get_message(const SvcContext *svc_ctx); +void vpx_svc_dump_statistics(SvcContext *svc_ctx); #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_SVC_CONTEXT_H_ +#endif // VPX_EXAMPLES_SVC_CONTEXT_H_ diff --git a/media/libvpx/libvpx/vpx/src/svc_encodeframe.c b/media/libvpx/libvpx/examples/svc_encodeframe.c similarity index 80% rename from media/libvpx/libvpx/vpx/src/svc_encodeframe.c rename to media/libvpx/libvpx/examples/svc_encodeframe.c index c2f80d8851..54843f0f0f 100644 --- a/media/libvpx/libvpx/vpx/src/svc_encodeframe.c +++ b/media/libvpx/libvpx/examples/svc_encodeframe.c @@ -21,8 +21,9 @@ #include #include #define VPX_DISABLE_CTRL_TYPECHECKS 1 +#include "../tools_common.h" #include "./vpx_config.h" -#include "vpx/svc_context.h" +#include "./svc_context.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" #include "vpx_mem/vpx_mem.h" @@ -95,17 +96,12 @@ static const SvcInternal_t *get_const_svc_internal(const SvcContext *svc_ctx) { return (const SvcInternal_t *)svc_ctx->internal; } -static void svc_log_reset(SvcContext *svc_ctx) { - SvcInternal_t *const si = (SvcInternal_t *)svc_ctx->internal; - si->message_buffer[0] = '\0'; -} - -static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt, - ...) { +static VPX_TOOLS_FORMAT_PRINTF(3, 4) int svc_log(SvcContext *svc_ctx, + SVC_LOG_LEVEL level, + const char *fmt, ...) { char buf[512]; int retval = 0; va_list ap; - SvcInternal_t *const si = get_svc_internal(svc_ctx); if (level > svc_ctx->log_level) { return retval; @@ -115,25 +111,17 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt, retval = vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); - if (svc_ctx->log_print) { - printf("%s", buf); - } else { - strncat(si->message_buffer, buf, - sizeof(si->message_buffer) - strlen(si->message_buffer) - 1); - } + printf("%s", buf); - if (level == SVC_LOG_ERROR) { - si->codec_ctx->err_detail = si->message_buffer; - } return retval; } static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input, int *value0, int *value1) { if (type == SCALE_FACTOR) { - *value0 = strtol(input, &input, 10); + *value0 = (int)strtol(input, &input, 10); if (*input++ != '/') return VPX_CODEC_INVALID_PARAM; - *value1 = strtol(input, &input, 10); + *value1 = (int)strtol(input, &input, 10); if (*value0 < option_min_values[SCALE_FACTOR] || *value1 < option_min_values[SCALE_FACTOR] || @@ -169,6 +157,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, return VPX_CODEC_INVALID_PARAM; input_string = strdup(input); + if (input_string == NULL) return VPX_CODEC_MEM_ERROR; token = strtok_r(input_string, delim, &save_ptr); for (i = 0; i < num_layers; ++i) { if (token != NULL) { @@ -208,6 +197,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { if (options == NULL) return VPX_CODEC_OK; input_string = strdup(options); + if (input_string == NULL) return VPX_CODEC_MEM_ERROR; // parse option name option_name = strtok_r(input_string, "=", &input_ptr); @@ -276,7 +266,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) { svc_log(svc_ctx, SVC_LOG_ERROR, "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could" - "enabled auto alt reference frame, but % layers are enabled\n", + "enabled auto alt reference frame, but %d layers are enabled\n", REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled); res = VPX_CODEC_INVALID_PARAM; } @@ -289,13 +279,13 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) { if (svc_ctx == NULL || options == NULL || si == NULL) { return VPX_CODEC_INVALID_PARAM; } - strncpy(si->options, options, sizeof(si->options)); + strncpy(si->options, options, sizeof(si->options) - 1); si->options[sizeof(si->options) - 1] = '\0'; return VPX_CODEC_OK; } -vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx, - vpx_codec_enc_cfg_t *const enc_cfg) { +static vpx_codec_err_t assign_layer_bitrates( + const SvcContext *svc_ctx, vpx_codec_enc_cfg_t *const enc_cfg) { int i; const SvcInternal_t *const si = get_const_svc_internal(svc_ctx); int sl, tl, spatial_layer_target; @@ -391,7 +381,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *enc_cfg) { vpx_codec_err_t res; - int i, sl, tl; + int sl, tl; SvcInternal_t *const si = get_svc_internal(svc_ctx); if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL || enc_cfg == NULL) { @@ -436,10 +426,14 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2]; si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2]; } + if (svc_ctx->spatial_layers == 1) { + si->svc_params.scaling_factor_num[0] = 1; + si->svc_params.scaling_factor_den[0] = 1; + } } for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { - i = sl * svc_ctx->temporal_layers + tl; + const int i = sl * svc_ctx->temporal_layers + tl; si->svc_params.max_quantizers[i] = MAX_QUANTIZER; si->svc_params.min_quantizers[i] = 0; if (enc_cfg->rc_end_usage == VPX_CBR && @@ -464,11 +458,11 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS; if (svc_ctx->temporal_layers * svc_ctx->spatial_layers > VPX_MAX_LAYERS) { - svc_log(svc_ctx, SVC_LOG_ERROR, - "spatial layers * temporal layers exceeds the maximum number of " - "allowed layers of %d\n", - svc_ctx->spatial_layers * svc_ctx->temporal_layers, - (int)VPX_MAX_LAYERS); + svc_log( + svc_ctx, SVC_LOG_ERROR, + "spatial layers * temporal layers (%d) exceeds the maximum number of " + "allowed layers of %d\n", + svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS); return VPX_CODEC_INVALID_PARAM; } res = assign_layer_bitrates(svc_ctx, enc_cfg); @@ -481,11 +475,6 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, return VPX_CODEC_INVALID_PARAM; } -#if CONFIG_SPATIAL_SVC - for (i = 0; i < svc_ctx->spatial_layers; ++i) - enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i]; -#endif - if (svc_ctx->temporal_layers > 1) { int i; for (i = 0; i < svc_ctx->temporal_layers; ++i) { @@ -510,14 +499,28 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, enc_cfg->rc_buf_initial_sz = 500; enc_cfg->rc_buf_optimal_sz = 600; enc_cfg->rc_buf_sz = 1000; - enc_cfg->rc_dropframe_thresh = 0; + } + + for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + const int i = sl * svc_ctx->temporal_layers + tl; + if (enc_cfg->rc_end_usage == VPX_CBR && + enc_cfg->g_pass == VPX_RC_ONE_PASS) { + si->svc_params.max_quantizers[i] = enc_cfg->rc_max_quantizer; + si->svc_params.min_quantizers[i] = enc_cfg->rc_min_quantizer; + } + } } if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0) enc_cfg->g_error_resilient = 1; // Initialize codec - res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR); + vpx_codec_flags_t flags = 0; + if (svc_ctx->use_psnr) { + flags |= VPX_CODEC_USE_PSNR; + } + res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, flags); if (res != VPX_CODEC_OK) { svc_log(svc_ctx, SVC_LOG_ERROR, "svc_enc_init error\n"); return res; @@ -537,100 +540,27 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, struct vpx_image *rawimg, vpx_codec_pts_t pts, int64_t duration, int deadline) { vpx_codec_err_t res; - vpx_codec_iter_t iter; - const vpx_codec_cx_pkt_t *cx_pkt; SvcInternal_t *const si = get_svc_internal(svc_ctx); if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) { return VPX_CODEC_INVALID_PARAM; } - svc_log_reset(svc_ctx); - res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, 0, deadline); if (res != VPX_CODEC_OK) { return res; } - // save compressed data - iter = NULL; - while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) { - switch (cx_pkt->kind) { -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) -#if CONFIG_SPATIAL_SVC - case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: { - int i; - for (i = 0; i < svc_ctx->spatial_layers; ++i) { - int j; - svc_log(svc_ctx, SVC_LOG_DEBUG, - "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): " - "%2.3f %2.3f %2.3f %2.3f \n", - si->psnr_pkt_received, i, cx_pkt->data.layer_psnr[i].psnr[0], - cx_pkt->data.layer_psnr[i].psnr[1], - cx_pkt->data.layer_psnr[i].psnr[2], - cx_pkt->data.layer_psnr[i].psnr[3]); - svc_log(svc_ctx, SVC_LOG_DEBUG, - "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): " - "%2.3f %2.3f %2.3f %2.3f \n", - si->psnr_pkt_received, i, cx_pkt->data.layer_psnr[i].sse[0], - cx_pkt->data.layer_psnr[i].sse[1], - cx_pkt->data.layer_psnr[i].sse[2], - cx_pkt->data.layer_psnr[i].sse[3]); - - for (j = 0; j < COMPONENTS; ++j) { - si->psnr_sum[i][j] += cx_pkt->data.layer_psnr[i].psnr[j]; - si->sse_sum[i][j] += cx_pkt->data.layer_psnr[i].sse[j]; - } - } - ++si->psnr_pkt_received; - break; - } - case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: { - int i; - for (i = 0; i < svc_ctx->spatial_layers; ++i) - si->bytes_sum[i] += cx_pkt->data.layer_sizes[i]; - break; - } -#endif -#endif - case VPX_CODEC_PSNR_PKT: { -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) - int j; - svc_log(svc_ctx, SVC_LOG_DEBUG, - "frame: %d, layer: %d, PSNR(Total/Y/U/V): " - "%2.3f %2.3f %2.3f %2.3f \n", - si->psnr_pkt_received, 0, cx_pkt->data.layer_psnr[0].psnr[0], - cx_pkt->data.layer_psnr[0].psnr[1], - cx_pkt->data.layer_psnr[0].psnr[2], - cx_pkt->data.layer_psnr[0].psnr[3]); - for (j = 0; j < COMPONENTS; ++j) { - si->psnr_sum[0][j] += cx_pkt->data.layer_psnr[0].psnr[j]; - si->sse_sum[0][j] += cx_pkt->data.layer_psnr[0].sse[j]; - } -#endif - } - ++si->psnr_pkt_received; - break; - default: { break; } - } - } return VPX_CODEC_OK; } -const char *vpx_svc_get_message(const SvcContext *svc_ctx) { - const SvcInternal_t *const si = get_const_svc_internal(svc_ctx); - if (svc_ctx == NULL || si == NULL) return NULL; - return si->message_buffer; -} - static double calc_psnr(double d) { if (d == 0) return 100; return -10.0 * log(d) / log(10.0); } // dump accumulated statistics and reset accumulated values -const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) { - int number_of_frames; +void vpx_svc_dump_statistics(SvcContext *svc_ctx) { int i, j; uint32_t bytes_total = 0; double scale[COMPONENTS]; @@ -639,23 +569,20 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) { double y_scale; SvcInternal_t *const si = get_svc_internal(svc_ctx); - if (svc_ctx == NULL || si == NULL) return NULL; - - svc_log_reset(svc_ctx); - - number_of_frames = si->psnr_pkt_received; - if (number_of_frames <= 0) return vpx_svc_get_message(svc_ctx); + if (svc_ctx == NULL || si == NULL) return; svc_log(svc_ctx, SVC_LOG_INFO, "\n"); for (i = 0; i < svc_ctx->spatial_layers; ++i) { svc_log(svc_ctx, SVC_LOG_INFO, - "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n", - i, (double)si->psnr_sum[i][0] / number_of_frames, - (double)si->psnr_sum[i][1] / number_of_frames, - (double)si->psnr_sum[i][2] / number_of_frames, - (double)si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]); + "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u], " + "Number_of_frames %d \n", + i, si->psnr_sum[i][0] / si->number_of_frames[i], + si->psnr_sum[i][1] / si->number_of_frames[i], + si->psnr_sum[i][2] / si->number_of_frames[i], + si->psnr_sum[i][3] / si->number_of_frames[i], si->bytes_sum[i], + si->number_of_frames[i]); // the following psnr calculation is deduced from ffmpeg.c#print_report - y_scale = si->width * si->height * 255.0 * 255.0 * number_of_frames; + y_scale = si->width * si->height * 255.0 * 255.0 * si->number_of_frames[i]; scale[1] = y_scale; scale[2] = scale[3] = y_scale / 4; // U or V scale[0] = y_scale * 1.5; // total @@ -672,19 +599,18 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) { mse[1], mse[2], mse[3]); bytes_total += si->bytes_sum[i]; - // Clear sums for next time. + } + // Clear sums for next time. + for (i = 0; i < svc_ctx->spatial_layers; ++i) { si->bytes_sum[i] = 0; + si->number_of_frames[i] = 0; for (j = 0; j < COMPONENTS; ++j) { si->psnr_sum[i][j] = 0; si->sse_sum[i][j] = 0; } } - // only display statistics once - si->psnr_pkt_received = 0; - svc_log(svc_ctx, SVC_LOG_INFO, "Total Bytes=[%u]\n", bytes_total); - return vpx_svc_get_message(svc_ctx); } void vpx_svc_release(SvcContext *svc_ctx) { diff --git a/media/libvpx/libvpx/examples/twopass_encoder.c b/media/libvpx/libvpx/examples/twopass_encoder.c index 4e63a7a6c9..07a10d9cf3 100644 --- a/media/libvpx/libvpx/examples/twopass_encoder.c +++ b/media/libvpx/libvpx/examples/twopass_encoder.c @@ -84,6 +84,7 @@ static int get_frame_stats(vpx_codec_ctx_t *ctx, const vpx_image_t *img, const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf; const size_t pkt_size = pkt->data.twopass_stats.sz; stats->buf = realloc(stats->buf, stats->sz + pkt_size); + if (!stats->buf) die("Failed to reallocate stats buffer."); memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size); stats->sz += pkt_size; } @@ -128,7 +129,7 @@ static vpx_fixed_buf_t pass0(vpx_image_t *raw, FILE *infile, vpx_fixed_buf_t stats = { NULL, 0 }; if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0)) - die_codec(&codec, "Failed to initialize encoder"); + die("Failed to initialize encoder"); // Calculate frame statistics. while (vpx_img_read(raw, infile)) { @@ -164,7 +165,7 @@ static void pass1(vpx_image_t *raw, FILE *infile, const char *outfile_name, if (!writer) die("Failed to open %s for writing", outfile_name); if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0)) - die_codec(&codec, "Failed to initialize encoder"); + die("Failed to initialize encoder"); // Encode frames. while (vpx_img_read(raw, infile)) { @@ -221,7 +222,7 @@ int main(int argc, char **argv) { die("Invalid frame size: %dx%d", w, h); if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1)) - die("Failed to allocate image", w, h); + die("Failed to allocate image (%dx%d)", w, h); printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface())); diff --git a/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c b/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c index 0b9663c777..60161da90a 100644 --- a/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c +++ b/media/libvpx/libvpx/examples/vp8_multi_resolution_encoder.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -61,7 +60,7 @@ void usage_exit(void) { exit(EXIT_FAILURE); } int (*read_frame_p)(FILE *f, vpx_image_t *img); -static int read_frame(FILE *f, vpx_image_t *img) { +static int mulres_read_frame(FILE *f, vpx_image_t *img) { size_t nbytes, to_read; int res = 1; @@ -75,7 +74,7 @@ static int read_frame(FILE *f, vpx_image_t *img) { return res; } -static int read_frame_by_row(FILE *f, vpx_image_t *img) { +static int mulres_read_frame_by_row(FILE *f, vpx_image_t *img) { size_t nbytes, to_read; int res = 1; int plane; @@ -151,7 +150,7 @@ static void write_ivf_frame_header(FILE *outfile, if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return; pts = pkt->data.frame.pts; - mem_put_le32(header, pkt->data.frame.sz); + mem_put_le32(header, (int)pkt->data.frame.sz); mem_put_le32(header + 4, pts & 0xFFFFFFFF); mem_put_le32(header + 8, pts >> 32); @@ -190,7 +189,7 @@ static void set_temporal_layer_pattern(int num_temporal_layers, cfg->ts_layer_id[0] = 0; cfg->ts_layer_id[1] = 1; // Use 60/40 bit allocation as example. - cfg->ts_target_bitrate[0] = 0.6f * bitrate; + cfg->ts_target_bitrate[0] = (int)(0.6f * bitrate); cfg->ts_target_bitrate[1] = bitrate; /* 0=L, 1=GF */ @@ -241,8 +240,8 @@ static void set_temporal_layer_pattern(int num_temporal_layers, cfg->ts_layer_id[2] = 1; cfg->ts_layer_id[3] = 2; // Use 45/20/35 bit allocation as example. - cfg->ts_target_bitrate[0] = 0.45f * bitrate; - cfg->ts_target_bitrate[1] = 0.65f * bitrate; + cfg->ts_target_bitrate[0] = (int)(0.45f * bitrate); + cfg->ts_target_bitrate[1] = (int)(0.65f * bitrate); cfg->ts_target_bitrate[2] = bitrate; /* 0=L, 1=GF, 2=ARF */ @@ -294,8 +293,8 @@ int main(int argc, char **argv) { vpx_codec_err_t res[NUM_ENCODERS]; int i; - long width; - long height; + int width; + int height; int length_frame; int frame_avail; int got_data; @@ -347,12 +346,12 @@ int main(int argc, char **argv) { printf("Using %s\n", vpx_codec_iface_name(interface)); - width = strtol(argv[1], NULL, 0); - height = strtol(argv[2], NULL, 0); - framerate = strtol(argv[3], NULL, 0); + width = (int)strtol(argv[1], NULL, 0); + height = (int)strtol(argv[2], NULL, 0); + framerate = (int)strtol(argv[3], NULL, 0); if (width < 16 || width % 2 || height < 16 || height % 2) - die("Invalid resolution: %ldx%ld", width, height); + die("Invalid resolution: %dx%d", width, height); /* Open input video file for encoding */ if (!(infile = fopen(argv[4], "rb"))) @@ -371,15 +370,16 @@ int main(int argc, char **argv) { // Bitrates per spatial layer: overwrite default rates above. for (i = 0; i < NUM_ENCODERS; i++) { - target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0); + target_bitrate[i] = (int)strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0); } // Temporal layers per spatial layers: overwrite default settings above. for (i = 0; i < NUM_ENCODERS; i++) { - num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0); + num_temporal_layers[i] = + (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0); if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3) die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n", - num_temporal_layers); + num_temporal_layers[i]); } /* Open file to write out each spatially downsampled input stream. */ @@ -391,9 +391,9 @@ int main(int argc, char **argv) { downsampled_input[i] = fopen(filename, "wb"); } - key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0); + key_frame_insert = (int)strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0); - show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0); + show_psnr = (int)strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0); /* Populate default encoder configuration */ for (i = 0; i < NUM_ENCODERS; i++) { @@ -437,7 +437,7 @@ int main(int argc, char **argv) { /* Other-resolution encoder settings */ for (i = 1; i < NUM_ENCODERS; i++) { - memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t)); + cfg[i] = cfg[0]; cfg[i].rc_target_bitrate = target_bitrate[i]; @@ -467,12 +467,12 @@ int main(int argc, char **argv) { /* Allocate image for each encoder */ for (i = 0; i < NUM_ENCODERS; i++) if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32)) - die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h); + die("Failed to allocate image (%dx%d)", cfg[i].g_w, cfg[i].g_h); - if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w) - read_frame_p = read_frame; + if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w) + read_frame_p = mulres_read_frame; else - read_frame_p = read_frame_by_row; + read_frame_p = mulres_read_frame_by_row; for (i = 0; i < NUM_ENCODERS; i++) if (outfile[i]) write_ivf_file_header(outfile[i], &cfg[i], 0); @@ -558,7 +558,8 @@ int main(int argc, char **argv) { /* Write out down-sampled input. */ length_frame = cfg[i].g_w * cfg[i].g_h * 3 / 2; if (fwrite(raw[i].planes[0], 1, length_frame, - downsampled_input[NUM_ENCODERS - i - 1]) != length_frame) { + downsampled_input[NUM_ENCODERS - i - 1]) != + (unsigned int)length_frame) { return EXIT_FAILURE; } } @@ -619,10 +620,6 @@ int main(int argc, char **argv) { break; default: break; } - printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT && - (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY) - ? "K" - : ""); fflush(stdout); } } @@ -663,7 +660,6 @@ int main(int argc, char **argv) { write_ivf_file_header(outfile[i], &cfg[i], frame_cnt - 1); fclose(outfile[i]); } - printf("\n"); return EXIT_SUCCESS; } diff --git a/media/libvpx/libvpx/examples/vp8cx_set_ref.c b/media/libvpx/libvpx/examples/vp8cx_set_ref.c index 846477c61e..ca528f9e90 100644 --- a/media/libvpx/libvpx/examples/vp8cx_set_ref.c +++ b/media/libvpx/libvpx/examples/vp8cx_set_ref.c @@ -155,7 +155,7 @@ int main(int argc, char **argv) { die("Failed to open %s for reading.", argv[3]); if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) - die_codec(&codec, "Failed to initialize encoder"); + die("Failed to initialize encoder"); // Encode frames. while (vpx_img_read(&raw, infile)) { diff --git a/media/libvpx/libvpx/examples/vp9_lossless_encoder.c b/media/libvpx/libvpx/examples/vp9_lossless_encoder.c index cb5ca6bfe0..c4eb3a8b17 100644 --- a/media/libvpx/libvpx/examples/vp9_lossless_encoder.c +++ b/media/libvpx/libvpx/examples/vp9_lossless_encoder.c @@ -110,7 +110,7 @@ int main(int argc, char **argv) { die("Failed to open %s for reading.", argv[3]); if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) - die_codec(&codec, "Failed to initialize encoder"); + die("Failed to initialize encoder"); if (vpx_codec_control_(&codec, VP9E_SET_LOSSLESS, 1)) die_codec(&codec, "Failed to use lossless mode"); diff --git a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c index 0e409387b3..16e6aba6cd 100644 --- a/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/media/libvpx/libvpx/examples/vp9_spatial_svc_encoder.c @@ -14,8 +14,11 @@ * that benefit from a scalable bitstream. */ +#include +#include #include #include +#include #include #include #include @@ -24,14 +27,22 @@ #include "../tools_common.h" #include "../video_writer.h" +#include "../vpx_ports/bitops.h" #include "../vpx_ports/vpx_timer.h" -#include "vpx/svc_context.h" +#include "./svc_context.h" #include "vpx/vp8cx.h" +#include "vpx/vpx_decoder.h" #include "vpx/vpx_encoder.h" #include "../vpxstats.h" -#include "vp9/encoder/vp9_encoder.h" +#include "./y4minput.h" + +#define OUTPUT_FRAME_STATS 0 #define OUTPUT_RC_STATS 1 +#define SIMULCAST_MODE 0 + +static const arg_def_t outputfile = + ARG_DEF("o", "output", 1, "Output filename"); static const arg_def_t skip_frames_arg = ARG_DEF("s", "skip-frames", 1, "input frames to skip"); static const arg_def_t frames_arg = @@ -60,12 +71,6 @@ static const arg_def_t kf_dist_arg = ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes"); static const arg_def_t scale_factors_arg = ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)"); -static const arg_def_t passes_arg = - ARG_DEF("p", "passes", 1, "Number of passes (1/2)"); -static const arg_def_t pass_arg = - ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)"); -static const arg_def_t fpf_name_arg = - ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"); static const arg_def_t min_q_arg = ARG_DEF(NULL, "min-q", 1, "Minimum quantizer"); static const arg_def_t max_q_arg = @@ -86,6 +91,21 @@ static const arg_def_t aqmode_arg = ARG_DEF("aq", "aqmode", 1, "aq-mode off/on"); static const arg_def_t bitrates_arg = ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]"); +static const arg_def_t dropframe_thresh_arg = + ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)"); +static const arg_def_t psnr_arg = + ARG_DEF(NULL, "psnr", 1, "Enable PSNR computation and statistics"); +static const struct arg_enum_list tune_content_enum[] = { + { "default", VP9E_CONTENT_DEFAULT }, + { "screen", VP9E_CONTENT_SCREEN }, + { "film", VP9E_CONTENT_FILM }, + { NULL, 0 } +}; + +static const arg_def_t tune_content_arg = ARG_DEF_ENUM( + NULL, "tune-content", 1, "Tune content type", tune_content_enum); +static const arg_def_t inter_layer_pred_arg = ARG_DEF( + NULL, "inter-layer-pred", 1, "0 - 3: On, Off, Key-frames, Constrained"); #if CONFIG_VP9_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { @@ -97,6 +117,7 @@ static const arg_def_t bitdepth_arg = ARG_DEF_ENUM( #endif // CONFIG_VP9_HIGHBITDEPTH static const arg_def_t *svc_args[] = { &frames_arg, + &outputfile, &width_arg, &height_arg, &timebase_arg, @@ -105,9 +126,6 @@ static const arg_def_t *svc_args[] = { &frames_arg, &spatial_layers_arg, &kf_dist_arg, &scale_factors_arg, - &passes_arg, - &pass_arg, - &fpf_name_arg, &min_q_arg, &max_q_arg, &min_bitrate_arg, @@ -127,6 +145,10 @@ static const arg_def_t *svc_args[] = { &frames_arg, &speed_arg, &rc_end_usage_arg, &bitrates_arg, + &dropframe_thresh_arg, + &tune_content_arg, + &inter_layer_pred_arg, + &psnr_arg, NULL }; static const uint32_t default_frames_to_skip = 0; @@ -145,20 +167,19 @@ static const int32_t default_speed = -1; // -1 means use library default. static const uint32_t default_threads = 0; // zero means use library default. typedef struct { - const char *input_filename; const char *output_filename; uint32_t frames_to_code; uint32_t frames_to_skip; struct VpxInputContext input_ctx; stats_io_t rc_stats; - int passes; - int pass; + int tune_content; + int inter_layer_pred; } AppInput; static const char *exec_name; void usage_exit(void) { - fprintf(stderr, "Usage: %s input_filename output_filename\n", + fprintf(stderr, "Usage: %s input_filename -o output_filename\n", exec_name); fprintf(stderr, "Options:\n"); arg_show_usage(stderr, svc_args); @@ -168,14 +189,11 @@ void usage_exit(void) { static void parse_command_line(int argc, const char **argv_, AppInput *app_input, SvcContext *svc_ctx, vpx_codec_enc_cfg_t *enc_cfg) { - struct arg arg = { 0 }; + struct arg arg; char **argv = NULL; char **argi = NULL; char **argj = NULL; vpx_codec_err_t res; - int passes = 0; - int pass = 0; - const char *fpf_file_name = NULL; unsigned int min_bitrate = 0; unsigned int max_bitrate = 0; char string_options[1024] = { 0 }; @@ -190,6 +208,7 @@ static void parse_command_line(int argc, const char **argv_, #endif svc_ctx->speed = default_speed; svc_ctx->threads = default_threads; + svc_ctx->use_psnr = 0; // start with default encoder configuration res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0); @@ -212,11 +231,17 @@ static void parse_command_line(int argc, const char **argv_, // process command line options argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + exit(EXIT_FAILURE); + } for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; if (arg_match(&arg, &frames_arg, argi)) { app_input->frames_to_code = arg_parse_uint(&arg); + } else if (arg_match(&arg, &outputfile, argi)) { + app_input->output_filename = arg.val; } else if (arg_match(&arg, &width_arg, argi)) { enc_cfg->g_w = arg_parse_uint(&arg); } else if (arg_match(&arg, &height_arg, argi)) { @@ -237,6 +262,9 @@ static void parse_command_line(int argc, const char **argv_, #endif } else if (arg_match(&arg, &speed_arg, argi)) { svc_ctx->speed = arg_parse_uint(&arg); + if (svc_ctx->speed > 9) { + warn("Mapping speed %d to speed 9.\n", svc_ctx->speed); + } } else if (arg_match(&arg, &aqmode_arg, argi)) { svc_ctx->aqmode = arg_parse_uint(&arg); } else if (arg_match(&arg, &threads_arg, argi)) { @@ -251,29 +279,25 @@ static void parse_command_line(int argc, const char **argv_, enc_cfg->kf_min_dist = arg_parse_uint(&arg); enc_cfg->kf_max_dist = enc_cfg->kf_min_dist; } else if (arg_match(&arg, &scale_factors_arg, argi)) { - snprintf(string_options, sizeof(string_options), "%s scale-factors=%s", - string_options, arg.val); + strncat(string_options, " scale-factors=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); } else if (arg_match(&arg, &bitrates_arg, argi)) { - snprintf(string_options, sizeof(string_options), "%s bitrates=%s", - string_options, arg.val); - } else if (arg_match(&arg, &passes_arg, argi)) { - passes = arg_parse_uint(&arg); - if (passes < 1 || passes > 2) { - die("Error: Invalid number of passes (%d)\n", passes); - } - } else if (arg_match(&arg, &pass_arg, argi)) { - pass = arg_parse_uint(&arg); - if (pass < 1 || pass > 2) { - die("Error: Invalid pass selected (%d)\n", pass); - } - } else if (arg_match(&arg, &fpf_name_arg, argi)) { - fpf_file_name = arg.val; + strncat(string_options, " bitrates=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); } else if (arg_match(&arg, &min_q_arg, argi)) { - snprintf(string_options, sizeof(string_options), "%s min-quantizers=%s", - string_options, arg.val); + strncat(string_options, " min-quantizers=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); } else if (arg_match(&arg, &max_q_arg, argi)) { - snprintf(string_options, sizeof(string_options), "%s max-quantizers=%s", - string_options, arg.val); + strncat(string_options, " max-quantizers=", + sizeof(string_options) - strlen(string_options) - 1); + strncat(string_options, arg.val, + sizeof(string_options) - strlen(string_options) - 1); } else if (arg_match(&arg, &min_bitrate_arg, argi)) { min_bitrate = arg_parse_uint(&arg); } else if (arg_match(&arg, &max_bitrate_arg, argi)) { @@ -300,9 +324,16 @@ static void parse_command_line(int argc, const char **argv_, break; default: die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth); - break; } #endif // CONFIG_VP9_HIGHBITDEPTH + } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) { + enc_cfg->rc_dropframe_thresh = arg_parse_uint(&arg); + } else if (arg_match(&arg, &tune_content_arg, argi)) { + app_input->tune_content = arg_parse_uint(&arg); + } else if (arg_match(&arg, &inter_layer_pred_arg, argi)) { + app_input->inter_layer_pred = arg_parse_uint(&arg); + } else if (arg_match(&arg, &psnr_arg, argi)) { + svc_ctx->use_psnr = arg_parse_uint(&arg); } else { ++argj; } @@ -312,35 +343,7 @@ static void parse_command_line(int argc, const char **argv_, if (strlen(string_options) > 0) vpx_svc_set_options(svc_ctx, string_options + 1); - if (passes == 0 || passes == 1) { - if (pass) { - fprintf(stderr, "pass is ignored since there's only one pass\n"); - } - enc_cfg->g_pass = VPX_RC_ONE_PASS; - } else { - if (pass == 0) { - die("pass must be specified when passes is 2\n"); - } - - if (fpf_file_name == NULL) { - die("fpf must be specified when passes is 2\n"); - } - - if (pass == 1) { - enc_cfg->g_pass = VPX_RC_FIRST_PASS; - if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 0)) { - fatal("Failed to open statistics store"); - } - } else { - enc_cfg->g_pass = VPX_RC_LAST_PASS; - if (!stats_open_file(&app_input->rc_stats, fpf_file_name, 1)) { - fatal("Failed to open statistics store"); - } - enc_cfg->rc_twopass_stats_in = stats_get(&app_input->rc_stats); - } - app_input->passes = passes; - app_input->pass = pass; - } + enc_cfg->g_pass = VPX_RC_ONE_PASS; if (enc_cfg->rc_target_bitrate > 0) { if (min_bitrate > 0) { @@ -358,13 +361,20 @@ static void parse_command_line(int argc, const char **argv_, if (argi[0][0] == '-' && strlen(argi[0]) > 1) die("Error: Unrecognized option %s\n", *argi); - if (argv[0] == NULL || argv[1] == 0) { + if (argv[0] == NULL) { usage_exit(); } - app_input->input_filename = argv[0]; - app_input->output_filename = argv[1]; + app_input->input_ctx.filename = argv[0]; free(argv); + open_input_file(&app_input->input_ctx); + if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) { + enc_cfg->g_w = app_input->input_ctx.width; + enc_cfg->g_h = app_input->input_ctx.height; + enc_cfg->g_timebase.den = app_input->input_ctx.framerate.numerator; + enc_cfg->g_timebase.num = app_input->input_ctx.framerate.denominator; + } + if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 || enc_cfg->g_h % 2) die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h); @@ -429,8 +439,9 @@ static void set_rate_control_stats(struct RateControlStats *rc, rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl]; if (tl > 0) { rc->layer_pfb[layer] = - 1000.0 * (cfg->layer_target_bitrate[layer] - - cfg->layer_target_bitrate[layer - 1]) / + 1000.0 * + (cfg->layer_target_bitrate[layer] - + cfg->layer_target_bitrate[layer - 1]) / (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]); } else { rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] / @@ -502,14 +513,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc, printf("Average, rms-variance, and percent-fluct: %f %f %f \n", rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate), perc_fluctuation); - if (frame_cnt != tot_num_frames) - die("Error: Number of input frames not equal to output encoded frames != " - "%d tot_num_frames = %d\n", - frame_cnt, tot_num_frames); + printf("Num of input, num of encoded (super) frames: %d %d \n", frame_cnt, + tot_num_frames); } -vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, - uint32_t sizes[8], int *count) { +static vpx_codec_err_t parse_superframe_index(const uint8_t *data, + size_t data_sz, uint64_t sizes[8], + int *count) { // A chunk ending with a byte matching 0xc0 is an invalid chunk unless // it is a super frame index. If the last byte of real video compression // data is 0xc0 the encoder must add a 0 byte. If we have the marker but @@ -561,105 +571,392 @@ vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, // bypass/flexible mode. The pattern corresponds to the pattern // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in // non-flexible mode. -void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers, - int is_key_frame, - vpx_svc_ref_frame_config_t *ref_frame_config) { +static void set_frame_flags_bypass_mode_ex0( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + int sl; + for (sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + for (sl = 0; sl < num_spatial_layers; ++sl) { + // Set the buffer idx. + if (tl == 0) { + ref_frame_config->lst_fb_idx[sl] = sl; + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { + ref_frame_config->gld_fb_idx[sl] = 0; + } + ref_frame_config->alt_fb_idx[sl] = 0; + } else if (tl == 1) { + ref_frame_config->lst_fb_idx[sl] = sl; + ref_frame_config->gld_fb_idx[sl] = + (sl == 0) ? 0 : num_spatial_layers + sl - 1; + ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl; + } + // Set the reference and update flags. if (!tl) { if (!sl) { - ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | - VP8_EFLAG_NO_UPD_ARF; + // Base spatial and base temporal (sl = 0, tl = 0) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; } else { if (is_key_frame) { - ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF | - VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->gld_fb_idx[sl]; } else { - ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + // Non-zero spatiall layer. + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 1; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; } } } else if (tl == 1) { if (!sl) { - ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | - VP8_EFLAG_NO_UPD_GF; + // Base spatial and top temporal (tl = 1) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; } else { - ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF; + // Non-zero spatial. + if (sl < num_spatial_layers - 1) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else if (sl == num_spatial_layers - 1) { + // Top spatial and top temporal (non-reference -- doesn't update any + // reference buffers) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + } } } - if (tl == 0) { - ref_frame_config->lst_fb_idx[sl] = sl; - if (sl) - ref_frame_config->gld_fb_idx[sl] = sl - 1; - else - ref_frame_config->gld_fb_idx[sl] = 0; - ref_frame_config->alt_fb_idx[sl] = 0; - } else if (tl == 1) { - ref_frame_config->lst_fb_idx[sl] = sl; - ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1; - ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl; - } } } +// Example pattern for 2 spatial layers and 2 temporal layers used in the +// bypass/flexible mode, except only 1 spatial layer when temporal_layer_id = 1. +static void set_frame_flags_bypass_mode_ex1( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + int sl; + for (sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + + if (tl == 0) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[1] = 0; + ref_frame_config->gld_fb_idx[1] = 1; + } else { + ref_frame_config->lst_fb_idx[1] = 1; + ref_frame_config->gld_fb_idx[1] = 0; + } + ref_frame_config->alt_fb_idx[1] = 0; + + ref_frame_config->lst_fb_idx[0] = 0; + ref_frame_config->gld_fb_idx[0] = 0; + ref_frame_config->alt_fb_idx[0] = 0; + } + if (tl == 1) { + ref_frame_config->lst_fb_idx[0] = 0; + ref_frame_config->gld_fb_idx[0] = 1; + ref_frame_config->alt_fb_idx[0] = 2; + + ref_frame_config->lst_fb_idx[1] = 1; + ref_frame_config->gld_fb_idx[1] = 2; + ref_frame_config->alt_fb_idx[1] = 3; + } + // Set the reference and update flags. + if (tl == 0) { + // Base spatial and base temporal (sl = 0, tl = 0) + ref_frame_config->reference_last[0] = 1; + ref_frame_config->reference_golden[0] = 0; + ref_frame_config->reference_alt_ref[0] = 0; + ref_frame_config->update_buffer_slot[0] |= + 1 << ref_frame_config->lst_fb_idx[0]; + + if (is_key_frame) { + ref_frame_config->reference_last[1] = 1; + ref_frame_config->reference_golden[1] = 0; + ref_frame_config->reference_alt_ref[1] = 0; + ref_frame_config->update_buffer_slot[1] |= + 1 << ref_frame_config->gld_fb_idx[1]; + } else { + // Non-zero spatiall layer. + ref_frame_config->reference_last[1] = 1; + ref_frame_config->reference_golden[1] = 1; + ref_frame_config->reference_alt_ref[1] = 1; + ref_frame_config->update_buffer_slot[1] |= + 1 << ref_frame_config->lst_fb_idx[1]; + } + } + if (tl == 1) { + // Top spatial and top temporal (non-reference -- doesn't update any + // reference buffers) + ref_frame_config->reference_last[1] = 1; + ref_frame_config->reference_golden[1] = 0; + ref_frame_config->reference_alt_ref[1] = 0; + } +} + +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE +static void test_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder, + const int frames_out, int *mismatch_seen) { + vpx_image_t enc_img, dec_img; + struct vp9_ref_frame ref_enc, ref_dec; + if (*mismatch_seen) return; + /* Get the internal reference frame */ + ref_enc.idx = 0; + ref_dec.idx = 0; + vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc); + enc_img = ref_enc.img; + vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec); + dec_img = ref_dec.img; +#if CONFIG_VP9_HIGHBITDEPTH + if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) != + (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) { + if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH, + enc_img.d_w, enc_img.d_h, 16); + vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img); + } + if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH, + dec_img.d_w, dec_img.d_h, 16); + vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img); + } + } +#endif + + if (!compare_img(&enc_img, &dec_img)) { + int y[4], u[4], v[4]; +#if CONFIG_VP9_HIGHBITDEPTH + if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + find_mismatch_high(&enc_img, &dec_img, y, u, v); + } else { + find_mismatch(&enc_img, &dec_img, y, u, v); + } +#else + find_mismatch(&enc_img, &dec_img, y, u, v); +#endif + decoder->err = 1; + printf( + "Encode/decode mismatch on frame %d at" + " Y[%d, %d] {%d/%d}," + " U[%d, %d] {%d/%d}," + " V[%d, %d] {%d/%d}\n", + frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1], + v[2], v[3]); + *mismatch_seen = frames_out; + } + + vpx_img_free(&enc_img); + vpx_img_free(&dec_img); +} +#endif + +#if OUTPUT_RC_STATS +static void svc_output_rc_stats( + vpx_codec_ctx_t *codec, vpx_codec_enc_cfg_t *enc_cfg, + vpx_svc_layer_id_t *layer_id, const vpx_codec_cx_pkt_t *cx_pkt, + struct RateControlStats *rc, VpxVideoWriter **outfile, + const uint32_t frame_cnt, const double framerate) { + int num_layers_encoded = 0; + unsigned int sl, tl; + uint64_t sizes[8]; + uint64_t sizes_parsed[8]; + int count = 0; + double sum_bitrate = 0.0; + double sum_bitrate2 = 0.0; + memset(sizes, 0, sizeof(sizes)); + memset(sizes_parsed, 0, sizeof(sizes_parsed)); + vpx_codec_control(codec, VP9E_GET_SVC_LAYER_ID, layer_id); + parse_superframe_index(cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, + sizes_parsed, &count); + if (enc_cfg->ss_number_layers == 1) { + sizes[0] = cx_pkt->data.frame.sz; + } else { + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + sizes[sl] = 0; + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) { + sizes[sl] = sizes_parsed[num_layers_encoded]; + num_layers_encoded++; + } + } + } + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + unsigned int sl2; + uint64_t tot_size = 0; +#if SIMULCAST_MODE + for (sl2 = 0; sl2 < sl; ++sl2) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl2]) tot_size += sizes[sl2]; + } + vpx_video_writer_write_frame(outfile[sl], + (uint8_t *)(cx_pkt->data.frame.buf) + tot_size, + (size_t)(sizes[sl]), cx_pkt->data.frame.pts); +#else + for (sl2 = 0; sl2 <= sl; ++sl2) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl2]) tot_size += sizes[sl2]; + } + if (tot_size > 0) + vpx_video_writer_write_frame(outfile[sl], cx_pkt->data.frame.buf, + (size_t)(tot_size), cx_pkt->data.frame.pts); +#endif // SIMULCAST_MODE + } + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) { + for (tl = layer_id->temporal_layer_id; tl < enc_cfg->ts_number_layers; + ++tl) { + const int layer = sl * enc_cfg->ts_number_layers + tl; + ++rc->layer_tot_enc_frames[layer]; + rc->layer_encoding_bitrate[layer] += 8.0 * sizes[sl]; + // Keep count of rate control stats per layer, for non-key + // frames. + if (tl == (unsigned int)layer_id->temporal_layer_id && + !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) { + rc->layer_avg_frame_size[layer] += 8.0 * sizes[sl]; + rc->layer_avg_rate_mismatch[layer] += + fabs(8.0 * sizes[sl] - rc->layer_pfb[layer]) / + rc->layer_pfb[layer]; + ++rc->layer_enc_frames[layer]; + } + } + } + } + + // Update for short-time encoding bitrate states, for moving + // window of size rc->window, shifted by rc->window / 2. + // Ignore first window segment, due to key frame. + if (frame_cnt > (unsigned int)rc->window_size) { + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) + sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate; + } + if (frame_cnt % rc->window_size == 0) { + rc->window_count += 1; + rc->avg_st_encoding_bitrate += sum_bitrate / rc->window_size; + rc->variance_st_encoding_bitrate += + (sum_bitrate / rc->window_size) * (sum_bitrate / rc->window_size); + } + } + + // Second shifted window. + if (frame_cnt > (unsigned int)(rc->window_size + rc->window_size / 2)) { + for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) { + sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate; + } + + if (frame_cnt > (unsigned int)(2 * rc->window_size) && + frame_cnt % rc->window_size == 0) { + rc->window_count += 1; + rc->avg_st_encoding_bitrate += sum_bitrate2 / rc->window_size; + rc->variance_st_encoding_bitrate += + (sum_bitrate2 / rc->window_size) * (sum_bitrate2 / rc->window_size); + } + } +} +#endif + int main(int argc, const char **argv) { - AppInput app_input = { 0 }; + AppInput app_input; VpxVideoWriter *writer = NULL; - VpxVideoInfo info = { 0 }; - vpx_codec_ctx_t codec; + VpxVideoInfo info; + vpx_codec_ctx_t encoder; vpx_codec_enc_cfg_t enc_cfg; SvcContext svc_ctx; + vpx_svc_frame_drop_t svc_drop_frame; uint32_t i; uint32_t frame_cnt = 0; vpx_image_t raw; vpx_codec_err_t res; int pts = 0; /* PTS starts at 0 */ int frame_duration = 1; /* 1 timebase tick per frame */ - FILE *infile = NULL; int end_of_stream = 0; +#if OUTPUT_FRAME_STATS int frames_received = 0; +#endif #if OUTPUT_RC_STATS - VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL }; + VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL }; struct RateControlStats rc; vpx_svc_layer_id_t layer_id; vpx_svc_ref_frame_config_t ref_frame_config; - unsigned int sl, tl; - double sum_bitrate = 0.0; - double sum_bitrate2 = 0.0; + unsigned int sl; double framerate = 30.0; #endif struct vpx_usec_timer timer; int64_t cx_time = 0; +#if CONFIG_INTERNAL_STATS + FILE *f = fopen("opsnr.stt", "a"); +#endif +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + int mismatch_seen = 0; + vpx_codec_ctx_t decoder; +#endif memset(&svc_ctx, 0, sizeof(svc_ctx)); - svc_ctx.log_print = 1; + memset(&app_input, 0, sizeof(AppInput)); + memset(&info, 0, sizeof(VpxVideoInfo)); + memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t)); + memset(&rc, 0, sizeof(struct RateControlStats)); exec_name = argv[0]; + + /* Setup default input stream settings */ + app_input.input_ctx.framerate.numerator = 30; + app_input.input_ctx.framerate.denominator = 1; + app_input.input_ctx.only_i420 = 1; + app_input.input_ctx.bit_depth = 0; + parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg); + // Y4M reader handles its own allocation. + if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) { // Allocate image buffer #if CONFIG_VP9_HIGHBITDEPTH - if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420 - : VPX_IMG_FMT_I42016, - enc_cfg.g_w, enc_cfg.g_h, 32)) { - die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); - } + if (!vpx_img_alloc(&raw, + enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420 + : VPX_IMG_FMT_I42016, + enc_cfg.g_w, enc_cfg.g_h, 32)) { + die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); + } #else - if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) { - die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); - } + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) { + die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h); + } #endif // CONFIG_VP9_HIGHBITDEPTH - - if (!(infile = fopen(app_input.input_filename, "rb"))) - die("Failed to open %s for reading\n", app_input.input_filename); + } // Initialize codec - if (vpx_svc_init(&svc_ctx, &codec, vpx_codec_vp9_cx(), &enc_cfg) != + if (vpx_svc_init(&svc_ctx, &encoder, vpx_codec_vp9_cx(), &enc_cfg) != VPX_CODEC_OK) die("Failed to initialize encoder\n"); +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + if (vpx_codec_dec_init( + &decoder, get_vpx_decoder_by_name("vp9")->codec_interface(), NULL, 0)) + die("Failed to initialize decoder\n"); +#endif #if OUTPUT_RC_STATS + rc.window_count = 1; + rc.window_size = 15; // Silence a static analysis warning. + rc.avg_st_encoding_bitrate = 0.0; + rc.variance_st_encoding_bitrate = 0.0; if (svc_ctx.output_rc_stat) { set_rate_control_stats(&rc, &enc_cfg); framerate = enc_cfg.g_timebase.den / enc_cfg.g_timebase.num; @@ -667,48 +964,79 @@ int main(int argc, const char **argv) { #endif info.codec_fourcc = VP9_FOURCC; + info.frame_width = enc_cfg.g_w; + info.frame_height = enc_cfg.g_h; info.time_base.numerator = enc_cfg.g_timebase.num; info.time_base.denominator = enc_cfg.g_timebase.den; - if (!(app_input.passes == 2 && app_input.pass == 1)) { - // We don't save the bitstream for the 1st pass on two pass rate control - writer = - vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info); - if (!writer) - die("Failed to open %s for writing\n", app_input.output_filename); - } + writer = + vpx_video_writer_open(app_input.output_filename, kContainerIVF, &info); + if (!writer) + die("Failed to open %s for writing\n", app_input.output_filename); + #if OUTPUT_RC_STATS - // For now, just write temporal layer streams. - // TODO(marpan): do spatial by re-writing superframe. + // Write out spatial layer stream. + // TODO(marpan/jianj): allow for writing each spatial and temporal stream. if (svc_ctx.output_rc_stat) { - for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) { + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { char file_name[PATH_MAX]; - snprintf(file_name, sizeof(file_name), "%s_t%d.ivf", - app_input.output_filename, tl); - outfile[tl] = vpx_video_writer_open(file_name, kContainerIVF, &info); - if (!outfile[tl]) die("Failed to open %s for writing", file_name); + snprintf(file_name, sizeof(file_name), "%s_s%d.ivf", + app_input.output_filename, sl); + outfile[sl] = vpx_video_writer_open(file_name, kContainerIVF, &info); + if (!outfile[sl]) die("Failed to open %s for writing", file_name); } } #endif // skip initial frames - for (i = 0; i < app_input.frames_to_skip; ++i) vpx_img_read(&raw, infile); + for (i = 0; i < app_input.frames_to_skip; ++i) + read_frame(&app_input.input_ctx, &raw); if (svc_ctx.speed != -1) - vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed); - if (svc_ctx.threads) - vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1)); + vpx_codec_control(&encoder, VP8E_SET_CPUUSED, svc_ctx.speed); + if (svc_ctx.threads) { + vpx_codec_control(&encoder, VP9E_SET_TILE_COLUMNS, + get_msb(svc_ctx.threads)); + if (svc_ctx.threads > 1) + vpx_codec_control(&encoder, VP9E_SET_ROW_MT, 1); + else + vpx_codec_control(&encoder, VP9E_SET_ROW_MT, 0); + } if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1) - vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); + vpx_codec_control(&encoder, VP9E_SET_AQ_MODE, 3); if (svc_ctx.speed >= 5) - vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(&encoder, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(&encoder, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900); + + vpx_codec_control(&encoder, VP9E_SET_SVC_INTER_LAYER_PRED, + app_input.inter_layer_pred); + + vpx_codec_control(&encoder, VP9E_SET_NOISE_SENSITIVITY, 0); + + vpx_codec_control(&encoder, VP9E_SET_TUNE_CONTENT, app_input.tune_content); + + vpx_codec_control(&encoder, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0); + vpx_codec_control(&encoder, VP9E_SET_DISABLE_LOOPFILTER, 0); + + svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP; + for (sl = 0; sl < (unsigned int)svc_ctx.spatial_layers; ++sl) + svc_drop_frame.framedrop_thresh[sl] = enc_cfg.rc_dropframe_thresh; + svc_drop_frame.max_consec_drop = INT_MAX; + vpx_codec_control(&encoder, VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); // Encode frames while (!end_of_stream) { vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *cx_pkt; - if (frame_cnt >= app_input.frames_to_code || !vpx_img_read(&raw, infile)) { + // Example patterns for bypass/flexible mode: + // example_pattern = 0: 2 temporal layers, and spatial_layers = 1,2,3. Exact + // to fixed SVC patterns. example_pattern = 1: 2 spatial and 2 temporal + // layers, with SL0 only has TL0, and SL1 has both TL0 and TL1. This example + // uses the extended API. + int example_pattern = 0; + if (frame_cnt >= app_input.frames_to_code || + !read_frame(&app_input.input_ctx, &raw)) { // We need one extra vpx_svc_encode call at end of stream to flush // encoder and get remaining data end_of_stream = 1; @@ -716,149 +1044,148 @@ int main(int argc, const char **argv) { // For BYPASS/FLEXIBLE mode, set the frame flags (reference and updates) // and the buffer indices for each spatial layer of the current - // (super)frame to be encoded. The temporal layer_id for the current frame - // also needs to be set. + // (super)frame to be encoded. The spatial and temporal layer_id for the + // current frame also needs to be set. // TODO(marpan): Should rename the "VP9E_TEMPORAL_LAYERING_MODE_BYPASS" // mode to "VP9E_LAYERING_MODE_BYPASS". if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { layer_id.spatial_layer_id = 0; // Example for 2 temporal layers. - if (frame_cnt % 2 == 0) + if (frame_cnt % 2 == 0) { layer_id.temporal_layer_id = 0; - else + for (i = 0; i < VPX_SS_MAX_LAYERS; i++) + layer_id.temporal_layer_id_per_spatial[i] = 0; + } else { layer_id.temporal_layer_id = 1; - // Note that we only set the temporal layer_id, since we are calling - // the encode for the whole superframe. The encoder will internally loop - // over all the spatial layers for the current superframe. - vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id); - set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id, - svc_ctx.spatial_layers, frame_cnt == 0, - &ref_frame_config); - vpx_codec_control(&codec, VP9E_SET_SVC_REF_FRAME_CONFIG, + for (i = 0; i < VPX_SS_MAX_LAYERS; i++) + layer_id.temporal_layer_id_per_spatial[i] = 1; + } + if (example_pattern == 1) { + // example_pattern 1 is hard-coded for 2 spatial and 2 temporal layers. + assert(svc_ctx.spatial_layers == 2); + assert(svc_ctx.temporal_layers == 2); + if (frame_cnt % 2 == 0) { + // Spatial layer 0 and 1 are encoded. + layer_id.temporal_layer_id_per_spatial[0] = 0; + layer_id.temporal_layer_id_per_spatial[1] = 0; + layer_id.spatial_layer_id = 0; + } else { + // Only spatial layer 1 is encoded here. + layer_id.temporal_layer_id_per_spatial[1] = 1; + layer_id.spatial_layer_id = 1; + } + } + vpx_codec_control(&encoder, VP9E_SET_SVC_LAYER_ID, &layer_id); + // TODO(jianj): Fix the parameter passing for "is_key_frame" in + // set_frame_flags_bypass_model() for case of periodic key frames. + if (example_pattern == 0) { + set_frame_flags_bypass_mode_ex0(layer_id.temporal_layer_id, + svc_ctx.spatial_layers, frame_cnt == 0, + &ref_frame_config); + } else if (example_pattern == 1) { + set_frame_flags_bypass_mode_ex1(layer_id.temporal_layer_id, + svc_ctx.spatial_layers, frame_cnt == 0, + &ref_frame_config); + } + ref_frame_config.duration[0] = frame_duration * 1; + ref_frame_config.duration[1] = frame_duration * 1; + + vpx_codec_control(&encoder, VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); // Keep track of input frames, to account for frame drops in rate control // stats/metrics. - for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) { + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + layer_id.temporal_layer_id]; } + } else { + // For the fixed pattern SVC, temporal layer is given by superframe count. + unsigned int tl = 0; + if (enc_cfg.ts_number_layers == 2) + tl = (frame_cnt % 2 != 0); + else if (enc_cfg.ts_number_layers == 3) { + if (frame_cnt % 2 != 0) tl = 2; + if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0)) tl = 1; + } + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) + ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + tl]; } vpx_usec_timer_start(&timer); res = vpx_svc_encode( - &svc_ctx, &codec, (end_of_stream ? NULL : &raw), pts, frame_duration, + &svc_ctx, &encoder, (end_of_stream ? NULL : &raw), pts, frame_duration, svc_ctx.speed >= 5 ? VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY); vpx_usec_timer_mark(&timer); cx_time += vpx_usec_timer_elapsed(&timer); - printf("%s", vpx_svc_get_message(&svc_ctx)); fflush(stdout); if (res != VPX_CODEC_OK) { - die_codec(&codec, "Failed to encode frame"); + die_codec(&encoder, "Failed to encode frame"); } - while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) { + while ((cx_pkt = vpx_codec_get_cx_data(&encoder, &iter)) != NULL) { switch (cx_pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: { SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal; if (cx_pkt->data.frame.sz > 0) { -#if OUTPUT_RC_STATS - uint32_t sizes[8]; - int count = 0; -#endif vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, cx_pkt->data.frame.pts); #if OUTPUT_RC_STATS - // TODO(marpan): Put this (to line728) in separate function. if (svc_ctx.output_rc_stat) { - vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id); - parse_superframe_index(cx_pkt->data.frame.buf, - cx_pkt->data.frame.sz, sizes, &count); - // Note computing input_layer_frames here won't account for frame - // drops in rate control stats. - // TODO(marpan): Fix this for non-bypass mode so we can get stats - // for dropped frames. - if (svc_ctx.temporal_layering_mode != - VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { - ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + - layer_id.temporal_layer_id]; - } - } - for (tl = layer_id.temporal_layer_id; - tl < enc_cfg.ts_number_layers; ++tl) { - vpx_video_writer_write_frame( - outfile[tl], cx_pkt->data.frame.buf, cx_pkt->data.frame.sz, - cx_pkt->data.frame.pts); - } - - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { - for (tl = layer_id.temporal_layer_id; - tl < enc_cfg.ts_number_layers; ++tl) { - const int layer = sl * enc_cfg.ts_number_layers + tl; - ++rc.layer_tot_enc_frames[layer]; - rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl]; - // Keep count of rate control stats per layer, for non-key - // frames. - if (tl == (unsigned int)layer_id.temporal_layer_id && - !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) { - rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl]; - rc.layer_avg_rate_mismatch[layer] += - fabs(8.0 * sizes[sl] - rc.layer_pfb[layer]) / - rc.layer_pfb[layer]; - ++rc.layer_enc_frames[layer]; - } - } - } - - // Update for short-time encoding bitrate states, for moving - // window of size rc->window, shifted by rc->window / 2. - // Ignore first window segment, due to key frame. - if (frame_cnt > (unsigned int)rc.window_size) { - tl = layer_id.temporal_layer_id; - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { - sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate; - } - if (frame_cnt % rc.window_size == 0) { - rc.window_count += 1; - rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size; - rc.variance_st_encoding_bitrate += - (sum_bitrate / rc.window_size) * - (sum_bitrate / rc.window_size); - sum_bitrate = 0.0; - } - } - - // Second shifted window. - if (frame_cnt > - (unsigned int)(rc.window_size + rc.window_size / 2)) { - tl = layer_id.temporal_layer_id; - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { - sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate; - } - - if (frame_cnt > (unsigned int)(2 * rc.window_size) && - frame_cnt % rc.window_size == 0) { - rc.window_count += 1; - rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size; - rc.variance_st_encoding_bitrate += - (sum_bitrate2 / rc.window_size) * - (sum_bitrate2 / rc.window_size); - sum_bitrate2 = 0.0; - } - } + svc_output_rc_stats(&encoder, &enc_cfg, &layer_id, cx_pkt, &rc, + outfile, frame_cnt, framerate); } #endif } - /* +#if OUTPUT_FRAME_STATS printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY), (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts); - */ - if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1) - si->bytes_sum[0] += (int)cx_pkt->data.frame.sz; ++frames_received; +#endif + if (enc_cfg.ss_number_layers > 1) { + uint64_t sizes[8] = { 0 }; + int count = 0; + int num_layers_encoded = 0; + parse_superframe_index(cx_pkt->data.frame.buf, + cx_pkt->data.frame.sz, sizes, &count); + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + if (cx_pkt->data.frame.spatial_layer_encoded[sl]) { + si->bytes_sum[sl] += (int)sizes[num_layers_encoded]; + num_layers_encoded++; + } + } + } else { + si->bytes_sum[0] += (int)cx_pkt->data.frame.sz; + } +#if CONFIG_VP9_DECODER && !SIMULCAST_MODE + if (vpx_codec_decode(&decoder, cx_pkt->data.frame.buf, + (unsigned int)cx_pkt->data.frame.sz, NULL, 0)) + die_codec(&decoder, "Failed to decode frame."); + vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id); + // Don't look for mismatch on top spatial and top temporal layers as + // they are non reference frames. Don't look at frames whose top + // spatial layer is dropped. + if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) && + cx_pkt->data.frame + .spatial_layer_encoded[enc_cfg.ss_number_layers - 1] && + !(layer_id.temporal_layer_id > 0 && + layer_id.temporal_layer_id == + (int)enc_cfg.ts_number_layers - 1)) { + test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen); + } +#endif + break; + } + case VPX_CODEC_PSNR_PKT: { + SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal; + sl = cx_pkt->data.psnr.spatial_layer_id; + si->number_of_frames[sl]++; + for (int j = 0; j < 4; ++j) { + si->psnr_sum[sl][j] += cx_pkt->data.psnr.psnr[j]; + si->sse_sum[sl][j] += cx_pkt->data.psnr.sse[j]; + } break; } case VPX_CODEC_STATS_PKT: { @@ -866,7 +1193,9 @@ int main(int argc, const char **argv) { cx_pkt->data.twopass_stats.sz); break; } - default: { break; } + default: { + break; + } } } @@ -876,41 +1205,44 @@ int main(int argc, const char **argv) { } } - // Compensate for the extra frame count for the bypass mode. - if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { - const int layer = - sl * enc_cfg.ts_number_layers + layer_id.temporal_layer_id; - --rc.layer_input_frames[layer]; - } - } - printf("Processed %d frames\n", frame_cnt); - fclose(infile); + + close_input_file(&app_input.input_ctx); + #if OUTPUT_RC_STATS if (svc_ctx.output_rc_stat) { printout_rate_control_summary(&rc, &enc_cfg, frame_cnt); printf("\n"); } #endif - if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); - if (app_input.passes == 2) stats_close(&app_input.rc_stats, 1); + if (vpx_codec_destroy(&encoder)) + die_codec(&encoder, "Failed to destroy codec"); if (writer) { vpx_video_writer_close(writer); } #if OUTPUT_RC_STATS if (svc_ctx.output_rc_stat) { - for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) { - vpx_video_writer_close(outfile[tl]); + for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + vpx_video_writer_close(outfile[sl]); } } +#endif +#if CONFIG_INTERNAL_STATS + if (mismatch_seen) { + fprintf(f, "First mismatch occurred in frame %d\n", mismatch_seen); + } else { + fprintf(f, "No mismatch detected in recon buffers\n"); + } + fclose(f); #endif printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n", frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000), 1000000 * (double)frame_cnt / (double)cx_time); - vpx_img_free(&raw); + if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) { + vpx_img_free(&raw); + } // display average size, psnr - printf("%s", vpx_svc_dump_statistics(&svc_ctx)); + vpx_svc_dump_statistics(&svc_ctx); vpx_svc_release(&svc_ctx); return EXIT_SUCCESS; } diff --git a/media/libvpx/libvpx/examples/vp9cx_set_ref.c b/media/libvpx/libvpx/examples/vp9cx_set_ref.c index 3472689db2..6e12d668b0 100644 --- a/media/libvpx/libvpx/examples/vp9cx_set_ref.c +++ b/media/libvpx/libvpx/examples/vp9cx_set_ref.c @@ -60,7 +60,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s " " \n", @@ -68,128 +68,6 @@ void usage_exit() { exit(EXIT_FAILURE); } -static int compare_img(const vpx_image_t *const img1, - const vpx_image_t *const img2) { - uint32_t l_w = img1->d_w; - uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; - const uint32_t c_h = - (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; - uint32_t i; - int match = 1; - - match &= (img1->fmt == img2->fmt); - match &= (img1->d_w == img2->d_w); - match &= (img1->d_h == img2->d_h); - - for (i = 0; i < img1->d_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], - img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], - l_w) == 0); - - for (i = 0; i < c_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], - img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], - c_w) == 0); - - for (i = 0; i < c_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], - img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], - c_w) == 0); - - return match; -} - -#define mmin(a, b) ((a) < (b) ? (a) : (b)) -static void find_mismatch(const vpx_image_t *const img1, - const vpx_image_t *const img2, int yloc[4], - int uloc[4], int vloc[4]) { - const uint32_t bsize = 64; - const uint32_t bsizey = bsize >> img1->y_chroma_shift; - const uint32_t bsizex = bsize >> img1->x_chroma_shift; - const uint32_t c_w = - (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; - const uint32_t c_h = - (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; - int match = 1; - uint32_t i, j; - yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; - for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { - for (j = 0; match && j < img1->d_w; j += bsize) { - int k, l; - const int si = mmin(i + bsize, img1->d_h) - i; - const int sj = mmin(j + bsize, img1->d_w) - j; - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_Y] + - (i + k) * img1->stride[VPX_PLANE_Y] + j + l) != - *(img2->planes[VPX_PLANE_Y] + - (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) { - yloc[0] = i + k; - yloc[1] = j + l; - yloc[2] = *(img1->planes[VPX_PLANE_Y] + - (i + k) * img1->stride[VPX_PLANE_Y] + j + l); - yloc[3] = *(img2->planes[VPX_PLANE_Y] + - (i + k) * img2->stride[VPX_PLANE_Y] + j + l); - match = 0; - break; - } - } - } - } - } - - uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_U] + - (i + k) * img1->stride[VPX_PLANE_U] + j + l) != - *(img2->planes[VPX_PLANE_U] + - (i + k) * img2->stride[VPX_PLANE_U] + j + l)) { - uloc[0] = i + k; - uloc[1] = j + l; - uloc[2] = *(img1->planes[VPX_PLANE_U] + - (i + k) * img1->stride[VPX_PLANE_U] + j + l); - uloc[3] = *(img2->planes[VPX_PLANE_U] + - (i + k) * img2->stride[VPX_PLANE_U] + j + l); - match = 0; - break; - } - } - } - } - } - vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_V] + - (i + k) * img1->stride[VPX_PLANE_V] + j + l) != - *(img2->planes[VPX_PLANE_V] + - (i + k) * img2->stride[VPX_PLANE_V] + j + l)) { - vloc[0] = i + k; - vloc[1] = j + l; - vloc[2] = *(img1->planes[VPX_PLANE_V] + - (i + k) * img1->stride[VPX_PLANE_V] + j + l); - vloc[3] = *(img2->planes[VPX_PLANE_V] + - (i + k) * img2->stride[VPX_PLANE_V] + j + l); - match = 0; - break; - } - } - } - } - } -} - static void testing_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder, unsigned int frame_out, int *mismatch_seen) { vpx_image_t enc_img, dec_img; @@ -373,7 +251,7 @@ int main(int argc, char **argv) { die("Failed to open %s for reading.", infile_arg); if (vpx_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, 0)) - die_codec(&ecodec, "Failed to initialize encoder"); + die("Failed to initialize encoder"); // Disable alt_ref. if (vpx_codec_control(&ecodec, VP8E_SET_ENABLEAUTOALTREF, 0)) diff --git a/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc b/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc new file mode 100644 index 0000000000..7af31095b9 --- /dev/null +++ b/media/libvpx/libvpx/examples/vpx_dec_fuzzer.cc @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Fuzzer for libvpx decoders + * ========================== + * Requirements + * -------------- + * Requires Clang 6.0 or above as -fsanitize=fuzzer is used as a linker + * option. + + * Steps to build + * -------------- + * Clone libvpx repository + $git clone https://chromium.googlesource.com/webm/libvpx + + * Create a directory in parallel to libvpx and change directory + $mkdir vpx_dec_fuzzer + $cd vpx_dec_fuzzer/ + + * Enable sanitizers (Supported: address integer memory thread undefined) + $source ../libvpx/tools/set_analyzer_env.sh address + + * Configure libvpx. + * Note --size-limit and VPX_MAX_ALLOCABLE_MEMORY are defined to avoid + * Out of memory errors when running generated fuzzer binary + $../libvpx/configure --disable-unit-tests --size-limit=12288x12288 \ + --extra-cflags="-fsanitize=fuzzer-no-link \ + -DVPX_MAX_ALLOCABLE_MEMORY=1073741824" \ + --disable-webm-io --enable-debug --disable-vp8-encoder \ + --disable-vp9-encoder --disable-examples + + * Build libvpx + $make -j32 + + * Build vp9 fuzzer + $ $CXX $CXXFLAGS -std=gnu++17 -DDECODER=vp9 \ + -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \ + ../libvpx/examples/vpx_dec_fuzzer.cc -o ./vpx_dec_fuzzer_vp9 \ + ./libvpx.a -Wl,--end-group + + * DECODER should be defined as vp9 or vp8 to enable vp9/vp8 + * + * create a corpus directory and copy some ivf files there. + * Based on which codec (vp8/vp9) is being tested, it is recommended to + * have corresponding ivf files in corpus directory + * Empty corpus directoy also is acceptable, though not recommended + $mkdir CORPUS && cp some-files CORPUS + + * Run fuzzing: + $./vpx_dec_fuzzer_vp9 CORPUS + + * References: + * http://llvm.org/docs/LibFuzzer.html + * https://github.com/google/oss-fuzz + */ + +#include +#include +#include +#include +#include +#include + +#include "third_party/nalloc/nalloc.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" +#include "vpx_ports/mem_ops.h" + +#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */ +#define IVF_FILE_HDR_SZ 32 + +#define VPXD_INTERFACE(name) VPXD_INTERFACE_(name) +#define VPXD_INTERFACE_(name) vpx_codec_##name##_dx() + +extern "C" void usage_exit(void) { exit(EXIT_FAILURE); } + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size <= IVF_FILE_HDR_SZ) { + return 0; + } + nalloc_init(nullptr); + + vpx_codec_ctx_t codec; + // Set thread count in the range [1, 64]. + const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1; + vpx_codec_dec_cfg_t cfg = { threads, 0, 0 }; + vpx_codec_flags_t flags = 0; + if ((data[IVF_FILE_HDR_SZ] & 0x40) != 0) { + flags |= VPX_CODEC_USE_POSTPROC; + } + vpx_codec_err_t err = + vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, flags); + if (err == VPX_CODEC_INCAPABLE) { + // vpx_codec_dec_init may fail with VPX_CODEC_USE_POSTPROC + // if the library is configured with --disable-postproc. + flags = 0; + if (vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, flags)) { + return 0; + } + } else if (err != 0) { + return 0; + } + + nalloc_start(data, size); + + if (threads > 1) { + const int enable = (data[IVF_FILE_HDR_SZ] & 0xa0) != 0; + err = vpx_codec_control(&codec, VP9D_SET_LOOP_FILTER_OPT, enable); + } + + data += IVF_FILE_HDR_SZ; + size -= IVF_FILE_HDR_SZ; + + int frame_cnt = 0; + while (size > IVF_FRAME_HDR_SZ) { + size_t frame_size = mem_get_le32(data); + size -= IVF_FRAME_HDR_SZ; + data += IVF_FRAME_HDR_SZ; + frame_size = std::min(size, frame_size); + + vpx_codec_stream_info_t stream_info; + stream_info.sz = sizeof(stream_info); + err = vpx_codec_peek_stream_info(VPXD_INTERFACE(DECODER), data, size, + &stream_info); + + ++frame_cnt; + if (flags & VPX_CODEC_USE_POSTPROC) { + if (frame_cnt % 16 == 4) { + vp8_postproc_cfg_t pp = { 0, 0, 0 }; + if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) goto fail; + } else if (frame_cnt % 16 == 12) { + vp8_postproc_cfg_t pp = { VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE, 4, + 0 }; + if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp)) goto fail; + } + } + + err = vpx_codec_decode(&codec, data, frame_size, nullptr, 0); + static_cast(err); + vpx_codec_iter_t iter = nullptr; + vpx_image_t *img = nullptr; + while ((img = vpx_codec_get_frame(&codec, &iter)) != nullptr) { + } + data += frame_size; + size -= frame_size; + } +fail: + vpx_codec_destroy(&codec); + nalloc_end(); + return 0; +} diff --git a/media/libvpx/libvpx/examples/vpx_enc_fuzzer.cc b/media/libvpx/libvpx/examples/vpx_enc_fuzzer.cc new file mode 100644 index 0000000000..4b99663a1a --- /dev/null +++ b/media/libvpx/libvpx/examples/vpx_enc_fuzzer.cc @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2025 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Fuzzer for libvpx encoders + * ========================== + * Requirements + * -------------- + * Requires Clang 6.0 or above as -fsanitize=fuzzer is used as a linker + * option. + + * Steps to build + * -------------- + * Clone libvpx repository + $git clone https://chromium.googlesource.com/webm/libvpx + + * Create a directory in parallel to libvpx and change directory + $mkdir vpx_enc_fuzzer + $cd vpx_enc_fuzzer/ + + * Enable sanitizers (Supported: address integer memory thread undefined) + $source ../libvpx/tools/set_analyzer_env.sh address + + * Configure libvpx. + * Note --size-limit and VPX_MAX_ALLOCABLE_MEMORY are defined to avoid + * Out of memory errors when running generated fuzzer binary + $../libvpx/configure --disable-unit-tests --size-limit=12288x12288 \ + --extra-cflags="-fsanitize=fuzzer-no-link \ + -DVPX_MAX_ALLOCABLE_MEMORY=1073741824" \ + --disable-webm-io --enable-debug --enable-vp8-encoder \ + --enable-vp9-encoder --disable-examples + + * Build libvpx + $make -j32 + + * Build vp9 fuzzer + $ $CXX $CXXFLAGS -std=gnu++17 -DENCODER=vp9 \ + -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \ + ../libvpx/examples/vpx_enc_fuzzer.cc -o ./vpx_enc_fuzzer_vp9 \ + ./libvpx.a -Wl,--end-group + + * ENCODER should be defined as vp9 or vp8 to enable vp9/vp8 + * + * create a corpus directory and copy some ivf files there. + * Based on which codec (vp8/vp9) is being tested, it is recommended to + * have corresponding ivf files in corpus directory + * Empty corpus directory also is acceptable, though not recommended + $mkdir CORPUS && cp some-files CORPUS + + * Run fuzzing: + $./vpx_enc_fuzzer_vp9 CORPUS + + * References: + * http://llvm.org/docs/LibFuzzer.html + * https://github.com/google/oss-fuzz + */ + +#include +#include +#include +#include +#include +#include + +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" +#include "vpx_ports/mem_ops.h" +#include "third_party/nalloc/nalloc.h" + +// fuzz header to have config options, before raw image data +#define FUZZ_HDR_SZ 32 + +#define VPXC_INTERFACE(name) VPXC_INTERFACE_(name) +#define VPXC_INTERFACE_(name) vpx_codec_##name##_cx() + +extern "C" void usage_exit(void) { exit(EXIT_FAILURE); } + +static int vpx_img_plane_width(const vpx_image_t *img, int plane) { + if (plane > 0 && img->x_chroma_shift > 0) + return (img->d_w + 1) >> img->x_chroma_shift; + else + return img->d_w; +} + +static int vpx_img_plane_height(const vpx_image_t *img, int plane) { + if (plane > 0 && img->y_chroma_shift > 0) + return (img->d_h + 1) >> img->y_chroma_shift; + else + return img->d_h; +} + +static int fuzz_vpx_img_read(vpx_image_t *img, const uint8_t *data, + size_t size) { + int plane; + // TODO: wtc - Need to clamp the sample values so that they are in range + // For example, if the bit depth is 10, the sample values must be <= 1023. + assert(img->bit_depth == 8); + const size_t bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; + + if (size == 0) return 0; + size_t used = 0; + for (plane = 0; plane < 3; ++plane) { + unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + int w = vpx_img_plane_width(img, plane); + const int h = vpx_img_plane_height(img, plane); + int y; + + // Assuming that for nv12 we read all chroma data at once + if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; + // Fixing NV12 chroma width if it is odd + if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; + + for (y = 0; y < h; ++y) { + size_t nb = bytespp * w; + if (nb > size - used) { + nb = size - used; + } + memcpy(buf, data, nb); + memset(buf + nb, 0, bytespp * w - nb); + buf += stride; + data += nb; + used += nb; + } + } + + return used; +} + +static int encode_frame(vpx_codec_ctx_t *codec, vpx_image_t *img, + int frame_index, int flags, FILE *out, + vpx_enc_deadline_t quality) { + int got_pkts = 0; + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt = NULL; + const vpx_codec_err_t res = + vpx_codec_encode(codec, img, frame_index, 1, flags, quality); + if (res != VPX_CODEC_OK) return 0; + + while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) { + got_pkts = 1; + + if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { + if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, out) != + pkt->data.frame.sz) + return 0; + } + } + + return got_pkts; +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size <= FUZZ_HDR_SZ) { + return 0; + } + nalloc_init(nullptr); + + int keyframe_interval = 0; + int frame_count = 0; + vpx_codec_ctx_t codec; + vpx_image_t raw; + vpx_codec_enc_cfg_t cfg; + vpx_enc_deadline_t quality = VPX_DL_GOOD_QUALITY; + + if ((data[0] & 0x80) != 0) { + keyframe_interval = 8; + } + if ((data[0] & 0x40) != 0) { + quality = VPX_DL_REALTIME; + } else if ((data[0] & 0x20) != 0) { + quality = VPX_DL_BEST_QUALITY; + } + + if (vpx_codec_enc_config_default(VPXC_INTERFACE(ENCODER), &cfg, 0)) abort(); + FILE *out = fopen("/dev/null", "wb"); + + switch (data[0] & 0x1F) { + case 0: cfg.g_w = 64; cfg.g_h = 1; + case 1: cfg.g_w = 1; cfg.g_h = 48; + case 2: cfg.g_w = 1; cfg.g_h = 1; + case 3: cfg.g_w = 4; cfg.g_h = 4; + case 4: cfg.g_w = 16; cfg.g_h = 16; + default: cfg.g_w = 64; cfg.g_h = 48; + } + cfg.g_timebase.num = 1; + cfg.g_timebase.den = 30; // fps + cfg.rc_target_bitrate = 200; + cfg.g_error_resilient = 1; + + if (vpx_codec_enc_init(&codec, VPXC_INTERFACE(ENCODER), &cfg, 0)) { + return 0; + } + + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1)) { + goto fail; + } + + nalloc_start(data, size); + // We may want to add more config options (for more complex encoders as seen + // in the examples) in the future while still maintaining the same format (so + // that generated corpus is still valid). So we reserve FUZZ_HDR_SZ=32 bytes + // for this even if we just use one byte so far. + data += FUZZ_HDR_SZ; + size -= FUZZ_HDR_SZ; + + // Encode frames. + while (1) { + int flags = 0; + size_t size_read = fuzz_vpx_img_read(&raw, data, size); + if (size_read == 0) break; + data += size_read; + size -= size_read; + if (keyframe_interval > 0 && frame_count % keyframe_interval == 0) + flags |= VPX_EFLAG_FORCE_KF; + encode_frame(&codec, &raw, frame_count++, flags, out, quality); + } + + // Flush encoder. + while (encode_frame(&codec, NULL, -1, 0, out, quality)) { + } + +fail: + nalloc_end(); + vpx_img_free(&raw); + vpx_codec_destroy(&codec); + fclose(out); + return 0; +} diff --git a/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c b/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c index b906980835..01badbe0cf 100644 --- a/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c +++ b/media/libvpx/libvpx/examples/vpx_temporal_svc_encoder.c @@ -19,24 +19,38 @@ #include #include "./vpx_config.h" +#include "./y4minput.h" #include "../vpx_ports/vpx_timer.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" +#include "vpx_ports/bitops.h" #include "../tools_common.h" #include "../video_writer.h" +#define ROI_MAP 0 + +#define zero(Dest) memset(&(Dest), 0, sizeof(Dest)) + static const char *exec_name; void usage_exit(void) { exit(EXIT_FAILURE); } -// Denoiser states, for temporal denoising. -enum denoiserState { - kDenoiserOff, - kDenoiserOnYOnly, - kDenoiserOnYUV, - kDenoiserOnYUVAggressive, - kDenoiserOnAdaptive +// Denoiser states for vp8, for temporal denoising. +enum denoiserStateVp8 { + kVp8DenoiserOff, + kVp8DenoiserOnYOnly, + kVp8DenoiserOnYUV, + kVp8DenoiserOnYUVAggressive, + kVp8DenoiserOnAdaptive +}; + +// Denoiser states for vp9, for temporal denoising. +enum denoiserStateVp9 { + kVp9DenoiserOff, + kVp9DenoiserOnYOnly, + // For SVC: denoise the top two spatial layers. + kVp9DenoiserOnYTwoSpatialLayers }; static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 }; @@ -79,19 +93,21 @@ struct RateControlMetrics { // in the stream. static void set_rate_control_metrics(struct RateControlMetrics *rc, vpx_codec_enc_cfg_t *cfg) { - unsigned int i = 0; + int i = 0; // Set the layer (cumulative) framerate and the target layer (non-cumulative) // per-frame-bandwidth, for the rate control encoding stats below. const double framerate = cfg->g_timebase.den / cfg->g_timebase.num; + const int ts_number_layers = cfg->ts_number_layers; rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0]; rc->layer_pfb[0] = 1000.0 * rc->layer_target_bitrate[0] / rc->layer_framerate[0]; - for (i = 0; i < cfg->ts_number_layers; ++i) { + for (i = 0; i < ts_number_layers; ++i) { if (i > 0) { rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i]; - rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] - - rc->layer_target_bitrate[i - 1]) / - (rc->layer_framerate[i] - rc->layer_framerate[i - 1]); + rc->layer_pfb[i] = + 1000.0 * + (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) / + (rc->layer_framerate[i] - rc->layer_framerate[i - 1]); } rc->layer_input_frames[i] = 0; rc->layer_enc_frames[i] = 0; @@ -104,6 +120,9 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc, rc->window_size = 15; rc->avg_st_encoding_bitrate = 0.0; rc->variance_st_encoding_bitrate = 0.0; + // Target bandwidth for the whole stream. + // Set to layer_target_bitrate for highest layer (total bitrate). + cfg->rc_target_bitrate = rc->layer_target_bitrate[ts_number_layers - 1]; } static void printout_rate_control_summary(struct RateControlMetrics *rc, @@ -154,6 +173,107 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc, die("Error: Number of input frames not equal to output! \n"); } +#if ROI_MAP +static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg, + vpx_roi_map_t *roi) { + unsigned int i, j; + int block_size = 0; + uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0; + uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0; + if (!is_vp8 && !is_vp9) { + die("unsupported codec."); + } + zero(*roi); + + block_size = is_vp9 && !is_vp8 ? 8 : 16; + + // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for + // segment is 16x16 for vp8, 8x8 for vp9. + roi->rows = (cfg->g_h + block_size - 1) / block_size; + roi->cols = (cfg->g_w + block_size - 1) / block_size; + + // Applies delta QP on the segment blocks, varies from -63 to 63. + // Setting to negative means lower QP (better quality). + // Below we set delta_q to the extreme (-63) to show strong effect. + // VP8 uses the first 4 segments. VP9 uses all 8 segments. + zero(roi->delta_q); + roi->delta_q[1] = -63; + + // Applies delta loopfilter strength on the segment blocks, varies from -63 to + // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4 + // segments. VP9 uses all 8 segments. + zero(roi->delta_lf); + + if (is_vp8) { + // Applies skip encoding threshold on the segment blocks, varies from 0 to + // UINT_MAX. Larger value means more skipping of encoding is possible. + // This skip threshold only applies on delta frames. + zero(roi->static_threshold); + } + + if (is_vp9) { + // Apply skip segment. Setting to 1 means this block will be copied from + // previous frame. + zero(roi->skip); + } + + if (is_vp9) { + // Apply ref frame segment. + // -1 : Do not apply this segment. + // 0 : Froce using intra. + // 1 : Force using last. + // 2 : Force using golden. + // 3 : Force using alfref but not used in non-rd pickmode for 0 lag. + memset(roi->ref_frame, -1, sizeof(roi->ref_frame)); + roi->ref_frame[1] = 1; + } + + // Use 2 states: 1 is center square, 0 is the rest. + roi->roi_map = + (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map)); + for (i = 0; i < roi->rows; ++i) { + for (j = 0; j < roi->cols; ++j) { + if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) && + j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) { + roi->roi_map[i * roi->cols + j] = 1; + } + } + } +} + +static void set_roi_skip_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi, + int *skip_map, int *prev_mask_map, int frame_num) { + const int block_size = 8; + unsigned int i, j; + roi->rows = (cfg->g_h + block_size - 1) / block_size; + roi->cols = (cfg->g_w + block_size - 1) / block_size; + zero(roi->skip); + zero(roi->delta_q); + zero(roi->delta_lf); + memset(roi->ref_frame, -1, sizeof(roi->ref_frame)); + roi->ref_frame[1] = 1; + // Use segment 3 for skip. + roi->skip[3] = 1; + roi->roi_map = + (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map)); + for (i = 0; i < roi->rows; ++i) { + for (j = 0; j < roi->cols; ++j) { + const int idx = i * roi->cols + j; + // Use segment 3 for skip. + // prev_mask_map keeps track of blocks that have been stably on segment 3 + // for the past 10 frames. Only skip when the block is on segment 3 in + // both current map and prev_mask_map. + if (skip_map[idx] == 1 && prev_mask_map[idx] == 1) roi->roi_map[idx] = 3; + // Reset it every 10 frames so it doesn't propagate for too many frames. + if (frame_num % 10 == 0) + prev_mask_map[idx] = skip_map[idx]; + else if (prev_mask_map[idx] == 1 && skip_map[idx] == 0) + prev_mask_map[idx] = 0; + } + } +} +#endif + // Temporal scaling parameters: // NOTE: The 3 prediction frames cannot be used interchangeably due to // differences in the way they are handled throughout the code. The @@ -486,6 +606,28 @@ static void set_temporal_layer_pattern(int layering_mode, } } +#if ROI_MAP +static int read_mask(FILE *mask_file, int *seg_map, int allowed_mask_rows, + int allowed_mask_cols) { + int mask_rows, mask_cols, i, j; + int *map_start = seg_map; + if (fscanf(mask_file, "%d %d\n", &mask_cols, &mask_rows) != 2) return 0; + if (mask_rows != allowed_mask_rows || mask_cols != allowed_mask_cols) { + return 0; + } + for (i = 0; i < mask_rows; i++) { + for (j = 0; j < mask_cols; j++) { + if (fscanf(mask_file, "%d ", &seg_map[j]) != 1) return 0; + // reverse the bit + seg_map[j] = 1 - seg_map[j]; + } + seg_map += mask_cols; + } + seg_map = map_start; + return 1; +} +#endif + int main(int argc, char **argv) { VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL }; vpx_codec_ctx_t codec; @@ -495,6 +637,7 @@ int main(int argc, char **argv) { vpx_codec_err_t res; unsigned int width; unsigned int height; + uint32_t error_resilient = 0; int speed; int frame_avail; int got_data; @@ -505,16 +648,15 @@ int main(int argc, char **argv) { int layering_mode = 0; int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 }; int flag_periodicity = 1; -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) - vpx_svc_layer_id_t layer_id = { 0, 0 }; -#else - vpx_svc_layer_id_t layer_id = { 0 }; +#if ROI_MAP + vpx_roi_map_t roi; #endif + vpx_svc_layer_id_t layer_id; const VpxInterface *encoder = NULL; - FILE *infile = NULL; + struct VpxInputContext input_ctx; struct RateControlMetrics rc; int64_t cx_time = 0; - const int min_args_base = 12; + const int min_args_base = 13; #if CONFIG_VP9_HIGHBITDEPTH vpx_bit_depth_t bit_depth = VPX_BITS_8; int input_bit_depth = 8; @@ -525,18 +667,36 @@ int main(int argc, char **argv) { double sum_bitrate = 0.0; double sum_bitrate2 = 0.0; double framerate = 30.0; +#if ROI_MAP + FILE *mask_file = NULL; + int block_size = 8; + int mask_rows = 0; + int mask_cols = 0; + int *mask_map; + int *prev_mask_map; +#endif + zero(rc.layer_target_bitrate); + memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t)); + memset(&input_ctx, 0, sizeof(input_ctx)); + /* Setup default input stream settings */ + input_ctx.framerate.numerator = 30; + input_ctx.framerate.denominator = 1; + input_ctx.only_i420 = 1; + input_ctx.bit_depth = 0; exec_name = argv[0]; // Check usage and arguments. if (argc < min_args) { #if CONFIG_VP9_HIGHBITDEPTH die("Usage: %s " - " " + " " + " " " ... \n", argv[0]); #else die("Usage: %s " - " " + " " + " " " ... \n", argv[0]); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -553,14 +713,23 @@ int main(int argc, char **argv) { die("Invalid resolution: %d x %d", width, height); } - layering_mode = (int)strtol(argv[11], NULL, 0); + layering_mode = (int)strtol(argv[12], NULL, 0); if (layering_mode < 0 || layering_mode > 13) { - die("Invalid layering mode (0..12) %s", argv[11]); + die("Invalid layering mode (0..12) %s", argv[12]); } +#if ROI_MAP + if (argc != min_args + mode_to_num_layers[layering_mode] + 1) { + die("Invalid number of arguments"); + } +#else if (argc != min_args + mode_to_num_layers[layering_mode]) { die("Invalid number of arguments"); } +#endif + + input_ctx.filename = argv[1]; + open_input_file(&input_ctx); #if CONFIG_VP9_HIGHBITDEPTH switch (strtol(argv[argc - 1], NULL, 0)) { @@ -578,14 +747,22 @@ int main(int argc, char **argv) { break; default: die("Invalid bit depth (8, 10, 12) %s", argv[argc - 1]); } - if (!vpx_img_alloc( - &raw, bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016, - width, height, 32)) { - die("Failed to allocate image", width, height); + + // Y4M reader has its own allocation. + if (input_ctx.file_type != FILE_TYPE_Y4M) { + if (!vpx_img_alloc( + &raw, + bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016, + width, height, 32)) { + die("Failed to allocate image (%dx%d)", width, height); + } } #else - if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) { - die("Failed to allocate image", width, height); + // Y4M reader has its own allocation. + if (input_ctx.file_type != FILE_TYPE_Y4M) { + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) { + die("Failed to allocate image (%dx%d)", width, height); + } } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -616,14 +793,17 @@ int main(int argc, char **argv) { if (speed < 0) { die("Invalid speed setting: must be positive"); } + if (strncmp(encoder->name, "vp9", 3) == 0 && speed > 9) { + warn("Mapping speed %d to speed 9.\n", speed); + } for (i = min_args_base; (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) { - rc.layer_target_bitrate[i - 12] = (int)strtol(argv[i], NULL, 0); + rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0); if (strncmp(encoder->name, "vp8", 3) == 0) - cfg.ts_target_bitrate[i - 12] = rc.layer_target_bitrate[i - 12]; + cfg.ts_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13]; else if (strncmp(encoder->name, "vp9", 3) == 0) - cfg.layer_target_bitrate[i - 12] = rc.layer_target_bitrate[i - 12]; + cfg.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13]; } // Real time parameters. @@ -634,7 +814,7 @@ int main(int argc, char **argv) { if (strncmp(encoder->name, "vp9", 3) == 0) cfg.rc_max_quantizer = 52; cfg.rc_undershoot_pct = 50; cfg.rc_overshoot_pct = 50; - cfg.rc_buf_initial_sz = 500; + cfg.rc_buf_initial_sz = 600; cfg.rc_buf_optimal_sz = 600; cfg.rc_buf_sz = 1000; @@ -642,10 +822,14 @@ int main(int argc, char **argv) { cfg.rc_resize_allowed = 0; // Use 1 thread as default. - cfg.g_threads = (unsigned int)strtoul(argv[10], NULL, 0); + cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0); + error_resilient = (uint32_t)strtoul(argv[10], NULL, 0); + if (error_resilient != 0 && error_resilient != 1) { + die("Invalid value for error resilient (0, 1): %d.", error_resilient); + } // Enable error resilient mode. - cfg.g_error_resilient = 1; + cfg.g_error_resilient = error_resilient; cfg.g_lag_in_frames = 0; cfg.kf_mode = VPX_KF_AUTO; @@ -659,13 +843,15 @@ int main(int argc, char **argv) { set_rate_control_metrics(&rc, &cfg); - // Target bandwidth for the whole stream. - // Set to layer_target_bitrate for highest layer (total bitrate). - cfg.rc_target_bitrate = rc.layer_target_bitrate[cfg.ts_number_layers - 1]; - - // Open input file. - if (!(infile = fopen(argv[1], "rb"))) { - die("Failed to open %s for reading", argv[1]); + if (input_ctx.file_type == FILE_TYPE_Y4M) { + if (input_ctx.width != cfg.g_w || input_ctx.height != cfg.g_h) { + die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h); + } + if (input_ctx.framerate.numerator != cfg.g_timebase.den || + input_ctx.framerate.denominator != cfg.g_timebase.num) { + die("Incorrect framerate: numerator %d denominator %d", + cfg.g_timebase.num, cfg.g_timebase.den); + } } framerate = cfg.g_timebase.den / cfg.g_timebase.num; @@ -696,25 +882,45 @@ int main(int argc, char **argv) { #else if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0)) #endif // CONFIG_VP9_HIGHBITDEPTH - die_codec(&codec, "Failed to initialize encoder"); + die("Failed to initialize encoder"); + +#if ROI_MAP + mask_rows = (cfg.g_h + block_size - 1) / block_size; + mask_cols = (cfg.g_w + block_size - 1) / block_size; + mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map)); + prev_mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map)); +#endif if (strncmp(encoder->name, "vp8", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed); - vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff); + vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0); +#if ROI_MAP + set_roi_map(encoder->name, &cfg, &roi); + if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi)) + die_codec(&codec, "Failed to set ROI map"); +#endif } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_svc_extra_cfg_t svc_params; memset(&svc_params, 0, sizeof(svc_params)); + vpx_codec_control(&codec, VP9E_SET_POSTENCODE_DROP, 0); + vpx_codec_control(&codec, VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, 0); vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0); vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0); vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0); - vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff); + vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0); - vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1)); + vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads)); + vpx_codec_control(&codec, VP9E_SET_DISABLE_LOOPFILTER, 0); + + if (cfg.g_threads > 1) + vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1); + else + vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0); if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1 : 0)) die_codec(&codec, "Failed to set SVC"); for (i = 0; i < cfg.ts_number_layers; ++i) { @@ -733,7 +939,7 @@ int main(int argc, char **argv) { // For generating smaller key frames, use a smaller max_intra_size_pct // value, like 100 or 200. { - const int max_intra_size_pct = 900; + const int max_intra_size_pct = 1000; vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct); } @@ -743,12 +949,14 @@ int main(int argc, char **argv) { struct vpx_usec_timer timer; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt; -#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION) +#if ROI_MAP + char mask_file_name[255]; +#endif // Update the temporal layer_id. No spatial layers in this test. layer_id.spatial_layer_id = 0; -#endif layer_id.temporal_layer_id = cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity]; + layer_id.temporal_layer_id_per_spatial[0] = layer_id.temporal_layer_id; if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id); } else if (strncmp(encoder->name, "vp8", 3) == 0) { @@ -757,7 +965,24 @@ int main(int argc, char **argv) { } flags = layer_flags[frame_cnt % flag_periodicity]; if (layering_mode == 0) flags = 0; - frame_avail = vpx_img_read(&raw, infile); +#if ROI_MAP + snprintf(mask_file_name, sizeof(mask_file_name), "%s%05d.txt", + argv[argc - 1], frame_cnt); + mask_file = fopen(mask_file_name, "r"); + if (mask_file != NULL) { + int mask_is_valid = read_mask(mask_file, mask_map, mask_rows, mask_cols); + fclose(mask_file); + if (mask_is_valid) { + // set_roi_map(encoder->name, &cfg, &roi); + set_roi_skip_map(&cfg, &roi, mask_map, prev_mask_map, frame_cnt); + if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi)) + die_codec(&codec, "Failed to set ROI map"); + } else { + die_codec(&codec, "Mask input is invalid for ROI map"); + } + } +#endif + frame_avail = read_frame(&input_ctx, &raw); if (frame_avail) ++rc.layer_input_frames[layer_id.temporal_layer_id]; vpx_usec_timer_start(&timer); if (vpx_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags, @@ -794,6 +1019,7 @@ int main(int argc, char **argv) { // Update for short-time encoding bitrate states, for moving window // of size rc->window, shifted by rc->window / 2. // Ignore first window segment, due to key frame. + if (rc.window_size == 0) rc.window_size = 15; if (frame_cnt > rc.window_size) { sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate; if (frame_cnt % rc.window_size == 0) { @@ -825,7 +1051,11 @@ int main(int argc, char **argv) { ++frame_cnt; pts += frame_duration; } - fclose(infile); +#if ROI_MAP + free(mask_map); + free(prev_mask_map); +#endif + close_input_file(&input_ctx); printout_rate_control_summary(&rc, &cfg, frame_cnt); printf("\n"); printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n", @@ -837,6 +1067,12 @@ int main(int argc, char **argv) { // Try to rewrite the output file headers with the actual frame count. for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]); - vpx_img_free(&raw); + if (input_ctx.file_type != FILE_TYPE_Y4M) { + vpx_img_free(&raw); + } + +#if ROI_MAP + free(roi.roi_map); +#endif return EXIT_SUCCESS; } diff --git a/media/libvpx/libvpx/ivfdec.c b/media/libvpx/libvpx/ivfdec.c index f64e594ab0..3e179bc6ed 100644 --- a/media/libvpx/libvpx/ivfdec.c +++ b/media/libvpx/libvpx/ivfdec.c @@ -76,12 +76,12 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, size_t frame_size = 0; if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) { - if (!feof(infile)) warn("Failed to read frame size\n"); + if (!feof(infile)) warn("Failed to read frame size"); } else { frame_size = mem_get_le32(raw_header); if (frame_size > 256 * 1024 * 1024) { - warn("Read invalid frame size (%u)\n", (unsigned int)frame_size); + warn("Read invalid frame size (%u)", (unsigned int)frame_size); frame_size = 0; } @@ -92,7 +92,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, *buffer = new_buffer; *buffer_size = 2 * frame_size; } else { - warn("Failed to allocate compressed data buffer\n"); + warn("Failed to allocate compressed data buffer"); frame_size = 0; } } @@ -100,7 +100,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, if (!feof(infile)) { if (fread(*buffer, 1, frame_size, infile) != frame_size) { - warn("Failed to read full frame\n"); + warn("Failed to read full frame"); return 1; } diff --git a/media/libvpx/libvpx/ivfdec.h b/media/libvpx/libvpx/ivfdec.h index af725572b4..847cd79f3f 100644 --- a/media/libvpx/libvpx/ivfdec.h +++ b/media/libvpx/libvpx/ivfdec.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef IVFDEC_H_ -#define IVFDEC_H_ +#ifndef VPX_IVFDEC_H_ +#define VPX_IVFDEC_H_ #include "./tools_common.h" @@ -25,4 +25,4 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, } /* extern "C" */ #endif -#endif // IVFDEC_H_ +#endif // VPX_IVFDEC_H_ diff --git a/media/libvpx/libvpx/ivfenc.c b/media/libvpx/libvpx/ivfenc.c index a50d31839d..2e8e04283a 100644 --- a/media/libvpx/libvpx/ivfenc.c +++ b/media/libvpx/libvpx/ivfenc.c @@ -13,27 +13,35 @@ #include "vpx/vpx_encoder.h" #include "vpx_ports/mem_ops.h" -void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg, - unsigned int fourcc, int frame_cnt) { +void ivf_write_file_header_with_video_info(FILE *outfile, unsigned int fourcc, + int frame_cnt, int frame_width, + int frame_height, + vpx_rational_t timebase) { char header[32]; header[0] = 'D'; header[1] = 'K'; header[2] = 'I'; header[3] = 'F'; - mem_put_le16(header + 4, 0); // version - mem_put_le16(header + 6, 32); // header size - mem_put_le32(header + 8, fourcc); // fourcc - mem_put_le16(header + 12, cfg->g_w); // width - mem_put_le16(header + 14, cfg->g_h); // height - mem_put_le32(header + 16, cfg->g_timebase.den); // rate - mem_put_le32(header + 20, cfg->g_timebase.num); // scale - mem_put_le32(header + 24, frame_cnt); // length - mem_put_le32(header + 28, 0); // unused + mem_put_le16(header + 4, 0); // version + mem_put_le16(header + 6, 32); // header size + mem_put_le32(header + 8, fourcc); // fourcc + mem_put_le16(header + 12, frame_width); // width + mem_put_le16(header + 14, frame_height); // height + mem_put_le32(header + 16, timebase.den); // rate + mem_put_le32(header + 20, timebase.num); // scale + mem_put_le32(header + 24, frame_cnt); // length + mem_put_le32(header + 28, 0); // unused fwrite(header, 1, 32, outfile); } +void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg, + unsigned int fourcc, int frame_cnt) { + ivf_write_file_header_with_video_info(outfile, fourcc, frame_cnt, cfg->g_w, + cfg->g_h, cfg->g_timebase); +} + void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) { char header[12]; diff --git a/media/libvpx/libvpx/ivfenc.h b/media/libvpx/libvpx/ivfenc.h index ebdce47be8..27b6910805 100644 --- a/media/libvpx/libvpx/ivfenc.h +++ b/media/libvpx/libvpx/ivfenc.h @@ -7,11 +7,13 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef IVFENC_H_ -#define IVFENC_H_ +#ifndef VPX_IVFENC_H_ +#define VPX_IVFENC_H_ #include "./tools_common.h" +#include "vpx/vpx_encoder.h" + struct vpx_codec_enc_cfg; struct vpx_codec_cx_pkt; @@ -19,6 +21,11 @@ struct vpx_codec_cx_pkt; extern "C" { #endif +void ivf_write_file_header_with_video_info(FILE *outfile, unsigned int fourcc, + int frame_cnt, int frame_width, + int frame_height, + vpx_rational_t timebase); + void ivf_write_file_header(FILE *outfile, const struct vpx_codec_enc_cfg *cfg, uint32_t fourcc, int frame_cnt); @@ -30,4 +37,4 @@ void ivf_write_frame_size(FILE *outfile, size_t frame_size); } /* extern "C" */ #endif -#endif // IVFENC_H_ +#endif // VPX_IVFENC_H_ diff --git a/media/libvpx/libvpx/libs.doxy_template b/media/libvpx/libvpx/libs.doxy_template index 5a8f847280..6d05162d00 100644 --- a/media/libvpx/libvpx/libs.doxy_template +++ b/media/libvpx/libvpx/libs.doxy_template @@ -654,12 +654,6 @@ VERBATIM_HEADERS = YES ALPHABETICAL_INDEX = NO -# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then -# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns -# in which this list will be split (can be a number in the range [1..20]) - -COLS_IN_ALPHA_INDEX = 5 - # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that @@ -943,18 +937,6 @@ GENERATE_XML = NO XML_OUTPUT = xml -# The XML_SCHEMA tag can be used to specify an XML schema, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_SCHEMA = - -# The XML_DTD tag can be used to specify an XML DTD, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_DTD = - # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that @@ -1111,32 +1093,10 @@ ALLEXTERNALS = NO EXTERNAL_GROUPS = YES -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of `which perl'). - -PERL_PATH = /usr/bin/perl - #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- -# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will -# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base -# or super classes. Setting the tag to NO turns the diagrams off. Note that -# this option is superseded by the HAVE_DOT option below. This is only a -# fallback. It is recommended to install and use dot, since it yields more -# powerful graphs. - -CLASS_DIAGRAMS = YES - -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to -# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to -# specify the directory where the mscgen tool resides. If left empty the tool is assumed to -# be found in the default search path. - -MSCGEN_PATH = - # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. @@ -1150,10 +1110,14 @@ HIDE_UNDOC_RELATIONS = YES HAVE_DOT = NO -# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect inheritance relations. Setting this tag to YES will force the -# the CLASS_DIAGRAMS tag to NO. +# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a +# graph for each documented class showing the direct and indirect inheritance +# relations. In case HAVE_DOT is set as well dot will be used to draw the graph, +# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set +# to TEXT the direct and indirect inheritance relations will be shown as texts / +# links. +# Possible values are: NO, YES, TEXT and GRAPH. +# The default value is: YES. CLASS_GRAPH = YES @@ -1259,14 +1223,6 @@ DOT_GRAPH_MAX_NODES = 50 MAX_DOT_GRAPH_DEPTH = 0 -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, which results in a white background. -# Warning: Depending on the platform used, enabling this option may lead to -# badly anti-aliased labels on the edges of a graph (i.e. they become hard to -# read). - -DOT_TRANSPARENT = YES - # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) diff --git a/media/libvpx/libvpx/libs.mk b/media/libvpx/libvpx/libs.mk index 36935bd1e6..483da0fdd2 100644 --- a/media/libvpx/libvpx/libs.mk +++ b/media/libvpx/libvpx/libs.mk @@ -11,7 +11,7 @@ # ARM assembly files are written in RVCT-style. We use some make magic to # filter those files to allow GCC compilation -ifeq ($(ARCH_ARM),yes) +ifeq ($(VPX_ARCH_ARM),yes) ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.S,.asm) else ASM:=.asm @@ -63,6 +63,7 @@ ifeq ($(CONFIG_VP8_ENCODER),yes) CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS)) CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS)) INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h + INSTALL-LIBS-yes += include/vpx/vpx_ext_ratectrl.h INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/% CODEC_DOC_SECTIONS += vp8 vp8_encoder endif @@ -87,13 +88,35 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS)) CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS)) CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h + CODEC_SRCS-yes += vpx/vpx_ext_ratectrl.h INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h - INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h + INSTALL-LIBS-yes += include/vpx/vpx_ext_ratectrl.h INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/% - CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h + CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h vpx/vpx_ext_ratectrl.h CODEC_DOC_SECTIONS += vp9 vp9_encoder endif +RC_RTC_SRCS := vpx/vp8.h vpx/vp8cx.h +RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h +RC_RTC_SRCS += vpx/internal/vpx_ratectrl_rtc.h +ifeq ($(CONFIG_VP9_ENCODER),yes) + VP9_PREFIX=vp9/ + RC_RTC_SRCS += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS)) + RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk + RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.cc + RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.h + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.cc + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.h +endif +ifeq ($(CONFIG_VP8_ENCODER),yes) + VP8_PREFIX=vp8/ + RC_RTC_SRCS += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS)) + RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.cc + RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.h + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.cc + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.h +endif + ifeq ($(CONFIG_VP9_DECODER),yes) VP9_PREFIX=vp9/ include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx.mk @@ -113,16 +136,10 @@ ifeq ($(CONFIG_DECODERS),yes) CODEC_DOC_SECTIONS += decoder endif -# Suppress -Wextra warnings in third party code. -$(BUILD_PFX)third_party/googletest/%.cc.o: CXXFLAGS += -Wno-missing-field-initializers -# Suppress -Wextra warnings in first party code pending investigation. -# https://bugs.chromium.org/p/webm/issues/detail?id=1069 -$(BUILD_PFX)vp8/encoder/onyx_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered -$(BUILD_PFX)vp8/decoder/onyxd_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered - ifeq ($(CONFIG_MSVS),yes) CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd) GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd) +RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxrcmt,vpxrcmd) # This variable uses deferred expansion intentionally, since the results of # $(wildcard) may change during the course of the Make. VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d)))) @@ -147,14 +164,12 @@ CODEC_SRCS-yes += vpx_ports/mem_ops_aligned.h CODEC_SRCS-yes += vpx_ports/vpx_once.h CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c -ifeq ($(ARCH_X86)$(ARCH_X86_64),yes) +ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm endif CODEC_EXPORTS-yes += vpx/exports_com CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc -ifeq ($(CONFIG_SPATIAL_SVC),yes) -CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_spatial_svc -endif CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec INSTALL-LIBS-yes += include/vpx/vpx_codec.h @@ -163,19 +178,25 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h INSTALL-LIBS-yes += include/vpx/vpx_integer.h INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h +INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) INSTALL-LIBS-yes += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib) +ifeq ($(CONFIG_STATIC),yes) INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB)d.lib) +endif INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/vpx.dll) INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/vpx.exp) endif else INSTALL-LIBS-$(CONFIG_STATIC) += $(LIBSUBDIR)/libvpx.a +ifeq ($(CONFIG_STATIC),yes) INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a endif +endif CODEC_SRCS=$(call enabled,CODEC_SRCS) + INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(CODEC_SRCS) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS) @@ -187,6 +208,18 @@ libvpx_srcs.txt: @echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ CLEAN-OBJS += libvpx_srcs.txt +libvpxrc_srcs.txt: + @echo " [CREATE] $@" + @echo $(RC_RTC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ +CLEAN-OBJS += libvpxrc_srcs.txt + +# Assembly files that are included, but don't define symbols themselves. +# Filtered out to avoid Windows build warnings. +ASM_INCLUDES := \ + third_party/x86inc/x86inc.asm \ + vpx_config.asm \ + vpx_ports/x86_abi_support.asm \ + vpx_dsp/x86/bitdepth_conversion_sse2.asm \ ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) @@ -198,12 +231,7 @@ vpx.def: $(call enabled,CODEC_EXPORTS) --out=$@ $^ CLEAN-OBJS += vpx.def -# Assembly files that are included, but don't define symbols themselves. -# Filtered out to avoid Visual Studio build warnings. -ASM_INCLUDES := \ - third_party/x86inc/x86inc.asm \ - vpx_config.asm \ - vpx_ports/x86_abi_support.asm \ +vpx.$(VCPROJ_SFX): VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def @echo " [CREATE] $@" @@ -217,7 +245,16 @@ vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ --out=$@ $(CFLAGS) \ - $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \ + --as=$(AS) \ + $(filter $(SRC_PATH_BARE)/vp8/%.c, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp8/%.h, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vpx/%, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vpx_dsp/%, $(VCPROJ_SRCS)) \ + $(filter-out $(addprefix $(SRC_PATH_BARE)/, \ + vp8/%.c vp8/%.h vp9/%.c vp9/%.h vpx/% vpx_dsp/%), \ + $(VCPROJ_SRCS)) \ --src-path-bare="$(SRC_PATH_BARE)" \ PROJECTS-yes += vpx.$(VCPROJ_SFX) @@ -225,15 +262,58 @@ PROJECTS-yes += vpx.$(VCPROJ_SFX) vpx.$(VCPROJ_SFX): vpx_config.asm vpx.$(VCPROJ_SFX): $(RTCD) -endif -else -LIBVPX_OBJS=$(call objs,$(CODEC_SRCS)) +vpxrc.$(VCPROJ_SFX): \ + VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) + +vpxrc.$(VCPROJ_SFX): $(RC_RTC_SRCS) + @echo " [CREATE] $@" + $(qexec)$(GEN_VCPROJ) \ + $(if $(CONFIG_SHARED),--dll,--lib) \ + --target=$(TOOLCHAIN) \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --name=vpxrc \ + --proj-guid=C26FF952-9494-4838-9A3F-7F3D4F613385 \ + --ver=$(CONFIG_VS_VERSION) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + --out=$@ $(CFLAGS) \ + --as=$(AS) \ + $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp9/%.cc, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vpx/%, $(VCPROJ_SRCS)) \ + $(filter $(SRC_PATH_BARE)/vpx_dsp/%, $(VCPROJ_SRCS)) \ + $(filter-out $(addprefix $(SRC_PATH_BARE)/, \ + vp8/%.c vp8/%.h vp9/%.c vp9/%.cc vp9/%.h vpx/% \ + vpx_dsp/%), \ + $(VCPROJ_SRCS)) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + +PROJECTS-yes += vpxrc.$(VCPROJ_SFX) + +vpxrc.$(VCPROJ_SFX): vpx_config.asm +vpxrc.$(VCPROJ_SFX): $(RTCD) + +endif # ifeq ($(CONFIG_MSVS),yes) +else # ifeq ($(CONFIG_EXTERNAL_BUILD),yes) +LIBVPX_OBJS=$(call objs, $(filter-out $(ASM_INCLUDES), $(CODEC_SRCS))) OBJS-yes += $(LIBVPX_OBJS) LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) -SO_VERSION_MAJOR := 4 -SO_VERSION_MINOR := 1 +# Updating version info. +# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info +# For libtool: c=, a=, r= +# libtool generates .so file as .so.[c-a].a.r, while -version-info c:r:a is +# passed to libtool. +# +# libvpx library file is generated as libvpx.so... +# MAJOR = c-a, MINOR = a, PATCH = r +# +# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current +# SO_VERSION_* then follow the rules in the link to detemine the new version +# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 +SO_VERSION_MAJOR := 12 +SO_VERSION_MINOR := 0 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib @@ -273,18 +353,6 @@ $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm $(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR) $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE) -libvpx.ver: $(call enabled,CODEC_EXPORTS) - @echo " [CREATE] $@" - $(qexec)echo "{ global:" > $@ - $(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done - $(qexec)echo "local: *; };" >> $@ -CLEAN-OBJS += libvpx.ver - -libvpx.syms: $(call enabled,CODEC_EXPORTS) - @echo " [CREATE] $@" - $(qexec)awk '{print "_"$$2}' $^ >$@ -CLEAN-OBJS += libvpx.syms - libvpx.def: $(call enabled,CODEC_EXPORTS) @echo " [CREATE] $@" $(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@ @@ -342,22 +410,42 @@ endif INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc CLEAN-OBJS += vpx.pc + +ifeq ($(CONFIG_ENCODERS),yes) + RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS)) + OBJS-yes += $(RC_RTC_OBJS) + LIBS-yes += $(BUILD_PFX)libvpxrc.a $(BUILD_PFX)libvpxrc_g.a + $(BUILD_PFX)libvpxrc_g.a: $(RC_RTC_OBJS) endif +endif # ifeq ($(CONFIG_EXTERNAL_BUILD),yes) + +libvpx.ver: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)echo "{ global:" > $@ + $(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done + $(qexec)echo "local: *; };" >> $@ +CLEAN-OBJS += libvpx.ver + +libvpx.syms: $(call enabled,CODEC_EXPORTS) + @echo " [CREATE] $@" + $(qexec)awk '{print "_"$$2}' $^ >$@ +CLEAN-OBJS += libvpx.syms + # # Rule to make assembler configuration file from C configuration file # -ifeq ($(ARCH_X86)$(ARCH_X86_64),yes) +ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes) # YASM $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h @echo " [CREATE] $@" - @egrep "#define [A-Z0-9_]+ [01]" $< \ + @LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \ | awk '{print $$2 " equ " $$3}' > $@ else ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION)) $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h @echo " [CREATE] $@" - @egrep "#define [A-Z0-9_]+ [01]" $< \ + @LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \ | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@ @echo " END" $(ADS2GAS) >> $@ CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm @@ -387,27 +475,56 @@ ifeq ($(CONFIG_UNIT_TESTS),yes) LIBVPX_TEST_DATA_PATH ?= . include $(SRC_PATH_BARE)/test/test.mk -LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS)) + +# addprefix_clean behaves like addprefix if the target doesn't start with "../" +# However, if the target starts with "../", instead of adding prefix, +# it will remove "../". +# Using addprefix_clean, we can avoid two different targets building the +# same file, i.e. +# test/../ivfenc.c.d: ivfenc.o +# ivfenc.c.d: ivfenc.o +# Note that the other way to solve this problem is using "realpath". +# The "realpath" is supported by make 3.81 or later. +addprefix_clean=$(patsubst $(1)../%,%,$(addprefix $(1), $(2))) +LIBVPX_TEST_SRCS=$(call addprefix_clean,test/,$(call enabled,LIBVPX_TEST_SRCS)) + LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX) LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\ $(call enabled,LIBVPX_TEST_DATA)) libvpx_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1) TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX) -TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS)) +TEST_INTRA_PRED_SPEED_SRCS=$(call addprefix_clean,test/,\ + $(call enabled,TEST_INTRA_PRED_SPEED_SRCS)) TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS))) +ifeq ($(CONFIG_ENCODERS),yes) +RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX) +RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\ + $(call enabled,RC_INTERFACE_TEST_SRCS)) +RC_INTERFACE_TEST_OBJS := $(sort $(call objs,$(RC_INTERFACE_TEST_SRCS))) +endif + libvpx_test_srcs.txt: @echo " [CREATE] $@" @echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ CLEAN-OBJS += libvpx_test_srcs.txt +# Attempt to download the file using curl, retrying once if it fails for a +# partial file (18). $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1 @echo " [DOWNLOAD] $@" - $(qexec)trap 'rm -f $@' INT TERM &&\ - curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F)) + $(qexec)( \ + trap 'rm -f $@' INT TERM; \ + curl="curl -S -s --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \ + $$curl; ret=$$?; \ + case "$$ret" in \ + 18) $$curl -C - ;; \ + *) exit $$ret ;; \ + esac \ + ) -testdata:: $(LIBVPX_TEST_DATA) +testdata: $(LIBVPX_TEST_DATA) $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\ [ -x "$$(which shasum)" ] && sha1sum=shasum;\ [ -x "$$(which sha1)" ] && sha1sum=sha1;\ @@ -416,7 +533,7 @@ testdata:: $(LIBVPX_TEST_DATA) echo "Checking test data:";\ for f in $(call enabled,LIBVPX_TEST_DATA); do\ grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\ - (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\ + (cd "$(LIBVPX_TEST_DATA_PATH)"; $${sha1sum} -c);\ done; \ else\ echo "Skipping test data integrity check, sha1sum not found.";\ @@ -435,6 +552,7 @@ gtest.$(VCPROJ_SFX): $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.c --proj-guid=EC00E1EC-AF68-4D92-A255-181690D1C9B1 \ --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$(AS) \ -D_VARIADIC_MAX=10 \ --out=gtest.$(VCPROJ_SFX) $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc \ -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" -I"$(SRC_PATH_BARE)/third_party/googletest/src" @@ -451,6 +569,7 @@ test_libvpx.$(VCPROJ_SFX): $(LIBVPX_TEST_SRCS) vpx.$(VCPROJ_SFX) gtest.$(VCPROJ_ --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \ --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$(AS) \ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ @@ -473,12 +592,35 @@ test_intra_pred_speed.$(VCPROJ_SFX): $(TEST_INTRA_PRED_SPEED_SRCS) vpx.$(VCPROJ_ --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \ --ver=$(CONFIG_VS_VERSION) \ --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$(AS) \ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^ endif # TEST_INTRA_PRED_SPEED -endif + +ifeq ($(CONFIG_ENCODERS),yes) +ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),) +PROJECTS-$(CONFIG_MSVS) += test_rc_interface.$(VCPROJ_SFX) +test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \ + vpxrc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX) + @echo " [CREATE] $@" + $(qexec)$(GEN_VCPROJ) \ + --exe \ + --target=$(TOOLCHAIN) \ + --name=test_rc_interface \ + -D_VARIADIC_MAX=10 \ + --proj-guid=30458F88-1BC6-4689-B41C-50F3737AAB27 \ + --ver=$(CONFIG_VS_VERSION) \ + --as=$(AS) \ + --src-path-bare="$(SRC_PATH_BARE)" \ + $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \ + -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \ + -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^ +endif # RC_INTERFACE_TEST +endif # CONFIG_ENCODERS +endif # CONFIG_MSVS else include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk @@ -519,31 +661,46 @@ $(eval $(call linkerxx_template,$(TEST_INTRA_PRED_SPEED_BIN), \ -L. -lvpx -lgtest $(extralibs) -lm)) endif # TEST_INTRA_PRED_SPEED -endif # CONFIG_UNIT_TESTS +ifeq ($(CONFIG_ENCODERS),yes) +ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),) +$(RC_INTERFACE_TEST_OBJS) $(RC_INTERFACE_TEST_OBJS:.o=.d): \ + CXXFLAGS += $(GTEST_INCLUDES) +OBJS-yes += $(RC_INTERFACE_TEST_OBJS) +BINS-yes += $(RC_INTERFACE_TEST_BIN) + +$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvpxrc.a +$(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \ + $(RC_INTERFACE_TEST_OBJS) \ + -L. -lvpx -lgtest -lvpxrc $(extralibs) -lm)) +endif # RC_INTERFACE_TEST +endif # CONFIG_ENCODERS + +endif # CONFIG_EXTERNAL_BUILD # Install test sources only if codec source is included INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\ $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f)) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBVPX_TEST_SRCS) INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS) +INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(RC_INTERFACE_TEST_SRCS) define test_shard_template -test:: test_shard.$(1) -test-no-data-check:: test_shard_ndc.$(1) +test: test_shard.$(1) +test-no-data-check: test_shard_ndc.$(1) test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN) @set -e; \ export GTEST_SHARD_INDEX=$(1); \ export GTEST_TOTAL_SHARDS=$(2); \ $(LIBVPX_TEST_BIN) test_shard.$(1): testdata -.PHONY: test_shard.$(1) +.PHONY: test_shard.$(1) test_shard_ndc.$(1) endef NUM_SHARDS := 10 SHARDS := 0 1 2 3 4 5 6 7 8 9 $(foreach s,$(SHARDS),$(eval $(call test_shard_template,$(s),$(NUM_SHARDS)))) -endif +endif # CONFIG_UNIT_TESTS ## ## documentation directives @@ -566,6 +723,7 @@ endif ## Update the global src list SRCS += $(CODEC_SRCS) $(LIBVPX_TEST_SRCS) $(GTEST_SRCS) +SRCS += $(RC_INTERFACE_TEST_SRCS) ## ## vpxdec/vpxenc tests. @@ -582,10 +740,10 @@ TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH)) endif utiltest utiltest-no-data-check: $(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \ - --test-data-path $(LIBVPX_TEST_DATA_PATH) \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ --bin-path $(TEST_BIN_PATH) $(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \ - --test-data-path $(LIBVPX_TEST_DATA_PATH) \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ --bin-path $(TEST_BIN_PATH) utiltest: testdata else @@ -609,7 +767,7 @@ EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release endif exampletest exampletest-no-data-check: examples $(qexec)$(SRC_PATH_BARE)/test/examples.sh \ - --test-data-path $(LIBVPX_TEST_DATA_PATH) \ + --test-data-path "$(LIBVPX_TEST_DATA_PATH)" \ --bin-path $(EXAMPLES_BIN_PATH) exampletest: testdata else diff --git a/media/libvpx/libvpx/mainpage.dox b/media/libvpx/libvpx/mainpage.dox index ec202fa4fb..4b0dff0871 100644 --- a/media/libvpx/libvpx/mainpage.dox +++ b/media/libvpx/libvpx/mainpage.dox @@ -25,8 +25,10 @@ release. - The \ref readme contains instructions on recompiling the sample applications. - Read the \ref usage "usage" for a narrative on codec usage. + \if samples - Read the \ref samples "sample code" for examples of how to interact with the codec. + \endif - \ref codec reference \if encoder - \ref encoder reference diff --git a/media/libvpx/libvpx/md5_utils.c b/media/libvpx/libvpx/md5_utils.c index 093798b833..abd8d43c39 100644 --- a/media/libvpx/libvpx/md5_utils.c +++ b/media/libvpx/libvpx/md5_utils.c @@ -23,6 +23,7 @@ #include /* for memcpy() */ #include "md5_utils.h" +#include "vpx_ports/compiler_attributes.h" static void byteSwap(UWORD32 *buf, unsigned words) { md5byte *p; @@ -145,25 +146,14 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) { #define MD5STEP(f, w, x, y, z, in, s) \ (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) -#if defined(__clang__) && defined(__has_attribute) -#if __has_attribute(no_sanitize) -#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \ - __attribute__((no_sanitize("unsigned-integer-overflow"))) -#endif -#endif - -#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK -#define VPX_NO_UNSIGNED_OVERFLOW_CHECK -#endif - /* * The core of the MD5 algorithm, this alters an existing MD5 hash to * reflect the addition of 16 longwords of new data. MD5Update blocks * the data and converts bytes into longwords for this routine. */ -VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4], - UWORD32 const in[16]) { - register UWORD32 a, b, c, d; +VPX_NO_UNSIGNED_OVERFLOW_CHECK VPX_NO_UNSIGNED_SHIFT_CHECK void MD5Transform( + UWORD32 buf[4], UWORD32 const in[16]) { + UWORD32 a, b, c, d; a = buf[0]; b = buf[1]; @@ -244,6 +234,4 @@ VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4], buf[3] += d; } -#undef VPX_NO_UNSIGNED_OVERFLOW_CHECK - #endif diff --git a/media/libvpx/libvpx/md5_utils.h b/media/libvpx/libvpx/md5_utils.h index bd4991b3ad..e0d5a2d1fb 100644 --- a/media/libvpx/libvpx/md5_utils.h +++ b/media/libvpx/libvpx/md5_utils.h @@ -20,8 +20,8 @@ * Still in the public domain. */ -#ifndef MD5_UTILS_H_ -#define MD5_UTILS_H_ +#ifndef VPX_MD5_UTILS_H_ +#define VPX_MD5_UTILS_H_ #ifdef __cplusplus extern "C" { @@ -46,4 +46,4 @@ void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]); } // extern "C" #endif -#endif // MD5_UTILS_H_ +#endif // VPX_MD5_UTILS_H_ diff --git a/media/libvpx/libvpx/rate_hist.c b/media/libvpx/libvpx/rate_hist.c index 872a10bae0..6a056cac10 100644 --- a/media/libvpx/libvpx/rate_hist.c +++ b/media/libvpx/libvpx/rate_hist.c @@ -9,10 +9,11 @@ */ #include -#include #include -#include #include +#include +#include +#include #include "./rate_hist.h" @@ -37,12 +38,19 @@ struct rate_hist { struct rate_hist *init_rate_histogram(const vpx_codec_enc_cfg_t *cfg, const vpx_rational_t *fps) { int i; - struct rate_hist *hist = malloc(sizeof(*hist)); + struct rate_hist *hist = calloc(1, sizeof(*hist)); + + if (hist == NULL || cfg == NULL || fps == NULL || fps->num == 0 || + fps->den == 0) { + destroy_rate_histogram(hist); + return NULL; + } // Determine the number of samples in the buffer. Use the file's framerate // to determine the number of frames in rc_buf_sz milliseconds, with an // adjustment (5/4) to account for alt-refs - hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000; + hist->samples = + (int)((int64_t)cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000); // prevent division by zero if (hist->samples == 0) hist->samples = 1; @@ -80,7 +88,11 @@ void update_rate_histogram(struct rate_hist *hist, (uint64_t)cfg->g_timebase.num / (uint64_t)cfg->g_timebase.den; - int idx = hist->frames++ % hist->samples; + int idx; + + if (hist == NULL || cfg == NULL || pkt == NULL) return; + + idx = hist->frames++ % hist->samples; hist->pts[idx] = now; hist->sz[idx] = (int)pkt->data.frame.sz; @@ -116,9 +128,14 @@ void update_rate_histogram(struct rate_hist *hist, static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets, int *num_buckets) { int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0; - int buckets = *num_buckets; + int buckets; int i; + assert(bucket != NULL); + assert(num_buckets != NULL); + + buckets = *num_buckets; + /* Find the extrema for this list of buckets */ big_bucket = small_bucket = 0; for (i = 0; i < buckets; i++) { @@ -178,38 +195,42 @@ static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets, static void show_histogram(const struct hist_bucket *bucket, int buckets, int total, int scale) { - const char *pat1, *pat2; + int width1, width2; int i; + if (!buckets) return; + assert(bucket != NULL); + assert(buckets > 0); + switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) { case 1: case 2: - pat1 = "%4d %2s: "; - pat2 = "%4d-%2d: "; + width1 = 4; + width2 = 2; break; case 3: - pat1 = "%5d %3s: "; - pat2 = "%5d-%3d: "; + width1 = 5; + width2 = 3; break; case 4: - pat1 = "%6d %4s: "; - pat2 = "%6d-%4d: "; + width1 = 6; + width2 = 4; break; case 5: - pat1 = "%7d %5s: "; - pat2 = "%7d-%5d: "; + width1 = 7; + width2 = 5; break; case 6: - pat1 = "%8d %6s: "; - pat2 = "%8d-%6d: "; + width1 = 8; + width2 = 6; break; case 7: - pat1 = "%9d %7s: "; - pat2 = "%9d-%7d: "; + width1 = 9; + width2 = 7; break; default: - pat1 = "%12d %10s: "; - pat2 = "%12d-%10d: "; + width1 = 12; + width2 = 10; break; } @@ -224,9 +245,10 @@ static void show_histogram(const struct hist_bucket *bucket, int buckets, assert(len <= HIST_BAR_MAX); if (bucket[i].low == bucket[i].high) - fprintf(stderr, pat1, bucket[i].low, ""); + fprintf(stderr, "%*d %*s: ", width1, bucket[i].low, width2, ""); else - fprintf(stderr, pat2, bucket[i].low, bucket[i].high); + fprintf(stderr, "%*d-%*d: ", width1, bucket[i].low, width2, + bucket[i].high); for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " "); fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct); @@ -259,6 +281,8 @@ void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg, int i, scale; int buckets = 0; + if (hist == NULL || cfg == NULL) return; + for (i = 0; i < RATE_BINS; i++) { if (hist->bucket[i].low == INT_MAX) continue; hist->bucket[buckets++] = hist->bucket[i]; diff --git a/media/libvpx/libvpx/rate_hist.h b/media/libvpx/libvpx/rate_hist.h index 00a1676a61..d6a4c68519 100644 --- a/media/libvpx/libvpx/rate_hist.h +++ b/media/libvpx/libvpx/rate_hist.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef RATE_HIST_H_ -#define RATE_HIST_H_ +#ifndef VPX_RATE_HIST_H_ +#define VPX_RATE_HIST_H_ #include "vpx/vpx_encoder.h" @@ -37,4 +37,4 @@ void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg, } // extern "C" #endif -#endif // RATE_HIST_H_ +#endif // VPX_RATE_HIST_H_ diff --git a/media/libvpx/libvpx/test/acm_random.h b/media/libvpx/libvpx/test/acm_random.h index c2f6b0e410..6ebb60028e 100644 --- a/media/libvpx/libvpx/test/acm_random.h +++ b/media/libvpx/libvpx/test/acm_random.h @@ -8,10 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_ACM_RANDOM_H_ -#define TEST_ACM_RANDOM_H_ +#ifndef VPX_TEST_ACM_RANDOM_H_ +#define VPX_TEST_ACM_RANDOM_H_ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include + +#include + +#include "gtest/gtest.h" #include "vpx/vpx_integer.h" @@ -24,37 +28,56 @@ class ACMRandom { explicit ACMRandom(int seed) : random_(seed) {} void Reset(int seed) { random_.Reseed(seed); } - uint16_t Rand16(void) { + uint16_t Rand16() { const uint32_t value = random_.Generate(testing::internal::Random::kMaxRange); return (value >> 15) & 0xffff; } - int16_t Rand9Signed(void) { - // Use 9 bits: values between 255 (0x0FF) and -256 (0x100). - const uint32_t value = random_.Generate(512); - return static_cast(value) - 256; + int32_t Rand20Signed() { + // Use 20 bits: values between 524287 and -524288. + const uint32_t value = random_.Generate(1048576); + return static_cast(value) - 524288; } - uint8_t Rand8(void) { + int16_t Rand16Signed() { + // Use 16 bits: values between 32767 and -32768. + return static_cast(random_.Generate(65536)); + } + + uint16_t Rand12() { + const uint32_t value = + random_.Generate(testing::internal::Random::kMaxRange); + // There's a bit more entropy in the upper bits of this implementation. + return (value >> 19) & 0xfff; + } + + uint8_t Rand8() { const uint32_t value = random_.Generate(testing::internal::Random::kMaxRange); // There's a bit more entropy in the upper bits of this implementation. return (value >> 23) & 0xff; } - uint8_t Rand8Extremes(void) { + uint8_t Rand8Extremes() { // Returns a random value near 0 or near 255, to better exercise // saturation behavior. const uint8_t r = Rand8(); - return r < 128 ? r << 4 : r >> 4; + return static_cast((r < 128) ? r << 4 : r >> 4); + } + + uint32_t RandRange(const uint32_t range) { + // testing::internal::Random::Generate provides values in the range + // testing::internal::Random::kMaxRange. + assert(range <= testing::internal::Random::kMaxRange); + return random_.Generate(range); } int PseudoUniform(int range) { return random_.Generate(range); } int operator()(int n) { return PseudoUniform(n); } - static int DeterministicSeed(void) { return 0xbaba; } + static int DeterministicSeed() { return 0xbaba; } private: testing::internal::Random random_; @@ -62,4 +85,4 @@ class ACMRandom { } // namespace libvpx_test -#endif // TEST_ACM_RANDOM_H_ +#endif // VPX_TEST_ACM_RANDOM_H_ diff --git a/media/libvpx/libvpx/test/active_map_refresh_test.cc b/media/libvpx/libvpx/test/active_map_refresh_test.cc index d893635505..a0b46059a7 100644 --- a/media/libvpx/libvpx/test/active_map_refresh_test.cc +++ b/media/libvpx/libvpx/test/active_map_refresh_test.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/util.h" @@ -62,25 +62,25 @@ class ActiveMapRefreshTest public ::libvpx_test::CodecTestWith2Params { protected: ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ActiveMapRefreshTest() {} + ~ActiveMapRefreshTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); cpu_used_ = GET_PARAM(2); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { ::libvpx_test::Y4mVideoSource *y4m_video = static_cast(video); - if (video->frame() == 1) { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP9E_SET_AQ_MODE, kAqModeCyclicRefresh); } else if (video->frame() >= 2 && video->img()) { vpx_image_t *current = video->img(); vpx_image_t *previous = y4m_holder_->img(); - ASSERT_TRUE(previous != NULL); + ASSERT_NE(previous, nullptr); vpx_active_map_t map = vpx_active_map_t(); const int width = static_cast(current->d_w); const int height = static_cast(current->d_h); @@ -122,7 +122,7 @@ TEST_P(ActiveMapRefreshTest, Test) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(ActiveMapRefreshTest, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(5, 6)); +VP9_INSTANTIATE_TEST_SUITE(ActiveMapRefreshTest, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(5, 6)); } // namespace diff --git a/media/libvpx/libvpx/test/active_map_test.cc b/media/libvpx/libvpx/test/active_map_test.cc index 1d24f956f5..e8976b416a 100644 --- a/media/libvpx/libvpx/test/active_map_test.cc +++ b/media/libvpx/libvpx/test/active_map_test.cc @@ -9,7 +9,7 @@ */ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" @@ -19,24 +19,26 @@ namespace { class ActiveMapTest : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWith2Params { + public ::libvpx_test::CodecTestWith3Params { protected: static const int kWidth = 208; static const int kHeight = 144; ActiveMapTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ActiveMapTest() {} + ~ActiveMapTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); cpu_used_ = GET_PARAM(2); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); + encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3)); } else if (video->frame() == 3) { vpx_active_map_t map = vpx_active_map_t(); /* clang-format off */ @@ -62,7 +64,7 @@ class ActiveMapTest vpx_active_map_t map = vpx_active_map_t(); map.cols = (kWidth + 15) / 16; map.rows = (kHeight + 15) / 16; - map.active_map = NULL; + map.active_map = nullptr; encoder->Control(VP8E_SET_ACTIVEMAP, &map); } } @@ -85,7 +87,7 @@ TEST_P(ActiveMapTest, Test) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(ActiveMapTest, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(0, 9)); +VP9_INSTANTIATE_TEST_SUITE(ActiveMapTest, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(5, 10), ::testing::Values(0, 3)); } // namespace diff --git a/media/libvpx/libvpx/test/add_noise_test.cc b/media/libvpx/libvpx/test/add_noise_test.cc index eae32c33bb..89c821fe20 100644 --- a/media/libvpx/libvpx/test/add_noise_test.cc +++ b/media/libvpx/libvpx/test/add_noise_test.cc @@ -8,11 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ #include +#include + +#include "gtest/gtest.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/util.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_config.h" #include "vpx_dsp/postproc.h" #include "vpx_mem/vpx_mem.h" @@ -20,15 +24,17 @@ namespace { static const int kNoiseSize = 3072; -// TODO(jimbankoski): make width and height integers not unsigned. -typedef void (*AddNoiseFunc)(uint8_t *start, const int8_t *noise, - int blackclamp, int whiteclamp, int width, - int height, int pitch); +using AddNoiseFunc = void (*)(uint8_t *start, const int8_t *noise, + int blackclamp, int whiteclamp, int width, + int height, int pitch); -class AddNoiseTest : public ::testing::TestWithParam { +using AddNoiseTestFPParam = std::tuple; + +class AddNoiseTest : public ::testing::Test, + public ::testing::WithParamInterface { public: - virtual void TearDown() { libvpx_test::ClearSystemState(); } - virtual ~AddNoiseTest() {} + void TearDown() override { libvpx_test::ClearSystemState(); } + ~AddNoiseTest() override = default; }; double stddev6(char a, char b, char c, char d, char e, char f) { @@ -44,14 +50,14 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) { const int height = 64; const int image_size = width * height; int8_t noise[kNoiseSize]; - const int clamp = vpx_setup_noise(4.4, noise, kNoiseSize); + const int clamp = vpx_setup_noise(GET_PARAM(0), noise, kNoiseSize); uint8_t *const s = reinterpret_cast(vpx_calloc(image_size, sizeof(*s))); - ASSERT_TRUE(s != NULL); + ASSERT_NE(s, nullptr); memset(s, 99, image_size * sizeof(*s)); ASM_REGISTER_STATE_CHECK( - GetParam()(s, noise, clamp, clamp, width, height, width)); + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); // Check to make sure we don't end up having either the same or no added // noise either vertically or horizontally. @@ -70,7 +76,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) { memset(s, 255, image_size); ASM_REGISTER_STATE_CHECK( - GetParam()(s, noise, clamp, clamp, width, height, width)); + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); // Check to make sure don't roll over. for (int i = 0; i < image_size; ++i) { @@ -81,7 +87,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) { memset(s, 0, image_size); ASM_REGISTER_STATE_CHECK( - GetParam()(s, noise, clamp, clamp, width, height, width)); + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); // Check to make sure don't roll under. for (int i = 0; i < image_size; ++i) { @@ -100,15 +106,15 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) { uint8_t *const s = reinterpret_cast(vpx_calloc(image_size, 1)); uint8_t *const d = reinterpret_cast(vpx_calloc(image_size, 1)); - ASSERT_TRUE(s != NULL); - ASSERT_TRUE(d != NULL); + ASSERT_NE(s, nullptr); + ASSERT_NE(d, nullptr); memset(s, 99, image_size); memset(d, 99, image_size); srand(0); ASM_REGISTER_STATE_CHECK( - GetParam()(s, noise, clamp, clamp, width, height, width)); + GET_PARAM(1)(s, noise, clamp, clamp, width, height, width)); srand(0); ASM_REGISTER_STATE_CHECK( vpx_plane_add_noise_c(d, noise, clamp, clamp, width, height, width)); @@ -121,16 +127,24 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) { vpx_free(s); } -INSTANTIATE_TEST_CASE_P(C, AddNoiseTest, - ::testing::Values(vpx_plane_add_noise_c)); +using std::make_tuple; + +INSTANTIATE_TEST_SUITE_P( + C, AddNoiseTest, + ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_c), + make_tuple(4.4, vpx_plane_add_noise_c))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest, - ::testing::Values(vpx_plane_add_noise_sse2)); +INSTANTIATE_TEST_SUITE_P( + SSE2, AddNoiseTest, + ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_sse2), + make_tuple(4.4, vpx_plane_add_noise_sse2))); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, AddNoiseTest, - ::testing::Values(vpx_plane_add_noise_msa)); +INSTANTIATE_TEST_SUITE_P( + MSA, AddNoiseTest, + ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_msa), + make_tuple(4.4, vpx_plane_add_noise_msa))); #endif } // namespace diff --git a/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc b/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc index 64a3011eb9..ade82d7ed2 100644 --- a/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc +++ b/media/libvpx/libvpx/test/alt_ref_aq_segment_test.cc @@ -7,7 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" @@ -20,9 +20,9 @@ class AltRefAqSegmentTest public ::libvpx_test::CodecTestWith2Params { protected: AltRefAqSegmentTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~AltRefAqSegmentTest() {} + ~AltRefAqSegmentTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); @@ -30,9 +30,9 @@ class AltRefAqSegmentTest alt_ref_aq_mode_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_ALT_REF_AQ, alt_ref_aq_mode_); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); @@ -150,8 +150,8 @@ TEST_P(AltRefAqSegmentTest, TestNoMisMatchAltRefAQ4) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(AltRefAqSegmentTest, - ::testing::Values(::libvpx_test::kOnePassGood, - ::libvpx_test::kTwoPassGood), - ::testing::Range(2, 5)); +VP9_INSTANTIATE_TEST_SUITE(AltRefAqSegmentTest, + ::testing::Values(::libvpx_test::kOnePassGood, + ::libvpx_test::kTwoPassGood), + ::testing::Range(2, 5)); } // namespace diff --git a/media/libvpx/libvpx/test/altref_test.cc b/media/libvpx/libvpx/test/altref_test.cc index f9308c2717..e98edcba4a 100644 --- a/media/libvpx/libvpx/test/altref_test.cc +++ b/media/libvpx/libvpx/test/altref_test.cc @@ -7,11 +7,12 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "vpx_config.h" namespace { #if CONFIG_VP8_ENCODER @@ -24,24 +25,24 @@ class AltRefTest : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWithParam { protected: AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {} - virtual ~AltRefTest() {} + ~AltRefTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(libvpx_test::kTwoPassGood); } - virtual void BeginPassHook(unsigned int /*pass*/) { altref_count_ = 0; } + void BeginPassHook(unsigned int /*pass*/) override { altref_count_ = 0; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_CPUUSED, 3); } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_; } @@ -63,8 +64,8 @@ TEST_P(AltRefTest, MonotonicTimestamps) { EXPECT_GE(altref_count(), 1); } -VP8_INSTANTIATE_TEST_CASE(AltRefTest, - ::testing::Range(kLookAheadMin, kLookAheadMax)); +VP8_INSTANTIATE_TEST_SUITE(AltRefTest, + ::testing::Range(kLookAheadMin, kLookAheadMax)); #endif // CONFIG_VP8_ENCODER @@ -75,17 +76,17 @@ class AltRefForcedKeyTestLarge AltRefForcedKeyTestLarge() : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {} - virtual ~AltRefForcedKeyTestLarge() {} + ~AltRefForcedKeyTestLarge() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); cfg_.rc_end_usage = VPX_VBR; cfg_.g_threads = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); @@ -100,7 +101,7 @@ class AltRefForcedKeyTestLarge (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (frame_num_ == forced_kf_frame_num_) { ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY)) << "Frame #" << frame_num_ << " isn't a keyframe!"; @@ -142,11 +143,11 @@ TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) { } } -VP8_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge, - ::testing::Values(::libvpx_test::kOnePassGood), - ::testing::Range(0, 9)); +VP8_INSTANTIATE_TEST_SUITE(AltRefForcedKeyTestLarge, + ::testing::Values(::libvpx_test::kOnePassGood), + ::testing::Range(0, 9)); -VP9_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge, - ::testing::Values(::libvpx_test::kOnePassGood), - ::testing::Range(0, 9)); +VP9_INSTANTIATE_TEST_SUITE(AltRefForcedKeyTestLarge, + ::testing::Values(::libvpx_test::kOnePassGood), + ::testing::Range(0, 9)); } // namespace diff --git a/media/libvpx/libvpx/test/android/Android.mk b/media/libvpx/libvpx/test/android/Android.mk index 48872a2b65..9a7533ebba 100644 --- a/media/libvpx/libvpx/test/android/Android.mk +++ b/media/libvpx/libvpx/test/android/Android.mk @@ -10,6 +10,9 @@ # The test app itself runs on the command line through adb shell # The paths are really messed up as the libvpx make file # expects to be made from a parent directory. + +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT CUR_WD := $(call my-dir) BINDINGS_DIR := $(CUR_WD)/../../.. LOCAL_PATH := $(CUR_WD)/../../.. @@ -32,7 +35,11 @@ LOCAL_CPP_EXTENSION := .cc LOCAL_MODULE := gtest LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/ LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/ +LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/ LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS include $(BUILD_STATIC_LIBRARY) #libvpx_test @@ -47,6 +54,9 @@ else LOCAL_STATIC_LIBRARIES += vpx endif +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/../../LICENSE $(LOCAL_PATH)/../../PATENTS include $(LOCAL_PATH)/test/test.mk LOCAL_C_INCLUDES := $(BINDINGS_DIR) FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes))) @@ -54,3 +64,4 @@ LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC)) # some test files depend on *_rtcd.h, ensure they're generated first. $(eval $(call rtcd_dep_template)) include $(BUILD_EXECUTABLE) +endif # NDK_ROOT diff --git a/media/libvpx/libvpx/test/android/README b/media/libvpx/libvpx/test/android/README index 4a1adcf7f4..0cd30779d4 100644 --- a/media/libvpx/libvpx/test/android/README +++ b/media/libvpx/libvpx/test/android/README @@ -3,19 +3,20 @@ Android.mk will build vpx unittests on android. ./libvpx/configure --target=armv7-android-gcc --enable-external-build \ --enable-postproc --disable-install-srcs --enable-multi-res-encoding \ --enable-temporal-denoising --disable-unit-tests --disable-install-docs \ - --disable-examples --disable-runtime-cpu-detect --sdk-path=$NDK + --disable-examples --disable-runtime-cpu-detect 2) From the parent directory, invoke ndk-build: NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \ APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release \ - APP_STL=gnustl_static + APP_STL=c++_static -Note: Both adb and ndk-build are available prebuilt at: - https://chromium.googlesource.com/android_tools +Note: Both adb and ndk-build are available at: + https://developer.android.com/studio#downloads + https://developer.android.com/ndk/downloads 3) Run get_files.py to download the test files: python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files \ - -u http://downloads.webmproject.org/test_data/libvpx + -u https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx 4) Transfer files to device using adb. Ensure you have proper permissions for the target diff --git a/media/libvpx/libvpx/test/android/get_files.py b/media/libvpx/libvpx/test/android/get_files.py index 1c69740d2b..98ce7b1947 100644 --- a/media/libvpx/libvpx/test/android/get_files.py +++ b/media/libvpx/libvpx/test/android/get_files.py @@ -38,7 +38,7 @@ def get_file_sha(filename): buf = file.read(HASH_CHUNK) return sha_hash.hexdigest() except IOError: - print "Error reading " + filename + print("Error reading " + filename) # Downloads a file from a url, and then checks the sha against the passed # in sha @@ -67,7 +67,7 @@ try: getopt.getopt(sys.argv[1:], \ "u:i:o:", ["url=", "input_csv=", "output_dir="]) except: - print 'get_files.py -u -i -o ' + print('get_files.py -u -i -o ') sys.exit(2) for opt, arg in opts: @@ -79,7 +79,7 @@ for opt, arg in opts: local_resource_path = os.path.join(arg) if len(sys.argv) != 7: - print "Expects two paths and a url!" + print("Expects two paths and a url!") exit(1) if not os.path.isdir(local_resource_path): @@ -89,7 +89,7 @@ file_list_csv = open(file_list_path, "rb") # Our 'csv' file uses multiple spaces as a delimiter, python's # csv class only uses single character delimiters, so we convert them below -file_list_reader = csv.reader((re.sub(' +', ' ', line) \ +file_list_reader = csv.reader((re.sub(' +', ' ', line.decode('utf-8')) \ for line in file_list_csv), delimiter = ' ') file_shas = [] @@ -104,15 +104,16 @@ for row in file_list_reader: file_list_csv.close() # Download files, only if they don't already exist and have correct shas -for filename, sha in itertools.izip(file_names, file_shas): +for filename, sha in zip(file_names, file_shas): + filename = filename.lstrip('*') path = os.path.join(local_resource_path, filename) if os.path.isfile(path) \ and get_file_sha(path) == sha: - print path + ' exists, skipping' + print(path + ' exists, skipping') continue for retry in range(0, ftp_retries): - print "Downloading " + path + print("Downloading " + path) if not download_and_check_sha(url, filename, sha): - print "Sha does not match, retrying..." + print("Sha does not match, retrying...") else: break diff --git a/media/libvpx/libvpx/test/aq_segment_test.cc b/media/libvpx/libvpx/test/aq_segment_test.cc index 1c2147fbb2..f7e9a118fc 100644 --- a/media/libvpx/libvpx/test/aq_segment_test.cc +++ b/media/libvpx/libvpx/test/aq_segment_test.cc @@ -7,7 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" @@ -20,18 +20,18 @@ class AqSegmentTest public ::libvpx_test::CodecTestWith2Params { protected: AqSegmentTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~AqSegmentTest() {} + ~AqSegmentTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); aq_mode_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 100); @@ -102,8 +102,8 @@ TEST_P(AqSegmentTest, TestNoMisMatchAQ3) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(AqSegmentTest, - ::testing::Values(::libvpx_test::kRealTime, - ::libvpx_test::kOnePassGood), - ::testing::Range(3, 9)); +VP9_INSTANTIATE_TEST_SUITE(AqSegmentTest, + ::testing::Values(::libvpx_test::kRealTime, + ::libvpx_test::kOnePassGood), + ::testing::Range(3, 9)); } // namespace diff --git a/media/libvpx/libvpx/test/avg_test.cc b/media/libvpx/libvpx/test/avg_test.cc index 272b99695e..42614c9bd3 100644 --- a/media/libvpx/libvpx/test/avg_test.cc +++ b/media/libvpx/libvpx/test/avg_test.cc @@ -11,9 +11,11 @@ #include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" +#include "./vp9_rtcd.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -21,39 +23,43 @@ #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" +#include "vpx/vpx_codec.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; namespace { + +template class AverageTestBase : public ::testing::Test { public: - AverageTestBase(int width, int height) : width_(width), height_(height) {} + AverageTestBase(int width, int height) + : width_(width), height_(height), source_data_(nullptr), + source_stride_(0), bit_depth_(8) {} - static void SetUpTestCase() { - source_data_ = reinterpret_cast( - vpx_memalign(kDataAlignment, kDataBlockSize)); - } - - static void TearDownTestCase() { + void TearDown() override { vpx_free(source_data_); - source_data_ = NULL; + source_data_ = nullptr; + libvpx_test::ClearSystemState(); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - protected: // Handle blocks up to 4 blocks 64x64 with stride up to 128 static const int kDataAlignment = 16; static const int kDataBlockSize = 64 * 128; - virtual void SetUp() { + void SetUp() override { + source_data_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); + ASSERT_NE(source_data_, nullptr); source_stride_ = (width_ + 31) & ~31; + bit_depth_ = 8; rnd_.Reset(ACMRandom::DeterministicSeed()); } // Sum Pixels - static unsigned int ReferenceAverage8x8(const uint8_t *source, int pitch) { + static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) { unsigned int average = 0; for (int h = 0; h < 8; ++h) { for (int w = 0; w < 8; ++w) average += source[h * pitch + w]; @@ -61,7 +67,7 @@ class AverageTestBase : public ::testing::Test { return ((average + 32) >> 6); } - static unsigned int ReferenceAverage4x4(const uint8_t *source, int pitch) { + static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) { unsigned int average = 0; for (int h = 0; h < 4; ++h) { for (int w = 0; w < 4; ++w) average += source[h * pitch + w]; @@ -69,7 +75,7 @@ class AverageTestBase : public ::testing::Test { return ((average + 8) >> 4); } - void FillConstant(uint8_t fill_constant) { + void FillConstant(Pixel fill_constant) { for (int i = 0; i < width_ * height_; ++i) { source_data_[i] = fill_constant; } @@ -77,21 +83,22 @@ class AverageTestBase : public ::testing::Test { void FillRandom() { for (int i = 0; i < width_ * height_; ++i) { - source_data_[i] = rnd_.Rand8(); + source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1); } } int width_, height_; - static uint8_t *source_data_; + Pixel *source_data_; int source_stride_; + int bit_depth_; ACMRandom rnd_; }; -typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch); +using AverageFunction = unsigned int (*)(const uint8_t *s, int pitch); -typedef std::tr1::tuple AvgFunc; +using AvgFunc = std::tuple; -class AverageTest : public AverageTestBase, +class AverageTest : public AverageTestBase, public ::testing::WithParamInterface { public: AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {} @@ -117,38 +124,75 @@ class AverageTest : public AverageTestBase, } }; -typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref, - const int ref_stride, const int height); +#if CONFIG_VP9_HIGHBITDEPTH +class AverageTestHBD : public AverageTestBase, + public ::testing::WithParamInterface { + public: + AverageTestHBD() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {} -typedef std::tr1::tuple IntProRowParam; + protected: + void CheckAverages() { + const int block_size = GET_PARAM(3); + unsigned int expected = 0; + if (block_size == 8) { + expected = + ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_); + } else if (block_size == 4) { + expected = + ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_); + } -class IntProRowTest : public AverageTestBase, + ASM_REGISTER_STATE_CHECK(GET_PARAM(4)( + CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_)); + unsigned int actual = GET_PARAM(4)( + CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_); + + EXPECT_EQ(expected, actual); + } +}; +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_NEON || HAVE_SSE2 || HAVE_MSA +using IntProRowFunc = void (*)(int16_t hbuf[16], uint8_t const *ref, + const int ref_stride, const int height); + +using IntProRowParam = std::tuple; + +class IntProRowTest : public AverageTestBase, public ::testing::WithParamInterface { public: IntProRowTest() - : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) { + : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(nullptr), + hbuf_c_(nullptr) { asm_func_ = GET_PARAM(1); c_func_ = GET_PARAM(2); } protected: - virtual void SetUp() { + void SetUp() override { + source_data_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0]))); + ASSERT_NE(source_data_, nullptr); + hbuf_asm_ = reinterpret_cast( vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16)); hbuf_c_ = reinterpret_cast( vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16)); } - virtual void TearDown() { + void TearDown() override { + vpx_free(source_data_); + source_data_ = nullptr; vpx_free(hbuf_c_); - hbuf_c_ = NULL; + hbuf_c_ = nullptr; vpx_free(hbuf_asm_); - hbuf_asm_ = NULL; + hbuf_asm_ = nullptr; } void RunComparison() { - ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_)); - ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_)); + ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, width_, height_)); + ASM_REGISTER_STATE_CHECK( + asm_func_(hbuf_asm_, source_data_, width_, height_)); EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16)) << "Output mismatch"; } @@ -159,12 +203,13 @@ class IntProRowTest : public AverageTestBase, int16_t *hbuf_asm_; int16_t *hbuf_c_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProRowTest); -typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width); +using IntProColFunc = int16_t (*)(uint8_t const *ref, const int width); -typedef std::tr1::tuple IntProColParam; +using IntProColParam = std::tuple; -class IntProColTest : public AverageTestBase, +class IntProColTest : public AverageTestBase, public ::testing::WithParamInterface { public: IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) { @@ -185,34 +230,34 @@ class IntProColTest : public AverageTestBase, int16_t sum_asm_; int16_t sum_c_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(IntProColTest); +#endif // HAVE_NEON || HAVE_SSE2 || HAVE_MSA -typedef int (*SatdFunc)(const int16_t *coeffs, int length); -typedef std::tr1::tuple SatdTestParam; +using SatdFunc = int (*)(const tran_low_t *coeffs, int length); +using SatdTestParam = std::tuple; class SatdTest : public ::testing::Test, public ::testing::WithParamInterface { protected: - virtual void SetUp() { + void SetUp() override { satd_size_ = GET_PARAM(0); satd_func_ = GET_PARAM(1); rnd_.Reset(ACMRandom::DeterministicSeed()); - src_ = reinterpret_cast( + src_ = reinterpret_cast( vpx_memalign(16, sizeof(*src_) * satd_size_)); - ASSERT_TRUE(src_ != NULL); + ASSERT_NE(src_, nullptr); } - virtual void TearDown() { + void TearDown() override { libvpx_test::ClearSystemState(); vpx_free(src_); } - void FillConstant(const int16_t val) { + void FillConstant(const tran_low_t val) { for (int i = 0; i < satd_size_; ++i) src_[i] = val; } - void FillRandom() { - for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16(); - } + virtual void FillRandom() = 0; void Check(const int expected) { int total; @@ -220,15 +265,84 @@ class SatdTest : public ::testing::Test, EXPECT_EQ(expected, total); } + tran_low_t *GetCoeff() const { return src_; } + int satd_size_; + ACMRandom rnd_; + tran_low_t *src_; private: - int16_t *src_; SatdFunc satd_func_; - ACMRandom rnd_; }; -uint8_t *AverageTestBase::source_data_ = NULL; +class SatdLowbdTest : public SatdTest { + protected: + void FillRandom() override { + for (int i = 0; i < satd_size_; ++i) { + const int16_t tmp = rnd_.Rand16Signed(); + src_[i] = (tran_low_t)tmp; + } + } +}; + +using BlockErrorFunc = int64_t (*)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size); +using BlockErrorTestFPParam = std::tuple; + +class BlockErrorTestFP + : public ::testing::Test, + public ::testing::WithParamInterface { + protected: + void SetUp() override { + txfm_size_ = GET_PARAM(0); + block_error_func_ = GET_PARAM(1); + rnd_.Reset(ACMRandom::DeterministicSeed()); + coeff_ = reinterpret_cast( + vpx_memalign(16, sizeof(*coeff_) * txfm_size_)); + dqcoeff_ = reinterpret_cast( + vpx_memalign(16, sizeof(*dqcoeff_) * txfm_size_)); + ASSERT_NE(coeff_, nullptr); + ASSERT_NE(dqcoeff_, nullptr); + } + + void TearDown() override { + libvpx_test::ClearSystemState(); + vpx_free(coeff_); + vpx_free(dqcoeff_); + } + + void FillConstant(const tran_low_t coeff_val, const tran_low_t dqcoeff_val) { + for (int i = 0; i < txfm_size_; ++i) coeff_[i] = coeff_val; + for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = dqcoeff_val; + } + + void FillRandom() { + // Just two fixed seeds + rnd_.Reset(0xb0b9); + for (int i = 0; i < txfm_size_; ++i) coeff_[i] = rnd_.Rand16() >> 1; + rnd_.Reset(0xb0c8); + for (int i = 0; i < txfm_size_; ++i) dqcoeff_[i] = rnd_.Rand16() >> 1; + } + + void Check(const int64_t expected) { + int64_t total; + ASM_REGISTER_STATE_CHECK( + total = block_error_func_(coeff_, dqcoeff_, txfm_size_)); + EXPECT_EQ(expected, total); + } + + tran_low_t *GetCoeff() const { return coeff_; } + + tran_low_t *GetDQCoeff() const { return dqcoeff_; } + + int txfm_size_; + + private: + tran_low_t *coeff_; + tran_low_t *dqcoeff_; + BlockErrorFunc block_error_func_; + ACMRandom rnd_; +}; TEST_P(AverageTest, MinValue) { FillConstant(0); @@ -248,7 +362,29 @@ TEST_P(AverageTest, Random) { CheckAverages(); } } +#if CONFIG_VP9_HIGHBITDEPTH +TEST_P(AverageTestHBD, MinValue) { + FillConstant(0); + CheckAverages(); +} +TEST_P(AverageTestHBD, MaxValue) { + FillConstant((1 << VPX_BITS_12) - 1); + CheckAverages(); +} + +TEST_P(AverageTestHBD, Random) { + bit_depth_ = VPX_BITS_12; + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + for (int i = 0; i < 1000; i++) { + FillRandom(); + CheckAverages(); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_NEON || HAVE_SSE2 || HAVE_MSA TEST_P(IntProRowTest, MinValue) { FillConstant(0); RunComparison(); @@ -278,28 +414,29 @@ TEST_P(IntProColTest, Random) { FillRandom(); RunComparison(); } +#endif -TEST_P(SatdTest, MinValue) { +TEST_P(SatdLowbdTest, MinValue) { const int kMin = -32640; const int expected = -kMin * satd_size_; FillConstant(kMin); Check(expected); } -TEST_P(SatdTest, MaxValue) { +TEST_P(SatdLowbdTest, MaxValue) { const int kMax = 32640; const int expected = kMax * satd_size_; FillConstant(kMax); Check(expected); } -TEST_P(SatdTest, Random) { +TEST_P(SatdLowbdTest, Random) { int expected; switch (satd_size_) { - case 16: expected = 205298; break; - case 64: expected = 1113950; break; - case 256: expected = 4268415; break; - case 1024: expected = 16954082; break; + case 16: expected = 261036; break; + case 64: expected = 991732; break; + case 256: expected = 4136358; break; + case 1024: expected = 16677592; break; default: FAIL() << "Invalid satd size (" << satd_size_ << ") valid: 16/64/256/1024"; @@ -308,21 +445,173 @@ TEST_P(SatdTest, Random) { Check(expected); } -using std::tr1::make_tuple; +TEST_P(SatdLowbdTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + const int blocksize = GET_PARAM(0); + FillRandom(); + tran_low_t *coeff = GetCoeff(); -INSTANTIATE_TEST_CASE_P( + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + +#if CONFIG_VP9_HIGHBITDEPTH +class SatdHighbdTest : public SatdTest { + protected: + void FillRandom() override { + for (int i = 0; i < satd_size_; ++i) { + src_[i] = rnd_.Rand20Signed(); + } + } +}; + +TEST_P(SatdHighbdTest, MinValue) { + const int kMin = -524280; + const int expected = -kMin * satd_size_; + FillConstant(kMin); + Check(expected); +} + +TEST_P(SatdHighbdTest, MaxValue) { + const int kMax = 524280; + const int expected = kMax * satd_size_; + FillConstant(kMax); + Check(expected); +} + +TEST_P(SatdHighbdTest, Random) { + int expected; + switch (satd_size_) { + case 16: expected = 5249712; break; + case 64: expected = 18362120; break; + case 256: expected = 66100520; break; + case 1024: expected = 266094734; break; + default: + FAIL() << "Invalid satd size (" << satd_size_ + << ") valid: 16/64/256/1024"; + } + FillRandom(); + Check(expected); +} + +TEST_P(SatdHighbdTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + const int blocksize = GET_PARAM(0); + FillRandom(); + tran_low_t *coeff = GetCoeff(); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +TEST_P(BlockErrorTestFP, MinValue) { + const int64_t kMin = -32640; + const int64_t expected = kMin * kMin * txfm_size_; + FillConstant(kMin, 0); + Check(expected); +} + +TEST_P(BlockErrorTestFP, MaxValue) { + const int64_t kMax = 32640; + const int64_t expected = kMax * kMax * txfm_size_; + FillConstant(kMax, 0); + Check(expected); +} + +TEST_P(BlockErrorTestFP, Random) { + int64_t expected; + switch (txfm_size_) { + case 16: expected = 2051681432; break; + case 64: expected = 11075114379; break; + case 256: expected = 44386271116; break; + case 1024: expected = 184774996089; break; + default: + FAIL() << "Invalid satd size (" << txfm_size_ + << ") valid: 16/64/256/1024"; + } + FillRandom(); + Check(expected); +} + +TEST_P(BlockErrorTestFP, DISABLED_Speed) { + const int kCountSpeedTestBlock = 20000; + vpx_usec_timer timer; + const int blocksize = GET_PARAM(0); + FillRandom(); + tran_low_t *coeff = GetCoeff(); + tran_low_t *dqcoeff = GetDQCoeff(); + + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + GET_PARAM(1)(coeff, dqcoeff, blocksize); + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time); +} + +using std::make_tuple; + +INSTANTIATE_TEST_SUITE_P( C, AverageTest, ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c), make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c))); -INSTANTIATE_TEST_CASE_P(C, SatdTest, - ::testing::Values(make_tuple(16, &vpx_satd_c), - make_tuple(64, &vpx_satd_c), - make_tuple(256, &vpx_satd_c), - make_tuple(1024, &vpx_satd_c))); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + C, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_c), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_c))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( + SSE2, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_sse2), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AverageTestHBD, + ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_neon), + make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_neon))); +#endif // HAVE_NEON + +INSTANTIATE_TEST_SUITE_P(C, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_c), + make_tuple(64, &vpx_satd_c), + make_tuple(256, &vpx_satd_c), + make_tuple(1024, &vpx_satd_c))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_SUITE_P(C, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_c), + make_tuple(64, &vpx_satd_c), + make_tuple(256, &vpx_satd_c), + make_tuple(1024, &vpx_satd_c))); + +INSTANTIATE_TEST_SUITE_P( + C, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_c), + make_tuple(64, &vp9_block_error_fp_c), + make_tuple(256, &vp9_block_error_fp_c), + make_tuple(1024, &vp9_block_error_fp_c))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( SSE2, AverageTest, ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_sse2), make_tuple(16, 16, 5, 8, &vpx_avg_8x8_sse2), @@ -331,29 +620,60 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, 16, 5, 4, &vpx_avg_4x4_sse2), make_tuple(32, 32, 15, 4, &vpx_avg_4x4_sse2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, IntProRowTest, ::testing::Values(make_tuple(16, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c), make_tuple(32, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c), make_tuple(64, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, IntProColTest, ::testing::Values(make_tuple(16, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c), make_tuple(32, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c), make_tuple(64, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c))); -INSTANTIATE_TEST_CASE_P(SSE2, SatdTest, - ::testing::Values(make_tuple(16, &vpx_satd_sse2), - make_tuple(64, &vpx_satd_sse2), - make_tuple(256, &vpx_satd_sse2), - make_tuple(1024, &vpx_satd_sse2))); +INSTANTIATE_TEST_SUITE_P(SSE2, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_sse2), + make_tuple(64, &vpx_satd_sse2), + make_tuple(256, &vpx_satd_sse2), + make_tuple(1024, &vpx_satd_sse2))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2), + make_tuple(64, &vp9_block_error_fp_sse2), + make_tuple(256, &vp9_block_error_fp_sse2), + make_tuple(1024, &vp9_block_error_fp_sse2))); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_avx2), + make_tuple(64, &vpx_satd_avx2), + make_tuple(256, &vpx_satd_avx2), + make_tuple(1024, &vpx_satd_avx2))); + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + AVX2, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_highbd_satd_avx2), + make_tuple(64, &vpx_highbd_satd_avx2), + make_tuple(256, &vpx_highbd_satd_avx2), + make_tuple(1024, &vpx_highbd_satd_avx2))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_SUITE_P( + AVX2, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2), + make_tuple(64, &vp9_block_error_fp_avx2), + make_tuple(256, &vp9_block_error_fp_avx2), + make_tuple(1024, &vp9_block_error_fp_avx2))); #endif #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, AverageTest, ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_neon), make_tuple(16, 16, 5, 8, &vpx_avg_8x8_neon), @@ -362,29 +682,54 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon), make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, IntProRowTest, ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c), make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c), make_tuple(64, &vpx_int_pro_row_neon, &vpx_int_pro_row_c))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, IntProColTest, ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c), make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c), make_tuple(64, &vpx_int_pro_col_neon, &vpx_int_pro_col_c))); -INSTANTIATE_TEST_CASE_P(NEON, SatdTest, - ::testing::Values(make_tuple(16, &vpx_satd_neon), - make_tuple(64, &vpx_satd_neon), - make_tuple(256, &vpx_satd_neon), - make_tuple(1024, &vpx_satd_neon))); -#endif +INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_neon), + make_tuple(64, &vpx_satd_neon), + make_tuple(256, &vpx_satd_neon), + make_tuple(1024, &vpx_satd_neon))); + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, SatdHighbdTest, + ::testing::Values(make_tuple(16, &vpx_highbd_satd_neon), + make_tuple(64, &vpx_highbd_satd_neon), + make_tuple(256, &vpx_highbd_satd_neon), + make_tuple(1024, &vpx_highbd_satd_neon))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_SUITE_P( + NEON, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon), + make_tuple(64, &vp9_block_error_fp_neon), + make_tuple(256, &vp9_block_error_fp_neon), + make_tuple(1024, &vp9_block_error_fp_neon))); +#endif // HAVE_NEON + +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, BlockErrorTestFP, + ::testing::Values(make_tuple(16, &vp9_block_error_fp_sve), + make_tuple(64, &vp9_block_error_fp_sve), + make_tuple(256, &vp9_block_error_fp_sve), + make_tuple(1024, &vp9_block_error_fp_sve))); +#endif // HAVE_SVE #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, AverageTest, ::testing::Values(make_tuple(16, 16, 0, 8, &vpx_avg_8x8_msa), make_tuple(16, 16, 5, 8, &vpx_avg_8x8_msa), @@ -392,6 +737,30 @@ INSTANTIATE_TEST_CASE_P( make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa), make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa), make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa))); -#endif + +INSTANTIATE_TEST_SUITE_P( + MSA, IntProRowTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_row_msa, &vpx_int_pro_row_c), + make_tuple(32, &vpx_int_pro_row_msa, &vpx_int_pro_row_c), + make_tuple(64, &vpx_int_pro_row_msa, + &vpx_int_pro_row_c))); + +INSTANTIATE_TEST_SUITE_P( + MSA, IntProColTest, + ::testing::Values(make_tuple(16, &vpx_int_pro_col_msa, &vpx_int_pro_col_c), + make_tuple(32, &vpx_int_pro_col_msa, &vpx_int_pro_col_c), + make_tuple(64, &vpx_int_pro_col_msa, + &vpx_int_pro_col_c))); + +// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are +// in place. +#if !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(MSA, SatdLowbdTest, + ::testing::Values(make_tuple(16, &vpx_satd_msa), + make_tuple(64, &vpx_satd_msa), + make_tuple(256, &vpx_satd_msa), + make_tuple(1024, &vpx_satd_msa))); +#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_MSA } // namespace diff --git a/media/libvpx/libvpx/test/bench.cc b/media/libvpx/libvpx/test/bench.cc new file mode 100644 index 0000000000..0783f2a734 --- /dev/null +++ b/media/libvpx/libvpx/test/bench.cc @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "test/bench.h" +#include "vpx_ports/vpx_timer.h" + +void AbstractBench::RunNTimes(int n) { + for (int r = 0; r < VPX_BENCH_ROBUST_ITER; r++) { + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int j = 0; j < n; ++j) { + Run(); + } + vpx_usec_timer_mark(&timer); + times_[r] = static_cast(vpx_usec_timer_elapsed(&timer)); + } +} + +void AbstractBench::PrintMedian(const char *title) { + std::sort(times_, times_ + VPX_BENCH_ROBUST_ITER); + const int med = times_[VPX_BENCH_ROBUST_ITER >> 1]; + int sad = 0; + for (int t = 0; t < VPX_BENCH_ROBUST_ITER; t++) { + sad += abs(times_[t] - med); + } + printf("[%10s] %s %.1f ms ( ±%.1f ms )\n", "BENCH ", title, med / 1000.0, + sad / (VPX_BENCH_ROBUST_ITER * 1000.0)); +} diff --git a/media/libvpx/libvpx/test/bench.h b/media/libvpx/libvpx/test/bench.h new file mode 100644 index 0000000000..203e4d247e --- /dev/null +++ b/media/libvpx/libvpx/test/bench.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_BENCH_H_ +#define VPX_TEST_BENCH_H_ + +// Number of iterations used to compute median run time. +#define VPX_BENCH_ROBUST_ITER 15 + +class AbstractBench { + public: + virtual ~AbstractBench() = default; + + void RunNTimes(int n); + void PrintMedian(const char *title); + + protected: + // Implement this method and put the code to benchmark in it. + virtual void Run() = 0; + + private: + int times_[VPX_BENCH_ROBUST_ITER]; +}; + +#endif // VPX_TEST_BENCH_H_ diff --git a/media/libvpx/libvpx/test/blockiness_test.cc b/media/libvpx/libvpx/test/blockiness_test.cc index 2fa10192f1..6e8301bb52 100644 --- a/media/libvpx/libvpx/test/blockiness_test.cc +++ b/media/libvpx/libvpx/test/blockiness_test.cc @@ -11,8 +11,9 @@ #include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #if CONFIG_VP9_ENCODER @@ -25,10 +26,7 @@ #include "test/util.h" #include "vpx_mem/vpx_mem.h" - -extern "C" double vp9_get_blockiness(const unsigned char *img1, int img1_pitch, - const unsigned char *img2, int img2_pitch, - int width, int height); +#include "vp9/encoder/vp9_blockiness.h" using libvpx_test::ACMRandom; @@ -37,28 +35,28 @@ class BlockinessTestBase : public ::testing::Test { public: BlockinessTestBase(int width, int height) : width_(width), height_(height) {} - static void SetUpTestCase() { + static void SetUpTestSuite() { source_data_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBufferSize)); reference_data_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBufferSize)); } - static void TearDownTestCase() { + static void TearDownTestSuite() { vpx_free(source_data_); - source_data_ = NULL; + source_data_ = nullptr; vpx_free(reference_data_); - reference_data_ = NULL; + reference_data_ = nullptr; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: // Handle frames up to 640x480 static const int kDataAlignment = 16; static const int kDataBufferSize = 640 * 480; - virtual void SetUp() { + void SetUp() override { source_stride_ = (width_ + 31) & ~31; reference_stride_ = width_ * 2; rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -141,7 +139,7 @@ class BlockinessTestBase : public ::testing::Test { }; #if CONFIG_VP9_ENCODER -typedef std::tr1::tuple BlockinessParam; +using BlockinessParam = std::tuple; class BlockinessVP9Test : public BlockinessTestBase, public ::testing::WithParamInterface { @@ -156,8 +154,8 @@ class BlockinessVP9Test }; #endif // CONFIG_VP9_ENCODER -uint8_t *BlockinessTestBase::source_data_ = NULL; -uint8_t *BlockinessTestBase::reference_data_ = NULL; +uint8_t *BlockinessTestBase::source_data_ = nullptr; +uint8_t *BlockinessTestBase::reference_data_ = nullptr; #if CONFIG_VP9_ENCODER TEST_P(BlockinessVP9Test, SourceBlockierThanReference) { @@ -208,16 +206,17 @@ TEST_P(BlockinessVP9Test, WorstCaseBlockiness) { } #endif // CONFIG_VP9_ENCODER -using std::tr1::make_tuple; +using std::make_tuple; //------------------------------------------------------------------------------ // C functions #if CONFIG_VP9_ENCODER -const BlockinessParam c_vp9_tests[] = { - make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238), -}; -INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests)); +const BlockinessParam c_vp9_tests[] = { make_tuple(320, 240), + make_tuple(318, 242), + make_tuple(318, 238) }; +INSTANTIATE_TEST_SUITE_P(C, BlockinessVP9Test, + ::testing::ValuesIn(c_vp9_tests)); #endif } // namespace diff --git a/media/libvpx/libvpx/test/borders_test.cc b/media/libvpx/libvpx/test/borders_test.cc index e66ff02e25..57ee179aaf 100644 --- a/media/libvpx/libvpx/test/borders_test.cc +++ b/media/libvpx/libvpx/test/borders_test.cc @@ -9,11 +9,12 @@ */ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "vpx_config.h" namespace { @@ -22,16 +23,16 @@ class BordersTest public ::libvpx_test::CodecTestWithParam { protected: BordersTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~BordersTest() {} + ~BordersTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 1); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); @@ -40,7 +41,7 @@ class BordersTest } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { } } @@ -79,6 +80,11 @@ TEST_P(BordersTest, TestLowBitrate) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(BordersTest, - ::testing::Values(::libvpx_test::kTwoPassGood)); +#if CONFIG_REALTIME_ONLY +VP9_INSTANTIATE_TEST_SUITE(BordersTest, + ::testing::Values(::libvpx_test::kRealTime)); +#else +VP9_INSTANTIATE_TEST_SUITE(BordersTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); +#endif } // namespace diff --git a/media/libvpx/libvpx/test/buffer.h b/media/libvpx/libvpx/test/buffer.h new file mode 100644 index 0000000000..f2846323b3 --- /dev/null +++ b/media/libvpx/libvpx/test/buffer.h @@ -0,0 +1,382 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_BUFFER_H_ +#define VPX_TEST_BUFFER_H_ + +#include + +#include + +#include "gtest/gtest.h" + +#include "test/acm_random.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" + +namespace libvpx_test { + +template +class Buffer { + public: + Buffer(int width, int height, int top_padding, int left_padding, + int right_padding, int bottom_padding) + : width_(width), height_(height), top_padding_(top_padding), + left_padding_(left_padding), right_padding_(right_padding), + bottom_padding_(bottom_padding), alignment_(0), padding_value_(0), + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} + + Buffer(int width, int height, int top_padding, int left_padding, + int right_padding, int bottom_padding, unsigned int alignment) + : width_(width), height_(height), top_padding_(top_padding), + left_padding_(left_padding), right_padding_(right_padding), + bottom_padding_(bottom_padding), alignment_(alignment), + padding_value_(0), stride_(0), raw_size_(0), num_elements_(0), + raw_buffer_(nullptr) {} + + Buffer(int width, int height, int padding) + : width_(width), height_(height), top_padding_(padding), + left_padding_(padding), right_padding_(padding), + bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0), + raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} + + Buffer(int width, int height, int padding, unsigned int alignment) + : width_(width), height_(height), top_padding_(padding), + left_padding_(padding), right_padding_(padding), + bottom_padding_(padding), alignment_(alignment), padding_value_(0), + stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {} + + ~Buffer() { + if (alignment_) { + vpx_free(raw_buffer_); + } else { + delete[] raw_buffer_; + } + } + + T *TopLeftPixel() const; + + int stride() const { return stride_; } + + // Set the buffer (excluding padding) to 'value'. + void Set(const T value); + + // Set the buffer (excluding padding) to the output of ACMRandom function + // 'rand_func'. + void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()); + + // Set the buffer (excluding padding) to the output of ACMRandom function + // 'RandRange' with range 'low' to 'high' which typically must be within + // testing::internal::Random::kMaxRange (1u << 31). However, because we want + // to allow negative low (and high) values, it is restricted to INT32_MAX + // here. + void Set(ACMRandom *rand_class, const T low, const T high); + + // Copy the contents of Buffer 'a' (excluding padding). + void CopyFrom(const Buffer &a); + + void DumpBuffer() const; + + // Highlight the differences between two buffers if they are the same size. + void PrintDifference(const Buffer &a) const; + + bool HasPadding() const; + + // Sets all the values in the buffer to 'padding_value'. + void SetPadding(const T padding_value); + + // Checks if all the values (excluding padding) are equal to 'value' if the + // Buffers are the same size. + bool CheckValues(const T value) const; + + // Check that padding matches the expected value or there is no padding. + bool CheckPadding() const; + + // Compare the non-padding portion of two buffers if they are the same size. + bool CheckValues(const Buffer &a) const; + + bool Init() { + if (raw_buffer_ != nullptr) return false; + EXPECT_GT(width_, 0); + EXPECT_GT(height_, 0); + EXPECT_GE(top_padding_, 0); + EXPECT_GE(left_padding_, 0); + EXPECT_GE(right_padding_, 0); + EXPECT_GE(bottom_padding_, 0); + stride_ = left_padding_ + width_ + right_padding_; + num_elements_ = stride_ * (top_padding_ + height_ + bottom_padding_); + raw_size_ = num_elements_ * sizeof(T); + if (alignment_) { + EXPECT_GE(alignment_, sizeof(T)); + // Ensure alignment of the first value will be preserved. + EXPECT_EQ((left_padding_ * sizeof(T)) % alignment_, 0u); + // Ensure alignment of the subsequent rows will be preserved when there is + // a stride. + if (stride_ != width_) { + EXPECT_EQ((stride_ * sizeof(T)) % alignment_, 0u); + } + raw_buffer_ = reinterpret_cast(vpx_memalign(alignment_, raw_size_)); + } else { + raw_buffer_ = new (std::nothrow) T[num_elements_]; + } + EXPECT_NE(raw_buffer_, nullptr); + SetPadding(std::numeric_limits::max()); + return !::testing::Test::HasFailure(); + } + + private: + bool BufferSizesMatch(const Buffer &a) const; + + const int width_; + const int height_; + const int top_padding_; + const int left_padding_; + const int right_padding_; + const int bottom_padding_; + const unsigned int alignment_; + T padding_value_; + int stride_; + int raw_size_; + int num_elements_; + T *raw_buffer_; +}; + +template +T *Buffer::TopLeftPixel() const { + if (!raw_buffer_) return nullptr; + return raw_buffer_ + (top_padding_ * stride_) + left_padding_; +} + +template +void Buffer::Set(const T value) { + if (!raw_buffer_) return; + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + src[width] = value; + } + src += stride_; + } +} + +template +void Buffer::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) { + if (!raw_buffer_) return; + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + src[width] = (*rand_class.*rand_func)(); + } + src += stride_; + } +} + +template +void Buffer::Set(ACMRandom *rand_class, const T low, const T high) { + if (!raw_buffer_) return; + + EXPECT_LE(low, high); + EXPECT_LE(static_cast(high) - low, + std::numeric_limits::max()); + + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + // 'low' will be promoted to unsigned given the return type of RandRange. + // Store the value as an int to avoid unsigned overflow warnings when + // 'low' is negative. + const int32_t value = + static_cast((*rand_class).RandRange(high - low)); + src[width] = static_cast(value + low); + } + src += stride_; + } +} + +template +void Buffer::CopyFrom(const Buffer &a) { + if (!raw_buffer_) return; + if (!BufferSizesMatch(a)) return; + + T *a_src = a.TopLeftPixel(); + T *b_src = this->TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + b_src[width] = a_src[width]; + } + a_src += a.stride(); + b_src += this->stride(); + } +} + +template +void Buffer::DumpBuffer() const { + if (!raw_buffer_) return; + for (int height = 0; height < height_ + top_padding_ + bottom_padding_; + ++height) { + for (int width = 0; width < stride_; ++width) { + printf("%4d", raw_buffer_[height + width * stride_]); + } + printf("\n"); + } +} + +template +bool Buffer::HasPadding() const { + if (!raw_buffer_) return false; + return top_padding_ || left_padding_ || right_padding_ || bottom_padding_; +} + +template +void Buffer::PrintDifference(const Buffer &a) const { + if (!raw_buffer_) return; + if (!BufferSizesMatch(a)) return; + + T *a_src = a.TopLeftPixel(); + T *b_src = TopLeftPixel(); + + printf("This buffer:\n"); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (a_src[width] != b_src[width]) { + printf("*%3d", b_src[width]); + } else { + printf("%4d", b_src[width]); + } + } + printf("\n"); + a_src += a.stride(); + b_src += this->stride(); + } + + a_src = a.TopLeftPixel(); + b_src = TopLeftPixel(); + + printf("Reference buffer:\n"); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (a_src[width] != b_src[width]) { + printf("*%3d", a_src[width]); + } else { + printf("%4d", a_src[width]); + } + } + printf("\n"); + a_src += a.stride(); + b_src += this->stride(); + } +} + +template +void Buffer::SetPadding(const T padding_value) { + if (!raw_buffer_) return; + padding_value_ = padding_value; + + T *src = raw_buffer_; + for (int i = 0; i < num_elements_; ++i) { + src[i] = padding_value; + } +} + +template +bool Buffer::CheckValues(const T value) const { + if (!raw_buffer_) return false; + T *src = TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (value != src[width]) { + return false; + } + } + src += stride_; + } + return true; +} + +template +bool Buffer::CheckPadding() const { + if (!raw_buffer_) return false; + if (!HasPadding()) return true; + + // Top padding. + T const *top = raw_buffer_; + for (int i = 0; i < stride_ * top_padding_; ++i) { + if (padding_value_ != top[i]) { + return false; + } + } + + // Left padding. + T const *left = TopLeftPixel() - left_padding_; + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < left_padding_; ++width) { + if (padding_value_ != left[width]) { + return false; + } + } + left += stride_; + } + + // Right padding. + T const *right = TopLeftPixel() + width_; + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < right_padding_; ++width) { + if (padding_value_ != right[width]) { + return false; + } + } + right += stride_; + } + + // Bottom padding + T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride_; + for (int i = 0; i < stride_ * bottom_padding_; ++i) { + if (padding_value_ != bottom[i]) { + return false; + } + } + + return true; +} + +template +bool Buffer::CheckValues(const Buffer &a) const { + if (!raw_buffer_) return false; + if (!BufferSizesMatch(a)) return false; + + T *a_src = a.TopLeftPixel(); + T *b_src = this->TopLeftPixel(); + for (int height = 0; height < height_; ++height) { + for (int width = 0; width < width_; ++width) { + if (a_src[width] != b_src[width]) { + return false; + } + } + a_src += a.stride(); + b_src += this->stride(); + } + return true; +} + +template +bool Buffer::BufferSizesMatch(const Buffer &a) const { + if (!raw_buffer_) return false; + if (a.width_ != this->width_ || a.height_ != this->height_) { + printf( + "Reference buffer of size %dx%d does not match this buffer which is " + "size %dx%d\n", + a.width_, a.height_, this->width_, this->height_); + return false; + } + + return true; +} +} // namespace libvpx_test +#endif // VPX_TEST_BUFFER_H_ diff --git a/media/libvpx/libvpx/test/byte_alignment_test.cc b/media/libvpx/libvpx/test/byte_alignment_test.cc index d78294d10f..ba6fffc524 100644 --- a/media/libvpx/libvpx/test/byte_alignment_test.cc +++ b/media/libvpx/libvpx/test/byte_alignment_test.cc @@ -55,23 +55,24 @@ const ByteAlignmentTestParam kBaTestParams[] = { class ByteAlignmentTest : public ::testing::TestWithParam { protected: - ByteAlignmentTest() : video_(NULL), decoder_(NULL), md5_file_(NULL) {} + ByteAlignmentTest() + : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {} - virtual void SetUp() { + void SetUp() override { video_ = new libvpx_test::WebMVideoSource(kVP9TestFile); - ASSERT_TRUE(video_ != NULL); + ASSERT_NE(video_, nullptr); video_->Init(); video_->Begin(); const vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); decoder_ = new libvpx_test::VP9Decoder(cfg, 0); - ASSERT_TRUE(decoder_ != NULL); + ASSERT_NE(decoder_, nullptr); OpenMd5File(kVP9Md5File); } - virtual void TearDown() { - if (md5_file_ != NULL) fclose(md5_file_); + void TearDown() override { + if (md5_file_ != nullptr) fclose(md5_file_); delete decoder_; delete video_; @@ -90,7 +91,7 @@ class ByteAlignmentTest } vpx_codec_err_t DecodeRemainingFrames(int byte_alignment_to_check) { - for (; video_->cxdata() != NULL; video_->Next()) { + for (; video_->cxdata() != nullptr; video_->Next()) { const vpx_codec_err_t res = decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); if (res != VPX_CODEC_OK) return res; @@ -113,7 +114,7 @@ class ByteAlignmentTest const vpx_image_t *img; // Get decompressed data - while ((img = dec_iter.Next()) != NULL) { + while ((img = dec_iter.Next()) != nullptr) { if (byte_alignment_to_check == kLegacyByteAlignment) { CheckByteAlignment(img->planes[0], kLegacyYPlaneByteAlignment); } else { @@ -128,12 +129,12 @@ class ByteAlignmentTest // TODO(fgalligan): Move the MD5 testing code into another class. void OpenMd5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: " - << md5_file_name_; + ASSERT_NE(md5_file_, nullptr) + << "MD5 file open failed. Filename: " << md5_file_name_; } void CheckMd5(const vpx_image_t &img) { - ASSERT_TRUE(md5_file_ != NULL); + ASSERT_NE(md5_file_, nullptr); char expected_md5[33]; char junk[128]; @@ -171,12 +172,13 @@ TEST_F(ByteAlignmentTest, SwitchByteAlignment) { TEST_P(ByteAlignmentTest, TestAlignment) { const ByteAlignmentTestParam t = GetParam(); SetByteAlignment(t.byte_alignment, t.expected_value); - if (t.decode_remaining) + if (t.decode_remaining) { ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(t.byte_alignment)); + } } -INSTANTIATE_TEST_CASE_P(Alignments, ByteAlignmentTest, - ::testing::ValuesIn(kBaTestParams)); +INSTANTIATE_TEST_SUITE_P(Alignments, ByteAlignmentTest, + ::testing::ValuesIn(kBaTestParams)); #endif // CONFIG_WEBM_IO diff --git a/media/libvpx/libvpx/test/clear_system_state.h b/media/libvpx/libvpx/test/clear_system_state.h index 044a5c7583..ba3c0b386a 100644 --- a/media/libvpx/libvpx/test/clear_system_state.h +++ b/media/libvpx/libvpx/test/clear_system_state.h @@ -7,23 +7,17 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_CLEAR_SYSTEM_STATE_H_ -#define TEST_CLEAR_SYSTEM_STATE_H_ +#ifndef VPX_TEST_CLEAR_SYSTEM_STATE_H_ +#define VPX_TEST_CLEAR_SYSTEM_STATE_H_ #include "./vpx_config.h" -#if ARCH_X86 || ARCH_X86_64 -#include "vpx_ports/x86.h" -#endif +#include "vpx_ports/system_state.h" namespace libvpx_test { // Reset system to a known state. This function should be used for all non-API // test cases. -inline void ClearSystemState() { -#if ARCH_X86 || ARCH_X86_64 - vpx_reset_mmx_state(); -#endif -} +inline void ClearSystemState() { vpx_clear_system_state(); } } // namespace libvpx_test -#endif // TEST_CLEAR_SYSTEM_STATE_H_ +#endif // VPX_TEST_CLEAR_SYSTEM_STATE_H_ diff --git a/media/libvpx/libvpx/test/codec_factory.h b/media/libvpx/libvpx/test/codec_factory.h index d5882ed9c8..179ccdf011 100644 --- a/media/libvpx/libvpx/test/codec_factory.h +++ b/media/libvpx/libvpx/test/codec_factory.h @@ -7,8 +7,10 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_CODEC_FACTORY_H_ -#define TEST_CODEC_FACTORY_H_ +#ifndef VPX_TEST_CODEC_FACTORY_H_ +#define VPX_TEST_CODEC_FACTORY_H_ + +#include #include "./vpx_config.h" #include "vpx/vpx_decoder.h" @@ -38,7 +40,7 @@ class CodecFactory { const vpx_codec_flags_t flags) const = 0; virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, - unsigned long deadline, + vpx_enc_deadline_t deadline, const unsigned long init_flags, TwopassStatsStore *stats) const = 0; @@ -53,23 +55,22 @@ class CodecFactory { template class CodecTestWithParam : public ::testing::TestWithParam< - std::tr1::tuple > {}; + std::tuple > {}; template class CodecTestWith2Params : public ::testing::TestWithParam< - std::tr1::tuple > {}; + std::tuple > {}; template class CodecTestWith3Params : public ::testing::TestWithParam< - std::tr1::tuple > {}; + std::tuple > {}; template class CodecTestWith4Params : public ::testing::TestWithParam< - std::tr1::tuple > { -}; + std::tuple > {}; /* * VP8 Codec Definitions @@ -83,27 +84,27 @@ class VP8Decoder : public Decoder { : Decoder(cfg, flag) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP8_DECODER return &vpx_codec_vp8_dx_algo; #else - return NULL; + return nullptr; #endif } }; class VP8Encoder : public Encoder { public: - VP8Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + VP8Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, const unsigned long init_flags, TwopassStatsStore *stats) : Encoder(cfg, deadline, init_flags, stats) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP8_ENCODER return &vpx_codec_vp8_cx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -112,25 +113,24 @@ class VP8CodecFactory : public CodecFactory { public: VP8CodecFactory() : CodecFactory() {} - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override { return CreateDecoder(cfg, 0); } - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, - const vpx_codec_flags_t flags) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, + const vpx_codec_flags_t flags) const override { #if CONFIG_VP8_DECODER return new VP8Decoder(cfg, flags); #else (void)cfg; (void)flags; - return NULL; + return nullptr; #endif } - virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, - unsigned long deadline, - const unsigned long init_flags, - TwopassStatsStore *stats) const { + Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const override { #if CONFIG_VP8_ENCODER return new VP8Encoder(cfg, deadline, init_flags, stats); #else @@ -138,12 +138,12 @@ class VP8CodecFactory : public CodecFactory { (void)deadline; (void)init_flags; (void)stats; - return NULL; + return nullptr; #endif } - virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, - int usage) const { + vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const override { #if CONFIG_VP8_ENCODER return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage); #else @@ -156,15 +156,17 @@ class VP8CodecFactory : public CodecFactory { const libvpx_test::VP8CodecFactory kVP8; -#define VP8_INSTANTIATE_TEST_CASE(test, ...) \ - INSTANTIATE_TEST_CASE_P( \ +#define VP8_INSTANTIATE_TEST_SUITE(test, ...) \ + INSTANTIATE_TEST_SUITE_P( \ VP8, test, \ ::testing::Combine( \ ::testing::Values(static_cast( \ &libvpx_test::kVP8)), \ __VA_ARGS__)) #else -#define VP8_INSTANTIATE_TEST_CASE(test, ...) +// static_assert() is used to avoid warnings about an extra ';' outside of a +// function. +#define VP8_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP8 == 0, "") #endif // CONFIG_VP8 /* @@ -179,27 +181,27 @@ class VP9Decoder : public Decoder { : Decoder(cfg, flag) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP9_DECODER return &vpx_codec_vp9_dx_algo; #else - return NULL; + return nullptr; #endif } }; class VP9Encoder : public Encoder { public: - VP9Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + VP9Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, const unsigned long init_flags, TwopassStatsStore *stats) : Encoder(cfg, deadline, init_flags, stats) {} protected: - virtual vpx_codec_iface_t *CodecInterface() const { + vpx_codec_iface_t *CodecInterface() const override { #if CONFIG_VP9_ENCODER return &vpx_codec_vp9_cx_algo; #else - return NULL; + return nullptr; #endif } }; @@ -208,25 +210,24 @@ class VP9CodecFactory : public CodecFactory { public: VP9CodecFactory() : CodecFactory() {} - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override { return CreateDecoder(cfg, 0); } - virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, - const vpx_codec_flags_t flags) const { + Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg, + const vpx_codec_flags_t flags) const override { #if CONFIG_VP9_DECODER return new VP9Decoder(cfg, flags); #else (void)cfg; (void)flags; - return NULL; + return nullptr; #endif } - virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, - unsigned long deadline, - const unsigned long init_flags, - TwopassStatsStore *stats) const { + Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, + const unsigned long init_flags, + TwopassStatsStore *stats) const override { #if CONFIG_VP9_ENCODER return new VP9Encoder(cfg, deadline, init_flags, stats); #else @@ -234,12 +235,12 @@ class VP9CodecFactory : public CodecFactory { (void)deadline; (void)init_flags; (void)stats; - return NULL; + return nullptr; #endif } - virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, - int usage) const { + vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg, + int usage) const override { #if CONFIG_VP9_ENCODER return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage); #else @@ -252,16 +253,18 @@ class VP9CodecFactory : public CodecFactory { const libvpx_test::VP9CodecFactory kVP9; -#define VP9_INSTANTIATE_TEST_CASE(test, ...) \ - INSTANTIATE_TEST_CASE_P( \ +#define VP9_INSTANTIATE_TEST_SUITE(test, ...) \ + INSTANTIATE_TEST_SUITE_P( \ VP9, test, \ ::testing::Combine( \ ::testing::Values(static_cast( \ &libvpx_test::kVP9)), \ __VA_ARGS__)) #else -#define VP9_INSTANTIATE_TEST_CASE(test, ...) +// static_assert() is used to avoid warnings about an extra ';' outside of a +// function. +#define VP9_INSTANTIATE_TEST_SUITE(test, ...) static_assert(CONFIG_VP9 == 0, "") #endif // CONFIG_VP9 } // namespace libvpx_test -#endif // TEST_CODEC_FACTORY_H_ +#endif // VPX_TEST_CODEC_FACTORY_H_ diff --git a/media/libvpx/libvpx/test/comp_avg_pred_test.cc b/media/libvpx/libvpx/test/comp_avg_pred_test.cc new file mode 100644 index 0000000000..de9842a88f --- /dev/null +++ b/media/libvpx/libvpx/test/comp_avg_pred_test.cc @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "gtest/gtest.h" + +#include "./vpx_dsp_rtcd.h" + +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/register_state_check.h" +#include "vpx_config.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +using ::libvpx_test::ACMRandom; +using ::libvpx_test::Buffer; + +template +Pixel avg_with_rounding(Pixel a, Pixel b) { + return (a + b + 1) >> 1; +} + +template +void reference_pred(const Buffer &pred, const Buffer &ref, + int width, int height, Buffer *avg) { + ASSERT_NE(avg->TopLeftPixel(), nullptr); + ASSERT_NE(pred.TopLeftPixel(), nullptr); + ASSERT_NE(ref.TopLeftPixel(), nullptr); + + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + avg->TopLeftPixel()[y * avg->stride() + x] = + avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x], + ref.TopLeftPixel()[y * ref.stride() + x]); + } + } +} + +using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h, + const uint8_t *c, int c_stride); + +template +class AvgPredTest : public ::testing::TestWithParam { + public: + void SetUp() override { + avg_pred_func_ = GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + void TestSizeCombinations(); + void TestCompareReferenceRandom(); + void TestSpeed(); + + protected: + AvgPredFunc avg_pred_func_; + ACMRandom rnd_; +}; + +template +void AvgPredTest::TestSizeCombinations() { + // This is called as part of the sub pixel variance. As such it must be one of + // the variance block sizes. + for (int width_pow = 2; width_pow <= 6; ++width_pow) { + for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; + ++height_pow) { + // Don't test 4x2 or 64x128 + if (height_pow == 1 || height_pow == 7) continue; + + // The sse2 special-cases when ref width == stride, so make sure to test + // it. + for (int ref_padding = 0; ref_padding < 2; ref_padding++) { + const int width = 1 << width_pow; + const int height = 1 << height_pow; + // Only the reference buffer may have a stride not equal to width. + Buffer ref = Buffer(width, height, ref_padding ? 8 : 0); + ASSERT_TRUE(ref.Init()); + Buffer pred = Buffer(width, height, 0, 32); + ASSERT_TRUE(pred.Init()); + Buffer avg_ref = Buffer(width, height, 0, 32); + ASSERT_TRUE(avg_ref.Init()); + Buffer avg_chk = Buffer(width, height, 0, 32); + ASSERT_TRUE(avg_chk.Init()); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK(avg_pred_func_( + (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), + width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride())); + + EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); + if (HasFailure()) { + printf("Width: %d Height: %d\n", width, height); + avg_chk.PrintDifference(avg_ref); + return; + } + } + } + } +} + +template +void AvgPredTest::TestCompareReferenceRandom() { + const int width = 64; + const int height = 32; + Buffer ref = Buffer(width, height, 8); + ASSERT_TRUE(ref.Init()); + Buffer pred = Buffer(width, height, 0, 32); + ASSERT_TRUE(pred.Init()); + Buffer avg_ref = Buffer(width, height, 0, 32); + ASSERT_TRUE(avg_ref.Init()); + Buffer avg_chk = Buffer(width, height, 0, 32); + ASSERT_TRUE(avg_chk.Init()); + + for (int i = 0; i < 500; ++i) { + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + + reference_pred(pred, ref, width, height, &avg_ref); + ASM_REGISTER_STATE_CHECK(avg_pred_func_( + (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(), + width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride())); + EXPECT_TRUE(avg_chk.CheckValues(avg_ref)); + if (HasFailure()) { + printf("Width: %d Height: %d\n", width, height); + avg_chk.PrintDifference(avg_ref); + return; + } + } +} + +template +void AvgPredTest::TestSpeed() { + for (int width_pow = 2; width_pow <= 6; ++width_pow) { + for (int height_pow = width_pow - 1; height_pow <= width_pow + 1; + ++height_pow) { + // Don't test 4x2 or 64x128 + if (height_pow == 1 || height_pow == 7) continue; + + for (int ref_padding = 0; ref_padding < 2; ref_padding++) { + const int width = 1 << width_pow; + const int height = 1 << height_pow; + Buffer ref = Buffer(width, height, ref_padding ? 8 : 0); + ASSERT_TRUE(ref.Init()); + Buffer pred = Buffer(width, height, 0, 32); + ASSERT_TRUE(pred.Init()); + Buffer avg = Buffer(width, height, 0, 32); + ASSERT_TRUE(avg.Init()); + const int bitdepth_mask = (1 << bitdepth) - 1; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask; + } + } + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < 100000000 / (width * height); ++i) { + avg_pred_func_((uint8_t *)avg.TopLeftPixel(), + (uint8_t *)pred.TopLeftPixel(), width, height, + (uint8_t *)ref.TopLeftPixel(), ref.stride()); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Average Test (ref_padding: %d) %dx%d time: %5d us\n", + ref_padding, width, height, elapsed_time); + } + } + } +} + +using AvgPredTestLBD = AvgPredTest<8, uint8_t>; + +TEST_P(AvgPredTestLBD, SizeCombinations) { TestSizeCombinations(); } + +TEST_P(AvgPredTestLBD, CompareReferenceRandom) { TestCompareReferenceRandom(); } + +TEST_P(AvgPredTestLBD, DISABLED_Speed) { TestSpeed(); } + +INSTANTIATE_TEST_SUITE_P(C, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_c)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_sse2)); +#endif // HAVE_SSE2 + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_avx2)); +#endif // HAVE_AVX2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_neon)); +#endif // HAVE_NEON + +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_vsx)); +#endif // HAVE_VSX + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTestLBD, + ::testing::Values(&vpx_comp_avg_pred_lsx)); +#endif // HAVE_LSX + +#if CONFIG_VP9_HIGHBITDEPTH +using HighbdAvgPredFunc = void (*)(uint16_t *a, const uint16_t *b, int w, int h, + const uint16_t *c, int c_stride); + +template +void highbd_wrapper(uint8_t *a, const uint8_t *b, int w, int h, + const uint8_t *c, int c_stride) { + fn((uint16_t *)a, (const uint16_t *)b, w, h, (const uint16_t *)c, c_stride); +} + +using AvgPredTestHBD = AvgPredTest<12, uint16_t>; + +TEST_P(AvgPredTestHBD, SizeCombinations) { TestSizeCombinations(); } + +TEST_P(AvgPredTestHBD, CompareReferenceRandom) { TestCompareReferenceRandom(); } + +TEST_P(AvgPredTestHBD, DISABLED_Speed) { TestSpeed(); } + +INSTANTIATE_TEST_SUITE_P( + C, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); +#endif // HAVE_NEON + +#endif // CONFIG_VP9_HIGHBITDEPTH +} // namespace diff --git a/media/libvpx/libvpx/test/config_test.cc b/media/libvpx/libvpx/test/config_test.cc index b2f8ea5ed3..d3aca4cfb5 100644 --- a/media/libvpx/libvpx/test/config_test.cc +++ b/media/libvpx/libvpx/test/config_test.cc @@ -7,7 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/util.h" @@ -22,24 +22,24 @@ class ConfigTest ConfigTest() : EncoderTest(GET_PARAM(0)), frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {} - virtual ~ConfigTest() {} + ~ConfigTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { frame_count_in_ = 0; frame_count_out_ = 0; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) { + void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) override { ++frame_count_in_; abort_ |= (frame_count_in_ >= frame_count_max_); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) { + void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override { ++frame_count_out_; } @@ -58,5 +58,5 @@ TEST_P(ConfigTest, LagIsDisabled) { EXPECT_EQ(frame_count_in_, frame_count_out_); } -VP8_INSTANTIATE_TEST_CASE(ConfigTest, ONE_PASS_TEST_MODES); +VP8_INSTANTIATE_TEST_SUITE(ConfigTest, ONE_PASS_TEST_MODES); } // namespace diff --git a/media/libvpx/libvpx/test/consistency_test.cc b/media/libvpx/libvpx/test/consistency_test.cc index 37b4a45e54..3e73bec492 100644 --- a/media/libvpx/libvpx/test/consistency_test.cc +++ b/media/libvpx/libvpx/test/consistency_test.cc @@ -11,8 +11,9 @@ #include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #if CONFIG_VP9_ENCODER @@ -38,7 +39,7 @@ class ConsistencyTestBase : public ::testing::Test { public: ConsistencyTestBase(int width, int height) : width_(width), height_(height) {} - static void SetUpTestCase() { + static void SetUpTestSuite() { source_data_[0] = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBufferSize)); reference_data_[0] = reinterpret_cast( @@ -51,27 +52,27 @@ class ConsistencyTestBase : public ::testing::Test { } static void ClearSsim() { memset(ssim_array_, 0, kDataBufferSize / 16); } - static void TearDownTestCase() { + static void TearDownTestSuite() { vpx_free(source_data_[0]); - source_data_[0] = NULL; + source_data_[0] = nullptr; vpx_free(reference_data_[0]); - reference_data_[0] = NULL; + reference_data_[0] = nullptr; vpx_free(source_data_[1]); - source_data_[1] = NULL; + source_data_[1] = nullptr; vpx_free(reference_data_[1]); - reference_data_[1] = NULL; + reference_data_[1] = nullptr; delete[] ssim_array_; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: // Handle frames up to 640x480 static const int kDataAlignment = 16; static const int kDataBufferSize = 640 * 480; - virtual void SetUp() { + void SetUp() override { source_stride_ = (width_ + 31) & ~31; reference_stride_ = width_ * 2; rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -127,7 +128,7 @@ class ConsistencyTestBase : public ::testing::Test { }; #if CONFIG_VP9_ENCODER -typedef std::tr1::tuple ConsistencyParam; +using ConsistencyParam = std::tuple; class ConsistencyVP9Test : public ConsistencyTestBase, public ::testing::WithParamInterface { @@ -144,9 +145,9 @@ class ConsistencyVP9Test }; #endif // CONFIG_VP9_ENCODER -uint8_t *ConsistencyTestBase::source_data_[2] = { NULL, NULL }; -uint8_t *ConsistencyTestBase::reference_data_[2] = { NULL, NULL }; -Ssimv *ConsistencyTestBase::ssim_array_ = NULL; +uint8_t *ConsistencyTestBase::source_data_[2] = { nullptr, nullptr }; +uint8_t *ConsistencyTestBase::reference_data_[2] = { nullptr, nullptr }; +Ssimv *ConsistencyTestBase::ssim_array_ = nullptr; #if CONFIG_VP9_ENCODER TEST_P(ConsistencyVP9Test, ConsistencyIsZero) { @@ -198,17 +199,17 @@ TEST_P(ConsistencyVP9Test, ConsistencyIsZero) { } #endif // CONFIG_VP9_ENCODER -using std::tr1::make_tuple; +using std::make_tuple; //------------------------------------------------------------------------------ // C functions #if CONFIG_VP9_ENCODER -const ConsistencyParam c_vp9_tests[] = { - make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238), -}; -INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test, - ::testing::ValuesIn(c_vp9_tests)); +const ConsistencyParam c_vp9_tests[] = { make_tuple(320, 240), + make_tuple(318, 242), + make_tuple(318, 238) }; +INSTANTIATE_TEST_SUITE_P(C, ConsistencyVP9Test, + ::testing::ValuesIn(c_vp9_tests)); #endif } // namespace diff --git a/media/libvpx/libvpx/test/convolve_test.cc b/media/libvpx/libvpx/test/convolve_test.cc index 0056c602b6..6eecbd3e5d 100644 --- a/media/libvpx/libvpx/test/convolve_test.cc +++ b/media/libvpx/libvpx/test/convolve_test.cc @@ -9,8 +9,9 @@ */ #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vp9_rtcd.h" #include "./vpx_config.h" @@ -25,22 +26,30 @@ #include "vpx_dsp/vpx_filter.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" namespace { static const unsigned int kMaxDimension = 64; -typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h); +using ConvolveFunc = void (*)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h); +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +using ConvolveFunc12Tap = void (*)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h); +#endif -typedef void (*WrapperFilterBlock2d8Func)( - const uint8_t *src_ptr, const unsigned int src_stride, - const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr, - unsigned int dst_stride, unsigned int output_width, - unsigned int output_height, int use_highbd); +using WrapperFilterBlock2d8Func = + void (*)(const uint8_t *src_ptr, const unsigned int src_stride, + const int16_t *hfilter, const int16_t *vfilter, uint8_t *dst_ptr, + unsigned int dst_stride, unsigned int output_width, + unsigned int output_height, int use_highbd); struct ConvolveFunctions { ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg, ConvolveFunc h8, @@ -76,7 +85,26 @@ struct ConvolveFunctions { int use_highbd_; // 0 if high bitdepth not used, else the actual bit depth. }; -typedef std::tr1::tuple ConvolveParam; +using ConvolveParam = std::tuple; + +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +struct ConvolveFunctions12Tap { + ConvolveFunctions12Tap(ConvolveFunc12Tap h12, ConvolveFunc12Tap v12, + ConvolveFunc12Tap hv12, int bd) + : use_highbd_(bd) { + h12_ = h12; + v12_ = v12; + hv12_ = hv12; + } + + ConvolveFunc12Tap h12_; + ConvolveFunc12Tap v12_; + ConvolveFunc12Tap hv12_; + int use_highbd_; // 0 if high bitdepth not used, else the actual bit depth. +}; + +using Convolve12TapParam = std::tuple; +#endif #define ALL_SIZES(convolve_fn) \ make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn), \ @@ -86,7 +114,13 @@ typedef std::tr1::tuple ConvolveParam; make_tuple(16, 32, &convolve_fn), make_tuple(32, 32, &convolve_fn), \ make_tuple(64, 32, &convolve_fn), make_tuple(32, 64, &convolve_fn), \ make_tuple(64, 64, &convolve_fn) - +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#define ALL_SIZES_12TAP(convolve_fn) \ + make_tuple(8, 8, &convolve_fn), make_tuple(16, 8, &convolve_fn), \ + make_tuple(8, 16, &convolve_fn), make_tuple(16, 16, &convolve_fn), \ + make_tuple(32, 16, &convolve_fn), make_tuple(16, 32, &convolve_fn), \ + make_tuple(32, 32, &convolve_fn) +#endif // Reference 8-tap subpixel filter, slightly modified to fit into this test. #define VP9_FILTER_WEIGHT 128 #define VP9_FILTER_SHIFT 7 @@ -113,6 +147,7 @@ void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride, // and filter_max_width = 16 // uint8_t intermediate_buffer[71 * kMaxDimension]; + vp9_zero(intermediate_buffer); const int intermediate_next_stride = 1 - static_cast(intermediate_height * output_width); @@ -212,6 +247,8 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr, const int intermediate_next_stride = 1 - static_cast(intermediate_height * output_width); + vp9_zero(intermediate_buffer); + // Horizontal pass (src -> transposed intermediate). { uint16_t *output_ptr = intermediate_buffer; @@ -239,7 +276,7 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr, // Vertical pass (transposed intermediate -> dst). { - uint16_t *src_ptr = intermediate_buffer; + src_ptr = intermediate_buffer; const int dst_next_row_stride = dst_stride - output_width; unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -300,9 +337,9 @@ void wrapper_filter_average_block2d_8_c( filter_average_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, dst_stride, output_width, output_height); } else { - highbd_filter_average_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + highbd_filter_average_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter, vfilter, - CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, + CAST_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height, use_highbd); } #else @@ -323,8 +360,8 @@ void wrapper_filter_block2d_8_c(const uint8_t *src_ptr, filter_block2d_8_c(src_ptr, src_stride, hfilter, vfilter, dst_ptr, dst_stride, output_width, output_height); } else { - highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride, hfilter, - vfilter, CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, + highbd_filter_block2d_8_c(CAST_TO_SHORTPTR(src_ptr), src_stride, hfilter, + vfilter, CAST_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height, use_highbd); } #else @@ -336,7 +373,7 @@ void wrapper_filter_block2d_8_c(const uint8_t *src_ptr, class ConvolveTest : public ::testing::TestWithParam { public: - static void SetUpTestCase() { + static void SetUpTestSuite() { // Force input_ to be unaligned, output to be 16 byte aligned. input_ = reinterpret_cast( vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + @@ -356,22 +393,22 @@ class ConvolveTest : public ::testing::TestWithParam { #endif } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } - static void TearDownTestCase() { + static void TearDownTestSuite() { vpx_free(input_ - 1); - input_ = NULL; + input_ = nullptr; vpx_free(output_); - output_ = NULL; + output_ = nullptr; vpx_free(output_ref_); - output_ref_ = NULL; + output_ref_ = nullptr; #if CONFIG_VP9_HIGHBITDEPTH vpx_free(input16_ - 1); - input16_ = NULL; + input16_ = nullptr; vpx_free(output16_); - output16_ = NULL; + output16_ = nullptr; vpx_free(output16_ref_); - output16_ref_ = NULL; + output16_ref_ = nullptr; #endif } @@ -398,7 +435,7 @@ class ConvolveTest : public ::testing::TestWithParam { i % kOuterBlockSize >= (BorderLeft() + Width())); } - virtual void SetUp() { + void SetUp() override { UUT_ = GET_PARAM(2); #if CONFIG_VP9_HIGHBITDEPTH if (UUT_->use_highbd_ != 0) { @@ -411,8 +448,14 @@ class ConvolveTest : public ::testing::TestWithParam { for (int i = 0; i < kOutputBufferSize; ++i) { if (IsIndexInBorder(i)) { output_[i] = 255; +#if CONFIG_VP9_HIGHBITDEPTH + output16_[i] = mask_; +#endif } else { output_[i] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + output16_[i] = 0; +#endif } } @@ -449,7 +492,9 @@ class ConvolveTest : public ::testing::TestWithParam { void CheckGuardBlocks() { for (int i = 0; i < kOutputBufferSize; ++i) { - if (IsIndexInBorder(i)) EXPECT_EQ(255, output_[i]); + if (IsIndexInBorder(i)) { + EXPECT_EQ(255, output_[i]); + } } } @@ -459,7 +504,7 @@ class ConvolveTest : public ::testing::TestWithParam { if (UUT_->use_highbd_ == 0) { return input_ + offset; } else { - return CONVERT_TO_BYTEPTR(input16_) + offset; + return CAST_TO_BYTEPTR(input16_ + offset); } #else return input_ + offset; @@ -472,7 +517,7 @@ class ConvolveTest : public ::testing::TestWithParam { if (UUT_->use_highbd_ == 0) { return output_ + offset; } else { - return CONVERT_TO_BYTEPTR(output16_) + offset; + return CAST_TO_BYTEPTR(output16_ + offset); } #else return output_ + offset; @@ -485,7 +530,7 @@ class ConvolveTest : public ::testing::TestWithParam { if (UUT_->use_highbd_ == 0) { return output_ref_ + offset; } else { - return CONVERT_TO_BYTEPTR(output16_ref_) + offset; + return CAST_TO_BYTEPTR(output16_ref_ + offset); } #else return output_ref_ + offset; @@ -497,7 +542,7 @@ class ConvolveTest : public ::testing::TestWithParam { if (UUT_->use_highbd_ == 0) { return list[index]; } else { - return CONVERT_TO_SHORTPTR(list)[index]; + return CAST_TO_SHORTPTR(list)[index]; } #else return list[index]; @@ -509,7 +554,7 @@ class ConvolveTest : public ::testing::TestWithParam { if (UUT_->use_highbd_ == 0) { list[index] = (uint8_t)val; } else { - CONVERT_TO_SHORTPTR(list)[index] = val; + CAST_TO_SHORTPTR(list)[index] = val; } #else list[index] = (uint8_t)val; @@ -528,23 +573,679 @@ class ConvolveTest : public ::testing::TestWithParam { #endif }; -uint8_t *ConvolveTest::input_ = NULL; -uint8_t *ConvolveTest::output_ = NULL; -uint8_t *ConvolveTest::output_ref_ = NULL; +uint8_t *ConvolveTest::input_ = nullptr; +uint8_t *ConvolveTest::output_ = nullptr; +uint8_t *ConvolveTest::output_ref_ = nullptr; #if CONFIG_VP9_HIGHBITDEPTH -uint16_t *ConvolveTest::input16_ = NULL; -uint16_t *ConvolveTest::output16_ = NULL; -uint16_t *ConvolveTest::output16_ref_ = NULL; +uint16_t *ConvolveTest::input16_ = nullptr; +uint16_t *ConvolveTest::output16_ = nullptr; +uint16_t *ConvolveTest::output16_ref_ = nullptr; +#endif +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +class ConvolveTest12Tap : public ::testing::TestWithParam { + public: + static void SetUpTestSuite() { + // Force input_ to be unaligned, output to be 16 byte aligned. + input_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + + 1; + output_ = reinterpret_cast( + vpx_memalign(kDataAlignment, kOutputBufferSize)); +#if CONFIG_VP9_HIGHBITDEPTH + input16_ = reinterpret_cast(vpx_memalign( + kDataAlignment, (kInputBufferSize + 1) * sizeof(uint16_t))) + + 1; + output16_ = reinterpret_cast( + vpx_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t))); +#endif + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + static void TearDownTestSuite() { + vpx_free(input_ - 1); + input_ = nullptr; + vpx_free(output_); + output_ = nullptr; +#if CONFIG_VP9_HIGHBITDEPTH + vpx_free(input16_ - 1); + input16_ = nullptr; + vpx_free(output16_); + output16_ = nullptr; +#endif + } + + protected: + static const int kDataAlignment = 16; + static const int kOuterBlockSize = 256; + static const int kInputStride = kOuterBlockSize; + static const int kOutputStride = kOuterBlockSize; + static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize; + static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize; + + int Width() const { return GET_PARAM(0); } + int Height() const { return GET_PARAM(1); } + int BorderLeft() const { + const int center = (kOuterBlockSize - Width()) / 2; + return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1); + } + int BorderTop() const { return (kOuterBlockSize - Height()) / 2; } + + bool IsIndexInBorder(int i) { + return (i < BorderTop() * kOuterBlockSize || + i >= (BorderTop() + Height()) * kOuterBlockSize || + i % kOuterBlockSize < BorderLeft() || + i % kOuterBlockSize >= (BorderLeft() + Width())); + } + + void SetUp() override { + UUT_ = GET_PARAM(2); +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ != 0) { + mask_ = (1 << UUT_->use_highbd_) - 1; + } else { + mask_ = 255; + } +#endif + /* Set up guard blocks for an inner block centered in the outer block */ + for (int i = 0; i < kOutputBufferSize; ++i) { + if (IsIndexInBorder(i)) { + output_[i] = 255; +#if CONFIG_VP9_HIGHBITDEPTH + output16_[i] = mask_; +#endif + } else { + output_[i] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + output16_[i] = 0; +#endif + } + } + + ::libvpx_test::ACMRandom prng; + for (int i = 0; i < kInputBufferSize; ++i) { + if (i & 1) { + input_[i] = 255; +#if CONFIG_VP9_HIGHBITDEPTH + input16_[i] = mask_; +#endif + } else { + input_[i] = prng.Rand8Extremes(); +#if CONFIG_VP9_HIGHBITDEPTH + input16_[i] = prng.Rand16() & mask_; +#endif + } + } + } + + void SetConstantInput(int value) { + memset(input_, value, kInputBufferSize); +#if CONFIG_VP9_HIGHBITDEPTH + vpx_memset16(input16_, value, kInputBufferSize); +#endif + } + + void CheckGuardBlocks() { + for (int i = 0; i < kOutputBufferSize; ++i) { + if (IsIndexInBorder(i)) { + EXPECT_EQ(255, output_[i]); + } + } + } + + uint8_t *input() const { + const int offset = BorderTop() * kOuterBlockSize + BorderLeft(); +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + return input_ + offset; + } else { + return CAST_TO_BYTEPTR(input16_ + offset); + } +#else + return input_ + offset; +#endif + } + + uint8_t *output() const { + const int offset = BorderTop() * kOuterBlockSize + BorderLeft(); +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + return output_ + offset; + } else { + return CAST_TO_BYTEPTR(output16_ + offset); + } +#else + return output_ + offset; +#endif + } + + uint16_t lookup(uint8_t *list, int index) const { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + return list[index]; + } else { + return CAST_TO_SHORTPTR(list)[index]; + } +#else + return list[index]; +#endif + } + + void assign_val(uint8_t *list, int index, uint16_t val) const { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + list[index] = (uint8_t)val; + } else { + CAST_TO_SHORTPTR(list)[index] = val; + } +#else + list[index] = (uint8_t)val; +#endif + } + const ConvolveFunctions12Tap *UUT_; + static uint8_t *input_; + static uint8_t *output_; +#if CONFIG_VP9_HIGHBITDEPTH + static uint16_t *input16_; + static uint16_t *output16_; + int mask_; +#endif +}; + +uint8_t *ConvolveTest12Tap::input_ = nullptr; +uint8_t *ConvolveTest12Tap::output_ = nullptr; +#if CONFIG_VP9_HIGHBITDEPTH +uint16_t *ConvolveTest12Tap::input16_ = nullptr; +uint16_t *ConvolveTest12Tap::output16_ = nullptr; +#endif + +TEST_P(ConvolveTest12Tap, MatchesReferenceSubpixelFilter) { + uint8_t *const in = input(); + uint8_t *const out = output(); +#if CONFIG_VP9_HIGHBITDEPTH + uint8_t ref8[kOutputStride * kMaxDimension]; + uint16_t ref16[kOutputStride * kMaxDimension]; + uint8_t *ref; + if (UUT_->use_highbd_ == 0) { + ref = ref8; + } else { + ref = CAST_TO_BYTEPTR(ref16); + } +#else + uint8_t ref[kOutputStride * kMaxDimension]; +#endif + + // Populate ref and out with some random data + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + uint16_t r; +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) { + r = prng.Rand8Extremes(); + } else { + r = prng.Rand16() & mask_; + } +#else + r = prng.Rand8Extremes(); +#endif + + assign_val(out, y * kOutputStride + x, r); + assign_val(ref, y * kOutputStride + x, r); + } + } + + const InterpKernel12 *filters = sub_pel_filters_12; + for (int filter_x = 0; filter_x < 16; ++filter_x) { + for (int filter_y = 0; filter_y < 16; ++filter_y) { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + vpx_convolve12_c(in, kInputStride, ref, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height()); + } else { + vpx_highbd_convolve12_c(CAST_TO_SHORTPTR(in), kInputStride, + CAST_TO_SHORTPTR(ref), kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height(), + UUT_->use_highbd_); + } +#else + vpx_convolve12_c(in, kInputStride, ref, kOutputStride, filters, filter_x, + 16, filter_y, 16, Width(), Height()); +#endif + if (filter_x && filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->hv12_(in, kInputStride, out, kOutputStride, filters, filter_x, + 16, filter_y, 16, Width(), Height())); + else if (filter_y) + ASM_REGISTER_STATE_CHECK(UUT_->v12_(in, kInputStride, out, + kOutputStride, filters, 0, 16, + filter_y, 16, Width(), Height())); + else if (filter_x) + ASM_REGISTER_STATE_CHECK(UUT_->h12_(in, kInputStride, out, + kOutputStride, filters, filter_x, + 16, 0, 16, Width(), Height())); + else + continue; + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "mismatch at (" << x << "," << y << "), " + << "filters (" + << "," << filter_x << "," << filter_y << ")"; + } + } + } +} + +TEST_P(ConvolveTest12Tap, FilterExtremes) { + uint8_t *const in = input(); + uint8_t *const out = output(); +#if CONFIG_VP9_HIGHBITDEPTH + uint8_t ref8[kOutputStride * kMaxDimension]; + uint16_t ref16[kOutputStride * kMaxDimension]; + uint8_t *ref; + if (UUT_->use_highbd_ == 0) { + ref = ref8; + } else { + ref = CAST_TO_BYTEPTR(ref16); + } +#else + uint8_t ref[kOutputStride * kMaxDimension]; +#endif + + // Populate ref and out with some random data + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + uint16_t r; +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) { + r = prng.Rand8Extremes(); + } else { + r = prng.Rand16() & mask_; + } +#else + r = prng.Rand8Extremes(); +#endif + assign_val(out, y * kOutputStride + x, r); + assign_val(ref, y * kOutputStride + x, r); + } + } + + for (int axis = 0; axis < 2; axis++) { + int seed_val = 0; + while (seed_val < 256) { + for (int y = 0; y < 8; ++y) { + for (int x = 0; x < 8; ++x) { +#if CONFIG_VP9_HIGHBITDEPTH + assign_val(in, y * kOutputStride + x - MAX_FILTER_TAP / 2 + 1, + ((seed_val >> (axis ? y : x)) & 1) * mask_); +#else + assign_val(in, y * kOutputStride + x - MAX_FILTER_TAP / 2 + 1, + ((seed_val >> (axis ? y : x)) & 1) * 255); +#endif + if (axis) seed_val++; + } + if (axis) { + seed_val -= 8; + } else { + seed_val++; + } + } + if (axis) seed_val += 8; + + const InterpKernel12 *filters = sub_pel_filters_12; + for (int filter_x = 0; filter_x < 16; ++filter_x) { + for (int filter_y = 0; filter_y < 16; ++filter_y) { +#if CONFIG_VP9_HIGHBITDEPTH + if (UUT_->use_highbd_ == 0) { + vpx_convolve12_c(in, kInputStride, ref, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height()); + } else { + vpx_highbd_convolve12_c(CAST_TO_SHORTPTR(in), kInputStride, + CAST_TO_SHORTPTR(ref), kOutputStride, + filters, filter_x, 16, filter_y, 16, + Width(), Height(), UUT_->use_highbd_); + } +#else + vpx_convolve12_c(in, kInputStride, ref, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height()); +#endif + if (filter_x && filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->hv12_(in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); + else if (filter_y) + ASM_REGISTER_STATE_CHECK( + UUT_->v12_(in, kInputStride, out, kOutputStride, filters, 0, 16, + filter_y, 16, Width(), Height())); + else if (filter_x) + ASM_REGISTER_STATE_CHECK( + UUT_->h12_(in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); + else + continue; + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "mismatch at (" << x << "," << y << "), " + << "filters (" + << "," << filter_x << "," << filter_y << ")"; + } + } + } + } + } +} + +TEST_P(ConvolveTest12Tap, DISABLED_12Tap_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel12 *const twelvetap = sub_pel_filters_12; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv12_(in, kInputStride, out, kOutputStride, twelvetap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve12_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest12Tap, DISABLED_12Tap_Horiz_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel12 *const twelvetap = sub_pel_filters_12; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->h12_(in, kInputStride, out, kOutputStride, twelvetap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve12_horiz_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest12Tap, DISABLED_12Tap_Vert_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel12 *const twelvetap = sub_pel_filters_12; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->v12_(in, kInputStride, out, kOutputStride, twelvetap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve12_vert_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} #endif TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); } +TEST_P(ConvolveTest, DISABLED_Copy_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->copy_[0](in, kInputStride, out, kOutputStride, nullptr, 0, 0, 0, 0, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve_copy_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_Avg_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->copy_[1](in, kInputStride, out, kOutputStride, nullptr, 0, 0, 0, 0, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve_avg_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_Scale_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve_scale_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_vert_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_4Tap_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve4_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_4Tap_Horiz_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->h8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve4_horiz_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + +TEST_P(ConvolveTest, DISABLED_4Tap_Vert_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->v8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve4_vert_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} +TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_avg_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + TEST_P(ConvolveTest, Copy) { uint8_t *const in = input(); uint8_t *const out = output(); ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride, - NULL, 0, NULL, 0, Width(), Height())); + nullptr, 0, 0, 0, 0, Width(), + Height())); CheckGuardBlocks(); @@ -563,7 +1264,8 @@ TEST_P(ConvolveTest, Avg) { CopyOutputToRef(); ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride, - NULL, 0, NULL, 0, Width(), Height())); + nullptr, 0, 0, 0, 0, Width(), + Height())); CheckGuardBlocks(); @@ -580,12 +1282,10 @@ TEST_P(ConvolveTest, Avg) { TEST_P(ConvolveTest, CopyHoriz) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -600,12 +1300,10 @@ TEST_P(ConvolveTest, CopyHoriz) { TEST_P(ConvolveTest, CopyVert) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -620,12 +1318,10 @@ TEST_P(ConvolveTest, CopyVert) { TEST_P(ConvolveTest, Copy2D) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -637,7 +1333,7 @@ TEST_P(ConvolveTest, Copy2D) { } } -const int kNumFilterBanks = 4; +const int kNumFilterBanks = 5; const int kNumFilters = 16; TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) { @@ -661,7 +1357,6 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) { } } -const int16_t kInvalidFilter[8] = { 0 }; const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = { wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c }; @@ -677,7 +1372,7 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { if (UUT_->use_highbd_ == 0) { ref = ref8; } else { - ref = CONVERT_TO_BYTEPTR(ref16); + ref = CAST_TO_BYTEPTR(ref16); } #else uint8_t ref[kOutputStride * kMaxDimension]; @@ -714,21 +1409,21 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { Width(), Height(), UUT_->use_highbd_); if (filter_x && filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->hv8_[i]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); else if (filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->v8_[i]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0, + 16, filter_y, 16, Width(), Height())); else if (filter_x) - ASM_REGISTER_STATE_CHECK(UUT_->h8_[i]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - kInvalidFilter, 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); else - ASM_REGISTER_STATE_CHECK(UUT_->copy_[i]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 0, - kInvalidFilter, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->copy_[i](in, kInputStride, out, kOutputStride, nullptr, 0, + 0, 0, 0, Width(), Height())); CheckGuardBlocks(); @@ -756,7 +1451,7 @@ TEST_P(ConvolveTest, FilterExtremes) { if (UUT_->use_highbd_ == 0) { ref = ref8; } else { - ref = CONVERT_TO_BYTEPTR(ref16); + ref = CAST_TO_BYTEPTR(ref16); } #else uint8_t ref[kOutputStride * kMaxDimension]; @@ -812,21 +1507,21 @@ TEST_P(ConvolveTest, FilterExtremes) { filters[filter_y], ref, kOutputStride, Width(), Height(), UUT_->use_highbd_); if (filter_x && filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->hv8_[0]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); else if (filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->v8_[0]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0, + 16, filter_y, 16, Width(), Height())); else if (filter_x) - ASM_REGISTER_STATE_CHECK(UUT_->h8_[0]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - kInvalidFilter, 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); else - ASM_REGISTER_STATE_CHECK(UUT_->copy_[0]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 0, - kInvalidFilter, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->copy_[0](in, kInputStride, out, kOutputStride, nullptr, + 0, 0, 0, 0, Width(), Height())); for (int y = 0; y < Height(); ++y) { for (int x = 0; x < Width(); ++x) @@ -845,47 +1540,66 @@ TEST_P(ConvolveTest, FilterExtremes) { /* This test exercises that enough rows and columns are filtered with every possible initial fractional positions and scaling steps. */ +#if !CONFIG_VP9_HIGHBITDEPTH +static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c, + vpx_scaled_avg_2d_c }; + TEST_P(ConvolveTest, CheckScalingFiltering) { uint8_t *const in = input(); uint8_t *const out = output(); - const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP]; + uint8_t ref[kOutputStride * kMaxDimension]; - SetConstantInput(127); + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + const uint16_t r = prng.Rand8Extremes(); + assign_val(in, y * kInputStride + x, r); + } + } - for (int frac = 0; frac < 16; ++frac) { - for (int step = 1; step <= 32; ++step) { - /* Test the horizontal and vertical filters in combination. */ - ASM_REGISTER_STATE_CHECK( - UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap[frac], - step, eighttap[frac], step, Width(), Height())); + for (int i = 0; i < 2; ++i) { + for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) { + const InterpKernel *const eighttap = vp9_filter_kernels[filter_type]; + for (int frac = 0; frac < 16; ++frac) { + for (int step = 1; step <= 32; ++step) { + /* Test the horizontal and vertical filters in combination. */ + scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height()); + ASM_REGISTER_STATE_CHECK( + UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height())); - CheckGuardBlocks(); + CheckGuardBlocks(); - for (int y = 0; y < Height(); ++y) { - for (int x = 0; x < Width(); ++x) { - ASSERT_EQ(lookup(in, y * kInputStride + x), - lookup(out, y * kOutputStride + x)) - << "x == " << x << ", y == " << y << ", frac == " << frac - << ", step == " << step; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "x == " << x << ", y == " << y << ", frac == " << frac + << ", step == " << step; + } + } } } } } } +#endif -using std::tr1::make_tuple; +using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH #define WRAP(func, bd) \ void wrap_##func##_##bd( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, \ - const int16_t *filter_y, int filter_y_stride, int w, int h) { \ - vpx_highbd_##func(src, src_stride, dst, dst_stride, filter_x, \ - filter_x_stride, filter_y, filter_y_stride, w, h, bd); \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + vpx_highbd_##func(reinterpret_cast(src), src_stride, \ + reinterpret_cast(dst), dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ } -#if HAVE_SSE2 && ARCH_X86_64 +#if HAVE_SSE2 && VPX_ARCH_X86_64 WRAP(convolve_copy_sse2, 8) WRAP(convolve_avg_sse2, 8) WRAP(convolve_copy_sse2, 10) @@ -910,7 +1624,36 @@ WRAP(convolve8_vert_sse2, 12) WRAP(convolve8_avg_vert_sse2, 12) WRAP(convolve8_sse2, 12) WRAP(convolve8_avg_sse2, 12) -#endif // HAVE_SSE2 && ARCH_X86_64 +#endif // HAVE_SSE2 && VPX_ARCH_X86_64 + +#if HAVE_AVX2 +WRAP(convolve_copy_avx2, 8) +WRAP(convolve_avg_avx2, 8) +WRAP(convolve8_horiz_avx2, 8) +WRAP(convolve8_avg_horiz_avx2, 8) +WRAP(convolve8_vert_avx2, 8) +WRAP(convolve8_avg_vert_avx2, 8) +WRAP(convolve8_avx2, 8) +WRAP(convolve8_avg_avx2, 8) + +WRAP(convolve_copy_avx2, 10) +WRAP(convolve_avg_avx2, 10) +WRAP(convolve8_avx2, 10) +WRAP(convolve8_horiz_avx2, 10) +WRAP(convolve8_vert_avx2, 10) +WRAP(convolve8_avg_avx2, 10) +WRAP(convolve8_avg_horiz_avx2, 10) +WRAP(convolve8_avg_vert_avx2, 10) + +WRAP(convolve_copy_avx2, 12) +WRAP(convolve_avg_avx2, 12) +WRAP(convolve8_avx2, 12) +WRAP(convolve8_horiz_avx2, 12) +WRAP(convolve8_vert_avx2, 12) +WRAP(convolve8_avg_avx2, 12) +WRAP(convolve8_avg_horiz_avx2, 12) +WRAP(convolve8_avg_vert_avx2, 12) +#endif // HAVE_AVX2 #if HAVE_NEON WRAP(convolve_copy_neon, 8) @@ -939,6 +1682,30 @@ WRAP(convolve8_neon, 12) WRAP(convolve8_avg_neon, 12) #endif // HAVE_NEON +#if HAVE_SVE +WRAP(convolve8_horiz_sve, 8) +WRAP(convolve8_avg_horiz_sve, 8) +WRAP(convolve8_horiz_sve, 10) +WRAP(convolve8_avg_horiz_sve, 10) +WRAP(convolve8_horiz_sve, 12) +WRAP(convolve8_avg_horiz_sve, 12) +#endif // HAVE_SVE + +#if HAVE_SVE2 +WRAP(convolve8_sve2, 8) +WRAP(convolve8_avg_sve2, 8) +WRAP(convolve8_vert_sve2, 8) +WRAP(convolve8_avg_vert_sve2, 8) +WRAP(convolve8_sve2, 10) +WRAP(convolve8_avg_sve2, 10) +WRAP(convolve8_vert_sve2, 10) +WRAP(convolve8_avg_vert_sve2, 10) +WRAP(convolve8_sve2, 12) +WRAP(convolve8_avg_sve2, 12) +WRAP(convolve8_vert_sve2, 12) +WRAP(convolve8_avg_vert_sve2, 12) +#endif // HAVE_SVE2 + WRAP(convolve_copy_c, 8) WRAP(convolve_avg_c, 8) WRAP(convolve8_horiz_c, 8) @@ -986,9 +1753,9 @@ const ConvolveFunctions convolve12_c( wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12); -const ConvolveParam kArrayConvolve_c[] = { - ALL_SIZES(convolve8_c), ALL_SIZES(convolve10_c), ALL_SIZES(convolve12_c) -}; +const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c), + ALL_SIZES(convolve10_c), + ALL_SIZES(convolve12_c) }; #else const ConvolveFunctions convolve8_c( @@ -999,9 +1766,107 @@ const ConvolveFunctions convolve8_c( vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) }; #endif -INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_c)); +INSTANTIATE_TEST_SUITE_P(C, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_c)); +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#if CONFIG_VP9_HIGHBITDEPTH +#define WRAP12TAP(func, bd) \ + void wrap_##func##_##bd( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + vpx_highbd_##func(reinterpret_cast(src), src_stride, \ + reinterpret_cast(dst), dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ + } -#if HAVE_SSE2 && ARCH_X86_64 +#if HAVE_AVX2 +WRAP12TAP(convolve12_horiz_avx2, 8) +WRAP12TAP(convolve12_vert_avx2, 8) +WRAP12TAP(convolve12_avx2, 8) +WRAP12TAP(convolve12_horiz_avx2, 10) +WRAP12TAP(convolve12_vert_avx2, 10) +WRAP12TAP(convolve12_avx2, 10) +WRAP12TAP(convolve12_horiz_avx2, 12) +WRAP12TAP(convolve12_vert_avx2, 12) +WRAP12TAP(convolve12_avx2, 12) +#endif // HAVE_AVX2 + +#if HAVE_SSSE3 +WRAP12TAP(convolve12_horiz_ssse3, 8) +WRAP12TAP(convolve12_vert_ssse3, 8) +WRAP12TAP(convolve12_ssse3, 8) +WRAP12TAP(convolve12_horiz_ssse3, 10) +WRAP12TAP(convolve12_vert_ssse3, 10) +WRAP12TAP(convolve12_ssse3, 10) +WRAP12TAP(convolve12_horiz_ssse3, 12) +WRAP12TAP(convolve12_vert_ssse3, 12) +WRAP12TAP(convolve12_ssse3, 12) +#endif // HAVE_SSSE3 + +#if HAVE_NEON +WRAP12TAP(convolve12_horiz_neon, 8) +WRAP12TAP(convolve12_vert_neon, 8) +WRAP12TAP(convolve12_neon, 8) +WRAP12TAP(convolve12_horiz_neon, 10) +WRAP12TAP(convolve12_vert_neon, 10) +WRAP12TAP(convolve12_neon, 10) +WRAP12TAP(convolve12_horiz_neon, 12) +WRAP12TAP(convolve12_vert_neon, 12) +WRAP12TAP(convolve12_neon, 12) +#endif // HAVE_NEON + +#if HAVE_SVE2 +WRAP12TAP(convolve12_horiz_sve2, 8) +WRAP12TAP(convolve12_vert_sve2, 8) +WRAP12TAP(convolve12_sve2, 8) +WRAP12TAP(convolve12_horiz_sve2, 10) +WRAP12TAP(convolve12_vert_sve2, 10) +WRAP12TAP(convolve12_sve2, 10) +WRAP12TAP(convolve12_horiz_sve2, 12) +WRAP12TAP(convolve12_vert_sve2, 12) +WRAP12TAP(convolve12_sve2, 12) +#endif // HAVE_SVE2 + +WRAP12TAP(convolve12_horiz_c, 8) +WRAP12TAP(convolve12_vert_c, 8) +WRAP12TAP(convolve12_c, 8) +WRAP12TAP(convolve12_horiz_c, 10) +WRAP12TAP(convolve12_vert_c, 10) +WRAP12TAP(convolve12_c, 10) +WRAP12TAP(convolve12_horiz_c, 12) +WRAP12TAP(convolve12_vert_c, 12) +WRAP12TAP(convolve12_c, 12) +#undef WRAP12TAP + +const ConvolveFunctions12Tap convolve12tap_8bit_c(wrap_convolve12_horiz_c_8, + wrap_convolve12_vert_c_8, + wrap_convolve12_c_8, 8); + +const ConvolveFunctions12Tap convolve12tap_10bit_c(wrap_convolve12_horiz_c_10, + wrap_convolve12_vert_c_10, + wrap_convolve12_c_10, 10); + +const ConvolveFunctions12Tap convolve12tap_12bit_c(wrap_convolve12_horiz_c_12, + wrap_convolve12_vert_c_12, + wrap_convolve12_c_12, 12); + +const Convolve12TapParam kArrayConvolve12Tap_c[] = { + ALL_SIZES_12TAP(convolve12tap_8bit_c), ALL_SIZES_12TAP(convolve12tap_10bit_c), + ALL_SIZES_12TAP(convolve12tap_12bit_c) +}; +#else +const ConvolveFunctions12Tap convolve12Tap_c(vpx_convolve12_horiz_c, + vpx_convolve12_vert_c, + vpx_convolve12_c, 0); +const Convolve12TapParam kArrayConvolve12Tap_c[] = { ALL_SIZES_12TAP( + convolve12Tap_c) }; +#endif +INSTANTIATE_TEST_SUITE_P(C, ConvolveTest12Tap, + ::testing::ValuesIn(kArrayConvolve12Tap_c)); +#endif + +#if HAVE_SSE2 && VPX_ARCH_X86_64 #if CONFIG_VP9_HIGHBITDEPTH const ConvolveFunctions convolve8_sse2( wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8, @@ -1040,8 +1905,8 @@ const ConvolveFunctions convolve8_sse2( const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) }; #endif // CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_sse2)); #endif #if HAVE_SSSE3 @@ -1053,22 +1918,113 @@ const ConvolveFunctions convolve8_ssse3( vpx_scaled_avg_vert_c, vpx_scaled_2d_ssse3, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) }; -INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_ssse3)); +INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_ssse3)); + +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions12Tap convolve12tap_8bit_ssse3( + wrap_convolve12_horiz_ssse3_8, wrap_convolve12_vert_ssse3_8, + wrap_convolve12_ssse3_8, 8); + +const ConvolveFunctions12Tap convolve12tap_10bit_ssse3( + wrap_convolve12_horiz_ssse3_10, wrap_convolve12_vert_ssse3_10, + wrap_convolve12_ssse3_10, 10); + +const ConvolveFunctions12Tap convolve12tap_12bit_ssse3( + wrap_convolve12_horiz_ssse3_12, wrap_convolve12_vert_ssse3_12, + wrap_convolve12_ssse3_12, 12); + +const Convolve12TapParam kArrayConvolve12Tap_ssse3[] = { + ALL_SIZES_12TAP(convolve12tap_8bit_ssse3), + ALL_SIZES_12TAP(convolve12tap_10bit_ssse3), + ALL_SIZES_12TAP(convolve12tap_12bit_ssse3) +}; +#else +const ConvolveFunctions12Tap convolve12_ssse3(vpx_convolve12_horiz_ssse3, + vpx_convolve12_vert_ssse3, + vpx_convolve12_ssse3, 0); +const Convolve12TapParam kArrayConvolve12Tap_ssse3[] = { ALL_SIZES_12TAP( + convolve12_ssse3) }; +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest12Tap, + ::testing::ValuesIn(kArrayConvolve12Tap_ssse3)); +#endif // !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER #endif -#if HAVE_AVX2 && HAVE_SSSE3 +#if HAVE_AVX2 +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_avx2( + wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8, + wrap_convolve8_horiz_avx2_8, wrap_convolve8_avg_horiz_avx2_8, + wrap_convolve8_vert_avx2_8, wrap_convolve8_avg_vert_avx2_8, + wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, wrap_convolve8_horiz_c_8, + wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8, + wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_avx2( + wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10, + wrap_convolve8_horiz_avx2_10, wrap_convolve8_avg_horiz_avx2_10, + wrap_convolve8_vert_avx2_10, wrap_convolve8_avg_vert_avx2_10, + wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10, + wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10, + wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, + wrap_convolve8_avg_c_10, 10); +const ConvolveFunctions convolve12_avx2( + wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12, + wrap_convolve8_horiz_avx2_12, wrap_convolve8_avg_horiz_avx2_12, + wrap_convolve8_vert_avx2_12, wrap_convolve8_avg_vert_avx2_12, + wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12, + wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12, + wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, + wrap_convolve8_avg_c_12, 12); +const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2), + ALL_SIZES(convolve10_avx2), + ALL_SIZES(convolve12_avx2) }; +INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_avx2)); +#else // !CONFIG_VP9_HIGHBITDEPTH const ConvolveFunctions convolve8_avx2( vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2, - vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2, - vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3, + vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2, + vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); - const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) }; -INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_avx2)); -#endif // HAVE_AVX2 && HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_avx2)); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions12Tap convolve12Tap_8bit_avx2( + wrap_convolve12_horiz_avx2_8, wrap_convolve12_vert_avx2_8, + wrap_convolve12_avx2_8, 8); + +const ConvolveFunctions12Tap convolve12Tap_10bit_avx2( + wrap_convolve12_horiz_avx2_10, wrap_convolve12_vert_avx2_10, + wrap_convolve12_avx2_10, 10); + +const ConvolveFunctions12Tap convolve12Tap_12bit_avx2( + wrap_convolve12_horiz_avx2_12, wrap_convolve12_vert_avx2_12, + wrap_convolve12_avx2_12, 12); + +const Convolve12TapParam kArrayConvolve12Tap_avx2[] = { + ALL_SIZES_12TAP(convolve12Tap_8bit_avx2), + ALL_SIZES_12TAP(convolve12Tap_10bit_avx2), + ALL_SIZES_12TAP(convolve12Tap_12bit_avx2) +}; +#else +const ConvolveFunctions12Tap convolve12Tap_avx2(vpx_convolve12_horiz_avx2, + vpx_convolve12_vert_avx2, + vpx_convolve12_avx2, 0); +const Convolve12TapParam kArrayConvolve12Tap_avx2[] = { ALL_SIZES_12TAP( + convolve12Tap_avx2) }; +#endif +INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest12Tap, + ::testing::ValuesIn(kArrayConvolve12Tap_avx2)); +#endif +#endif // HAVE_AVX2 #if HAVE_NEON #if CONFIG_VP9_HIGHBITDEPTH @@ -1105,14 +2061,188 @@ const ConvolveFunctions convolve8_neon( vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, - vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + vpx_scaled_avg_vert_c, vpx_scaled_2d_neon, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) }; #endif // CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon)); + +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions12Tap convolve12tap_8bit_neon( + wrap_convolve12_horiz_neon_8, wrap_convolve12_vert_neon_8, + wrap_convolve12_neon_8, 8); + +const ConvolveFunctions12Tap convolve12tap_10bit_neon( + wrap_convolve12_horiz_neon_10, wrap_convolve12_vert_neon_10, + wrap_convolve12_neon_10, 10); + +const ConvolveFunctions12Tap convolve12tap_12bit_neon( + wrap_convolve12_horiz_neon_12, wrap_convolve12_vert_neon_12, + wrap_convolve12_neon_12, 12); + +const Convolve12TapParam kArrayConvolve12Tap_neon[] = { + ALL_SIZES_12TAP(convolve12tap_8bit_neon), + ALL_SIZES_12TAP(convolve12tap_10bit_neon), + ALL_SIZES_12TAP(convolve12tap_12bit_neon) +}; + +#else + +const ConvolveFunctions12Tap convolve12Tap_neon(vpx_convolve12_horiz_neon, + vpx_convolve12_vert_neon, + vpx_convolve12_neon, 0); +const Convolve12TapParam kArrayConvolve12Tap_neon[] = { ALL_SIZES_12TAP( + convolve12Tap_neon) }; +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest12Tap, + ::testing::ValuesIn(kArrayConvolve12Tap_neon)); +#endif // !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER #endif // HAVE_NEON +#if HAVE_NEON_DOTPROD +const ConvolveFunctions convolve8_neon_dotprod( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_dotprod, + vpx_convolve8_avg_horiz_neon_dotprod, vpx_convolve8_vert_neon_dotprod, + vpx_convolve8_avg_vert_neon_dotprod, vpx_convolve8_neon_dotprod, + vpx_convolve8_avg_neon_dotprod, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, + vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, + vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve_neon_dotprod[] = { ALL_SIZES( + convolve8_neon_dotprod) }; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon_dotprod)); + +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +const ConvolveFunctions12Tap convolve12Tap_neon_dotprod( + vpx_convolve12_horiz_neon_dotprod, vpx_convolve12_vert_neon_dotprod, + vpx_convolve12_neon_dotprod, 0); +const Convolve12TapParam kArrayConvolve12Tap_neon_dotprod[] = { ALL_SIZES_12TAP( + convolve12Tap_neon_dotprod) }; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest12Tap, + ::testing::ValuesIn(kArrayConvolve12Tap_neon_dotprod)); +#endif +#endif // HAVE_NEON_DOTPROD + +#if HAVE_SVE +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_sve( + wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_sve_8, + wrap_convolve8_avg_horiz_sve_8, wrap_convolve8_vert_c_8, + wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, + wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8, + wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, + wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_sve( + wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, + wrap_convolve8_horiz_sve_10, wrap_convolve8_avg_horiz_sve_10, + wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, + wrap_convolve8_avg_c_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10, + wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10, + 10); +const ConvolveFunctions convolve12_sve( + wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, + wrap_convolve8_horiz_sve_12, wrap_convolve8_avg_horiz_sve_12, + wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, + wrap_convolve8_avg_c_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, + wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, + 12); + +const ConvolveParam kArrayConvolve_sve[] = { ALL_SIZES(convolve8_sve), + ALL_SIZES(convolve10_sve), + ALL_SIZES(convolve12_sve) }; +INSTANTIATE_TEST_SUITE_P(SVE, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_sve)); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SVE + +#if HAVE_SVE2 +#if CONFIG_VP9_HIGHBITDEPTH +const ConvolveFunctions convolve8_sve2( + wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8, + wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_sve2_8, + wrap_convolve8_avg_vert_sve2_8, wrap_convolve8_sve2_8, + wrap_convolve8_avg_sve2_8, wrap_convolve8_horiz_c_8, + wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8, + wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8); +const ConvolveFunctions convolve10_sve2( + wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_sve2_10, + wrap_convolve8_avg_vert_sve2_10, wrap_convolve8_sve2_10, + wrap_convolve8_avg_sve2_10, wrap_convolve8_horiz_c_10, + wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10, + wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10, + 10); +const ConvolveFunctions convolve12_sve2( + wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_sve2_12, + wrap_convolve8_avg_vert_sve2_12, wrap_convolve8_sve2_12, + wrap_convolve8_avg_sve2_12, wrap_convolve8_horiz_c_12, + wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12, + wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12, + 12); + +const ConvolveParam kArrayConvolve_sve2[] = { ALL_SIZES(convolve8_sve2), + ALL_SIZES(convolve10_sve2), + ALL_SIZES(convolve12_sve2) }; +INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_sve2)); + +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +const ConvolveFunctions12Tap convolve12tap_8bit_sve2( + wrap_convolve12_horiz_sve2_8, wrap_convolve12_vert_sve2_8, + wrap_convolve12_sve2_8, 8); + +const ConvolveFunctions12Tap convolve12tap_10bit_sve2( + wrap_convolve12_horiz_sve2_10, wrap_convolve12_vert_sve2_10, + wrap_convolve12_sve2_10, 10); + +const ConvolveFunctions12Tap convolve12tap_12bit_sve2( + wrap_convolve12_horiz_sve2_12, wrap_convolve12_vert_sve2_12, + wrap_convolve12_sve2_12, 12); + +const Convolve12TapParam kArrayConvolve12Tap_sve2[] = { + ALL_SIZES_12TAP(convolve12tap_8bit_sve2), + ALL_SIZES_12TAP(convolve12tap_10bit_sve2), + ALL_SIZES_12TAP(convolve12tap_12bit_sve2) +}; + +INSTANTIATE_TEST_SUITE_P(SVE2, ConvolveTest12Tap, + ::testing::ValuesIn(kArrayConvolve12Tap_sve2)); +#endif // !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SVE2 + +#if HAVE_NEON_I8MM +const ConvolveFunctions convolve8_neon_i8mm( + vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm, + vpx_convolve8_avg_horiz_neon_i8mm, vpx_convolve8_vert_neon_i8mm, + vpx_convolve8_avg_vert_neon_i8mm, vpx_convolve8_neon_i8mm, + vpx_convolve8_avg_neon_i8mm, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, + vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, + vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve_neon_i8mm[] = { ALL_SIZES( + convolve8_neon_i8mm) }; +INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_neon_i8mm)); + +#if !CONFIG_REALTIME_ONLY && CONFIG_VP9_ENCODER +const ConvolveFunctions12Tap convolve12Tap_neon_i8mm( + vpx_convolve12_horiz_neon_i8mm, vpx_convolve12_vert_neon_i8mm, + vpx_convolve12_neon_i8mm, 0); +const Convolve12TapParam kArrayConvolve12Tap_neon_i8mm[] = { ALL_SIZES_12TAP( + convolve12Tap_neon_i8mm) }; +INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest12Tap, + ::testing::ValuesIn(kArrayConvolve12Tap_neon_i8mm)); +#endif +#endif // HAVE_NEON_I8MM + #if HAVE_DSPR2 const ConvolveFunctions convolve8_dspr2( vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, vpx_convolve8_horiz_dspr2, @@ -1122,8 +2252,8 @@ const ConvolveFunctions convolve8_dspr2( vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES(convolve8_dspr2) }; -INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_dspr2)); +INSTANTIATE_TEST_SUITE_P(DSPR2, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_dspr2)); #endif // HAVE_DSPR2 #if HAVE_MSA @@ -1132,10 +2262,47 @@ const ConvolveFunctions convolve8_msa( vpx_convolve8_avg_horiz_msa, vpx_convolve8_vert_msa, vpx_convolve8_avg_vert_msa, vpx_convolve8_msa, vpx_convolve8_avg_msa, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, - vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + vpx_scaled_avg_vert_c, vpx_scaled_2d_msa, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) }; -INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, - ::testing::ValuesIn(kArrayConvolve8_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_msa)); #endif // HAVE_MSA + +#if HAVE_LSX +const ConvolveFunctions convolve8_lsx( + vpx_convolve_copy_lsx, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx, + vpx_convolve8_avg_horiz_lsx, vpx_convolve8_vert_lsx, + vpx_convolve8_avg_vert_lsx, vpx_convolve8_lsx, vpx_convolve8_avg_lsx, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); + +const ConvolveParam kArrayConvolve8_lsx[] = { ALL_SIZES(convolve8_lsx) }; +INSTANTIATE_TEST_SUITE_P(LSX, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve8_lsx)); +#endif // HAVE_LSX + +#if HAVE_VSX +const ConvolveFunctions convolve8_vsx( + vpx_convolve_copy_vsx, vpx_convolve_avg_vsx, vpx_convolve8_horiz_vsx, + vpx_convolve8_avg_horiz_vsx, vpx_convolve8_vert_vsx, + vpx_convolve8_avg_vert_vsx, vpx_convolve8_vsx, vpx_convolve8_avg_vsx, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); +const ConvolveParam kArrayConvolve_vsx[] = { ALL_SIZES(convolve8_vsx) }; +INSTANTIATE_TEST_SUITE_P(VSX, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_vsx)); +#endif // HAVE_VSX + +#if HAVE_MMI +const ConvolveFunctions convolve8_mmi( + vpx_convolve_copy_c, vpx_convolve_avg_mmi, vpx_convolve8_horiz_mmi, + vpx_convolve8_avg_horiz_mmi, vpx_convolve8_vert_mmi, + vpx_convolve8_avg_vert_mmi, vpx_convolve8_mmi, vpx_convolve8_avg_mmi, + vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, + vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); +const ConvolveParam kArrayConvolve_mmi[] = { ALL_SIZES(convolve8_mmi) }; +INSTANTIATE_TEST_SUITE_P(MMI, ConvolveTest, + ::testing::ValuesIn(kArrayConvolve_mmi)); +#endif // HAVE_MMI } // namespace diff --git a/media/libvpx/libvpx/test/cpu_speed_test.cc b/media/libvpx/libvpx/test/cpu_speed_test.cc index 404b5b44f4..6e0a046633 100644 --- a/media/libvpx/libvpx/test/cpu_speed_test.cc +++ b/media/libvpx/libvpx/test/cpu_speed_test.cc @@ -7,7 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" @@ -26,9 +26,9 @@ class CpuSpeedTest : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR), tune_content_(VP9E_CONTENT_DEFAULT) {} - virtual ~CpuSpeedTest() {} + ~CpuSpeedTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -40,11 +40,11 @@ class CpuSpeedTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; } + void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -56,7 +56,7 @@ class CpuSpeedTest } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0]; } @@ -105,7 +105,7 @@ TEST_P(CpuSpeedTest, TestTuneScreen) { ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25); cfg_.g_timebase = video.timebase(); cfg_.rc_2pass_vbr_minsection_pct = 5; - cfg_.rc_2pass_vbr_minsection_pct = 2000; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; cfg_.rc_target_bitrate = 2000; cfg_.rc_max_quantizer = 63; cfg_.rc_min_quantizer = 0; @@ -148,9 +148,6 @@ TEST_P(CpuSpeedTest, TestLowBitrate) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP9_INSTANTIATE_TEST_CASE(CpuSpeedTest, - ::testing::Values(::libvpx_test::kTwoPassGood, - ::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime), - ::testing::Range(0, 9)); +VP9_INSTANTIATE_TEST_SUITE(CpuSpeedTest, ONE_PASS_TEST_MODES, + ::testing::Range(0, 10)); } // namespace diff --git a/media/libvpx/libvpx/test/cq_test.cc b/media/libvpx/libvpx/test/cq_test.cc index 20e1f0f3de..103d1b8dca 100644 --- a/media/libvpx/libvpx/test/cq_test.cc +++ b/media/libvpx/libvpx/test/cq_test.cc @@ -9,11 +9,12 @@ */ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "vpx_config.h" namespace { @@ -27,11 +28,11 @@ class CQTest : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWithParam { public: // maps the cqlevel to the bitrate produced. - typedef std::map BitrateMap; + using BitrateMap = std::map; - static void SetUpTestCase() { bitrates_.clear(); } + static void SetUpTestSuite() { bitrates_.clear(); } - static void TearDownTestCase() { + static void TearDownTestSuite() { ASSERT_TRUE(!HasFailure()) << "skipping bitrate validation due to earlier failure."; uint32_t prev_actual_bitrate = kCQTargetBitrate; @@ -50,22 +51,22 @@ class CQTest : public ::libvpx_test::EncoderTest, init_flags_ = VPX_CODEC_USE_PSNR; } - virtual ~CQTest() {} + ~CQTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(libvpx_test::kTwoPassGood); } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { file_size_ = 0; psnr_ = 0.0; n_frames_ = 0; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { if (cfg_.rc_end_usage == VPX_CQ) { encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_); } @@ -73,12 +74,12 @@ class CQTest : public ::libvpx_test::EncoderTest, } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0); n_frames_++; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { file_size_ += pkt->data.frame.sz; } @@ -104,6 +105,10 @@ CQTest::BitrateMap CQTest::bitrates_; TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) { const vpx_rational timebase = { 33333333, 1000000000 }; +#if CONFIG_REALTIME_ONlY + GTEST_SKIP() + << "Non-zero g_lag_in_frames is unsupported with CONFIG_REALTIME_ONLY"; +#else cfg_.g_timebase = timebase; cfg_.rc_target_bitrate = kCQTargetBitrate; cfg_.g_lag_in_frames = 25; @@ -124,8 +129,9 @@ TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); const double vbr_psnr_lin = GetLinearPSNROverBitrate(); EXPECT_GE(cq_psnr_lin, vbr_psnr_lin); +#endif // CONFIG_REALTIME_ONLY } -VP8_INSTANTIATE_TEST_CASE(CQTest, ::testing::Range(kCQLevelMin, kCQLevelMax, - kCQLevelStep)); +VP8_INSTANTIATE_TEST_SUITE(CQTest, ::testing::Range(kCQLevelMin, kCQLevelMax, + kCQLevelStep)); } // namespace diff --git a/media/libvpx/libvpx/test/cx_set_ref.sh b/media/libvpx/libvpx/test/cx_set_ref.sh index 0a58dc187b..0a3d50ce1f 100644 --- a/media/libvpx/libvpx/test/cx_set_ref.sh +++ b/media/libvpx/libvpx/test/cx_set_ref.sh @@ -38,7 +38,7 @@ vpx_set_ref() { eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT_WIDTH}" \ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \ - "${ref_frame_num}" ${devnull} + "${ref_frame_num}" ${devnull} || return 1 [ -e "${output_file}" ] || return 1 } diff --git a/media/libvpx/libvpx/test/datarate_test.cc b/media/libvpx/libvpx/test/datarate_test.cc deleted file mode 100644 index 98d77285ab..0000000000 --- a/media/libvpx/libvpx/test/datarate_test.cc +++ /dev/null @@ -1,1476 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "./vpx_config.h" -#include "third_party/googletest/src/include/gtest/gtest.h" -#include "test/codec_factory.h" -#include "test/encode_test_driver.h" -#include "test/i420_video_source.h" -#include "test/util.h" -#include "test/y4m_video_source.h" -#include "vpx/vpx_codec.h" - -namespace { - -class DatarateTestLarge - : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWith2Params { - public: - DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {} - - virtual ~DatarateTestLarge() {} - - protected: - virtual void SetUp() { - InitializeConfig(); - SetMode(GET_PARAM(1)); - set_cpu_used_ = GET_PARAM(2); - ResetModel(); - } - - virtual void ResetModel() { - last_pts_ = 0; - bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; - frame_number_ = 0; - first_drop_ = 0; - bits_total_ = 0; - duration_ = 0.0; - denoiser_offon_test_ = 0; - denoiser_offon_period_ = -1; - gf_boost_ = 0; - } - - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 0) { - encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); - encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); - encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_); - } - - if (denoiser_offon_test_) { - ASSERT_GT(denoiser_offon_period_, 0) - << "denoiser_offon_period_ is not positive."; - if ((video->frame() + 1) % denoiser_offon_period_ == 0) { - // Flip denoiser_on_ periodically - denoiser_on_ ^= 1; - } - encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); - } - - const vpx_rational_t tb = video->timebase(); - timebase_ = static_cast(tb.num) / tb.den; - duration_ = 0; - } - - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - // Time since last timestamp = duration. - vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; - - // TODO(jimbankoski): Remove these lines when the issue: - // http://code.google.com/p/webm/issues/detail?id=496 is fixed. - // For now the codec assumes buffer starts at starting buffer rate - // plus one frame's time. - if (last_pts_ == 0) duration = 1; - - // Add to the buffer the bits we'd expect from a constant bitrate server. - bits_in_buffer_model_ += static_cast( - duration * timebase_ * cfg_.rc_target_bitrate * 1000); - - /* Test the buffer model here before subtracting the frame. Do so because - * the way the leaky bucket model works in libvpx is to allow the buffer to - * empty - and then stop showing frames until we've got enough bits to - * show one. As noted in comment below (issue 495), this does not currently - * apply to key frames. For now exclude key frames in condition below. */ - const bool key_frame = - (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; - if (!key_frame) { - ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame " - << pkt->data.frame.pts; - } - - const int64_t frame_size_in_bits = pkt->data.frame.sz * 8; - - // Subtract from the buffer the bits associated with a played back frame. - bits_in_buffer_model_ -= frame_size_in_bits; - - // Update the running total of bits for end of test datarate checks. - bits_total_ += frame_size_in_bits; - - // If first drop not set and we have a drop set it to this time. - if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1; - - // Update the most recent pts. - last_pts_ = pkt->data.frame.pts; - - // We update this so that we can calculate the datarate minus the last - // frame encoded in the file. - bits_in_last_frame_ = frame_size_in_bits; - - ++frame_number_; - } - - virtual void EndPassHook(void) { - if (bits_total_) { - const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit - - duration_ = (last_pts_ + 1) * timebase_; - - // Effective file datarate includes the time spent prebuffering. - effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 / - (cfg_.rc_buf_initial_sz / 1000.0 + duration_); - - file_datarate_ = file_size_in_kb / duration_; - } - } - - vpx_codec_pts_t last_pts_; - int64_t bits_in_buffer_model_; - double timebase_; - int frame_number_; - vpx_codec_pts_t first_drop_; - int64_t bits_total_; - double duration_; - double file_datarate_; - double effective_datarate_; - int64_t bits_in_last_frame_; - int denoiser_on_; - int denoiser_offon_test_; - int denoiser_offon_period_; - int set_cpu_used_; - int gf_boost_; -}; - -#if CONFIG_TEMPORAL_DENOISING -// Check basic datarate targeting, for a single bitrate, but loop over the -// various denoiser settings. -TEST_P(DatarateTestLarge, DenoiserLevels) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - for (int j = 1; j < 5; ++j) { - // Run over the denoiser levels. - // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j - // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV, - // denoiserOnAggressive, and denoiserOnAdaptive. - denoiser_on_ = j; - cfg_.rc_target_bitrate = 300; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; - } -} - -// Check basic datarate targeting, for a single bitrate, when denoiser is off -// and on. -TEST_P(DatarateTestLarge, DenoiserOffOn) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 299); - cfg_.rc_target_bitrate = 300; - ResetModel(); - // The denoiser is off by default. - denoiser_on_ = 0; - // Set the offon test flag. - denoiser_offon_test_ = 1; - denoiser_offon_period_ = 100; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; -} -#endif // CONFIG_TEMPORAL_DENOISING - -TEST_P(DatarateTestLarge, BasicBufferModel) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - // 2 pass cbr datarate control has a bug hidden by the small # of - // frames selected in this encode. The problem is that even if the buffer is - // negative we produce a keyframe on a cutscene. Ignoring datarate - // constraints - // TODO(jimbankoski): ( Fix when issue - // http://code.google.com/p/webm/issues/detail?id=495 is addressed. ) - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - - // There is an issue for low bitrates in real-time mode, where the - // effective_datarate slightly overshoots the target bitrate. - // This is same the issue as noted about (#495). - // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100), - // when the issue is resolved. - for (int i = 100; i < 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; - } -} - -TEST_P(DatarateTestLarge, ChangingDropFrameThresh) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_max_quantizer = 36; - cfg_.rc_end_usage = VPX_CBR; - cfg_.rc_target_bitrate = 200; - cfg_.kf_mode = VPX_KF_DISABLED; - - const int frame_count = 40; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, frame_count); - - // Here we check that the first dropped frame gets earlier and earlier - // as the drop frame threshold is increased. - - const int kDropFrameThreshTestStep = 30; - vpx_codec_pts_t last_drop = frame_count; - for (int i = 1; i < 91; i += kDropFrameThreshTestStep) { - cfg_.rc_dropframe_thresh = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_LE(first_drop_, last_drop) - << " The first dropped frame for drop_thresh " << i - << " > first dropped frame for drop_thresh " - << i - kDropFrameThreshTestStep; - last_drop = first_drop_; - } -} - -// Disabled for tsan, see: -// https://bugs.chromium.org/p/webm/issues/detail?id=1049 -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define BUILDING_WITH_TSAN -#endif -#endif -#ifndef BUILDING_WITH_TSAN -TEST_P(DatarateTestLarge, DropFramesMultiThreads) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 30; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_threads = 2; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - cfg_.rc_target_bitrate = 200; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; -} -#endif // !BUILDING_WITH_TSAN - -class DatarateTestRealTime : public DatarateTestLarge { - public: - virtual ~DatarateTestRealTime() {} -}; - -#if CONFIG_TEMPORAL_DENOISING -// Check basic datarate targeting, for a single bitrate, but loop over the -// various denoiser settings. -TEST_P(DatarateTestRealTime, DenoiserLevels) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - for (int j = 1; j < 5; ++j) { - // Run over the denoiser levels. - // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j - // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV, - // denoiserOnAggressive, and denoiserOnAdaptive. - denoiser_on_ = j; - cfg_.rc_target_bitrate = 300; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; - } -} - -// Check basic datarate targeting, for a single bitrate, when denoiser is off -// and on. -TEST_P(DatarateTestRealTime, DenoiserOffOn) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 299); - cfg_.rc_target_bitrate = 300; - ResetModel(); - // The denoiser is off by default. - denoiser_on_ = 0; - // Set the offon test flag. - denoiser_offon_test_ = 1; - denoiser_offon_period_ = 100; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; -} -#endif // CONFIG_TEMPORAL_DENOISING - -TEST_P(DatarateTestRealTime, BasicBufferModel) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - // 2 pass cbr datarate control has a bug hidden by the small # of - // frames selected in this encode. The problem is that even if the buffer is - // negative we produce a keyframe on a cutscene, ignoring datarate - // constraints - // TODO(jimbankoski): Fix when issue - // http://bugs.chromium.org/p/webm/issues/detail?id=495 is addressed. - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - - // There is an issue for low bitrates in real-time mode, where the - // effective_datarate slightly overshoots the target bitrate. - // This is same the issue as noted above (#495). - // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100), - // when the issue is resolved. - for (int i = 100; i <= 700; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; - } -} - -TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_max_quantizer = 36; - cfg_.rc_end_usage = VPX_CBR; - cfg_.rc_target_bitrate = 200; - cfg_.kf_mode = VPX_KF_DISABLED; - - const int frame_count = 40; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, frame_count); - - // Check that the first dropped frame gets earlier and earlier - // as the drop frame threshold is increased. - - const int kDropFrameThreshTestStep = 30; - vpx_codec_pts_t last_drop = frame_count; - for (int i = 1; i < 91; i += kDropFrameThreshTestStep) { - cfg_.rc_dropframe_thresh = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_LE(first_drop_, last_drop) - << " The first dropped frame for drop_thresh " << i - << " > first dropped frame for drop_thresh " - << i - kDropFrameThreshTestStep; - last_drop = first_drop_; - } -} - -// Disabled for tsan, see: -// https://bugs.chromium.org/p/webm/issues/detail?id=1049 - -#ifndef BUILDING_WITH_TSAN -TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 30; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - // Encode using multiple threads. - cfg_.g_threads = 2; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - cfg_.rc_target_bitrate = 200; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; -} -#endif - -TEST_P(DatarateTestRealTime, GFBoost) { - denoiser_on_ = 0; - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_dropframe_thresh = 0; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_error_resilient = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 300); - cfg_.rc_target_bitrate = 300; - ResetModel(); - // Apply a gf boost. - gf_boost_ = 50; - - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) - << " The datarate for the file exceeds the target!"; - - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) - << " The datarate for the file missed the target!"; -} - -class DatarateTestVP9Large - : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWith2Params { - public: - DatarateTestVP9Large() : EncoderTest(GET_PARAM(0)) {} - - protected: - virtual ~DatarateTestVP9Large() {} - - virtual void SetUp() { - InitializeConfig(); - SetMode(GET_PARAM(1)); - set_cpu_used_ = GET_PARAM(2); - ResetModel(); - } - - virtual void ResetModel() { - last_pts_ = 0; - bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; - frame_number_ = 0; - tot_frame_number_ = 0; - first_drop_ = 0; - num_drops_ = 0; - // Denoiser is off by default. - denoiser_on_ = 0; - // For testing up to 3 layers. - for (int i = 0; i < 3; ++i) { - bits_total_[i] = 0; - } - denoiser_offon_test_ = 0; - denoiser_offon_period_ = -1; - } - - // - // Frame flags and layer id for temporal layers. - // - - // For two layers, test pattern is: - // 1 3 - // 0 2 ..... - // For three layers, test pattern is: - // 1 3 5 7 - // 2 6 - // 0 4 .... - // LAST is always update on base/layer 0, GOLDEN is updated on layer 1. - // For this 3 layer example, the 2nd enhancement layer (layer 2) does not - // update any reference frames. - int SetFrameFlags(int frame_num, int num_temp_layers) { - int frame_flags = 0; - if (num_temp_layers == 2) { - if (frame_num % 2 == 0) { - // Layer 0: predict from L and ARF, update L. - frame_flags = - VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; - } else { - // Layer 1: predict from L, G and ARF, and update G. - frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | - VP8_EFLAG_NO_UPD_ENTROPY; - } - } else if (num_temp_layers == 3) { - if (frame_num % 4 == 0) { - // Layer 0: predict from L and ARF; update L. - frame_flags = - VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; - } else if ((frame_num - 2) % 4 == 0) { - // Layer 1: predict from L, G, ARF; update G. - frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; - } else if ((frame_num - 1) % 2 == 0) { - // Layer 2: predict from L, G, ARF; update none. - frame_flags = - VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; - } - } - return frame_flags; - } - - int SetLayerId(int frame_num, int num_temp_layers) { - int layer_id = 0; - if (num_temp_layers == 2) { - if (frame_num % 2 == 0) { - layer_id = 0; - } else { - layer_id = 1; - } - } else if (num_temp_layers == 3) { - if (frame_num % 4 == 0) { - layer_id = 0; - } else if ((frame_num - 2) % 4 == 0) { - layer_id = 1; - } else if ((frame_num - 1) % 2 == 0) { - layer_id = 2; - } - } - return layer_id; - } - - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); - - if (denoiser_offon_test_) { - ASSERT_GT(denoiser_offon_period_, 0) - << "denoiser_offon_period_ is not positive."; - if ((video->frame() + 1) % denoiser_offon_period_ == 0) { - // Flip denoiser_on_ periodically - denoiser_on_ ^= 1; - } - } - - encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); - - if (cfg_.ts_number_layers > 1) { - if (video->frame() == 0) { - encoder->Control(VP9E_SET_SVC, 1); - } - vpx_svc_layer_id_t layer_id; - layer_id.spatial_layer_id = 0; - frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers); - layer_id.temporal_layer_id = - SetLayerId(video->frame(), cfg_.ts_number_layers); - encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); - } - const vpx_rational_t tb = video->timebase(); - timebase_ = static_cast(tb.num) / tb.den; - duration_ = 0; - } - - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - // Time since last timestamp = duration. - vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; - - if (duration > 1) { - // If first drop not set and we have a drop set it to this time. - if (!first_drop_) first_drop_ = last_pts_ + 1; - // Update the number of frame drops. - num_drops_ += static_cast(duration - 1); - // Update counter for total number of frames (#frames input to encoder). - // Needed for setting the proper layer_id below. - tot_frame_number_ += static_cast(duration - 1); - } - - int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers); - - // Add to the buffer the bits we'd expect from a constant bitrate server. - bits_in_buffer_model_ += static_cast( - duration * timebase_ * cfg_.rc_target_bitrate * 1000); - - // Buffer should not go negative. - ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame " - << pkt->data.frame.pts; - - const size_t frame_size_in_bits = pkt->data.frame.sz * 8; - - // Update the total encoded bits. For temporal layers, update the cumulative - // encoded bits per layer. - for (int i = layer; i < static_cast(cfg_.ts_number_layers); ++i) { - bits_total_[i] += frame_size_in_bits; - } - - // Update the most recent pts. - last_pts_ = pkt->data.frame.pts; - ++frame_number_; - ++tot_frame_number_; - } - - virtual void EndPassHook(void) { - for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); - ++layer) { - duration_ = (last_pts_ + 1) * timebase_; - if (bits_total_[layer]) { - // Effective file datarate: - effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_; - } - } - } - - vpx_codec_pts_t last_pts_; - double timebase_; - int frame_number_; // Counter for number of non-dropped/encoded frames. - int tot_frame_number_; // Counter for total number of input frames. - int64_t bits_total_[3]; - double duration_; - double effective_datarate_[3]; - int set_cpu_used_; - int64_t bits_in_buffer_model_; - vpx_codec_pts_t first_drop_; - int num_drops_; - int denoiser_on_; - int denoiser_offon_test_; - int denoiser_offon_period_; -}; - -// Check basic rate targeting for VBR mode with 0 lag. -TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) { - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.g_error_resilient = 0; - cfg_.rc_end_usage = VPX_VBR; - cfg_.g_lag_in_frames = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 300); - for (int i = 400; i <= 800; i += 400) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) - << " The datarate for the file is greater than target by too much!"; - } -} - -// Check basic rate targeting for VBR mode with non-zero lag. -TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) { - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.g_error_resilient = 0; - cfg_.rc_end_usage = VPX_VBR; - // For non-zero lag, rate control will work (be within bounds) for - // real-time mode. - if (deadline_ == VPX_DL_REALTIME) { - cfg_.g_lag_in_frames = 15; - } else { - cfg_.g_lag_in_frames = 0; - } - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 300); - for (int i = 400; i <= 800; i += 400) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) - << " The datarate for the file is greater than target by too much!"; - } -} - -// Check basic rate targeting for CBR mode. -TEST_P(DatarateTestVP9Large, BasicRateTargeting) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - for (int i = 150; i < 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; - } -} - -// Check basic rate targeting for CBR mode, with 2 threads and dropped frames. -TEST_P(DatarateTestVP9Large, BasicRateTargetingDropFramesMultiThreads) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 30; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - // Encode using multiple threads. - cfg_.g_threads = 2; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - cfg_.rc_target_bitrate = 200; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; -} - -// Check basic rate targeting for CBR. -TEST_P(DatarateTestVP9Large, BasicRateTargeting444) { - ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); - - cfg_.g_profile = 1; - cfg_.g_timebase = video.timebase(); - - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - - for (int i = 250; i < 900; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(static_cast(cfg_.rc_target_bitrate), - effective_datarate_[0] * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(static_cast(cfg_.rc_target_bitrate), - effective_datarate_[0] * 1.15) - << " The datarate for the file missed the target!" - << cfg_.rc_target_bitrate << " " << effective_datarate_; - } -} - -// Check that (1) the first dropped frame gets earlier and earlier -// as the drop frame threshold is increased, and (2) that the total number of -// frame drops does not decrease as we increase frame drop threshold. -// Use a lower qp-max to force some frame drops. -TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_undershoot_pct = 20; - cfg_.rc_undershoot_pct = 20; - cfg_.rc_dropframe_thresh = 10; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 50; - cfg_.rc_end_usage = VPX_CBR; - cfg_.rc_target_bitrate = 200; - cfg_.g_lag_in_frames = 0; - // TODO(marpan): Investigate datarate target failures with a smaller keyframe - // interval (128). - cfg_.kf_max_dist = 9999; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - - const int kDropFrameThreshTestStep = 30; - vpx_codec_pts_t last_drop = 140; - int last_num_drops = 0; - for (int i = 10; i < 100; i += kDropFrameThreshTestStep) { - cfg_.rc_dropframe_thresh = i; - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; - ASSERT_LE(first_drop_, last_drop) - << " The first dropped frame for drop_thresh " << i - << " > first dropped frame for drop_thresh " - << i - kDropFrameThreshTestStep; - ASSERT_GE(num_drops_, last_num_drops * 0.85) - << " The number of dropped frames for drop_thresh " << i - << " < number of dropped frames for drop_thresh " - << i - kDropFrameThreshTestStep; - last_drop = first_drop_; - last_num_drops = num_drops_; - } -} - -// Check basic rate targeting for 2 temporal layers. -TEST_P(DatarateTestVP9Large, BasicRateTargeting2TemporalLayers) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1). - cfg_.ss_number_layers = 1; - cfg_.ts_number_layers = 2; - cfg_.ts_rate_decimator[0] = 2; - cfg_.ts_rate_decimator[1] = 1; - - cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; - - if (deadline_ == VPX_DL_REALTIME) cfg_.g_error_resilient = 1; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); - for (int i = 200; i <= 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - // 60-40 bitrate allocation for 2 temporal layers. - cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; - cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { - ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85) - << " The datarate for the file is lower than target by too much, " - "for layer: " - << j; - ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15) - << " The datarate for the file is greater than target by too much, " - "for layer: " - << j; - } - } -} - -// Check basic rate targeting for 3 temporal layers. -TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayers) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). - cfg_.ss_number_layers = 1; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - - cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); - for (int i = 200; i <= 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - // 40-20-40 bitrate allocation for 3 temporal layers. - cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; - cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; - cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { - // TODO(yaowu): Work out more stable rc control strategy and - // Adjust the thresholds to be tighter than .75. - ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75) - << " The datarate for the file is lower than target by too much, " - "for layer: " - << j; - // TODO(yaowu): Work out more stable rc control strategy and - // Adjust the thresholds to be tighter than 1.25. - ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25) - << " The datarate for the file is greater than target by too much, " - "for layer: " - << j; - } - } -} - -// Check basic rate targeting for 3 temporal layers, with frame dropping. -// Only for one (low) bitrate with lower max_quantizer, and somewhat higher -// frame drop threshold, to force frame dropping. -TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - // Set frame drop threshold and rc_max_quantizer to force some frame drops. - cfg_.rc_dropframe_thresh = 20; - cfg_.rc_max_quantizer = 45; - cfg_.rc_min_quantizer = 0; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). - cfg_.ss_number_layers = 1; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - - cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); - cfg_.rc_target_bitrate = 200; - ResetModel(); - // 40-20-40 bitrate allocation for 3 temporal layers. - cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; - cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; - cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { - ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85) - << " The datarate for the file is lower than target by too much, " - "for layer: " - << j; - ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15) - << " The datarate for the file is greater than target by too much, " - "for layer: " - << j; - // Expect some frame drops in this test: for this 200 frames test, - // expect at least 10% and not more than 60% drops. - ASSERT_GE(num_drops_, 20); - ASSERT_LE(num_drops_, 130); - } -} - -#if CONFIG_VP9_TEMPORAL_DENOISING -class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large { - public: - virtual ~DatarateTestVP9LargeDenoiser() {} -}; - -// Check basic datarate targeting, for a single bitrate, when denoiser is on. -TEST_P(DatarateTestVP9LargeDenoiser, LowNoise) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 2; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 140); - - // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), - // there is only one denoiser mode: denoiserYonly(which is 1), - // but may add more modes in the future. - cfg_.rc_target_bitrate = 300; - ResetModel(); - // Turn on the denoiser. - denoiser_on_ = 1; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; -} - -// Check basic datarate targeting, for a single bitrate, when denoiser is on, -// for clip with high noise level. -TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 2; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200); - - // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), - // there is only one denoiser mode: denoiserYonly(which is 1), - // but may add more modes in the future. - cfg_.rc_target_bitrate = 1000; - ResetModel(); - // Turn on the denoiser. - denoiser_on_ = 1; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; -} - -// Check basic datarate targeting, for a single bitrate, when denoiser is off -// and on. -TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_dropframe_thresh = 1; - cfg_.rc_min_quantizer = 2; - cfg_.rc_max_quantizer = 56; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 299); - - // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), - // there is only one denoiser mode: denoiserYonly(which is 1), - // but may add more modes in the future. - cfg_.rc_target_bitrate = 300; - ResetModel(); - // The denoiser is off by default. - denoiser_on_ = 0; - // Set the offon test flag. - denoiser_offon_test_ = 1; - denoiser_offon_period_ = 100; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) - << " The datarate for the file is lower than target by too much!"; - ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) - << " The datarate for the file is greater than target by too much!"; -} -#endif // CONFIG_VP9_TEMPORAL_DENOISING - -class DatarateOnePassCbrSvc - : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWith2Params { - public: - DatarateOnePassCbrSvc() : EncoderTest(GET_PARAM(0)) { - memset(&svc_params_, 0, sizeof(svc_params_)); - } - virtual ~DatarateOnePassCbrSvc() {} - - protected: - virtual void SetUp() { - InitializeConfig(); - SetMode(GET_PARAM(1)); - speed_setting_ = GET_PARAM(2); - ResetModel(); - } - virtual void ResetModel() { - last_pts_ = 0; - bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; - frame_number_ = 0; - first_drop_ = 0; - bits_total_ = 0; - duration_ = 0.0; - mismatch_psnr_ = 0.0; - mismatch_nframes_ = 0; - } - virtual void BeginPassHook(unsigned int /*pass*/) {} - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 0) { - int i; - for (i = 0; i < VPX_MAX_LAYERS; ++i) { - svc_params_.max_quantizers[i] = 63; - svc_params_.min_quantizers[i] = 0; - } - svc_params_.speed_per_layer[0] = 5; - for (i = 1; i < VPX_SS_MAX_LAYERS; ++i) { - svc_params_.speed_per_layer[i] = speed_setting_; - } - encoder->Control(VP9E_SET_SVC, 1); - encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); - encoder->Control(VP8E_SET_CPUUSED, speed_setting_); - encoder->Control(VP9E_SET_TILE_COLUMNS, 0); - encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300); - encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1)); - encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1); - } - const vpx_rational_t tb = video->timebase(); - timebase_ = static_cast(tb.num) / tb.den; - duration_ = 0; - } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; - if (last_pts_ == 0) duration = 1; - bits_in_buffer_model_ += static_cast( - duration * timebase_ * cfg_.rc_target_bitrate * 1000); - const bool key_frame = - (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; - if (!key_frame) { - // TODO(marpan): This check currently fails for some of the SVC tests, - // re-enable when issue (webm:1350) is resolved. - // ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame " - // << pkt->data.frame.pts; - } - const size_t frame_size_in_bits = pkt->data.frame.sz * 8; - bits_in_buffer_model_ -= static_cast(frame_size_in_bits); - bits_total_ += frame_size_in_bits; - if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1; - last_pts_ = pkt->data.frame.pts; - bits_in_last_frame_ = frame_size_in_bits; - ++frame_number_; - } - virtual void EndPassHook(void) { - if (bits_total_) { - const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit - duration_ = (last_pts_ + 1) * timebase_; - file_datarate_ = file_size_in_kb / duration_; - } - } - - virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { - double mismatch_psnr = compute_psnr(img1, img2); - mismatch_psnr_ += mismatch_psnr; - ++mismatch_nframes_; - } - - unsigned int GetMismatchFrames() { return mismatch_nframes_; } - - vpx_codec_pts_t last_pts_; - int64_t bits_in_buffer_model_; - double timebase_; - int frame_number_; - vpx_codec_pts_t first_drop_; - int64_t bits_total_; - double duration_; - double file_datarate_; - size_t bits_in_last_frame_; - vpx_svc_extra_cfg_t svc_params_; - int speed_setting_; - double mismatch_psnr_; - int mismatch_nframes_; -}; -static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg, - const vpx_svc_extra_cfg_t *svc_params, - int spatial_layers, int temporal_layers, - int temporal_layering_mode) { - int sl, spatial_layer_target; - float total = 0; - float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; - for (sl = 0; sl < spatial_layers; ++sl) { - if (svc_params->scaling_factor_den[sl] > 0) { - alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 / - svc_params->scaling_factor_den[sl]); - total += alloc_ratio[sl]; - } - } - for (sl = 0; sl < spatial_layers; ++sl) { - enc_cfg->ss_target_bitrate[sl] = spatial_layer_target = - (unsigned int)(enc_cfg->rc_target_bitrate * alloc_ratio[sl] / total); - const int index = sl * temporal_layers; - if (temporal_layering_mode == 3) { - enc_cfg->layer_target_bitrate[index] = spatial_layer_target >> 1; - enc_cfg->layer_target_bitrate[index + 1] = - (spatial_layer_target >> 1) + (spatial_layer_target >> 2); - enc_cfg->layer_target_bitrate[index + 2] = spatial_layer_target; - } else if (temporal_layering_mode == 2) { - enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3; - enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target; - } - } -} - -// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and -// 3 temporal layers. Run CIF clip with 1 thread. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 1; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 144; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 288; - svc_params_.scaling_factor_den[1] = 288; - cfg_.rc_dropframe_thresh = 10; - cfg_.kf_max_dist = 9999; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); - // TODO(marpan): Check that effective_datarate for each layer hits the - // layer target_bitrate. - for (int i = 200; i <= 800; i += 200) { - cfg_.rc_target_bitrate = i; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); - } -} - -// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 -// temporal layers. Run CIF clip with 1 thread, and few short key frame periods. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersSmallKf) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 1; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 144; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 288; - svc_params_.scaling_factor_den[1] = 288; - cfg_.rc_dropframe_thresh = 10; - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 200); - cfg_.rc_target_bitrate = 400; - // For this 3 temporal layer case, pattern repeats every 4 frames, so choose - // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). - for (int j = 64; j <= 67; j++) { - cfg_.kf_max_dist = j; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); - } -} - -// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and -// 3 temporal layers. Run HD clip with 4 threads. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers4threads) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 4; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 144; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 288; - svc_params_.scaling_factor_den[1] = 288; - cfg_.rc_dropframe_thresh = 10; - cfg_.kf_max_dist = 9999; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); - cfg_.rc_target_bitrate = 800; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); -} - -// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and -// 3 temporal layers. Run CIF clip with 1 thread. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 3; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 1; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 72; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 144; - svc_params_.scaling_factor_den[1] = 288; - svc_params_.scaling_factor_num[2] = 288; - svc_params_.scaling_factor_den[2] = 288; - cfg_.rc_dropframe_thresh = 10; - cfg_.kf_max_dist = 9999; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); - cfg_.rc_target_bitrate = 800; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); -} - -// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 -// temporal layers. Run CIF clip with 1 thread, and few short key frame periods. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayersSmallKf) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 3; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 1; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 72; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 144; - svc_params_.scaling_factor_den[1] = 288; - svc_params_.scaling_factor_num[2] = 288; - svc_params_.scaling_factor_den[2] = 288; - cfg_.rc_dropframe_thresh = 10; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); - cfg_.rc_target_bitrate = 800; - // For this 3 temporal layer case, pattern repeats every 4 frames, so choose - // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). - for (int j = 32; j <= 35; j++) { - cfg_.kf_max_dist = j; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); - } -} - -// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and -// 3 temporal layers. Run HD clip with 4 threads. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers4threads) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 3; - cfg_.ts_number_layers = 3; - cfg_.ts_rate_decimator[0] = 4; - cfg_.ts_rate_decimator[1] = 2; - cfg_.ts_rate_decimator[2] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 4; - cfg_.temporal_layering_mode = 3; - svc_params_.scaling_factor_num[0] = 72; - svc_params_.scaling_factor_den[0] = 288; - svc_params_.scaling_factor_num[1] = 144; - svc_params_.scaling_factor_den[1] = 288; - svc_params_.scaling_factor_num[2] = 288; - svc_params_.scaling_factor_den[2] = 288; - cfg_.rc_dropframe_thresh = 10; - cfg_.kf_max_dist = 9999; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); - cfg_.rc_target_bitrate = 800; - ResetModel(); - assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, - cfg_.ts_number_layers, cfg_.temporal_layering_mode); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85) - << " The datarate for the file exceeds the target by too much!"; - ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22) - << " The datarate for the file is lower than the target by too much!"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); -} - -// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial -// downscale 5x5. -TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers5x5MultipleRuns) { - cfg_.rc_buf_initial_sz = 500; - cfg_.rc_buf_optimal_sz = 500; - cfg_.rc_buf_sz = 1000; - cfg_.rc_min_quantizer = 0; - cfg_.rc_max_quantizer = 63; - cfg_.rc_end_usage = VPX_CBR; - cfg_.g_lag_in_frames = 0; - cfg_.ss_number_layers = 2; - cfg_.ts_number_layers = 1; - cfg_.ts_rate_decimator[0] = 1; - cfg_.g_error_resilient = 1; - cfg_.g_threads = 3; - cfg_.temporal_layering_mode = 0; - svc_params_.scaling_factor_num[0] = 256; - svc_params_.scaling_factor_den[0] = 1280; - svc_params_.scaling_factor_num[1] = 1280; - svc_params_.scaling_factor_den[1] = 1280; - cfg_.rc_dropframe_thresh = 0; - cfg_.kf_max_dist = 999999; - cfg_.kf_min_dist = 0; - cfg_.ss_target_bitrate[0] = 300; - cfg_.ss_target_bitrate[1] = 1400; - cfg_.layer_target_bitrate[0] = 300; - cfg_.layer_target_bitrate[1] = 1400; - cfg_.rc_target_bitrate = 1700; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); - ResetModel(); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - EXPECT_EQ(static_cast(0), GetMismatchFrames()); -} - -VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES, - ::testing::Values(0)); -VP8_INSTANTIATE_TEST_CASE(DatarateTestRealTime, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Values(-6, -12)); -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large, - ::testing::Values(::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime), - ::testing::Range(2, 9)); -#if CONFIG_VP9_TEMPORAL_DENOISING -VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(5, 9)); -#endif -VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(5, 9)); -} // namespace diff --git a/media/libvpx/libvpx/test/dct16x16_test.cc b/media/libvpx/libvpx/test/dct16x16_test.cc index f9745ed81f..a3382ab64e 100644 --- a/media/libvpx/libvpx/test/dct16x16_test.cc +++ b/media/libvpx/libvpx/test/dct16x16_test.cc @@ -11,8 +11,9 @@ #include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" @@ -24,8 +25,9 @@ #include "vp9/common/vp9_scan.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_config.h" #include "vpx_ports/mem.h" -#include "vpx_ports/msvc.h" // for round() +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -222,17 +224,16 @@ void reference_16x16_dct_2d(int16_t input[256], double output[256]) { } } -typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); -typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, - int tx_type); -typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, - int tx_type); +using FdctFunc = void (*)(const int16_t *in, tran_low_t *out, int stride); +using IdctFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride); +using FhtFunc = void (*)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); -typedef std::tr1::tuple Dct16x16Param; -typedef std::tr1::tuple Ht16x16Param; -typedef std::tr1::tuple - Idct16x16Param; +using Dct16x16Param = std::tuple; +using Ht16x16Param = std::tuple; +using Idct16x16Param = std::tuple; void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride, int /*tx_type*/) { @@ -255,11 +256,11 @@ void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride, #if CONFIG_VP9_HIGHBITDEPTH void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_c(in, out, stride, 10); + vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_c(in, out, stride, 12); + vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride, @@ -273,43 +274,43 @@ void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride, } void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10); + vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); } void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12); + vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); } #if HAVE_SSE2 void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_c(in, out, stride, 10); + vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_c(in, out, stride, 12); + vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10); + vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12); + vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10); + vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12); + vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH class Trans16x16TestBase { public: - virtual ~Trans16x16TestBase() {} + virtual ~Trans16x16TestBase() = default; protected: virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0; @@ -353,7 +354,7 @@ class Trans16x16TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -475,10 +476,10 @@ class Trans16x16TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_)); #if CONFIG_VP9_HIGHBITDEPTH } else { - inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_, + inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_, tx_type_); ASM_REGISTER_STATE_CHECK( - RunInvTxfm(output_ref_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } if (bit_depth_ == VPX_BITS_8) { @@ -530,8 +531,7 @@ class Trans16x16TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16)); #if CONFIG_VP9_HIGHBITDEPTH } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), 16)); + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16)); #endif // CONFIG_VP9_HIGHBITDEPTH } @@ -543,18 +543,56 @@ class Trans16x16TestBase { const uint32_t diff = dst[j] - src[j]; #endif // CONFIG_VP9_HIGHBITDEPTH const uint32_t error = diff * diff; - EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error - << " at index " << j; + EXPECT_GE(1u, error) + << "Error: 16x16 IDCT has error " << error << " at index " << j; } } } + void RunSpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + int c_sum_time = 0; + int simd_sum_time = 0; + + DECLARE_ALIGNED(32, int16_t, input_block[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, output_ref_block[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, output_block[kNumCoeffs]); + + // Initialize a test block with input range [-mask_, mask_]. + for (int j = 0; j < kNumCoeffs; ++j) { + input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); + } + + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + vpx_fdct16x16_c(input_block, output_ref_block, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += static_cast(vpx_usec_timer_elapsed(&timer_c)); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunFwdTxfm(input_block, output_block, pitch_); + } + + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += static_cast(vpx_usec_timer_elapsed(&timer_mod)); + + printf( + "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time, + simd_sum_time, + (static_cast(c_sum_time) / static_cast(simd_sum_time))); + } + void CompareInvReference(IdctFunc ref_txfm, int thresh) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 10000; const int eob = 10; const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan; - DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); #if CONFIG_VP9_HIGHBITDEPTH @@ -585,9 +623,9 @@ class Trans16x16TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); } else { #if CONFIG_VP9_HIGHBITDEPTH - ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_); + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); #endif // CONFIG_VP9_HIGHBITDEPTH } @@ -605,6 +643,80 @@ class Trans16x16TestBase { } } + void RunInvTrans16x16SpeedTest(IdctFunc ref_txfm, int thresh) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 10; + const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan; + int64_t c_sum_time = 0; + int64_t simd_sum_time = 0; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + if (bit_depth_ == VPX_BITS_8) { + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + ref_txfm(coeff, ref, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, dst, pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + printf( + "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n", + c_sum_time, simd_sum_time, + (static_cast(c_sum_time) / static_cast(simd_sum_time))); + } + int pitch_; int tx_type_; vpx_bit_depth_t bit_depth_; @@ -616,9 +728,9 @@ class Trans16x16TestBase { class Trans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam { public: - virtual ~Trans16x16DCT() {} + ~Trans16x16DCT() override = default; - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -637,13 +749,13 @@ class Trans16x16DCT : public Trans16x16TestBase, inv_txfm_ref = idct16x16_ref; #endif } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } @@ -665,12 +777,14 @@ TEST_P(Trans16x16DCT, QuantCheck) { TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); } +TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); } + class Trans16x16HT : public Trans16x16TestBase, public ::testing::TestWithParam { public: - virtual ~Trans16x16HT() {} + ~Trans16x16HT() override = default; - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -689,13 +803,13 @@ class Trans16x16HT : public Trans16x16TestBase, inv_txfm_ref = iht16x16_ref; #endif } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride, tx_type_); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride, tx_type_); } @@ -718,9 +832,9 @@ TEST_P(Trans16x16HT, QuantCheck) { class InvTrans16x16DCT : public Trans16x16TestBase, public ::testing::TestWithParam { public: - virtual ~InvTrans16x16DCT() {} + ~InvTrans16x16DCT() override = default; - virtual void SetUp() { + void SetUp() override { ref_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); thresh_ = GET_PARAM(2); @@ -728,11 +842,12 @@ class InvTrans16x16DCT : public Trans16x16TestBase, pitch_ = 16; mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {} - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, + int /*stride*/) override {} + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } @@ -740,89 +855,34 @@ class InvTrans16x16DCT : public Trans16x16TestBase, IdctFunc inv_txfm_; int thresh_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans16x16DCT); TEST_P(InvTrans16x16DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } -class PartialTrans16x16Test : public ::testing::TestWithParam< - std::tr1::tuple > { - public: - virtual ~PartialTrans16x16Test() {} - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - bit_depth_ = GET_PARAM(1); - } - - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - vpx_bit_depth_t bit_depth_; - FdctFunc fwd_txfm_; -}; - -TEST_P(PartialTrans16x16Test, Extremes) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - const int minval = -maxval; - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - - for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16)); - EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]); - - for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16)); - EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]); +TEST_P(InvTrans16x16DCT, DISABLED_Speed) { + RunInvTrans16x16SpeedTest(ref_txfm_, thresh_); } -TEST_P(PartialTrans16x16Test, Random) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - ACMRandom rnd(ACMRandom::DeterministicSeed()); - - int sum = 0; - for (int i = 0; i < kNumCoeffs; ++i) { - const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1); - input[i] = val; - sum += val; - } - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16)); - EXPECT_EQ(sum >> 1, output[0]); -} - -using std::tr1::make_tuple; +using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, Trans16x16DCT, ::testing::Values( make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10), make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12), make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8))); #else -INSTANTIATE_TEST_CASE_P(C, Trans16x16DCT, - ::testing::Values(make_tuple(&vpx_fdct16x16_c, - &vpx_idct16x16_256_add_c, - 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(C, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_c, + &vpx_idct16x16_256_add_c, + 0, VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, Trans16x16HT, ::testing::Values( make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10), @@ -837,37 +897,45 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - C, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8), - make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, Trans16x16HT, ::testing::Values( make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_c, - VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(C, InvTrans16x16DCT, + ::testing::Values(make_tuple(&vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_c, + 6225, VPX_BITS_8))); + #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Trans16x16DCT, - ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_neon, - 0, VPX_BITS_8))); -#endif + ::testing::Values(make_tuple(&vpx_fdct16x16_neon, + &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8))); +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + NEON, Trans16x16DCT, + ::testing::Values( + make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_10, 0, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_12, 0, VPX_BITS_12), + make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_c, 0, + VPX_BITS_8))); +#endif // HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16DCT, ::testing::Values(make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16HT, ::testing::Values(make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0, VPX_BITS_8), @@ -877,13 +945,27 @@ INSTANTIATE_TEST_CASE_P( 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2, - VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(SSE2, InvTrans16x16DCT, + ::testing::Values(make_tuple( + &vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_sse2, 6225, VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + AVX2, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_avx2, + &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P(AVX2, InvTrans16x16DCT, + ::testing::Values(make_tuple( + &vpx_idct16x16_256_add_c, + &vpx_idct16x16_256_add_avx2, 6225, VPX_BITS_8))); +#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16DCT, ::testing::Values( make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 0, VPX_BITS_10), @@ -894,7 +976,7 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_12), make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_c, 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans16x16HT, ::testing::Values( make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8), @@ -904,7 +986,7 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_8))); // Optimizations take effect at a threshold of 3155, so we use a value close to // that to test both branches. -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, InvTrans16x16DCT, ::testing::Values(make_tuple(&idct16x16_10_add_10_c, &idct16x16_10_add_10_sse2, 3167, VPX_BITS_10), @@ -914,17 +996,14 @@ INSTANTIATE_TEST_CASE_P( &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12), make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2, 3167, VPX_BITS_12))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2, - VPX_BITS_8))); #endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(MSA, Trans16x16DCT, - ::testing::Values(make_tuple(&vpx_fdct16x16_msa, - &vpx_idct16x16_256_add_msa, - 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( + MSA, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa, + 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( MSA, Trans16x16HT, ::testing::Values( make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8), @@ -932,8 +1011,19 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8), make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test, - ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa, - VPX_BITS_8))); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + VSX, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_vsx, + 0, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(LSX, Trans16x16DCT, + ::testing::Values(make_tuple(&vpx_fdct16x16_lsx, + &vpx_idct16x16_256_add_c, + 0, VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/media/libvpx/libvpx/test/dct32x32_test.cc b/media/libvpx/libvpx/test/dct32x32_test.cc index a168e690ee..77447481cb 100644 --- a/media/libvpx/libvpx/test/dct32x32_test.cc +++ b/media/libvpx/libvpx/test/dct32x32_test.cc @@ -11,21 +11,24 @@ #include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" #include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_scan.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -#include "vpx_ports/msvc.h" // for round() +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; @@ -63,26 +66,30 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], } } -typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); +using FwdTxfmFunc = void (*)(const int16_t *in, tran_low_t *out, int stride); +using InvTxfmFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride); -typedef std::tr1::tuple - Trans32x32Param; +using Trans32x32Param = + std::tuple; + +using InvTrans32x32Param = + std::tuple; #if CONFIG_VP9_HIGHBITDEPTH void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10); + vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12); + vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // CONFIG_VP9_HIGHBITDEPTH -class Trans32x32Test : public ::testing::TestWithParam { +class Trans32x32Test : public AbstractBench, + public ::testing::TestWithParam { public: - virtual ~Trans32x32Test() {} - virtual void SetUp() { + ~Trans32x32Test() override = default; + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); version_ = GET_PARAM(2); // 0: high precision forward transform @@ -91,7 +98,7 @@ class Trans32x32Test : public ::testing::TestWithParam { mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int version_; @@ -99,8 +106,14 @@ class Trans32x32Test : public ::testing::TestWithParam { int mask_; FwdTxfmFunc fwd_txfm_; InvTxfmFunc inv_txfm_; + + int16_t *bench_in_; + tran_low_t *bench_out_; + void Run() override; }; +void Trans32x32Test::Run() { fwd_txfm_(bench_in_, bench_out_, 32); } + TEST_P(Trans32x32Test, AccuracyCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); uint32_t max_error = 0; @@ -137,7 +150,7 @@ TEST_P(Trans32x32Test, AccuracyCheck) { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - inv_txfm_(test_temp_block, CONVERT_TO_BYTEPTR(dst16), 32)); + inv_txfm_(test_temp_block, CAST_TO_BYTEPTR(dst16), 32)); #endif } @@ -237,6 +250,19 @@ TEST_P(Trans32x32Test, MemCheck) { } } +TEST_P(Trans32x32Test, DISABLED_Speed) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]); + DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); + + bench_in_ = input_extreme_block; + bench_out_ = output_block; + + RunNTimes(INT16_MAX); + PrintMedian("32x32"); +} + TEST_P(Trans32x32Test, InverseAccuracy) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 1000; @@ -275,7 +301,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) { ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32)); #if CONFIG_VP9_HIGHBITDEPTH } else { - ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32)); + ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CAST_TO_BYTEPTR(dst16), 32)); #endif } for (int j = 0; j < kNumCoeffs; ++j) { @@ -292,71 +318,178 @@ TEST_P(Trans32x32Test, InverseAccuracy) { } } -class PartialTrans32x32Test - : public ::testing::TestWithParam< - std::tr1::tuple > { +class InvTrans32x32Test : public ::testing::TestWithParam { public: - virtual ~PartialTrans32x32Test() {} - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - bit_depth_ = GET_PARAM(1); + ~InvTrans32x32Test() override = default; + void SetUp() override { + ref_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + version_ = GET_PARAM(2); // 0: high precision forward transform + // 1: low precision version for rd loop + bit_depth_ = GET_PARAM(3); + eob_ = GET_PARAM(4); + thresh_ = GET_PARAM(4); + mask_ = (1 << bit_depth_) - 1; + pitch_ = 32; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: + void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) { + ref_txfm_(out, dst, stride); + } + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride); + } + int version_; vpx_bit_depth_t bit_depth_; - FwdTxfmFunc fwd_txfm_; + int mask_; + int eob_; + int thresh_; + + InvTxfmFunc ref_txfm_; + InvTxfmFunc inv_txfm_; + int pitch_; + + void RunInvTrans32x32SpeedTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + int64_t c_sum_time = 0; + int64_t simd_sum_time = 0; + const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob_) { + // Random values less than the threshold, either positive or negative + coeff[scan[j]] = rnd(thresh_); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + if (bit_depth_ == VPX_BITS_8) { + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + RunRefTxfm(coeff, ref, pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, dst, pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + vpx_usec_timer timer_c; + vpx_usec_timer_start(&timer_c); + for (int i = 0; i < count_test_block; ++i) { + RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + } + vpx_usec_timer_mark(&timer_c); + c_sum_time += vpx_usec_timer_elapsed(&timer_c); + + vpx_usec_timer timer_mod; + vpx_usec_timer_start(&timer_mod); + for (int i = 0; i < count_test_block; ++i) { + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_); + } + vpx_usec_timer_mark(&timer_mod); + simd_sum_time += vpx_usec_timer_elapsed(&timer_mod); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + printf( + "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n", + c_sum_time, simd_sum_time, + (static_cast(c_sum_time) / static_cast(simd_sum_time))); + } + + void CompareInvReference32x32() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 10000; + const int eob = 31; + const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan; + DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); + DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); + DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + for (int i = 0; i < count_test_block; ++i) { + for (int j = 0; j < kNumCoeffs; ++j) { + if (j < eob) { + coeff[scan[j]] = rnd.Rand8Extremes(); + } else { + coeff[scan[j]] = 0; + } + if (bit_depth_ == VPX_BITS_8) { + dst[j] = 0; + ref[j] = 0; +#if CONFIG_VP9_HIGHBITDEPTH + } else { + dst16[j] = 0; + ref16[j] = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + if (bit_depth_ == VPX_BITS_8) { + RunRefTxfm(coeff, ref, pitch_); + RunInvTxfm(coeff, dst, pitch_); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); + ASM_REGISTER_STATE_CHECK( + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + + for (int j = 0; j < kNumCoeffs; ++j) { +#if CONFIG_VP9_HIGHBITDEPTH + const uint32_t diff = + bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j]; +#else + const uint32_t diff = dst[j] - ref[j]; +#endif // CONFIG_VP9_HIGHBITDEPTH + const uint32_t error = diff * diff; + EXPECT_EQ(0u, error) << "Error: 32x32 IDCT Comparison has error " + << error << " at index " << j; + } + } + } }; -TEST_P(PartialTrans32x32Test, Extremes) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - const int minval = -maxval; - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans32x32Test); - for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32)); - EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]); +TEST_P(InvTrans32x32Test, DISABLED_Speed) { RunInvTrans32x32SpeedTest(); } +TEST_P(InvTrans32x32Test, CompareReference) { CompareInvReference32x32(); } - for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval; - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32)); - EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]); -} - -TEST_P(PartialTrans32x32Test, Random) { -#if CONFIG_VP9_HIGHBITDEPTH - const int16_t maxval = - static_cast(clip_pixel_highbd(1 << 30, bit_depth_)); -#else - const int16_t maxval = 255; -#endif - DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]); - ACMRandom rnd(ACMRandom::DeterministicSeed()); - - int sum = 0; - for (int i = 0; i < kNumCoeffs; ++i) { - const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1); - input[i] = val; - sum += val; - } - output[0] = 0; - ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32)); - EXPECT_EQ(sum >> 3, output[0]); -} - -using std::tr1::make_tuple; +using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, Trans32x32Test, ::testing::Values( make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 0, VPX_BITS_10), @@ -366,46 +499,51 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - C, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_8), - make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_12))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, - VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + C, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_c, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_c, 0, + VPX_BITS_8, 16, 6255))); #endif // CONFIG_VP9_HIGHBITDEPTH -#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( NEON, Trans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_neon, - 0, VPX_BITS_8), - make_tuple(&vpx_fdct32x32_rd_c, + ::testing::Values(make_tuple(&vpx_fdct32x32_neon, + &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_neon, &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8))); -#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, - VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + SSE2, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_sse2, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_sse2, 0, + VPX_BITS_8, 16, 6225))); #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Trans32x32Test, ::testing::Values( make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 0, VPX_BITS_10), @@ -418,29 +556,49 @@ INSTANTIATE_TEST_CASE_P( VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, - VPX_BITS_8))); #endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_avx2, &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_avx2, &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8))); + +INSTANTIATE_TEST_SUITE_P( + AVX2, InvTrans32x32Test, + ::testing::Values( + (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_avx2, 0, + VPX_BITS_8, 32, 6225)), + make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_avx2, 0, + VPX_BITS_8, 16, 6225))); #endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, Trans32x32Test, ::testing::Values(make_tuple(&vpx_fdct32x32_msa, &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8), make_tuple(&vpx_fdct32x32_rd_msa, &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test, - ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, - VPX_BITS_8))); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + VSX, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx, + 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_vsx, + &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P( + LSX, Trans32x32Test, + ::testing::Values(make_tuple(&vpx_fdct32x32_lsx, + &vpx_idct32x32_1024_add_lsx, 0, VPX_BITS_8), + make_tuple(&vpx_fdct32x32_rd_lsx, + &vpx_idct32x32_1024_add_lsx, 1, VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/media/libvpx/libvpx/test/dct_partial_test.cc b/media/libvpx/libvpx/test/dct_partial_test.cc new file mode 100644 index 0000000000..dc4921ab94 --- /dev/null +++ b/media/libvpx/libvpx/test/dct_partial_test.cc @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx_config.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; +using std::make_tuple; +using std::tuple; + +namespace { +using PartialFdctFunc = void (*)(const int16_t *in, tran_low_t *out, + int stride); + +using PartialFdctParam = tuple; + +tran_low_t partial_fdct_ref(const Buffer &in, int size) { + int64_t sum = 0; + if (in.TopLeftPixel() != nullptr) { + for (int y = 0; y < size; ++y) { + for (int x = 0; x < size; ++x) { + sum += in.TopLeftPixel()[y * in.stride() + x]; + } + } + } else { + assert(0); + } + + switch (size) { + case 4: sum *= 2; break; + case 8: /*sum = sum;*/ break; + case 16: sum >>= 1; break; + case 32: sum >>= 3; break; + } + + return static_cast(sum); +} + +class PartialFdctTest : public ::testing::TestWithParam { + public: + PartialFdctTest() { + fwd_txfm_ = GET_PARAM(0); + size_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); + } + + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void RunTest() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int16_t maxvalue = + clip_pixel_highbd(std::numeric_limits::max(), bit_depth_); + const int16_t minvalue = -maxvalue; + Buffer input_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_block.Init()); + Buffer output_block = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + if (output_block.TopLeftPixel() != nullptr) { + for (int i = 0; i < 100; ++i) { + if (i == 0) { + input_block.Set(maxvalue); + } else if (i == 1) { + input_block.Set(minvalue); + } else { + input_block.Set(&rnd, minvalue, maxvalue); + } + + ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(), + output_block.TopLeftPixel(), + input_block.stride())); + + EXPECT_EQ(partial_fdct_ref(input_block, size_), + output_block.TopLeftPixel()[0]); + } + } else { + assert(0); + } + } + + PartialFdctFunc fwd_txfm_; + vpx_bit_depth_t bit_depth_; + int size_; +}; + +TEST_P(PartialFdctTest, PartialFdctTest) { RunTest(); } + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + C, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_10), + make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_10), + make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_10), + make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8))); +#else +INSTANTIATE_TEST_SUITE_P( + C, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_sse2, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_sse2, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_sse2, 4, VPX_BITS_8))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_8), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_12), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_10), + make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_12), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_10), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); +#else +INSTANTIATE_TEST_SUITE_P( + NEON, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8), + make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON + +#if HAVE_MSA +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(MSA, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8, + VPX_BITS_8))); +#else // !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + MSA, PartialFdctTest, + ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, 32, VPX_BITS_8), + make_tuple(&vpx_fdct16x16_1_msa, 16, VPX_BITS_8), + make_tuple(&vpx_fdct8x8_1_msa, 8, VPX_BITS_8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_MSA +} // namespace diff --git a/media/libvpx/libvpx/test/dct_test.cc b/media/libvpx/libvpx/test/dct_test.cc new file mode 100644 index 0000000000..ffc6b93ed9 --- /dev/null +++ b/media/libvpx/libvpx/test/dct_test.cc @@ -0,0 +1,791 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp9/common/vp9_entropy.h" +#include "vpx_config.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +using libvpx_test::ACMRandom; +using libvpx_test::Buffer; +using std::make_tuple; +using std::tuple; + +namespace { +using FdctFunc = void (*)(const int16_t *in, tran_low_t *out, int stride); +using IdctFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride); +using FhtFunc = void (*)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +using FhtFuncRef = void (*)(const Buffer &in, Buffer *out, + int size, int tx_type); +using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); +using IhtWithBdFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type, int bd); + +template +void fdct_wrapper(const int16_t *in, tran_low_t *out, int stride, int tx_type) { + (void)tx_type; + fn(in, out, stride); +} + +template +void idct_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type, + int bd) { + (void)tx_type; + (void)bd; + fn(in, out, stride); +} + +template +void iht_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type, + int bd) { + (void)bd; + fn(in, out, stride, tx_type); +} + +#if CONFIG_VP9_HIGHBITDEPTH +using HighbdIdctFunc = void (*)(const tran_low_t *in, uint16_t *out, int stride, + int bd); + +using HighbdIhtFunc = void (*)(const tran_low_t *in, uint16_t *out, int stride, + int tx_type, int bd); + +template +void highbd_idct_wrapper(const tran_low_t *in, uint8_t *out, int stride, + int tx_type, int bd) { + (void)tx_type; + fn(in, CAST_TO_SHORTPTR(out), stride, bd); +} + +template +void highbd_iht_wrapper(const tran_low_t *in, uint8_t *out, int stride, + int tx_type, int bd) { + fn(in, CAST_TO_SHORTPTR(out), stride, tx_type, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +struct FuncInfo { + FhtFunc ft_func; + IhtWithBdFunc it_func; + int size; + int pixel_size; +}; + +/* forward transform, inverse transform, size, transform type, bit depth */ +using DctParam = tuple; + +void fdct_ref(const Buffer &in, Buffer *out, int size, + int /*tx_type*/) { + const int16_t *i = in.TopLeftPixel(); + const int i_stride = in.stride(); + tran_low_t *o = out->TopLeftPixel(); + if (size == 4) { + vpx_fdct4x4_c(i, o, i_stride); + } else if (size == 8) { + vpx_fdct8x8_c(i, o, i_stride); + } else if (size == 16) { + vpx_fdct16x16_c(i, o, i_stride); + } else if (size == 32) { + vpx_fdct32x32_c(i, o, i_stride); + } +} + +void fht_ref(const Buffer &in, Buffer *out, int size, + int tx_type) { + const int16_t *i = in.TopLeftPixel(); + const int i_stride = in.stride(); + tran_low_t *o = out->TopLeftPixel(); + if (size == 4) { + vp9_fht4x4_c(i, o, i_stride, tx_type); + } else if (size == 8) { + vp9_fht8x8_c(i, o, i_stride, tx_type); + } else if (size == 16) { + vp9_fht16x16_c(i, o, i_stride, tx_type); + } +} + +void fwht_ref(const Buffer &in, Buffer *out, int size, + int /*tx_type*/) { + ASSERT_EQ(size, 4); + vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride()); +} + +class TransTestBase : public ::testing::TestWithParam { + public: + void SetUp() override { + rnd_.Reset(ACMRandom::DeterministicSeed()); + const int idx = GET_PARAM(0); + const FuncInfo *func_info = &(GET_PARAM(1)[idx]); + tx_type_ = GET_PARAM(2); + bit_depth_ = GET_PARAM(3); + fwd_txfm_ = func_info->ft_func; + inv_txfm_ = func_info->it_func; + size_ = func_info->size; + pixel_size_ = func_info->pixel_size; + max_pixel_value_ = (1 << bit_depth_) - 1; + + // Randomize stride_ to a value less than or equal to 1024 + stride_ = rnd_(1024) + 1; + if (stride_ < size_) { + stride_ = size_; + } + // Align stride_ to 16 if it's bigger than 16. + if (stride_ > 16) { + stride_ &= ~15; + } + + block_size_ = size_ * stride_; + + src_ = reinterpret_cast( + vpx_memalign(16, pixel_size_ * block_size_)); + ASSERT_NE(src_, nullptr); + dst_ = reinterpret_cast( + vpx_memalign(16, pixel_size_ * block_size_)); + ASSERT_NE(dst_, nullptr); + } + + void TearDown() override { + vpx_free(src_); + src_ = nullptr; + vpx_free(dst_); + dst_ = nullptr; + libvpx_test::ClearSystemState(); + } + + void InitMem() { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + if (pixel_size_ == 1) { + for (int j = 0; j < block_size_; ++j) { + src_[j] = rnd_.Rand16() & max_pixel_value_; + } + for (int j = 0; j < block_size_; ++j) { + dst_[j] = rnd_.Rand16() & max_pixel_value_; + } + } else { + ASSERT_EQ(pixel_size_, 2); + uint16_t *const src = reinterpret_cast(src_); + uint16_t *const dst = reinterpret_cast(dst_); + for (int j = 0; j < block_size_; ++j) { + src[j] = rnd_.Rand16() & max_pixel_value_; + } + for (int j = 0; j < block_size_; ++j) { + dst[j] = rnd_.Rand16() & max_pixel_value_; + } + } + } + + void RunFwdTxfm(const Buffer &in, Buffer *out) { + fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_); + } + + void RunInvTxfm(const Buffer &in, uint8_t *out) { + inv_txfm_(in.TopLeftPixel(), out, stride_, tx_type_, bit_depth_); + } + + protected: + void RunAccuracyCheck(int limit) { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + Buffer test_input_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(test_input_block.Init()); + ASSERT_NE(test_input_block.TopLeftPixel(), nullptr); + Buffer test_temp_block = + Buffer(size_, size_, 0, 16); + ASSERT_TRUE(test_temp_block.Init()); + uint32_t max_error = 0; + int64_t total_error = 0; + const int count_test_block = 10000; + for (int i = 0; i < count_test_block; ++i) { + InitMem(); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + if (pixel_size_ == 1) { + test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = + src_[h * stride_ + w] - dst_[h * stride_ + w]; + } else { + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] = + src[h * stride_ + w] - dst[h * stride_ + w]; + } + } + } + + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block)); + ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst_)); + + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + int diff; + if (pixel_size_ == 1) { + diff = dst_[h * stride_ + w] - src_[h * stride_ + w]; + } else { + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + diff = dst[h * stride_ + w] - src[h * stride_ + w]; + } + const uint32_t error = diff * diff; + if (max_error < error) max_error = error; + total_error += error; + } + } + } + + EXPECT_GE(static_cast(limit), max_error) + << "Error: " << size_ << "x" << size_ + << " transform/inverse transform has an individual round trip error > " + << limit; + + EXPECT_GE(count_test_block * limit, total_error) + << "Error: " << size_ << "x" << size_ + << " transform/inverse transform has average round trip error > " + << limit << " per block"; + } + + void RunCoeffCheck() { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + Buffer input_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_block.Init()); + Buffer output_ref_block = Buffer(size_, size_, 0); + ASSERT_TRUE(output_ref_block.Init()); + Buffer output_block = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-max_pixel_value_, + // max_pixel_value_]. + input_block.Set(&rnd, -max_pixel_value_, max_pixel_value_); + + fwd_txfm_ref(input_block, &output_ref_block, size_, tx_type_); + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, &output_block)); + + // The minimum quant value is 4. + EXPECT_TRUE(output_block.CheckValues(output_ref_block)); + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + output_block.PrintDifference(output_ref_block); + return; + } + } + } + + void RunMemCheck() { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 5000; + Buffer input_extreme_block = + Buffer(size_, size_, 8, size_ == 4 ? 0 : 16); + ASSERT_TRUE(input_extreme_block.Init()); + Buffer output_ref_block = Buffer(size_, size_, 0); + ASSERT_TRUE(output_ref_block.Init()); + Buffer output_block = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(output_block.Init()); + + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with -max_pixel_value_ or max_pixel_value_. + if (i == 0) { + input_extreme_block.Set(max_pixel_value_); + } else if (i == 1) { + input_extreme_block.Set(-max_pixel_value_); + } else { + ASSERT_NE(input_extreme_block.TopLeftPixel(), nullptr); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + input_extreme_block + .TopLeftPixel()[h * input_extreme_block.stride() + w] = + rnd.Rand8() % 2 ? max_pixel_value_ : -max_pixel_value_; + } + } + } + + fwd_txfm_ref(input_extreme_block, &output_ref_block, size_, tx_type_); + ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block, &output_block)); + + // The minimum quant value is 4. + EXPECT_TRUE(output_block.CheckValues(output_ref_block)); + ASSERT_NE(output_block.TopLeftPixel(), nullptr); + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + EXPECT_GE( + 4 * DCT_MAX_VALUE << (bit_depth_ - 8), + abs(output_block.TopLeftPixel()[h * output_block.stride() + w])) + << "Error: " << size_ << "x" << size_ + << " transform has coefficient larger than 4*DCT_MAX_VALUE" + << " at " << w << "," << h; + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + output_block.DumpBuffer(); + return; + } + } + } + } + } + + void RunInvAccuracyCheck(int limit) { + if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + Buffer in = Buffer(size_, size_, 4); + ASSERT_TRUE(in.Init()); + Buffer coeff = Buffer(size_, size_, 0, 16); + ASSERT_TRUE(coeff.Init()); + + for (int i = 0; i < count_test_block; ++i) { + InitMem(); + ASSERT_NE(in.TopLeftPixel(), nullptr); + // Initialize a test block with input range [-max_pixel_value_, + // max_pixel_value_]. + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + if (pixel_size_ == 1) { + in.TopLeftPixel()[h * in.stride() + w] = + src_[h * stride_ + w] - dst_[h * stride_ + w]; + } else { + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + in.TopLeftPixel()[h * in.stride() + w] = + src[h * stride_ + w] - dst[h * stride_ + w]; + } + } + } + + fwd_txfm_ref(in, &coeff, size_, tx_type_); + + ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst_)); + + for (int h = 0; h < size_; ++h) { + for (int w = 0; w < size_; ++w) { + int diff; + if (pixel_size_ == 1) { + diff = dst_[h * stride_ + w] - src_[h * stride_ + w]; + } else { + ASSERT_EQ(pixel_size_, 2); + const uint16_t *const src = reinterpret_cast(src_); + const uint16_t *const dst = reinterpret_cast(dst_); + diff = dst[h * stride_ + w] - src[h * stride_ + w]; + } + const uint32_t error = diff * diff; + EXPECT_GE(static_cast(limit), error) + << "Error: " << size_ << "x" << size_ + << " inverse transform has error " << error << " at " << w << "," + << h; + if (::testing::Test::HasFailure()) { + printf("Size: %d Transform type: %d\n", size_, tx_type_); + return; + } + } + } + } + } + + FhtFunc fwd_txfm_; + FhtFuncRef fwd_txfm_ref; + IhtWithBdFunc inv_txfm_; + ACMRandom rnd_; + uint8_t *src_; + uint8_t *dst_; + vpx_bit_depth_t bit_depth_; + int tx_type_; + int max_pixel_value_; + int size_; + int stride_; + int pixel_size_; + int block_size_; +}; + +/* -------------------------------------------------------------------------- */ + +class TransDCT : public TransTestBase { + public: + TransDCT() { fwd_txfm_ref = fdct_ref; } +}; + +TEST_P(TransDCT, AccuracyCheck) { + int t = 1; + if (size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2) { + t = 2; + } else if (size_ == 32 && bit_depth_ > 10 && pixel_size_ == 2) { + t = 7; + } + RunAccuracyCheck(t); +} + +TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransDCT, MemCheck) { RunMemCheck(); } + +TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } + +static const FuncInfo dct_c_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 }, +#endif + { &fdct_wrapper, &idct_wrapper, 4, 1 }, + { &fdct_wrapper, &idct_wrapper, 8, 1 }, + { &fdct_wrapper, &idct_wrapper, 16, + 1 }, + { &fdct_wrapper, &idct_wrapper, 32, + 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + C, TransDCT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(dct_c_func_info) / + sizeof(dct_c_func_info[0]))), + ::testing::Values(dct_c_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); + +#if !CONFIG_EMULATE_HARDWARE + +#if HAVE_SSE2 +static const FuncInfo dct_sse2_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 }, +#endif + { &fdct_wrapper, &idct_wrapper, 4, + 1 }, + { &fdct_wrapper, &idct_wrapper, 8, + 1 }, + { &fdct_wrapper, + &idct_wrapper, 16, 1 }, + { &fdct_wrapper, + &idct_wrapper, 32, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + SSE2, TransDCT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(dct_sse2_func_info) / + sizeof(dct_sse2_func_info[0]))), + ::testing::Values(dct_sse2_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 +// vpx_fdct8x8_ssse3 is only available in 64 bit builds. +static const FuncInfo dct_ssse3_func_info = { + &fdct_wrapper, &idct_wrapper, 8, 1 +}; + +// TODO(johannkoenig): high bit depth fdct8x8. +INSTANTIATE_TEST_SUITE_P(SSSE3, TransDCT, + ::testing::Values(make_tuple(0, &dct_ssse3_func_info, + 0, VPX_BITS_8))); +#endif // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 + +#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_avx2_func_info = { + &fdct_wrapper, &idct_wrapper, + 32, 1 +}; + +// TODO(johannkoenig): high bit depth fdct32x32. +INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT, + ::testing::Values(make_tuple(0, &dct_avx2_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_neon_func_info[] = { + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 8, 2 }, + { &fdct_wrapper, + &highbd_idct_wrapper, 16, 2 }, + /* { &fdct_wrapper, + &highbd_idct_wrapper, 32, 2 },*/ +}; +#else +static const FuncInfo dct_neon_func_info[4] = { + { &fdct_wrapper, &idct_wrapper, 4, + 1 }, + { &fdct_wrapper, &idct_wrapper, 8, + 1 }, + { &fdct_wrapper, + &idct_wrapper, 16, 1 }, + { &fdct_wrapper, + &idct_wrapper, 32, 1 } +}; +#endif // CONFIG_VP9_HIGHBITDEPTH + +INSTANTIATE_TEST_SUITE_P( + NEON, TransDCT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(dct_neon_func_info) / + sizeof(dct_neon_func_info[0]))), + ::testing::Values(dct_neon_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); +#endif // HAVE_NEON + +#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_msa_func_info[4] = { + { &fdct_wrapper, &idct_wrapper, 4, + 1 }, + { &fdct_wrapper, &idct_wrapper, 8, + 1 }, + { &fdct_wrapper, &idct_wrapper, + 16, 1 }, + { &fdct_wrapper, &idct_wrapper, + 32, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + MSA, TransDCT, + ::testing::Combine(::testing::Range(0, 4), + ::testing::Values(dct_msa_func_info), + ::testing::Values(0), ::testing::Values(VPX_BITS_8))); +#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_vsx_func_info = { + &fdct_wrapper, &idct_wrapper, 4, 1 +}; + +INSTANTIATE_TEST_SUITE_P(VSX, TransDCT, + ::testing::Values(make_tuple(0, &dct_vsx_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo dct_lsx_func_info[4] = { + { &fdct_wrapper, &idct_wrapper, 4, 1 }, + { &fdct_wrapper, &idct_wrapper, 8, 1 }, + { &fdct_wrapper, &idct_wrapper, + 16, 1 }, + { &fdct_wrapper, &idct_wrapper, + 32, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + LSX, TransDCT, + ::testing::Combine(::testing::Range(0, 4), + ::testing::Values(dct_lsx_func_info), + ::testing::Values(0), ::testing::Values(VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH + +#endif // !CONFIG_EMULATE_HARDWARE + +/* -------------------------------------------------------------------------- */ + +class TransHT : public TransTestBase { + public: + TransHT() { fwd_txfm_ref = fht_ref; } +}; + +TEST_P(TransHT, AccuracyCheck) { + RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2 ? 2 : 1); +} + +TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransHT, MemCheck) { RunMemCheck(); } + +TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } + +static const FuncInfo ht_c_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, 4, + 2 }, + { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper, 8, + 2 }, + { &vp9_highbd_fht16x16_c, &highbd_iht_wrapper, + 16, 2 }, +#endif + { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + C, TransHT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(ht_c_func_info) / + sizeof(ht_c_func_info[0]))), + ::testing::Values(ht_c_func_info), ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); + +#if !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON + +static const FuncInfo ht_neon_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, 4, + 2 }, + { &vp9_highbd_fht4x4_neon, &highbd_iht_wrapper, + 4, 2 }, + { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper, 8, + 2 }, + { &vp9_highbd_fht8x8_neon, &highbd_iht_wrapper, + 8, 2 }, + { &vp9_highbd_fht16x16_c, + &highbd_iht_wrapper, 16, 2 }, + { &vp9_highbd_fht16x16_neon, + &highbd_iht_wrapper, 16, 2 }, +#endif + { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht4x4_neon, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, + { &vp9_fht8x8_neon, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper, 16, 1 }, + { &vp9_fht16x16_neon, &iht_wrapper, 16, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + NEON, TransHT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(ht_neon_func_info) / + sizeof(ht_neon_func_info[0]))), + ::testing::Values(ht_neon_func_info), ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); +#endif // HAVE_NEON + +#if HAVE_SSE2 + +static const FuncInfo ht_sse2_func_info[3] = { + { &vp9_fht4x4_sse2, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_sse2, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_sse2, &iht_wrapper, 16, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + SSE2, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_sse2_func_info), + ::testing::Range(0, 4), ::testing::Values(VPX_BITS_8))); +#endif // HAVE_SSE2 + +#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo ht_sse4_1_func_info[3] = { + { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, + 4, 2 }, + { vp9_highbd_fht8x8_c, &highbd_iht_wrapper, + 8, 2 }, + { &vp9_highbd_fht16x16_c, + &highbd_iht_wrapper, 16, 2 } +}; + +INSTANTIATE_TEST_SUITE_P( + SSE4_1, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_sse4_1_func_info), + ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, + VPX_BITS_12))); +#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo ht_vsx_func_info[3] = { + { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, + { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } +}; + +INSTANTIATE_TEST_SUITE_P(VSX, TransHT, + ::testing::Combine(::testing::Range(0, 3), + ::testing::Values(ht_vsx_func_info), + ::testing::Range(0, 4), + ::testing::Values(VPX_BITS_8))); +#endif // HAVE_VSX +#endif // !CONFIG_EMULATE_HARDWARE + +/* -------------------------------------------------------------------------- */ + +class TransWHT : public TransTestBase { + public: + TransWHT() { fwd_txfm_ref = fwht_ref; } +}; + +TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); } + +TEST_P(TransWHT, CoeffCheck) { RunCoeffCheck(); } + +TEST_P(TransWHT, MemCheck) { RunMemCheck(); } + +TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); } + +static const FuncInfo wht_c_func_info[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { &fdct_wrapper, + &highbd_idct_wrapper, 4, 2 }, +#endif + { &fdct_wrapper, &idct_wrapper, 4, 1 } +}; + +INSTANTIATE_TEST_SUITE_P( + C, TransWHT, + ::testing::Combine( + ::testing::Range(0, static_cast(sizeof(wht_c_func_info) / + sizeof(wht_c_func_info[0]))), + ::testing::Values(wht_c_func_info), ::testing::Values(0), + ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12))); + +#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE +static const FuncInfo wht_sse2_func_info = { + &fdct_wrapper, &idct_wrapper, 4, 1 +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, TransWHT, + ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo wht_vsx_func_info = { + &fdct_wrapper, &idct_wrapper, 4, 1 +}; + +INSTANTIATE_TEST_SUITE_P(VSX, TransWHT, + ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_EMULATE_HARDWARE + +} // namespace diff --git a/media/libvpx/libvpx/test/decode_api_test.cc b/media/libvpx/libvpx/test/decode_api_test.cc index 593637780e..d7436ca3c2 100644 --- a/media/libvpx/libvpx/test/decode_api_test.cc +++ b/media/libvpx/libvpx/test/decode_api_test.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "test/ivf_video_source.h" @@ -20,7 +20,7 @@ namespace { #define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) TEST(DecodeAPI, InvalidParams) { - static const vpx_codec_iface_t *kCodecs[] = { + static vpx_codec_iface_t *kCodecs[] = { #if CONFIG_VP8_DECODER &vpx_codec_vp8_dx_algo, #endif @@ -31,27 +31,33 @@ TEST(DecodeAPI, InvalidParams) { uint8_t buf[1] = { 0 }; vpx_codec_ctx_t dec; - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_dec_init(NULL, NULL, NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_dec_init(&dec, NULL, NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(NULL, NULL, 0, NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(NULL, buf, 0, NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_decode(NULL, buf, NELEMENTS(buf), NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_decode(NULL, NULL, NELEMENTS(buf), NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL)); - EXPECT_TRUE(vpx_codec_error(NULL) != NULL); + EXPECT_EQ(vpx_codec_dec_init(nullptr, nullptr, nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_dec_init(&dec, nullptr, nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_decode(nullptr, nullptr, 0, nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_decode(nullptr, buf, 0, nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_decode(nullptr, buf, NELEMENTS(buf), nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_decode(nullptr, nullptr, NELEMENTS(buf), nullptr, 0), + VPX_CODEC_INVALID_PARAM); + EXPECT_EQ(vpx_codec_destroy(nullptr), VPX_CODEC_INVALID_PARAM); + EXPECT_NE(vpx_codec_error(nullptr), nullptr); + EXPECT_EQ(vpx_codec_error_detail(nullptr), nullptr); for (int i = 0; i < NELEMENTS(kCodecs); ++i) { EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_dec_init(NULL, kCodecs[i], NULL, 0)); + vpx_codec_dec_init(nullptr, kCodecs[i], nullptr, 0)); - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, kCodecs[i], NULL, 0)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, kCodecs[i], nullptr, 0)); EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM, - vpx_codec_decode(&dec, buf, NELEMENTS(buf), NULL, 0)); + vpx_codec_decode(&dec, buf, NELEMENTS(buf), nullptr, 0)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_decode(&dec, NULL, NELEMENTS(buf), NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(&dec, buf, 0, NULL, 0)); + vpx_codec_decode(&dec, nullptr, NELEMENTS(buf), nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_decode(&dec, buf, 0, nullptr, 0)); EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); } @@ -62,11 +68,12 @@ TEST(DecodeAPI, OptionalParams) { vpx_codec_ctx_t dec; #if CONFIG_ERROR_CONCEALMENT - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, NULL, - VPX_CODEC_USE_ERROR_CONCEALMENT)); + EXPECT_EQ(VPX_CODEC_OK, + vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, nullptr, + VPX_CODEC_USE_ERROR_CONCEALMENT)); #else EXPECT_EQ(VPX_CODEC_INCAPABLE, - vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, NULL, + vpx_codec_dec_init(&dec, &vpx_codec_vp8_dx_algo, nullptr, VPX_CODEC_USE_ERROR_CONCEALMENT)); #endif // CONFIG_ERROR_CONCEALMENT } @@ -90,30 +97,30 @@ void TestVp9Controls(vpx_codec_ctx_t *dec) { default: EXPECT_EQ(VPX_CODEC_OK, res) << kControls[i]; break; } EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_control_(dec, kControls[i], NULL)); + vpx_codec_control_(dec, kControls[i], nullptr)); } vp9_ref_frame_t ref; ref.idx = 0; EXPECT_EQ(VPX_CODEC_ERROR, vpx_codec_control(dec, VP9_GET_REFERENCE, &ref)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_control(dec, VP9_GET_REFERENCE, NULL)); + vpx_codec_control(dec, VP9_GET_REFERENCE, nullptr)); vpx_ref_frame_t ref_copy; const int width = 352; const int height = 288; - ASSERT_TRUE( - vpx_img_alloc(&ref_copy.img, VPX_IMG_FMT_I420, width, height, 1) != NULL); + EXPECT_NE(vpx_img_alloc(&ref_copy.img, VPX_IMG_FMT_I420, width, height, 1), + nullptr); ref_copy.frame_type = VP8_LAST_FRAME; EXPECT_EQ(VPX_CODEC_ERROR, vpx_codec_control(dec, VP8_COPY_REFERENCE, &ref_copy)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_control(dec, VP8_COPY_REFERENCE, NULL)); + vpx_codec_control(dec, VP8_COPY_REFERENCE, nullptr)); vpx_img_free(&ref_copy.img); } TEST(DecodeAPI, Vp9InvalidDecode) { - const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; + vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; const char filename[] = "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"; libvpx_test::IVFVideoSource video(filename); @@ -122,24 +129,46 @@ TEST(DecodeAPI, Vp9InvalidDecode) { ASSERT_TRUE(!HasFailure()); vpx_codec_ctx_t dec; - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, nullptr, 0)); const uint32_t frame_size = static_cast(video.frame_size()); #if CONFIG_VP9_HIGHBITDEPTH EXPECT_EQ(VPX_CODEC_MEM_ERROR, - vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0)); + vpx_codec_decode(&dec, video.cxdata(), frame_size, nullptr, 0)); #else EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM, - vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0)); + vpx_codec_decode(&dec, video.cxdata(), frame_size, nullptr, 0)); #endif - vpx_codec_iter_t iter = NULL; - EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter)); + vpx_codec_iter_t iter = nullptr; + EXPECT_EQ(nullptr, vpx_codec_get_frame(&dec, &iter)); TestVp9Controls(&dec); EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); } -TEST(DecodeAPI, Vp9PeekSI) { - const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; +void TestPeekInfo(const uint8_t *const data, uint32_t data_sz, + uint32_t peek_size) { + vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo; + // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get + // to decoder_peek_si_internal on frames of size < 8. + if (data_sz >= 8) { + vpx_codec_ctx_t dec; + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, nullptr, 0)); + EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM + : VPX_CODEC_CORRUPT_FRAME, + vpx_codec_decode(&dec, data, data_sz, nullptr, 0)); + vpx_codec_iter_t iter = nullptr; + EXPECT_EQ(nullptr, vpx_codec_get_frame(&dec, &iter)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); + } + + // Verify behavior of vpx_codec_peek_stream_info. + vpx_codec_stream_info_t si; + si.sz = sizeof(si); + EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK, + vpx_codec_peek_stream_info(codec, data, data_sz, &si)); +} + +TEST(DecodeAPI, Vp9PeekStreamInfo) { // The first 9 bytes are valid and the rest of the bytes are made up. Until // size 10, this should return VPX_CODEC_UNSUP_BITSTREAM and after that it // should return VPX_CODEC_CORRUPT_FRAME. @@ -150,26 +179,37 @@ TEST(DecodeAPI, Vp9PeekSI) { }; for (uint32_t data_sz = 1; data_sz <= 32; ++data_sz) { - // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get - // to decoder_peek_si_internal on frames of size < 8. - if (data_sz >= 8) { - vpx_codec_ctx_t dec; - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0)); - EXPECT_EQ( - (data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_CORRUPT_FRAME, - vpx_codec_decode(&dec, data, data_sz, NULL, 0)); - vpx_codec_iter_t iter = NULL; - EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter)); - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec)); - } + TestPeekInfo(data, data_sz, 10); + } +} - // Verify behavior of vpx_codec_peek_stream_info. - vpx_codec_stream_info_t si; - si.sz = sizeof(si); - EXPECT_EQ((data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK, - vpx_codec_peek_stream_info(codec, data, data_sz, &si)); +TEST(DecodeAPI, Vp9PeekStreamInfoTruncated) { + // This profile 1 header requires 10.25 bytes, ensure + // vpx_codec_peek_stream_info doesn't over read. + const uint8_t profile1_data[10] = { 0xa4, 0xe9, 0x30, 0x68, 0x53, + 0xe9, 0x30, 0x68, 0x53, 0x04 }; + + for (uint32_t data_sz = 1; data_sz <= 10; ++data_sz) { + TestPeekInfo(profile1_data, data_sz, 11); } } #endif // CONFIG_VP9_DECODER +TEST(DecodeAPI, HighBitDepthCapability) { +// VP8 should not claim VP9 HBD as a capability. +#if CONFIG_VP8_DECODER + const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_dx_algo); + EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif + +#if CONFIG_VP9_DECODER + const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_dx_algo); +#if CONFIG_VP9_HIGHBITDEPTH + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH); +#else + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif +#endif +} + } // namespace diff --git a/media/libvpx/libvpx/test/decode_corrupted.cc b/media/libvpx/libvpx/test/decode_corrupted.cc new file mode 100644 index 0000000000..55919d0f49 --- /dev/null +++ b/media/libvpx/libvpx/test/decode_corrupted.cc @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/i420_video_source.h" +#include "vpx_config.h" +#include "vpx_mem/vpx_mem.h" + +namespace { + +class DecodeCorruptedFrameTest + : public ::libvpx_test::EncoderTest, + public ::testing::TestWithParam< + std::tuple > { + public: + DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {} + + protected: + ~DecodeCorruptedFrameTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + + // Set small key frame distance such that we insert more key frames. + cfg_.kf_max_dist = 3; + dec_cfg_.threads = 1; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7); + } + + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override {} + + const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) override { + // Don't edit frame packet on key frame. + if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt; + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt; + + modified_pkt_ = *pkt; + + // Halve the size so it's corrupted to decoder. + modified_pkt_.data.frame.sz = modified_pkt_.data.frame.sz / 2; + + return &modified_pkt_; + } + + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { + EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError(); + return VPX_CODEC_MEM_ERROR != res_dec; + } + + vpx_codec_cx_pkt_t modified_pkt_; +}; + +TEST_P(DecodeCorruptedFrameTest, DecodeCorruptedFrame) { + cfg_.rc_target_bitrate = 200; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +#if CONFIG_VP9 +INSTANTIATE_TEST_SUITE_P( + VP9, DecodeCorruptedFrameTest, + ::testing::Values( + static_cast(&libvpx_test::kVP9))); +#endif // CONFIG_VP9 + +#if CONFIG_VP8 +INSTANTIATE_TEST_SUITE_P( + VP8, DecodeCorruptedFrameTest, + ::testing::Values( + static_cast(&libvpx_test::kVP8))); +#endif // CONFIG_VP8 + +} // namespace diff --git a/media/libvpx/libvpx/test/decode_perf_test.cc b/media/libvpx/libvpx/test/decode_perf_test.cc index ee26c3c046..69c30a9e80 100644 --- a/media/libvpx/libvpx/test/decode_perf_test.cc +++ b/media/libvpx/libvpx/test/decode_perf_test.cc @@ -9,6 +9,8 @@ */ #include +#include + #include "test/codec_factory.h" #include "test/decode_test_driver.h" #include "test/encode_test_driver.h" @@ -17,11 +19,11 @@ #include "test/md5_helper.h" #include "test/util.h" #include "test/webm_video_source.h" +#include "vpx/vpx_codec.h" #include "vpx_ports/vpx_timer.h" #include "./ivfenc.h" -#include "./vpx_version.h" -using std::tr1::make_tuple; +using std::make_tuple; namespace { @@ -34,7 +36,7 @@ const char kNewEncodeOutputFile[] = "new_encode.ivf"; /* DecodePerfTest takes a tuple of filename + number of threads to decode with */ -typedef std::tr1::tuple DecodePerfParam; +using DecodePerfParam = std::tuple; const DecodePerfParam kVP9DecodePerfVectors[] = { make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1), @@ -85,7 +87,7 @@ TEST_P(DecodePerfTest, PerfTest) { vpx_usec_timer t; vpx_usec_timer_start(&t); - for (video.Begin(); video.cxdata() != NULL; video.Next()) { + for (video.Begin(); video.cxdata() != nullptr; video.Next()) { decoder.DecodeFrame(video.cxdata(), video.frame_size()); } @@ -96,7 +98,7 @@ TEST_P(DecodePerfTest, PerfTest) { printf("{\n"); printf("\t\"type\" : \"decode_perf_test\",\n"); - printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"version\" : \"%s\",\n", vpx_codec_version_str()); printf("\t\"videoName\" : \"%s\",\n", video_name); printf("\t\"threadCount\" : %u,\n", threads); printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs); @@ -105,8 +107,8 @@ TEST_P(DecodePerfTest, PerfTest) { printf("}\n"); } -INSTANTIATE_TEST_CASE_P(VP9, DecodePerfTest, - ::testing::ValuesIn(kVP9DecodePerfVectors)); +INSTANTIATE_TEST_SUITE_P(VP9, DecodePerfTest, + ::testing::ValuesIn(kVP9DecodePerfVectors)); class VP9NewEncodeDecodePerfTest : public ::libvpx_test::EncoderTest, @@ -114,11 +116,11 @@ class VP9NewEncodeDecodePerfTest protected: VP9NewEncodeDecodePerfTest() : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0), - outfile_(0), out_frames_(0) {} + outfile_(nullptr), out_frames_(0) {} - virtual ~VP9NewEncodeDecodePerfTest() {} + ~VP9NewEncodeDecodePerfTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); @@ -135,33 +137,33 @@ class VP9NewEncodeDecodePerfTest cfg_.rc_end_usage = VPX_VBR; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, speed_); encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); encoder->Control(VP9E_SET_TILE_COLUMNS, 2); } } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH"); const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile; outfile_ = fopen(path_to_source.c_str(), "wb"); - ASSERT_TRUE(outfile_ != NULL); + ASSERT_NE(outfile_, nullptr); } - virtual void EndPassHook() { - if (outfile_ != NULL) { + void EndPassHook() override { + if (outfile_ != nullptr) { if (!fseek(outfile_, 0, SEEK_SET)) { ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_); } fclose(outfile_); - outfile_ = NULL; + outfile_ = nullptr; } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ++out_frames_; // Write initial file header if first frame. @@ -175,7 +177,7 @@ class VP9NewEncodeDecodePerfTest pkt->data.frame.sz); } - virtual bool DoDecode() const { return false; } + bool DoDecode() const override { return false; } void set_speed(unsigned int speed) { speed_ = speed; } @@ -234,7 +236,7 @@ TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) { vpx_usec_timer t; vpx_usec_timer_start(&t); - for (decode_video.Begin(); decode_video.cxdata() != NULL; + for (decode_video.Begin(); decode_video.cxdata() != nullptr; decode_video.Next()) { decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size()); } @@ -247,7 +249,7 @@ TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) { printf("{\n"); printf("\t\"type\" : \"decode_perf_test\",\n"); - printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"version\" : \"%s\",\n", vpx_codec_version_str()); printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile); printf("\t\"threadCount\" : %u,\n", threads); printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs); @@ -256,6 +258,6 @@ TEST_P(VP9NewEncodeDecodePerfTest, PerfTest) { printf("}\n"); } -VP9_INSTANTIATE_TEST_CASE(VP9NewEncodeDecodePerfTest, - ::testing::Values(::libvpx_test::kTwoPassGood)); +VP9_INSTANTIATE_TEST_SUITE(VP9NewEncodeDecodePerfTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); } // namespace diff --git a/media/libvpx/libvpx/test/decode_svc_test.cc b/media/libvpx/libvpx/test/decode_svc_test.cc index 69f62f13bd..7098e7b270 100644 --- a/media/libvpx/libvpx/test/decode_svc_test.cc +++ b/media/libvpx/libvpx/test/decode_svc_test.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "test/codec_factory.h" @@ -24,17 +25,16 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam { protected: DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {} - virtual ~DecodeSvcTest() {} + ~DecodeSvcTest() override = default; - virtual void PreDecodeFrameHook( - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { if (video.frame_number() == 0) decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_); } - virtual void DecompressedFrameHook(const vpx_image_t &img, - const unsigned int frame_number) { + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { ASSERT_EQ(img.d_w, width_); ASSERT_EQ(img.d_h, height_); total_frames_ = frame_number; @@ -53,9 +53,9 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest, // number of frames decoded. This results in 1/4x1/4 resolution (320x180). TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) { const std::string filename = GET_PARAM(1); - testing::internal::scoped_ptr video; + std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); total_frames_ = 0; spatial_layer_ = 0; @@ -70,9 +70,9 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) { // number of frames decoded. This results in 1/2x1/2 resolution (640x360). TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) { const std::string filename = GET_PARAM(1); - testing::internal::scoped_ptr video; + std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); total_frames_ = 0; spatial_layer_ = 1; @@ -87,9 +87,9 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) { // number of frames decoded. This results in the full resolution (1280x720). TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) { const std::string filename = GET_PARAM(1); - testing::internal::scoped_ptr video; + std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); total_frames_ = 0; spatial_layer_ = 2; @@ -105,9 +105,9 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) { // the decoding should result in the full resolution (1280x720). TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) { const std::string filename = GET_PARAM(1); - testing::internal::scoped_ptr video; + std::unique_ptr video; video.reset(new libvpx_test::IVFVideoSource(filename)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); total_frames_ = 0; spatial_layer_ = 10; @@ -117,7 +117,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) { ASSERT_EQ(total_frames_, kNumFrames); } -VP9_INSTANTIATE_TEST_CASE( +VP9_INSTANTIATE_TEST_SUITE( DecodeSvcTest, ::testing::ValuesIn(libvpx_test::kVP9TestVectorsSvc, libvpx_test::kVP9TestVectorsSvc + libvpx_test::kNumVP9TestVectorsSvc)); diff --git a/media/libvpx/libvpx/test/decode_test_driver.cc b/media/libvpx/libvpx/test/decode_test_driver.cc index b738e0db1c..af87bc25fb 100644 --- a/media/libvpx/libvpx/test/decode_test_driver.cc +++ b/media/libvpx/libvpx/test/decode_test_driver.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/decode_test_driver.h" @@ -26,7 +26,7 @@ vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size, } vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) { - return DecodeFrame(cxdata, size, NULL); + return DecodeFrame(cxdata, size, nullptr); } vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size, @@ -52,21 +52,22 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder, /* Vp8's implementation of PeekStream returns an error if the frame you * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first * frame, which must be a keyframe. */ - if (video->frame_number() == 0) - ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: " - << vpx_codec_err_to_string(res_peek); + if (video->frame_number() == 0) { + ASSERT_EQ(VPX_CODEC_OK, res_peek) + << "Peek return failed: " << vpx_codec_err_to_string(res_peek); + } } else { /* The Vp9 implementation of PeekStream returns an error only if the * data passed to it isn't a valid Vp9 chunk. */ - ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: " - << vpx_codec_err_to_string(res_peek); + ASSERT_EQ(VPX_CODEC_OK, res_peek) + << "Peek return failed: " << vpx_codec_err_to_string(res_peek); } } void DecoderTest::RunLoop(CompressedVideoSource *video, const vpx_codec_dec_cfg_t &dec_cfg) { Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_); - ASSERT_TRUE(decoder != NULL); + ASSERT_NE(decoder, nullptr); bool end_of_file = false; // Decode frames. @@ -77,7 +78,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video, vpx_codec_stream_info_t stream_info; stream_info.sz = sizeof(stream_info); - if (video->cxdata() != NULL) { + if (video->cxdata() != nullptr) { const vpx_codec_err_t res_peek = decoder->PeekStream( video->cxdata(), video->frame_size(), &stream_info); HandlePeekResult(decoder, video, res_peek); @@ -88,16 +89,16 @@ void DecoderTest::RunLoop(CompressedVideoSource *video, if (!HandleDecodeResult(res_dec, *video, decoder)) break; } else { // Signal end of the file to the decoder. - const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0); + const vpx_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0); ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); end_of_file = true; } DxDataIterator dec_iter = decoder->GetDxData(); - const vpx_image_t *img = NULL; + const vpx_image_t *img = nullptr; // Get decompressed data - while ((img = dec_iter.Next())) { + while (!::testing::Test::HasFailure() && (img = dec_iter.Next())) { DecompressedFrameHook(*img, video->frame_number()); } } @@ -110,7 +111,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) { } void DecoderTest::set_cfg(const vpx_codec_dec_cfg_t &dec_cfg) { - memcpy(&cfg_, &dec_cfg, sizeof(cfg_)); + cfg_ = dec_cfg; } void DecoderTest::set_flags(const vpx_codec_flags_t flags) { flags_ = flags; } diff --git a/media/libvpx/libvpx/test/decode_test_driver.h b/media/libvpx/libvpx/test/decode_test_driver.h index 644fc9e90d..81f5001ebc 100644 --- a/media/libvpx/libvpx/test/decode_test_driver.h +++ b/media/libvpx/libvpx/test/decode_test_driver.h @@ -8,10 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_DECODE_TEST_DRIVER_H_ -#define TEST_DECODE_TEST_DRIVER_H_ +#ifndef VPX_TEST_DECODE_TEST_DRIVER_H_ +#define VPX_TEST_DECODE_TEST_DRIVER_H_ #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "vpx/vpx_decoder.h" @@ -24,7 +24,7 @@ class CompressedVideoSource; class DxDataIterator { public: explicit DxDataIterator(vpx_codec_ctx_t *decoder) - : decoder_(decoder), iter_(NULL) {} + : decoder_(decoder), iter_(nullptr) {} const vpx_image_t *Next() { return vpx_codec_get_frame(decoder_, &iter_); } @@ -159,4 +159,4 @@ class DecoderTest { } // namespace libvpx_test -#endif // TEST_DECODE_TEST_DRIVER_H_ +#endif // VPX_TEST_DECODE_TEST_DRIVER_H_ diff --git a/media/libvpx/libvpx/test/decode_to_md5.sh b/media/libvpx/libvpx/test/decode_to_md5.sh index 854b74f84f..15eee39fac 100644 --- a/media/libvpx/libvpx/test/decode_to_md5.sh +++ b/media/libvpx/libvpx/test/decode_to_md5.sh @@ -40,7 +40,7 @@ decode_to_md5() { fi eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ - ${devnull} + ${devnull} || return 1 [ -e "${output_file}" ] || return 1 diff --git a/media/libvpx/libvpx/test/decode_with_drops.sh b/media/libvpx/libvpx/test/decode_with_drops.sh index 9b2edb6429..2c826045b3 100644 --- a/media/libvpx/libvpx/test/decode_with_drops.sh +++ b/media/libvpx/libvpx/test/decode_with_drops.sh @@ -40,7 +40,7 @@ decode_with_drops() { fi eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ - "${drop_mode}" ${devnull} + "${drop_mode}" ${devnull} || return 1 [ -e "${output_file}" ] || return 1 } @@ -52,10 +52,10 @@ decode_with_drops() { decode_with_drops_vp8() { if [ "$(vp8_decode_available)" = "yes" ]; then # Test sequence mode: Drop frames 2-28. - decode_with_drops "${VP8_IVF_FILE}" "vp8" "2-28" + decode_with_drops "${VP8_IVF_FILE}" "vp8" "2-28" || return 1 # Test pattern mode: Drop 3 of every 4 frames. - decode_with_drops "${VP8_IVF_FILE}" "vp8" "3/4" + decode_with_drops "${VP8_IVF_FILE}" "vp8" "3/4" || return 1 fi } @@ -66,10 +66,10 @@ decode_with_drops_vp8() { decode_with_drops_vp9() { if [ "$(vp9_decode_available)" = "yes" ]; then # Test sequence mode: Drop frames 2-28. - decode_with_drops "${VP9_IVF_FILE}" "vp9" "2-19" + decode_with_drops "${VP9_IVF_FILE}" "vp9" "2-19" || return 1 # Test pattern mode: Drop 3 of every 4 frames. - decode_with_drops "${VP9_IVF_FILE}" "vp9" "3/4" + decode_with_drops "${VP9_IVF_FILE}" "vp9" "3/4" || return 1 fi } diff --git a/media/libvpx/libvpx/test/encode_api_test.cc b/media/libvpx/libvpx/test/encode_api_test.cc index 419e38506c..554b9e7fa9 100644 --- a/media/libvpx/libvpx/test/encode_api_test.cc +++ b/media/libvpx/libvpx/test/encode_api_test.cc @@ -8,25 +8,113 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "test/acm_random.h" +#include "test/video_source.h" +#include "test/y4m_video_source.h" #include "./vpx_config.h" #include "vpx/vp8cx.h" +#include "vpx/vpx_codec.h" #include "vpx/vpx_encoder.h" +#include "vpx/vpx_image.h" namespace { -#define NELEMENTS(x) static_cast(sizeof(x) / sizeof(x[0])) - -TEST(EncodeAPI, InvalidParams) { - static const vpx_codec_iface_t *kCodecs[] = { +vpx_codec_iface_t *kCodecIfaces[] = { #if CONFIG_VP8_ENCODER - &vpx_codec_vp8_cx_algo, + &vpx_codec_vp8_cx_algo, #endif #if CONFIG_VP9_ENCODER - &vpx_codec_vp9_cx_algo, + &vpx_codec_vp9_cx_algo, #endif - }; +}; + +bool IsVP9(vpx_codec_iface_t *iface) { + static const char kVP9Name[] = "WebM Project VP9"; + return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) == + 0; +} + +void *Memset16(void *dest, int val, size_t length) { + uint16_t *dest16 = reinterpret_cast(dest); + for (size_t i = 0; i < length; i++) { + *dest16++ = val; + } + return dest; +} + +vpx_image_t *CreateImage(vpx_bit_depth_t bit_depth, vpx_img_fmt_t fmt, + unsigned int width, unsigned int height) { + assert(fmt != VPX_IMG_FMT_NV12); + if (bit_depth > VPX_BITS_8) { + fmt = static_cast(fmt | VPX_IMG_FMT_HIGHBITDEPTH); + } + vpx_image_t *image = vpx_img_alloc(nullptr, fmt, width, height, 1); + if (!image) return image; + + const int val = 1 << (bit_depth - 1); + const unsigned int uv_h = + (image->d_h + image->y_chroma_shift) >> image->y_chroma_shift; + const unsigned int uv_w = + (image->d_w + image->x_chroma_shift) >> image->x_chroma_shift; + if (bit_depth > VPX_BITS_8) { + for (unsigned int i = 0; i < image->d_h; ++i) { + Memset16(image->planes[0] + i * image->stride[0], val, image->d_w); + } + for (unsigned int i = 0; i < uv_h; ++i) { + Memset16(image->planes[1] + i * image->stride[1], val, uv_w); + Memset16(image->planes[2] + i * image->stride[2], val, uv_w); + } + } else { + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], val, image->d_w); + } + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], val, uv_w); + memset(image->planes[2] + i * image->stride[2], val, uv_w); + } + } + + return image; +} + +void InitCodec(vpx_codec_iface_t &iface, int width, int height, + vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) { + cfg->g_w = width; + cfg->g_h = height; + cfg->g_lag_in_frames = 0; + cfg->g_pass = VPX_RC_ONE_PASS; + ASSERT_EQ(vpx_codec_enc_init(enc, &iface, cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control_(enc, VP8E_SET_CPUUSED, 2), VPX_CODEC_OK); +} + +// Encodes 1 frame of size |cfg.g_w| x |cfg.g_h| setting |enc|'s configuration +// to |cfg|. +void EncodeWithConfig(const vpx_codec_enc_cfg_t &cfg, vpx_codec_ctx_t *enc) { + libvpx_test::DummyVideoSource video; + video.SetSize(cfg.g_w, cfg.g_h); + video.Begin(); + EXPECT_EQ(vpx_codec_enc_config_set(enc, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(enc); + + EXPECT_EQ(vpx_codec_encode(enc, video.img(), video.pts(), video.duration(), + /*flags=*/0, VPX_DL_GOOD_QUALITY), + VPX_CODEC_OK) + << vpx_codec_error_detail(enc); +} + +TEST(EncodeAPI, InvalidParams) { uint8_t buf[1] = { 0 }; vpx_image_t img; vpx_codec_ctx_t enc; @@ -34,32 +122,2162 @@ TEST(EncodeAPI, InvalidParams) { EXPECT_EQ(&img, vpx_img_wrap(&img, VPX_IMG_FMT_I420, 1, 1, 1, buf)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(NULL, NULL, NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_enc_init(&enc, NULL, NULL, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, NULL, 0, 0, 0, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_encode(NULL, &img, 0, 0, 0, 0)); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_config_default(NULL, NULL, 0)); + vpx_codec_enc_init(nullptr, nullptr, nullptr, 0)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_config_default(NULL, &cfg, 0)); - EXPECT_TRUE(vpx_codec_error(NULL) != NULL); + vpx_codec_enc_init(&enc, nullptr, nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_encode(nullptr, nullptr, 0, 0, 0, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_encode(nullptr, &img, 0, 0, 0, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(nullptr)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_config_default(nullptr, nullptr, 0)); + EXPECT_EQ(VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_config_default(nullptr, &cfg, 0)); + EXPECT_NE(vpx_codec_error(nullptr), nullptr); - for (int i = 0; i < NELEMENTS(kCodecs); ++i) { - SCOPED_TRACE(vpx_codec_iface_name(kCodecs[i])); + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_init(NULL, kCodecs[i], NULL, 0)); + vpx_codec_enc_init(nullptr, iface, nullptr, 0)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_init(&enc, kCodecs[i], NULL, 0)); + vpx_codec_enc_init(&enc, iface, nullptr, 0)); EXPECT_EQ(VPX_CODEC_INVALID_PARAM, - vpx_codec_enc_config_default(kCodecs[i], &cfg, 1)); + vpx_codec_enc_config_default(iface, &cfg, 1)); - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(kCodecs[i], &cfg, 0)); - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, kCodecs[i], &cfg, 0)); - EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, NULL, 0, 0, 0, 0)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(iface, &cfg, 0)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, iface, &cfg, 0)); + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, nullptr, 0, 0, 0, 0)); EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc)); } } +TEST(EncodeAPI, HighBitDepthCapability) { +// VP8 should not claim VP9 HBD as a capability. +#if CONFIG_VP8_ENCODER + const vpx_codec_caps_t vp8_caps = vpx_codec_get_caps(&vpx_codec_vp8_cx_algo); + EXPECT_EQ(vp8_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif + +#if CONFIG_VP9_ENCODER + const vpx_codec_caps_t vp9_caps = vpx_codec_get_caps(&vpx_codec_vp9_cx_algo); +#if CONFIG_VP9_HIGHBITDEPTH + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, VPX_CODEC_CAP_HIGHBITDEPTH); +#else + EXPECT_EQ(vp9_caps & VPX_CODEC_CAP_HIGHBITDEPTH, 0); +#endif +#endif +} + +#if CONFIG_VP8_ENCODER +TEST(EncodeAPI, ImageSizeSetting) { + const int width = 711; + const int height = 360; + const int bps = 12; + vpx_image_t img; + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + uint8_t *img_buf = reinterpret_cast( + calloc(width * height * bps / 8, sizeof(*img_buf))); + vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0); + + cfg.g_w = width; + cfg.g_h = height; + + vpx_img_wrap(&img, VPX_IMG_FMT_I420, width, height, 1, img_buf); + + vpx_codec_enc_init(&enc, vpx_codec_vp8_cx(), &cfg, 0); + + EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, &img, 0, 1, 0, 0)); + + free(img_buf); + + vpx_codec_destroy(&enc); +} + +// Verifies the fix for a float-cast-overflow in vp8_change_config(). +// +// Causes cpi->framerate to become the largest possible value (10,000,000) in +// VP8 by setting cfg.g_timebase to 1/10000000 and passing a duration of 1 to +// vpx_codec_encode(). +TEST(EncodeAPI, HugeFramerateVp8) { + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_w = 271; + cfg.g_h = 1080; + cfg.g_timebase.num = 1; + // Largest value (VP8's TICKS_PER_SEC) such that frame duration is nonzero (1 + // tick). + cfg.g_timebase.den = 10000000; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_end_usage = VPX_CBR; + + vpx_codec_ctx_t enc; + // Before we encode the first frame, cpi->framerate is set to a guess (the + // reciprocal of cfg.g_timebase). If this guess doesn't seem reasonable + // (> 180), cpi->framerate is set to 30. + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -12), VPX_CODEC_OK); + + vpx_image_t *const image = + vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1); + ASSERT_NE(image, nullptr); + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + + // Encode a frame. + // Up to this point cpi->framerate is 30. Now pass a duration of only 1. This + // causes cpi->framerate to become 10,000,000. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Change to the same config. Since cpi->framerate is now huge, when it is + // used to calculate raw_target_rate (bit rate of uncompressed frames), the + // result is likely to overflow an unsigned int. + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK); + + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +// A test that reproduces https://crbug.com/webm/1831. +TEST(EncodeAPI, RandomPixelsVp8) { + // Initialize libvpx encoder + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.rc_target_bitrate = 2000; + cfg.g_w = 1280; + cfg.g_h = 720; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Generate random frame data and encode + libvpx_test::RandomVideoSource video; + video.SetSize(cfg.g_w, cfg.g_h); + video.SetImageFormat(VPX_IMG_FMT_I420); + video.Begin(); + ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(), + /*flags=*/0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + + // Destroy libvpx encoder + vpx_codec_destroy(&enc); +} + +TEST(EncodeAPI, ChangeToL1T3AndSetBitrateVp8) { + // Initialize libvpx encoder + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_threads = 1; + cfg.g_profile = 0; + cfg.g_w = 1; + cfg.g_h = 64; + cfg.g_bit_depth = VPX_BITS_8; + cfg.g_input_bit_depth = 8; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = 1000000; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_dropframe_thresh = 0; // Don't drop frames + cfg.rc_resize_allowed = 0; + cfg.rc_end_usage = VPX_VBR; + cfg.rc_target_bitrate = 10; + cfg.rc_min_quantizer = 2; + cfg.rc_max_quantizer = 58; + cfg.kf_mode = VPX_KF_AUTO; + cfg.kf_min_dist = 0; + cfg.kf_max_dist = 10000; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -6), VPX_CODEC_OK); + + // Generate random frame data and encode + uint8_t img[1 * 64 * 3 / 2]; + libvpx_test::ACMRandom rng; + for (size_t i = 0; i < sizeof(img); ++i) { + img[i] = rng.Rand8(); + } + vpx_image_t img_wrapper; + ASSERT_EQ( + vpx_img_wrap(&img_wrapper, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1, img), + &img_wrapper); + vpx_enc_frame_flags_t flags = VPX_EFLAG_FORCE_KF; + ASSERT_EQ( + vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_encode(&enc, nullptr, -1, 0, 0, 0), VPX_CODEC_OK); + + cfg.rc_target_bitrate = 4294967; + // Set the scalability mode to L1T3. + cfg.ts_number_layers = 3; + cfg.ts_periodicity = 4; + cfg.ts_layer_id[0] = 0; + cfg.ts_layer_id[1] = 2; + cfg.ts_layer_id[2] = 1; + cfg.ts_layer_id[3] = 2; + cfg.ts_rate_decimator[0] = 4; + cfg.ts_rate_decimator[1] = 2; + cfg.ts_rate_decimator[2] = 1; + // Bitrate allocation L0: 50% L1: 20% L2: 30% + cfg.layer_target_bitrate[0] = cfg.ts_target_bitrate[0] = + 50 * cfg.rc_target_bitrate / 100; + cfg.layer_target_bitrate[1] = cfg.ts_target_bitrate[1] = + 70 * cfg.rc_target_bitrate / 100; + cfg.layer_target_bitrate[2] = cfg.ts_target_bitrate[2] = + cfg.rc_target_bitrate; + cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212; + cfg.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT; + ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TEMPORAL_LAYER_ID, 2), + VPX_CODEC_OK); + + constexpr vpx_enc_frame_flags_t VP8_UPDATE_NOTHING = + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + // Layer 2: only reference last frame, no updates + // It only depends on layer 0 + flags = VP8_UPDATE_NOTHING | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_REF_GF; + ASSERT_EQ( + vpx_codec_encode(&enc, &img_wrapper, 0, 500000, flags, VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Destroy libvpx encoder + vpx_codec_destroy(&enc); +} + +// Emulates the WebCodecs VideoEncoder interface. +class VP8Encoder { + public: + explicit VP8Encoder(int speed) : speed_(speed) {} + ~VP8Encoder(); + + void Configure(unsigned int threads, unsigned int width, unsigned int height, + vpx_rc_mode end_usage, vpx_enc_deadline_t deadline); + void Encode(bool key_frame); + + private: + const int speed_; + bool initialized_ = false; + vpx_codec_enc_cfg_t cfg_; + vpx_codec_ctx_t enc_; + int frame_index_ = 0; + vpx_enc_deadline_t deadline_ = 0; +}; + +VP8Encoder::~VP8Encoder() { + if (initialized_) { + EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK); + } +} + +void VP8Encoder::Configure(unsigned int threads, unsigned int width, + unsigned int height, vpx_rc_mode end_usage, + vpx_enc_deadline_t deadline) { + deadline_ = deadline; + + if (!initialized_) { + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0), + VPX_CODEC_OK); + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.g_timebase.num = 1; + cfg_.g_timebase.den = 1000 * 1000; // microseconds + cfg_.g_pass = VPX_RC_ONE_PASS; + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = end_usage; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 58; + ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK); + initialized_ = true; + return; + } + + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.rc_end_usage = end_usage; + ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc_); +} + +void VP8Encoder::Encode(bool key_frame) { + assert(initialized_); + const vpx_codec_cx_pkt_t *pkt; + vpx_image_t *image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg_.g_w, cfg_.g_h); + ASSERT_NE(image, nullptr); + const vpx_enc_frame_flags_t flags = key_frame ? VPX_EFLAG_FORCE_KF : 0; + ASSERT_EQ(vpx_codec_encode(&enc_, image, frame_index_, 1, flags, deadline_), + VPX_CODEC_OK); + ++frame_index_; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + if (key_frame) { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY); + } + } + vpx_img_free(image); +} + +// This is the reproducer testcase for crbug.com/324459561. However, +// just running this test is not enough to reproduce the bug. We also +// need to send signals to the test. +TEST(EncodeAPI, Chromium324459561) { + VP8Encoder encoder(-12); + + encoder.Configure(11, 1685, 652, VPX_CBR, VPX_DL_REALTIME); + + encoder.Encode(true); + encoder.Encode(true); + encoder.Encode(true); + + encoder.Configure(0, 1685, 1, VPX_VBR, VPX_DL_REALTIME); +} + +TEST(EncodeAPI, VP8GlobalHeaders) { + constexpr int kWidth = 320; + constexpr int kHeight = 240; + + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0), + VPX_CODEC_OK); + ASSERT_NO_FATAL_FAILURE( + InitCodec(*vpx_codec_vp8_cx(), kWidth, kHeight, &enc.ctx, &cfg)); + EXPECT_EQ(vpx_codec_get_global_headers(&enc.ctx), nullptr); + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx)); + EXPECT_EQ(vpx_codec_get_global_headers(&enc.ctx), nullptr); +} + +// Encode a few frames for 2 temporal layers realtime mode. +// Set duration to be very large on first frame, much smaller +// on second frames, with the timestamp (pts) parameter very +// inconsistent with the duration (i.e, pts != prev_pts + duration). +// This reproduces the issue found in the bug: 431520320. +TEST(EncodeAPI, Vp8ChromiumIssue431520320) { + // Initialize libvpx encoder. + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_w = 320; + cfg.g_h = 240; + cfg.g_lag_in_frames = 0; + cfg.rc_target_bitrate = 500; + + // 2-layers, 2-frame period. + int ids[2] = { 0, 1 }; + cfg.ts_periodicity = 2; + cfg.ts_number_layers = 2; + cfg.ts_rate_decimator[0] = 2; + cfg.ts_rate_decimator[1] = 1; + cfg.ts_target_bitrate[0] = 300; + cfg.ts_target_bitrate[1] = 500; + memcpy(cfg.ts_layer_id, ids, sizeof(ids)); + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode first frame. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 0, /*duration=*/800000, 0, VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Encode second frame. + ASSERT_EQ(vpx_codec_encode(&enc, image, 40000, /*duration=*/40000, 0, + VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Encode third frame. + ASSERT_EQ(vpx_codec_encode(&enc, image, 80000, /*duration=*/40000, 0, + VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(EncodeAPI, AomediaIssue3509VbrMinSection2PercentVP8) { + // Initialize libvpx encoder. + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_w = 1920; + cfg.g_h = 1080; + cfg.g_lag_in_frames = 0; + cfg.rc_target_bitrate = 1000000; + // Set this to more than 1 percent to cause a signed integer overflow in the + // multiplication cpi->av_per_frame_bandwidth * + // cpi->oxcf.two_pass_vbrmin_section in vp8_new_framerate() if the + // multiplication is done in the `int` type. + cfg.rc_2pass_vbr_minsection_pct = 2; + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + // `duration` can go as high as 300, but the UBSan error is gone if + // `duration` is 301 or higher. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 0, /*duration=*/300, 0, VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(EncodeAPI, AomediaIssue3509VbrMinSection101PercentVP8) { + // Initialize libvpx encoder. + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_w = 1920; + cfg.g_h = 1080; + cfg.g_lag_in_frames = 0; + cfg.rc_target_bitrate = 1000000; + // Set this to more than 100 percent to cause an error when vbr_min_bits is + // cast to `int` in vp8_new_framerate() if vbr_min_bits is not clamped to + // INT_MAX. + cfg.rc_2pass_vbr_minsection_pct = 101; + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + // `duration` can go as high as 300, but the UBSan error is gone if + // `duration` is 301 or higher. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 0, /*duration=*/300, 0, VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(EncodeAPI, OssFuzz69100) { + // Initialize libvpx encoder. + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_w = 64; + cfg.g_h = 64; + cfg.g_lag_in_frames = 25; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = 6240592; + cfg.rc_target_bitrate = 1202607620; + cfg.kf_max_dist = 24377; + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 1), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_ARNR_MAXFRAMES, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_ARNR_STRENGTH, 3), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control_(&enc, VP8E_SET_ARNR_TYPE, 3), + VPX_CODEC_OK); // deprecated + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_NOISE_SENSITIVITY, 0), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TOKEN_PARTITIONS, 0), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_STATIC_THRESHOLD, 0), + VPX_CODEC_OK); + + libvpx_test::RandomVideoSource video; + video.set_limit(30); + video.SetSize(cfg.g_w, cfg.g_h); + video.SetImageFormat(VPX_IMG_FMT_I420); + video.Begin(); + do { + ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(), + /*flags=*/0, VPX_DL_GOOD_QUALITY), + VPX_CODEC_OK); + video.Next(); + } while (video.img() != nullptr); + + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +void EncodeOssFuzz69906(int cpu_used, vpx_enc_deadline_t deadline) { + char str[80]; + snprintf(str, sizeof(str), "cpu_used: %d deadline: %d", cpu_used, + static_cast(deadline)); + SCOPED_TRACE(str); + + // Initialize libvpx encoder. + vpx_codec_iface_t *const iface = vpx_codec_vp8_cx(); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_w = 4097; + cfg.g_h = 16; + cfg.rc_target_bitrate = 1237084865; + cfg.kf_max_dist = 4336; + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, cpu_used), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_ARNR_MAXFRAMES, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_ARNR_STRENGTH, 3), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control_(&enc, VP8E_SET_ARNR_TYPE, 3), + VPX_CODEC_OK); // deprecated + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_NOISE_SENSITIVITY, 0), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_TOKEN_PARTITIONS, 0), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_STATIC_THRESHOLD, 0), + VPX_CODEC_OK); + + libvpx_test::Y4mVideoSource video("repro-oss-fuzz-69906.y4m", /*start=*/0, + /*limit=*/3); + video.Begin(); + do { + ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(), + /*flags=*/0, deadline), + VPX_CODEC_OK); + video.Next(); + } while (video.img() != nullptr); + + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(EncodeAPI, OssFuzz69906) { + // Note the original bug report was for speed 1, good quality. The remainder + // of the settings are for added coverage. + for (int cpu_used = 0; cpu_used <= 5; ++cpu_used) { + EncodeOssFuzz69906(cpu_used, VPX_DL_GOOD_QUALITY); + } + + for (int cpu_used = -16; cpu_used <= -5; ++cpu_used) { + EncodeOssFuzz69906(cpu_used, VPX_DL_REALTIME); + } +} +#endif // CONFIG_VP8_ENCODER + +// Set up 2 spatial streams with 2 temporal layers per stream, and generate +// invalid configuration by setting the temporal layer rate allocation +// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of +// CONFIG_MULTI_RES_ENCODING. +TEST(EncodeAPI, MultiResEncode) { + const int width = 1280; + const int height = 720; + const int width_down = width / 2; + const int height_down = height / 2; + const int target_bitrate = 1000; + const int framerate = 30; + + for (const auto *iface : kCodecIfaces) { + vpx_codec_ctx_t enc[2]; + vpx_codec_enc_cfg_t cfg[2]; + vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } }; + + memset(enc, 0, sizeof(enc)); + + for (int i = 0; i < 2; i++) { + vpx_codec_enc_config_default(iface, &cfg[i], 0); + } + + /* Highest-resolution encoder settings */ + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].rc_dropframe_thresh = 0; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 2; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 100; + cfg[0].rc_overshoot_pct = 15; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ + cfg[0].g_lag_in_frames = 0; + + cfg[0].kf_mode = VPX_KF_AUTO; + cfg[0].kf_min_dist = 3000; + cfg[0].kf_max_dist = 3000; + + cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */ + cfg[0].g_timebase.num = 1; /* Set fps */ + cfg[0].g_timebase.den = framerate; + + cfg[1] = cfg[0]; + cfg[1].rc_target_bitrate = 500; + cfg[1].g_w = width_down; + cfg[1].g_h = height_down; + + for (int i = 0; i < 2; i++) { + cfg[i].ts_number_layers = 2; + cfg[i].ts_periodicity = 2; + cfg[i].ts_rate_decimator[0] = 2; + cfg[i].ts_rate_decimator[1] = 1; + cfg[i].ts_layer_id[0] = 0; + cfg[i].ts_layer_id[1] = 1; + // Invalid parameters. + cfg[i].ts_target_bitrate[0] = 0; + cfg[i].ts_target_bitrate[1] = 0; + } + + // VP9 should report incapable, VP8 invalid for all configurations. + EXPECT_EQ(IsVP9(iface) ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0])); + + for (int i = 0; i < 2; i++) { + vpx_codec_destroy(&enc[i]); + } + } +} + +TEST(EncodeAPI, SetRoi) { + static struct { + vpx_codec_iface_t *iface; + int ctrl_id; + } kCodecs[] = { +#if CONFIG_VP8_ENCODER + { &vpx_codec_vp8_cx_algo, VP8E_SET_ROI_MAP }, +#endif +#if CONFIG_VP9_ENCODER + { &vpx_codec_vp9_cx_algo, VP9E_SET_ROI_MAP }, +#endif + }; + constexpr int kWidth = 64; + constexpr int kHeight = 64; + + for (const auto &codec : kCodecs) { + SCOPED_TRACE(vpx_codec_iface_name(codec.iface)); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + EXPECT_EQ(vpx_codec_enc_config_default(codec.iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_w = kWidth; + cfg.g_h = kHeight; + EXPECT_EQ(vpx_codec_enc_init(&enc, codec.iface, &cfg, 0), VPX_CODEC_OK); + + vpx_roi_map_t roi = {}; + uint8_t roi_map[kWidth * kHeight] = {}; + if (IsVP9(codec.iface)) { + roi.rows = (cfg.g_w + 7) >> 3; + roi.cols = (cfg.g_h + 7) >> 3; + } else { + roi.rows = (cfg.g_w + 15) >> 4; + roi.cols = (cfg.g_h + 15) >> 4; + } + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK); + + roi.roi_map = roi_map; + // VP8 only. This value isn't range checked. + roi.static_threshold[1] = 1000; + roi.static_threshold[2] = UINT_MAX / 2 + 1; + roi.static_threshold[3] = UINT_MAX; + + for (const auto delta : { -63, -1, 0, 1, 63 }) { + for (int i = 0; i < 8; ++i) { + roi.delta_q[i] = delta; + roi.delta_lf[i] = delta; + // VP9 only. + roi.skip[i] ^= 1; + roi.ref_frame[i] = (roi.ref_frame[i] + 1) % 4; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK); + } + } + + vpx_codec_err_t expected_error; + for (const auto delta : { -64, 64, INT_MIN, INT_MAX }) { + expected_error = VPX_CODEC_INVALID_PARAM; + for (int i = 0; i < 8; ++i) { + roi.delta_q[i] = delta; + // The max segment count for VP8 is 4, the remainder of the entries are + // ignored. + if (i >= 4 && !IsVP9(codec.iface)) expected_error = VPX_CODEC_OK; + + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "delta_q[" << i << "]: " << delta; + roi.delta_q[i] = 0; + + roi.delta_lf[i] = delta; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "delta_lf[" << i << "]: " << delta; + roi.delta_lf[i] = 0; + } + } + + // VP8 should ignore skip[] and ref_frame[] values. + expected_error = + IsVP9(codec.iface) ? VPX_CODEC_INVALID_PARAM : VPX_CODEC_OK; + for (const auto skip : { -2, 2, INT_MIN, INT_MAX }) { + for (int i = 0; i < 8; ++i) { + roi.skip[i] = skip; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "skip[" << i << "]: " << skip; + roi.skip[i] = 0; + } + } + + // VP9 allows negative values to be used to disable segmentation. + for (int ref_frame = -3; ref_frame < 0; ++ref_frame) { + for (int i = 0; i < 8; ++i) { + roi.ref_frame[i] = ref_frame; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), VPX_CODEC_OK) + << "ref_frame[" << i << "]: " << ref_frame; + roi.ref_frame[i] = 0; + } + } + + for (const auto ref_frame : { 4, INT_MIN, INT_MAX }) { + for (int i = 0; i < 8; ++i) { + roi.ref_frame[i] = ref_frame; + EXPECT_EQ(vpx_codec_control_(&enc, codec.ctrl_id, &roi), expected_error) + << "ref_frame[" << i << "]: " << ref_frame; + roi.ref_frame[i] = 0; + } + } + + EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, ConfigChangeThreadCount) { + constexpr int kWidth = 1920; + constexpr int kHeight = 1080; + + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + EXPECT_NO_FATAL_FAILURE( + InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)); + if (IsVP9(iface)) { + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6), + VPX_CODEC_OK); + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i), + VPX_CODEC_OK); + } + + for (const auto threads : { 1, 4, 8, 6, 2, 1 }) { + cfg.g_threads = threads; + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx)) + << "iteration: " << i << " threads: " << threads; + } + } + } +} + +TEST(EncodeAPI, ConfigResizeChangeThreadCount) { + constexpr int kInitWidth = 1024; + constexpr int kInitHeight = 1024; + + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) { + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // Start in threaded mode to ensure resolution and thread related + // allocations are updated correctly across changes in resolution and + // thread counts. See https://crbug.com/1486441. + cfg.g_threads = 4; + EXPECT_NO_FATAL_FAILURE( + InitCodec(*iface, kInitWidth, kInitHeight, &enc.ctx, &cfg)); + if (IsVP9(iface)) { + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6), + VPX_CODEC_OK); + EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i), + VPX_CODEC_OK); + } + + cfg.g_w = 1000; + cfg.g_h = 608; + EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc.ctx); + + cfg.g_w = 1000; + cfg.g_h = 720; + + for (const auto threads : { 1, 4, 8, 6, 2, 1 }) { + cfg.g_threads = threads; + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx)) + << "iteration: " << i << " threads: " << threads; + } + } + } +} + +TEST(EncodeAPI, ConfigResizeBiggerAfterInit) { + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg; + vpx_codec_ctx_t enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg)); + + cfg.g_w = 1920; + cfg.g_h = 1; + EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg), + IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM); + + EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, ConfigResizeBiggerAfterEncode) { + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg; + vpx_codec_ctx_t enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, 1, 1, &enc, &cfg)); + EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc)); + + cfg.g_w = 1920; + cfg.g_h = 1; + EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg), + IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM); + + cfg.g_w = 1920; + cfg.g_h = 1080; + EXPECT_EQ(vpx_codec_enc_config_set(&enc, &cfg), + IsVP9(iface) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM); + + EXPECT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, PtsSmallerThanInitialPts) { + for (const auto *iface : kCodecIfaces) { + // Initialize libvpx encoder. + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(vpx_codec_encode(&enc, image, 12, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_encode(&enc, image, 13, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); + // pts (10) is smaller than the initial pts (12). + ASSERT_EQ(vpx_codec_encode(&enc, image, 10, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, PtsOrDurationTooBig) { + for (const auto *iface : kCodecIfaces) { + // Initialize libvpx encoder. + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_OK); +#if ULONG_MAX > INT64_MAX + // duration is too big. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, 2), + VPX_CODEC_INVALID_PARAM); +#endif + // pts, when converted to ticks, is too big. + ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000 + 1, 1, 0, + VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); +#if ULONG_MAX > INT64_MAX + // duration is too big. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 0, (1ul << 63), 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); + // pts + duration is too big. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 1, INT64_MAX, 0, VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); +#endif + // pts + duration, when converted to ticks, is too big. +#if ULONG_MAX > INT64_MAX + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 0xbd6b566b15c7, 0, + VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); +#endif + ASSERT_EQ(vpx_codec_encode(&enc, image, INT64_MAX / 1000000, 1, 0, + VPX_DL_BEST_QUALITY), + VPX_CODEC_INVALID_PARAM); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +TEST(EncodeAPI, PerFramePsnr) { + for (const auto *iface : kCodecIfaces) { + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_lag_in_frames = 0; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + vpx_enc_frame_flags_t psnr_flags = VPX_EFLAG_CALCULATE_PSNR; + ASSERT_EQ(vpx_codec_encode(&enc, image, /*pts=*/0, /*duration=*/1, + psnr_flags, VPX_DL_REALTIME), + VPX_CODEC_OK); + + const vpx_codec_cx_pkt_t *pkt; + vpx_codec_iter_t iter = nullptr; + bool had_psnr = false; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) { + ASSERT_EQ(pkt->kind, VPX_CODEC_PSNR_PKT); + had_psnr = true; + } + } + EXPECT_TRUE(had_psnr); + + vpx_enc_frame_flags_t no_psnr_flags = 0; + ASSERT_EQ(vpx_codec_encode(&enc, image, /*pts=*/1, /*duration=*/1, + no_psnr_flags, VPX_DL_REALTIME), + VPX_CODEC_OK); + + iter = nullptr; + had_psnr = false; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) { + ASSERT_EQ(pkt->kind, VPX_CODEC_PSNR_PKT); + had_psnr = true; + } + } +#if CONFIG_INTERNAL_STATS + // CONFIG_INTERNAL_STATS unconditionally generates PSNR. + EXPECT_TRUE(had_psnr); +#else + EXPECT_FALSE(had_psnr); +#endif // CONFIG_INTERNAL_STATS + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); + } +} + +#if CONFIG_VP9_ENCODER +// Frame size needed to trigger the overflow exceeds the max buffer allowed on +// 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY +#if VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 +TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) { +#ifdef CHROMIUM + GTEST_SKIP() << "Under Chromium's configuration the allocator is unable" + "to provide the space required for the frames below."; +#else + constexpr int kWidth = 12383; + constexpr int kHeight = 8192; + constexpr auto *iface = &vpx_codec_vp9_cx_algo; + SCOPED_TRACE(vpx_codec_iface_name(iface)); + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // The following setting will cause avg_frame_bandwidth in rate control to be + // larger than INT_MAX + cfg.rc_target_bitrate = INT_MAX; + // Framerate 0.1 (equivalent to timebase 10) is the smallest framerate allowed + // by libvpx + cfg.g_timebase.den = 1; + cfg.g_timebase.num = 10; + EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg)) + << "target bitrate: " << cfg.rc_target_bitrate << " framerate: " + << static_cast(cfg.g_timebase.den) / cfg.g_timebase.num; +#endif // defined(CHROMIUM) +} +#endif // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64 + +// Emulates the WebCodecs VideoEncoder interface. +class VP9Encoder { + public: + explicit VP9Encoder(int speed) + : speed_(speed), row_mt_(0), bit_depth_(VPX_BITS_8), + fmt_(VPX_IMG_FMT_I420) {} + // The image format `fmt` must not have the VPX_IMG_FMT_HIGHBITDEPTH bit set. + // If bit_depth > 8, we will set the VPX_IMG_FMT_HIGHBITDEPTH bit before + // passing the image format to vpx_img_alloc(). + VP9Encoder(int speed, unsigned int row_mt, vpx_bit_depth_t bit_depth, + vpx_img_fmt_t fmt) + : speed_(speed), row_mt_(row_mt), bit_depth_(bit_depth), fmt_(fmt) {} + ~VP9Encoder(); + + void Configure(unsigned int threads, unsigned int width, unsigned int height, + vpx_rc_mode end_usage, vpx_enc_deadline_t deadline); + void Encode(bool key_frame); + + private: + const int speed_; + const unsigned int row_mt_; + const vpx_bit_depth_t bit_depth_; + const vpx_img_fmt_t fmt_; + bool initialized_ = false; + vpx_codec_enc_cfg_t cfg_; + vpx_codec_ctx_t enc_; + int frame_index_ = 0; + vpx_enc_deadline_t deadline_ = 0; +}; + +VP9Encoder::~VP9Encoder() { + if (initialized_) { + EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK); + } +} + +void VP9Encoder::Configure(unsigned int threads, unsigned int width, + unsigned int height, vpx_rc_mode end_usage, + vpx_enc_deadline_t deadline) { + deadline_ = deadline; + + if (!initialized_) { + ASSERT_EQ(fmt_ & VPX_IMG_FMT_HIGHBITDEPTH, 0); + const bool high_bit_depth = bit_depth_ > VPX_BITS_8; + const bool is_420 = fmt_ == VPX_IMG_FMT_I420; + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0), + VPX_CODEC_OK); + cfg_.g_threads = threads; + // In profiles 0 and 2, only 4:2:0 format is allowed. In profiles 1 and 3, + // all other subsampling formats are allowed. In profiles 0 and 1, only bit + // depth 8 is allowed. In profiles 2 and 3, only bit depths 10 and 12 are + // allowed. + cfg_.g_profile = 2 * high_bit_depth + !is_420; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.g_bit_depth = bit_depth_; + cfg_.g_input_bit_depth = bit_depth_; + cfg_.g_timebase.num = 1; + cfg_.g_timebase.den = 1000 * 1000; // microseconds + cfg_.g_pass = VPX_RC_ONE_PASS; + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = end_usage; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 58; + ASSERT_EQ( + vpx_codec_enc_init(&enc_, iface, &cfg_, + high_bit_depth ? VPX_CODEC_USE_HIGHBITDEPTH : 0), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc_, VP9E_SET_ROW_MT, row_mt_), VPX_CODEC_OK); + initialized_ = true; + return; + } + + cfg_.g_threads = threads; + cfg_.g_w = width; + cfg_.g_h = height; + cfg_.rc_end_usage = end_usage; + ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK) + << vpx_codec_error_detail(&enc_); +} + +void VP9Encoder::Encode(bool key_frame) { + assert(initialized_); + const vpx_codec_cx_pkt_t *pkt; + vpx_image_t *image = CreateImage(bit_depth_, fmt_, cfg_.g_w, cfg_.g_h); + ASSERT_NE(image, nullptr); + const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0; + ASSERT_EQ( + vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_), + VPX_CODEC_OK); + ++frame_index_; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + } + vpx_img_free(image); +} + +// This is a test case from clusterfuzz. +TEST(EncodeAPI, PrevMiCheckNullptr) { + VP9Encoder encoder(0); + encoder.Configure(0, 1554, 644, VPX_VBR, VPX_DL_REALTIME); + + // First step: encode, without forcing KF. + encoder.Encode(false); + // Second step: change config + encoder.Configure(0, 1131, 644, VPX_CBR, VPX_DL_GOOD_QUALITY); + // Third step: encode, without forcing KF + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/310477034. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, MultipleChangeConfigResize) { + VP9Encoder encoder(3); + + // Set initial config. + encoder.Configure(3, 41, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 31, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Change config again. + encoder.Configure(0, 17, 1, VPX_CBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); + + // Encode 3rd frame with same config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/310663186. +// Encode set of frames while varying the deadline on the fly from +// good to realtime to best and back to realtime. +TEST(EncodeAPI, DynamicDeadlineChange) { + // Use realtime speed: 5 to 9. + VP9Encoder encoder(5); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(true); + + // Encode 2nd frame, delta frame. + encoder.Encode(false); + + // Change config: change deadline to REALTIME. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set key frame. + encoder.Encode(true); + + // Encode 4th frame with same config, delta frame. + encoder.Encode(false); + + // Encode 5th frame with same config, key frame. + encoder.Encode(true); + + // Change config: change deadline to BEST. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY); + + // Encode 6th frame with new config, set delta frame. + encoder.Encode(false); + + // Change config: change deadline to REALTIME. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 7th frame with new config, set delta frame. + encoder.Encode(false); + + // Encode 8th frame with new config, set key frame. + encoder.Encode(true); + + // Encode 9th frame with new config, set delta frame. + encoder.Encode(false); +} + +TEST(EncodeAPI, Buganizer310340241) { + VP9Encoder encoder(-6); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(true); + + // Encode 2nd frame, delta frame. + encoder.Encode(false); + + // Change config: change deadline to REALTIME. + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set key frame. + encoder.Encode(true); +} + +// This is a test case from clusterfuzz: based on b/312517065. +TEST(EncodeAPI, Buganizer312517065) { + VP9Encoder encoder(4); + encoder.Configure(0, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Configure(10, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY); + encoder.Encode(false); + encoder.Configure(6, 327, 269, VPX_VBR, VPX_DL_GOOD_QUALITY); + encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/311489136. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311489136) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(3, 1678, 202, VPX_CBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); + + // Change config again. + encoder.Configure(8, 1037, 476, VPX_CBR, VPX_DL_REALTIME); + + // Encode 3rd frame with new config, set delta frame. + encoder.Encode(false); + + // Change config again. + encoder.Configure(0, 580, 620, VPX_CBR, VPX_DL_GOOD_QUALITY); + + // Encode 4th frame with same config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/312656387. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer312656387) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(16, 1, 1024, VPX_CBR, VPX_DL_REALTIME); + + // Change config. + encoder.Configure(15, 1, 1024, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config again. + encoder.Configure(14, 1, 595, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config. + encoder.Encode(true); + + // Change config again. + encoder.Configure(2, 1, 1024, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 3rd frame with new config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/310329177. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer310329177) { + VP9Encoder encoder(6); + + // Set initial config. + encoder.Configure(10, 41, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 1, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config, set delta frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/311394513. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311394513) { + VP9Encoder encoder(-7); + + // Set initial config. + encoder.Configure(0, 5, 9, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(5, 2, 1, VPX_VBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config. + encoder.Encode(true); +} + +TEST(EncodeAPI, Buganizer311985118) { + VP9Encoder encoder(0); + + // Set initial config, in particular set deadline to GOOD mode. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 1st frame. + encoder.Encode(false); + + // Change config: change threads and width. + encoder.Configure(0, 1574, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Change config: change threads, width and height. + encoder.Configure(16, 837, 432, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame. + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/314857577. +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer314857577) { + VP9Encoder encoder(4); + + // Set initial config. + encoder.Configure(12, 1060, 437, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(16, 1060, 1, VPX_CBR, VPX_DL_REALTIME); + + // Encode 2nd frame with new config. + encoder.Encode(false); + + // Encode 3rd frame with new config. + encoder.Encode(true); + + // Change config. + encoder.Configure(15, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 4th frame with new config. + encoder.Encode(true); + + // Encode 5th frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(5, 327, 269, VPX_VBR, VPX_DL_REALTIME); + + // Change config. + encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME); + + // Encode 6th frame with new config. + encoder.Encode(false); + + // Encode 7th frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(4, 1060, 437, VPX_VBR, VPX_DL_REALTIME); + + // Encode 8th frame with new config. + encoder.Encode(false); +} + +TEST(EncodeAPI, Buganizer312875957PredBufferStride) { + VP9Encoder encoder(-1); + + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Encode(false); + encoder.Configure(0, 456, 486, VPX_VBR, VPX_DL_REALTIME); + encoder.Encode(true); + encoder.Configure(0, 1678, 620, VPX_CBR, 1000000); + encoder.Encode(false); + encoder.Encode(false); +} + +// This is a test case from clusterfuzz: based on b/311294795 +// Encode a few frames with multiple change config calls +// with different frame sizes. +TEST(EncodeAPI, Buganizer311294795) { + VP9Encoder encoder(1); + + // Set initial config. + encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME); + + // Encode first frame. + encoder.Encode(false); + + // Change config. + encoder.Configure(16, 632, 620, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 2nd frame with new config + encoder.Encode(true); + + // Change config. + encoder.Configure(16, 1678, 342, VPX_VBR, VPX_DL_GOOD_QUALITY); + + // Encode 3rd frame with new config. + encoder.Encode(false); + + // Change config. + encoder.Configure(0, 1574, 618, VPX_VBR, VPX_DL_REALTIME); + // Encode more frames with new config. + encoder.Encode(false); + encoder.Encode(false); +} + +// Test case to capture assert issue triggered in +// vp9_bitstream.c for good_quality, speed 1, lossless; +// See comment#22 in issue:433941753. +TEST(EncodeAPI, AssertIssueGoodQualitySpeed1Lossless) { + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_w = 1540; + cfg.g_h = 838; + cfg.g_profile = 0; + cfg.g_bit_depth = VPX_BITS_8; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = 10000; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_end_usage = VPX_VBR; + cfg.g_threads = 1; + cfg.rc_target_bitrate = 10000; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_LOSSLESS, 1), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 1), VPX_CODEC_OK); + libvpx_test::RandomVideoSource video; + video.SetSize(cfg.g_w, cfg.g_h); + video.SetImageFormat(VPX_IMG_FMT_I420); + video.set_limit(20); + video.Begin(); + do { + ASSERT_EQ(vpx_codec_encode(&enc, video.img(), video.pts(), video.duration(), + 0, VPX_DL_GOOD_QUALITY), + VPX_CODEC_OK); + video.Next(); + } while (video.img() != nullptr); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(EncodeAPI, Buganizer317105128) { + VP9Encoder encoder(-9); + encoder.Configure(0, 1, 1, VPX_CBR, VPX_DL_GOOD_QUALITY); + encoder.Configure(16, 1920, 1, VPX_CBR, VPX_DL_REALTIME); +} + +TEST(EncodeAPI, Buganizer319964497) { + VP9Encoder encoder(7); + encoder.Configure(/*threads=*/1, /*width=*/320, /*height=*/240, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Encode(/*key_frame=*/true); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/1, /*width=*/2, /*height=*/2, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); +} + +TEST(EncodeAPI, Buganizer329088759RowMT0) { + VP9Encoder encoder(8, 0, VPX_BITS_8, VPX_IMG_FMT_I444); + encoder.Configure(/*threads=*/8, /*width=*/1686, /*height=*/398, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/0, /*width=*/1686, /*height=*/1, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Configure(/*threads=*/0, /*width=*/1482, /*height=*/113, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Configure(/*threads=*/0, /*width=*/881, /*height=*/59, VPX_CBR, + VPX_DL_REALTIME); + encoder.Configure(/*threads=*/13, /*width=*/1271, /*height=*/385, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/2, /*width=*/1, /*height=*/62, VPX_VBR, + VPX_DL_REALTIME); +} + +TEST(EncodeAPI, Buganizer329088759RowMT1) { + VP9Encoder encoder(8, 1, VPX_BITS_8, VPX_IMG_FMT_I444); + encoder.Configure(/*threads=*/8, /*width=*/1686, /*height=*/398, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Encode(/*key_frame=*/false); + // Needs to set threads to non-zero to repro the issue. + encoder.Configure(/*threads=*/2, /*width=*/1686, /*height=*/1, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Configure(/*threads=*/2, /*width=*/1482, /*height=*/113, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Configure(/*threads=*/2, /*width=*/881, /*height=*/59, VPX_CBR, + VPX_DL_REALTIME); + encoder.Configure(/*threads=*/13, /*width=*/1271, /*height=*/385, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/2, /*width=*/1, /*height=*/62, VPX_VBR, + VPX_DL_REALTIME); +} + +TEST(EncodeAPI, Buganizer331086799) { + VP9Encoder encoder(6, 1, VPX_BITS_8, VPX_IMG_FMT_I420); + encoder.Configure(0, 1385, 1, VPX_CBR, VPX_DL_REALTIME); + encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME); + encoder.Encode(false); + encoder.Configure(16, 1385, 1, VPX_VBR, VPX_DL_GOOD_QUALITY); + encoder.Encode(false); + encoder.Encode(false); + encoder.Configure(0, 1, 1, VPX_CBR, VPX_DL_REALTIME); + encoder.Encode(true); +} + +TEST(EncodeAPI, Buganizer331108729) { + VP9Encoder encoder(1, 1, VPX_BITS_8, VPX_IMG_FMT_I422); + encoder.Configure(0, 1919, 260, VPX_VBR, VPX_DL_REALTIME); + encoder.Configure(9, 440, 1, VPX_CBR, VPX_DL_GOOD_QUALITY); + encoder.Encode(true); + encoder.Configure(8, 1919, 260, VPX_VBR, VPX_DL_REALTIME); + encoder.Encode(false); +} + +TEST(EncodeAPI, Buganizer331108922BitDepth8) { + VP9Encoder encoder(9, 1, VPX_BITS_8, VPX_IMG_FMT_I420); + encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1080, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/0, /*width=*/1, /*height=*/1080, VPX_CBR, + VPX_DL_GOOD_QUALITY); + encoder.Configure(/*threads=*/16, /*width=*/1, /*height=*/394, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Encode(/*key_frame=*/true); + encoder.Configure(/*threads=*/16, /*width=*/1, /*height=*/798, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); +} + +// Encode some frames, flip from BEST_QUALITY to REALTIME after 2 frames. +// This test is taken from the code snippet in issue:441668134. +TEST(EncodeAPI, Buganizer441668134) { + // Get VP9 encoder interface. + vpx_codec_iface_t *iface = vpx_codec_vp9_cx(); + // Initialize encoder configuration with default values. + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + cfg.g_lag_in_frames = 0; + cfg.rc_max_quantizer = 0; + unsigned long init_flags = 0; + vpx_codec_ctx_t ctx; + ASSERT_EQ(vpx_codec_enc_init(&ctx, iface, &cfg, init_flags), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control_(&ctx, VP8E_SET_CPUUSED, 9), 0); + ASSERT_EQ(vpx_codec_control_(&ctx, VP9E_SET_DELTA_Q_UV, -15), 0); + // Image allocation. + vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420; + vpx_image_t *img = vpx_img_alloc(NULL, img_fmt, cfg.g_w, cfg.g_h, 32); + for (unsigned int y = 0; y < img->d_h; y++) { + for (unsigned int x = 0; x < img->d_w; x++) { + img->planes[0][y * img->stride[0] + x] = ((x ^ y) * 127) & 0xFF; + } + } + const unsigned int uv_height = (img->d_h + 1) >> 1; + for (int i : { VPX_PLANE_U, VPX_PLANE_V }) { + memset(img->planes[i], 0, img->stride[i] * uv_height); + } + // Encode some frames. + int num_frames = 6; + static constexpr int kChoices[6] = { 1, 1, 0, 0, 0, 0 }; + for (int frame = 0; frame < num_frames; frame++) { + vpx_enc_deadline_t deadline = VPX_DL_REALTIME; + uint8_t dl_choice = kChoices[frame]; + if (dl_choice == 1) deadline = VPX_DL_BEST_QUALITY; + // Encode frame. + ASSERT_EQ(vpx_codec_encode(&ctx, img, frame, 1, 0, deadline), VPX_CODEC_OK); + } + vpx_img_free(img); + vpx_codec_destroy(&ctx); +} + +// Encode a few frames, with realtime mode and tile_rows set to 1, +// with row-mt enabled. This triggers an assertion in vp9_bitstream.c (in +// function write_modes()), as in the issue:442105459. In this test it happens +// on very first encoded frame since lag_in_frames = 0. Issue is due to enabling +// TILE_ROWS, with number of tile_rows more than the number of superblocks. +// This test sets 2 tile_rows with height corresponding to 1 superblock (sb). +TEST(EncodeAPI, Buganizer442105459_2RowTiles) { + // Initialize VP9 encoder interface + vpx_codec_iface_t *iface = vpx_codec_vp9_cx(); + // Get default encoder configuration + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // Configure encoder + cfg.g_w = 946u; + cfg.g_h = 64u; // 1 sb row, 2 tile_rows set below. + cfg.g_threads = 1; + cfg.g_profile = 0; + cfg.g_bit_depth = VPX_BITS_8; + // Rate control targeting deeper encoding paths + cfg.rc_target_bitrate = 100; + cfg.rc_min_quantizer = 0; + cfg.rc_max_quantizer = 0; + cfg.rc_end_usage = VPX_VBR; + cfg.ss_number_layers = 1; + cfg.g_lag_in_frames = 0; + // Initialize encoder context + vpx_codec_ctx_t ctx; + ASSERT_EQ(vpx_codec_enc_init(&ctx, iface, &cfg, 0), VPX_CODEC_OK); + // Set control parameters + vpx_codec_control_(&ctx, VP8E_SET_CPUUSED, -5); + vpx_codec_control_(&ctx, VP9E_SET_TILE_ROWS, 1); + vpx_codec_control_(&ctx, VP9E_SET_TILE_COLUMNS, 1); + vpx_codec_control_(&ctx, VP9E_SET_ROW_MT, 1); + // Image format selection + vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420; + // Allocate image with varied alignment + vpx_image_t *img = vpx_img_alloc(nullptr, img_fmt, cfg.g_w, cfg.g_h, 1); + for (unsigned int y = 0; y < img->d_h; y++) { + for (unsigned int x = 0; x < img->d_w; x++) { + img->planes[0][y * img->stride[0] + x] = ((x ^ y) * 127) & 0xFF; + } + } + const unsigned int uv_height = (img->d_h + 1) >> 1; + for (int i : { VPX_PLANE_U, VPX_PLANE_V }) { + memset(img->planes[i], 0, img->stride[i] * uv_height); + } + // Encode with dynamic configuration changes + int num_frames = 2; + // Per-frame constants captured from the original run (indices consumed per + // frame) + const vpx_codec_pts_t frame_pts_mul[] = { 33333UL, 33333UL }; + const unsigned long frame_durations[] = { 33333UL, 33333UL }; + const vpx_enc_deadline_t frame_deadlines[] = { VPX_DL_REALTIME, + VPX_DL_REALTIME }; + for (int frame = 0; frame < num_frames; frame++) { + // Encode frame + vpx_codec_pts_t pts = frame * frame_pts_mul[frame]; + unsigned long duration = frame_durations[frame]; + vpx_enc_deadline_t deadline = frame_deadlines[frame]; + ASSERT_EQ(vpx_codec_encode(&ctx, img, pts, duration, /*flags*/ 0, deadline), + VPX_CODEC_OK); + } + // Flush encoder. + ASSERT_EQ(vpx_codec_encode(&ctx, nullptr, 0, 0, 0, VPX_DL_REALTIME), 0); + // Get remaining data + vpx_codec_iter_t iter = nullptr; + while (vpx_codec_get_cx_data(&ctx, &iter) != nullptr) { + // Process remaining packets + } + vpx_img_free(img); + vpx_codec_destroy(&ctx); +} + +// Encode a few frames, with realtime mode and tile_rows set to 1, +// with row-mt enabled. This triggers an assertion in vp9_bitstream.c (in +// function write_modes()), as in the issue:442105459. In this test it happens +// on very first encoded frame since lag_in_frames = 0. Issue is due to enabling +// TILE_ROWS, with number of tile_rows more than the number of superblocks. +// This test sets 4 tile_rows with height corresponding to 3 superblocks. +TEST(EncodeAPI, Buganizer442105459_4RowTiles) { + // Initialize VP9 encoder interface + vpx_codec_iface_t *iface = vpx_codec_vp9_cx(); + // Get default encoder configuration + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + // Configure encoder + cfg.g_w = 946u; + cfg.g_h = 192u; // 3 sb rows, 4 tile_rows set below. + cfg.g_threads = 1; + cfg.g_profile = 0; + cfg.g_bit_depth = VPX_BITS_8; + // Rate control targeting deeper encoding paths + cfg.rc_target_bitrate = 100; + cfg.rc_min_quantizer = 0; + cfg.rc_max_quantizer = 0; + cfg.rc_end_usage = VPX_VBR; + cfg.ss_number_layers = 1; + cfg.g_lag_in_frames = 0; + // Initialize encoder context + vpx_codec_ctx_t ctx; + ASSERT_EQ(vpx_codec_enc_init(&ctx, iface, &cfg, 0), VPX_CODEC_OK); + // Set control parameters + vpx_codec_control_(&ctx, VP8E_SET_CPUUSED, -5); + vpx_codec_control_(&ctx, VP9E_SET_TILE_ROWS, 2); + vpx_codec_control_(&ctx, VP9E_SET_TILE_COLUMNS, 1); + vpx_codec_control_(&ctx, VP9E_SET_ROW_MT, 1); + // Image format selection + vpx_img_fmt_t img_fmt = VPX_IMG_FMT_I420; + // Allocate image with varied alignment + vpx_image_t *img = vpx_img_alloc(nullptr, img_fmt, cfg.g_w, cfg.g_h, 1); + for (unsigned int y = 0; y < img->d_h; y++) { + for (unsigned int x = 0; x < img->d_w; x++) { + img->planes[0][y * img->stride[0] + x] = ((x ^ y) * 127) & 0xFF; + } + } + const unsigned int uv_height = (img->d_h + 1) >> 1; + for (int i : { VPX_PLANE_U, VPX_PLANE_V }) { + memset(img->planes[i], 0, img->stride[i] * uv_height); + } + // Encode with dynamic configuration changes + int num_frames = 2; + // Per-frame constants captured from the original run (indices consumed per + // frame) + const vpx_codec_pts_t frame_pts_mul[] = { 33333UL, 33333UL }; + const unsigned long frame_durations[] = { 33333UL, 33333UL }; + const vpx_enc_deadline_t frame_deadlines[] = { VPX_DL_REALTIME, + VPX_DL_REALTIME }; + for (int frame = 0; frame < num_frames; frame++) { + // Encode frame + vpx_codec_pts_t pts = frame * frame_pts_mul[frame]; + unsigned long duration = frame_durations[frame]; + vpx_enc_deadline_t deadline = frame_deadlines[frame]; + ASSERT_EQ(vpx_codec_encode(&ctx, img, pts, duration, /*flags*/ 0, deadline), + VPX_CODEC_OK); + } + // Flush encoder. + ASSERT_EQ(vpx_codec_encode(&ctx, nullptr, 0, 0, 0, VPX_DL_REALTIME), 0); + // Get remaining data + vpx_codec_iter_t iter = nullptr; + while (vpx_codec_get_cx_data(&ctx, &iter) != nullptr) { + // Process remaining packets + } + vpx_img_free(img); + vpx_codec_destroy(&ctx); +} + +#if CONFIG_VP9_HIGHBITDEPTH +TEST(EncodeAPI, Buganizer329674887RowMT0BitDepth12) { + VP9Encoder encoder(8, 0, VPX_BITS_12, VPX_IMG_FMT_I444); + encoder.Configure(/*threads=*/2, /*width=*/1030, /*height=*/583, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Configure(/*threads=*/0, /*width=*/1030, /*height=*/1, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Configure(/*threads=*/0, /*width=*/548, /*height=*/322, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/16, /*width=*/24, /*height=*/583, VPX_CBR, + VPX_DL_GOOD_QUALITY); +} + +TEST(EncodeAPI, Buganizer329179808RowMT0BitDepth10) { + VP9Encoder encoder(4, 0, VPX_BITS_10, VPX_IMG_FMT_I444); + encoder.Configure(/*threads=*/16, /*width=*/1488, /*height=*/5, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Configure(/*threads=*/16, /*width=*/839, /*height=*/1, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/11, /*width=*/657, /*height=*/5, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); +} + +TEST(EncodeAPI, Buganizer329179808RowMT1BitDepth10) { + VP9Encoder encoder(4, 1, VPX_BITS_10, VPX_IMG_FMT_I444); + encoder.Configure(/*threads=*/16, /*width=*/1488, /*height=*/5, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/true); + encoder.Configure(/*threads=*/16, /*width=*/839, /*height=*/1, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/11, /*width=*/657, /*height=*/5, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); +} + +TEST(EncodeAPI, Buganizer331108922BitDepth12) { + VP9Encoder encoder(9, 1, VPX_BITS_12, VPX_IMG_FMT_I444); + encoder.Configure(/*threads=*/1, /*width=*/1, /*height=*/1080, VPX_VBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Configure(/*threads=*/0, /*width=*/1, /*height=*/1080, VPX_CBR, + VPX_DL_GOOD_QUALITY); + encoder.Configure(/*threads=*/16, /*width=*/1, /*height=*/394, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); + encoder.Encode(/*key_frame=*/true); + encoder.Configure(/*threads=*/16, /*width=*/1, /*height=*/798, VPX_CBR, + VPX_DL_REALTIME); + encoder.Encode(/*key_frame=*/false); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +TEST(EncodeAPI, VP9GlobalHeaders) { + constexpr int kWidth = 320; + constexpr int kHeight = 240; + + libvpx_test::DummyVideoSource video; + video.SetSize(kWidth, kHeight); + +#if CONFIG_VP9_HIGHBITDEPTH + const int profiles[] = { 0, 1, 2, 3 }; +#else + const int profiles[] = { 0, 1 }; +#endif + char str[80]; + for (const int profile : profiles) { + std::vector bitdepths; + std::vector formats; + switch (profile) { + case 0: + bitdepths = { VPX_BITS_8 }; + formats = { VPX_IMG_FMT_I420 }; + break; + case 1: + bitdepths = { VPX_BITS_8 }; + formats = { VPX_IMG_FMT_I422, VPX_IMG_FMT_I444 }; + break; +#if CONFIG_VP9_HIGHBITDEPTH + case 2: + bitdepths = { VPX_BITS_10, VPX_BITS_12 }; + formats = { VPX_IMG_FMT_I42016 }; + break; + case 3: + bitdepths = { VPX_BITS_10, VPX_BITS_12 }; + formats = { VPX_IMG_FMT_I42216, VPX_IMG_FMT_I44416 }; + break; +#endif + } + + for (const auto format : formats) { + for (const auto bitdepth : bitdepths) { + snprintf(str, sizeof(str), "profile: %d bitdepth: %d format: %d", + profile, bitdepth, format); + SCOPED_TRACE(str); + + vpx_codec_enc_cfg_t cfg = {}; + struct Encoder { + ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); } + vpx_codec_ctx_t ctx = {}; + } enc; + vpx_codec_ctx_t *const ctx = &enc.ctx; + + ASSERT_EQ(vpx_codec_enc_config_default(vpx_codec_vp9_cx(), &cfg, 0), + VPX_CODEC_OK); + cfg.g_w = kWidth; + cfg.g_h = kHeight; + cfg.g_lag_in_frames = 0; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_profile = profile; + cfg.g_bit_depth = bitdepth; + ASSERT_EQ( + vpx_codec_enc_init(ctx, vpx_codec_vp9_cx(), &cfg, + bitdepth == 8 ? 0 : VPX_CODEC_USE_HIGHBITDEPTH), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control_(ctx, VP8E_SET_CPUUSED, 2), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control_(ctx, VP9E_SET_TARGET_LEVEL, 62), + VPX_CODEC_OK); + + vpx_fixed_buf_t *global_headers = vpx_codec_get_global_headers(ctx); + EXPECT_NE(global_headers, nullptr); + EXPECT_EQ(global_headers->sz, size_t{ 9 }); + + video.SetImageFormat(format); + video.Begin(); + EXPECT_EQ( + vpx_codec_encode(ctx, video.img(), video.pts(), video.duration(), + /*flags=*/0, VPX_DL_GOOD_QUALITY), + VPX_CODEC_OK) + << vpx_codec_error_detail(ctx); + + global_headers = vpx_codec_get_global_headers(ctx); + EXPECT_NE(global_headers, nullptr); + EXPECT_EQ(global_headers->sz, size_t{ 12 }); + uint8_t chroma_subsampling; + if ((format & VPX_IMG_FMT_I420) == VPX_IMG_FMT_I420) { + chroma_subsampling = 1; + } else if ((format & VPX_IMG_FMT_I422) == VPX_IMG_FMT_I422) { + chroma_subsampling = 2; + } else { // VPX_IMG_FMT_I444 + chroma_subsampling = 3; + } + const uint8_t expected_headers[] = { 1, + 1, + static_cast(profile), + 2, + 1, + /*level,*/ 3, + 1, + static_cast(bitdepth), + 4, + 1, + chroma_subsampling }; + const uint8_t *actual_headers = + reinterpret_cast(global_headers->buf); + for (int i = 0; i < 5; ++i) { + EXPECT_EQ(expected_headers[i], actual_headers[i]) << "index: " << i; + } + EXPECT_NE(actual_headers[6], 0); // level + for (int i = 5; i < 11; ++i) { + EXPECT_EQ(expected_headers[i], actual_headers[i + 1]) + << "index: " << i + 1; + } + } + } + } +} + +TEST(EncodeAPI, AomediaIssue3509VbrMinSection2PercentVP9) { + // Initialize libvpx encoder. + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_w = 1920; + cfg.g_h = 1080; + cfg.g_lag_in_frames = 0; + cfg.rc_target_bitrate = 1000000; + // Set this to more than 1 percent to cause a signed integer overflow in the + // multiplication rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section in + // vp9_rc_update_framerate() if the multiplication is done in the `int` type. + cfg.rc_2pass_vbr_minsection_pct = 2; + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + // `duration` can go as high as 300, but the UBSan error is gone if + // `duration` is 301 or higher. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 0, /*duration=*/300, 0, VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(EncodeAPI, AomediaIssue3509VbrMinSection101PercentVP9) { + // Initialize libvpx encoder. + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_w = 1920; + cfg.g_h = 1080; + cfg.g_lag_in_frames = 0; + cfg.rc_target_bitrate = 1000000; + // Set this to more than 100 percent to cause an error when vbr_min_bits is + // cast to `int` in vp9_rc_update_framerate() if vbr_min_bits is not clamped + // to INT_MAX. + cfg.rc_2pass_vbr_minsection_pct = 101; + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + // `duration` can go as high as 300, but the UBSan error is gone if + // `duration` is 301 or higher. + ASSERT_EQ( + vpx_codec_encode(&enc, image, 0, /*duration=*/300, 0, VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(EncodeAPI, Chromium352414650) { + // Initialize libvpx encoder. + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + vpx_codec_ctx_t enc; + vpx_codec_enc_cfg_t cfg; + + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + + cfg.g_w = 1024; + cfg.g_h = 1024; + cfg.g_profile = 0; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.rc_max_quantizer = 58; + cfg.rc_min_quantizer = 2; + cfg.g_threads = 4; + cfg.rc_resize_allowed = 0; + cfg.rc_dropframe_thresh = 0; + cfg.g_timebase.num = 1; + cfg.g_timebase.den = 1000000; + cfg.kf_min_dist = 0; + cfg.kf_max_dist = 10000; + cfg.rc_end_usage = VPX_CBR; + cfg.rc_target_bitrate = 754974; + cfg.ts_number_layers = 3; + cfg.ts_periodicity = 4; + cfg.ts_layer_id[0] = 0; + cfg.ts_layer_id[1] = 2; + cfg.ts_layer_id[2] = 1; + cfg.ts_layer_id[3] = 2; + cfg.ts_rate_decimator[0] = 4; + cfg.ts_rate_decimator[1] = 2; + cfg.ts_rate_decimator[2] = 1; + cfg.layer_target_bitrate[0] = 2147483; + cfg.layer_target_bitrate[1] = 3006476; + cfg.layer_target_bitrate[2] = 4294967; + cfg.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212; + cfg.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT; + + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, 7), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_TILE_COLUMNS, 2), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_ROW_MT, 1), VPX_CODEC_OK); + + vpx_svc_extra_cfg_t svc_cfg = {}; + svc_cfg.max_quantizers[0] = svc_cfg.max_quantizers[1] = + svc_cfg.max_quantizers[2] = 58; + svc_cfg.min_quantizers[0] = svc_cfg.min_quantizers[1] = + svc_cfg.min_quantizers[2] = 2; + svc_cfg.scaling_factor_num[0] = svc_cfg.scaling_factor_num[1] = + svc_cfg.scaling_factor_num[2] = 1; + svc_cfg.scaling_factor_den[0] = svc_cfg.scaling_factor_den[1] = + svc_cfg.scaling_factor_den[2] = 1; + ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_SVC_PARAMETERS, &svc_cfg), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_SVC, 1), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_AQ_MODE, 3), VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_STATIC_THRESHOLD, 1), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_COLOR_SPACE, VPX_CS_SMPTE_170), + VPX_CODEC_OK); + ASSERT_EQ(vpx_codec_control(&enc, VP9E_SET_COLOR_RANGE, VPX_CR_STUDIO_RANGE), + VPX_CODEC_OK); + + // Create input image. + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(vpx_codec_encode(&enc, image, 0, /*duration=*/500000, + VPX_EFLAG_FORCE_KF, VPX_DL_REALTIME), + VPX_CODEC_OK); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(EncodeAPI, PerFramePsnrNotSupportedWithLagInFrames) { + vpx_codec_iface_t *const iface = vpx_codec_vp9_cx(); + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK); + ASSERT_NE(cfg.g_lag_in_frames, 0u); + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + vpx_image_t *const image = + CreateImage(VPX_BITS_8, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + vpx_enc_frame_flags_t psnr_flags = VPX_EFLAG_CALCULATE_PSNR; + ASSERT_EQ(vpx_codec_encode(&enc, image, /*pts=*/0, /*duration=*/1, psnr_flags, + VPX_DL_REALTIME), + VPX_CODEC_INCAPABLE); + + // Free resources. + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} +#endif // CONFIG_VP9_ENCODER + } // namespace diff --git a/media/libvpx/libvpx/test/encode_perf_test.cc b/media/libvpx/libvpx/test/encode_perf_test.cc index 0bb435502b..75d6838f60 100644 --- a/media/libvpx/libvpx/test/encode_perf_test.cc +++ b/media/libvpx/libvpx/test/encode_perf_test.cc @@ -7,15 +7,15 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include #include -#include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" -#include "./vpx_version.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" #include "test/y4m_video_source.h" +#include "vpx/vpx_codec.h" #include "vpx_ports/vpx_timer.h" namespace { @@ -48,7 +48,7 @@ const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = { EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470), }; -const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 }; +const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8, 9 }; const int kEncodePerfTestThreads[] = { 1, 2, 4 }; #define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0])) @@ -61,9 +61,9 @@ class VP9EncodePerfTest : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0), encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {} - virtual ~VP9EncodePerfTest() {} + ~VP9EncodePerfTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); @@ -82,8 +82,8 @@ class VP9EncodePerfTest cfg_.g_threads = threads_; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { const int log2_tile_columns = 3; encoder->Control(VP8E_SET_CPUUSED, speed_); @@ -93,19 +93,19 @@ class VP9EncodePerfTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPsnr; nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.psnr.psnr[0] < min_psnr_) { min_psnr_ = pkt->data.psnr.psnr[0]; } } // for performance reasons don't decode - virtual bool DoDecode() const { return false; } + bool DoDecode() const override { return false; } double min_psnr() const { return min_psnr_; } @@ -169,7 +169,7 @@ TEST_P(VP9EncodePerfTest, PerfTest) { printf("{\n"); printf("\t\"type\" : \"encode_perf_test\",\n"); - printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP); + printf("\t\"version\" : \"%s\",\n", vpx_codec_version_str()); printf("\t\"videoName\" : \"%s\",\n", display_name.c_str()); printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs); printf("\t\"totalFrames\" : %u,\n", frames); @@ -183,6 +183,6 @@ TEST_P(VP9EncodePerfTest, PerfTest) { } } -VP9_INSTANTIATE_TEST_CASE(VP9EncodePerfTest, - ::testing::Values(::libvpx_test::kRealTime)); +VP9_INSTANTIATE_TEST_SUITE(VP9EncodePerfTest, + ::testing::Values(::libvpx_test::kRealTime)); } // namespace diff --git a/media/libvpx/libvpx/test/encode_test_driver.cc b/media/libvpx/libvpx/test/encode_test_driver.cc index 632c98f05a..770f410d80 100644 --- a/media/libvpx/libvpx/test/encode_test_driver.cc +++ b/media/libvpx/libvpx/test/encode_test_driver.cc @@ -8,9 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "test/codec_factory.h" @@ -51,7 +52,8 @@ void Encoder::InitEncoder(VideoSource *video) { } } -void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) { +void Encoder::EncodeFrame(VideoSource *video, + const vpx_enc_frame_flags_t frame_flags) { if (video->img()) { EncodeFrameInternal(*video, frame_flags); } else { @@ -69,7 +71,7 @@ void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) { } void Encoder::EncodeFrameInternal(const VideoSource &video, - const unsigned long frame_flags) { + const vpx_enc_frame_flags_t frame_flags) { vpx_codec_err_t res; const vpx_image_t *img = video.img(); @@ -90,7 +92,7 @@ void Encoder::EncodeFrameInternal(const VideoSource &video, void Encoder::Flush() { const vpx_codec_err_t res = - vpx_codec_encode(&encoder_, NULL, 0, 0, 0, deadline_); + vpx_codec_encode(&encoder_, nullptr, 0, 0, 0, deadline_); if (!encoder_.priv) ASSERT_EQ(VPX_CODEC_ERROR, res) << EncoderError(); else @@ -128,6 +130,8 @@ static bool compare_img(const vpx_image_t *img1, const vpx_image_t *img2) { bool match = (img1->fmt == img2->fmt) && (img1->cs == img2->cs) && (img1->d_w == img2->d_w) && (img1->d_h == img2->d_h); + if (!match) return false; + const unsigned int width_y = img1->d_w; const unsigned int height_y = img1->d_h; unsigned int i; @@ -166,7 +170,7 @@ void EncoderTest::RunLoop(VideoSource *video) { ASSERT_TRUE(passes_ == 1 || passes_ == 2); for (unsigned int pass = 0; pass < passes_; pass++) { - last_pts_ = 0; + vpx_codec_pts_t last_pts = 0; if (passes_ == 1) { cfg_.g_pass = VPX_RC_ONE_PASS; @@ -177,9 +181,9 @@ void EncoderTest::RunLoop(VideoSource *video) { } BeginPassHook(pass); - testing::internal::scoped_ptr encoder( + std::unique_ptr encoder( codec_->CreateEncoder(cfg_, deadline_, init_flags_, &stats_)); - ASSERT_TRUE(encoder.get() != NULL); + ASSERT_NE(encoder.get(), nullptr); ASSERT_NO_FATAL_FAILURE(video->Begin()); encoder->InitEncoder(video); @@ -191,16 +195,18 @@ void EncoderTest::RunLoop(VideoSource *video) { if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) { dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS; } - testing::internal::scoped_ptr decoder( + std::unique_ptr decoder( codec_->CreateDecoder(dec_cfg, dec_init_flags)); bool again; for (again = true; again; video->Next()) { - again = (video->img() != NULL); + again = (video->img() != nullptr); PreEncodeFrameHook(video); PreEncodeFrameHook(video, encoder.get()); encoder->EncodeFrame(video, frame_flags_); + PostEncodeFrameHook(encoder.get()); + CxDataIterator iter = encoder->GetCxData(); bool has_cxdata = false; @@ -211,7 +217,8 @@ void EncoderTest::RunLoop(VideoSource *video) { switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: has_cxdata = true; - if (decoder.get() != NULL && DoDecode()) { + if (decoder != nullptr && DoDecode()) { + PreDecodeFrameHook(video, decoder.get()); vpx_codec_err_t res_dec = decoder->DecodeFrame( (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz); @@ -219,20 +226,22 @@ void EncoderTest::RunLoop(VideoSource *video) { has_dxdata = true; } - ASSERT_GE(pkt->data.frame.pts, last_pts_); - last_pts_ = pkt->data.frame.pts; + ASSERT_GE(pkt->data.frame.pts, last_pts); + last_pts = pkt->data.frame.pts; FramePktHook(pkt); break; case VPX_CODEC_PSNR_PKT: PSNRPktHook(pkt); break; + case VPX_CODEC_STATS_PKT: StatsPktHook(pkt); break; + default: break; } } // Flush the decoder when there are no more fragments. if ((init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) { - const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0); + const vpx_codec_err_t res_dec = decoder->DecodeFrame(nullptr, 0); if (!HandleDecodeResult(res_dec, *video, decoder.get())) break; } diff --git a/media/libvpx/libvpx/test/encode_test_driver.h b/media/libvpx/libvpx/test/encode_test_driver.h index 09b1a78344..c9a7c154f2 100644 --- a/media/libvpx/libvpx/test/encode_test_driver.h +++ b/media/libvpx/libvpx/test/encode_test_driver.h @@ -7,19 +7,19 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_ENCODE_TEST_DRIVER_H_ -#define TEST_ENCODE_TEST_DRIVER_H_ +#ifndef VPX_TEST_ENCODE_TEST_DRIVER_H_ +#define VPX_TEST_ENCODE_TEST_DRIVER_H_ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER #include "vpx/vp8cx.h" #endif -#include "vpx/vpx_encoder.h" +#include "vpx/vpx_tpl.h" namespace libvpx_test { @@ -33,15 +33,24 @@ enum TestMode { kTwoPassGood, kTwoPassBest }; + +#if CONFIG_REALTIME_ONLY +#define ALL_TEST_MODES ::testing::Values(::libvpx_test::kRealTime) +#define ONE_PASS_TEST_MODES ::testing::Values(::libvpx_test::kRealTime) +#define ONE_OR_TWO_PASS_TEST_MODES ::testing::Values(::libvpx_test::kRealTime) +#else #define ALL_TEST_MODES \ ::testing::Values(::libvpx_test::kRealTime, ::libvpx_test::kOnePassGood, \ ::libvpx_test::kOnePassBest, ::libvpx_test::kTwoPassGood, \ ::libvpx_test::kTwoPassBest) - #define ONE_PASS_TEST_MODES \ ::testing::Values(::libvpx_test::kRealTime, ::libvpx_test::kOnePassGood, \ ::libvpx_test::kOnePassBest) +#define ONE_OR_TWO_PASS_TEST_MODES \ + ::testing::Values(::libvpx_test::kOnePassGood, ::libvpx_test::kTwoPassGood) +#endif + #define TWO_PASS_TEST_MODES \ ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kTwoPassBest) @@ -49,7 +58,7 @@ enum TestMode { class CxDataIterator { public: explicit CxDataIterator(vpx_codec_ctx_t *encoder) - : encoder_(encoder), iter_(NULL) {} + : encoder_(encoder), iter_(nullptr) {} const vpx_codec_cx_pkt_t *Next() { return vpx_codec_get_cx_data(encoder_, &iter_); @@ -86,7 +95,7 @@ class TwopassStatsStore { // level of abstraction will be fleshed out as more tests are written. class Encoder { public: - Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline, + Encoder(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, const unsigned long init_flags, TwopassStatsStore *stats) : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) { memset(&encoder_, 0, sizeof(encoder_)); @@ -103,7 +112,7 @@ class Encoder { } // This is a thin wrapper around vpx_codec_encode(), so refer to // vpx_encoder.h for its semantics. - void EncodeFrame(VideoSource *video, const unsigned long frame_flags); + void EncodeFrame(VideoSource *video, vpx_enc_frame_flags_t frame_flags); // Convenience wrapper for EncodeFrame() void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); } @@ -128,24 +137,56 @@ class Encoder { ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } + void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + void Control(int ctrl_id, struct vpx_svc_parameters *arg) { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } + + void Control(int ctrl_id, struct vpx_svc_frame_drop *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, struct vpx_svc_spatial_layer_sync *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + +#if CONFIG_VP9_ENCODER + void Control(int ctrl_id, vpx_rc_funcs_t *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + + void Control(int ctrl_id, VpxTplGopStats *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } +#endif // CONFIG_VP9_ENCODER + #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER void Control(int ctrl_id, vpx_active_map_t *arg) { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } -#endif + void Control(int ctrl_id, vpx_roi_map_t *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } +#endif void Config(const vpx_codec_enc_cfg_t *cfg) { const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); cfg_ = *cfg; } - void set_deadline(unsigned long deadline) { deadline_ = deadline; } + void set_deadline(vpx_enc_deadline_t deadline) { deadline_ = deadline; } protected: virtual vpx_codec_iface_t *CodecInterface() const = 0; @@ -157,14 +198,14 @@ class Encoder { // Encode an image void EncodeFrameInternal(const VideoSource &video, - const unsigned long frame_flags); + vpx_enc_frame_flags_t frame_flags); // Flush the encoder on EOS void Flush(); vpx_codec_ctx_t encoder_; vpx_codec_enc_cfg_t cfg_; - unsigned long deadline_; + vpx_enc_deadline_t deadline_; unsigned long init_flags_; TwopassStatsStore *stats_; }; @@ -179,8 +220,7 @@ class Encoder { class EncoderTest { protected: explicit EncoderTest(const CodecFactory *codec) - : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0), - last_pts_(0) { + : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0) { // Default to 1 thread. cfg_.g_threads = 1; } @@ -212,12 +252,20 @@ class EncoderTest { virtual void PreEncodeFrameHook(VideoSource * /*video*/, Encoder * /*encoder*/) {} + virtual void PreDecodeFrameHook(VideoSource * /*video*/, + Decoder * /*decoder*/) {} + + virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {} + // Hook to be called on every compressed data packet. virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} // Hook to be called on every PSNR packet. virtual void PSNRPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} + // Hook to be called on every first pass stats packet. + virtual void StatsPktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {} + // Hook to determine whether the encode loop should continue. virtual bool Continue() const { return !(::testing::Test::HasFatalFailure() || abort_); @@ -225,7 +273,7 @@ class EncoderTest { const CodecFactory *codec_; // Hook to determine whether to decode frame after encoding - virtual bool DoDecode() const { return 1; } + virtual bool DoDecode() const { return true; } // Hook to handle encode/decode mismatch virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2); @@ -252,13 +300,12 @@ class EncoderTest { vpx_codec_enc_cfg_t cfg_; vpx_codec_dec_cfg_t dec_cfg_; unsigned int passes_; - unsigned long deadline_; + vpx_enc_deadline_t deadline_; TwopassStatsStore stats_; unsigned long init_flags_; - unsigned long frame_flags_; - vpx_codec_pts_t last_pts_; + vpx_enc_frame_flags_t frame_flags_; }; } // namespace libvpx_test -#endif // TEST_ENCODE_TEST_DRIVER_H_ +#endif // VPX_TEST_ENCODE_TEST_DRIVER_H_ diff --git a/media/libvpx/libvpx/test/error_resilience_test.cc b/media/libvpx/libvpx/test/error_resilience_test.cc index 030b67c572..9bd43b72bf 100644 --- a/media/libvpx/libvpx/test/error_resilience_test.cc +++ b/media/libvpx/libvpx/test/error_resilience_test.cc @@ -8,11 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "vpx_config.h" namespace { @@ -30,7 +31,7 @@ class ErrorResilienceTestLarge Reset(); } - virtual ~ErrorResilienceTestLarge() {} + ~ErrorResilienceTestLarge() override = default; void Reset() { error_nframes_ = 0; @@ -38,19 +39,19 @@ class ErrorResilienceTestLarge pattern_switch_ = 0; } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { psnr_ = 0.0; nframes_ = 0; mismatch_psnr_ = 0.0; mismatch_nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { psnr_ += pkt->data.psnr.psnr[0]; nframes_++; } @@ -90,7 +91,7 @@ class ErrorResilienceTestLarge return frame_flags; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video) override { frame_flags_ &= ~(VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF); // For temporal layer case. @@ -129,21 +130,21 @@ class ErrorResilienceTestLarge return 0.0; } - virtual bool DoDecode() const { + bool DoDecode() const override { if (error_nframes_ > 0 && (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) { for (unsigned int i = 0; i < error_nframes_; ++i) { if (error_frames_[i] == nframes_ - 1) { std::cout << " Skipping decoding frame: " << error_frames_[i] << "\n"; - return 0; + return false; } } } - return 1; + return true; } - virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { double mismatch_psnr = compute_psnr(img1, img2); mismatch_psnr_ += mismatch_psnr; ++mismatch_nframes_; @@ -194,6 +195,10 @@ class ErrorResilienceTestLarge }; TEST_P(ErrorResilienceTestLarge, OnVersusOff) { +#if CONFIG_REALTIME_ONLY + GTEST_SKIP() + << "Non-zero g_lag_in_frames is unsupported with CONFIG_REALTIME_ONLY"; +#else const vpx_rational timebase = { 33333333, 1000000000 }; cfg_.g_timebase = timebase; cfg_.rc_target_bitrate = 2000; @@ -222,6 +227,7 @@ TEST_P(ErrorResilienceTestLarge, OnVersusOff) { EXPECT_GE(psnr_ratio, 0.9); EXPECT_LE(psnr_ratio, 1.1); } +#endif // CONFIG_REALTIME_ONLY } // Check for successful decoding and no encoder/decoder mismatch @@ -381,7 +387,7 @@ class ErrorResilienceTestLargeCodecControls Reset(); } - virtual ~ErrorResilienceTestLargeCodecControls() {} + ~ErrorResilienceTestLargeCodecControls() override = default; void Reset() { last_pts_ = 0; @@ -393,7 +399,7 @@ class ErrorResilienceTestLargeCodecControls duration_ = 0.0; } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); } @@ -460,8 +466,8 @@ class ErrorResilienceTestLargeCodecControls return layer_id; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (cfg_.ts_number_layers > 1) { int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers); int frame_flags = SetFrameFlags(video->frame(), cfg_.ts_number_layers); @@ -476,7 +482,7 @@ class ErrorResilienceTestLargeCodecControls } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { // Time since last timestamp = duration. vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; if (duration > 1) { @@ -496,7 +502,7 @@ class ErrorResilienceTestLargeCodecControls ++tot_frame_number_; } - virtual void EndPassHook(void) { + void EndPassHook() override { duration_ = (last_pts_ + 1) * timebase_; if (cfg_.ts_number_layers > 1) { for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); @@ -573,10 +579,10 @@ TEST_P(ErrorResilienceTestLargeCodecControls, CodecControl3TemporalLayers) { } } -VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES, - ::testing::Values(true)); -VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTestLargeCodecControls, - ONE_PASS_TEST_MODES); -VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES, - ::testing::Values(true)); +VP8_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES, + ::testing::Values(true)); +VP8_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLargeCodecControls, + ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_SUITE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES, + ::testing::Values(true)); } // namespace diff --git a/media/libvpx/libvpx/test/external_frame_buffer_test.cc b/media/libvpx/libvpx/test/external_frame_buffer_test.cc index f9686695a7..7b9a836fbc 100644 --- a/media/libvpx/libvpx/test/external_frame_buffer_test.cc +++ b/media/libvpx/libvpx/test/external_frame_buffer_test.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "./vpx_config.h" @@ -34,7 +35,8 @@ struct ExternalFrameBuffer { // Class to manipulate a list of external frame buffers. class ExternalFrameBufferList { public: - ExternalFrameBufferList() : num_buffers_(0), ext_fb_list_(NULL) {} + ExternalFrameBufferList() + : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(nullptr) {} virtual ~ExternalFrameBufferList() { for (int i = 0; i < num_buffers_; ++i) { @@ -49,7 +51,7 @@ class ExternalFrameBufferList { num_buffers_ = num_buffers; ext_fb_list_ = new ExternalFrameBuffer[num_buffers_]; - EXPECT_TRUE(ext_fb_list_ != NULL); + EXPECT_NE(ext_fb_list_, nullptr); memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_); return true; } @@ -59,7 +61,7 @@ class ExternalFrameBufferList { // frame buffer is in use by libvpx. Finally sets |fb| to point to the // external frame buffer. Returns < 0 on an error. int GetFreeFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) { - EXPECT_TRUE(fb != NULL); + EXPECT_NE(fb, nullptr); const int idx = FindFreeBufferIndex(); if (idx == num_buffers_) return -1; @@ -71,19 +73,21 @@ class ExternalFrameBufferList { } SetFrameBuffer(idx, fb); + + num_used_buffers_++; return 0; } // Test function that will not allocate any data for the frame buffer. // Returns < 0 on an error. int GetZeroFrameBuffer(size_t min_size, vpx_codec_frame_buffer_t *fb) { - EXPECT_TRUE(fb != NULL); + EXPECT_NE(fb, nullptr); const int idx = FindFreeBufferIndex(); if (idx == num_buffers_) return -1; if (ext_fb_list_[idx].size < min_size) { delete[] ext_fb_list_[idx].data; - ext_fb_list_[idx].data = NULL; + ext_fb_list_[idx].data = nullptr; ext_fb_list_[idx].size = min_size; } @@ -94,25 +98,26 @@ class ExternalFrameBufferList { // Marks the external frame buffer that |fb| is pointing to as free. // Returns < 0 on an error. int ReturnFrameBuffer(vpx_codec_frame_buffer_t *fb) { - if (fb == NULL) { - EXPECT_TRUE(fb != NULL); + if (fb == nullptr) { + EXPECT_NE(fb, nullptr); return -1; } ExternalFrameBuffer *const ext_fb = reinterpret_cast(fb->priv); - if (ext_fb == NULL) { - EXPECT_TRUE(ext_fb != NULL); + if (ext_fb == nullptr) { + EXPECT_NE(ext_fb, nullptr); return -1; } EXPECT_EQ(1, ext_fb->in_use); ext_fb->in_use = 0; + num_used_buffers_--; return 0; } - // Checks that the ximage data is contained within the external frame buffer - // private data passed back in the ximage. - void CheckXImageFrameBuffer(const vpx_image_t *img) { - if (img->fb_priv != NULL) { + // Checks that the vpx_image_t data is contained within the external frame + // buffer private data passed back in the vpx_image_t. + void CheckImageFrameBuffer(const vpx_image_t *img) { + if (img->fb_priv != nullptr) { const struct ExternalFrameBuffer *const ext_fb = reinterpret_cast(img->fb_priv); @@ -121,6 +126,8 @@ class ExternalFrameBufferList { } } + int num_used_buffers() const { return num_used_buffers_; } + private: // Returns the index of the first free frame buffer. Returns |num_buffers_| // if there are no free frame buffers. @@ -136,7 +143,7 @@ class ExternalFrameBufferList { // Sets |fb| to an external frame buffer. idx is the index into the frame // buffer list. void SetFrameBuffer(int idx, vpx_codec_frame_buffer_t *fb) { - ASSERT_TRUE(fb != NULL); + ASSERT_NE(fb, nullptr); fb->data = ext_fb_list_[idx].data; fb->size = ext_fb_list_[idx].size; ASSERT_EQ(0, ext_fb_list_[idx].in_use); @@ -145,6 +152,7 @@ class ExternalFrameBufferList { } int num_buffers_; + int num_used_buffers_; ExternalFrameBuffer *ext_fb_list_; }; @@ -200,15 +208,14 @@ class ExternalFrameBufferMD5Test protected: ExternalFrameBufferMD5Test() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)), - md5_file_(NULL), num_buffers_(0) {} + md5_file_(nullptr), num_buffers_(0) {} - virtual ~ExternalFrameBufferMD5Test() { - if (md5_file_ != NULL) fclose(md5_file_); + ~ExternalFrameBufferMD5Test() override { + if (md5_file_ != nullptr) fclose(md5_file_); } - virtual void PreDecodeFrameHook( - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { if (num_buffers_ > 0 && video.frame_number() == 0) { // Have libvpx use frame buffers we create. ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_)); @@ -220,13 +227,13 @@ class ExternalFrameBufferMD5Test void OpenMD5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: " - << md5_file_name_; + ASSERT_NE(md5_file_, nullptr) + << "Md5 file open failed. Filename: " << md5_file_name_; } - virtual void DecompressedFrameHook(const vpx_image_t &img, - const unsigned int frame_number) { - ASSERT_TRUE(md5_file_ != NULL); + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { + ASSERT_NE(md5_file_, nullptr); char expected_md5[33]; char junk[128]; @@ -273,26 +280,30 @@ class ExternalFrameBufferMD5Test #if CONFIG_WEBM_IO const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm"; +const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm"; // Class for testing passing in external frame buffers to libvpx. class ExternalFrameBufferTest : public ::testing::Test { protected: - ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {} + ExternalFrameBufferTest() + : video_(nullptr), decoder_(nullptr), num_buffers_(0) {} - virtual void SetUp() { + void SetUp() override { video_ = new libvpx_test::WebMVideoSource(kVP9TestFile); - ASSERT_TRUE(video_ != NULL); + ASSERT_NE(video_, nullptr); video_->Init(); video_->Begin(); vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); decoder_ = new libvpx_test::VP9Decoder(cfg, 0); - ASSERT_TRUE(decoder_ != NULL); + ASSERT_NE(decoder_, nullptr); } - virtual void TearDown() { + void TearDown() override { delete decoder_; + decoder_ = nullptr; delete video_; + video_ = nullptr; } // Passes the external frame buffer information to libvpx. @@ -316,7 +327,7 @@ class ExternalFrameBufferTest : public ::testing::Test { } vpx_codec_err_t DecodeRemainingFrames() { - for (; video_->cxdata() != NULL; video_->Next()) { + for (; video_->cxdata() != nullptr; video_->Next()) { const vpx_codec_err_t res = decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); if (res != VPX_CODEC_OK) return res; @@ -325,14 +336,13 @@ class ExternalFrameBufferTest : public ::testing::Test { return VPX_CODEC_OK; } - private: void CheckDecodedFrames() { libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData(); - const vpx_image_t *img = NULL; + const vpx_image_t *img = nullptr; // Get decompressed data - while ((img = dec_iter.Next()) != NULL) { - fb_list_.CheckXImageFrameBuffer(img); + while ((img = dec_iter.Next()) != nullptr) { + fb_list_.CheckImageFrameBuffer(img); } } @@ -341,6 +351,25 @@ class ExternalFrameBufferTest : public ::testing::Test { int num_buffers_; ExternalFrameBufferList fb_list_; }; + +class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest { + protected: + void SetUp() override { + video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile); + ASSERT_NE(video_, nullptr); + video_->Init(); + video_->Begin(); + + vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); + decoder_ = new libvpx_test::VP9Decoder(cfg, 0); + ASSERT_NE(decoder_, nullptr); + } + + virtual void CheckFrameBufferRelease() { + TearDown(); + ASSERT_EQ(0, fb_list_.num_used_buffers()); + } +}; #endif // CONFIG_WEBM_IO // This test runs through the set of test vectors, and decodes them. @@ -364,7 +393,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) { #endif // Open compressed video file. - testing::internal::scoped_ptr video; + std::unique_ptr video; if (filename.substr(filename.length() - 3, 3) == "ivf") { video.reset(new libvpx_test::IVFVideoSource(filename)); } else { @@ -376,7 +405,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) { return; #endif } - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); // Construct md5 file name. @@ -419,6 +448,8 @@ TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) { SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer)); ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame()); + // Only run this on long clips. Decoding a very short clip will return + // VPX_CODEC_OK even with only 2 buffers. ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames()); } @@ -451,13 +482,14 @@ TEST_F(ExternalFrameBufferTest, NullGetFunction) { const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; ASSERT_EQ( VPX_CODEC_INVALID_PARAM, - SetFrameBufferFunctions(num_buffers, NULL, release_vp9_frame_buffer)); + SetFrameBufferFunctions(num_buffers, nullptr, release_vp9_frame_buffer)); } TEST_F(ExternalFrameBufferTest, NullReleaseFunction) { const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; - ASSERT_EQ(VPX_CODEC_INVALID_PARAM, - SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, NULL)); + ASSERT_EQ( + VPX_CODEC_INVALID_PARAM, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, nullptr)); } TEST_F(ExternalFrameBufferTest, SetAfterDecode) { @@ -467,9 +499,18 @@ TEST_F(ExternalFrameBufferTest, SetAfterDecode) { SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, release_vp9_frame_buffer)); } + +TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + ASSERT_EQ(VPX_CODEC_OK, + SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, + release_vp9_frame_buffer)); + ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames()); + CheckFrameBufferRelease(); +} #endif // CONFIG_WEBM_IO -VP9_INSTANTIATE_TEST_CASE( +VP9_INSTANTIATE_TEST_SUITE( ExternalFrameBufferMD5Test, ::testing::ValuesIn(libvpx_test::kVP9TestVectors, libvpx_test::kVP9TestVectors + diff --git a/media/libvpx/libvpx/test/fdct4x4_test.cc b/media/libvpx/libvpx/test/fdct4x4_test.cc deleted file mode 100644 index 444b0209d5..0000000000 --- a/media/libvpx/libvpx/test/fdct4x4_test.cc +++ /dev/null @@ -1,511 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include - -#include "third_party/googletest/src/include/gtest/gtest.h" - -#include "./vp9_rtcd.h" -#include "./vpx_dsp_rtcd.h" -#include "test/acm_random.h" -#include "test/clear_system_state.h" -#include "test/register_state_check.h" -#include "test/util.h" -#include "vp9/common/vp9_entropy.h" -#include "vpx/vpx_codec.h" -#include "vpx/vpx_integer.h" -#include "vpx_ports/mem.h" - -using libvpx_test::ACMRandom; - -namespace { -const int kNumCoeffs = 16; -typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); -typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, - int tx_type); -typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, - int tx_type); - -typedef std::tr1::tuple Dct4x4Param; -typedef std::tr1::tuple Ht4x4Param; - -void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride, - int /*tx_type*/) { - vpx_fdct4x4_c(in, out, stride); -} - -void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { - vp9_fht4x4_c(in, out, stride, tx_type); -} - -void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride, - int /*tx_type*/) { - vp9_fwht4x4_c(in, out, stride); -} - -#if CONFIG_VP9_HIGHBITDEPTH -void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_c(in, out, stride, 10); -} - -void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_c(in, out, stride, 12); -} - -void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10); -} - -void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12); -} - -void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10); -} - -void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12); -} - -#if HAVE_SSE2 -void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10); -} - -void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12); -} -#endif // HAVE_SSE2 -#endif // CONFIG_VP9_HIGHBITDEPTH - -class Trans4x4TestBase { - public: - virtual ~Trans4x4TestBase() {} - - protected: - virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0; - - virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0; - - void RunAccuracyCheck(int limit) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - uint32_t max_error = 0; - int64_t total_error = 0; - const int count_test_block = 10000; - for (int i = 0; i < count_test_block; ++i) { - DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); -#if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); - DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); -#endif - - // Initialize a test block with input range [-255, 255]. - for (int j = 0; j < kNumCoeffs; ++j) { - if (bit_depth_ == VPX_BITS_8) { - src[j] = rnd.Rand8(); - dst[j] = rnd.Rand8(); - test_input_block[j] = src[j] - dst[j]; -#if CONFIG_VP9_HIGHBITDEPTH - } else { - src16[j] = rnd.Rand16() & mask_; - dst16[j] = rnd.Rand16() & mask_; - test_input_block[j] = src16[j] - dst16[j]; -#endif - } - } - - ASM_REGISTER_STATE_CHECK( - RunFwdTxfm(test_input_block, test_temp_block, pitch_)); - if (bit_depth_ == VPX_BITS_8) { - ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_)); -#if CONFIG_VP9_HIGHBITDEPTH - } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); -#endif - } - - for (int j = 0; j < kNumCoeffs; ++j) { -#if CONFIG_VP9_HIGHBITDEPTH - const int diff = - bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; -#else - ASSERT_EQ(VPX_BITS_8, bit_depth_); - const int diff = dst[j] - src[j]; -#endif - const uint32_t error = diff * diff; - if (max_error < error) max_error = error; - total_error += error; - } - } - - EXPECT_GE(static_cast(limit), max_error) - << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit; - - EXPECT_GE(count_test_block * limit, total_error) - << "Error: 4x4 FHT/IHT has average round trip error > " << limit - << " per block"; - } - - void RunCoeffCheck() { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 5000; - DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); - - for (int i = 0; i < count_test_block; ++i) { - // Initialize a test block with input range [-mask_, mask_]. - for (int j = 0; j < kNumCoeffs; ++j) { - input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); - } - - fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_); - ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_)); - - // The minimum quant value is 4. - for (int j = 0; j < kNumCoeffs; ++j) - EXPECT_EQ(output_block[j], output_ref_block[j]); - } - } - - void RunMemCheck() { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 5000; - DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]); - - for (int i = 0; i < count_test_block; ++i) { - // Initialize a test block with input range [-mask_, mask_]. - for (int j = 0; j < kNumCoeffs; ++j) { - input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_; - } - if (i == 0) { - for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_; - } else if (i == 1) { - for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_; - } - - fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_); - ASM_REGISTER_STATE_CHECK( - RunFwdTxfm(input_extreme_block, output_block, pitch_)); - - // The minimum quant value is 4. - for (int j = 0; j < kNumCoeffs; ++j) { - EXPECT_EQ(output_block[j], output_ref_block[j]); - EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j])) - << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE"; - } - } - } - - void RunInvAccuracyCheck(int limit) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 1000; - DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]); - DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]); - DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]); -#if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]); - DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]); -#endif - - for (int i = 0; i < count_test_block; ++i) { - // Initialize a test block with input range [-mask_, mask_]. - for (int j = 0; j < kNumCoeffs; ++j) { - if (bit_depth_ == VPX_BITS_8) { - src[j] = rnd.Rand8(); - dst[j] = rnd.Rand8(); - in[j] = src[j] - dst[j]; -#if CONFIG_VP9_HIGHBITDEPTH - } else { - src16[j] = rnd.Rand16() & mask_; - dst16[j] = rnd.Rand16() & mask_; - in[j] = src16[j] - dst16[j]; -#endif - } - } - - fwd_txfm_ref(in, coeff, pitch_, tx_type_); - - if (bit_depth_ == VPX_BITS_8) { - ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); -#if CONFIG_VP9_HIGHBITDEPTH - } else { - ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); -#endif - } - - for (int j = 0; j < kNumCoeffs; ++j) { -#if CONFIG_VP9_HIGHBITDEPTH - const int diff = - bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j]; -#else - const int diff = dst[j] - src[j]; -#endif - const uint32_t error = diff * diff; - EXPECT_GE(static_cast(limit), error) - << "Error: 4x4 IDCT has error " << error << " at index " << j; - } - } - } - - int pitch_; - int tx_type_; - FhtFunc fwd_txfm_ref; - vpx_bit_depth_t bit_depth_; - int mask_; -}; - -class Trans4x4DCT : public Trans4x4TestBase, - public ::testing::TestWithParam { - public: - virtual ~Trans4x4DCT() {} - - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - tx_type_ = GET_PARAM(2); - pitch_ = 4; - fwd_txfm_ref = fdct4x4_ref; - bit_depth_ = GET_PARAM(3); - mask_ = (1 << bit_depth_) - 1; - } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { - fwd_txfm_(in, out, stride); - } - void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { - inv_txfm_(out, dst, stride); - } - - FdctFunc fwd_txfm_; - IdctFunc inv_txfm_; -}; - -TEST_P(Trans4x4DCT, AccuracyCheck) { RunAccuracyCheck(1); } - -TEST_P(Trans4x4DCT, CoeffCheck) { RunCoeffCheck(); } - -TEST_P(Trans4x4DCT, MemCheck) { RunMemCheck(); } - -TEST_P(Trans4x4DCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } - -class Trans4x4HT : public Trans4x4TestBase, - public ::testing::TestWithParam { - public: - virtual ~Trans4x4HT() {} - - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - tx_type_ = GET_PARAM(2); - pitch_ = 4; - fwd_txfm_ref = fht4x4_ref; - bit_depth_ = GET_PARAM(3); - mask_ = (1 << bit_depth_) - 1; - } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { - fwd_txfm_(in, out, stride, tx_type_); - } - - void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { - inv_txfm_(out, dst, stride, tx_type_); - } - - FhtFunc fwd_txfm_; - IhtFunc inv_txfm_; -}; - -TEST_P(Trans4x4HT, AccuracyCheck) { RunAccuracyCheck(1); } - -TEST_P(Trans4x4HT, CoeffCheck) { RunCoeffCheck(); } - -TEST_P(Trans4x4HT, MemCheck) { RunMemCheck(); } - -TEST_P(Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } - -class Trans4x4WHT : public Trans4x4TestBase, - public ::testing::TestWithParam { - public: - virtual ~Trans4x4WHT() {} - - virtual void SetUp() { - fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - tx_type_ = GET_PARAM(2); - pitch_ = 4; - fwd_txfm_ref = fwht4x4_ref; - bit_depth_ = GET_PARAM(3); - mask_ = (1 << bit_depth_) - 1; - } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { - fwd_txfm_(in, out, stride); - } - void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { - inv_txfm_(out, dst, stride); - } - - FdctFunc fwd_txfm_; - IdctFunc inv_txfm_; -}; - -TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0); } - -TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); } - -TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); } - -TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); } -using std::tr1::make_tuple; - -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, Trans4x4DCT, - ::testing::Values( - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12), - make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_c, - &vpx_idct4x4_16_add_c, 0, - VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P( - C, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - C, Trans4x4WHT, - ::testing::Values( - make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12), - make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8))); -#else -INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT, - ::testing::Values(make_tuple(&vp9_fwht4x4_c, - &vpx_iwht4x4_16_add_c, 0, - VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_c, - &vpx_idct4x4_16_add_neon, - 0, VPX_BITS_8))); -#if !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - NEON, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8))); -#endif // !CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE - -#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4WHT, - ::testing::Values( - make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8))); -#endif - -#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_sse2, - &vpx_idct4x4_16_add_sse2, - 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8))); -#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE - -#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4DCT, - ::testing::Values( - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10), - make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12), - make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12), - make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8))); - -INSTANTIATE_TEST_CASE_P( - SSE2, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8))); -#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE - -#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT, - ::testing::Values(make_tuple(&vpx_fdct4x4_msa, - &vpx_idct4x4_16_add_msa, 0, - VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( - MSA, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8), - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8), - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8), - make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8))); -#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -} // namespace diff --git a/media/libvpx/libvpx/test/fdct8x8_test.cc b/media/libvpx/libvpx/test/fdct8x8_test.cc index e403404906..4c2cbf43fd 100644 --- a/media/libvpx/libvpx/test/fdct8x8_test.cc +++ b/media/libvpx/libvpx/test/fdct8x8_test.cc @@ -11,8 +11,9 @@ #include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" @@ -22,6 +23,7 @@ #include "test/util.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_scan.h" +#include "vpx_config.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" @@ -36,16 +38,16 @@ const double kPi = 3.141592653589793238462643383279502884; const int kSignBiasMaxDiff255 = 1500; const int kSignBiasMaxDiff15 = 10000; -typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride); -typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride, - int tx_type); -typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, - int tx_type); +using FdctFunc = void (*)(const int16_t *in, tran_low_t *out, int stride); +using IdctFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride); +using FhtFunc = void (*)(const int16_t *in, tran_low_t *out, int stride, + int tx_type); +using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride, + int tx_type); -typedef std::tr1::tuple Dct8x8Param; -typedef std::tr1::tuple Ht8x8Param; -typedef std::tr1::tuple Idct8x8Param; +using Dct8x8Param = std::tuple; +using Ht8x8Param = std::tuple; +using Idct8x8Param = std::tuple; void reference_8x8_dct_1d(const double in[8], double out[8]) { const double kInvSqrt2 = 0.707106781186547524400844362104; @@ -88,52 +90,64 @@ void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { #if CONFIG_VP9_HIGHBITDEPTH void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_c(in, out, stride, 10); + vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_c(in, out, stride, 12); + vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10); + vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10); } void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) { - vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12); + vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12); } #if HAVE_SSE2 void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_c(in, out, stride, 10); + vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_c(in, out, stride, 12); + vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10); + vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12); + vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10); + vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10); } void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { - vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12); + vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12); } #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH +// Visual Studio 2022 (cl.exe) < 17.12.3 targeting AArch64 with optimizations +// enabled produces invalid code in RunExtremalCheck() and +// RunInvAccuracyCheck(). See: +// https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786 +#if defined(_MSC_FULL_VER) && _MSC_FULL_VER < 194234435 && \ + defined(_M_ARM64) && !defined(__clang__) +#define AOM_WORK_AROUND_MSVC_BUG_10369786 +#endif + +#ifdef AOM_WORK_AROUND_MSVC_BUG_10369786 +#pragma optimize("", off) +#endif class FwdTrans8x8TestBase { public: - virtual ~FwdTrans8x8TestBase() {} + virtual ~FwdTrans8x8TestBase() = default; protected: virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0; @@ -169,7 +183,7 @@ class FwdTrans8x8TestBase { for (int j = 0; j < 64; ++j) { const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); const int max_diff = kSignBiasMaxDiff255; - EXPECT_LT(diff, max_diff << (bit_depth_ - 8)) + ASSERT_LT(diff, max_diff << (bit_depth_ - 8)) << "Error: 8x8 FDCT/FHT has a sign bias > " << 1. * max_diff / count_test_block * 100 << "%" << " for input range [-255, 255] at index " << j @@ -200,7 +214,7 @@ class FwdTrans8x8TestBase { for (int j = 0; j < 64; ++j) { const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); const int max_diff = kSignBiasMaxDiff15; - EXPECT_LT(diff, max_diff << (bit_depth_ - 8)) + ASSERT_LT(diff, max_diff << (bit_depth_ - 8)) << "Error: 8x8 FDCT/FHT has a sign bias > " << 1. * max_diff / count_test_block * 100 << "%" << " for input range [-15, 15] at index " << j @@ -257,7 +271,7 @@ class FwdTrans8x8TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -274,11 +288,11 @@ class FwdTrans8x8TestBase { } } - EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error) + ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error) << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual" << " roundtrip error > 1"; - EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) + ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip " << "error > 1/5 per block"; } @@ -340,7 +354,7 @@ class FwdTrans8x8TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -359,17 +373,17 @@ class FwdTrans8x8TestBase { total_coeff_error += abs(coeff_diff); } - EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error) + ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error) << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has" - << "an individual roundtrip error > 1"; + << " an individual roundtrip error > 1"; - EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) + ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error) << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average" << " roundtrip error > 1/5 per block"; - EXPECT_EQ(0, total_coeff_error) + ASSERT_EQ(0, total_coeff_error) << "Error: Extremal 8x8 FDCT/FHT has" - << "overflow issues in the intermediate steps > 1"; + << " overflow issues in the intermediate steps > 1"; } } @@ -413,7 +427,7 @@ class FwdTrans8x8TestBase { #if CONFIG_VP9_HIGHBITDEPTH } else { ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -425,7 +439,7 @@ class FwdTrans8x8TestBase { const int diff = dst[j] - src[j]; #endif const uint32_t error = diff * diff; - EXPECT_GE(1u << 2 * (bit_depth_ - 8), error) + ASSERT_GE(1u << 2 * (bit_depth_ - 8), error) << "Error: 8x8 IDCT has error " << error << " at index " << j; } } @@ -455,7 +469,7 @@ class FwdTrans8x8TestBase { for (int j = 0; j < kNumCoeffs; ++j) { const int32_t diff = coeff[j] - coeff_r[j]; const uint32_t error = diff * diff; - EXPECT_GE(9u << 2 * (bit_depth_ - 8), error) + ASSERT_GE(9u << 2 * (bit_depth_ - 8), error) << "Error: 8x8 DCT has error " << error << " at index " << j; } } @@ -497,9 +511,9 @@ class FwdTrans8x8TestBase { ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_)); #if CONFIG_VP9_HIGHBITDEPTH } else { - ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_); + ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_); ASM_REGISTER_STATE_CHECK( - RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_)); + RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_)); #endif } @@ -511,8 +525,8 @@ class FwdTrans8x8TestBase { const int diff = dst[j] - ref[j]; #endif const uint32_t error = diff * diff; - EXPECT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error - << " at index " << j; + ASSERT_EQ(0u, error) + << "Error: 8x8 IDCT has error " << error << " at index " << j; } } } @@ -522,13 +536,16 @@ class FwdTrans8x8TestBase { vpx_bit_depth_t bit_depth_; int mask_; }; +#ifdef AOM_WORK_AROUND_MSVC_BUG_10369786 +#pragma optimize("", on) +#endif class FwdTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { public: - virtual ~FwdTrans8x8DCT() {} + ~FwdTrans8x8DCT() override = default; - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -538,13 +555,13 @@ class FwdTrans8x8DCT : public FwdTrans8x8TestBase, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } @@ -565,9 +582,9 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); } class FwdTrans8x8HT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { public: - virtual ~FwdTrans8x8HT() {} + ~FwdTrans8x8HT() override = default; - virtual void SetUp() { + void SetUp() override { fwd_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); tx_type_ = GET_PARAM(2); @@ -577,13 +594,13 @@ class FwdTrans8x8HT : public FwdTrans8x8TestBase, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) { + void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override { fwd_txfm_(in, out, stride, tx_type_); } - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride, tx_type_); } @@ -597,12 +614,13 @@ TEST_P(FwdTrans8x8HT, RoundTripErrorCheck) { RunRoundTripErrorCheck(); } TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); } +#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE class InvTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam { public: - virtual ~InvTrans8x8DCT() {} + ~InvTrans8x8DCT() override = default; - virtual void SetUp() { + void SetUp() override { ref_txfm_ = GET_PARAM(0); inv_txfm_ = GET_PARAM(1); thresh_ = GET_PARAM(2); @@ -611,41 +629,44 @@ class InvTrans8x8DCT : public FwdTrans8x8TestBase, mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) { + void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override { inv_txfm_(out, dst, stride); } - void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {} + void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, + int /*stride*/) override {} IdctFunc ref_txfm_; IdctFunc inv_txfm_; int thresh_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans8x8DCT); TEST_P(InvTrans8x8DCT, CompareReference) { CompareInvReference(ref_txfm_, thresh_); } +#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -using std::tr1::make_tuple; +using std::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, FwdTrans8x8DCT, ::testing::Values( make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8), make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10), make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12))); #else -INSTANTIATE_TEST_CASE_P(C, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_c, - &vpx_idct8x8_64_add_c, 0, - VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(C, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_c, + &vpx_idct8x8_64_add_c, 0, + VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), @@ -661,7 +682,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), @@ -671,32 +692,28 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE -#if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_c, - &vpx_idct8x8_64_add_neon, - 0, VPX_BITS_8))); -#else // !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_neon, - &vpx_idct8x8_64_add_neon, - 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P(NEON, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_neon, + &vpx_idct8x8_64_add_neon, + 0, VPX_BITS_8))); + +#if !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( NEON, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8))); -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#endif // !CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_sse2, - &vpx_idct8x8_64_add_sse2, - 0, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P(SSE2, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_sse2, + &vpx_idct8x8_64_add_sse2, + 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( SSE2, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8), @@ -706,7 +723,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, FwdTrans8x8DCT, ::testing::Values(make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_c, 0, VPX_BITS_8), @@ -719,7 +736,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_64_add_12_sse2, 12, VPX_BITS_12))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8), @@ -729,7 +746,7 @@ INSTANTIATE_TEST_CASE_P( // Optimizations take effect at a threshold of 6201, so we use a value close to // that to test both branches. -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, InvTrans8x8DCT, ::testing::Values( make_tuple(&idct8x8_12_add_10_c, &idct8x8_12_add_10_sse2, 6225, @@ -740,20 +757,20 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, VPX_BITS_12))); #endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \ +#if HAVE_SSSE3 && VPX_ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH && \ !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3, - &vpx_idct8x8_64_add_ssse3, - 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P(SSSE3, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3, + &vpx_idct8x8_64_add_sse2, + 0, VPX_BITS_8))); #endif #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE -INSTANTIATE_TEST_CASE_P(MSA, FwdTrans8x8DCT, - ::testing::Values(make_tuple(&vpx_fdct8x8_msa, - &vpx_idct8x8_64_add_msa, 0, - VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P(MSA, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_msa, + &vpx_idct8x8_64_add_msa, + 0, VPX_BITS_8))); +INSTANTIATE_TEST_SUITE_P( MSA, FwdTrans8x8HT, ::testing::Values( make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 0, VPX_BITS_8), @@ -761,4 +778,18 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8), make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8))); #endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(VSX, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_c, + &vpx_idct8x8_64_add_vsx, + 0, VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT, + ::testing::Values(make_tuple(&vpx_fdct8x8_lsx, + &vpx_idct8x8_64_add_c, 0, + VPX_BITS_8))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/media/libvpx/libvpx/test/frame_size_tests.cc b/media/libvpx/libvpx/test/frame_size_tests.cc index 5a9b166e5b..a86ca9a42a 100644 --- a/media/libvpx/libvpx/test/frame_size_tests.cc +++ b/media/libvpx/libvpx/test/frame_size_tests.cc @@ -7,34 +7,95 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include + +#include "gtest/gtest.h" #include "test/codec_factory.h" +#include "test/register_state_check.h" #include "test/video_source.h" +#include "vpx_config.h" namespace { +class EncoderWithExpectedError : public ::libvpx_test::Encoder { + public: + EncoderWithExpectedError(vpx_codec_enc_cfg_t cfg, vpx_enc_deadline_t deadline, + const unsigned long init_flags, // NOLINT + ::libvpx_test::TwopassStatsStore *stats) + : ::libvpx_test::Encoder(cfg, deadline, init_flags, stats) {} + // This overrides with expected error code. + void EncodeFrame(::libvpx_test::VideoSource *video, + const unsigned long frame_flags, // NOLINT + const vpx_codec_err_t expected_err) { + if (video->img()) { + EncodeFrameInternal(*video, frame_flags, expected_err); + } else { + Flush(); + } + + // Handle twopass stats + ::libvpx_test::CxDataIterator iter = GetCxData(); + + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + if (pkt->kind != VPX_CODEC_STATS_PKT) continue; + + stats_->Append(*pkt); + } + } + + protected: + void EncodeFrameInternal(const ::libvpx_test::VideoSource &video, + const unsigned long frame_flags, // NOLINT + const vpx_codec_err_t expected_err) { + vpx_codec_err_t res; + const vpx_image_t *img = video.img(); + + // Handle frame resizing + if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) { + cfg_.g_w = img->d_w; + cfg_.g_h = img->d_h; + res = vpx_codec_enc_config_set(&encoder_, &cfg_); + ASSERT_EQ(res, VPX_CODEC_OK) << EncoderError(); + } + + // Encode the frame + API_REGISTER_STATE_CHECK(res = vpx_codec_encode(&encoder_, img, video.pts(), + video.duration(), + frame_flags, deadline_)); + ASSERT_EQ(expected_err, res) << EncoderError(); + } + + vpx_codec_iface_t *CodecInterface() const override { +#if CONFIG_VP9_ENCODER + return &vpx_codec_vp9_cx_algo; +#else + return nullptr; +#endif + } +}; + class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, public ::testing::Test { protected: VP9FrameSizeTestsLarge() : EncoderTest(&::libvpx_test::kVP9), expected_res_(VPX_CODEC_OK) {} - virtual ~VP9FrameSizeTestsLarge() {} + ~VP9FrameSizeTestsLarge() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kRealTime); } - virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder *decoder) { + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError(); return !::testing::Test::HasFailure(); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, 7); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); @@ -43,28 +104,98 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest, } } - int expected_res_; + using ::libvpx_test::EncoderTest::RunLoop; + virtual void RunLoop(::libvpx_test::VideoSource *video, + const vpx_codec_err_t expected_err) { + stats_.Reset(); + + ASSERT_TRUE(passes_ == 1 || passes_ == 2); + for (unsigned int pass = 0; pass < passes_; pass++) { + vpx_codec_pts_t last_pts = 0; + + if (passes_ == 1) { + cfg_.g_pass = VPX_RC_ONE_PASS; + } else if (pass == 0) { + cfg_.g_pass = VPX_RC_FIRST_PASS; + } else { + cfg_.g_pass = VPX_RC_LAST_PASS; + } + + BeginPassHook(pass); + std::unique_ptr encoder( + new EncoderWithExpectedError(cfg_, deadline_, init_flags_, &stats_)); + ASSERT_NE(encoder.get(), nullptr); + + ASSERT_NO_FATAL_FAILURE(video->Begin()); + encoder->InitEncoder(video); + ASSERT_FALSE(::testing::Test::HasFatalFailure()); + for (bool again = true; again; video->Next()) { + again = (video->img() != nullptr); + + PreEncodeFrameHook(video, encoder.get()); + encoder->EncodeFrame(video, frame_flags_, expected_err); + + PostEncodeFrameHook(encoder.get()); + + ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); + + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + pkt = MutateEncoderOutputHook(pkt); + again = true; + switch (pkt->kind) { + case VPX_CODEC_CX_FRAME_PKT: + ASSERT_GE(pkt->data.frame.pts, last_pts); + last_pts = pkt->data.frame.pts; + FramePktHook(pkt); + break; + + case VPX_CODEC_PSNR_PKT: PSNRPktHook(pkt); break; + case VPX_CODEC_STATS_PKT: StatsPktHook(pkt); break; + default: break; + } + } + + if (!Continue()) break; + } + + EndPassHook(); + + if (!Continue()) break; + } + } + + vpx_codec_err_t expected_res_; }; TEST_F(VP9FrameSizeTestsLarge, TestInvalidSizes) { +#ifdef CHROMIUM + GTEST_SKIP() << "16K framebuffers are not supported by Chromium's allocator."; +#else ::libvpx_test::RandomVideoSource video; #if CONFIG_SIZE_LIMIT video.SetSize(DECODE_WIDTH_LIMIT + 16, DECODE_HEIGHT_LIMIT + 16); video.set_limit(2); - expected_res_ = VPX_CODEC_CORRUPT_FRAME; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + expected_res_ = VPX_CODEC_MEM_ERROR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video, expected_res_)); +#endif + #endif } TEST_F(VP9FrameSizeTestsLarge, ValidSizes) { +#ifdef CHROMIUM + GTEST_SKIP() + << "Under Chromium's configuration the allocator is unable to provide" + "the space required for a single frame at the maximum resolution."; +#else ::libvpx_test::RandomVideoSource video; #if CONFIG_SIZE_LIMIT video.SetSize(DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT); video.set_limit(2); expected_res_ = VPX_CODEC_OK; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_NO_FATAL_FAILURE(::libvpx_test::EncoderTest::RunLoop(&video)); #else // This test produces a pretty large single frame allocation, (roughly // 25 megabits). The encoder allocates a good number of these frames @@ -73,15 +204,17 @@ TEST_F(VP9FrameSizeTestsLarge, ValidSizes) { // size or almost 1 gig of memory. // In total the allocations will exceed 2GiB which may cause a failure with // mingw + wine, use a smaller size in that case. -#if defined(_WIN32) && !defined(_WIN64) || defined(__OS2__) +#if defined(_WIN32) && !defined(_WIN64) video.SetSize(4096, 3072); #else video.SetSize(4096, 4096); #endif video.set_limit(2); expected_res_ = VPX_CODEC_OK; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_NO_FATAL_FAILURE(::libvpx_test::EncoderTest::RunLoop(&video)); #endif + +#endif // defined(CHROMIUM) } TEST_F(VP9FrameSizeTestsLarge, OneByOneVideo) { @@ -90,6 +223,6 @@ TEST_F(VP9FrameSizeTestsLarge, OneByOneVideo) { video.SetSize(1, 1); video.set_limit(2); expected_res_ = VPX_CODEC_OK; - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_NO_FATAL_FAILURE(::libvpx_test::EncoderTest::RunLoop(&video)); } } // namespace diff --git a/media/libvpx/libvpx/test/hadamard_test.cc b/media/libvpx/libvpx/test/hadamard_test.cc index e7715958ed..e0cc8f30c2 100644 --- a/media/libvpx/libvpx/test/hadamard_test.cc +++ b/media/libvpx/libvpx/test/hadamard_test.cc @@ -10,26 +10,29 @@ #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_ports/vpx_timer.h" #include "test/acm_random.h" #include "test/register_state_check.h" +#include "vpx_config.h" namespace { using ::libvpx_test::ACMRandom; -typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b); +using HadamardFunc = void (*)(const int16_t *a, ptrdiff_t a_stride, + tran_low_t *b); -void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) { - int16_t b[8]; +void hadamard_loop(const tran_low_t *a, tran_low_t *out) { + tran_low_t b[8]; for (int i = 0; i < 8; i += 2) { - b[i + 0] = a[i * a_stride] + a[(i + 1) * a_stride]; - b[i + 1] = a[i * a_stride] - a[(i + 1) * a_stride]; + b[i + 0] = a[i * 8] + a[(i + 1) * 8]; + b[i + 1] = a[i * 8] - a[(i + 1) * 8]; } - int16_t c[8]; + tran_low_t c[8]; for (int i = 0; i < 8; i += 4) { c[i + 0] = b[i + 0] + b[i + 2]; c[i + 1] = b[i + 1] + b[i + 3]; @@ -46,18 +49,19 @@ void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) { out[5] = c[3] - c[7]; } -void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) { - int16_t buf[64]; +void reference_hadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) { + tran_low_t input[64]; + tran_low_t buf[64]; for (int i = 0; i < 8; ++i) { - hadamard_loop(a + i, a_stride, buf + i * 8); - } - - for (int i = 0; i < 8; ++i) { - hadamard_loop(buf + i, 8, b + i * 8); + for (int j = 0; j < 8; ++j) { + input[i * 8 + j] = static_cast(a[i * a_stride + j]); + } } + for (int i = 0; i < 8; ++i) hadamard_loop(input + i, buf + i * 8); + for (int i = 0; i < 8; ++i) hadamard_loop(buf + i, b + i * 8); } -void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) { +void reference_hadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) { /* The source is a 16x16 block. The destination is rearranged to 8x32. * Input is 9 bit. */ reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0); @@ -68,16 +72,16 @@ void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) { /* Overlay the 8x8 blocks and combine. */ for (int i = 0; i < 64; ++i) { /* 8x8 steps the range up to 15 bits. */ - const int16_t a0 = b[0]; - const int16_t a1 = b[64]; - const int16_t a2 = b[128]; - const int16_t a3 = b[192]; + const tran_low_t a0 = b[0]; + const tran_low_t a1 = b[64]; + const tran_low_t a2 = b[128]; + const tran_low_t a3 = b[192]; /* Prevent the result from escaping int16_t. */ - const int16_t b0 = (a0 + a1) >> 1; - const int16_t b1 = (a0 - a1) >> 1; - const int16_t b2 = (a2 + a3) >> 1; - const int16_t b3 = (a2 - a3) >> 1; + const tran_low_t b0 = (a0 + a1) >> 1; + const tran_low_t b1 = (a0 - a1) >> 1; + const tran_low_t b2 = (a2 + a3) >> 1; + const tran_low_t b3 = (a2 - a3) >> 1; /* Store a 16 bit value. */ b[0] = b0 + b2; @@ -89,132 +93,289 @@ void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) { } } -class HadamardTestBase : public ::testing::TestWithParam { +void reference_hadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) { + reference_hadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0); + reference_hadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256); + reference_hadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512); + reference_hadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768); + + for (int i = 0; i < 256; ++i) { + const tran_low_t a0 = b[0]; + const tran_low_t a1 = b[256]; + const tran_low_t a2 = b[512]; + const tran_low_t a3 = b[768]; + + const tran_low_t b0 = (a0 + a1) >> 2; + const tran_low_t b1 = (a0 - a1) >> 2; + const tran_low_t b2 = (a2 + a3) >> 2; + const tran_low_t b3 = (a2 - a3) >> 2; + + b[0] = b0 + b2; + b[256] = b1 + b3; + b[512] = b0 - b2; + b[768] = b1 - b3; + + ++b; + } +} + +struct HadamardFuncWithSize { + HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {} + HadamardFunc func; + int block_size; +}; + +std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) { + return os << "block size: " << hfs.block_size; +} + +class HadamardTestBase : public ::testing::TestWithParam { public: - virtual void SetUp() { - h_func_ = GetParam(); + void SetUp() override { + h_func_ = GetParam().func; + bwh_ = GetParam().block_size; + block_size_ = bwh_ * bwh_; rnd_.Reset(ACMRandom::DeterministicSeed()); } + // The Rand() function generates values in the range [-((1 << BitDepth) - 1), + // (1 << BitDepth) - 1]. This is because the input to the Hadamard transform + // is the residual pixel, which is defined as 'source pixel - predicted + // pixel'. Source pixel and predicted pixel take values in the range + // [0, (1 << BitDepth) - 1] and thus the residual pixel ranges from + // -((1 << BitDepth) - 1) to ((1 << BitDepth) - 1). + virtual int16_t Rand() = 0; + + void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b, + int bwh) { + if (bwh == 32) + reference_hadamard32x32(a, a_stride, b); + else if (bwh == 16) + reference_hadamard16x16(a, a_stride, b); + else + reference_hadamard8x8(a, a_stride, b); + } + + void CompareReferenceRandom() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(a, 0, sizeof(a)); + memset(b, 0, sizeof(b)); + + tran_low_t b_ref[kMaxBlockSize]; + memset(b_ref, 0, sizeof(b_ref)); + + for (int i = 0; i < block_size_; ++i) a[i] = Rand(); + + ReferenceHadamard(a, bwh_, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + + void ExtremeValuesTest() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(b, 0, sizeof(b)); + + tran_low_t b_ref[kMaxBlockSize]; + memset(b_ref, 0, sizeof(b_ref)); + + for (int i = 0; i < 2; ++i) { + // Initialize a test block with input range [-mask_, mask_]. + const int sign = (i == 0) ? 1 : -1; + for (int j = 0; j < kMaxBlockSize; ++j) + input_extreme_block[j] = sign * 255; + + ReferenceHadamard(input_extreme_block, bwh_, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(input_extreme_block, bwh_, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + } + + void VaryStride() { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]); + DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]); + memset(a, 0, sizeof(a)); + for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand(); + + tran_low_t b_ref[kMaxBlockSize]; + for (int i = 8; i < 64; i += 8) { + memset(b, 0, sizeof(b)); + memset(b_ref, 0, sizeof(b_ref)); + + ReferenceHadamard(a, i, b_ref, bwh_); + ASM_REGISTER_STATE_CHECK(h_func_(a, i, b)); + + // The order of the output is not important. Sort before checking. + std::sort(b, b + block_size_); + std::sort(b_ref, b_ref + block_size_); + EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); + } + } + + void SpeedTest(int times) { + const int kMaxBlockSize = 32 * 32; + DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]); + DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]); + memset(input, 1, sizeof(input)); + memset(output, 0, sizeof(output)); + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < times; ++i) { + h_func_(input, bwh_, output); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Hadamard%dx%d[%12d runs]: %d us\n", bwh_, bwh_, times, + elapsed_time); + } + protected: + int bwh_; + int block_size_; HadamardFunc h_func_; ACMRandom rnd_; }; -class Hadamard8x8Test : public HadamardTestBase {}; - -TEST_P(Hadamard8x8Test, CompareReferenceRandom) { - DECLARE_ALIGNED(16, int16_t, a[64]); - DECLARE_ALIGNED(16, int16_t, b[64]); - int16_t b_ref[64]; - for (int i = 0; i < 64; ++i) { - a[i] = rnd_.Rand9Signed(); +class HadamardLowbdTest : public HadamardTestBase { + protected: + // Use values between -255 (0xFF01) and 255 (0x00FF) + int16_t Rand() override { + int16_t src = rnd_.Rand8(); + int16_t pred = rnd_.Rand8(); + return src - pred; } - memset(b, 0, sizeof(b)); - memset(b_ref, 0, sizeof(b_ref)); +}; - reference_hadamard8x8(a, 8, b_ref); - ASM_REGISTER_STATE_CHECK(h_func_(a, 8, b)); +TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } - // The order of the output is not important. Sort before checking. - std::sort(b, b + 64); - std::sort(b_ref, b_ref + 64); - EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); +TEST_P(HadamardLowbdTest, ExtremeValuesTest) { ExtremeValuesTest(); } + +TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); } + +TEST_P(HadamardLowbdTest, DISABLED_Speed) { + SpeedTest(10); + SpeedTest(10000); + SpeedTest(10000000); } -TEST_P(Hadamard8x8Test, VaryStride) { - DECLARE_ALIGNED(16, int16_t, a[64 * 8]); - DECLARE_ALIGNED(16, int16_t, b[64]); - int16_t b_ref[64]; - for (int i = 0; i < 64 * 8; ++i) { - a[i] = rnd_.Rand9Signed(); - } - - for (int i = 8; i < 64; i += 8) { - memset(b, 0, sizeof(b)); - memset(b_ref, 0, sizeof(b_ref)); - - reference_hadamard8x8(a, i, b_ref); - ASM_REGISTER_STATE_CHECK(h_func_(a, i, b)); - - // The order of the output is not important. Sort before checking. - std::sort(b, b + 64); - std::sort(b_ref, b_ref + 64); - EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); - } -} - -INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test, - ::testing::Values(&vpx_hadamard_8x8_c)); +INSTANTIATE_TEST_SUITE_P( + C, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_c, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_c, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_c, 32))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test, - ::testing::Values(&vpx_hadamard_8x8_sse2)); +INSTANTIATE_TEST_SUITE_P( + SSE2, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_sse2, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_sse2, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_sse2, 32))); #endif // HAVE_SSE2 -#if HAVE_SSSE3 && ARCH_X86_64 -INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test, - ::testing::Values(&vpx_hadamard_8x8_ssse3)); -#endif // HAVE_SSSE3 && ARCH_X86_64 +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_16x16_avx2, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_avx2, 32))); +#endif // HAVE_AVX2 + +#if HAVE_SSSE3 && VPX_ARCH_X86_64 +INSTANTIATE_TEST_SUITE_P( + SSSE3, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_ssse3, 8))); +#endif // HAVE_SSSE3 && VPX_ARCH_X86_64 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test, - ::testing::Values(&vpx_hadamard_8x8_neon)); +INSTANTIATE_TEST_SUITE_P( + NEON, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16), + HadamardFuncWithSize(&vpx_hadamard_32x32_neon, 32))); #endif // HAVE_NEON -class Hadamard16x16Test : public HadamardTestBase {}; +// TODO(jingning): Remove highbitdepth flag when the SIMD functions are +// in place and turn on the unit test. +#if !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_msa, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_msa, 16))); +#endif // HAVE_MSA +#endif // !CONFIG_VP9_HIGHBITDEPTH -TEST_P(Hadamard16x16Test, CompareReferenceRandom) { - DECLARE_ALIGNED(16, int16_t, a[16 * 16]); - DECLARE_ALIGNED(16, int16_t, b[16 * 16]); - int16_t b_ref[16 * 16]; - for (int i = 0; i < 16 * 16; ++i) { - a[i] = rnd_.Rand9Signed(); +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P( + VSX, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_vsx, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16))); +#endif // HAVE_VSX + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, HadamardLowbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_lsx, 8), + HadamardFuncWithSize(&vpx_hadamard_16x16_lsx, 16))); +#endif // HAVE_LSX + +#if CONFIG_VP9_HIGHBITDEPTH +class HadamardHighbdTest : public HadamardTestBase { + protected: + // Use values between -4095 (0xF001) and 4095 (0x0FFF) + int16_t Rand() override { + int16_t src = rnd_.Rand12(); + int16_t pred = rnd_.Rand12(); + return src - pred; } - memset(b, 0, sizeof(b)); - memset(b_ref, 0, sizeof(b_ref)); +}; - reference_hadamard16x16(a, 16, b_ref); - ASM_REGISTER_STATE_CHECK(h_func_(a, 16, b)); +TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); } - // The order of the output is not important. Sort before checking. - std::sort(b, b + 16 * 16); - std::sort(b_ref, b_ref + 16 * 16); - EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); +TEST_P(HadamardHighbdTest, VaryStride) { VaryStride(); } + +TEST_P(HadamardHighbdTest, DISABLED_Speed) { + SpeedTest(10); + SpeedTest(10000); + SpeedTest(10000000); } -TEST_P(Hadamard16x16Test, VaryStride) { - DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]); - DECLARE_ALIGNED(16, int16_t, b[16 * 16]); - int16_t b_ref[16 * 16]; - for (int i = 0; i < 16 * 16 * 8; ++i) { - a[i] = rnd_.Rand9Signed(); - } +INSTANTIATE_TEST_SUITE_P( + C, HadamardHighbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_c, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_c, 16), + HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_c, 32))); - for (int i = 8; i < 64; i += 8) { - memset(b, 0, sizeof(b)); - memset(b_ref, 0, sizeof(b_ref)); - - reference_hadamard16x16(a, i, b_ref); - ASM_REGISTER_STATE_CHECK(h_func_(a, i, b)); - - // The order of the output is not important. Sort before checking. - std::sort(b, b + 16 * 16); - std::sort(b_ref, b_ref + 16 * 16); - EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b))); - } -} - -INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test, - ::testing::Values(&vpx_hadamard_16x16_c)); - -#if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test, - ::testing::Values(&vpx_hadamard_16x16_sse2)); -#endif // HAVE_SSE2 +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, HadamardHighbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2, 16), + HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_avx2, + 32))); +#endif // HAVE_AVX2 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test, - ::testing::Values(&vpx_hadamard_16x16_neon)); -#endif // HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, HadamardHighbdTest, + ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8), + HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon, 16), + HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_neon, + 32))); +#endif + +#endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/media/libvpx/libvpx/test/i420_video_source.h b/media/libvpx/libvpx/test/i420_video_source.h index 49573823b4..97473b5c2f 100644 --- a/media/libvpx/libvpx/test/i420_video_source.h +++ b/media/libvpx/libvpx/test/i420_video_source.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_I420_VIDEO_SOURCE_H_ -#define TEST_I420_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_I420_VIDEO_SOURCE_H_ +#define VPX_TEST_I420_VIDEO_SOURCE_H_ #include #include #include @@ -30,4 +30,4 @@ class I420VideoSource : public YUVVideoSource { } // namespace libvpx_test -#endif // TEST_I420_VIDEO_SOURCE_H_ +#endif // VPX_TEST_I420_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/test/idct8x8_test.cc b/media/libvpx/libvpx/test/idct8x8_test.cc index 7951bb93c9..28ab257e6e 100644 --- a/media/libvpx/libvpx/test/idct8x8_test.cc +++ b/media/libvpx/libvpx/test/idct8x8_test.cc @@ -12,12 +12,11 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" #include "vpx/vpx_integer.h" -#include "vpx_ports/msvc.h" // for round() using libvpx_test::ACMRandom; diff --git a/media/libvpx/libvpx/test/idct_test.cc b/media/libvpx/libvpx/test/idct_test.cc index 700da77e3e..ed969945f8 100644 --- a/media/libvpx/libvpx/test/idct_test.cc +++ b/media/libvpx/libvpx/test/idct_test.cc @@ -11,120 +11,170 @@ #include "./vpx_config.h" #include "./vp8_rtcd.h" -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" +#include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "vpx/vpx_integer.h" -typedef void (*IdctFunc)(int16_t *input, unsigned char *pred_ptr, - int pred_stride, unsigned char *dst_ptr, - int dst_stride); +using IdctFunc = void (*)(int16_t *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride); namespace { + +using libvpx_test::Buffer; + class IDCTTest : public ::testing::TestWithParam { protected: - virtual void SetUp() { - int i; - + void SetUp() override { UUT = GetParam(); - memset(input, 0, sizeof(input)); - /* Set up guard blocks */ - for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1; + + input = new Buffer(4, 4, 0); + ASSERT_NE(input, nullptr); + ASSERT_TRUE(input->Init()); + predict = new Buffer(4, 4, 3); + ASSERT_NE(predict, nullptr); + ASSERT_TRUE(predict->Init()); + output = new Buffer(4, 4, 3); + ASSERT_NE(output, nullptr); + ASSERT_TRUE(output->Init()); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { + delete input; + delete predict; + delete output; + libvpx_test::ClearSystemState(); + } IdctFunc UUT; - int16_t input[16]; - unsigned char output[256]; - unsigned char predict[256]; + Buffer *input; + Buffer *predict; + Buffer *output; }; -TEST_P(IDCTTest, TestGuardBlocks) { - int i; - - for (i = 0; i < 256; i++) { - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(0, output[i]) << i; - else - EXPECT_EQ(255, output[i]); - } -} - TEST_P(IDCTTest, TestAllZeros) { - int i; + // When the input is '0' the output will be '0'. + input->Set(0); + predict->Set(0); + output->Set(0); - ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); - for (i = 0; i < 256; i++) { - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(0, output[i]) << "i==" << i; - else - EXPECT_EQ(255, output[i]) << "i==" << i; - } + ASSERT_TRUE(input->CheckValues(0)); + ASSERT_TRUE(input->CheckPadding()); + ASSERT_TRUE(output->CheckValues(0)); + ASSERT_TRUE(output->CheckPadding()); } TEST_P(IDCTTest, TestAllOnes) { - int i; + input->Set(0); + ASSERT_NE(input->TopLeftPixel(), nullptr); + // When the first element is '4' it will fill the output buffer with '1'. + input->TopLeftPixel()[0] = 4; + predict->Set(0); + output->Set(0); - input[0] = 4; - ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); - for (i = 0; i < 256; i++) { - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(1, output[i]) << "i==" << i; - else - EXPECT_EQ(255, output[i]) << "i==" << i; - } + ASSERT_TRUE(output->CheckValues(1)); + ASSERT_TRUE(output->CheckPadding()); } TEST_P(IDCTTest, TestAddOne) { - int i; + // Set the transform output to '1' and make sure it gets added to the + // prediction buffer. + input->Set(0); + ASSERT_NE(input->TopLeftPixel(), nullptr); + input->TopLeftPixel()[0] = 4; + output->Set(0); - for (i = 0; i < 256; i++) predict[i] = i; - input[0] = 4; - ASM_REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16)); - - for (i = 0; i < 256; i++) { - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(i + 1, output[i]) << "i==" << i; - else - EXPECT_EQ(255, output[i]) << "i==" << i; + uint8_t *pred = predict->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + pred[y * predict->stride() + x] = y * 4 + x; + } } + + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); + + uint8_t const *out = output->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + EXPECT_EQ(1 + y * 4 + x, out[y * output->stride() + x]); + } + } + + if (HasFailure()) { + output->DumpBuffer(); + } + + ASSERT_TRUE(output->CheckPadding()); } TEST_P(IDCTTest, TestWithData) { - int i; + // Test a single known input. + predict->Set(0); - for (i = 0; i < 16; i++) input[i] = i; - - ASM_REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); - - for (i = 0; i < 256; i++) { - if ((i & 0xF) > 3 || i > 63) - EXPECT_EQ(255, output[i]) << "i==" << i; - else if (i == 0) - EXPECT_EQ(11, output[i]) << "i==" << i; - else if (i == 34) - EXPECT_EQ(1, output[i]) << "i==" << i; - else if (i == 2 || i == 17 || i == 32) - EXPECT_EQ(3, output[i]) << "i==" << i; - else - EXPECT_EQ(0, output[i]) << "i==" << i; + int16_t *in = input->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + in[y * input->stride() + x] = y * 4 + x; + } } + + ASM_REGISTER_STATE_CHECK(UUT(input->TopLeftPixel(), predict->TopLeftPixel(), + predict->stride(), output->TopLeftPixel(), + output->stride())); + + uint8_t *out = output->TopLeftPixel(); + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) { + switch (y * 4 + x) { + case 0: EXPECT_EQ(11, out[y * output->stride() + x]); break; + case 2: + case 5: + case 8: EXPECT_EQ(3, out[y * output->stride() + x]); break; + case 10: EXPECT_EQ(1, out[y * output->stride() + x]); break; + default: EXPECT_EQ(0, out[y * output->stride() + x]); + } + } + } + + if (HasFailure()) { + output->DumpBuffer(); + } + + ASSERT_TRUE(output->CheckPadding()); } -INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c)); +INSTANTIATE_TEST_SUITE_P(C, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_c)); + #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, IDCTTest, - ::testing::Values(vp8_short_idct4x4llm_neon)); -#endif +INSTANTIATE_TEST_SUITE_P(NEON, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_neon)); +#endif // HAVE_NEON + #if HAVE_MMX -INSTANTIATE_TEST_CASE_P(MMX, IDCTTest, - ::testing::Values(vp8_short_idct4x4llm_mmx)); -#endif +INSTANTIATE_TEST_SUITE_P(MMX, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_mmx)); +#endif // HAVE_MMX + #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, IDCTTest, - ::testing::Values(vp8_short_idct4x4llm_msa)); -#endif -} +INSTANTIATE_TEST_SUITE_P(MSA, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_msa)); +#endif // HAVE_MSA + +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P(MMI, IDCTTest, + ::testing::Values(vp8_short_idct4x4llm_mmi)); +#endif // HAVE_MMI +} // namespace diff --git a/media/libvpx/libvpx/test/init_vpx_test.cc b/media/libvpx/libvpx/test/init_vpx_test.cc new file mode 100644 index 0000000000..11e3863c6d --- /dev/null +++ b/media/libvpx/libvpx/test/init_vpx_test.cc @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "test/init_vpx_test.h" + +#include "./vpx_config.h" + +#if !CONFIG_SHARED +#include +#include "gtest/gtest.h" +#if VPX_ARCH_ARM +#include "vpx_ports/arm.h" +#endif +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +#include "vpx_ports/x86.h" +#endif +extern "C" { +#if CONFIG_VP8 +extern void vp8_rtcd(); +#endif // CONFIG_VP8 +#if CONFIG_VP9 +extern void vp9_rtcd(); +#endif // CONFIG_VP9 +extern void vpx_dsp_rtcd(); +extern void vpx_scale_rtcd(); +} + +#if VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64 +static void append_negative_gtest_filter(const char *str) { + std::string filter = GTEST_FLAG_GET(filter); + // Negative patterns begin with one '-' followed by a ':' separated list. + if (filter.find('-') == std::string::npos) filter += '-'; + filter += str; + GTEST_FLAG_SET(filter, filter); +} +#endif // VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64 +#endif // !CONFIG_SHARED + +namespace libvpx_test { +void init_vpx_test() { +#if !CONFIG_SHARED +#if VPX_ARCH_AARCH64 + const int caps = arm_cpu_caps(); + if (!(caps & HAS_NEON_DOTPROD)) { + append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*"); + } + if (!(caps & HAS_NEON_I8MM)) { + append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*"); + } + if (!(caps & HAS_SVE)) { + append_negative_gtest_filter(":SVE.*:SVE/*"); + } + if (!(caps & HAS_SVE2)) { + append_negative_gtest_filter(":SVE2.*:SVE2/*"); + } +#elif VPX_ARCH_ARM + const int caps = arm_cpu_caps(); + if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*"); +#endif // VPX_ARCH_ARM + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 + const int simd_caps = x86_simd_caps(); + if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*"); + if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*"); + if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*"); + if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*"); + if (!(simd_caps & HAS_SSSE3)) { + append_negative_gtest_filter(":SSSE3.*:SSSE3/*"); + } + if (!(simd_caps & HAS_SSE4_1)) { + append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*"); + } + if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*"); + if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*"); + if (!(simd_caps & HAS_AVX512)) { + append_negative_gtest_filter(":AVX512.*:AVX512/*"); + } +#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 + + // Shared library builds don't support whitebox tests that exercise internal + // symbols. +#if CONFIG_VP8 + vp8_rtcd(); +#endif // CONFIG_VP8 +#if CONFIG_VP9 + vp9_rtcd(); +#endif // CONFIG_VP9 + vpx_dsp_rtcd(); + vpx_scale_rtcd(); +#endif // !CONFIG_SHARED +} +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/init_vpx_test.h b/media/libvpx/libvpx/test/init_vpx_test.h new file mode 100644 index 0000000000..5e0dbb0e7e --- /dev/null +++ b/media/libvpx/libvpx/test/init_vpx_test.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef TEST_INIT_VPX_TEST_H_ +#define TEST_INIT_VPX_TEST_H_ + +namespace libvpx_test { +void init_vpx_test(); +} + +#endif // TEST_INIT_VPX_TEST_H_ diff --git a/media/libvpx/libvpx/test/invalid_file_test.cc b/media/libvpx/libvpx/test/invalid_file_test.cc index eae81faa13..0b895ed902 100644 --- a/media/libvpx/libvpx/test/invalid_file_test.cc +++ b/media/libvpx/libvpx/test/invalid_file_test.cc @@ -10,9 +10,10 @@ #include #include +#include #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "test/codec_factory.h" #include "test/decode_test_driver.h" @@ -37,23 +38,22 @@ std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) { class InvalidFileTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam { protected: - InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {} + InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {} - virtual ~InvalidFileTest() { - if (res_file_ != NULL) fclose(res_file_); + ~InvalidFileTest() override { + if (res_file_ != nullptr) fclose(res_file_); } void OpenResFile(const std::string &res_file_name_) { res_file_ = libvpx_test::OpenTestDataFile(res_file_name_); - ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: " - << res_file_name_; + ASSERT_NE(res_file_, nullptr) + << "Result file open failed. Filename: " << res_file_name_; } - virtual bool HandleDecodeResult( - const vpx_codec_err_t res_dec, - const libvpx_test::CompressedVideoSource &video, - libvpx_test::Decoder *decoder) { - EXPECT_TRUE(res_file_ != NULL); + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { + EXPECT_NE(res_file_, nullptr); int expected_res_dec; // Read integer result. @@ -89,7 +89,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, const std::string filename = input.filename; // Open compressed video file. - testing::internal::scoped_ptr video; + std::unique_ptr video; if (filename.substr(filename.length() - 3, 3) == "ivf") { video.reset(new libvpx_test::IVFVideoSource(filename)); } else if (filename.substr(filename.length() - 4, 4) == "webm") { @@ -101,7 +101,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, return; #endif } - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); // Construct result file name. The file holds a list of expected integer @@ -120,11 +120,26 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest, TEST_P(InvalidFileTest, ReturnCode) { RunTest(); } +#if CONFIG_VP8_DECODER +const DecodeParam kVP8InvalidFileTests[] = { + { 1, "invalid-bug-1443.ivf" }, + { 1, "invalid-bug-148271109.ivf" }, + { 1, "invalid-token-partition.ivf" }, + { 1, "invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf" }, +}; + +VP8_INSTANTIATE_TEST_SUITE(InvalidFileTest, + ::testing::ValuesIn(kVP8InvalidFileTests)); +#endif // CONFIG_VP8_DECODER + #if CONFIG_VP9_DECODER const DecodeParam kVP9InvalidFileTests[] = { { 1, "invalid-vp90-02-v2.webm" }, #if CONFIG_VP9_HIGHBITDEPTH { 1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf" }, + { 1, + "invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-." + "ivf" }, #endif { 1, "invalid-vp90-03-v3.webm" }, { 1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf" }, @@ -132,7 +147,7 @@ const DecodeParam kVP9InvalidFileTests[] = { // This file will cause a large allocation which is expected to fail in 32-bit // environments. Test x86 for coverage purposes as the allocation failure will // be in platform agnostic code. -#if ARCH_X86 +#if VPX_ARCH_X86 { 1, "invalid-vp90-2-00-quantizer-63.ivf.kf_65527x61446.ivf" }, #endif { 1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf" }, @@ -147,8 +162,8 @@ const DecodeParam kVP9InvalidFileTests[] = { { 1, "invalid-crbug-667044.webm" }, }; -VP9_INSTANTIATE_TEST_CASE(InvalidFileTest, - ::testing::ValuesIn(kVP9InvalidFileTests)); +VP9_INSTANTIATE_TEST_SUITE(InvalidFileTest, + ::testing::ValuesIn(kVP9InvalidFileTests)); #endif // CONFIG_VP9_DECODER // This class will include test vectors that are expected to fail @@ -156,20 +171,20 @@ VP9_INSTANTIATE_TEST_CASE(InvalidFileTest, class InvalidFileInvalidPeekTest : public InvalidFileTest { protected: InvalidFileInvalidPeekTest() : InvalidFileTest() {} - virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/, - libvpx_test::CompressedVideoSource * /*video*/, - const vpx_codec_err_t /*res_peek*/) {} + void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/, + libvpx_test::CompressedVideoSource * /*video*/, + const vpx_codec_err_t /*res_peek*/) override {} }; TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); } #if CONFIG_VP8_DECODER -const DecodeParam kVP8InvalidFileTests[] = { +const DecodeParam kVP8InvalidPeekTests[] = { { 1, "invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf" }, }; -VP8_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest, - ::testing::ValuesIn(kVP8InvalidFileTests)); +VP8_INSTANTIATE_TEST_SUITE(InvalidFileInvalidPeekTest, + ::testing::ValuesIn(kVP8InvalidPeekTests)); #endif // CONFIG_VP8_DECODER #if CONFIG_VP9_DECODER @@ -177,8 +192,9 @@ const DecodeParam kVP9InvalidFileInvalidPeekTests[] = { { 1, "invalid-vp90-01-v3.webm" }, }; -VP9_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest, - ::testing::ValuesIn(kVP9InvalidFileInvalidPeekTests)); +VP9_INSTANTIATE_TEST_SUITE( + InvalidFileInvalidPeekTest, + ::testing::ValuesIn(kVP9InvalidFileInvalidPeekTests)); const DecodeParam kMultiThreadedVP9InvalidFileTests[] = { { 4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm" }, @@ -190,9 +206,11 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = { { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" }, { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" }, { 2, "invalid-crbug-629481.webm" }, + { 3, "invalid-crbug-1558.ivf" }, + { 4, "invalid-crbug-1562.ivf" }, }; -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP9MultiThreaded, InvalidFileTest, ::testing::Combine( ::testing::Values( diff --git a/media/libvpx/libvpx/test/ivf_video_source.h b/media/libvpx/libvpx/test/ivf_video_source.h index b87624a11f..3ccac62b51 100644 --- a/media/libvpx/libvpx/test/ivf_video_source.h +++ b/media/libvpx/libvpx/test/ivf_video_source.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_IVF_VIDEO_SOURCE_H_ -#define TEST_IVF_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_IVF_VIDEO_SOURCE_H_ +#define VPX_TEST_IVF_VIDEO_SOURCE_H_ #include #include #include @@ -16,7 +16,7 @@ #include "test/video_source.h" namespace libvpx_test { -const unsigned int kCodeBufferSize = 256 * 1024; +const unsigned int kCodeBufferSize = 256 * 1024 * 1024; const unsigned int kIvfFileHdrSize = 32; const unsigned int kIvfFrameHdrSize = 12; @@ -29,26 +29,26 @@ static unsigned int MemGetLe32(const uint8_t *mem) { class IVFVideoSource : public CompressedVideoSource { public: explicit IVFVideoSource(const std::string &file_name) - : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL), - frame_sz_(0), frame_(0), end_of_file_(false) {} + : file_name_(file_name), input_file_(nullptr), + compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0), + end_of_file_(false) {} - virtual ~IVFVideoSource() { + ~IVFVideoSource() override { delete[] compressed_frame_buf_; if (input_file_) fclose(input_file_); } - virtual void Init() { + void Init() override { // Allocate a buffer for read in the compressed video frame. compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize]; - ASSERT_TRUE(compressed_frame_buf_ != NULL) - << "Allocate frame buffer failed"; + ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed"; } - virtual void Begin() { + void Begin() override { input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_NE(input_file_, nullptr) + << "Input file open failed. Filename: " << file_name_; // Read file header uint8_t file_hdr[kIvfFileHdrSize]; @@ -62,13 +62,13 @@ class IVFVideoSource : public CompressedVideoSource { FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } void FillFrame() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); uint8_t frame_hdr[kIvfFrameHdrSize]; // Check frame header and read a frame from input_file. if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) != @@ -86,11 +86,11 @@ class IVFVideoSource : public CompressedVideoSource { } } - virtual const uint8_t *cxdata() const { - return end_of_file_ ? NULL : compressed_frame_buf_; + const uint8_t *cxdata() const override { + return end_of_file_ ? nullptr : compressed_frame_buf_; } - virtual size_t frame_size() const { return frame_sz_; } - virtual unsigned int frame_number() const { return frame_; } + size_t frame_size() const override { return frame_sz_; } + unsigned int frame_number() const override { return frame_; } protected: std::string file_name_; @@ -103,4 +103,4 @@ class IVFVideoSource : public CompressedVideoSource { } // namespace libvpx_test -#endif // TEST_IVF_VIDEO_SOURCE_H_ +#endif // VPX_TEST_IVF_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/test/keyframe_test.cc b/media/libvpx/libvpx/test/keyframe_test.cc index 38bd923b7d..c49ea91bd2 100644 --- a/media/libvpx/libvpx/test/keyframe_test.cc +++ b/media/libvpx/libvpx/test/keyframe_test.cc @@ -8,12 +8,18 @@ * be found in the AUTHORS file in the root of the source tree. */ #include +#include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "./vpx_config.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vpx_image.h" namespace { @@ -22,9 +28,9 @@ class KeyframeTest public ::libvpx_test::CodecTestWithParam { protected: KeyframeTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~KeyframeTest() {} + ~KeyframeTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); kf_count_ = 0; @@ -33,17 +39,17 @@ class KeyframeTest set_cpu_used_ = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (kf_do_force_kf_) { frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF; } - if (set_cpu_used_ && video->frame() == 1) { + if (set_cpu_used_ && video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); } } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { kf_pts_list_.push_back(pkt->data.frame.pts); kf_count_++; @@ -68,7 +74,9 @@ TEST_P(KeyframeTest, TestRandomVideoSource) { // In realtime mode - auto placed keyframes are exceedingly rare, don't // bother with this check if(GetParam() > 0) - if (GET_PARAM(1) > 0) EXPECT_GT(kf_count_, 1); + if (GET_PARAM(1) > 0) { + EXPECT_GT(kf_count_, 1); + } } TEST_P(KeyframeTest, TestDisableKeyframes) { @@ -128,19 +136,121 @@ TEST_P(KeyframeTest, TestAutoKeyframe) { // In realtime mode - auto placed keyframes are exceedingly rare, don't // bother with this check - if (GET_PARAM(1) > 0) + if (GET_PARAM(1) > 0) { EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes "; + } // Verify that keyframes match the file keyframes in the file. for (std::vector::const_iterator iter = kf_pts_list_.begin(); iter != kf_pts_list_.end(); ++iter) { if (deadline_ == VPX_DL_REALTIME && *iter > 0) - EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame " - << *iter; + EXPECT_EQ(0, (*iter - 1) % 30) + << "Unexpected keyframe at frame " << *iter; else EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter; } } -VP8_INSTANTIATE_TEST_CASE(KeyframeTest, ALL_TEST_MODES); +VP8_INSTANTIATE_TEST_SUITE(KeyframeTest, ALL_TEST_MODES); + +bool IsVP9(vpx_codec_iface_t *iface) { + static const char kVP9Name[] = "WebM Project VP9"; + return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) == + 0; +} + +vpx_image_t *CreateGrayImage(vpx_img_fmt_t fmt, unsigned int w, + unsigned int h) { + vpx_image_t *const image = vpx_img_alloc(nullptr, fmt, w, h, 1); + if (!image) return image; + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + return image; +} + +// Tests kf_max_dist in one-pass encoding with zero lag. +void TestKeyframeMaximumInterval(vpx_codec_iface_t *iface, + vpx_enc_deadline_t deadline, + unsigned int kf_max_dist) { + vpx_codec_enc_cfg_t cfg; + ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0), + VPX_CODEC_OK); + cfg.g_w = 320; + cfg.g_h = 240; + cfg.g_pass = VPX_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.kf_mode = VPX_KF_AUTO; + cfg.kf_min_dist = 0; + cfg.kf_max_dist = kf_max_dist; + + vpx_codec_ctx_t enc; + ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK); + + const int speed = IsVP9(iface) ? 9 : -12; + ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, speed), VPX_CODEC_OK); + + vpx_image_t *image = CreateGrayImage(VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frames. + const vpx_codec_cx_pkt_t *pkt; + const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1; + for (unsigned int i = 0; i < num_frames; ++i) { + ASSERT_EQ(vpx_codec_encode(&enc, image, i, 1, 0, deadline), VPX_CODEC_OK); + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + if (kf_max_dist == 0 || i % kf_max_dist == 0) { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY); + } else { + ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, 0u); + } + } + } + + // Flush the encoder. + bool got_data; + do { + ASSERT_EQ(vpx_codec_encode(&enc, nullptr, 0, 1, 0, deadline), VPX_CODEC_OK); + got_data = false; + vpx_codec_iter_t iter = nullptr; + while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT); + got_data = true; + } + } while (got_data); + + vpx_img_free(image); + ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK); +} + +TEST(KeyframeIntervalTest, KeyframeMaximumInterval) { + std::vector ifaces; +#if CONFIG_VP8_ENCODER + ifaces.push_back(vpx_codec_vp8_cx()); +#endif +#if CONFIG_VP9_ENCODER + ifaces.push_back(vpx_codec_vp9_cx()); +#endif + for (vpx_codec_iface_t *iface : ifaces) { + for (vpx_enc_deadline_t deadline : + { VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, VPX_DL_BEST_QUALITY }) { + // Test 0 and 1 (both mean all intra), some powers of 2, some multiples + // of 10, and some prime numbers. + for (unsigned int kf_max_dist : + { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) { + TestKeyframeMaximumInterval(iface, deadline, kf_max_dist); + } + } + } +} + } // namespace diff --git a/media/libvpx/libvpx/test/level_test.cc b/media/libvpx/libvpx/test/level_test.cc index 67a794e6f8..03217a293e 100644 --- a/media/libvpx/libvpx/test/level_test.cc +++ b/media/libvpx/libvpx/test/level_test.cc @@ -7,11 +7,12 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "vpx_config.h" namespace { class LevelTest @@ -22,9 +23,9 @@ class LevelTest : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0), level_(0) {} - virtual ~LevelTest() {} + ~LevelTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); if (encoding_mode_ != ::libvpx_test::kRealTime) { @@ -41,8 +42,8 @@ class LevelTest cfg_.rc_min_quantizer = 0; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP8E_SET_CPUUSED, cpu_used_); encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_); @@ -66,34 +67,46 @@ class LevelTest int level_; }; -TEST_P(LevelTest, TestTargetLevel11) { +TEST_P(LevelTest, TestTargetLevel11Large) { +#if CONFIG_REALTIME_ONLY + GTEST_SKIP(); +#else ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime); ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, - 90); + 60); target_level_ = 11; cfg_.rc_target_bitrate = 150; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(target_level_, level_); + ASSERT_GE(target_level_, level_); +#endif } -TEST_P(LevelTest, TestTargetLevel20) { +TEST_P(LevelTest, TestTargetLevel20Large) { +#if CONFIG_REALTIME_ONLY + GTEST_SKIP(); +#else ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime); ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 90); + 30, 1, 0, 60); target_level_ = 20; cfg_.rc_target_bitrate = 1200; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(target_level_, level_); + ASSERT_GE(target_level_, level_); +#endif } -TEST_P(LevelTest, TestTargetLevel31) { +TEST_P(LevelTest, TestTargetLevel31Large) { +#if CONFIG_REALTIME_ONLY + GTEST_SKIP(); +#else ASSERT_NE(encoding_mode_, ::libvpx_test::kRealTime); ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720, 30, 1, 0, 60); target_level_ = 31; cfg_.rc_target_bitrate = 8000; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(target_level_, level_); + ASSERT_GE(target_level_, level_); +#endif } // Test for keeping level stats only @@ -103,11 +116,11 @@ TEST_P(LevelTest, TestTargetLevel0) { target_level_ = 0; min_gf_internal_ = 4; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(11, level_); + ASSERT_GE(11, level_); cfg_.rc_target_bitrate = 1600; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(20, level_); + ASSERT_GE(20, level_); } // Test for level control being turned off @@ -120,7 +133,7 @@ TEST_P(LevelTest, TestTargetLevel255) { TEST_P(LevelTest, TestTargetLevelApi) { ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1); - static const vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo; + static vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo; vpx_codec_ctx_t enc; vpx_codec_enc_cfg_t cfg; EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0)); @@ -130,7 +143,7 @@ TEST_P(LevelTest, TestTargetLevelApi) { if (level == 10 || level == 11 || level == 20 || level == 21 || level == 30 || level == 31 || level == 40 || level == 41 || level == 50 || level == 51 || level == 52 || level == 60 || - level == 61 || level == 62 || level == 0 || level == 255) + level == 61 || level == 62 || level == 0 || level == 1 || level == 255) EXPECT_EQ(VPX_CODEC_OK, vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level)); else @@ -140,8 +153,6 @@ TEST_P(LevelTest, TestTargetLevelApi) { EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc)); } -VP9_INSTANTIATE_TEST_CASE(LevelTest, - ::testing::Values(::libvpx_test::kTwoPassGood, - ::libvpx_test::kOnePassGood), - ::testing::Range(0, 9)); +VP9_INSTANTIATE_TEST_SUITE(LevelTest, ONE_OR_TWO_PASS_TEST_MODES, + ::testing::Range(0, 9)); } // namespace diff --git a/media/libvpx/libvpx/test/lpf_test.cc b/media/libvpx/libvpx/test/lpf_test.cc index 4fca7d49c0..b1688f7d26 100644 --- a/media/libvpx/libvpx/test/lpf_test.cc +++ b/media/libvpx/libvpx/test/lpf_test.cc @@ -11,8 +11,9 @@ #include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -35,29 +36,29 @@ const int kNumCoeffs = 1024; const int number_of_iterations = 10000; #if CONFIG_VP9_HIGHBITDEPTH -typedef uint16_t Pixel; +using Pixel = uint16_t; #define PIXEL_WIDTH 16 -typedef void (*loop_op_t)(Pixel *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, int bd); -typedef void (*dual_loop_op_t)(Pixel *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd); +using loop_op_t = void (*)(Pixel *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, int bd); +using dual_loop_op_t = void (*)(Pixel *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd); #else -typedef uint8_t Pixel; +using Pixel = uint8_t; #define PIXEL_WIDTH 8 -typedef void (*loop_op_t)(Pixel *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh); -typedef void (*dual_loop_op_t)(Pixel *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1); +using loop_op_t = void (*)(Pixel *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh); +using dual_loop_op_t = void (*)(Pixel *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1); #endif // CONFIG_VP9_HIGHBITDEPTH -typedef std::tr1::tuple loop8_param_t; -typedef std::tr1::tuple dualloop8_param_t; +using loop8_param_t = std::tuple; +using dualloop8_param_t = std::tuple; void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit, const int mask, const int32_t p, const int i) { @@ -74,9 +75,9 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit, if (j < 1) { tmp_s[j] = rnd->Rand16(); } else if (val & 0x20) { // Increment by a value within the limit. - tmp_s[j] = tmp_s[j - 1] + (limit - 1); + tmp_s[j] = static_cast(tmp_s[j - 1] + (limit - 1)); } else { // Decrement by a value within the limit. - tmp_s[j] = tmp_s[j - 1] - (limit - 1); + tmp_s[j] = static_cast(tmp_s[j - 1] - (limit - 1)); } j++; } @@ -93,11 +94,11 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit, if (j < 1) { tmp_s[j] = rnd->Rand16(); } else if (val & 0x20) { // Increment by a value within the limit. - tmp_s[(j % 32) * 32 + j / 32] = - tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1); + tmp_s[(j % 32) * 32 + j / 32] = static_cast( + tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1)); } else { // Decrement by a value within the limit. - tmp_s[(j % 32) * 32 + j / 32] = - tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1); + tmp_s[(j % 32) * 32 + j / 32] = static_cast( + tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1)); } j++; } @@ -114,17 +115,29 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit, } } +uint8_t GetOuterThresh(ACMRandom *rnd) { + return static_cast(rnd->RandRange(3 * MAX_LOOP_FILTER + 5)); +} + +uint8_t GetInnerThresh(ACMRandom *rnd) { + return static_cast(rnd->RandRange(MAX_LOOP_FILTER + 1)); +} + +uint8_t GetHevThresh(ACMRandom *rnd) { + return static_cast(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4); +} + class Loop8Test6Param : public ::testing::TestWithParam { public: - virtual ~Loop8Test6Param() {} - virtual void SetUp() { + ~Loop8Test6Param() override = default; + void SetUp() override { loopfilter_op_ = GET_PARAM(0); ref_loopfilter_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int bit_depth_; @@ -132,18 +145,21 @@ class Loop8Test6Param : public ::testing::TestWithParam { loop_op_t loopfilter_op_; loop_op_t ref_loopfilter_op_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param); +#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH) || \ + (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH) class Loop8Test9Param : public ::testing::TestWithParam { public: - virtual ~Loop8Test9Param() {} - virtual void SetUp() { + ~Loop8Test9Param() override = default; + void SetUp() override { loopfilter_op_ = GET_PARAM(0); ref_loopfilter_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); mask_ = (1 << bit_depth_) - 1; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int bit_depth_; @@ -151,6 +167,9 @@ class Loop8Test9Param : public ::testing::TestWithParam { dual_loop_op_t loopfilter_op_; dual_loop_op_t ref_loopfilter_op_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param); +#endif // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA && + // (!CONFIG_VP9_HIGHBITDEPTH) || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH)) TEST_P(Loop8Test6Param, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); @@ -162,15 +181,15 @@ TEST_P(Loop8Test6Param, OperationCheck) { int first_failure = -1; for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -221,15 +240,15 @@ TEST_P(Loop8Test6Param, ValueCheck) { for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -262,6 +281,8 @@ TEST_P(Loop8Test6Param, ValueCheck) { << "First failed at test case " << first_failure; } +#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)) || \ + (HAVE_DSPR2 || HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)) TEST_P(Loop8Test9Param, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = number_of_iterations; @@ -271,27 +292,27 @@ TEST_P(Loop8Test9Param, OperationCheck) { int first_failure = -1; for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -334,27 +355,27 @@ TEST_P(Loop8Test9Param, ValueCheck) { int first_failure = -1; for (int i = 0; i < count_test_block; ++i) { int err_count = 0; - uint8_t tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + uint8_t tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(3 * MAX_LOOP_FILTER + 4)); + tmp = GetOuterThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = static_cast(rnd(MAX_LOOP_FILTER)); + tmp = GetInnerThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; - tmp = rnd.Rand8(); + tmp = GetHevThresh(&rnd); DECLARE_ALIGNED(16, const uint8_t, thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; @@ -389,12 +410,15 @@ TEST_P(Loop8Test9Param, ValueCheck) { "loopfilter output. " << "First failed at test case " << first_failure; } +#endif // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA && + // (!CONFIG_VP9_HIGHBITDEPTH)) || (HAVE_LSX && + // (!CONFIG_VP9_HIGHBITDEPTH)) -using std::tr1::make_tuple; +using std::make_tuple; #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Loop8Test6Param, ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_sse2, &vpx_highbd_lpf_horizontal_4_c, 8), @@ -445,7 +469,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_highbd_lpf_vertical_16_dual_sse2, &vpx_highbd_lpf_vertical_16_dual_c, 12))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_sse2, &vpx_lpf_horizontal_4_c, 8), @@ -462,7 +486,7 @@ INSTANTIATE_TEST_CASE_P( #endif #if HAVE_AVX2 && (!CONFIG_VP9_HIGHBITDEPTH) -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, Loop8Test6Param, ::testing::Values(make_tuple(&vpx_lpf_horizontal_16_avx2, &vpx_lpf_horizontal_16_c, 8), @@ -472,7 +496,7 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_sse2, &vpx_highbd_lpf_horizontal_4_dual_c, 8), @@ -499,7 +523,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_highbd_lpf_vertical_8_dual_sse2, &vpx_highbd_lpf_vertical_8_dual_c, 12))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_sse2, &vpx_lpf_horizontal_4_dual_c, 8), @@ -514,7 +538,7 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_NEON #if CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Loop8Test6Param, ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_neon, &vpx_highbd_lpf_horizontal_4_c, 8), @@ -564,7 +588,7 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_lpf_vertical_16_dual_c, 10), make_tuple(&vpx_highbd_lpf_vertical_16_dual_neon, &vpx_highbd_lpf_vertical_16_dual_c, 12))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_highbd_lpf_horizontal_4_dual_neon, &vpx_highbd_lpf_horizontal_4_dual_c, 8), @@ -591,7 +615,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_highbd_lpf_vertical_8_dual_neon, &vpx_highbd_lpf_vertical_8_dual_c, 12))); #else -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_lpf_horizontal_16_neon, &vpx_lpf_horizontal_16_c, 8), @@ -604,7 +628,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_lpf_vertical_8_neon, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_horizontal_4_neon, &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_vertical_4_neon, &vpx_lpf_vertical_4_c, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_lpf_horizontal_8_dual_neon, &vpx_lpf_horizontal_8_dual_c, 8), @@ -618,7 +642,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_NEON #if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( DSPR2, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_dspr2, &vpx_lpf_horizontal_4_c, 8), @@ -632,7 +656,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_lpf_vertical_16_dual_dspr2, &vpx_lpf_vertical_16_dual_c, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( DSPR2, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_dspr2, &vpx_lpf_horizontal_4_dual_c, 8), @@ -645,7 +669,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH #if HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, Loop8Test6Param, ::testing::Values( make_tuple(&vpx_lpf_horizontal_4_msa, &vpx_lpf_horizontal_4_c, 8), @@ -657,7 +681,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vpx_lpf_vertical_8_msa, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_vertical_16_msa, &vpx_lpf_vertical_16_c, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, Loop8Test9Param, ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_msa, &vpx_lpf_horizontal_4_dual_c, 8), @@ -669,4 +693,29 @@ INSTANTIATE_TEST_CASE_P( &vpx_lpf_vertical_8_dual_c, 8))); #endif // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH) +#if HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH) +INSTANTIATE_TEST_SUITE_P( + LSX, Loop8Test6Param, + ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_lsx, &vpx_lpf_horizontal_4_c, 8), + make_tuple(&vpx_lpf_horizontal_8_lsx, &vpx_lpf_horizontal_8_c, 8), + make_tuple(&vpx_lpf_horizontal_16_dual_lsx, + &vpx_lpf_horizontal_16_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_lsx, &vpx_lpf_vertical_4_c, 8), + make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8), + make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c, + 8))); + +INSTANTIATE_TEST_SUITE_P( + LSX, Loop8Test9Param, + ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_lsx, + &vpx_lpf_horizontal_4_dual_c, 8), + make_tuple(&vpx_lpf_horizontal_8_dual_lsx, + &vpx_lpf_horizontal_8_dual_c, 8), + make_tuple(&vpx_lpf_vertical_4_dual_lsx, + &vpx_lpf_vertical_4_dual_c, 8), + make_tuple(&vpx_lpf_vertical_8_dual_lsx, + &vpx_lpf_vertical_8_dual_c, 8))); +#endif // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH) + } // namespace diff --git a/media/libvpx/libvpx/test/md5_helper.h b/media/libvpx/libvpx/test/md5_helper.h index ef310a2d90..9095d96a8a 100644 --- a/media/libvpx/libvpx/test/md5_helper.h +++ b/media/libvpx/libvpx/test/md5_helper.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_MD5_HELPER_H_ -#define TEST_MD5_HELPER_H_ +#ifndef VPX_TEST_MD5_HELPER_H_ +#define VPX_TEST_MD5_HELPER_H_ #include "./md5_utils.h" #include "vpx/vpx_decoder.h" @@ -47,7 +47,7 @@ class MD5 { MD5Update(&md5_, data, static_cast(size)); } - const char *Get(void) { + const char *Get() { static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', @@ -72,4 +72,4 @@ class MD5 { } // namespace libvpx_test -#endif // TEST_MD5_HELPER_H_ +#endif // VPX_TEST_MD5_HELPER_H_ diff --git a/media/libvpx/libvpx/test/minmax_test.cc b/media/libvpx/libvpx/test/minmax_test.cc index e51c9fd48c..8c758ab45f 100644 --- a/media/libvpx/libvpx/test/minmax_test.cc +++ b/media/libvpx/libvpx/test/minmax_test.cc @@ -11,10 +11,12 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" +#include "vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" #include "test/acm_random.h" #include "test/register_state_check.h" @@ -23,12 +25,12 @@ namespace { using ::libvpx_test::ACMRandom; -typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int *min, int *max); +using MinMaxFunc = void (*)(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min, int *max); class MinMaxTest : public ::testing::TestWithParam { public: - virtual void SetUp() { + void SetUp() override { mm_func_ = GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); } @@ -107,24 +109,141 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) { int min_ref, max_ref, min, max; reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref); ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max)); - EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride - << " and b_stride = " << b_stride; - EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride - << " and b_stride = " << b_stride; + EXPECT_EQ(max_ref, max) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + EXPECT_EQ(min_ref, min) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; } } } -INSTANTIATE_TEST_CASE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c)); +#if CONFIG_VP9_HIGHBITDEPTH + +using HBDMinMaxTest = MinMaxTest; + +void highbd_reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min_ret, int *max_ret) { + int min = 65535; + int max = 0; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b); + for (int i = 0; i < 8; i++) { + for (int j = 0; j < 8; j++) { + const int diff = abs(a_ptr[i * a_stride + j] - b_ptr[i * b_stride + j]); + if (min > diff) min = diff; + if (max < diff) max = diff; + } + } + + *min_ret = min; + *max_ret = max; +} + +TEST_P(HBDMinMaxTest, MinValue) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + for (int i = 0; i < 64; i++) { + vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64); + vpx_memset16(CONVERT_TO_SHORTPTR(b), 65535, 64); + CONVERT_TO_SHORTPTR(b)[i] = i; // Set a minimum difference of i. + + int min, max; + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(65535, max); + EXPECT_EQ(i, min); + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} + +TEST_P(HBDMinMaxTest, MaxValue) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + for (int i = 0; i < 64; i++) { + vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64); + vpx_memset16(CONVERT_TO_SHORTPTR(b), 0, 64); + CONVERT_TO_SHORTPTR(b)[i] = i; // Set a minimum difference of i. + + int min, max; + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + EXPECT_EQ(i, max); + EXPECT_EQ(0, min); + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} + +TEST_P(HBDMinMaxTest, CompareReference) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc(64 * sizeof(uint16_t)))); + for (int j = 0; j < 64; j++) { + CONVERT_TO_SHORTPTR(a)[j] = rnd_.Rand16(); + CONVERT_TO_SHORTPTR(b)[j] = rnd_.Rand16(); + } + + int min_ref, max_ref, min, max; + highbd_reference_minmax(a, 8, b, 8, &min_ref, &max_ref); + ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max)); + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); + EXPECT_EQ(max_ref, max); + EXPECT_EQ(min_ref, min); +} + +TEST_P(HBDMinMaxTest, CompareReferenceAndVaryStride) { + uint8_t *a = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc((8 * 64) * sizeof(uint16_t)))); + uint8_t *b = CONVERT_TO_BYTEPTR( + reinterpret_cast(vpx_malloc((8 * 64) * sizeof(uint16_t)))); + for (int i = 0; i < 8 * 64; i++) { + CONVERT_TO_SHORTPTR(a)[i] = rnd_.Rand16(); + CONVERT_TO_SHORTPTR(b)[i] = rnd_.Rand16(); + } + for (int a_stride = 8; a_stride <= 64; a_stride += 8) { + for (int b_stride = 8; b_stride <= 64; b_stride += 8) { + int min_ref, max_ref, min, max; + highbd_reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref); + ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max)); + EXPECT_EQ(max_ref, max) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + EXPECT_EQ(min_ref, min) + << "when a_stride = " << a_stride << " and b_stride = " << b_stride; + } + } + vpx_free(CONVERT_TO_SHORTPTR(a)); + vpx_free(CONVERT_TO_SHORTPTR(b)); +} +#endif + +INSTANTIATE_TEST_SUITE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c)); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(C, HBDMinMaxTest, + ::testing::Values(&vpx_highbd_minmax_8x8_c)); +#endif #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, MinMaxTest, - ::testing::Values(&vpx_minmax_8x8_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest, + ::testing::Values(&vpx_minmax_8x8_sse2)); #endif #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest, - ::testing::Values(&vpx_minmax_8x8_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest, + ::testing::Values(&vpx_minmax_8x8_neon)); +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest, + ::testing::Values(&vpx_highbd_minmax_8x8_neon)); +#endif +#endif + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P(MSA, MinMaxTest, + ::testing::Values(&vpx_minmax_8x8_msa)); #endif } // namespace diff --git a/media/libvpx/libvpx/test/non_greedy_mv_test.cc b/media/libvpx/libvpx/test/non_greedy_mv_test.cc new file mode 100644 index 0000000000..6b5dcc651b --- /dev/null +++ b/media/libvpx/libvpx/test/non_greedy_mv_test.cc @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "gtest/gtest.h" +#include "vp9/encoder/vp9_non_greedy_mv.h" +#include "./vpx_dsp_rtcd.h" + +namespace { + +static void read_in_mf(const char *filename, int *rows_ptr, int *cols_ptr, + MV **buffer_ptr) { + FILE *input = fopen(filename, "rb"); + int row, col; + int idx; + + ASSERT_NE(input, nullptr) << "Cannot open file: " << filename << std::endl; + + fscanf(input, "%d,%d\n", rows_ptr, cols_ptr); + + *buffer_ptr = (MV *)malloc((*rows_ptr) * (*cols_ptr) * sizeof(MV)); + + for (idx = 0; idx < (*rows_ptr) * (*cols_ptr); ++idx) { + fscanf(input, "%d,%d;", &row, &col); + (*buffer_ptr)[idx].row = row; + (*buffer_ptr)[idx].col = col; + } + fclose(input); +} + +static void read_in_local_var(const char *filename, int *rows_ptr, + int *cols_ptr, + int (**M_ptr)[MF_LOCAL_STRUCTURE_SIZE]) { + FILE *input = fopen(filename, "rb"); + int M00, M01, M10, M11; + int idx; + int int_type; + + ASSERT_NE(input, nullptr) << "Cannot open file: " << filename << std::endl; + + fscanf(input, "%d,%d\n", rows_ptr, cols_ptr); + + *M_ptr = (int(*)[MF_LOCAL_STRUCTURE_SIZE])malloc( + (*rows_ptr) * (*cols_ptr) * MF_LOCAL_STRUCTURE_SIZE * sizeof(int_type)); + + for (idx = 0; idx < (*rows_ptr) * (*cols_ptr); ++idx) { + fscanf(input, "%d,%d,%d,%d;", &M00, &M01, &M10, &M11); + (*M_ptr)[idx][0] = M00; + (*M_ptr)[idx][1] = M01; + (*M_ptr)[idx][2] = M10; + (*M_ptr)[idx][3] = M11; + } + fclose(input); +} + +static void compare_mf(const MV *mf1, const MV *mf2, int rows, int cols, + float *mean_ptr, float *std_ptr) { + float float_type; + float *diffs = (float *)malloc(rows * cols * sizeof(float_type)); + int idx; + float accu = 0.0f; + for (idx = 0; idx < rows * cols; ++idx) { + MV mv1 = mf1[idx]; + MV mv2 = mf2[idx]; + float row_diff2 = (float)((mv1.row - mv2.row) * (mv1.row - mv2.row)); + float col_diff2 = (float)((mv1.col - mv2.col) * (mv1.col - mv2.col)); + diffs[idx] = sqrt(row_diff2 + col_diff2); + accu += diffs[idx]; + } + *mean_ptr = accu / rows / cols; + *std_ptr = 0; + for (idx = 0; idx < rows * cols; ++idx) { + *std_ptr += (diffs[idx] - (*mean_ptr)) * (diffs[idx] - (*mean_ptr)); + } + *std_ptr = sqrt(*std_ptr / rows / cols); + free(diffs); +} + +static void load_frame_info(const char *filename, + YV12_BUFFER_CONFIG *ref_frame_ptr) { + FILE *input = fopen(filename, "rb"); + int idx; + uint8_t data_type; + + ASSERT_NE(input, nullptr) << "Cannot open file: " << filename << std::endl; + + fscanf(input, "%d,%d\n", &(ref_frame_ptr->y_height), + &(ref_frame_ptr->y_width)); + + ref_frame_ptr->y_buffer = (uint8_t *)malloc( + (ref_frame_ptr->y_width) * (ref_frame_ptr->y_height) * sizeof(data_type)); + + for (idx = 0; idx < (ref_frame_ptr->y_width) * (ref_frame_ptr->y_height); + ++idx) { + int value; + fscanf(input, "%d,", &value); + ref_frame_ptr->y_buffer[idx] = (uint8_t)value; + } + + ref_frame_ptr->y_stride = ref_frame_ptr->y_width; + fclose(input); +} + +static int compare_local_var(const int (*local_var1)[MF_LOCAL_STRUCTURE_SIZE], + const int (*local_var2)[MF_LOCAL_STRUCTURE_SIZE], + int rows, int cols) { + int diff = 0; + int outter_idx, inner_idx; + for (outter_idx = 0; outter_idx < rows * cols; ++outter_idx) { + for (inner_idx = 0; inner_idx < MF_LOCAL_STRUCTURE_SIZE; ++inner_idx) { + diff += abs(local_var1[outter_idx][inner_idx] - + local_var2[outter_idx][inner_idx]); + } + } + return diff / rows / cols; +} + +TEST(non_greedy_mv, smooth_mf) { + const char *search_mf_file = "non_greedy_mv_test_files/exhaust_16x16.txt"; + const char *local_var_file = "non_greedy_mv_test_files/localVar_16x16.txt"; + const char *estimation_file = "non_greedy_mv_test_files/estimation_16x16.txt"; + const char *ground_truth_file = + "non_greedy_mv_test_files/ground_truth_16x16.txt"; + BLOCK_SIZE bsize = BLOCK_32X32; + MV *search_mf = nullptr; + MV *smooth_mf = nullptr; + MV *estimation = nullptr; + MV *ground_truth = nullptr; + int(*local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr; + int rows = 0, cols = 0; + + int alpha = 100, max_iter = 100; + + read_in_mf(search_mf_file, &rows, &cols, &search_mf); + read_in_local_var(local_var_file, &rows, &cols, &local_var); + read_in_mf(estimation_file, &rows, &cols, &estimation); + read_in_mf(ground_truth_file, &rows, &cols, &ground_truth); + + float sm_mean, sm_std; + float est_mean, est_std; + + smooth_mf = (MV *)malloc(rows * cols * sizeof(MV)); + vp9_get_smooth_motion_field(search_mf, local_var, rows, cols, bsize, alpha, + max_iter, smooth_mf); + + compare_mf(smooth_mf, ground_truth, rows, cols, &sm_mean, &sm_std); + compare_mf(smooth_mf, estimation, rows, cols, &est_mean, &est_std); + + EXPECT_LE(sm_mean, 3); + EXPECT_LE(est_mean, 2); + + free(search_mf); + free(local_var); + free(estimation); + free(ground_truth); + free(smooth_mf); +} + +TEST(non_greedy_mv, local_var) { + const char *ref_frame_file = "non_greedy_mv_test_files/ref_frame_16x16.txt"; + const char *cur_frame_file = "non_greedy_mv_test_files/cur_frame_16x16.txt"; + const char *gt_local_var_file = "non_greedy_mv_test_files/localVar_16x16.txt"; + const char *search_mf_file = "non_greedy_mv_test_files/exhaust_16x16.txt"; + BLOCK_SIZE bsize = BLOCK_16X16; + int(*gt_local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr; + int(*est_local_var)[MF_LOCAL_STRUCTURE_SIZE] = nullptr; + YV12_BUFFER_CONFIG ref_frame, cur_frame; + int rows, cols; + MV *search_mf; + int int_type; + int local_var_diff; + vp9_variance_fn_ptr_t fn; + + load_frame_info(ref_frame_file, &ref_frame); + load_frame_info(cur_frame_file, &cur_frame); + read_in_mf(search_mf_file, &rows, &cols, &search_mf); + + fn.sdf = vpx_sad16x16; + est_local_var = (int(*)[MF_LOCAL_STRUCTURE_SIZE])malloc( + rows * cols * MF_LOCAL_STRUCTURE_SIZE * sizeof(int_type)); + vp9_get_local_structure(&cur_frame, &ref_frame, search_mf, &fn, rows, cols, + bsize, est_local_var); + read_in_local_var(gt_local_var_file, &rows, &cols, >_local_var); + + local_var_diff = compare_local_var(est_local_var, gt_local_var, rows, cols); + + EXPECT_LE(local_var_diff, 1); + + free(gt_local_var); + free(est_local_var); + free(ref_frame.y_buffer); +} +} // namespace diff --git a/media/libvpx/libvpx/test/partial_idct_test.cc b/media/libvpx/libvpx/test/partial_idct_test.cc index 7e901bd033..da391b1a7e 100644 --- a/media/libvpx/libvpx/test/partial_idct_test.cc +++ b/media/libvpx/libvpx/test/partial_idct_test.cc @@ -11,10 +11,10 @@ #include #include #include - #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" @@ -25,16 +25,17 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_scan.h" #include "vpx/vpx_integer.h" +#include "vpx_config.h" #include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; namespace { -typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride); -typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride); -typedef void (*InvTxfmWithBdFunc)(const tran_low_t *in, uint8_t *out, - int stride, int bd); +using FwdTxfmFunc = void (*)(const int16_t *in, tran_low_t *out, int stride); +using InvTxfmFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride); +using InvTxfmWithBdFunc = void (*)(const tran_low_t *in, uint8_t *out, + int stride, int bd); template void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { @@ -43,53 +44,29 @@ void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { } #if CONFIG_VP9_HIGHBITDEPTH -template +using InvTxfmHighbdFunc = void (*)(const tran_low_t *in, uint16_t *out, + int stride, int bd); + +template void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) { - fn(in, CONVERT_TO_BYTEPTR(out), stride, bd); + fn(in, CAST_TO_SHORTPTR(out), stride, bd); } #endif -typedef std::tr1::tuple - PartialInvTxfmParam; +using PartialInvTxfmParam = + std::tuple; const int kMaxNumCoeffs = 1024; const int kCountTestBlock = 1000; -// https://bugs.chromium.org/p/webm/issues/detail?id=1332 -// The functions specified do not pass with INT16_MIN/MAX. They fail at the -// value specified, but pass when 1 is added/subtracted. -int16_t MaxSupportedCoeff(InvTxfmWithBdFunc a) { -#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE - if (a == &wrapper || - a == &wrapper) { - return 23625 - 1; - } -#else - (void)a; -#endif - return std::numeric_limits::max(); -} - -int16_t MinSupportedCoeff(InvTxfmWithBdFunc a) { -#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE - if (a == &wrapper || - a == &wrapper) { - return -23625 + 1; - } -#else - (void)a; -#endif - return std::numeric_limits::min(); -} - class PartialIDctTest : public ::testing::TestWithParam { public: - virtual ~PartialIDctTest() {} - virtual void SetUp() { + ~PartialIDctTest() override = default; + void SetUp() override { rnd_.Reset(ACMRandom::DeterministicSeed()); - ftxfm_ = GET_PARAM(0); - full_itxfm_ = GET_PARAM(1); - partial_itxfm_ = GET_PARAM(2); + fwd_txfm_ = GET_PARAM(0); + full_inv_txfm_ = GET_PARAM(1); + partial_inv_txfm_ = GET_PARAM(2); tx_size_ = GET_PARAM(3); last_nonzero_ = GET_PARAM(4); bit_depth_ = GET_PARAM(5); @@ -101,7 +78,7 @@ class PartialIDctTest : public ::testing::TestWithParam { case TX_8X8: size_ = 8; break; case TX_16X16: size_ = 16; break; case TX_32X32: size_ = 32; break; - default: FAIL() << "Wrong Size!"; break; + default: FAIL() << "Wrong Size!"; } // Randomize stride_ to a value less than or equal to 1024 @@ -125,13 +102,13 @@ class PartialIDctTest : public ::testing::TestWithParam { vpx_memalign(16, pixel_size_ * output_block_size_)); } - virtual void TearDown() { + void TearDown() override { vpx_free(input_block_); - input_block_ = NULL; + input_block_ = nullptr; vpx_free(output_block_); - output_block_ = NULL; + output_block_ = nullptr; vpx_free(output_block_ref_); - output_block_ref_ = NULL; + output_block_ref_ = nullptr; libvpx_test::ClearSystemState(); } @@ -153,12 +130,12 @@ class PartialIDctTest : public ::testing::TestWithParam { } void InitInput() { - const int max_coeff = 32766 / 4; - int max_energy_leftover = max_coeff * max_coeff; + const int64_t max_coeff = (32766 << (bit_depth_ - 8)) / 4; + int64_t max_energy_leftover = max_coeff * max_coeff; for (int j = 0; j < last_nonzero_; ++j) { - int16_t coeff = static_cast(sqrt(1.0 * max_energy_leftover) * - (rnd_.Rand16() - 32768) / 65536); - max_energy_leftover -= coeff * coeff; + tran_low_t coeff = static_cast( + sqrt(1.0 * max_energy_leftover) * (rnd_.Rand16() - 32768) / 65536); + max_energy_leftover -= static_cast(coeff) * coeff; if (max_energy_leftover < 0) { max_energy_leftover = 0; coeff = 0; @@ -167,6 +144,36 @@ class PartialIDctTest : public ::testing::TestWithParam { } } + void PrintDiff() { + if (memcmp(output_block_ref_, output_block_, + pixel_size_ * output_block_size_)) { + uint16_t ref, opt; + for (int y = 0; y < size_; y++) { + for (int x = 0; x < size_; x++) { + if (pixel_size_ == 1) { + ref = output_block_ref_[y * stride_ + x]; + opt = output_block_[y * stride_ + x]; + } else { + ref = reinterpret_cast( + output_block_ref_)[y * stride_ + x]; + opt = reinterpret_cast(output_block_)[y * stride_ + x]; + } + if (ref != opt) { + printf("dest[%d][%d] diff:%6d (ref),%6d (opt)\n", y, x, ref, opt); + } + } + } + + printf("\ninput_block_:\n"); + for (int y = 0; y < size_; y++) { + for (int x = 0; x < size_; x++) { + printf("%6d,", input_block_[y * size_ + x]); + } + printf("\n"); + } + } + } + protected: int last_nonzero_; TX_SIZE tx_size_; @@ -180,34 +187,43 @@ class PartialIDctTest : public ::testing::TestWithParam { int output_block_size_; int bit_depth_; int mask_; - FwdTxfmFunc ftxfm_; - InvTxfmWithBdFunc full_itxfm_; - InvTxfmWithBdFunc partial_itxfm_; + FwdTxfmFunc fwd_txfm_; + InvTxfmWithBdFunc full_inv_txfm_; + InvTxfmWithBdFunc partial_inv_txfm_; ACMRandom rnd_; }; TEST_P(PartialIDctTest, RunQuantCheck) { + const int count_test_block = (size_ != 4) ? kCountTestBlock : 65536; DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]); DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]); InitMem(); - for (int i = 0; i < kCountTestBlock * kCountTestBlock; ++i) { + + for (int i = 0; i < count_test_block; ++i) { // Initialize a test block with input range [-mask_, mask_]. - if (i == 0) { - for (int k = 0; k < input_block_size_; ++k) { - input_extreme_block[k] = mask_; - } - } else if (i == 1) { - for (int k = 0; k < input_block_size_; ++k) { - input_extreme_block[k] = -mask_; + if (size_ != 4) { + if (i == 0) { + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = mask_; + } + } else if (i == 1) { + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = -mask_; + } + } else { + for (int k = 0; k < input_block_size_; ++k) { + input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_; + } } } else { + // Try all possible combinations. for (int k = 0; k < input_block_size_; ++k) { - input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_; + input_extreme_block[k] = (i & (1 << k)) ? mask_ : -mask_; } } - ftxfm_(input_extreme_block, output_ref_block, size_); + fwd_txfm_(input_extreme_block, output_ref_block, size_); // quantization with minimum allowed step sizes input_block_[0] = (output_ref_block[0] / 4) * 4; @@ -217,9 +233,9 @@ TEST_P(PartialIDctTest, RunQuantCheck) { } ASM_REGISTER_STATE_CHECK( - full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); ASM_REGISTER_STATE_CHECK( - partial_itxfm_(input_block_, output_block_, stride_, bit_depth_)); + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, pixel_size_ * output_block_size_)) << "Error: partial inverse transform produces different results"; @@ -232,9 +248,9 @@ TEST_P(PartialIDctTest, ResultsMatch) { InitInput(); ASM_REGISTER_STATE_CHECK( - full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); ASM_REGISTER_STATE_CHECK( - partial_itxfm_(input_block_, output_block_, stride_, bit_depth_)); + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, pixel_size_ * output_block_size_)) << "Error: partial inverse transform produces different results"; @@ -249,9 +265,9 @@ TEST_P(PartialIDctTest, AddOutputBlock) { } ASM_REGISTER_STATE_CHECK( - full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); ASM_REGISTER_STATE_CHECK( - partial_itxfm_(input_block_, output_block_, stride_, bit_depth_)); + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, pixel_size_ * output_block_size_)) << "Error: Transform results are not correctly added to output."; @@ -259,8 +275,8 @@ TEST_P(PartialIDctTest, AddOutputBlock) { } TEST_P(PartialIDctTest, SingleExtremeCoeff) { - const int16_t max_coeff = MaxSupportedCoeff(partial_itxfm_); - const int16_t min_coeff = MinSupportedCoeff(partial_itxfm_); + const int16_t max_coeff = std::numeric_limits::max(); + const int16_t min_coeff = std::numeric_limits::min(); for (int i = 0; i < last_nonzero_; ++i) { memset(input_block_, 0, sizeof(*input_block_) * input_block_size_); // Run once for min and once for max. @@ -272,9 +288,9 @@ TEST_P(PartialIDctTest, SingleExtremeCoeff) { input_block_[vp9_default_scan_orders[tx_size_].scan[i]] = coeff; ASM_REGISTER_STATE_CHECK( - full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); ASM_REGISTER_STATE_CHECK( - partial_itxfm_(input_block_, output_block_, stride_, bit_depth_)); + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_)); ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, pixel_size_ * output_block_size_)) << "Error: Fails with single coeff of " << coeff << " at " << i @@ -291,26 +307,26 @@ TEST_P(PartialIDctTest, DISABLED_Speed) { for (int i = 0; i < kCountSpeedTestBlock; ++i) { ASM_REGISTER_STATE_CHECK( - full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_)); + full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_)); } vpx_usec_timer timer; vpx_usec_timer_start(&timer); for (int i = 0; i < kCountSpeedTestBlock; ++i) { - partial_itxfm_(input_block_, output_block_, stride_, bit_depth_); + partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_); } libvpx_test::ClearSystemState(); vpx_usec_timer_mark(&timer); const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer) / 1000); - printf("idct%dx%d_%d (bitdepth %d) time: %5d ms ", size_, size_, - last_nonzero_, bit_depth_, elapsed_time); - + printf("idct%dx%d_%d (%s %d) time: %5d ms\n", size_, size_, last_nonzero_, + (pixel_size_ == 1) ? "bitdepth" : "high bitdepth", bit_depth_, + elapsed_time); ASSERT_EQ(0, memcmp(output_block_ref_, output_block_, pixel_size_ * output_block_size_)) << "Error: partial inverse transform produces different results"; } -using std::tr1::make_tuple; +using std::make_tuple; const PartialInvTxfmParam c_partial_idct_tests[] = { #if CONFIG_VP9_HIGHBITDEPTH @@ -323,6 +339,15 @@ const PartialInvTxfmParam c_partial_idct_tests[] = { make_tuple( &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 1024, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 12, 2), make_tuple( &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 34, 8, 2), @@ -350,6 +375,15 @@ const PartialInvTxfmParam c_partial_idct_tests[] = { make_tuple( &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), make_tuple( &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 8, 2), @@ -424,6 +458,8 @@ const PartialInvTxfmParam c_partial_idct_tests[] = { &wrapper, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 256, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 38, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 10, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, @@ -440,12 +476,89 @@ const PartialInvTxfmParam c_partial_idct_tests[] = { &wrapper, TX_4X4, 1, 8, 1) }; -INSTANTIATE_TEST_CASE_P(C, PartialIDctTest, - ::testing::ValuesIn(c_partial_idct_tests)); +INSTANTIATE_TEST_SUITE_P(C, PartialIDctTest, + ::testing::ValuesIn(c_partial_idct_tests)); -#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE +#if !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON const PartialInvTxfmParam neon_partial_idct_tests[] = { #if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 1, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 12, 2), make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, &highbd_wrapper, TX_8X8, 64, 8, 2), @@ -488,46 +601,78 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = { #endif // CONFIG_VP9_HIGHBITDEPTH make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 135, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 34, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 256, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 38, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 10, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 1, 8, 1), make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 64, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 12, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 1, 8, 1), make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 16, 8, 1), - make_tuple(&vpx_fdct4x4_c, &wrapper, + make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 1, 8, 1) }; -INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest, - ::testing::ValuesIn(neon_partial_idct_tests)); -#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(NEON, PartialIDctTest, + ::testing::ValuesIn(neon_partial_idct_tests)); +#endif // HAVE_NEON -#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE +#if HAVE_SSE2 // 32x32_135_ is implemented using the 1024 version. const PartialInvTxfmParam sse2_partial_idct_tests[] = { #if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 12, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 1, 8, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 1, 10, 2), make_tuple( - &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &vpx_highbd_fdct32x32_c, &highbd_wrapper, &highbd_wrapper, TX_32X32, 1, 12, 2), make_tuple( &vpx_highbd_fdct16x16_c, &highbd_wrapper, @@ -539,14 +684,32 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 256, 12, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 8, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 10, 2), make_tuple( - &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 12, 2), make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, &highbd_wrapper, TX_8X8, 64, 8, 2), @@ -557,14 +720,20 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { &vpx_highbd_fdct8x8_c, &highbd_wrapper, &highbd_wrapper, TX_8X8, 64, 12, 2), make_tuple(&vpx_highbd_fdct8x8_c, - &highbd_wrapper, + &highbd_wrapper, &highbd_wrapper, TX_8X8, 12, 8, 2), make_tuple( - &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &vpx_highbd_fdct8x8_c, &highbd_wrapper, &highbd_wrapper, TX_8X8, 12, 10, 2), make_tuple( - &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &vpx_highbd_fdct8x8_c, &highbd_wrapper, &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 12, 2), make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, &highbd_wrapper, TX_4X4, 16, 8, 2), @@ -574,119 +743,233 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { make_tuple( &vpx_highbd_fdct4x4_c, &highbd_wrapper, &highbd_wrapper, TX_4X4, 16, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 8, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 10, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 12, 2), #endif // CONFIG_VP9_HIGHBITDEPTH make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, - &wrapper, TX_32X32, 135, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 135, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 34, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 256, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, + &wrapper, TX_16X16, 38, 8, 1), + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 10, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 1, 8, 1), make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 64, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 12, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 1, 8, 1), make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 16, 8, 1), - make_tuple(&vpx_fdct4x4_c, &wrapper, + make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 1, 8, 1) }; -INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest, - ::testing::ValuesIn(sse2_partial_idct_tests)); +INSTANTIATE_TEST_SUITE_P(SSE2, PartialIDctTest, + ::testing::ValuesIn(sse2_partial_idct_tests)); -#endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE +#endif // HAVE_SSE2 -#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE +#if HAVE_SSSE3 const PartialInvTxfmParam ssse3_partial_idct_tests[] = { - make_tuple(&vpx_fdct32x32_c, &wrapper, - &wrapper, TX_32X32, 1024, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 135, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 34, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, - &wrapper, TX_8X8, 64, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 12, 8, 1) }; -INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest, - ::testing::ValuesIn(ssse3_partial_idct_tests)); -#endif // HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE +INSTANTIATE_TEST_SUITE_P(SSSE3, PartialIDctTest, + ::testing::ValuesIn(ssse3_partial_idct_tests)); +#endif // HAVE_SSSE3 -#if HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH +const PartialInvTxfmParam sse4_1_partial_idct_tests[] = { + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 1024, 12, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 135, 8, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 135, 10, 2), + make_tuple(&vpx_highbd_fdct32x32_c, + &highbd_wrapper, + &highbd_wrapper, TX_32X32, + 135, 12, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 8, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 10, 2), + make_tuple( + &vpx_highbd_fdct32x32_c, &highbd_wrapper, + &highbd_wrapper, TX_32X32, 34, 12, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, + 256, 8, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, + 256, 10, 2), + make_tuple(&vpx_highbd_fdct16x16_c, + &highbd_wrapper, + &highbd_wrapper, TX_16X16, + 256, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 38, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 64, 12, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 8, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 10, 2), + make_tuple( + &vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 8, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 10, 2), + make_tuple( + &vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 16, 12, 2) +}; + +INSTANTIATE_TEST_SUITE_P(SSE4_1, PartialIDctTest, + ::testing::ValuesIn(sse4_1_partial_idct_tests)); +#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH const PartialInvTxfmParam dspr2_partial_idct_tests[] = { make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, - &wrapper, TX_32X32, 135, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 34, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 256, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 10, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 1, 8, 1), make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 64, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 12, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 1, 8, 1), make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 16, 8, 1), - make_tuple(&vpx_fdct4x4_c, &wrapper, + make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 1, 8, 1) }; -INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest, - ::testing::ValuesIn(dspr2_partial_idct_tests)); -#endif // HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(DSPR2, PartialIDctTest, + ::testing::ValuesIn(dspr2_partial_idct_tests)); +#endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH -#if HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH // 32x32_135_ is implemented using the 1024 version. const PartialInvTxfmParam msa_partial_idct_tests[] = { make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, - &wrapper, TX_32X32, 135, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 34, 8, 1), - make_tuple(&vpx_fdct32x32_c, &wrapper, + make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1, 8, 1), make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 256, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 10, 8, 1), - make_tuple(&vpx_fdct16x16_c, &wrapper, + make_tuple(&vpx_fdct16x16_c, &wrapper, &wrapper, TX_16X16, 1, 8, 1), make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 64, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 12, 8, 1), - make_tuple(&vpx_fdct8x8_c, &wrapper, + make_tuple(&vpx_fdct8x8_c, &wrapper, &wrapper, TX_8X8, 1, 8, 1), make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 16, 8, 1), - make_tuple(&vpx_fdct4x4_c, &wrapper, + make_tuple(&vpx_fdct4x4_c, &wrapper, &wrapper, TX_4X4, 1, 8, 1) }; -INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest, - ::testing::ValuesIn(msa_partial_idct_tests)); -#endif // HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P(MSA, PartialIDctTest, + ::testing::ValuesIn(msa_partial_idct_tests)); +#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH +const PartialInvTxfmParam lsx_partial_idct_tests[] = { + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1024, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 34, 8, 1), + make_tuple(&vpx_fdct32x32_c, &wrapper, + &wrapper, TX_32X32, 1, 8, 1), +}; + +INSTANTIATE_TEST_SUITE_P(LSX, PartialIDctTest, + ::testing::ValuesIn(lsx_partial_idct_tests)); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH + +#endif // !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/media/libvpx/libvpx/test/postproc.sh b/media/libvpx/libvpx/test/postproc.sh index 939a3e7620..91ca9b26fe 100644 --- a/media/libvpx/libvpx/test/postproc.sh +++ b/media/libvpx/libvpx/test/postproc.sh @@ -38,7 +38,7 @@ postproc() { fi eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ - ${devnull} + ${devnull} || return 1 [ -e "${output_file}" ] || return 1 } diff --git a/media/libvpx/libvpx/test/pp_filter_test.cc b/media/libvpx/libvpx/test/pp_filter_test.cc index 4b4795accf..bc0566a4f2 100644 --- a/media/libvpx/libvpx/test/pp_filter_test.cc +++ b/media/libvpx/libvpx/test/pp_filter_test.cc @@ -7,30 +7,36 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + #include + +#include + #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "gtest/gtest.h" #include "test/acm_random.h" +#include "test/bench.h" +#include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" -#include "third_party/googletest/src/include/gtest/gtest.h" #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" using libvpx_test::ACMRandom; +using libvpx_test::Buffer; -typedef void (*VpxPostProcDownAndAcrossMbRowFunc)( +using VpxPostProcDownAndAcrossMbRowFunc = void (*)( unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int cols, unsigned char *flimit, int size); -typedef void (*VpxMbPostProcAcrossIpFunc)(unsigned char *src, int pitch, - int rows, int cols, int flimit); +using VpxMbPostProcAcrossIpFunc = void (*)(unsigned char *src, int pitch, + int rows, int cols, int flimit); -typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows, - int cols, int flimit); +using VpxMbPostProcDownFunc = void (*)(unsigned char *dst, int pitch, int rows, + int cols, int flimit); namespace { - // Compute the filter level used in post proc from the loop filter strength int q2mbl(int x) { if (x < 20) x = 20; @@ -40,184 +46,192 @@ int q2mbl(int x) { } class VpxPostProcDownAndAcrossMbRowTest - : public ::testing::TestWithParam { + : public AbstractBench, + public ::testing::TestWithParam { public: - virtual void TearDown() { libvpx_test::ClearSystemState(); } + VpxPostProcDownAndAcrossMbRowTest() + : mb_post_proc_down_and_across_(GetParam()) {} + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void Run() override; + + const VpxPostProcDownAndAcrossMbRowFunc mb_post_proc_down_and_across_; + // Size of the underlying data block that will be filtered. + int block_width_; + int block_height_; + Buffer *src_image_; + Buffer *dst_image_; + uint8_t *flimits_; }; +void VpxPostProcDownAndAcrossMbRowTest::Run() { + mb_post_proc_down_and_across_( + src_image_->TopLeftPixel(), dst_image_->TopLeftPixel(), + src_image_->stride(), dst_image_->stride(), block_width_, flimits_, 16); +} + // Test routine for the VPx post-processing function // vpx_post_proc_down_and_across_mb_row_c. TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { // Size of the underlying data block that will be filtered. - const int block_width = 16; - const int block_height = 16; + block_width_ = 16; + block_height_ = 16; // 5-tap filter needs 2 padding rows above and below the block in the input. - const int input_width = block_width; - const int input_height = block_height + 4; - const int input_stride = input_width; - const int input_size = input_width * input_height; + Buffer src_image = Buffer(block_width_, block_height_, 2); + ASSERT_TRUE(src_image.Init()); // Filter extends output block by 8 samples at left and right edges. - const int output_width = block_width + 16; - const int output_height = block_height; - const int output_stride = output_width; - const int output_size = output_width * output_height; - - uint8_t *const src_image = new uint8_t[input_size]; - ASSERT_TRUE(src_image != NULL); - // Though the left padding is only 8 bytes, the assembly code tries to // read 16 bytes before the pointer. - uint8_t *const dst_image = new uint8_t[output_size + 8]; - ASSERT_TRUE(dst_image != NULL); + Buffer dst_image = + Buffer(block_width_, block_height_, 8, 16, 8, 8); + ASSERT_TRUE(dst_image.Init()); - // Pointers to top-left pixel of block in the input and output images. - uint8_t *const src_image_ptr = src_image + (input_stride << 1); - - // The assembly works in increments of 16. The first read may be offset by - // this amount. - uint8_t *const dst_image_ptr = dst_image + 16; - uint8_t *const flimits = - reinterpret_cast(vpx_memalign(16, block_width)); - (void)memset(flimits, 255, block_width); + flimits_ = reinterpret_cast(vpx_memalign(16, block_width_)); + (void)memset(flimits_, 255, block_width_); // Initialize pixels in the input: // block pixels to value 1, // border pixels to value 10. - (void)memset(src_image, 10, input_size); - uint8_t *pixel_ptr = src_image_ptr; - for (int i = 0; i < block_height; ++i) { - for (int j = 0; j < block_width; ++j) { - pixel_ptr[j] = 1; - } - pixel_ptr += input_stride; - } + src_image.SetPadding(10); + src_image.Set(1); // Initialize pixels in the output to 99. - (void)memset(dst_image, 99, output_size); + dst_image.Set(99); - ASM_REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr, - input_stride, output_stride, block_width, - flimits, 16)); + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_and_across_( + src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(), + dst_image.stride(), block_width_, flimits_, 16)); - static const uint8_t kExpectedOutput[block_height] = { - 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4 - }; + static const uint8_t kExpectedOutput[] = { 4, 3, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 3, 4 }; - pixel_ptr = dst_image_ptr; - for (int i = 0; i < block_height; ++i) { - for (int j = 0; j < block_width; ++j) { - ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) << "at (" << i << ", " << j - << ")"; + uint8_t *pixel_ptr = dst_image.TopLeftPixel(); + for (int i = 0; i < block_height_; ++i) { + for (int j = 0; j < block_width_; ++j) { + ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) + << "at (" << i << ", " << j << ")"; } - pixel_ptr += output_stride; + pixel_ptr += dst_image.stride(); } - delete[] src_image; - delete[] dst_image; - vpx_free(flimits); -}; + vpx_free(flimits_); +} TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { // Size of the underlying data block that will be filtered. // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V // blocks are always a multiple of 8 wide and exactly 8 high. - const int block_width = 136; - const int block_height = 16; + block_width_ = 136; + block_height_ = 16; // 5-tap filter needs 2 padding rows above and below the block in the input. // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. - const int input_width = block_width; - const int input_height = block_height + 4 + 8; - const int input_stride = input_width; - const int input_size = input_stride * input_height; + Buffer src_image = + Buffer(block_width_, block_height_, 2, 2, 10, 2); + ASSERT_TRUE(src_image.Init()); // Filter extends output block by 8 samples at left and right edges. + // Though the left padding is only 8 bytes, there is 'above' padding as well + // so when the assembly code tries to read 16 bytes before the pointer it is + // not a problem. // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. - const int output_width = block_width + 24; - const int output_height = block_height; - const int output_stride = output_width; - const int output_size = output_stride * output_height; - - uint8_t *const src_image = new uint8_t[input_size]; - ASSERT_TRUE(src_image != NULL); - - // Though the left padding is only 8 bytes, the assembly code tries to - // read 16 bytes before the pointer. - uint8_t *const dst_image = new uint8_t[output_size + 8]; - ASSERT_TRUE(dst_image != NULL); - uint8_t *const dst_image_ref = new uint8_t[output_size + 8]; - ASSERT_TRUE(dst_image_ref != NULL); - - // Pointers to top-left pixel of block in the input and output images. - uint8_t *const src_image_ptr = src_image + (input_stride << 1); - - // The assembly works in increments of 16. The first read may be offset by - // this amount. - uint8_t *const dst_image_ptr = dst_image + 16; - uint8_t *const dst_image_ref_ptr = dst_image + 16; + Buffer dst_image = + Buffer(block_width_, block_height_, 8, 8, 16, 8); + ASSERT_TRUE(dst_image.Init()); + Buffer dst_image_ref = + Buffer(block_width_, block_height_, 8); + ASSERT_TRUE(dst_image_ref.Init()); // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so // it must be padded out. - const int flimits_width = block_width % 16 ? block_width + 8 : block_width; - uint8_t *const flimits = - reinterpret_cast(vpx_memalign(16, flimits_width)); + const int flimits_width = block_width_ % 16 ? block_width_ + 8 : block_width_; + flimits_ = reinterpret_cast(vpx_memalign(16, flimits_width)); ACMRandom rnd; rnd.Reset(ACMRandom::DeterministicSeed()); // Initialize pixels in the input: // block pixels to random values. // border pixels to value 10. - (void)memset(src_image, 10, input_size); - uint8_t *pixel_ptr = src_image_ptr; - for (int i = 0; i < block_height; ++i) { - for (int j = 0; j < block_width; ++j) { - pixel_ptr[j] = rnd.Rand8(); - } - pixel_ptr += input_stride; - } + src_image.SetPadding(10); + src_image.Set(&rnd, &ACMRandom::Rand8); - for (int blocks = 0; blocks < block_width; blocks += 8) { - (void)memset(flimits, 0, sizeof(*flimits) * flimits_width); + for (int blocks = 0; blocks < block_width_; blocks += 8) { + (void)memset(flimits_, 0, sizeof(*flimits_) * flimits_width); for (int f = 0; f < 255; f++) { - (void)memset(flimits + blocks, f, sizeof(*flimits) * 8); - - (void)memset(dst_image, 0, output_size); - (void)memset(dst_image_ref, 0, output_size); + (void)memset(flimits_ + blocks, f, sizeof(*flimits_) * 8); + dst_image.Set(0); + dst_image_ref.Set(0); vpx_post_proc_down_and_across_mb_row_c( - src_image_ptr, dst_image_ref_ptr, input_stride, output_stride, - block_width, flimits, block_height); - ASM_REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr, - input_stride, output_stride, - block_width, flimits, 16)); + src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(), + src_image.stride(), dst_image_ref.stride(), block_width_, flimits_, + block_height_); + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_and_across_( + src_image.TopLeftPixel(), dst_image.TopLeftPixel(), + src_image.stride(), dst_image.stride(), block_width_, flimits_, + block_height_)); - for (int i = 0; i < block_height; ++i) { - for (int j = 0; j < block_width; ++j) { - ASSERT_EQ(dst_image_ref_ptr[j + i * output_stride], - dst_image_ptr[j + i * output_stride]) - << "at (" << i << ", " << j << ")"; - } - } + ASSERT_TRUE(dst_image.CheckValues(dst_image_ref)); } } - delete[] src_image; - delete[] dst_image; - delete[] dst_image_ref; - vpx_free(flimits); + vpx_free(flimits_); +} + +TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) { + // Size of the underlying data block that will be filtered. + block_width_ = 16; + block_height_ = 16; + + // 5-tap filter needs 2 padding rows above and below the block in the input. + Buffer src_image = Buffer(block_width_, block_height_, 2); + ASSERT_TRUE(src_image.Init()); + this->src_image_ = &src_image; + + // Filter extends output block by 8 samples at left and right edges. + // Though the left padding is only 8 bytes, the assembly code tries to + // read 16 bytes before the pointer. + Buffer dst_image = + Buffer(block_width_, block_height_, 8, 16, 8, 8); + ASSERT_TRUE(dst_image.Init()); + this->dst_image_ = &dst_image; + + flimits_ = reinterpret_cast(vpx_memalign(16, block_width_)); + (void)memset(flimits_, 255, block_width_); + + // Initialize pixels in the input: + // block pixels to value 1, + // border pixels to value 10. + src_image.SetPadding(10); + src_image.Set(1); + + // Initialize pixels in the output to 99. + dst_image.Set(99); + + RunNTimes(INT16_MAX); + PrintMedian("16x16"); + + vpx_free(flimits_); } class VpxMbPostProcAcrossIpTest - : public ::testing::TestWithParam { + : public AbstractBench, + public ::testing::TestWithParam { public: - virtual void TearDown() { libvpx_test::ClearSystemState(); } + VpxMbPostProcAcrossIpTest() + : rows_(16), cols_(16), mb_post_proc_across_ip_(GetParam()), + src_(Buffer(rows_, cols_, 8, 8, 17, 8)) {} + void TearDown() override { libvpx_test::ClearSystemState(); } protected: + void Run() override; + void SetCols(unsigned char *s, int rows, int cols, int src_width) { for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { @@ -231,8 +245,8 @@ class VpxMbPostProcAcrossIpTest int rows, int cols, int src_pitch) { for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { - ASSERT_EQ(expected_output[c], src_c[c]) << "at (" << r << ", " << c - << ")"; + ASSERT_EQ(expected_output[c], src_c[c]) + << "at (" << r << ", " << c << ")"; } src_c += src_pitch; } @@ -244,159 +258,113 @@ class VpxMbPostProcAcrossIpTest GetParam()(s, src_width, rows, cols, filter_level)); RunComparison(expected_output, s, rows, cols, src_width); } + + const int rows_; + const int cols_; + const VpxMbPostProcAcrossIpFunc mb_post_proc_across_ip_; + Buffer src_; }; +void VpxMbPostProcAcrossIpTest::Run() { + mb_post_proc_across_ip_(src_.TopLeftPixel(), src_.stride(), rows_, cols_, + q2mbl(0)); +} + TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) { - const int rows = 16; - const int cols = 16; - const int src_left_padding = 8; - const int src_right_padding = 17; - const int src_width = cols + src_left_padding + src_right_padding; - const int src_size = rows * src_width; + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); - unsigned char *const src = new unsigned char[src_size]; - ASSERT_TRUE(src != NULL); - memset(src, 10, src_size); - unsigned char *const s = src + src_left_padding; - SetCols(s, rows, cols, src_width); + Buffer expected_output = Buffer(cols_, rows_, 0); + ASSERT_TRUE(expected_output.Init()); + SetCols(expected_output.TopLeftPixel(), rows_, cols_, + expected_output.stride()); - unsigned char *expected_output = new unsigned char[rows * cols]; - ASSERT_TRUE(expected_output != NULL); - SetCols(expected_output, rows, cols, cols); - - RunFilterLevel(s, rows, cols, src_width, q2mbl(0), expected_output); - delete[] src; - delete[] expected_output; + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(0), + expected_output.TopLeftPixel()); } TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) { - const int rows = 16; - const int cols = 16; - const int src_left_padding = 8; - const int src_right_padding = 17; - const int src_width = cols + src_left_padding + src_right_padding; - const int src_size = rows * src_width; + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); - unsigned char *const src = new unsigned char[src_size]; - ASSERT_TRUE(src != NULL); - memset(src, 10, src_size); - unsigned char *const s = src + src_left_padding; - - SetCols(s, rows, cols, src_width); - static const unsigned char kExpectedOutput[cols] = { + static const unsigned char kExpectedOutput[] = { 2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13 }; - RunFilterLevel(s, rows, cols, src_width, q2mbl(70), kExpectedOutput); - - delete[] src; + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(70), + kExpectedOutput); } TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) { - const int rows = 16; - const int cols = 16; - const int src_left_padding = 8; - const int src_right_padding = 17; - const int src_width = cols + src_left_padding + src_right_padding; - const int src_size = rows * src_width; + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); - unsigned char *const src = new unsigned char[src_size]; - ASSERT_TRUE(src != NULL); - unsigned char *const s = src + src_left_padding; - - memset(src, 10, src_size); - SetCols(s, rows, cols, src_width); - static const unsigned char kExpectedOutput[cols] = { + static const unsigned char kExpectedOutput[] = { 2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13 }; - RunFilterLevel(s, rows, cols, src_width, INT_MAX, kExpectedOutput); + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), INT_MAX, + kExpectedOutput); - memset(src, 10, src_size); - SetCols(s, rows, cols, src_width); - RunFilterLevel(s, rows, cols, src_width, q2mbl(100), kExpectedOutput); + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); - delete[] src; + RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(100), + kExpectedOutput); } TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) { - const int rows = 16; - const int cols = 16; - const int src_left_padding = 8; - const int src_right_padding = 17; - const int src_width = cols + src_left_padding + src_right_padding; - const int src_size = rows * src_width; - - unsigned char *const c_mem = new unsigned char[src_size]; - unsigned char *const asm_mem = new unsigned char[src_size]; - ASSERT_TRUE(c_mem != NULL); - ASSERT_TRUE(asm_mem != NULL); - unsigned char *const src_c = c_mem + src_left_padding; - unsigned char *const src_asm = asm_mem + src_left_padding; + Buffer c_mem = Buffer(cols_, rows_, 8, 8, 17, 8); + ASSERT_TRUE(c_mem.Init()); + Buffer asm_mem = Buffer(cols_, rows_, 8, 8, 17, 8); + ASSERT_TRUE(asm_mem.Init()); // When level >= 100, the filter behaves the same as the level = INT_MAX // When level < 20, it behaves the same as the level = 0 for (int level = 0; level < 100; level++) { - memset(c_mem, 10, src_size); - memset(asm_mem, 10, src_size); - SetCols(src_c, rows, cols, src_width); - SetCols(src_asm, rows, cols, src_width); + c_mem.SetPadding(10); + asm_mem.SetPadding(10); + SetCols(c_mem.TopLeftPixel(), rows_, cols_, c_mem.stride()); + SetCols(asm_mem.TopLeftPixel(), rows_, cols_, asm_mem.stride()); - vpx_mbpost_proc_across_ip_c(src_c, src_width, rows, cols, q2mbl(level)); - ASM_REGISTER_STATE_CHECK( - GetParam()(src_asm, src_width, rows, cols, q2mbl(level))); + vpx_mbpost_proc_across_ip_c(c_mem.TopLeftPixel(), c_mem.stride(), rows_, + cols_, q2mbl(level)); + ASM_REGISTER_STATE_CHECK(GetParam()( + asm_mem.TopLeftPixel(), asm_mem.stride(), rows_, cols_, q2mbl(level))); - RunComparison(src_c, src_asm, rows, cols, src_width); + ASSERT_TRUE(asm_mem.CheckValues(c_mem)); } +} - delete[] c_mem; - delete[] asm_mem; +TEST_P(VpxMbPostProcAcrossIpTest, DISABLED_Speed) { + ASSERT_TRUE(src_.Init()); + src_.SetPadding(10); + + SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride()); + + RunNTimes(100000); + PrintMedian("16x16"); } class VpxMbPostProcDownTest - : public ::testing::TestWithParam { + : public AbstractBench, + public ::testing::TestWithParam { public: - virtual void TearDown() { libvpx_test::ClearSystemState(); } + VpxMbPostProcDownTest() + : rows_(16), cols_(16), mb_post_proc_down_(GetParam()), + src_c_(Buffer(rows_, cols_, 8, 8, 8, 17)) {} + + void TearDown() override { libvpx_test::ClearSystemState(); } protected: - void SetRows(unsigned char *src_c, int rows, int cols) { + void Run() override; + + void SetRows(unsigned char *src_c, int rows, int cols, int src_width) { for (int r = 0; r < rows; r++) { memset(src_c, r, cols); - src_c += cols; - } - } - - void SetRandom(unsigned char *src_c, unsigned char *src_asm, int rows, - int cols, int src_pitch) { - ACMRandom rnd; - rnd.Reset(ACMRandom::DeterministicSeed()); - - // Add some random noise to the input - for (int r = 0; r < rows; r++) { - for (int c = 0; c < cols; c++) { - const int noise = rnd(4); - src_c[c] = r + noise; - src_asm[c] = r + noise; - } - src_c += src_pitch; - src_asm += src_pitch; - } - } - - void SetRandomSaturation(unsigned char *src_c, unsigned char *src_asm, - int rows, int cols, int src_pitch) { - ACMRandom rnd; - rnd.Reset(ACMRandom::DeterministicSeed()); - - // Add some random noise to the input - for (int r = 0; r < rows; r++) { - for (int c = 0; c < cols; c++) { - const int noise = 3 * rnd(2); - src_c[c] = r + noise; - src_asm[c] = r + noise; - } - src_c += src_pitch; - src_asm += src_pitch; + src_c += src_width; } } @@ -404,48 +372,38 @@ class VpxMbPostProcDownTest int rows, int cols, int src_pitch) { for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { - ASSERT_EQ(expected_output[r * rows + c], src_c[c]) << "at (" << r - << ", " << c << ")"; + ASSERT_EQ(expected_output[r * rows + c], src_c[c]) + << "at (" << r << ", " << c << ")"; } src_c += src_pitch; } } - void RunComparison(unsigned char *src_c, unsigned char *src_asm, int rows, - int cols, int src_pitch) { - for (int r = 0; r < rows; r++) { - for (int c = 0; c < cols; c++) { - ASSERT_EQ(src_c[c], src_asm[c]) << "at (" << r << ", " << c << ")"; - } - src_c += src_pitch; - src_asm += src_pitch; - } - } - void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width, int filter_level, const unsigned char *expected_output) { ASM_REGISTER_STATE_CHECK( - GetParam()(s, src_width, rows, cols, filter_level)); + mb_post_proc_down_(s, src_width, rows, cols, filter_level)); RunComparison(expected_output, s, rows, cols, src_width); } + + const int rows_; + const int cols_; + const VpxMbPostProcDownFunc mb_post_proc_down_; + Buffer src_c_; }; +void VpxMbPostProcDownTest::Run() { + mb_post_proc_down_(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_, + q2mbl(0)); +} + TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) { - const int rows = 16; - const int cols = 16; - const int src_pitch = cols; - const int src_top_padding = 8; - const int src_bottom_padding = 17; + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); - const int src_size = cols * (rows + src_top_padding + src_bottom_padding); - unsigned char *const c_mem = new unsigned char[src_size]; - ASSERT_TRUE(c_mem != NULL); - memset(c_mem, 10, src_size); - unsigned char *const src_c = c_mem + src_top_padding * src_pitch; + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); - SetRows(src_c, rows, cols); - - static const unsigned char kExpectedOutput[rows * cols] = { + static const unsigned char kExpectedOutput[] = { 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 4, 4, 3, 3, 3, @@ -462,31 +420,22 @@ TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) { 13, 13, 13, 13, 14, 13, 13, 13, 13 }; - RunFilterLevel(src_c, rows, cols, src_pitch, INT_MAX, kExpectedOutput); + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), INT_MAX, + kExpectedOutput); - memset(c_mem, 10, src_size); - SetRows(src_c, rows, cols); - RunFilterLevel(src_c, rows, cols, src_pitch, q2mbl(100), kExpectedOutput); - - delete[] c_mem; + src_c_.SetPadding(10); + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), + q2mbl(100), kExpectedOutput); } TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) { - const int rows = 16; - const int cols = 16; - const int src_pitch = cols; - const int src_top_padding = 8; - const int src_bottom_padding = 17; + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); - const int src_size = cols * (rows + src_top_padding + src_bottom_padding); - unsigned char *const c_mem = new unsigned char[src_size]; - ASSERT_TRUE(c_mem != NULL); - memset(c_mem, 10, src_size); - unsigned char *const src_c = c_mem + src_top_padding * src_pitch; + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); - SetRows(src_c, rows, cols); - - static const unsigned char kExpectedOutput[rows * cols] = { + static const unsigned char kExpectedOutput[] = { 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, @@ -503,113 +452,124 @@ TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) { 13, 13, 13, 13, 14, 13, 13, 13, 13 }; - RunFilterLevel(src_c, rows, cols, src_pitch, q2mbl(70), kExpectedOutput); - - delete[] c_mem; + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), + q2mbl(70), kExpectedOutput); } TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) { - const int rows = 16; - const int cols = 16; - const int src_pitch = cols; - const int src_top_padding = 8; - const int src_bottom_padding = 17; + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); - const int src_size = cols * (rows + src_top_padding + src_bottom_padding); - unsigned char *const c_mem = new unsigned char[src_size]; - ASSERT_TRUE(c_mem != NULL); - memset(c_mem, 10, src_size); - unsigned char *const src_c = c_mem + src_top_padding * src_pitch; + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); - SetRows(src_c, rows, cols); + std::unique_ptr expected_output( + new unsigned char[rows_ * cols_]); + ASSERT_NE(expected_output, nullptr); + SetRows(expected_output.get(), rows_, cols_, cols_); - unsigned char *expected_output = new unsigned char[rows * cols]; - ASSERT_TRUE(expected_output != NULL); - SetRows(expected_output, rows, cols); - - RunFilterLevel(src_c, rows, cols, src_pitch, q2mbl(0), expected_output); - - delete[] c_mem; - delete[] expected_output; + RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0), + expected_output.get()); } TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) { - const int rows = 16; - const int cols = 16; - const int src_pitch = cols; - const int src_top_padding = 8; - const int src_bottom_padding = 17; - const int src_size = cols * (rows + src_top_padding + src_bottom_padding); - unsigned char *const c_mem = new unsigned char[src_size]; - unsigned char *const asm_mem = new unsigned char[src_size]; - ASSERT_TRUE(c_mem != NULL); - ASSERT_TRUE(asm_mem != NULL); - unsigned char *const src_c = c_mem + src_top_padding * src_pitch; - unsigned char *const src_asm = asm_mem + src_top_padding * src_pitch; + ACMRandom rnd; + rnd.Reset(ACMRandom::DeterministicSeed()); + + ASSERT_TRUE(src_c_.Init()); + Buffer src_asm = Buffer(cols_, rows_, 8, 8, 8, 17); + ASSERT_TRUE(src_asm.Init()); for (int level = 0; level < 100; level++) { - memset(c_mem, 10, src_size); - memset(asm_mem, 10, src_size); - SetRandom(src_c, src_asm, rows, cols, src_pitch); - vpx_mbpost_proc_down_c(src_c, src_pitch, rows, cols, q2mbl(level)); - ASM_REGISTER_STATE_CHECK( - GetParam()(src_asm, src_pitch, rows, cols, q2mbl(level))); - RunComparison(src_c, src_asm, rows, cols, src_pitch); + src_c_.SetPadding(10); + src_asm.SetPadding(10); + src_c_.Set(&rnd, &ACMRandom::Rand8); + src_asm.CopyFrom(src_c_); - memset(c_mem, 10, src_size); - memset(asm_mem, 10, src_size); - SetRandomSaturation(src_c, src_asm, rows, cols, src_pitch); - vpx_mbpost_proc_down_c(src_c, src_pitch, rows, cols, q2mbl(level)); - ASM_REGISTER_STATE_CHECK( - GetParam()(src_asm, src_pitch, rows, cols, q2mbl(level))); - RunComparison(src_c, src_asm, rows, cols, src_pitch); + vpx_mbpost_proc_down_c(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_, + q2mbl(level)); + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_( + src_asm.TopLeftPixel(), src_asm.stride(), rows_, cols_, q2mbl(level))); + ASSERT_TRUE(src_asm.CheckValues(src_c_)); + + src_c_.SetPadding(10); + src_asm.SetPadding(10); + src_c_.Set(&rnd, &ACMRandom::Rand8Extremes); + src_asm.CopyFrom(src_c_); + + vpx_mbpost_proc_down_c(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_, + q2mbl(level)); + ASM_REGISTER_STATE_CHECK(mb_post_proc_down_( + src_asm.TopLeftPixel(), src_asm.stride(), rows_, cols_, q2mbl(level))); + ASSERT_TRUE(src_asm.CheckValues(src_c_)); } - - delete[] c_mem; - delete[] asm_mem; } -INSTANTIATE_TEST_CASE_P( +TEST_P(VpxMbPostProcDownTest, DISABLED_Speed) { + ASSERT_TRUE(src_c_.Init()); + src_c_.SetPadding(10); + + SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride()); + + RunNTimes(100000); + PrintMedian("16x16"); +} + +INSTANTIATE_TEST_SUITE_P( C, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_c)); -INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcAcrossIpTest, - ::testing::Values(vpx_mbpost_proc_across_ip_c)); +INSTANTIATE_TEST_SUITE_P(C, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_c)); -INSTANTIATE_TEST_CASE_P(C, VpxMbPostProcDownTest, - ::testing::Values(vpx_mbpost_proc_down_c)); +INSTANTIATE_TEST_SUITE_P(C, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_c)); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_sse2)); -INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcAcrossIpTest, - ::testing::Values(vpx_mbpost_proc_across_ip_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_sse2)); -INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcDownTest, - ::testing::Values(vpx_mbpost_proc_down_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_sse2)); #endif // HAVE_SSE2 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_neon)); -INSTANTIATE_TEST_CASE_P(NEON, VpxMbPostProcAcrossIpTest, - ::testing::Values(vpx_mbpost_proc_across_ip_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_neon)); + +INSTANTIATE_TEST_SUITE_P(NEON, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_neon)); #endif // HAVE_NEON #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, VpxPostProcDownAndAcrossMbRowTest, ::testing::Values(vpx_post_proc_down_and_across_mb_row_msa)); -INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcAcrossIpTest, - ::testing::Values(vpx_mbpost_proc_across_ip_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_msa)); -INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest, - ::testing::Values(vpx_mbpost_proc_down_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_msa)); #endif // HAVE_MSA +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P( + VSX, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_vsx)); + +INSTANTIATE_TEST_SUITE_P(VSX, VpxMbPostProcAcrossIpTest, + ::testing::Values(vpx_mbpost_proc_across_ip_vsx)); + +INSTANTIATE_TEST_SUITE_P(VSX, VpxMbPostProcDownTest, + ::testing::Values(vpx_mbpost_proc_down_vsx)); +#endif // HAVE_VSX + } // namespace diff --git a/media/libvpx/libvpx/test/predict_test.cc b/media/libvpx/libvpx/test/predict_test.cc index a6e2b3cf32..cc290bf50c 100644 --- a/media/libvpx/libvpx/test/predict_test.cc +++ b/media/libvpx/libvpx/test/predict_test.cc @@ -8,14 +8,17 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vp8_rtcd.h" #include "./vpx_config.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" @@ -25,23 +28,24 @@ namespace { using libvpx_test::ACMRandom; -using std::tr1::make_tuple; +using std::make_tuple; -typedef void (*PredictFunc)(uint8_t *src_ptr, int src_pixels_per_line, - int xoffset, int yoffset, uint8_t *dst_ptr, - int dst_pitch); +using PredictFunc = void (*)(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch); -typedef std::tr1::tuple PredictParam; +using PredictParam = std::tuple; -class PredictTestBase : public ::testing::TestWithParam { +class PredictTestBase : public AbstractBench, + public ::testing::TestWithParam { public: PredictTestBase() : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)), - src_(NULL), padded_dst_(NULL), dst_(NULL), dst_c_(NULL) {} + src_(nullptr), padded_dst_(nullptr), dst_(nullptr), dst_c_(nullptr) {} - virtual void SetUp() { + void SetUp() override { src_ = new uint8_t[kSrcSize]; - ASSERT_TRUE(src_ != NULL); + ASSERT_NE(src_, nullptr); // padded_dst_ provides a buffer of kBorderSize around the destination // memory to facilitate detecting out of bounds writes. @@ -49,25 +53,25 @@ class PredictTestBase : public ::testing::TestWithParam { padded_dst_size_ = dst_stride_ * (kBorderSize + height_ + kBorderSize); padded_dst_ = reinterpret_cast(vpx_memalign(16, padded_dst_size_)); - ASSERT_TRUE(padded_dst_ != NULL); + ASSERT_NE(padded_dst_, nullptr); dst_ = padded_dst_ + (kBorderSize * dst_stride_) + kBorderSize; dst_c_ = new uint8_t[16 * 16]; - ASSERT_TRUE(dst_c_ != NULL); + ASSERT_NE(dst_c_, nullptr); memset(src_, 0, kSrcSize); memset(padded_dst_, 128, padded_dst_size_); memset(dst_c_, 0, 16 * 16); } - virtual void TearDown() { + void TearDown() override { delete[] src_; - src_ = NULL; + src_ = nullptr; vpx_free(padded_dst_); - padded_dst_ = NULL; - dst_ = NULL; + padded_dst_ = nullptr; + dst_ = nullptr; delete[] dst_c_; - dst_c_ = NULL; + dst_c_ = nullptr; libvpx_test::ClearSystemState(); } @@ -204,7 +208,20 @@ class PredictTestBase : public ::testing::TestWithParam { } } } -}; + + void Run() override { + for (int xoffset = 0; xoffset < 8; ++xoffset) { + for (int yoffset = 0; yoffset < 8; ++yoffset) { + if (xoffset == 0 && yoffset == 0) { + continue; + } + + predict_(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, dst_, + dst_stride_); + } + } + } +}; // namespace class SixtapPredictTest : public PredictTestBase {}; @@ -281,14 +298,14 @@ TEST_P(SixtapPredictTest, TestWithPresetData) { CompareBuffers(kExpectedDst, kExpectedDstStride, dst_, dst_stride_)); } -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_c), make_tuple(8, 8, &vp8_sixtap_predict8x8_c), make_tuple(8, 4, &vp8_sixtap_predict8x4_c), make_tuple(4, 4, &vp8_sixtap_predict4x4_c))); #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_neon), make_tuple(8, 8, &vp8_sixtap_predict8x8_neon), @@ -296,19 +313,19 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, &vp8_sixtap_predict4x4_neon))); #endif #if HAVE_MMX -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MMX, SixtapPredictTest, ::testing::Values(make_tuple(4, 4, &vp8_sixtap_predict4x4_mmx))); #endif #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_sse2), make_tuple(8, 8, &vp8_sixtap_predict8x8_sse2), make_tuple(8, 4, &vp8_sixtap_predict8x4_sse2))); #endif #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_ssse3), make_tuple(8, 8, &vp8_sixtap_predict8x8_ssse3), @@ -316,7 +333,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, &vp8_sixtap_predict4x4_ssse3))); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, SixtapPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_msa), make_tuple(8, 8, &vp8_sixtap_predict8x8_msa), @@ -324,6 +341,23 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, &vp8_sixtap_predict4x4_msa))); #endif +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P( + MMI, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi), + make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi), + make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi), + make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi))); +#endif + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, SixtapPredictTest, + ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_lsx), + make_tuple(8, 8, &vp8_sixtap_predict8x8_lsx), + make_tuple(4, 4, &vp8_sixtap_predict4x4_lsx))); +#endif + class BilinearPredictTest : public PredictTestBase {}; TEST_P(BilinearPredictTest, TestWithRandomData) { @@ -332,41 +366,45 @@ TEST_P(BilinearPredictTest, TestWithRandomData) { TEST_P(BilinearPredictTest, TestWithUnalignedDst) { TestWithUnalignedDst(vp8_bilinear_predict16x16_c); } +TEST_P(BilinearPredictTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 5000000 / (width_ * height_); + RunNTimes(kCountSpeedTestBlock); -INSTANTIATE_TEST_CASE_P( + char title[16]; + snprintf(title, sizeof(title), "%dx%d", width_, height_); + PrintMedian(title); +} + +INSTANTIATE_TEST_SUITE_P( C, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_c), make_tuple(8, 8, &vp8_bilinear_predict8x8_c), make_tuple(8, 4, &vp8_bilinear_predict8x4_c), make_tuple(4, 4, &vp8_bilinear_predict4x4_c))); #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_neon), make_tuple(8, 8, &vp8_bilinear_predict8x8_neon), make_tuple(8, 4, &vp8_bilinear_predict8x4_neon), make_tuple(4, 4, &vp8_bilinear_predict4x4_neon))); #endif -#if HAVE_MMX -INSTANTIATE_TEST_CASE_P( - MMX, BilinearPredictTest, - ::testing::Values(make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx), - make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx))); -#endif #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2), - make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2))); + make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2), + make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2), + make_tuple(4, 4, &vp8_bilinear_predict4x4_sse2))); #endif #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_ssse3), make_tuple(8, 8, &vp8_bilinear_predict8x8_ssse3))); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_msa), make_tuple(8, 8, &vp8_bilinear_predict8x8_msa), diff --git a/media/libvpx/libvpx/test/quantize_test.cc b/media/libvpx/libvpx/test/quantize_test.cc index 69da8994ca..8185dd6356 100644 --- a/media/libvpx/libvpx/test/quantize_test.cc +++ b/media/libvpx/libvpx/test/quantize_test.cc @@ -9,12 +9,14 @@ */ #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" -#include "./vpx_config.h" #include "./vp8_rtcd.h" +#include "./vpx_config.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" @@ -31,12 +33,12 @@ namespace { const int kNumBlocks = 25; const int kNumBlockEntries = 16; -typedef void (*VP8Quantize)(BLOCK *b, BLOCKD *d); +using VP8Quantize = void (*)(BLOCK *b, BLOCKD *d); -typedef std::tr1::tuple VP8QuantizeParam; +using VP8QuantizeParam = std::tuple; using libvpx_test::ACMRandom; -using std::tr1::make_tuple; +using std::make_tuple; // Create and populate a VP8_COMP instance which has a complete set of // quantization inputs as well as a second MACROBLOCKD for output. @@ -44,9 +46,9 @@ class QuantizeTestBase { public: virtual ~QuantizeTestBase() { vp8_remove_compressor(&vp8_comp_); - vp8_comp_ = NULL; + vp8_comp_ = nullptr; vpx_free(macroblockd_dst_); - macroblockd_dst_ = NULL; + macroblockd_dst_ = nullptr; libvpx_test::ClearSystemState(); } @@ -69,7 +71,7 @@ class QuantizeTestBase { // Copy macroblockd from the reference to get pre-set-up dequant values. macroblockd_dst_ = reinterpret_cast( vpx_memalign(32, sizeof(*macroblockd_dst_))); - memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_)); + *macroblockd_dst_ = vp8_comp_->mb.e_mbd; // Fix block pointers - currently they point to the blocks in the reference // structure. vp8_setup_block_dptrs(macroblockd_dst_); @@ -78,7 +80,7 @@ class QuantizeTestBase { void UpdateQuantizer(int q) { vp8_set_quantizer(vp8_comp_, q); - memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_)); + *macroblockd_dst_ = vp8_comp_->mb.e_mbd; vp8_setup_block_dptrs(macroblockd_dst_); } @@ -116,14 +118,19 @@ class QuantizeTestBase { }; class QuantizeTest : public QuantizeTestBase, - public ::testing::TestWithParam { + public ::testing::TestWithParam, + public AbstractBench { protected: - virtual void SetUp() { + void SetUp() override { SetupCompressor(); asm_quant_ = GET_PARAM(0); c_quant_ = GET_PARAM(1); } + void Run() override { + asm_quant_(&vp8_comp_->mb.block[0], ¯oblockd_dst_->block[0]); + } + void RunComparison() { for (int i = 0; i < kNumBlocks; ++i) { ASM_REGISTER_STATE_CHECK( @@ -139,6 +146,7 @@ class QuantizeTest : public QuantizeTestBase, VP8Quantize asm_quant_; VP8Quantize c_quant_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(QuantizeTest); TEST_P(QuantizeTest, TestZeroInput) { FillCoeffConstant(0); @@ -166,8 +174,15 @@ TEST_P(QuantizeTest, TestMultipleQ) { } } +TEST_P(QuantizeTest, DISABLED_Speed) { + FillCoeffRandom(); + + RunNTimes(10000000); + PrintMedian("vp8 quantize"); +} + #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, QuantizeTest, ::testing::Values( make_tuple(&vp8_fast_quantize_b_sse2, &vp8_fast_quantize_b_c), @@ -175,29 +190,45 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P(SSSE3, QuantizeTest, - ::testing::Values(make_tuple(&vp8_fast_quantize_b_ssse3, - &vp8_fast_quantize_b_c))); +INSTANTIATE_TEST_SUITE_P( + SSSE3, QuantizeTest, + ::testing::Values(make_tuple(&vp8_fast_quantize_b_ssse3, + &vp8_fast_quantize_b_c))); #endif // HAVE_SSSE3 #if HAVE_SSE4_1 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE4_1, QuantizeTest, ::testing::Values(make_tuple(&vp8_regular_quantize_b_sse4_1, &vp8_regular_quantize_b_c))); #endif // HAVE_SSE4_1 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, QuantizeTest, - ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon, - &vp8_fast_quantize_b_c))); +INSTANTIATE_TEST_SUITE_P(NEON, QuantizeTest, + ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon, + &vp8_fast_quantize_b_c))); #endif // HAVE_NEON #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, QuantizeTest, ::testing::Values( make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c), make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c))); #endif // HAVE_MSA + +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P( + MMI, QuantizeTest, + ::testing::Values( + make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c), + make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c))); +#endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, QuantizeTest, + ::testing::Values(make_tuple(&vp8_regular_quantize_b_lsx, + &vp8_regular_quantize_b_c))); +#endif // HAVE_LSX } // namespace diff --git a/media/libvpx/libvpx/test/realtime_test.cc b/media/libvpx/libvpx/test/realtime_test.cc index 63f1ac3c29..b0cc929dda 100644 --- a/media/libvpx/libvpx/test/realtime_test.cc +++ b/media/libvpx/libvpx/test/realtime_test.cc @@ -7,11 +7,14 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include + +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/util.h" #include "test/video_source.h" -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx_config.h" namespace { @@ -24,40 +27,96 @@ class RealtimeTest public ::libvpx_test::CodecTestWithParam { protected: RealtimeTest() : EncoderTest(GET_PARAM(0)), frame_packets_(0) {} - virtual ~RealtimeTest() {} + ~RealtimeTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); cfg_.g_lag_in_frames = 0; SetMode(::libvpx_test::kRealTime); } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { +#if !CONFIG_REALTIME_ONLY // TODO(tomfinegan): We're changing the pass value here to make sure // we get frames when real time mode is combined with |g_pass| set to // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets // the pass value based on the mode passed into EncoderTest::SetMode(), // which overrides the one specified in SetUp() above. cfg_.g_pass = VPX_RC_FIRST_PASS; +#endif } - virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) { + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0 && set_cpu_used_) { + encoder->Control(VP8E_SET_CPUUSED, 8); + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override { frame_packets_++; } + bool IsVP9() const { +#if CONFIG_VP9_ENCODER + return codec_ == &libvpx_test::kVP9; +#else + return false; +#endif + } + + void TestIntegerOverflow(unsigned int width, unsigned int height) { + ::libvpx_test::RandomVideoSource video; + video.SetSize(width, height); + video.set_limit(20); + cfg_.rc_target_bitrate = UINT_MAX; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void TestEncode() { + ::libvpx_test::RandomVideoSource video; + video.SetSize(kVideoSourceWidth, kVideoSourceHeight); + video.set_limit(kFramesToEncode); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_EQ(kFramesToEncode, frame_packets_); + } + int frame_packets_; + bool set_cpu_used_ = true; }; -TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { - ::libvpx_test::RandomVideoSource video; - video.SetSize(kVideoSourceWidth, kVideoSourceHeight); - video.set_limit(kFramesToEncode); - ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - EXPECT_EQ(kFramesToEncode, frame_packets_); +TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { TestEncode(); } + +TEST_P(RealtimeTest, RealtimeDefaultCpuUsed) { + set_cpu_used_ = false; + TestEncode(); } -VP8_INSTANTIATE_TEST_CASE(RealtimeTest, - ::testing::Values(::libvpx_test::kRealTime)); -VP9_INSTANTIATE_TEST_CASE(RealtimeTest, - ::testing::Values(::libvpx_test::kRealTime)); +TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); } + +TEST_P(RealtimeTest, IntegerOverflowLarge) { +#ifdef CHROMIUM + GTEST_SKIP() << "16K framebuffers are not supported by Chromium's allocator."; +#else + if (IsVP9()) { +#if VPX_ARCH_AARCH64 || VPX_ARCH_X86_64 + TestIntegerOverflow(16384, 16384); +#else + TestIntegerOverflow(4096, 4096); +#endif + } else { + GTEST_SKIP() + << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):" + << " Enable this test after bitstream errors & undefined sanitizer " + "warnings are fixed."; + // TestIntegerOverflow(16383, 16383); + } +#endif // defined(CHROMIUM) +} + +VP8_INSTANTIATE_TEST_SUITE(RealtimeTest, + ::testing::Values(::libvpx_test::kRealTime)); +VP9_INSTANTIATE_TEST_SUITE(RealtimeTest, + ::testing::Values(::libvpx_test::kRealTime)); } // namespace diff --git a/media/libvpx/libvpx/test/register_state_check.h b/media/libvpx/libvpx/test/register_state_check.h index 84641c8e99..96795f65be 100644 --- a/media/libvpx/libvpx/test/register_state_check.h +++ b/media/libvpx/libvpx/test/register_state_check.h @@ -8,10 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_REGISTER_STATE_CHECK_H_ -#define TEST_REGISTER_STATE_CHECK_H_ +#ifndef VPX_TEST_REGISTER_STATE_CHECK_H_ +#define VPX_TEST_REGISTER_STATE_CHECK_H_ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -28,13 +28,14 @@ // See platform implementations of RegisterStateCheckXXX for details. // -#if defined(_WIN64) +#if defined(_WIN64) && VPX_ARCH_X86_64 #undef NOMINMAX #define NOMINMAX #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif +#include #include #include @@ -55,7 +56,7 @@ class RegisterStateCheck { private: static bool StoreRegisters(CONTEXT *const context) { const HANDLE this_thread = GetCurrentThread(); - EXPECT_TRUE(this_thread != NULL); + EXPECT_NE(this_thread, nullptr); context->ContextFlags = CONTEXT_FLOATING_POINT; const bool context_saved = GetThreadContext(this_thread, context) == TRUE; EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError(); @@ -81,10 +82,13 @@ class RegisterStateCheck { CONTEXT pre_context_; }; -#define ASM_REGISTER_STATE_CHECK(statement) \ - do { \ - libvpx_test::RegisterStateCheck reg_check; \ - statement; \ +#define ASM_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheck reg_check; \ + statement; \ + } \ + _ReadWriteBarrier(); \ } while (false) } // namespace libvpx_test @@ -113,19 +117,30 @@ class RegisterStateCheck { int64_t post_store[8]; vpx_push_neon(post_store); for (int i = 0; i < 8; ++i) { - EXPECT_EQ(pre_store_[i], post_store[i]) << "d" << i + 8 - << " has been modified"; + EXPECT_EQ(pre_store_[i], post_store[i]) + << "d" << i + 8 << " has been modified"; } } int64_t pre_store_[8]; }; +#if defined(__GNUC__) +#define ASM_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheck reg_check; \ + statement; \ + } \ + __asm__ volatile("" ::: "memory"); \ + } while (false) +#else #define ASM_REGISTER_STATE_CHECK(statement) \ do { \ libvpx_test::RegisterStateCheck reg_check; \ statement; \ } while (false) +#endif } // namespace libvpx_test @@ -138,9 +153,9 @@ class RegisterStateCheck {}; } // namespace libvpx_test -#endif // _WIN64 +#endif // _WIN64 && VPX_ARCH_X86_64 -#if ARCH_X86 || ARCH_X86_64 +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 #if defined(__GNUC__) namespace libvpx_test { @@ -169,19 +184,22 @@ class RegisterStateCheckMMX { uint16_t pre_fpu_env_[14]; }; -#define API_REGISTER_STATE_CHECK(statement) \ - do { \ - libvpx_test::RegisterStateCheckMMX reg_check; \ - ASM_REGISTER_STATE_CHECK(statement); \ +#define API_REGISTER_STATE_CHECK(statement) \ + do { \ + { \ + libvpx_test::RegisterStateCheckMMX reg_check_mmx; \ + ASM_REGISTER_STATE_CHECK(statement); \ + } \ + __asm__ volatile("" ::: "memory"); \ } while (false) } // namespace libvpx_test #endif // __GNUC__ -#endif // ARCH_X86 || ARCH_X86_64 +#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 #ifndef API_REGISTER_STATE_CHECK #define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK #endif -#endif // TEST_REGISTER_STATE_CHECK_H_ +#endif // VPX_TEST_REGISTER_STATE_CHECK_H_ diff --git a/media/libvpx/libvpx/test/resize_test.cc b/media/libvpx/libvpx/test/resize_test.cc index c9950dd433..1a8319481d 100644 --- a/media/libvpx/libvpx/test/resize_test.cc +++ b/media/libvpx/libvpx/test/resize_test.cc @@ -7,16 +7,15 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include - #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" -#include "test/video_source.h" #include "test/util.h" +#include "test/video_source.h" +#include "vpx_config.h" // Enable(1) or Disable(0) writing of the compressed bitstream. #define WRITE_COMPRESSED_STREAM 0 @@ -93,10 +92,29 @@ struct FrameInfo { void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, unsigned int initial_h, unsigned int *w, - unsigned int *h, int flag_codec) { + unsigned int *h, bool flag_codec, + bool smaller_width_larger_size, + bool random_input_one_half_only) { + *w = initial_w; + *h = initial_h; + + if (random_input_one_half_only == 1) { + if (frame < 100) { + return; + } + *w = initial_w / 2; + *h = initial_h / 2; + return; + } + if (smaller_width_larger_size) { + if (frame < 30) { + return; + } + *w = initial_w * 7 / 10; + *h = initial_h * 16 / 10; + return; + } if (frame < 10) { - *w = initial_w; - *h = initial_h; return; } if (frame < 20) { @@ -110,8 +128,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 40) { - *w = initial_w; - *h = initial_h; return; } if (frame < 50) { @@ -125,8 +141,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 70) { - *w = initial_w; - *h = initial_h; return; } if (frame < 80) { @@ -145,8 +159,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 110) { - *w = initial_w; - *h = initial_h; return; } if (frame < 120) { @@ -165,8 +177,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 150) { - *w = initial_w; - *h = initial_h; return; } if (frame < 160) { @@ -185,8 +195,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 190) { - *w = initial_w; - *h = initial_h; return; } if (frame < 200) { @@ -205,8 +213,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 230) { - *w = initial_w; - *h = initial_h; return; } if (frame < 240) { @@ -220,8 +226,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, return; } if (frame < 260) { - *w = initial_w; - *h = initial_h; return; } // Go down very low. @@ -234,34 +238,52 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, // Cases that only works for VP9. // For VP9: Swap width and height of original. if (frame < 320) { - *w = initial_h; - *h = initial_w; return; } } - *w = initial_w; - *h = initial_h; } class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { public: - ResizingVideoSource() { - SetSize(kInitialWidth, kInitialHeight); + ResizingVideoSource(int width, int height) + : smaller_width_larger_size_(false), random_input_one_half_only_(false), + configured_width_(width), configured_height_(height) { limit_ = 350; + SetSize(configured_width_, configured_height_); } - int flag_codec_; - virtual ~ResizingVideoSource() {} + bool flag_codec_; + bool smaller_width_larger_size_; + bool random_input_one_half_only_; + // configured_width_/height_ is the configured resolution when codec is + // created. + int configured_width_; + int configured_height_; + ~ResizingVideoSource() override = default; protected: - virtual void Next() { + void Next() override { ++frame_; - unsigned int width; - unsigned int height; - ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height, - flag_codec_); + unsigned int width = 0; + unsigned int height = 0; + ScaleForFrameNumber(frame_, configured_width_, configured_height_, &width, + &height, flag_codec_, smaller_width_larger_size_, + random_input_one_half_only_); SetSize(width, height); FillFrame(); } + + void FillFrame() override { + if (img_) { + memset(img_->img_data, 0, raw_sz_); + if (random_input_one_half_only_) { + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + unsigned char *image = img_->planes[0]; + for (size_t i = 0; i < raw_sz_; ++i) { + image[i] = rnd.Rand8(); + } + } + } + } }; class ResizeTest @@ -270,38 +292,60 @@ class ResizeTest protected: ResizeTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ResizeTest() {} + ~ResizeTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); } - virtual void DecompressedFrameHook(const vpx_image_t &img, - vpx_codec_pts_t pts) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + ASSERT_NE(static_cast(pkt->data.frame.width[0]), 0); + ASSERT_NE(static_cast(pkt->data.frame.height[0]), 0); + encode_frame_width_.push_back(pkt->data.frame.width[0]); + encode_frame_height_.push_back(pkt->data.frame.height[0]); + } + + unsigned int GetFrameWidth(size_t idx) const { + return encode_frame_width_[idx]; + } + + unsigned int GetFrameHeight(size_t idx) const { + return encode_frame_height_[idx]; + } + + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t pts) override { frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); } std::vector frame_info_list_; + std::vector encode_frame_width_; + std::vector encode_frame_height_; }; TEST_P(ResizeTest, TestExternalResizeWorks) { - ResizingVideoSource video; - video.flag_codec_ = 0; + ResizingVideoSource video(kInitialWidth, kInitialHeight); + video.flag_codec_ = false; + video.smaller_width_larger_size_ = false; cfg_.g_lag_in_frames = 0; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - for (std::vector::const_iterator info = frame_info_list_.begin(); - info != frame_info_list_.end(); ++info) { - const unsigned int frame = static_cast(info->pts); + for (const auto &info : frame_info_list_) { + const unsigned int frame = static_cast(info.pts); unsigned int expected_w; unsigned int expected_h; + const size_t idx = &info - &frame_info_list_[0]; + ASSERT_EQ(info.w, GetFrameWidth(idx)); + ASSERT_EQ(info.h, GetFrameHeight(idx)); ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, - &expected_h, 0); - EXPECT_EQ(expected_w, info->w) << "Frame " << frame - << " had unexpected width"; - EXPECT_EQ(expected_h, info->h) << "Frame " << frame - << " had unexpected height"; + &expected_h, video.flag_codec_, + video.smaller_width_larger_size_, + /*random_input_one_half_only=*/false); + EXPECT_EQ(expected_w, info.w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info.h) + << "Frame " << frame << " had unexpected height"; } } @@ -312,32 +356,32 @@ class ResizeInternalTest : public ResizeTest { protected: #if WRITE_COMPRESSED_STREAM ResizeInternalTest() - : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {} + : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {} #else ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {} #endif - virtual ~ResizeInternalTest() {} + ~ResizeInternalTest() override = default; - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { #if WRITE_COMPRESSED_STREAM outfile_ = fopen("vp90-2-05-resize.ivf", "wb"); #endif } - virtual void EndPassHook() { + void EndPassHook() override { #if WRITE_COMPRESSED_STREAM if (outfile_) { if (!fseek(outfile_, 0, SEEK_SET)) write_ivf_file_header(&cfg_, out_frames_, outfile_); fclose(outfile_); - outfile_ = NULL; + outfile_ = nullptr; } #endif } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (change_config_) { int new_q = 60; if (video->frame() == 0) { @@ -362,13 +406,13 @@ class ResizeInternalTest : public ResizeTest { } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); } #if WRITE_COMPRESSED_STREAM - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ++out_frames_; // Write initial file header if first frame. @@ -404,15 +448,14 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) { cfg_.g_lag_in_frames = 0; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - for (std::vector::const_iterator info = frame_info_list_.begin(); - info != frame_info_list_.end(); ++info) { - const vpx_codec_pts_t pts = info->pts; + for (const auto &info : frame_info_list_) { + const vpx_codec_pts_t pts = info.pts; if (pts >= kStepDownFrame && pts < kStepUpFrame) { - ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width"; - ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height"; + ASSERT_EQ(282U, info.w) << "Frame " << pts << " had unexpected width"; + ASSERT_EQ(173U, info.h) << "Frame " << pts << " had unexpected height"; } else { - EXPECT_EQ(352U, info->w) << "Frame " << pts << " had unexpected width"; - EXPECT_EQ(288U, info->h) << "Frame " << pts << " had unexpected height"; + EXPECT_EQ(352U, info.w) << "Frame " << pts << " had unexpected width"; + EXPECT_EQ(288U, info.h) << "Frame " << pts << " had unexpected height"; } } } @@ -431,13 +474,17 @@ class ResizeRealtimeTest public ::libvpx_test::CodecTestWith2Params { protected: ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {} - virtual ~ResizeRealtimeTest() {} + ~ResizeRealtimeTest() override = default; - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_AQ_MODE, 3); encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + if (cfg_.g_threads > 0) { + encoder->Control(VP9E_SET_ROW_MT, 1); + encoder->Control(VP9E_SET_TILE_COLUMNS, cfg_.g_threads >> 1); + } } if (change_bitrate_ && video->frame() == 120) { @@ -447,25 +494,40 @@ class ResizeRealtimeTest } } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(GET_PARAM(1)); set_cpu_used_ = GET_PARAM(2); } - virtual void DecompressedFrameHook(const vpx_image_t &img, - vpx_codec_pts_t pts) { + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t pts) override { frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); } - virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) { + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { double mismatch_psnr = compute_psnr(img1, img2); mismatch_psnr_ += mismatch_psnr; ++mismatch_nframes_; } + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + ASSERT_NE(static_cast(pkt->data.frame.width[0]), 0); + ASSERT_NE(static_cast(pkt->data.frame.height[0]), 0); + encode_frame_width_.push_back(pkt->data.frame.width[0]); + encode_frame_height_.push_back(pkt->data.frame.height[0]); + } + unsigned int GetMismatchFrames() { return mismatch_nframes_; } + unsigned int GetFrameWidth(size_t idx) const { + return encode_frame_width_[idx]; + } + + unsigned int GetFrameHeight(size_t idx) const { + return encode_frame_height_[idx]; + } + void DefaultConfig() { cfg_.rc_buf_initial_sz = 500; cfg_.rc_buf_optimal_sz = 600; @@ -493,11 +555,14 @@ class ResizeRealtimeTest bool change_bitrate_; double mismatch_psnr_; int mismatch_nframes_; + std::vector encode_frame_width_; + std::vector encode_frame_height_; }; TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { - ResizingVideoSource video; - video.flag_codec_ = 1; + ResizingVideoSource video(kInitialWidth, kInitialHeight); + video.flag_codec_ = true; + video.smaller_width_larger_size_ = false; DefaultConfig(); // Disable internal resize for this test. cfg_.rc_resize_allowed = 0; @@ -506,18 +571,80 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { mismatch_nframes_ = 0; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - for (std::vector::const_iterator info = frame_info_list_.begin(); - info != frame_info_list_.end(); ++info) { - const unsigned int frame = static_cast(info->pts); + for (const auto &info : frame_info_list_) { + const unsigned int frame = static_cast(info.pts); unsigned int expected_w; unsigned int expected_h; ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, - &expected_h, 1); - EXPECT_EQ(expected_w, info->w) << "Frame " << frame - << " had unexpected width"; - EXPECT_EQ(expected_h, info->h) << "Frame " << frame - << " had unexpected height"; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); + &expected_h, video.flag_codec_, + video.smaller_width_larger_size_, + /*random_input_one_half_only=*/false); + EXPECT_EQ(expected_w, info.w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info.h) + << "Frame " << frame << " had unexpected height"; + EXPECT_EQ(GetMismatchFrames(), static_cast(0)); + } +} + +// This test uses 4 threads with small keyframe spacing, random input, +// and uses 640x480 as initial resolution. +TEST_P(ResizeRealtimeTest, TestExternalResizeWorks4Threads) { + ResizingVideoSource video(640, 480); + video.flag_codec_ = true; + video.smaller_width_larger_size_ = false; + video.random_input_one_half_only_ = true; + DefaultConfig(); + // Disable internal resize for this test. + cfg_.rc_resize_allowed = 0; + cfg_.g_threads = 4; + cfg_.kf_max_dist = 50; + cfg_.kf_min_dist = 50; + change_bitrate_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + for (const auto &info : frame_info_list_) { + const unsigned int frame = static_cast(info.pts); + unsigned int expected_w; + unsigned int expected_h; + ScaleForFrameNumber(frame, 640, 480, &expected_w, &expected_h, + video.flag_codec_, video.smaller_width_larger_size_, + video.random_input_one_half_only_); + EXPECT_EQ(expected_w, info.w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info.h) + << "Frame " << frame << " had unexpected height"; + EXPECT_EQ(GetMismatchFrames(), static_cast(0)); + } +} + +TEST_P(ResizeRealtimeTest, TestExternalResizeSmallerWidthBiggerSize) { + ResizingVideoSource video(kInitialWidth, kInitialHeight); + video.flag_codec_ = true; + video.smaller_width_larger_size_ = true; + DefaultConfig(); + // Disable internal resize for this test. + cfg_.rc_resize_allowed = 0; + change_bitrate_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + for (const auto &info : frame_info_list_) { + const unsigned int frame = static_cast(info.pts); + unsigned int expected_w; + unsigned int expected_h; + ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w, + &expected_h, video.flag_codec_, + video.smaller_width_larger_size_, + /*random_input_one_half_only=*/false); + EXPECT_EQ(expected_w, info.w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info.h) + << "Frame " << frame << " had unexpected height"; + EXPECT_EQ(GetMismatchFrames(), static_cast(0)); } } @@ -525,37 +652,37 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) { // Run at low bitrate, with resize_allowed = 1, and verify that we get // one resize down event. TEST_P(ResizeRealtimeTest, TestInternalResizeDown) { - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 299); + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 299); DefaultConfig(); - cfg_.g_w = 352; - cfg_.g_h = 288; + cfg_.g_w = 640; + cfg_.g_h = 480; change_bitrate_ = false; mismatch_psnr_ = 0.0; mismatch_nframes_ = 0; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER unsigned int last_w = cfg_.g_w; unsigned int last_h = cfg_.g_h; int resize_count = 0; - for (std::vector::const_iterator info = frame_info_list_.begin(); - info != frame_info_list_.end(); ++info) { - if (info->w != last_w || info->h != last_h) { + for (const auto &info : frame_info_list_) { + if (info.w != last_w || info.h != last_h) { // Verify that resize down occurs. - ASSERT_LT(info->w, last_w); - ASSERT_LT(info->h, last_h); - last_w = info->w; - last_h = info->h; + ASSERT_LT(info.w, last_w); + ASSERT_LT(info.h, last_h); + last_w = info.w; + last_h = info.h; resize_count++; } } -#if CONFIG_VP9_DECODER // Verify that we get 1 resize down event in this test. ASSERT_EQ(1, resize_count) << "Resizing should occur."; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); + EXPECT_EQ(GetMismatchFrames(), static_cast(0)); #else - printf("Warning: VP9 decoder unavailable, unable to check resize count!\n"); + GTEST_SKIP() + << "Warning: VP9 decoder unavailable, unable to check resize count!\n"; #endif } @@ -563,11 +690,11 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDown) { // Start at low target bitrate, raise the bitrate in the middle of the clip, // scaling-up should occur after bitrate changed. TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { - ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 359); + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); DefaultConfig(); - cfg_.g_w = 352; - cfg_.g_h = 288; + cfg_.g_w = 640; + cfg_.g_h = 480; change_bitrate_ = true; mismatch_psnr_ = 0.0; mismatch_nframes_ = 0; @@ -580,30 +707,33 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { unsigned int last_w = cfg_.g_w; unsigned int last_h = cfg_.g_h; int resize_count = 0; - for (std::vector::const_iterator info = frame_info_list_.begin(); - info != frame_info_list_.end(); ++info) { - if (info->w != last_w || info->h != last_h) { + for (const auto &info : frame_info_list_) { + const size_t idx = &info - &frame_info_list_[0]; + ASSERT_EQ(info.w, GetFrameWidth(idx)); + ASSERT_EQ(info.h, GetFrameHeight(idx)); + if (info.w != last_w || info.h != last_h) { resize_count++; - if (resize_count == 1) { + if (resize_count <= 2) { // Verify that resize down occurs. - ASSERT_LT(info->w, last_w); - ASSERT_LT(info->h, last_h); - } else if (resize_count == 2) { + ASSERT_LT(info.w, last_w); + ASSERT_LT(info.h, last_h); + } else if (resize_count > 2) { // Verify that resize up occurs. - ASSERT_GT(info->w, last_w); - ASSERT_GT(info->h, last_h); + ASSERT_GT(info.w, last_w); + ASSERT_GT(info.h, last_h); } - last_w = info->w; - last_h = info->h; + last_w = info.w; + last_h = info.h; } } #if CONFIG_VP9_DECODER - // Verify that we get 2 resize events in this test. - ASSERT_EQ(resize_count, 2) << "Resizing should occur twice."; - EXPECT_EQ(static_cast(0), GetMismatchFrames()); + // Verify that we get 4 resize events in this test. + ASSERT_EQ(resize_count, 4) << "Resizing should occur twice."; + EXPECT_EQ(GetMismatchFrames(), static_cast(0)); #else - printf("Warning: VP9 decoder unavailable, unable to check resize count!\n"); + GTEST_SKIP() + << "Warning: VP9 decoder unavailable, unable to check resize count!\n"; #endif } @@ -617,32 +747,32 @@ class ResizeCspTest : public ResizeTest { protected: #if WRITE_COMPRESSED_STREAM ResizeCspTest() - : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {} + : ResizeTest(), frame0_psnr_(0.0), outfile_(nullptr), out_frames_(0) {} #else ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {} #endif - virtual ~ResizeCspTest() {} + ~ResizeCspTest() override = default; - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { #if WRITE_COMPRESSED_STREAM outfile_ = fopen("vp91-2-05-cspchape.ivf", "wb"); #endif } - virtual void EndPassHook() { + void EndPassHook() override { #if WRITE_COMPRESSED_STREAM if (outfile_) { if (!fseek(outfile_, 0, SEEK_SET)) write_ivf_file_header(&cfg_, out_frames_, outfile_); fclose(outfile_); - outfile_ = NULL; + outfile_ = nullptr; } #endif } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { if (CspForFrameNumber(video->frame()) != VPX_IMG_FMT_I420 && cfg_.g_profile != 1) { cfg_.g_profile = 1; @@ -655,13 +785,13 @@ class ResizeCspTest : public ResizeTest { } } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); } #if WRITE_COMPRESSED_STREAM - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { ++out_frames_; // Write initial file header if first frame. @@ -687,10 +817,10 @@ class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource { limit_ = 30; } - virtual ~ResizingCspVideoSource() {} + ~ResizingCspVideoSource() override = default; protected: - virtual void Next() { + void Next() override { ++frame_; SetImageFormat(CspForFrameNumber(frame_)); FillFrame(); @@ -705,14 +835,13 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } -VP8_INSTANTIATE_TEST_CASE(ResizeTest, ONE_PASS_TEST_MODES); -VP9_INSTANTIATE_TEST_CASE(ResizeTest, - ::testing::Values(::libvpx_test::kRealTime)); -VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest, - ::testing::Values(::libvpx_test::kOnePassBest)); -VP9_INSTANTIATE_TEST_CASE(ResizeRealtimeTest, - ::testing::Values(::libvpx_test::kRealTime), - ::testing::Range(5, 9)); -VP9_INSTANTIATE_TEST_CASE(ResizeCspTest, - ::testing::Values(::libvpx_test::kRealTime)); +VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES); +VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest, + ::testing::Values(::libvpx_test::kOnePassBest)); +VP9_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(5, 9)); +VP9_INSTANTIATE_TEST_SUITE(ResizeCspTest, + ::testing::Values(::libvpx_test::kRealTime)); } // namespace diff --git a/media/libvpx/libvpx/test/resize_util.sh b/media/libvpx/libvpx/test/resize_util.sh deleted file mode 100644 index 5e472716da..0000000000 --- a/media/libvpx/libvpx/test/resize_util.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/sh -## -## Copyright (c) 2014 The WebM project authors. All Rights Reserved. -## -## Use of this source code is governed by a BSD-style license -## that can be found in the LICENSE file in the root of the source -## tree. An additional intellectual property rights grant can be found -## in the file PATENTS. All contributing project authors may -## be found in the AUTHORS file in the root of the source tree. -## -## This file tests the libvpx resize_util example code. To add new tests to -## this file, do the following: -## 1. Write a shell function (this is your test). -## 2. Add the function to resize_util_tests (on a new line). -## -. $(dirname $0)/tools_common.sh - -# Environment check: $YUV_RAW_INPUT is required. -resize_util_verify_environment() { - if [ ! -e "${YUV_RAW_INPUT}" ]; then - echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." - return 1 - fi -} - -# Resizes $YUV_RAW_INPUT using the resize_util example. $1 is the output -# dimensions that will be passed to resize_util. -resize_util() { - local resizer="${LIBVPX_BIN_PATH}/resize_util${VPX_TEST_EXE_SUFFIX}" - local output_file="${VPX_TEST_OUTPUT_DIR}/resize_util.raw" - local frames_to_resize="10" - local target_dimensions="$1" - - # resize_util is available only when CONFIG_SHARED is disabled. - if [ -z "$(vpx_config_option_enabled CONFIG_SHARED)" ]; then - if [ ! -x "${resizer}" ]; then - elog "${resizer} does not exist or is not executable." - return 1 - fi - - eval "${VPX_TEST_PREFIX}" "${resizer}" "${YUV_RAW_INPUT}" \ - "${YUV_RAW_INPUT_WIDTH}x${YUV_RAW_INPUT_HEIGHT}" \ - "${target_dimensions}" "${output_file}" ${frames_to_resize} \ - ${devnull} - - [ -e "${output_file}" ] || return 1 - fi -} - -# Halves each dimension of $YUV_RAW_INPUT using resize_util(). -resize_down() { - local target_width=$((${YUV_RAW_INPUT_WIDTH} / 2)) - local target_height=$((${YUV_RAW_INPUT_HEIGHT} / 2)) - - resize_util "${target_width}x${target_height}" -} - -# Doubles each dimension of $YUV_RAW_INPUT using resize_util(). -resize_up() { - local target_width=$((${YUV_RAW_INPUT_WIDTH} * 2)) - local target_height=$((${YUV_RAW_INPUT_HEIGHT} * 2)) - - resize_util "${target_width}x${target_height}" -} - -resize_util_tests="resize_down - resize_up" - -run_tests resize_util_verify_environment "${resize_util_tests}" diff --git a/media/libvpx/libvpx/test/sad_test.cc b/media/libvpx/libvpx/test/sad_test.cc index 837b08fbdf..d8d304f3c6 100644 --- a/media/libvpx/libvpx/test/sad_test.cc +++ b/media/libvpx/libvpx/test/sad_test.cc @@ -8,21 +8,27 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include -#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" #include "vpx/vpx_codec.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" + +// const[expr] should be sufficient for DECLARE_ALIGNED but early +// implementations of c++11 appear to have some issues with it. +#define kDataAlignment 32 template struct TestParams { @@ -32,19 +38,32 @@ struct TestParams { Function func; }; -typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride); -typedef TestParams SadMxNParam; +using SadMxNFunc = unsigned int (*)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); +using SadMxNParam = TestParams; -typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred); -typedef TestParams SadMxNAvgParam; +using SadSkipMxNFunc = unsigned int (*)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); +using SadSkipMxNParam = TestParams; -typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_ptr[], int ref_stride, - unsigned int *sad_array); -typedef TestParams SadMxNx4Param; +using SadMxNAvgFunc = unsigned int (*)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred); +using SadMxNAvgParam = TestParams; + +using SadMxNx4Func = void (*)(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[], int ref_stride, + unsigned int *sad_array); +using SadMxNx4Param = TestParams; + +using SadSkipMxNx4Func = void (*)(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[], + int ref_stride, unsigned int *sad_array); +using SadSkipMxNx4Param = TestParams; + +using SadMxNx8Func = void (*)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sad_array); using libvpx_test::ACMRandom; @@ -54,7 +73,7 @@ class SADTestBase : public ::testing::TestWithParam { public: explicit SADTestBase(const ParamType ¶ms) : params_(params) {} - virtual void SetUp() { + void SetUp() override { source_data8_ = reinterpret_cast( vpx_memalign(kDataAlignment, kDataBlockSize)); reference_data8_ = reinterpret_cast( @@ -84,53 +103,64 @@ class SADTestBase : public ::testing::TestWithParam { #endif // CONFIG_VP9_HIGHBITDEPTH } mask_ = (1 << bit_depth_) - 1; - source_stride_ = (params_.width + 31) & ~31; + source_stride_ = (params_.width + 63) & ~63; reference_stride_ = params_.width * 2; rnd_.Reset(ACMRandom::DeterministicSeed()); } - virtual void TearDown() { + void TearDown() override { vpx_free(source_data8_); - source_data8_ = NULL; + source_data8_ = nullptr; vpx_free(reference_data8_); - reference_data8_ = NULL; + reference_data8_ = nullptr; vpx_free(second_pred8_); - second_pred8_ = NULL; + second_pred8_ = nullptr; vpx_free(source_data16_); - source_data16_ = NULL; + source_data16_ = nullptr; vpx_free(reference_data16_); - reference_data16_ = NULL; + reference_data16_ = nullptr; vpx_free(second_pred16_); - second_pred16_ = NULL; + second_pred16_ = nullptr; libvpx_test::ClearSystemState(); } protected: // Handle blocks up to 4 blocks 64x64 with stride up to 128 - static const int kDataAlignment = 16; + // crbug.com/webm/1660 static const int kDataBlockSize = 64 * 128; static const int kDataBufferSize = 4 * kDataBlockSize; - uint8_t *GetReference(int block_idx) const { + int GetBlockRefOffset(int block_idx) const { + return block_idx * kDataBlockSize; + } + + uint8_t *GetReferenceFromOffset(int ref_offset) const { + assert((params_.height - 1) * reference_stride_ + params_.width - 1 + + ref_offset < + kDataBufferSize); #if CONFIG_VP9_HIGHBITDEPTH if (use_high_bit_depth_) { return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) + - block_idx * kDataBlockSize); + ref_offset); } #endif // CONFIG_VP9_HIGHBITDEPTH - return reference_data_ + block_idx * kDataBlockSize; + return reference_data_ + ref_offset; + } + + uint8_t *GetReference(int block_idx) const { + return GetReferenceFromOffset(GetBlockRefOffset(block_idx)); } // Sum of Absolute Differences. Given two blocks, calculate the absolute // difference between two pixels in the same relative location; accumulate. - uint32_t ReferenceSAD(int block_idx) const { + uint32_t ReferenceSAD(int ref_offset) const { uint32_t sad = 0; - const uint8_t *const reference8 = GetReference(block_idx); + const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset); const uint8_t *const source8 = source_data_; #if CONFIG_VP9_HIGHBITDEPTH const uint16_t *const reference16 = - CONVERT_TO_SHORTPTR(GetReference(block_idx)); + CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset)); const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); #endif // CONFIG_VP9_HIGHBITDEPTH for (int h = 0; h < params_.height; ++h) { @@ -149,6 +179,34 @@ class SADTestBase : public ::testing::TestWithParam { return sad; } + // Sum of Absolute Differences Skip rows. Given two blocks, calculate the + // absolute difference between two pixels in the same relative location every + // other row; accumulate and double the result at the end. + uint32_t ReferenceSADSkip(int ref_offset) const { + uint32_t sad = 0; + const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset); + const uint8_t *const source8 = source_data_; +#if CONFIG_VP9_HIGHBITDEPTH + const uint16_t *const reference16 = + CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset)); + const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_); +#endif // CONFIG_VP9_HIGHBITDEPTH + for (int h = 0; h < params_.height; h += 2) { + for (int w = 0; w < params_.width; ++w) { + if (!use_high_bit_depth_) { + sad += abs(source8[h * source_stride_ + w] - + reference8[h * reference_stride_ + w]); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + sad += abs(source16[h * source_stride_ + w] - + reference16[h * reference_stride_ + w]); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + } + return sad * 2; + } + // Sum of Absolute Differences Average. Given two blocks, and a prediction // calculate the absolute difference between one pixel and average of the // corresponding and predicted pixels; accumulate. @@ -201,24 +259,28 @@ class SADTestBase : public ::testing::TestWithParam { } } - void FillRandom(uint8_t *data, int stride) { + void FillRandomWH(uint8_t *data, int stride, int w, int h) { uint8_t *data8 = data; #if CONFIG_VP9_HIGHBITDEPTH uint16_t *data16 = CONVERT_TO_SHORTPTR(data); #endif // CONFIG_VP9_HIGHBITDEPTH - for (int h = 0; h < params_.height; ++h) { - for (int w = 0; w < params_.width; ++w) { + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { if (!use_high_bit_depth_) { - data8[h * stride + w] = rnd_.Rand8(); + data8[r * stride + c] = rnd_.Rand8(); #if CONFIG_VP9_HIGHBITDEPTH } else { - data16[h * stride + w] = rnd_.Rand16() & mask_; + data16[r * stride + c] = rnd_.Rand16() & mask_; #endif // CONFIG_VP9_HIGHBITDEPTH } } } } + void FillRandom(uint8_t *data, int stride) { + FillRandomWH(data, stride, params_.width, params_.height); + } + uint32_t mask_; vpx_bit_depth_t bit_depth_; int source_stride_; @@ -253,18 +315,45 @@ class SADx4Test : public SADTestBase { } void CheckSADs() const { - uint32_t reference_sad, exp_sad[4]; + uint32_t reference_sad; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); SADs(exp_sad); for (int block = 0; block < 4; ++block) { - reference_sad = ReferenceSAD(block); + reference_sad = ReferenceSAD(GetBlockRefOffset(block)); EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; } } }; -class SADTest : public SADTestBase { +class SADSkipx4Test : public SADTestBase { + public: + SADSkipx4Test() : SADTestBase(GetParam()) {} + + protected: + void SADs(unsigned int *results) const { + const uint8_t *references[] = { GetReference(0), GetReference(1), + GetReference(2), GetReference(3) }; + + ASM_REGISTER_STATE_CHECK(params_.func( + source_data_, source_stride_, references, reference_stride_, results)); + } + + void CheckSADs() const { + uint32_t reference_sad; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + + SADs(exp_sad); + for (int block = 0; block < 4; ++block) { + reference_sad = ReferenceSADSkip(GetBlockRefOffset(block)); + + EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block; + } + } +}; + +class SADTest : public AbstractBench, public SADTestBase { public: SADTest() : SADTestBase(GetParam()) {} @@ -279,14 +368,46 @@ class SADTest : public SADTestBase { } void CheckSAD() const { - const unsigned int reference_sad = ReferenceSAD(0); + const unsigned int reference_sad = ReferenceSAD(GetBlockRefOffset(0)); const unsigned int exp_sad = SAD(0); ASSERT_EQ(reference_sad, exp_sad); } + + void Run() override { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_); + } }; -class SADavgTest : public SADTestBase { +class SADSkipTest : public AbstractBench, public SADTestBase { + public: + SADSkipTest() : SADTestBase(GetParam()) {} + + protected: + unsigned int SAD(int block_idx) const { + unsigned int ret; + const uint8_t *const reference = GetReference(block_idx); + + ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_, + reference, reference_stride_)); + return ret; + } + + void CheckSAD() const { + const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0)); + const unsigned int exp_sad = SAD(0); + + ASSERT_EQ(reference_sad, exp_sad); + } + + void Run() override { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_); + } +}; + +class SADavgTest : public AbstractBench, public SADTestBase { public: SADavgTest() : SADTestBase(GetParam()) {} @@ -307,6 +428,11 @@ class SADavgTest : public SADTestBase { ASSERT_EQ(reference_sad, exp_sad); } + + void Run() override { + params_.func(source_data_, source_stride_, reference_data_, + reference_stride_, second_pred_); + } }; TEST_P(SADTest, MaxRef) { @@ -350,6 +476,69 @@ TEST_P(SADTest, ShortSrc) { source_stride_ = tmp_stride; } +TEST_P(SADTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + +TEST_P(SADSkipTest, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(reference_data_, reference_stride_, mask_); + CheckSAD(); +} + +TEST_P(SADSkipTest, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(reference_data_, reference_stride_, 0); + CheckSAD(); +} + +TEST_P(SADSkipTest, ShortRef) { + const int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + const int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, ShortSrc) { + const int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + CheckSAD(); + source_stride_ = tmp_stride; +} + +TEST_P(SADSkipTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + TEST_P(SADavgTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(reference_data_, reference_stride_, mask_); @@ -395,6 +584,19 @@ TEST_P(SADavgTest, ShortSrc) { source_stride_ = tmp_stride; } +TEST_P(SADavgTest, DISABLED_Speed) { + const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height); + FillRandom(source_data_, source_stride_); + FillRandom(reference_data_, reference_stride_); + FillRandom(second_pred_, params_.width); + + RunNTimes(kCountSpeedTestBlock); + + char title[16]; + snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height); + PrintMedian(title); +} + TEST_P(SADx4Test, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(GetReference(0), reference_stride_, mask_); @@ -463,6 +665,136 @@ TEST_P(SADx4Test, SrcAlignedByWidth) { source_data_ = tmp_source_data; } +TEST_P(SADx4Test, DISABLED_Speed) { + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height); + uint32_t reference_sad[4]; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + vpx_usec_timer timer; + for (int block = 0; block < 4; ++block) { + reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block)); + } + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + SADs(exp_sad); + } + vpx_usec_timer_mark(&timer); + for (int block = 0; block < 4; ++block) { + EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block; + } + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height, + bit_depth_, elapsed_time); + + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, MaxRef) { + FillConstant(source_data_, source_stride_, 0); + FillConstant(GetReference(0), reference_stride_, mask_); + FillConstant(GetReference(1), reference_stride_, mask_); + FillConstant(GetReference(2), reference_stride_, mask_); + FillConstant(GetReference(3), reference_stride_, mask_); + CheckSADs(); +} + +TEST_P(SADSkipx4Test, MaxSrc) { + FillConstant(source_data_, source_stride_, mask_); + FillConstant(GetReference(0), reference_stride_, 0); + FillConstant(GetReference(1), reference_stride_, 0); + FillConstant(GetReference(2), reference_stride_, 0); + FillConstant(GetReference(3), reference_stride_, 0); + CheckSADs(); +} + +TEST_P(SADSkipx4Test, ShortRef) { + int tmp_stride = reference_stride_; + reference_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, UnalignedRef) { + // The reference frame, but not the source frame, may be unaligned for + // certain types of searches. + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + reference_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, ShortSrc) { + int tmp_stride = source_stride_; + source_stride_ >>= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_stride_ = tmp_stride; +} + +TEST_P(SADSkipx4Test, SrcAlignedByWidth) { + uint8_t *tmp_source_data = source_data_; + source_data_ += params_.width; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + CheckSADs(); + source_data_ = tmp_source_data; +} + +TEST_P(SADSkipx4Test, DISABLED_Speed) { + int tmp_stride = reference_stride_; + reference_stride_ -= 1; + FillRandom(source_data_, source_stride_); + FillRandom(GetReference(0), reference_stride_); + FillRandom(GetReference(1), reference_stride_); + FillRandom(GetReference(2), reference_stride_); + FillRandom(GetReference(3), reference_stride_); + const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height); + uint32_t reference_sad[4]; + DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]); + vpx_usec_timer timer; + for (int block = 0; block < 4; ++block) { + reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block)); + } + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + SADs(exp_sad); + } + vpx_usec_timer_mark(&timer); + for (int block = 0; block < 4; ++block) { + EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block; + } + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height, + bit_depth_, elapsed_time); + + reference_stride_ = tmp_stride; +} + //------------------------------------------------------------------------------ // C functions const SadMxNParam c_tests[] = { @@ -521,7 +853,57 @@ const SadMxNParam c_tests[] = { SadMxNParam(4, 4, &vpx_highbd_sad4x4_c, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests)); +INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests)); + +const SadSkipMxNParam skip_c_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests)); const SadMxNAvgParam avg_c_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c), @@ -579,7 +961,7 @@ const SadMxNAvgParam avg_c_tests[] = { SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_c, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests)); +INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests)); const SadMxNx4Param x4d_c_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_c), @@ -637,28 +1019,418 @@ const SadMxNx4Param x4d_c_tests[] = { SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_c, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); +INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); + +const SadSkipMxNx4Param skip_x4d_c_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_c_tests)); //------------------------------------------------------------------------------ // ARM functions #if HAVE_NEON const SadMxNParam neon_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_neon), + SadMxNParam(64, 32, &vpx_sad64x32_neon), SadMxNParam(32, 32, &vpx_sad32x32_neon), + SadMxNParam(16, 32, &vpx_sad16x32_neon), SadMxNParam(16, 16, &vpx_sad16x16_neon), SadMxNParam(16, 8, &vpx_sad16x8_neon), SadMxNParam(8, 16, &vpx_sad8x16_neon), SadMxNParam(8, 8, &vpx_sad8x8_neon), + SadMxNParam(8, 4, &vpx_sad8x4_neon), + SadMxNParam(4, 8, &vpx_sad4x8_neon), SadMxNParam(4, 4, &vpx_sad4x4_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 8), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 8), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 8), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 8), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 8), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 8), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 8), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 8), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 8), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 8), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 8), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 8), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 10), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 10), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 10), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 10), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 10), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 10), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 10), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 10), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 10), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 10), + SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 12), + SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 12), + SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 12), + SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 12), + SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 12), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 12), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 12), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 12), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 12), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 12), + SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH + }; -INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); +INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests)); + +#if HAVE_NEON_DOTPROD +const SadMxNParam neon_dotprod_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_neon_dotprod), + SadMxNParam(64, 32, &vpx_sad64x32_neon_dotprod), + SadMxNParam(32, 64, &vpx_sad32x64_neon_dotprod), + SadMxNParam(32, 32, &vpx_sad32x32_neon_dotprod), + SadMxNParam(32, 16, &vpx_sad32x16_neon_dotprod), + SadMxNParam(16, 32, &vpx_sad16x32_neon_dotprod), + SadMxNParam(16, 16, &vpx_sad16x16_neon_dotprod), + SadMxNParam(16, 8, &vpx_sad16x8_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest, + ::testing::ValuesIn(neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + +const SadSkipMxNParam skip_neon_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_neon), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_neon), + SadSkipMxNParam(8, 4, &vpx_sad_skip_8x4_neon), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_neon), + SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 8), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 8), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 8), + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 10), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 10), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 10), + SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 12), + SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 12), + SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 12), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest, + ::testing::ValuesIn(skip_neon_tests)); + +#if HAVE_NEON_DOTPROD +const SadSkipMxNParam skip_neon_dotprod_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon_dotprod), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon_dotprod), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon_dotprod), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon_dotprod), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon_dotprod), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon_dotprod), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon_dotprod), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest, + ::testing::ValuesIn(skip_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + +const SadMxNAvgParam avg_neon_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_neon), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_neon), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 8), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 8), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 8), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 8), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 8), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 8), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 8), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 8), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 8), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 8), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 8), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 8), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 8), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 10), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 10), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 10), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 10), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 10), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 10), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 10), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 10), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 10), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 10), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 10), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 10), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 10), + SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 12), + SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 12), + SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 12), + SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 12), + SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 12), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 12), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 12), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 12), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 12), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 12), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 12), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 12), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests)); + +#if HAVE_NEON_DOTPROD +const SadMxNAvgParam avg_neon_dotprod_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon_dotprod), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon_dotprod), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon_dotprod), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon_dotprod), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon_dotprod), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon_dotprod), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon_dotprod), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest, + ::testing::ValuesIn(avg_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD const SadMxNx4Param x4d_neon_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon), SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 8), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 8), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 8), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 8), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 8), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 8), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 8), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 8), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 8), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 10), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 10), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 10), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 10), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 10), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 10), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 10), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 10), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 10), + SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 12), + SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 12), + SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 12), + SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 12), + SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 12), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 12), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 12), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 12), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 12), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); +INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests)); + +#if HAVE_NEON_DOTPROD +const SadMxNx4Param x4d_neon_dotprod_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon_dotprod), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon_dotprod), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon_dotprod), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon_dotprod), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon_dotprod), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon_dotprod), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon_dotprod), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test, + ::testing::ValuesIn(x4d_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD + +const SadSkipMxNx4Param skip_x4d_neon_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_neon), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_neon), + SadSkipMxNx4Param(8, 4, &vpx_sad_skip_8x4x4d_neon), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_neon), + SadSkipMxNx4Param(4, 4, &vpx_sad_skip_4x4x4d_neon), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 8), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 8), + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 10), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 10), + SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 12), + SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 12), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_neon_tests)); + +#if HAVE_NEONE_DOTPROD +const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon_dotprod), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon_dotprod), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon_dotprod), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon_dotprod), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon_dotprod), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon_dotprod), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon_dotprod), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_neon_dotprod_tests)); +#endif // HAVE_NEON_DOTPROD #endif // HAVE_NEON //------------------------------------------------------------------------------ @@ -714,7 +1486,55 @@ const SadMxNParam sse2_tests[] = { SadMxNParam(8, 4, &vpx_highbd_sad8x4_sse2, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); +INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); + +const SadSkipMxNParam skip_sse2_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2), + SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2), + SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2), + SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2), + SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2), + SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2), + SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10), + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12), + SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12), + SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest, + ::testing::ValuesIn(skip_sse2_tests)); const SadMxNAvgParam avg_sse2_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2), @@ -766,7 +1586,7 @@ const SadMxNAvgParam avg_sse2_tests[] = { SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_sse2, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests)); +INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests)); const SadMxNx4Param x4d_sse2_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_sse2), @@ -824,7 +1644,58 @@ const SadMxNx4Param x4d_sse2_tests[] = { SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_sse2, 12), #endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); +INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); + +const SadSkipMxNx4Param skip_x4d_sse2_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2), + SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2), + SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2), + SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2), + SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2), + SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2), + SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12), + SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12), + SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12), + SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_sse2_tests)); #endif // HAVE_SSE2 #if HAVE_SSE3 @@ -835,10 +1706,6 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); // Only functions are x3, which do not have tests. #endif // HAVE_SSSE3 -#if HAVE_SSE4_1 -// Only functions are x8, which do not have tests. -#endif // HAVE_SSE4_1 - #if HAVE_AVX2 const SadMxNParam avx2_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_avx2), @@ -846,8 +1713,74 @@ const SadMxNParam avx2_tests[] = { SadMxNParam(32, 64, &vpx_sad32x64_avx2), SadMxNParam(32, 32, &vpx_sad32x32_avx2), SadMxNParam(32, 16, &vpx_sad32x16_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 8), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 8), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8), + + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 10), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 10), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10), + + SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 12), + SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 12), + SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12), + SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12), + SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12), + SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12), + SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12), + SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); +INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests)); + +const SadSkipMxNParam skip_avx2_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2), + SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2), + SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2), + SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8), + + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10), + + SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12), + SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12), + SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12), + SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12), + SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12), + SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12), + SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12), + SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest, + ::testing::ValuesIn(skip_avx2_tests)); const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2), @@ -855,16 +1788,140 @@ const SadMxNAvgParam avg_avx2_tests[] = { SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_avx2), SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2), SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 8), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 8), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 10), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 10), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10), + SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 12), + SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 12), + SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12), + SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12), + SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12), + SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12), + SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12), + SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); +INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests)); const SadMxNx4Param x4d_avx2_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2), SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 8), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 8), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 10), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 10), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10), + SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 12), + SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 12), + SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12), + SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12), + SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12), + SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12), + SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12), + SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH }; -INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); +INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests)); + +const SadSkipMxNx4Param skip_x4d_avx2_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2), + SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2), + SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2), + SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10), + SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12), + SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12), + SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12), + SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12), + SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12), + SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12), + SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12), + SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12), +#endif // CONFIG_VP9_HIGHBITDEPTH +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_avx2_tests)); + #endif // HAVE_AVX2 +#if HAVE_AVX512 +const SadMxNParam avx512_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_avx512), + SadMxNParam(64, 32, &vpx_sad64x32_avx512), +}; +INSTANTIATE_TEST_SUITE_P(AVX512, SADTest, ::testing::ValuesIn(avx512_tests)); + +const SadSkipMxNParam skip_avx512_tests[] = { + SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx512), + SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx512), +}; +INSTANTIATE_TEST_SUITE_P(AVX512, SADSkipTest, + ::testing::ValuesIn(skip_avx512_tests)); + +const SadMxNAvgParam avg_avx512_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx512), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx512), +}; +INSTANTIATE_TEST_SUITE_P(AVX512, SADavgTest, + ::testing::ValuesIn(avg_avx512_tests)); + +const SadMxNx4Param x4d_avx512_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx512), +}; +INSTANTIATE_TEST_SUITE_P(AVX512, SADx4Test, + ::testing::ValuesIn(x4d_avx512_tests)); + +const SadSkipMxNx4Param skip_x4d_avx512_tests[] = { + SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx512), + SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx512), +}; +INSTANTIATE_TEST_SUITE_P(AVX512, SADSkipx4Test, + ::testing::ValuesIn(skip_x4d_avx512_tests)); +#endif // HAVE_AVX512 + //------------------------------------------------------------------------------ // MIPS functions #if HAVE_MSA @@ -883,7 +1940,7 @@ const SadMxNParam msa_tests[] = { SadMxNParam(4, 8, &vpx_sad4x8_msa), SadMxNParam(4, 4, &vpx_sad4x4_msa), }; -INSTANTIATE_TEST_CASE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests)); +INSTANTIATE_TEST_SUITE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests)); const SadMxNAvgParam avg_msa_tests[] = { SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_msa), @@ -900,7 +1957,7 @@ const SadMxNAvgParam avg_msa_tests[] = { SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_msa), SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_msa), }; -INSTANTIATE_TEST_CASE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests)); +INSTANTIATE_TEST_SUITE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests)); const SadMxNx4Param x4d_msa_tests[] = { SadMxNx4Param(64, 64, &vpx_sad64x64x4d_msa), @@ -917,7 +1974,133 @@ const SadMxNx4Param x4d_msa_tests[] = { SadMxNx4Param(4, 8, &vpx_sad4x8x4d_msa), SadMxNx4Param(4, 4, &vpx_sad4x4x4d_msa), }; -INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests)); +INSTANTIATE_TEST_SUITE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests)); #endif // HAVE_MSA +//------------------------------------------------------------------------------ +// VSX functions +#if HAVE_VSX +const SadMxNParam vsx_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_vsx), + SadMxNParam(64, 32, &vpx_sad64x32_vsx), + SadMxNParam(32, 64, &vpx_sad32x64_vsx), + SadMxNParam(32, 32, &vpx_sad32x32_vsx), + SadMxNParam(32, 16, &vpx_sad32x16_vsx), + SadMxNParam(16, 32, &vpx_sad16x32_vsx), + SadMxNParam(16, 16, &vpx_sad16x16_vsx), + SadMxNParam(16, 8, &vpx_sad16x8_vsx), + SadMxNParam(8, 16, &vpx_sad8x16_vsx), + SadMxNParam(8, 8, &vpx_sad8x8_vsx), + SadMxNParam(8, 4, &vpx_sad8x4_vsx), +}; +INSTANTIATE_TEST_SUITE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests)); + +const SadMxNAvgParam avg_vsx_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_vsx), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_vsx), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_vsx), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_vsx), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_vsx), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_vsx), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_vsx), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_vsx), +}; +INSTANTIATE_TEST_SUITE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests)); + +const SadMxNx4Param x4d_vsx_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_vsx), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_vsx), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_vsx), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_vsx), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_vsx), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_vsx), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_vsx), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_vsx), +}; +INSTANTIATE_TEST_SUITE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests)); +#endif // HAVE_VSX + +//------------------------------------------------------------------------------ +// Loongson functions +#if HAVE_MMI +const SadMxNParam mmi_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_mmi), + SadMxNParam(64, 32, &vpx_sad64x32_mmi), + SadMxNParam(32, 64, &vpx_sad32x64_mmi), + SadMxNParam(32, 32, &vpx_sad32x32_mmi), + SadMxNParam(32, 16, &vpx_sad32x16_mmi), + SadMxNParam(16, 32, &vpx_sad16x32_mmi), + SadMxNParam(16, 16, &vpx_sad16x16_mmi), + SadMxNParam(16, 8, &vpx_sad16x8_mmi), + SadMxNParam(8, 16, &vpx_sad8x16_mmi), + SadMxNParam(8, 8, &vpx_sad8x8_mmi), + SadMxNParam(8, 4, &vpx_sad8x4_mmi), + SadMxNParam(4, 8, &vpx_sad4x8_mmi), + SadMxNParam(4, 4, &vpx_sad4x4_mmi), +}; +INSTANTIATE_TEST_SUITE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests)); + +const SadMxNAvgParam avg_mmi_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi), + SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_mmi), + SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_mmi), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_mmi), + SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_mmi), + SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_mmi), + SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_mmi), + SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_mmi), + SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_mmi), + SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_mmi), + SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_mmi), + SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi), + SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi), +}; +INSTANTIATE_TEST_SUITE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests)); + +const SadMxNx4Param x4d_mmi_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_mmi), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_mmi), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_mmi), + SadMxNx4Param(32, 16, &vpx_sad32x16x4d_mmi), + SadMxNx4Param(16, 32, &vpx_sad16x32x4d_mmi), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_mmi), + SadMxNx4Param(16, 8, &vpx_sad16x8x4d_mmi), + SadMxNx4Param(8, 16, &vpx_sad8x16x4d_mmi), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_mmi), + SadMxNx4Param(8, 4, &vpx_sad8x4x4d_mmi), + SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi), + SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi), +}; +INSTANTIATE_TEST_SUITE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests)); +#endif // HAVE_MMI + +//------------------------------------------------------------------------------ +// loongarch functions +#if HAVE_LSX +const SadMxNParam lsx_tests[] = { + SadMxNParam(64, 64, &vpx_sad64x64_lsx), + SadMxNParam(32, 32, &vpx_sad32x32_lsx), + SadMxNParam(16, 16, &vpx_sad16x16_lsx), + SadMxNParam(8, 8, &vpx_sad8x8_lsx), +}; +INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests)); + +const SadMxNAvgParam avg_lsx_tests[] = { + SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_lsx), + SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_lsx), +}; +INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests)); + +const SadMxNx4Param x4d_lsx_tests[] = { + SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx), + SadMxNx4Param(64, 32, &vpx_sad64x32x4d_lsx), + SadMxNx4Param(32, 64, &vpx_sad32x64x4d_lsx), + SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx), + SadMxNx4Param(16, 16, &vpx_sad16x16x4d_lsx), + SadMxNx4Param(8, 8, &vpx_sad8x8x4d_lsx), +}; +INSTANTIATE_TEST_SUITE_P(LSX, SADx4Test, ::testing::ValuesIn(x4d_lsx_tests)); +#endif // HAVE_LSX + } // namespace diff --git a/media/libvpx/libvpx/test/set_maps.sh b/media/libvpx/libvpx/test/set_maps.sh index e7c8d43fa8..f45dc51f49 100644 --- a/media/libvpx/libvpx/test/set_maps.sh +++ b/media/libvpx/libvpx/test/set_maps.sh @@ -36,7 +36,7 @@ set_maps() { eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \ - ${devnull} + ${devnull} || return 1 [ -e "${output_file}" ] || return 1 } diff --git a/media/libvpx/libvpx/test/set_roi.cc b/media/libvpx/libvpx/test/set_roi.cc index 38711a806d..ac07ca161f 100644 --- a/media/libvpx/libvpx/test/set_roi.cc +++ b/media/libvpx/libvpx/test/set_roi.cc @@ -15,7 +15,7 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/acm_random.h" #include "vp8/encoder/onyx_int.h" #include "vpx/vpx_integer.h" @@ -40,7 +40,7 @@ TEST(VP8RoiMapTest, ParameterCheck) { // Initialize elements of cpi with valid defaults. VP8_COMP cpi; - cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA; + cpi.mb.e_mbd.mb_segment_abs_delta = SEGMENT_DELTADATA; cpi.cyclic_refresh_mode_enabled = 0; cpi.mb.e_mbd.segmentation_enabled = 0; cpi.mb.e_mbd.update_mb_segmentation_map = 0; @@ -146,14 +146,6 @@ TEST(VP8RoiMapTest, ParameterCheck) { if (deltas_valid != roi_retval) break; } - // Test that we report and error if cyclic refresh is enabled. - cpi.cyclic_refresh_mode_enabled = 1; - roi_retval = - vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols, - delta_q, delta_lf, threshold); - EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error"; - cpi.cyclic_refresh_mode_enabled = 0; - // Test invalid number of rows or colums. roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1, @@ -169,6 +161,6 @@ TEST(VP8RoiMapTest, ParameterCheck) { // Free allocated memory if (cpi.segmentation_map) vpx_free(cpi.segmentation_map); if (roi_map) vpx_free(roi_map); -}; +} } // namespace diff --git a/media/libvpx/libvpx/test/simple_decoder.sh b/media/libvpx/libvpx/test/simple_decoder.sh index 7eeaf71b1c..65fc4828ed 100644 --- a/media/libvpx/libvpx/test/simple_decoder.sh +++ b/media/libvpx/libvpx/test/simple_decoder.sh @@ -38,7 +38,7 @@ simple_decoder() { fi eval "${VPX_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \ - ${devnull} + ${devnull} || return 1 [ -e "${output_file}" ] || return 1 } diff --git a/media/libvpx/libvpx/test/simple_encoder.sh b/media/libvpx/libvpx/test/simple_encoder.sh index ee633ae99e..dc7f46ff38 100644 --- a/media/libvpx/libvpx/test/simple_encoder.sh +++ b/media/libvpx/libvpx/test/simple_encoder.sh @@ -36,7 +36,7 @@ simple_encoder() { eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 100 \ - ${devnull} + ${devnull} || return 1 [ -e "${output_file}" ] || return 1 } diff --git a/media/libvpx/libvpx/test/stress.sh b/media/libvpx/libvpx/test/stress.sh index 9523824764..ba79a52ac3 100644 --- a/media/libvpx/libvpx/test/stress.sh +++ b/media/libvpx/libvpx/test/stress.sh @@ -8,15 +8,17 @@ ## in the file PATENTS. All contributing project authors may ## be found in the AUTHORS file in the root of the source tree. ## -## This file performs a stress test. It runs 5 encodes and 30 decodes in -## parallel. +## This file performs a stress test. It runs (STRESS_ONEPASS_MAX_JOBS, +## default=5) one, (STRESS_TWOPASS_MAX_JOBS, default=5) two pass & +## (STRESS_RT_MAX_JOBS, default=5) encodes and (STRESS__DECODE_MAX_JOBS, +## default=30) decodes in parallel. . $(dirname $0)/tools_common.sh YUV="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.yuv" VP8="${LIBVPX_TEST_DATA_PATH}/tos_vp8.webm" VP9="${LIBVPX_TEST_DATA_PATH}/vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm" -DATA_URL="http://downloads.webmproject.org/test_data/libvpx/" +DATA_URL="https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/" SHA1_FILE="$(dirname $0)/test-data.sha1" # Set sha1sum to proper sha program (sha1sum, shasum, sha1). This code is @@ -28,7 +30,7 @@ SHA1_FILE="$(dirname $0)/test-data.sha1" # Download a file from the url and check its sha1sum. download_and_check_file() { # Get the file from the file path. - local readonly root="${1#${LIBVPX_TEST_DATA_PATH}/}" + local root="${1#${LIBVPX_TEST_DATA_PATH}/}" # Download the file using curl. Trap to insure non partial file. (trap "rm -f $1" INT TERM \ @@ -50,7 +52,7 @@ stress_verify_environment() { fi for file in "${YUV}" "${VP8}" "${VP9}"; do if [ ! -e "${file}" ] ; then - download_and_check_file "${file}" + download_and_check_file "${file}" || return 1 fi done if [ ! -e "${YUV}" ] || [ ! -e "${VP8}" ] || [ ! -e "${VP9}" ] ; then @@ -70,25 +72,38 @@ stress_verify_environment() { # This function runs tests on libvpx that run multiple encodes and decodes # in parallel in hopes of catching synchronization and/or threading issues. stress() { - local readonly decoder="$(vpx_tool_path vpxdec)" - local readonly encoder="$(vpx_tool_path vpxenc)" - local readonly codec="$1" - local readonly webm="$2" - local readonly decode_count="$3" + local decoder="$(vpx_tool_path vpxdec)" + local encoder="$(vpx_tool_path vpxenc)" + local codec="$1" + local webm="$2" + local decode_count="$3" + local threads="$4" + local enc_args="$5" local pids="" local rt_max_jobs=${STRESS_RT_MAX_JOBS:-5} + local onepass_max_jobs=${STRESS_ONEPASS_MAX_JOBS:-5} local twopass_max_jobs=${STRESS_TWOPASS_MAX_JOBS:-5} # Enable job control, so we can run multiple processes. set -m + # Start $onepass_max_jobs encode jobs in parallel. + for i in $(seq ${onepass_max_jobs}); do + bitrate=$(($i * 20 + 300)) + eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \ + "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=1" \ + "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.1pass.webm" \ + "${enc_args}" ${devnull} & + pids="${pids} $!" + done + # Start $twopass_max_jobs encode jobs in parallel. for i in $(seq ${twopass_max_jobs}); do bitrate=$(($i * 20 + 300)) eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \ - "${YUV}" "-t 4 --limit=150 --test-decode=fatal " \ - "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.webm" \ - ${devnull} & + "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal --passes=2" \ + "--target-bitrate=${bitrate} -o ${VPX_TEST_OUTPUT_DIR}/${i}.2pass.webm" \ + "${enc_args}" ${devnull} & pids="${pids} $!" done @@ -96,7 +111,7 @@ stress() { for i in $(seq ${rt_max_jobs}); do bitrate=$(($i * 20 + 300)) eval "${VPX_TEST_PREFIX}" "${encoder}" "--codec=${codec} -w 1280 -h 720" \ - "${YUV}" "-t 4 --limit=150 --test-decode=fatal " \ + "${YUV}" "-t ${threads} --limit=150 --test-decode=fatal " \ "--target-bitrate=${bitrate} --lag-in-frames=0 --error-resilient=1" \ "--kf-min-dist=3000 --kf-max-dist=3000 --cpu-used=-6 --static-thresh=1" \ "--end-usage=cbr --min-q=2 --max-q=56 --undershoot-pct=100" \ @@ -109,7 +124,7 @@ stress() { # Start $decode_count decode jobs in parallel. for i in $(seq "${decode_count}"); do - eval "${decoder}" "-t 4" "${webm}" "--noblit" ${devnull} & + eval "${decoder}" "-t ${threads}" "${webm}" "--noblit" ${devnull} & pids="${pids} $!" done @@ -125,17 +140,44 @@ vp8_stress_test() { local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40} if [ "$(vp8_decode_available)" = "yes" -a \ "$(vp8_encode_available)" = "yes" ]; then - stress vp8 "${VP8}" "${vp8_max_jobs}" + stress vp8 "${VP8}" "${vp8_max_jobs}" 4 fi } -vp9_stress_test() { +vp8_stress_test_token_parititions() { + local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40} + if [ "$(vp8_decode_available)" = "yes" -a \ + "$(vp8_encode_available)" = "yes" ]; then + for threads in 2 4 8; do + for token_partitions in 1 2 3; do + stress vp8 "${VP8}" "${vp8_max_jobs}" ${threads} \ + "--token-parts=$token_partitions" + done + done + fi +} + +vp9_stress() { local vp9_max_jobs=${STRESS_VP9_DECODE_MAX_JOBS:-25} if [ "$(vp9_decode_available)" = "yes" -a \ "$(vp9_encode_available)" = "yes" ]; then - stress vp9 "${VP9}" "${vp9_max_jobs}" + stress vp9 "${VP9}" "${vp9_max_jobs}" "$@" fi } -run_tests stress_verify_environment "vp8_stress_test vp9_stress_test" +vp9_stress_test() { + for threads in 4 8 64; do + vp9_stress "$threads" "--row-mt=0" + done +} + +vp9_stress_test_row_mt() { + for threads in 4 8 64; do + vp9_stress "$threads" "--row-mt=1" + done +} + +run_tests stress_verify_environment \ + "vp8_stress_test vp8_stress_test_token_parititions + vp9_stress_test vp9_stress_test_row_mt" diff --git a/media/libvpx/libvpx/test/sum_squares_test.cc b/media/libvpx/libvpx/test/sum_squares_test.cc index 3aa43d2ce0..8b57e772d1 100644 --- a/media/libvpx/libvpx/test/sum_squares_test.cc +++ b/media/libvpx/libvpx/test/sum_squares_test.cc @@ -9,10 +9,12 @@ */ #include +#include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -20,30 +22,36 @@ #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" +#include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; +using ::testing::Combine; +using ::testing::Range; +using ::testing::ValuesIn; namespace { const int kNumIterations = 10000; -typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size); -typedef std::tr1::tuple SumSquaresParam; +using SSI16Func = uint64_t (*)(const int16_t *src, int stride, int size); +using SumSquaresParam = std::tuple; class SumSquaresTest : public ::testing::TestWithParam { public: - virtual ~SumSquaresTest() {} - virtual void SetUp() { + ~SumSquaresTest() override = default; + void SetUp() override { ref_func_ = GET_PARAM(0); tst_func_ = GET_PARAM(1); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: SSI16Func ref_func_; SSI16Func tst_func_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SumSquaresTest); TEST_P(SumSquaresTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); @@ -102,12 +110,239 @@ TEST_P(SumSquaresTest, ExtremeValues) { } } -using std::tr1::make_tuple; +using std::make_tuple; + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_neon))); +#endif // HAVE_NEON + +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_sve))); +#endif // HAVE_SVE #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, SumSquaresTest, ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, &vpx_sum_squares_2d_i16_sse2))); #endif // HAVE_SSE2 + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P( + MSA, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_msa))); +#endif // HAVE_MSA + +using SSEFunc = int64_t (*)(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height); + +struct TestSSEFuncs { + TestSSEFuncs(SSEFunc ref = nullptr, SSEFunc tst = nullptr, int depth = 0) + : ref_func(ref), tst_func(tst), bit_depth(depth) {} + SSEFunc ref_func; // Pointer to reference function + SSEFunc tst_func; // Pointer to tested function + int bit_depth; +}; + +using SSETestParam = std::tuple; + +class SSETest : public ::testing::TestWithParam { + public: + ~SSETest() override = default; + void SetUp() override { + params_ = GET_PARAM(0); + width_ = GET_PARAM(1); + is_hbd_ = +#if CONFIG_VP9_HIGHBITDEPTH + params_.ref_func == vpx_highbd_sse_c; +#else + false; +#endif + rnd_.Reset(ACMRandom::DeterministicSeed()); + src_ = reinterpret_cast(vpx_memalign(32, 256 * 256 * 2)); + ref_ = reinterpret_cast(vpx_memalign(32, 256 * 256 * 2)); + ASSERT_NE(src_, nullptr); + ASSERT_NE(ref_, nullptr); + } + + void TearDown() override { + vpx_free(src_); + vpx_free(ref_); + } + void RunTest(bool is_random, int width, int height, int run_times); + + void GenRandomData(int width, int height, int stride) { + uint16_t *src16 = reinterpret_cast(src_); + uint16_t *ref16 = reinterpret_cast(ref_); + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + if (!is_hbd_) { + src_[ii * stride + jj] = rnd_.Rand8(); + ref_[ii * stride + jj] = rnd_.Rand8(); + } else { + src16[ii * stride + jj] = rnd_(limit); + ref16[ii * stride + jj] = rnd_(limit); + } + } + } + } + + void GenExtremeData(int width, int height, int stride, uint8_t *data, + int16_t val) { + uint16_t *data16 = reinterpret_cast(data); + for (int ii = 0; ii < height; ii++) { + for (int jj = 0; jj < width; jj++) { + if (!is_hbd_) { + data[ii * stride + jj] = static_cast(val); + } else { + data16[ii * stride + jj] = val; + } + } + } + } + + protected: + bool is_hbd_; + int width_; + TestSSEFuncs params_; + uint8_t *src_; + uint8_t *ref_; + ACMRandom rnd_; +}; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest); + +void SSETest::RunTest(bool is_random, int width, int height, int run_times) { + int failed = 0; + vpx_usec_timer ref_timer, test_timer; + for (int k = 0; k < 3; k++) { + int stride = 4 << rnd_(7); // Up to 256 stride + while (stride < width) { // Make sure it's valid + stride = 4 << rnd_(7); + } + if (is_random) { + GenRandomData(width, height, stride); + } else { + const int msb = is_hbd_ ? 12 : 8; // Up to 12 bit input + const int limit = (1 << msb) - 1; + if (k == 0) { + GenExtremeData(width, height, stride, src_, 0); + GenExtremeData(width, height, stride, ref_, limit); + } else { + GenExtremeData(width, height, stride, src_, limit); + GenExtremeData(width, height, stride, ref_, 0); + } + } + int64_t res_ref, res_tst; + uint8_t *src = src_; + uint8_t *ref = ref_; +#if CONFIG_VP9_HIGHBITDEPTH + if (is_hbd_) { + src = CONVERT_TO_BYTEPTR(src_); + ref = CONVERT_TO_BYTEPTR(ref_); + } +#endif + res_ref = params_.ref_func(src, stride, ref, stride, width, height); + res_tst = params_.tst_func(src, stride, ref, stride, width, height); + if (run_times > 1) { + vpx_usec_timer_start(&ref_timer); + for (int j = 0; j < run_times; j++) { + params_.ref_func(src, stride, ref, stride, width, height); + } + vpx_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast(vpx_usec_timer_elapsed(&ref_timer)); + + vpx_usec_timer_start(&test_timer); + for (int j = 0; j < run_times; j++) { + params_.tst_func(src, stride, ref, stride, width, height); + } + vpx_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast(vpx_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%d\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } else { + if (!failed) { + failed = res_ref != res_tst; + EXPECT_EQ(res_ref, res_tst) + << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test [" + << width << "x" << height + << "] C output does not match optimized output."; + } + } + } +} + +TEST_P(SSETest, OperationCheck) { + for (int height = 4; height <= 128; height += 4) { + RunTest(true, width_, height, 1); // GenRandomData + } +} + +TEST_P(SSETest, ExtremeValues) { + for (int height = 4; height <= 128; height += 4) { + RunTest(false, width_, height, 1); + } +} + +TEST_P(SSETest, DISABLED_Speed) { + for (int height = 4; height <= 128; height += 4) { + RunTest(true, width_, height, 100); + } +} + +#if HAVE_NEON +TestSSEFuncs sse_neon[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_neon) +#endif +}; +INSTANTIATE_TEST_SUITE_P(NEON, SSETest, + Combine(ValuesIn(sse_neon), Range(4, 129, 4))); +#endif // HAVE_NEON + +#if HAVE_NEON_DOTPROD +TestSSEFuncs sse_neon_dotprod[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon_dotprod), +}; +INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest, + Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4))); +#endif // HAVE_NEON_DOTPROD + +#if HAVE_SSE4_1 +TestSSEFuncs sse_sse4[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_sse4_1), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_sse4_1) +#endif +}; +INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest, + Combine(ValuesIn(sse_sse4), Range(4, 129, 4))); +#endif // HAVE_SSE4_1 + +#if HAVE_AVX2 + +TestSSEFuncs sse_avx2[] = { + TestSSEFuncs(&vpx_sse_c, &vpx_sse_avx2), +#if CONFIG_VP9_HIGHBITDEPTH + TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_avx2) +#endif +}; +INSTANTIATE_TEST_SUITE_P(AVX2, SSETest, + Combine(ValuesIn(sse_avx2), Range(4, 129, 4))); +#endif // HAVE_AVX2 } // namespace diff --git a/media/libvpx/libvpx/test/superframe_test.cc b/media/libvpx/libvpx/test/superframe_test.cc index 421dfccd60..9319fab2ae 100644 --- a/media/libvpx/libvpx/test/superframe_test.cc +++ b/media/libvpx/libvpx/test/superframe_test.cc @@ -8,7 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include + +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" @@ -18,36 +20,36 @@ namespace { const int kTestMode = 0; -typedef std::tr1::tuple SuperframeTestParam; +using SuperframeTestParam = std::tuple; class SuperframeTest : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWithParam { protected: SuperframeTest() - : EncoderTest(GET_PARAM(0)), modified_buf_(NULL), last_sf_pts_(0) {} - virtual ~SuperframeTest() {} + : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {} + ~SuperframeTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); const SuperframeTestParam input = GET_PARAM(1); - const libvpx_test::TestMode mode = std::tr1::get(input); + const libvpx_test::TestMode mode = std::get(input); SetMode(mode); sf_count_ = 0; sf_count_max_ = INT_MAX; } - virtual void TearDown() { delete[] modified_buf_; } + void TearDown() override { delete[] modified_buf_; } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); } } - virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( - const vpx_codec_cx_pkt_t *pkt) { + const vpx_codec_cx_pkt_t *MutateEncoderOutputHook( + const vpx_codec_cx_pkt_t *pkt) override { if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt; const uint8_t *buffer = reinterpret_cast(pkt->data.frame.buf); @@ -93,7 +95,7 @@ TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) { EXPECT_EQ(sf_count_, 1); } -VP9_INSTANTIATE_TEST_CASE( +VP9_INSTANTIATE_TEST_SUITE( SuperframeTest, ::testing::Combine(::testing::Values(::libvpx_test::kTwoPassGood), ::testing::Values(0))); diff --git a/media/libvpx/libvpx/test/svc_datarate_test.cc b/media/libvpx/libvpx/test/svc_datarate_test.cc new file mode 100644 index 0000000000..7e0f95277e --- /dev/null +++ b/media/libvpx/libvpx/test/svc_datarate_test.cc @@ -0,0 +1,2208 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/svc_test.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace svc_test { +namespace { + +enum INTER_LAYER_PRED { + // Inter-layer prediction is on on all frames. + INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. + INTER_LAYER_PRED_OFF, + // Inter-layer prediction is off on non-key frames and non-sync frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED +}; + +class DatarateOnePassCbrSvc : public OnePassCbrSvc { + public: + explicit DatarateOnePassCbrSvc(const ::libvpx_test::CodecFactory *codec) + : OnePassCbrSvc(codec) { + inter_layer_pred_mode_ = 0; + } + + protected: + ~DatarateOnePassCbrSvc() override = default; + + virtual void ResetModel() { + last_pts_ = 0; + duration_ = 0.0; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + denoiser_on_ = 0; + tune_content_ = 0; + base_speed_setting_ = 5; + spatial_layer_id_ = 0; + temporal_layer_id_ = 0; + update_pattern_ = 0; + memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_)); + memset(bits_total_, 0, sizeof(bits_total_)); + memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_)); + dynamic_drop_layer_ = false; + single_layer_resize_ = false; + change_bitrate_ = false; + last_pts_ref_ = 0; + middle_bitrate_ = 0; + top_bitrate_ = 0; + superframe_count_ = -1; + key_frame_spacing_ = 9999; + num_nonref_frames_ = 0; + layer_framedrop_ = 0; + force_key_ = 0; + force_key_test_ = 0; + insert_layer_sync_ = 0; + layer_sync_on_base_ = 0; + force_intra_only_frame_ = 0; + superframe_has_intra_only_ = 0; + use_post_encode_drop_ = 0; + denoiser_off_on_ = false; + denoiser_enable_layers_ = false; + num_resize_down_ = 0; + num_resize_up_ = 0; + for (int i = 0; i < VPX_MAX_LAYERS; i++) { + prev_frame_width_[i] = 320; + prev_frame_height_[i] = 240; + } + ksvc_flex_noupd_tlenh_ = false; + external_resize_dynamic_drop_layer_ = false; + external_resize_pattern_ = 0; + superframe_cnt_ = 0; + } + void BeginPassHook(unsigned int /*pass*/) override {} + + // Example pattern for spatial layers and 2 temporal layers used in the + // bypass/flexible mode. The pattern corresponds to the pattern + // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in + // non-flexible mode, except that we disable inter-layer prediction. + void set_frame_flags_bypass_mode(int tl, int num_spatial_layers, + int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config, + int noupdate_tlenh) { + for (int sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + + for (int sl = 0; sl < num_spatial_layers; ++sl) { + if (tl == 0) { + ref_frame_config->lst_fb_idx[sl] = sl; + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { + ref_frame_config->gld_fb_idx[sl] = 0; + } + ref_frame_config->alt_fb_idx[sl] = 0; + } else if (tl == 1) { + ref_frame_config->lst_fb_idx[sl] = sl; + ref_frame_config->gld_fb_idx[sl] = + VPXMIN(REF_FRAMES - 1, num_spatial_layers + sl - 1); + ref_frame_config->alt_fb_idx[sl] = + VPXMIN(REF_FRAMES - 1, num_spatial_layers + sl); + } + if (!tl) { + if (!sl) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } else { + if (is_key_frame) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->gld_fb_idx[sl]; + } else { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } + } + } else if (tl == 1) { + if (!sl) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + // Non reference frame on top temporal top spatial. + ref_frame_config->update_buffer_slot[sl] = 0; + } + // Force no update on all spatial layers for temporal enhancement layer + // frames. + if (noupdate_tlenh) ref_frame_config->update_buffer_slot[sl] = 0; + } + } + } + + void CheckLayerRateTargeting(int num_spatial_layers, int num_temporal_layers, + double thresh_overshoot, + double thresh_undershoot) const { + for (int sl = 0; sl < num_spatial_layers; ++sl) + for (int tl = 0; tl < num_temporal_layers; ++tl) { + const int layer = sl * num_temporal_layers + tl; + ASSERT_GE(cfg_.layer_target_bitrate[layer], + file_datarate_[layer] * thresh_overshoot) + << " The datarate for the file exceeds the target by too much!"; + ASSERT_LE(cfg_.layer_target_bitrate[layer], + file_datarate_[layer] * thresh_undershoot) + << " The datarate for the file is lower than the target by too " + "much!"; + } + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + PreEncodeFrameHookSetup(video, encoder); + + if (video->frame() == 0) { + if (force_intra_only_frame_) { + // Decoder sets the color_space for Intra-only frames + // to BT_601 (see line 1810 in vp9_decodeframe.c). + // So set it here in these tess to avoid encoder-decoder + // mismatch check on color space setting. + encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601); + } + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, inter_layer_pred_mode_); + + if (layer_framedrop_) { + vpx_svc_frame_drop_t svc_drop_frame; + svc_drop_frame.framedrop_mode = LAYER_DROP; + for (int i = 0; i < number_spatial_layers_; i++) + svc_drop_frame.framedrop_thresh[i] = 30; + svc_drop_frame.max_consec_drop = 30; + encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); + } + + if (use_post_encode_drop_) { + encoder->Control(VP9E_SET_POSTENCODE_DROP, use_post_encode_drop_); + } + // We want to force external resize on the very first frame. + if (external_resize_dynamic_drop_layer_) video->Next(); + } + + if (denoiser_off_on_) { + encoder->Control(VP9E_SET_AQ_MODE, 3); + // Set inter_layer_pred to INTER_LAYER_PRED_OFF_NONKEY (K-SVC). + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, 2); + if (!denoiser_enable_layers_) { + if (video->frame() == 0) + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 0); + else if (video->frame() == 100) + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 1); + } else { + // Cumulative bitrates for top spatial layers, for + // 3 temporal layers. + if (video->frame() == 0) { + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 0); + // Change layer bitrates to set top spatial layer to 0. + // This is for 3 spatial 3 temporal layers. + // This will trigger skip encoding/dropping of top spatial layer. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8]; + for (int i = 0; i < 3; i++) + bitrate_sl3_[i] = cfg_.layer_target_bitrate[i + 6]; + cfg_.layer_target_bitrate[6] = 0; + cfg_.layer_target_bitrate[7] = 0; + cfg_.layer_target_bitrate[8] = 0; + encoder->Config(&cfg_); + } else if (video->frame() == 100) { + // Change layer bitrates to non-zero on top spatial layer. + // This will trigger skip encoding of top spatial layer + // on key frame (period = 100). + for (int i = 0; i < 3; i++) + cfg_.layer_target_bitrate[i + 6] = bitrate_sl3_[i]; + cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[8]; + encoder->Config(&cfg_); + } else if (video->frame() == 120) { + // Enable denoiser and top spatial layer after key frame (period is + // 100). + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 1); + } + } + } + + if (ksvc_flex_noupd_tlenh_) { + vpx_svc_layer_id_t layer_id; + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = (video->frame() % 2 != 0); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int i = 0; i < number_spatial_layers_; i++) { + layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; + ref_frame_config_.duration[i] = 1; + } + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + set_frame_flags_bypass_mode(layer_id.temporal_layer_id, + number_spatial_layers_, 0, &ref_frame_config_, + 1); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); + } + + if (update_pattern_ && video->frame() >= 100) { + vpx_svc_layer_id_t layer_id; + if (video->frame() == 100) { + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + encoder->Config(&cfg_); + } + // Set layer id since the pattern changed. + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = (video->frame() % 2 != 0); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int i = 0; i < number_spatial_layers_; i++) { + layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; + ref_frame_config_.duration[i] = 1; + } + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + set_frame_flags_bypass_mode(layer_id.temporal_layer_id, + number_spatial_layers_, 0, &ref_frame_config_, + 0); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); + } + + if (change_bitrate_ && video->frame() == 200) { + duration_ = (last_pts_ + 1) * timebase_; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = 0; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + const double file_size_in_kb = bits_total_[layer] / 1000.; + file_datarate_[layer] = file_size_in_kb / duration_; + } + } + + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, + 0.78, 1.15); + + memset(file_datarate_, 0, sizeof(file_datarate_)); + memset(bits_total_, 0, sizeof(bits_total_)); + int64_t bits_in_buffer_model_tmp[VPX_MAX_LAYERS]; + last_pts_ref_ = last_pts_; + // Set new target bitarate. + cfg_.rc_target_bitrate = cfg_.rc_target_bitrate >> 1; + // Buffer level should not reset on dynamic bitrate change. + memcpy(bits_in_buffer_model_tmp, bits_in_buffer_model_, + sizeof(bits_in_buffer_model_)); + AssignLayerBitrates(); + memcpy(bits_in_buffer_model_, bits_in_buffer_model_tmp, + sizeof(bits_in_buffer_model_)); + + // Change config to update encoder with new bitrate configuration. + encoder->Config(&cfg_); + } + + if (external_resize_dynamic_drop_layer_) { + frame_flags_ = 0; + for (int i = 0; i < 9; ++i) { + svc_params_.min_quantizers[i] = 20; + svc_params_.max_quantizers[i] = 56; + } + if (video->frame() == 1 || video->frame() == 150) { + // Set the new top width/height for external resize. + top_sl_width_ = video->img()->d_w; + top_sl_height_ = video->img()->d_h; + for (int i = 0; i < 9; ++i) { + bitrate_layer_[i] = cfg_.layer_target_bitrate[i]; + } + if (external_resize_pattern_ == 1) { + // Input size is 1/4. 2 top spatial layers are dropped. + // This will trigger skip encoding/dropping of two top spatial layers. + cfg_.rc_target_bitrate -= + cfg_.layer_target_bitrate[5] + cfg_.layer_target_bitrate[8]; + for (int i = 3; i < 9; ++i) { + cfg_.layer_target_bitrate[i] = 0; + } + for (int sl = 0; sl < 3; sl++) { + svc_params_.scaling_factor_num[sl] = 1; + svc_params_.scaling_factor_den[sl] = 1; + } + } else if (external_resize_pattern_ == 2) { + // Input size is 1/2. Top spatial layer is dropped. + // This will trigger skip encoding/dropping of top spatial layer. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8]; + for (int i = 6; i < 9; ++i) { + cfg_.layer_target_bitrate[i] = 0; + } + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 2; + svc_params_.scaling_factor_num[1] = 1; + svc_params_.scaling_factor_den[1] = 1; + svc_params_.scaling_factor_num[2] = 1; + svc_params_.scaling_factor_den[2] = 1; + } + encoder->Config(&cfg_); + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + } else if (video->frame() == 50 || video->frame() == 200) { + top_sl_width_ = video->img()->d_w; + top_sl_height_ = video->img()->d_h; + if (external_resize_pattern_ == 1) { + // Input size is 1/2. Change layer bitrates to set top layer to 0. + // This will trigger skip encoding/dropping of top spatial layer. + cfg_.rc_target_bitrate += bitrate_layer_[5]; + for (int i = 3; i < 6; ++i) { + cfg_.layer_target_bitrate[i] = bitrate_layer_[i]; + } + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 2; + svc_params_.scaling_factor_num[1] = 1; + svc_params_.scaling_factor_den[1] = 1; + svc_params_.scaling_factor_num[2] = 1; + svc_params_.scaling_factor_den[2] = 1; + } else if (external_resize_pattern_ == 2) { + // Input size is 1/4. Change layer bitrates to set two top layers to + // 0. This will trigger skip encoding/dropping of two top spatial + // layers. + cfg_.rc_target_bitrate -= bitrate_layer_[5]; + for (int i = 3; i < 6; ++i) { + cfg_.layer_target_bitrate[i] = 0; + } + for (int sl = 0; sl < 3; sl++) { + svc_params_.scaling_factor_num[sl] = 1; + svc_params_.scaling_factor_den[sl] = 1; + } + } + encoder->Config(&cfg_); + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + } else if (video->frame() == 100 || video->frame() == 250) { + top_sl_width_ = video->img()->d_w; + top_sl_height_ = video->img()->d_h; + // Input is original size. Change layer bitrates to nonzero for all + // layers. + cfg_.rc_target_bitrate = + bitrate_layer_[2] + bitrate_layer_[5] + bitrate_layer_[8]; + for (int i = 0; i < 9; ++i) { + cfg_.layer_target_bitrate[i] = bitrate_layer_[i]; + } + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 4; + svc_params_.scaling_factor_num[1] = 1; + svc_params_.scaling_factor_den[1] = 2; + svc_params_.scaling_factor_num[2] = 1; + svc_params_.scaling_factor_den[2] = 1; + encoder->Config(&cfg_); + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + } + } else if (dynamic_drop_layer_ && !single_layer_resize_) { + if (video->frame() == 0) { + // Change layer bitrates to set top layers to 0. This will trigger skip + // encoding/dropping of top two spatial layers. + cfg_.rc_target_bitrate -= + (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]); + middle_bitrate_ = cfg_.layer_target_bitrate[1]; + top_bitrate_ = cfg_.layer_target_bitrate[2]; + cfg_.layer_target_bitrate[1] = 0; + cfg_.layer_target_bitrate[2] = 0; + encoder->Config(&cfg_); + } else if (video->frame() == 50) { + // Change layer bitrates to non-zero on two top spatial layers. + // This will trigger skip encoding of top two spatial layers. + cfg_.layer_target_bitrate[1] = middle_bitrate_; + cfg_.layer_target_bitrate[2] = top_bitrate_; + cfg_.rc_target_bitrate += + cfg_.layer_target_bitrate[2] + cfg_.layer_target_bitrate[1]; + encoder->Config(&cfg_); + } else if (video->frame() == 100) { + // Change layer bitrates to set top layers to 0. This will trigger skip + // encoding/dropping of top two spatial layers. + cfg_.rc_target_bitrate -= + (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]); + middle_bitrate_ = cfg_.layer_target_bitrate[1]; + top_bitrate_ = cfg_.layer_target_bitrate[2]; + cfg_.layer_target_bitrate[1] = 0; + cfg_.layer_target_bitrate[2] = 0; + encoder->Config(&cfg_); + } else if (video->frame() == 150) { + // Change layer bitrate on second layer to non-zero to start + // encoding it again. + cfg_.layer_target_bitrate[1] = middle_bitrate_; + cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[1]; + encoder->Config(&cfg_); + } else if (video->frame() == 200) { + // Change layer bitrate on top layer to non-zero to start + // encoding it again. + cfg_.layer_target_bitrate[2] = top_bitrate_; + cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[2]; + encoder->Config(&cfg_); + } + } else if (dynamic_drop_layer_ && single_layer_resize_) { + // Change layer bitrates to set top layers to 0. This will trigger skip + // encoding/dropping of top spatial layers. + if (video->frame() == 2) { + cfg_.rc_target_bitrate -= + (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]); + middle_bitrate_ = cfg_.layer_target_bitrate[1]; + top_bitrate_ = cfg_.layer_target_bitrate[2]; + cfg_.layer_target_bitrate[1] = 0; + cfg_.layer_target_bitrate[2] = 0; + // Set spatial layer 0 to a very low bitrate to trigger resize. + cfg_.layer_target_bitrate[0] = 30; + cfg_.rc_target_bitrate = cfg_.layer_target_bitrate[0]; + encoder->Config(&cfg_); + } else if (video->frame() == 100) { + // Set base spatial layer to very high to go back up to original size. + cfg_.layer_target_bitrate[0] = 400; + cfg_.rc_target_bitrate = cfg_.layer_target_bitrate[0]; + encoder->Config(&cfg_); + } + } else if (!dynamic_drop_layer_ && single_layer_resize_) { + if (video->frame() == 2) { + cfg_.layer_target_bitrate[0] = 30; + cfg_.layer_target_bitrate[1] = 50; + cfg_.rc_target_bitrate = + (cfg_.layer_target_bitrate[0] + cfg_.layer_target_bitrate[1]); + encoder->Config(&cfg_); + } else if (video->frame() == 160) { + cfg_.layer_target_bitrate[0] = 1500; + cfg_.layer_target_bitrate[1] = 2000; + cfg_.rc_target_bitrate = + (cfg_.layer_target_bitrate[0] + cfg_.layer_target_bitrate[1]); + encoder->Config(&cfg_); + } + } + if (force_key_test_ && force_key_) frame_flags_ = VPX_EFLAG_FORCE_KF; + + if (insert_layer_sync_) { + vpx_svc_spatial_layer_sync_t svc_layer_sync; + svc_layer_sync.base_layer_intra_only = 0; + for (int i = 0; i < number_spatial_layers_; i++) + svc_layer_sync.spatial_layer_sync[i] = 0; + if (force_intra_only_frame_) { + superframe_has_intra_only_ = 0; + if (video->frame() == 0) { + svc_layer_sync.base_layer_intra_only = 1; + svc_layer_sync.spatial_layer_sync[0] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + superframe_has_intra_only_ = 1; + } else if (video->frame() == 100) { + svc_layer_sync.base_layer_intra_only = 1; + svc_layer_sync.spatial_layer_sync[0] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + superframe_has_intra_only_ = 1; + } + } else { + layer_sync_on_base_ = 0; + if (video->frame() == 150) { + svc_layer_sync.spatial_layer_sync[1] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + } else if (video->frame() == 240) { + svc_layer_sync.spatial_layer_sync[2] = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + } else if (video->frame() == 320) { + svc_layer_sync.spatial_layer_sync[0] = 1; + layer_sync_on_base_ = 1; + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync); + } + } + } + + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + superframe_cnt_++; + } + + vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz, + uint32_t sizes[8], int *count) { + uint8_t marker; + marker = *(data + data_sz - 1); + *count = 0; + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; + { + const uint8_t marker2 = *(data + data_sz - index_sz); + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; + } + { + uint32_t i, j; + const uint8_t *x = &data[data_sz - index_sz + 1]; + for (i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); + sizes[i] = this_sz; + } + *count = frames; + } + } + return VPX_CODEC_OK; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + uint32_t sizes[8] = { 0 }; + uint32_t sizes_parsed[8] = { 0 }; + int count = 0; + int num_layers_encoded = 0; + last_pts_ = pkt->data.frame.pts; + const bool key_frame = + (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; + if (external_resize_dynamic_drop_layer_) { + // No key frames expected in stream, except for first. + if (cfg_.kf_max_dist > 1000) { + ASSERT_FALSE(key_frame && superframe_cnt_ > 1); + } + } + if (key_frame) { + // For test that inserts layer sync frames: requesting a layer_sync on + // the base layer must force key frame. So if any key frame occurs after + // first superframe it must due to layer sync on base spatial layer. + if (superframe_count_ > 0 && insert_layer_sync_ && + !force_intra_only_frame_) { + ASSERT_EQ(layer_sync_on_base_, 1); + } + temporal_layer_id_ = 0; + superframe_count_ = 0; + } + parse_superframe_index(static_cast(pkt->data.frame.buf), + pkt->data.frame.sz, sizes_parsed, &count); + // Count may be less than number of spatial layers because of frame drops. + if (number_spatial_layers_ > 1) { + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + if (pkt->data.frame.spatial_layer_encoded[sl]) { + sizes[sl] = sizes_parsed[num_layers_encoded]; + num_layers_encoded++; + } + } + } + // For superframe with Intra-only count will be +1 larger + // because of no-show frame. + if (force_intra_only_frame_ && superframe_has_intra_only_) + ASSERT_EQ(count, num_layers_encoded + 1); + else + ASSERT_EQ(count, num_layers_encoded); + + // In the constrained frame drop mode, if a given spatial is dropped all + // upper layers must be dropped too. + if (!layer_framedrop_) { + int num_layers_dropped = 0; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + if (!pkt->data.frame.spatial_layer_encoded[sl]) { + // Check that all upper layers are dropped. + num_layers_dropped++; + for (int sl2 = sl + 1; sl2 < number_spatial_layers_; ++sl2) + ASSERT_EQ(pkt->data.frame.spatial_layer_encoded[sl2], 0); + } + } + if (num_layers_dropped == number_spatial_layers_ - 1) + force_key_ = 1; + else + force_key_ = 0; + } + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1]) + num_nonref_frames_++; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + sizes[sl] = sizes[sl] << 3; + // Update the total encoded bits per layer. + // For temporal layers, update the cumulative encoded bits per layer. + for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + bits_total_[layer] += static_cast(sizes[sl]); + // Update the per-layer buffer level with the encoded frame size. + bits_in_buffer_model_[layer] -= static_cast(sizes[sl]); + // There should be no buffer underrun, except on the base + // temporal layer, since there may be key frames there. + // Fo short key frame spacing, buffer can underrun on individual frames. + if (!key_frame && tl > 0 && key_frame_spacing_ < 100) { + ASSERT_GE(bits_in_buffer_model_[layer], 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + } + } + + if (!single_layer_resize_ && sl < number_spatial_layers_ - 1) { + unsigned int scaled_width = top_sl_width_ * + svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]; + if (scaled_width % 2 != 0) scaled_width += 1; + ASSERT_EQ(pkt->data.frame.width[sl], scaled_width); + unsigned int scaled_height = top_sl_height_ * + svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]; + if (scaled_height % 2 != 0) scaled_height += 1; + ASSERT_EQ(pkt->data.frame.height[sl], scaled_height); + } else if (superframe_count_ > 0) { + if (pkt->data.frame.width[sl] < prev_frame_width_[sl] && + pkt->data.frame.height[sl] < prev_frame_height_[sl]) + num_resize_down_ += 1; + if (pkt->data.frame.width[sl] > prev_frame_width_[sl] && + pkt->data.frame.height[sl] > prev_frame_height_[sl]) + num_resize_up_ += 1; + } + prev_frame_width_[sl] = pkt->data.frame.width[sl]; + prev_frame_height_[sl] = pkt->data.frame.height[sl]; + } + } + + void EndPassHook() override { + if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_; + duration_ = (last_pts_ + 1) * timebase_; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = 0; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + const double file_size_in_kb = bits_total_[layer] / 1000.; + file_datarate_[layer] = file_size_in_kb / duration_; + } + } + } + + void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override { + // TODO(marpan): Look into why an assert is triggered in compute_psnr + // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh. + // Has to do with dropped frames in bypass/flexible svc mode. + if (!ksvc_flex_noupd_tlenh_) { + double mismatch_psnr = compute_psnr(img1, img2); + mismatch_psnr_ += mismatch_psnr; + ++mismatch_nframes_; + } + } + + unsigned int GetMismatchFrames() { return mismatch_nframes_; } + unsigned int GetNonRefFrames() { return num_nonref_frames_; } + + vpx_codec_pts_t last_pts_; + double timebase_; + int64_t bits_total_[VPX_MAX_LAYERS]; + double duration_; + double file_datarate_[VPX_MAX_LAYERS]; + size_t bits_in_last_frame_; + double mismatch_psnr_; + int denoiser_on_; + int tune_content_; + int spatial_layer_id_; + bool dynamic_drop_layer_; + bool single_layer_resize_; + unsigned int top_sl_width_; + unsigned int top_sl_height_; + vpx_svc_ref_frame_config_t ref_frame_config_; + int update_pattern_; + bool change_bitrate_; + vpx_codec_pts_t last_pts_ref_; + int middle_bitrate_; + int top_bitrate_; + int key_frame_spacing_; + int layer_framedrop_; + int force_key_; + int force_key_test_; + int inter_layer_pred_mode_; + int insert_layer_sync_; + int layer_sync_on_base_; + int force_intra_only_frame_; + int superframe_has_intra_only_; + int use_post_encode_drop_; + int bitrate_sl3_[3]; + // Denoiser switched on the fly. + bool denoiser_off_on_; + // Top layer enabled on the fly. + bool denoiser_enable_layers_; + int num_resize_up_; + int num_resize_down_; + unsigned int prev_frame_width_[VPX_MAX_LAYERS]; + unsigned int prev_frame_height_[VPX_MAX_LAYERS]; + bool ksvc_flex_noupd_tlenh_; + bool external_resize_dynamic_drop_layer_; + int bitrate_layer_[9]; + int external_resize_pattern_; + int superframe_cnt_; + + private: + void SetConfig(const int num_temporal_layer) override { + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + if (num_temporal_layer == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + } else if (num_temporal_layer == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.temporal_layering_mode = 2; + } else if (num_temporal_layer == 1) { + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + } + } + + unsigned int num_nonref_frames_; + unsigned int mismatch_nframes_; +}; + +void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w, + unsigned int initial_h, unsigned int *w, + unsigned int *h, int resize_pattern) { + *w = initial_w; + *h = initial_h; + if (resize_pattern == 1) { + if (frame < 50) { + *w = initial_w / 4; + *h = initial_h / 4; + } else if (frame < 100) { + *w = initial_w / 2; + *h = initial_h / 2; + } else if (frame < 150) { + *w = initial_w; + *h = initial_h; + } else if (frame < 200) { + *w = initial_w / 4; + *h = initial_h / 4; + } else if (frame < 250) { + *w = initial_w / 2; + *h = initial_h / 2; + } + } else if (resize_pattern == 2) { + if (frame < 50) { + *w = initial_w / 2; + *h = initial_h / 2; + } else if (frame < 100) { + *w = initial_w / 4; + *h = initial_h / 4; + } else if (frame < 150) { + *w = initial_w; + *h = initial_h; + } else if (frame < 200) { + *w = initial_w / 2; + *h = initial_h / 2; + } else if (frame < 250) { + *w = initial_w / 4; + *h = initial_h / 4; + } + } +} + +class ResizingVideoSource : public ::libvpx_test::DummyVideoSource { + public: + ResizingVideoSource(int width, int height) { + top_width_ = width; + top_height_ = height; + SetSize(top_width_, top_height_); + limit_ = 300; + } + int external_resize_pattern_ = 1; + int force_zero_source_ = 0; + int top_width_; + int top_height_; + ~ResizingVideoSource() override = default; + + protected: + void Next() override { + ++frame_; + unsigned int width = 0; + unsigned int height = 0; + libvpx_test::ACMRandom rnd(libvpx_test::ACMRandom::DeterministicSeed()); + ScaleForFrameNumber(frame_, top_width_, top_height_, &width, &height, + external_resize_pattern_); + SetSize(width, height); + FillFrame(); + unsigned char *image = img_->planes[0]; + for (size_t i = 0; i < raw_sz_; ++i) { + image[i] = rnd.Rand8(); + if (force_zero_source_ && frame_ % 20 == 0) image[i] = 0; + } + } +}; + +// Params: speed setting. +class DatarateOnePassCbrSvcSingleBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcSingleBR() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for 4:4:4 Profile 1. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL444Profile1) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); + cfg_.g_profile = 1; + cfg_.g_bit_depth = VPX_BITS_8; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 352; + top_sl_height_ = 288; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 +// temporal layers, for 4:2:2 Profile 1. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) { + SetSvcConfig(2, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_8_422.y4m", 0, 20); + cfg_.g_profile = 1; + cfg_.g_bit_depth = VPX_BITS_8; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +#if CONFIG_VP9_HIGHBITDEPTH +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for Profle 2 10bit. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL10bitProfile2) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_10_420_20f.y4m", 0, 20); + cfg_.g_profile = 2; + cfg_.g_bit_depth = VPX_BITS_10; + cfg_.g_input_bit_depth = VPX_BITS_10; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(marpan/jianj): Comment out the rate-target checking for now + // as superframe parsing to get frame size needs to be fixed for + // high bitdepth. + /* + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); + */ +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers, for Profle 2 12bit. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL12bitProfile2) { + SetSvcConfig(3, 3); + ::libvpx_test::Y4mVideoSource video("park_joy_90p_12_420_20f.y4m", 0, 20); + cfg_.g_profile = 2; + cfg_.g_bit_depth = VPX_BITS_12; + cfg_.g_input_bit_depth = VPX_BITS_12; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + + top_sl_width_ = 160; + top_sl_height_ = 90; + cfg_.rc_target_bitrate = 500; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(marpan/jianj): Comment out the rate-target checking for now + // as superframe parsing to get frame size needs to be fixed for + // high bitdepth. + /* + // Use large under/over shoot thresholds as this is a very short clip, + // so not good for testing rate-targeting. + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5, + 1.7); + */ +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} +#endif + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1 +// temporal layer, with screen content mode on and same speed setting for all +// layers. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TLScreenContent1) { + SetSvcConfig(2, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.kf_max_dist = 9999; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 500; + ResetModel(); + tune_content_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers, with force key frame after frame drop +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLForceKey) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 100; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.25); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 2 temporal layers, with a change on the fly from the fixed SVC pattern to one +// generate via SVC_SET_REF_FRAME_CONFIG. The new pattern also disables +// inter-layer prediction. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL2TLDynamicPatternChange) { + SetSvcConfig(3, 2); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + // Change SVC pattern on the fly. + update_pattern_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 3 spatial and 3 temporal +// layers, for inter_layer_pred=OffKey (K-SVC) and on the fly switching +// of denoiser from off to on (on at frame = 100). Key frame period is set to +// 1000 so denoise is enabled on non-key. +TEST_P(DatarateOnePassCbrSvcSingleBR, + OnePassCbrSvc3SL3TL_DenoiserOffOnFixedLayers) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 1000; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 30, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 1000; + ResetModel(); + denoiser_off_on_ = true; + denoiser_enable_layers_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 3 spatial and 3 temporal +// layers, for inter_layer_pred=OffKey (K-SVC) and on the fly switching +// of denoiser from off to on, for dynamic layers. Start at 2 spatial layers +// and enable 3rd spatial layer at frame = 100. Use periodic key frame with +// period 100 so enabling of spatial layer occurs at key frame. Enable denoiser +// at frame > 100, after the key frame sync. +TEST_P(DatarateOnePassCbrSvcSingleBR, + OnePassCbrSvc3SL3TL_DenoiserOffOnEnableLayers) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 100; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 30, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 1000; + ResetModel(); + denoiser_off_on_ = true; + denoiser_enable_layers_ = true; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 3 spatial layers and on +// the fly switching to 1 and then 2 and back to 3 spatial layers. This switch +// is done by setting spatial layer bitrates to 0, and then back to non-zero, +// during the sequence. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL_DisableEnableLayers) { + SetSvcConfig(3, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 0; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + dynamic_drop_layer_ = true; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC with 2 spatial layers and on +// the fly switching to 1 spatial layer with dynamic resize enabled. +// The resizer will resize the single layer down and back up again, as the +// bitrate goes back up. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL_SingleLayerResize) { + SetSvcConfig(2, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 0; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_resize_allowed = 1; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 15, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 800; + ResetModel(); + dynamic_drop_layer_ = true; + single_layer_resize_ = true; + base_speed_setting_ = speed_setting_; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Expect at least one resize down and at least one resize back up. + EXPECT_GE(num_resize_down_, 1); + EXPECT_GE(num_resize_up_, 1); + // Don't check rate targeting on two top spatial layer since they will be + // skipped for part of the sequence. + CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_, + 0.78, 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// For 1 pass CBR SVC with 1 spatial and 2 temporal layers with dynamic resize +// and denoiser enabled. The resizer will resize the single layer down and back +// up again, as the bitrate goes back up. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc1SL2TL_DenoiseResize) { + SetSvcConfig(1, 2); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 2; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_resize_allowed = 1; + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280, + 720, 12, 1, 0, 300); + top_sl_width_ = 1280; + top_sl_height_ = 720; + cfg_.rc_target_bitrate = 800; + ResetModel(); + dynamic_drop_layer_ = false; + single_layer_resize_ = true; + denoiser_on_ = 1; + base_speed_setting_ = speed_setting_; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Expect at least one resize down and at least one resize back up. + EXPECT_GE(num_resize_down_, 1); + EXPECT_GE(num_resize_up_, 1); +} + +// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial +// downscale 5x5. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TL5x5MultipleRuns) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 3; + cfg_.temporal_layering_mode = 0; + svc_params_.scaling_factor_num[0] = 256; + svc_params_.scaling_factor_den[0] = 1280; + svc_params_.scaling_factor_num[1] = 1280; + svc_params_.scaling_factor_den[1] = 1280; + cfg_.rc_dropframe_thresh = 10; + cfg_.kf_max_dist = 999999; + cfg_.kf_min_dist = 0; + cfg_.ss_target_bitrate[0] = 300; + cfg_.ss_target_bitrate[1] = 1400; + cfg_.layer_target_bitrate[0] = 300; + cfg_.layer_target_bitrate[1] = 1400; + cfg_.rc_target_bitrate = 1700; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ResetModel(); + layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30; + bits_in_buffer_model_[0] = + cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz; + layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30; + bits_in_buffer_model_[1] = + cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// For 1 pass CBR SVC with 3 spatial and 3 temporal layers with external resize +// and denoiser enabled. The external resizer will resize down and back up, +// setting 0/nonzero bitrate on spatial enhancement layers to disable/enable +// layers. Resizing starts on first frame and the pattern is: +// 1/4 -> 1/2 -> 1 -> 1/4 -> 1/2. +TEST_P(DatarateOnePassCbrSvcSingleBR, + OnePassCbrSvc3SL3TL_DenoiseExternalResizePattern1) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 40; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.rc_dropframe_thresh = 1; + cfg_.kf_max_dist = 10000; + cfg_.kf_min_dist = 10000; + cfg_.rc_resize_allowed = 0; + cfg_.g_w = 1280; + cfg_.g_h = 720; + top_sl_width_ = 1280; + top_sl_height_ = 720; + ResizingVideoSource video(1280, 720); + video.external_resize_pattern_ = 1; + video.force_zero_source_ = 0; + cfg_.rc_target_bitrate = 1000; + ResetModel(); + dynamic_drop_layer_ = false; + single_layer_resize_ = false; + denoiser_on_ = 1; + base_speed_setting_ = speed_setting_; + external_resize_dynamic_drop_layer_ = true; + external_resize_pattern_ = video.external_resize_pattern_; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// For 1 pass CBR SVC with 3 spatial and 3 temporal layers with external resize +// and denoiser enabled. The external resizer will resize down and back up, +// setting 0/nonzero bitrate on spatial enhancement layers to disable/enable +// layers. Resizing starts on first frame and the pattern is: +// 1/2 -> 1/4 -> 1 -> 1/2 -> 1/4. +TEST_P(DatarateOnePassCbrSvcSingleBR, + OnePassCbrSvc3SL3TL_DenoiseExternalResizePattern2) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 40; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.rc_dropframe_thresh = 1; + cfg_.kf_max_dist = 10000; + cfg_.kf_min_dist = 10000; + cfg_.rc_resize_allowed = 0; + cfg_.g_w = 1280; + cfg_.g_h = 720; + top_sl_width_ = 1280; + top_sl_height_ = 720; + ResizingVideoSource video(1280, 720); + video.external_resize_pattern_ = 2; + video.force_zero_source_ = 0; + cfg_.rc_target_bitrate = 1000; + ResetModel(); + dynamic_drop_layer_ = false; + single_layer_resize_ = false; + denoiser_on_ = 1; + base_speed_setting_ = speed_setting_; + external_resize_dynamic_drop_layer_ = true; + external_resize_pattern_ = video.external_resize_pattern_; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// For 1 pass CBR SVC with 3 spatial and 3 temporal layers with external resize +// and denoiser enabled. The external resizer will resize down and back up, +// setting 0/nonzero bitrate on spatial enhancement layers to disable/enable +// layers. Resizing starts on first frame and the pattern is: +// 1/2 -> 1/4 -> 1 -> 1/2 -> 1/4. This test uses 4 threads with small keyframe +// spacing, and top resolution is 1280x960. +TEST_P(DatarateOnePassCbrSvcSingleBR, + OnePassCbrSvc3SL3TL_DenoiseExternalResizePattern2Key4Threads) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 40; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.temporal_layering_mode = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.rc_dropframe_thresh = 1; + cfg_.kf_max_dist = 40; + cfg_.kf_min_dist = 40; + cfg_.rc_resize_allowed = 0; + cfg_.g_w = 1280; + cfg_.g_h = 960; + top_sl_width_ = cfg_.g_w; + top_sl_height_ = cfg_.g_h; + ResizingVideoSource video(1280, 960); + video.external_resize_pattern_ = 2; + video.force_zero_source_ = 0; + cfg_.rc_target_bitrate = 1000; + ResetModel(); + dynamic_drop_layer_ = false; + single_layer_resize_ = false; + denoiser_on_ = 1; + base_speed_setting_ = speed_setting_; + external_resize_dynamic_drop_layer_ = true; + external_resize_pattern_ = video.external_resize_pattern_; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// For 1 pass CBR SVC with 3 spatial and 3 temporal layers with external resize +// and denoiser enabled. The external resizer will resize down and back up, +// setting 0/nonzero bitrate on spatial enhancement layers to disable/enable +// layers. Resizing starts on first frame and the pattern is: +// 1/4 -> 1/2 -> 1 -> 1/4 -> 1/2. The source will be set to 0 every x frames, +// otherwise random values, to trigger scene detection in the encoder. +TEST_P(DatarateOnePassCbrSvcSingleBR, + OnePassCbrSvc3SL3TL_DenoiseExternalResizePattern1SceneChange) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 40; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.rc_dropframe_thresh = 1; + cfg_.kf_max_dist = 10000; + cfg_.kf_min_dist = 10000; + cfg_.rc_resize_allowed = 0; + cfg_.g_w = 1280; + cfg_.g_h = 720; + top_sl_width_ = 1280; + top_sl_height_ = 720; + ResizingVideoSource video(1280, 720); + video.external_resize_pattern_ = 1; + video.force_zero_source_ = 1; + cfg_.rc_target_bitrate = 1000; + ResetModel(); + dynamic_drop_layer_ = false; + single_layer_resize_ = false; + denoiser_on_ = 1; + base_speed_setting_ = speed_setting_; + external_resize_dynamic_drop_layer_ = true; + external_resize_pattern_ = video.external_resize_pattern_; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +// Params: speed setting and index for bitrate array. +class DatarateOnePassCbrSvcMultiBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcMultiBR() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Run CIF clip with 1 thread. +TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + const int bitrates[3] = { 200, 400, 600 }; + // TODO(marpan): Check that effective_datarate for each layer hits the + // layer target_bitrate. + cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)]; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.75, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass VBR SVC: 2 spatial layers and +// 3 temporal layers. Run VGA clip with 1 thread. +TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassVbrSvc2SL3TL) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_VBR; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)]; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70, + 1.3); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: speed setting, layer framedrop control and index for bitrate array. +class DatarateOnePassCbrSvcFrameDropMultiBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith3Params { + public: + DatarateOnePassCbrSvcFrameDropMultiBR() + : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcFrameDropMultiBR() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Run HD clip with 4 threads. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc2SL3TL4Threads) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.64, + 1.45); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Run HD clip with 4 threads. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL3TL4Threads) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Run HD clip with 4 threads, for 1284x770, which +// likely is the issue for Bug: 366146260. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, + OnePassCbrSvc3SL3TL4Threads1284x770) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1284_770_30.y4m", 0, 60); + top_sl_width_ = 1284; + top_sl_height_ = 770; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Run HD clip with 4 threads, for 1857x167. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, + OnePassCbrSvc3SL3TL4Threads1857x167) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1857_167_30.y4m", 0, 60); + top_sl_width_ = 1857; + top_sl_height_ = 167; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 2 temporal layers, for KSVC in flexible mode with no update of reference +// frames for all spatial layers on TL > 0 superframes. +// Run HD clip with 4 threads. +TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL2TL4ThKSVCFlex) { + SetSvcConfig(3, 2); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; + layer_framedrop_ = 0; + const int bitrates[3] = { 200, 400, 600 }; + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + layer_framedrop_ = GET_PARAM(2); + AssignLayerBitrates(); + ksvc_flex_noupd_tlenh_ = true; + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58, + 1.2); +} + +// Params: speed setting, inter-layer prediction mode. +class DatarateOnePassCbrSvcInterLayerPredSingleBR + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateOnePassCbrSvcInterLayerPredSingleBR() + : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + inter_layer_pred_mode_ = GET_PARAM(2); + ResetModel(); + } +}; + +// Check basic rate targeting with different inter-layer prediction modes for 1 +// pass CBR SVC: 3 spatial layers and 3 temporal layers. Run CIF clip with 1 +// thread. +TEST_P(DatarateOnePassCbrSvcInterLayerPredSingleBR, OnePassCbrSvc3SL3TL) { + // Disable test for inter-layer pred off for now since simulcast_mode fails. + if (inter_layer_pred_mode_ == INTER_LAYER_PRED_OFF) return; + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 3; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check rate targeting with different inter-layer prediction modes for 1 pass +// CBR SVC: 3 spatial layers and 3 temporal layers, changing the target bitrate +// at the middle of encoding. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLDynamicBitrateChange) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + change_bitrate_ = true; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +// Params: speed setting, noise sensitivity, index for bitrate array and inter +// layer pred mode. +class DatarateOnePassCbrSvcDenoiser + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith4Params { + public: + DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcDenoiser() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + inter_layer_pred_mode_ = GET_PARAM(3); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC with denoising. +// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads. +TEST_P(DatarateOnePassCbrSvcDenoiser, OnePassCbrSvc2SL3TLDenoiserOn) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 2; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + const int bitrates[3] = { 600, 800, 1000 }; + // TODO(marpan): Check that effective_datarate for each layer hits the + // layer target_bitrate. + // For SVC, noise_sen = 1 means denoising only the top spatial layer + // noise_sen = 2 means denoising the two top spatial layers. + cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)]; + ResetModel(); + denoiser_on_ = GET_PARAM(2); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} +#endif + +// Params: speed setting, key frame dist. +class DatarateOnePassCbrSvcSmallKF + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcSmallKF() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers. Run CIF clip with 1 thread, and few short key frame periods. +TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc3SL3TLSmallKf) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 800; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + // For this 3 temporal layer case, pattern repeats every 4 frames, so choose + // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). + const int kf_dist = GET_PARAM(2); + cfg_.kf_max_dist = kf_dist; + key_frame_spacing_ = kf_dist; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3 +// temporal layers. Run CIF clip with 1 thread, and few short key frame periods. +TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc2SL3TLSmallKf) { + SetSvcConfig(2, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + // For this 3 temporal layer case, pattern repeats every 4 frames, so choose + // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). + const int kf_dist = GET_PARAM(2) + 32; + cfg_.kf_max_dist = kf_dist; + key_frame_spacing_ = kf_dist; + ResetModel(); + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3 +// temporal layers. Run VGA clip with 1 thread, and place layer sync frames: +// one at middle layer first, then another one for top layer, and another +// insert for base spatial layer (which forces key frame). +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLSyncFrames) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.kf_max_dist = 9999; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + ResetModel(); + insert_layer_sync_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78, + 1.15); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Run SVC encoder for 3 spatial layers, 1 temporal layer, with +// intra-only frame as sync frame on base spatial layer. +// Intra_only is inserted at start and in middle of sequence. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL1TLSyncWithIntraOnly) { + SetSvcConfig(3, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 4; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + cfg_.rc_target_bitrate = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + ResetModel(); + insert_layer_sync_ = 1; + // Use intra_only frame for sync on base layer. + force_intra_only_frame_ = 1; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73, + 1.2); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Run SVC encoder for 2 quality layers (same resolution different, +// bitrates), 1 temporal layer, with screen content mode. +TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2QL1TLScreen) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 2; + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 1; + svc_params_.scaling_factor_num[1] = 1; + svc_params_.scaling_factor_den[1] = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + ResetModel(); + tune_content_ = 1; + // Set the layer bitrates, for 2 spatial layers, 1 temporal. + cfg_.rc_target_bitrate = 400; + cfg_.ss_target_bitrate[0] = 100; + cfg_.ss_target_bitrate[1] = 300; + cfg_.layer_target_bitrate[0] = 100; + cfg_.layer_target_bitrate[1] = 300; + for (int sl = 0; sl < 2; ++sl) { + float layer_framerate = 30.0; + layer_target_avg_bandwidth_[sl] = static_cast( + cfg_.layer_target_bitrate[sl] * 1000.0 / layer_framerate); + bits_in_buffer_model_[sl] = + cfg_.layer_target_bitrate[sl] * cfg_.rc_buf_initial_sz; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73, + 1.25); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: speed setting. +class DatarateOnePassCbrSvcPostencodeDrop + : public DatarateOnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) { + memset(&svc_params_, 0, sizeof(svc_params_)); + } + ~DatarateOnePassCbrSvcPostencodeDrop() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + speed_setting_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Run SVC encoder for 2 quality layers (same resolution different, +// bitrates), 1 temporal layer, with screen content mode. +TEST_P(DatarateOnePassCbrSvcPostencodeDrop, OnePassCbrSvc2QL1TLScreen) { + cfg_.rc_buf_initial_sz = 200; + cfg_.rc_buf_optimal_sz = 200; + cfg_.rc_buf_sz = 400; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 52; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 2; + cfg_.ts_number_layers = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 2; + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 1; + svc_params_.scaling_factor_num[1] = 1; + svc_params_.scaling_factor_den[1] = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + top_sl_width_ = 352; + top_sl_height_ = 288; + ResetModel(); + base_speed_setting_ = speed_setting_; + tune_content_ = 1; + use_post_encode_drop_ = 1; + // Set the layer bitrates, for 2 spatial layers, 1 temporal. + cfg_.rc_target_bitrate = 400; + cfg_.ss_target_bitrate[0] = 100; + cfg_.ss_target_bitrate[1] = 300; + cfg_.layer_target_bitrate[0] = 100; + cfg_.layer_target_bitrate[1] = 300; + for (int sl = 0; sl < 2; ++sl) { + float layer_framerate = 30.0; + layer_target_avg_bandwidth_[sl] = static_cast( + cfg_.layer_target_bitrate[sl] * 1000.0 / layer_framerate); + bits_in_buffer_model_[sl] = + cfg_.layer_target_bitrate[sl] * cfg_.rc_buf_initial_sz; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73, + 1.25); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcSingleBR, + ::testing::Range(5, 10)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcPostencodeDrop, + ::testing::Range(5, 6)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcInterLayerPredSingleBR, + ::testing::Range(5, 10), ::testing::Range(0, 3)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 3)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcFrameDropMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 2), + ::testing::Range(0, 3)); + +#if CONFIG_VP9_TEMPORAL_DENOISING +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcDenoiser, + ::testing::Range(5, 10), ::testing::Range(1, 3), + ::testing::Range(0, 3), ::testing::Range(0, 4)); +#endif + +VP9_INSTANTIATE_TEST_SUITE(DatarateOnePassCbrSvcSmallKF, + ::testing::Range(5, 10), ::testing::Range(32, 36)); +} // namespace +} // namespace svc_test diff --git a/media/libvpx/libvpx/test/svc_end_to_end_test.cc b/media/libvpx/libvpx/test/svc_end_to_end_test.cc new file mode 100644 index 0000000000..3bd6b1307c --- /dev/null +++ b/media/libvpx/libvpx/test/svc_end_to_end_test.cc @@ -0,0 +1,825 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/svc_test.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace svc_test { +namespace { + +enum INTER_LAYER_PRED { + // Inter-layer prediction is on on all frames. + INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. + INTER_LAYER_PRED_OFF, + // Inter-layer prediction is off on non-key frames and non-sync frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED +}; + +class ScalePartitionOnePassCbrSvc + : public OnePassCbrSvc, + public ::testing::TestWithParam { + public: + ScalePartitionOnePassCbrSvc() + : OnePassCbrSvc(GetParam()), mismatch_nframes_(0), num_nonref_frames_(0) { + SetMode(::libvpx_test::kRealTime); + } + + protected: + ~ScalePartitionOnePassCbrSvc() override = default; + + void SetUp() override { + InitializeConfig(); + speed_setting_ = 7; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + PreEncodeFrameHookSetup(video, encoder); + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1]) + num_nonref_frames_++; + } + + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { + ++mismatch_nframes_; + } + + void SetConfig(const int /*num_temporal_layer*/) override {} + + unsigned int GetMismatchFrames() const { return mismatch_nframes_; } + unsigned int GetNonRefFrames() const { return num_nonref_frames_; } + + private: + unsigned int mismatch_nframes_; + unsigned int num_nonref_frames_; +}; + +TEST_P(ScalePartitionOnePassCbrSvc, OnePassCbrSvc3SL3TL1080P) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + ::libvpx_test::I420VideoSource video( + "slides_code_term_web_plot.1920_1080.yuv", 1920, 1080, 30, 1, 0, 100); + // For this 3 temporal layer case, pattern repeats every 4 frames, so choose + // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: Inter layer prediction modes. +class SyncFrameOnePassCbrSvc : public OnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + SyncFrameOnePassCbrSvc() + : OnePassCbrSvc(GET_PARAM(0)), current_video_frame_(0), + frame_to_start_decode_(0), frame_to_sync_(0), + inter_layer_pred_mode_(GET_PARAM(1)), decode_to_layer_before_sync_(-1), + decode_to_layer_after_sync_(-1), denoiser_on_(0), + intra_only_test_(false), loopfilter_off_(0), mismatch_nframes_(0), + num_nonref_frames_(0) { + SetMode(::libvpx_test::kRealTime); + memset(&svc_layer_sync_, 0, sizeof(svc_layer_sync_)); + } + + protected: + ~SyncFrameOnePassCbrSvc() override = default; + + void SetUp() override { + InitializeConfig(); + speed_setting_ = 7; + } + + bool DoDecode() const override { + return current_video_frame_ >= frame_to_start_decode_; + } + + // Example pattern for spatial layers and 2 temporal layers used in the + // bypass/flexible mode. The pattern corresponds to the pattern + // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in + // non-flexible mode. + void set_frame_flags_bypass_mode( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + int sl; + for (sl = 0; sl < num_spatial_layers; ++sl) + ref_frame_config->update_buffer_slot[sl] = 0; + + for (sl = 0; sl < num_spatial_layers; ++sl) { + // Set the buffer idx. + if (tl == 0) { + ref_frame_config->lst_fb_idx[sl] = sl; + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { + ref_frame_config->gld_fb_idx[sl] = 0; + } + ref_frame_config->alt_fb_idx[sl] = 0; + } else if (tl == 1) { + ref_frame_config->lst_fb_idx[sl] = sl; + ref_frame_config->gld_fb_idx[sl] = + (sl == 0) ? 0 : num_spatial_layers + sl - 1; + ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl; + } + // Set the reference and update flags. + if (!tl) { + if (!sl) { + // Base spatial and base temporal (sl = 0, tl = 0) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } else { + if (is_key_frame) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->gld_fb_idx[sl]; + } else { + // Non-zero spatiall layer. + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 1; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->lst_fb_idx[sl]; + } + } + } else if (tl == 1) { + if (!sl) { + // Base spatial and top temporal (tl = 1) + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 0; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else { + // Non-zero spatial. + if (sl < num_spatial_layers - 1) { + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + ref_frame_config->update_buffer_slot[sl] |= + 1 << ref_frame_config->alt_fb_idx[sl]; + } else if (sl == num_spatial_layers - 1) { + // Top spatial and top temporal (non-reference -- doesn't + // update any reference buffers). + ref_frame_config->reference_last[sl] = 1; + ref_frame_config->reference_golden[sl] = 1; + ref_frame_config->reference_alt_ref[sl] = 0; + } + } + } + } + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + current_video_frame_ = video->frame(); + PreEncodeFrameHookSetup(video, encoder); + if (video->frame() == 0) { + // Do not turn off inter-layer pred completely because simulcast mode + // fails. + if (inter_layer_pred_mode_ != INTER_LAYER_PRED_OFF) + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, inter_layer_pred_mode_); + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + if (intra_only_test_) + // Decoder sets the color_space for Intra-only frames + // to BT_601 (see line 1810 in vp9_decodeframe.c). + // So set it here in these tess to avoid encoder-decoder + // mismatch check on color space setting. + encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601); + + encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_); + } + if (flexible_mode_) { + vpx_svc_layer_id_t layer_id; + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = (video->frame() % 2 != 0); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int i = 0; i < number_spatial_layers_; i++) { + layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; + ref_frame_config_.duration[i] = 1; + } + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + set_frame_flags_bypass_mode(layer_id.temporal_layer_id, + number_spatial_layers_, 0, + &ref_frame_config_); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); + } + if (video->frame() == frame_to_sync_) { + encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_); + } + } + +#if CONFIG_VP9_DECODER + void PreDecodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Decoder *decoder) override { + if (video->frame() < frame_to_sync_) { + if (decode_to_layer_before_sync_ >= 0) + decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, + decode_to_layer_before_sync_); + } else { + if (decode_to_layer_after_sync_ >= 0) { + int decode_to_layer = decode_to_layer_after_sync_; + // Overlay frame is additional layer for intra-only. + if (video->frame() == frame_to_sync_ && intra_only_test_ && + decode_to_layer_after_sync_ == 0 && number_spatial_layers_ > 1) + decode_to_layer += 1; + decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, decode_to_layer); + } + } + } +#endif + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1] && + current_video_frame_ >= frame_to_sync_) + num_nonref_frames_++; + + if (intra_only_test_ && current_video_frame_ == frame_to_sync_) { + // Intra-only frame is only generated for spatial layers > 1 and <= 3, + // among other conditions (see constraint in set_intra_only_frame(). If + // intra-only is no allowed then encoder will insert key frame instead. + const bool key_frame = + (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; + if (number_spatial_layers_ == 1 || number_spatial_layers_ > 3) + ASSERT_TRUE(key_frame); + else + ASSERT_FALSE(key_frame); + } + } + + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { + if (current_video_frame_ >= frame_to_sync_) ++mismatch_nframes_; + } + + unsigned int GetMismatchFrames() const { return mismatch_nframes_; } + unsigned int GetNonRefFrames() const { return num_nonref_frames_; } + + unsigned int current_video_frame_; + unsigned int frame_to_start_decode_; + unsigned int frame_to_sync_; + int inter_layer_pred_mode_; + int decode_to_layer_before_sync_; + int decode_to_layer_after_sync_; + int denoiser_on_; + bool intra_only_test_; + int loopfilter_off_; + vpx_svc_spatial_layer_sync_t svc_layer_sync_; + unsigned int mismatch_nframes_; + unsigned int num_nonref_frames_; + bool flexible_mode_; + vpx_svc_ref_frame_config_t ref_frame_config_; + + private: + void SetConfig(const int num_temporal_layer) override { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 30; + cfg_.kf_max_dist = 9999; + if (num_temporal_layer == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + } else if (num_temporal_layer == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.temporal_layering_mode = 2; + } else if (num_temporal_layer == 1) { + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + } + } +}; + +// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Only start decoding on the sync layer. +// Full sync: insert key frame on base layer. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLFullSync) { + SetSvcConfig(3, 3); + // Sync is on base layer so the frame to sync and the frame to start decoding + // is the same. + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = -1; + decode_to_layer_after_sync_ = 2; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Test for sync layer for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Decoding QVGA before sync frame and decode up to +// VGA on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncToVGA) { + SetSvcConfig(2, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 100; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 1; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 1; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 400; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Decoding QVGA and VGA before sync frame and decode up to +// HD on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToHD) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 1; + decode_to_layer_after_sync_ = 2; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and +// 3 temporal layers. Decoding QVGA before sync frame and decode up to +// HD on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToVGAHD) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 2; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 1; + svc_layer_sync_.spatial_layer_sync[2] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +// Test for sync layer for 1 pass CBR SVC: 2 spatial layers and +// 3 temporal layers. Decoding QVGA before sync frame and decode up to +// VGA on and after sync. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) { + SetSvcConfig(2, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 100; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 1; + + denoiser_on_ = 1; + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 0; + svc_layer_sync_.spatial_layer_sync[0] = 0; + svc_layer_sync_.spatial_layer_sync[1] = 1; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 400; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} +#endif + +// Encode 3 spatial, 2 temporal layer in flexible mode but don't +// start decoding. During the sequence insert intra-only on base/qvga +// layer at frame 20 and start decoding only QVGA layer from there. +TEST_P(SyncFrameOnePassCbrSvc, + OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGAFlex) { + SetSvcConfig(3, 2); + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + decode_to_layer_after_sync_ = 0; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = true; + AssignLayerBitrates(); + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Can't check mismatch here because only base is decoded at + // frame sync, whereas encoder continues encoding all layers. +} + +// Encode 3 spatial, 3 temporal layer but don't start decoding. +// During the sequence insert intra-only on base/qvga layer at frame 20 +// and start decoding only QVGA layer from there. +TEST_P(SyncFrameOnePassCbrSvc, + OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGA) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + decode_to_layer_after_sync_ = 0; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Can't check mismatch here because only base is decoded at + // frame sync, whereas encoder continues encoding all layers. +} + +// Start decoding from beginning of sequence, during sequence insert intra-only +// on base/qvga layer. Decode all layers. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + // The superframe containing intra-only layer will have +1 frames. Thus set + // the layer to decode after sync frame to +1 from + // decode_to_layer_before_sync. + decode_to_layer_after_sync_ = 3; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 0; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Start decoding from beginning of sequence, during sequence insert intra-only +// on base/qvga layer and sync_layer on middle/VGA layer. Decode all layers. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) { + SetSvcConfig(3, 3); + frame_to_start_decode_ = 0; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 2; + // The superframe containing intra-only layer will have +1 frames. Thus set + // the layer to decode after sync frame to +1 from + // decode_to_layer_before_sync. + decode_to_layer_after_sync_ = 3; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + svc_layer_sync_.spatial_layer_sync[1] = 1; + svc_layer_sync_.spatial_layer_sync[2] = 0; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Start decoding from sync frame, insert intra-only on base/qvga layer. Decode +// all layers. For 1 spatial layer, it inserts a key frame. +TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) { + SetSvcConfig(1, 3); + frame_to_start_decode_ = 20; + frame_to_sync_ = 20; + decode_to_layer_before_sync_ = 0; + decode_to_layer_after_sync_ = 0; + intra_only_test_ = true; + + // Set up svc layer sync structure. + svc_layer_sync_.base_layer_intra_only = 1; + svc_layer_sync_.spatial_layer_sync[0] = 1; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + cfg_.rc_target_bitrate = 600; + flexible_mode_ = false; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + // The non-reference frames are expected to be mismatched frames as the + // encoder will avoid loopfilter on these frames. + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); +#endif +} + +// Params: Loopfilter modes. +class LoopfilterOnePassCbrSvc : public OnePassCbrSvc, + public ::libvpx_test::CodecTestWithParam { + public: + LoopfilterOnePassCbrSvc() + : OnePassCbrSvc(GET_PARAM(0)), loopfilter_off_(GET_PARAM(1)), + mismatch_nframes_(0), num_nonref_frames_(0) { + SetMode(::libvpx_test::kRealTime); + } + + protected: + ~LoopfilterOnePassCbrSvc() override = default; + + void SetUp() override { + InitializeConfig(); + speed_setting_ = 7; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + PreEncodeFrameHookSetup(video, encoder); + if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) { + // Consider 3 cases: + if (loopfilter_off_ == 0) { + // loopfilter is on for all spatial layers on every superrframe. + for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.loopfilter_ctrl[i] = 0; + } + } else if (loopfilter_off_ == 1) { + // loopfilter is off for non-reference frames for all spatial layers. + for (int i = 0; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.loopfilter_ctrl[i] = 1; + } + } else { + // loopfilter is off for all SL0 frames, and off only for non-reference + // frames for SL > 0. + svc_params_.loopfilter_ctrl[0] = 2; + for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.loopfilter_ctrl[i] = 1; + } + } + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 1) { + // For non-SVC mode use the single layer control. + encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_); + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Keep track of number of non-reference frames, needed for mismatch check. + // Non-reference frames are top spatial and temporal layer frames, + // for TL > 0. + if (temporal_layer_id_ == number_temporal_layers_ - 1 && + temporal_layer_id_ > 0 && + pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1]) + num_nonref_frames_++; + } + + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { + ++mismatch_nframes_; + } + + void SetConfig(const int /*num_temporal_layer*/) override {} + + int GetMismatchFrames() const { return mismatch_nframes_; } + int GetNonRefFrames() const { return num_nonref_frames_; } + + int loopfilter_off_; + + private: + int mismatch_nframes_; + int num_nonref_frames_; +}; + +TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL1TLLoopfilterOff) { + SetSvcConfig(1, 1); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + if (loopfilter_off_ == 0) + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); + else + EXPECT_EQ(GetMismatchFrames(), 0); +#endif +} + +TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc1SL3TLLoopfilterOff) { + SetSvcConfig(1, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + if (loopfilter_off_ == 0) + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); + else + EXPECT_EQ(GetMismatchFrames(), 0); +#endif +} + +TEST_P(LoopfilterOnePassCbrSvc, OnePassCbrSvc3SL3TLLoopfilterOff) { + SetSvcConfig(3, 3); + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_threads = 1; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_target_bitrate = 800; + cfg_.kf_max_dist = 9999; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 600; + AssignLayerBitrates(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +#if CONFIG_VP9_DECODER + if (loopfilter_off_ == 0) + EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames()); + else + EXPECT_EQ(GetMismatchFrames(), 0); +#endif +} + +VP9_INSTANTIATE_TEST_SUITE(SyncFrameOnePassCbrSvc, ::testing::Range(0, 3)); + +VP9_INSTANTIATE_TEST_SUITE(LoopfilterOnePassCbrSvc, ::testing::Range(0, 3)); + +INSTANTIATE_TEST_SUITE_P( + VP9, ScalePartitionOnePassCbrSvc, + ::testing::Values( + static_cast(&libvpx_test::kVP9))); + +} // namespace +} // namespace svc_test diff --git a/media/libvpx/libvpx/test/svc_test.cc b/media/libvpx/libvpx/test/svc_test.cc index 482d9fffa1..cbc0abe032 100644 --- a/media/libvpx/libvpx/test/svc_test.cc +++ b/media/libvpx/libvpx/test/svc_test.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,782 +8,128 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include "third_party/googletest/src/include/gtest/gtest.h" -#include "test/codec_factory.h" -#include "test/decode_test_driver.h" -#include "test/i420_video_source.h" +#include "test/svc_test.h" -#include "vp9/decoder/vp9_decoder.h" - -#include "vpx/svc_context.h" -#include "vpx/vp8cx.h" -#include "vpx/vpx_encoder.h" - -namespace { - -using libvpx_test::CodecFactory; -using libvpx_test::Decoder; -using libvpx_test::DxDataIterator; -using libvpx_test::VP9CodecFactory; - -class SvcTest : public ::testing::Test { - protected: - static const uint32_t kWidth = 352; - static const uint32_t kHeight = 288; - - SvcTest() - : codec_iface_(0), test_file_name_("hantro_collage_w352h288.yuv"), - codec_initialized_(false), decoder_(0) { - memset(&svc_, 0, sizeof(svc_)); - memset(&codec_, 0, sizeof(codec_)); - memset(&codec_enc_, 0, sizeof(codec_enc_)); +namespace svc_test { +void OnePassCbrSvc::SetSvcConfig(const int num_spatial_layer, + const int num_temporal_layer) { + SetConfig(num_temporal_layer); + cfg_.ss_number_layers = num_spatial_layer; + cfg_.ts_number_layers = num_temporal_layer; + if (num_spatial_layer == 1) { + svc_params_.scaling_factor_num[0] = 288; + svc_params_.scaling_factor_den[0] = 288; + } else if (num_spatial_layer == 2) { + svc_params_.scaling_factor_num[0] = 144; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 288; + svc_params_.scaling_factor_den[1] = 288; + } else if (num_spatial_layer == 3) { + svc_params_.scaling_factor_num[0] = 72; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 144; + svc_params_.scaling_factor_den[1] = 288; + svc_params_.scaling_factor_num[2] = 288; + svc_params_.scaling_factor_den[2] = 288; } + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; +} - virtual ~SvcTest() {} - - virtual void SetUp() { - svc_.log_level = SVC_LOG_DEBUG; - svc_.log_print = 0; - - codec_iface_ = vpx_codec_vp9_cx(); - const vpx_codec_err_t res = - vpx_codec_enc_config_default(codec_iface_, &codec_enc_, 0); - EXPECT_EQ(VPX_CODEC_OK, res); - - codec_enc_.g_w = kWidth; - codec_enc_.g_h = kHeight; - codec_enc_.g_timebase.num = 1; - codec_enc_.g_timebase.den = 60; - codec_enc_.kf_min_dist = 100; - codec_enc_.kf_max_dist = 100; - - vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t(); - VP9CodecFactory codec_factory; - decoder_ = codec_factory.CreateDecoder(dec_cfg, 0); - - tile_columns_ = 0; - tile_rows_ = 0; - } - - virtual void TearDown() { - ReleaseEncoder(); - delete (decoder_); - } - - void InitializeEncoder() { - const vpx_codec_err_t res = - vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_OK, res); - vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4); // Make the test faster - vpx_codec_control(&codec_, VP9E_SET_TILE_COLUMNS, tile_columns_); - vpx_codec_control(&codec_, VP9E_SET_TILE_ROWS, tile_rows_); - codec_initialized_ = true; - } - - void ReleaseEncoder() { - vpx_svc_release(&svc_); - if (codec_initialized_) vpx_codec_destroy(&codec_); - codec_initialized_ = false; - } - - void GetStatsData(std::string *const stats_buf) { - vpx_codec_iter_t iter = NULL; - const vpx_codec_cx_pkt_t *cx_pkt; - - while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) { - if (cx_pkt->kind == VPX_CODEC_STATS_PKT) { - EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U); - ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL); - stats_buf->append(static_cast(cx_pkt->data.twopass_stats.buf), - cx_pkt->data.twopass_stats.sz); +void OnePassCbrSvc::PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 0) { + for (int i = 0; i < VPX_MAX_LAYERS; ++i) { + svc_params_.max_quantizers[i] = 63; + svc_params_.min_quantizers[i] = 0; + } + if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) { + svc_params_.speed_per_layer[0] = base_speed_setting_; + for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) { + svc_params_.speed_per_layer[i] = speed_setting_; } + encoder->Control(VP9E_SET_SVC, 1); + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + } + encoder->Control(VP8E_SET_CPUUSED, speed_setting_); + encoder->Control(VP9E_SET_AQ_MODE, 3); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300); + encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads)); + encoder->Control(VP9E_SET_ROW_MT, 1); + encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1); + } + + superframe_count_++; + temporal_layer_id_ = 0; + if (number_temporal_layers_ == 2) { + temporal_layer_id_ = (superframe_count_ % 2 != 0); + } else if (number_temporal_layers_ == 3) { + if (superframe_count_ % 2 != 0) temporal_layer_id_ = 2; + if (superframe_count_ > 1) { + if ((superframe_count_ - 2) % 4 == 0) temporal_layer_id_ = 1; } } - void Pass1EncodeNFrames(const int n, const int layers, - std::string *const stats_buf) { - vpx_codec_err_t res; + frame_flags_ = 0; +} - ASSERT_GT(n, 0); - ASSERT_GT(layers, 0); - svc_.spatial_layers = layers; - codec_enc_.g_pass = VPX_RC_FIRST_PASS; - InitializeEncoder(); - - libvpx_test::I420VideoSource video( - test_file_name_, codec_enc_.g_w, codec_enc_.g_h, - codec_enc_.g_timebase.den, codec_enc_.g_timebase.num, 0, 30); - video.Begin(); - - for (int i = 0; i < n; ++i) { - res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(), - video.duration(), VPX_DL_GOOD_QUALITY); - ASSERT_EQ(VPX_CODEC_OK, res); - GetStatsData(stats_buf); - video.Next(); - } - - // Flush encoder and test EOS packet. - res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(), video.duration(), - VPX_DL_GOOD_QUALITY); - ASSERT_EQ(VPX_CODEC_OK, res); - GetStatsData(stats_buf); - - ReleaseEncoder(); - } - - void StoreFrames(const size_t max_frame_received, - struct vpx_fixed_buf *const outputs, - size_t *const frame_received) { - vpx_codec_iter_t iter = NULL; - const vpx_codec_cx_pkt_t *cx_pkt; - - while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) { - if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) { - const size_t frame_size = cx_pkt->data.frame.sz; - - EXPECT_GT(frame_size, 0U); - ASSERT_TRUE(cx_pkt->data.frame.buf != NULL); - ASSERT_LT(*frame_received, max_frame_received); - - if (*frame_received == 0) - EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)); - - outputs[*frame_received].buf = malloc(frame_size + 16); - ASSERT_TRUE(outputs[*frame_received].buf != NULL); - memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf, - frame_size); - outputs[*frame_received].sz = frame_size; - ++(*frame_received); - } +void OnePassCbrSvc::PostEncodeFrameHook(::libvpx_test::Encoder *encoder) { + vpx_svc_layer_id_t layer_id; + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + temporal_layer_id_ = layer_id.temporal_layer_id; + for (int sl = 0; sl < number_spatial_layers_; ++sl) { + for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) { + const int layer = sl * number_temporal_layers_ + tl; + bits_in_buffer_model_[layer] += + static_cast(layer_target_avg_bandwidth_[layer]); } } +} - void Pass2EncodeNFrames(std::string *const stats_buf, const int n, - const int layers, - struct vpx_fixed_buf *const outputs) { - vpx_codec_err_t res; - size_t frame_received = 0; - - ASSERT_TRUE(outputs != NULL); - ASSERT_GT(n, 0); - ASSERT_GT(layers, 0); - svc_.spatial_layers = layers; - codec_enc_.rc_target_bitrate = 500; - if (codec_enc_.g_pass == VPX_RC_LAST_PASS) { - ASSERT_TRUE(stats_buf != NULL); - ASSERT_GT(stats_buf->size(), 0U); - codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0]; - codec_enc_.rc_twopass_stats_in.sz = stats_buf->size(); - } - InitializeEncoder(); - - libvpx_test::I420VideoSource video( - test_file_name_, codec_enc_.g_w, codec_enc_.g_h, - codec_enc_.g_timebase.den, codec_enc_.g_timebase.num, 0, 30); - video.Begin(); - - for (int i = 0; i < n; ++i) { - res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(), - video.duration(), VPX_DL_GOOD_QUALITY); - ASSERT_EQ(VPX_CODEC_OK, res); - StoreFrames(n, outputs, &frame_received); - video.Next(); - } - - // Flush encoder. - res = vpx_svc_encode(&svc_, &codec_, NULL, 0, video.duration(), - VPX_DL_GOOD_QUALITY); - EXPECT_EQ(VPX_CODEC_OK, res); - StoreFrames(n, outputs, &frame_received); - - EXPECT_EQ(frame_received, static_cast(n)); - - ReleaseEncoder(); - } - - void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) { - int decoded_frames = 0; - int received_frames = 0; - - ASSERT_TRUE(inputs != NULL); - ASSERT_GT(n, 0); - - for (int i = 0; i < n; ++i) { - ASSERT_TRUE(inputs[i].buf != NULL); - ASSERT_GT(inputs[i].sz, 0U); - const vpx_codec_err_t res_dec = decoder_->DecodeFrame( - static_cast(inputs[i].buf), inputs[i].sz); - ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError(); - ++decoded_frames; - - DxDataIterator dec_iter = decoder_->GetDxData(); - while (dec_iter.Next() != NULL) { - ++received_frames; - } - } - EXPECT_EQ(decoded_frames, n); - EXPECT_EQ(received_frames, n); - } - - void DropEnhancementLayers(struct vpx_fixed_buf *const inputs, - const int num_super_frames, - const int remained_spatial_layers) { - ASSERT_TRUE(inputs != NULL); - ASSERT_GT(num_super_frames, 0); - ASSERT_GT(remained_spatial_layers, 0); - - for (int i = 0; i < num_super_frames; ++i) { - uint32_t frame_sizes[8] = { 0 }; - int frame_count = 0; - int frames_found = 0; - int frame; - ASSERT_TRUE(inputs[i].buf != NULL); - ASSERT_GT(inputs[i].sz, 0U); - - vpx_codec_err_t res = vp9_parse_superframe_index( - static_cast(inputs[i].buf), inputs[i].sz, - frame_sizes, &frame_count, NULL, NULL); - ASSERT_EQ(VPX_CODEC_OK, res); - - if (frame_count == 0) { - // There's no super frame but only a single frame. - ASSERT_EQ(1, remained_spatial_layers); - } else { - // Found a super frame. - uint8_t *frame_data = static_cast(inputs[i].buf); - uint8_t *frame_start = frame_data; - for (frame = 0; frame < frame_count; ++frame) { - // Looking for a visible frame. - if (frame_data[0] & 0x02) { - ++frames_found; - if (frames_found == remained_spatial_layers) break; - } - frame_data += frame_sizes[frame]; - } - ASSERT_LT(frame, frame_count) - << "Couldn't find a visible frame. " - << "remained_spatial_layers: " << remained_spatial_layers - << " super_frame: " << i; - if (frame == frame_count - 1) continue; - - frame_data += frame_sizes[frame]; - - // We need to add one more frame for multiple frame contexts. - uint8_t marker = - static_cast(inputs[i].buf)[inputs[i].sz - 1]; - const uint32_t mag = ((marker >> 3) & 0x3) + 1; - const size_t index_sz = 2 + mag * frame_count; - const size_t new_index_sz = 2 + mag * (frame + 1); - marker &= 0x0f8; - marker |= frame; - - // Copy existing frame sizes. - memmove(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1, - new_index_sz - 2); - // New marker. - frame_data[0] = marker; - frame_data += (mag * (frame + 1) + 1); - - *frame_data++ = marker; - inputs[i].sz = frame_data - frame_start; - } +void OnePassCbrSvc::AssignLayerBitrates() { + int sl, spatial_layer_target; + int spatial_layers = cfg_.ss_number_layers; + int temporal_layers = cfg_.ts_number_layers; + float total = 0; + float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; + float framerate = 30.0; + for (sl = 0; sl < spatial_layers; ++sl) { + if (svc_params_.scaling_factor_den[sl] > 0) { + alloc_ratio[sl] = + static_cast((svc_params_.scaling_factor_num[sl] * 1.0 / + svc_params_.scaling_factor_den[sl])); + total += alloc_ratio[sl]; } } - - void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) { - ASSERT_TRUE(inputs != NULL); - ASSERT_GT(n, 0); - - for (int i = 0; i < n; ++i) { - free(inputs[i].buf); - inputs[i].buf = NULL; - inputs[i].sz = 0; + for (sl = 0; sl < spatial_layers; ++sl) { + cfg_.ss_target_bitrate[sl] = spatial_layer_target = + static_cast(cfg_.rc_target_bitrate * alloc_ratio[sl] / + total); + const int index = sl * temporal_layers; + if (cfg_.temporal_layering_mode == 3) { + cfg_.layer_target_bitrate[index] = spatial_layer_target >> 1; + cfg_.layer_target_bitrate[index + 1] = + (spatial_layer_target >> 1) + (spatial_layer_target >> 2); + cfg_.layer_target_bitrate[index + 2] = spatial_layer_target; + } else if (cfg_.temporal_layering_mode == 2) { + cfg_.layer_target_bitrate[index] = spatial_layer_target * 2 / 3; + cfg_.layer_target_bitrate[index + 1] = spatial_layer_target; + } else if (cfg_.temporal_layering_mode <= 1) { + cfg_.layer_target_bitrate[index] = spatial_layer_target; + } + } + for (sl = 0; sl < spatial_layers; ++sl) { + for (int tl = 0; tl < temporal_layers; ++tl) { + const int layer = sl * temporal_layers + tl; + float layer_framerate = framerate; + if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2; + if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4; + if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2; + layer_target_avg_bandwidth_[layer] = static_cast( + cfg_.layer_target_bitrate[layer] * 1000.0 / layer_framerate); + bits_in_buffer_model_[layer] = + cfg_.layer_target_bitrate[layer] * cfg_.rc_buf_initial_sz; } } - - SvcContext svc_; - vpx_codec_ctx_t codec_; - struct vpx_codec_enc_cfg codec_enc_; - vpx_codec_iface_t *codec_iface_; - std::string test_file_name_; - bool codec_initialized_; - Decoder *decoder_; - int tile_columns_; - int tile_rows_; -}; - -TEST_F(SvcTest, SvcInit) { - // test missing parameters - vpx_codec_err_t res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - res = vpx_svc_init(&svc_, NULL, codec_iface_, &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - res = vpx_svc_init(&svc_, &codec_, NULL, &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_init(&svc_, &codec_, codec_iface_, NULL); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - svc_.spatial_layers = 6; // too many layers - res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - svc_.spatial_layers = 0; // use default layers - InitializeEncoder(); - EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers); } - -TEST_F(SvcTest, InitTwoLayers) { - svc_.spatial_layers = 2; - InitializeEncoder(); -} - -TEST_F(SvcTest, InvalidOptions) { - vpx_codec_err_t res = vpx_svc_set_options(&svc_, NULL); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "not-an-option=1"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); -} - -TEST_F(SvcTest, SetLayersOption) { - vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3"); - EXPECT_EQ(VPX_CODEC_OK, res); - InitializeEncoder(); - EXPECT_EQ(3, svc_.spatial_layers); -} - -TEST_F(SvcTest, SetMultipleOptions) { - vpx_codec_err_t res = - vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3"); - EXPECT_EQ(VPX_CODEC_OK, res); - InitializeEncoder(); - EXPECT_EQ(2, svc_.spatial_layers); -} - -TEST_F(SvcTest, SetScaleFactorsOption) { - svc_.spatial_layers = 2; - vpx_codec_err_t res = - vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "scale-factors=1/3, 3*3"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "scale-factors=1/3"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3"); - EXPECT_EQ(VPX_CODEC_OK, res); - InitializeEncoder(); -} - -TEST_F(SvcTest, SetQuantizersOption) { - svc_.spatial_layers = 2; - vpx_codec_err_t res = vpx_svc_set_options(&svc_, "max-quantizers=nothing"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "min-quantizers=nothing"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "max-quantizers=40"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "min-quantizers=40"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "max-quantizers=30,30 min-quantizers=40,40"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "max-quantizers=40,40 min-quantizers=30,30"); - InitializeEncoder(); -} - -TEST_F(SvcTest, SetAutoAltRefOption) { - svc_.spatial_layers = 5; - vpx_codec_err_t res = vpx_svc_set_options(&svc_, "auto-alt-refs=none"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - res = vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1,1,0"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0"); - InitializeEncoder(); -} - -// Test that decoder can handle an SVC frame as the first frame in a sequence. -TEST_F(SvcTest, OnePassEncodeOneFrame) { - codec_enc_.g_pass = VPX_RC_ONE_PASS; - vpx_fixed_buf output = vpx_fixed_buf(); - Pass2EncodeNFrames(NULL, 1, 2, &output); - DecodeNFrames(&output, 1); - FreeBitstreamBuffers(&output, 1); -} - -TEST_F(SvcTest, OnePassEncodeThreeFrames) { - codec_enc_.g_pass = VPX_RC_ONE_PASS; - codec_enc_.g_lag_in_frames = 0; - vpx_fixed_buf outputs[3]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 3); - FreeBitstreamBuffers(&outputs[0], 3); -} - -TEST_F(SvcTest, TwoPassEncode10Frames) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(10, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(20, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1"); - vpx_fixed_buf outputs[20]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 20); - FreeBitstreamBuffers(&outputs[0], 20); -} - -TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(10, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DropEnhancementLayers(&outputs[0], 10, 1); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(10, 5, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]); - - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 4); - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 3); - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 2); - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 1); - DecodeNFrames(&outputs[0], 10); - - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2SNRLayers) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1"); - Pass1EncodeNFrames(20, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1"); - vpx_fixed_buf outputs[20]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 20); - FreeBitstreamBuffers(&outputs[0], 20); -} - -TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1"); - Pass1EncodeNFrames(20, 3, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1"); - vpx_fixed_buf outputs[20]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]); - DecodeNFrames(&outputs[0], 20); - DropEnhancementLayers(&outputs[0], 20, 2); - DecodeNFrames(&outputs[0], 20); - DropEnhancementLayers(&outputs[0], 20, 1); - DecodeNFrames(&outputs[0], 20); - - FreeBitstreamBuffers(&outputs[0], 20); -} - -TEST_F(SvcTest, SetMultipleFrameContextsOption) { - svc_.spatial_layers = 5; - vpx_codec_err_t res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1"); - EXPECT_EQ(VPX_CODEC_OK, res); - res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_); - EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res); - - svc_.spatial_layers = 2; - res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1"); - InitializeEncoder(); -} - -TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(10, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, - TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) { - // First pass encode - std::string stats_buf; - Pass1EncodeNFrames(10, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DropEnhancementLayers(&outputs[0], 10, 1); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1"); - Pass1EncodeNFrames(10, 2, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, - "auto-alt-refs=1,1 scale-factors=1/1,1/1 " - "multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, - TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1"); - Pass1EncodeNFrames(10, 3, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, - "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 " - "multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]); - - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 2); - DecodeNFrames(&outputs[0], 10); - DropEnhancementLayers(&outputs[0], 10, 1); - DecodeNFrames(&outputs[0], 10); - - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2TemporalLayers) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, - "auto-alt-refs=1 scale-factors=1/1 " - "multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - - vpx_fixed_buf base_layer[5]; - for (int i = 0; i < 5; ++i) base_layer[i] = outputs[i * 2]; - - DecodeNFrames(&base_layer[0], 5); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, - TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - codec_enc_.g_error_resilient = 0; - vpx_svc_set_options(&svc_, - "auto-alt-refs=1 scale-factors=1/1 " - "multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - - vpx_fixed_buf base_layer[5]; - for (int i = 0; i < 5; ++i) base_layer[i] = outputs[i * 2]; - - DecodeNFrames(&base_layer[0], 5); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithTiles) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1"); - codec_enc_.g_w = 704; - codec_enc_.g_h = 144; - tile_columns_ = 1; - tile_rows_ = 1; - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContextsAndTiles) { - // First pass encode - std::string stats_buf; - vpx_svc_set_options(&svc_, "scale-factors=1/1"); - svc_.temporal_layers = 2; - Pass1EncodeNFrames(10, 1, &stats_buf); - - // Second pass encode - codec_enc_.g_pass = VPX_RC_LAST_PASS; - svc_.temporal_layers = 2; - codec_enc_.g_error_resilient = 0; - codec_enc_.g_w = 704; - codec_enc_.g_h = 144; - tile_columns_ = 1; - tile_rows_ = 1; - vpx_svc_set_options(&svc_, - "auto-alt-refs=1 scale-factors=1/1 " - "multi-frame-contexts=1"); - vpx_fixed_buf outputs[10]; - memset(&outputs[0], 0, sizeof(outputs)); - Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]); - DecodeNFrames(&outputs[0], 10); - FreeBitstreamBuffers(&outputs[0], 10); -} - -} // namespace +} // namespace svc_test diff --git a/media/libvpx/libvpx/test/svc_test.h b/media/libvpx/libvpx/test/svc_test.h new file mode 100644 index 0000000000..de39412f6d --- /dev/null +++ b/media/libvpx/libvpx/test/svc_test.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_SVC_TEST_H_ +#define VPX_TEST_SVC_TEST_H_ + +#include "./vpx_config.h" +#include "gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace svc_test { +class OnePassCbrSvc : public ::libvpx_test::EncoderTest { + public: + explicit OnePassCbrSvc(const ::libvpx_test::CodecFactory *codec) + : EncoderTest(codec), base_speed_setting_(0), speed_setting_(0), + superframe_count_(0), temporal_layer_id_(0), number_temporal_layers_(0), + number_spatial_layers_(0) { + memset(&svc_params_, 0, sizeof(svc_params_)); + memset(bits_in_buffer_model_, 0, + sizeof(bits_in_buffer_model_[0]) * VPX_MAX_LAYERS); + memset(layer_target_avg_bandwidth_, 0, + sizeof(layer_target_avg_bandwidth_[0]) * VPX_MAX_LAYERS); + } + + protected: + ~OnePassCbrSvc() override {} + + virtual void SetConfig(const int num_temporal_layer) = 0; + + virtual void SetSvcConfig(const int num_spatial_layer, + const int num_temporal_layer); + + virtual void PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder); + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override; + + virtual void AssignLayerBitrates(); + + void MismatchHook(const vpx_image_t *, const vpx_image_t *) override {} + + vpx_svc_extra_cfg_t svc_params_; + int64_t bits_in_buffer_model_[VPX_MAX_LAYERS]; + int layer_target_avg_bandwidth_[VPX_MAX_LAYERS]; + int base_speed_setting_; + int speed_setting_; + int superframe_count_; + int temporal_layer_id_; + int number_temporal_layers_; + int number_spatial_layers_; +}; +} // namespace svc_test + +#endif // VPX_TEST_SVC_TEST_H_ diff --git a/media/libvpx/libvpx/test/test-data.mk b/media/libvpx/libvpx/test/test-data.mk index a6cda1e9b0..ffb9c0801e 100644 --- a/media/libvpx/libvpx/test/test-data.mk +++ b/media/libvpx/libvpx/test/test-data.mk @@ -2,15 +2,19 @@ LIBVPX_TEST_SRCS-yes += test-data.mk # Encoder test source LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288_nv12.yuv LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktop_office1.1280_720-020.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += slides_code_term_web_plot.1920_1080.yuv +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktopqvga.320_240.yuv -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444_20f.y4m LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_440.yuv -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m -LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422_20f.y4m +LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444_20f.y4m LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_440.yuv LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420_a10-1.y4m LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m @@ -18,11 +22,19 @@ LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_422.y4m LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP8_ENCODER) += repro-oss-fuzz-69906.y4m + +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += 4x2.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_credits.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.y4m +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1284_770_30.y4m +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1857_167_30.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += noisy_clip_640_360.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += crowd_run_360p_10_150f.y4m # Test vectors LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf @@ -731,8 +743,16 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5 endif # CONFIG_VP9_HIGHBITDEPTH # Invalid files for testing libvpx error checking. +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-148271109.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-148271109.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm @@ -771,6 +791,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s367 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm @@ -778,8 +800,13 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.web LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += crbug-1539.rawfile ifeq ($(CONFIG_DECODE_PERF_TESTS),yes) # Encode / Decode test @@ -814,7 +841,6 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv -LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv @@ -874,3 +900,5 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm.md5 diff --git a/media/libvpx/libvpx/test/test-data.sha1 b/media/libvpx/libvpx/test/test-data.sha1 index 22ca6f5643..1e8c090e0c 100644 --- a/media/libvpx/libvpx/test/test-data.sha1 +++ b/media/libvpx/libvpx/test/test-data.sha1 @@ -1,3 +1,4 @@ +3eaf216d9fc8b4b9bb8c3956311f49a85974806c *bus_352x288_420_f20_b8.yuv d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv 76024eb753cdac6a5e5703aaea189d35c3c30ac7 *invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf @@ -6,6 +7,8 @@ b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res +efafb92b7567bc04c3f1432ea6c268c1c31affd5 *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf +5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm @@ -15,13 +18,13 @@ df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm d637297561dd904eb2c97a9015deeb31c4a1e8d2 *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm 3a204bdbeaa3c6458b77bcebb8366d107267f55d *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res 9aa21d8b2cb9d39abe8a7bb6032dc66955fb4342 *noisy_clip_640_360.y4m -a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m -0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m -ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m +0936b837708ae68c034719f8e07596021c2c214f *park_joy_90p_10_420_20f.y4m +5727a853c083c1099f837d27967bc1322d50ed4f *park_joy_90p_10_422_20f.y4m +e13489470ef8e8b2a871a5640d795a42a39be58d *park_joy_90p_10_444_20f.y4m c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv -614c32ae1eca391e867c70d19974f0d62664dd99 *park_joy_90p_12_420.y4m -c92825f1ea25c5c37855083a69faac6ac4641a9e *park_joy_90p_12_422.y4m -b592189b885b6cc85db55cc98512a197d73d3b34 *park_joy_90p_12_444.y4m +79b0dc1784635a7f291e21c4e8d66a29c496ab99 *park_joy_90p_12_420_20f.y4m +9cf22b0f809f7464c8b9058f0cfa9d905921cbd1 *park_joy_90p_12_422_20f.y4m +22b2a4abaecc4a9ade6bb503d25fb82367947e85 *park_joy_90p_12_444_20f.y4m 82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m 4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m @@ -848,3 +851,27 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_ 6fa3d3ac306a3d9ce1d610b78441dc00d2c2d4b9 *tos_vp8.webm e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res +fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf +fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res +1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf +90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res +17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm +e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5 +a0fbbbc5dd50fd452096f4455a58c1a8c9f66697 *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf +a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res +894fae3afee0290546590823974203ab4b8abd95 *crbug-1539.rawfile +f1026c03efd5da21b381c8eb21f0d64e6d7e4ba3 *invalid-crbug-1558.ivf +eb198c25f861c3fe2cbd310de11eb96843019345 *invalid-crbug-1558.ivf.res +c62b005a9fd32c36a1b3f67de6840330f9915e34 *invalid-crbug-1562.ivf +f0cd8389948ad16085714d96567612136f6a46c5 *invalid-crbug-1562.ivf.res +bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv +094be4b80fa30bd227149ea16ab6476d549ea092 *slides_code_term_web_plot.1920_1080.yuv +518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf +d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res +ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv +8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv +ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m +f9a73e921552598a5804911e9f84fec2318e056a *repro-oss-fuzz-69906.y4m +320874b648f54e9156339e9d7e322ec4c51cb5f7 *niklas_1284_770_30.y4m +64e876e725f83b93e92a8240457c152dd6f5b77d *niklas_1857_167_30.y4m +27a7f0ebda1842a27c4b973b0e9e393769f74012 *4x2.y4m diff --git a/media/libvpx/libvpx/test/test.mk b/media/libvpx/libvpx/test/test.mk index e25463e46a..7c5cbf0720 100644 --- a/media/libvpx/libvpx/test/test.mk +++ b/media/libvpx/libvpx/test/test.mk @@ -1,9 +1,14 @@ LIBVPX_TEST_SRCS-yes += acm_random.h +LIBVPX_TEST_SRCS-yes += bench.h +LIBVPX_TEST_SRCS-yes += bench.cc +LIBVPX_TEST_SRCS-yes += buffer.h LIBVPX_TEST_SRCS-yes += clear_system_state.h LIBVPX_TEST_SRCS-yes += codec_factory.h LIBVPX_TEST_SRCS-yes += md5_helper.h LIBVPX_TEST_SRCS-yes += register_state_check.h LIBVPX_TEST_SRCS-yes += test.mk +LIBVPX_TEST_SRCS-yes += init_vpx_test.cc +LIBVPX_TEST_SRCS-yes += init_vpx_test.h LIBVPX_TEST_SRCS-yes += test_libvpx.cc LIBVPX_TEST_SRCS-yes += test_vectors.cc LIBVPX_TEST_SRCS-yes += test_vectors.h @@ -16,12 +21,12 @@ LIBVPX_TEST_SRCS-yes += video_source.h ## Black box tests only use the public API. ## LIBVPX_TEST_SRCS-yes += ../md5_utils.h ../md5_utils.c +LIBVPX_TEST_SRCS-yes += vpx_image_test.cc LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ivf_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += ../y4minput.h ../y4minput.c +ifneq ($(CONFIG_REALTIME_ONLY),yes) LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += altref_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += aq_segment_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += alt_ref_aq_segment_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += datarate_test.cc +endif LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += encode_api_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += error_resilience_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += i420_video_source.h @@ -30,24 +35,41 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += resize_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += y4m_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += yuv_video_source.h +ifneq ($(CONFIG_REALTIME_ONLY),yes) LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc +endif LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_datarate_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc +ifneq ($(CONFIG_REALTIME_ONLY),yes) +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += alt_ref_aq_segment_test.cc +endif +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += aq_segment_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += decode_corrupted.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_motion_vector_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_datarate_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_datarate_test.cc +ifneq ($(CONFIG_REALTIME_ONLY),yes) +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc +endif LIBVPX_TEST_SRCS-yes += decode_test_driver.cc LIBVPX_TEST_SRCS-yes += decode_test_driver.h @@ -66,12 +88,14 @@ LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.cc LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.cc LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.h LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.h +LIBWEBM_PARSER_SRCS += ../third_party/libwebm/common/webmids.h LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += $(LIBWEBM_PARSER_SRCS) LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../tools_common.h LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.cc LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../webmdec.h LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += webm_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc +$(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS) endif LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += decode_api_test.cc @@ -110,11 +134,13 @@ ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes) LIBVPX_TEST_SRCS-yes += vp8_boolcoder_test.cc LIBVPX_TEST_SRCS-yes += vp8_fragments_test.cc endif - LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += add_noise_test.cc LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += pp_filter_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc +ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_SSSE3) $(HAVE_SSE4_1) $(HAVE_NEON) \ + $(HAVE_MSA) $(HAVE_MMI))) LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc +endif LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc @@ -122,6 +148,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc LIBVPX_TEST_SRCS-yes += idct_test.cc LIBVPX_TEST_SRCS-yes += predict_test.cc LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc +LIBVPX_TEST_SRCS-yes += vpx_scale_test.h ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes) LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc @@ -141,6 +168,7 @@ LIBVPX_TEST_SRCS-yes += superframe_test.cc LIBVPX_TEST_SRCS-yes += tile_independence_test.cc LIBVPX_TEST_SRCS-yes += vp9_boolcoder_test.cc LIBVPX_TEST_SRCS-yes += vp9_encoder_parms_get_to_decoder.cc +LIBVPX_TEST_SRCS-yes += vp9_roi_test.cc endif LIBVPX_TEST_SRCS-yes += convolve_test.cc @@ -149,25 +177,36 @@ LIBVPX_TEST_SRCS-yes += vp9_intrapred_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += avg_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += comp_avg_pred_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_partial_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc +ifneq ($(CONFIG_REALTIME_ONLY),yes) +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc +endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc +ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2) $(HAVE_NEON))) +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc +endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc ifeq ($(CONFIG_VP9_ENCODER),yes) -LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc endif +ifeq ($(CONFIG_VP9_ENCODER),yes) +LIBVPX_TEST_SRCS-$(CONFIG_NON_GREEDY_MV) += non_greedy_mv_test.cc +endif + ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes) -LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp9_denoiser_sse2_test.cc +LIBVPX_TEST_SRCS-yes += vp9_denoiser_test.cc endif LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc @@ -176,10 +215,23 @@ endif # VP9 ## Multi-codec / unconditional whitebox tests. LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc +ifneq (, $(filter yes, $(HAVE_NEON) $(HAVE_SSE2) $(HAVE_MSA))) LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sum_squares_test.cc +endif TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c +TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.cc +TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.h + +RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_ratectrl_rtc_test.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.cc +RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.h +RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.cc +RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.h +RC_INTERFACE_TEST_SRCS-yes += codec_factory.h endif # CONFIG_SHARED diff --git a/media/libvpx/libvpx/test/test_intra_pred_speed.cc b/media/libvpx/libvpx/test/test_intra_pred_speed.cc index 17dde1a526..87ab732306 100644 --- a/media/libvpx/libvpx/test/test_intra_pred_speed.cc +++ b/media/libvpx/libvpx/test/test_intra_pred_speed.cc @@ -12,11 +12,13 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" #include "test/clear_system_state.h" +#include "test/init_vpx_test.h" #include "test/md5_helper.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" @@ -26,8 +28,8 @@ namespace { -typedef void (*VpxPredFunc)(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left); +using VpxPredFunc = void (*)(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left); const int kBPS = 32; const int kTotalPixels = 32 * kBPS; @@ -48,11 +50,9 @@ struct IntraPredTestMem { for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask; for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask; - // some code assumes the top row has been extended: - // d45/d63 C-code, for instance, but not the assembly. - // TODO(jzern): this style of extension isn't strictly necessary. + // d45/d63 require the top row to be extended. ASSERT_LE(block_size, kBPS); - for (int i = block_size; i < 2 * kBPS; ++i) { + for (int i = block_size; i < 2 * block_size; ++i) { above[i] = above[block_size - 1]; } } @@ -63,7 +63,7 @@ struct IntraPredTestMem { DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]); }; -typedef IntraPredTestMem Vp9IntraPredTestMem; +using Vp9IntraPredTestMem = IntraPredTestMem; void CheckMd5Signature(const char name[], const char *const signatures[], const void *data, size_t data_size, int elapsed_time, @@ -85,7 +85,7 @@ void TestIntraPred(const char name[], VpxPredFunc const *pred_funcs, intra_pred_test_mem.Init(block_size, 8); for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) { - if (pred_funcs[k] == NULL) continue; + if (pred_funcs[k] == nullptr) continue; memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, sizeof(intra_pred_test_mem.src)); vpx_usec_timer timer; @@ -206,58 +206,64 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c, INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2, vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2, vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2, - vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, NULL, - NULL, NULL, vpx_d207_predictor_4x4_sse2, NULL, + vpx_h_predictor_4x4_sse2, vpx_d45_predictor_4x4_sse2, nullptr, + nullptr, nullptr, vpx_d207_predictor_4x4_sse2, nullptr, vpx_tm_predictor_4x4_sse2) INTRA_PRED_TEST(SSE2, TestIntraPred8, vpx_dc_predictor_8x8_sse2, vpx_dc_left_predictor_8x8_sse2, vpx_dc_top_predictor_8x8_sse2, vpx_dc_128_predictor_8x8_sse2, vpx_v_predictor_8x8_sse2, - vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, NULL, - NULL, NULL, NULL, NULL, vpx_tm_predictor_8x8_sse2) + vpx_h_predictor_8x8_sse2, vpx_d45_predictor_8x8_sse2, nullptr, + nullptr, nullptr, nullptr, nullptr, vpx_tm_predictor_8x8_sse2) INTRA_PRED_TEST(SSE2, TestIntraPred16, vpx_dc_predictor_16x16_sse2, vpx_dc_left_predictor_16x16_sse2, vpx_dc_top_predictor_16x16_sse2, vpx_dc_128_predictor_16x16_sse2, vpx_v_predictor_16x16_sse2, - vpx_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_16x16_sse2) + vpx_h_predictor_16x16_sse2, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_16x16_sse2) INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2, vpx_dc_left_predictor_32x32_sse2, vpx_dc_top_predictor_32x32_sse2, vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2, - vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_32x32_sse2) + vpx_h_predictor_32x32_sse2, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_32x32_sse2) #endif // HAVE_SSE2 #if HAVE_SSSE3 -INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_d153_predictor_4x4_ssse3, NULL, - vpx_d63_predictor_4x4_ssse3, NULL) -INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_d153_predictor_8x8_ssse3, - vpx_d207_predictor_8x8_ssse3, vpx_d63_predictor_8x8_ssse3, NULL) -INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_d45_predictor_16x16_ssse3, NULL, NULL, - vpx_d153_predictor_16x16_ssse3, vpx_d207_predictor_16x16_ssse3, - vpx_d63_predictor_16x16_ssse3, NULL) -INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_d45_predictor_32x32_ssse3, NULL, NULL, - vpx_d153_predictor_32x32_ssse3, vpx_d207_predictor_32x32_ssse3, - vpx_d63_predictor_32x32_ssse3, NULL) +INTRA_PRED_TEST(SSSE3, TestIntraPred4, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_d153_predictor_4x4_ssse3, nullptr, + vpx_d63_predictor_4x4_ssse3, nullptr) +INTRA_PRED_TEST(SSSE3, TestIntraPred8, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_d153_predictor_8x8_ssse3, vpx_d207_predictor_8x8_ssse3, + vpx_d63_predictor_8x8_ssse3, nullptr) +INTRA_PRED_TEST(SSSE3, TestIntraPred16, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_d45_predictor_16x16_ssse3, nullptr, + nullptr, vpx_d153_predictor_16x16_ssse3, + vpx_d207_predictor_16x16_ssse3, vpx_d63_predictor_16x16_ssse3, + nullptr) +INTRA_PRED_TEST(SSSE3, TestIntraPred32, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_d45_predictor_32x32_ssse3, nullptr, + nullptr, vpx_d153_predictor_32x32_ssse3, + vpx_d207_predictor_32x32_ssse3, vpx_d63_predictor_32x32_ssse3, + nullptr) #endif // HAVE_SSSE3 #if HAVE_DSPR2 -INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, NULL, NULL, - NULL, NULL, vpx_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_tm_predictor_4x4_dspr2) -INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, NULL, NULL, - NULL, NULL, vpx_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_tm_predictor_8x8_c) -INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, NULL, - NULL, NULL, NULL, vpx_h_predictor_16x16_dspr2, NULL, NULL, NULL, - NULL, NULL, NULL, NULL) +INTRA_PRED_TEST(DSPR2, TestIntraPred4, vpx_dc_predictor_4x4_dspr2, nullptr, + nullptr, nullptr, nullptr, vpx_h_predictor_4x4_dspr2, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_tm_predictor_4x4_dspr2) +INTRA_PRED_TEST(DSPR2, TestIntraPred8, vpx_dc_predictor_8x8_dspr2, nullptr, + nullptr, nullptr, nullptr, vpx_h_predictor_8x8_dspr2, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, + vpx_tm_predictor_8x8_c) +INTRA_PRED_TEST(DSPR2, TestIntraPred16, vpx_dc_predictor_16x16_dspr2, nullptr, + nullptr, nullptr, nullptr, vpx_h_predictor_16x16_dspr2, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr) #endif // HAVE_DSPR2 #if HAVE_NEON @@ -265,63 +271,104 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon, vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon, vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon, vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon, - vpx_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, - vpx_tm_predictor_4x4_neon) + vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon, + vpx_d153_predictor_4x4_neon, vpx_d207_predictor_4x4_neon, + vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon) INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon, vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon, vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon, vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon, - vpx_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, - vpx_tm_predictor_8x8_neon) + vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon, + vpx_d153_predictor_8x8_neon, vpx_d207_predictor_8x8_neon, + vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon) INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon, vpx_dc_left_predictor_16x16_neon, vpx_dc_top_predictor_16x16_neon, vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon, vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon, - vpx_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL, - vpx_tm_predictor_16x16_neon) + vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon, + vpx_d153_predictor_16x16_neon, vpx_d207_predictor_16x16_neon, + vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon) INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon, vpx_dc_left_predictor_32x32_neon, vpx_dc_top_predictor_32x32_neon, vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon, vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon, - vpx_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL, - vpx_tm_predictor_32x32_neon) + vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon, + vpx_d153_predictor_32x32_neon, vpx_d207_predictor_32x32_neon, + vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon) #endif // HAVE_NEON #if HAVE_MSA INTRA_PRED_TEST(MSA, TestIntraPred4, vpx_dc_predictor_4x4_msa, vpx_dc_left_predictor_4x4_msa, vpx_dc_top_predictor_4x4_msa, vpx_dc_128_predictor_4x4_msa, vpx_v_predictor_4x4_msa, - vpx_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_4x4_msa) + vpx_h_predictor_4x4_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_4x4_msa) INTRA_PRED_TEST(MSA, TestIntraPred8, vpx_dc_predictor_8x8_msa, vpx_dc_left_predictor_8x8_msa, vpx_dc_top_predictor_8x8_msa, vpx_dc_128_predictor_8x8_msa, vpx_v_predictor_8x8_msa, - vpx_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_8x8_msa) + vpx_h_predictor_8x8_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_8x8_msa) INTRA_PRED_TEST(MSA, TestIntraPred16, vpx_dc_predictor_16x16_msa, vpx_dc_left_predictor_16x16_msa, vpx_dc_top_predictor_16x16_msa, vpx_dc_128_predictor_16x16_msa, vpx_v_predictor_16x16_msa, - vpx_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_16x16_msa) + vpx_h_predictor_16x16_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_16x16_msa) INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa, vpx_dc_left_predictor_32x32_msa, vpx_dc_top_predictor_32x32_msa, vpx_dc_128_predictor_32x32_msa, vpx_v_predictor_32x32_msa, - vpx_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL, - vpx_tm_predictor_32x32_msa) + vpx_h_predictor_32x32_msa, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_tm_predictor_32x32_msa) #endif // HAVE_MSA +#if HAVE_VSX +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +INTRA_PRED_TEST(VSX, TestIntraPred4, nullptr, nullptr, nullptr, nullptr, + nullptr, vpx_h_predictor_4x4_vsx, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, vpx_tm_predictor_4x4_vsx) + +INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, nullptr, nullptr, + nullptr, nullptr, vpx_h_predictor_8x8_vsx, + vpx_d45_predictor_8x8_vsx, nullptr, nullptr, nullptr, nullptr, + vpx_d63_predictor_8x8_vsx, vpx_tm_predictor_8x8_vsx) +#endif + +INTRA_PRED_TEST(VSX, TestIntraPred16, vpx_dc_predictor_16x16_vsx, + vpx_dc_left_predictor_16x16_vsx, vpx_dc_top_predictor_16x16_vsx, + vpx_dc_128_predictor_16x16_vsx, vpx_v_predictor_16x16_vsx, + vpx_h_predictor_16x16_vsx, vpx_d45_predictor_16x16_vsx, nullptr, + nullptr, nullptr, nullptr, vpx_d63_predictor_16x16_vsx, + vpx_tm_predictor_16x16_vsx) + +INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx, + vpx_dc_left_predictor_32x32_vsx, vpx_dc_top_predictor_32x32_vsx, + vpx_dc_128_predictor_32x32_vsx, vpx_v_predictor_32x32_vsx, + vpx_h_predictor_32x32_vsx, vpx_d45_predictor_32x32_vsx, nullptr, + nullptr, nullptr, nullptr, vpx_d63_predictor_32x32_vsx, + vpx_tm_predictor_32x32_vsx) +#endif // HAVE_VSX + +#if HAVE_LSX +INTRA_PRED_TEST(LSX, TestIntraPred8, vpx_dc_predictor_8x8_lsx, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr) +INTRA_PRED_TEST(LSX, TestIntraPred16, vpx_dc_predictor_16x16_lsx, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr) +#endif // HAVE_LSX + // ----------------------------------------------------------------------------- #if CONFIG_VP9_HIGHBITDEPTH namespace { -typedef void (*VpxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride, - const uint16_t *above, const uint16_t *left, - int bd); +using VpxHighbdPredFunc = void (*)(uint16_t *dst, ptrdiff_t y_stride, + const uint16_t *above, const uint16_t *left, + int bd); -typedef IntraPredTestMem Vp9HighbdIntraPredTestMem; +using Vp9HighbdIntraPredTestMem = IntraPredTestMem; void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs, const char *const signatures[], int block_size) { @@ -333,7 +380,7 @@ void TestHighbdIntraPred(const char name[], VpxHighbdPredFunc const *pred_funcs, intra_pred_test_mem.Init(block_size, 12); for (int k = 0; k < kNumVp9IntraPredFuncs; ++k) { - if (pred_funcs[k] == NULL) continue; + if (pred_funcs[k] == nullptr) continue; memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src, sizeof(intra_pred_test_mem.src)); vpx_usec_timer timer; @@ -455,66 +502,115 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c) #if HAVE_SSE2 -HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4, - vpx_highbd_dc_predictor_4x4_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_4x4_sse2, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c) +HIGHBD_INTRA_PRED_TEST( + SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2, + vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2, + vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2, + vpx_highbd_h_predictor_4x4_sse2, nullptr, + vpx_highbd_d135_predictor_4x4_sse2, vpx_highbd_d117_predictor_4x4_sse2, + vpx_highbd_d153_predictor_4x4_sse2, vpx_highbd_d207_predictor_4x4_sse2, + vpx_highbd_d63_predictor_4x4_sse2, vpx_highbd_tm_predictor_4x4_c) -HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8, - vpx_highbd_dc_predictor_8x8_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_8x8_sse2, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2) +HIGHBD_INTRA_PRED_TEST( + SSE2, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_sse2, + vpx_highbd_dc_left_predictor_8x8_sse2, vpx_highbd_dc_top_predictor_8x8_sse2, + vpx_highbd_dc_128_predictor_8x8_sse2, vpx_highbd_v_predictor_8x8_sse2, + vpx_highbd_h_predictor_8x8_sse2, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, vpx_highbd_tm_predictor_8x8_sse2) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, - vpx_highbd_dc_predictor_16x16_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_16x16_sse2, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, + vpx_highbd_dc_predictor_16x16_sse2, + vpx_highbd_dc_left_predictor_16x16_sse2, + vpx_highbd_dc_top_predictor_16x16_sse2, + vpx_highbd_dc_128_predictor_16x16_sse2, + vpx_highbd_v_predictor_16x16_sse2, + vpx_highbd_h_predictor_16x16_sse2, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_16x16_sse2) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, - vpx_highbd_dc_predictor_32x32_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_32x32_sse2, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, + vpx_highbd_dc_predictor_32x32_sse2, + vpx_highbd_dc_left_predictor_32x32_sse2, + vpx_highbd_dc_top_predictor_32x32_sse2, + vpx_highbd_dc_128_predictor_32x32_sse2, + vpx_highbd_v_predictor_32x32_sse2, + vpx_highbd_h_predictor_32x32_sse2, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, vpx_highbd_tm_predictor_32x32_sse2) #endif // HAVE_SSE2 +#if HAVE_SSSE3 +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_4x4_ssse3, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_8x8_ssse3, + vpx_highbd_d135_predictor_8x8_ssse3, + vpx_highbd_d117_predictor_8x8_ssse3, + vpx_highbd_d153_predictor_8x8_ssse3, + vpx_highbd_d207_predictor_8x8_ssse3, + vpx_highbd_d63_predictor_8x8_ssse3, nullptr) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_16x16_ssse3, + vpx_highbd_d135_predictor_16x16_ssse3, + vpx_highbd_d117_predictor_16x16_ssse3, + vpx_highbd_d153_predictor_16x16_ssse3, + vpx_highbd_d207_predictor_16x16_ssse3, + vpx_highbd_d63_predictor_16x16_ssse3, nullptr) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + vpx_highbd_d45_predictor_32x32_ssse3, + vpx_highbd_d135_predictor_32x32_ssse3, + vpx_highbd_d117_predictor_32x32_ssse3, + vpx_highbd_d153_predictor_32x32_ssse3, + vpx_highbd_d207_predictor_32x32_ssse3, + vpx_highbd_d63_predictor_32x32_ssse3, nullptr) +#endif // HAVE_SSSE3 + #if HAVE_NEON HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon, vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon, vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, - vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, - vpx_highbd_tm_predictor_4x4_neon) + vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon, + vpx_highbd_d153_predictor_4x4_neon, vpx_highbd_d207_predictor_4x4_neon, + vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, - vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, - vpx_highbd_tm_predictor_8x8_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16, - vpx_highbd_dc_predictor_16x16_neon, - vpx_highbd_dc_left_predictor_16x16_neon, - vpx_highbd_dc_top_predictor_16x16_neon, - vpx_highbd_dc_128_predictor_16x16_neon, - vpx_highbd_v_predictor_16x16_neon, - vpx_highbd_h_predictor_16x16_neon, - vpx_highbd_d45_predictor_16x16_neon, - vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, - NULL, vpx_highbd_tm_predictor_16x16_neon) -HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32, - vpx_highbd_dc_predictor_32x32_neon, - vpx_highbd_dc_left_predictor_32x32_neon, - vpx_highbd_dc_top_predictor_32x32_neon, - vpx_highbd_dc_128_predictor_32x32_neon, - vpx_highbd_v_predictor_32x32_neon, - vpx_highbd_h_predictor_32x32_neon, - vpx_highbd_d45_predictor_32x32_neon, - vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, - NULL, vpx_highbd_tm_predictor_32x32_neon) + vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon, + vpx_highbd_d153_predictor_8x8_neon, vpx_highbd_d207_predictor_8x8_neon, + vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, + vpx_highbd_dc_left_predictor_16x16_neon, + vpx_highbd_dc_top_predictor_16x16_neon, + vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, + vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, + vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon, + vpx_highbd_d153_predictor_16x16_neon, vpx_highbd_d207_predictor_16x16_neon, + vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon) +HIGHBD_INTRA_PRED_TEST( + NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, + vpx_highbd_dc_left_predictor_32x32_neon, + vpx_highbd_dc_top_predictor_32x32_neon, + vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, + vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, + vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon, + vpx_highbd_d153_predictor_32x32_neon, vpx_highbd_d207_predictor_32x32_neon, + vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH -#include "test/test_libvpx.cc" +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + ::libvpx_test::init_vpx_test(); + return RUN_ALL_TESTS(); +} diff --git a/media/libvpx/libvpx/test/test_libvpx.cc b/media/libvpx/libvpx/test/test_libvpx.cc index 8a70b4e28c..ba27102385 100644 --- a/media/libvpx/libvpx/test/test_libvpx.cc +++ b/media/libvpx/libvpx/test/test_libvpx.cc @@ -7,67 +7,12 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include -#include "third_party/googletest/src/include/gtest/gtest.h" - -#include "./vpx_config.h" -#if ARCH_X86 || ARCH_X86_64 -#include "vpx_ports/x86.h" -#endif -extern "C" { -#if CONFIG_VP8 -extern void vp8_rtcd(); -#endif // CONFIG_VP8 -#if CONFIG_VP9 -extern void vp9_rtcd(); -#endif // CONFIG_VP9 -extern void vpx_dsp_rtcd(); -extern void vpx_scale_rtcd(); -} - -#if ARCH_X86 || ARCH_X86_64 -static void append_negative_gtest_filter(const char *str) { - std::string filter = ::testing::FLAGS_gtest_filter; - // Negative patterns begin with one '-' followed by a ':' separated list. - if (filter.find('-') == std::string::npos) filter += '-'; - filter += str; - ::testing::FLAGS_gtest_filter = filter; -} -#endif // ARCH_X86 || ARCH_X86_64 +#include "gtest/gtest.h" +#include "test/init_vpx_test.h" int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); - -#if ARCH_X86 || ARCH_X86_64 - const int simd_caps = x86_simd_caps(); - if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*"); - if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*"); - if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*"); - if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*"); - if (!(simd_caps & HAS_SSSE3)) { - append_negative_gtest_filter(":SSSE3.*:SSSE3/*"); - } - if (!(simd_caps & HAS_SSE4_1)) { - append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*"); - } - if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*"); - if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*"); -#endif // ARCH_X86 || ARCH_X86_64 - -#if !CONFIG_SHARED -// Shared library builds don't support whitebox tests -// that exercise internal symbols. - -#if CONFIG_VP8 - vp8_rtcd(); -#endif // CONFIG_VP8 -#if CONFIG_VP9 - vp9_rtcd(); -#endif // CONFIG_VP9 - vpx_dsp_rtcd(); - vpx_scale_rtcd(); -#endif // !CONFIG_SHARED - + ::libvpx_test::init_vpx_test(); return RUN_ALL_TESTS(); } diff --git a/media/libvpx/libvpx/test/test_rc_interface.cc b/media/libvpx/libvpx/test/test_rc_interface.cc new file mode 100644 index 0000000000..840a299f82 --- /dev/null +++ b/media/libvpx/libvpx/test/test_rc_interface.cc @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "gtest/gtest.h" + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/media/libvpx/libvpx/test/test_vector_test.cc b/media/libvpx/libvpx/test/test_vector_test.cc index 2dd33f73bc..2c0de86cba 100644 --- a/media/libvpx/libvpx/test/test_vector_test.cc +++ b/media/libvpx/libvpx/test/test_vector_test.cc @@ -10,9 +10,12 @@ #include #include +#include #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include + +#include "gtest/gtest.h" #include "../tools_common.h" #include "./vpx_config.h" #include "test/codec_factory.h" @@ -28,18 +31,16 @@ namespace { -enum DecodeMode { kSerialMode, kFrameParallelMode }; - -const int kDecodeMode = 0; -const int kThreads = 1; +const int kThreads = 0; +const int kMtMode = 1; const int kFileName = 2; -typedef std::tr1::tuple DecodeParam; +using DecodeParam = std::tuple; class TestVectorTest : public ::libvpx_test::DecoderTest, public ::libvpx_test::CodecTestWithParam { protected: - TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) { + TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(nullptr) { #if CONFIG_VP9_DECODER resize_clips_.insert(::libvpx_test::kVP9TestVectorsResize, ::libvpx_test::kVP9TestVectorsResize + @@ -47,19 +48,37 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, #endif } - virtual ~TestVectorTest() { + ~TestVectorTest() override { if (md5_file_) fclose(md5_file_); } void OpenMD5File(const std::string &md5_file_name_) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_); - ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: " - << md5_file_name_; + ASSERT_NE(md5_file_, nullptr) + << "Md5 file open failed. Filename: " << md5_file_name_; } - virtual void DecompressedFrameHook(const vpx_image_t &img, - const unsigned int frame_number) { - ASSERT_TRUE(md5_file_ != NULL); +#if CONFIG_VP9_DECODER + void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video, + libvpx_test::Decoder *decoder) override { + if (video.frame_number() == 0 && mt_mode_ >= 0) { + if (mt_mode_ == 1) { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1); + decoder->Control(VP9D_SET_ROW_MT, 0); + } else if (mt_mode_ == 2) { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0); + decoder->Control(VP9D_SET_ROW_MT, 1); + } else { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0); + decoder->Control(VP9D_SET_ROW_MT, 0); + } + } + } +#endif + + void DecompressedFrameHook(const vpx_image_t &img, + const unsigned int frame_number) override { + ASSERT_NE(md5_file_, nullptr); char expected_md5[33]; char junk[128]; @@ -80,6 +99,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, #if CONFIG_VP9_DECODER std::set resize_clips_; #endif + int mt_mode_; private: FILE *md5_file_; @@ -91,34 +111,20 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, // the test failed. TEST_P(TestVectorTest, MD5Match) { const DecodeParam input = GET_PARAM(1); - const std::string filename = std::tr1::get(input); - const int threads = std::tr1::get(input); - const int mode = std::tr1::get(input); + const std::string filename = std::get(input); vpx_codec_flags_t flags = 0; vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); char str[256]; - if (mode == kFrameParallelMode) { - flags |= VPX_CODEC_USE_FRAME_THREADING; -#if CONFIG_VP9_DECODER - // TODO(hkuang): Fix frame parallel decode bug. See issue 1086. - if (resize_clips_.find(filename) != resize_clips_.end()) { - printf("Skipping the test file: %s, due to frame parallel decode bug.\n", - filename.c_str()); - return; - } -#endif - } - - cfg.threads = threads; - + cfg.threads = std::get(input); + mt_mode_ = std::get(input); snprintf(str, sizeof(str) / sizeof(str[0]) - 1, - "file: %s mode: %s threads: %d", filename.c_str(), - mode == 0 ? "Serial" : "Parallel", threads); + "file: %s threads: %d MT mode: %d", filename.c_str(), cfg.threads, + mt_mode_); SCOPED_TRACE(str); // Open compressed video file. - testing::internal::scoped_ptr video; + std::unique_ptr video; if (filename.substr(filename.length() - 3, 3) == "ivf") { video.reset(new libvpx_test::IVFVideoSource(filename)); } else if (filename.substr(filename.length() - 4, 4) == "webm") { @@ -130,7 +136,7 @@ TEST_P(TestVectorTest, MD5Match) { return; #endif } - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); video->Init(); // Construct md5 file name. @@ -145,53 +151,52 @@ TEST_P(TestVectorTest, MD5Match) { ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg)); } -// Test VP8 decode in serial mode with single thread. -// NOTE: VP8 only support serial mode. #if CONFIG_VP8_DECODER -VP8_INSTANTIATE_TEST_CASE( +VP8_INSTANTIATE_TEST_SUITE( TestVectorTest, ::testing::Combine( - ::testing::Values(0), // Serial Mode. - ::testing::Values(1), // Single thread. + ::testing::Values(1), // Single thread. + ::testing::Values(-1), // LPF opt and Row MT is not applicable ::testing::ValuesIn(libvpx_test::kVP8TestVectors, libvpx_test::kVP8TestVectors + libvpx_test::kNumVP8TestVectors))); // Test VP8 decode in with different numbers of threads. -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP8MultiThreaded, TestVectorTest, ::testing::Combine( ::testing::Values( static_cast(&libvpx_test::kVP8)), ::testing::Combine( - ::testing::Values(0), // Serial Mode. - ::testing::Range(1, 8), // With 1 ~ 8 threads. + ::testing::Range(2, 9), // With 2 ~ 8 threads. + ::testing::Values(-1), // LPF opt and Row MT is not applicable ::testing::ValuesIn(libvpx_test::kVP8TestVectors, libvpx_test::kVP8TestVectors + libvpx_test::kNumVP8TestVectors)))); #endif // CONFIG_VP8_DECODER -// Test VP9 decode in serial mode with single thread. #if CONFIG_VP9_DECODER -VP9_INSTANTIATE_TEST_CASE( +VP9_INSTANTIATE_TEST_SUITE( TestVectorTest, ::testing::Combine( - ::testing::Values(0), // Serial Mode. - ::testing::Values(1), // Single thread. + ::testing::Values(1), // Single thread. + ::testing::Values(-1), // LPF opt and Row MT is not applicable ::testing::ValuesIn(libvpx_test::kVP9TestVectors, libvpx_test::kVP9TestVectors + libvpx_test::kNumVP9TestVectors))); -// Test VP9 decode in frame parallel mode with different number of threads. -INSTANTIATE_TEST_CASE_P( - VP9MultiThreadedFrameParallel, TestVectorTest, +INSTANTIATE_TEST_SUITE_P( + VP9MultiThreaded, TestVectorTest, ::testing::Combine( ::testing::Values( static_cast(&libvpx_test::kVP9)), ::testing::Combine( - ::testing::Values(1), // Frame Parallel mode. ::testing::Range(2, 9), // With 2 ~ 8 threads. + ::testing::Range(0, 3), // With multi threads modes 0 ~ 2 + // 0: LPF opt and Row MT disabled + // 1: LPF opt enabled + // 2: Row MT enabled ::testing::ValuesIn(libvpx_test::kVP9TestVectors, libvpx_test::kVP9TestVectors + libvpx_test::kNumVP9TestVectors)))); diff --git a/media/libvpx/libvpx/test/test_vectors.cc b/media/libvpx/libvpx/test/test_vectors.cc index def78da282..954ff771a9 100644 --- a/media/libvpx/libvpx/test/test_vectors.cc +++ b/media/libvpx/libvpx/test/test_vectors.cc @@ -9,6 +9,7 @@ */ #include "test/test_vectors.h" +#include "vpx_config.h" namespace libvpx_test { @@ -371,6 +372,7 @@ const char *const kVP9TestVectors[] = { #endif // CONFIG_VP9_HIGHBITDEPTH "vp90-2-20-big_superframe-01.webm", "vp90-2-20-big_superframe-02.webm", + "vp90-2-22-svc_1280x720_1.webm", RESIZE_TEST_VECTORS }; const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" }; diff --git a/media/libvpx/libvpx/test/test_vectors.h b/media/libvpx/libvpx/test/test_vectors.h index 3df3e81133..0a4be0f1a2 100644 --- a/media/libvpx/libvpx/test/test_vectors.h +++ b/media/libvpx/libvpx/test/test_vectors.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_TEST_VECTORS_H_ -#define TEST_TEST_VECTORS_H_ +#ifndef VPX_TEST_TEST_VECTORS_H_ +#define VPX_TEST_TEST_VECTORS_H_ #include "./vpx_config.h" @@ -31,4 +31,4 @@ extern const char *const kVP9TestVectorsResize[]; } // namespace libvpx_test -#endif // TEST_TEST_VECTORS_H_ +#endif // VPX_TEST_TEST_VECTORS_H_ diff --git a/media/libvpx/libvpx/test/tile_independence_test.cc b/media/libvpx/libvpx/test/tile_independence_test.cc index e24981c68d..6bf203571f 100644 --- a/media/libvpx/libvpx/test/tile_independence_test.cc +++ b/media/libvpx/libvpx/test/tile_independence_test.cc @@ -11,12 +11,12 @@ #include #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" -#include "test/util.h" #include "test/md5_helper.h" +#include "test/util.h" #include "vpx_mem/vpx_mem.h" namespace { @@ -36,19 +36,19 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1); } - virtual ~TileIndependenceTest() { + ~TileIndependenceTest() override { delete fw_dec_; delete inv_dec_; } - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(libvpx_test::kTwoPassGood); } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, - libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_); } } @@ -65,7 +65,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, md5->Add(img); } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { UpdateMD5(fw_dec_, pkt, &md5_fw_order_); UpdateMD5(inv_dec_, pkt, &md5_inv_order_); } @@ -100,5 +100,5 @@ TEST_P(TileIndependenceTest, MD5Match) { ASSERT_STREQ(md5_fw_str, md5_inv_str); } -VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1)); +VP9_INSTANTIATE_TEST_SUITE(TileIndependenceTest, ::testing::Range(0, 2, 1)); } // namespace diff --git a/media/libvpx/libvpx/test/timestamp_test.cc b/media/libvpx/libvpx/test/timestamp_test.cc new file mode 100644 index 0000000000..3567824df3 --- /dev/null +++ b/media/libvpx/libvpx/test/timestamp_test.cc @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/video_source.h" +#include "vpx_config.h" + +namespace { + +const int kVideoSourceWidth = 320; +const int kVideoSourceHeight = 240; +const int kFramesToEncode = 3; + +// A video source that exposes functions to set the timebase, framerate and +// starting pts. +class DummyTimebaseVideoSource : public ::libvpx_test::DummyVideoSource { + public: + // Parameters num and den set the timebase for the video source. + DummyTimebaseVideoSource(int num, int den) + : timebase_({ num, den }), framerate_numerator_(30), + framerate_denominator_(1), starting_pts_(0) { + SetSize(kVideoSourceWidth, kVideoSourceHeight); + set_limit(kFramesToEncode); + } + + void SetFramerate(int numerator, int denominator) { + framerate_numerator_ = numerator; + framerate_denominator_ = denominator; + } + + // Returns one frames duration in timebase units as a double. + double FrameDuration() const { + return (static_cast(timebase_.den) / timebase_.num) / + (static_cast(framerate_numerator_) / framerate_denominator_); + } + + vpx_codec_pts_t pts() const override { + return static_cast(frame_ * FrameDuration() + + starting_pts_ + 0.5); + } + + unsigned long duration() const override { + return static_cast(FrameDuration() + 0.5); + } + + vpx_rational_t timebase() const override { return timebase_; } + + void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; } + + private: + vpx_rational_t timebase_; + int framerate_numerator_; + int framerate_denominator_; + int64_t starting_pts_; +}; + +class TimestampTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam { + protected: + TimestampTest() : EncoderTest(GET_PARAM(0)) {} + ~TimestampTest() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + } +}; + +// Tests encoding in millisecond timebase. +TEST_P(TimestampTest, EncodeFrames) { + DummyTimebaseVideoSource video(1, 1000); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(TimestampTest, TestMicrosecondTimebase) { + // Set the timebase to microseconds. + DummyTimebaseVideoSource video(1, 1000000); + video.set_limit(1); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(TimestampTest, TestVpxRollover) { + DummyTimebaseVideoSource video(1, 1000); + video.set_starting_pts(922337170351ll); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +#if CONFIG_REALTIME_ONLY +VP8_INSTANTIATE_TEST_SUITE(TimestampTest, + ::testing::Values(::libvpx_test::kRealTime)); +VP9_INSTANTIATE_TEST_SUITE(TimestampTest, + ::testing::Values(::libvpx_test::kRealTime)); +#else +VP8_INSTANTIATE_TEST_SUITE(TimestampTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); +VP9_INSTANTIATE_TEST_SUITE(TimestampTest, + ::testing::Values(::libvpx_test::kTwoPassGood)); +#endif +} // namespace diff --git a/media/libvpx/libvpx/test/tools_common.sh b/media/libvpx/libvpx/test/tools_common.sh index 0bdcc08d78..d0dd24df36 100644 --- a/media/libvpx/libvpx/test/tools_common.sh +++ b/media/libvpx/libvpx/test/tools_common.sh @@ -133,7 +133,7 @@ vpx_config_option_enabled() { vpx_config_option="${1}" vpx_config_file="${LIBVPX_CONFIG_PATH}/vpx_config.h" config_line=$(grep "${vpx_config_option}" "${vpx_config_file}") - if echo "${config_line}" | egrep -q '1$'; then + if echo "${config_line}" | grep -E -q '1$'; then echo yes fi } @@ -150,7 +150,7 @@ is_windows_target() { # empty string. Caller is responsible for testing the string once the function # returns. vpx_tool_path() { - local readonly tool_name="$1" + local tool_name="$1" local tool_path="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}" if [ ! -x "${tool_path}" ]; then # Try one directory up: when running via examples.sh the tool could be in @@ -222,7 +222,7 @@ filter_strings() { if [ -n "${filter}" ]; then for s in ${strings}; do - if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then + if echo "${s}" | grep -E -q ${exclude} "${filter}" > /dev/null 2>&1; then filtered_strings="${filtered_strings} ${s}" fi done @@ -280,7 +280,12 @@ run_tests() { test_end "${test}" done - local tested_config="$(test_configuration_target) @ $(current_hash)" + # C vs SIMD tests are run for x86 32-bit, 64-bit and ARM platform + if [ "${test_name}" = "vp9_c_vs_simd_encode" ]; then + local tested_config="$(current_hash)" + else + local tested_config="$(test_configuration_target) @ $(current_hash)" + fi echo "${test_name}: Done, all tests pass for ${tested_config}." } @@ -404,12 +409,16 @@ VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm" VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm" VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm" +VP9_RAW_FILE="${LIBVPX_TEST_DATA_PATH}/crbug-1539.rawfile" + YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" YUV_RAW_INPUT_WIDTH=352 YUV_RAW_INPUT_HEIGHT=288 Y4M_NOSQ_PAR_INPUT="${LIBVPX_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m" Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m" +Y4M_720P_INPUT_WIDTH=1280 +Y4M_720P_INPUT_HEIGHT=720 # Setup a trap function to clean up after tests complete. trap cleanup EXIT diff --git a/media/libvpx/libvpx/test/twopass_encoder.sh b/media/libvpx/libvpx/test/twopass_encoder.sh index 7a223f2afc..69ecbacd0c 100644 --- a/media/libvpx/libvpx/test/twopass_encoder.sh +++ b/media/libvpx/libvpx/test/twopass_encoder.sh @@ -37,7 +37,7 @@ twopass_encoder() { eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 100 \ - ${devnull} + ${devnull} || return 1 [ -e "${output_file}" ] || return 1 } @@ -54,7 +54,10 @@ twopass_encoder_vp9() { fi } -twopass_encoder_tests="twopass_encoder_vp8 - twopass_encoder_vp9" -run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}" +if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then + twopass_encoder_tests="twopass_encoder_vp8 + twopass_encoder_vp9" + + run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}" +fi diff --git a/media/libvpx/libvpx/test/user_priv_test.cc b/media/libvpx/libvpx/test/user_priv_test.cc index 4b5de094e9..0f45b082e5 100644 --- a/media/libvpx/libvpx/test/user_priv_test.cc +++ b/media/libvpx/libvpx/test/user_priv_test.cc @@ -11,7 +11,7 @@ #include #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "test/acm_random.h" #include "test/codec_factory.h" @@ -27,8 +27,8 @@ namespace { -using std::string; using libvpx_test::ACMRandom; +using std::string; #if CONFIG_WEBM_IO @@ -57,28 +57,28 @@ string DecodeFile(const string &filename) { void *user_priv = reinterpret_cast(&frame_num); const vpx_codec_err_t res = decoder.DecodeFrame(video.cxdata(), video.frame_size(), - (frame_num == 0) ? NULL : user_priv); + (frame_num == 0) ? nullptr : user_priv); if (res != VPX_CODEC_OK) { EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); break; } libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); - const vpx_image_t *img = NULL; + const vpx_image_t *img = nullptr; // Get decompressed data. while ((img = dec_iter.Next())) { if (frame_num == 0) { - CheckUserPrivateData(img->user_priv, NULL); + CheckUserPrivateData(img->user_priv, nullptr); } else { CheckUserPrivateData(img->user_priv, &frame_num); // Also test ctrl_get_reference api. - struct vp9_ref_frame ref; + struct vp9_ref_frame ref = vp9_ref_frame(); // Randomly fetch a reference frame. ref.idx = rnd.Rand8() % 3; decoder.Control(VP9_GET_REFERENCE, &ref); - CheckUserPrivateData(ref.img.user_priv, NULL); + CheckUserPrivateData(ref.img.user_priv, nullptr); } md5.Add(img); } diff --git a/media/libvpx/libvpx/test/util.h b/media/libvpx/libvpx/test/util.h index 1f2540ecf2..94bab66416 100644 --- a/media/libvpx/libvpx/test/util.h +++ b/media/libvpx/libvpx/test/util.h @@ -8,16 +8,18 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_UTIL_H_ -#define TEST_UTIL_H_ +#ifndef VPX_TEST_UTIL_H_ +#define VPX_TEST_UTIL_H_ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include + +#include "gtest/gtest.h" #include "vpx/vpx_image.h" // Macros -#define GET_PARAM(k) std::tr1::get(GetParam()) +#define GET_PARAM(k) std::get(GetParam()) inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) { assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) && @@ -43,4 +45,4 @@ inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) { return psnr; } -#endif // TEST_UTIL_H_ +#endif // VPX_TEST_UTIL_H_ diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc index 6e31165faf..aceb4e4168 100644 --- a/media/libvpx/libvpx/test/variance_test.cc +++ b/media/libvpx/libvpx/test/variance_test.cc @@ -11,7 +11,7 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -20,26 +20,19 @@ #include "test/register_state_check.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/variance.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vpx_ports/vpx_timer.h" namespace { -typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse); -typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - unsigned int *sse); -typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - uint32_t *sse, - const uint8_t *second_pred); -typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride); -typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); +using Get4x4SseFunc = unsigned int (*)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); +using GetVarianceFunc = void (*)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int *sum); +using SumOfSquaresFunction = unsigned int (*)(const int16_t *src); using libvpx_test::ACMRandom; @@ -73,35 +66,65 @@ static unsigned int mb_ss_ref(const int16_t *src) { * Our codebase calculates the "diff" value in the variance algorithm by * (src - ref). */ -static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w, - int l2h, int src_stride, int ref_stride, - uint32_t *sse_ptr, bool use_high_bit_depth_, - vpx_bit_depth_t bit_depth) { - int64_t se = 0; - uint64_t sse = 0; - const int w = 1 << l2w; - const int h = 1 << l2h; +static void variance(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int w, int h, bool use_high_bit_depth_, + uint64_t *sse, int64_t *se, vpx_bit_depth_t bit_depth) { + int64_t se_long = 0; + uint64_t sse_long = 0; + for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { - int diff; + int diff = 0; if (!use_high_bit_depth_) { diff = src[y * src_stride + x] - ref[y * ref_stride + x]; - se += diff; - sse += diff * diff; #if CONFIG_VP9_HIGHBITDEPTH } else { diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] - CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x]; - se += diff; - sse += diff * diff; #endif // CONFIG_VP9_HIGHBITDEPTH } + se_long += diff; + sse_long += diff * diff; } } - RoundHighBitDepth(bit_depth, &se, &sse); - *sse_ptr = static_cast(sse); + + RoundHighBitDepth(bit_depth, &se_long, &sse_long); + + *sse = sse_long; + *se = se_long; +} + +static void get_variance_ref(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int l2w, + int l2h, bool use_high_bit_depth_, uint32_t *sse, + int *se, vpx_bit_depth_t bit_depth) { + const int w = 1 << l2w; + const int h = 1 << l2h; + int64_t se_long = 0; + uint64_t sse_long = 0; + + variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_, + &sse_long, &se_long, bit_depth); + + *sse = static_cast(sse_long); + *se = static_cast(se_long); +} + +static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w, + int l2h, int src_stride, int ref_stride, + uint32_t *sse_ptr, bool use_high_bit_depth_, + vpx_bit_depth_t bit_depth) { + const int w = 1 << l2w; + const int h = 1 << l2h; + int64_t se_long = 0; + uint64_t sse_long = 0; + + variance(src, src_stride, ref, ref_stride, w, h, use_high_bit_depth_, + &sse_long, &se_long, bit_depth); + + *sse_ptr = static_cast(sse_long); return static_cast( - sse - ((static_cast(se) * se) >> (l2w + l2h))); + sse_long - ((static_cast(se_long) * se_long) >> (l2w + l2h))); } /* The subpel reference functions differ from the codec version in one aspect: @@ -220,7 +243,7 @@ class SumOfSquaresTest : public ::testing::TestWithParam { public: SumOfSquaresTest() : func_(GetParam()) {} - virtual ~SumOfSquaresTest() { libvpx_test::ClearSystemState(); } + ~SumOfSquaresTest() override { libvpx_test::ClearSystemState(); } protected: void ConstTest(); @@ -263,7 +286,7 @@ void SumOfSquaresTest::RefTest() { template struct TestParams { - TestParams(int log2w = 0, int log2h = 0, Func function = NULL, + TestParams(int log2w = 0, int log2h = 0, Func function = nullptr, int bit_depth_value = 0) : log2width(log2w), log2height(log2h), func(function) { use_high_bit_depth = (bit_depth_value > 0); @@ -299,7 +322,7 @@ template class MainTestClass : public ::testing::TestWithParam > { public: - virtual void SetUp() { + void SetUp() override { params_ = this->GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); @@ -307,8 +330,8 @@ class MainTestClass use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t); src_ = reinterpret_cast(vpx_memalign(16, block_size() * unit)); ref_ = new uint8_t[block_size() * unit]; - ASSERT_TRUE(src_ != NULL); - ASSERT_TRUE(ref_ != NULL); + ASSERT_NE(src_, nullptr); + ASSERT_NE(ref_, nullptr); #if CONFIG_VP9_HIGHBITDEPTH if (use_high_bit_depth()) { // TODO(skal): remove! @@ -318,7 +341,7 @@ class MainTestClass #endif } - virtual void TearDown() { + void TearDown() override { #if CONFIG_VP9_HIGHBITDEPTH if (use_high_bit_depth()) { // TODO(skal): remove! @@ -329,8 +352,8 @@ class MainTestClass vpx_free(src_); delete[] ref_; - src_ = NULL; - ref_ = NULL; + src_ = nullptr; + ref_ = nullptr; libvpx_test::ClearSystemState(); } @@ -345,6 +368,10 @@ class MainTestClass void RefTest(); void RefStrideTest(); void OneQuarterTest(); + void SpeedTest(); + + // GetVariance tests + void RefTestGetVar(); // MSE/SSE tests void RefTestMse(); @@ -363,6 +390,7 @@ class MainTestClass int byte_shift() const { return params_.bit_depth - 8; } int block_size() const { return params_.block_size; } int width() const { return params_.width; } + int height() const { return params_.height; } uint32_t mask() const { return params_.mask; } }; @@ -471,6 +499,64 @@ void MainTestClass::OneQuarterTest() { EXPECT_EQ(expected, var); } +template +void MainTestClass::SpeedTest() { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { + memset(src_, 255, block_size()); + memset(ref_, 255, half); + memset(ref_ + half, 0, half); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse; + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < (1 << 30) / block_size(); ++i) { + const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse); + // Ignore return value. + (void)variance; + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("Variance %dx%d %dbpp time: %5d ms\n", width(), height(), + params_.bit_depth, elapsed_time / 1000); +} + +//////////////////////////////////////////////////////////////////////////////// +// Tests related to GetVariance. +template +void MainTestClass::RefTestGetVar() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size(); j++) { + if (!use_high_bit_depth()) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + unsigned int sse1, sse2; + int sum1, sum2; + const int stride = width(); + ASM_REGISTER_STATE_CHECK( + params_.func(src_, stride, ref_, stride, &sse1, &sum1)); + get_variance_ref(src_, stride, ref_, stride, params_.log2width, + params_.log2height, use_high_bit_depth(), &sse2, &sum2, + params_.bit_depth); + EXPECT_EQ(sse1, sse2) << "Error at test index: " << i; + EXPECT_EQ(sum1, sum2) << "Error at test index: " << i; + } +} + //////////////////////////////////////////////////////////////////////////////// // Tests related to MSE / SSE. @@ -478,14 +564,21 @@ template void MainTestClass::RefTestMse() { for (int i = 0; i < 10; ++i) { for (int j = 0; j < block_size(); ++j) { - src_[j] = rnd_.Rand8(); - ref_[j] = rnd_.Rand8(); + if (!use_high_bit_depth()) { + src_[j] = rnd_.Rand8(); + ref_[j] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); +#endif // CONFIG_VP9_HIGHBITDEPTH + } } unsigned int sse1, sse2; const int stride = width(); ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1)); variance_ref(src_, ref_, params_.log2width, params_.log2height, stride, - stride, &sse2, false, VPX_BITS_8); + stride, &sse2, use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2); } } @@ -509,8 +602,15 @@ void MainTestClass::RefTestSse() { template void MainTestClass::MaxTestMse() { - memset(src_, 255, block_size()); - memset(ref_, 0, block_size()); + if (!use_high_bit_depth()) { + memset(src_, 255, block_size()); + memset(ref_, 0, block_size()); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, block_size()); +#endif // CONFIG_VP9_HIGHBITDEPTH + } unsigned int sse; ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse)); const unsigned int expected = block_size() * 255 * 255; @@ -529,62 +629,43 @@ void MainTestClass::MaxTestSse() { //////////////////////////////////////////////////////////////////////////////// -using ::std::tr1::get; -using ::std::tr1::make_tuple; -using ::std::tr1::tuple; - -template +template class SubpelVarianceTest - : public ::testing::TestWithParam< - tuple > { + : public ::testing::TestWithParam > { public: - virtual void SetUp() { - const tuple ¶ms = - this->GetParam(); - log2width_ = get<0>(params); - width_ = 1 << log2width_; - log2height_ = get<1>(params); - height_ = 1 << log2height_; - subpel_variance_ = get<2>(params); - if (get<3>(params)) { - bit_depth_ = (vpx_bit_depth_t)get<3>(params); - use_high_bit_depth_ = true; - } else { - bit_depth_ = VPX_BITS_8; - use_high_bit_depth_ = false; - } - mask_ = (1 << bit_depth_) - 1; + void SetUp() override { + params_ = this->GetParam(); rnd_.Reset(ACMRandom::DeterministicSeed()); - block_size_ = width_ * height_; - if (!use_high_bit_depth_) { - src_ = reinterpret_cast(vpx_memalign(16, block_size_)); - sec_ = reinterpret_cast(vpx_memalign(16, block_size_)); - ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; + if (!use_high_bit_depth()) { + src_ = reinterpret_cast(vpx_memalign(16, block_size())); + sec_ = reinterpret_cast(vpx_memalign(16, block_size())); + ref_ = reinterpret_cast( + vpx_malloc(block_size() + width() + height() + 1)); #if CONFIG_VP9_HIGHBITDEPTH } else { src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( - vpx_memalign(16, block_size_ * sizeof(uint16_t)))); + vpx_memalign(16, block_size() * sizeof(uint16_t)))); sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast( - vpx_memalign(16, block_size_ * sizeof(uint16_t)))); - ref_ = - CONVERT_TO_BYTEPTR(new uint16_t[block_size_ + width_ + height_ + 1]); + vpx_memalign(16, block_size() * sizeof(uint16_t)))); + ref_ = CONVERT_TO_BYTEPTR(reinterpret_cast(vpx_malloc( + (block_size() + width() + height() + 1) * sizeof(uint16_t)))); #endif // CONFIG_VP9_HIGHBITDEPTH } - ASSERT_TRUE(src_ != NULL); - ASSERT_TRUE(sec_ != NULL); - ASSERT_TRUE(ref_ != NULL); + ASSERT_NE(src_, nullptr); + ASSERT_NE(sec_, nullptr); + ASSERT_NE(ref_, nullptr); } - virtual void TearDown() { - if (!use_high_bit_depth_) { + void TearDown() override { + if (!use_high_bit_depth()) { vpx_free(src_); - delete[] ref_; vpx_free(sec_); + vpx_free(ref_); #if CONFIG_VP9_HIGHBITDEPTH } else { vpx_free(CONVERT_TO_SHORTPTR(src_)); - delete[] CONVERT_TO_SHORTPTR(ref_); + vpx_free(CONVERT_TO_SHORTPTR(ref_)); vpx_free(CONVERT_TO_SHORTPTR(sec_)); #endif // CONFIG_VP9_HIGHBITDEPTH } @@ -594,47 +675,51 @@ class SubpelVarianceTest protected: void RefTest(); void ExtremeRefTest(); + void SpeedTest(); ACMRandom rnd_; uint8_t *src_; uint8_t *ref_; uint8_t *sec_; - bool use_high_bit_depth_; - vpx_bit_depth_t bit_depth_; - int width_, log2width_; - int height_, log2height_; - int block_size_, mask_; - SubpelVarianceFunctionType subpel_variance_; + TestParams params_; + + // some relay helpers + bool use_high_bit_depth() const { return params_.use_high_bit_depth; } + int byte_shift() const { return params_.bit_depth - 8; } + int block_size() const { return params_.block_size; } + int width() const { return params_.width; } + int height() const { return params_.height; } + uint32_t mask() const { return params_.mask; } }; template void SubpelVarianceTest::RefTest() { for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { - if (!use_high_bit_depth_) { - for (int j = 0; j < block_size_; j++) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { src_[j] = rnd_.Rand8(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + for (int j = 0; j < block_size() + width() + height() + 1; j++) { ref_[j] = rnd_.Rand8(); } #if CONFIG_VP9_HIGHBITDEPTH } else { - for (int j = 0; j < block_size_; j++) { - CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); } #endif // CONFIG_VP9_HIGHBITDEPTH } unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( - var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1)); - const unsigned int var2 = - subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2, - use_high_bit_depth_, bit_depth_); + var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1)); + const unsigned int var2 = subpel_variance_ref( + ref_, src_, params_.log2width, params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; } @@ -648,108 +733,143 @@ void SubpelVarianceTest::ExtremeRefTest() { // Ref: Set the first half of values to the maximum, the second half to 0. for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { - const int half = block_size_ / 2; - if (!use_high_bit_depth_) { + const int half = block_size() / 2; + if (!use_high_bit_depth()) { memset(src_, 0, half); memset(src_ + half, 255, half); memset(ref_, 255, half); - memset(ref_ + half, 0, half + width_ + height_ + 1); + memset(ref_ + half, 0, half + width() + height() + 1); #if CONFIG_VP9_HIGHBITDEPTH } else { - vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half); + vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half); vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half); vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half); - vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_, - half + width_ + height_ + 1); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(), + half + width() + height() + 1); #endif // CONFIG_VP9_HIGHBITDEPTH } unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( - var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1)); - const unsigned int var2 = - subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2, - use_high_bit_depth_, bit_depth_); + var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1)); + const unsigned int var2 = subpel_variance_ref( + ref_, src_, params_.log2width, params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y; EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y; } } } +template +void SubpelVarianceTest::SpeedTest() { + // The only interesting points are 0, 4, and anything else. To make the loops + // simple we will use 0, 2 and 4. + for (int x = 0; x <= 4; x += 2) { + for (int y = 0; y <= 4; y += 2) { + if (!use_high_bit_depth()) { + memset(src_, 25, block_size()); + memset(ref_, 50, block_size()); +#if CONFIG_VP9_HIGHBITDEPTH + } else { + vpx_memset16(CONVERT_TO_SHORTPTR(src_), 25, block_size()); + vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 50, block_size()); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + unsigned int sse; + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < 1000000000 / block_size(); ++i) { + const uint32_t variance = + params_.func(ref_, width() + 1, x, y, src_, width(), &sse); + (void)variance; + } + vpx_usec_timer_mark(&timer); + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("SubpelVariance %dx%d xoffset: %d yoffset: %d time: %5d ms\n", + width(), height(), x, y, elapsed_time / 1000); + } + } +} + template <> -void SubpelVarianceTest::RefTest() { +void SubpelVarianceTest::RefTest() { for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { - if (!use_high_bit_depth_) { - for (int j = 0; j < block_size_; j++) { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { src_[j] = rnd_.Rand8(); sec_[j] = rnd_.Rand8(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + for (int j = 0; j < block_size() + width() + height() + 1; j++) { ref_[j] = rnd_.Rand8(); } #if CONFIG_VP9_HIGHBITDEPTH } else { - for (int j = 0; j < block_size_; j++) { - CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; - CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask(); } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); } #endif // CONFIG_VP9_HIGHBITDEPTH } uint32_t sse1, sse2; uint32_t var1, var2; - ASM_REGISTER_STATE_CHECK(var1 = - subpel_variance_(ref_, width_ + 1, x, y, - src_, width_, &sse1, sec_)); - var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_, - x, y, &sse2, use_high_bit_depth_, - static_cast(bit_depth_)); + ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y, + src_, width(), &sse1, sec_)); + var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width, + params_.log2height, x, y, &sse2, + use_high_bit_depth(), params_.bit_depth); EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; } } } -typedef MainTestClass VpxSseTest; -typedef MainTestClass VpxMseTest; -typedef MainTestClass VpxVarianceTest; -typedef SubpelVarianceTest VpxSubpelVarianceTest; -typedef SubpelVarianceTest VpxSubpelAvgVarianceTest; +using VpxSseTest = MainTestClass; +using VpxMseTest = MainTestClass; +using VpxVarianceTest = MainTestClass; +using VpxGetVarianceTest = MainTestClass; +using VpxSubpelVarianceTest = SubpelVarianceTest; +using VpxSubpelAvgVarianceTest = SubpelVarianceTest; TEST_P(VpxSseTest, RefSse) { RefTestSse(); } TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); } TEST_P(VpxMseTest, RefMse) { RefTestMse(); } TEST_P(VpxMseTest, MaxMse) { MaxTestMse(); } +TEST_P(VpxMseTest, DISABLED_Speed) { SpeedTest(); } TEST_P(VpxVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxVarianceTest, Ref) { RefTest(); } TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(VpxGetVarianceTest, RefGetVar) { RefTestGetVar(); } TEST_P(SumOfSquaresTest, Const) { ConstTest(); } TEST_P(SumOfSquaresTest, Ref) { RefTest(); } TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); } TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(VpxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); } TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); } -INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest, - ::testing::Values(vpx_get_mb_ss_c)); +INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_c)); -typedef TestParams SseParams; -INSTANTIATE_TEST_CASE_P(C, VpxSseTest, - ::testing::Values(SseParams(2, 2, - &vpx_get4x4sse_cs_c))); +using SseParams = TestParams; +INSTANTIATE_TEST_SUITE_P(C, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_c))); -typedef TestParams MseParams; -INSTANTIATE_TEST_CASE_P(C, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c), - MseParams(4, 3, &vpx_mse16x8_c), - MseParams(3, 4, &vpx_mse8x16_c), - MseParams(3, 3, &vpx_mse8x8_c))); +using MseParams = TestParams; +INSTANTIATE_TEST_SUITE_P(C, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c), + MseParams(4, 3, &vpx_mse16x8_c), + MseParams(3, 4, &vpx_mse8x16_c), + MseParams(3, 3, &vpx_mse8x8_c))); -typedef TestParams VarianceParams; -INSTANTIATE_TEST_CASE_P( +using VarianceParams = TestParams; +INSTANTIATE_TEST_SUITE_P( C, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_c), VarianceParams(6, 5, &vpx_variance64x32_c), @@ -765,72 +885,91 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_c), VarianceParams(2, 2, &vpx_variance4x4_c))); -INSTANTIATE_TEST_CASE_P( - C, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0))); +using GetVarianceParams = TestParams; +INSTANTIATE_TEST_SUITE_P( + C, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_c), + GetVarianceParams(3, 3, &vpx_get8x8var_c), + GetVarianceParams(4, 4, &vpx_get16x16var_c), + GetVarianceParams(3, 3, &vpx_get8x8var_c), + GetVarianceParams(4, 4, &vpx_get16x16var_c), + GetVarianceParams(3, 3, &vpx_get8x8var_c))); -INSTANTIATE_TEST_CASE_P( +using SubpelVarianceParams = TestParams; +INSTANTIATE_TEST_SUITE_P( + C, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_c, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_c, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_c, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_c, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_c, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_c, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_c, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_c, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_c, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_c, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0))); + +using SubpelAvgVarianceParams = TestParams; +INSTANTIATE_TEST_SUITE_P( C, VpxSubpelAvgVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0))); + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0))); #if CONFIG_VP9_HIGHBITDEPTH -typedef MainTestClass VpxHBDMseTest; -typedef MainTestClass VpxHBDVarianceTest; -typedef SubpelVarianceTest VpxHBDSubpelVarianceTest; -typedef SubpelVarianceTest VpxHBDSubpelAvgVarianceTest; +using VpxHBDVarianceTest = MainTestClass; +using VpxHBDGetVarianceTest = MainTestClass; +using VpxHBDSubpelVarianceTest = SubpelVarianceTest; +using VpxHBDSubpelAvgVarianceTest = + SubpelVarianceTest; -TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); } -TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); } TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); } +TEST_P(VpxHBDGetVarianceTest, RefGetVar) { RefTestGetVar(); } TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } -/* TODO(debargha): This test does not support the highbd version -INSTANTIATE_TEST_CASE_P( +using VpxHBDMseTest = MainTestClass; +TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); } +TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); } +TEST_P(VpxHBDMseTest, DISABLED_Speed) { SpeedTest(); } +INSTANTIATE_TEST_SUITE_P( C, VpxHBDMseTest, - ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c), - make_tuple(4, 4, &vpx_highbd_12_mse16x8_c), - make_tuple(4, 4, &vpx_highbd_12_mse8x16_c), - make_tuple(4, 4, &vpx_highbd_12_mse8x8_c), - make_tuple(4, 4, &vpx_highbd_10_mse16x16_c), - make_tuple(4, 4, &vpx_highbd_10_mse16x8_c), - make_tuple(4, 4, &vpx_highbd_10_mse8x16_c), - make_tuple(4, 4, &vpx_highbd_10_mse8x8_c), - make_tuple(4, 4, &vpx_highbd_8_mse16x16_c), - make_tuple(4, 4, &vpx_highbd_8_mse16x8_c), - make_tuple(4, 4, &vpx_highbd_8_mse8x16_c), - make_tuple(4, 4, &vpx_highbd_8_mse8x8_c))); -*/ + ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_c, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_c, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_c, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_c, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_c, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_c, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_c, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_c, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_c, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_c, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_c, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VpxHBDMseTest); + +INSTANTIATE_TEST_SUITE_P( C, VpxHBDVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_highbd_12_variance64x64_c, 12), VarianceParams(6, 5, &vpx_highbd_12_variance64x32_c, 12), @@ -872,104 +1011,186 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_highbd_8_variance4x8_c, 8), VarianceParams(2, 2, &vpx_highbd_8_variance4x4_c, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( + C, VpxHBDGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_c, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_c, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_c, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_c, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_c, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_c, 8))); + +INSTANTIATE_TEST_SUITE_P( C, VpxHBDSubpelVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8), - make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8), - make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10), - make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10), - make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10), - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12), - make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12), - make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12))); + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8), + SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8), + SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10), + SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10), + SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10), + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12), + SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12), + SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, + 12))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( C, VpxHBDSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8), - make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8), - make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10), - make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10), - make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10), - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12), - make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12), - make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12))); + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8), + SubpelAvgVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, + 8), + SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, + 8), + SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, + 8), + SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, + 8), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_c, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_c, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_c, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_c, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_c, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_c, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_c, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_c, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_c, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_c, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_c, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_c, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_c, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_c, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_c, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_c, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_c, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_c, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_12_sub_pixel_avg_variance4x4_c, + 12))); #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, - ::testing::Values(vpx_get_mb_ss_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_sse2)); -INSTANTIATE_TEST_CASE_P(SSE2, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_sse2), - MseParams(4, 3, &vpx_mse16x8_sse2), - MseParams(3, 4, &vpx_mse8x16_sse2), - MseParams(3, 3, &vpx_mse8x8_sse2))); +INSTANTIATE_TEST_SUITE_P(SSE2, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_sse2), + MseParams(4, 3, &vpx_mse16x8_sse2), + MseParams(3, 4, &vpx_mse8x16_sse2), + MseParams(3, 3, &vpx_mse8x8_sse2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_sse2), VarianceParams(6, 5, &vpx_variance64x32_sse2), @@ -985,58 +1206,61 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_sse2), VarianceParams(2, 2, &vpx_variance4x4_sse2))); -INSTANTIATE_TEST_CASE_P( - SSE2, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0))); +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_sse2), + GetVarianceParams(3, 3, &vpx_get8x8var_sse2), + GetVarianceParams(4, 4, &vpx_get16x16var_sse2), + GetVarianceParams(3, 3, &vpx_get8x8var_sse2), + GetVarianceParams(4, 4, &vpx_get16x16var_sse2), + GetVarianceParams(3, 3, &vpx_get8x8var_sse2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0))); + +INSTANTIATE_TEST_SUITE_P( SSE2, VpxSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0))); #if CONFIG_VP9_HIGHBITDEPTH -/* TODO(debargha): This test does not support the highbd version -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDMseTest, - ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2), - MseParams(4, 3, &vpx_highbd_12_mse16x8_sse2), - MseParams(3, 4, &vpx_highbd_12_mse8x16_sse2), - MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2), - MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2), - MseParams(4, 3, &vpx_highbd_10_mse16x8_sse2), - MseParams(3, 4, &vpx_highbd_10_mse8x16_sse2), - MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2), - MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2), - MseParams(4, 3, &vpx_highbd_8_mse16x8_sse2), - MseParams(3, 4, &vpx_highbd_8_mse8x16_sse2), - MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2))); -*/ + ::testing::Values( + MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2, VPX_BITS_8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDVarianceTest, ::testing::Values( VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sse2, 12), @@ -1070,183 +1294,745 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sse2, 8), VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sse2, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( + SSE2, VpxHBDGetVarianceTest, + ::testing::Values( + GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sse2, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sse2, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sse2, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sse2, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sse2, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sse2, 8))); + +INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDSubpelVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10), - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8))); + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, + 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, + 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, + 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, + 12), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, + 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, + 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, + 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, + 10), + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, + 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, + 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, + 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, + 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, + 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, + 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, + 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, + 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, + 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, + 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VpxHBDSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12), - make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12), - make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12), - make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12), - make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12), - make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12), - make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12), - make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12), - make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12), - make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12), - make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12), - make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10), - make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10), - make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10), - make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10), - make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10), - make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10), - make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10), - make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10), - make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10), - make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10), - make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10), - make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8), - make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8), - make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8), - make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8), - make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8), - make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8), - make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8), - make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8), - make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8), - make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8), - make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8))); + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, + 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, + 12), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, + 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, + 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, + 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, + 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, + 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, + 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, + 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, + 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, + 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, + 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, + 8), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, + 8), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, + 8))); #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SSE2 #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0), - make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, VpxSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0))); + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, + 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, + 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, + 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, + 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, + 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, + 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, + 0))); #endif // HAVE_SSSE3 #if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2))); +INSTANTIATE_TEST_SUITE_P(AVX2, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2), + MseParams(4, 3, &vpx_mse16x8_avx2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_avx2), VarianceParams(6, 5, &vpx_variance64x32_avx2), + VarianceParams(5, 6, &vpx_variance32x64_avx2), VarianceParams(5, 5, &vpx_variance32x32_avx2), VarianceParams(5, 4, &vpx_variance32x16_avx2), - VarianceParams(4, 4, &vpx_variance16x16_avx2))); + VarianceParams(4, 5, &vpx_variance16x32_avx2), + VarianceParams(4, 4, &vpx_variance16x16_avx2), + VarianceParams(4, 3, &vpx_variance16x8_avx2), + VarianceParams(3, 4, &vpx_variance8x16_avx2), + VarianceParams(3, 3, &vpx_variance8x8_avx2), + VarianceParams(3, 2, &vpx_variance8x4_avx2))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( AVX2, VpxSubpelAvgVarianceTest, ::testing::Values( - make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0))); + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, + 0))); #endif // HAVE_AVX2 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest, - ::testing::Values(SseParams(2, 2, - &vpx_get4x4sse_cs_neon))); +INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_neon))); -INSTANTIATE_TEST_CASE_P(NEON, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon))); +INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon), + MseParams(4, 3, &vpx_mse16x8_neon), + MseParams(3, 4, &vpx_mse8x16_neon), + MseParams(3, 3, &vpx_mse8x8_neon))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon), VarianceParams(6, 5, &vpx_variance64x32_neon), VarianceParams(5, 6, &vpx_variance32x64_neon), VarianceParams(5, 5, &vpx_variance32x32_neon), + VarianceParams(5, 4, &vpx_variance32x16_neon), + VarianceParams(4, 5, &vpx_variance16x32_neon), VarianceParams(4, 4, &vpx_variance16x16_neon), VarianceParams(4, 3, &vpx_variance16x8_neon), VarianceParams(3, 4, &vpx_variance8x16_neon), - VarianceParams(3, 3, &vpx_variance8x8_neon))); + VarianceParams(3, 3, &vpx_variance8x8_neon), + VarianceParams(3, 2, &vpx_variance8x4_neon), + VarianceParams(2, 3, &vpx_variance4x8_neon), + VarianceParams(2, 2, &vpx_variance4x4_neon))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( + NEON, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon), + GetVarianceParams(3, 3, &vpx_get8x8var_neon), + GetVarianceParams(4, 4, &vpx_get16x16var_neon), + GetVarianceParams(3, 3, &vpx_get8x8var_neon), + GetVarianceParams(4, 4, &vpx_get16x16var_neon), + GetVarianceParams(3, 3, &vpx_get8x8var_neon))); + +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxSseTest, + ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_neon_dotprod))); + +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon_dotprod), + MseParams(4, 3, &vpx_mse16x8_neon_dotprod), + MseParams(3, 4, &vpx_mse8x16_neon_dotprod), + MseParams(3, 3, &vpx_mse8x8_neon_dotprod))); + +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon_dotprod), + VarianceParams(6, 5, &vpx_variance64x32_neon_dotprod), + VarianceParams(5, 6, &vpx_variance32x64_neon_dotprod), + VarianceParams(5, 5, &vpx_variance32x32_neon_dotprod), + VarianceParams(5, 4, &vpx_variance32x16_neon_dotprod), + VarianceParams(4, 5, &vpx_variance16x32_neon_dotprod), + VarianceParams(4, 4, &vpx_variance16x16_neon_dotprod), + VarianceParams(4, 3, &vpx_variance16x8_neon_dotprod), + VarianceParams(3, 4, &vpx_variance8x16_neon_dotprod), + VarianceParams(3, 3, &vpx_variance8x8_neon_dotprod), + VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod), + VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod), + VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod))); + +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod), + GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod), + GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod), + GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod), + GetVarianceParams(4, 4, &vpx_get16x16var_neon_dotprod), + GetVarianceParams(3, 3, &vpx_get8x8var_neon_dotprod))); +#endif // HAVE_NEON_DOTPROD + +INSTANTIATE_TEST_SUITE_P( NEON, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0))); + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_neon, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_neon, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_neon, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_neon, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_neon, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_neon, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_neon, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_neon, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_neon, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_neon, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0))); + +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDMseTest, + ::testing::Values( + MseParams(4, 4, &vpx_highbd_12_mse16x16_neon, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_neon, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_neon, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_neon, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_neon, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_neon, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_neon, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_neon, VPX_BITS_10), + MseParams(4, 4, &vpx_highbd_8_mse16x16_neon, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_neon, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8))); + +#if HAVE_NEON_DOTPROD +INSTANTIATE_TEST_SUITE_P( + NEON_DOTPROD, VpxHBDMseTest, + ::testing::Values( + MseParams(4, 4, &vpx_highbd_8_mse16x16_neon_dotprod, VPX_BITS_8), + MseParams(4, 3, &vpx_highbd_8_mse16x8_neon_dotprod, VPX_BITS_8), + MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8), + MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8))); +#endif // HAVE_NEON_DOTPROD + +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, VpxHBDMseTest, + ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sve, VPX_BITS_12), + MseParams(4, 3, &vpx_highbd_12_mse16x8_sve, VPX_BITS_12), + MseParams(3, 4, &vpx_highbd_12_mse8x16_sve, VPX_BITS_12), + MseParams(3, 3, &vpx_highbd_12_mse8x8_sve, VPX_BITS_12), + MseParams(4, 4, &vpx_highbd_10_mse16x16_sve, VPX_BITS_10), + MseParams(4, 3, &vpx_highbd_10_mse16x8_sve, VPX_BITS_10), + MseParams(3, 4, &vpx_highbd_10_mse8x16_sve, VPX_BITS_10), + MseParams(3, 3, &vpx_highbd_10_mse8x8_sve, VPX_BITS_10))); +#endif // HAVE_SVE + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDVarianceTest, + ::testing::Values( + VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12), + VarianceParams(6, 5, &vpx_highbd_12_variance64x32_neon, 12), + VarianceParams(5, 6, &vpx_highbd_12_variance32x64_neon, 12), + VarianceParams(5, 5, &vpx_highbd_12_variance32x32_neon, 12), + VarianceParams(5, 4, &vpx_highbd_12_variance32x16_neon, 12), + VarianceParams(4, 5, &vpx_highbd_12_variance16x32_neon, 12), + VarianceParams(4, 4, &vpx_highbd_12_variance16x16_neon, 12), + VarianceParams(4, 3, &vpx_highbd_12_variance16x8_neon, 12), + VarianceParams(3, 4, &vpx_highbd_12_variance8x16_neon, 12), + VarianceParams(3, 3, &vpx_highbd_12_variance8x8_neon, 12), + VarianceParams(3, 2, &vpx_highbd_12_variance8x4_neon, 12), + VarianceParams(2, 3, &vpx_highbd_12_variance4x8_neon, 12), + VarianceParams(2, 2, &vpx_highbd_12_variance4x4_neon, 12), + VarianceParams(6, 6, &vpx_highbd_10_variance64x64_neon, 10), + VarianceParams(6, 5, &vpx_highbd_10_variance64x32_neon, 10), + VarianceParams(5, 6, &vpx_highbd_10_variance32x64_neon, 10), + VarianceParams(5, 5, &vpx_highbd_10_variance32x32_neon, 10), + VarianceParams(5, 4, &vpx_highbd_10_variance32x16_neon, 10), + VarianceParams(4, 5, &vpx_highbd_10_variance16x32_neon, 10), + VarianceParams(4, 4, &vpx_highbd_10_variance16x16_neon, 10), + VarianceParams(4, 3, &vpx_highbd_10_variance16x8_neon, 10), + VarianceParams(3, 4, &vpx_highbd_10_variance8x16_neon, 10), + VarianceParams(3, 3, &vpx_highbd_10_variance8x8_neon, 10), + VarianceParams(3, 2, &vpx_highbd_10_variance8x4_neon, 10), + VarianceParams(2, 3, &vpx_highbd_10_variance4x8_neon, 10), + VarianceParams(2, 2, &vpx_highbd_10_variance4x4_neon, 10), + VarianceParams(6, 6, &vpx_highbd_8_variance64x64_neon, 8), + VarianceParams(6, 5, &vpx_highbd_8_variance64x32_neon, 8), + VarianceParams(5, 6, &vpx_highbd_8_variance32x64_neon, 8), + VarianceParams(5, 5, &vpx_highbd_8_variance32x32_neon, 8), + VarianceParams(5, 4, &vpx_highbd_8_variance32x16_neon, 8), + VarianceParams(4, 5, &vpx_highbd_8_variance16x32_neon, 8), + VarianceParams(4, 4, &vpx_highbd_8_variance16x16_neon, 8), + VarianceParams(4, 3, &vpx_highbd_8_variance16x8_neon, 8), + VarianceParams(3, 4, &vpx_highbd_8_variance8x16_neon, 8), + VarianceParams(3, 3, &vpx_highbd_8_variance8x8_neon, 8), + VarianceParams(3, 2, &vpx_highbd_8_variance8x4_neon, 8), + VarianceParams(2, 3, &vpx_highbd_8_variance4x8_neon, 8), + VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDGetVarianceTest, + ::testing::Values( + GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_neon, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_neon, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_neon, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_neon, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_neon, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_neon, 8))); + +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, VpxHBDGetVarianceTest, + ::testing::Values( + GetVarianceParams(4, 4, &vpx_highbd_12_get16x16var_sve, 12), + GetVarianceParams(3, 3, &vpx_highbd_12_get8x8var_sve, 12), + GetVarianceParams(4, 4, &vpx_highbd_10_get16x16var_sve, 10), + GetVarianceParams(3, 3, &vpx_highbd_10_get8x8var_sve, 10), + GetVarianceParams(4, 4, &vpx_highbd_8_get16x16var_sve, 8), + GetVarianceParams(3, 3, &vpx_highbd_8_get8x8var_sve, 8))); +#endif // HAVE_SVE + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon, + 12), + SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_neon, + 12), + SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_neon, + 12), + SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_neon, + 12), + SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_neon, + 12), + SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_neon, + 12), + SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_neon, + 12), + SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_neon, + 12), + SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_neon, + 12), + SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_neon, + 12), + SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon, + 12), + SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon, + 12), + SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon, + 12), + SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon, + 10), + SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon, + 10), + SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_neon, + 10), + SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_neon, + 10), + SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_neon, + 10), + SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_neon, + 10), + SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_neon, + 10), + SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_neon, + 10), + SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_neon, + 10), + SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_neon, + 10), + SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon, + 10), + SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon, + 10), + SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon, + 10), + SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon, + 8), + SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon, + 8), + SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_neon, + 8), + SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_neon, + 8), + SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_neon, + 8), + SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_neon, + 8), + SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_neon, + 8), + SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_neon, + 8), + SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon, + 8), + SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8), + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8), + SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8), + SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon, + 8))); + +INSTANTIATE_TEST_SUITE_P( + NEON, VpxHBDSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_12_sub_pixel_avg_variance64x64_neon, + 12), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_12_sub_pixel_avg_variance64x32_neon, + 12), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_12_sub_pixel_avg_variance32x64_neon, + 12), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_12_sub_pixel_avg_variance32x32_neon, + 12), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_12_sub_pixel_avg_variance32x16_neon, + 12), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_12_sub_pixel_avg_variance16x32_neon, + 12), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_12_sub_pixel_avg_variance16x16_neon, + 12), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_12_sub_pixel_avg_variance16x8_neon, + 12), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_12_sub_pixel_avg_variance8x16_neon, + 12), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_12_sub_pixel_avg_variance8x8_neon, + 12), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_12_sub_pixel_avg_variance8x4_neon, + 12), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_12_sub_pixel_avg_variance4x8_neon, + 12), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_12_sub_pixel_avg_variance4x4_neon, + 12), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_10_sub_pixel_avg_variance64x64_neon, + 10), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_10_sub_pixel_avg_variance64x32_neon, + 10), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_10_sub_pixel_avg_variance32x64_neon, + 10), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_10_sub_pixel_avg_variance32x32_neon, + 10), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_10_sub_pixel_avg_variance32x16_neon, + 10), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_10_sub_pixel_avg_variance16x32_neon, + 10), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_10_sub_pixel_avg_variance16x16_neon, + 10), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_10_sub_pixel_avg_variance16x8_neon, + 10), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_10_sub_pixel_avg_variance8x16_neon, + 10), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_10_sub_pixel_avg_variance8x8_neon, + 10), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_10_sub_pixel_avg_variance8x4_neon, + 10), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_10_sub_pixel_avg_variance4x8_neon, + 10), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_10_sub_pixel_avg_variance4x4_neon, + 10), + SubpelAvgVarianceParams(6, 6, + &vpx_highbd_8_sub_pixel_avg_variance64x64_neon, + 8), + SubpelAvgVarianceParams(6, 5, + &vpx_highbd_8_sub_pixel_avg_variance64x32_neon, + 8), + SubpelAvgVarianceParams(5, 6, + &vpx_highbd_8_sub_pixel_avg_variance32x64_neon, + 8), + SubpelAvgVarianceParams(5, 5, + &vpx_highbd_8_sub_pixel_avg_variance32x32_neon, + 8), + SubpelAvgVarianceParams(5, 4, + &vpx_highbd_8_sub_pixel_avg_variance32x16_neon, + 8), + SubpelAvgVarianceParams(4, 5, + &vpx_highbd_8_sub_pixel_avg_variance16x32_neon, + 8), + SubpelAvgVarianceParams(4, 4, + &vpx_highbd_8_sub_pixel_avg_variance16x16_neon, + 8), + SubpelAvgVarianceParams(4, 3, + &vpx_highbd_8_sub_pixel_avg_variance16x8_neon, + 8), + SubpelAvgVarianceParams(3, 4, + &vpx_highbd_8_sub_pixel_avg_variance8x16_neon, + 8), + SubpelAvgVarianceParams(3, 3, + &vpx_highbd_8_sub_pixel_avg_variance8x8_neon, + 8), + SubpelAvgVarianceParams(3, 2, + &vpx_highbd_8_sub_pixel_avg_variance8x4_neon, + 8), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_8_sub_pixel_avg_variance4x8_neon, + 8), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_8_sub_pixel_avg_variance4x4_neon, + 8))); + +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON +#if HAVE_SVE +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SVE, VpxHBDVarianceTest, + ::testing::Values( + VarianceParams(6, 6, &vpx_highbd_12_variance64x64_sve, 12), + VarianceParams(6, 5, &vpx_highbd_12_variance64x32_sve, 12), + VarianceParams(5, 6, &vpx_highbd_12_variance32x64_sve, 12), + VarianceParams(5, 5, &vpx_highbd_12_variance32x32_sve, 12), + VarianceParams(5, 4, &vpx_highbd_12_variance32x16_sve, 12), + VarianceParams(4, 5, &vpx_highbd_12_variance16x32_sve, 12), + VarianceParams(4, 4, &vpx_highbd_12_variance16x16_sve, 12), + VarianceParams(4, 3, &vpx_highbd_12_variance16x8_sve, 12), + VarianceParams(3, 4, &vpx_highbd_12_variance8x16_sve, 12), + VarianceParams(3, 3, &vpx_highbd_12_variance8x8_sve, 12), + VarianceParams(3, 2, &vpx_highbd_12_variance8x4_sve, 12), + VarianceParams(2, 3, &vpx_highbd_12_variance4x8_sve, 12), + VarianceParams(2, 2, &vpx_highbd_12_variance4x4_sve, 12), + VarianceParams(6, 6, &vpx_highbd_10_variance64x64_sve, 10), + VarianceParams(6, 5, &vpx_highbd_10_variance64x32_sve, 10), + VarianceParams(5, 6, &vpx_highbd_10_variance32x64_sve, 10), + VarianceParams(5, 5, &vpx_highbd_10_variance32x32_sve, 10), + VarianceParams(5, 4, &vpx_highbd_10_variance32x16_sve, 10), + VarianceParams(4, 5, &vpx_highbd_10_variance16x32_sve, 10), + VarianceParams(4, 4, &vpx_highbd_10_variance16x16_sve, 10), + VarianceParams(4, 3, &vpx_highbd_10_variance16x8_sve, 10), + VarianceParams(3, 4, &vpx_highbd_10_variance8x16_sve, 10), + VarianceParams(3, 3, &vpx_highbd_10_variance8x8_sve, 10), + VarianceParams(3, 2, &vpx_highbd_10_variance8x4_sve, 10), + VarianceParams(2, 3, &vpx_highbd_10_variance4x8_sve, 10), + VarianceParams(2, 2, &vpx_highbd_10_variance4x4_sve, 10), + VarianceParams(6, 6, &vpx_highbd_8_variance64x64_sve, 8), + VarianceParams(6, 5, &vpx_highbd_8_variance64x32_sve, 8), + VarianceParams(5, 6, &vpx_highbd_8_variance32x64_sve, 8), + VarianceParams(5, 5, &vpx_highbd_8_variance32x32_sve, 8), + VarianceParams(5, 4, &vpx_highbd_8_variance32x16_sve, 8), + VarianceParams(4, 5, &vpx_highbd_8_variance16x32_sve, 8), + VarianceParams(4, 4, &vpx_highbd_8_variance16x16_sve, 8), + VarianceParams(4, 3, &vpx_highbd_8_variance16x8_sve, 8), + VarianceParams(3, 4, &vpx_highbd_8_variance8x16_sve, 8), + VarianceParams(3, 3, &vpx_highbd_8_variance8x8_sve, 8), + VarianceParams(3, 2, &vpx_highbd_8_variance8x4_sve, 8), + VarianceParams(2, 3, &vpx_highbd_8_variance4x8_sve, 8), + VarianceParams(2, 2, &vpx_highbd_8_variance4x4_sve, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SVE + #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest, - ::testing::Values(vpx_get_mb_ss_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_msa)); -INSTANTIATE_TEST_CASE_P(MSA, VpxSseTest, - ::testing::Values(SseParams(2, 2, - &vpx_get4x4sse_cs_msa))); +INSTANTIATE_TEST_SUITE_P(MSA, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_msa))); -INSTANTIATE_TEST_CASE_P(MSA, VpxMseTest, - ::testing::Values(MseParams(4, 4, &vpx_mse16x16_msa), - MseParams(4, 3, &vpx_mse16x8_msa), - MseParams(3, 4, &vpx_mse8x16_msa), - MseParams(3, 3, &vpx_mse8x8_msa))); +INSTANTIATE_TEST_SUITE_P(MSA, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_msa), + MseParams(4, 3, &vpx_mse16x8_msa), + MseParams(3, 4, &vpx_mse8x16_msa), + MseParams(3, 3, &vpx_mse8x8_msa))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_msa), VarianceParams(6, 5, &vpx_variance64x32_msa), @@ -1262,36 +2048,167 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_msa), VarianceParams(2, 2, &vpx_variance4x4_msa))); -INSTANTIATE_TEST_CASE_P( - MSA, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0), - make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0), - make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0), - make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0), - make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0), - make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0), - make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0), - make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0), - make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0), - make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0), - make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0), - make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0))); +INSTANTIATE_TEST_SUITE_P( + MSA, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_msa), + GetVarianceParams(3, 3, &vpx_get8x8var_msa), + GetVarianceParams(4, 4, &vpx_get16x16var_msa), + GetVarianceParams(3, 3, &vpx_get8x8var_msa), + GetVarianceParams(4, 4, &vpx_get16x16var_msa), + GetVarianceParams(3, 3, &vpx_get8x8var_msa))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( + MSA, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_msa, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_msa, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_msa, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_msa, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_msa, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_msa, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_msa, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_msa, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_msa, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_msa, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0), + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0))); + +INSTANTIATE_TEST_SUITE_P( MSA, VpxSubpelAvgVarianceTest, - ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0), - make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0), - make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0), - make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0), - make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0), - make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0), - make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0), - make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0), - make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0), - make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0), - make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0), - make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0), - make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0))); + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0))); #endif // HAVE_MSA + +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P(VSX, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_vsx)); + +INSTANTIATE_TEST_SUITE_P(VSX, VpxSseTest, + ::testing::Values(SseParams(2, 2, + &vpx_get4x4sse_cs_vsx))); +INSTANTIATE_TEST_SUITE_P(VSX, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_vsx), + MseParams(4, 3, &vpx_mse16x8_vsx), + MseParams(3, 4, &vpx_mse8x16_vsx), + MseParams(3, 3, &vpx_mse8x8_vsx))); + +INSTANTIATE_TEST_SUITE_P( + VSX, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_vsx), + VarianceParams(6, 5, &vpx_variance64x32_vsx), + VarianceParams(5, 6, &vpx_variance32x64_vsx), + VarianceParams(5, 5, &vpx_variance32x32_vsx), + VarianceParams(5, 4, &vpx_variance32x16_vsx), + VarianceParams(4, 5, &vpx_variance16x32_vsx), + VarianceParams(4, 4, &vpx_variance16x16_vsx), + VarianceParams(4, 3, &vpx_variance16x8_vsx), + VarianceParams(3, 4, &vpx_variance8x16_vsx), + VarianceParams(3, 3, &vpx_variance8x8_vsx), + VarianceParams(3, 2, &vpx_variance8x4_vsx), + VarianceParams(2, 3, &vpx_variance4x8_vsx), + VarianceParams(2, 2, &vpx_variance4x4_vsx))); + +INSTANTIATE_TEST_SUITE_P( + VSX, VpxGetVarianceTest, + ::testing::Values(GetVarianceParams(4, 4, &vpx_get16x16var_vsx), + GetVarianceParams(3, 3, &vpx_get8x8var_vsx), + GetVarianceParams(4, 4, &vpx_get16x16var_vsx), + GetVarianceParams(3, 3, &vpx_get8x8var_vsx), + GetVarianceParams(4, 4, &vpx_get16x16var_vsx), + GetVarianceParams(3, 3, &vpx_get8x8var_vsx))); +#endif // HAVE_VSX + +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P(MMI, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi), + MseParams(4, 3, &vpx_mse16x8_mmi), + MseParams(3, 4, &vpx_mse8x16_mmi), + MseParams(3, 3, &vpx_mse8x8_mmi))); + +INSTANTIATE_TEST_SUITE_P( + MMI, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi), + VarianceParams(6, 5, &vpx_variance64x32_mmi), + VarianceParams(5, 6, &vpx_variance32x64_mmi), + VarianceParams(5, 5, &vpx_variance32x32_mmi), + VarianceParams(5, 4, &vpx_variance32x16_mmi), + VarianceParams(4, 5, &vpx_variance16x32_mmi), + VarianceParams(4, 4, &vpx_variance16x16_mmi), + VarianceParams(4, 3, &vpx_variance16x8_mmi), + VarianceParams(3, 4, &vpx_variance8x16_mmi), + VarianceParams(3, 3, &vpx_variance8x8_mmi), + VarianceParams(3, 2, &vpx_variance8x4_mmi), + VarianceParams(2, 3, &vpx_variance4x8_mmi), + VarianceParams(2, 2, &vpx_variance4x4_mmi))); + +INSTANTIATE_TEST_SUITE_P( + MMI, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0), + SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_mmi, 0), + SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_mmi, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_mmi, 0), + SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_mmi, 0), + SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_mmi, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_mmi, 0), + SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_mmi, 0), + SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_mmi, 0), + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_mmi, 0), + SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_mmi, 0), + SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0), + SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0))); + +INSTANTIATE_TEST_SUITE_P( + MMI, VpxSubpelAvgVarianceTest, + ::testing::Values( + SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0), + SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_mmi, 0), + SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_mmi, 0), + SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_mmi, 0), + SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_mmi, 0), + SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_mmi, 0), + SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_mmi, 0), + SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_mmi, 0), + SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_mmi, 0), + SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_mmi, 0), + SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_mmi, 0), + SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0), + SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0))); +#endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, VpxMseTest, + ::testing::Values(MseParams(4, 4, &vpx_mse16x16_lsx))); + +INSTANTIATE_TEST_SUITE_P( + LSX, VpxVarianceTest, + ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx), + VarianceParams(5, 5, &vpx_variance32x32_lsx), + VarianceParams(4, 4, &vpx_variance16x16_lsx), + VarianceParams(3, 3, &vpx_variance8x8_lsx))); + +INSTANTIATE_TEST_SUITE_P( + LSX, VpxSubpelVarianceTest, + ::testing::Values( + SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_lsx, 0), + SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_lsx, 0), + SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_lsx, 0))); + +INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelAvgVarianceTest, + ::testing::Values(SubpelAvgVarianceParams( + 6, 6, &vpx_sub_pixel_avg_variance64x64_lsx, 0))); +#endif } // namespace diff --git a/media/libvpx/libvpx/test/video_source.h b/media/libvpx/libvpx/test/video_source.h index 54f692865b..419d162596 100644 --- a/media/libvpx/libvpx/test/video_source.h +++ b/media/libvpx/libvpx/test/video_source.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_VIDEO_SOURCE_H_ -#define TEST_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_VIDEO_SOURCE_H_ +#define VPX_TEST_VIDEO_SOURCE_H_ #if defined(_WIN32) #undef NOMINMAX @@ -20,7 +20,13 @@ #endif #include #include +#include +#include #include + +#if !defined(_WIN32) +#include "gtest/gtest.h" +#endif #include "test/acm_random.h" #include "vpx/vpx_encoder.h" @@ -36,7 +42,7 @@ namespace libvpx_test { // A simple function to encapsulate cross platform retrieval of test data path static std::string GetDataPath() { const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH"); - if (data_path == NULL) { + if (data_path == nullptr) { #ifdef LIBVPX_TEST_DATA_PATH // In some environments, we cannot set environment variables // Instead, we set the data path by using a preprocessor symbol @@ -58,7 +64,7 @@ inline FILE *OpenTestDataFile(const std::string &file_name) { return fopen(path_to_source.c_str(), "rb"); } -static FILE *GetTempOutFile(std::string *file_name) { +static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) { file_name->clear(); #if defined(_WIN32) char fname[MAX_PATH]; @@ -67,18 +73,37 @@ static FILE *GetTempOutFile(std::string *file_name) { // Assume for now that the filename generated is unique per process if (GetTempFileNameA(tmppath, "lvx", 0, fname)) { file_name->assign(fname); - return fopen(fname, "wb+"); + return fopen(fname, io_mode); } } - return NULL; + return nullptr; #else - return tmpfile(); + std::string temp_dir = testing::TempDir(); + if (temp_dir.empty()) return nullptr; + // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may + // use the value of an environment variable without checking for a trailing + // path delimiter. + if (temp_dir[temp_dir.size() - 1] != '/') temp_dir += '/'; + const char name_template[] = "libvpxtest.XXXXXX"; + std::unique_ptr temp_file_name( + new char[temp_dir.size() + sizeof(name_template)]); + if (temp_file_name == nullptr) return nullptr; + memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size()); + memcpy(temp_file_name.get() + temp_dir.size(), name_template, + sizeof(name_template)); + const int fd = mkstemp(temp_file_name.get()); + if (fd == -1) return nullptr; + *file_name = temp_file_name.get(); + return fdopen(fd, io_mode); #endif } class TempOutFile { public: - TempOutFile() { file_ = GetTempOutFile(&file_name_); } + TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); } + TempOutFile(const char *io_mode) { + file_ = GetTempOutFile(&file_name_, io_mode); + } ~TempOutFile() { CloseFile(); if (!file_name_.empty()) { @@ -92,7 +117,7 @@ class TempOutFile { void CloseFile() { if (file_) { fclose(file_); - file_ = NULL; + file_ = nullptr; } } FILE *file_; @@ -111,7 +136,7 @@ class VideoSource { // Advance the cursor to the next frame virtual void Next() = 0; - // Get the current video frame, or NULL on End-Of-Stream. + // Get the current video frame, or nullptr on End-Of-Stream. virtual vpx_image_t *img() const = 0; // Get the presentation timestamp of the current frame. @@ -133,38 +158,40 @@ class VideoSource { class DummyVideoSource : public VideoSource { public: DummyVideoSource() - : img_(NULL), limit_(100), width_(80), height_(64), + : img_(nullptr), limit_(100), width_(80), height_(64), format_(VPX_IMG_FMT_I420) { ReallocImage(); } - virtual ~DummyVideoSource() { vpx_img_free(img_); } + ~DummyVideoSource() override { vpx_img_free(img_); } - virtual void Begin() { + void Begin() override { frame_ = 0; FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } - virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; } + vpx_image_t *img() const override { + return (frame_ < limit_) ? img_ : nullptr; + } // Models a stream where Timebase = 1/FPS, so pts == frame. - virtual vpx_codec_pts_t pts() const { return frame_; } + vpx_codec_pts_t pts() const override { return frame_; } - virtual unsigned long duration() const { return 1; } + unsigned long duration() const override { return 1; } - virtual vpx_rational_t timebase() const { + vpx_rational_t timebase() const override { const vpx_rational_t t = { 1, 30 }; return t; } - virtual unsigned int frame() const { return frame_; } + unsigned int frame() const override { return frame_; } - virtual unsigned int limit() const { return limit_; } + unsigned int limit() const override { return limit_; } void set_limit(unsigned int limit) { limit_ = limit; } @@ -190,8 +217,9 @@ class DummyVideoSource : public VideoSource { void ReallocImage() { vpx_img_free(img_); - img_ = vpx_img_alloc(NULL, format_, width_, height_, 32); - raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8; + img_ = vpx_img_alloc(nullptr, format_, width_, height_, 32); + ASSERT_NE(img_, nullptr); + raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8; } vpx_image_t *img_; @@ -208,17 +236,17 @@ class RandomVideoSource : public DummyVideoSource { RandomVideoSource(int seed = ACMRandom::DeterministicSeed()) : rnd_(seed), seed_(seed) {} - protected: // Reset the RNG to get a matching stream for the second pass - virtual void Begin() { + void Begin() override { frame_ = 0; rnd_.Reset(seed_); FillFrame(); } + protected: // 15 frames of noise, followed by 15 static frames. Reset to 0 rather // than holding previous frames to encourage keyframes to be thrown. - virtual void FillFrame() { + void FillFrame() override { if (img_) { if (frame_ % 30 < 15) { for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8(); @@ -255,4 +283,4 @@ class CompressedVideoSource { } // namespace libvpx_test -#endif // TEST_VIDEO_SOURCE_H_ +#endif // VPX_TEST_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/test/vp8_boolcoder_test.cc b/media/libvpx/libvpx/test/vp8_boolcoder_test.cc index 9d81f9382a..0e2c7b77e1 100644 --- a/media/libvpx/libvpx/test/vp8_boolcoder_test.cc +++ b/media/libvpx/libvpx/test/vp8_boolcoder_test.cc @@ -15,7 +15,7 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/acm_random.h" #include "vp8/decoder/dboolhuff.h" @@ -93,6 +93,9 @@ TEST(VP8, TestBitIO) { } vp8_stop_encode(&bw); + // vp8dx_bool_decoder_fill() may read into uninitialized data that + // isn't used meaningfully, but may trigger an MSan warning. + memset(bw_buffer + bw.pos, 0, sizeof(VP8_BD_VALUE) - 1); BOOL_DECODER br; encrypt_buffer(bw_buffer, kBufferSize); diff --git a/media/libvpx/libvpx/test/vp8_datarate_test.cc b/media/libvpx/libvpx/test/vp8_datarate_test.cc new file mode 100644 index 0000000000..63fc724b7d --- /dev/null +++ b/media/libvpx/libvpx/test/vp8_datarate_test.cc @@ -0,0 +1,508 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx/vpx_encoder.h" + +namespace { + +class DatarateTestLarge + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {} + + ~DatarateTestLarge() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); + ResetModel(); + } + + void ResetModel() { + last_pts_ = 0; + bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; + frame_number_ = 0; + first_drop_ = 0; + bits_total_ = 0; + duration_ = 0.0; + // Denoiser is off by default. + denoiser_on_ = 0; + denoiser_offon_test_ = 0; + denoiser_offon_period_ = -1; + gf_boost_ = 0; + use_roi_ = false; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_); + } + + if (use_roi_) { + encoder->Control(VP8E_SET_ROI_MAP, &roi_); + } + + if (denoiser_offon_test_) { + ASSERT_GT(denoiser_offon_period_, 0) + << "denoiser_offon_period_ is not positive."; + if ((video->frame() + 1) % denoiser_offon_period_ == 0) { + // Flip denoiser_on_ periodically + denoiser_on_ ^= 1; + } + encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_); + } + + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Time since last timestamp = duration. + vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; + + // TODO(jimbankoski): Remove these lines when the issue: + // http://code.google.com/p/webm/issues/detail?id=496 is fixed. + // For now the codec assumes buffer starts at starting buffer rate + // plus one frame's time. + if (last_pts_ == 0) duration = 1; + + // Add to the buffer the bits we'd expect from a constant bitrate server. + bits_in_buffer_model_ += static_cast( + duration * timebase_ * cfg_.rc_target_bitrate * 1000); + + /* Test the buffer model here before subtracting the frame. Do so because + * the way the leaky bucket model works in libvpx is to allow the buffer to + * empty - and then stop showing frames until we've got enough bits to + * show one. As noted in comment below (issue 495), this does not currently + * apply to key frames. For now exclude key frames in condition below. */ + const bool key_frame = + (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false; + if (!key_frame) { + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + } + + const int64_t frame_size_in_bits = pkt->data.frame.sz * 8; + + // Subtract from the buffer the bits associated with a played back frame. + bits_in_buffer_model_ -= frame_size_in_bits; + + // Update the running total of bits for end of test datarate checks. + bits_total_ += frame_size_in_bits; + + // If first drop not set and we have a drop set it to this time. + if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1; + + // Update the most recent pts. + last_pts_ = pkt->data.frame.pts; + + // We update this so that we can calculate the datarate minus the last + // frame encoded in the file. + bits_in_last_frame_ = frame_size_in_bits; + + ++frame_number_; + } + + void EndPassHook() override { + if (bits_total_) { + const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit + + duration_ = (last_pts_ + 1) * timebase_; + + // Effective file datarate includes the time spent prebuffering. + effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 / + (cfg_.rc_buf_initial_sz / 1000.0 + duration_); + + file_datarate_ = file_size_in_kb / duration_; + } + } + + virtual void DenoiserLevelsTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + for (int j = 1; j < 5; ++j) { + // Run over the denoiser levels. + // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j + // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV, + // denoiserOnAggressive, and denoiserOnAdaptive. + cfg_.rc_target_bitrate = 300; + ResetModel(); + denoiser_on_ = j; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + } + + virtual void DenoiserOffOnTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 299); + cfg_.rc_target_bitrate = 300; + ResetModel(); + // Set the offon test flag. + denoiser_offon_test_ = 1; + denoiser_offon_period_ = 100; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + + virtual void BasicBufferModelTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // 2 pass cbr datarate control has a bug hidden by the small # of + // frames selected in this encode. The problem is that even if the buffer is + // negative we produce a keyframe on a cutscene. Ignoring datarate + // constraints + // TODO(jimbankoski): ( Fix when issue + // http://code.google.com/p/webm/issues/detail?id=495 is addressed. ) + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + + // There is an issue for low bitrates in real-time mode, where the + // effective_datarate slightly overshoots the target bitrate. + // This is same the issue as noted about (#495). + // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100), + // when the issue is resolved. + for (int i = 100; i < 800; i += 200) { + cfg_.rc_target_bitrate = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + } + + virtual void ChangingDropFrameThreshTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_max_quantizer = 36; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.kf_mode = VPX_KF_DISABLED; + + const int frame_count = 40; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, frame_count); + + // Here we check that the first dropped frame gets earlier and earlier + // as the drop frame threshold is increased. + + const int kDropFrameThreshTestStep = 30; + vpx_codec_pts_t last_drop = frame_count; + for (int i = 1; i < 91; i += kDropFrameThreshTestStep) { + cfg_.rc_dropframe_thresh = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_LE(first_drop_, last_drop) + << " The first dropped frame for drop_thresh " << i + << " > first dropped frame for drop_thresh " + << i - kDropFrameThreshTestStep; + last_drop = first_drop_; + } + } + + virtual void DropFramesMultiThreadsTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 30; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, + 288, 30, 1, 0, 140); + cfg_.rc_target_bitrate = 200; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + } + + virtual void MultiThreadsPSNRTest() { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_threads = 4; + init_flags_ = VPX_CODEC_USE_PSNR; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, 30); + cfg_.rc_target_bitrate = 1000; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.5) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 2.0) + << " The datarate for the file missed the target!"; + } + + vpx_codec_pts_t last_pts_; + int64_t bits_in_buffer_model_; + double timebase_; + int frame_number_; + vpx_codec_pts_t first_drop_; + int64_t bits_total_; + double duration_; + double file_datarate_; + double effective_datarate_; + int64_t bits_in_last_frame_; + int denoiser_on_; + int denoiser_offon_test_; + int denoiser_offon_period_; + int set_cpu_used_; + int gf_boost_; + bool use_roi_; + vpx_roi_map_t roi_; +}; + +#if CONFIG_TEMPORAL_DENOISING +// Check basic datarate targeting, for a single bitrate, but loop over the +// various denoiser settings. +TEST_P(DatarateTestLarge, DenoiserLevels) { DenoiserLevelsTest(); } + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestLarge, DenoiserOffOn) { DenoiserOffOnTest(); } +#endif // CONFIG_TEMPORAL_DENOISING + +TEST_P(DatarateTestLarge, BasicBufferModel) { BasicBufferModelTest(); } + +TEST_P(DatarateTestLarge, ChangingDropFrameThresh) { + ChangingDropFrameThreshTest(); +} + +TEST_P(DatarateTestLarge, DropFramesMultiThreads) { + DropFramesMultiThreadsTest(); +} + +class DatarateTestRealTime : public DatarateTestLarge { + public: + ~DatarateTestRealTime() override = default; +}; + +#if CONFIG_TEMPORAL_DENOISING +// Check basic datarate targeting, for a single bitrate, but loop over the +// various denoiser settings. +TEST_P(DatarateTestRealTime, DenoiserLevels) { DenoiserLevelsTest(); } + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestRealTime, DenoiserOffOn) {} +#endif // CONFIG_TEMPORAL_DENOISING + +TEST_P(DatarateTestRealTime, BasicBufferModel) { BasicBufferModelTest(); } + +TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) { + ChangingDropFrameThreshTest(); +} + +TEST_P(DatarateTestRealTime, DropFramesMultiThreads) { + DropFramesMultiThreadsTest(); +} + +TEST_P(DatarateTestRealTime, MultiThreadsPSNR) { MultiThreadsPSNRTest(); } + +TEST_P(DatarateTestRealTime, RegionOfInterest) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + // Encode using multiple threads. + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 352; + cfg_.g_h = 288; + + ResetModel(); + + // Set ROI parameters + use_roi_ = true; + memset(&roi_, 0, sizeof(roi_)); + + roi_.rows = (cfg_.g_h + 15) / 16; + roi_.cols = (cfg_.g_w + 15) / 16; + + roi_.delta_q[0] = 0; + roi_.delta_q[1] = -20; + roi_.delta_q[2] = 0; + roi_.delta_q[3] = 0; + + roi_.delta_lf[0] = 0; + roi_.delta_lf[1] = -20; + roi_.delta_lf[2] = 0; + roi_.delta_lf[3] = 0; + + roi_.static_threshold[0] = 0; + roi_.static_threshold[1] = 1000; + roi_.static_threshold[2] = 0; + roi_.static_threshold[3] = 0; + + // Use 2 states: 1 is center square, 0 is the rest. + roi_.roi_map = + (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)); + for (unsigned int i = 0; i < roi_.rows; ++i) { + for (unsigned int j = 0; j < roi_.cols; ++j) { + if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && + j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { + roi_.roi_map[i * roi_.cols + j] = 1; + } + } + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; + + free(roi_.roi_map); +} + +TEST_P(DatarateTestRealTime, GFBoost) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 300; + ResetModel(); + // Apply a gf boost. + gf_boost_ = 50; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; +} + +TEST_P(DatarateTestRealTime, NV12) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_error_resilient = 0; + ::libvpx_test::YUVVideoSource video("hantro_collage_w352h288_nv12.yuv", + VPX_IMG_FMT_NV12, 352, 288, 30, 1, 0, + 100); + + cfg_.rc_target_bitrate = 200; + ResetModel(); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4) + << " The datarate for the file missed the target!"; +} + +class DatarateTestPsnr : public DatarateTestLarge { + public: + DatarateTestPsnr() : DatarateTestLarge() {} + ~DatarateTestPsnr() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(libvpx_test::kRealTime); + set_cpu_used_ = 10; + ResetModel(); + frame_flags_ = VPX_EFLAG_CALCULATE_PSNR; + } + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + DatarateTestLarge::PreEncodeFrameHook(video, encoder); + frame_flags_ ^= VPX_EFLAG_CALCULATE_PSNR; +#if CONFIG_INTERNAL_STATS + // CONFIG_INTERNAL_STATS unconditionally generates PSNR. + expect_psnr_ = true; +#else + expect_psnr_ = (frame_flags_ & VPX_EFLAG_CALCULATE_PSNR) != 0; +#endif // CONFIG_INTERNAL_STATS + if (video->img() == nullptr) { + expect_psnr_ = false; + } + } + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + libvpx_test::CxDataIterator iter = encoder->GetCxData(); + + bool had_psnr = false; + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + if (pkt->kind == VPX_CODEC_PSNR_PKT) had_psnr = true; + } + + EXPECT_EQ(had_psnr, expect_psnr_); + } + + private: + bool expect_psnr_; +}; + +TEST_P(DatarateTestPsnr, PerFramePsnr) { + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 100); + + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP8_INSTANTIATE_TEST_SUITE(DatarateTestLarge, ALL_TEST_MODES, + ::testing::Values(0)); +VP8_INSTANTIATE_TEST_SUITE(DatarateTestRealTime, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Values(-6, -12)); +VP8_INSTANTIATE_TEST_SUITE(DatarateTestPsnr, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Values(0)); + +} // namespace diff --git a/media/libvpx/libvpx/test/vp8_decrypt_test.cc b/media/libvpx/libvpx/test/vp8_decrypt_test.cc index bcac9d1a82..e00620b192 100644 --- a/media/libvpx/libvpx/test/vp8_decrypt_test.cc +++ b/media/libvpx/libvpx/test/vp8_decrypt_test.cc @@ -12,7 +12,7 @@ #include #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/ivf_video_source.h" diff --git a/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc b/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc index 2cbcf04149..6c68355d8e 100644 --- a/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc +++ b/media/libvpx/libvpx/test/vp8_denoiser_sse2_test.cc @@ -12,7 +12,7 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/acm_random.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" @@ -21,6 +21,7 @@ #include "vp8/encoder/denoising.h" #include "vp8/common/reconinter.h" #include "vpx/vpx_integer.h" +#include "vpx_config.h" #include "vpx_mem/vpx_mem.h" using libvpx_test::ACMRandom; @@ -30,17 +31,22 @@ namespace { const int kNumPixels = 16 * 16; class VP8DenoiserTest : public ::testing::TestWithParam { public: - virtual ~VP8DenoiserTest() {} + ~VP8DenoiserTest() override = default; - virtual void SetUp() { increase_denoising_ = GetParam(); } + void SetUp() override { increase_denoising_ = GetParam(); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: int increase_denoising_; }; +// TODO(https://crbug.com/webm/1718): This test fails with gcc 8-10. +#if defined(__GNUC__) && __GNUC__ >= 8 +TEST_P(VP8DenoiserTest, DISABLED_BitexactCheck) { +#else TEST_P(VP8DenoiserTest, BitexactCheck) { +#endif ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 4000; const int stride = 16; @@ -87,7 +93,7 @@ TEST_P(VP8DenoiserTest, BitexactCheck) { // Check bitexactness. for (int h = 0; h < 16; ++h) { for (int w = 0; w < 16; ++w) { - EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]); + ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]); } } @@ -103,12 +109,12 @@ TEST_P(VP8DenoiserTest, BitexactCheck) { // Check bitexactness. for (int h = 0; h < 16; ++h) { for (int w = 0; w < 16; ++w) { - EXPECT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]); + ASSERT_EQ(avg_block_c[h * stride + w], avg_block_sse2[h * stride + w]); } } } } // Test for all block size. -INSTANTIATE_TEST_CASE_P(SSE2, VP8DenoiserTest, ::testing::Values(0, 1)); +INSTANTIATE_TEST_SUITE_P(SSE2, VP8DenoiserTest, ::testing::Values(0, 1)); } // namespace diff --git a/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc b/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc index da4f0caa1e..3a6514a3e9 100644 --- a/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc +++ b/media/libvpx/libvpx/test/vp8_fdct4x4_test.cc @@ -15,14 +15,18 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" +#include "./vpx_config.h" #include "./vp8_rtcd.h" #include "test/acm_random.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" namespace { +using FdctFunc = void (*)(int16_t *a, int16_t *b, int a_stride); + const int cospi8sqrt2minus1 = 20091; const int sinpi8sqrt2 = 35468; @@ -68,10 +72,21 @@ void reference_idct4x4(const int16_t *input, int16_t *output) { using libvpx_test::ACMRandom; -TEST(VP8FdctTest, SignBiasCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); +class FdctTest : public ::testing::TestWithParam { + public: + void SetUp() override { + fdct_func_ = GetParam(); + rnd_.Reset(ACMRandom::DeterministicSeed()); + } + + protected: + FdctFunc fdct_func_; + ACMRandom rnd_; +}; + +TEST_P(FdctTest, SignBiasCheck) { int16_t test_input_block[16]; - int16_t test_output_block[16]; + DECLARE_ALIGNED(16, int16_t, test_output_block[16]); const int pitch = 8; int count_sign_block[16][2]; const int count_test_block = 1000000; @@ -81,10 +96,10 @@ TEST(VP8FdctTest, SignBiasCheck) { for (int i = 0; i < count_test_block; ++i) { // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 16; ++j) { - test_input_block[j] = rnd.Rand8() - rnd.Rand8(); + test_input_block[j] = rnd_.Rand8() - rnd_.Rand8(); } - vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch); + fdct_func_(test_input_block, test_output_block, pitch); for (int j = 0; j < 16; ++j) { if (test_output_block[j] < 0) { @@ -110,10 +125,10 @@ TEST(VP8FdctTest, SignBiasCheck) { for (int i = 0; i < count_test_block; ++i) { // Initialize a test block with input range [-15, 15]. for (int j = 0; j < 16; ++j) { - test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4); + test_input_block[j] = (rnd_.Rand8() >> 4) - (rnd_.Rand8() >> 4); } - vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch); + fdct_func_(test_input_block, test_output_block, pitch); for (int j = 0; j < 16; ++j) { if (test_output_block[j] < 0) { @@ -133,25 +148,24 @@ TEST(VP8FdctTest, SignBiasCheck) { EXPECT_EQ(true, bias_acceptable) << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]"; -}; +} -TEST(VP8FdctTest, RoundTripErrorCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); +TEST_P(FdctTest, RoundTripErrorCheck) { int max_error = 0; double total_error = 0; const int count_test_block = 1000000; for (int i = 0; i < count_test_block; ++i) { int16_t test_input_block[16]; - int16_t test_temp_block[16]; int16_t test_output_block[16]; + DECLARE_ALIGNED(16, int16_t, test_temp_block[16]); // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 16; ++j) { - test_input_block[j] = rnd.Rand8() - rnd.Rand8(); + test_input_block[j] = rnd_.Rand8() - rnd_.Rand8(); } const int pitch = 8; - vp8_short_fdct4x4_c(test_input_block, test_temp_block, pitch); + fdct_func_(test_input_block, test_temp_block, pitch); reference_idct4x4(test_temp_block, test_output_block); for (int j = 0; j < 16; ++j) { @@ -167,6 +181,31 @@ TEST(VP8FdctTest, RoundTripErrorCheck) { EXPECT_GE(count_test_block, total_error) << "Error: FDCT/IDCT has average roundtrip error > 1 per block"; -}; +} +INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c)); + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, FdctTest, + ::testing::Values(vp8_short_fdct4x4_neon)); +#endif // HAVE_NEON + +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P(SSE2, FdctTest, + ::testing::Values(vp8_short_fdct4x4_sse2)); +#endif // HAVE_SSE2 + +#if HAVE_MSA +INSTANTIATE_TEST_SUITE_P(MSA, FdctTest, + ::testing::Values(vp8_short_fdct4x4_msa)); +#endif // HAVE_MSA +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P(MMI, FdctTest, + ::testing::Values(vp8_short_fdct4x4_mmi)); +#endif // HAVE_MMI + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, FdctTest, + ::testing::Values(vp8_short_fdct4x4_lsx)); +#endif // HAVE_LSX } // namespace diff --git a/media/libvpx/libvpx/test/vp8_fragments_test.cc b/media/libvpx/libvpx/test/vp8_fragments_test.cc index ac967d1b7e..1f13cde812 100644 --- a/media/libvpx/libvpx/test/vp8_fragments_test.cc +++ b/media/libvpx/libvpx/test/vp8_fragments_test.cc @@ -7,19 +7,19 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/video_source.h" namespace { -class VP8FramgmentsTest : public ::libvpx_test::EncoderTest, - public ::testing::Test { +class VP8FragmentsTest : public ::libvpx_test::EncoderTest, + public ::testing::Test { protected: - VP8FramgmentsTest() : EncoderTest(&::libvpx_test::kVP8) {} - virtual ~VP8FramgmentsTest() {} + VP8FragmentsTest() : EncoderTest(&::libvpx_test::kVP8) {} + ~VP8FragmentsTest() override = default; - virtual void SetUp() { + void SetUp() override { const unsigned long init_flags = // NOLINT(runtime/int) VPX_CODEC_USE_OUTPUT_PARTITION; InitializeConfig(); @@ -28,7 +28,7 @@ class VP8FramgmentsTest : public ::libvpx_test::EncoderTest, } }; -TEST_F(VP8FramgmentsTest, TestFragmentsEncodeDecode) { +TEST_F(VP8FragmentsTest, TestFragmentsEncodeDecode) { ::libvpx_test::RandomVideoSource video; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } diff --git a/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh b/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh index a8b7fe78ee..1e96f94cc7 100644 --- a/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh +++ b/media/libvpx/libvpx/test/vp8_multi_resolution_encoder.sh @@ -22,7 +22,7 @@ vp8_multi_resolution_encoder_verify_environment() { elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." return 1 fi - local readonly app="vp8_multi_resolution_encoder" + local app="vp8_multi_resolution_encoder" if [ -z "$(vpx_tool_path "${app}")" ]; then elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent." return 1 @@ -33,7 +33,7 @@ vp8_multi_resolution_encoder_verify_environment() { # Runs vp8_multi_resolution_encoder. Simply forwards all arguments to # vp8_multi_resolution_encoder after building path to the executable. vp8_mre() { - local readonly encoder="$(vpx_tool_path vp8_multi_resolution_encoder)" + local encoder="$(vpx_tool_path vp8_multi_resolution_encoder)" if [ ! -x "${encoder}" ]; then elog "${encoder} does not exist or is not executable." return 1 @@ -43,23 +43,35 @@ vp8_mre() { } vp8_multi_resolution_encoder_three_formats() { - local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf - ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf - ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf" + local output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf + ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf + ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf" + local layer_bitrates="150 80 50" + local keyframe_insert="200" + local temporal_layers="3 3 3" + local framerate="30" if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then if [ "$(vp8_encode_available)" = "yes" ]; then # Param order: # Input width # Input height + # Framerate # Input file path # Output file names + # Layer bitrates + # Temporal layers + # Keyframe insert # Output PSNR vp8_mre "${YUV_RAW_INPUT_WIDTH}" \ "${YUV_RAW_INPUT_HEIGHT}" \ + "${framerate}" \ "${YUV_RAW_INPUT}" \ ${output_files} \ - 0 + ${layer_bitrates} \ + ${temporal_layers} \ + "${keyframe_insert}" \ + 0 || return 1 for output_file in ${output_files}; do if [ ! -e "${output_file}" ]; then diff --git a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc new file mode 100644 index 0000000000..74a8cde7db --- /dev/null +++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc @@ -0,0 +1,424 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // NOLINT +#include + +#include "./vpx_config.h" +#include "gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/video_source.h" +#include "vp8/vp8_ratectrl_rtc.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace { + +struct Vp8RCTestVideo { + Vp8RCTestVideo() = default; + Vp8RCTestVideo(const char *name_, int width_, int height_, + unsigned int frames_) + : name(name_), width(width_), height(height_), frames(frames_) {} + + friend std::ostream &operator<<(std::ostream &os, + const Vp8RCTestVideo &video) { + os << video.name << " " << video.width << " " << video.height << " " + << video.frames; + return os; + } + const char *name; + int width; + int height; + unsigned int frames; +}; + +const Vp8RCTestVideo kVp8RCTestVectors[] = { + Vp8RCTestVideo("niklas_640_480_30.yuv", 640, 480, 470), + Vp8RCTestVideo("desktop_office1.1280_720-020.yuv", 1280, 720, 300), + Vp8RCTestVideo("hantro_collage_w352h288.yuv", 352, 288, 100), +}; + +class Vp8RcInterfaceTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + public: + Vp8RcInterfaceTest() + : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false), + frame_drop_thresh_(0) {} + ~Vp8RcInterfaceTest() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + } + + // From error_resilience_test.cc + int SetFrameFlags(int frame_num, int num_temp_layers) { + int frame_flags = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + // Layer 0: predict from L and ARF, update L. + frame_flags = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + } else { + // Layer 1: predict from L, G and ARF, and update G. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + // Layer 0: predict from L, update L. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF; + } else if ((frame_num - 2) % 4 == 0) { + // Layer 1: predict from L, G, update G. + frame_flags = + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_REF_ARF; + } else if ((frame_num - 1) % 2 == 0) { + // Layer 2: predict from L, G, ARF; update ARG. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + } + } + return frame_flags; + } + + int SetLayerId(int frame_num, int num_temp_layers) { + int layer_id = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + layer_id = 0; + } else { + layer_id = 1; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + layer_id = 0; + } else if ((frame_num - 2) % 4 == 0) { + layer_id = 1; + } else if ((frame_num - 1) % 2 == 0) { + layer_id = 2; + } + } + return layer_id; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (rc_cfg_.ts_number_layers > 1) { + const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers); + const int frame_flags = + SetFrameFlags(video->frame(), cfg_.ts_number_layers); + frame_params_.temporal_layer_id = layer_id; + if (video->frame() > 0) { + encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id); + encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags); + } + } else { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, -6); + encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); + if (rc_cfg_.is_screen) { + encoder->Control(VP8E_SET_SCREEN_CONTENT_MODE, 1); + } + } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { + // Disable golden frame update. + frame_flags_ |= VP8_EFLAG_NO_UPD_GF; + frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; + } + } + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + encoder_exit_ = video->frame() == test_video_.frames; + } + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + if (encoder_exit_) { + return; + } + int qp; + libvpx::UVDeltaQP uv_delta_qp; + encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) { + ASSERT_EQ(rc_api_->GetQP(), qp); + uv_delta_qp = rc_api_->GetUVDeltaQP(); + // delta_qp for UV channel is only set for screen. + if (!rc_cfg_.is_screen) { + ASSERT_EQ(uv_delta_qp.uvdc_delta_q, 0); + ASSERT_EQ(uv_delta_qp.uvac_delta_q, 0); + } + } else { + num_drops_++; + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + rc_api_->PostEncodeUpdate(pkt->data.frame.sz); + } + + void RunOneLayer() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + SetConfig(); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunOneLayerScreen() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + SetConfig(); + rc_cfg_.is_screen = true; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunOneLayerDropFrames() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + frame_drop_thresh_ = 30; + num_drops_ = 0; + // Use lower target_bitrate and max_quantizer to trigger drops. + target_bitrate_ = target_bitrate_ >> 2; + SetConfig(); + rc_cfg_.max_quantizer = 56; + cfg_.rc_max_quantizer = 56; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + + void RunPeriodicKey() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + key_interval_ = 100; + frame_drop_thresh_ = 30; + SetConfig(); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunTemporalLayers2TL() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + SetConfigTemporalLayers(2); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunTemporalLayers3TL() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + SetConfigTemporalLayers(3); + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunTemporalLayers3TLDropFrames() { + test_video_ = GET_PARAM(2); + target_bitrate_ = GET_PARAM(1); + frame_drop_thresh_ = 30; + num_drops_ = 0; + // Use lower target_bitrate and max_quantizer to trigger drops. + target_bitrate_ = target_bitrate_ >> 2; + SetConfigTemporalLayers(3); + rc_cfg_.max_quantizer = 56; + cfg_.rc_max_quantizer = 56; + rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + + ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width, + test_video_.height, 30, 1, 0, + test_video_.frames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + + private: + void SetConfig() { + rc_cfg_.width = test_video_.width; + rc_cfg_.height = test_video_.height; + rc_cfg_.max_quantizer = 60; + rc_cfg_.min_quantizer = 2; + rc_cfg_.target_bandwidth = target_bitrate_; + rc_cfg_.buf_initial_sz = 600; + rc_cfg_.buf_optimal_sz = 600; + rc_cfg_.buf_sz = target_bitrate_; + rc_cfg_.undershoot_pct = 50; + rc_cfg_.overshoot_pct = 50; + rc_cfg_.max_intra_bitrate_pct = 1000; + rc_cfg_.framerate = 30.0; + rc_cfg_.layer_target_bitrate[0] = target_bitrate_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; + + // Encoder settings for ground truth. + cfg_.g_w = test_video_.width; + cfg_.g_h = test_video_.height; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_initial_sz = 600; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = target_bitrate_; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 60; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.rc_target_bitrate = target_bitrate_; + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; + } + + void SetConfigTemporalLayers(int temporal_layers) { + rc_cfg_.width = test_video_.width; + rc_cfg_.height = test_video_.height; + rc_cfg_.max_quantizer = 60; + rc_cfg_.min_quantizer = 2; + rc_cfg_.target_bandwidth = target_bitrate_; + rc_cfg_.buf_initial_sz = 600; + rc_cfg_.buf_optimal_sz = 600; + rc_cfg_.buf_sz = target_bitrate_; + rc_cfg_.undershoot_pct = 50; + rc_cfg_.overshoot_pct = 50; + rc_cfg_.max_intra_bitrate_pct = 1000; + rc_cfg_.framerate = 30.0; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; + if (temporal_layers == 2) { + rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100; + rc_cfg_.layer_target_bitrate[1] = target_bitrate_; + rc_cfg_.ts_rate_decimator[0] = 2; + rc_cfg_.ts_rate_decimator[1] = 1; + } else if (temporal_layers == 3) { + rc_cfg_.layer_target_bitrate[0] = 40 * target_bitrate_ / 100; + rc_cfg_.layer_target_bitrate[1] = 60 * target_bitrate_ / 100; + rc_cfg_.layer_target_bitrate[2] = target_bitrate_; + rc_cfg_.ts_rate_decimator[0] = 4; + rc_cfg_.ts_rate_decimator[1] = 2; + rc_cfg_.ts_rate_decimator[2] = 1; + } + + rc_cfg_.ts_number_layers = temporal_layers; + + // Encoder settings for ground truth. + cfg_.g_w = test_video_.width; + cfg_.g_h = test_video_.height; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_initial_sz = 600; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = target_bitrate_; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 60; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 1; + cfg_.rc_target_bitrate = target_bitrate_; + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; + // 2 Temporal layers, no spatial layers, CBR mode. + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = temporal_layers; + if (temporal_layers == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.ts_periodicity = 2; + cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate; + } else if (temporal_layers == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.ts_periodicity = 4; + cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate; + } + } + + std::unique_ptr rc_api_; + libvpx::VP8RateControlRtcConfig rc_cfg_; + int key_interval_; + int target_bitrate_; + Vp8RCTestVideo test_video_; + libvpx::VP8FrameParamsQpRTC frame_params_; + bool encoder_exit_; + int frame_drop_thresh_; + int num_drops_; +}; + +TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); } + +TEST_P(Vp8RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); } + +TEST_P(Vp8RcInterfaceTest, OneLayerDropFrames) { RunOneLayerDropFrames(); } + +TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); } + +TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); } + +TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); } + +TEST_P(Vp8RcInterfaceTest, TemporalLayers3TLDropFrames) { + RunTemporalLayers3TLDropFrames(); +} + +VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest, + ::testing::Values(200, 400, 1000), + ::testing::ValuesIn(kVp8RCTestVectors)); + +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_arf_freq_test.cc b/media/libvpx/libvpx/test/vp9_arf_freq_test.cc index 48a4ca7392..ea018b6e22 100644 --- a/media/libvpx/libvpx/test/vp9_arf_freq_test.cc +++ b/media/libvpx/libvpx/test/vp9_arf_freq_test.cc @@ -8,7 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include + +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" @@ -16,6 +18,7 @@ #include "test/y4m_video_source.h" #include "test/yuv_video_source.h" #include "vp9/encoder/vp9_ratectrl.h" +#include "vpx_config.h" namespace { @@ -25,7 +28,7 @@ const int kBitrate = 500; #define ARF_NOT_SEEN 1000001 #define ARF_SEEN_ONCE 1000000 -typedef struct { +struct TestVideoParam { const char *filename; unsigned int width; unsigned int height; @@ -35,12 +38,12 @@ typedef struct { vpx_img_fmt fmt; vpx_bit_depth_t bit_depth; unsigned int profile; -} TestVideoParam; +}; -typedef struct { +struct TestEncodeParam { libvpx_test::TestMode mode; int cpu_used; -} TestEncodeParam; +}; const TestVideoParam kTestVectors[] = { // artificially increase framerate to trigger default check @@ -84,9 +87,9 @@ class ArfFreqTest : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {} - virtual ~ArfFreqTest() {} + ~ArfFreqTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(test_encode_param_.mode); if (test_encode_param_.mode != ::libvpx_test::kRealTime) { @@ -102,7 +105,7 @@ class ArfFreqTest dec_cfg_.threads = 4; } - virtual void BeginPassHook(unsigned int) { + void BeginPassHook(unsigned int) override { min_run_ = ARF_NOT_SEEN; run_of_visible_frames_ = 0; } @@ -124,7 +127,7 @@ class ArfFreqTest return frames; } - virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return; const int frames = GetNumFramesInPkt(pkt); if (frames == 1) { @@ -143,8 +146,8 @@ class ArfFreqTest } } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { if (video->frame() == 0) { encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); encoder->Control(VP9E_SET_TILE_COLUMNS, 4); @@ -190,7 +193,7 @@ TEST_P(ArfFreqTest, MinArfFreqTest) { init_flags_ = VPX_CODEC_USE_PSNR; if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; - testing::internal::scoped_ptr video; + std::unique_ptr video; if (is_extension_y4m(test_video_param_.filename)) { video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, kFrames)); @@ -211,7 +214,7 @@ TEST_P(ArfFreqTest, MinArfFreqTest) { } } -VP9_INSTANTIATE_TEST_CASE(ArfFreqTest, ::testing::ValuesIn(kTestVectors), - ::testing::ValuesIn(kEncodeVectors), - ::testing::ValuesIn(kMinArfVectors)); +VP9_INSTANTIATE_TEST_SUITE(ArfFreqTest, ::testing::ValuesIn(kTestVectors), + ::testing::ValuesIn(kEncodeVectors), + ::testing::ValuesIn(kMinArfVectors)); } // namespace diff --git a/media/libvpx/libvpx/test/vp9_error_block_test.cc b/media/libvpx/libvpx/test/vp9_block_error_test.cc similarity index 58% rename from media/libvpx/libvpx/test/vp9_error_block_test.cc rename to media/libvpx/libvpx/test/vp9_block_error_test.cc index 74436c09e7..5cf8280c68 100644 --- a/media/libvpx/libvpx/test/vp9_error_block_test.cc +++ b/media/libvpx/libvpx/test/vp9_block_error_test.cc @@ -11,8 +11,9 @@ #include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "./vp9_rtcd.h" @@ -23,51 +24,52 @@ #include "vp9/common/vp9_entropy.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" using libvpx_test::ACMRandom; namespace { -#if CONFIG_VP9_HIGHBITDEPTH const int kNumIterations = 1000; -typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz, int bps); +using HBDBlockErrorFunc = int64_t (*)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps); -typedef std::tr1::tuple - ErrorBlockParam; +using BlockErrorParam = + std::tuple; -// wrapper for 8-bit block error functions without a 'bps' param. -typedef int64_t (*HighBdBlockError8bit)(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz); -template -int64_t HighBdBlockError8bitWrapper(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz, - int bps) { - EXPECT_EQ(8, bps); +using BlockErrorFunc = int64_t (*)(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz); + +template +int64_t BlockError8BitWrapper(const tran_low_t *coeff, + const tran_low_t *dqcoeff, intptr_t block_size, + int64_t *ssz, int bps) { + EXPECT_EQ(bps, 8); return fn(coeff, dqcoeff, block_size, ssz); } -class ErrorBlockTest : public ::testing::TestWithParam { +class BlockErrorTest : public ::testing::TestWithParam { public: - virtual ~ErrorBlockTest() {} - virtual void SetUp() { + ~BlockErrorTest() override = default; + void SetUp() override { error_block_op_ = GET_PARAM(0); ref_error_block_op_ = GET_PARAM(1); bit_depth_ = GET_PARAM(2); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: vpx_bit_depth_t bit_depth_; - ErrorBlockFunc error_block_op_; - ErrorBlockFunc ref_error_block_op_; + HBDBlockErrorFunc error_block_op_; + HBDBlockErrorFunc ref_error_block_op_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(BlockErrorTest); -TEST_P(ErrorBlockTest, OperationCheck) { +TEST_P(BlockErrorTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); @@ -110,7 +112,7 @@ TEST_P(ErrorBlockTest, OperationCheck) { << "First failed at test case " << first_failure; } -TEST_P(ErrorBlockTest, ExtremeValues) { +TEST_P(BlockErrorTest, ExtremeValues) { ACMRandom rnd(ACMRandom::DeterministicSeed()); DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); @@ -168,32 +170,58 @@ TEST_P(ErrorBlockTest, ExtremeValues) { << "First failed at test case " << first_failure; } -using std::tr1::make_tuple; +using std::make_tuple; #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( - SSE2, ErrorBlockTest, - ::testing::Values( - make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, - VPX_BITS_10), - make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, - VPX_BITS_12), - make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, - VPX_BITS_8), - make_tuple( - &HighBdBlockError8bitWrapper, - &HighBdBlockError8bitWrapper, - VPX_BITS_8))); +const BlockErrorParam sse2_block_error_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_10), + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_12), + make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c, + VPX_BITS_8), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, VPX_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(SSE2, BlockErrorTest, + ::testing::ValuesIn(sse2_block_error_tests)); #endif // HAVE_SSE2 -#if HAVE_AVX -INSTANTIATE_TEST_CASE_P( - AVX, ErrorBlockTest, - ::testing::Values(make_tuple( - &HighBdBlockError8bitWrapper, - &HighBdBlockError8bitWrapper, - VPX_BITS_8))); -#endif // HAVE_AVX +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, BlockErrorTest, + ::testing::Values(make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, + VPX_BITS_8))); +#endif // HAVE_AVX2 +#if HAVE_NEON +const BlockErrorParam neon_block_error_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_10), + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_12), + make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c, + VPX_BITS_8), #endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, VPX_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest, + ::testing::ValuesIn(neon_block_error_tests)); +#endif // HAVE_NEON + +#if HAVE_SVE +const BlockErrorParam sve_block_error_tests[] = { make_tuple( + &BlockError8BitWrapper, + &BlockError8BitWrapper, VPX_BITS_8) }; + +INSTANTIATE_TEST_SUITE_P(SVE, BlockErrorTest, + ::testing::ValuesIn(sve_block_error_tests)); +#endif // HAVE_SVE } // namespace diff --git a/media/libvpx/libvpx/test/vp9_boolcoder_test.cc b/media/libvpx/libvpx/test/vp9_boolcoder_test.cc index 5dbfd5ca59..fbbdb19465 100644 --- a/media/libvpx/libvpx/test/vp9_boolcoder_test.cc +++ b/media/libvpx/libvpx/test/vp9_boolcoder_test.cc @@ -12,7 +12,7 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/acm_random.h" #include "vpx/vpx_integer.h" @@ -53,7 +53,7 @@ TEST(VP9, TestBitIO) { ACMRandom bit_rnd(random_seed); vpx_writer bw; uint8_t bw_buffer[kBufferSize]; - vpx_start_encode(&bw, bw_buffer); + vpx_start_encode(&bw, bw_buffer, sizeof(bw_buffer)); int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0; for (int i = 0; i < kBitsToTest; ++i) { @@ -65,13 +65,16 @@ TEST(VP9, TestBitIO) { vpx_write(&bw, bit, static_cast(probas[i])); } - vpx_stop_encode(&bw); + GTEST_ASSERT_EQ(vpx_stop_encode(&bw), 0); + // vpx_reader_fill() may read into uninitialized data that + // isn't used meaningfully, but may trigger an MSan warning. + memset(bw_buffer + bw.pos, 0, sizeof(BD_VALUE) - 1); // First bit should be zero GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0); vpx_reader br; - vpx_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL); + vpx_reader_init(&br, bw_buffer, kBufferSize, nullptr, nullptr); bit_rnd.Reset(random_seed); for (int i = 0; i < kBitsToTest; ++i) { if (bit_method == 2) { @@ -87,3 +90,24 @@ TEST(VP9, TestBitIO) { } } } + +TEST(VP9, TestBitIOBufferSize0) { + vpx_writer bw; + uint8_t bw_buffer[1]; + vpx_start_encode(&bw, bw_buffer, 0); + GTEST_ASSERT_EQ(vpx_stop_encode(&bw), -1); +} + +TEST(VP9, TestBitIOBufferSize1) { + vpx_writer bw; + uint8_t bw_buffer[1]; + vpx_start_encode(&bw, bw_buffer, sizeof(bw_buffer)); + GTEST_ASSERT_EQ(vpx_stop_encode(&bw), -1); +} + +TEST(VP9, TestBitIOBufferSize2) { + vpx_writer bw; + uint8_t bw_buffer[2]; + vpx_start_encode(&bw, bw_buffer, sizeof(bw_buffer)); + GTEST_ASSERT_EQ(vpx_stop_encode(&bw), 0); +} diff --git a/media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh b/media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh new file mode 100755 index 0000000000..03843610dc --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_c_vs_simd_encode.sh @@ -0,0 +1,420 @@ +#!/bin/sh +## +## Copyright (c) 2023 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## This script checks the bit exactness between C and SIMD +## implementations of VP9 encoder. +## +. $(dirname $0)/tools_common.sh + +TEST_BITRATES="1600 6400" +PRESETS="good rt" +TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input" +OUT_FILE_SUFFIX=".ivf" +SCRIPT_DIR=$(dirname "$0") +LIBVPX_SOURCE_DIR=$(cd "${SCRIPT_DIR}/.."; pwd) + +# Clips used in test. +YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" +YUV_480P_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_640_480_30.yuv" +Y4M_360P_10BIT_INPUT="${LIBVPX_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m" +Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m" + +# Number of frames to test. +VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=20 + +# Create a temporary directory for output files. +if [ -n "${TMPDIR}" ]; then + VPX_TEST_TEMP_ROOT="${TMPDIR}" +elif [ -n "${TEMPDIR}" ]; then + VPX_TEST_TEMP_ROOT="${TEMPDIR}" +else + VPX_TEST_TEMP_ROOT=/tmp +fi + +VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$" + +if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \ + [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then + echo "${0##*/}: Cannot create output directory, giving up." + echo "${0##*/}: VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}" + exit 1 +fi + +elog() { + echo "$@" 1>&2 +} + +# Echoes path to $1 when it's executable and exists in ${VPX_TEST_OUTPUT_DIR}, +# or an empty string. Caller is responsible for testing the string once the +# function returns. +vp9_enc_tool_path() { + local target="$1" + local tool_path="${VPX_TEST_OUTPUT_DIR}/build_target_${target}/vpxenc" + + if [ ! -x "${tool_path}" ]; then + tool_path="" + fi + echo "${tool_path}" +} + +# Environment check: Make sure input and source directories are available. +vp9_c_vs_simd_enc_verify_environment() { + if [ ! -e "${YUV_RAW_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${Y4M_720P_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then + elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH." + return 1 + fi + if [ ! -d "$LIBVPX_SOURCE_DIR" ]; then + elog "LIBVPX_SOURCE_DIR does not exist." + return 1 + fi +} + +# This is not needed since tools_common.sh does the same cleanup. +# Keep the code here for our reference. +# cleanup() { +# rm -rf ${VPX_TEST_OUTPUT_DIR} +# } + +# Echo VPX_SIMD_CAPS_MASK for different instruction set architecture. +avx512f() { + echo "0x1FF" +} + +avx2() { + echo "0x0FF" +} + +sse4_1() { + echo "0x03F" +} + +ssse3() { + echo "0x01F" +} + +sse2() { + echo "0x007" +} + +# Echo clip details to be used as input to vpxenc. +yuv_raw_input() { + echo ""${YUV_RAW_INPUT}" + --width=352 + --height=288 + --bit-depth=8 + --profile=0" +} + +yuv_480p_raw_input() { + echo ""${YUV_480P_RAW_INPUT}" + --width=640 + --height=480 + --bit-depth=8 + --profile=0" +} + +y4m_720p_input() { + echo ""${Y4M_720P_INPUT}" + --bit-depth=8 + --profile=0" +} + +y4m_360p_10bit_input() { + echo ""${Y4M_360P_10BIT_INPUT}" + --bit-depth=10 + --profile=2" +} + +has_x86_isa_extn() { + instruction_set=$1 + if ! grep -q "$instruction_set" /proc/cpuinfo; then + # This instruction_set is not supported. + return 1 + fi + # This instruction_set is supported. + return 0 +} + +# Echo good encode params for use with VP9 encoder. +vp9_encode_good_params() { + echo "--codec=vp9 \ + --good \ + --test-decode=fatal \ + --ivf \ + --threads=1 \ + --static-thresh=0 \ + --tile-columns=0 \ + --end-usage=vbr \ + --kf-max-dist=160 \ + --kf-min-dist=0 \ + --lag-in-frames=19 \ + --max-q=63 \ + --min-q=0 \ + --passes=2 \ + --undershoot-pct=100 \ + --overshoot-pct=100 \ + --verbose \ + --auto-alt-ref=1 \ + --drop-frame=0 \ + --bias-pct=50 \ + --minsection-pct=0 \ + --maxsection-pct=2000 \ + --arnr-maxframes=7 \ + --arnr-strength=5 \ + --sharpness=0 \ + --frame-parallel=0" +} + +# Echo realtime encode params for use with VP9 encoder. +vp9_encode_rt_params() { + echo "--codec=vp9 \ + --rt \ + --test-decode=fatal \ + --ivf \ + --threads=1 \ + --static-thresh=0 \ + --tile-columns=0 \ + --tile-rows=0 \ + --end-usage=cbr \ + --kf-max-dist=90000 \ + --lag-in-frames=0 \ + --max-q=58 \ + --min-q=2 \ + --passes=1 \ + --undershoot-pct=50 \ + --overshoot-pct=50 \ + --verbose \ + --row-mt=0 \ + --buf-sz=1000 \ + --buf-initial-sz=500 \ + --buf-optimal-sz=600 \ + --max-intra-rate=300 \ + --resize-allowed=0 \ + --noise-sensitivity=0 \ + --aq-mode=3 \ + --error-resilient=0" +} + +# Configures for the given target in the +# ${VPX_TEST_OUTPUT_DIR}/build_target_${target} directory. +vp9_enc_build() { + local target=$1 + local configure="$2" + local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target} + mkdir -p "$tmp_build_dir" + local save_dir="$PWD" + cd "$tmp_build_dir" + + echo "Building target: ${target}" + local config_args="--disable-install-docs \ + --enable-unit-tests \ + --enable-debug \ + --enable-postproc \ + --enable-vp9-postproc \ + --enable-vp9-temporal-denoising \ + --enable-vp9-highbitdepth" + + eval "$configure" --target="${target}" "${config_args}" ${devnull} + eval make -j$(nproc) ${devnull} + echo "Done building target: ${target}" + cd "${save_dir}" +} + +compare_enc_output() { + local target=$1 + local cpu=$2 + local clip=$3 + local bitrate=$4 + local preset=$5 + if ! diff -q ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then + elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset" + return 1 + fi +} + +vp9_enc_test() { + local encoder="$1" + local target=$2 + if [ -z "$(vp9_enc_tool_path "${target}")" ]; then + elog "vpxenc not found. It must exist in ${VPX_TEST_OUTPUT_DIR}/build_target_${target} path" + return 1 + fi + + local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target} + local save_dir="$PWD" + cd "$tmp_build_dir" + for preset in ${PRESETS}; do + if [ "${preset}" = "good" ]; then + local max_cpu_used=5 + local test_params=vp9_encode_good_params + elif [ "${preset}" = "rt" ]; then + local max_cpu_used=9 + local test_params=vp9_encode_rt_params + else + elog "Invalid preset" + cd "${save_dir}" + return 1 + fi + + # Enable armv8 test for real-time only + if [ "${preset}" = "good" ] && [ "${target}" = "armv8-linux-gcc" ]; then + continue + fi + + for cpu in $(seq 0 $max_cpu_used); do + for clip in ${TEST_CLIPS}; do + for bitrate in ${TEST_BITRATES}; do + eval "${encoder}" $($clip) $($test_params) \ + "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \ + "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \ + ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${devnull} + + if [ "${target}" != "generic-gnu" ]; then + if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then + # Find the mismatch + cd "${save_dir}" + return 1 + fi + fi + done + done + done + done + cd "${save_dir}" +} + +vp9_test_generic() { + local configure="$LIBVPX_SOURCE_DIR/configure" + local target="generic-gnu" + + echo "Build for: ${target}" + vp9_enc_build ${target} ${configure} + local encoder="$(vp9_enc_tool_path "${target}")" + vp9_enc_test $encoder "${target}" +} + +# This function encodes VP9 bitstream by enabling SSE2, SSSE3, SSE4_1, AVX2, AVX512f as there are +# no functions with MMX, SSE, SSE3 and AVX specialization. +# The value of environment variable 'VPX_SIMD_CAPS' controls enabling of different instruction +# set extension optimizations. The value of the flag 'VPX_SIMD_CAPS' and the corresponding +# instruction set extension optimization enabled are as follows: +# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX +# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX512 and lower variants +# 0 1 1 1 1 1 1 1 1 -> 0x0FF -> Enable AVX2 and lower variants +# 0 0 1 1 1 1 1 1 1 -> 0x07F -> Enable AVX and lower variants +# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants +# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants +# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants +# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants +# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants +# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX +## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "VPX_SIMD_CAPS_MASK" as +# all x86_64 platforms implement sse2. +vp9_test_x86() { + local arch=$1 + + if ! uname -m | grep -q "x86"; then + elog "Machine architecture is not x86 or x86_64" + return 0 + fi + + if [ $arch = "x86" ]; then + local target="x86-linux-gcc" + elif [ $arch = "x86_64" ]; then + local target="x86_64-linux-gcc" + fi + + local x86_isa_variants="avx512f avx2 sse4_1 ssse3 sse2" + local configure="$LIBVPX_SOURCE_DIR/configure" + + echo "Build for x86: ${target}" + vp9_enc_build ${target} ${configure} + local encoder="$(vp9_enc_tool_path "${target}")" + for isa in $x86_isa_variants; do + # Note that if has_x86_isa_extn returns 1, it is false, and vice versa. + if ! has_x86_isa_extn $isa; then + echo "${isa} is not supported in this machine" + continue + fi + export VPX_SIMD_CAPS_MASK=$($isa) + if ! vp9_enc_test $encoder ${target}; then + # Find the mismatch + return 1 + fi + unset VPX_SIMD_CAPS_MASK + done +} + +vp9_test_arm() { + local target="armv8-linux-gcc" + local configure="CROSS=aarch64-linux-gnu- $LIBVPX_SOURCE_DIR/configure --extra-cflags=-march=armv8.4-a \ + --extra-cxxflags=-march=armv8.4-a" + echo "Build for arm64: ${target}" + vp9_enc_build ${target} "${configure}" + + local encoder="$(vp9_enc_tool_path "${target}")" + if ! vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target}; then + # Find the mismatch + return 1 + fi +} + +vp9_c_vs_simd_enc_test() { + # Test Generic + vp9_test_generic + + # Test x86 (32 bit) + echo "vp9 test for x86 (32 bit): Started." + if ! vp9_test_x86 "x86"; then + echo "vp9 test for x86 (32 bit): Done, test failed." + return 1 + else + echo "vp9 test for x86 (32 bit): Done, all tests passed." + fi + + # Test x86_64 (64 bit) + if [ "$(eval uname -m)" = "x86_64" ]; then + echo "vp9 test for x86_64 (64 bit): Started." + if ! vp9_test_x86 "x86_64"; then + echo "vp9 test for x86_64 (64 bit): Done, test failed." + return 1 + else + echo "vp9 test for x86_64 (64 bit): Done, all tests passed." + fi + fi + + # Test ARM + echo "vp9_test_arm: Started." + if ! vp9_test_arm; then + echo "vp9 test for arm: Done, test failed." + return 1 + else + echo "vp9 test for arm: Done, all tests passed." + fi +} + +# Setup a trap function to clean up build, and output files after tests complete. +# trap cleanup EXIT + +run_tests vp9_c_vs_simd_enc_verify_environment vp9_c_vs_simd_enc_test diff --git a/media/libvpx/libvpx/test/vp9_datarate_test.cc b/media/libvpx/libvpx/test/vp9_datarate_test.cc new file mode 100644 index 0000000000..2c90d751e7 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_datarate_test.cc @@ -0,0 +1,1151 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "gtest/gtest.h" +#include "test/acm_random.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace { + +class DatarateTestVP9 : public ::libvpx_test::EncoderTest { + public: + explicit DatarateTestVP9(const ::libvpx_test::CodecFactory *codec) + : EncoderTest(codec) { + tune_content_ = 0; + } + + protected: + ~DatarateTestVP9() override = default; + + virtual void ResetModel() { + last_pts_ = 0; + bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz; + frame_number_ = 0; + tot_frame_number_ = 0; + first_drop_ = 0; + num_drops_ = 0; + aq_mode_ = 3; + // Denoiser is off by default. + denoiser_on_ = 0; + // For testing up to 3 layers. + for (int i = 0; i < 3; ++i) { + bits_total_[i] = 0; + } + denoiser_offon_test_ = 0; + denoiser_offon_period_ = -1; + frame_parallel_decoding_mode_ = 1; + delta_q_uv_ = 0; + use_roi_ = false; + } + + // + // Frame flags and layer id for temporal layers. + // + + // For two layers, test pattern is: + // 1 3 + // 0 2 ..... + // For three layers, test pattern is: + // 1 3 5 7 + // 2 6 + // 0 4 .... + // LAST is always update on base/layer 0, GOLDEN is updated on layer 1. + // For this 3 layer example, the 2nd enhancement layer (layer 2) updates + // the altref frame. + static int GetFrameFlags(int frame_num, int num_temp_layers) { + int frame_flags = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + // Layer 0: predict from L and ARF, update L. + frame_flags = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + } else { + // Layer 1: predict from L, G and ARF, and update G. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_ENTROPY; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + // Layer 0: predict from L and ARF; update L. + frame_flags = + VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; + } else if ((frame_num - 2) % 4 == 0) { + // Layer 1: predict from L, G, ARF; update G. + frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST; + } else if ((frame_num - 1) % 2 == 0) { + // Layer 2: predict from L, G, ARF; update ARF. + frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST; + } + } + return frame_flags; + } + + static int SetLayerId(int frame_num, int num_temp_layers) { + int layer_id = 0; + if (num_temp_layers == 2) { + if (frame_num % 2 == 0) { + layer_id = 0; + } else { + layer_id = 1; + } + } else if (num_temp_layers == 3) { + if (frame_num % 4 == 0) { + layer_id = 0; + } else if ((frame_num - 2) % 4 == 0) { + layer_id = 1; + } else if ((frame_num - 1) % 2 == 0) { + layer_id = 2; + } + } + return layer_id; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); + } + + if (denoiser_offon_test_) { + ASSERT_GT(denoiser_offon_period_, 0) + << "denoiser_offon_period_ is not positive."; + if ((video->frame() + 1) % denoiser_offon_period_ == 0) { + // Flip denoiser_on_ periodically + denoiser_on_ ^= 1; + } + } + + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads)); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, + frame_parallel_decoding_mode_); + + if (use_roi_) { + encoder->Control(VP9E_SET_ROI_MAP, &roi_); + encoder->Control(VP9E_SET_AQ_MODE, 0); + } + + if (delta_q_uv_ != 0) { + encoder->Control(VP9E_SET_DELTA_Q_UV, delta_q_uv_); + } + + if (cfg_.ts_number_layers > 1) { + if (video->frame() == 0) { + encoder->Control(VP9E_SET_SVC, 1); + } + if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + vpx_svc_layer_id_t layer_id; + frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers); + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = + SetLayerId(video->frame(), cfg_.ts_number_layers); + layer_id.temporal_layer_id_per_spatial[0] = + SetLayerId(video->frame(), cfg_.ts_number_layers); + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + } + } + const vpx_rational_t tb = video->timebase(); + timebase_ = static_cast(tb.num) / tb.den; + duration_ = 0; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // Time since last timestamp = duration. + vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_; + + if (duration > 1) { + // If first drop not set and we have a drop set it to this time. + if (!first_drop_) first_drop_ = last_pts_ + 1; + // Update the number of frame drops. + num_drops_ += static_cast(duration - 1); + // Update counter for total number of frames (#frames input to encoder). + // Needed for setting the proper layer_id below. + tot_frame_number_ += static_cast(duration - 1); + } + + int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers); + + // Add to the buffer the bits we'd expect from a constant bitrate server. + bits_in_buffer_model_ += static_cast( + duration * timebase_ * cfg_.rc_target_bitrate * 1000); + + // Buffer should not go negative. + ASSERT_GE(bits_in_buffer_model_, 0) + << "Buffer Underrun at frame " << pkt->data.frame.pts; + + const size_t frame_size_in_bits = pkt->data.frame.sz * 8; + + // Update the total encoded bits. For temporal layers, update the cumulative + // encoded bits per layer. + for (int i = layer; i < static_cast(cfg_.ts_number_layers); ++i) { + bits_total_[i] += frame_size_in_bits; + } + + // Update the most recent pts. + last_pts_ = pkt->data.frame.pts; + ++frame_number_; + ++tot_frame_number_; + } + + void EndPassHook() override { + for (int layer = 0; layer < static_cast(cfg_.ts_number_layers); + ++layer) { + duration_ = (last_pts_ + 1) * timebase_; + if (bits_total_[layer]) { + // Effective file datarate: + effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_; + } + } + } + + vpx_codec_pts_t last_pts_; + double timebase_; + int tune_content_; + int frame_number_; // Counter for number of non-dropped/encoded frames. + int tot_frame_number_; // Counter for total number of input frames. + int64_t bits_total_[3]; + double duration_; + double effective_datarate_[3]; + int set_cpu_used_; + int64_t bits_in_buffer_model_; + vpx_codec_pts_t first_drop_; + int num_drops_; + int aq_mode_; + int denoiser_on_; + int denoiser_offon_test_; + int denoiser_offon_period_; + int frame_parallel_decoding_mode_; + int delta_q_uv_; + bool use_roi_; + vpx_roi_map_t roi_; +}; + +// Params: test mode, speed setting and index for bitrate array. +class DatarateTestVP9RealTimeMultiBR + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestVP9RealTimeMultiBR() : DatarateTestVP9(GET_PARAM(0)) {} + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Params: speed setting and index for bitrate array. +class DatarateTestVP9LargeVBR + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestVP9LargeVBR() : DatarateTestVP9(GET_PARAM(0)) {} + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for VBR mode with 0 lag. +TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagZero) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + + const int bitrates[2] = { 400, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.36) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for VBR mode with non-zero lag. +TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagNonZero) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + // For non-zero lag, rate control will work (be within bounds) for + // real-time mode. + if (deadline_ == VPX_DL_REALTIME) { + cfg_.g_lag_in_frames = 15; + } else { + cfg_.g_lag_in_frames = 0; + } + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + const int bitrates[2] = { 400, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for VBR mode with non-zero lag, with +// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs +// since error_resilience is off. +TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagNonZeroFrameParDecOff) { + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = VPX_VBR; + // For non-zero lag, rate control will work (be within bounds) for + // real-time mode. + if (deadline_ == VPX_DL_REALTIME) { + cfg_.g_lag_in_frames = 15; + } else { + cfg_.g_lag_in_frames = 0; + } + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + const int bitrates[2] = { 400, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + frame_parallel_decoding_mode_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for CBR mode. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 150, 350, 550, 750 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode +// off( and error_resilience off). +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargetingFrameParDecOff) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 150, 350, 550, 750 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + frame_parallel_decoding_mode_ = 0; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for CBR. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting444) { + ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); + + cfg_.g_profile = 1; + cfg_.g_timebase = video.timebase(); + + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + const int bitrates[4] = { 250, 450, 650, 850 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(static_cast(cfg_.rc_target_bitrate), + effective_datarate_[0] * 0.80) + << " The datarate for the file exceeds the target by too much!"; + ASSERT_LE(static_cast(cfg_.rc_target_bitrate), + effective_datarate_[0] * 1.15) + << " The datarate for the file missed the target!" + << cfg_.rc_target_bitrate << " " << effective_datarate_; +} + +// Check that (1) the first dropped frame gets earlier and earlier +// as the drop frame threshold is increased, and (2) that the total number of +// frame drops does not decrease as we increase frame drop threshold. +// Use a lower qp-max to force some frame drops. +TEST_P(DatarateTestVP9RealTimeMultiBR, ChangingDropFrameThresh) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 50; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 0; + // TODO(marpan): Investigate datarate target failures with a smaller keyframe + // interval (128). + cfg_.kf_max_dist = 9999; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + const int kDropFrameThreshTestStep = 30; + const int bitrates[2] = { 50, 150 }; + const int bitrate_index = GET_PARAM(2); + if (bitrate_index > 1) return; + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + vpx_codec_pts_t last_drop = 140; + int last_num_drops = 0; + for (int i = 10; i < 100; i += kDropFrameThreshTestStep) { + cfg_.rc_dropframe_thresh = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25) + << " The datarate for the file is greater than target by too much!"; + ASSERT_LE(first_drop_, last_drop) + << " The first dropped frame for drop_thresh " << i + << " > first dropped frame for drop_thresh " + << i - kDropFrameThreshTestStep; + ASSERT_GE(num_drops_, last_num_drops * 0.85) + << " The number of dropped frames for drop_thresh " << i + << " < number of dropped frames for drop_thresh " + << i - kDropFrameThreshTestStep; + last_drop = first_drop_; + last_num_drops = num_drops_; + } +} // namespace + +// Check basic rate targeting for 2 temporal layers. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting2TemporalLayers) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 2; + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 200, 400, 600, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + // 60-40 bitrate allocation for 2 temporal layers. + cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate; + aq_mode_ = 0; + if (deadline_ == VPX_DL_REALTIME) { + aq_mode_ = 3; + cfg_.g_error_resilient = 1; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { + ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85) + << " The datarate for the file is lower than target by too much, " + "for layer: " + << j; + ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15) + << " The datarate for the file is greater than target by too much, " + "for layer: " + << j; + } +} + +// Check basic rate targeting for 3 temporal layers. +TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting3TemporalLayers) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + const int bitrates[4] = { 200, 400, 600, 800 }; + const int bitrate_index = GET_PARAM(2); + cfg_.rc_target_bitrate = bitrates[bitrate_index]; + ResetModel(); + // 40-20-40 bitrate allocation for 3 temporal layers. + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + aq_mode_ = 0; + if (deadline_ == VPX_DL_REALTIME) { + aq_mode_ = 3; + cfg_.g_error_resilient = 1; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { + // TODO(yaowu): Work out more stable rc control strategy and + // Adjust the thresholds to be tighter than .75. + ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75) + << " The datarate for the file is lower than target by too much, " + "for layer: " + << j; + // TODO(yaowu): Work out more stable rc control strategy and + // Adjust the thresholds to be tighter than 1.25. + ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25) + << " The datarate for the file is greater than target by too much, " + "for layer: " + << j; + } +} + +// Params: speed setting. +class DatarateTestVP9RealTime : public DatarateTestVP9, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {} + ~DatarateTestVP9RealTime() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for CBR mode, with 2 threads and dropped frames. +TEST_P(DatarateTestVP9RealTime, BasicRateTargetingDropFramesMultiThreads) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 30; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + // Encode using multiple threads. + cfg_.g_threads = 2; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 200; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic rate targeting for 3 temporal layers, with frame dropping. +// Only for one (low) bitrate with lower max_quantizer, and somewhat higher +// frame drop threshold, to force frame dropping. +TEST_P(DatarateTestVP9RealTime, + BasicRateTargeting3TemporalLayersFrameDropping) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + // Set frame drop threshold and rc_max_quantizer to force some frame drops. + cfg_.rc_dropframe_thresh = 20; + cfg_.rc_max_quantizer = 45; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + cfg_.rc_target_bitrate = 200; + ResetModel(); + // 40-20-40 bitrate allocation for 3 temporal layers. + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + aq_mode_ = 0; + if (deadline_ == VPX_DL_REALTIME) { + aq_mode_ = 3; + cfg_.g_error_resilient = 1; + } + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + for (int j = 0; j < static_cast(cfg_.ts_number_layers); ++j) { + ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85) + << " The datarate for the file is lower than target by too much, " + "for layer: " + << j; + ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.20) + << " The datarate for the file is greater than target by too much, " + "for layer: " + << j; + // Expect some frame drops in this test: for this 200 frames test, + // expect at least 10% and not more than 60% drops. + ASSERT_GE(num_drops_, 20); + ASSERT_LE(num_drops_, 280); + } +} + +// Check VP9 region of interest feature. +TEST_P(DatarateTestVP9RealTime, RegionOfInterest) { + if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 640; + cfg_.g_h = 480; + + ResetModel(); + + // Set ROI parameters + use_roi_ = true; + memset(&roi_, 0, sizeof(roi_)); + + roi_.rows = (cfg_.g_h + 7) / 8; + roi_.cols = (cfg_.g_w + 7) / 8; + + roi_.delta_q[1] = -20; + roi_.delta_lf[1] = -20; + memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame)); + roi_.ref_frame[1] = 1; + + // Use 2 states: 1 is center square, 0 is the rest. + roi_.roi_map = reinterpret_cast( + calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map))); + ASSERT_NE(roi_.roi_map, nullptr); + + for (unsigned int i = 0; i < roi_.rows; ++i) { + for (unsigned int j = 0; j < roi_.cols; ++j) { + if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && + j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { + roi_.roi_map[i * roi_.cols + j] = 1; + } + } + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4) + << " The datarate for the file missed the target!"; + + free(roi_.roi_map); +} + +// Params: speed setting, delta q UV. +class DatarateTestVP9RealTimeDeltaQUV + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWith2Params { + public: + DatarateTestVP9RealTimeDeltaQUV() : DatarateTestVP9(GET_PARAM(0)) {} + ~DatarateTestVP9RealTimeDeltaQUV() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +TEST_P(DatarateTestVP9RealTimeDeltaQUV, DeltaQUV) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 640; + cfg_.g_h = 480; + + ResetModel(); + + delta_q_uv_ = GET_PARAM(2); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4) + << " The datarate for the file missed the target!"; +} + +// Params: test mode, speed setting and index for bitrate array. +class DatarateTestVP9PostEncodeDrop + : public DatarateTestVP9, + public ::libvpx_test::CodecTestWithParam { + public: + DatarateTestVP9PostEncodeDrop() : DatarateTestVP9(GET_PARAM(0)) {} + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + set_cpu_used_ = GET_PARAM(1); + ResetModel(); + } +}; + +// Check basic rate targeting for CBR mode, with 2 threads and dropped frames. +TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 30; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + // Encode using multiple threads. + cfg_.g_threads = 2; + cfg_.g_error_resilient = 0; + tune_content_ = 1; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + cfg_.rc_target_bitrate = 300; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +using libvpx_test::ACMRandom; + +class DatarateTestVP9FrameQp + : public DatarateTestVP9, + public ::testing::TestWithParam { + public: + DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {} + ~DatarateTestVP9FrameQp() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + ResetModel(); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + set_cpu_used_ = 7; + DatarateTestVP9::PreEncodeFrameHook(video, encoder); + frame_qp_ = static_cast(rnd_.RandRange(64)); + encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_); + frame_++; + } + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + int qp = 0; + vpx_svc_layer_id_t layer_id; + if (frame_ >= total_frame_) return; + encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp); + ASSERT_EQ(frame_qp_, qp); + encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id); + temporal_layer_id_ = layer_id.temporal_layer_id; + } + + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override { + if (frame_ >= total_frame_) return; + ASSERT_TRUE(cfg_.temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_0212 && + temporal_layer_id_ == 2); + } + + protected: + int total_frame_; + + private: + ACMRandom rnd_; + int frame_qp_; + int frame_; + int temporal_layer_id_; +}; + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + cfg_.rc_target_bitrate = 200; + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_min_quantizer = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1). + cfg_.ss_number_layers = 1; + cfg_.ts_number_layers = 3; + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212; + cfg_.rc_target_bitrate = 200; + cfg_.g_error_resilient = 1; + total_frame_ = 400; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, total_frame_); + ResetModel(); + cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100; + cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +// Params: speed setting. +class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime { + public: + ~DatarateTestVP9RealTimeDenoiser() override = default; +}; + +// Check basic datarate targeting, for a single bitrate, when denoiser is on. +TEST_P(DatarateTestVP9RealTimeDenoiser, LowNoise) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 400; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is on, +// for clip with high noise level. Use 2 threads. +TEST_P(DatarateTestVP9RealTimeDenoiser, HighNoise) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 2; + + ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: kDenoiserOnYOnly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 1000; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is on, +// for 1280x720 clip with 4 threads. +TEST_P(DatarateTestVP9RealTimeDenoiser, 4threads) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_threads = 4; + + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 1000; + ResetModel(); + // Turn on the denoiser. + denoiser_on_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29) + << " The datarate for the file is greater than target by too much!"; +} + +// Check basic datarate targeting, for a single bitrate, when denoiser is off +// and on. +TEST_P(DatarateTestVP9RealTimeDenoiser, DenoiserOffOn) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING), + // there is only one denoiser mode: denoiserYonly(which is 1), + // but may add more modes in the future. + cfg_.rc_target_bitrate = 400; + ResetModel(); + // The denoiser is off by default. + denoiser_on_ = 0; + // Set the offon test flag. + denoiser_offon_test_ = 1; + denoiser_offon_period_ = 100; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85) + << " The datarate for the file is lower than target by too much!"; + ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15) + << " The datarate for the file is greater than target by too much!"; +} +#endif // CONFIG_VP9_TEMPORAL_DENOISING + +class DatarateTestVP9Psnr : public DatarateTestVP9, + public ::libvpx_test::CodecTestWithParam { + protected: + DatarateTestVP9Psnr() : DatarateTestVP9(GET_PARAM(0)) {} + ~DatarateTestVP9Psnr() override = default; + + void SetUp() override { + InitializeConfig(); + cfg_.g_lag_in_frames = 0; + SetMode(libvpx_test::kRealTime); + set_cpu_used_ = 10; + ResetModel(); + frame_flags_ = VPX_EFLAG_CALCULATE_PSNR; + expect_psnr_ = true; + } + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + DatarateTestVP9::PreEncodeFrameHook(video, encoder); + frame_flags_ ^= VPX_EFLAG_CALCULATE_PSNR; +#if CONFIG_INTERNAL_STATS + // CONFIG_INTERNAL_STATS unconditionally generates PSNR. + expect_psnr_ = true; +#else + expect_psnr_ = (frame_flags_ & VPX_EFLAG_CALCULATE_PSNR) != 0; +#endif // CONFIG_INTERNAL_STATS + if (video->img() == nullptr) { + expect_psnr_ = false; + } + } + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + libvpx_test::CxDataIterator iter = encoder->GetCxData(); + + bool had_psnr = false; + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + if (pkt->kind == VPX_CODEC_PSNR_PKT) had_psnr = true; + } + + EXPECT_EQ(had_psnr, expect_psnr_); + } + + private: + bool expect_psnr_; +}; + +TEST_P(DatarateTestVP9Psnr, PerFramePsnr) { + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 100); + + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeMultiBR, + ::testing::Range(5, 10), ::testing::Range(0, 4)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9), + ::testing::Range(0, 2)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTime, ::testing::Range(5, 10)); + +#if CONFIG_VP9 +INSTANTIATE_TEST_SUITE_P( + VP9, DatarateTestVP9FrameQp, + ::testing::Values( + static_cast(&libvpx_test::kVP9))); +#endif + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDeltaQUV, + ::testing::Range(5, 10), + ::testing::Values(-5, -10, -15)); + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9PostEncodeDrop, + ::testing::Range(5, 6)); + +#if CONFIG_VP9_TEMPORAL_DENOISING +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDenoiser, + ::testing::Range(5, 10)); +#endif + +VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9Psnr, + ::testing::Values(::libvpx_test::kRealTime)); +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_decrypt_test.cc b/media/libvpx/libvpx/test/vp9_decrypt_test.cc index 1874d23117..558ee70366 100644 --- a/media/libvpx/libvpx/test/vp9_decrypt_test.cc +++ b/media/libvpx/libvpx/test/vp9_decrypt_test.cc @@ -12,7 +12,7 @@ #include #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/ivf_video_source.h" diff --git a/media/libvpx/libvpx/test/vp9_denoiser_sse2_test.cc b/media/libvpx/libvpx/test/vp9_denoiser_test.cc similarity index 50% rename from media/libvpx/libvpx/test/vp9_denoiser_sse2_test.cc rename to media/libvpx/libvpx/test/vp9_denoiser_test.cc index 2a50b77355..bc5612ea47 100644 --- a/media/libvpx/libvpx/test/vp9_denoiser_sse2_test.cc +++ b/media/libvpx/libvpx/test/vp9_denoiser_test.cc @@ -11,8 +11,9 @@ #include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/acm_random.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" @@ -23,23 +24,35 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_denoiser.h" +#include "vpx_config.h" using libvpx_test::ACMRandom; namespace { const int kNumPixels = 64 * 64; -class VP9DenoiserTest : public ::testing::TestWithParam { + +using Vp9DenoiserFilterFunc = int (*)(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude); +using VP9DenoiserTestParam = std::tuple; + +class VP9DenoiserTest + : public ::testing::Test, + public ::testing::WithParamInterface { public: - virtual ~VP9DenoiserTest() {} + ~VP9DenoiserTest() override = default; - virtual void SetUp() { bs_ = GetParam(); } + void SetUp() override { bs_ = GET_PARAM(1); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } protected: BLOCK_SIZE bs_; }; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VP9DenoiserTest); TEST_P(VP9DenoiserTest, BitexactCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); @@ -76,9 +89,9 @@ TEST_P(VP9DenoiserTest, BitexactCheck) { 64, avg_block_c, 64, 0, bs_, motion_magnitude_random)); - ASM_REGISTER_STATE_CHECK(vp9_denoiser_filter_sse2( - sig_block, 64, mc_avg_block, 64, avg_block_sse2, 64, 0, bs_, - motion_magnitude_random)); + ASM_REGISTER_STATE_CHECK(GET_PARAM(0)(sig_block, 64, mc_avg_block, 64, + avg_block_sse2, 64, 0, bs_, + motion_magnitude_random)); // Test bitexactness. for (int h = 0; h < (4 << b_height_log2_lookup[bs_]); ++h) { @@ -89,10 +102,36 @@ TEST_P(VP9DenoiserTest, BitexactCheck) { } } +using std::make_tuple; + // Test for all block size. -INSTANTIATE_TEST_CASE_P(SSE2, VP9DenoiserTest, - ::testing::Values(BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, - BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, - BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, - BLOCK_64X64)); +#if HAVE_SSE2 +INSTANTIATE_TEST_SUITE_P( + SSE2, VP9DenoiserTest, + ::testing::Values(make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X8), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_8X16), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X8), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X16), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_16X32), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X16), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X32), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_32X64), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X32), + make_tuple(&vp9_denoiser_filter_sse2, BLOCK_64X64))); +#endif // HAVE_SSE2 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, VP9DenoiserTest, + ::testing::Values(make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X8), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_8X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X8), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_16X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X16), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_32X64), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X32), + make_tuple(&vp9_denoiser_filter_neon, BLOCK_64X64))); +#endif } // namespace diff --git a/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc index 53dc8c9fe4..00860789c1 100644 --- a/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc +++ b/media/libvpx/libvpx/test/vp9_encoder_parms_get_to_decoder.cc @@ -8,7 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include + +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" @@ -60,9 +62,9 @@ class VpxEncoderParmsGetToDecoder VpxEncoderParmsGetToDecoder() : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {} - virtual ~VpxEncoderParmsGetToDecoder() {} + ~VpxEncoderParmsGetToDecoder() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(::libvpx_test::kTwoPassGood); cfg_.g_lag_in_frames = 25; @@ -72,9 +74,9 @@ class VpxEncoderParmsGetToDecoder cfg_.rc_target_bitrate = test_video_.bitrate; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs); encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range); encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless); @@ -93,15 +95,13 @@ class VpxEncoderParmsGetToDecoder } } - virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder *decoder) { + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder *decoder) override { vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder(); vpx_codec_alg_priv_t *const priv = reinterpret_cast(vp9_decoder->priv); - FrameWorkerData *const worker_data = - reinterpret_cast(priv->frame_workers[0].data1); - VP9_COMMON *const common = &worker_data->pbi->common; + VP9_COMMON *const common = &priv->pbi->common; if (encode_parms.lossless) { EXPECT_EQ(0, common->base_qindex); @@ -140,14 +140,14 @@ class VpxEncoderParmsGetToDecoder TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) { init_flags_ = VPX_CODEC_USE_PSNR; - testing::internal::scoped_ptr video( + std::unique_ptr video( new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames)); - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); } -VP9_INSTANTIATE_TEST_CASE(VpxEncoderParmsGetToDecoder, - ::testing::ValuesIn(kVP9EncodeParameterSet), - ::testing::ValuesIn(kVP9EncodePerfTestVectors)); +VP9_INSTANTIATE_TEST_SUITE(VpxEncoderParmsGetToDecoder, + ::testing::ValuesIn(kVP9EncodeParameterSet), + ::testing::ValuesIn(kVP9EncodePerfTestVectors)); } // namespace diff --git a/media/libvpx/libvpx/test/vp9_end_to_end_test.cc b/media/libvpx/libvpx/test/vp9_end_to_end_test.cc index 955f567ce2..a1ad5c1e4e 100644 --- a/media/libvpx/libvpx/test/vp9_end_to_end_test.cc +++ b/media/libvpx/libvpx/test/vp9_end_to_end_test.cc @@ -8,36 +8,40 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "memory" + +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" +#include "test/i420_video_source.h" #include "test/util.h" #include "test/y4m_video_source.h" #include "test/yuv_video_source.h" +#include "vpx_config.h" namespace { const unsigned int kWidth = 160; const unsigned int kHeight = 90; const unsigned int kFramerate = 50; -const unsigned int kFrames = 10; +const unsigned int kFrames = 20; const int kBitrate = 500; // List of psnr thresholds for speed settings 0-7 and 5 encoding modes const double kPsnrThreshold[][5] = { { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 }, { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 }, - { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 }, - { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 }, + { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 28.0, 32.0, 32.0, 32.0, 32.0 }, + { 28.4, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 }, }; -typedef struct { +struct TestVideoParam { const char *filename; unsigned int input_bit_depth; vpx_img_fmt fmt; vpx_bit_depth_t bit_depth; unsigned int profile; -} TestVideoParam; +}; const TestVideoParam kTestVectors[] = { { "park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420, VPX_BITS_8, 0 }, @@ -45,25 +49,35 @@ const TestVideoParam kTestVectors[] = { { "park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, VPX_BITS_8, 1 }, { "park_joy_90p_8_440.yuv", 8, VPX_IMG_FMT_I440, VPX_BITS_8, 1 }, #if CONFIG_VP9_HIGHBITDEPTH - { "park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2 }, - { "park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3 }, - { "park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3 }, + { "park_joy_90p_10_420_20f.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2 }, + { "park_joy_90p_10_422_20f.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3 }, + { "park_joy_90p_10_444_20f.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3 }, { "park_joy_90p_10_440.yuv", 10, VPX_IMG_FMT_I44016, VPX_BITS_10, 3 }, - { "park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2 }, - { "park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3 }, - { "park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3 }, + { "park_joy_90p_12_420_20f.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2 }, + { "park_joy_90p_12_422_20f.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3 }, + { "park_joy_90p_12_444_20f.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3 }, { "park_joy_90p_12_440.yuv", 12, VPX_IMG_FMT_I44016, VPX_BITS_12, 3 }, #endif // CONFIG_VP9_HIGHBITDEPTH }; +const TestVideoParam kTestVectorsNv12[] = { + { "hantro_collage_w352h288_nv12.yuv", 8, VPX_IMG_FMT_NV12, VPX_BITS_8, 0 }, +}; + +const TestVideoParam k4x2VideoTestVectors[] = { + { "4x2.y4m", 8, VPX_IMG_FMT_I420, VPX_BITS_8, 0 }, +}; + // Encoding modes tested const libvpx_test::TestMode kEncodingModeVectors[] = { +#if !CONFIG_REALTIME_ONLY ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime, +#endif + ::libvpx_test::kRealTime }; // Speed settings tested -const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 }; +const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6, 7 }; int is_extension_y4m(const char *filename) { const char *dot = strrchr(filename, '.'); @@ -74,6 +88,43 @@ int is_extension_y4m(const char *filename) { } } +class EndToEndTestAdaptiveRDThresh + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + EndToEndTestAdaptiveRDThresh() + : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)), + cpu_used_end_(GET_PARAM(2)) {} + + ~EndToEndTestAdaptiveRDThresh() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + dec_cfg_.threads = 4; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, cpu_used_start_); + encoder->Control(VP9E_SET_ROW_MT, 1); + encoder->Control(VP9E_SET_TILE_COLUMNS, 2); + } + if (video->frame() == 100) + encoder->Control(VP8E_SET_CPUUSED, cpu_used_end_); + } + + private: + int cpu_used_start_; + int cpu_used_end_; +}; + class EndToEndTestLarge : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWith3Paramsdata.psnr.psnr[0]; nframes_++; } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1); encoder->Control(VP9E_SET_TILE_COLUMNS, 4); encoder->Control(VP8E_SET_CPUUSED, cpu_used_); @@ -123,6 +177,9 @@ class EndToEndTestLarge encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } else { + encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_); + encoder->Control(VP9E_SET_AQ_MODE, cyclic_refresh_); } } } @@ -138,6 +195,8 @@ class EndToEndTestLarge TestVideoParam test_video_param_; int cpu_used_; + int cyclic_refresh_; + int denoiser_on_; private: double psnr_; @@ -145,6 +204,89 @@ class EndToEndTestLarge libvpx_test::TestMode encoding_mode_; }; +#if CONFIG_VP9_DECODER +// The test parameters control VP9D_SET_LOOP_FILTER_OPT and the number of +// decoder threads. +class EndToEndTestLoopFilterThreading + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + EndToEndTestLoopFilterThreading() + : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {} + + ~EndToEndTestLoopFilterThreading() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + cfg_.g_threads = 2; + cfg_.g_lag_in_frames = 0; + cfg_.rc_target_bitrate = 500; + cfg_.rc_end_usage = VPX_CBR; + cfg_.kf_min_dist = 1; + cfg_.kf_max_dist = 1; + dec_cfg_.threads = GET_PARAM(2); + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 8); + } + encoder->Control(VP9E_SET_TILE_COLUMNS, 4 - video->frame() % 5); + } + + void PreDecodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Decoder *decoder) override { + if (video->frame() == 0) { + decoder->Control(VP9D_SET_LOOP_FILTER_OPT, use_loop_filter_opt_ ? 1 : 0); + } + } + + private: + const bool use_loop_filter_opt_; +}; +#endif // CONFIG_VP9_DECODER + +class EndToEndNV12 : public EndToEndTestLarge {}; + +TEST_P(EndToEndNV12, EndtoEndNV12Test) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = VPX_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr video; + + video.reset(new libvpx_test::YUVVideoSource(test_video_param_.filename, + test_video_param_.fmt, 352, 288, + 30, 1, 0, 100)); + ASSERT_NE(video.get(), nullptr); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +class EndToEnd4x2Video : public EndToEndTestLarge {}; + +TEST_P(EndToEnd4x2Video, EndtoEnd4x2VideoTest) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + + std::unique_ptr video; + + video.reset( + new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, 200)); + ASSERT_NE(video.get(), nullptr); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { cfg_.rc_target_bitrate = kBitrate; cfg_.g_error_resilient = 0; @@ -154,7 +296,7 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { init_flags_ = VPX_CODEC_USE_PSNR; if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; - testing::internal::scoped_ptr video; + std::unique_ptr video; if (is_extension_y4m(test_video_param_.filename)) { video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, kFrames)); @@ -163,15 +305,80 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight, kFramerate, 1, 0, kFrames)); } - ASSERT_TRUE(video.get() != NULL); + ASSERT_NE(video.get(), nullptr); ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); const double psnr = GetAveragePsnr(); EXPECT_GT(psnr, GetPsnrThreshold()); } -VP9_INSTANTIATE_TEST_CASE(EndToEndTestLarge, - ::testing::ValuesIn(kEncodingModeVectors), - ::testing::ValuesIn(kTestVectors), - ::testing::ValuesIn(kCpuUsedVectors)); +TEST_P(EndToEndTestLarge, EndtoEndPSNRDenoiserAQTest) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = VPX_CODEC_USE_PSNR; + cyclic_refresh_ = 3; + denoiser_on_ = 1; + if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr video; + if (is_extension_y4m(test_video_param_.filename)) { + video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + } else { + video.reset(new libvpx_test::YUVVideoSource( + test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight, + kFramerate, 1, 0, kFrames)); + } + ASSERT_NE(video.get(), nullptr); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, GetPsnrThreshold()); +} + +TEST_P(EndToEndTestAdaptiveRDThresh, EndtoEndAdaptiveRDThreshRowMT) { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_threads = 2; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +#if CONFIG_VP9_DECODER +TEST_P(EndToEndTestLoopFilterThreading, TileCountChange) { + ::libvpx_test::RandomVideoSource video; + video.SetSize(4096, 2160); + video.set_limit(10); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +#endif // CONFIG_VP9_DECODER + +VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLarge, + ::testing::ValuesIn(kEncodingModeVectors), + ::testing::ValuesIn(kTestVectors), + ::testing::ValuesIn(kCpuUsedVectors)); + +VP9_INSTANTIATE_TEST_SUITE(EndToEndNV12, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::ValuesIn(kTestVectorsNv12), + ::testing::Values(6, 7, 8)); + +VP9_INSTANTIATE_TEST_SUITE(EndToEnd4x2Video, + ::testing::Values(::libvpx_test::kTwoPassGood), + ::testing::ValuesIn(k4x2VideoTestVectors), + ::testing::Values(0, 1)); + +VP9_INSTANTIATE_TEST_SUITE(EndToEndTestAdaptiveRDThresh, + ::testing::Values(5, 6, 7), ::testing::Values(8, 9)); + +#if CONFIG_VP9_DECODER +VP9_INSTANTIATE_TEST_SUITE(EndToEndTestLoopFilterThreading, ::testing::Bool(), + ::testing::Range(2, 6)); +#endif // CONFIG_VP9_DECODER } // namespace diff --git a/media/libvpx/libvpx/test/vp9_ethread_test.cc b/media/libvpx/libvpx/test/vp9_ethread_test.cc index 4df40854b8..9e366eb366 100644 --- a/media/libvpx/libvpx/test/vp9_ethread_test.cc +++ b/media/libvpx/libvpx/test/vp9_ethread_test.cc @@ -10,14 +10,221 @@ #include #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/md5_helper.h" #include "test/util.h" #include "test/y4m_video_source.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vpx_config.h" namespace { +// FIRSTPASS_STATS struct: +// { +// 26 double members; +// 1 int64_t member; +// } +// Whenever FIRSTPASS_STATS struct is modified, the following constants need to +// be revisited. +const int kDbl = 26; +const int kInt = 1; +const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t); + +class VPxFirstPassEncoderThreadTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + protected: + VPxFirstPassEncoderThreadTest() + : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(0), + encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) { + init_flags_ = VPX_CODEC_USE_PSNR; + + row_mt_mode_ = 1; + first_pass_only_ = true; + firstpass_stats_.buf = nullptr; + firstpass_stats_.sz = 0; + } + ~VPxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); } + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + + cfg_.rc_end_usage = VPX_VBR; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_maxsection_pct = 2000; + cfg_.rc_max_quantizer = 56; + cfg_.rc_min_quantizer = 0; + } + + void BeginPassHook(unsigned int /*pass*/) override { + encoder_initialized_ = false; + abort_ = false; + } + + void EndPassHook() override { + // For first pass stats test, only run first pass encoder. + if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS) + abort_ |= first_pass_only_; + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/, + ::libvpx_test::Encoder *encoder) override { + if (!encoder_initialized_) { + // Encode in 2-pass mode. + encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_); + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0); + + if (encoding_mode_ == ::libvpx_test::kTwoPassGood) + encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_); + + encoder_initialized_ = true; + } + } + + void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) override { + const uint8_t *const pkt_buf = + reinterpret_cast(pkt->data.twopass_stats.buf); + const size_t pkt_size = pkt->data.twopass_stats.sz; + + // First pass stats size equals sizeof(FIRSTPASS_STATS) + EXPECT_EQ(pkt_size, kFirstPassStatsSz) + << "Error: First pass stats size doesn't equal kFirstPassStatsSz"; + + firstpass_stats_.buf = + realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size); + ASSERT_NE(firstpass_stats_.buf, nullptr); + memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf, + pkt_size); + firstpass_stats_.sz += pkt_size; + } + + bool encoder_initialized_; + int tiles_; + ::libvpx_test::TestMode encoding_mode_; + int set_cpu_used_; + int row_mt_mode_; + bool first_pass_only_; + vpx_fixed_buf_t firstpass_stats_; +}; + +#if !CONFIG_REALTIME_ONLY +static void compare_fp_stats(vpx_fixed_buf_t *fp_stats, double factor) { + // fp_stats consists of 2 set of first pass encoding stats. These 2 set of + // stats are compared to check if the stats match or at least are very close. + FIRSTPASS_STATS *stats1 = reinterpret_cast(fp_stats->buf); + int nframes_ = (int)(fp_stats->sz / sizeof(FIRSTPASS_STATS)); + FIRSTPASS_STATS *stats2 = stats1 + nframes_ / 2; + int i, j; + + // The total stats are also output and included in the first pass stats. Here + // ignore that in the comparison. + for (i = 0; i < (nframes_ / 2 - 1); ++i) { + const double *frame_stats1 = reinterpret_cast(stats1); + const double *frame_stats2 = reinterpret_cast(stats2); + + for (j = 0; j < kDbl; ++j) { + ASSERT_LE(fabs(*frame_stats1 - *frame_stats2), + fabs(*frame_stats1) / factor) + << "First failure @ frame #" << i << " stat #" << j << " (" + << *frame_stats1 << " vs. " << *frame_stats2 << ")"; + frame_stats1++; + frame_stats2++; + } + + stats1++; + stats2++; + } + + // Reset firstpass_stats_ to 0. + memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz); + fp_stats->sz = 0; +} + +static void compare_fp_stats_md5(vpx_fixed_buf_t *fp_stats) { + // fp_stats consists of 2 set of first pass encoding stats. These 2 set of + // stats are compared to check if the stats match. + uint8_t *stats1 = reinterpret_cast(fp_stats->buf); + uint8_t *stats2 = stats1 + fp_stats->sz / 2; + ::libvpx_test::MD5 md5_row_mt_0, md5_row_mt_1; + + md5_row_mt_0.Add(stats1, fp_stats->sz / 2); + const char *md5_row_mt_0_str = md5_row_mt_0.Get(); + + md5_row_mt_1.Add(stats2, fp_stats->sz / 2); + const char *md5_row_mt_1_str = md5_row_mt_1.Get(); + + // Check md5 match. + ASSERT_STREQ(md5_row_mt_0_str, md5_row_mt_1_str) + << "MD5 checksums don't match"; + + // Reset firstpass_stats_ to 0. + memset((uint8_t *)fp_stats->buf, 0, fp_stats->sz); + fp_stats->sz = 0; +} +#endif // !CONFIG_REALTIME_ONLY + +TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) { +#if CONFIG_REALTIME_ONLY + GTEST_SKIP(); +#else + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + + first_pass_only_ = true; + cfg_.rc_target_bitrate = 1000; + + // Test row_mt_mode: 0 vs 1 at single thread case(threads = 1, tiles_ = 0) + tiles_ = 0; + cfg_.g_threads = 1; + + row_mt_mode_ = 0; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + row_mt_mode_ = 1; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare to check if using or not using row-mt generates close stats. + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0)); + + // Test single thread vs multiple threads + row_mt_mode_ = 1; + tiles_ = 0; + + cfg_.g_threads = 1; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + cfg_.g_threads = 4; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare to check if single-thread and multi-thread stats are close enough. + ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0)); + + // Bit exact test in row_mt mode. + // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact + // result. + row_mt_mode_ = 1; + tiles_ = 2; + + cfg_.g_threads = 2; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + cfg_.g_threads = 8; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Compare to check if stats match with row-mt=0/1. + compare_fp_stats_md5(&firstpass_stats_); +#endif // CONFIG_REALTIME_ONLY +} + class VPxEncoderThreadTest : public ::libvpx_test::EncoderTest, public ::libvpx_test::CodecTestWith4ParamsControl(VP9E_SET_TILE_COLUMNS, tiles_); @@ -70,20 +281,27 @@ class VPxEncoderThreadTest encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0); encoder->Control(VP9E_SET_AQ_MODE, 3); } + encoder->Control(VP9E_SET_ROW_MT, row_mt_mode_); + encoder_initialized_ = true; } } - virtual void DecompressedFrameHook(const vpx_image_t &img, - vpx_codec_pts_t /*pts*/) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + void DecompressedFrameHook(const vpx_image_t &img, + vpx_codec_pts_t /*pts*/) override { ::libvpx_test::MD5 md5_res; md5_res.Add(&img); md5_.push_back(md5_res.Get()); } - virtual bool HandleDecodeResult(const vpx_codec_err_t res, - const libvpx_test::VideoSource & /*video*/, - libvpx_test::Decoder * /*decoder*/) { + bool HandleDecodeResult(const vpx_codec_err_t res, + const libvpx_test::VideoSource & /*video*/, + libvpx_test::Decoder * /*decoder*/) override { if (res != VPX_CODEC_OK) { EXPECT_EQ(VPX_CODEC_OK, res); return false; @@ -92,63 +310,142 @@ class VPxEncoderThreadTest return true; } + double GetAveragePsnr() const { return nframes_ ? (psnr_ / nframes_) : 0.0; } + bool encoder_initialized_; int tiles_; int threads_; ::libvpx_test::TestMode encoding_mode_; int set_cpu_used_; + int row_mt_mode_; + double psnr_; + unsigned int nframes_; std::vector md5_; }; TEST_P(VPxEncoderThreadTest, EncoderResultTest) { - std::vector single_thr_md5, multi_thr_md5; - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20); - cfg_.rc_target_bitrate = 1000; + // Part 1: Bit exact test for row_mt_mode_ = 0. + // This part keeps original unit tests done before row-mt code is checked in. + row_mt_mode_ = 0; + // Encode using single thread. cfg_.g_threads = 1; init_flags_ = VPX_CODEC_USE_PSNR; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - single_thr_md5 = md5_; + const std::vector single_thr_md5 = md5_; md5_.clear(); // Encode using multiple threads. cfg_.g_threads = threads_; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - multi_thr_md5 = md5_; + const std::vector multi_thr_md5 = md5_; md5_.clear(); // Compare to check if two vectors are equal. ASSERT_EQ(single_thr_md5, multi_thr_md5); + + // Part 2: row_mt_mode_ = 0 vs row_mt_mode_ = 1 single thread bit exact test. + row_mt_mode_ = 1; + + // Encode using single thread + cfg_.g_threads = 1; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + std::vector row_mt_single_thr_md5 = md5_; + md5_.clear(); + + ASSERT_EQ(single_thr_md5, row_mt_single_thr_md5); + + // Part 3: Bit exact test with row-mt on + // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact + // result. + row_mt_mode_ = 1; + row_mt_single_thr_md5.clear(); + + // Encode using 2 threads. + cfg_.g_threads = 2; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + row_mt_single_thr_md5 = md5_; + md5_.clear(); + + // Encode using multiple threads. + cfg_.g_threads = threads_; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const std::vector row_mt_multi_thr_md5 = md5_; + md5_.clear(); + + // Compare to check if two vectors are equal. + ASSERT_EQ(row_mt_single_thr_md5, row_mt_multi_thr_md5); + + // Part 4: PSNR test with bit_match_mode_ = 0 + row_mt_mode_ = 1; + + // Encode using single thread. + cfg_.g_threads = 1; + init_flags_ = VPX_CODEC_USE_PSNR; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double single_thr_psnr = GetAveragePsnr(); + + // Encode using multiple threads. + cfg_.g_threads = threads_; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double multi_thr_psnr = GetAveragePsnr(); + + EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.2); } -// Split this into two instantiations so that we can distinguish -// between very slow runs ( ie cpu_speed 0 ) vs ones that can be +INSTANTIATE_TEST_SUITE_P( + VP9, VPxFirstPassEncoderThreadTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP9)), + ::testing::Values(::libvpx_test::kTwoPassGood), + ::testing::Range(0, 4))); // cpu_used + +constexpr libvpx_test::TestMode kOnePassTestModes[] = { + libvpx_test::kRealTime, +#if !CONFIG_REALTIME_ONLY + libvpx_test::kOnePassGood, +#endif +}; + +// Split this into multiple instantiations so that we can distinguish +// between very slow runs ( i.e., cpu_speed 0 ) vs ones that can be // run nightly by adding Large to the title. -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP9, VPxEncoderThreadTest, ::testing::Combine( ::testing::Values( static_cast(&libvpx_test::kVP9)), - ::testing::Values(::libvpx_test::kTwoPassGood, - ::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime), - ::testing::Range(2, 9), // cpu_used + ::testing::ValuesIn(kOnePassTestModes), + ::testing::Range(3, 10), // cpu_used ::testing::Range(0, 3), // tile_columns ::testing::Range(2, 5))); // threads -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( VP9Large, VPxEncoderThreadTest, ::testing::Combine( ::testing::Values( static_cast(&libvpx_test::kVP9)), - ::testing::Values(::libvpx_test::kTwoPassGood, - ::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime), - ::testing::Range(0, 2), // cpu_used + ::testing::ValuesIn(kOnePassTestModes), + ::testing::Range(0, 3), // cpu_used ::testing::Range(0, 3), // tile_columns ::testing::Range(2, 5))); // threads +#if !CONFIG_REALTIME_ONLY +INSTANTIATE_TEST_SUITE_P( + VP9LargeBest, VPxEncoderThreadTest, + ::testing::Combine( + ::testing::Values( + static_cast(&libvpx_test::kVP9)), + ::testing::Values(libvpx_test::kOnePassBest), + ::testing::Range(0, 10), // cpu_used + ::testing::Range(0, 3), // tile_columns + ::testing::Range(2, 5))); // threads +#endif + } // namespace diff --git a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc new file mode 100644 index 0000000000..b85dee1d08 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_config.h" + +#include "gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/yuv_video_source.h" +#if CONFIG_VP9_DECODER +#include "vpx/vp8dx.h" +#endif +#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_image.h" +#include "vpx/vpx_tpl.h" +#include "vpx_dsp/vpx_dsp_common.h" + +namespace { + +constexpr int kShowFrameCount = 10; +constexpr int kKeyframeQp = 10; +constexpr int kLeafQp = 40; +constexpr int kArfQp = 15; + +// Simple external rate controller for testing. +class RateControllerForTest { + public: + RateControllerForTest() : current_gop_(-1) {} + ~RateControllerForTest() {} + + void StartNextGop() { ++current_gop_; } + + vpx_rc_gop_decision_t GetCurrentGop() const { + vpx_rc_gop_decision_t gop_decision; + if (current_gop_ == 0) { + gop_decision.use_key_frame = 1; + gop_decision.use_alt_ref = 1; + gop_decision.gop_coding_frames = + kShowFrameCount - 1 + gop_decision.use_alt_ref; + // key frame + gop_decision.update_type[0] = VPX_RC_KF_UPDATE; + gop_decision.update_ref_index[0] = 0; + gop_decision.ref_frame_list[0] = get_kf_ref_frame(); + // arf + gop_decision.update_type[1] = VPX_RC_ARF_UPDATE; + gop_decision.update_ref_index[1] = 1; + gop_decision.ref_frame_list[1] = get_arf_ref_frame(); + // leafs + for (int i = 2; i < gop_decision.gop_coding_frames; ++i) { + gop_decision.update_type[i] = VPX_RC_LF_UPDATE; + gop_decision.update_ref_index[i] = 2; + gop_decision.ref_frame_list[i] = get_leaf_ref_frame(i); + } + } else { + // Pad a overlay-only GOP as the last GOP. + EXPECT_EQ(current_gop_, 1); + gop_decision.use_key_frame = 0; + gop_decision.use_alt_ref = 0; + gop_decision.gop_coding_frames = 1; + + gop_decision.update_type[0] = VPX_RC_OVERLAY_UPDATE; + gop_decision.update_ref_index[0] = 1; + gop_decision.ref_frame_list[0] = get_ovl_ref_frame(); + } + return gop_decision; + } + + int CalculateFrameDecision(int frame_index) { + if (current_gop_ == 0 && frame_index == 0) { + // Key frame, first frame in the first GOP. + return kKeyframeQp; + } else if (frame_index == 1) { + // ARF, we always use ARF for this test. + return kArfQp; + } else { + return kLeafQp; + } + } + + private: + vpx_rc_ref_frame_t get_kf_ref_frame() const { + vpx_rc_ref_frame_t ref_frame; + ref_frame.index[0] = -1; + ref_frame.index[1] = -1; + ref_frame.index[2] = -1; + ref_frame.name[0] = VPX_RC_INVALID_REF_FRAME; + ref_frame.name[1] = VPX_RC_INVALID_REF_FRAME; + ref_frame.name[2] = VPX_RC_INVALID_REF_FRAME; + return ref_frame; + } + vpx_rc_ref_frame_t get_arf_ref_frame() const { + vpx_rc_ref_frame_t ref_frame; + ref_frame.index[0] = 0; + ref_frame.index[1] = -1; + ref_frame.index[2] = -1; + ref_frame.name[0] = VPX_RC_GOLDEN_FRAME; + ref_frame.name[1] = VPX_RC_INVALID_REF_FRAME; + ref_frame.name[2] = VPX_RC_INVALID_REF_FRAME; + return ref_frame; + } + vpx_rc_ref_frame_t get_leaf_ref_frame(int count) const { + vpx_rc_ref_frame_t ref_frame; + ref_frame.index[0] = 0; + ref_frame.index[1] = 1; + ref_frame.index[2] = count > 2 ? 2 : -1; + ref_frame.name[0] = VPX_RC_GOLDEN_FRAME; + ref_frame.name[1] = VPX_RC_ALTREF_FRAME; + ref_frame.name[2] = + count > 2 ? VPX_RC_LAST_FRAME : VPX_RC_INVALID_REF_FRAME; + return ref_frame; + } + vpx_rc_ref_frame_t get_ovl_ref_frame() const { + vpx_rc_ref_frame_t ref_frame; + ref_frame.index[0] = 1; + ref_frame.index[1] = -1; + ref_frame.index[2] = -1; + ref_frame.name[0] = VPX_RC_ALTREF_FRAME; + ref_frame.name[1] = VPX_RC_INVALID_REF_FRAME; + ref_frame.name[2] = VPX_RC_INVALID_REF_FRAME; + return ref_frame; + } + + int current_gop_; +}; + +// Callbacks used in this test. +vpx_rc_status_t rc_test_create_model( + void * /*priv*/, const vpx_rc_config_t * /*ratectrl_config*/, + vpx_rc_model_t *rate_ctrl_model_ptr) { + std::unique_ptr test_controller( + new RateControllerForTest()); + *rate_ctrl_model_ptr = test_controller.release(); + return VPX_RC_OK; +} + +vpx_rc_status_t rc_test_send_firstpass_stats( + vpx_rc_model_t /*rate_ctrl_model*/, + const vpx_rc_firstpass_stats_t *first_pass_stats) { + EXPECT_EQ(first_pass_stats->num_frames, kShowFrameCount); + for (int i = 0; i < first_pass_stats->num_frames; ++i) { + EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i); + } + return VPX_RC_OK; +} + +vpx_rc_status_t rc_test_send_tpl_gop_stats( + vpx_rc_model_t /*rate_ctrl_model*/, const VpxTplGopStats *tpl_gop_stats) { + EXPECT_GT(tpl_gop_stats->size, 0); + + for (int i = 0; i < tpl_gop_stats->size; ++i) { + EXPECT_GT(tpl_gop_stats->frame_stats_list[i].num_blocks, 0); + } + return VPX_RC_OK; +} + +vpx_rc_status_t rc_test_get_encodeframe_decision( + vpx_rc_model_t rate_ctrl_model, const int frame_gop_index, + vpx_rc_encodeframe_decision_t *frame_decision) { + RateControllerForTest *test_controller = + static_cast(rate_ctrl_model); + frame_decision->q_index = + test_controller->CalculateFrameDecision(frame_gop_index); + frame_decision->rdmult = + frame_decision->q_index * frame_decision->q_index / 2; + frame_decision->delta_q_uv = 0; + return VPX_RC_OK; +} + +vpx_rc_status_t rc_test_get_gop_decision(vpx_rc_model_t rate_ctrl_model, + vpx_rc_gop_decision_t *gop_decision) { + RateControllerForTest *test_controller = + static_cast(rate_ctrl_model); + test_controller->StartNextGop(); + *gop_decision = test_controller->GetCurrentGop(); + return VPX_RC_OK; +} + +vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) { + RateControllerForTest *test_controller = + static_cast(rate_ctrl_model); + delete test_controller; + return VPX_RC_OK; +} + +class ExtRateCtrlTest : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + ExtRateCtrlTest() + : EncoderTest(&::libvpx_test::kVP9), received_show_frame_count_(0), + current_frame_qp_(0) {} + + ~ExtRateCtrlTest() override = default; + + void SetUp() override { + InitializeConfig(); +#if CONFIG_REALTIME_ONLY + SetMode(::libvpx_test::kRealTime); +#else + SetMode(::libvpx_test::kTwoPassGood); +#endif + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + vpx_rc_funcs_t rc_funcs = {}; + rc_funcs.rc_type = VPX_RC_GOP_QP; + rc_funcs.create_model = rc_test_create_model; + rc_funcs.send_firstpass_stats = rc_test_send_firstpass_stats; + rc_funcs.send_tpl_gop_stats = rc_test_send_tpl_gop_stats; + rc_funcs.get_gop_decision = rc_test_get_gop_decision; + rc_funcs.get_encodeframe_decision = rc_test_get_encodeframe_decision; + rc_funcs.delete_model = rc_delete_model; + encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs); + } + } + +#if CONFIG_VP9_DECODER + bool HandleDecodeResult(const vpx_codec_err_t res_dec, + const ::libvpx_test::VideoSource & /*video*/, + ::libvpx_test::Decoder *decoder) override { + EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError(); + decoder->Control(VPXD_GET_LAST_QUANTIZER, ¤t_frame_qp_); + return VPX_CODEC_OK == res_dec; + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + // We are not comparing current_frame_qp_ here because the encoder will + // pack ARF and the next show frame into one pkt. Therefore, we might + // receive two frames in one pkt. However, one thing we are sure is that + // each pkt will have just one show frame. Therefore, we can check if the + // received show frame count match the actual show frame count. + if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { + ++received_show_frame_count_; + } + } +#endif // CONFIG_VP9_DECODER + + int received_show_frame_count_; + int current_frame_qp_; +}; + +TEST_F(ExtRateCtrlTest, EncodeTest) { + cfg_.rc_target_bitrate = 4000; + cfg_.g_lag_in_frames = 25; + + std::unique_ptr video; + video.reset(new (std::nothrow) libvpx_test::YUVVideoSource( + "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, + kShowFrameCount)); + + ASSERT_NE(video, nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + EXPECT_EQ(received_show_frame_count_, kShowFrameCount); +} + +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_frame_parallel_test.cc b/media/libvpx/libvpx/test/vp9_frame_parallel_test.cc deleted file mode 100644 index 670cd4d721..0000000000 --- a/media/libvpx/libvpx/test/vp9_frame_parallel_test.cc +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include -#include "third_party/googletest/src/include/gtest/gtest.h" -#include "./vpx_config.h" -#include "test/codec_factory.h" -#include "test/decode_test_driver.h" -#include "test/ivf_video_source.h" -#include "test/md5_helper.h" -#include "test/util.h" -#if CONFIG_WEBM_IO -#include "test/webm_video_source.h" -#endif -#include "vpx_mem/vpx_mem.h" - -namespace { - -using std::string; - -#if CONFIG_WEBM_IO - -struct PauseFileList { - const char *name; - // md5 sum for decoded frames which does not include skipped frames. - const char *expected_md5; - const int pause_frame_num; -}; - -// Decodes |filename| with |num_threads|. Pause at the specified frame_num, -// seek to next key frame and then continue decoding until the end. Return -// the md5 of the decoded frames which does not include skipped frames. -string DecodeFileWithPause(const string &filename, int num_threads, - int pause_num) { - libvpx_test::WebMVideoSource video(filename); - video.Init(); - int in_frames = 0; - int out_frames = 0; - - vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); - cfg.threads = num_threads; - vpx_codec_flags_t flags = 0; - flags |= VPX_CODEC_USE_FRAME_THREADING; - libvpx_test::VP9Decoder decoder(cfg, flags); - - libvpx_test::MD5 md5; - video.Begin(); - - do { - ++in_frames; - const vpx_codec_err_t res = - decoder.DecodeFrame(video.cxdata(), video.frame_size()); - if (res != VPX_CODEC_OK) { - EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); - break; - } - - // Pause at specified frame number. - if (in_frames == pause_num) { - // Flush the decoder and then seek to next key frame. - decoder.DecodeFrame(NULL, 0); - video.SeekToNextKeyFrame(); - } else { - video.Next(); - } - - // Flush the decoder at the end of the video. - if (!video.cxdata()) decoder.DecodeFrame(NULL, 0); - - libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); - const vpx_image_t *img; - - // Get decompressed data - while ((img = dec_iter.Next())) { - ++out_frames; - md5.Add(img); - } - } while (video.cxdata() != NULL); - - EXPECT_EQ(in_frames, out_frames) - << "Input frame count does not match output frame count"; - - return string(md5.Get()); -} - -void DecodeFilesWithPause(const PauseFileList files[]) { - for (const PauseFileList *iter = files; iter->name != NULL; ++iter) { - SCOPED_TRACE(iter->name); - for (int t = 2; t <= 8; ++t) { - EXPECT_EQ(iter->expected_md5, - DecodeFileWithPause(iter->name, t, iter->pause_frame_num)) - << "threads = " << t; - } - } -} - -TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) { - // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with - // one key frame for every ten frames. - static const PauseFileList files[] = { - { "vp90-2-07-frame_parallel-1.webm", "6ea7c3875d67252e7caf2bc6e75b36b1", - 6 }, - { "vp90-2-07-frame_parallel-1.webm", "4bb634160c7356a8d7d4299b6dc83a45", - 12 }, - { "vp90-2-07-frame_parallel-1.webm", "89772591e6ef461f9fa754f916c78ed8", - 26 }, - { NULL, NULL, 0 }, - }; - DecodeFilesWithPause(files); -} - -struct FileList { - const char *name; - // md5 sum for decoded frames which does not include corrupted frames. - const char *expected_md5; - // Expected number of decoded frames which does not include corrupted frames. - const int expected_frame_count; -}; - -// Decodes |filename| with |num_threads|. Return the md5 of the decoded -// frames which does not include corrupted frames. -string DecodeFile(const string &filename, int num_threads, - int expected_frame_count) { - libvpx_test::WebMVideoSource video(filename); - video.Init(); - - vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); - cfg.threads = num_threads; - const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING; - libvpx_test::VP9Decoder decoder(cfg, flags); - - libvpx_test::MD5 md5; - video.Begin(); - - int out_frames = 0; - do { - const vpx_codec_err_t res = - decoder.DecodeFrame(video.cxdata(), video.frame_size()); - // TODO(hkuang): frame parallel mode should return an error on corruption. - if (res != VPX_CODEC_OK) { - EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); - break; - } - - video.Next(); - - // Flush the decoder at the end of the video. - if (!video.cxdata()) decoder.DecodeFrame(NULL, 0); - - libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); - const vpx_image_t *img; - - // Get decompressed data - while ((img = dec_iter.Next())) { - ++out_frames; - md5.Add(img); - } - } while (video.cxdata() != NULL); - - EXPECT_EQ(expected_frame_count, out_frames) - << "Input frame count does not match expected output frame count"; - - return string(md5.Get()); -} - -void DecodeFiles(const FileList files[]) { - for (const FileList *iter = files; iter->name != NULL; ++iter) { - SCOPED_TRACE(iter->name); - for (int t = 2; t <= 8; ++t) { - EXPECT_EQ(iter->expected_md5, - DecodeFile(iter->name, t, iter->expected_frame_count)) - << "threads = " << t; - } - } -} - -TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) { - static const FileList files[] = { - // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with - // one key frame for every ten frames. The 11th frame has corrupted data. - { "invalid-vp90-2-07-frame_parallel-1.webm", - "0549d0f45f60deaef8eb708e6c0eb6cb", 30 }, - // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with - // one key frame for every ten frames. The 1st and 31st frames have - // corrupted data. - { "invalid-vp90-2-07-frame_parallel-2.webm", - "6a1f3cf6f9e7a364212fadb9580d525e", 20 }, - // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with - // one key frame for every ten frames. The 5th and 13th frames have - // corrupted data. - { "invalid-vp90-2-07-frame_parallel-3.webm", - "8256544308de926b0681e04685b98677", 27 }, - { NULL, NULL, 0 }, - }; - DecodeFiles(files); -} - -TEST(VP9MultiThreadedFrameParallel, ValidFileTest) { - static const FileList files[] = { -#if CONFIG_VP9_HIGHBITDEPTH - { "vp92-2-20-10bit-yuv420.webm", "a16b99df180c584e8db2ffeda987d293", 10 }, -#endif - { NULL, NULL, 0 }, - }; - DecodeFiles(files); -} -#endif // CONFIG_WEBM_IO -} // namespace diff --git a/media/libvpx/libvpx/test/vp9_intrapred_test.cc b/media/libvpx/libvpx/test/vp9_intrapred_test.cc index 8c5fb20191..6733ef73d1 100644 --- a/media/libvpx/libvpx/test/vp9_intrapred_test.cc +++ b/media/libvpx/libvpx/test/vp9_intrapred_test.cc @@ -10,7 +10,7 @@ #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -28,11 +28,11 @@ using libvpx_test::ACMRandom; const int count_test_block = 100000; -typedef void (*IntraPredFunc)(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left); +using IntraPredFunc = void (*)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); struct IntraPredParam { - IntraPredParam(IntraPredFunc pred = NULL, IntraPredFunc ref = NULL, + IntraPredParam(IntraPredFunc pred = nullptr, IntraPredFunc ref = nullptr, int block_size_value = 0, int bit_depth_value = 0) : pred_fn(pred), ref_fn(ref), block_size(block_size_value), bit_depth(bit_depth_value) {} @@ -55,6 +55,21 @@ class IntraPredTest : public ::testing::TestWithParam { ref_dst_ = ref_dst; int error_count = 0; for (int i = 0; i < count_test_block; ++i) { + // TODO(webm:1797): Some of the optimised predictor implementations rely + // on the trailing half of the above_row_ being a copy of the final + // element, however relying on this in some cases can cause the MD5 tests + // to fail. We have fixed all of these cases for Neon, so fill the whole + // of above_row_ randomly. +#if HAVE_NEON + // Fill edges with random data, try first with saturated values. + for (int x = -1; x < 2 * block_size; x++) { + if (i == 0) { + above_row_[x] = mask_; + } else { + above_row_[x] = rnd.Rand16() & mask_; + } + } +#else // Fill edges with random data, try first with saturated values. for (int x = -1; x < block_size; x++) { if (i == 0) { @@ -66,6 +81,7 @@ class IntraPredTest : public ::testing::TestWithParam { for (int x = block_size; x < 2 * block_size; x++) { above_row_[x] = above_row_[block_size - 1]; } +#endif for (int y = 0; y < block_size; y++) { if (i == 0) { left_col_[y] = mask_; @@ -80,7 +96,7 @@ class IntraPredTest : public ::testing::TestWithParam { } protected: - virtual void SetUp() { + void SetUp() override { params_ = this->GetParam(); stride_ = params_.block_size * 3; mask_ = (1 << params_.bit_depth) - 1; @@ -119,7 +135,7 @@ void IntraPredTest::Predict() { params_.pred_fn(dst_, stride_, above_row_, left_col_)); } -typedef IntraPredTest VP9IntraPredTest; +using VP9IntraPredTest = IntraPredTest; TEST_P(VP9IntraPredTest, IntraPredTests) { // max block size is 32 @@ -130,8 +146,14 @@ TEST_P(VP9IntraPredTest, IntraPredTests) { RunTest(left_col, above_data, dst, ref_dst); } +// Instantiate a token test to avoid -Wuninitialized warnings when none of the +// other tests are enabled. +INSTANTIATE_TEST_SUITE_P( + C, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_d45_predictor_4x4_c, + &vpx_d45_predictor_4x4_c, 4, 8))); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2, VP9IntraPredTest, ::testing::Values( IntraPredParam(&vpx_d45_predictor_4x4_sse2, &vpx_d45_predictor_4x4_c, 4, @@ -195,7 +217,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSSE3, VP9IntraPredTest, ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_ssse3, &vpx_d45_predictor_16x16_c, 16, 8), @@ -226,7 +248,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSSE3 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON, VP9IntraPredTest, ::testing::Values( IntraPredParam(&vpx_d45_predictor_4x4_neon, &vpx_d45_predictor_4x4_c, 4, @@ -237,6 +259,22 @@ INSTANTIATE_TEST_CASE_P( &vpx_d45_predictor_16x16_c, 16, 8), IntraPredParam(&vpx_d45_predictor_32x32_neon, &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d63_predictor_4x4_neon, &vpx_d63_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_d63_predictor_8x8_neon, &vpx_d63_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d63_predictor_16x16_neon, + &vpx_d63_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_neon, + &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d117_predictor_16x16_neon, + &vpx_d117_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d117_predictor_32x32_neon, + &vpx_d117_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c, @@ -245,6 +283,22 @@ INSTANTIATE_TEST_CASE_P( &vpx_d135_predictor_16x16_c, 16, 8), IntraPredParam(&vpx_d135_predictor_32x32_neon, &vpx_d135_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d153_predictor_4x4_neon, &vpx_d153_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d153_predictor_8x8_neon, &vpx_d153_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d153_predictor_16x16_neon, + &vpx_d153_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d153_predictor_32x32_neon, + &vpx_d153_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d207_predictor_4x4_neon, &vpx_d207_predictor_4x4_c, + 4, 8), + IntraPredParam(&vpx_d207_predictor_8x8_neon, &vpx_d207_predictor_8x8_c, + 8, 8), + IntraPredParam(&vpx_d207_predictor_16x16_neon, + &vpx_d207_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d207_predictor_32x32_neon, + &vpx_d207_predictor_32x32_c, 32, 8), IntraPredParam(&vpx_dc_128_predictor_4x4_neon, &vpx_dc_128_predictor_4x4_c, 4, 8), IntraPredParam(&vpx_dc_128_predictor_8x8_neon, @@ -300,7 +354,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_NEON #if HAVE_DSPR2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( DSPR2, VP9IntraPredTest, ::testing::Values(IntraPredParam(&vpx_dc_predictor_4x4_dspr2, &vpx_dc_predictor_4x4_c, 4, 8), @@ -321,7 +375,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_DSPR2 #if HAVE_MSA -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( MSA, VP9IntraPredTest, ::testing::Values( IntraPredParam(&vpx_dc_128_predictor_4x4_msa, @@ -378,13 +432,81 @@ INSTANTIATE_TEST_CASE_P( 8))); #endif // HAVE_MSA +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 + IntraPredParam(&vpx_d45_predictor_8x8_vsx, &vpx_d45_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_d63_predictor_8x8_vsx, &vpx_d63_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_dc_predictor_8x8_vsx, &vpx_dc_predictor_8x8_c, 8, + 8), + IntraPredParam(&vpx_h_predictor_4x4_vsx, &vpx_h_predictor_4x4_c, 4, 8), + IntraPredParam(&vpx_h_predictor_8x8_vsx, &vpx_h_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_tm_predictor_4x4_vsx, &vpx_tm_predictor_4x4_c, 4, + 8), + IntraPredParam(&vpx_tm_predictor_8x8_vsx, &vpx_tm_predictor_8x8_c, 8, + 8), +#endif + +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P( + VSX, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_vsx, + &vpx_d45_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d45_predictor_32x32_vsx, + &vpx_d45_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_d63_predictor_16x16_vsx, + &vpx_d63_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_d63_predictor_32x32_vsx, + &vpx_d63_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_128_predictor_16x16_vsx, + &vpx_dc_128_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_128_predictor_32x32_vsx, + &vpx_dc_128_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_left_predictor_16x16_vsx, + &vpx_dc_left_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_left_predictor_32x32_vsx, + &vpx_dc_left_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_predictor_16x16_vsx, + &vpx_dc_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_predictor_32x32_vsx, + &vpx_dc_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_dc_top_predictor_16x16_vsx, + &vpx_dc_top_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_dc_top_predictor_32x32_vsx, + &vpx_dc_top_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_h_predictor_16x16_vsx, + &vpx_h_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_h_predictor_32x32_vsx, + &vpx_h_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_tm_predictor_16x16_vsx, + &vpx_tm_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_tm_predictor_32x32_vsx, + &vpx_tm_predictor_32x32_c, 32, 8), + IntraPredParam(&vpx_v_predictor_16x16_vsx, + &vpx_v_predictor_16x16_c, 16, 8), + IntraPredParam(&vpx_v_predictor_32x32_vsx, + &vpx_v_predictor_32x32_c, 32, 8))); +#endif // HAVE_VSX + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P( + LSX, VP9IntraPredTest, + ::testing::Values(IntraPredParam(&vpx_dc_predictor_8x8_lsx, + &vpx_dc_predictor_8x8_c, 8, 8), + IntraPredParam(&vpx_dc_predictor_16x16_lsx, + &vpx_dc_predictor_16x16_c, 16, 8))); +#endif // HAVE_LSX + #if CONFIG_VP9_HIGHBITDEPTH -typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, - int bps); +using HighbdIntraPred = void (*)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bps); + struct HighbdIntraPredParam { - HighbdIntraPredParam(HighbdIntraPred pred = NULL, HighbdIntraPred ref = NULL, - int block_size_value = 0, int bit_depth_value = 0) + HighbdIntraPredParam(HighbdIntraPred pred = nullptr, + HighbdIntraPred ref = nullptr, int block_size_value = 0, + int bit_depth_value = 0) : pred_fn(pred), ref_fn(ref), block_size(block_size_value), bit_depth(bit_depth_value) {} @@ -394,6 +516,7 @@ struct HighbdIntraPredParam { int bit_depth; }; +#if HAVE_SSSE3 || HAVE_NEON || HAVE_SSE2 template <> void IntraPredTest::Predict() { const int bit_depth = params_.bit_depth; @@ -402,7 +525,8 @@ void IntraPredTest::Predict() { params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth)); } -typedef IntraPredTest VP9HighbdIntraPredTest; +using VP9HighbdIntraPredTest = IntraPredTest; +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VP9HighbdIntraPredTest); TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { // max block size is 32 @@ -412,11 +536,166 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 32 * 32]); RunTest(left_col, above_data, dst, ref_dst); } +#endif + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3_TO_C_8, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 8))); + +INSTANTIATE_TEST_SUITE_P( + SSSE3_TO_C_10, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 10))); + +INSTANTIATE_TEST_SUITE_P( + SSSE3_TO_C_12, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3, + &vpx_highbd_d45_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3, + &vpx_highbd_d45_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3, + &vpx_highbd_d45_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3, + &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3, + &vpx_highbd_d63_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3, + &vpx_highbd_d63_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c, + &vpx_highbd_d63_predictor_32x32_ssse3, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3, + &vpx_highbd_d117_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3, + &vpx_highbd_d117_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c, + &vpx_highbd_d117_predictor_32x32_ssse3, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3, + &vpx_highbd_d135_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3, + &vpx_highbd_d135_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3, + &vpx_highbd_d135_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3, + &vpx_highbd_d153_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3, + &vpx_highbd_d153_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3, + &vpx_highbd_d153_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 12))); +#endif // HAVE_SSSE3 #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2_TO_C_8, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, @@ -425,6 +704,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_dc_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, &vpx_highbd_dc_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, @@ -433,6 +720,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_tm_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, @@ -442,9 +737,35 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2_TO_C_10, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, @@ -453,6 +774,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_dc_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, &vpx_highbd_dc_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, @@ -461,6 +790,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_tm_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, @@ -470,9 +807,35 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 10))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( SSE2_TO_C_12, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2, + &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2, + &vpx_highbd_dc_128_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2, + &vpx_highbd_dc_128_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, + &vpx_highbd_dc_128_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2, + &vpx_highbd_d63_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2, + &vpx_highbd_d117_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2, + &vpx_highbd_d135_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2, + &vpx_highbd_d153_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, + &vpx_highbd_dc_left_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, + &vpx_highbd_dc_left_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2, + &vpx_highbd_dc_left_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2, + &vpx_highbd_dc_left_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2, &vpx_highbd_dc_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2, @@ -481,6 +844,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_dc_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2, &vpx_highbd_dc_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2, + &vpx_highbd_dc_top_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2, + &vpx_highbd_dc_top_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2, + &vpx_highbd_dc_top_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2, + &vpx_highbd_dc_top_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2, @@ -489,6 +860,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_tm_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, @@ -500,7 +879,7 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_NEON -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON_TO_C_8, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, @@ -511,6 +890,22 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, @@ -519,6 +914,22 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_d135_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, &vpx_highbd_d135_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, @@ -576,7 +987,7 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, &vpx_highbd_v_predictor_32x32_c, 32, 8))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON_TO_C_10, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, @@ -587,6 +998,22 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, @@ -595,6 +1022,22 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_d135_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, &vpx_highbd_d135_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, @@ -652,7 +1095,7 @@ INSTANTIATE_TEST_CASE_P( HighbdIntraPredParam(&vpx_highbd_v_predictor_32x32_neon, &vpx_highbd_v_predictor_32x32_c, 32, 10))); -INSTANTIATE_TEST_CASE_P( +INSTANTIATE_TEST_SUITE_P( NEON_TO_C_12, VP9HighbdIntraPredTest, ::testing::Values( HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, @@ -663,6 +1106,22 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_d45_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon, + &vpx_highbd_d63_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon, + &vpx_highbd_d63_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon, + &vpx_highbd_d63_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon, + &vpx_highbd_d63_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon, + &vpx_highbd_d117_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon, + &vpx_highbd_d117_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon, + &vpx_highbd_d117_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon, + &vpx_highbd_d117_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, &vpx_highbd_d135_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, @@ -671,6 +1130,22 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_d135_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, &vpx_highbd_d135_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon, + &vpx_highbd_d153_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon, + &vpx_highbd_d153_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon, + &vpx_highbd_d153_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon, + &vpx_highbd_d153_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon, + &vpx_highbd_d207_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon, + &vpx_highbd_d207_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon, + &vpx_highbd_d207_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon, + &vpx_highbd_d207_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, diff --git a/media/libvpx/libvpx/test/vp9_lossless_test.cc b/media/libvpx/libvpx/test/vp9_lossless_test.cc index 703b55e9bd..48839a7a23 100644 --- a/media/libvpx/libvpx/test/vp9_lossless_test.cc +++ b/media/libvpx/libvpx/test/vp9_lossless_test.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "test/codec_factory.h" @@ -29,16 +29,16 @@ class LosslessTest : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0), encoding_mode_(GET_PARAM(1)) {} - virtual ~LosslessTest() {} + ~LosslessTest() override = default; - virtual void SetUp() { + void SetUp() override { InitializeConfig(); SetMode(encoding_mode_); } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, - ::libvpx_test::Encoder *encoder) { - if (video->frame() == 1) { + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { // Only call Control if quantizer > 0 to verify that using quantizer // alone will activate lossless if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) { @@ -47,12 +47,12 @@ class LosslessTest } } - virtual void BeginPassHook(unsigned int /*pass*/) { + void BeginPassHook(unsigned int /*pass*/) override { psnr_ = kMaxPsnr; nframes_ = 0; } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override { if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0]; } @@ -118,8 +118,13 @@ TEST_P(LosslessTest, TestLossLessEncodingCtrl) { EXPECT_GE(psnr_lossless, kMaxPsnr); } -VP9_INSTANTIATE_TEST_CASE(LosslessTest, - ::testing::Values(::libvpx_test::kRealTime, - ::libvpx_test::kOnePassGood, - ::libvpx_test::kTwoPassGood)); +#if CONFIG_REALTIME_ONLY +VP9_INSTANTIATE_TEST_SUITE(LosslessTest, + ::testing::Values(::libvpx_test::kRealTime)); +#else +VP9_INSTANTIATE_TEST_SUITE(LosslessTest, + ::testing::Values(::libvpx_test::kRealTime, + ::libvpx_test::kOnePassGood, + ::libvpx_test::kTwoPassGood)); +#endif } // namespace diff --git a/media/libvpx/libvpx/test/vp9_motion_vector_test.cc b/media/libvpx/libvpx/test/vp9_motion_vector_test.cc new file mode 100644 index 0000000000..b47f530909 --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_motion_vector_test.cc @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/yuv_video_source.h" +#include "vpx_config.h" + +namespace { +#define MAX_EXTREME_MV 1 +#define MIN_EXTREME_MV 2 + +// Encoding modes +const libvpx_test::TestMode kEncodingModeVectors[] = { +#if !CONFIG_REALTIME_ONLY + ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, +#endif + ::libvpx_test::kRealTime +}; + +// Encoding speeds +const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6 }; + +// MV test modes: 1 - always use maximum MV; 2 - always use minimum MV. +const int kMVTestModes[] = { MAX_EXTREME_MV, MIN_EXTREME_MV }; + +class MotionVectorTestLarge + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith3Params { + protected: + MotionVectorTestLarge() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {} + + ~MotionVectorTestLarge() override = default; + + void SetUp() override { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + cfg_.g_lag_in_frames = 3; + cfg_.rc_end_usage = VPX_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, cpu_used_); + encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_); + if (encoding_mode_ != ::libvpx_test::kRealTime) { + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } + } + } + + libvpx_test::TestMode encoding_mode_; + int cpu_used_; + int mv_test_mode_; +}; + +TEST_P(MotionVectorTestLarge, OverallTest) { + cfg_.rc_target_bitrate = 24000; + cfg_.g_profile = 0; + init_flags_ = VPX_CODEC_USE_PSNR; + + std::unique_ptr video; + video.reset(new libvpx_test::YUVVideoSource( + "niklas_640_480_30.yuv", VPX_IMG_FMT_I420, 3840, 2160, // 2048, 1080, + 30, 1, 0, 5)); + + ASSERT_NE(video.get(), nullptr); + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +VP9_INSTANTIATE_TEST_SUITE(MotionVectorTestLarge, + ::testing::ValuesIn(kEncodingModeVectors), + ::testing::ValuesIn(kCpuUsedVectors), + ::testing::ValuesIn(kMVTestModes)); +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_quantize_test.cc b/media/libvpx/libvpx/test/vp9_quantize_test.cc index 4643895021..0c49598532 100644 --- a/media/libvpx/libvpx/test/vp9_quantize_test.cc +++ b/media/libvpx/libvpx/test/vp9_quantize_test.cc @@ -9,336 +9,716 @@ */ #include +#include #include #include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" +#include "./vp9_rtcd.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" +#include "test/buffer.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "test/util.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/vpx_timer.h" using libvpx_test::ACMRandom; +using libvpx_test::Buffer; namespace { -#if CONFIG_VP9_HIGHBITDEPTH const int number_of_iterations = 100; -typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count, - int skip_block, const int16_t *zbin, - const int16_t *round, const int16_t *quant, - const int16_t *quant_shift, tran_low_t *qcoeff, - tran_low_t *dqcoeff, const int16_t *dequant, - uint16_t *eob, const int16_t *scan, - const int16_t *iscan); -typedef std::tr1::tuple - QuantizeParam; +using QuantizeFunc = void (*)(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order); +using QuantizeParam = std::tuple; -class VP9QuantizeTest : public ::testing::TestWithParam { +// Wrapper for 32x32 version which does not use count +using Quantize32x32Func = void (*)(const tran_low_t *coeff, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order); + +template +void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order) { + (void)count; + fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); +} + +// Wrapper for FP version which does not use zbin or quant_shift. +using QuantizeFPFunc = void (*)(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order); + +template +void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, + const macroblock_plane *const mb_plane, tran_low_t *qcoeff, + tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob, + const struct ScanOrder *const scan_order) { + fn(coeff, count, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order); +} + +void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, + int16_t *dequant, int16_t *round_fp, + int16_t *quant_fp) { + // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V. + constexpr int kMaxQRoundingFactorFp = 64; + + for (int j = 0; j < 2; j++) { + // The range is 4 to 1828 in the VP9 tables. + const int qlookup = rnd->RandRange(1825) + 4; + round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7; + quant_fp[j] = (1 << 16) / qlookup; + + // Values determined by deconstructing vp9_init_quantizer(). + // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y + // values or U/V values of any bit depth. This is because y_delta is not + // factored into the vp9_ac_quant() call. + zbin[j] = rnd->RandRange(1200); + + // round may be up to 685 for Y values or 914 for U/V. + round[j] = rnd->RandRange(914); + // quant ranges from 1 to -32703 + quant[j] = static_cast(rnd->RandRange(32704)) - 32703; + // quant_shift goes up to 1 << 16. + quant_shift[j] = rnd->RandRange(16384); + // dequant maxes out at 1828 for all cases. + dequant[j] = rnd->RandRange(1828); + } + for (int j = 2; j < 8; j++) { + zbin[j] = zbin[1]; + round_fp[j] = round_fp[1]; + quant_fp[j] = quant_fp[1]; + round[j] = round[1]; + quant[j] = quant[1]; + quant_shift[j] = quant_shift[1]; + dequant[j] = dequant[1]; + } +} + +class VP9QuantizeBase : public AbstractBench { public: - virtual ~VP9QuantizeTest() {} - virtual void SetUp() { - quantize_op_ = GET_PARAM(0); - ref_quantize_op_ = GET_PARAM(1); - bit_depth_ = GET_PARAM(2); - mask_ = (1 << bit_depth_) - 1; + VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp) + : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp), + coeff_(Buffer(max_size_, max_size_, 0, 16)), + qcoeff_(Buffer(max_size_, max_size_, 0, 32)), + dqcoeff_(Buffer(max_size_, max_size_, 0, 32)) { + // TODO(jianj): SSSE3 and AVX2 tests fail on extreme values. +#if HAVE_NEON + max_value_ = (1 << (7 + bit_depth_)) - 1; +#else + max_value_ = (1 << bit_depth_) - 1; +#endif + + mb_plane_ = reinterpret_cast( + vpx_memalign(16, sizeof(macroblock_plane))); + + zbin_ptr_ = mb_plane_->zbin = + reinterpret_cast(vpx_memalign(16, 8 * sizeof(*zbin_ptr_))); + round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*round_fp_ptr_))); + quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_))); + round_ptr_ = mb_plane_->round = + reinterpret_cast(vpx_memalign(16, 8 * sizeof(*round_ptr_))); + quant_ptr_ = mb_plane_->quant = + reinterpret_cast(vpx_memalign(16, 8 * sizeof(*quant_ptr_))); + quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_))); + dequant_ptr_ = reinterpret_cast( + vpx_memalign(16, 8 * sizeof(*dequant_ptr_))); + + r_ptr_ = (is_fp_) ? round_fp_ptr_ : round_ptr_; + q_ptr_ = (is_fp_) ? quant_fp_ptr_ : quant_ptr_; } - virtual void TearDown() { libvpx_test::ClearSystemState(); } - - protected: - vpx_bit_depth_t bit_depth_; - int mask_; - QuantizeFunc quantize_op_; - QuantizeFunc ref_quantize_op_; -}; - -class VP9Quantize32Test : public ::testing::TestWithParam { - public: - virtual ~VP9Quantize32Test() {} - virtual void SetUp() { - quantize_op_ = GET_PARAM(0); - ref_quantize_op_ = GET_PARAM(1); - bit_depth_ = GET_PARAM(2); - mask_ = (1 << bit_depth_) - 1; + ~VP9QuantizeBase() override { + vpx_free(mb_plane_); + vpx_free(zbin_ptr_); + vpx_free(round_fp_ptr_); + vpx_free(quant_fp_ptr_); + vpx_free(round_ptr_); + vpx_free(quant_ptr_); + vpx_free(quant_shift_ptr_); + vpx_free(dequant_ptr_); + mb_plane_ = nullptr; + zbin_ptr_ = nullptr; + round_fp_ptr_ = nullptr; + quant_fp_ptr_ = nullptr; + round_ptr_ = nullptr; + quant_ptr_ = nullptr; + quant_shift_ptr_ = nullptr; + dequant_ptr_ = nullptr; + libvpx_test::ClearSystemState(); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } + protected: + macroblock_plane *mb_plane_; + int16_t *zbin_ptr_; + int16_t *quant_fp_ptr_; + int16_t *round_fp_ptr_; + int16_t *round_ptr_; + int16_t *quant_ptr_; + int16_t *quant_shift_ptr_; + int16_t *dequant_ptr_; + const vpx_bit_depth_t bit_depth_; + int max_value_; + const int max_size_; + const bool is_fp_; + Buffer coeff_; + Buffer qcoeff_; + Buffer dqcoeff_; + int16_t *r_ptr_; + int16_t *q_ptr_; + int count_; + const ScanOrder *scan_; + uint16_t eob_; +}; + +class VP9QuantizeTest : public VP9QuantizeBase, + public ::testing::TestWithParam { + public: + VP9QuantizeTest() + : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)), + quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {} protected: - vpx_bit_depth_t bit_depth_; - int mask_; - QuantizeFunc quantize_op_; - QuantizeFunc ref_quantize_op_; + void Run() override; + void Speed(bool is_median); + const QuantizeFunc quantize_op_; + const QuantizeFunc ref_quantize_op_; }; +void VP9QuantizeTest::Run() { + quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_); +} + +void VP9QuantizeTest::Speed(bool is_median) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); + TX_SIZE starting_sz, ending_sz; + + if (max_size_ == 16) { + starting_sz = TX_4X4; + ending_sz = TX_16X16; + } else { + starting_sz = TX_32X32; + ending_sz = TX_32X32; + } + + for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { + // zbin > coeff, zbin < coeff. + for (int i = 0; i < 2; ++i) { + // TX_TYPE defines the scan order. That is not relevant to the speed test. + // Pick the first one. + const TX_TYPE tx_type = DCT_DCT; + count_ = (4 << sz) * (4 << sz); + scan_ = &vp9_scan_orders[sz][tx_type]; + + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + + if (i == 0) { + // When |coeff values| are less than zbin the results are 0. + int threshold = 100; + if (max_size_ == 32) { + // For 32x32, the threshold is halved. Double it to keep the values + // from clearing it. + threshold = 200; + } + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; + coeff_.Set(&rnd, -99, 99); + } else if (i == 1) { + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; + coeff_.Set(&rnd, -500, 500); + } + + const char *type = + (i == 0) ? "Bypass calculations " : "Full calculations "; + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); + char title[100]; + snprintf(title, sizeof(title), "%25s %8s ", type, block_size); + + if (is_median) { + RunNTimes(10000000 / count_); + PrintMedian(title); + } else { + Buffer ref_qcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer ref_dqcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t ref_eob = 0; + + const int kNumTests = 5000000; + vpx_usec_timer timer, simd_timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, + scan_); + } + vpx_usec_timer_mark(&timer); + + vpx_usec_timer_start(&simd_timer); + for (int n = 0; n < kNumTests; ++n) { + quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(), + dequant_ptr_, &eob_, scan_); + } + vpx_usec_timer_mark(&simd_timer); + + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer)); + const int simd_elapsed_time = + static_cast(vpx_usec_timer_elapsed(&simd_timer)); + printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title, + elapsed_time, simd_elapsed_time, + ((float)elapsed_time / simd_elapsed_time)); + } + } + } +} + +// This quantizer compares the AC coefficients to the quantization step size to +// determine if further multiplication operations are needed. +// Based on vp9_quantize_fp_sse2(). +inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order, + int is_32x32) { + int i, eob = -1; + const int thr = dequant_ptr[1] >> (1 + is_32x32); + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i += 16) { + int y; + int nzflag_cnt = 0; + int abs_coeff[16]; + int coeff_sign[16]; + + // count nzflag for each row (16 tran_low_t) + for (y = 0; y < 16; ++y) { + const int rc = i + y; + const int coeff = coeff_ptr[rc]; + coeff_sign[y] = (coeff >> 31); + abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y]; + // The first 16 are skipped in the sse2 code. Do the same here to match. + if (i >= 16 && (abs_coeff[y] <= thr)) { + nzflag_cnt++; + } + } + + for (y = 0; y < 16; ++y) { + const int rc = i + y; + // If all of the AC coeffs in a row has magnitude less than the + // quantization step_size/2, quantize to zero. + if (nzflag_cnt < 16) { + int tmp; + int _round; + + if (is_32x32) { + _round = ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + } else { + _round = round_ptr[rc != 0]; + } + tmp = clamp(abs_coeff[y] + _round, INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> (16 - is_32x32); + qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y]; + dqcoeff_ptr[rc] = + static_cast(qcoeff_ptr[rc] * dequant_ptr[rc != 0]); + + if (is_32x32) { + dqcoeff_ptr[rc] = static_cast(qcoeff_ptr[rc] * + dequant_ptr[rc != 0] / 2); + } else { + dqcoeff_ptr[rc] = + static_cast(qcoeff_ptr[rc] * dequant_ptr[rc != 0]); + } + } else { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } + } + } + + // Scan for eob. + for (i = 0; i < n_coeffs; i++) { + // Use the scan order to find the correct eob. + const int rc = scan[i]; + if (qcoeff_ptr[rc]) { + eob = i; + } + } + *eob_ptr = eob + 1; +} + +void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_order, 0); +} + +void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan_order, 1); +} + TEST_P(VP9QuantizeTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; - for (int i = 0; i < number_of_iterations; ++i) { - const int skip_block = i == 0; - const TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16 - const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - const int count = (4 << sz) * (4 << sz); // 16, 64, 256 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; - for (int j = 0; j < count; j++) { - coeff_ptr[j] = rnd.Rand16() & mask_; - } - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); - } - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, - scan_order->scan, scan_order->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); - } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; - } - err_count_total += err_count; - } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; -} + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); + Buffer ref_qcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer ref_dqcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t ref_eob = 0; + eob_ = 0; -TEST_P(VP9Quantize32Test, OperationCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; for (int i = 0; i < number_of_iterations; ++i) { - const int skip_block = i == 0; - const TX_SIZE sz = TX_32X32; - const TX_TYPE tx_type = (TX_TYPE)(i % 4); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - const int count = (4 << sz) * (4 << sz); // 1024 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; - for (int j = 0; j < count; j++) { - coeff_ptr[j] = rnd.Rand16() & mask_; + TX_SIZE sz; + if (max_size_ == 16) { + sz = static_cast(i % 3); // TX_4X4, TX_8X8 TX_16X16 + } else { + sz = TX_32X32; } - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); - } - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, - scan_order->scan, scan_order->iscan); + const TX_TYPE tx_type = static_cast((i >> 2) % 3); + scan_ = &vp9_scan_orders[sz][tx_type]; + count_ = (4 << sz) * (4 << sz); + coeff_.Set(&rnd, -max_value_, max_value_); + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_); + ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); + coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); + + EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); + EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); + + EXPECT_EQ(eob_, ref_eob); + + if (HasFailure()) { + printf("Failure on iteration %d.\n", i); + qcoeff_.PrintDifference(ref_qcoeff); + dqcoeff_.PrintDifference(ref_dqcoeff); + return; } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; - } - err_count_total += err_count; } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; } TEST_P(VP9QuantizeTest, EOBCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); + Buffer ref_qcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer ref_dqcoeff = + Buffer(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t ref_eob = 0; + eob_ = 0; + const uint32_t max_index = max_size_ * max_size_ - 1; + for (int i = 0; i < number_of_iterations; ++i) { - int skip_block = i == 0; - TX_SIZE sz = (TX_SIZE)(i % 3); // TX_4X4, TX_8X8 TX_16X16 - TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - int count = (4 << sz) * (4 << sz); // 16, 64, 256 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; + TX_SIZE sz; + if (max_size_ == 16) { + sz = static_cast(i % 3); // TX_4X4, TX_8X8 TX_16X16 + } else { + sz = TX_32X32; + } + const TX_TYPE tx_type = static_cast((i >> 2) % 3); + scan_ = &vp9_scan_orders[sz][tx_type]; + count_ = (4 << sz) * (4 << sz); // Two random entries - for (int j = 0; j < count; j++) { - coeff_ptr[j] = 0; - } - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); - } + coeff_.Set(0); + coeff_.TopLeftPixel()[rnd.RandRange(count_) & max_index] = + static_cast(rnd.RandRange(max_value_ * 2)) - max_value_; + coeff_.TopLeftPixel()[rnd.RandRange(count_) & max_index] = + static_cast(rnd.RandRange(max_value_ * 2)) - max_value_; + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, + ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(), + dequant_ptr_, &ref_eob, scan_); - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, - scan_order->scan, scan_order->iscan); ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); + coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_)); - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); + EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff)); + EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff)); + + EXPECT_EQ(eob_, ref_eob); + + if (HasFailure()) { + printf("Failure on iteration %d.\n", i); + qcoeff_.PrintDifference(ref_qcoeff); + dqcoeff_.PrintDifference(ref_dqcoeff); + return; } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; - } - err_count_total += err_count; } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; } -TEST_P(VP9Quantize32Test, EOBCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]); - DECLARE_ALIGNED(16, int16_t, round_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_ptr[2]); - DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]); - DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]); - DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]); - DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]); - DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]); - int err_count_total = 0; - int first_failure = -1; - for (int i = 0; i < number_of_iterations; ++i) { - int skip_block = i == 0; - TX_SIZE sz = TX_32X32; - TX_TYPE tx_type = (TX_TYPE)(i % 4); - const scan_order *scan_order = &vp9_scan_orders[sz][tx_type]; - int count = (4 << sz) * (4 << sz); // 1024 - int err_count = 0; - *eob_ptr = rnd.Rand16(); - *ref_eob_ptr = *eob_ptr; - for (int j = 0; j < count; j++) { - coeff_ptr[j] = 0; - } - // Two random entries - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - coeff_ptr[rnd(count)] = rnd.Rand16() & mask_; - for (int j = 0; j < 2; j++) { - zbin_ptr[j] = rnd.Rand16() & mask_; - round_ptr[j] = rnd.Rand16(); - quant_ptr[j] = rnd.Rand16(); - quant_shift_ptr[j] = rnd.Rand16(); - dequant_ptr[j] = rnd.Rand16(); - } +TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); } - ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, ref_qcoeff_ptr, - ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr, - scan_order->scan, scan_order->iscan); - ASM_REGISTER_STATE_CHECK(quantize_op_( - coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr, - quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, - scan_order->scan, scan_order->iscan)); +TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); } - for (int j = 0; j < sz; ++j) { - err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) | - (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]); - } - err_count += (*ref_eob_ptr != *eob_ptr); - if (err_count && !err_count_total) { - first_failure = i; - } - err_count_total += err_count; - } - EXPECT_EQ(0, err_count_total) - << "Error: Quantization Test, C output doesn't match SSE2 output. " - << "First failed at test case " << first_failure; -} -using std::tr1::make_tuple; +using std::make_tuple; #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P( +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( SSE2, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_highbd_quantize_b_sse2, - &vpx_highbd_quantize_b_c, VPX_BITS_8), - make_tuple(&vpx_highbd_quantize_b_sse2, - &vpx_highbd_quantize_b_c, VPX_BITS_10), - make_tuple(&vpx_highbd_quantize_b_sse2, - &vpx_highbd_quantize_b_c, VPX_BITS_12))); -INSTANTIATE_TEST_CASE_P( - SSE2, VP9Quantize32Test, - ::testing::Values(make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10), - make_tuple(&vpx_highbd_quantize_b_32x32_sse2, - &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12))); -#endif // HAVE_SSE2 + ::testing::Values( + make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false))); + +#else +INSTANTIATE_TEST_SUITE_P( + SSE2, VP9QuantizeTest, + ::testing::Values(make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true))); #endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSE2 + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P( + SSSE3, VP9QuantizeTest, + ::testing::Values(make_tuple(vpx_quantize_b_ssse3, vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true))); +#endif // HAVE_SSSE3 + +#if HAVE_AVX +INSTANTIATE_TEST_SUITE_P( + AVX, VP9QuantizeTest, + ::testing::Values(make_tuple(vpx_quantize_b_avx, vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); +#endif // HAVE_AVX + +#if VPX_ARCH_X86_64 && HAVE_AVX2 +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + AVX2, VP9QuantizeTest, + ::testing::Values( + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_12, 16, + true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_12, + 32, true), + make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false))); +#else +INSTANTIATE_TEST_SUITE_P( + AVX2, VP9QuantizeTest, + ::testing::Values(make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true), + make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_AVX2 + +#if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_10, 32, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_12, 32, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true))); +#else +INSTANTIATE_TEST_SUITE_P( + NEON, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_NEON + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + VSX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_vsx, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_vsx, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, + 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, + VPX_BITS_8, 32, true))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH + +#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + LSX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_lsx, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, + VPX_BITS_8, 32, false))); +#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH + +// Only useful to compare "Speed" test results. +INSTANTIATE_TEST_SUITE_P( + DISABLED_C, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false), + make_tuple(&Quant32x32Wrapper, + &Quant32x32Wrapper, VPX_BITS_8, 32, + false), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true), + make_tuple(&QuantFPWrapper, + &QuantFPWrapper, VPX_BITS_8, 32, + true))); } // namespace diff --git a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc new file mode 100644 index 0000000000..e58f0a0d9d --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc @@ -0,0 +1,675 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vp9/ratectrl_rtc.h" + +#include +#include // NOLINT +#include + +#include "./vpx_config.h" +#include "gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/video_source.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_svc_layercontext.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/bitops.h" + +namespace { + +const size_t kNumFrames = 300; + +const int kTemporalId3Layer[4] = { 0, 2, 1, 2 }; +const int kTemporalId2Layer[2] = { 0, 1 }; +const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 }; +const int kTemporalRateAllocation2Layer[2] = { 60, 100 }; +const int kSpatialLayerBitrate[3] = { 200, 400, 1000 }; +const int kSpatialLayerBitrateLow[3] = { 50, 100, 400 }; + +class RcInterfaceTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + public: + RcInterfaceTest() + : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), + encoder_exit_(false), frame_drop_thresh_(0), num_drops_(0) {} + + ~RcInterfaceTest() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 7); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + if (rc_cfg_.is_screen) { + encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN); + } else { + encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_DEFAULT); + } + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); + encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); + } + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + if (rc_cfg_.rc_mode == VPX_CBR && + frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { + // Disable golden frame update. + frame_flags_ |= VP8_EFLAG_NO_UPD_GF; + frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; + } + encoder_exit_ = video->frame() == kNumFrames; + } + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + if (encoder_exit_) { + return; + } + int loopfilter_level, qp; + encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); + encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) { + ASSERT_EQ(rc_api_->GetQP(), qp); + ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + } else { + num_drops_++; + } + } + + void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override { + rc_api_->PostEncodeUpdate(pkt->data.frame.sz, frame_params_); + } + + void RunOneLayer() { + SetConfig(GET_PARAM(2)); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunOneLayerScreen() { + SetConfig(GET_PARAM(2)); + rc_cfg_.is_screen = true; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunOneLayerDropFramesCBR() { + if (GET_PARAM(2) != VPX_CBR) { + GTEST_SKIP() << "Frame dropping is only for CBR mode."; + } + frame_drop_thresh_ = 30; + SetConfig(GET_PARAM(2)); + // Use lower bitrate, lower max-q, and enable frame dropper. + rc_cfg_.target_bandwidth = 200; + cfg_.rc_target_bitrate = 200; + rc_cfg_.max_quantizer = 50; + cfg_.rc_max_quantizer = 50; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + + void RunOneLayerVBRPeriodicKey() { + if (GET_PARAM(2) != VPX_VBR) return; + key_interval_ = 100; + SetConfig(VPX_VBR); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + frame_params_.spatial_layer_id = 0; + frame_params_.temporal_layer_id = 0; + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + private: + void SetConfig(vpx_rc_mode rc_mode) { + rc_cfg_.width = 1280; + rc_cfg_.height = 720; + rc_cfg_.max_quantizer = 52; + rc_cfg_.min_quantizer = 2; + rc_cfg_.target_bandwidth = 1000; + rc_cfg_.buf_initial_sz = 600; + rc_cfg_.buf_optimal_sz = 600; + rc_cfg_.buf_sz = 1000; + rc_cfg_.undershoot_pct = 50; + rc_cfg_.overshoot_pct = 50; + rc_cfg_.max_intra_bitrate_pct = 1000; + rc_cfg_.framerate = 30.0; + rc_cfg_.ss_number_layers = 1; + rc_cfg_.ts_number_layers = 1; + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 1; + rc_cfg_.layer_target_bitrate[0] = 1000; + rc_cfg_.max_quantizers[0] = 52; + rc_cfg_.min_quantizers[0] = 2; + rc_cfg_.rc_mode = rc_mode; + rc_cfg_.aq_mode = aq_mode_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; + + // Encoder settings for ground truth. + cfg_.g_w = 1280; + cfg_.g_h = 720; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_buf_initial_sz = 600; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 52; + cfg_.rc_end_usage = rc_mode; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + cfg_.rc_target_bitrate = 1000; + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; + } + + std::unique_ptr rc_api_; + libvpx::VP9RateControlRtcConfig rc_cfg_; + int aq_mode_; + int key_interval_; + libvpx::VP9FrameParamsQpRTC frame_params_; + bool encoder_exit_; + int frame_drop_thresh_; + int num_drops_; +}; + +class RcInterfaceSvcTest + : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params { + public: + RcInterfaceSvcTest() + : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000), + dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)), + parallel_spatial_layers_(false), frame_drop_thresh_(0), + max_consec_drop_(INT_MAX), num_drops_(0) {} + ~RcInterfaceSvcTest() override = default; + + protected: + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + } + + void PreEncodeFrameHook(libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + current_superframe_ = 0; + encoder->Control(VP8E_SET_CPUUSED, 7); + encoder->Control(VP9E_SET_AQ_MODE, aq_mode_); + encoder->Control(VP9E_SET_TUNE_CONTENT, 0); + encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 900); + encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); + encoder->Control(VP9E_SET_SVC, 1); + encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); + if (inter_layer_pred_off_) { + encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, + INTER_LAYER_PRED_OFF_NONKEY); + } + if (frame_drop_thresh_ > 0) { + vpx_svc_frame_drop_t svc_drop_frame; + svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP; + for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) + svc_drop_frame.framedrop_thresh[sl] = frame_drop_thresh_; + svc_drop_frame.max_consec_drop = max_consec_drop_; + encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame); + } + } + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + encoder_exit_ = video->frame() == kNumFrames; + if (dynamic_spatial_layers_ == 1) { + if (video->frame() == 100) { + // Go down to 2 spatial layers: set top SL to 0 bitrate. + // Update the encoder config. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8]; + cfg_.layer_target_bitrate[6] = 0; + cfg_.layer_target_bitrate[7] = 0; + cfg_.layer_target_bitrate[8] = 0; + encoder->Config(&cfg_); + // Update the RC config. + rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[8]; + rc_cfg_.layer_target_bitrate[6] = 0; + rc_cfg_.layer_target_bitrate[7] = 0; + rc_cfg_.layer_target_bitrate[8] = 0; + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + } else if (video->frame() == 200) { + // Go down to 1 spatial layer. + // Update the encoder config. + cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[5]; + cfg_.layer_target_bitrate[3] = 0; + cfg_.layer_target_bitrate[4] = 0; + cfg_.layer_target_bitrate[5] = 0; + encoder->Config(&cfg_); + // Update the RC config. + rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[5]; + rc_cfg_.layer_target_bitrate[3] = 0; + rc_cfg_.layer_target_bitrate[4] = 0; + rc_cfg_.layer_target_bitrate[5] = 0; + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + } else if (/*DISABLES CODE*/ (false) && video->frame() == 280) { + // TODO(marpan): Re-enable this going back up when issue is fixed. + // Go back up to 3 spatial layers. + // Update the encoder config: use the original bitrates. + SetEncoderConfigSvc(3, 3); + encoder->Config(&cfg_); + // Update the RC config. + SetRCConfigSvc(3, 3); + ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_)); + } + } + } + + virtual void SetFrameParamsSvc(int sl) { + frame_params_.spatial_layer_id = sl; + if (rc_cfg_.ts_number_layers == 3) + frame_params_.temporal_layer_id = + kTemporalId3Layer[current_superframe_ % 4]; + else if (rc_cfg_.ts_number_layers == 2) + frame_params_.temporal_layer_id = + kTemporalId2Layer[current_superframe_ % 2]; + else + frame_params_.temporal_layer_id = 0; + frame_params_.frame_type = + current_superframe_ % key_interval_ == 0 && sl == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + } + + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { + if (encoder_exit_) { + return; + } + int superframe_is_dropped = false; + ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0; + std::vector rc_qp; + // For FULL_SUPERFRAME_DROP: the full superframe drop decision is + // determined on the base spatial layer. + SetFrameParamsSvc(0); + if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kDrop) { + superframe_is_dropped = true; + num_drops_++; + } + while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { + ASSERT_EQ(superframe_is_dropped, false); + ParseSuperframeSizes(static_cast(pkt->data.frame.buf), + pkt->data.frame.sz); + if (!parallel_spatial_layers_ || current_superframe_ == 0) { + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + if (sizes_[sl] > 0) { + SetFrameParamsSvc(sl); + // For sl=0 ComputeQP() is already called above (line 310). + if (sl > 0) rc_api_->ComputeQP(frame_params_); + rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_); + rc_qp.push_back(rc_api_->GetQP()); + } + } + } else { + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + // For sl=0 ComputeQP() is already called above (line 310). + if (sizes_[sl] > 0 && sl > 0) { + SetFrameParamsSvc(sl); + rc_api_->ComputeQP(frame_params_); + } + } + for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) { + if (sizes_[sl] > 0) { + SetFrameParamsSvc(sl); + rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_); + rc_qp.push_back(rc_api_->GetQP()); + } + } + } + } + if (!superframe_is_dropped) { + int loopfilter_level; + std::vector encoder_qp(VPX_SS_MAX_LAYERS, 0); + encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level); + encoder->Control(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, encoder_qp.data()); + encoder_qp.resize(rc_qp.size()); + ASSERT_EQ(rc_qp, encoder_qp); + ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level); + current_superframe_++; + } + } + // This method needs to be overridden because non-reference frames are + // expected to be mismatched frames as the encoder will avoid loopfilter on + // these frames. + void MismatchHook(const vpx_image_t * /*img1*/, + const vpx_image_t * /*img2*/) override {} + + void RunSvc() { + SetRCConfigSvc(3, 3); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcDropFramesCBR() { + max_consec_drop_ = 10; + frame_drop_thresh_ = 30; + SetRCConfigSvc(3, 3); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // Check that some frames were dropped, otherwise test has no value. + ASSERT_GE(num_drops_, 1); + } + + void RunSvcPeriodicKey() { + SetRCConfigSvc(3, 3); + key_interval_ = 100; + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcDynamicSpatial() { + dynamic_spatial_layers_ = 1; + SetRCConfigSvc(3, 3); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + void RunSvcParallelSpatialLayers() { + if (!inter_layer_pred_off_) return; + parallel_spatial_layers_ = true; + SetRCConfigSvc(3, 3); + rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_); + SetEncoderConfigSvc(3, 3); + + ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", + 1280, 720, 30, 1, 0, kNumFrames); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + } + + private: + vpx_codec_err_t ParseSuperframeSizes(const uint8_t *data, size_t data_sz) { + uint8_t marker = *(data + data_sz - 1); + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME; + { + const uint8_t marker2 = *(data + data_sz - index_sz); + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME; + } + const uint8_t *x = &data[data_sz - index_sz + 1]; + for (uint32_t i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (uint32_t j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8); + sizes_[i] = this_sz; + } + } + return VPX_CODEC_OK; + } + + void SetEncoderConfigSvc(int number_spatial_layers, + int number_temporal_layers) { + cfg_.g_w = 1280; + cfg_.g_h = 720; + cfg_.ss_number_layers = number_spatial_layers; + cfg_.ts_number_layers = number_temporal_layers; + cfg_.g_timebase.num = 1; + cfg_.g_timebase.den = 30; + if (number_spatial_layers == 3) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 4; + svc_params_.scaling_factor_num[1] = 2; + svc_params_.scaling_factor_den[1] = 4; + svc_params_.scaling_factor_num[2] = 4; + svc_params_.scaling_factor_den[2] = 4; + } else if (number_spatial_layers == 2) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 2; + svc_params_.scaling_factor_num[1] = 2; + svc_params_.scaling_factor_den[1] = 2; + } else if (number_spatial_layers == 1) { + svc_params_.scaling_factor_num[0] = 1; + svc_params_.scaling_factor_den[0] = 1; + } + + for (int i = 0; i < VPX_MAX_LAYERS; ++i) { + svc_params_.max_quantizers[i] = 56; + svc_params_.min_quantizers[i] = 2; + svc_params_.speed_per_layer[i] = 7; + svc_params_.loopfilter_ctrl[i] = LOOPFILTER_ALL; + } + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + + if (number_temporal_layers == 3) { + cfg_.ts_rate_decimator[0] = 4; + cfg_.ts_rate_decimator[1] = 2; + cfg_.ts_rate_decimator[2] = 1; + cfg_.temporal_layering_mode = 3; + } else if (number_temporal_layers == 2) { + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.temporal_layering_mode = 2; + } else if (number_temporal_layers == 1) { + cfg_.ts_rate_decimator[0] = 1; + cfg_.temporal_layering_mode = 0; + } + + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 2; + cfg_.rc_max_quantizer = 56; + cfg_.g_threads = 1; + cfg_.kf_max_dist = 9999; + cfg_.rc_overshoot_pct = 50; + cfg_.rc_undershoot_pct = 50; + cfg_.rc_dropframe_thresh = frame_drop_thresh_; + + cfg_.rc_target_bitrate = 0; + for (int sl = 0; sl < number_spatial_layers; sl++) { + int spatial_bitrate = 0; + if (number_spatial_layers <= 3) + spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl] + : kSpatialLayerBitrate[sl]; + for (int tl = 0; tl < number_temporal_layers; tl++) { + int layer = sl * number_temporal_layers + tl; + if (number_temporal_layers == 3) + cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 2) + cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 1) + cfg_.layer_target_bitrate[layer] = spatial_bitrate; + } + cfg_.rc_target_bitrate += spatial_bitrate; + } + + cfg_.kf_min_dist = key_interval_; + cfg_.kf_max_dist = key_interval_; + } + + void SetRCConfigSvc(int number_spatial_layers, int number_temporal_layers) { + rc_cfg_.width = 1280; + rc_cfg_.height = 720; + rc_cfg_.ss_number_layers = number_spatial_layers; + rc_cfg_.ts_number_layers = number_temporal_layers; + rc_cfg_.max_quantizer = 56; + rc_cfg_.min_quantizer = 2; + rc_cfg_.buf_initial_sz = 500; + rc_cfg_.buf_optimal_sz = 600; + rc_cfg_.buf_sz = 1000; + rc_cfg_.undershoot_pct = 50; + rc_cfg_.overshoot_pct = 50; + rc_cfg_.max_intra_bitrate_pct = 900; + rc_cfg_.framerate = 30.0; + rc_cfg_.rc_mode = VPX_CBR; + rc_cfg_.aq_mode = aq_mode_; + rc_cfg_.frame_drop_thresh = frame_drop_thresh_; + rc_cfg_.max_consec_drop = max_consec_drop_; + + if (number_spatial_layers == 3) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 4; + rc_cfg_.scaling_factor_num[1] = 2; + rc_cfg_.scaling_factor_den[1] = 4; + rc_cfg_.scaling_factor_num[2] = 4; + rc_cfg_.scaling_factor_den[2] = 4; + } else if (number_spatial_layers == 2) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 2; + rc_cfg_.scaling_factor_num[1] = 2; + rc_cfg_.scaling_factor_den[1] = 2; + } else if (number_spatial_layers == 1) { + rc_cfg_.scaling_factor_num[0] = 1; + rc_cfg_.scaling_factor_den[0] = 1; + } + + if (number_temporal_layers == 3) { + rc_cfg_.ts_rate_decimator[0] = 4; + rc_cfg_.ts_rate_decimator[1] = 2; + rc_cfg_.ts_rate_decimator[2] = 1; + } else if (number_temporal_layers == 2) { + rc_cfg_.ts_rate_decimator[0] = 2; + rc_cfg_.ts_rate_decimator[1] = 1; + } else if (number_temporal_layers == 1) { + rc_cfg_.ts_rate_decimator[0] = 1; + } + + rc_cfg_.target_bandwidth = 0; + for (int sl = 0; sl < number_spatial_layers; sl++) { + int spatial_bitrate = 0; + if (number_spatial_layers <= 3) + spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl] + : kSpatialLayerBitrate[sl]; + for (int tl = 0; tl < number_temporal_layers; tl++) { + int layer = sl * number_temporal_layers + tl; + if (number_temporal_layers == 3) + rc_cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 2) + rc_cfg_.layer_target_bitrate[layer] = + kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100; + else if (number_temporal_layers == 1) + rc_cfg_.layer_target_bitrate[layer] = spatial_bitrate; + } + rc_cfg_.target_bandwidth += spatial_bitrate; + } + + for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) { + for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) { + const int i = sl * rc_cfg_.ts_number_layers + tl; + rc_cfg_.max_quantizers[i] = 56; + rc_cfg_.min_quantizers[i] = 2; + } + } + } + + int aq_mode_; + std::unique_ptr rc_api_; + libvpx::VP9RateControlRtcConfig rc_cfg_; + vpx_svc_extra_cfg_t svc_params_; + libvpx::VP9FrameParamsQpRTC frame_params_; + bool encoder_exit_; + int current_superframe_; + uint32_t sizes_[8]; + int key_interval_; + int dynamic_spatial_layers_; + bool inter_layer_pred_off_; + // ComputeQP() and PostEncodeUpdate() don't need to be sequential for KSVC. + bool parallel_spatial_layers_; + int frame_drop_thresh_; + int max_consec_drop_; + int num_drops_; +}; + +TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); } + +TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); } + +TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); } + +TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); } + +TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); } + +TEST_P(RcInterfaceSvcTest, SvcDropFramesCBR) { RunSvcDropFramesCBR(); } + +TEST_P(RcInterfaceSvcTest, SvcParallelSpatialLayers) { + RunSvcParallelSpatialLayers(); +} + +TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); } + +TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); } + +VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3), + ::testing::Values(VPX_CBR, VPX_VBR)); +VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3), + ::testing::Values(true, false)); +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_roi_test.cc b/media/libvpx/libvpx/test/vp9_roi_test.cc new file mode 100644 index 0000000000..9427796a2b --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_roi_test.cc @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" +#include "test/video_source.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" + +#define MASK_WIDTH 40 +#define MASK_HEIGHT 30 +#define MASK_SIZE MASK_WIDTH *MASK_HEIGHT + +namespace { + +const int mask[MASK_SIZE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 +}; + +class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest, + public ::testing::Test { + protected: + RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {} + ~RoiMaskBackgroundSkip() override { free(roi_.roi_map); } + + void SetUp() override { + InitializeConfig(); + SetMode(::libvpx_test::kRealTime); + SetRoi(); + } + + void SetRoi() { + const int block_size = 8; + unsigned int i, j; + roi_.rows = (cfg_.g_h + block_size - 1) / block_size; + roi_.cols = (cfg_.g_w + block_size - 1) / block_size; + memset(&roi_.skip, 0, sizeof(roi_.skip)); + memset(&roi_.delta_q, 0, sizeof(roi_.delta_q)); + memset(&roi_.delta_lf, 0, sizeof(roi_.delta_lf)); + memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame)); + roi_.ref_frame[1] = 1; + // Use segment 3 for skip. + roi_.skip[3] = 1; + roi_.roi_map = + (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)); + for (i = 0; i < roi_.rows; ++i) { + for (j = 0; j < roi_.cols; ++j) { + const int idx = i * roi_.cols + j; + if (mask[idx] == 1) roi_.roi_map[idx] = 3; + } + } + } + + void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) override { + if (video->frame() == 0) { + encoder->Control(VP8E_SET_CPUUSED, 7); + encoder->Control(VP9E_SET_AQ_MODE, 3); + } + encoder->Control(VP9E_SET_ROI_MAP, &roi_); + } + + private: + vpx_roi_map_t roi_; +}; + +TEST_F(RoiMaskBackgroundSkip, RoiMaskNoMismatch) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_undershoot_pct = 20; + cfg_.rc_dropframe_thresh = 10; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 50; + cfg_.rc_end_usage = VPX_CBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 0; + cfg_.kf_max_dist = 9999; + + ::libvpx_test::I420VideoSource video("desktopqvga.320_240.yuv", 320, 240, 30, + 1, 0, 150); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +} // namespace diff --git a/media/libvpx/libvpx/test/vp9_scale_test.cc b/media/libvpx/libvpx/test/vp9_scale_test.cc new file mode 100644 index 0000000000..53b38d22aa --- /dev/null +++ b/media/libvpx/libvpx/test/vp9_scale_test.cc @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/vpx_scale_test.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_scale/yv12config.h" + +namespace libvpx_test { + +using ScaleFrameFunc = void (*)(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, int phase_scaler); + +class ScaleTest : public VpxScaleBase, + public ::testing::TestWithParam { + public: + ~ScaleTest() override = default; + + protected: + void SetUp() override { scale_fn_ = GetParam(); } + + void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) { + vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler); + } + + void ScaleFrame(INTERP_FILTER filter_type, int phase_scaler) { + ASM_REGISTER_STATE_CHECK( + scale_fn_(&img_, &dst_img_, filter_type, phase_scaler)); + } + + void RunTest(INTERP_FILTER filter_type) { + static const int kNumSizesToTest = 22; + static const int kNumScaleFactorsToTest = 4; + static const int kSizesToTest[] = { 1, 2, 3, 4, 6, 8, 10, 12, + 14, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 68, 128, 134 }; + static const int kScaleFactors[] = { 1, 2, 3, 4 }; + for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) { + for (int h = 0; h < kNumSizesToTest; ++h) { + const int src_height = kSizesToTest[h]; + for (int w = 0; w < kNumSizesToTest; ++w) { + const int src_width = kSizesToTest[w]; + for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; + ++sf_up_idx) { + const int sf_up = kScaleFactors[sf_up_idx]; + for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; + ++sf_down_idx) { + const int sf_down = kScaleFactors[sf_down_idx]; + const int dst_width = src_width * sf_up / sf_down; + const int dst_height = src_height * sf_up / sf_down; + if (sf_up == sf_down && sf_up != 1) { + continue; + } + // I420 frame width and height must be even. + if (!dst_width || !dst_height || dst_width & 1 || + dst_height & 1) { + continue; + } + // vpx_convolve8_c() has restriction on the step which cannot + // exceed 64 (ratio 1 to 4). + if (src_width > 4 * dst_width || src_height > 4 * dst_height) { + continue; + } + ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height, + dst_width, dst_height)); + ReferenceScaleFrame(filter_type, phase_scaler); + ScaleFrame(filter_type, phase_scaler); + if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, + ref_img_.frame_size)) { + printf( + "filter_type = %d, phase_scaler = %d, src_width = %4d, " + "src_height = %4d, dst_width = %4d, dst_height = %4d, " + "scale factor = %d:%d\n", + filter_type, phase_scaler, src_width, src_height, dst_width, + dst_height, sf_down, sf_up); + PrintDiff(); + } + CompareImages(dst_img_); + DeallocScaleImages(); + } + } + } + } + } + } + + void PrintDiffComponent(const uint8_t *const ref, const uint8_t *const opt, + const int stride, const int width, const int height, + const int plane_idx) const { + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + if (ref[y * stride + x] != opt[y * stride + x]) { + printf("Plane %d pixel[%d][%d] diff:%6d (ref),%6d (opt)\n", plane_idx, + y, x, ref[y * stride + x], opt[y * stride + x]); + break; + } + } + } + } + + void PrintDiff() const { + assert(ref_img_.y_stride == dst_img_.y_stride); + assert(ref_img_.y_width == dst_img_.y_width); + assert(ref_img_.y_height == dst_img_.y_height); + assert(ref_img_.uv_stride == dst_img_.uv_stride); + assert(ref_img_.uv_width == dst_img_.uv_width); + assert(ref_img_.uv_height == dst_img_.uv_height); + + if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, + ref_img_.frame_size)) { + PrintDiffComponent(ref_img_.y_buffer, dst_img_.y_buffer, + ref_img_.y_stride, ref_img_.y_width, ref_img_.y_height, + 0); + PrintDiffComponent(ref_img_.u_buffer, dst_img_.u_buffer, + ref_img_.uv_stride, ref_img_.uv_width, + ref_img_.uv_height, 1); + PrintDiffComponent(ref_img_.v_buffer, dst_img_.v_buffer, + ref_img_.uv_stride, ref_img_.uv_width, + ref_img_.uv_height, 2); + } + } + + ScaleFrameFunc scale_fn_; +}; + +TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); } +TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); } +TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); } +TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); } + +TEST_P(ScaleTest, DISABLED_Speed) { + static const int kCountSpeedTestBlock = 100; + static const int kNumScaleFactorsToTest = 4; + static const int kScaleFactors[] = { 1, 2, 3, 4 }; + const int src_width = 1280; + const int src_height = 720; + for (INTERP_FILTER filter_type = 2; filter_type < 4; ++filter_type) { + for (int phase_scaler = 0; phase_scaler < 2; ++phase_scaler) { + for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; ++sf_up_idx) { + const int sf_up = kScaleFactors[sf_up_idx]; + for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; + ++sf_down_idx) { + const int sf_down = kScaleFactors[sf_down_idx]; + const int dst_width = src_width * sf_up / sf_down; + const int dst_height = src_height * sf_up / sf_down; + if (sf_up == sf_down && sf_up != 1) { + continue; + } + // I420 frame width and height must be even. + if (dst_width & 1 || dst_height & 1) { + continue; + } + ASSERT_NO_FATAL_FAILURE( + ResetScaleImages(src_width, src_height, dst_width, dst_height)); + ASM_REGISTER_STATE_CHECK( + ReferenceScaleFrame(filter_type, phase_scaler)); + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < kCountSpeedTestBlock; ++i) { + ScaleFrame(filter_type, phase_scaler); + } + libvpx_test::ClearSystemState(); + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer) / 1000); + CompareImages(dst_img_); + DeallocScaleImages(); + + printf( + "filter_type = %d, phase_scaler = %d, src_width = %4d, " + "src_height = %4d, dst_width = %4d, dst_height = %4d, " + "scale factor = %d:%d, scale time: %5d ms\n", + filter_type, phase_scaler, src_width, src_height, dst_width, + dst_height, sf_down, sf_up, elapsed_time); + } + } + } + } +} + +INSTANTIATE_TEST_SUITE_P(C, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_c)); + +#if HAVE_SSSE3 +INSTANTIATE_TEST_SUITE_P(SSSE3, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_ssse3)); +#endif // HAVE_SSSE3 + +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, ScaleTest, + ::testing::Values(vp9_scale_and_extend_frame_neon)); +#endif // HAVE_NEON + +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc b/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc index e847bbddf3..c080a2caae 100644 --- a/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc +++ b/media/libvpx/libvpx/test/vp9_skip_loopfilter_test.cc @@ -24,33 +24,42 @@ const char kVp9Md5File[] = "vp90-2-08-tile_1x8_frame_parallel.webm.md5"; // Class for testing shutting off the loop filter. class SkipLoopFilterTest { public: - SkipLoopFilterTest() : video_(NULL), decoder_(NULL), md5_file_(NULL) {} + SkipLoopFilterTest() + : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {} ~SkipLoopFilterTest() { - if (md5_file_ != NULL) fclose(md5_file_); + if (md5_file_ != nullptr) fclose(md5_file_); delete decoder_; delete video_; } // If |threads| > 0 then set the decoder with that number of threads. - void Init(int num_threads) { + bool Init(int num_threads) { expected_md5_[0] = '\0'; junk_[0] = '\0'; video_ = new libvpx_test::WebMVideoSource(kVp9TestFile); - ASSERT_TRUE(video_ != NULL); + if (video_ == nullptr) { + EXPECT_NE(video_, nullptr); + return false; + } video_->Init(); video_->Begin(); vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t(); if (num_threads > 0) cfg.threads = num_threads; decoder_ = new libvpx_test::VP9Decoder(cfg, 0); - ASSERT_TRUE(decoder_ != NULL); + if (decoder_ == nullptr) { + EXPECT_NE(decoder_, nullptr); + return false; + } OpenMd5File(kVp9Md5File); + return !::testing::Test::HasFailure(); } // Set the VP9 skipLoopFilter control value. void SetSkipLoopFilter(int value, vpx_codec_err_t expected_value) { + ASSERT_NE(decoder_, nullptr); decoder_->Control(VP9_SET_SKIP_LOOP_FILTER, value, expected_value); } @@ -65,7 +74,7 @@ class SkipLoopFilterTest { } vpx_codec_err_t DecodeRemainingFrames() { - for (; video_->cxdata() != NULL; video_->Next()) { + for (; video_->cxdata() != nullptr; video_->Next()) { const vpx_codec_err_t res = decoder_->DecodeFrame(video_->cxdata(), video_->frame_size()); if (res != VPX_CODEC_OK) return res; @@ -85,13 +94,13 @@ class SkipLoopFilterTest { // TODO(fgalligan): Move the MD5 testing code into another class. void OpenMd5File(const std::string &md5_file_name) { md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name); - ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: " - << md5_file_name; + ASSERT_NE(md5_file_, nullptr) + << "MD5 file open failed. Filename: " << md5_file_name; } // Reads the next line of the MD5 file. void ReadMd5() { - ASSERT_TRUE(md5_file_ != NULL); + ASSERT_NE(md5_file_, nullptr); const int res = fscanf(md5_file_, "%s %s", expected_md5_, junk_); ASSERT_NE(EOF, res) << "Read md5 data failed"; expected_md5_[32] = '\0'; @@ -121,7 +130,7 @@ TEST(SkipLoopFilterTest, ShutOffLoopFilter) { const int non_zero_value = 1; const int num_threads = 0; SkipLoopFilterTest skip_loop_filter; - skip_loop_filter.Init(num_threads); + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); skip_loop_filter.CheckMd5(false); @@ -131,7 +140,7 @@ TEST(SkipLoopFilterTest, ShutOffLoopFilterSingleThread) { const int non_zero_value = 1; const int num_threads = 1; SkipLoopFilterTest skip_loop_filter; - skip_loop_filter.Init(num_threads); + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); skip_loop_filter.CheckMd5(false); @@ -141,7 +150,7 @@ TEST(SkipLoopFilterTest, ShutOffLoopFilter8Threads) { const int non_zero_value = 1; const int num_threads = 8; SkipLoopFilterTest skip_loop_filter; - skip_loop_filter.Init(num_threads); + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); skip_loop_filter.CheckMd5(false); @@ -151,7 +160,7 @@ TEST(SkipLoopFilterTest, WithLoopFilter) { const int non_zero_value = 1; const int num_threads = 0; SkipLoopFilterTest skip_loop_filter; - skip_loop_filter.Init(num_threads); + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); skip_loop_filter.SetSkipLoopFilter(non_zero_value, VPX_CODEC_OK); skip_loop_filter.SetSkipLoopFilter(0, VPX_CODEC_OK); ASSERT_EQ(VPX_CODEC_OK, skip_loop_filter.DecodeRemainingFrames()); @@ -161,7 +170,7 @@ TEST(SkipLoopFilterTest, WithLoopFilter) { TEST(SkipLoopFilterTest, ToggleLoopFilter) { const int num_threads = 0; SkipLoopFilterTest skip_loop_filter; - skip_loop_filter.Init(num_threads); + ASSERT_TRUE(skip_loop_filter.Init(num_threads)); for (int i = 0; i < 10; ++i) { skip_loop_filter.SetSkipLoopFilter(i % 2, VPX_CODEC_OK); diff --git a/media/libvpx/libvpx/test/vp9_spatial_svc_encoder.sh b/media/libvpx/libvpx/test/vp9_spatial_svc_encoder.sh deleted file mode 100644 index 65031073f8..0000000000 --- a/media/libvpx/libvpx/test/vp9_spatial_svc_encoder.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/sh -## -## Copyright (c) 2014 The WebM project authors. All Rights Reserved. -## -## Use of this source code is governed by a BSD-style license -## that can be found in the LICENSE file in the root of the source -## tree. An additional intellectual property rights grant can be found -## in the file PATENTS. All contributing project authors may -## be found in the AUTHORS file in the root of the source tree. -## -## This file tests the libvpx vp9_spatial_svc_encoder example. To add new -## tests to to this file, do the following: -## 1. Write a shell function (this is your test). -## 2. Add the function to vp9_spatial_svc_tests (on a new line). -## -. $(dirname $0)/tools_common.sh - -# Environment check: $YUV_RAW_INPUT is required. -vp9_spatial_svc_encoder_verify_environment() { - if [ ! -e "${YUV_RAW_INPUT}" ]; then - echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." - return 1 - fi -} - -# Runs vp9_spatial_svc_encoder. $1 is the test name. -vp9_spatial_svc_encoder() { - local readonly \ - encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder${VPX_TEST_EXE_SUFFIX}" - local readonly test_name="$1" - local readonly \ - output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf" - local readonly frames_to_encode=10 - local readonly max_kf=9999 - - shift - - if [ ! -x "${encoder}" ]; then - elog "${encoder} does not exist or is not executable." - return 1 - fi - - eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \ - -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \ - "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull} - - [ -e "${output_file}" ] || return 1 -} - -# Each test is run with layer count 1-$vp9_ssvc_test_layers. -vp9_ssvc_test_layers=5 - -vp9_spatial_svc() { - if [ "$(vp9_encode_available)" = "yes" ]; then - local readonly test_name="vp9_spatial_svc" - for layers in $(seq 1 ${vp9_ssvc_test_layers}); do - vp9_spatial_svc_encoder "${test_name}" -sl ${layers} - done - fi -} - -readonly vp9_spatial_svc_tests="DISABLED_vp9_spatial_svc_mode_i - DISABLED_vp9_spatial_svc_mode_altip - DISABLED_vp9_spatial_svc_mode_ip - DISABLED_vp9_spatial_svc_mode_gf - vp9_spatial_svc" - -if [ "$(vpx_config_option_enabled CONFIG_SPATIAL_SVC)" = "yes" ]; then - run_tests \ - vp9_spatial_svc_encoder_verify_environment \ - "${vp9_spatial_svc_tests}" -fi diff --git a/media/libvpx/libvpx/test/vp9_subtract_test.cc b/media/libvpx/libvpx/test/vp9_subtract_test.cc index 19ed304315..19234c8c6b 100644 --- a/media/libvpx/libvpx/test/vp9_subtract_test.cc +++ b/media/libvpx/libvpx/test/vp9_subtract_test.cc @@ -7,98 +7,315 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include +#include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "test/acm_random.h" +#include "test/bench.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" +#include "test/util.h" #include "vp9/common/vp9_blockd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" -typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr, - ptrdiff_t diff_stride, const uint8_t *src_ptr, - ptrdiff_t src_stride, const uint8_t *pred_ptr, - ptrdiff_t pred_stride); +using SubtractFunc = void (*)(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride); namespace vp9 { -class VP9SubtractBlockTest : public ::testing::TestWithParam { +class VP9SubtractBlockTest : public AbstractBench, + public ::testing::TestWithParam { public: - virtual void TearDown() { libvpx_test::ClearSystemState(); } + void TearDown() override { libvpx_test::ClearSystemState(); } + + protected: + void Run() override { + GetParam()(block_height_, block_width_, diff_, block_width_, src_, + block_width_, pred_, block_width_); + } + + void SetupBlocks(BLOCK_SIZE bsize) { + block_width_ = 4 * num_4x4_blocks_wide_lookup[bsize]; + block_height_ = 4 * num_4x4_blocks_high_lookup[bsize]; + diff_ = reinterpret_cast( + vpx_memalign(16, sizeof(*diff_) * block_width_ * block_height_ * 2)); + pred_ = reinterpret_cast( + vpx_memalign(16, block_width_ * block_height_ * 2)); + src_ = reinterpret_cast( + vpx_memalign(16, block_width_ * block_height_ * 2)); + } + + int block_width_; + int block_height_; + int16_t *diff_; + uint8_t *pred_; + uint8_t *src_; }; using libvpx_test::ACMRandom; -TEST_P(VP9SubtractBlockTest, SimpleSubtract) { +TEST_P(VP9SubtractBlockTest, DISABLED_Speed) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - // FIXME(rbultje) split in its own file for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES; bsize = static_cast(static_cast(bsize) + 1)) { - const int block_width = 4 * num_4x4_blocks_wide_lookup[bsize]; - const int block_height = 4 * num_4x4_blocks_high_lookup[bsize]; - int16_t *diff = reinterpret_cast( - vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2)); - uint8_t *pred = reinterpret_cast( - vpx_memalign(16, block_width * block_height * 2)); - uint8_t *src = reinterpret_cast( - vpx_memalign(16, block_width * block_height * 2)); + SetupBlocks(bsize); - for (int n = 0; n < 100; n++) { - for (int r = 0; r < block_height; ++r) { - for (int c = 0; c < block_width * 2; ++c) { - src[r * block_width * 2 + c] = rnd.Rand8(); - pred[r * block_width * 2 + c] = rnd.Rand8(); - } - } + RunNTimes(100000000 / (block_height_ * block_width_)); + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", block_height_, + block_width_); + char title[100]; + snprintf(title, sizeof(title), "%8s ", block_size); + PrintMedian(title); - GetParam()(block_height, block_width, diff, block_width, src, block_width, - pred, block_width); - - for (int r = 0; r < block_height; ++r) { - for (int c = 0; c < block_width; ++c) { - EXPECT_EQ(diff[r * block_width + c], - (src[r * block_width + c] - pred[r * block_width + c])) - << "r = " << r << ", c = " << c << ", bs = " << bsize; - } - } - - GetParam()(block_height, block_width, diff, block_width * 2, src, - block_width * 2, pred, block_width * 2); - - for (int r = 0; r < block_height; ++r) { - for (int c = 0; c < block_width; ++c) { - EXPECT_EQ( - diff[r * block_width * 2 + c], - (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c])) - << "r = " << r << ", c = " << c << ", bs = " << bsize; - } - } - } - vpx_free(diff); - vpx_free(pred); - vpx_free(src); + vpx_free(diff_); + vpx_free(pred_); + vpx_free(src_); } } -INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest, - ::testing::Values(vpx_subtract_block_c)); +TEST_P(VP9SubtractBlockTest, SimpleSubtract) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES; + bsize = static_cast(static_cast(bsize) + 1)) { + SetupBlocks(bsize); + + for (int n = 0; n < 100; n++) { + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_ * 2; ++c) { + src_[r * block_width_ * 2 + c] = rnd.Rand8(); + pred_[r * block_width_ * 2 + c] = rnd.Rand8(); + } + } + + GetParam()(block_height_, block_width_, diff_, block_width_, src_, + block_width_, pred_, block_width_); + + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ + c], + (src_[r * block_width_ + c] - pred_[r * block_width_ + c])) + << "r = " << r << ", c = " << c + << ", bs = " << static_cast(bsize); + } + } + + GetParam()(block_height_, block_width_, diff_, block_width_ * 2, src_, + block_width_ * 2, pred_, block_width_ * 2); + + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ * 2 + c], + (src_[r * block_width_ * 2 + c] - + pred_[r * block_width_ * 2 + c])) + << "r = " << r << ", c = " << c + << ", bs = " << static_cast(bsize); + } + } + } + vpx_free(diff_); + vpx_free(pred_); + vpx_free(src_); + } +} + +INSTANTIATE_TEST_SUITE_P(C, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_c)); #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest, - ::testing::Values(vpx_subtract_block_sse2)); +INSTANTIATE_TEST_SUITE_P(SSE2, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_sse2)); +#endif +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_avx2)); #endif #if HAVE_NEON -INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest, - ::testing::Values(vpx_subtract_block_neon)); +INSTANTIATE_TEST_SUITE_P(NEON, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_neon)); #endif #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest, - ::testing::Values(vpx_subtract_block_msa)); +INSTANTIATE_TEST_SUITE_P(MSA, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_msa)); #endif +#if HAVE_MMI +INSTANTIATE_TEST_SUITE_P(MMI, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_mmi)); +#endif + +#if HAVE_VSX +INSTANTIATE_TEST_SUITE_P(VSX, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_vsx)); +#endif + +#if HAVE_LSX +INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_lsx)); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + +using HBDSubtractFunc = void (*)(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride, int bd); + +// +using Params = std::tuple; + +class VPXHBDSubtractBlockTest : public ::testing::TestWithParam { + public: + void SetUp() override { + block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)]; + block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)]; + bit_depth_ = static_cast(GET_PARAM(1)); + func_ = GET_PARAM(2); + ref_func_ = GET_PARAM(3); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + + constexpr size_t kMaxWidth = 128; + constexpr size_t kMaxBlockSize = kMaxWidth * kMaxWidth; + src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t)))); + ASSERT_NE(src_, nullptr); + pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast( + vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t)))); + ASSERT_NE(pred_, nullptr); + diff_ = reinterpret_cast( + vpx_memalign(16, kMaxBlockSize * sizeof(int16_t))); + ASSERT_NE(diff_, nullptr); + } + + void TearDown() override { + vpx_free(CONVERT_TO_SHORTPTR(src_)); + vpx_free(CONVERT_TO_SHORTPTR(pred_)); + vpx_free(diff_); + } + + protected: + void CheckResult(); + void RunForSpeed(); + + private: + ACMRandom rnd_; + int block_height_; + int block_width_; + vpx_bit_depth_t bit_depth_; + HBDSubtractFunc func_; + HBDSubtractFunc ref_func_; + uint8_t *src_; + uint8_t *pred_; + int16_t *diff_; +}; + +void VPXHBDSubtractBlockTest::CheckResult() { + constexpr int kTestNum = 100; + constexpr int kMaxWidth = 128; + constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth; + const int mask = (1 << bit_depth_) - 1; + for (int i = 0; i < kTestNum; ++i) { + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + func_(block_height_, block_width_, diff_, block_width_, src_, block_width_, + pred_, block_width_, bit_depth_); + + for (int r = 0; r < block_height_; ++r) { + for (int c = 0; c < block_width_; ++c) { + EXPECT_EQ(diff_[r * block_width_ + c], + (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] - + CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c])) + << "r = " << r << ", c = " << c << ", test: " << i; + } + } + } +} + +TEST_P(VPXHBDSubtractBlockTest, CheckResult) { CheckResult(); } + +void VPXHBDSubtractBlockTest::RunForSpeed() { + constexpr int kTestNum = 200000; + constexpr int kMaxWidth = 128; + constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth; + const int mask = (1 << bit_depth_) - 1; + + if (ref_func_ == func_) GTEST_SKIP(); + + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + vpx_usec_timer ref_timer; + vpx_usec_timer_start(&ref_timer); + for (int i = 0; i < kTestNum; ++i) { + ref_func_(block_height_, block_width_, diff_, block_width_, src_, + block_width_, pred_, block_width_, bit_depth_); + } + vpx_usec_timer_mark(&ref_timer); + const int64_t ref_elapsed_time = vpx_usec_timer_elapsed(&ref_timer); + + for (int j = 0; j < kMaxBlockSize; ++j) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask; + CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask; + } + + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + for (int i = 0; i < kTestNum; ++i) { + func_(block_height_, block_width_, diff_, block_width_, src_, block_width_, + pred_, block_width_, bit_depth_); + } + vpx_usec_timer_mark(&timer); + const int64_t elapsed_time = vpx_usec_timer_elapsed(&timer); + + printf( + "[%dx%d]: " + "ref_time=%6" PRId64 " \t simd_time=%6" PRId64 + " \t " + "gain=%f \n", + block_width_, block_height_, ref_elapsed_time, elapsed_time, + static_cast(ref_elapsed_time) / + static_cast(elapsed_time)); +} + +TEST_P(VPXHBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); } + +const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, + BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, + BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, + BLOCK_64X64 }; + +INSTANTIATE_TEST_SUITE_P( + C, VPXHBDSubtractBlockTest, + ::testing::Combine(::testing::ValuesIn(kValidBlockSize), + ::testing::Values(12), + ::testing::Values(&vpx_highbd_subtract_block_c), + ::testing::Values(&vpx_highbd_subtract_block_c))); + +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P( + AVX2, VPXHBDSubtractBlockTest, + ::testing::Combine(::testing::ValuesIn(kValidBlockSize), + ::testing::Values(12), + ::testing::Values(&vpx_highbd_subtract_block_avx2), + ::testing::Values(&vpx_highbd_subtract_block_c))); +#endif // HAVE_AVX2 + +#endif // CONFIG_VP9_HIGHBITDEPTH } // namespace vp9 diff --git a/media/libvpx/libvpx/test/vp9_thread_test.cc b/media/libvpx/libvpx/test/vp9_thread_test.cc index 3e3fd25acb..387bb4050b 100644 --- a/media/libvpx/libvpx/test/vp9_thread_test.cc +++ b/media/libvpx/libvpx/test/vp9_thread_test.cc @@ -10,7 +10,7 @@ #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "test/codec_factory.h" #include "test/decode_test_driver.h" @@ -26,10 +26,10 @@ using std::string; class VPxWorkerThreadTest : public ::testing::TestWithParam { protected: - virtual ~VPxWorkerThreadTest() {} - virtual void SetUp() { vpx_get_worker_interface()->init(&worker_); } + ~VPxWorkerThreadTest() override = default; + void SetUp() override { vpx_get_worker_interface()->init(&worker_); } - virtual void TearDown() { vpx_get_worker_interface()->end(&worker_); } + void TearDown() override { vpx_get_worker_interface()->end(&worker_); } void Run(VPxWorker *worker) { const bool synchronous = GetParam(); @@ -128,18 +128,18 @@ TEST_P(VPxWorkerThreadTest, EndWithoutSync) { } TEST(VPxWorkerThreadTest, TestInterfaceAPI) { - EXPECT_EQ(0, vpx_set_worker_interface(NULL)); - EXPECT_TRUE(vpx_get_worker_interface() != NULL); + EXPECT_EQ(0, vpx_set_worker_interface(nullptr)); + EXPECT_NE(vpx_get_worker_interface(), nullptr); for (int i = 0; i < 6; ++i) { VPxWorkerInterface winterface = *vpx_get_worker_interface(); switch (i) { default: - case 0: winterface.init = NULL; break; - case 1: winterface.reset = NULL; break; - case 2: winterface.sync = NULL; break; - case 3: winterface.launch = NULL; break; - case 4: winterface.execute = NULL; break; - case 5: winterface.end = NULL; break; + case 0: winterface.init = nullptr; break; + case 1: winterface.reset = nullptr; break; + case 2: winterface.sync = nullptr; break; + case 3: winterface.launch = nullptr; break; + case 4: winterface.execute = nullptr; break; + case 5: winterface.end = nullptr; break; } EXPECT_EQ(0, vpx_set_worker_interface(&winterface)); } @@ -147,13 +147,7 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) { // ----------------------------------------------------------------------------- // Multi-threaded decode tests - #if CONFIG_WEBM_IO -struct FileList { - const char *name; - const char *expected_md5; -}; - // Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames. string DecodeFile(const string &filename, int num_threads) { libvpx_test::WebMVideoSource video(filename); @@ -173,7 +167,7 @@ string DecodeFile(const string &filename, int num_threads) { } libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); - const vpx_image_t *img = NULL; + const vpx_image_t *img = nullptr; // Get decompressed data while ((img = dec_iter.Next())) { @@ -183,20 +177,11 @@ string DecodeFile(const string &filename, int num_threads) { return string(md5.Get()); } -void DecodeFiles(const FileList files[]) { - for (const FileList *iter = files; iter->name != NULL; ++iter) { - SCOPED_TRACE(iter->name); - for (int t = 1; t <= 8; ++t) { - EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) << "threads = " - << t; - } - } -} - // Trivial serialized thread worker interface implementation. // Note any worker that requires synchronization between other workers will // hang. namespace impl { +namespace { void Init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); } int Reset(VPxWorker *const /*worker*/) { return 1; } @@ -209,16 +194,13 @@ void Execute(VPxWorker *const worker) { void Launch(VPxWorker *const worker) { Execute(worker); } void End(VPxWorker *const /*worker*/) {} +} // namespace } // namespace impl TEST(VPxWorkerThreadTest, TestSerialInterface) { static const VPxWorkerInterface serial_interface = { impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End }; - // TODO(jzern): Avoid using a file that will use the row-based thread - // loopfilter, with the simple serialized implementation it will hang. This is - // due to its expectation that rows will be run in parallel as they wait on - // progress in the row above before proceeding. static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc"; static const char filename[] = "vp90-2-03-size-226x226.webm"; VPxWorkerInterface default_interface = *vpx_get_worker_interface(); @@ -231,90 +213,83 @@ TEST(VPxWorkerThreadTest, TestSerialInterface) { EXPECT_EQ(expected_md5, DecodeFile(filename, 2)); } -TEST(VP9DecodeMultiThreadedTest, NoTilesNonFrameParallel) { - // no tiles or frame parallel; this exercises loop filter threading. - EXPECT_EQ("b35a1b707b28e82be025d960aba039bc", - DecodeFile("vp90-2-03-size-226x226.webm", 2)); +struct FileParam { + const char *name; + const char *expected_md5; + friend std::ostream &operator<<(std::ostream &os, const FileParam ¶m) { + return os << "file name: " << param.name + << " digest: " << param.expected_md5; + } +}; + +class VP9DecodeMultiThreadedTest : public ::testing::TestWithParam { +}; + +TEST_P(VP9DecodeMultiThreadedTest, Decode) { + for (int t = 1; t <= 8; ++t) { + EXPECT_EQ(GetParam().expected_md5, DecodeFile(GetParam().name, t)) + << "threads = " << t; + } } -TEST(VP9DecodeMultiThreadedTest, FrameParallel) { - static const FileList files[] = { { "vp90-2-08-tile_1x2_frame_parallel.webm", - "68ede6abd66bae0a2edf2eb9232241b6" }, - { "vp90-2-08-tile_1x4_frame_parallel.webm", - "368ebc6ebf3a5e478d85b2c3149b2848" }, - { "vp90-2-08-tile_1x8_frame_parallel.webm", - "17e439da2388aff3a0f69cb22579c6c1" }, - { NULL, NULL } }; +const FileParam kNoTilesNonFrameParallelFiles[] = { + { "vp90-2-03-size-226x226.webm", "b35a1b707b28e82be025d960aba039bc" } +}; - DecodeFiles(files); -} +const FileParam kFrameParallelFiles[] = { + { "vp90-2-08-tile_1x2_frame_parallel.webm", + "68ede6abd66bae0a2edf2eb9232241b6" }, + { "vp90-2-08-tile_1x4_frame_parallel.webm", + "368ebc6ebf3a5e478d85b2c3149b2848" }, + { "vp90-2-08-tile_1x8_frame_parallel.webm", + "17e439da2388aff3a0f69cb22579c6c1" }, +}; -TEST(VP9DecodeMultiThreadedTest, FrameParallelResize) { - static const FileList files[] = { - { "vp90-2-14-resize-fp-tiles-1-16.webm", - "0cd5e632c326297e975f38949c31ea94" }, - { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm", - "5c78a96a42e7f4a4f6b2edcdb791e44c" }, - { "vp90-2-14-resize-fp-tiles-1-2.webm", - "e030450ae85c3277be2a418769df98e2" }, - { "vp90-2-14-resize-fp-tiles-1-4.webm", - "312eed4e2b64eb7a4e7f18916606a430" }, - { "vp90-2-14-resize-fp-tiles-16-1.webm", - "1755c16d8af16a9cb3fe7338d90abe52" }, - { "vp90-2-14-resize-fp-tiles-16-2.webm", - "500300592d3fcb6f12fab25e48aaf4df" }, - { "vp90-2-14-resize-fp-tiles-16-4.webm", - "47c48379fa6331215d91c67648e1af6e" }, - { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm", - "eecf17290739bc708506fa4827665989" }, - { "vp90-2-14-resize-fp-tiles-16-8.webm", - "29b6bb54e4c26b5ca85d5de5fed94e76" }, - { "vp90-2-14-resize-fp-tiles-1-8.webm", - "1b6f175e08cd82cf84bb800ac6d1caa3" }, - { "vp90-2-14-resize-fp-tiles-2-16.webm", - "ca3b03e4197995d8d5444ede7a6c0804" }, - { "vp90-2-14-resize-fp-tiles-2-1.webm", - "99aec065369d70bbb78ccdff65afed3f" }, - { "vp90-2-14-resize-fp-tiles-2-4.webm", - "22d0ebdb49b87d2920a85aea32e1afd5" }, - { "vp90-2-14-resize-fp-tiles-2-8.webm", - "c2115cf051c62e0f7db1d4a783831541" }, - { "vp90-2-14-resize-fp-tiles-4-16.webm", - "c690d7e1719b31367564cac0af0939cb" }, - { "vp90-2-14-resize-fp-tiles-4-1.webm", - "a926020b2cc3e15ad4cc271853a0ff26" }, - { "vp90-2-14-resize-fp-tiles-4-2.webm", - "42699063d9e581f1993d0cf890c2be78" }, - { "vp90-2-14-resize-fp-tiles-4-8.webm", - "7f76d96036382f45121e3d5aa6f8ec52" }, - { "vp90-2-14-resize-fp-tiles-8-16.webm", - "76a43fcdd7e658542913ea43216ec55d" }, - { "vp90-2-14-resize-fp-tiles-8-1.webm", - "8e3fbe89486ca60a59299dea9da91378" }, - { "vp90-2-14-resize-fp-tiles-8-2.webm", - "ae96f21f21b6370cc0125621b441fc52" }, - { "vp90-2-14-resize-fp-tiles-8-4.webm", - "3eb4f24f10640d42218f7fd7b9fd30d4" }, - { NULL, NULL } - }; +const FileParam kFrameParallelResizeFiles[] = { + { "vp90-2-14-resize-fp-tiles-1-16.webm", "0cd5e632c326297e975f38949c31ea94" }, + { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm", + "5c78a96a42e7f4a4f6b2edcdb791e44c" }, + { "vp90-2-14-resize-fp-tiles-1-2.webm", "e030450ae85c3277be2a418769df98e2" }, + { "vp90-2-14-resize-fp-tiles-1-4.webm", "312eed4e2b64eb7a4e7f18916606a430" }, + { "vp90-2-14-resize-fp-tiles-16-1.webm", "1755c16d8af16a9cb3fe7338d90abe52" }, + { "vp90-2-14-resize-fp-tiles-16-2.webm", "500300592d3fcb6f12fab25e48aaf4df" }, + { "vp90-2-14-resize-fp-tiles-16-4.webm", "47c48379fa6331215d91c67648e1af6e" }, + { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm", + "eecf17290739bc708506fa4827665989" }, + { "vp90-2-14-resize-fp-tiles-16-8.webm", "29b6bb54e4c26b5ca85d5de5fed94e76" }, + { "vp90-2-14-resize-fp-tiles-1-8.webm", "1b6f175e08cd82cf84bb800ac6d1caa3" }, + { "vp90-2-14-resize-fp-tiles-2-16.webm", "ca3b03e4197995d8d5444ede7a6c0804" }, + { "vp90-2-14-resize-fp-tiles-2-1.webm", "99aec065369d70bbb78ccdff65afed3f" }, + { "vp90-2-14-resize-fp-tiles-2-4.webm", "22d0ebdb49b87d2920a85aea32e1afd5" }, + { "vp90-2-14-resize-fp-tiles-2-8.webm", "c2115cf051c62e0f7db1d4a783831541" }, + { "vp90-2-14-resize-fp-tiles-4-16.webm", "c690d7e1719b31367564cac0af0939cb" }, + { "vp90-2-14-resize-fp-tiles-4-1.webm", "a926020b2cc3e15ad4cc271853a0ff26" }, + { "vp90-2-14-resize-fp-tiles-4-2.webm", "42699063d9e581f1993d0cf890c2be78" }, + { "vp90-2-14-resize-fp-tiles-4-8.webm", "7f76d96036382f45121e3d5aa6f8ec52" }, + { "vp90-2-14-resize-fp-tiles-8-16.webm", "76a43fcdd7e658542913ea43216ec55d" }, + { "vp90-2-14-resize-fp-tiles-8-1.webm", "8e3fbe89486ca60a59299dea9da91378" }, + { "vp90-2-14-resize-fp-tiles-8-2.webm", "ae96f21f21b6370cc0125621b441fc52" }, + { "vp90-2-14-resize-fp-tiles-8-4.webm", "3eb4f24f10640d42218f7fd7b9fd30d4" }, +}; - DecodeFiles(files); -} +const FileParam kNonFrameParallelFiles[] = { + { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" }, + { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" }, + { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" }, + { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" }, + { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" }, +}; -TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) { - static const FileList files[] = { - { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" }, - { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" }, - { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" }, - { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" }, - { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" }, - { NULL, NULL } - }; - - DecodeFiles(files); -} +INSTANTIATE_TEST_SUITE_P(NoTilesNonFrameParallel, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kNoTilesNonFrameParallelFiles)); +INSTANTIATE_TEST_SUITE_P(FrameParallel, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kFrameParallelFiles)); +INSTANTIATE_TEST_SUITE_P(FrameParallelResize, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kFrameParallelResizeFiles)); +INSTANTIATE_TEST_SUITE_P(NonFrameParallel, VP9DecodeMultiThreadedTest, + ::testing::ValuesIn(kNonFrameParallelFiles)); #endif // CONFIG_WEBM_IO -INSTANTIATE_TEST_CASE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool()); +INSTANTIATE_TEST_SUITE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool()); } // namespace diff --git a/media/libvpx/libvpx/test/vpx_image_test.cc b/media/libvpx/libvpx/test/vpx_image_test.cc new file mode 100644 index 0000000000..7c32035ed4 --- /dev/null +++ b/media/libvpx/libvpx/test/vpx_image_test.cc @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx/vpx_image.h" +#include "gtest/gtest.h" + +TEST(VpxImageTest, VpxImgWrapInvalidAlign) { + const int kWidth = 128; + const int kHeight = 128; + unsigned char buf[kWidth * kHeight * 3]; + + vpx_image_t img; + // Set img_data and img_data_owner to junk values. vpx_img_wrap() should + // not read these values on failure. + unsigned char empty[] = ""; + img.img_data = empty; + img.img_data_owner = 1; + + vpx_img_fmt_t format = VPX_IMG_FMT_I444; + // 'align' must be a power of 2 but is not. This causes the vpx_img_wrap() + // call to fail. The test verifies we do not read the junk values in 'img'. + unsigned int align = 31; + EXPECT_EQ(vpx_img_wrap(&img, format, kWidth, kHeight, align, buf), nullptr); +} + +TEST(VpxImageTest, VpxImgSetRectOverflow) { + const int kWidth = 128; + const int kHeight = 128; + unsigned char buf[kWidth * kHeight * 3]; + + vpx_image_t img; + vpx_img_fmt_t format = VPX_IMG_FMT_I444; + unsigned int align = 32; + EXPECT_EQ(vpx_img_wrap(&img, format, kWidth, kHeight, align, buf), &img); + + EXPECT_EQ(vpx_img_set_rect(&img, 0, 0, kWidth, kHeight), 0); + // This would result in overflow because -1 is cast to UINT_MAX. + EXPECT_NE(vpx_img_set_rect(&img, static_cast(-1), + static_cast(-1), kWidth, kHeight), + 0); +} + +TEST(VpxImageTest, VpxImgAllocNone) { + const int kWidth = 128; + const int kHeight = 128; + + vpx_image_t img; + vpx_img_fmt_t format = VPX_IMG_FMT_NONE; + unsigned int align = 32; + ASSERT_EQ(vpx_img_alloc(&img, format, kWidth, kHeight, align), nullptr); +} + +TEST(VpxImageTest, VpxImgAllocNv12) { + const int kWidth = 128; + const int kHeight = 128; + + vpx_image_t img; + vpx_img_fmt_t format = VPX_IMG_FMT_NV12; + unsigned int align = 32; + EXPECT_EQ(vpx_img_alloc(&img, format, kWidth, kHeight, align), &img); + EXPECT_EQ(img.stride[VPX_PLANE_U], img.stride[VPX_PLANE_Y]); + EXPECT_EQ(img.stride[VPX_PLANE_V], img.stride[VPX_PLANE_U]); + EXPECT_EQ(img.planes[VPX_PLANE_V], img.planes[VPX_PLANE_U] + 1); + vpx_img_free(&img); +} + +TEST(VpxImageTest, VpxImgAllocHugeWidth) { + // The stride (0x80000000 * 2) would overflow unsigned int. + vpx_image_t *image = + vpx_img_alloc(nullptr, VPX_IMG_FMT_I42016, 0x80000000, 1, 1); + ASSERT_EQ(image, nullptr); + + // The stride (0x80000000) would overflow int. + image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, 0x80000000, 1, 1); + ASSERT_EQ(image, nullptr); + + // The aligned width (UINT_MAX + 1) would overflow unsigned int. + image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, UINT_MAX, 1, 1); + ASSERT_EQ(image, nullptr); + + image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, 0x7ffffffe, 1, 1); + if (image) { + vpx_img_free(image); + } + + image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, 285245883, 64, 1); + if (image) { + vpx_img_free(image); + } + + image = vpx_img_alloc(nullptr, VPX_IMG_FMT_NV12, 285245883, 64, 1); + if (image) { + vpx_img_free(image); + } + + image = vpx_img_alloc(nullptr, VPX_IMG_FMT_YV12, 285245883, 64, 1); + if (image) { + vpx_img_free(image); + } + + image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I42016, 65536, 2, 1); + if (image) { + uint16_t *y_plane = + reinterpret_cast(image->planes[VPX_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + vpx_img_free(image); + } + + image = vpx_img_alloc(nullptr, VPX_IMG_FMT_I42016, 285245883, 2, 1); + if (image) { + uint16_t *y_plane = + reinterpret_cast(image->planes[VPX_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + vpx_img_free(image); + } +} diff --git a/media/libvpx/libvpx/test/vpx_scale_test.cc b/media/libvpx/libvpx/test/vpx_scale_test.cc index 81773fe5b6..a5307ace93 100644 --- a/media/libvpx/libvpx/test/vpx_scale_test.cc +++ b/media/libvpx/libvpx/test/vpx_scale_test.cc @@ -8,181 +8,51 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "./vpx_scale_rtcd.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" +#include "test/vpx_scale_test.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" #include "vpx_scale/yv12config.h" +namespace libvpx_test { namespace { -typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf); -typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf, - YV12_BUFFER_CONFIG *dst_ybf); +#if VPX_ARCH_ARM || (VPX_ARCH_MIPS && !HAVE_MIPS64) || VPX_ARCH_X86 +// Avoid OOM failures on 32-bit platforms. +const int kNumSizesToTest = 7; +#else +const int kNumSizesToTest = 8; +#endif +const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 3840, 16383 }; -class VpxScaleBase { - public: - virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); } - - void ResetImage(int width, int height) { - width_ = width; - height_ = height; - memset(&img_, 0, sizeof(img_)); - ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_, - VP8BORDERINPIXELS)); - memset(img_.buffer_alloc, kBufFiller, img_.frame_size); - FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, - img_.y_stride); - FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, - img_.uv_stride); - FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, - img_.uv_stride); - - memset(&ref_img_, 0, sizeof(ref_img_)); - ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_, - VP8BORDERINPIXELS)); - memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size); - - memset(&cpy_img_, 0, sizeof(cpy_img_)); - ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_, - VP8BORDERINPIXELS)); - memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size); - ReferenceCopyFrame(); - } - - void DeallocImage() { - vp8_yv12_de_alloc_frame_buffer(&img_); - vp8_yv12_de_alloc_frame_buffer(&ref_img_); - vp8_yv12_de_alloc_frame_buffer(&cpy_img_); - } - - protected: - static const int kBufFiller = 123; - static const int kBufMax = kBufFiller - 1; - - static void FillPlane(uint8_t *buf, int width, int height, int stride) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - buf[x + (y * stride)] = (x + (width * y)) % kBufMax; - } - } - } - - static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, - int width, int height, int stride, int padding) { - // Copy the outermost visible pixel to a distance of at least 'padding.' - // The buffers are allocated such that there may be excess space outside the - // padding. As long as the minimum amount of padding is achieved it is not - // necessary to fill this space as well. - uint8_t *left = buf - padding; - uint8_t *right = buf + crop_width; - const int right_extend = padding + (width - crop_width); - const int bottom_extend = padding + (height - crop_height); - - // Fill the border pixels from the nearest image pixel. - for (int y = 0; y < crop_height; ++y) { - memset(left, left[padding], padding); - memset(right, right[-1], right_extend); - left += stride; - right += stride; - } - - left = buf - padding; - uint8_t *top = left - (stride * padding); - // The buffer does not always extend as far as the stride. - // Equivalent to padding + width + padding. - const int extend_width = padding + crop_width + right_extend; - - // The first row was already extended to the left and right. Copy it up. - for (int y = 0; y < padding; ++y) { - memcpy(top, left, extend_width); - top += stride; - } - - uint8_t *bottom = left + (crop_height * stride); - for (int y = 0; y < bottom_extend; ++y) { - memcpy(bottom, left + (crop_height - 1) * stride, extend_width); - bottom += stride; - } - } - - void ReferenceExtendBorder() { - ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width, - ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height, - ref_img_.y_stride, ref_img_.border); - ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width, - ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, - ref_img_.uv_stride, ref_img_.border / 2); - ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width, - ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, - ref_img_.uv_stride, ref_img_.border / 2); - } - - void ReferenceCopyFrame() { - // Copy img_ to ref_img_ and extend frame borders. This will be used for - // verifying extend_fn_ as well as copy_frame_fn_. - EXPECT_EQ(ref_img_.frame_size, img_.frame_size); - for (int y = 0; y < img_.y_crop_height; ++y) { - for (int x = 0; x < img_.y_crop_width; ++x) { - ref_img_.y_buffer[x + y * ref_img_.y_stride] = - img_.y_buffer[x + y * img_.y_stride]; - } - } - - for (int y = 0; y < img_.uv_crop_height; ++y) { - for (int x = 0; x < img_.uv_crop_width; ++x) { - ref_img_.u_buffer[x + y * ref_img_.uv_stride] = - img_.u_buffer[x + y * img_.uv_stride]; - ref_img_.v_buffer[x + y * ref_img_.uv_stride] = - img_.v_buffer[x + y * img_.uv_stride]; - } - } - - ReferenceExtendBorder(); - } - - void CompareImages(const YV12_BUFFER_CONFIG actual) { - EXPECT_EQ(ref_img_.frame_size, actual.frame_size); - EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc, - ref_img_.frame_size)); - } - - YV12_BUFFER_CONFIG img_; - YV12_BUFFER_CONFIG ref_img_; - YV12_BUFFER_CONFIG cpy_img_; - int width_; - int height_; -}; +using ExtendFrameBorderFunc = void (*)(YV12_BUFFER_CONFIG *ybf); +using CopyFrameFunc = void (*)(const YV12_BUFFER_CONFIG *src_ybf, + YV12_BUFFER_CONFIG *dst_ybf); class ExtendBorderTest : public VpxScaleBase, public ::testing::TestWithParam { public: - virtual ~ExtendBorderTest() {} + ~ExtendBorderTest() override = default; protected: - virtual void SetUp() { extend_fn_ = GetParam(); } + void SetUp() override { extend_fn_ = GetParam(); } void ExtendBorder() { ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); } void RunTest() { -#if ARCH_ARM - // Some arm devices OOM when trying to allocate the largest buffers. - static const int kNumSizesToTest = 6; -#else - static const int kNumSizesToTest = 7; -#endif - static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 }; for (int h = 0; h < kNumSizesToTest; ++h) { for (int w = 0; w < kNumSizesToTest; ++w) { - ResetImage(kSizesToTest[w], kSizesToTest[h]); + ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h])); + ReferenceCopyFrame(); ExtendBorder(); - ReferenceExtendBorder(); CompareImages(img_); - DeallocImage(); + DeallocImages(); } } } @@ -192,36 +62,29 @@ class ExtendBorderTest TEST_P(ExtendBorderTest, ExtendBorder) { ASSERT_NO_FATAL_FAILURE(RunTest()); } -INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest, - ::testing::Values(vp8_yv12_extend_frame_borders_c)); +INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest, + ::testing::Values(vp8_yv12_extend_frame_borders_c)); class CopyFrameTest : public VpxScaleBase, public ::testing::TestWithParam { public: - virtual ~CopyFrameTest() {} + ~CopyFrameTest() override = default; protected: - virtual void SetUp() { copy_frame_fn_ = GetParam(); } + void SetUp() override { copy_frame_fn_ = GetParam(); } void CopyFrame() { - ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_)); + ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_)); } void RunTest() { -#if ARCH_ARM - // Some arm devices OOM when trying to allocate the largest buffers. - static const int kNumSizesToTest = 6; -#else - static const int kNumSizesToTest = 7; -#endif - static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 }; for (int h = 0; h < kNumSizesToTest; ++h) { for (int w = 0; w < kNumSizesToTest; ++w) { - ResetImage(kSizesToTest[w], kSizesToTest[h]); + ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h])); ReferenceCopyFrame(); CopyFrame(); - CompareImages(cpy_img_); - DeallocImage(); + CompareImages(dst_img_); + DeallocImages(); } } } @@ -231,6 +94,8 @@ class CopyFrameTest : public VpxScaleBase, TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); } -INSTANTIATE_TEST_CASE_P(C, CopyFrameTest, - ::testing::Values(vp8_yv12_copy_frame_c)); +INSTANTIATE_TEST_SUITE_P(C, CopyFrameTest, + ::testing::Values(vp8_yv12_copy_frame_c)); + } // namespace +} // namespace libvpx_test diff --git a/media/libvpx/libvpx/test/vpx_scale_test.h b/media/libvpx/libvpx/test/vpx_scale_test.h new file mode 100644 index 0000000000..89d62fcd7f --- /dev/null +++ b/media/libvpx/libvpx/test/vpx_scale_test.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_TEST_VPX_SCALE_TEST_H_ +#define VPX_TEST_VPX_SCALE_TEST_H_ + +#include "gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_scale/yv12config.h" + +using libvpx_test::ACMRandom; + +namespace libvpx_test { + +class VpxScaleBase { + public: + virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); } + + void ResetImage(YV12_BUFFER_CONFIG *const img, const int width, + const int height) { + memset(img, 0, sizeof(*img)); + ASSERT_EQ( + 0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS)) + << "for width: " << width << " height: " << height; + memset(img->buffer_alloc, kBufFiller, img->frame_size); + } + + void ResetImages(const int width, const int height) { + ResetImage(&img_, width, height); + ResetImage(&ref_img_, width, height); + ResetImage(&dst_img_, width, height); + + FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + } + + void ResetScaleImage(YV12_BUFFER_CONFIG *const img, const int width, + const int height) { + memset(img, 0, sizeof(*img)); +#if CONFIG_VP9_HIGHBITDEPTH + ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, 0, + VP9_ENC_BORDER_IN_PIXELS, 0)); +#else + ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, + VP9_ENC_BORDER_IN_PIXELS, 0)); +#endif + memset(img->buffer_alloc, kBufFiller, img->frame_size); + } + + void ResetScaleImages(const int src_width, const int src_height, + const int dst_width, const int dst_height) { + ResetScaleImage(&img_, src_width, src_height); + ResetScaleImage(&ref_img_, dst_width, dst_height); + ResetScaleImage(&dst_img_, dst_width, dst_height); + FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + } + + void DeallocImages() { + vp8_yv12_de_alloc_frame_buffer(&img_); + vp8_yv12_de_alloc_frame_buffer(&ref_img_); + vp8_yv12_de_alloc_frame_buffer(&dst_img_); + } + + void DeallocScaleImages() { + vpx_free_frame_buffer(&img_); + vpx_free_frame_buffer(&ref_img_); + vpx_free_frame_buffer(&dst_img_); + } + + protected: + static const int kBufFiller = 123; + static const int kBufMax = kBufFiller - 1; + + static void FillPlane(uint8_t *const buf, const int width, const int height, + const int stride) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = (x + (width * y)) % kBufMax; + } + } + } + + static void FillPlaneExtreme(uint8_t *const buf, const int width, + const int height, const int stride) { + ACMRandom rnd; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0; + } + } + } + + static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, + int width, int height, int stride, int padding) { + // Copy the outermost visible pixel to a distance of at least 'padding.' + // The buffers are allocated such that there may be excess space outside the + // padding. As long as the minimum amount of padding is achieved it is not + // necessary to fill this space as well. + uint8_t *left = buf - padding; + uint8_t *right = buf + crop_width; + const int right_extend = padding + (width - crop_width); + const int bottom_extend = padding + (height - crop_height); + + // Fill the border pixels from the nearest image pixel. + for (int y = 0; y < crop_height; ++y) { + memset(left, left[padding], padding); + memset(right, right[-1], right_extend); + left += stride; + right += stride; + } + + left = buf - padding; + uint8_t *top = left - (stride * padding); + // The buffer does not always extend as far as the stride. + // Equivalent to padding + width + padding. + const int extend_width = padding + crop_width + right_extend; + + // The first row was already extended to the left and right. Copy it up. + for (int y = 0; y < padding; ++y) { + memcpy(top, left, extend_width); + top += stride; + } + + uint8_t *bottom = left + (crop_height * stride); + for (int y = 0; y < bottom_extend; ++y) { + memcpy(bottom, left + (crop_height - 1) * stride, extend_width); + bottom += stride; + } + } + + void ReferenceExtendBorder() { + ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width, + ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height, + ref_img_.y_stride, ref_img_.border); + ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width, + ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, ref_img_.border / 2); + ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width, + ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, ref_img_.border / 2); + } + + void ReferenceCopyFrame() { + // Copy img_ to ref_img_ and extend frame borders. This will be used for + // verifying extend_fn_ as well as copy_frame_fn_. + EXPECT_EQ(ref_img_.frame_size, img_.frame_size); + for (int y = 0; y < img_.y_crop_height; ++y) { + for (int x = 0; x < img_.y_crop_width; ++x) { + ref_img_.y_buffer[x + y * ref_img_.y_stride] = + img_.y_buffer[x + y * img_.y_stride]; + } + } + + for (int y = 0; y < img_.uv_crop_height; ++y) { + for (int x = 0; x < img_.uv_crop_width; ++x) { + ref_img_.u_buffer[x + y * ref_img_.uv_stride] = + img_.u_buffer[x + y * img_.uv_stride]; + ref_img_.v_buffer[x + y * ref_img_.uv_stride] = + img_.v_buffer[x + y * img_.uv_stride]; + } + } + + ReferenceExtendBorder(); + } + + void CompareImages(const YV12_BUFFER_CONFIG actual) { + EXPECT_EQ(ref_img_.frame_size, actual.frame_size); + EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc, + ref_img_.frame_size)); + } + + YV12_BUFFER_CONFIG img_; + YV12_BUFFER_CONFIG ref_img_; + YV12_BUFFER_CONFIG dst_img_; +}; + +} // namespace libvpx_test + +#endif // VPX_TEST_VPX_SCALE_TEST_H_ diff --git a/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh b/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh index 6b6d15e7f8..69c734daf8 100644 --- a/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh +++ b/media/libvpx/libvpx/test/vpx_temporal_svc_encoder.sh @@ -38,9 +38,11 @@ vpx_tsvc_encoder() { local output_file="${VPX_TEST_OUTPUT_DIR}/${output_file_base}" local timebase_num="1" local timebase_den="1000" + local timebase_den_y4m="30" local speed="6" local frame_drop_thresh="30" local max_threads="4" + local error_resilient="1" shift 2 @@ -51,11 +53,25 @@ vpx_tsvc_encoder() { # TODO(tomfinegan): Verify file output for all thread runs. for threads in $(seq $max_threads); do - eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \ - "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \ - "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \ - "${threads}" "$@" \ - ${devnull} + if [ "$(vpx_config_option_enabled CONFIG_VP9_HIGHBITDEPTH)" != "yes" ]; then + eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \ + "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" ${devnull} || return 1 + # Test for y4m input. + eval "${VPX_TEST_PREFIX}" "${encoder}" "${Y4M_720P_INPUT}" \ + "${output_file}" "${codec}" "${Y4M_720P_INPUT_WIDTH}" \ + "${Y4M_720P_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den_y4m}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" ${devnull} || return 1 + else + eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \ + "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \ + "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \ + "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \ + "$@" "8" ${devnull} || return 1 + fi done } @@ -76,193 +92,217 @@ files_exist() { vpx_tsvc_encoder_vp8_mode_0() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 0 200 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_0" + vpx_tsvc_encoder vp8 "${output_basename}" 0 200 || return 1 # Mode 0 produces 1 stream - files_exist "${FUNCNAME}" 1 || return 1 + files_exist "${output_basename}" 1 || return 1 fi } vpx_tsvc_encoder_vp8_mode_1() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 1 200 400 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_1" + vpx_tsvc_encoder vp8 "${output_basename}" 1 200 400 || return 1 # Mode 1 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp8_mode_2() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 2 200 400 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_2" + vpx_tsvc_encoder vp8 "${output_basename}" 2 200 400 || return 1 # Mode 2 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp8_mode_3() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 3 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_3" + vpx_tsvc_encoder vp8 "${output_basename}" 3 200 400 600 || return 1 # Mode 3 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_4() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 4 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_4" + vpx_tsvc_encoder vp8 "${output_basename}" 4 200 400 600 || return 1 # Mode 4 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_5() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 5 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_5" + vpx_tsvc_encoder vp8 "${output_basename}" 5 200 400 600 || return 1 # Mode 5 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_6() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 6 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_6" + vpx_tsvc_encoder vp8 "${output_basename}" 6 200 400 600 || return 1 # Mode 6 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_7() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_7" + vpx_tsvc_encoder vp8 "${output_basename}" 7 200 400 600 800 1000 || return 1 # Mode 7 produces 5 streams - files_exist "${FUNCNAME}" 5 || return 1 + files_exist "${output_basename}" 5 || return 1 fi } vpx_tsvc_encoder_vp8_mode_8() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 8 200 400 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_8" + vpx_tsvc_encoder vp8 "${output_basename}" 8 200 400 || return 1 # Mode 8 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp8_mode_9() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 9 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_9" + vpx_tsvc_encoder vp8 "${output_basename}" 9 200 400 600 || return 1 # Mode 9 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_10() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 10 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_10" + vpx_tsvc_encoder vp8 "${output_basename}" 10 200 400 600 || return 1 # Mode 10 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp8_mode_11() { if [ "$(vp8_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp8 "${FUNCNAME}" 11 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp8_mode_11" + vpx_tsvc_encoder vp8 "${output_basename}" 11 200 400 600 || return 1 # Mode 11 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_0() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 0 200 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_0" + vpx_tsvc_encoder vp9 "${output_basename}" 0 200 || return 1 # Mode 0 produces 1 stream - files_exist "${FUNCNAME}" 1 || return 1 + files_exist "${output_basename}" 1 || return 1 fi } vpx_tsvc_encoder_vp9_mode_1() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 1 200 400 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_1" + vpx_tsvc_encoder vp9 "${output_basename}" 1 200 400 || return 1 # Mode 1 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp9_mode_2() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 2 200 400 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_2" + vpx_tsvc_encoder vp9 "${output_basename}" 2 200 400 || return 1 # Mode 2 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp9_mode_3() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 3 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_3" + vpx_tsvc_encoder vp9 "${output_basename}" 3 200 400 600 || return 1 # Mode 3 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_4() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 4 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_4" + vpx_tsvc_encoder vp9 "${output_basename}" 4 200 400 600 || return 1 # Mode 4 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_5() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 5 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_5" + vpx_tsvc_encoder vp9 "${output_basename}" 5 200 400 600 || return 1 # Mode 5 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_6() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 6 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_6" + vpx_tsvc_encoder vp9 "${output_basename}" 6 200 400 600 || return 1 # Mode 6 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_7() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_7" + vpx_tsvc_encoder vp9 "${output_basename}" 7 200 400 600 800 1000 || return 1 # Mode 7 produces 5 streams - files_exist "${FUNCNAME}" 5 || return 1 + files_exist "${output_basename}" 5 || return 1 fi } vpx_tsvc_encoder_vp9_mode_8() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 8 200 400 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_8" + vpx_tsvc_encoder vp9 "${output_basename}" 8 200 400 || return 1 # Mode 8 produces 2 streams - files_exist "${FUNCNAME}" 2 || return 1 + files_exist "${output_basename}" 2 || return 1 fi } vpx_tsvc_encoder_vp9_mode_9() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 9 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_9" + vpx_tsvc_encoder vp9 "${output_basename}" 9 200 400 600 || return 1 # Mode 9 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_10() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 10 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_10" + vpx_tsvc_encoder vp9 "${output_basename}" 10 200 400 600 || return 1 # Mode 10 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } vpx_tsvc_encoder_vp9_mode_11() { if [ "$(vp9_encode_available)" = "yes" ]; then - vpx_tsvc_encoder vp9 "${FUNCNAME}" 11 200 400 600 || return 1 + local output_basename="vpx_tsvc_encoder_vp9_mode_11" + vpx_tsvc_encoder vp9 "${output_basename}" 11 200 400 600 || return 1 # Mode 11 produces 3 streams - files_exist "${FUNCNAME}" 3 || return 1 + files_exist "${output_basename}" 3 || return 1 fi } diff --git a/media/libvpx/libvpx/test/vpxdec.sh b/media/libvpx/libvpx/test/vpxdec.sh index de51c8004e..199feae5f3 100644 --- a/media/libvpx/libvpx/test/vpxdec.sh +++ b/media/libvpx/libvpx/test/vpxdec.sh @@ -18,7 +18,8 @@ vpxdec_verify_environment() { if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \ [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \ - [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] ; then + [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] || \ + [ ! -e "${VP9_RAW_FILE}" ]; then elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." return 1 fi @@ -33,8 +34,8 @@ vpxdec_verify_environment() { # input file path and shifted away. All remaining parameters are passed through # to vpxdec. vpxdec_pipe() { - local readonly decoder="$(vpx_tool_path vpxdec)" - local readonly input="$1" + local decoder="$(vpx_tool_path vpxdec)" + local input="$1" shift cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull} } @@ -43,8 +44,8 @@ vpxdec_pipe() { # the directory containing vpxdec. $1 one is used as the input file path and # shifted away. All remaining parameters are passed through to vpxdec. vpxdec() { - local readonly decoder="$(vpx_tool_path vpxdec)" - local readonly input="$1" + local decoder="$(vpx_tool_path vpxdec)" + local input="$1" shift eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull} } @@ -85,7 +86,7 @@ vpxdec_vp9_webm_frame_parallel() { [ "$(webm_io_available)" = "yes" ]; then for threads in 2 3 4 5 6 7 8; do vpxdec "${VP9_FPM_WEBM_FILE}" --summary --noblit --threads=$threads \ - --frame-parallel + --frame-parallel || return 1 done fi } @@ -95,9 +96,9 @@ vpxdec_vp9_webm_less_than_50_frames() { # frames in actual webm_read_frame calls. if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly decoder="$(vpx_tool_path vpxdec)" - local readonly expected=10 - local readonly num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \ + local decoder="$(vpx_tool_path vpxdec)" + local expected=10 + local num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \ "${VP9_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \ | awk '/^[0-9]+ decoded frames/ { print $1 }') if [ "$num_frames" -ne "$expected" ]; then @@ -107,10 +108,28 @@ vpxdec_vp9_webm_less_than_50_frames() { fi } +# Ensures VP9_RAW_FILE correctly produces 1 frame instead of causing a hang. +vpxdec_vp9_raw_file() { + # Ensure a raw file properly reports eof and doesn't cause a hang. + if [ "$(vpxdec_can_decode_vp9)" = "yes" ]; then + local decoder="$(vpx_tool_path vpxdec)" + local expected=1 + [ -x /usr/bin/timeout ] && local TIMEOUT="/usr/bin/timeout 30s" + local num_frames=$(${TIMEOUT} ${VPX_TEST_PREFIX} "${decoder}" \ + "${VP9_RAW_FILE}" --summary --noblit 2>&1 \ + | awk '/^[0-9]+ decoded frames/ { print $1 }') + if [ -z "$num_frames" ] || [ "$num_frames" -ne "$expected" ]; then + elog "Output frames ($num_frames) != expected ($expected)" + return 1 + fi + fi +} + vpxdec_tests="vpxdec_vp8_ivf vpxdec_vp8_ivf_pipe_input vpxdec_vp9_webm vpxdec_vp9_webm_frame_parallel - vpxdec_vp9_webm_less_than_50_frames" + vpxdec_vp9_webm_less_than_50_frames + vpxdec_vp9_raw_file" run_tests vpxdec_verify_environment "${vpxdec_tests}" diff --git a/media/libvpx/libvpx/test/vpxenc.sh b/media/libvpx/libvpx/test/vpxenc.sh index e8994992ae..172349a2b3 100644 --- a/media/libvpx/libvpx/test/vpxenc.sh +++ b/media/libvpx/libvpx/test/vpxenc.sh @@ -67,7 +67,7 @@ y4m_input_720p() { # Echo default vpxenc real time encoding params. $1 is the codec, which defaults # to vp8 if unspecified. vpxenc_rt_params() { - local readonly codec="${1:-vp8}" + local codec="${1:-vp8}" echo "--codec=${codec} --buf-initial-sz=500 --buf-optimal-sz=600 @@ -90,13 +90,22 @@ vpxenc_rt_params() { --undershoot-pct=50" } +# Forces --passes to 1 with CONFIG_REALTIME_ONLY. +vpxenc_passes_param() { + if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ]; then + echo "--passes=1" + else + echo "--passes=2" + fi +} + # Wrapper function for running vpxenc with pipe input. Requires that # LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the # input file path and shifted away. All remaining parameters are passed through # to vpxenc. vpxenc_pipe() { - local readonly encoder="$(vpx_tool_path vpxenc)" - local readonly input="$1" + local encoder="$(vpx_tool_path vpxenc)" + local input="$1" shift cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - \ --test-decode=fatal \ @@ -107,8 +116,8 @@ vpxenc_pipe() { # the directory containing vpxenc. $1 one is used as the input file path and # shifted away. All remaining parameters are passed through to vpxenc. vpxenc() { - local readonly encoder="$(vpx_tool_path vpxenc)" - local readonly input="$1" + local encoder="$(vpx_tool_path vpxenc)" + local input="$1" shift eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \ --test-decode=fatal \ @@ -117,12 +126,12 @@ vpxenc() { vpxenc_vp8_ivf() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf" + local output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf" vpxenc $(yuv_input_hantro_collage) \ --codec=vp8 \ --limit="${TEST_FRAMES}" \ --ivf \ - --output="${output}" + --output="${output}" || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -134,11 +143,11 @@ vpxenc_vp8_ivf() { vpxenc_vp8_webm() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" vpxenc $(yuv_input_hantro_collage) \ --codec=vp8 \ --limit="${TEST_FRAMES}" \ - --output="${output}" + --output="${output}" || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -150,10 +159,11 @@ vpxenc_vp8_webm() { vpxenc_vp8_webm_rt() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm" vpxenc $(yuv_input_hantro_collage) \ $(vpxenc_rt_params vp8) \ - --output="${output}" + --output="${output}" || return 1 + if [ ! -e "${output}" ]; then elog "Output file does not exist." return 1 @@ -164,12 +174,12 @@ vpxenc_vp8_webm_rt() { vpxenc_vp8_webm_2pass() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp8.webm" vpxenc $(yuv_input_hantro_collage) \ --codec=vp8 \ --limit="${TEST_FRAMES}" \ --output="${output}" \ - --passes=2 + --passes=2 || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -181,16 +191,16 @@ vpxenc_vp8_webm_2pass() { vpxenc_vp8_webm_lag10_frames20() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly lag_total_frames=20 - local readonly lag_frames=10 - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm" + local lag_total_frames=20 + local lag_frames=10 + local output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm" vpxenc $(yuv_input_hantro_collage) \ --codec=vp8 \ --limit="${lag_total_frames}" \ --lag-in-frames="${lag_frames}" \ --output="${output}" \ --auto-alt-ref=1 \ - --passes=2 + --passes=2 || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -201,12 +211,12 @@ vpxenc_vp8_webm_lag10_frames20() { vpxenc_vp8_ivf_piped_input() { if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf" + local output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf" vpxenc_pipe $(yuv_input_hantro_collage) \ --codec=vp8 \ --limit="${TEST_FRAMES}" \ --ivf \ - --output="${output}" + --output="${output}" || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -217,12 +227,14 @@ vpxenc_vp8_ivf_piped_input() { vpxenc_vp9_ivf() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf" + local output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf" + local passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ + "${passes}" \ --ivf \ - --output="${output}" + --output="${output}" || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -234,11 +246,13 @@ vpxenc_vp9_ivf() { vpxenc_vp9_webm() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" + local passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ - --output="${output}" + "${passes}" \ + --output="${output}" || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -250,10 +264,10 @@ vpxenc_vp9_webm() { vpxenc_vp9_webm_rt() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm" vpxenc $(yuv_input_hantro_collage) \ $(vpxenc_rt_params vp9) \ - --output="${output}" + --output="${output}" || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -265,11 +279,11 @@ vpxenc_vp9_webm_rt() { vpxenc_vp9_webm_rt_multithread_tiled() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm" - local readonly tilethread_min=2 - local readonly tilethread_max=4 - local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})" - local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})" + local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm" + local tilethread_min=2 + local tilethread_max=4 + local num_threads="$(seq ${tilethread_min} ${tilethread_max})" + local num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})" for threads in ${num_threads}; do for tile_cols in ${num_tile_cols}; do @@ -277,27 +291,26 @@ vpxenc_vp9_webm_rt_multithread_tiled() { $(vpxenc_rt_params vp9) \ --threads=${threads} \ --tile-columns=${tile_cols} \ - --output="${output}" + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + rm "${output}" done done - - if [ ! -e "${output}" ]; then - elog "Output file does not exist." - return 1 - fi - - rm "${output}" fi } vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm" - local readonly tilethread_min=2 - local readonly tilethread_max=4 - local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})" - local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})" + local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm" + local tilethread_min=2 + local tilethread_max=4 + local num_threads="$(seq ${tilethread_min} ${tilethread_max})" + local num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})" for threads in ${num_threads}; do for tile_cols in ${num_tile_cols}; do @@ -306,28 +319,27 @@ vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() { --threads=${threads} \ --tile-columns=${tile_cols} \ --frame-parallel=1 \ - --output="${output}" + --output="${output}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + rm "${output}" done done - - if [ ! -e "${output}" ]; then - elog "Output file does not exist." - return 1 - fi - - rm "${output}" fi } vpxenc_vp9_webm_2pass() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp9.webm" vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ --output="${output}" \ - --passes=2 + --passes=2 || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -338,13 +350,15 @@ vpxenc_vp9_webm_2pass() { vpxenc_vp9_ivf_lossless() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf" + local output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf" + local passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ --ivf \ --output="${output}" \ - --lossless=1 + "${passes}" \ + --lossless=1 || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -355,14 +369,16 @@ vpxenc_vp9_ivf_lossless() { vpxenc_vp9_ivf_minq0_maxq0() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf" + local output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf" + local passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ --ivf \ --output="${output}" \ + "${passes}" \ --min-q=0 \ - --max-q=0 + --max-q=0 || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -374,16 +390,17 @@ vpxenc_vp9_ivf_minq0_maxq0() { vpxenc_vp9_webm_lag10_frames20() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly lag_total_frames=20 - local readonly lag_frames=10 - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm" + local lag_total_frames=20 + local lag_frames=10 + local output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm" + local passes=$(vpxenc_passes_param) vpxenc $(yuv_input_hantro_collage) \ --codec=vp9 \ --limit="${lag_total_frames}" \ --lag-in-frames="${lag_frames}" \ --output="${output}" \ - --passes=2 \ - --auto-alt-ref=1 + "${passes}" \ + --auto-alt-ref=1 || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -396,11 +413,13 @@ vpxenc_vp9_webm_lag10_frames20() { vpxenc_vp9_webm_non_square_par() { if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm" + local output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm" + local passes=$(vpxenc_passes_param) vpxenc $(y4m_input_non_square_par) \ --codec=vp9 \ --limit="${TEST_FRAMES}" \ - --output="${output}" + "${passes}" \ + --output="${output}" || return 1 if [ ! -e "${output}" ]; then elog "Output file does not exist." @@ -409,21 +428,62 @@ vpxenc_vp9_webm_non_square_par() { fi } +vpxenc_vp9_webm_sharpness() { + if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then + local sharpnesses="0 1 2 3 4 5 6 7" + local output="${VPX_TEST_OUTPUT_DIR}/vpxenc_vp9_webm_sharpness.ivf" + local last_size=0 + local this_size=0 + + for sharpness in ${sharpnesses}; do + + vpxenc $(yuv_input_hantro_collage) \ + --sharpness="${sharpness}" \ + --codec=vp9 \ + --limit=1 \ + --cpu-used=2 \ + --end-usage=q \ + --cq-level=40 \ + --output="${output}" \ + "${passes}" || return 1 + + if [ ! -e "${output}" ]; then + elog "Output file does not exist." + return 1 + fi + + this_size=$(stat -c '%s' "${output}") + if [ "${this_size}" -lt "${last_size}" ]; then + elog "Higher sharpness value yielded lower file size." + echo "${this_size}" " < " "${last_size}" + return 1 + fi + last_size="${this_size}" + + done + fi +} + vpxenc_tests="vpxenc_vp8_ivf vpxenc_vp8_webm vpxenc_vp8_webm_rt - vpxenc_vp8_webm_2pass - vpxenc_vp8_webm_lag10_frames20 vpxenc_vp8_ivf_piped_input vpxenc_vp9_ivf vpxenc_vp9_webm vpxenc_vp9_webm_rt vpxenc_vp9_webm_rt_multithread_tiled vpxenc_vp9_webm_rt_multithread_tiled_frameparallel - vpxenc_vp9_webm_2pass vpxenc_vp9_ivf_lossless vpxenc_vp9_ivf_minq0_maxq0 vpxenc_vp9_webm_lag10_frames20 - vpxenc_vp9_webm_non_square_par" + vpxenc_vp9_webm_non_square_par + vpxenc_vp9_webm_sharpness" + +if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then + vpxenc_tests="$vpxenc_tests + vpxenc_vp8_webm_2pass + vpxenc_vp8_webm_lag10_frames20 + vpxenc_vp9_webm_2pass" +fi run_tests vpxenc_verify_environment "${vpxenc_tests}" diff --git a/media/libvpx/libvpx/test/webm_video_source.h b/media/libvpx/libvpx/test/webm_video_source.h index 53713618ee..6ab50c849f 100644 --- a/media/libvpx/libvpx/test/webm_video_source.h +++ b/media/libvpx/libvpx/test/webm_video_source.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_WEBM_VIDEO_SOURCE_H_ -#define TEST_WEBM_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_WEBM_VIDEO_SOURCE_H_ +#define VPX_TEST_WEBM_VIDEO_SOURCE_H_ #include #include #include @@ -26,35 +26,35 @@ class WebMVideoSource : public CompressedVideoSource { public: explicit WebMVideoSource(const std::string &file_name) : file_name_(file_name), vpx_ctx_(new VpxInputContext()), - webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_(0), + webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0), end_of_file_(false) {} - virtual ~WebMVideoSource() { - if (vpx_ctx_->file != NULL) fclose(vpx_ctx_->file); + ~WebMVideoSource() override { + if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file); webm_free(webm_ctx_); delete vpx_ctx_; delete webm_ctx_; } - virtual void Init() {} + void Init() override {} - virtual void Begin() { + void Begin() override { vpx_ctx_->file = OpenTestDataFile(file_name_); - ASSERT_TRUE(vpx_ctx_->file != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_NE(vpx_ctx_->file, nullptr) + << "Input file open failed. Filename: " << file_name_; ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM"; FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } void FillFrame() { - ASSERT_TRUE(vpx_ctx_->file != NULL); + ASSERT_NE(vpx_ctx_->file, nullptr); const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_); ASSERT_GE(status, 0) << "webm_read_frame failed"; if (status == 1) { @@ -63,7 +63,7 @@ class WebMVideoSource : public CompressedVideoSource { } void SeekToNextKeyFrame() { - ASSERT_TRUE(vpx_ctx_->file != NULL); + ASSERT_NE(vpx_ctx_->file, nullptr); do { const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_); ASSERT_GE(status, 0) << "webm_read_frame failed"; @@ -74,9 +74,11 @@ class WebMVideoSource : public CompressedVideoSource { } while (!webm_ctx_->is_key_frame && !end_of_file_); } - virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; } - virtual size_t frame_size() const { return buf_sz_; } - virtual unsigned int frame_number() const { return frame_; } + const uint8_t *cxdata() const override { + return end_of_file_ ? nullptr : buf_; + } + size_t frame_size() const override { return buf_sz_; } + unsigned int frame_number() const override { return frame_; } protected: std::string file_name_; @@ -90,4 +92,4 @@ class WebMVideoSource : public CompressedVideoSource { } // namespace libvpx_test -#endif // TEST_WEBM_VIDEO_SOURCE_H_ +#endif // VPX_TEST_WEBM_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/test/y4m_test.cc b/media/libvpx/libvpx/test/y4m_test.cc index ced717a7c1..3865880b60 100644 --- a/media/libvpx/libvpx/test/y4m_test.cc +++ b/media/libvpx/libvpx/test/y4m_test.cc @@ -10,7 +10,7 @@ #include -#include "third_party/googletest/src/include/gtest/gtest.h" +#include "gtest/gtest.h" #include "./vpx_config.h" #include "./y4menc.h" @@ -40,18 +40,18 @@ const Y4mTestParam kY4mTestVectors[] = { "284a47a47133b12884ec3a14e959a0b6" }, { "park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, "90517ff33843d85de712fd4fe60dbed0" }, - { "park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016, - "63f21f9f717d8b8631bd2288ee87137b" }, - { "park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216, - "48ab51fb540aed07f7ff5af130c9b605" }, - { "park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416, - "067bfd75aa85ff9bae91fa3e0edd1e3e" }, - { "park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016, - "9e6d8f6508c6e55625f6b697bc461cef" }, - { "park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216, - "b239c6b301c0b835485be349ca83a7e3" }, - { "park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416, - "5a6481a550821dab6d0192f5c63845e9" }, + { "park_joy_90p_10_420_20f.y4m", 10, VPX_IMG_FMT_I42016, + "2f56ab9809269f074df7e3daf1ce0be6" }, + { "park_joy_90p_10_422_20f.y4m", 10, VPX_IMG_FMT_I42216, + "1b5c73d2e8e8c4e02dc4889ecac41c83" }, + { "park_joy_90p_10_444_20f.y4m", 10, VPX_IMG_FMT_I44416, + "ec4ab5be53195c5b838d1d19e1bc2674" }, + { "park_joy_90p_12_420_20f.y4m", 12, VPX_IMG_FMT_I42016, + "3370856c8ddebbd1f9bb2e66f97677f4" }, + { "park_joy_90p_12_422_20f.y4m", 12, VPX_IMG_FMT_I42216, + "4eab364318dd8201acbb182e43bd4966" }, + { "park_joy_90p_12_444_20f.y4m", 12, VPX_IMG_FMT_I44416, + "f189dfbbd92119fc8e5f211a550166be" }, }; static void write_image_file(const vpx_image_t *img, FILE *file) { @@ -78,7 +78,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam, protected: Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {} - virtual ~Y4mVideoSourceTest() { CloseSource(); } + ~Y4mVideoSourceTest() override { CloseSource(); } virtual void Init(const std::string &file_name, int limit) { file_name_ = file_name; @@ -90,7 +90,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam, // Checks y4m header information void HeaderChecks(unsigned int bit_depth, vpx_img_fmt_t fmt) { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); ASSERT_EQ(y4m_.pic_w, (int)kWidth); ASSERT_EQ(y4m_.pic_h, (int)kHeight); ASSERT_EQ(img()->d_w, kWidth); @@ -116,7 +116,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam, // Checks MD5 of the raw frame data void Md5Check(const string &expected_md5) { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); libvpx_test::MD5 md5; for (unsigned int i = start_; i < limit_; i++) { md5.Add(img()); @@ -133,16 +133,16 @@ TEST_P(Y4mVideoSourceTest, SourceTest) { Md5Check(t.md5raw); } -INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest, - ::testing::ValuesIn(kY4mTestVectors)); +INSTANTIATE_TEST_SUITE_P(C, Y4mVideoSourceTest, + ::testing::ValuesIn(kY4mTestVectors)); class Y4mVideoWriteTest : public Y4mVideoSourceTest { protected: - Y4mVideoWriteTest() : tmpfile_(NULL) {} + Y4mVideoWriteTest() : tmpfile_(nullptr) {} - virtual ~Y4mVideoWriteTest() { + ~Y4mVideoWriteTest() override { delete tmpfile_; - input_file_ = NULL; + input_file_ = nullptr; } void ReplaceInputFile(FILE *input_file) { @@ -155,11 +155,11 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest { // Writes out a y4m file and then reads it back void WriteY4mAndReadBack() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); char buf[Y4M_BUFFER_SIZE] = { 0 }; const struct VpxRational framerate = { y4m_.fps_n, y4m_.fps_d }; tmpfile_ = new libvpx_test::TempOutFile; - ASSERT_TRUE(tmpfile_->file() != NULL); + ASSERT_NE(tmpfile_->file(), nullptr); y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate, y4m_.vpx_fmt, y4m_.bit_depth); fputs(buf, tmpfile_->file()); @@ -172,7 +172,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest { ReplaceInputFile(tmpfile_->file()); } - virtual void Init(const std::string &file_name, int limit) { + void Init(const std::string &file_name, int limit) override { Y4mVideoSourceTest::Init(file_name, limit); WriteY4mAndReadBack(); } @@ -186,6 +186,59 @@ TEST_P(Y4mVideoWriteTest, WriteTest) { Md5Check(t.md5raw); } -INSTANTIATE_TEST_CASE_P(C, Y4mVideoWriteTest, - ::testing::ValuesIn(kY4mTestVectors)); +INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest, + ::testing::ValuesIn(kY4mTestVectors)); + +static const char kY4MRegularHeader[] = + "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG\n" + "FRAME\n" + "012345678912345601230123"; + +TEST(Y4MHeaderTest, RegularHeader) { + libvpx_test::TempOutFile f; + ASSERT_NE(f.file(), nullptr); + fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file()); + fflush(f.file()); + EXPECT_EQ(0, fseek(f.file(), 0, 0)); + + y4m_input y4m; + EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr, + /*num_skip=*/0, /*only_420=*/0), + 0); + EXPECT_EQ(y4m.pic_w, 4); + EXPECT_EQ(y4m.pic_h, 4); + EXPECT_EQ(y4m.fps_n, 30); + EXPECT_EQ(y4m.fps_d, 1); + EXPECT_EQ(y4m.interlace, 'p'); + EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0); + y4m_input_close(&y4m); +} + +// Testing that headers over 100 characters can be parsed. +static const char kY4MLongHeader[] = + "YUV4MPEG2 W4 H4 F30:1 Ip A0:0 C420jpeg XYSCSS=420JPEG " + "XCOLORRANGE=LIMITED XSOME_UNKNOWN_METADATA XOTHER_UNKNOWN_METADATA\n" + "FRAME\n" + "012345678912345601230123"; + +TEST(Y4MHeaderTest, LongHeader) { + libvpx_test::TempOutFile f; + ASSERT_NE(f.file(), nullptr); + fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f.file()); + fflush(f.file()); + EXPECT_EQ(fseek(f.file(), 0, 0), 0); + + y4m_input y4m; + EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr, + /*num_skip=*/0, /*only_420=*/0), + 0); + EXPECT_EQ(y4m.pic_w, 4); + EXPECT_EQ(y4m.pic_h, 4); + EXPECT_EQ(y4m.fps_n, 30); + EXPECT_EQ(y4m.fps_d, 1); + EXPECT_EQ(y4m.interlace, 'p'); + EXPECT_EQ(strcmp("420jpeg", y4m.chroma_type), 0); + y4m_input_close(&y4m); +} + } // namespace diff --git a/media/libvpx/libvpx/test/y4m_video_source.h b/media/libvpx/libvpx/test/y4m_video_source.h index 2682ddde3d..e43e37d9e4 100644 --- a/media/libvpx/libvpx/test/y4m_video_source.h +++ b/media/libvpx/libvpx/test/y4m_video_source.h @@ -7,9 +7,10 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_Y4M_VIDEO_SOURCE_H_ -#define TEST_Y4M_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_Y4M_VIDEO_SOURCE_H_ +#define VPX_TEST_Y4M_VIDEO_SOURCE_H_ #include +#include #include #include "test/video_source.h" @@ -22,11 +23,11 @@ namespace libvpx_test { class Y4mVideoSource : public VideoSource { public: Y4mVideoSource(const std::string &file_name, unsigned int start, int limit) - : file_name_(file_name), input_file_(NULL), img_(new vpx_image_t()), + : file_name_(file_name), input_file_(nullptr), img_(new vpx_image_t()), start_(start), limit_(limit), frame_(0), framerate_numerator_(0), framerate_denominator_(0), y4m_() {} - virtual ~Y4mVideoSource() { + ~Y4mVideoSource() override { vpx_img_free(img_.get()); CloseSource(); } @@ -34,13 +35,13 @@ class Y4mVideoSource : public VideoSource { virtual void OpenSource() { CloseSource(); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_NE(input_file_, nullptr) + << "Input file open failed. Filename: " << file_name_; } virtual void ReadSourceToStart() { - ASSERT_TRUE(input_file_ != NULL); - ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0)); + ASSERT_NE(input_file_, nullptr); + ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, nullptr, 0, 0)); framerate_numerator_ = y4m_.fps_n; framerate_denominator_ = y4m_.fps_d; frame_ = 0; @@ -50,36 +51,36 @@ class Y4mVideoSource : public VideoSource { FillFrame(); } - virtual void Begin() { + void Begin() override { OpenSource(); ReadSourceToStart(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } - virtual vpx_image_t *img() const { - return (frame_ < limit_) ? img_.get() : NULL; + vpx_image_t *img() const override { + return (frame_ < limit_) ? img_.get() : nullptr; } // Models a stream where Timebase = 1/FPS, so pts == frame. - virtual vpx_codec_pts_t pts() const { return frame_; } + vpx_codec_pts_t pts() const override { return frame_; } - virtual unsigned long duration() const { return 1; } + unsigned long duration() const override { return 1; } - virtual vpx_rational_t timebase() const { + vpx_rational_t timebase() const override { const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ }; return t; } - virtual unsigned int frame() const { return frame_; } + unsigned int frame() const override { return frame_; } - virtual unsigned int limit() const { return limit_; } + unsigned int limit() const override { return limit_; } virtual void FillFrame() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); // Read a frame from input_file. y4m_input_fetch_frame(&y4m_, input_file_, img_.get()); } @@ -100,15 +101,15 @@ class Y4mVideoSource : public VideoSource { void CloseSource() { y4m_input_close(&y4m_); y4m_ = y4m_input(); - if (input_file_ != NULL) { + if (input_file_ != nullptr) { fclose(input_file_); - input_file_ = NULL; + input_file_ = nullptr; } } std::string file_name_; FILE *input_file_; - testing::internal::scoped_ptr img_; + std::unique_ptr img_; unsigned int start_; unsigned int limit_; unsigned int frame_; @@ -119,4 +120,4 @@ class Y4mVideoSource : public VideoSource { } // namespace libvpx_test -#endif // TEST_Y4M_VIDEO_SOURCE_H_ +#endif // VPX_TEST_Y4M_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/test/yuv_temporal_filter_test.cc b/media/libvpx/libvpx/test/yuv_temporal_filter_test.cc new file mode 100644 index 0000000000..e3a9ae9d27 --- /dev/null +++ b/media/libvpx/libvpx/test/yuv_temporal_filter_test.cc @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "gtest/gtest.h" + +#include "./vp9_rtcd.h" +#include "test/acm_random.h" +#include "test/buffer.h" +#include "test/register_state_check.h" +#include "vpx_config.h" +#include "vpx_ports/vpx_timer.h" + +namespace { + +using ::libvpx_test::ACMRandom; +using ::libvpx_test::Buffer; + +using YUVTemporalFilterFunc = void (*)( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, + uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, + uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + +struct TemporalFilterWithBd { + TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth) + : temporal_filter(func), bd(bitdepth) {} + + YUVTemporalFilterFunc temporal_filter; + int bd; +}; + +std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) { + return os << "Bitdepth: " << tf.bd; +} + +int GetFilterWeight(unsigned int row, unsigned int col, + unsigned int block_height, unsigned int block_width, + const int *const blk_fw, int use_32x32) { + if (use_32x32) { + return blk_fw[0]; + } + + return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)]; +} + +template +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int mod = sum_dist * 3 / index; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template <> +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + unsigned int index_mult[14] = { 0, 0, 0, 0, 49152, + 39322, 32768, 28087, 24576, 21846, + 19661, 17874, 0, 15124 }; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template <> +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int64_t index_mult[14] = { 0U, 0U, 0U, 0U, + 3221225472U, 2576980378U, 2147483648U, 1840700270U, + 1610612736U, 1431655766U, 1288490189U, 1171354718U, + 0U, 991146300U }; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = static_cast((sum_dist * index_mult[index]) >> 32); + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template +void ApplyReferenceFilter( + const Buffer &y_src, const Buffer &y_pre, + const Buffer &u_src, const Buffer &v_src, + const Buffer &u_pre, const Buffer &v_pre, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *const blk_fw, int use_32x32, + Buffer *y_accumulator, Buffer *y_counter, + Buffer *u_accumulator, Buffer *u_counter, + Buffer *v_accumulator, Buffer *v_counter) { + const PixelType *y_src_ptr = y_src.TopLeftPixel(); + const PixelType *y_pre_ptr = y_pre.TopLeftPixel(); + const PixelType *u_src_ptr = u_src.TopLeftPixel(); + const PixelType *u_pre_ptr = u_pre.TopLeftPixel(); + const PixelType *v_src_ptr = v_src.TopLeftPixel(); + const PixelType *v_pre_ptr = v_pre.TopLeftPixel(); + + const int uv_block_width = block_width >> ss_x, + uv_block_height = block_height >> ss_y; + const int y_src_stride = y_src.stride(), y_pre_stride = y_pre.stride(); + const int uv_src_stride = u_src.stride(), uv_pre_stride = u_pre.stride(); + const int y_diff_stride = block_width, uv_diff_stride = uv_block_width; + + Buffer y_dif = Buffer(block_width, block_height, 0); + Buffer u_dif = Buffer(uv_block_width, uv_block_height, 0); + Buffer v_dif = Buffer(uv_block_width, uv_block_height, 0); + + ASSERT_TRUE(y_dif.Init()); + ASSERT_TRUE(u_dif.Init()); + ASSERT_TRUE(v_dif.Init()); + y_dif.Set(0); + u_dif.Set(0); + v_dif.Set(0); + + int *y_diff_ptr = y_dif.TopLeftPixel(); + int *u_diff_ptr = u_dif.TopLeftPixel(); + int *v_diff_ptr = v_dif.TopLeftPixel(); + + uint32_t *y_accum = y_accumulator->TopLeftPixel(); + uint32_t *u_accum = u_accumulator->TopLeftPixel(); + uint32_t *v_accum = v_accumulator->TopLeftPixel(); + uint16_t *y_count = y_counter->TopLeftPixel(); + uint16_t *u_count = u_counter->TopLeftPixel(); + uint16_t *v_count = v_counter->TopLeftPixel(); + + const int y_accum_stride = y_accumulator->stride(); + const int u_accum_stride = u_accumulator->stride(); + const int v_accum_stride = v_accumulator->stride(); + const int y_count_stride = y_counter->stride(); + const int u_count_stride = u_counter->stride(); + const int v_count_stride = v_counter->stride(); + + const int rounding = (1 << strength) >> 1; + + // Get the square diffs + for (int row = 0; row < static_cast(block_height); row++) { + for (int col = 0; col < static_cast(block_width); col++) { + const int diff = y_src_ptr[row * y_src_stride + col] - + y_pre_ptr[row * y_pre_stride + col]; + y_diff_ptr[row * y_diff_stride + col] = diff * diff; + } + } + + for (int row = 0; row < uv_block_height; row++) { + for (int col = 0; col < uv_block_width; col++) { + const int u_diff = u_src_ptr[row * uv_src_stride + col] - + u_pre_ptr[row * uv_pre_stride + col]; + const int v_diff = v_src_ptr[row * uv_src_stride + col] - + v_pre_ptr[row * uv_pre_stride + col]; + u_diff_ptr[row * uv_diff_stride + col] = u_diff * u_diff; + v_diff_ptr[row * uv_diff_stride + col] = v_diff * v_diff; + } + } + + // Apply the filter to luma + for (int row = 0; row < static_cast(block_height); row++) { + for (int col = 0; col < static_cast(block_width); col++) { + const int uv_row = row >> ss_y; + const int uv_col = col >> ss_x; + const int filter_weight = GetFilterWeight(row, col, block_height, + block_width, blk_fw, use_32x32); + + // First we get the modifier for the current y pixel + const int y_pixel = y_pre_ptr[row * y_pre_stride + col]; + int y_num_used = 0; + int y_mod = 0; + + // Sum the neighboring 3x3 y pixels + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = row + row_step; + const int sub_col = col + col_step; + + if (sub_row >= 0 && sub_row < static_cast(block_height) && + sub_col >= 0 && sub_col < static_cast(block_width)) { + y_mod += y_diff_ptr[sub_row * y_diff_stride + sub_col]; + y_num_used++; + } + } + } + + // Sum the corresponding uv pixels to the current y modifier + // Note we are rounding down instead of rounding to the nearest pixel. + y_mod += u_diff_ptr[uv_row * uv_diff_stride + uv_col]; + y_mod += v_diff_ptr[uv_row * uv_diff_stride + uv_col]; + + y_num_used += 2; + + // Set the modifier + y_mod = GetModIndex(y_mod, y_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + y_count[row * y_count_stride + col] += y_mod; + y_accum[row * y_accum_stride + col] += y_mod * y_pixel; + } + } + + // Apply the filter to chroma + for (int uv_row = 0; uv_row < uv_block_height; uv_row++) { + for (int uv_col = 0; uv_col < uv_block_width; uv_col++) { + const int y_row = uv_row << ss_y; + const int y_col = uv_col << ss_x; + const int filter_weight = GetFilterWeight( + uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32); + + const int u_pixel = u_pre_ptr[uv_row * uv_pre_stride + uv_col]; + const int v_pixel = v_pre_ptr[uv_row * uv_pre_stride + uv_col]; + + int uv_num_used = 0; + int u_mod = 0, v_mod = 0; + + // Sum the neighboring 3x3 chromal pixels to the chroma modifier + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = uv_row + row_step; + const int sub_col = uv_col + col_step; + + if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && + sub_col < uv_block_width) { + u_mod += u_diff_ptr[sub_row * uv_diff_stride + sub_col]; + v_mod += v_diff_ptr[sub_row * uv_diff_stride + sub_col]; + uv_num_used++; + } + } + } + + // Sum all the luma pixels associated with the current luma pixel + for (int row_step = 0; row_step < 1 + ss_y; row_step++) { + for (int col_step = 0; col_step < 1 + ss_x; col_step++) { + const int sub_row = y_row + row_step; + const int sub_col = y_col + col_step; + const int y_diff = y_diff_ptr[sub_row * y_diff_stride + sub_col]; + + u_mod += y_diff; + v_mod += y_diff; + uv_num_used++; + } + } + + // Set the modifier + u_mod = GetModIndex(u_mod, uv_num_used, rounding, strength, + filter_weight); + v_mod = GetModIndex(v_mod, uv_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + u_count[uv_row * u_count_stride + uv_col] += u_mod; + u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel; + v_count[uv_row * v_count_stride + uv_col] += v_mod; + v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel; + } + } +} + +class YUVTemporalFilterTest + : public ::testing::TestWithParam { + public: + void SetUp() override { + filter_func_ = GetParam().temporal_filter; + bd_ = GetParam().bd; + use_highbd_ = (bd_ != 8); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + saturate_test_ = 0; + num_repeats_ = 10; + + ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12); + } + + protected: + template + void CompareTestWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + template + void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + YUVTemporalFilterFunc filter_func_; + ACMRandom rnd_; + int saturate_test_; + int num_repeats_; + int use_highbd_; + int bd_; +}; + +template +void YUVTemporalFilterTest::CompareTestWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + const int uv_width = width >> ss_x, uv_height = height >> ss_y; + + Buffer y_src = Buffer(width, height, 0); + Buffer y_pre = Buffer(width, height, 0); + Buffer y_count_ref = Buffer(width, height, 0); + Buffer y_accum_ref = Buffer(width, height, 0); + Buffer y_count_tst = Buffer(width, height, 0); + Buffer y_accum_tst = Buffer(width, height, 0); + + Buffer u_src = Buffer(uv_width, uv_height, 0); + Buffer u_pre = Buffer(uv_width, uv_height, 0); + Buffer u_count_ref = Buffer(uv_width, uv_height, 0); + Buffer u_accum_ref = Buffer(uv_width, uv_height, 0); + Buffer u_count_tst = Buffer(uv_width, uv_height, 0); + Buffer u_accum_tst = Buffer(uv_width, uv_height, 0); + + Buffer v_src = Buffer(uv_width, uv_height, 0); + Buffer v_pre = Buffer(uv_width, uv_height, 0); + Buffer v_count_ref = Buffer(uv_width, uv_height, 0); + Buffer v_accum_ref = Buffer(uv_width, uv_height, 0); + Buffer v_count_tst = Buffer(uv_width, uv_height, 0); + Buffer v_accum_tst = Buffer(uv_width, uv_height, 0); + + ASSERT_TRUE(y_src.Init()); + ASSERT_TRUE(y_pre.Init()); + ASSERT_TRUE(y_count_ref.Init()); + ASSERT_TRUE(y_accum_ref.Init()); + ASSERT_TRUE(y_count_tst.Init()); + ASSERT_TRUE(y_accum_tst.Init()); + ASSERT_TRUE(u_src.Init()); + ASSERT_TRUE(u_pre.Init()); + ASSERT_TRUE(u_count_ref.Init()); + ASSERT_TRUE(u_accum_ref.Init()); + ASSERT_TRUE(u_count_tst.Init()); + ASSERT_TRUE(u_accum_tst.Init()); + + ASSERT_TRUE(v_src.Init()); + ASSERT_TRUE(v_pre.Init()); + ASSERT_TRUE(v_count_ref.Init()); + ASSERT_TRUE(v_accum_ref.Init()); + ASSERT_TRUE(v_count_tst.Init()); + ASSERT_TRUE(v_accum_tst.Init()); + + y_accum_ref.Set(0); + y_accum_tst.Set(0); + y_count_ref.Set(0); + y_count_tst.Set(0); + u_accum_ref.Set(0); + u_accum_tst.Set(0); + u_count_ref.Set(0); + u_count_tst.Set(0); + v_accum_ref.Set(0); + v_accum_tst.Set(0); + v_count_ref.Set(0); + v_count_tst.Set(0); + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + if (saturate_test_) { + const int max_val = (1 << bd_) - 1; + y_src.Set(max_val); + y_pre.Set(0); + u_src.Set(max_val); + u_pre.Set(0); + v_src.Set(max_val); + v_pre.Set(0); + } else { + y_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + y_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + } + + ApplyReferenceFilter( + y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y, + filter_strength, filter_weight, use_32x32, &y_accum_ref, &y_count_ref, + &u_accum_ref, &u_count_ref, &v_accum_ref, &v_count_ref); + + ASM_REGISTER_STATE_CHECK(filter_func_( + reinterpret_cast(y_src.TopLeftPixel()), y_src.stride(), + reinterpret_cast(y_pre.TopLeftPixel()), y_pre.stride(), + reinterpret_cast(u_src.TopLeftPixel()), + reinterpret_cast(v_src.TopLeftPixel()), u_src.stride(), + reinterpret_cast(u_pre.TopLeftPixel()), + reinterpret_cast(v_pre.TopLeftPixel()), u_pre.stride(), + width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32, + y_accum_tst.TopLeftPixel(), y_count_tst.TopLeftPixel(), + u_accum_tst.TopLeftPixel(), u_count_tst.TopLeftPixel(), + v_accum_tst.TopLeftPixel(), v_count_tst.TopLeftPixel())); + + EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref)); + EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref)); + EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref)); + EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref)); + EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref)); + EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref)); + + if (HasFailure()) { + if (use_32x32) { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y, + filter_strength, *filter_weight); + } else { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x, + ss_y, filter_strength, filter_weight[0], filter_weight[1], + filter_weight[2], filter_weight[3]); + } + y_accum_tst.PrintDifference(y_accum_ref); + y_count_tst.PrintDifference(y_count_ref); + u_accum_tst.PrintDifference(u_accum_ref); + u_count_tst.PrintDifference(u_count_ref); + v_accum_tst.PrintDifference(v_accum_ref); + v_count_tst.PrintDifference(v_count_ref); + + return; + } + } +} + +template +void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + const int uv_width = width >> ss_x, uv_height = height >> ss_y; + + Buffer y_src = Buffer(width, height, 0); + Buffer y_pre = Buffer(width, height, 0); + Buffer y_count = Buffer(width, height, 0); + Buffer y_accum = Buffer(width, height, 0); + + Buffer u_src = Buffer(uv_width, uv_height, 0); + Buffer u_pre = Buffer(uv_width, uv_height, 0); + Buffer u_count = Buffer(uv_width, uv_height, 0); + Buffer u_accum = Buffer(uv_width, uv_height, 0); + + Buffer v_src = Buffer(uv_width, uv_height, 0); + Buffer v_pre = Buffer(uv_width, uv_height, 0); + Buffer v_count = Buffer(uv_width, uv_height, 0); + Buffer v_accum = Buffer(uv_width, uv_height, 0); + + ASSERT_TRUE(y_src.Init()); + ASSERT_TRUE(y_pre.Init()); + ASSERT_TRUE(y_count.Init()); + ASSERT_TRUE(y_accum.Init()); + + ASSERT_TRUE(u_src.Init()); + ASSERT_TRUE(u_pre.Init()); + ASSERT_TRUE(u_count.Init()); + ASSERT_TRUE(u_accum.Init()); + + ASSERT_TRUE(v_src.Init()); + ASSERT_TRUE(v_pre.Init()); + ASSERT_TRUE(v_count.Init()); + ASSERT_TRUE(v_accum.Init()); + + y_accum.Set(0); + y_count.Set(0); + + u_accum.Set(0); + u_count.Set(0); + + v_accum.Set(0); + v_count.Set(0); + + y_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + y_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + u_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_src.Set(&rnd_, 0, 7 << (bd_ - 8)); + v_pre.Set(&rnd_, 0, 7 << (bd_ - 8)); + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + ASM_REGISTER_STATE_CHECK(filter_func_( + reinterpret_cast(y_src.TopLeftPixel()), y_src.stride(), + reinterpret_cast(y_pre.TopLeftPixel()), y_pre.stride(), + reinterpret_cast(u_src.TopLeftPixel()), + reinterpret_cast(v_src.TopLeftPixel()), u_src.stride(), + reinterpret_cast(u_pre.TopLeftPixel()), + reinterpret_cast(v_pre.TopLeftPixel()), u_pre.stride(), + width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32, + y_accum.TopLeftPixel(), y_count.TopLeftPixel(), u_accum.TopLeftPixel(), + u_count.TopLeftPixel(), v_accum.TopLeftPixel(), + v_count.TopLeftPixel())); + } +} + +TEST_P(YUVTemporalFilterTest, Use32x32) { + const int width = 32, height = 32; + const int use_32x32 = 1; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + for (int filter_weight = 0; filter_weight <= 2; filter_weight++) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); + } + ASSERT_FALSE(HasFailure()); + } + } + } + } +} + +TEST_P(YUVTemporalFilterTest, Use16x16) { + const int width = 32, height = 32; + const int use_32x32 = 0; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) { + // Set up the filter + int filter_weight[4]; + int filter_idx_cp = filter_idx; + for (int idx = 0; idx < 4; idx++) { + filter_weight[idx] = filter_idx_cp % 3; + filter_idx_cp /= 3; + } + + // Test each parameter + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } + + ASSERT_FALSE(HasFailure()); + } + } + } + } +} + +TEST_P(YUVTemporalFilterTest, SaturationTest) { + const int width = 32, height = 32; + const int use_32x32 = 1; + const int filter_weight = 1; + saturate_test_ = 1; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); + } + + ASSERT_FALSE(HasFailure()); + } + } + } +} + +TEST_P(YUVTemporalFilterTest, DISABLED_Speed) { + const int width = 32, height = 32; + num_repeats_ = 1000; + + for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) { + const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3; + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_idx = 0; filter_idx < num_filter_weights; + filter_idx++) { + // Set up the filter + int filter_weight[4]; + int filter_idx_cp = filter_idx; + for (int idx = 0; idx < 4; idx++) { + filter_weight[idx] = filter_idx_cp % 3; + filter_idx_cp /= 3; + } + + // Test each parameter + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + vpx_usec_timer timer; + vpx_usec_timer_start(&timer); + + if (use_highbd_) { + RunTestFilterWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } else { + RunTestFilterWithParam(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } + + vpx_usec_timer_mark(&timer); + const int elapsed_time = + static_cast(vpx_usec_timer_elapsed(&timer)); + + printf( + "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: " + "%d, Strength: %d, Time: %5d\n", + bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength, + elapsed_time); + } + } + } + } + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +#define WRAP_HIGHBD_FUNC(func, bd) \ + void wrap_##func##_##bd( \ + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, \ + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, \ + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, \ + int uv_pre_stride, unsigned int block_width, unsigned int block_height, \ + int ss_x, int ss_y, int strength, const int *const blk_fw, \ + int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, \ + uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, \ + uint16_t *v_count) { \ + func(reinterpret_cast(y_src), y_src_stride, \ + reinterpret_cast(y_pre), y_pre_stride, \ + reinterpret_cast(u_src), \ + reinterpret_cast(v_src), uv_src_stride, \ + reinterpret_cast(u_pre), \ + reinterpret_cast(v_pre), uv_pre_stride, \ + block_width, block_height, ss_x, ss_y, strength, blk_fw, use_32x32, \ + y_accumulator, y_count, u_accumulator, u_count, v_accumulator, \ + v_count); \ + } + +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10) +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12) + +INSTANTIATE_TEST_SUITE_P( + C, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12))); +#if HAVE_SSE4_1 +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10) +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12) + +INSTANTIATE_TEST_SUITE_P( + SSE4_1, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_10, + 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_12, + 12))); +#endif // HAVE_SSE4_1 +#if HAVE_NEON +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 10) +WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 12) + +INSTANTIATE_TEST_SUITE_P( + NEON, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_10, + 10), + TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_12, + 12))); +#endif // HAVE_NEON +#else +INSTANTIATE_TEST_SUITE_P( + C, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd(&vp9_apply_temporal_filter_c, 8))); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_SUITE_P(SSE4_1, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd( + &vp9_apply_temporal_filter_sse4_1, 8))); +#endif // HAVE_SSE4_1 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, YUVTemporalFilterTest, + ::testing::Values(TemporalFilterWithBd( + &vp9_apply_temporal_filter_neon, 8))); +#endif // HAVE_NEON +#endif // CONFIG_VP9_HIGHBITDEPTH + +} // namespace diff --git a/media/libvpx/libvpx/test/yuv_video_source.h b/media/libvpx/libvpx/test/yuv_video_source.h index 71ad2ab9a6..bb5eec5bb8 100644 --- a/media/libvpx/libvpx/test/yuv_video_source.h +++ b/media/libvpx/libvpx/test/yuv_video_source.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TEST_YUV_VIDEO_SOURCE_H_ -#define TEST_YUV_VIDEO_SOURCE_H_ +#ifndef VPX_TEST_YUV_VIDEO_SOURCE_H_ +#define VPX_TEST_YUV_VIDEO_SOURCE_H_ #include #include @@ -27,24 +27,24 @@ class YUVVideoSource : public VideoSource { YUVVideoSource(const std::string &file_name, vpx_img_fmt format, unsigned int width, unsigned int height, int rate_numerator, int rate_denominator, unsigned int start, int limit) - : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start), - limit_(limit), frame_(0), width_(0), height_(0), + : file_name_(file_name), input_file_(nullptr), img_(nullptr), + start_(start), limit_(limit), frame_(0), width_(0), height_(0), format_(VPX_IMG_FMT_NONE), framerate_numerator_(rate_numerator), framerate_denominator_(rate_denominator) { // This initializes format_, raw_size_, width_, height_ and allocates img. SetSize(width, height, format); } - virtual ~YUVVideoSource() { + ~YUVVideoSource() override { vpx_img_free(img_); if (input_file_) fclose(input_file_); } - virtual void Begin() { + void Begin() override { if (input_file_) fclose(input_file_); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " - << file_name_; + ASSERT_NE(input_file_, nullptr) + << "Input file open failed. Filename: " << file_name_; if (start_) { fseek(input_file_, static_cast(raw_size_) * start_, SEEK_SET); } @@ -53,37 +53,40 @@ class YUVVideoSource : public VideoSource { FillFrame(); } - virtual void Next() { + void Next() override { ++frame_; FillFrame(); } - virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; } + vpx_image_t *img() const override { + return (frame_ < limit_) ? img_ : nullptr; + } // Models a stream where Timebase = 1/FPS, so pts == frame. - virtual vpx_codec_pts_t pts() const { return frame_; } + vpx_codec_pts_t pts() const override { return frame_; } - virtual unsigned long duration() const { return 1; } + unsigned long duration() const override { return 1; } - virtual vpx_rational_t timebase() const { + vpx_rational_t timebase() const override { const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ }; return t; } - virtual unsigned int frame() const { return frame_; } + unsigned int frame() const override { return frame_; } - virtual unsigned int limit() const { return limit_; } + unsigned int limit() const override { return limit_; } virtual void SetSize(unsigned int width, unsigned int height, vpx_img_fmt format) { if (width != width_ || height != height_ || format != format_) { vpx_img_free(img_); - img_ = vpx_img_alloc(NULL, format, width, height, 1); - ASSERT_TRUE(img_ != NULL); + img_ = vpx_img_alloc(nullptr, format, width, height, 1); + ASSERT_NE(img_, nullptr); width_ = width; height_ = height; format_ = format; switch (format) { + case VPX_IMG_FMT_NV12: case VPX_IMG_FMT_I420: raw_size_ = width * height * 3 / 2; break; case VPX_IMG_FMT_I422: raw_size_ = width * height * 2; break; case VPX_IMG_FMT_I440: raw_size_ = width * height * 2; break; @@ -98,7 +101,7 @@ class YUVVideoSource : public VideoSource { } virtual void FillFrame() { - ASSERT_TRUE(input_file_ != NULL); + ASSERT_NE(input_file_, nullptr); // Read a frame from input_file. if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) { limit_ = frame_; @@ -122,4 +125,4 @@ class YUVVideoSource : public VideoSource { } // namespace libvpx_test -#endif // TEST_YUV_VIDEO_SOURCE_H_ +#endif // VPX_TEST_YUV_VIDEO_SOURCE_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/README.libvpx b/media/libvpx/libvpx/third_party/googletest/README.libvpx index 0e3b8b9377..5f6b01b0ec 100644 --- a/media/libvpx/libvpx/third_party/googletest/README.libvpx +++ b/media/libvpx/libvpx/third_party/googletest/README.libvpx @@ -1,7 +1,7 @@ -URL: http://code.google.com/p/googletest/ -Version: 1.7.0 +URL: https://github.com/google/googletest.git +Version: release-1.12.1 License: BSD -License File: COPYING +License File: LICENSE Description: Google's framework for writing C++ tests on a variety of platforms @@ -12,10 +12,18 @@ failures, various options for running the tests, and XML test report generation. Local Modifications: -- Removed unused declarations of kPathSeparatorString to have warning - free build. -- Added GTEST_ATTRIBUTE_UNUSED_ to test registering dummies in TEST_P - and INSTANTIATE_TEST_CASE_P to remove warnings about unused variables - under GCC 5. -- Only define g_in_fast_death_test_child for non-Windows builds; quiets an - unused variable warning. +- Remove everything but: + .clang-format + CONTRIBUTORS + googletest/ + include + README.md + src + LICENSE +- Move .clang-format, CONTRIBUTORS, and LICENSE into googletest/ +- In googletest/include/gtest/internal/custom/gtest-port.h, define + GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix + the mingw32 g++ compilation errors caused by the lack of std::mutex + and std::condition_variable in the and + headers if mingw32 is configured with the win32 threads option. See + https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32 diff --git a/media/libvpx/libvpx/third_party/googletest/src/.clang-format b/media/libvpx/libvpx/third_party/googletest/src/.clang-format new file mode 100644 index 0000000000..5b9bfe6d22 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/.clang-format @@ -0,0 +1,4 @@ +# Run manually to reformat a file: +# clang-format -i --style=file +Language: Cpp +BasedOnStyle: Google diff --git a/media/libvpx/libvpx/third_party/googletest/src/CHANGES b/media/libvpx/libvpx/third_party/googletest/src/CHANGES deleted file mode 100644 index 0552132421..0000000000 --- a/media/libvpx/libvpx/third_party/googletest/src/CHANGES +++ /dev/null @@ -1,157 +0,0 @@ -Changes for 1.7.0: - -* New feature: death tests are supported on OpenBSD and in iOS - simulator now. -* New feature: Google Test now implements a protocol to allow - a test runner to detect that a test program has exited - prematurely and report it as a failure (before it would be - falsely reported as a success if the exit code is 0). -* New feature: Test::RecordProperty() can now be used outside of the - lifespan of a test method, in which case it will be attributed to - the current test case or the test program in the XML report. -* New feature (potentially breaking): --gtest_list_tests now prints - the type parameters and value parameters for each test. -* Improvement: char pointers and char arrays are now escaped properly - in failure messages. -* Improvement: failure summary in XML reports now includes file and - line information. -* Improvement: the XML element now has a timestamp attribute. -* Improvement: When --gtest_filter is specified, XML report now doesn't - contain information about tests that are filtered out. -* Fixed the bug where long --gtest_filter flag values are truncated in - death tests. -* Potentially breaking change: RUN_ALL_TESTS() is now implemented as a - function instead of a macro in order to work better with Clang. -* Compatibility fixes with C++ 11 and various platforms. -* Bug/warning fixes. - -Changes for 1.6.0: - -* New feature: ADD_FAILURE_AT() for reporting a test failure at the - given source location -- useful for writing testing utilities. -* New feature: the universal value printer is moved from Google Mock - to Google Test. -* New feature: type parameters and value parameters are reported in - the XML report now. -* A gtest_disable_pthreads CMake option. -* Colored output works in GNU Screen sessions now. -* Parameters of value-parameterized tests are now printed in the - textual output. -* Failures from ad hoc test assertions run before RUN_ALL_TESTS() are - now correctly reported. -* Arguments of ASSERT_XY and EXPECT_XY no longer need to support << to - ostream. -* More complete handling of exceptions. -* GTEST_ASSERT_XY can be used instead of ASSERT_XY in case the latter - name is already used by another library. -* --gtest_catch_exceptions is now true by default, allowing a test - program to continue after an exception is thrown. -* Value-parameterized test fixtures can now derive from Test and - WithParamInterface separately, easing conversion of legacy tests. -* Death test messages are clearly marked to make them more - distinguishable from other messages. -* Compatibility fixes for Android, Google Native Client, MinGW, HP UX, - PowerPC, Lucid autotools, libCStd, Sun C++, Borland C++ Builder (Code Gear), - IBM XL C++ (Visual Age C++), and C++0x. -* Bug fixes and implementation clean-ups. -* Potentially incompatible changes: disables the harmful 'make install' - command in autotools. - -Changes for 1.5.0: - - * New feature: assertions can be safely called in multiple threads - where the pthreads library is available. - * New feature: predicates used inside EXPECT_TRUE() and friends - can now generate custom failure messages. - * New feature: Google Test can now be compiled as a DLL. - * New feature: fused source files are included. - * New feature: prints help when encountering unrecognized Google Test flags. - * Experimental feature: CMake build script (requires CMake 2.6.4+). - * Experimental feature: the Pump script for meta programming. - * double values streamed to an assertion are printed with enough precision - to differentiate any two different values. - * Google Test now works on Solaris and AIX. - * Build and test script improvements. - * Bug fixes and implementation clean-ups. - - Potentially breaking changes: - - * Stopped supporting VC++ 7.1 with exceptions disabled. - * Dropped support for 'make install'. - -Changes for 1.4.0: - - * New feature: the event listener API - * New feature: test shuffling - * New feature: the XML report format is closer to junitreport and can - be parsed by Hudson now. - * New feature: when a test runs under Visual Studio, its failures are - integrated in the IDE. - * New feature: /MD(d) versions of VC++ projects. - * New feature: elapsed time for the tests is printed by default. - * New feature: comes with a TR1 tuple implementation such that Boost - is no longer needed for Combine(). - * New feature: EXPECT_DEATH_IF_SUPPORTED macro and friends. - * New feature: the Xcode project can now produce static gtest - libraries in addition to a framework. - * Compatibility fixes for Solaris, Cygwin, minGW, Windows Mobile, - Symbian, gcc, and C++Builder. - * Bug fixes and implementation clean-ups. - -Changes for 1.3.0: - - * New feature: death tests on Windows, Cygwin, and Mac. - * New feature: ability to use Google Test assertions in other testing - frameworks. - * New feature: ability to run disabled test via - --gtest_also_run_disabled_tests. - * New feature: the --help flag for printing the usage. - * New feature: access to Google Test flag values in user code. - * New feature: a script that packs Google Test into one .h and one - .cc file for easy deployment. - * New feature: support for distributing test functions to multiple - machines (requires support from the test runner). - * Bug fixes and implementation clean-ups. - -Changes for 1.2.1: - - * Compatibility fixes for Linux IA-64 and IBM z/OS. - * Added support for using Boost and other TR1 implementations. - * Changes to the build scripts to support upcoming release of Google C++ - Mocking Framework. - * Added Makefile to the distribution package. - * Improved build instructions in README. - -Changes for 1.2.0: - - * New feature: value-parameterized tests. - * New feature: the ASSERT/EXPECT_(NON)FATAL_FAILURE(_ON_ALL_THREADS) - macros. - * Changed the XML report format to match JUnit/Ant's. - * Added tests to the Xcode project. - * Added scons/SConscript for building with SCons. - * Added src/gtest-all.cc for building Google Test from a single file. - * Fixed compatibility with Solaris and z/OS. - * Enabled running Python tests on systems with python 2.3 installed, - e.g. Mac OS X 10.4. - * Bug fixes. - -Changes for 1.1.0: - - * New feature: type-parameterized tests. - * New feature: exception assertions. - * New feature: printing elapsed time of tests. - * Improved the robustness of death tests. - * Added an Xcode project and samples. - * Adjusted the output format on Windows to be understandable by Visual Studio. - * Minor bug fixes. - -Changes for 1.0.1: - - * Added project files for Visual Studio 7.1. - * Fixed issues with compiling on Mac OS X. - * Fixed issues with compiling on Cygwin. - -Changes for 1.0.0: - - * Initial Open Source release of Google Test diff --git a/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS b/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS index feae2fc044..77397a5b53 100644 --- a/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS +++ b/media/libvpx/libvpx/third_party/googletest/src/CONTRIBUTORS @@ -5,33 +5,61 @@ Ajay Joshi Balázs Dán +Benoit Sigoure Bharat Mediratta +Bogdan Piloca Chandler Carruth Chris Prince Chris Taylor Dan Egnor +Dave MacLachlan +David Anderson +Dean Sturtevant Eric Roman +Gene Volovich Hady Zalek +Hal Burch Jeffrey Yasskin +Jim Keller +Joe Walnes +Jon Wray Jói Sigurðsson Keir Mierle Keith Ray Kenton Varda +Kostya Serebryany +Krystian Kuzniarek +Lev Makhlis Manuel Klimek +Mario Tanev +Mark Paskin Markus Heule +Martijn Vels +Matthew Simmons Mika Raento +Mike Bland Miklós Fazekas +Neal Norwitz +Nermin Ozkiranartli +Owen Carlsen +Paneendra Ba Pasi Valminen Patrick Hanna Patrick Riley +Paul Menage Peter Kaminski +Piotr Kaminski Preston Jackson Rainer Klaffenboeck Russ Cox Russ Rufer Sean Mcafee Sigurður Ásgeirsson +Sverre Sundsdal +Szymon Sobik +Takeshi Yoshino Tracy Bialik Vadim Berman Vlad Losev +Wolfgang Klier Zhanyong Wan diff --git a/media/libvpx/libvpx/third_party/googletest/src/README b/media/libvpx/libvpx/third_party/googletest/src/README deleted file mode 100644 index 26f35a8479..0000000000 --- a/media/libvpx/libvpx/third_party/googletest/src/README +++ /dev/null @@ -1,435 +0,0 @@ -Google C++ Testing Framework -============================ - -http://code.google.com/p/googletest/ - -Overview --------- - -Google's framework for writing C++ tests on a variety of platforms -(Linux, Mac OS X, Windows, Windows CE, Symbian, etc). Based on the -xUnit architecture. Supports automatic test discovery, a rich set of -assertions, user-defined assertions, death tests, fatal and non-fatal -failures, various options for running the tests, and XML test report -generation. - -Please see the project page above for more information as well as the -mailing list for questions, discussions, and development. There is -also an IRC channel on OFTC (irc.oftc.net) #gtest available. Please -join us! - -Requirements for End Users --------------------------- - -Google Test is designed to have fairly minimal requirements to build -and use with your projects, but there are some. Currently, we support -Linux, Windows, Mac OS X, and Cygwin. We will also make our best -effort to support other platforms (e.g. Solaris, AIX, and z/OS). -However, since core members of the Google Test project have no access -to these platforms, Google Test may have outstanding issues there. If -you notice any problems on your platform, please notify -googletestframework@googlegroups.com. Patches for fixing them are -even more welcome! - -### Linux Requirements ### - -These are the base requirements to build and use Google Test from a source -package (as described below): - * GNU-compatible Make or gmake - * POSIX-standard shell - * POSIX(-2) Regular Expressions (regex.h) - * A C++98-standard-compliant compiler - -### Windows Requirements ### - - * Microsoft Visual C++ 7.1 or newer - -### Cygwin Requirements ### - - * Cygwin 1.5.25-14 or newer - -### Mac OS X Requirements ### - - * Mac OS X 10.4 Tiger or newer - * Developer Tools Installed - -Also, you'll need CMake 2.6.4 or higher if you want to build the -samples using the provided CMake script, regardless of the platform. - -Requirements for Contributors ------------------------------ - -We welcome patches. If you plan to contribute a patch, you need to -build Google Test and its own tests from an SVN checkout (described -below), which has further requirements: - - * Python version 2.3 or newer (for running some of the tests and - re-generating certain source files from templates) - * CMake 2.6.4 or newer - -Getting the Source ------------------- - -There are two primary ways of getting Google Test's source code: you -can download a stable source release in your preferred archive format, -or directly check out the source from our Subversion (SVN) repositary. -The SVN checkout requires a few extra steps and some extra software -packages on your system, but lets you track the latest development and -make patches much more easily, so we highly encourage it. - -### Source Package ### - -Google Test is released in versioned source packages which can be -downloaded from the download page [1]. Several different archive -formats are provided, but the only difference is the tools used to -manipulate them, and the size of the resulting file. Download -whichever you are most comfortable with. - - [1] http://code.google.com/p/googletest/downloads/list - -Once the package is downloaded, expand it using whichever tools you -prefer for that type. This will result in a new directory with the -name "gtest-X.Y.Z" which contains all of the source code. Here are -some examples on Linux: - - tar -xvzf gtest-X.Y.Z.tar.gz - tar -xvjf gtest-X.Y.Z.tar.bz2 - unzip gtest-X.Y.Z.zip - -### SVN Checkout ### - -To check out the main branch (also known as the "trunk") of Google -Test, run the following Subversion command: - - svn checkout http://googletest.googlecode.com/svn/trunk/ gtest-svn - -Setting up the Build --------------------- - -To build Google Test and your tests that use it, you need to tell your -build system where to find its headers and source files. The exact -way to do it depends on which build system you use, and is usually -straightforward. - -### Generic Build Instructions ### - -Suppose you put Google Test in directory ${GTEST_DIR}. To build it, -create a library build target (or a project as called by Visual Studio -and Xcode) to compile - - ${GTEST_DIR}/src/gtest-all.cc - -with ${GTEST_DIR}/include in the system header search path and ${GTEST_DIR} -in the normal header search path. Assuming a Linux-like system and gcc, -something like the following will do: - - g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \ - -pthread -c ${GTEST_DIR}/src/gtest-all.cc - ar -rv libgtest.a gtest-all.o - -(We need -pthread as Google Test uses threads.) - -Next, you should compile your test source file with -${GTEST_DIR}/include in the system header search path, and link it -with gtest and any other necessary libraries: - - g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \ - -o your_test - -As an example, the make/ directory contains a Makefile that you can -use to build Google Test on systems where GNU make is available -(e.g. Linux, Mac OS X, and Cygwin). It doesn't try to build Google -Test's own tests. Instead, it just builds the Google Test library and -a sample test. You can use it as a starting point for your own build -script. - -If the default settings are correct for your environment, the -following commands should succeed: - - cd ${GTEST_DIR}/make - make - ./sample1_unittest - -If you see errors, try to tweak the contents of make/Makefile to make -them go away. There are instructions in make/Makefile on how to do -it. - -### Using CMake ### - -Google Test comes with a CMake build script (CMakeLists.txt) that can -be used on a wide range of platforms ("C" stands for cross-platofrm.). -If you don't have CMake installed already, you can download it for -free from http://www.cmake.org/. - -CMake works by generating native makefiles or build projects that can -be used in the compiler environment of your choice. The typical -workflow starts with: - - mkdir mybuild # Create a directory to hold the build output. - cd mybuild - cmake ${GTEST_DIR} # Generate native build scripts. - -If you want to build Google Test's samples, you should replace the -last command with - - cmake -Dgtest_build_samples=ON ${GTEST_DIR} - -If you are on a *nix system, you should now see a Makefile in the -current directory. Just type 'make' to build gtest. - -If you use Windows and have Vistual Studio installed, a gtest.sln file -and several .vcproj files will be created. You can then build them -using Visual Studio. - -On Mac OS X with Xcode installed, a .xcodeproj file will be generated. - -### Legacy Build Scripts ### - -Before settling on CMake, we have been providing hand-maintained build -projects/scripts for Visual Studio, Xcode, and Autotools. While we -continue to provide them for convenience, they are not actively -maintained any more. We highly recommend that you follow the -instructions in the previous two sections to integrate Google Test -with your existing build system. - -If you still need to use the legacy build scripts, here's how: - -The msvc\ folder contains two solutions with Visual C++ projects. -Open the gtest.sln or gtest-md.sln file using Visual Studio, and you -are ready to build Google Test the same way you build any Visual -Studio project. Files that have names ending with -md use DLL -versions of Microsoft runtime libraries (the /MD or the /MDd compiler -option). Files without that suffix use static versions of the runtime -libraries (the /MT or the /MTd option). Please note that one must use -the same option to compile both gtest and the test code. If you use -Visual Studio 2005 or above, we recommend the -md version as /MD is -the default for new projects in these versions of Visual Studio. - -On Mac OS X, open the gtest.xcodeproj in the xcode/ folder using -Xcode. Build the "gtest" target. The universal binary framework will -end up in your selected build directory (selected in the Xcode -"Preferences..." -> "Building" pane and defaults to xcode/build). -Alternatively, at the command line, enter: - - xcodebuild - -This will build the "Release" configuration of gtest.framework in your -default build location. See the "xcodebuild" man page for more -information about building different configurations and building in -different locations. - -If you wish to use the Google Test Xcode project with Xcode 4.x and -above, you need to either: - * update the SDK configuration options in xcode/Config/General.xconfig. - Comment options SDKROOT, MACOS_DEPLOYMENT_TARGET, and GCC_VERSION. If - you choose this route you lose the ability to target earlier versions - of MacOS X. - * Install an SDK for an earlier version. This doesn't appear to be - supported by Apple, but has been reported to work - (http://stackoverflow.com/questions/5378518). - -Tweaking Google Test --------------------- - -Google Test can be used in diverse environments. The default -configuration may not work (or may not work well) out of the box in -some environments. However, you can easily tweak Google Test by -defining control macros on the compiler command line. Generally, -these macros are named like GTEST_XYZ and you define them to either 1 -or 0 to enable or disable a certain feature. - -We list the most frequently used macros below. For a complete list, -see file include/gtest/internal/gtest-port.h. - -### Choosing a TR1 Tuple Library ### - -Some Google Test features require the C++ Technical Report 1 (TR1) -tuple library, which is not yet available with all compilers. The -good news is that Google Test implements a subset of TR1 tuple that's -enough for its own need, and will automatically use this when the -compiler doesn't provide TR1 tuple. - -Usually you don't need to care about which tuple library Google Test -uses. However, if your project already uses TR1 tuple, you need to -tell Google Test to use the same TR1 tuple library the rest of your -project uses, or the two tuple implementations will clash. To do -that, add - - -DGTEST_USE_OWN_TR1_TUPLE=0 - -to the compiler flags while compiling Google Test and your tests. If -you want to force Google Test to use its own tuple library, just add - - -DGTEST_USE_OWN_TR1_TUPLE=1 - -to the compiler flags instead. - -If you don't want Google Test to use tuple at all, add - - -DGTEST_HAS_TR1_TUPLE=0 - -and all features using tuple will be disabled. - -### Multi-threaded Tests ### - -Google Test is thread-safe where the pthread library is available. -After #include "gtest/gtest.h", you can check the GTEST_IS_THREADSAFE -macro to see whether this is the case (yes if the macro is #defined to -1, no if it's undefined.). - -If Google Test doesn't correctly detect whether pthread is available -in your environment, you can force it with - - -DGTEST_HAS_PTHREAD=1 - -or - - -DGTEST_HAS_PTHREAD=0 - -When Google Test uses pthread, you may need to add flags to your -compiler and/or linker to select the pthread library, or you'll get -link errors. If you use the CMake script or the deprecated Autotools -script, this is taken care of for you. If you use your own build -script, you'll need to read your compiler and linker's manual to -figure out what flags to add. - -### As a Shared Library (DLL) ### - -Google Test is compact, so most users can build and link it as a -static library for the simplicity. You can choose to use Google Test -as a shared library (known as a DLL on Windows) if you prefer. - -To compile *gtest* as a shared library, add - - -DGTEST_CREATE_SHARED_LIBRARY=1 - -to the compiler flags. You'll also need to tell the linker to produce -a shared library instead - consult your linker's manual for how to do -it. - -To compile your *tests* that use the gtest shared library, add - - -DGTEST_LINKED_AS_SHARED_LIBRARY=1 - -to the compiler flags. - -Note: while the above steps aren't technically necessary today when -using some compilers (e.g. GCC), they may become necessary in the -future, if we decide to improve the speed of loading the library (see -http://gcc.gnu.org/wiki/Visibility for details). Therefore you are -recommended to always add the above flags when using Google Test as a -shared library. Otherwise a future release of Google Test may break -your build script. - -### Avoiding Macro Name Clashes ### - -In C++, macros don't obey namespaces. Therefore two libraries that -both define a macro of the same name will clash if you #include both -definitions. In case a Google Test macro clashes with another -library, you can force Google Test to rename its macro to avoid the -conflict. - -Specifically, if both Google Test and some other code define macro -FOO, you can add - - -DGTEST_DONT_DEFINE_FOO=1 - -to the compiler flags to tell Google Test to change the macro's name -from FOO to GTEST_FOO. Currently FOO can be FAIL, SUCCEED, or TEST. -For example, with -DGTEST_DONT_DEFINE_TEST=1, you'll need to write - - GTEST_TEST(SomeTest, DoesThis) { ... } - -instead of - - TEST(SomeTest, DoesThis) { ... } - -in order to define a test. - -Upgrating from an Earlier Version ---------------------------------- - -We strive to keep Google Test releases backward compatible. -Sometimes, though, we have to make some breaking changes for the -users' long-term benefits. This section describes what you'll need to -do if you are upgrading from an earlier version of Google Test. - -### Upgrading from 1.3.0 or Earlier ### - -You may need to explicitly enable or disable Google Test's own TR1 -tuple library. See the instructions in section "Choosing a TR1 Tuple -Library". - -### Upgrading from 1.4.0 or Earlier ### - -The Autotools build script (configure + make) is no longer officially -supportted. You are encouraged to migrate to your own build system or -use CMake. If you still need to use Autotools, you can find -instructions in the README file from Google Test 1.4.0. - -On platforms where the pthread library is available, Google Test uses -it in order to be thread-safe. See the "Multi-threaded Tests" section -for what this means to your build script. - -If you use Microsoft Visual C++ 7.1 with exceptions disabled, Google -Test will no longer compile. This should affect very few people, as a -large portion of STL (including ) doesn't compile in this mode -anyway. We decided to stop supporting it in order to greatly simplify -Google Test's implementation. - -Developing Google Test ----------------------- - -This section discusses how to make your own changes to Google Test. - -### Testing Google Test Itself ### - -To make sure your changes work as intended and don't break existing -functionality, you'll want to compile and run Google Test's own tests. -For that you can use CMake: - - mkdir mybuild - cd mybuild - cmake -Dgtest_build_tests=ON ${GTEST_DIR} - -Make sure you have Python installed, as some of Google Test's tests -are written in Python. If the cmake command complains about not being -able to find Python ("Could NOT find PythonInterp (missing: -PYTHON_EXECUTABLE)"), try telling it explicitly where your Python -executable can be found: - - cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR} - -Next, you can build Google Test and all of its own tests. On *nix, -this is usually done by 'make'. To run the tests, do - - make test - -All tests should pass. - -### Regenerating Source Files ### - -Some of Google Test's source files are generated from templates (not -in the C++ sense) using a script. A template file is named FOO.pump, -where FOO is the name of the file it will generate. For example, the -file include/gtest/internal/gtest-type-util.h.pump is used to generate -gtest-type-util.h in the same directory. - -Normally you don't need to worry about regenerating the source files, -unless you need to modify them. In that case, you should modify the -corresponding .pump files instead and run the pump.py Python script to -regenerate them. You can find pump.py in the scripts/ directory. -Read the Pump manual [2] for how to use it. - - [2] http://code.google.com/p/googletest/wiki/PumpManual - -### Contributing a Patch ### - -We welcome patches. Please read the Google Test developer's guide [3] -for how you can contribute. In particular, make sure you have signed -the Contributor License Agreement, or we won't be able to accept the -patch. - - [3] http://code.google.com/p/googletest/wiki/GoogleTestDevGuide - -Happy testing! diff --git a/media/libvpx/libvpx/third_party/googletest/src/README.md b/media/libvpx/libvpx/third_party/googletest/src/README.md new file mode 100644 index 0000000000..d26b309ed0 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/README.md @@ -0,0 +1,217 @@ +### Generic Build Instructions + +#### Setup + +To build GoogleTest and your tests that use it, you need to tell your build +system where to find its headers and source files. The exact way to do it +depends on which build system you use, and is usually straightforward. + +### Build with CMake + +GoogleTest comes with a CMake build script +([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt)) +that can be used on a wide range of platforms ("C" stands for cross-platform.). +If you don't have CMake installed already, you can download it for free from +. + +CMake works by generating native makefiles or build projects that can be used in +the compiler environment of your choice. You can either build GoogleTest as a +standalone project or it can be incorporated into an existing CMake build for +another project. + +#### Standalone CMake Project + +When building GoogleTest as a standalone project, the typical workflow starts +with + +``` +git clone https://github.com/google/googletest.git -b release-1.11.0 +cd googletest # Main directory of the cloned repository. +mkdir build # Create a directory to hold the build output. +cd build +cmake .. # Generate native build scripts for GoogleTest. +``` + +The above command also includes GoogleMock by default. And so, if you want to +build only GoogleTest, you should replace the last command with + +``` +cmake .. -DBUILD_GMOCK=OFF +``` + +If you are on a \*nix system, you should now see a Makefile in the current +directory. Just type `make` to build GoogleTest. And then you can simply install +GoogleTest if you are a system administrator. + +``` +make +sudo make install # Install in /usr/local/ by default +``` + +If you use Windows and have Visual Studio installed, a `gtest.sln` file and +several `.vcproj` files will be created. You can then build them using Visual +Studio. + +On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated. + +#### Incorporating Into An Existing CMake Project + +If you want to use GoogleTest in a project which already uses CMake, the easiest +way is to get installed libraries and headers. + +* Import GoogleTest by using `find_package` (or `pkg_check_modules`). For + example, if `find_package(GTest CONFIG REQUIRED)` succeeds, you can use the + libraries as `GTest::gtest`, `GTest::gmock`. + +And a more robust and flexible approach is to build GoogleTest as part of that +project directly. This is done by making the GoogleTest source code available to +the main build and adding it using CMake's `add_subdirectory()` command. This +has the significant advantage that the same compiler and linker settings are +used between GoogleTest and the rest of your project, so issues associated with +using incompatible libraries (eg debug/release), etc. are avoided. This is +particularly useful on Windows. Making GoogleTest's source code available to the +main build can be done a few different ways: + +* Download the GoogleTest source code manually and place it at a known + location. This is the least flexible approach and can make it more difficult + to use with continuous integration systems, etc. +* Embed the GoogleTest source code as a direct copy in the main project's + source tree. This is often the simplest approach, but is also the hardest to + keep up to date. Some organizations may not permit this method. +* Add GoogleTest as a git submodule or equivalent. This may not always be + possible or appropriate. Git submodules, for example, have their own set of + advantages and drawbacks. +* Use CMake to download GoogleTest as part of the build's configure step. This + approach doesn't have the limitations of the other methods. + +The last of the above methods is implemented with a small piece of CMake code +that downloads and pulls the GoogleTest code into the main build. + +Just add to your `CMakeLists.txt`: + +```cmake +include(FetchContent) +FetchContent_Declare( + googletest + # Specify the commit you depend on and update it regularly. + URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip +) +# For Windows: Prevent overriding the parent project's compiler/linker settings +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) + +# Now simply link against gtest or gtest_main as needed. Eg +add_executable(example example.cpp) +target_link_libraries(example gtest_main) +add_test(NAME example_test COMMAND example) +``` + +Note that this approach requires CMake 3.14 or later due to its use of the +`FetchContent_MakeAvailable()` command. + +##### Visual Studio Dynamic vs Static Runtimes + +By default, new Visual Studio projects link the C runtimes dynamically but +GoogleTest links them statically. This will generate an error that looks +something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch +detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value +'MDd_DynamicDebug' in main.obj + +GoogleTest already has a CMake option for this: `gtest_force_shared_crt` + +Enabling this option will make gtest link the runtimes dynamically too, and +match the project in which it is included. + +#### C++ Standard Version + +An environment that supports C++11 is required in order to successfully build +GoogleTest. One way to ensure this is to specify the standard in the top-level +project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this +is not feasible, for example in a C project using GoogleTest for validation, +then it can be specified by adding it to the options for cmake via the +`DCMAKE_CXX_FLAGS` option. + +### Tweaking GoogleTest + +GoogleTest can be used in diverse environments. The default configuration may +not work (or may not work well) out of the box in some environments. However, +you can easily tweak GoogleTest by defining control macros on the compiler +command line. Generally, these macros are named like `GTEST_XYZ` and you define +them to either 1 or 0 to enable or disable a certain feature. + +We list the most frequently used macros below. For a complete list, see file +[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/googletest/include/gtest/internal/gtest-port.h). + +### Multi-threaded Tests + +GoogleTest is thread-safe where the pthread library is available. After +`#include "gtest/gtest.h"`, you can check the +`GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is +`#defined` to 1, no if it's undefined.). + +If GoogleTest doesn't correctly detect whether pthread is available in your +environment, you can force it with + + -DGTEST_HAS_PTHREAD=1 + +or + + -DGTEST_HAS_PTHREAD=0 + +When GoogleTest uses pthread, you may need to add flags to your compiler and/or +linker to select the pthread library, or you'll get link errors. If you use the +CMake script, this is taken care of for you. If you use your own build script, +you'll need to read your compiler and linker's manual to figure out what flags +to add. + +### As a Shared Library (DLL) + +GoogleTest is compact, so most users can build and link it as a static library +for the simplicity. You can choose to use GoogleTest as a shared library (known +as a DLL on Windows) if you prefer. + +To compile *gtest* as a shared library, add + + -DGTEST_CREATE_SHARED_LIBRARY=1 + +to the compiler flags. You'll also need to tell the linker to produce a shared +library instead - consult your linker's manual for how to do it. + +To compile your *tests* that use the gtest shared library, add + + -DGTEST_LINKED_AS_SHARED_LIBRARY=1 + +to the compiler flags. + +Note: while the above steps aren't technically necessary today when using some +compilers (e.g. GCC), they may become necessary in the future, if we decide to +improve the speed of loading the library (see + for details). Therefore you are recommended +to always add the above flags when using GoogleTest as a shared library. +Otherwise a future release of GoogleTest may break your build script. + +### Avoiding Macro Name Clashes + +In C++, macros don't obey namespaces. Therefore two libraries that both define a +macro of the same name will clash if you `#include` both definitions. In case a +GoogleTest macro clashes with another library, you can force GoogleTest to +rename its macro to avoid the conflict. + +Specifically, if both GoogleTest and some other code define macro FOO, you can +add + + -DGTEST_DONT_DEFINE_FOO=1 + +to the compiler flags to tell GoogleTest to change the macro's name from `FOO` +to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`, +`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`, +`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For +example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write + + GTEST_TEST(SomeTest, DoesThis) { ... } + +instead of + + TEST(SomeTest, DoesThis) { ... } + +in order to define a test. diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h new file mode 100644 index 0000000000..addbb59c64 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h @@ -0,0 +1,237 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements the AssertionResult type. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ + +#include +#include +#include +#include + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-port.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// A class for indicating whether an assertion was successful. When +// the assertion wasn't successful, the AssertionResult object +// remembers a non-empty message that describes how it failed. +// +// To create an instance of this class, use one of the factory functions +// (AssertionSuccess() and AssertionFailure()). +// +// This class is useful for two purposes: +// 1. Defining predicate functions to be used with Boolean test assertions +// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts +// 2. Defining predicate-format functions to be +// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). +// +// For example, if you define IsEven predicate: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) +// will print the message +// +// Value of: IsEven(Fib(5)) +// Actual: false (5 is odd) +// Expected: true +// +// instead of a more opaque +// +// Value of: IsEven(Fib(5)) +// Actual: false +// Expected: true +// +// in case IsEven is a simple Boolean predicate. +// +// If you expect your predicate to be reused and want to support informative +// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up +// about half as often as positive ones in our tests), supply messages for +// both success and failure cases: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess() << n << " is even"; +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print +// +// Value of: IsEven(Fib(6)) +// Actual: true (8 is even) +// Expected: false +// +// NB: Predicates that support negative Boolean assertions have reduced +// performance in positive ones so be careful not to use them in tests +// that have lots (tens of thousands) of positive Boolean assertions. +// +// To use this class with EXPECT_PRED_FORMAT assertions such as: +// +// // Verifies that Foo() returns an even number. +// EXPECT_PRED_FORMAT1(IsEven, Foo()); +// +// you need to define: +// +// testing::AssertionResult IsEven(const char* expr, int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() +// << "Expected: " << expr << " is even\n Actual: it's " << n; +// } +// +// If Foo() returns 5, you will see the following message: +// +// Expected: Foo() is even +// Actual: it's 5 +// +class GTEST_API_ AssertionResult { + public: + // Copy constructor. + // Used in EXPECT_TRUE/FALSE(assertion_result). + AssertionResult(const AssertionResult& other); + +// C4800 is a level 3 warning in Visual Studio 2015 and earlier. +// This warning is not emitted in Visual Studio 2017. +// This warning is off by default starting in Visual Studio 2019 but can be +// enabled with command-line options. +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */) +#endif + + // Used in the EXPECT_TRUE/FALSE(bool_expression). + // + // T must be contextually convertible to bool. + // + // The second parameter prevents this overload from being considered if + // the argument is implicitly convertible to AssertionResult. In that case + // we want AssertionResult's copy constructor to be used. + template + explicit AssertionResult( + const T& success, + typename std::enable_if< + !std::is_convertible::value>::type* + /*enabler*/ + = nullptr) + : success_(success) {} + +#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920) + GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + + // Assignment operator. + AssertionResult& operator=(AssertionResult other) { + swap(other); + return *this; + } + + // Returns true if and only if the assertion succeeded. + operator bool() const { return success_; } // NOLINT + + // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. + AssertionResult operator!() const; + + // Returns the text streamed into this AssertionResult. Test assertions + // use it when they fail (i.e., the predicate's outcome doesn't match the + // assertion's expectation). When nothing has been streamed into the + // object, returns an empty string. + const char* message() const { + return message_.get() != nullptr ? message_->c_str() : ""; + } + // Deprecated; please use message() instead. + const char* failure_message() const { return message(); } + + // Streams a custom failure message into this object. + template + AssertionResult& operator<<(const T& value) { + AppendMessage(Message() << value); + return *this; + } + + // Allows streaming basic output manipulators such as endl or flush into + // this object. + AssertionResult& operator<<( + ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) { + AppendMessage(Message() << basic_manipulator); + return *this; + } + + private: + // Appends the contents of message to message_. + void AppendMessage(const Message& a_message) { + if (message_.get() == nullptr) message_.reset(new ::std::string); + message_->append(a_message.GetString().c_str()); + } + + // Swap the contents of this AssertionResult with other. + void swap(AssertionResult& other); + + // Stores result of the assertion predicate. + bool success_; + // Stores the message describing the condition in case the expectation + // construct is not satisfied with the predicate's outcome. + // Referenced via a pointer to avoid taking too much stack frame space + // with test assertions. + std::unique_ptr< ::std::string> message_; +}; + +// Makes a successful assertion result. +GTEST_API_ AssertionResult AssertionSuccess(); + +// Makes a failed assertion result. +GTEST_API_ AssertionResult AssertionFailure(); + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << msg. +GTEST_API_ AssertionResult AssertionFailure(const Message& msg); + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h new file mode 100644 index 0000000000..84e5a5bbd3 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h @@ -0,0 +1,345 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the public API for death tests. It is +// #included by gtest.h so a user doesn't need to include this +// directly. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ + +#include "gtest/internal/gtest-death-test-internal.h" + +// This flag controls the style of death tests. Valid values are "threadsafe", +// meaning that the death test child process will re-execute the test binary +// from the start, running only a single death test, or "fast", +// meaning that the child process will execute the test logic immediately +// after forking. +GTEST_DECLARE_string_(death_test_style); + +namespace testing { + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +GTEST_API_ bool InDeathTestChild(); + +} // namespace internal + +// The following macros are useful for writing death tests. + +// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is +// executed: +// +// 1. It generates a warning if there is more than one active +// thread. This is because it's safe to fork() or clone() only +// when there is a single thread. +// +// 2. The parent process clone()s a sub-process and runs the death +// test in it; the sub-process exits with code 0 at the end of the +// death test, if it hasn't exited already. +// +// 3. The parent process waits for the sub-process to terminate. +// +// 4. The parent process checks the exit code and error message of +// the sub-process. +// +// Examples: +// +// ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number"); +// for (int i = 0; i < 5; i++) { +// EXPECT_DEATH(server.ProcessRequest(i), +// "Invalid request .* in ProcessRequest()") +// << "Failed to die on request " << i; +// } +// +// ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting"); +// +// bool KilledBySIGHUP(int exit_code) { +// return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP; +// } +// +// ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!"); +// +// The final parameter to each of these macros is a matcher applied to any data +// the sub-process wrote to stderr. For compatibility with existing tests, a +// bare string is interpreted as a regular expression matcher. +// +// On the regular expressions used in death tests: +// +// On POSIX-compliant systems (*nix), we use the library, +// which uses the POSIX extended regex syntax. +// +// On other platforms (e.g. Windows or Mac), we only support a simple regex +// syntax implemented as part of Google Test. This limited +// implementation should be enough most of the time when writing +// death tests; though it lacks many features you can find in PCRE +// or POSIX extended regex syntax. For example, we don't support +// union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and +// repetition count ("x{5,7}"), among others. +// +// Below is the syntax that we do support. We chose it to be a +// subset of both PCRE and POSIX extended regex, so it's easy to +// learn wherever you come from. In the following: 'A' denotes a +// literal character, period (.), or a single \\ escape sequence; +// 'x' and 'y' denote regular expressions; 'm' and 'n' are for +// natural numbers. +// +// c matches any literal character c +// \\d matches any decimal digit +// \\D matches any character that's not a decimal digit +// \\f matches \f +// \\n matches \n +// \\r matches \r +// \\s matches any ASCII whitespace, including \n +// \\S matches any character that's not a whitespace +// \\t matches \t +// \\v matches \v +// \\w matches any letter, _, or decimal digit +// \\W matches any character that \\w doesn't match +// \\c matches any literal character c, which must be a punctuation +// . matches any single character except \n +// A? matches 0 or 1 occurrences of A +// A* matches 0 or many occurrences of A +// A+ matches 1 or many occurrences of A +// ^ matches the beginning of a string (not that of each line) +// $ matches the end of a string (not that of each line) +// xy matches x followed by y +// +// If you accidentally use PCRE or POSIX extended regex features +// not implemented by us, you will get a run-time failure. In that +// case, please try to rewrite your regular expression within the +// above syntax. +// +// This implementation is *not* meant to be as highly tuned or robust +// as a compiled regex library, but should perform well enough for a +// death test, which already incurs significant overhead by launching +// a child process. +// +// Known caveats: +// +// A "threadsafe" style death test obtains the path to the test +// program from argv[0] and re-executes it in the sub-process. For +// simplicity, the current implementation doesn't search the PATH +// when launching the sub-process. This means that the user must +// invoke the test program via a path that contains at least one +// path separator (e.g. path/to/foo_test and +// /absolute/path/to/bar_test are fine, but foo_test is not). This +// is rarely a problem as people usually don't put the test binary +// directory in PATH. +// + +// Asserts that a given `statement` causes the program to exit, with an +// integer exit status that satisfies `predicate`, and emitting error output +// that matches `matcher`. +#define ASSERT_EXIT(statement, predicate, matcher) \ + GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_) + +// Like `ASSERT_EXIT`, but continues on to successive tests in the +// test suite, if any: +#define EXPECT_EXIT(statement, predicate, matcher) \ + GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_) + +// Asserts that a given `statement` causes the program to exit, either by +// explicitly exiting with a nonzero exit code or being killed by a +// signal, and emitting error output that matches `matcher`. +#define ASSERT_DEATH(statement, matcher) \ + ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) + +// Like `ASSERT_DEATH`, but continues on to successive tests in the +// test suite, if any: +#define EXPECT_DEATH(statement, matcher) \ + EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher) + +// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: + +// Tests that an exit code describes a normal exit with a given exit code. +class GTEST_API_ ExitedWithCode { + public: + explicit ExitedWithCode(int exit_code); + ExitedWithCode(const ExitedWithCode&) = default; + void operator=(const ExitedWithCode& other) = delete; + bool operator()(int exit_status) const; + + private: + const int exit_code_; +}; + +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +// Tests that an exit code describes an exit due to termination by a +// given signal. +class GTEST_API_ KilledBySignal { + public: + explicit KilledBySignal(int signum); + bool operator()(int exit_status) const; + + private: + const int signum_; +}; +#endif // !GTEST_OS_WINDOWS + +// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. +// The death testing framework causes this to have interesting semantics, +// since the sideeffects of the call are only visible in opt mode, and not +// in debug mode. +// +// In practice, this can be used to test functions that utilize the +// LOG(DFATAL) macro using the following style: +// +// int DieInDebugOr12(int* sideeffect) { +// if (sideeffect) { +// *sideeffect = 12; +// } +// LOG(DFATAL) << "death"; +// return 12; +// } +// +// TEST(TestSuite, TestDieOr12WorksInDgbAndOpt) { +// int sideeffect = 0; +// // Only asserts in dbg. +// EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death"); +// +// #ifdef NDEBUG +// // opt-mode has sideeffect visible. +// EXPECT_EQ(12, sideeffect); +// #else +// // dbg-mode no visible sideeffect. +// EXPECT_EQ(0, sideeffect); +// #endif +// } +// +// This will assert that DieInDebugReturn12InOpt() crashes in debug +// mode, usually due to a DCHECK or LOG(DFATAL), but returns the +// appropriate fallback value (12 in this case) in opt mode. If you +// need to test that a function has appropriate side-effects in opt +// mode, include assertions against the side-effects. A general +// pattern for this is: +// +// EXPECT_DEBUG_DEATH({ +// // Side-effects here will have an effect after this statement in +// // opt mode, but none in debug mode. +// EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); +// }, "death"); +// +#ifdef NDEBUG + +#define EXPECT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +#define ASSERT_DEBUG_DEATH(statement, regex) \ + GTEST_EXECUTE_STATEMENT_(statement, regex) + +#else + +#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex) + +#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex) + +#endif // NDEBUG for EXPECT_DEBUG_DEATH +#endif // GTEST_HAS_DEATH_TEST + +// This macro is used for implementing macros such as +// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where +// death tests are not supported. Those macros must compile on such systems +// if and only if EXPECT_DEATH and ASSERT_DEATH compile with the same parameters +// on systems that support death tests. This allows one to write such a macro on +// a system that does not support death tests and be sure that it will compile +// on a death-test supporting system. It is exposed publicly so that systems +// that have death-tests with stricter requirements than GTEST_HAS_DEATH_TEST +// can write their own equivalent of EXPECT_DEATH_IF_SUPPORTED and +// ASSERT_DEATH_IF_SUPPORTED. +// +// Parameters: +// statement - A statement that a macro such as EXPECT_DEATH would test +// for program termination. This macro has to make sure this +// statement is compiled but not executed, to ensure that +// EXPECT_DEATH_IF_SUPPORTED compiles with a certain +// parameter if and only if EXPECT_DEATH compiles with it. +// regex - A regex that a macro such as EXPECT_DEATH would use to test +// the output of statement. This parameter has to be +// compiled but not evaluated by this macro, to ensure that +// this macro only accepts expressions that a macro such as +// EXPECT_DEATH would accept. +// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED +// and a return statement for ASSERT_DEATH_IF_SUPPORTED. +// This ensures that ASSERT_DEATH_IF_SUPPORTED will not +// compile inside functions where ASSERT_DEATH doesn't +// compile. +// +// The branch that has an always false condition is used to ensure that +// statement and regex are compiled (and thus syntactically correct) but +// never executed. The unreachable code macro protects the terminator +// statement from generating an 'unreachable code' warning in case +// statement unconditionally returns or throws. The Message constructor at +// the end allows the syntax of streaming additional messages into the +// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. +#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \ + << "Statement '" #statement "' cannot be verified."; \ + } else if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::RE::PartialMatch(".*", (regex)); \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + terminator; \ + } else \ + ::testing::Message() + +// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and +// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if +// death tests are supported; otherwise they just issue a warning. This is +// useful when you are combining death test assertions with normal test +// assertions in one test. +#if GTEST_HAS_DEATH_TEST +#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + EXPECT_DEATH(statement, regex) +#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + ASSERT_DEATH(statement, regex) +#else +#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, ) +#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return) +#endif + +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h new file mode 100644 index 0000000000..bffa00c533 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h @@ -0,0 +1,956 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements just enough of the matcher interface to allow +// EXPECT_DEATH and friends to accept a matcher argument. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ + +#include +#include +#include +#include +#include + +#include "gtest/gtest-printers.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +// MSVC warning C5046 is new as of VS2017 version 15.8. +#if defined(_MSC_VER) && _MSC_VER >= 1915 +#define GTEST_MAYBE_5046_ 5046 +#else +#define GTEST_MAYBE_5046_ +#endif + +GTEST_DISABLE_MSC_WARNINGS_PUSH_( + 4251 GTEST_MAYBE_5046_ /* class A needs to have dll-interface to be used by + clients of class B */ + /* Symbol involving type with internal linkage not defined */) + +namespace testing { + +// To implement a matcher Foo for type T, define: +// 1. a class FooMatcherMatcher that implements the matcher interface: +// using is_gtest_matcher = void; +// bool MatchAndExplain(const T&, std::ostream*); +// (MatchResultListener* can also be used instead of std::ostream*) +// void DescribeTo(std::ostream*); +// void DescribeNegationTo(std::ostream*); +// +// 2. a factory function that creates a Matcher object from a +// FooMatcherMatcher. + +class MatchResultListener { + public: + // Creates a listener object with the given underlying ostream. The + // listener does not own the ostream, and does not dereference it + // in the constructor or destructor. + explicit MatchResultListener(::std::ostream* os) : stream_(os) {} + virtual ~MatchResultListener() = 0; // Makes this class abstract. + + // Streams x to the underlying ostream; does nothing if the ostream + // is NULL. + template + MatchResultListener& operator<<(const T& x) { + if (stream_ != nullptr) *stream_ << x; + return *this; + } + + // Returns the underlying ostream. + ::std::ostream* stream() { return stream_; } + + // Returns true if and only if the listener is interested in an explanation + // of the match result. A matcher's MatchAndExplain() method can use + // this information to avoid generating the explanation when no one + // intends to hear it. + bool IsInterested() const { return stream_ != nullptr; } + + private: + ::std::ostream* const stream_; + + MatchResultListener(const MatchResultListener&) = delete; + MatchResultListener& operator=(const MatchResultListener&) = delete; +}; + +inline MatchResultListener::~MatchResultListener() {} + +// An instance of a subclass of this knows how to describe itself as a +// matcher. +class GTEST_API_ MatcherDescriberInterface { + public: + virtual ~MatcherDescriberInterface() {} + + // Describes this matcher to an ostream. The function should print + // a verb phrase that describes the property a value matching this + // matcher should have. The subject of the verb phrase is the value + // being matched. For example, the DescribeTo() method of the Gt(7) + // matcher prints "is greater than 7". + virtual void DescribeTo(::std::ostream* os) const = 0; + + // Describes the negation of this matcher to an ostream. For + // example, if the description of this matcher is "is greater than + // 7", the negated description could be "is not greater than 7". + // You are not required to override this when implementing + // MatcherInterface, but it is highly advised so that your matcher + // can produce good error messages. + virtual void DescribeNegationTo(::std::ostream* os) const { + *os << "not ("; + DescribeTo(os); + *os << ")"; + } +}; + +// The implementation of a matcher. +template +class MatcherInterface : public MatcherDescriberInterface { + public: + // Returns true if and only if the matcher matches x; also explains the + // match result to 'listener' if necessary (see the next paragraph), in + // the form of a non-restrictive relative clause ("which ...", + // "whose ...", etc) that describes x. For example, the + // MatchAndExplain() method of the Pointee(...) matcher should + // generate an explanation like "which points to ...". + // + // Implementations of MatchAndExplain() should add an explanation of + // the match result *if and only if* they can provide additional + // information that's not already present (or not obvious) in the + // print-out of x and the matcher's description. Whether the match + // succeeds is not a factor in deciding whether an explanation is + // needed, as sometimes the caller needs to print a failure message + // when the match succeeds (e.g. when the matcher is used inside + // Not()). + // + // For example, a "has at least 10 elements" matcher should explain + // what the actual element count is, regardless of the match result, + // as it is useful information to the reader; on the other hand, an + // "is empty" matcher probably only needs to explain what the actual + // size is when the match fails, as it's redundant to say that the + // size is 0 when the value is already known to be empty. + // + // You should override this method when defining a new matcher. + // + // It's the responsibility of the caller (Google Test) to guarantee + // that 'listener' is not NULL. This helps to simplify a matcher's + // implementation when it doesn't care about the performance, as it + // can talk to 'listener' without checking its validity first. + // However, in order to implement dummy listeners efficiently, + // listener->stream() may be NULL. + virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0; + + // Inherits these methods from MatcherDescriberInterface: + // virtual void DescribeTo(::std::ostream* os) const = 0; + // virtual void DescribeNegationTo(::std::ostream* os) const; +}; + +namespace internal { + +struct AnyEq { + template + bool operator()(const A& a, const B& b) const { + return a == b; + } +}; +struct AnyNe { + template + bool operator()(const A& a, const B& b) const { + return a != b; + } +}; +struct AnyLt { + template + bool operator()(const A& a, const B& b) const { + return a < b; + } +}; +struct AnyGt { + template + bool operator()(const A& a, const B& b) const { + return a > b; + } +}; +struct AnyLe { + template + bool operator()(const A& a, const B& b) const { + return a <= b; + } +}; +struct AnyGe { + template + bool operator()(const A& a, const B& b) const { + return a >= b; + } +}; + +// A match result listener that ignores the explanation. +class DummyMatchResultListener : public MatchResultListener { + public: + DummyMatchResultListener() : MatchResultListener(nullptr) {} + + private: + DummyMatchResultListener(const DummyMatchResultListener&) = delete; + DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete; +}; + +// A match result listener that forwards the explanation to a given +// ostream. The difference between this and MatchResultListener is +// that the former is concrete. +class StreamMatchResultListener : public MatchResultListener { + public: + explicit StreamMatchResultListener(::std::ostream* os) + : MatchResultListener(os) {} + + private: + StreamMatchResultListener(const StreamMatchResultListener&) = delete; + StreamMatchResultListener& operator=(const StreamMatchResultListener&) = + delete; +}; + +struct SharedPayloadBase { + std::atomic ref{1}; + void Ref() { ref.fetch_add(1, std::memory_order_relaxed); } + bool Unref() { return ref.fetch_sub(1, std::memory_order_acq_rel) == 1; } +}; + +template +struct SharedPayload : SharedPayloadBase { + explicit SharedPayload(const T& v) : value(v) {} + explicit SharedPayload(T&& v) : value(std::move(v)) {} + + static void Destroy(SharedPayloadBase* shared) { + delete static_cast(shared); + } + + T value; +}; + +// An internal class for implementing Matcher, which will derive +// from it. We put functionalities common to all Matcher +// specializations here to avoid code duplication. +template +class MatcherBase : private MatcherDescriberInterface { + public: + // Returns true if and only if the matcher matches x; also explains the + // match result to 'listener'. + bool MatchAndExplain(const T& x, MatchResultListener* listener) const { + GTEST_CHECK_(vtable_ != nullptr); + return vtable_->match_and_explain(*this, x, listener); + } + + // Returns true if and only if this matcher matches x. + bool Matches(const T& x) const { + DummyMatchResultListener dummy; + return MatchAndExplain(x, &dummy); + } + + // Describes this matcher to an ostream. + void DescribeTo(::std::ostream* os) const final { + GTEST_CHECK_(vtable_ != nullptr); + vtable_->describe(*this, os, false); + } + + // Describes the negation of this matcher to an ostream. + void DescribeNegationTo(::std::ostream* os) const final { + GTEST_CHECK_(vtable_ != nullptr); + vtable_->describe(*this, os, true); + } + + // Explains why x matches, or doesn't match, the matcher. + void ExplainMatchResultTo(const T& x, ::std::ostream* os) const { + StreamMatchResultListener listener(os); + MatchAndExplain(x, &listener); + } + + // Returns the describer for this matcher object; retains ownership + // of the describer, which is only guaranteed to be alive when + // this matcher object is alive. + const MatcherDescriberInterface* GetDescriber() const { + if (vtable_ == nullptr) return nullptr; + return vtable_->get_describer(*this); + } + + protected: + MatcherBase() : vtable_(nullptr), buffer_() {} + + // Constructs a matcher from its implementation. + template + explicit MatcherBase(const MatcherInterface* impl) + : vtable_(nullptr), buffer_() { + Init(impl); + } + + template ::type::is_gtest_matcher> + MatcherBase(M&& m) : vtable_(nullptr), buffer_() { // NOLINT + Init(std::forward(m)); + } + + MatcherBase(const MatcherBase& other) + : vtable_(other.vtable_), buffer_(other.buffer_) { + if (IsShared()) buffer_.shared->Ref(); + } + + MatcherBase& operator=(const MatcherBase& other) { + if (this == &other) return *this; + Destroy(); + vtable_ = other.vtable_; + buffer_ = other.buffer_; + if (IsShared()) buffer_.shared->Ref(); + return *this; + } + + MatcherBase(MatcherBase&& other) + : vtable_(other.vtable_), buffer_(other.buffer_) { + other.vtable_ = nullptr; + } + + MatcherBase& operator=(MatcherBase&& other) { + if (this == &other) return *this; + Destroy(); + vtable_ = other.vtable_; + buffer_ = other.buffer_; + other.vtable_ = nullptr; + return *this; + } + + ~MatcherBase() override { Destroy(); } + + private: + struct VTable { + bool (*match_and_explain)(const MatcherBase&, const T&, + MatchResultListener*); + void (*describe)(const MatcherBase&, std::ostream*, bool negation); + // Returns the captured object if it implements the interface, otherwise + // returns the MatcherBase itself. + const MatcherDescriberInterface* (*get_describer)(const MatcherBase&); + // Called on shared instances when the reference count reaches 0. + void (*shared_destroy)(SharedPayloadBase*); + }; + + bool IsShared() const { + return vtable_ != nullptr && vtable_->shared_destroy != nullptr; + } + + // If the implementation uses a listener, call that. + template + static auto MatchAndExplainImpl(const MatcherBase& m, const T& value, + MatchResultListener* listener) + -> decltype(P::Get(m).MatchAndExplain(value, listener->stream())) { + return P::Get(m).MatchAndExplain(value, listener->stream()); + } + + template + static auto MatchAndExplainImpl(const MatcherBase& m, const T& value, + MatchResultListener* listener) + -> decltype(P::Get(m).MatchAndExplain(value, listener)) { + return P::Get(m).MatchAndExplain(value, listener); + } + + template + static void DescribeImpl(const MatcherBase& m, std::ostream* os, + bool negation) { + if (negation) { + P::Get(m).DescribeNegationTo(os); + } else { + P::Get(m).DescribeTo(os); + } + } + + template + static const MatcherDescriberInterface* GetDescriberImpl( + const MatcherBase& m) { + // If the impl is a MatcherDescriberInterface, then return it. + // Otherwise use MatcherBase itself. + // This allows us to implement the GetDescriber() function without support + // from the impl, but some users really want to get their impl back when + // they call GetDescriber(). + // We use std::get on a tuple as a workaround of not having `if constexpr`. + return std::get<( + std::is_convertible::value + ? 1 + : 0)>(std::make_tuple(&m, &P::Get(m))); + } + + template + const VTable* GetVTable() { + static constexpr VTable kVTable = {&MatchAndExplainImpl

, + &DescribeImpl

, &GetDescriberImpl

, + P::shared_destroy}; + return &kVTable; + } + + union Buffer { + // Add some types to give Buffer some common alignment/size use cases. + void* ptr; + double d; + int64_t i; + // And add one for the out-of-line cases. + SharedPayloadBase* shared; + }; + + void Destroy() { + if (IsShared() && buffer_.shared->Unref()) { + vtable_->shared_destroy(buffer_.shared); + } + } + + template + static constexpr bool IsInlined() { + return sizeof(M) <= sizeof(Buffer) && alignof(M) <= alignof(Buffer) && + std::is_trivially_copy_constructible::value && + std::is_trivially_destructible::value; + } + + template ()> + struct ValuePolicy { + static const M& Get(const MatcherBase& m) { + // When inlined along with Init, need to be explicit to avoid violating + // strict aliasing rules. + const M* ptr = + static_cast(static_cast(&m.buffer_)); + return *ptr; + } + static void Init(MatcherBase& m, M impl) { + ::new (static_cast(&m.buffer_)) M(impl); + } + static constexpr auto shared_destroy = nullptr; + }; + + template + struct ValuePolicy { + using Shared = SharedPayload; + static const M& Get(const MatcherBase& m) { + return static_cast(m.buffer_.shared)->value; + } + template + static void Init(MatcherBase& m, Arg&& arg) { + m.buffer_.shared = new Shared(std::forward(arg)); + } + static constexpr auto shared_destroy = &Shared::Destroy; + }; + + template + struct ValuePolicy*, B> { + using M = const MatcherInterface; + using Shared = SharedPayload>; + static const M& Get(const MatcherBase& m) { + return *static_cast(m.buffer_.shared)->value; + } + static void Init(MatcherBase& m, M* impl) { + m.buffer_.shared = new Shared(std::unique_ptr(impl)); + } + + static constexpr auto shared_destroy = &Shared::Destroy; + }; + + template + void Init(M&& m) { + using MM = typename std::decay::type; + using Policy = ValuePolicy; + vtable_ = GetVTable(); + Policy::Init(*this, std::forward(m)); + } + + const VTable* vtable_; + Buffer buffer_; +}; + +} // namespace internal + +// A Matcher is a copyable and IMMUTABLE (except by assignment) +// object that can check whether a value of type T matches. The +// implementation of Matcher is just a std::shared_ptr to const +// MatcherInterface. Don't inherit from Matcher! +template +class Matcher : public internal::MatcherBase { + public: + // Constructs a null matcher. Needed for storing Matcher objects in STL + // containers. A default-constructed matcher is not yet initialized. You + // cannot use it until a valid value has been assigned to it. + explicit Matcher() {} // NOLINT + + // Constructs a matcher from its implementation. + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template + explicit Matcher( + const MatcherInterface* impl, + typename std::enable_if::value>::type* = + nullptr) + : internal::MatcherBase(impl) {} + + template ::type::is_gtest_matcher> + Matcher(M&& m) : internal::MatcherBase(std::forward(m)) {} // NOLINT + + // Implicit constructor here allows people to write + // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes + Matcher(T value); // NOLINT +}; + +// The following two specializations allow the user to write str +// instead of Eq(str) and "foo" instead of Eq("foo") when a std::string +// matcher is expected. +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT +}; + +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT +}; + +#if GTEST_INTERNAL_HAS_STRING_VIEW +// The following two specializations allow the user to write str +// instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view +// matcher is expected. +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) { + } + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT + + // Allows the user to pass absl::string_views or std::string_views directly. + Matcher(internal::StringView s); // NOLINT +}; + +template <> +class GTEST_API_ Matcher + : public internal::MatcherBase { + public: + Matcher() {} + + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + explicit Matcher(const MatcherInterface* impl) + : internal::MatcherBase(impl) {} + + template ::type::is_gtest_matcher> + Matcher(M&& m) // NOLINT + : internal::MatcherBase(std::forward(m)) {} + + // Allows the user to write str instead of Eq(str) sometimes, where + // str is a std::string object. + Matcher(const std::string& s); // NOLINT + + // Allows the user to write "foo" instead of Eq("foo") sometimes. + Matcher(const char* s); // NOLINT + + // Allows the user to pass absl::string_views or std::string_views directly. + Matcher(internal::StringView s); // NOLINT +}; +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + +// Prints a matcher in a human-readable format. +template +std::ostream& operator<<(std::ostream& os, const Matcher& matcher) { + matcher.DescribeTo(&os); + return os; +} + +// The PolymorphicMatcher class template makes it easy to implement a +// polymorphic matcher (i.e. a matcher that can match values of more +// than one type, e.g. Eq(n) and NotNull()). +// +// To define a polymorphic matcher, a user should provide an Impl +// class that has a DescribeTo() method and a DescribeNegationTo() +// method, and define a member function (or member function template) +// +// bool MatchAndExplain(const Value& value, +// MatchResultListener* listener) const; +// +// See the definition of NotNull() for a complete example. +template +class PolymorphicMatcher { + public: + explicit PolymorphicMatcher(const Impl& an_impl) : impl_(an_impl) {} + + // Returns a mutable reference to the underlying matcher + // implementation object. + Impl& mutable_impl() { return impl_; } + + // Returns an immutable reference to the underlying matcher + // implementation object. + const Impl& impl() const { return impl_; } + + template + operator Matcher() const { + return Matcher(new MonomorphicImpl(impl_)); + } + + private: + template + class MonomorphicImpl : public MatcherInterface { + public: + explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {} + + void DescribeTo(::std::ostream* os) const override { impl_.DescribeTo(os); } + + void DescribeNegationTo(::std::ostream* os) const override { + impl_.DescribeNegationTo(os); + } + + bool MatchAndExplain(T x, MatchResultListener* listener) const override { + return impl_.MatchAndExplain(x, listener); + } + + private: + const Impl impl_; + }; + + Impl impl_; +}; + +// Creates a matcher from its implementation. +// DEPRECATED: Especially in the generic code, prefer: +// Matcher(new MyMatcherImpl(...)); +// +// MakeMatcher may create a Matcher that accepts its argument by value, which +// leads to unnecessary copies & lack of support for non-copyable types. +template +inline Matcher MakeMatcher(const MatcherInterface* impl) { + return Matcher(impl); +} + +// Creates a polymorphic matcher from its implementation. This is +// easier to use than the PolymorphicMatcher constructor as it +// doesn't require you to explicitly write the template argument, e.g. +// +// MakePolymorphicMatcher(foo); +// vs +// PolymorphicMatcher(foo); +template +inline PolymorphicMatcher MakePolymorphicMatcher(const Impl& impl) { + return PolymorphicMatcher(impl); +} + +namespace internal { +// Implements a matcher that compares a given value with a +// pre-supplied value using one of the ==, <=, <, etc, operators. The +// two values being compared don't have to have the same type. +// +// The matcher defined here is polymorphic (for example, Eq(5) can be +// used to match an int, a short, a double, etc). Therefore we use +// a template type conversion operator in the implementation. +// +// The following template definition assumes that the Rhs parameter is +// a "bare" type (i.e. neither 'const T' nor 'T&'). +template +class ComparisonBase { + public: + explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {} + + using is_gtest_matcher = void; + + template + bool MatchAndExplain(const Lhs& lhs, std::ostream*) const { + return Op()(lhs, Unwrap(rhs_)); + } + void DescribeTo(std::ostream* os) const { + *os << D::Desc() << " "; + UniversalPrint(Unwrap(rhs_), os); + } + void DescribeNegationTo(std::ostream* os) const { + *os << D::NegatedDesc() << " "; + UniversalPrint(Unwrap(rhs_), os); + } + + private: + template + static const T& Unwrap(const T& v) { + return v; + } + template + static const T& Unwrap(std::reference_wrapper v) { + return v; + } + + Rhs rhs_; +}; + +template +class EqMatcher : public ComparisonBase, Rhs, AnyEq> { + public: + explicit EqMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyEq>(rhs) {} + static const char* Desc() { return "is equal to"; } + static const char* NegatedDesc() { return "isn't equal to"; } +}; +template +class NeMatcher : public ComparisonBase, Rhs, AnyNe> { + public: + explicit NeMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyNe>(rhs) {} + static const char* Desc() { return "isn't equal to"; } + static const char* NegatedDesc() { return "is equal to"; } +}; +template +class LtMatcher : public ComparisonBase, Rhs, AnyLt> { + public: + explicit LtMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyLt>(rhs) {} + static const char* Desc() { return "is <"; } + static const char* NegatedDesc() { return "isn't <"; } +}; +template +class GtMatcher : public ComparisonBase, Rhs, AnyGt> { + public: + explicit GtMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyGt>(rhs) {} + static const char* Desc() { return "is >"; } + static const char* NegatedDesc() { return "isn't >"; } +}; +template +class LeMatcher : public ComparisonBase, Rhs, AnyLe> { + public: + explicit LeMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyLe>(rhs) {} + static const char* Desc() { return "is <="; } + static const char* NegatedDesc() { return "isn't <="; } +}; +template +class GeMatcher : public ComparisonBase, Rhs, AnyGe> { + public: + explicit GeMatcher(const Rhs& rhs) + : ComparisonBase, Rhs, AnyGe>(rhs) {} + static const char* Desc() { return "is >="; } + static const char* NegatedDesc() { return "isn't >="; } +}; + +template ::value>::type> +using StringLike = T; + +// Implements polymorphic matchers MatchesRegex(regex) and +// ContainsRegex(regex), which can be used as a Matcher as long as +// T can be converted to a string. +class MatchesRegexMatcher { + public: + MatchesRegexMatcher(const RE* regex, bool full_match) + : regex_(regex), full_match_(full_match) {} + +#if GTEST_INTERNAL_HAS_STRING_VIEW + bool MatchAndExplain(const internal::StringView& s, + MatchResultListener* listener) const { + return MatchAndExplain(std::string(s), listener); + } +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + + // Accepts pointer types, particularly: + // const char* + // char* + // const wchar_t* + // wchar_t* + template + bool MatchAndExplain(CharType* s, MatchResultListener* listener) const { + return s != nullptr && MatchAndExplain(std::string(s), listener); + } + + // Matches anything that can convert to std::string. + // + // This is a template, not just a plain function with const std::string&, + // because absl::string_view has some interfering non-explicit constructors. + template + bool MatchAndExplain(const MatcheeStringType& s, + MatchResultListener* /* listener */) const { + const std::string& s2(s); + return full_match_ ? RE::FullMatch(s2, *regex_) + : RE::PartialMatch(s2, *regex_); + } + + void DescribeTo(::std::ostream* os) const { + *os << (full_match_ ? "matches" : "contains") << " regular expression "; + UniversalPrinter::Print(regex_->pattern(), os); + } + + void DescribeNegationTo(::std::ostream* os) const { + *os << "doesn't " << (full_match_ ? "match" : "contain") + << " regular expression "; + UniversalPrinter::Print(regex_->pattern(), os); + } + + private: + const std::shared_ptr regex_; + const bool full_match_; +}; +} // namespace internal + +// Matches a string that fully matches regular expression 'regex'. +// The matcher takes ownership of 'regex'. +inline PolymorphicMatcher MatchesRegex( + const internal::RE* regex) { + return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true)); +} +template +PolymorphicMatcher MatchesRegex( + const internal::StringLike& regex) { + return MatchesRegex(new internal::RE(std::string(regex))); +} + +// Matches a string that contains regular expression 'regex'. +// The matcher takes ownership of 'regex'. +inline PolymorphicMatcher ContainsRegex( + const internal::RE* regex) { + return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false)); +} +template +PolymorphicMatcher ContainsRegex( + const internal::StringLike& regex) { + return ContainsRegex(new internal::RE(std::string(regex))); +} + +// Creates a polymorphic matcher that matches anything equal to x. +// Note: if the parameter of Eq() were declared as const T&, Eq("foo") +// wouldn't compile. +template +inline internal::EqMatcher Eq(T x) { + return internal::EqMatcher(x); +} + +// Constructs a Matcher from a 'value' of type T. The constructed +// matcher matches any value that's equal to 'value'. +template +Matcher::Matcher(T value) { + *this = Eq(value); +} + +// Creates a monomorphic matcher that matches anything with type Lhs +// and equal to rhs. A user may need to use this instead of Eq(...) +// in order to resolve an overloading ambiguity. +// +// TypedEq(x) is just a convenient short-hand for Matcher(Eq(x)) +// or Matcher(x), but more readable than the latter. +// +// We could define similar monomorphic matchers for other comparison +// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do +// it yet as those are used much less than Eq() in practice. A user +// can always write Matcher(Lt(5)) to be explicit about the type, +// for example. +template +inline Matcher TypedEq(const Rhs& rhs) { + return Eq(rhs); +} + +// Creates a polymorphic matcher that matches anything >= x. +template +inline internal::GeMatcher Ge(Rhs x) { + return internal::GeMatcher(x); +} + +// Creates a polymorphic matcher that matches anything > x. +template +inline internal::GtMatcher Gt(Rhs x) { + return internal::GtMatcher(x); +} + +// Creates a polymorphic matcher that matches anything <= x. +template +inline internal::LeMatcher Le(Rhs x) { + return internal::LeMatcher(x); +} + +// Creates a polymorphic matcher that matches anything < x. +template +inline internal::LtMatcher Lt(Rhs x) { + return internal::LtMatcher(x); +} + +// Creates a polymorphic matcher that matches anything != x. +template +inline internal::NeMatcher Ne(Rhs x) { + return internal::NeMatcher(x); +} +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 5046 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h new file mode 100644 index 0000000000..6c8bf90009 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-message.h @@ -0,0 +1,218 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the Message class. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ + +#include +#include +#include + +#include "gtest/internal/gtest-port.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// Ensures that there is at least one operator<< in the global namespace. +// See Message& operator<<(...) below for why. +void operator<<(const testing::internal::Secret&, int); + +namespace testing { + +// The Message class works like an ostream repeater. +// +// Typical usage: +// +// 1. You stream a bunch of values to a Message object. +// It will remember the text in a stringstream. +// 2. Then you stream the Message object to an ostream. +// This causes the text in the Message to be streamed +// to the ostream. +// +// For example; +// +// testing::Message foo; +// foo << 1 << " != " << 2; +// std::cout << foo; +// +// will print "1 != 2". +// +// Message is not intended to be inherited from. In particular, its +// destructor is not virtual. +// +// Note that stringstream behaves differently in gcc and in MSVC. You +// can stream a NULL char pointer to it in the former, but not in the +// latter (it causes an access violation if you do). The Message +// class hides this difference by treating a NULL char pointer as +// "(null)". +class GTEST_API_ Message { + private: + // The type of basic IO manipulators (endl, ends, and flush) for + // narrow streams. + typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&); + + public: + // Constructs an empty Message. + Message(); + + // Copy constructor. + Message(const Message& msg) : ss_(new ::std::stringstream) { // NOLINT + *ss_ << msg.GetString(); + } + + // Constructs a Message from a C-string. + explicit Message(const char* str) : ss_(new ::std::stringstream) { + *ss_ << str; + } + + // Streams a non-pointer value to this object. + template + inline Message& operator<<(const T& val) { + // Some libraries overload << for STL containers. These + // overloads are defined in the global namespace instead of ::std. + // + // C++'s symbol lookup rule (i.e. Koenig lookup) says that these + // overloads are visible in either the std namespace or the global + // namespace, but not other namespaces, including the testing + // namespace which Google Test's Message class is in. + // + // To allow STL containers (and other types that has a << operator + // defined in the global namespace) to be used in Google Test + // assertions, testing::Message must access the custom << operator + // from the global namespace. With this using declaration, + // overloads of << defined in the global namespace and those + // visible via Koenig lookup are both exposed in this function. + using ::operator<<; + *ss_ << val; + return *this; + } + + // Streams a pointer value to this object. + // + // This function is an overload of the previous one. When you + // stream a pointer to a Message, this definition will be used as it + // is more specialized. (The C++ Standard, section + // [temp.func.order].) If you stream a non-pointer, then the + // previous definition will be used. + // + // The reason for this overload is that streaming a NULL pointer to + // ostream is undefined behavior. Depending on the compiler, you + // may get "0", "(nil)", "(null)", or an access violation. To + // ensure consistent result across compilers, we always treat NULL + // as "(null)". + template + inline Message& operator<<(T* const& pointer) { // NOLINT + if (pointer == nullptr) { + *ss_ << "(null)"; + } else { + *ss_ << pointer; + } + return *this; + } + + // Since the basic IO manipulators are overloaded for both narrow + // and wide streams, we have to provide this specialized definition + // of operator <<, even though its body is the same as the + // templatized version above. Without this definition, streaming + // endl or other basic IO manipulators to Message will confuse the + // compiler. + Message& operator<<(BasicNarrowIoManip val) { + *ss_ << val; + return *this; + } + + // Instead of 1/0, we want to see true/false for bool values. + Message& operator<<(bool b) { return *this << (b ? "true" : "false"); } + + // These two overloads allow streaming a wide C string to a Message + // using the UTF-8 encoding. + Message& operator<<(const wchar_t* wide_c_str); + Message& operator<<(wchar_t* wide_c_str); + +#if GTEST_HAS_STD_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message& operator<<(const ::std::wstring& wstr); +#endif // GTEST_HAS_STD_WSTRING + + // Gets the text streamed to this object so far as an std::string. + // Each '\0' character in the buffer is replaced with "\\0". + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + std::string GetString() const; + + private: + // We'll hold the text streamed to this object here. + const std::unique_ptr< ::std::stringstream> ss_; + + // We declare (but don't implement) this to prevent the compiler + // from implementing the assignment operator. + void operator=(const Message&); +}; + +// Streams a Message to an ostream. +inline std::ostream& operator<<(std::ostream& os, const Message& sb) { + return os << sb.GetString(); +} + +namespace internal { + +// Converts a streamable value to an std::string. A NULL pointer is +// converted to "(null)". When the input value is a ::string, +// ::std::string, ::wstring, or ::std::wstring object, each NUL +// character in it is replaced with "\\0". +template +std::string StreamableToString(const T& streamable) { + return (Message() << streamable).GetString(); +} + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h new file mode 100644 index 0000000000..b55119ac62 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h @@ -0,0 +1,510 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Macros and functions for implementing parameterized tests +// in Google C++ Testing and Mocking Framework (Google Test) + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ + +// Value-parameterized tests allow you to test your code with different +// parameters without writing multiple copies of the same test. +// +// Here is how you use value-parameterized tests: + +#if 0 + +// To write value-parameterized tests, first you should define a fixture +// class. It is usually derived from testing::TestWithParam (see below for +// another inheritance scheme that's sometimes useful in more complicated +// class hierarchies), where the type of your parameter values. +// TestWithParam is itself derived from testing::Test. T can be any +// copyable type. If it's a raw pointer, you are responsible for managing the +// lifespan of the pointed values. + +class FooTest : public ::testing::TestWithParam { + // You can implement all the usual class fixture members here. +}; + +// Then, use the TEST_P macro to define as many parameterized tests +// for this fixture as you want. The _P suffix is for "parameterized" +// or "pattern", whichever you prefer to think. + +TEST_P(FooTest, DoesBlah) { + // Inside a test, access the test parameter with the GetParam() method + // of the TestWithParam class: + EXPECT_TRUE(foo.Blah(GetParam())); + ... +} + +TEST_P(FooTest, HasBlahBlah) { + ... +} + +// Finally, you can use INSTANTIATE_TEST_SUITE_P to instantiate the test +// case with any set of parameters you want. Google Test defines a number +// of functions for generating test parameters. They return what we call +// (surprise!) parameter generators. Here is a summary of them, which +// are all in the testing namespace: +// +// +// Range(begin, end [, step]) - Yields values {begin, begin+step, +// begin+step+step, ...}. The values do not +// include end. step defaults to 1. +// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. +// ValuesIn(container) - Yields values from a C-style array, an STL +// ValuesIn(begin,end) container, or an iterator range [begin, end). +// Bool() - Yields sequence {false, true}. +// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product +// for the math savvy) of the values generated +// by the N generators. +// +// For more details, see comments at the definitions of these functions below +// in this file. +// +// The following statement will instantiate tests from the FooTest test suite +// each with parameter values "meeny", "miny", and "moe". + +INSTANTIATE_TEST_SUITE_P(InstantiationName, + FooTest, + Values("meeny", "miny", "moe")); + +// To distinguish different instances of the pattern, (yes, you +// can instantiate it more than once) the first argument to the +// INSTANTIATE_TEST_SUITE_P macro is a prefix that will be added to the +// actual test suite name. Remember to pick unique prefixes for different +// instantiations. The tests from the instantiation above will have +// these names: +// +// * InstantiationName/FooTest.DoesBlah/0 for "meeny" +// * InstantiationName/FooTest.DoesBlah/1 for "miny" +// * InstantiationName/FooTest.DoesBlah/2 for "moe" +// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" +// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" +// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" +// +// You can use these names in --gtest_filter. +// +// This statement will instantiate all tests from FooTest again, each +// with parameter values "cat" and "dog": + +const char* pets[] = {"cat", "dog"}; +INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); + +// The tests from the instantiation above will have these names: +// +// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" +// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" +// +// Please note that INSTANTIATE_TEST_SUITE_P will instantiate all tests +// in the given test suite, whether their definitions come before or +// AFTER the INSTANTIATE_TEST_SUITE_P statement. +// +// Please also note that generator expressions (including parameters to the +// generators) are evaluated in InitGoogleTest(), after main() has started. +// This allows the user on one hand, to adjust generator parameters in order +// to dynamically determine a set of tests to run and on the other hand, +// give the user a chance to inspect the generated tests with Google Test +// reflection API before RUN_ALL_TESTS() is executed. +// +// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc +// for more examples. +// +// In the future, we plan to publish the API for defining new parameter +// generators. But for now this interface remains part of the internal +// implementation and is subject to change. +// +// +// A parameterized test fixture must be derived from testing::Test and from +// testing::WithParamInterface, where T is the type of the parameter +// values. Inheriting from TestWithParam satisfies that requirement because +// TestWithParam inherits from both Test and WithParamInterface. In more +// complicated hierarchies, however, it is occasionally useful to inherit +// separately from Test and WithParamInterface. For example: + +class BaseTest : public ::testing::Test { + // You can inherit all the usual members for a non-parameterized test + // fixture here. +}; + +class DerivedTest : public BaseTest, public ::testing::WithParamInterface { + // The usual test fixture members go here too. +}; + +TEST_F(BaseTest, HasFoo) { + // This is an ordinary non-parameterized test. +} + +TEST_P(DerivedTest, DoesBlah) { + // GetParam works just the same here as if you inherit from TestWithParam. + EXPECT_TRUE(foo.Blah(GetParam())); +} + +#endif // 0 + +#include +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-param-util.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { + +// Functions producing parameter generators. +// +// Google Test uses these generators to produce parameters for value- +// parameterized tests. When a parameterized test suite is instantiated +// with a particular generator, Google Test creates and runs tests +// for each element in the sequence produced by the generator. +// +// In the following sample, tests from test suite FooTest are instantiated +// each three times with parameter values 3, 5, and 8: +// +// class FooTest : public TestWithParam { ... }; +// +// TEST_P(FooTest, TestThis) { +// } +// TEST_P(FooTest, TestThat) { +// } +// INSTANTIATE_TEST_SUITE_P(TestSequence, FooTest, Values(3, 5, 8)); +// + +// Range() returns generators providing sequences of values in a range. +// +// Synopsis: +// Range(start, end) +// - returns a generator producing a sequence of values {start, start+1, +// start+2, ..., }. +// Range(start, end, step) +// - returns a generator producing a sequence of values {start, start+step, +// start+step+step, ..., }. +// Notes: +// * The generated sequences never include end. For example, Range(1, 5) +// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) +// returns a generator producing {1, 3, 5, 7}. +// * start and end must have the same type. That type may be any integral or +// floating-point type or a user defined type satisfying these conditions: +// * It must be assignable (have operator=() defined). +// * It must have operator+() (operator+(int-compatible type) for +// two-operand version). +// * It must have operator<() defined. +// Elements in the resulting sequences will also have that type. +// * Condition start < end must be satisfied in order for resulting sequences +// to contain any elements. +// +template +internal::ParamGenerator Range(T start, T end, IncrementT step) { + return internal::ParamGenerator( + new internal::RangeGenerator(start, end, step)); +} + +template +internal::ParamGenerator Range(T start, T end) { + return Range(start, end, 1); +} + +// ValuesIn() function allows generation of tests with parameters coming from +// a container. +// +// Synopsis: +// ValuesIn(const T (&array)[N]) +// - returns a generator producing sequences with elements from +// a C-style array. +// ValuesIn(const Container& container) +// - returns a generator producing sequences with elements from +// an STL-style container. +// ValuesIn(Iterator begin, Iterator end) +// - returns a generator producing sequences with elements from +// a range [begin, end) defined by a pair of STL-style iterators. These +// iterators can also be plain C pointers. +// +// Please note that ValuesIn copies the values from the containers +// passed in and keeps them to generate tests in RUN_ALL_TESTS(). +// +// Examples: +// +// This instantiates tests from test suite StringTest +// each with C-string values of "foo", "bar", and "baz": +// +// const char* strings[] = {"foo", "bar", "baz"}; +// INSTANTIATE_TEST_SUITE_P(StringSequence, StringTest, ValuesIn(strings)); +// +// This instantiates tests from test suite StlStringTest +// each with STL strings with values "a" and "b": +// +// ::std::vector< ::std::string> GetParameterStrings() { +// ::std::vector< ::std::string> v; +// v.push_back("a"); +// v.push_back("b"); +// return v; +// } +// +// INSTANTIATE_TEST_SUITE_P(CharSequence, +// StlStringTest, +// ValuesIn(GetParameterStrings())); +// +// +// This will also instantiate tests from CharTest +// each with parameter values 'a' and 'b': +// +// ::std::list GetParameterChars() { +// ::std::list list; +// list.push_back('a'); +// list.push_back('b'); +// return list; +// } +// ::std::list l = GetParameterChars(); +// INSTANTIATE_TEST_SUITE_P(CharSequence2, +// CharTest, +// ValuesIn(l.begin(), l.end())); +// +template +internal::ParamGenerator< + typename std::iterator_traits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end) { + typedef typename std::iterator_traits::value_type ParamType; + return internal::ParamGenerator( + new internal::ValuesInIteratorRangeGenerator(begin, end)); +} + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]) { + return ValuesIn(array, array + N); +} + +template +internal::ParamGenerator ValuesIn( + const Container& container) { + return ValuesIn(container.begin(), container.end()); +} + +// Values() allows generating tests from explicitly specified list of +// parameters. +// +// Synopsis: +// Values(T v1, T v2, ..., T vN) +// - returns a generator producing sequences with elements v1, v2, ..., vN. +// +// For example, this instantiates tests from test suite BarTest each +// with values "one", "two", and "three": +// +// INSTANTIATE_TEST_SUITE_P(NumSequence, +// BarTest, +// Values("one", "two", "three")); +// +// This instantiates tests from test suite BazTest each with values 1, 2, 3.5. +// The exact type of values will depend on the type of parameter in BazTest. +// +// INSTANTIATE_TEST_SUITE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); +// +// +template +internal::ValueArray Values(T... v) { + return internal::ValueArray(std::move(v)...); +} + +// Bool() allows generating tests with parameters in a set of (false, true). +// +// Synopsis: +// Bool() +// - returns a generator producing sequences with elements {false, true}. +// +// It is useful when testing code that depends on Boolean flags. Combinations +// of multiple flags can be tested when several Bool()'s are combined using +// Combine() function. +// +// In the following example all tests in the test suite FlagDependentTest +// will be instantiated twice with parameters false and true. +// +// class FlagDependentTest : public testing::TestWithParam { +// virtual void SetUp() { +// external_flag = GetParam(); +// } +// } +// INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool()); +// +inline internal::ParamGenerator Bool() { return Values(false, true); } + +// Combine() allows the user to combine two or more sequences to produce +// values of a Cartesian product of those sequences' elements. +// +// Synopsis: +// Combine(gen1, gen2, ..., genN) +// - returns a generator producing sequences with elements coming from +// the Cartesian product of elements from the sequences generated by +// gen1, gen2, ..., genN. The sequence elements will have a type of +// std::tuple where T1, T2, ..., TN are the types +// of elements from sequences produces by gen1, gen2, ..., genN. +// +// Example: +// +// This will instantiate tests in test suite AnimalTest each one with +// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), +// tuple("dog", BLACK), and tuple("dog", WHITE): +// +// enum Color { BLACK, GRAY, WHITE }; +// class AnimalTest +// : public testing::TestWithParam > {...}; +// +// TEST_P(AnimalTest, AnimalLooksNice) {...} +// +// INSTANTIATE_TEST_SUITE_P(AnimalVariations, AnimalTest, +// Combine(Values("cat", "dog"), +// Values(BLACK, WHITE))); +// +// This will instantiate tests in FlagDependentTest with all variations of two +// Boolean flags: +// +// class FlagDependentTest +// : public testing::TestWithParam > { +// virtual void SetUp() { +// // Assigns external_flag_1 and external_flag_2 values from the tuple. +// std::tie(external_flag_1, external_flag_2) = GetParam(); +// } +// }; +// +// TEST_P(FlagDependentTest, TestFeature1) { +// // Test your code using external_flag_1 and external_flag_2 here. +// } +// INSTANTIATE_TEST_SUITE_P(TwoBoolSequence, FlagDependentTest, +// Combine(Bool(), Bool())); +// +template +internal::CartesianProductHolder Combine(const Generator&... g) { + return internal::CartesianProductHolder(g...); +} + +#define TEST_P(test_suite_name, test_name) \ + class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + : public test_suite_name { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {} \ + void TestBody() override; \ + \ + private: \ + static int AddToRegistry() { \ + ::testing::UnitTest::GetInstance() \ + ->parameterized_test_registry() \ + .GetTestSuitePatternHolder( \ + GTEST_STRINGIFY_(test_suite_name), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ + ->AddTestPattern( \ + GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name), \ + new ::testing::internal::TestMetaFactory(), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)); \ + return 0; \ + } \ + static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &) = delete; /* NOLINT */ \ + }; \ + int GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)::gtest_registering_dummy_ = \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::AddToRegistry(); \ + void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() + +// The last argument to INSTANTIATE_TEST_SUITE_P allows the user to specify +// generator and an optional function or functor that generates custom test name +// suffixes based on the test parameters. Such a function or functor should +// accept one argument of type testing::TestParamInfo, and +// return std::string. +// +// testing::PrintToStringParamName is a builtin test suffix generator that +// returns the value of testing::PrintToString(GetParam()). +// +// Note: test names must be non-empty, unique, and may only contain ASCII +// alphanumeric characters or underscore. Because PrintToString adds quotes +// to std::string and C strings, it won't work for these types. + +#define GTEST_EXPAND_(arg) arg +#define GTEST_GET_FIRST_(first, ...) first +#define GTEST_GET_SECOND_(first, second, ...) second + +#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...) \ + static ::testing::internal::ParamGenerator \ + gtest_##prefix##test_suite_name##_EvalGenerator_() { \ + return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_)); \ + } \ + static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_( \ + const ::testing::TestParamInfo& info) { \ + if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName, \ + DUMMY_PARAM_))); \ + auto t = std::make_tuple(__VA_ARGS__); \ + static_assert(std::tuple_size::value <= 2, \ + "Too Many Args!"); \ + } \ + return ((GTEST_EXPAND_(GTEST_GET_SECOND_( \ + __VA_ARGS__, \ + ::testing::internal::DefaultParamName, \ + DUMMY_PARAM_))))(info); \ + } \ + static int gtest_##prefix##test_suite_name##_dummy_ \ + GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::UnitTest::GetInstance() \ + ->parameterized_test_registry() \ + .GetTestSuitePatternHolder( \ + GTEST_STRINGIFY_(test_suite_name), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__)) \ + ->AddTestSuiteInstantiation( \ + GTEST_STRINGIFY_(prefix), \ + >est_##prefix##test_suite_name##_EvalGenerator_, \ + >est_##prefix##test_suite_name##_EvalGenerateName_, \ + __FILE__, __LINE__) + +// Allow Marking a Parameterized test class as not needing to be instantiated. +#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T) \ + namespace gtest_do_not_use_outside_namespace_scope {} \ + static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \ + GTEST_STRINGIFY_(T)) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define INSTANTIATE_TEST_CASE_P \ + static_assert(::testing::internal::InstantiateTestCase_P_IsDeprecated(), \ + ""); \ + INSTANTIATE_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h new file mode 100644 index 0000000000..a91e8b8b10 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h @@ -0,0 +1,1048 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Google Test - The Google C++ Testing and Mocking Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// A user can teach this function how to print a class type T by +// defining either operator<<() or PrintTo() in the namespace that +// defines T. More specifically, the FIRST defined function in the +// following list will be used (assuming T is defined in namespace +// foo): +// +// 1. foo::PrintTo(const T&, ostream*) +// 2. operator<<(ostream&, const T&) defined in either foo or the +// global namespace. +// +// However if T is an STL-style container then it is printed element-wise +// unless foo::PrintTo(const T&, ostream*) is defined. Note that +// operator<<() is ignored for container types. +// +// If none of the above is defined, it will print the debug string of +// the value if it is a protocol buffer, or print the raw bytes in the +// value otherwise. +// +// To aid debugging: when T is a reference type, the address of the +// value is also printed; when T is a (const) char pointer, both the +// pointer value and the NUL-terminated string it points to are +// printed. +// +// We also provide some convenient wrappers: +// +// // Prints a value to a string. For a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// std::string ::testing::PrintToString(const T& value); +// +// // Prints a value tersely: for a reference type, the referenced +// // value (but not the address) is printed; for a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// void ::testing::internal::UniversalTersePrint(const T& value, ostream*); +// +// // Prints value using the type inferred by the compiler. The difference +// // from UniversalTersePrint() is that this function prints both the +// // pointer and the NUL-terminated string for a (const or not) char pointer. +// void ::testing::internal::UniversalPrint(const T& value, ostream*); +// +// // Prints the fields of a tuple tersely to a string vector, one +// // element for each field. Tuple support must be enabled in +// // gtest-port.h. +// std::vector UniversalTersePrintTupleFieldsToStrings( +// const Tuple& value); +// +// Known limitation: +// +// The print primitives print the elements of an STL-style container +// using the compiler-inferred type of *iter where iter is a +// const_iterator of the container. When const_iterator is an input +// iterator but not a forward iterator, this inferred type may not +// match value_type, and the print output may be incorrect. In +// practice, this is rarely a problem as for most containers +// const_iterator is a forward iterator. We'll fix this if there's an +// actual need for it. Note that this fix cannot rely on value_type +// being defined as many user-defined container types don't have +// value_type. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ + +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { + +// Definitions in the internal* namespaces are subject to change without notice. +// DO NOT USE THEM IN USER CODE! +namespace internal { + +template +void UniversalPrint(const T& value, ::std::ostream* os); + +// Used to print an STL-style container when the user doesn't define +// a PrintTo() for it. +struct ContainerPrinter { + template (0)) == sizeof(IsContainer)) && + !IsRecursiveContainer::value>::type> + static void PrintValue(const T& container, std::ostream* os) { + const size_t kMaxCount = 32; // The maximum number of elements to print. + *os << '{'; + size_t count = 0; + for (auto&& elem : container) { + if (count > 0) { + *os << ','; + if (count == kMaxCount) { // Enough has been printed. + *os << " ..."; + break; + } + } + *os << ' '; + // We cannot call PrintTo(elem, os) here as PrintTo() doesn't + // handle `elem` being a native array. + internal::UniversalPrint(elem, os); + ++count; + } + + if (count > 0) { + *os << ' '; + } + *os << '}'; + } +}; + +// Used to print a pointer that is neither a char pointer nor a member +// pointer, when the user doesn't define PrintTo() for it. (A member +// variable pointer or member function pointer doesn't really point to +// a location in the address space. Their representation is +// implementation-defined. Therefore they will be printed as raw +// bytes.) +struct FunctionPointerPrinter { + template ::value>::type> + static void PrintValue(T* p, ::std::ostream* os) { + if (p == nullptr) { + *os << "NULL"; + } else { + // T is a function type, so '*os << p' doesn't do what we want + // (it just prints p as bool). We want to print p as a const + // void*. + *os << reinterpret_cast(p); + } + } +}; + +struct PointerPrinter { + template + static void PrintValue(T* p, ::std::ostream* os) { + if (p == nullptr) { + *os << "NULL"; + } else { + // T is not a function type. We just call << to print p, + // relying on ADL to pick up user-defined << for their pointer + // types, if any. + *os << p; + } + } +}; + +namespace internal_stream_operator_without_lexical_name_lookup { + +// The presence of an operator<< here will terminate lexical scope lookup +// straight away (even though it cannot be a match because of its argument +// types). Thus, the two operator<< calls in StreamPrinter will find only ADL +// candidates. +struct LookupBlocker {}; +void operator<<(LookupBlocker, LookupBlocker); + +struct StreamPrinter { + template ::value>::type, + // Only accept types for which we can find a streaming operator via + // ADL (possibly involving implicit conversions). + typename = decltype(std::declval() + << std::declval())> + static void PrintValue(const T& value, ::std::ostream* os) { + // Call streaming operator found by ADL, possibly with implicit conversions + // of the arguments. + *os << value; + } +}; + +} // namespace internal_stream_operator_without_lexical_name_lookup + +struct ProtobufPrinter { + // We print a protobuf using its ShortDebugString() when the string + // doesn't exceed this many characters; otherwise we print it using + // DebugString() for better readability. + static const size_t kProtobufOneLinerMaxLength = 50; + + template ::value>::type> + static void PrintValue(const T& value, ::std::ostream* os) { + std::string pretty_str = value.ShortDebugString(); + if (pretty_str.length() > kProtobufOneLinerMaxLength) { + pretty_str = "\n" + value.DebugString(); + } + *os << ("<" + pretty_str + ">"); + } +}; + +struct ConvertibleToIntegerPrinter { + // Since T has no << operator or PrintTo() but can be implicitly + // converted to BiggestInt, we print it as a BiggestInt. + // + // Most likely T is an enum type (either named or unnamed), in which + // case printing it as an integer is the desired behavior. In case + // T is not an enum, printing it as an integer is the best we can do + // given that it has no user-defined printer. + static void PrintValue(internal::BiggestInt value, ::std::ostream* os) { + *os << value; + } +}; + +struct ConvertibleToStringViewPrinter { +#if GTEST_INTERNAL_HAS_STRING_VIEW + static void PrintValue(internal::StringView value, ::std::ostream* os) { + internal::UniversalPrint(value, os); + } +#endif +}; + +// Prints the given number of bytes in the given object to the given +// ostream. +GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, + size_t count, ::std::ostream* os); +struct RawBytesPrinter { + // SFINAE on `sizeof` to make sure we have a complete type. + template + static void PrintValue(const T& value, ::std::ostream* os) { + PrintBytesInObjectTo( + static_cast( + // Load bearing cast to void* to support iOS + reinterpret_cast(std::addressof(value))), + sizeof(value), os); + } +}; + +struct FallbackPrinter { + template + static void PrintValue(const T&, ::std::ostream* os) { + *os << "(incomplete type)"; + } +}; + +// Try every printer in order and return the first one that works. +template +struct FindFirstPrinter : FindFirstPrinter {}; + +template +struct FindFirstPrinter< + T, decltype(Printer::PrintValue(std::declval(), nullptr)), + Printer, Printers...> { + using type = Printer; +}; + +// Select the best printer in the following order: +// - Print containers (they have begin/end/etc). +// - Print function pointers. +// - Print object pointers. +// - Use the stream operator, if available. +// - Print protocol buffers. +// - Print types convertible to BiggestInt. +// - Print types convertible to StringView, if available. +// - Fallback to printing the raw bytes of the object. +template +void PrintWithFallback(const T& value, ::std::ostream* os) { + using Printer = typename FindFirstPrinter< + T, void, ContainerPrinter, FunctionPointerPrinter, PointerPrinter, + internal_stream_operator_without_lexical_name_lookup::StreamPrinter, + ProtobufPrinter, ConvertibleToIntegerPrinter, + ConvertibleToStringViewPrinter, RawBytesPrinter, FallbackPrinter>::type; + Printer::PrintValue(value, os); +} + +// FormatForComparison::Format(value) formats a +// value of type ToPrint that is an operand of a comparison assertion +// (e.g. ASSERT_EQ). OtherOperand is the type of the other operand in +// the comparison, and is used to help determine the best way to +// format the value. In particular, when the value is a C string +// (char pointer) and the other operand is an STL string object, we +// want to format the C string as a string, since we know it is +// compared by value with the string object. If the value is a char +// pointer but the other operand is not an STL string object, we don't +// know whether the pointer is supposed to point to a NUL-terminated +// string, and thus want to print it as a pointer to be safe. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// The default case. +template +class FormatForComparison { + public: + static ::std::string Format(const ToPrint& value) { + return ::testing::PrintToString(value); + } +}; + +// Array. +template +class FormatForComparison { + public: + static ::std::string Format(const ToPrint* value) { + return FormatForComparison::Format(value); + } +}; + +// By default, print C string as pointers to be safe, as we don't know +// whether they actually point to a NUL-terminated string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType) \ + template \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(static_cast(value)); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); +#ifdef __cpp_lib_char8_t +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t); +#endif +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char16_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char16_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char32_t); +GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t); + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_ + +// If a C string is compared with an STL string object, we know it's meant +// to point to a NUL-terminated string, and thus can print it as a string. + +#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \ + template <> \ + class FormatForComparison { \ + public: \ + static ::std::string Format(CharType* value) { \ + return ::testing::PrintToString(value); \ + } \ + } + +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string); +#ifdef __cpp_char8_t +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char8_t, ::std::u8string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char8_t, ::std::u8string); +#endif +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char16_t, ::std::u16string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char16_t, ::std::u16string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char32_t, ::std::u32string); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char32_t, ::std::u32string); + +#if GTEST_HAS_STD_WSTRING +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring); +GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); +#endif + +#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_ + +// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc) +// operand to be used in a failure message. The type (but not value) +// of the other operand may affect the format. This allows us to +// print a char* as a raw pointer when it is compared against another +// char* or void*, and print it as a C string when it is compared +// against an std::string object, for example. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template +std::string FormatForComparisonFailureMessage(const T1& value, + const T2& /* other_operand */) { + return FormatForComparison::Format(value); +} + +// UniversalPrinter::Print(value, ostream_ptr) prints the given +// value to the given ostream. The caller must ensure that +// 'ostream_ptr' is not NULL, or the behavior is undefined. +// +// We define UniversalPrinter as a class template (as opposed to a +// function template), as we need to partially specialize it for +// reference types, which cannot be done with function templates. +template +class UniversalPrinter; + +// Prints the given value using the << operator if it has one; +// otherwise prints the bytes in it. This is what +// UniversalPrinter::Print() does when PrintTo() is not specialized +// or overloaded for type T. +// +// A user can override this behavior for a class type Foo by defining +// an overload of PrintTo() in the namespace where Foo is defined. We +// give the user this option as sometimes defining a << operator for +// Foo is not desirable (e.g. the coding style may prevent doing it, +// or there is already a << operator but it doesn't do what the user +// wants). +template +void PrintTo(const T& value, ::std::ostream* os) { + internal::PrintWithFallback(value, os); +} + +// The following list of PrintTo() overloads tells +// UniversalPrinter::Print() how to print standard types (built-in +// types, strings, plain arrays, and pointers). + +// Overloads for various char types. +GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os); +GTEST_API_ void PrintTo(signed char c, ::std::ostream* os); +inline void PrintTo(char c, ::std::ostream* os) { + // When printing a plain char, we always treat it as unsigned. This + // way, the output won't be affected by whether the compiler thinks + // char is signed or not. + PrintTo(static_cast(c), os); +} + +// Overloads for other simple built-in types. +inline void PrintTo(bool x, ::std::ostream* os) { + *os << (x ? "true" : "false"); +} + +// Overload for wchar_t type. +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its decimal code (except for L'\0'). +// The L'\0' char is printed as "L'\\0'". The decimal code is printed +// as signed integer when wchar_t is implemented by the compiler +// as a signed type and is printed as an unsigned integer when wchar_t +// is implemented as an unsigned type. +GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os); + +GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os); +inline void PrintTo(char16_t c, ::std::ostream* os) { + PrintTo(ImplicitCast_(c), os); +} +#ifdef __cpp_char8_t +inline void PrintTo(char8_t c, ::std::ostream* os) { + PrintTo(ImplicitCast_(c), os); +} +#endif + +// gcc/clang __{u,}int128_t +#if defined(__SIZEOF_INT128__) +GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os); +GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os); +#endif // __SIZEOF_INT128__ + +// Overloads for C strings. +GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); +inline void PrintTo(char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} + +// signed/unsigned char is often used for representing binary data, so +// we print pointers to it as void* to be safe. +inline void PrintTo(const signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(const unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +#ifdef __cpp_char8_t +// Overloads for u8 strings. +GTEST_API_ void PrintTo(const char8_t* s, ::std::ostream* os); +inline void PrintTo(char8_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +#endif +// Overloads for u16 strings. +GTEST_API_ void PrintTo(const char16_t* s, ::std::ostream* os); +inline void PrintTo(char16_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +// Overloads for u32 strings. +GTEST_API_ void PrintTo(const char32_t* s, ::std::ostream* os); +inline void PrintTo(char32_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} + +// MSVC can be configured to define wchar_t as a typedef of unsigned +// short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native +// type. When wchar_t is a typedef, defining an overload for const +// wchar_t* would cause unsigned short* be printed as a wide string, +// possibly causing invalid memory accesses. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Overloads for wide C strings +GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os); +inline void PrintTo(wchar_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +#endif + +// Overload for C arrays. Multi-dimensional arrays are printed +// properly. + +// Prints the given number of elements in an array, without printing +// the curly braces. +template +void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { + UniversalPrint(a[0], os); + for (size_t i = 1; i != count; i++) { + *os << ", "; + UniversalPrint(a[i], os); + } +} + +// Overloads for ::std::string. +GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os); +inline void PrintTo(const ::std::string& s, ::std::ostream* os) { + PrintStringTo(s, os); +} + +// Overloads for ::std::u8string +#ifdef __cpp_char8_t +GTEST_API_ void PrintU8StringTo(const ::std::u8string& s, ::std::ostream* os); +inline void PrintTo(const ::std::u8string& s, ::std::ostream* os) { + PrintU8StringTo(s, os); +} +#endif + +// Overloads for ::std::u16string +GTEST_API_ void PrintU16StringTo(const ::std::u16string& s, ::std::ostream* os); +inline void PrintTo(const ::std::u16string& s, ::std::ostream* os) { + PrintU16StringTo(s, os); +} + +// Overloads for ::std::u32string +GTEST_API_ void PrintU32StringTo(const ::std::u32string& s, ::std::ostream* os); +inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) { + PrintU32StringTo(s, os); +} + +// Overloads for ::std::wstring. +#if GTEST_HAS_STD_WSTRING +GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os); +inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_INTERNAL_HAS_STRING_VIEW +// Overload for internal::StringView. +inline void PrintTo(internal::StringView sp, ::std::ostream* os) { + PrintTo(::std::string(sp), os); +} +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + +inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; } + +#if GTEST_HAS_RTTI +inline void PrintTo(const std::type_info& info, std::ostream* os) { + *os << internal::GetTypeName(info); +} +#endif // GTEST_HAS_RTTI + +template +void PrintTo(std::reference_wrapper ref, ::std::ostream* os) { + UniversalPrinter::Print(ref.get(), os); +} + +inline const void* VoidifyPointer(const void* p) { return p; } +inline const void* VoidifyPointer(volatile const void* p) { + return const_cast(p); +} + +template +void PrintSmartPointer(const Ptr& ptr, std::ostream* os, char) { + if (ptr == nullptr) { + *os << "(nullptr)"; + } else { + // We can't print the value. Just print the pointer.. + *os << "(" << (VoidifyPointer)(ptr.get()) << ")"; + } +} +template ::value && + !std::is_array::value>::type> +void PrintSmartPointer(const Ptr& ptr, std::ostream* os, int) { + if (ptr == nullptr) { + *os << "(nullptr)"; + } else { + *os << "(ptr = " << (VoidifyPointer)(ptr.get()) << ", value = "; + UniversalPrinter::Print(*ptr, os); + *os << ")"; + } +} + +template +void PrintTo(const std::unique_ptr& ptr, std::ostream* os) { + (PrintSmartPointer)(ptr, os, 0); +} + +template +void PrintTo(const std::shared_ptr& ptr, std::ostream* os) { + (PrintSmartPointer)(ptr, os, 0); +} + +// Helper function for printing a tuple. T must be instantiated with +// a tuple type. +template +void PrintTupleTo(const T&, std::integral_constant, + ::std::ostream*) {} + +template +void PrintTupleTo(const T& t, std::integral_constant, + ::std::ostream* os) { + PrintTupleTo(t, std::integral_constant(), os); + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (I > 1) { + GTEST_INTENTIONAL_CONST_COND_POP_() + *os << ", "; + } + UniversalPrinter::type>::Print( + std::get(t), os); +} + +template +void PrintTo(const ::std::tuple& t, ::std::ostream* os) { + *os << "("; + PrintTupleTo(t, std::integral_constant(), os); + *os << ")"; +} + +// Overload for std::pair. +template +void PrintTo(const ::std::pair& value, ::std::ostream* os) { + *os << '('; + // We cannot use UniversalPrint(value.first, os) here, as T1 may be + // a reference type. The same for printing value.second. + UniversalPrinter::Print(value.first, os); + *os << ", "; + UniversalPrinter::Print(value.second, os); + *os << ')'; +} + +// Implements printing a non-reference type T by letting the compiler +// pick the right overload of PrintTo() for T. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180) + + // Note: we deliberately don't call this PrintTo(), as that name + // conflicts with ::testing::internal::PrintTo in the body of the + // function. + static void Print(const T& value, ::std::ostream* os) { + // By default, ::testing::internal::PrintTo() is used for printing + // the value. + // + // Thanks to Koenig look-up, if T is a class and has its own + // PrintTo() function defined in its namespace, that function will + // be visible here. Since it is more specific than the generic ones + // in ::testing::internal, it will be picked by the compiler in the + // following statement - exactly what we want. + PrintTo(value, os); + } + + GTEST_DISABLE_MSC_WARNINGS_POP_() +}; + +// Remove any const-qualifiers before passing a type to UniversalPrinter. +template +class UniversalPrinter : public UniversalPrinter {}; + +#if GTEST_INTERNAL_HAS_ANY + +// Printer for std::any / absl::any + +template <> +class UniversalPrinter { + public: + static void Print(const Any& value, ::std::ostream* os) { + if (value.has_value()) { + *os << "value of type " << GetTypeName(value); + } else { + *os << "no value"; + } + } + + private: + static std::string GetTypeName(const Any& value) { +#if GTEST_HAS_RTTI + return internal::GetTypeName(value.type()); +#else + static_cast(value); // possibly unused + return ""; +#endif // GTEST_HAS_RTTI + } +}; + +#endif // GTEST_INTERNAL_HAS_ANY + +#if GTEST_INTERNAL_HAS_OPTIONAL + +// Printer for std::optional / absl::optional + +template +class UniversalPrinter> { + public: + static void Print(const Optional& value, ::std::ostream* os) { + *os << '('; + if (!value) { + *os << "nullopt"; + } else { + UniversalPrint(*value, os); + } + *os << ')'; + } +}; + +template <> +class UniversalPrinter { + public: + static void Print(decltype(Nullopt()), ::std::ostream* os) { + *os << "(nullopt)"; + } +}; + +#endif // GTEST_INTERNAL_HAS_OPTIONAL + +#if GTEST_INTERNAL_HAS_VARIANT + +// Printer for std::variant / absl::variant + +template +class UniversalPrinter> { + public: + static void Print(const Variant& value, ::std::ostream* os) { + *os << '('; +#if GTEST_HAS_ABSL + absl::visit(Visitor{os, value.index()}, value); +#else + std::visit(Visitor{os, value.index()}, value); +#endif // GTEST_HAS_ABSL + *os << ')'; + } + + private: + struct Visitor { + template + void operator()(const U& u) const { + *os << "'" << GetTypeName() << "(index = " << index + << ")' with value "; + UniversalPrint(u, os); + } + ::std::ostream* os; + std::size_t index; + }; +}; + +#endif // GTEST_INTERNAL_HAS_VARIANT + +// UniversalPrintArray(begin, len, os) prints an array of 'len' +// elements, starting at address 'begin'. +template +void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { + if (len == 0) { + *os << "{}"; + } else { + *os << "{ "; + const size_t kThreshold = 18; + const size_t kChunkSize = 8; + // If the array has more than kThreshold elements, we'll have to + // omit some details by printing only the first and the last + // kChunkSize elements. + if (len <= kThreshold) { + PrintRawArrayTo(begin, len, os); + } else { + PrintRawArrayTo(begin, kChunkSize, os); + *os << ", ..., "; + PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os); + } + *os << " }"; + } +} +// This overload prints a (const) char array compactly. +GTEST_API_ void UniversalPrintArray(const char* begin, size_t len, + ::std::ostream* os); + +#ifdef __cpp_char8_t +// This overload prints a (const) char8_t array compactly. +GTEST_API_ void UniversalPrintArray(const char8_t* begin, size_t len, + ::std::ostream* os); +#endif + +// This overload prints a (const) char16_t array compactly. +GTEST_API_ void UniversalPrintArray(const char16_t* begin, size_t len, + ::std::ostream* os); + +// This overload prints a (const) char32_t array compactly. +GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len, + ::std::ostream* os); + +// This overload prints a (const) wchar_t array compactly. +GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len, + ::std::ostream* os); + +// Implements printing an array type T[N]. +template +class UniversalPrinter { + public: + // Prints the given array, omitting some elements when there are too + // many. + static void Print(const T (&a)[N], ::std::ostream* os) { + UniversalPrintArray(a, N, os); + } +}; + +// Implements printing a reference type T&. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180) + + static void Print(const T& value, ::std::ostream* os) { + // Prints the address of the value. We use reinterpret_cast here + // as static_cast doesn't compile when T is a function type. + *os << "@" << reinterpret_cast(&value) << " "; + + // Then prints the value itself. + UniversalPrint(value, os); + } + + GTEST_DISABLE_MSC_WARNINGS_POP_() +}; + +// Prints a value tersely: for a reference type, the referenced value +// (but not the address) is printed; for a (const) char pointer, the +// NUL-terminated string (but not the pointer) is printed. + +template +class UniversalTersePrinter { + public: + static void Print(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); + } +}; +template +class UniversalTersePrinter { + public: + static void Print(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); + } +}; +template +class UniversalTersePrinter { + public: + static void Print(const T (&value)[N], ::std::ostream* os) { + UniversalPrinter::Print(value, os); + } +}; +template <> +class UniversalTersePrinter { + public: + static void Print(const char* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(std::string(str), os); + } + } +}; +template <> +class UniversalTersePrinter : public UniversalTersePrinter { +}; + +#ifdef __cpp_char8_t +template <> +class UniversalTersePrinter { + public: + static void Print(const char8_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::u8string(str), os); + } + } +}; +template <> +class UniversalTersePrinter + : public UniversalTersePrinter {}; +#endif + +template <> +class UniversalTersePrinter { + public: + static void Print(const char16_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::u16string(str), os); + } + } +}; +template <> +class UniversalTersePrinter + : public UniversalTersePrinter {}; + +template <> +class UniversalTersePrinter { + public: + static void Print(const char32_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::u32string(str), os); + } + } +}; +template <> +class UniversalTersePrinter + : public UniversalTersePrinter {}; + +#if GTEST_HAS_STD_WSTRING +template <> +class UniversalTersePrinter { + public: + static void Print(const wchar_t* str, ::std::ostream* os) { + if (str == nullptr) { + *os << "NULL"; + } else { + UniversalPrint(::std::wstring(str), os); + } + } +}; +#endif + +template <> +class UniversalTersePrinter { + public: + static void Print(wchar_t* str, ::std::ostream* os) { + UniversalTersePrinter::Print(str, os); + } +}; + +template +void UniversalTersePrint(const T& value, ::std::ostream* os) { + UniversalTersePrinter::Print(value, os); +} + +// Prints a value using the type inferred by the compiler. The +// difference between this and UniversalTersePrint() is that for a +// (const) char pointer, this prints both the pointer and the +// NUL-terminated string. +template +void UniversalPrint(const T& value, ::std::ostream* os) { + // A workarond for the bug in VC++ 7.1 that prevents us from instantiating + // UniversalPrinter with T directly. + typedef T T1; + UniversalPrinter::Print(value, os); +} + +typedef ::std::vector<::std::string> Strings; + +// Tersely prints the first N fields of a tuple to a string vector, +// one element for each field. +template +void TersePrintPrefixToStrings(const Tuple&, std::integral_constant, + Strings*) {} +template +void TersePrintPrefixToStrings(const Tuple& t, + std::integral_constant, + Strings* strings) { + TersePrintPrefixToStrings(t, std::integral_constant(), + strings); + ::std::stringstream ss; + UniversalTersePrint(std::get(t), &ss); + strings->push_back(ss.str()); +} + +// Prints the fields of a tuple tersely to a string vector, one +// element for each field. See the comment before +// UniversalTersePrint() for how we define "tersely". +template +Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { + Strings result; + TersePrintPrefixToStrings( + value, std::integral_constant::value>(), + &result); + return result; +} + +} // namespace internal + +template +::std::string PrintToString(const T& value) { + ::std::stringstream ss; + internal::UniversalTersePrinter::Print(value, &ss); + return ss.str(); +} + +} // namespace testing + +// Include any custom printer added by the local installation. +// We must include this header at the end to make sure it can use the +// declarations from this file. +#include "gtest/internal/custom/gtest-printers.h" + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h new file mode 100644 index 0000000000..bec8c4810b --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h @@ -0,0 +1,248 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Utilities for testing Google Test itself and code that uses Google Test +// (e.g. frameworks built on top of Google Test). + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ + +#include "gtest/gtest.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// This helper class can be used to mock out Google Test failure reporting +// so that we can test Google Test or code that builds on Google Test. +// +// An object of this class appends a TestPartResult object to the +// TestPartResultArray object given in the constructor whenever a Google Test +// failure is reported. It can either intercept only failures that are +// generated in the same thread that created this object or it can intercept +// all generated failures. The scope of this mock object can be controlled with +// the second argument to the two arguments constructor. +class GTEST_API_ ScopedFakeTestPartResultReporter + : public TestPartResultReporterInterface { + public: + // The two possible mocking modes of this object. + enum InterceptMode { + INTERCEPT_ONLY_CURRENT_THREAD, // Intercepts only thread local failures. + INTERCEPT_ALL_THREADS // Intercepts all failures. + }; + + // The c'tor sets this object as the test part result reporter used + // by Google Test. The 'result' parameter specifies where to report the + // results. This reporter will only catch failures generated in the current + // thread. DEPRECATED + explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result); + + // Same as above, but you can choose the interception scope of this object. + ScopedFakeTestPartResultReporter(InterceptMode intercept_mode, + TestPartResultArray* result); + + // The d'tor restores the previous test part result reporter. + ~ScopedFakeTestPartResultReporter() override; + + // Appends the TestPartResult object to the TestPartResultArray + // received in the constructor. + // + // This method is from the TestPartResultReporterInterface + // interface. + void ReportTestPartResult(const TestPartResult& result) override; + + private: + void Init(); + + const InterceptMode intercept_mode_; + TestPartResultReporterInterface* old_reporter_; + TestPartResultArray* const result_; + + ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) = + delete; + ScopedFakeTestPartResultReporter& operator=( + const ScopedFakeTestPartResultReporter&) = delete; +}; + +namespace internal { + +// A helper class for implementing EXPECT_FATAL_FAILURE() and +// EXPECT_NONFATAL_FAILURE(). Its destructor verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +class GTEST_API_ SingleFailureChecker { + public: + // The constructor remembers the arguments. + SingleFailureChecker(const TestPartResultArray* results, + TestPartResult::Type type, const std::string& substr); + ~SingleFailureChecker(); + + private: + const TestPartResultArray* const results_; + const TestPartResult::Type type_; + const std::string substr_; + + SingleFailureChecker(const SingleFailureChecker&) = delete; + SingleFailureChecker& operator=(const SingleFailureChecker&) = delete; +}; + +} // namespace internal + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// A set of macros for testing Google Test assertions or code that's expected +// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but +// not a non-fatal failure, as from EXPECT_EQ). It verifies that the given +// statement will cause exactly one fatal Google Test failure with 'substr' +// being part of the failure message. +// +// There are two different versions of this macro. EXPECT_FATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - 'statement' cannot reference local non-static variables or +// non-static members of the current object. +// - 'statement' cannot return a value. +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. The AcceptsMacroThatExpandsToUnprotectedComma test in +// gtest_unittest.cc will fail to compile if we do that. +#define EXPECT_FATAL_FAILURE(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper { \ + public: \ + static void Execute() { statement; } \ + }; \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, \ + >est_failures); \ + GTestExpectFatalFailureHelper::Execute(); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper { \ + public: \ + static void Execute() { statement; } \ + }; \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ + >est_failures); \ + GTestExpectFatalFailureHelper::Execute(); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// A macro for testing Google Test assertions or code that's expected to +// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ, +// but not from an ASSERT_EQ). It asserts that the given statement will cause +// exactly one non-fatal Google Test failure with 'substr' being part of the +// failure message. +// +// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// 'statement' is allowed to reference local variables and members of +// the current object. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. If we do that, the code won't compile when the user gives +// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that +// expands to code containing an unprotected comma. The +// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc +// catches that. +// +// For the same reason, we have to write +// if (::testing::internal::AlwaysTrue()) { statement; } +// instead of +// GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) +// to avoid an MSVC warning on unreachable code. +#define EXPECT_NONFATAL_FAILURE(statement, substr) \ + do { \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, \ + >est_failures); \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } \ + } \ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + ::testing::TestPartResultArray gtest_failures; \ + ::testing::internal::SingleFailureChecker gtest_checker( \ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr)); \ + { \ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter( \ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ + >est_failures); \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } \ + } \ + } while (::testing::internal::AlwaysFalse()) + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h new file mode 100644 index 0000000000..09cc8c34f0 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h @@ -0,0 +1,190 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ + +#include +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { + +// A copyable object representing the result of a test part (i.e. an +// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()). +// +// Don't inherit from TestPartResult as its destructor is not virtual. +class GTEST_API_ TestPartResult { + public: + // The possible outcomes of a test part (i.e. an assertion or an + // explicit SUCCEED(), FAIL(), or ADD_FAILURE()). + enum Type { + kSuccess, // Succeeded. + kNonFatalFailure, // Failed but the test can continue. + kFatalFailure, // Failed and the test should be terminated. + kSkip // Skipped. + }; + + // C'tor. TestPartResult does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestPartResult object. + TestPartResult(Type a_type, const char* a_file_name, int a_line_number, + const char* a_message) + : type_(a_type), + file_name_(a_file_name == nullptr ? "" : a_file_name), + line_number_(a_line_number), + summary_(ExtractSummary(a_message)), + message_(a_message) {} + + // Gets the outcome of the test part. + Type type() const { return type_; } + + // Gets the name of the source file where the test part took place, or + // NULL if it's unknown. + const char* file_name() const { + return file_name_.empty() ? nullptr : file_name_.c_str(); + } + + // Gets the line in the source file where the test part took place, + // or -1 if it's unknown. + int line_number() const { return line_number_; } + + // Gets the summary of the failure message. + const char* summary() const { return summary_.c_str(); } + + // Gets the message associated with the test part. + const char* message() const { return message_.c_str(); } + + // Returns true if and only if the test part was skipped. + bool skipped() const { return type_ == kSkip; } + + // Returns true if and only if the test part passed. + bool passed() const { return type_ == kSuccess; } + + // Returns true if and only if the test part non-fatally failed. + bool nonfatally_failed() const { return type_ == kNonFatalFailure; } + + // Returns true if and only if the test part fatally failed. + bool fatally_failed() const { return type_ == kFatalFailure; } + + // Returns true if and only if the test part failed. + bool failed() const { return fatally_failed() || nonfatally_failed(); } + + private: + Type type_; + + // Gets the summary of the failure message by omitting the stack + // trace in it. + static std::string ExtractSummary(const char* message); + + // The name of the source file where the test part took place, or + // "" if the source file is unknown. + std::string file_name_; + // The line in the source file where the test part took place, or -1 + // if the line number is unknown. + int line_number_; + std::string summary_; // The test failure summary. + std::string message_; // The test failure message. +}; + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result); + +// An array of TestPartResult objects. +// +// Don't inherit from TestPartResultArray as its destructor is not +// virtual. +class GTEST_API_ TestPartResultArray { + public: + TestPartResultArray() {} + + // Appends the given TestPartResult to the array. + void Append(const TestPartResult& result); + + // Returns the TestPartResult at the given index (0-based). + const TestPartResult& GetTestPartResult(int index) const; + + // Returns the number of TestPartResult objects in the array. + int size() const; + + private: + std::vector array_; + + TestPartResultArray(const TestPartResultArray&) = delete; + TestPartResultArray& operator=(const TestPartResultArray&) = delete; +}; + +// This interface knows how to report a test part result. +class GTEST_API_ TestPartResultReporterInterface { + public: + virtual ~TestPartResultReporterInterface() {} + + virtual void ReportTestPartResult(const TestPartResult& result) = 0; +}; + +namespace internal { + +// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a +// statement generates new fatal failures. To do so it registers itself as the +// current test part result reporter. Besides checking if fatal failures were +// reported, it only delegates the reporting to the former result reporter. +// The original result reporter is restored in the destructor. +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +class GTEST_API_ HasNewFatalFailureHelper + : public TestPartResultReporterInterface { + public: + HasNewFatalFailureHelper(); + ~HasNewFatalFailureHelper() override; + void ReportTestPartResult(const TestPartResult& result) override; + bool has_new_fatal_failure() const { return has_new_fatal_failure_; } + + private: + bool has_new_fatal_failure_; + TestPartResultReporterInterface* original_reporter_; + + HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete; + HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete; +}; + +} // namespace internal + +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h new file mode 100644 index 0000000000..bd35a32660 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h @@ -0,0 +1,331 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ + +// This header implements typed tests and type-parameterized tests. + +// Typed (aka type-driven) tests repeat the same test for types in a +// list. You must know which types you want to test with when writing +// typed tests. Here's how you do it: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + public: + ... + typedef std::list List; + static T shared_; + T value_; +}; + +// Next, associate a list of types with the test suite, which will be +// repeated for each type in the list. The typedef is necessary for +// the macro to parse correctly. +typedef testing::Types MyTypes; +TYPED_TEST_SUITE(FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// TYPED_TEST_SUITE(FooTest, int); + +// Then, use TYPED_TEST() instead of TEST_F() to define as many typed +// tests for this test suite as you want. +TYPED_TEST(FooTest, DoesBlah) { + // Inside a test, refer to the special name TypeParam to get the type + // parameter. Since we are inside a derived class template, C++ requires + // us to visit the members of FooTest via 'this'. + TypeParam n = this->value_; + + // To visit static members of the fixture, add the TestFixture:: + // prefix. + n += TestFixture::shared_; + + // To refer to typedefs in the fixture, add the "typename + // TestFixture::" prefix. + typename TestFixture::List values; + values.push_back(n); + ... +} + +TYPED_TEST(FooTest, HasPropertyA) { ... } + +// TYPED_TEST_SUITE takes an optional third argument which allows to specify a +// class that generates custom test name suffixes based on the type. This should +// be a class which has a static template function GetName(int index) returning +// a string for each type. The provided integer index equals the index of the +// type in the provided type list. In many cases the index can be ignored. +// +// For example: +// class MyTypeNames { +// public: +// template +// static std::string GetName(int) { +// if (std::is_same()) return "char"; +// if (std::is_same()) return "int"; +// if (std::is_same()) return "unsignedInt"; +// } +// }; +// TYPED_TEST_SUITE(FooTest, MyTypes, MyTypeNames); + +#endif // 0 + +// Type-parameterized tests are abstract test patterns parameterized +// by a type. Compared with typed tests, type-parameterized tests +// allow you to define the test pattern without knowing what the type +// parameters are. The defined pattern can be instantiated with +// different types any number of times, in any number of translation +// units. +// +// If you are designing an interface or concept, you can define a +// suite of type-parameterized tests to verify properties that any +// valid implementation of the interface/concept should have. Then, +// each implementation can easily instantiate the test suite to verify +// that it conforms to the requirements, without having to write +// similar tests repeatedly. Here's an example: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + ... +}; + +// Next, declare that you will define a type-parameterized test suite +// (the _P suffix is for "parameterized" or "pattern", whichever you +// prefer): +TYPED_TEST_SUITE_P(FooTest); + +// Then, use TYPED_TEST_P() to define as many type-parameterized tests +// for this type-parameterized test suite as you want. +TYPED_TEST_P(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + TypeParam n = 0; + ... +} + +TYPED_TEST_P(FooTest, HasPropertyA) { ... } + +// Now the tricky part: you need to register all test patterns before +// you can instantiate them. The first argument of the macro is the +// test suite name; the rest are the names of the tests in this test +// case. +REGISTER_TYPED_TEST_SUITE_P(FooTest, + DoesBlah, HasPropertyA); + +// Finally, you are free to instantiate the pattern with the types you +// want. If you put the above code in a header file, you can #include +// it in multiple C++ source files and instantiate it multiple times. +// +// To distinguish different instances of the pattern, the first +// argument to the INSTANTIATE_* macro is a prefix that will be added +// to the actual test suite name. Remember to pick unique prefixes for +// different instances. +typedef testing::Types MyTypes; +INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int); +// +// Similar to the optional argument of TYPED_TEST_SUITE above, +// INSTANTIATE_TEST_SUITE_P takes an optional fourth argument which allows to +// generate custom names. +// INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes, MyTypeNames); + +#endif // 0 + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" +#include "gtest/internal/gtest-type-util.h" + +// Implements typed tests. + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the typedef for the type parameters of the +// given test suite. +#define GTEST_TYPE_PARAMS_(TestSuiteName) gtest_type_params_##TestSuiteName##_ + +// Expands to the name of the typedef for the NameGenerator, responsible for +// creating the suffixes of the name. +#define GTEST_NAME_GENERATOR_(TestSuiteName) \ + gtest_type_params_##TestSuiteName##_NameGenerator + +#define TYPED_TEST_SUITE(CaseName, Types, ...) \ + typedef ::testing::internal::GenerateTypeList::type \ + GTEST_TYPE_PARAMS_(CaseName); \ + typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \ + GTEST_NAME_GENERATOR_(CaseName) + +#define TYPED_TEST(CaseName, TestName) \ + static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1, \ + "test-name must not be empty"); \ + template \ + class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ + : public CaseName { \ + private: \ + typedef CaseName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + void TestBody() override; \ + }; \ + static bool gtest_##CaseName##_##TestName##_registered_ \ + GTEST_ATTRIBUTE_UNUSED_ = ::testing::internal::TypeParameterizedTest< \ + CaseName, \ + ::testing::internal::TemplateSel, \ + GTEST_TYPE_PARAMS_( \ + CaseName)>::Register("", \ + ::testing::internal::CodeLocation( \ + __FILE__, __LINE__), \ + GTEST_STRINGIFY_(CaseName), \ + GTEST_STRINGIFY_(TestName), 0, \ + ::testing::internal::GenerateNames< \ + GTEST_NAME_GENERATOR_(CaseName), \ + GTEST_TYPE_PARAMS_(CaseName)>()); \ + template \ + void GTEST_TEST_CLASS_NAME_(CaseName, \ + TestName)::TestBody() + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define TYPED_TEST_CASE \ + static_assert(::testing::internal::TypedTestCaseIsDeprecated(), ""); \ + TYPED_TEST_SUITE +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// Implements type-parameterized tests. + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the namespace name that the type-parameterized tests for +// the given type-parameterized test suite are defined in. The exact +// name of the namespace is subject to change without notice. +#define GTEST_SUITE_NAMESPACE_(TestSuiteName) gtest_suite_##TestSuiteName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the variable used to remember the names of +// the defined tests in the given test suite. +#define GTEST_TYPED_TEST_SUITE_P_STATE_(TestSuiteName) \ + gtest_typed_test_suite_p_state_##TestSuiteName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY. +// +// Expands to the name of the variable used to remember the names of +// the registered tests in the given test suite. +#define GTEST_REGISTERED_TEST_NAMES_(TestSuiteName) \ + gtest_registered_test_names_##TestSuiteName##_ + +// The variables defined in the type-parameterized test macros are +// static as typically these macros are used in a .h file that can be +// #included in multiple translation units linked together. +#define TYPED_TEST_SUITE_P(SuiteName) \ + static ::testing::internal::TypedTestSuitePState \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define TYPED_TEST_CASE_P \ + static_assert(::testing::internal::TypedTestCase_P_IsDeprecated(), ""); \ + TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#define TYPED_TEST_P(SuiteName, TestName) \ + namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \ + template \ + class TestName : public SuiteName { \ + private: \ + typedef SuiteName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + void TestBody() override; \ + }; \ + static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName( \ + __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName), \ + GTEST_STRINGIFY_(TestName)); \ + } \ + template \ + void GTEST_SUITE_NAMESPACE_( \ + SuiteName)::TestName::TestBody() + +// Note: this won't work correctly if the trailing arguments are macros. +#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...) \ + namespace GTEST_SUITE_NAMESPACE_(SuiteName) { \ + typedef ::testing::internal::Templates<__VA_ARGS__> gtest_AllTests_; \ + } \ + static const char* const GTEST_REGISTERED_TEST_NAMES_( \ + SuiteName) GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \ + GTEST_STRINGIFY_(SuiteName), __FILE__, __LINE__, #__VA_ARGS__) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define REGISTER_TYPED_TEST_CASE_P \ + static_assert(::testing::internal::RegisterTypedTestCase_P_IsDeprecated(), \ + ""); \ + REGISTER_TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...) \ + static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1, \ + "test-suit-prefix must not be empty"); \ + static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestSuite< \ + SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_, \ + ::testing::internal::GenerateTypeList::type>:: \ + Register(GTEST_STRINGIFY_(Prefix), \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), \ + >EST_TYPED_TEST_SUITE_P_STATE_(SuiteName), \ + GTEST_STRINGIFY_(SuiteName), \ + GTEST_REGISTERED_TEST_NAMES_(SuiteName), \ + ::testing::internal::GenerateNames< \ + ::testing::internal::NameGeneratorSelector< \ + __VA_ARGS__>::type, \ + ::testing::internal::GenerateTypeList::type>()) + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +#define INSTANTIATE_TYPED_TEST_CASE_P \ + static_assert( \ + ::testing::internal::InstantiateTypedTestCase_P_IsDeprecated(), ""); \ + INSTANTIATE_TYPED_TEST_SUITE_P +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h index 581a44e95f..d19a587a18 100644 --- a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest.h @@ -26,10 +26,8 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) + +// The Google C++ Testing and Mocking Framework (Google Test) // // This header file defines the public API for Google Test. It should be // included by any test program that uses Google Test. @@ -48,17452 +46,31 @@ // registration from Barthelemy Dagenais' (barthelemy@prologique.com) // easyUnit framework. -#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_H_ +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_H_ +#include #include +#include #include +#include #include -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) -// -// The Google C++ Testing Framework (Google Test) -// -// This header file declares functions and macros used internally by -// Google Test. They are subject to change without notice. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ - -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: wan@google.com (Zhanyong Wan) -// -// Low-level types and utilities for porting Google Test to various -// platforms. They are subject to change without notice. DO NOT USE -// THEM IN USER CODE. -// -// This file is fundamental to Google Test. All other Google Test source -// files are expected to #include this. Therefore, it cannot #include -// any other Google Test header. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ - -// The user can define the following macros in the build script to -// control Google Test's behavior. If the user doesn't define a macro -// in this list, Google Test will define it. -// -// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2) -// is/isn't available. -// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions -// are enabled. -// GTEST_HAS_GLOBAL_STRING - Define it to 1/0 to indicate that ::string -// is/isn't available (some systems define -// ::string, which is different to std::string). -// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string -// is/isn't available (some systems define -// ::wstring, which is different to std::wstring). -// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular -// expressions are/aren't available. -// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that -// is/isn't available. -// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't -// enabled. -// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that -// std::wstring does/doesn't work (Google Test can -// be used where std::wstring is unavailable). -// GTEST_HAS_TR1_TUPLE - Define it to 1/0 to indicate tr1::tuple -// is/isn't available. -// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the -// compiler supports Microsoft's "Structured -// Exception Handling". -// GTEST_HAS_STREAM_REDIRECTION -// - Define it to 1/0 to indicate whether the -// platform supports I/O stream redirection using -// dup() and dup2(). -// GTEST_USE_OWN_TR1_TUPLE - Define it to 1/0 to indicate whether Google -// Test's own tr1 tuple implementation should be -// used. Unused when the user sets -// GTEST_HAS_TR1_TUPLE to 0. -// GTEST_LANG_CXX11 - Define it to 1/0 to indicate that Google Test -// is building in C++11/C++98 mode. -// GTEST_LINKED_AS_SHARED_LIBRARY -// - Define to 1 when compiling tests that use -// Google Test as a shared library (known as -// DLL on Windows). -// GTEST_CREATE_SHARED_LIBRARY -// - Define to 1 when compiling Google Test itself -// as a shared library. - -// This header defines the following utilities: -// -// Macros indicating the current platform (defined to 1 if compiled on -// the given platform; otherwise undefined): -// GTEST_OS_AIX - IBM AIX -// GTEST_OS_CYGWIN - Cygwin -// GTEST_OS_HPUX - HP-UX -// GTEST_OS_LINUX - Linux -// GTEST_OS_LINUX_ANDROID - Google Android -// GTEST_OS_MAC - Mac OS X -// GTEST_OS_IOS - iOS -// GTEST_OS_IOS_SIMULATOR - iOS simulator -// GTEST_OS_NACL - Google Native Client (NaCl) -// GTEST_OS_OPENBSD - OpenBSD -// GTEST_OS_QNX - QNX -// GTEST_OS_SOLARIS - Sun Solaris -// GTEST_OS_SYMBIAN - Symbian -// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile) -// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop -// GTEST_OS_WINDOWS_MINGW - MinGW -// GTEST_OS_WINDOWS_MOBILE - Windows Mobile -// GTEST_OS_ZOS - z/OS -// -// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the -// most stable support. Since core members of the Google Test project -// don't have access to other platforms, support for them may be less -// stable. If you notice any problems on your platform, please notify -// googletestframework@googlegroups.com (patches for fixing them are -// even more welcome!). -// -// Note that it is possible that none of the GTEST_OS_* macros are defined. -// -// Macros indicating available Google Test features (defined to 1 if -// the corresponding feature is supported; otherwise undefined): -// GTEST_HAS_COMBINE - the Combine() function (for value-parameterized -// tests) -// GTEST_HAS_DEATH_TEST - death tests -// GTEST_HAS_PARAM_TEST - value-parameterized tests -// GTEST_HAS_TYPED_TEST - typed tests -// GTEST_HAS_TYPED_TEST_P - type-parameterized tests -// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with -// GTEST_HAS_POSIX_RE (see above) which users can -// define themselves. -// GTEST_USES_SIMPLE_RE - our own simple regex is used; -// the above two are mutually exclusive. -// GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ(). -// -// Macros for basic C++ coding: -// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. -// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a -// variable don't have to be used. -// GTEST_DISALLOW_ASSIGN_ - disables operator=. -// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=. -// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. -// -// Synchronization: -// Mutex, MutexLock, ThreadLocal, GetThreadCount() -// - synchronization primitives. -// GTEST_IS_THREADSAFE - defined to 1 to indicate that the above -// synchronization primitives have real implementations -// and Google Test is thread-safe; or 0 otherwise. -// -// Template meta programming: -// is_pointer - as in TR1; needed on Symbian and IBM XL C/C++ only. -// IteratorTraits - partial implementation of std::iterator_traits, which -// is not available in libCstd when compiled with Sun C++. -// -// Smart pointers: -// scoped_ptr - as in TR2. -// -// Regular expressions: -// RE - a simple regular expression class using the POSIX -// Extended Regular Expression syntax on UNIX-like -// platforms, or a reduced regular exception syntax on -// other platforms, including Windows. -// -// Logging: -// GTEST_LOG_() - logs messages at the specified severity level. -// LogToStderr() - directs all log messages to stderr. -// FlushInfoLog() - flushes informational log messages. -// -// Stdout and stderr capturing: -// CaptureStdout() - starts capturing stdout. -// GetCapturedStdout() - stops capturing stdout and returns the captured -// string. -// CaptureStderr() - starts capturing stderr. -// GetCapturedStderr() - stops capturing stderr and returns the captured -// string. -// -// Integer types: -// TypeWithSize - maps an integer to a int type. -// Int32, UInt32, Int64, UInt64, TimeInMillis -// - integers of known sizes. -// BiggestInt - the biggest signed integer type. -// -// Command-line utilities: -// GTEST_FLAG() - references a flag. -// GTEST_DECLARE_*() - declares a flag. -// GTEST_DEFINE_*() - defines a flag. -// GetInjectableArgvs() - returns the command line as a vector of strings. -// -// Environment variable utilities: -// GetEnv() - gets the value of an environment variable. -// BoolFromGTestEnv() - parses a bool environment variable. -// Int32FromGTestEnv() - parses an Int32 environment variable. -// StringFromGTestEnv() - parses a string environment variable. - -#include // for isspace, etc -#include // for ptrdiff_t -#include -#include -#include -#ifndef _WIN32_WCE -# include -# include -#endif // !_WIN32_WCE - -#if defined __APPLE__ -# include -# include -#endif - -#include // NOLINT -#include // NOLINT -#include // NOLINT - -#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" -#define GTEST_FLAG_PREFIX_ "gtest_" -#define GTEST_FLAG_PREFIX_DASH_ "gtest-" -#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" -#define GTEST_NAME_ "Google Test" -#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/" - -// Determines the version of gcc that is used to compile this. -#ifdef __GNUC__ -// 40302 means version 4.3.2. -# define GTEST_GCC_VER_ \ - (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) -#endif // __GNUC__ - -// Determines the platform on which Google Test is compiled. -#ifdef __CYGWIN__ -# define GTEST_OS_CYGWIN 1 -#elif defined __SYMBIAN32__ -# define GTEST_OS_SYMBIAN 1 -#elif defined _WIN32 -# define GTEST_OS_WINDOWS 1 -# ifdef _WIN32_WCE -# define GTEST_OS_WINDOWS_MOBILE 1 -# elif defined(__MINGW__) || defined(__MINGW32__) -# define GTEST_OS_WINDOWS_MINGW 1 -# else -# define GTEST_OS_WINDOWS_DESKTOP 1 -# endif // _WIN32_WCE -#elif defined __APPLE__ -# define GTEST_OS_MAC 1 -# if TARGET_OS_IPHONE -# define GTEST_OS_IOS 1 -# if TARGET_IPHONE_SIMULATOR -# define GTEST_OS_IOS_SIMULATOR 1 -# endif -# endif -#elif defined __linux__ -# define GTEST_OS_LINUX 1 -# if defined __ANDROID__ -# define GTEST_OS_LINUX_ANDROID 1 -# endif -#elif defined __MVS__ -# define GTEST_OS_ZOS 1 -#elif defined(__sun) && defined(__SVR4) -# define GTEST_OS_SOLARIS 1 -#elif defined(_AIX) -# define GTEST_OS_AIX 1 -#elif defined(__hpux) -# define GTEST_OS_HPUX 1 -#elif defined __native_client__ -# define GTEST_OS_NACL 1 -#elif defined __OpenBSD__ -# define GTEST_OS_OPENBSD 1 -#elif defined __QNX__ -# define GTEST_OS_QNX 1 -#endif // __CYGWIN__ - -#ifndef GTEST_LANG_CXX11 -// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when -// -std={c,gnu}++{0x,11} is passed. The C++11 standard specifies a -// value for __cplusplus, and recent versions of clang, gcc, and -// probably other compilers set that too in C++11 mode. -# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L -// Compiling in at least C++11 mode. -# define GTEST_LANG_CXX11 1 -# else -# define GTEST_LANG_CXX11 0 -# endif -#endif - -// Brings in definitions for functions used in the testing::internal::posix -// namespace (read, write, close, chdir, isatty, stat). We do not currently -// use them on Windows Mobile. -#if !GTEST_OS_WINDOWS -// This assumes that non-Windows OSes provide unistd.h. For OSes where this -// is not the case, we need to include headers that provide the functions -// mentioned above. -# include -# include -#elif !GTEST_OS_WINDOWS_MOBILE -# include -# include -#endif - -#if GTEST_OS_LINUX_ANDROID -// Used to define __ANDROID_API__ matching the target NDK API level. -# include // NOLINT -#endif - -// Defines this to true iff Google Test can use POSIX regular expressions. -#ifndef GTEST_HAS_POSIX_RE -# if GTEST_OS_LINUX_ANDROID -// On Android, is only available starting with Gingerbread. -# define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) -# else -# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS) -# endif -#endif - -#if GTEST_HAS_POSIX_RE - -// On some platforms, needs someone to define size_t, and -// won't compile otherwise. We can #include it here as we already -// included , which is guaranteed to define size_t through -// . -# include // NOLINT - -# define GTEST_USES_POSIX_RE 1 - -#elif GTEST_OS_WINDOWS - -// is not available on Windows. Use our own simple regex -// implementation instead. -# define GTEST_USES_SIMPLE_RE 1 - -#else - -// may not be available on this platform. Use our own -// simple regex implementation instead. -# define GTEST_USES_SIMPLE_RE 1 - -#endif // GTEST_HAS_POSIX_RE - -#ifndef GTEST_HAS_EXCEPTIONS -// The user didn't tell us whether exceptions are enabled, so we need -// to figure it out. -# if defined(_MSC_VER) || defined(__BORLANDC__) -// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS -// macro to enable exceptions, so we'll do the same. -// Assumes that exceptions are enabled by default. -# ifndef _HAS_EXCEPTIONS -# define _HAS_EXCEPTIONS 1 -# endif // _HAS_EXCEPTIONS -# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS -# elif defined(__GNUC__) && __EXCEPTIONS -// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__SUNPRO_CC) -// Sun Pro CC supports exceptions. However, there is no compile-time way of -// detecting whether they are enabled or not. Therefore, we assume that -// they are enabled unless the user tells us otherwise. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__IBMCPP__) && __EXCEPTIONS -// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled. -# define GTEST_HAS_EXCEPTIONS 1 -# elif defined(__HP_aCC) -// Exception handling is in effect by default in HP aCC compiler. It has to -// be turned of by +noeh compiler option if desired. -# define GTEST_HAS_EXCEPTIONS 1 -# else -// For other compilers, we assume exceptions are disabled to be -// conservative. -# define GTEST_HAS_EXCEPTIONS 0 -# endif // defined(_MSC_VER) || defined(__BORLANDC__) -#endif // GTEST_HAS_EXCEPTIONS - -#if !defined(GTEST_HAS_STD_STRING) -// Even though we don't use this macro any longer, we keep it in case -// some clients still depend on it. -# define GTEST_HAS_STD_STRING 1 -#elif !GTEST_HAS_STD_STRING -// The user told us that ::std::string isn't available. -# error "Google Test cannot be used where ::std::string isn't available." -#endif // !defined(GTEST_HAS_STD_STRING) - -#ifndef GTEST_HAS_GLOBAL_STRING -// The user didn't tell us whether ::string is available, so we need -// to figure it out. - -# define GTEST_HAS_GLOBAL_STRING 0 - -#endif // GTEST_HAS_GLOBAL_STRING - -#ifndef GTEST_HAS_STD_WSTRING -// The user didn't tell us whether ::std::wstring is available, so we need -// to figure it out. -// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring -// is available. - -// Cygwin 1.7 and below doesn't support ::std::wstring. -// Solaris' libc++ doesn't support it either. Android has -// no support for it at least as recent as Froyo (2.2). -# define GTEST_HAS_STD_WSTRING \ - (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS)) - -#endif // GTEST_HAS_STD_WSTRING - -#ifndef GTEST_HAS_GLOBAL_WSTRING -// The user didn't tell us whether ::wstring is available, so we need -// to figure it out. -# define GTEST_HAS_GLOBAL_WSTRING \ - (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING) -#endif // GTEST_HAS_GLOBAL_WSTRING - -// Determines whether RTTI is available. -#ifndef GTEST_HAS_RTTI -// The user didn't tell us whether RTTI is enabled, so we need to -// figure it out. - -# ifdef _MSC_VER - -# ifdef _CPPRTTI // MSVC defines this macro iff RTTI is enabled. -# define GTEST_HAS_RTTI 1 -# else -# define GTEST_HAS_RTTI 0 -# endif - -// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled. -# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302) - -# ifdef __GXX_RTTI -// When building against STLport with the Android NDK and with -// -frtti -fno-exceptions, the build fails at link time with undefined -// references to __cxa_bad_typeid. Note sure if STL or toolchain bug, -// so disable RTTI when detected. -# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \ - !defined(__EXCEPTIONS) -# define GTEST_HAS_RTTI 0 -# else -# define GTEST_HAS_RTTI 1 -# endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS -# else -# define GTEST_HAS_RTTI 0 -# endif // __GXX_RTTI - -// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends -// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the -// first version with C++ support. -# elif defined(__clang__) - -# define GTEST_HAS_RTTI __has_feature(cxx_rtti) - -// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if -// both the typeid and dynamic_cast features are present. -# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) - -# ifdef __RTTI_ALL__ -# define GTEST_HAS_RTTI 1 -# else -# define GTEST_HAS_RTTI 0 -# endif - -# else - -// For all other compilers, we assume RTTI is enabled. -# define GTEST_HAS_RTTI 1 - -# endif // _MSC_VER - -#endif // GTEST_HAS_RTTI - -// It's this header's responsibility to #include when RTTI -// is enabled. -#if GTEST_HAS_RTTI -# include -#endif - -// Determines whether Google Test can use the pthreads library. -#ifndef GTEST_HAS_PTHREAD -// The user didn't tell us explicitly, so we assume pthreads support is -// available on Linux and Mac. -// -// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 -// to your compiler flags. -# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \ - || GTEST_OS_QNX) -#endif // GTEST_HAS_PTHREAD - -#if GTEST_HAS_PTHREAD -// gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is -// true. -# include // NOLINT - -// For timespec and nanosleep, used below. -# include // NOLINT -#endif - -// Determines whether Google Test can use tr1/tuple. You can define -// this macro to 0 to prevent Google Test from using tuple (any -// feature depending on tuple with be disabled in this mode). -#ifndef GTEST_HAS_TR1_TUPLE -# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) -// STLport, provided with the Android NDK, has neither or . -# define GTEST_HAS_TR1_TUPLE 0 -# else -// The user didn't tell us not to do it, so we assume it's OK. -# define GTEST_HAS_TR1_TUPLE 1 -# endif -#endif // GTEST_HAS_TR1_TUPLE - -// Determines whether Google Test's own tr1 tuple implementation -// should be used. -#ifndef GTEST_USE_OWN_TR1_TUPLE -// The user didn't tell us, so we need to figure it out. - -// We use our own TR1 tuple if we aren't sure the user has an -// implementation of it already. At this time, libstdc++ 4.0.0+ and -// MSVC 2010 are the only mainstream standard libraries that come -// with a TR1 tuple implementation. NVIDIA's CUDA NVCC compiler -// pretends to be GCC by defining __GNUC__ and friends, but cannot -// compile GCC's tuple implementation. MSVC 2008 (9.0) provides TR1 -// tuple in a 323 MB Feature Pack download, which we cannot assume the -// user has. QNX's QCC compiler is a modified GCC but it doesn't -// support TR1 tuple. libc++ only provides std::tuple, in C++11 mode, -// and it can be used with some compilers that define __GNUC__. -# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \ - && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600 -# define GTEST_ENV_HAS_TR1_TUPLE_ 1 -# endif - -// C++11 specifies that provides std::tuple. Use that if gtest is used -// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6 -// can build with clang but need to use gcc4.2's libstdc++). -# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325) -# define GTEST_ENV_HAS_STD_TUPLE_ 1 -# endif - -# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_ -# define GTEST_USE_OWN_TR1_TUPLE 0 -# else -# define GTEST_USE_OWN_TR1_TUPLE 1 -# endif - -#endif // GTEST_USE_OWN_TR1_TUPLE - -// To avoid conditional compilation everywhere, we make it -// gtest-port.h's responsibility to #include the header implementing -// tr1/tuple. -#if GTEST_HAS_TR1_TUPLE - -# if GTEST_USE_OWN_TR1_TUPLE -// This file was GENERATED by command: -// pump.py gtest-tuple.h.pump -// DO NOT EDIT BY HAND!!! - -// Copyright 2009 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - -// Implements a subset of TR1 tuple needed by Google Test and Google Mock. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ - -#include // For ::std::pair. - -// The compiler used in Symbian has a bug that prevents us from declaring the -// tuple template as a friend (it complains that tuple is redefined). This -// hack bypasses the bug by declaring the members that should otherwise be -// private as public. -// Sun Studio versions < 12 also have the above bug. -#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) -# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public: -#else -# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \ - template friend class tuple; \ - private: -#endif - -// GTEST_n_TUPLE_(T) is the type of an n-tuple. -#define GTEST_0_TUPLE_(T) tuple<> -#define GTEST_1_TUPLE_(T) tuple -#define GTEST_2_TUPLE_(T) tuple -#define GTEST_3_TUPLE_(T) tuple -#define GTEST_4_TUPLE_(T) tuple -#define GTEST_5_TUPLE_(T) tuple -#define GTEST_6_TUPLE_(T) tuple -#define GTEST_7_TUPLE_(T) tuple -#define GTEST_8_TUPLE_(T) tuple -#define GTEST_9_TUPLE_(T) tuple -#define GTEST_10_TUPLE_(T) tuple - -// GTEST_n_TYPENAMES_(T) declares a list of n typenames. -#define GTEST_0_TYPENAMES_(T) -#define GTEST_1_TYPENAMES_(T) typename T##0 -#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1 -#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2 -#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3 -#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4 -#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5 -#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6 -#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6, typename T##7 -#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6, \ - typename T##7, typename T##8 -#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ - typename T##3, typename T##4, typename T##5, typename T##6, \ - typename T##7, typename T##8, typename T##9 - -// In theory, defining stuff in the ::std namespace is undefined -// behavior. We can do this as we are playing the role of a standard -// library vendor. -namespace std { -namespace tr1 { - -template -class tuple; - -// Anything in namespace gtest_internal is Google Test's INTERNAL -// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code. -namespace gtest_internal { - -// ByRef::type is T if T is a reference; otherwise it's const T&. -template -struct ByRef { typedef const T& type; }; // NOLINT -template -struct ByRef { typedef T& type; }; // NOLINT - -// A handy wrapper for ByRef. -#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef::type - -// AddRef::type is T if T is a reference; otherwise it's T&. This -// is the same as tr1::add_reference::type. -template -struct AddRef { typedef T& type; }; // NOLINT -template -struct AddRef { typedef T& type; }; // NOLINT - -// A handy wrapper for AddRef. -#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef::type - -// A helper for implementing get(). -template class Get; - -// A helper for implementing tuple_element. kIndexValid is true -// iff k < the number of fields in tuple type T. -template -struct TupleElement; - -template -struct TupleElement { - typedef T0 type; -}; - -template -struct TupleElement { - typedef T1 type; -}; - -template -struct TupleElement { - typedef T2 type; -}; - -template -struct TupleElement { - typedef T3 type; -}; - -template -struct TupleElement { - typedef T4 type; -}; - -template -struct TupleElement { - typedef T5 type; -}; - -template -struct TupleElement { - typedef T6 type; -}; - -template -struct TupleElement { - typedef T7 type; -}; - -template -struct TupleElement { - typedef T8 type; -}; - -template -struct TupleElement { - typedef T9 type; -}; - -} // namespace gtest_internal - -template <> -class tuple<> { - public: - tuple() {} - tuple(const tuple& /* t */) {} - tuple& operator=(const tuple& /* t */) { return *this; } -}; - -template -class GTEST_1_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {} - - tuple(const tuple& t) : f0_(t.f0_) {} - - template - tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_1_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) { - f0_ = t.f0_; - return *this; - } - - T0 f0_; -}; - -template -class GTEST_2_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0), - f1_(f1) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {} - - template - tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {} - template - tuple(const ::std::pair& p) : f0_(p.first), f1_(p.second) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_2_TUPLE_(U)& t) { - return CopyFrom(t); - } - template - tuple& operator=(const ::std::pair& p) { - f0_ = p.first; - f1_ = p.second; - return *this; - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - return *this; - } - - T0 f0_; - T1 f1_; -}; - -template -class GTEST_3_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} - - template - tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_3_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; -}; - -template -class GTEST_4_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2), - f3_(f3) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {} - - template - tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_4_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; -}; - -template -class GTEST_5_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, - GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_) {} - - template - tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_5_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; -}; - -template -class GTEST_6_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), - f5_(f5) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_) {} - - template - tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_6_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; -}; - -template -class GTEST_7_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2), - f3_(f3), f4_(f4), f5_(f5), f6_(f6) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} - - template - tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_7_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; -}; - -template -class GTEST_8_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, - GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), - f5_(f5), f6_(f6), f7_(f7) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} - - template - tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_8_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - f7_ = t.f7_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; - T7 f7_; -}; - -template -class GTEST_9_TUPLE_(T) { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, - GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), - f5_(f5), f6_(f6), f7_(f7), f8_(f8) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} - - template - tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_9_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - f7_ = t.f7_; - f8_ = t.f8_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; - T7 f7_; - T8 f8_; -}; - -template -class tuple { - public: - template friend class gtest_internal::Get; - - tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(), - f9_() {} - - explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, - GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, - GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, - GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2), - f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {} - - tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), - f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {} - - template - tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), - f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), - f9_(t.f9_) {} - - tuple& operator=(const tuple& t) { return CopyFrom(t); } - - template - tuple& operator=(const GTEST_10_TUPLE_(U)& t) { - return CopyFrom(t); - } - - GTEST_DECLARE_TUPLE_AS_FRIEND_ - - template - tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) { - f0_ = t.f0_; - f1_ = t.f1_; - f2_ = t.f2_; - f3_ = t.f3_; - f4_ = t.f4_; - f5_ = t.f5_; - f6_ = t.f6_; - f7_ = t.f7_; - f8_ = t.f8_; - f9_ = t.f9_; - return *this; - } - - T0 f0_; - T1 f1_; - T2 f2_; - T3 f3_; - T4 f4_; - T5 f5_; - T6 f6_; - T7 f7_; - T8 f8_; - T9 f9_; -}; - -// 6.1.3.2 Tuple creation functions. - -// Known limitations: we don't support passing an -// std::tr1::reference_wrapper to make_tuple(). And we don't -// implement tie(). - -inline tuple<> make_tuple() { return tuple<>(); } - -template -inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) { - return GTEST_1_TUPLE_(T)(f0); -} - -template -inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) { - return GTEST_2_TUPLE_(T)(f0, f1); -} - -template -inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) { - return GTEST_3_TUPLE_(T)(f0, f1, f2); -} - -template -inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3) { - return GTEST_4_TUPLE_(T)(f0, f1, f2, f3); -} - -template -inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4) { - return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4); -} - -template -inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5) { - return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5); -} - -template -inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6) { - return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6); -} - -template -inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) { - return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7); -} - -template -inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, - const T8& f8) { - return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8); -} - -template -inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, - const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, - const T8& f8, const T9& f9) { - return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9); -} - -// 6.1.3.3 Tuple helper classes. - -template struct tuple_size; - -template -struct tuple_size { - static const int value = 0; -}; - -template -struct tuple_size { - static const int value = 1; -}; - -template -struct tuple_size { - static const int value = 2; -}; - -template -struct tuple_size { - static const int value = 3; -}; - -template -struct tuple_size { - static const int value = 4; -}; - -template -struct tuple_size { - static const int value = 5; -}; - -template -struct tuple_size { - static const int value = 6; -}; - -template -struct tuple_size { - static const int value = 7; -}; - -template -struct tuple_size { - static const int value = 8; -}; - -template -struct tuple_size { - static const int value = 9; -}; - -template -struct tuple_size { - static const int value = 10; -}; - -template -struct tuple_element { - typedef typename gtest_internal::TupleElement< - k < (tuple_size::value), k, Tuple>::type type; -}; - -#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element::type - -// 6.1.3.4 Element access. - -namespace gtest_internal { - -template <> -class Get<0> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) - Field(Tuple& t) { return t.f0_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) - ConstField(const Tuple& t) { return t.f0_; } -}; - -template <> -class Get<1> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) - Field(Tuple& t) { return t.f1_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) - ConstField(const Tuple& t) { return t.f1_; } -}; - -template <> -class Get<2> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) - Field(Tuple& t) { return t.f2_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) - ConstField(const Tuple& t) { return t.f2_; } -}; - -template <> -class Get<3> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) - Field(Tuple& t) { return t.f3_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) - ConstField(const Tuple& t) { return t.f3_; } -}; - -template <> -class Get<4> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) - Field(Tuple& t) { return t.f4_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) - ConstField(const Tuple& t) { return t.f4_; } -}; - -template <> -class Get<5> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) - Field(Tuple& t) { return t.f5_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) - ConstField(const Tuple& t) { return t.f5_; } -}; - -template <> -class Get<6> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) - Field(Tuple& t) { return t.f6_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) - ConstField(const Tuple& t) { return t.f6_; } -}; - -template <> -class Get<7> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) - Field(Tuple& t) { return t.f7_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) - ConstField(const Tuple& t) { return t.f7_; } -}; - -template <> -class Get<8> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) - Field(Tuple& t) { return t.f8_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) - ConstField(const Tuple& t) { return t.f8_; } -}; - -template <> -class Get<9> { - public: - template - static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) - Field(Tuple& t) { return t.f9_; } // NOLINT - - template - static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) - ConstField(const Tuple& t) { return t.f9_; } -}; - -} // namespace gtest_internal - -template -GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) -get(GTEST_10_TUPLE_(T)& t) { - return gtest_internal::Get::Field(t); -} - -template -GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) -get(const GTEST_10_TUPLE_(T)& t) { - return gtest_internal::Get::ConstField(t); -} - -// 6.1.3.5 Relational operators - -// We only implement == and !=, as we don't have a need for the rest yet. - -namespace gtest_internal { - -// SameSizeTuplePrefixComparator::Eq(t1, t2) returns true if the -// first k fields of t1 equals the first k fields of t2. -// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if -// k1 != k2. -template -struct SameSizeTuplePrefixComparator; - -template <> -struct SameSizeTuplePrefixComparator<0, 0> { - template - static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) { - return true; - } -}; - -template -struct SameSizeTuplePrefixComparator { - template - static bool Eq(const Tuple1& t1, const Tuple2& t2) { - return SameSizeTuplePrefixComparator::Eq(t1, t2) && - ::std::tr1::get(t1) == ::std::tr1::get(t2); - } -}; - -} // namespace gtest_internal - -template -inline bool operator==(const GTEST_10_TUPLE_(T)& t, - const GTEST_10_TUPLE_(U)& u) { - return gtest_internal::SameSizeTuplePrefixComparator< - tuple_size::value, - tuple_size::value>::Eq(t, u); -} - -template -inline bool operator!=(const GTEST_10_TUPLE_(T)& t, - const GTEST_10_TUPLE_(U)& u) { return !(t == u); } - -// 6.1.4 Pairs. -// Unimplemented. - -} // namespace tr1 -} // namespace std - -#undef GTEST_0_TUPLE_ -#undef GTEST_1_TUPLE_ -#undef GTEST_2_TUPLE_ -#undef GTEST_3_TUPLE_ -#undef GTEST_4_TUPLE_ -#undef GTEST_5_TUPLE_ -#undef GTEST_6_TUPLE_ -#undef GTEST_7_TUPLE_ -#undef GTEST_8_TUPLE_ -#undef GTEST_9_TUPLE_ -#undef GTEST_10_TUPLE_ - -#undef GTEST_0_TYPENAMES_ -#undef GTEST_1_TYPENAMES_ -#undef GTEST_2_TYPENAMES_ -#undef GTEST_3_TYPENAMES_ -#undef GTEST_4_TYPENAMES_ -#undef GTEST_5_TYPENAMES_ -#undef GTEST_6_TYPENAMES_ -#undef GTEST_7_TYPENAMES_ -#undef GTEST_8_TYPENAMES_ -#undef GTEST_9_TYPENAMES_ -#undef GTEST_10_TYPENAMES_ - -#undef GTEST_DECLARE_TUPLE_AS_FRIEND_ -#undef GTEST_BY_REF_ -#undef GTEST_ADD_REF_ -#undef GTEST_TUPLE_ELEMENT_ - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ -# elif GTEST_ENV_HAS_STD_TUPLE_ -# include -// C++11 puts its tuple into the ::std namespace rather than -// ::std::tr1. gtest expects tuple to live in ::std::tr1, so put it there. -// This causes undefined behavior, but supported compilers react in -// the way we intend. -namespace std { -namespace tr1 { -using ::std::get; -using ::std::make_tuple; -using ::std::tuple; -using ::std::tuple_element; -using ::std::tuple_size; -} -} - -# elif GTEST_OS_SYMBIAN - -// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to -// use STLport's tuple implementation, which unfortunately doesn't -// work as the copy of STLport distributed with Symbian is incomplete. -// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to -// use its own tuple implementation. -# ifdef BOOST_HAS_TR1_TUPLE -# undef BOOST_HAS_TR1_TUPLE -# endif // BOOST_HAS_TR1_TUPLE - -// This prevents , which defines -// BOOST_HAS_TR1_TUPLE, from being #included by Boost's . -# define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED -# include - -# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000) -// GCC 4.0+ implements tr1/tuple in the header. This does -// not conform to the TR1 spec, which requires the header to be . - -# if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 -// Until version 4.3.2, gcc has a bug that causes , -// which is #included by , to not compile when RTTI is -// disabled. _TR1_FUNCTIONAL is the header guard for -// . Hence the following #define is a hack to prevent -// from being included. -# define _TR1_FUNCTIONAL 1 -# include -# undef _TR1_FUNCTIONAL // Allows the user to #include - // if he chooses to. -# else -# include // NOLINT -# endif // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 - -# else -// If the compiler is not GCC 4.0+, we assume the user is using a -// spec-conforming TR1 implementation. -# include // NOLINT -# endif // GTEST_USE_OWN_TR1_TUPLE - -#endif // GTEST_HAS_TR1_TUPLE - -// Determines whether clone(2) is supported. -// Usually it will only be available on Linux, excluding -// Linux on the Itanium architecture. -// Also see http://linux.die.net/man/2/clone. -#ifndef GTEST_HAS_CLONE -// The user didn't tell us, so we need to figure it out. - -# if GTEST_OS_LINUX && !defined(__ia64__) -# if GTEST_OS_LINUX_ANDROID -// On Android, clone() is only available on ARM starting with Gingerbread. -# if defined(__arm__) && __ANDROID_API__ >= 9 -# define GTEST_HAS_CLONE 1 -# else -# define GTEST_HAS_CLONE 0 -# endif -# else -# define GTEST_HAS_CLONE 1 -# endif -# else -# define GTEST_HAS_CLONE 0 -# endif // GTEST_OS_LINUX && !defined(__ia64__) - -#endif // GTEST_HAS_CLONE - -// Determines whether to support stream redirection. This is used to test -// output correctness and to implement death tests. -#ifndef GTEST_HAS_STREAM_REDIRECTION -// By default, we assume that stream redirection is supported on all -// platforms except known mobile ones. -# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN -# define GTEST_HAS_STREAM_REDIRECTION 0 -# else -# define GTEST_HAS_STREAM_REDIRECTION 1 -# endif // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN -#endif // GTEST_HAS_STREAM_REDIRECTION - -// Determines whether to support death tests. -// Google Test does not support death tests for VC 7.1 and earlier as -// abort() in a VC 7.1 application compiled as GUI in debug config -// pops up a dialog window that cannot be suppressed programmatically. -#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ - (GTEST_OS_MAC && !GTEST_OS_IOS) || GTEST_OS_IOS_SIMULATOR || \ - (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \ - GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \ - GTEST_OS_OPENBSD || GTEST_OS_QNX) -# define GTEST_HAS_DEATH_TEST 1 -# include // NOLINT -#endif - -// We don't support MSVC 7.1 with exceptions disabled now. Therefore -// all the compilers we care about are adequate for supporting -// value-parameterized tests. -#define GTEST_HAS_PARAM_TEST 1 - -// Determines whether to support type-driven tests. - -// Typed tests need and variadic macros, which GCC, VC++ 8.0, -// Sun Pro CC, IBM Visual Age, and HP aCC support. -#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \ - defined(__IBMCPP__) || defined(__HP_aCC) -# define GTEST_HAS_TYPED_TEST 1 -# define GTEST_HAS_TYPED_TEST_P 1 -#endif - -// Determines whether to support Combine(). This only makes sense when -// value-parameterized tests are enabled. The implementation doesn't -// work on Sun Studio since it doesn't understand templated conversion -// operators. -#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC) -# define GTEST_HAS_COMBINE 1 -#endif - -// Determines whether the system compiler uses UTF-16 for encoding wide strings. -#define GTEST_WIDE_STRING_USES_UTF16_ \ - (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX) - -// Determines whether test results can be streamed to a socket. -#if GTEST_OS_LINUX -# define GTEST_CAN_STREAM_RESULTS_ 1 -#endif - -// Defines some utility macros. - -// The GNU compiler emits a warning if nested "if" statements are followed by -// an "else" statement and braces are not used to explicitly disambiguate the -// "else" binding. This leads to problems with code like: -// -// if (gate) -// ASSERT_*(condition) << "Some message"; -// -// The "switch (0) case 0:" idiom is used to suppress this. -#ifdef __INTEL_COMPILER -# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ -#else -# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default: // NOLINT -#endif - -// Use this annotation at the end of a struct/class definition to -// prevent the compiler from optimizing away instances that are never -// used. This is useful when all interesting logic happens inside the -// c'tor and / or d'tor. Example: -// -// struct Foo { -// Foo() { ... } -// } GTEST_ATTRIBUTE_UNUSED_; -// -// Also use it after a variable or parameter declaration to tell the -// compiler the variable/parameter does not have to be used. -#if defined(__GNUC__) && !defined(COMPILER_ICC) -# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) -#else -# define GTEST_ATTRIBUTE_UNUSED_ -#endif - -// A macro to disallow operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_ASSIGN_(type)\ - void operator=(type const &) - -// A macro to disallow copy constructor and operator= -// This should be used in the private: declarations for a class. -#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\ - type(type const &);\ - GTEST_DISALLOW_ASSIGN_(type) - -// Tell the compiler to warn about unused return values for functions declared -// with this macro. The macro should be used on function declarations -// following the argument list: -// -// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; -#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC) -# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result)) -#else -# define GTEST_MUST_USE_RESULT_ -#endif // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC - -// Determine whether the compiler supports Microsoft's Structured Exception -// Handling. This is supported by several Windows compilers but generally -// does not exist on any other system. -#ifndef GTEST_HAS_SEH -// The user didn't tell us, so we need to figure it out. - -# if defined(_MSC_VER) || defined(__BORLANDC__) -// These two compilers are known to support SEH. -# define GTEST_HAS_SEH 1 -# else -// Assume no SEH. -# define GTEST_HAS_SEH 0 -# endif - -#endif // GTEST_HAS_SEH - -#ifdef _MSC_VER - -# if GTEST_LINKED_AS_SHARED_LIBRARY -# define GTEST_API_ __declspec(dllimport) -# elif GTEST_CREATE_SHARED_LIBRARY -# define GTEST_API_ __declspec(dllexport) -# endif - -#endif // _MSC_VER - -#ifndef GTEST_API_ -# define GTEST_API_ -#endif - -#ifdef __GNUC__ -// Ask the compiler to never inline a given function. -# define GTEST_NO_INLINE_ __attribute__((noinline)) -#else -# define GTEST_NO_INLINE_ -#endif - -// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. -#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION) -# define GTEST_HAS_CXXABI_H_ 1 -#else -# define GTEST_HAS_CXXABI_H_ 0 -#endif - -namespace testing { - -class Message; - -namespace internal { - -// A secret type that Google Test users don't know about. It has no -// definition on purpose. Therefore it's impossible to create a -// Secret object, which is what we want. -class Secret; - -// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time -// expression is true. For example, you could use it to verify the -// size of a static array: -// -// GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES, -// content_type_names_incorrect_size); -// -// or to make sure a struct is smaller than a certain size: -// -// GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large); -// -// The second argument to the macro is the name of the variable. If -// the expression is false, most compilers will issue a warning/error -// containing the name of the variable. - -template -struct CompileAssert { -}; - -#define GTEST_COMPILE_ASSERT_(expr, msg) \ - typedef ::testing::internal::CompileAssert<(static_cast(expr))> \ - msg[static_cast(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_ - -// Implementation details of GTEST_COMPILE_ASSERT_: -// -// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1 -// elements (and thus is invalid) when the expression is false. -// -// - The simpler definition -// -// #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1] -// -// does not work, as gcc supports variable-length arrays whose sizes -// are determined at run-time (this is gcc's extension and not part -// of the C++ standard). As a result, gcc fails to reject the -// following code with the simple definition: -// -// int foo; -// GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is -// // not a compile-time constant. -// -// - By using the type CompileAssert<(bool(expr))>, we ensures that -// expr is a compile-time constant. (Template arguments must be -// determined at compile-time.) -// -// - The outter parentheses in CompileAssert<(bool(expr))> are necessary -// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written -// -// CompileAssert -// -// instead, these compilers will refuse to compile -// -// GTEST_COMPILE_ASSERT_(5 > 0, some_message); -// -// (They seem to think the ">" in "5 > 0" marks the end of the -// template argument list.) -// -// - The array size is (bool(expr) ? 1 : -1), instead of simply -// -// ((expr) ? 1 : -1). -// -// This is to avoid running into a bug in MS VC 7.1, which -// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. - -// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h. -// -// This template is declared, but intentionally undefined. -template -struct StaticAssertTypeEqHelper; - -template -struct StaticAssertTypeEqHelper {}; - -#if GTEST_HAS_GLOBAL_STRING -typedef ::string string; -#else -typedef ::std::string string; -#endif // GTEST_HAS_GLOBAL_STRING - -#if GTEST_HAS_GLOBAL_WSTRING -typedef ::wstring wstring; -#elif GTEST_HAS_STD_WSTRING -typedef ::std::wstring wstring; -#endif // GTEST_HAS_GLOBAL_WSTRING - -// A helper for suppressing warnings on constant condition. It just -// returns 'condition'. -GTEST_API_ bool IsTrue(bool condition); - -// Defines scoped_ptr. - -// This implementation of scoped_ptr is PARTIAL - it only contains -// enough stuff to satisfy Google Test's need. -template -class scoped_ptr { - public: - typedef T element_type; - - explicit scoped_ptr(T* p = NULL) : ptr_(p) {} - ~scoped_ptr() { reset(); } - - T& operator*() const { return *ptr_; } - T* operator->() const { return ptr_; } - T* get() const { return ptr_; } - - T* release() { - T* const ptr = ptr_; - ptr_ = NULL; - return ptr; - } - - void reset(T* p = NULL) { - if (p != ptr_) { - if (IsTrue(sizeof(T) > 0)) { // Makes sure T is a complete type. - delete ptr_; - } - ptr_ = p; - } - } - - private: - T* ptr_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr); -}; - -// Defines RE. - -// A simple C++ wrapper for . It uses the POSIX Extended -// Regular Expression syntax. -class GTEST_API_ RE { - public: - // A copy constructor is required by the Standard to initialize object - // references from r-values. - RE(const RE& other) { Init(other.pattern()); } - - // Constructs an RE from a string. - RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT - -#if GTEST_HAS_GLOBAL_STRING - - RE(const ::string& regex) { Init(regex.c_str()); } // NOLINT - -#endif // GTEST_HAS_GLOBAL_STRING - - RE(const char* regex) { Init(regex); } // NOLINT - ~RE(); - - // Returns the string representation of the regex. - const char* pattern() const { return pattern_; } - - // FullMatch(str, re) returns true iff regular expression re matches - // the entire str. - // PartialMatch(str, re) returns true iff regular expression re - // matches a substring of str (including str itself). - // - // TODO(wan@google.com): make FullMatch() and PartialMatch() work - // when str contains NUL characters. - static bool FullMatch(const ::std::string& str, const RE& re) { - return FullMatch(str.c_str(), re); - } - static bool PartialMatch(const ::std::string& str, const RE& re) { - return PartialMatch(str.c_str(), re); - } - -#if GTEST_HAS_GLOBAL_STRING - - static bool FullMatch(const ::string& str, const RE& re) { - return FullMatch(str.c_str(), re); - } - static bool PartialMatch(const ::string& str, const RE& re) { - return PartialMatch(str.c_str(), re); - } - -#endif // GTEST_HAS_GLOBAL_STRING - - static bool FullMatch(const char* str, const RE& re); - static bool PartialMatch(const char* str, const RE& re); - - private: - void Init(const char* regex); - - // We use a const char* instead of an std::string, as Google Test used to be - // used where std::string is not available. TODO(wan@google.com): change to - // std::string. - const char* pattern_; - bool is_valid_; - -#if GTEST_USES_POSIX_RE - - regex_t full_regex_; // For FullMatch(). - regex_t partial_regex_; // For PartialMatch(). - -#else // GTEST_USES_SIMPLE_RE - - const char* full_pattern_; // For FullMatch(); - -#endif - - GTEST_DISALLOW_ASSIGN_(RE); -}; - -// Formats a source file path and a line number as they would appear -// in an error message from the compiler used to compile this code. -GTEST_API_ ::std::string FormatFileLocation(const char* file, int line); - -// Formats a file location for compiler-independent XML output. -// Although this function is not platform dependent, we put it next to -// FormatFileLocation in order to contrast the two functions. -GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, - int line); - -// Defines logging utilities: -// GTEST_LOG_(severity) - logs messages at the specified severity level. The -// message itself is streamed into the macro. -// LogToStderr() - directs all log messages to stderr. -// FlushInfoLog() - flushes informational log messages. - -enum GTestLogSeverity { - GTEST_INFO, - GTEST_WARNING, - GTEST_ERROR, - GTEST_FATAL -}; - -// Formats log entry severity, provides a stream object for streaming the -// log message, and terminates the message with a newline when going out of -// scope. -class GTEST_API_ GTestLog { - public: - GTestLog(GTestLogSeverity severity, const char* file, int line); - - // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. - ~GTestLog(); - - ::std::ostream& GetStream() { return ::std::cerr; } - - private: - const GTestLogSeverity severity_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog); -}; - -#define GTEST_LOG_(severity) \ - ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ - __FILE__, __LINE__).GetStream() - -inline void LogToStderr() {} -inline void FlushInfoLog() { fflush(NULL); } - -// INTERNAL IMPLEMENTATION - DO NOT USE. -// -// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition -// is not satisfied. -// Synopsys: -// GTEST_CHECK_(boolean_condition); -// or -// GTEST_CHECK_(boolean_condition) << "Additional message"; -// -// This checks the condition and if the condition is not satisfied -// it prints message about the condition violation, including the -// condition itself, plus additional message streamed into it, if any, -// and then it aborts the program. It aborts the program irrespective of -// whether it is built in the debug mode or not. -#define GTEST_CHECK_(condition) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::IsTrue(condition)) \ - ; \ - else \ - GTEST_LOG_(FATAL) << "Condition " #condition " failed. " - -// An all-mode assert to verify that the given POSIX-style function -// call returns 0 (indicating success). Known limitation: this -// doesn't expand to a balanced 'if' statement, so enclose the macro -// in {} if you need to use it as the only statement in an 'if' -// branch. -#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ - if (const int gtest_error = (posix_call)) \ - GTEST_LOG_(FATAL) << #posix_call << "failed with error " \ - << gtest_error - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Use ImplicitCast_ as a safe version of static_cast for upcasting in -// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a -// const Foo*). When you use ImplicitCast_, the compiler checks that -// the cast is safe. Such explicit ImplicitCast_s are necessary in -// surprisingly many situations where C++ demands an exact type match -// instead of an argument type convertable to a target type. -// -// The syntax for using ImplicitCast_ is the same as for static_cast: -// -// ImplicitCast_(expr) -// -// ImplicitCast_ would have been part of the C++ standard library, -// but the proposal was submitted too late. It will probably make -// its way into the language in the future. -// -// This relatively ugly name is intentional. It prevents clashes with -// similar functions users may have (e.g., implicit_cast). The internal -// namespace alone is not enough because the function can be found by ADL. -template -inline To ImplicitCast_(To x) { return x; } - -// When you upcast (that is, cast a pointer from type Foo to type -// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts -// always succeed. When you downcast (that is, cast a pointer from -// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because -// how do you know the pointer is really of type SubclassOfFoo? It -// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, -// when you downcast, you should use this macro. In debug mode, we -// use dynamic_cast<> to double-check the downcast is legal (we die -// if it's not). In normal mode, we do the efficient static_cast<> -// instead. Thus, it's important to test in debug mode to make sure -// the cast is legal! -// This is the only place in the code we should use dynamic_cast<>. -// In particular, you SHOULDN'T be using dynamic_cast<> in order to -// do RTTI (eg code like this: -// if (dynamic_cast(foo)) HandleASubclass1Object(foo); -// if (dynamic_cast(foo)) HandleASubclass2Object(foo); -// You should design the code some other way not to need this. -// -// This relatively ugly name is intentional. It prevents clashes with -// similar functions users may have (e.g., down_cast). The internal -// namespace alone is not enough because the function can be found by ADL. -template // use like this: DownCast_(foo); -inline To DownCast_(From* f) { // so we only accept pointers - // Ensures that To is a sub-type of From *. This test is here only - // for compile-time type checking, and has no overhead in an - // optimized build at run-time, as it will be optimized away - // completely. - if (false) { - const To to = NULL; - ::testing::internal::ImplicitCast_(to); - } - -#if GTEST_HAS_RTTI - // RTTI: debug mode only! - GTEST_CHECK_(f == NULL || dynamic_cast(f) != NULL); -#endif - return static_cast(f); -} - -// Downcasts the pointer of type Base to Derived. -// Derived must be a subclass of Base. The parameter MUST -// point to a class of type Derived, not any subclass of it. -// When RTTI is available, the function performs a runtime -// check to enforce this. -template -Derived* CheckedDowncastToActualType(Base* base) { -#if GTEST_HAS_RTTI - GTEST_CHECK_(typeid(*base) == typeid(Derived)); - return dynamic_cast(base); // NOLINT -#else - return static_cast(base); // Poor man's downcast. -#endif -} - -#if GTEST_HAS_STREAM_REDIRECTION - -// Defines the stderr capturer: -// CaptureStdout - starts capturing stdout. -// GetCapturedStdout - stops capturing stdout and returns the captured string. -// CaptureStderr - starts capturing stderr. -// GetCapturedStderr - stops capturing stderr and returns the captured string. -// -GTEST_API_ void CaptureStdout(); -GTEST_API_ std::string GetCapturedStdout(); -GTEST_API_ void CaptureStderr(); -GTEST_API_ std::string GetCapturedStderr(); - -#endif // GTEST_HAS_STREAM_REDIRECTION - - -#if GTEST_HAS_DEATH_TEST - -const ::std::vector& GetInjectableArgvs(); -void SetInjectableArgvs(const ::std::vector* - new_argvs); - -// A copy of all command line arguments. Set by InitGoogleTest(). -extern ::std::vector g_argvs; - -#endif // GTEST_HAS_DEATH_TEST - -// Defines synchronization primitives. - -#if GTEST_HAS_PTHREAD - -// Sleeps for (roughly) n milli-seconds. This function is only for -// testing Google Test's own constructs. Don't use it in user tests, -// either directly or indirectly. -inline void SleepMilliseconds(int n) { - const timespec time = { - 0, // 0 seconds. - n * 1000L * 1000L, // And n ms. - }; - nanosleep(&time, NULL); -} - -// Allows a controller thread to pause execution of newly created -// threads until notified. Instances of this class must be created -// and destroyed in the controller thread. -// -// This class is only for testing Google Test's own constructs. Do not -// use it in user tests, either directly or indirectly. -class Notification { - public: - Notification() : notified_(false) { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); - } - ~Notification() { - pthread_mutex_destroy(&mutex_); - } - - // Notifies all threads created with this notification to start. Must - // be called from the controller thread. - void Notify() { - pthread_mutex_lock(&mutex_); - notified_ = true; - pthread_mutex_unlock(&mutex_); - } - - // Blocks until the controller thread notifies. Must be called from a test - // thread. - void WaitForNotification() { - for (;;) { - pthread_mutex_lock(&mutex_); - const bool notified = notified_; - pthread_mutex_unlock(&mutex_); - if (notified) - break; - SleepMilliseconds(10); - } - } - - private: - pthread_mutex_t mutex_; - bool notified_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); -}; - -// As a C-function, ThreadFuncWithCLinkage cannot be templated itself. -// Consequently, it cannot select a correct instantiation of ThreadWithParam -// in order to call its Run(). Introducing ThreadWithParamBase as a -// non-templated base class for ThreadWithParam allows us to bypass this -// problem. -class ThreadWithParamBase { - public: - virtual ~ThreadWithParamBase() {} - virtual void Run() = 0; -}; - -// pthread_create() accepts a pointer to a function type with the C linkage. -// According to the Standard (7.5/1), function types with different linkages -// are different even if they are otherwise identical. Some compilers (for -// example, SunStudio) treat them as different types. Since class methods -// cannot be defined with C-linkage we need to define a free C-function to -// pass into pthread_create(). -extern "C" inline void* ThreadFuncWithCLinkage(void* thread) { - static_cast(thread)->Run(); - return NULL; -} - -// Helper class for testing Google Test's multi-threading constructs. -// To use it, write: -// -// void ThreadFunc(int param) { /* Do things with param */ } -// Notification thread_can_start; -// ... -// // The thread_can_start parameter is optional; you can supply NULL. -// ThreadWithParam thread(&ThreadFunc, 5, &thread_can_start); -// thread_can_start.Notify(); -// -// These classes are only for testing Google Test's own constructs. Do -// not use them in user tests, either directly or indirectly. -template -class ThreadWithParam : public ThreadWithParamBase { - public: - typedef void (*UserThreadFunc)(T); - - ThreadWithParam( - UserThreadFunc func, T param, Notification* thread_can_start) - : func_(func), - param_(param), - thread_can_start_(thread_can_start), - finished_(false) { - ThreadWithParamBase* const base = this; - // The thread can be created only after all fields except thread_ - // have been initialized. - GTEST_CHECK_POSIX_SUCCESS_( - pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base)); - } - ~ThreadWithParam() { Join(); } - - void Join() { - if (!finished_) { - GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0)); - finished_ = true; - } - } - - virtual void Run() { - if (thread_can_start_ != NULL) - thread_can_start_->WaitForNotification(); - func_(param_); - } - - private: - const UserThreadFunc func_; // User-supplied thread function. - const T param_; // User-supplied parameter to the thread function. - // When non-NULL, used to block execution until the controller thread - // notifies. - Notification* const thread_can_start_; - bool finished_; // true iff we know that the thread function has finished. - pthread_t thread_; // The native thread object. - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); -}; - -// MutexBase and Mutex implement mutex on pthreads-based platforms. They -// are used in conjunction with class MutexLock: -// -// Mutex mutex; -// ... -// MutexLock lock(&mutex); // Acquires the mutex and releases it at the end -// // of the current scope. -// -// MutexBase implements behavior for both statically and dynamically -// allocated mutexes. Do not use MutexBase directly. Instead, write -// the following to define a static mutex: -// -// GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex); -// -// You can forward declare a static mutex like this: -// -// GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex); -// -// To create a dynamic mutex, just define an object of type Mutex. -class MutexBase { - public: - // Acquires this mutex. - void Lock() { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_)); - owner_ = pthread_self(); - has_owner_ = true; - } - - // Releases this mutex. - void Unlock() { - // Since the lock is being released the owner_ field should no longer be - // considered valid. We don't protect writing to has_owner_ here, as it's - // the caller's responsibility to ensure that the current thread holds the - // mutex when this is called. - has_owner_ = false; - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_)); - } - - // Does nothing if the current thread holds the mutex. Otherwise, crashes - // with high probability. - void AssertHeld() const { - GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self())) - << "The current thread is not holding the mutex @" << this; - } - - // A static mutex may be used before main() is entered. It may even - // be used before the dynamic initialization stage. Therefore we - // must be able to initialize a static mutex object at link time. - // This means MutexBase has to be a POD and its member variables - // have to be public. - public: - pthread_mutex_t mutex_; // The underlying pthread mutex. - // has_owner_ indicates whether the owner_ field below contains a valid thread - // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All - // accesses to the owner_ field should be protected by a check of this field. - // An alternative might be to memset() owner_ to all zeros, but there's no - // guarantee that a zero'd pthread_t is necessarily invalid or even different - // from pthread_self(). - bool has_owner_; - pthread_t owner_; // The thread holding the mutex. -}; - -// Forward-declares a static mutex. -# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ - extern ::testing::internal::MutexBase mutex - -// Defines and statically (i.e. at link time) initializes a static mutex. -// The initialization list here does not explicitly initialize each field, -// instead relying on default initialization for the unspecified fields. In -// particular, the owner_ field (a pthread_t) is not explicitly initialized. -// This allows initialization to work whether pthread_t is a scalar or struct. -// The flag -Wmissing-field-initializers must not be specified for this to work. -# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ - ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false } - -// The Mutex class can only be used for mutexes created at runtime. It -// shares its API with MutexBase otherwise. -class Mutex : public MutexBase { - public: - Mutex() { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); - has_owner_ = false; - } - ~Mutex() { - GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); - } - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); -}; - -// We cannot name this class MutexLock as the ctor declaration would -// conflict with a macro named MutexLock, which is defined on some -// platforms. Hence the typedef trick below. -class GTestMutexLock { - public: - explicit GTestMutexLock(MutexBase* mutex) - : mutex_(mutex) { mutex_->Lock(); } - - ~GTestMutexLock() { mutex_->Unlock(); } - - private: - MutexBase* const mutex_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); -}; - -typedef GTestMutexLock MutexLock; - -// Helpers for ThreadLocal. - -// pthread_key_create() requires DeleteThreadLocalValue() to have -// C-linkage. Therefore it cannot be templatized to access -// ThreadLocal. Hence the need for class -// ThreadLocalValueHolderBase. -class ThreadLocalValueHolderBase { - public: - virtual ~ThreadLocalValueHolderBase() {} -}; - -// Called by pthread to delete thread-local data stored by -// pthread_setspecific(). -extern "C" inline void DeleteThreadLocalValue(void* value_holder) { - delete static_cast(value_holder); -} - -// Implements thread-local storage on pthreads-based systems. -// -// // Thread 1 -// ThreadLocal tl(100); // 100 is the default value for each thread. -// -// // Thread 2 -// tl.set(150); // Changes the value for thread 2 only. -// EXPECT_EQ(150, tl.get()); -// -// // Thread 1 -// EXPECT_EQ(100, tl.get()); // In thread 1, tl has the original value. -// tl.set(200); -// EXPECT_EQ(200, tl.get()); -// -// The template type argument T must have a public copy constructor. -// In addition, the default ThreadLocal constructor requires T to have -// a public default constructor. -// -// An object managed for a thread by a ThreadLocal instance is deleted -// when the thread exits. Or, if the ThreadLocal instance dies in -// that thread, when the ThreadLocal dies. It's the user's -// responsibility to ensure that all other threads using a ThreadLocal -// have exited when it dies, or the per-thread objects for those -// threads will not be deleted. -// -// Google Test only uses global ThreadLocal objects. That means they -// will die after main() has returned. Therefore, no per-thread -// object managed by Google Test will be leaked as long as all threads -// using Google Test have exited when main() returns. -template -class ThreadLocal { - public: - ThreadLocal() : key_(CreateKey()), - default_() {} - explicit ThreadLocal(const T& value) : key_(CreateKey()), - default_(value) {} - - ~ThreadLocal() { - // Destroys the managed object for the current thread, if any. - DeleteThreadLocalValue(pthread_getspecific(key_)); - - // Releases resources associated with the key. This will *not* - // delete managed objects for other threads. - GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_)); - } - - T* pointer() { return GetOrCreateValue(); } - const T* pointer() const { return GetOrCreateValue(); } - const T& get() const { return *pointer(); } - void set(const T& value) { *pointer() = value; } - - private: - // Holds a value of type T. - class ValueHolder : public ThreadLocalValueHolderBase { - public: - explicit ValueHolder(const T& value) : value_(value) {} - - T* pointer() { return &value_; } - - private: - T value_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); - }; - - static pthread_key_t CreateKey() { - pthread_key_t key; - // When a thread exits, DeleteThreadLocalValue() will be called on - // the object managed for that thread. - GTEST_CHECK_POSIX_SUCCESS_( - pthread_key_create(&key, &DeleteThreadLocalValue)); - return key; - } - - T* GetOrCreateValue() const { - ThreadLocalValueHolderBase* const holder = - static_cast(pthread_getspecific(key_)); - if (holder != NULL) { - return CheckedDowncastToActualType(holder)->pointer(); - } - - ValueHolder* const new_holder = new ValueHolder(default_); - ThreadLocalValueHolderBase* const holder_base = new_holder; - GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base)); - return new_holder->pointer(); - } - - // A key pthreads uses for looking up per-thread values. - const pthread_key_t key_; - const T default_; // The default value for each thread. - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); -}; - -# define GTEST_IS_THREADSAFE 1 - -#else // GTEST_HAS_PTHREAD - -// A dummy implementation of synchronization primitives (mutex, lock, -// and thread-local variable). Necessary for compiling Google Test where -// mutex is not supported - using Google Test in multiple threads is not -// supported on such platforms. - -class Mutex { - public: - Mutex() {} - void Lock() {} - void Unlock() {} - void AssertHeld() const {} -}; - -# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ - extern ::testing::internal::Mutex mutex - -# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex - -class GTestMutexLock { - public: - explicit GTestMutexLock(Mutex*) {} // NOLINT -}; - -typedef GTestMutexLock MutexLock; - -template -class ThreadLocal { - public: - ThreadLocal() : value_() {} - explicit ThreadLocal(const T& value) : value_(value) {} - T* pointer() { return &value_; } - const T* pointer() const { return &value_; } - const T& get() const { return value_; } - void set(const T& value) { value_ = value; } - private: - T value_; -}; - -// The above synchronization primitives have dummy implementations. -// Therefore Google Test is not thread-safe. -# define GTEST_IS_THREADSAFE 0 - -#endif // GTEST_HAS_PTHREAD - -// Returns the number of threads running in the process, or 0 to indicate that -// we cannot detect it. -GTEST_API_ size_t GetThreadCount(); - -// Passing non-POD classes through ellipsis (...) crashes the ARM -// compiler and generates a warning in Sun Studio. The Nokia Symbian -// and the IBM XL C/C++ compiler try to instantiate a copy constructor -// for objects passed through ellipsis (...), failing for uncopyable -// objects. We define this to ensure that only POD is passed through -// ellipsis on these systems. -#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC) -// We lose support for NULL detection where the compiler doesn't like -// passing non-POD classes through ellipsis (...). -# define GTEST_ELLIPSIS_NEEDS_POD_ 1 -#else -# define GTEST_CAN_COMPARE_NULL 1 -#endif - -// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between -// const T& and const T* in a function template. These compilers -// _can_ decide between class template specializations for T and T*, -// so a tr1::type_traits-like is_pointer works. -#if defined(__SYMBIAN32__) || defined(__IBMCPP__) -# define GTEST_NEEDS_IS_POINTER_ 1 -#endif - -template -struct bool_constant { - typedef bool_constant type; - static const bool value = bool_value; -}; -template const bool bool_constant::value; - -typedef bool_constant false_type; -typedef bool_constant true_type; - -template -struct is_pointer : public false_type {}; - -template -struct is_pointer : public true_type {}; - -template -struct IteratorTraits { - typedef typename Iterator::value_type value_type; -}; - -template -struct IteratorTraits { - typedef T value_type; -}; - -template -struct IteratorTraits { - typedef T value_type; -}; - -#if GTEST_OS_WINDOWS -# define GTEST_PATH_SEP_ "\\" -# define GTEST_HAS_ALT_PATH_SEP_ 1 -// The biggest signed integer type the compiler supports. -typedef __int64 BiggestInt; -#else -# define GTEST_PATH_SEP_ "/" -# define GTEST_HAS_ALT_PATH_SEP_ 0 -typedef long long BiggestInt; // NOLINT -#endif // GTEST_OS_WINDOWS - -// Utilities for char. - -// isspace(int ch) and friends accept an unsigned char or EOF. char -// may be signed, depending on the compiler (or compiler flags). -// Therefore we need to cast a char to unsigned char before calling -// isspace(), etc. - -inline bool IsAlpha(char ch) { - return isalpha(static_cast(ch)) != 0; -} -inline bool IsAlNum(char ch) { - return isalnum(static_cast(ch)) != 0; -} -inline bool IsDigit(char ch) { - return isdigit(static_cast(ch)) != 0; -} -inline bool IsLower(char ch) { - return islower(static_cast(ch)) != 0; -} -inline bool IsSpace(char ch) { - return isspace(static_cast(ch)) != 0; -} -inline bool IsUpper(char ch) { - return isupper(static_cast(ch)) != 0; -} -inline bool IsXDigit(char ch) { - return isxdigit(static_cast(ch)) != 0; -} -inline bool IsXDigit(wchar_t ch) { - const unsigned char low_byte = static_cast(ch); - return ch == low_byte && isxdigit(low_byte) != 0; -} - -inline char ToLower(char ch) { - return static_cast(tolower(static_cast(ch))); -} -inline char ToUpper(char ch) { - return static_cast(toupper(static_cast(ch))); -} - -// The testing::internal::posix namespace holds wrappers for common -// POSIX functions. These wrappers hide the differences between -// Windows/MSVC and POSIX systems. Since some compilers define these -// standard functions as macros, the wrapper cannot have the same name -// as the wrapped function. - -namespace posix { - -// Functions with a different name on Windows. - -#if GTEST_OS_WINDOWS - -typedef struct _stat StatStruct; - -# ifdef __BORLANDC__ -inline int IsATTY(int fd) { return isatty(fd); } -inline int StrCaseCmp(const char* s1, const char* s2) { - return stricmp(s1, s2); -} -inline char* StrDup(const char* src) { return strdup(src); } -# else // !__BORLANDC__ -# if GTEST_OS_WINDOWS_MOBILE -inline int IsATTY(int /* fd */) { return 0; } -# else -inline int IsATTY(int fd) { return _isatty(fd); } -# endif // GTEST_OS_WINDOWS_MOBILE -inline int StrCaseCmp(const char* s1, const char* s2) { - return _stricmp(s1, s2); -} -inline char* StrDup(const char* src) { return _strdup(src); } -# endif // __BORLANDC__ - -# if GTEST_OS_WINDOWS_MOBILE -inline int FileNo(FILE* file) { return reinterpret_cast(_fileno(file)); } -// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this -// time and thus not defined there. -# else -inline int FileNo(FILE* file) { return _fileno(file); } -inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); } -inline int RmDir(const char* dir) { return _rmdir(dir); } -inline bool IsDir(const StatStruct& st) { - return (_S_IFDIR & st.st_mode) != 0; -} -# endif // GTEST_OS_WINDOWS_MOBILE - -#else - -typedef struct stat StatStruct; - -inline int FileNo(FILE* file) { return fileno(file); } -inline int IsATTY(int fd) { return isatty(fd); } -inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); } -inline int StrCaseCmp(const char* s1, const char* s2) { - return strcasecmp(s1, s2); -} -inline char* StrDup(const char* src) { return strdup(src); } -inline int RmDir(const char* dir) { return rmdir(dir); } -inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } - -#endif // GTEST_OS_WINDOWS - -// Functions deprecated by MSVC 8.0. - -#ifdef _MSC_VER -// Temporarily disable warning 4996 (deprecated function). -# pragma warning(push) -# pragma warning(disable:4996) -#endif - -inline const char* StrNCpy(char* dest, const char* src, size_t n) { - return strncpy(dest, src, n); -} - -// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and -// StrError() aren't needed on Windows CE at this time and thus not -// defined there. - -#if !GTEST_OS_WINDOWS_MOBILE -inline int ChDir(const char* dir) { return chdir(dir); } -#endif -inline FILE* FOpen(const char* path, const char* mode) { - return fopen(path, mode); -} -#if !GTEST_OS_WINDOWS_MOBILE -inline FILE *FReopen(const char* path, const char* mode, FILE* stream) { - return freopen(path, mode, stream); -} -inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); } -#endif -inline int FClose(FILE* fp) { return fclose(fp); } -#if !GTEST_OS_WINDOWS_MOBILE -inline int Read(int fd, void* buf, unsigned int count) { - return static_cast(read(fd, buf, count)); -} -inline int Write(int fd, const void* buf, unsigned int count) { - return static_cast(write(fd, buf, count)); -} -inline int Close(int fd) { return close(fd); } -inline const char* StrError(int errnum) { return strerror(errnum); } -#endif -inline const char* GetEnv(const char* name) { -#if GTEST_OS_WINDOWS_MOBILE - // We are on Windows CE, which has no environment variables. - return NULL; -#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) - // Environment variables which we programmatically clear will be set to the - // empty string rather than unset (NULL). Handle that case. - const char* const env = getenv(name); - return (env != NULL && env[0] != '\0') ? env : NULL; -#else - return getenv(name); -#endif -} - -#ifdef _MSC_VER -# pragma warning(pop) // Restores the warning state. -#endif - -#if GTEST_OS_WINDOWS_MOBILE -// Windows CE has no C library. The abort() function is used in -// several places in Google Test. This implementation provides a reasonable -// imitation of standard behaviour. -void Abort(); -#else -inline void Abort() { abort(); } -#endif // GTEST_OS_WINDOWS_MOBILE - -} // namespace posix - -// MSVC "deprecates" snprintf and issues warnings wherever it is used. In -// order to avoid these warnings, we need to use _snprintf or _snprintf_s on -// MSVC-based platforms. We map the GTEST_SNPRINTF_ macro to the appropriate -// function in order to achieve that. We use macro definition here because -// snprintf is a variadic function. -#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE -// MSVC 2005 and above support variadic macros. -# define GTEST_SNPRINTF_(buffer, size, format, ...) \ - _snprintf_s(buffer, size, size, format, __VA_ARGS__) -#elif defined(_MSC_VER) -// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't -// complain about _snprintf. -# define GTEST_SNPRINTF_ _snprintf -#else -# define GTEST_SNPRINTF_ snprintf -#endif - -// The maximum number a BiggestInt can represent. This definition -// works no matter BiggestInt is represented in one's complement or -// two's complement. -// -// We cannot rely on numeric_limits in STL, as __int64 and long long -// are not part of standard C++ and numeric_limits doesn't need to be -// defined for them. -const BiggestInt kMaxBiggestInt = - ~(static_cast(1) << (8*sizeof(BiggestInt) - 1)); - -// This template class serves as a compile-time function from size to -// type. It maps a size in bytes to a primitive type with that -// size. e.g. -// -// TypeWithSize<4>::UInt -// -// is typedef-ed to be unsigned int (unsigned integer made up of 4 -// bytes). -// -// Such functionality should belong to STL, but I cannot find it -// there. -// -// Google Test uses this class in the implementation of floating-point -// comparison. -// -// For now it only handles UInt (unsigned int) as that's all Google Test -// needs. Other types can be easily added in the future if need -// arises. -template -class TypeWithSize { - public: - // This prevents the user from using TypeWithSize with incorrect - // values of N. - typedef void UInt; -}; - -// The specialization for size 4. -template <> -class TypeWithSize<4> { - public: - // unsigned int has size 4 in both gcc and MSVC. - // - // As base/basictypes.h doesn't compile on Windows, we cannot use - // uint32, uint64, and etc here. - typedef int Int; - typedef unsigned int UInt; -}; - -// The specialization for size 8. -template <> -class TypeWithSize<8> { - public: -#if GTEST_OS_WINDOWS - typedef __int64 Int; - typedef unsigned __int64 UInt; -#else - typedef long long Int; // NOLINT - typedef unsigned long long UInt; // NOLINT -#endif // GTEST_OS_WINDOWS -}; - -// Integer types of known sizes. -typedef TypeWithSize<4>::Int Int32; -typedef TypeWithSize<4>::UInt UInt32; -typedef TypeWithSize<8>::Int Int64; -typedef TypeWithSize<8>::UInt UInt64; -typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. - -// Utilities for command line flags and environment variables. - -// Macro for referencing flags. -#define GTEST_FLAG(name) FLAGS_gtest_##name - -// Macros for declaring flags. -#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) -#define GTEST_DECLARE_int32_(name) \ - GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name) -#define GTEST_DECLARE_string_(name) \ - GTEST_API_ extern ::std::string GTEST_FLAG(name) - -// Macros for defining flags. -#define GTEST_DEFINE_bool_(name, default_val, doc) \ - GTEST_API_ bool GTEST_FLAG(name) = (default_val) -#define GTEST_DEFINE_int32_(name, default_val, doc) \ - GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val) -#define GTEST_DEFINE_string_(name, default_val, doc) \ - GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val) - -// Thread annotations -#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) -#define GTEST_LOCK_EXCLUDED_(locks) - -// Parses 'str' for a 32-bit signed integer. If successful, writes the result -// to *value and returns true; otherwise leaves *value unchanged and returns -// false. -// TODO(chandlerc): Find a better way to refactor flag and environment parsing -// out of both gtest-port.cc and gtest.cc to avoid exporting this utility -// function. -bool ParseInt32(const Message& src_text, const char* str, Int32* value); - -// Parses a bool/Int32/string from the environment variable -// corresponding to the given Google Test flag. -bool BoolFromGTestEnv(const char* flag, bool default_val); -GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val); -const char* StringFromGTestEnv(const char* flag, const char* default_val); - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ - -#if GTEST_OS_LINUX -# include -# include -# include -# include -#endif // GTEST_OS_LINUX - -#if GTEST_HAS_EXCEPTIONS -# include -#endif - -#include -#include -#include -#include -#include -#include - -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) -// -// This header file defines the Message class. -// -// IMPORTANT NOTE: Due to limitation of the C++ language, we have to -// leave some internal implementation details in this header file. -// They are clearly marked by comments like this: -// -// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -// -// Such code is NOT meant to be used by a user directly, and is subject -// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user -// program! - -#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ -#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ - -#include - - -// Ensures that there is at least one operator<< in the global namespace. -// See Message& operator<<(...) below for why. -void operator<<(const testing::internal::Secret&, int); - -namespace testing { - -// The Message class works like an ostream repeater. -// -// Typical usage: -// -// 1. You stream a bunch of values to a Message object. -// It will remember the text in a stringstream. -// 2. Then you stream the Message object to an ostream. -// This causes the text in the Message to be streamed -// to the ostream. -// -// For example; -// -// testing::Message foo; -// foo << 1 << " != " << 2; -// std::cout << foo; -// -// will print "1 != 2". -// -// Message is not intended to be inherited from. In particular, its -// destructor is not virtual. -// -// Note that stringstream behaves differently in gcc and in MSVC. You -// can stream a NULL char pointer to it in the former, but not in the -// latter (it causes an access violation if you do). The Message -// class hides this difference by treating a NULL char pointer as -// "(null)". -class GTEST_API_ Message { - private: - // The type of basic IO manipulators (endl, ends, and flush) for - // narrow streams. - typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&); - - public: - // Constructs an empty Message. - Message(); - - // Copy constructor. - Message(const Message& msg) : ss_(new ::std::stringstream) { // NOLINT - *ss_ << msg.GetString(); - } - - // Constructs a Message from a C-string. - explicit Message(const char* str) : ss_(new ::std::stringstream) { - *ss_ << str; - } - -#if GTEST_OS_SYMBIAN - // Streams a value (either a pointer or not) to this object. - template - inline Message& operator <<(const T& value) { - StreamHelper(typename internal::is_pointer::type(), value); - return *this; - } -#else - // Streams a non-pointer value to this object. - template - inline Message& operator <<(const T& val) { - // Some libraries overload << for STL containers. These - // overloads are defined in the global namespace instead of ::std. - // - // C++'s symbol lookup rule (i.e. Koenig lookup) says that these - // overloads are visible in either the std namespace or the global - // namespace, but not other namespaces, including the testing - // namespace which Google Test's Message class is in. - // - // To allow STL containers (and other types that has a << operator - // defined in the global namespace) to be used in Google Test - // assertions, testing::Message must access the custom << operator - // from the global namespace. With this using declaration, - // overloads of << defined in the global namespace and those - // visible via Koenig lookup are both exposed in this function. - using ::operator <<; - *ss_ << val; - return *this; - } - - // Streams a pointer value to this object. - // - // This function is an overload of the previous one. When you - // stream a pointer to a Message, this definition will be used as it - // is more specialized. (The C++ Standard, section - // [temp.func.order].) If you stream a non-pointer, then the - // previous definition will be used. - // - // The reason for this overload is that streaming a NULL pointer to - // ostream is undefined behavior. Depending on the compiler, you - // may get "0", "(nil)", "(null)", or an access violation. To - // ensure consistent result across compilers, we always treat NULL - // as "(null)". - template - inline Message& operator <<(T* const& pointer) { // NOLINT - if (pointer == NULL) { - *ss_ << "(null)"; - } else { - *ss_ << pointer; - } - return *this; - } -#endif // GTEST_OS_SYMBIAN - - // Since the basic IO manipulators are overloaded for both narrow - // and wide streams, we have to provide this specialized definition - // of operator <<, even though its body is the same as the - // templatized version above. Without this definition, streaming - // endl or other basic IO manipulators to Message will confuse the - // compiler. - Message& operator <<(BasicNarrowIoManip val) { - *ss_ << val; - return *this; - } - - // Instead of 1/0, we want to see true/false for bool values. - Message& operator <<(bool b) { - return *this << (b ? "true" : "false"); - } - - // These two overloads allow streaming a wide C string to a Message - // using the UTF-8 encoding. - Message& operator <<(const wchar_t* wide_c_str); - Message& operator <<(wchar_t* wide_c_str); - -#if GTEST_HAS_STD_WSTRING - // Converts the given wide string to a narrow string using the UTF-8 - // encoding, and streams the result to this Message object. - Message& operator <<(const ::std::wstring& wstr); -#endif // GTEST_HAS_STD_WSTRING - -#if GTEST_HAS_GLOBAL_WSTRING - // Converts the given wide string to a narrow string using the UTF-8 - // encoding, and streams the result to this Message object. - Message& operator <<(const ::wstring& wstr); -#endif // GTEST_HAS_GLOBAL_WSTRING - - // Gets the text streamed to this object so far as an std::string. - // Each '\0' character in the buffer is replaced with "\\0". - // - // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. - std::string GetString() const; - - private: - -#if GTEST_OS_SYMBIAN - // These are needed as the Nokia Symbian Compiler cannot decide between - // const T& and const T* in a function template. The Nokia compiler _can_ - // decide between class template specializations for T and T*, so a - // tr1::type_traits-like is_pointer works, and we can overload on that. - template - inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) { - if (pointer == NULL) { - *ss_ << "(null)"; - } else { - *ss_ << pointer; - } - } - template - inline void StreamHelper(internal::false_type /*is_pointer*/, - const T& value) { - // See the comments in Message& operator <<(const T&) above for why - // we need this using statement. - using ::operator <<; - *ss_ << value; - } -#endif // GTEST_OS_SYMBIAN - - // We'll hold the text streamed to this object here. - const internal::scoped_ptr< ::std::stringstream> ss_; - - // We declare (but don't implement) this to prevent the compiler - // from implementing the assignment operator. - void operator=(const Message&); -}; - -// Streams a Message to an ostream. -inline std::ostream& operator <<(std::ostream& os, const Message& sb) { - return os << sb.GetString(); -} - -namespace internal { - -// Converts a streamable value to an std::string. A NULL pointer is -// converted to "(null)". When the input value is a ::string, -// ::std::string, ::wstring, or ::std::wstring object, each NUL -// character in it is replaced with "\\0". -template -std::string StreamableToString(const T& streamable) { - return (Message() << streamable).GetString(); -} - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) -// -// The Google C++ Testing Framework (Google Test) -// -// This header file declares the String class and functions used internally by -// Google Test. They are subject to change without notice. They should not used -// by code external to Google Test. -// -// This header file is #included by . -// It should not be #included by other files. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ - -#ifdef __BORLANDC__ -// string.h is not guaranteed to provide strcpy on C++ Builder. -# include -#endif - -#include -#include - - -namespace testing { -namespace internal { - -// String - an abstract class holding static string utilities. -class GTEST_API_ String { - public: - // Static utility methods - - // Clones a 0-terminated C string, allocating memory using new. The - // caller is responsible for deleting the return value using - // delete[]. Returns the cloned string, or NULL if the input is - // NULL. - // - // This is different from strdup() in string.h, which allocates - // memory using malloc(). - static const char* CloneCString(const char* c_str); - -#if GTEST_OS_WINDOWS_MOBILE - // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be - // able to pass strings to Win32 APIs on CE we need to convert them - // to 'Unicode', UTF-16. - - // Creates a UTF-16 wide string from the given ANSI string, allocating - // memory using new. The caller is responsible for deleting the return - // value using delete[]. Returns the wide string, or NULL if the - // input is NULL. - // - // The wide string is created using the ANSI codepage (CP_ACP) to - // match the behaviour of the ANSI versions of Win32 calls and the - // C runtime. - static LPCWSTR AnsiToUtf16(const char* c_str); - - // Creates an ANSI string from the given wide string, allocating - // memory using new. The caller is responsible for deleting the return - // value using delete[]. Returns the ANSI string, or NULL if the - // input is NULL. - // - // The returned string is created using the ANSI codepage (CP_ACP) to - // match the behaviour of the ANSI versions of Win32 calls and the - // C runtime. - static const char* Utf16ToAnsi(LPCWSTR utf16_str); -#endif - - // Compares two C strings. Returns true iff they have the same content. - // - // Unlike strcmp(), this function can handle NULL argument(s). A - // NULL C string is considered different to any non-NULL C string, - // including the empty string. - static bool CStringEquals(const char* lhs, const char* rhs); - - // Converts a wide C string to a String using the UTF-8 encoding. - // NULL will be converted to "(null)". If an error occurred during - // the conversion, "(failed to convert from wide string)" is - // returned. - static std::string ShowWideCString(const wchar_t* wide_c_str); - - // Compares two wide C strings. Returns true iff they have the same - // content. - // - // Unlike wcscmp(), this function can handle NULL argument(s). A - // NULL C string is considered different to any non-NULL C string, - // including the empty string. - static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs); - - // Compares two C strings, ignoring case. Returns true iff they - // have the same content. - // - // Unlike strcasecmp(), this function can handle NULL argument(s). - // A NULL C string is considered different to any non-NULL C string, - // including the empty string. - static bool CaseInsensitiveCStringEquals(const char* lhs, - const char* rhs); - - // Compares two wide C strings, ignoring case. Returns true iff they - // have the same content. - // - // Unlike wcscasecmp(), this function can handle NULL argument(s). - // A NULL C string is considered different to any non-NULL wide C string, - // including the empty string. - // NB: The implementations on different platforms slightly differ. - // On windows, this method uses _wcsicmp which compares according to LC_CTYPE - // environment variable. On GNU platform this method uses wcscasecmp - // which compares according to LC_CTYPE category of the current locale. - // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the - // current locale. - static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs, - const wchar_t* rhs); - - // Returns true iff the given string ends with the given suffix, ignoring - // case. Any string is considered to end with an empty suffix. - static bool EndsWithCaseInsensitive( - const std::string& str, const std::string& suffix); - - // Formats an int value as "%02d". - static std::string FormatIntWidth2(int value); // "%02d" for width == 2 - - // Formats an int value as "%X". - static std::string FormatHexInt(int value); - - // Formats a byte as "%02X". - static std::string FormatByte(unsigned char value); - - private: - String(); // Not meant to be instantiated. -}; // class String - -// Gets the content of the stringstream's buffer as an std::string. Each '\0' -// character in the buffer is replaced with "\\0". -GTEST_API_ std::string StringStreamToString(::std::stringstream* stream); - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: keith.ray@gmail.com (Keith Ray) -// -// Google Test filepath utilities -// -// This header file declares classes and functions used internally by -// Google Test. They are subject to change without notice. -// -// This file is #included in . -// Do not include this header file separately! - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ - - -namespace testing { -namespace internal { - -// FilePath - a class for file and directory pathname manipulation which -// handles platform-specific conventions (like the pathname separator). -// Used for helper functions for naming files in a directory for xml output. -// Except for Set methods, all methods are const or static, which provides an -// "immutable value object" -- useful for peace of mind. -// A FilePath with a value ending in a path separator ("like/this/") represents -// a directory, otherwise it is assumed to represent a file. In either case, -// it may or may not represent an actual file or directory in the file system. -// Names are NOT checked for syntax correctness -- no checking for illegal -// characters, malformed paths, etc. - -class GTEST_API_ FilePath { - public: - FilePath() : pathname_("") { } - FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { } - - explicit FilePath(const std::string& pathname) : pathname_(pathname) { - Normalize(); - } - - FilePath& operator=(const FilePath& rhs) { - Set(rhs); - return *this; - } - - void Set(const FilePath& rhs) { - pathname_ = rhs.pathname_; - } - - const std::string& string() const { return pathname_; } - const char* c_str() const { return pathname_.c_str(); } - - // Returns the current working directory, or "" if unsuccessful. - static FilePath GetCurrentDir(); - - // Given directory = "dir", base_name = "test", number = 0, - // extension = "xml", returns "dir/test.xml". If number is greater - // than zero (e.g., 12), returns "dir/test_12.xml". - // On Windows platform, uses \ as the separator rather than /. - static FilePath MakeFileName(const FilePath& directory, - const FilePath& base_name, - int number, - const char* extension); - - // Given directory = "dir", relative_path = "test.xml", - // returns "dir/test.xml". - // On Windows, uses \ as the separator rather than /. - static FilePath ConcatPaths(const FilePath& directory, - const FilePath& relative_path); - - // Returns a pathname for a file that does not currently exist. The pathname - // will be directory/base_name.extension or - // directory/base_name_.extension if directory/base_name.extension - // already exists. The number will be incremented until a pathname is found - // that does not already exist. - // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. - // There could be a race condition if two or more processes are calling this - // function at the same time -- they could both pick the same filename. - static FilePath GenerateUniqueFileName(const FilePath& directory, - const FilePath& base_name, - const char* extension); - - // Returns true iff the path is "". - bool IsEmpty() const { return pathname_.empty(); } - - // If input name has a trailing separator character, removes it and returns - // the name, otherwise return the name string unmodified. - // On Windows platform, uses \ as the separator, other platforms use /. - FilePath RemoveTrailingPathSeparator() const; - - // Returns a copy of the FilePath with the directory part removed. - // Example: FilePath("path/to/file").RemoveDirectoryName() returns - // FilePath("file"). If there is no directory part ("just_a_file"), it returns - // the FilePath unmodified. If there is no file part ("just_a_dir/") it - // returns an empty FilePath (""). - // On Windows platform, '\' is the path separator, otherwise it is '/'. - FilePath RemoveDirectoryName() const; - - // RemoveFileName returns the directory path with the filename removed. - // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". - // If the FilePath is "a_file" or "/a_file", RemoveFileName returns - // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does - // not have a file, like "just/a/dir/", it returns the FilePath unmodified. - // On Windows platform, '\' is the path separator, otherwise it is '/'. - FilePath RemoveFileName() const; - - // Returns a copy of the FilePath with the case-insensitive extension removed. - // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns - // FilePath("dir/file"). If a case-insensitive extension is not - // found, returns a copy of the original FilePath. - FilePath RemoveExtension(const char* extension) const; - - // Creates directories so that path exists. Returns true if successful or if - // the directories already exist; returns false if unable to create - // directories for any reason. Will also return false if the FilePath does - // not represent a directory (that is, it doesn't end with a path separator). - bool CreateDirectoriesRecursively() const; - - // Create the directory so that path exists. Returns true if successful or - // if the directory already exists; returns false if unable to create the - // directory for any reason, including if the parent directory does not - // exist. Not named "CreateDirectory" because that's a macro on Windows. - bool CreateFolder() const; - - // Returns true if FilePath describes something in the file-system, - // either a file, directory, or whatever, and that something exists. - bool FileOrDirectoryExists() const; - - // Returns true if pathname describes a directory in the file-system - // that exists. - bool DirectoryExists() const; - - // Returns true if FilePath ends with a path separator, which indicates that - // it is intended to represent a directory. Returns false otherwise. - // This does NOT check that a directory (or file) actually exists. - bool IsDirectory() const; - - // Returns true if pathname describes a root directory. (Windows has one - // root directory per disk drive.) - bool IsRootDirectory() const; - - // Returns true if pathname describes an absolute path. - bool IsAbsolutePath() const; - - private: - // Replaces multiple consecutive separators with a single separator. - // For example, "bar///foo" becomes "bar/foo". Does not eliminate other - // redundancies that might be in a pathname involving "." or "..". - // - // A pathname with multiple consecutive separators may occur either through - // user error or as a result of some scripts or APIs that generate a pathname - // with a trailing separator. On other platforms the same API or script - // may NOT generate a pathname with a trailing "/". Then elsewhere that - // pathname may have another "/" and pathname components added to it, - // without checking for the separator already being there. - // The script language and operating system may allow paths like "foo//bar" - // but some of the functions in FilePath will not handle that correctly. In - // particular, RemoveTrailingPathSeparator() only removes one separator, and - // it is called in CreateDirectoriesRecursively() assuming that it will change - // a pathname from directory syntax (trailing separator) to filename syntax. - // - // On Windows this method also replaces the alternate path separator '/' with - // the primary path separator '\\', so that for example "bar\\/\\foo" becomes - // "bar\\foo". - - void Normalize(); - - // Returns a pointer to the last occurence of a valid path separator in - // the FilePath. On Windows, for example, both '/' and '\' are valid path - // separators. Returns NULL if no path separator was found. - const char* FindLastPathSeparator() const; - - std::string pathname_; -}; // class FilePath - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ -// This file was GENERATED by command: -// pump.py gtest-type-util.h.pump -// DO NOT EDIT BY HAND!!! - -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - -// Type utilities needed for implementing typed and type-parameterized -// tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -// Currently we support at most 50 types in a list, and at most 50 -// type-parameterized tests in one type-parameterized test case. -// Please contact googletestframework@googlegroups.com if you need -// more. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ - - -// #ifdef __GNUC__ is too general here. It is possible to use gcc without using -// libstdc++ (which is where cxxabi.h comes from). -# if GTEST_HAS_CXXABI_H_ -# include -# elif defined(__HP_aCC) -# include -# endif // GTEST_HASH_CXXABI_H_ - -namespace testing { -namespace internal { - -// GetTypeName() returns a human-readable name of type T. -// NB: This function is also used in Google Mock, so don't move it inside of -// the typed-test-only section below. -template -std::string GetTypeName() { -# if GTEST_HAS_RTTI - - const char* const name = typeid(T).name(); -# if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) - int status = 0; - // gcc's implementation of typeid(T).name() mangles the type name, - // so we have to demangle it. -# if GTEST_HAS_CXXABI_H_ - using abi::__cxa_demangle; -# endif // GTEST_HAS_CXXABI_H_ - char* const readable_name = __cxa_demangle(name, 0, 0, &status); - const std::string name_str(status == 0 ? readable_name : name); - free(readable_name); - return name_str; -# else - return name; -# endif // GTEST_HAS_CXXABI_H_ || __HP_aCC - -# else - - return ""; - -# endif // GTEST_HAS_RTTI -} - -#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - -// AssertyTypeEq::type is defined iff T1 and T2 are the same -// type. This can be used as a compile-time assertion to ensure that -// two types are equal. - -template -struct AssertTypeEq; - -template -struct AssertTypeEq { - typedef bool type; -}; - -// A unique type used as the default value for the arguments of class -// template Types. This allows us to simulate variadic templates -// (e.g. Types, Type, and etc), which C++ doesn't -// support directly. -struct None {}; - -// The following family of struct and struct templates are used to -// represent type lists. In particular, TypesN -// represents a type list with N types (T1, T2, ..., and TN) in it. -// Except for Types0, every struct in the family has two member types: -// Head for the first type in the list, and Tail for the rest of the -// list. - -// The empty type list. -struct Types0 {}; - -// Type lists of length 1, 2, 3, and so on. - -template -struct Types1 { - typedef T1 Head; - typedef Types0 Tail; -}; -template -struct Types2 { - typedef T1 Head; - typedef Types1 Tail; -}; - -template -struct Types3 { - typedef T1 Head; - typedef Types2 Tail; -}; - -template -struct Types4 { - typedef T1 Head; - typedef Types3 Tail; -}; - -template -struct Types5 { - typedef T1 Head; - typedef Types4 Tail; -}; - -template -struct Types6 { - typedef T1 Head; - typedef Types5 Tail; -}; - -template -struct Types7 { - typedef T1 Head; - typedef Types6 Tail; -}; - -template -struct Types8 { - typedef T1 Head; - typedef Types7 Tail; -}; - -template -struct Types9 { - typedef T1 Head; - typedef Types8 Tail; -}; - -template -struct Types10 { - typedef T1 Head; - typedef Types9 Tail; -}; - -template -struct Types11 { - typedef T1 Head; - typedef Types10 Tail; -}; - -template -struct Types12 { - typedef T1 Head; - typedef Types11 Tail; -}; - -template -struct Types13 { - typedef T1 Head; - typedef Types12 Tail; -}; - -template -struct Types14 { - typedef T1 Head; - typedef Types13 Tail; -}; - -template -struct Types15 { - typedef T1 Head; - typedef Types14 Tail; -}; - -template -struct Types16 { - typedef T1 Head; - typedef Types15 Tail; -}; - -template -struct Types17 { - typedef T1 Head; - typedef Types16 Tail; -}; - -template -struct Types18 { - typedef T1 Head; - typedef Types17 Tail; -}; - -template -struct Types19 { - typedef T1 Head; - typedef Types18 Tail; -}; - -template -struct Types20 { - typedef T1 Head; - typedef Types19 Tail; -}; - -template -struct Types21 { - typedef T1 Head; - typedef Types20 Tail; -}; - -template -struct Types22 { - typedef T1 Head; - typedef Types21 Tail; -}; - -template -struct Types23 { - typedef T1 Head; - typedef Types22 Tail; -}; - -template -struct Types24 { - typedef T1 Head; - typedef Types23 Tail; -}; - -template -struct Types25 { - typedef T1 Head; - typedef Types24 Tail; -}; - -template -struct Types26 { - typedef T1 Head; - typedef Types25 Tail; -}; - -template -struct Types27 { - typedef T1 Head; - typedef Types26 Tail; -}; - -template -struct Types28 { - typedef T1 Head; - typedef Types27 Tail; -}; - -template -struct Types29 { - typedef T1 Head; - typedef Types28 Tail; -}; - -template -struct Types30 { - typedef T1 Head; - typedef Types29 Tail; -}; - -template -struct Types31 { - typedef T1 Head; - typedef Types30 Tail; -}; - -template -struct Types32 { - typedef T1 Head; - typedef Types31 Tail; -}; - -template -struct Types33 { - typedef T1 Head; - typedef Types32 Tail; -}; - -template -struct Types34 { - typedef T1 Head; - typedef Types33 Tail; -}; - -template -struct Types35 { - typedef T1 Head; - typedef Types34 Tail; -}; - -template -struct Types36 { - typedef T1 Head; - typedef Types35 Tail; -}; - -template -struct Types37 { - typedef T1 Head; - typedef Types36 Tail; -}; - -template -struct Types38 { - typedef T1 Head; - typedef Types37 Tail; -}; - -template -struct Types39 { - typedef T1 Head; - typedef Types38 Tail; -}; - -template -struct Types40 { - typedef T1 Head; - typedef Types39 Tail; -}; - -template -struct Types41 { - typedef T1 Head; - typedef Types40 Tail; -}; - -template -struct Types42 { - typedef T1 Head; - typedef Types41 Tail; -}; - -template -struct Types43 { - typedef T1 Head; - typedef Types42 Tail; -}; - -template -struct Types44 { - typedef T1 Head; - typedef Types43 Tail; -}; - -template -struct Types45 { - typedef T1 Head; - typedef Types44 Tail; -}; - -template -struct Types46 { - typedef T1 Head; - typedef Types45 Tail; -}; - -template -struct Types47 { - typedef T1 Head; - typedef Types46 Tail; -}; - -template -struct Types48 { - typedef T1 Head; - typedef Types47 Tail; -}; - -template -struct Types49 { - typedef T1 Head; - typedef Types48 Tail; -}; - -template -struct Types50 { - typedef T1 Head; - typedef Types49 Tail; -}; - - -} // namespace internal - -// We don't want to require the users to write TypesN<...> directly, -// as that would require them to count the length. Types<...> is much -// easier to write, but generates horrible messages when there is a -// compiler error, as gcc insists on printing out each template -// argument, even if it has the default value (this means Types -// will appear as Types in the compiler -// errors). -// -// Our solution is to combine the best part of the two approaches: a -// user would write Types, and Google Test will translate -// that to TypesN internally to make error messages -// readable. The translation is done by the 'type' member of the -// Types template. -template -struct Types { - typedef internal::Types50 type; -}; - -template <> -struct Types { - typedef internal::Types0 type; -}; -template -struct Types { - typedef internal::Types1 type; -}; -template -struct Types { - typedef internal::Types2 type; -}; -template -struct Types { - typedef internal::Types3 type; -}; -template -struct Types { - typedef internal::Types4 type; -}; -template -struct Types { - typedef internal::Types5 type; -}; -template -struct Types { - typedef internal::Types6 type; -}; -template -struct Types { - typedef internal::Types7 type; -}; -template -struct Types { - typedef internal::Types8 type; -}; -template -struct Types { - typedef internal::Types9 type; -}; -template -struct Types { - typedef internal::Types10 type; -}; -template -struct Types { - typedef internal::Types11 type; -}; -template -struct Types { - typedef internal::Types12 type; -}; -template -struct Types { - typedef internal::Types13 type; -}; -template -struct Types { - typedef internal::Types14 type; -}; -template -struct Types { - typedef internal::Types15 type; -}; -template -struct Types { - typedef internal::Types16 type; -}; -template -struct Types { - typedef internal::Types17 type; -}; -template -struct Types { - typedef internal::Types18 type; -}; -template -struct Types { - typedef internal::Types19 type; -}; -template -struct Types { - typedef internal::Types20 type; -}; -template -struct Types { - typedef internal::Types21 type; -}; -template -struct Types { - typedef internal::Types22 type; -}; -template -struct Types { - typedef internal::Types23 type; -}; -template -struct Types { - typedef internal::Types24 type; -}; -template -struct Types { - typedef internal::Types25 type; -}; -template -struct Types { - typedef internal::Types26 type; -}; -template -struct Types { - typedef internal::Types27 type; -}; -template -struct Types { - typedef internal::Types28 type; -}; -template -struct Types { - typedef internal::Types29 type; -}; -template -struct Types { - typedef internal::Types30 type; -}; -template -struct Types { - typedef internal::Types31 type; -}; -template -struct Types { - typedef internal::Types32 type; -}; -template -struct Types { - typedef internal::Types33 type; -}; -template -struct Types { - typedef internal::Types34 type; -}; -template -struct Types { - typedef internal::Types35 type; -}; -template -struct Types { - typedef internal::Types36 type; -}; -template -struct Types { - typedef internal::Types37 type; -}; -template -struct Types { - typedef internal::Types38 type; -}; -template -struct Types { - typedef internal::Types39 type; -}; -template -struct Types { - typedef internal::Types40 type; -}; -template -struct Types { - typedef internal::Types41 type; -}; -template -struct Types { - typedef internal::Types42 type; -}; -template -struct Types { - typedef internal::Types43 type; -}; -template -struct Types { - typedef internal::Types44 type; -}; -template -struct Types { - typedef internal::Types45 type; -}; -template -struct Types { - typedef internal::Types46 type; -}; -template -struct Types { - typedef internal::Types47 type; -}; -template -struct Types { - typedef internal::Types48 type; -}; -template -struct Types { - typedef internal::Types49 type; -}; - -namespace internal { - -# define GTEST_TEMPLATE_ template class - -// The template "selector" struct TemplateSel is used to -// represent Tmpl, which must be a class template with one type -// parameter, as a type. TemplateSel::Bind::type is defined -// as the type Tmpl. This allows us to actually instantiate the -// template "selected" by TemplateSel. -// -// This trick is necessary for simulating typedef for class templates, -// which C++ doesn't support directly. -template -struct TemplateSel { - template - struct Bind { - typedef Tmpl type; - }; -}; - -# define GTEST_BIND_(TmplSel, T) \ - TmplSel::template Bind::type - -// A unique struct template used as the default value for the -// arguments of class template Templates. This allows us to simulate -// variadic templates (e.g. Templates, Templates, -// and etc), which C++ doesn't support directly. -template -struct NoneT {}; - -// The following family of struct and struct templates are used to -// represent template lists. In particular, TemplatesN represents a list of N templates (T1, T2, ..., and TN). Except -// for Templates0, every struct in the family has two member types: -// Head for the selector of the first template in the list, and Tail -// for the rest of the list. - -// The empty template list. -struct Templates0 {}; - -// Template lists of length 1, 2, 3, and so on. - -template -struct Templates1 { - typedef TemplateSel Head; - typedef Templates0 Tail; -}; -template -struct Templates2 { - typedef TemplateSel Head; - typedef Templates1 Tail; -}; - -template -struct Templates3 { - typedef TemplateSel Head; - typedef Templates2 Tail; -}; - -template -struct Templates4 { - typedef TemplateSel Head; - typedef Templates3 Tail; -}; - -template -struct Templates5 { - typedef TemplateSel Head; - typedef Templates4 Tail; -}; - -template -struct Templates6 { - typedef TemplateSel Head; - typedef Templates5 Tail; -}; - -template -struct Templates7 { - typedef TemplateSel Head; - typedef Templates6 Tail; -}; - -template -struct Templates8 { - typedef TemplateSel Head; - typedef Templates7 Tail; -}; - -template -struct Templates9 { - typedef TemplateSel Head; - typedef Templates8 Tail; -}; - -template -struct Templates10 { - typedef TemplateSel Head; - typedef Templates9 Tail; -}; - -template -struct Templates11 { - typedef TemplateSel Head; - typedef Templates10 Tail; -}; - -template -struct Templates12 { - typedef TemplateSel Head; - typedef Templates11 Tail; -}; - -template -struct Templates13 { - typedef TemplateSel Head; - typedef Templates12 Tail; -}; - -template -struct Templates14 { - typedef TemplateSel Head; - typedef Templates13 Tail; -}; - -template -struct Templates15 { - typedef TemplateSel Head; - typedef Templates14 Tail; -}; - -template -struct Templates16 { - typedef TemplateSel Head; - typedef Templates15 Tail; -}; - -template -struct Templates17 { - typedef TemplateSel Head; - typedef Templates16 Tail; -}; - -template -struct Templates18 { - typedef TemplateSel Head; - typedef Templates17 Tail; -}; - -template -struct Templates19 { - typedef TemplateSel Head; - typedef Templates18 Tail; -}; - -template -struct Templates20 { - typedef TemplateSel Head; - typedef Templates19 Tail; -}; - -template -struct Templates21 { - typedef TemplateSel Head; - typedef Templates20 Tail; -}; - -template -struct Templates22 { - typedef TemplateSel Head; - typedef Templates21 Tail; -}; - -template -struct Templates23 { - typedef TemplateSel Head; - typedef Templates22 Tail; -}; - -template -struct Templates24 { - typedef TemplateSel Head; - typedef Templates23 Tail; -}; - -template -struct Templates25 { - typedef TemplateSel Head; - typedef Templates24 Tail; -}; - -template -struct Templates26 { - typedef TemplateSel Head; - typedef Templates25 Tail; -}; - -template -struct Templates27 { - typedef TemplateSel Head; - typedef Templates26 Tail; -}; - -template -struct Templates28 { - typedef TemplateSel Head; - typedef Templates27 Tail; -}; - -template -struct Templates29 { - typedef TemplateSel Head; - typedef Templates28 Tail; -}; - -template -struct Templates30 { - typedef TemplateSel Head; - typedef Templates29 Tail; -}; - -template -struct Templates31 { - typedef TemplateSel Head; - typedef Templates30 Tail; -}; - -template -struct Templates32 { - typedef TemplateSel Head; - typedef Templates31 Tail; -}; - -template -struct Templates33 { - typedef TemplateSel Head; - typedef Templates32 Tail; -}; - -template -struct Templates34 { - typedef TemplateSel Head; - typedef Templates33 Tail; -}; - -template -struct Templates35 { - typedef TemplateSel Head; - typedef Templates34 Tail; -}; - -template -struct Templates36 { - typedef TemplateSel Head; - typedef Templates35 Tail; -}; - -template -struct Templates37 { - typedef TemplateSel Head; - typedef Templates36 Tail; -}; - -template -struct Templates38 { - typedef TemplateSel Head; - typedef Templates37 Tail; -}; - -template -struct Templates39 { - typedef TemplateSel Head; - typedef Templates38 Tail; -}; - -template -struct Templates40 { - typedef TemplateSel Head; - typedef Templates39 Tail; -}; - -template -struct Templates41 { - typedef TemplateSel Head; - typedef Templates40 Tail; -}; - -template -struct Templates42 { - typedef TemplateSel Head; - typedef Templates41 Tail; -}; - -template -struct Templates43 { - typedef TemplateSel Head; - typedef Templates42 Tail; -}; - -template -struct Templates44 { - typedef TemplateSel Head; - typedef Templates43 Tail; -}; - -template -struct Templates45 { - typedef TemplateSel Head; - typedef Templates44 Tail; -}; - -template -struct Templates46 { - typedef TemplateSel Head; - typedef Templates45 Tail; -}; - -template -struct Templates47 { - typedef TemplateSel Head; - typedef Templates46 Tail; -}; - -template -struct Templates48 { - typedef TemplateSel Head; - typedef Templates47 Tail; -}; - -template -struct Templates49 { - typedef TemplateSel Head; - typedef Templates48 Tail; -}; - -template -struct Templates50 { - typedef TemplateSel Head; - typedef Templates49 Tail; -}; - - -// We don't want to require the users to write TemplatesN<...> directly, -// as that would require them to count the length. Templates<...> is much -// easier to write, but generates horrible messages when there is a -// compiler error, as gcc insists on printing out each template -// argument, even if it has the default value (this means Templates -// will appear as Templates in the compiler -// errors). -// -// Our solution is to combine the best part of the two approaches: a -// user would write Templates, and Google Test will translate -// that to TemplatesN internally to make error messages -// readable. The translation is done by the 'type' member of the -// Templates template. -template -struct Templates { - typedef Templates50 type; -}; - -template <> -struct Templates { - typedef Templates0 type; -}; -template -struct Templates { - typedef Templates1 type; -}; -template -struct Templates { - typedef Templates2 type; -}; -template -struct Templates { - typedef Templates3 type; -}; -template -struct Templates { - typedef Templates4 type; -}; -template -struct Templates { - typedef Templates5 type; -}; -template -struct Templates { - typedef Templates6 type; -}; -template -struct Templates { - typedef Templates7 type; -}; -template -struct Templates { - typedef Templates8 type; -}; -template -struct Templates { - typedef Templates9 type; -}; -template -struct Templates { - typedef Templates10 type; -}; -template -struct Templates { - typedef Templates11 type; -}; -template -struct Templates { - typedef Templates12 type; -}; -template -struct Templates { - typedef Templates13 type; -}; -template -struct Templates { - typedef Templates14 type; -}; -template -struct Templates { - typedef Templates15 type; -}; -template -struct Templates { - typedef Templates16 type; -}; -template -struct Templates { - typedef Templates17 type; -}; -template -struct Templates { - typedef Templates18 type; -}; -template -struct Templates { - typedef Templates19 type; -}; -template -struct Templates { - typedef Templates20 type; -}; -template -struct Templates { - typedef Templates21 type; -}; -template -struct Templates { - typedef Templates22 type; -}; -template -struct Templates { - typedef Templates23 type; -}; -template -struct Templates { - typedef Templates24 type; -}; -template -struct Templates { - typedef Templates25 type; -}; -template -struct Templates { - typedef Templates26 type; -}; -template -struct Templates { - typedef Templates27 type; -}; -template -struct Templates { - typedef Templates28 type; -}; -template -struct Templates { - typedef Templates29 type; -}; -template -struct Templates { - typedef Templates30 type; -}; -template -struct Templates { - typedef Templates31 type; -}; -template -struct Templates { - typedef Templates32 type; -}; -template -struct Templates { - typedef Templates33 type; -}; -template -struct Templates { - typedef Templates34 type; -}; -template -struct Templates { - typedef Templates35 type; -}; -template -struct Templates { - typedef Templates36 type; -}; -template -struct Templates { - typedef Templates37 type; -}; -template -struct Templates { - typedef Templates38 type; -}; -template -struct Templates { - typedef Templates39 type; -}; -template -struct Templates { - typedef Templates40 type; -}; -template -struct Templates { - typedef Templates41 type; -}; -template -struct Templates { - typedef Templates42 type; -}; -template -struct Templates { - typedef Templates43 type; -}; -template -struct Templates { - typedef Templates44 type; -}; -template -struct Templates { - typedef Templates45 type; -}; -template -struct Templates { - typedef Templates46 type; -}; -template -struct Templates { - typedef Templates47 type; -}; -template -struct Templates { - typedef Templates48 type; -}; -template -struct Templates { - typedef Templates49 type; -}; - -// The TypeList template makes it possible to use either a single type -// or a Types<...> list in TYPED_TEST_CASE() and -// INSTANTIATE_TYPED_TEST_CASE_P(). - -template -struct TypeList { - typedef Types1 type; -}; - -template -struct TypeList > { - typedef typename Types::type type; -}; - -#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ - -// Due to C++ preprocessor weirdness, we need double indirection to -// concatenate two tokens when one of them is __LINE__. Writing -// -// foo ## __LINE__ -// -// will result in the token foo__LINE__, instead of foo followed by -// the current line number. For more details, see -// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 -#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) -#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar - -class ProtocolMessage; -namespace proto2 { class Message; } - -namespace testing { - -// Forward declarations. - -class AssertionResult; // Result of an assertion. -class Message; // Represents a failure message. -class Test; // Represents a test. -class TestInfo; // Information about a test. -class TestPartResult; // Result of a test part. -class UnitTest; // A collection of test cases. - -template -::std::string PrintToString(const T& value); - -namespace internal { - -struct TraceInfo; // Information about a trace point. -class ScopedTrace; // Implements scoped trace. -class TestInfoImpl; // Opaque implementation of TestInfo -class UnitTestImpl; // Opaque implementation of UnitTest - -// How many times InitGoogleTest() has been called. -GTEST_API_ extern int g_init_gtest_count; - -// The text used in failure messages to indicate the start of the -// stack trace. -GTEST_API_ extern const char kStackTraceMarker[]; - -// Two overloaded helpers for checking at compile time whether an -// expression is a null pointer literal (i.e. NULL or any 0-valued -// compile-time integral constant). Their return values have -// different sizes, so we can use sizeof() to test which version is -// picked by the compiler. These helpers have no implementations, as -// we only need their signatures. -// -// Given IsNullLiteralHelper(x), the compiler will pick the first -// version if x can be implicitly converted to Secret*, and pick the -// second version otherwise. Since Secret is a secret and incomplete -// type, the only expression a user can write that has type Secret* is -// a null pointer literal. Therefore, we know that x is a null -// pointer literal if and only if the first version is picked by the -// compiler. -char IsNullLiteralHelper(Secret* p); -char (&IsNullLiteralHelper(...))[2]; // NOLINT - -// A compile-time bool constant that is true if and only if x is a -// null pointer literal (i.e. NULL or any 0-valued compile-time -// integral constant). -#ifdef GTEST_ELLIPSIS_NEEDS_POD_ -// We lose support for NULL detection where the compiler doesn't like -// passing non-POD classes through ellipsis (...). -# define GTEST_IS_NULL_LITERAL_(x) false -#else -# define GTEST_IS_NULL_LITERAL_(x) \ - (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1) -#endif // GTEST_ELLIPSIS_NEEDS_POD_ - -// Appends the user-supplied message to the Google-Test-generated message. -GTEST_API_ std::string AppendUserMessage( - const std::string& gtest_msg, const Message& user_msg); - -#if GTEST_HAS_EXCEPTIONS - -// This exception is thrown by (and only by) a failed Google Test -// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions -// are enabled). We derive it from std::runtime_error, which is for -// errors presumably detectable only at run time. Since -// std::runtime_error inherits from std::exception, many testing -// frameworks know how to extract and print the message inside it. -class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error { - public: - explicit GoogleTestFailureException(const TestPartResult& failure); -}; - -#endif // GTEST_HAS_EXCEPTIONS - -// A helper class for creating scoped traces in user programs. -class GTEST_API_ ScopedTrace { - public: - // The c'tor pushes the given source file location and message onto - // a trace stack maintained by Google Test. - ScopedTrace(const char* file, int line, const Message& message); - - // The d'tor pops the info pushed by the c'tor. - // - // Note that the d'tor is not virtual in order to be efficient. - // Don't inherit from ScopedTrace! - ~ScopedTrace(); - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); -} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its - // c'tor and d'tor. Therefore it doesn't - // need to be used otherwise. - -// Constructs and returns the message for an equality assertion -// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. -// -// The first four parameters are the expressions used in the assertion -// and their values, as strings. For example, for ASSERT_EQ(foo, bar) -// where foo is 5 and bar is 6, we have: -// -// expected_expression: "foo" -// actual_expression: "bar" -// expected_value: "5" -// actual_value: "6" -// -// The ignoring_case parameter is true iff the assertion is a -// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will -// be inserted into the message. -GTEST_API_ AssertionResult EqFailure(const char* expected_expression, - const char* actual_expression, - const std::string& expected_value, - const std::string& actual_value, - bool ignoring_case); - -// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. -GTEST_API_ std::string GetBoolAssertionFailureMessage( - const AssertionResult& assertion_result, - const char* expression_text, - const char* actual_predicate_value, - const char* expected_predicate_value); - -// This template class represents an IEEE floating-point number -// (either single-precision or double-precision, depending on the -// template parameters). -// -// The purpose of this class is to do more sophisticated number -// comparison. (Due to round-off error, etc, it's very unlikely that -// two floating-points will be equal exactly. Hence a naive -// comparison by the == operation often doesn't work.) -// -// Format of IEEE floating-point: -// -// The most-significant bit being the leftmost, an IEEE -// floating-point looks like -// -// sign_bit exponent_bits fraction_bits -// -// Here, sign_bit is a single bit that designates the sign of the -// number. -// -// For float, there are 8 exponent bits and 23 fraction bits. -// -// For double, there are 11 exponent bits and 52 fraction bits. -// -// More details can be found at -// http://en.wikipedia.org/wiki/IEEE_floating-point_standard. -// -// Template parameter: -// -// RawType: the raw floating-point type (either float or double) -template -class FloatingPoint { - public: - // Defines the unsigned integer type that has the same size as the - // floating point number. - typedef typename TypeWithSize::UInt Bits; - - // Constants. - - // # of bits in a number. - static const size_t kBitCount = 8*sizeof(RawType); - - // # of fraction bits in a number. - static const size_t kFractionBitCount = - std::numeric_limits::digits - 1; - - // # of exponent bits in a number. - static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; - - // The mask for the sign bit. - static const Bits kSignBitMask = static_cast(1) << (kBitCount - 1); - - // The mask for the fraction bits. - static const Bits kFractionBitMask = - ~static_cast(0) >> (kExponentBitCount + 1); - - // The mask for the exponent bits. - static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); - - // How many ULP's (Units in the Last Place) we want to tolerate when - // comparing two numbers. The larger the value, the more error we - // allow. A 0 value means that two numbers must be exactly the same - // to be considered equal. - // - // The maximum error of a single floating-point operation is 0.5 - // units in the last place. On Intel CPU's, all floating-point - // calculations are done with 80-bit precision, while double has 64 - // bits. Therefore, 4 should be enough for ordinary use. - // - // See the following article for more details on ULP: - // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ - static const size_t kMaxUlps = 4; - - // Constructs a FloatingPoint from a raw floating-point number. - // - // On an Intel CPU, passing a non-normalized NAN (Not a Number) - // around may change its bits, although the new value is guaranteed - // to be also a NAN. Therefore, don't expect this constructor to - // preserve the bits in x when x is a NAN. - explicit FloatingPoint(const RawType& x) { u_.value_ = x; } - - // Static methods - - // Reinterprets a bit pattern as a floating-point number. - // - // This function is needed to test the AlmostEquals() method. - static RawType ReinterpretBits(const Bits bits) { - FloatingPoint fp(0); - fp.u_.bits_ = bits; - return fp.u_.value_; - } - - // Returns the floating-point number that represent positive infinity. - static RawType Infinity() { - return ReinterpretBits(kExponentBitMask); - } - - // Returns the maximum representable finite floating-point number. - static RawType Max(); - - // Non-static methods - - // Returns the bits that represents this number. - const Bits &bits() const { return u_.bits_; } - - // Returns the exponent bits of this number. - Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } - - // Returns the fraction bits of this number. - Bits fraction_bits() const { return kFractionBitMask & u_.bits_; } - - // Returns the sign bit of this number. - Bits sign_bit() const { return kSignBitMask & u_.bits_; } - - // Returns true iff this is NAN (not a number). - bool is_nan() const { - // It's a NAN if the exponent bits are all ones and the fraction - // bits are not entirely zeros. - return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0); - } - - // Returns true iff this number is at most kMaxUlps ULP's away from - // rhs. In particular, this function: - // - // - returns false if either number is (or both are) NAN. - // - treats really large numbers as almost equal to infinity. - // - thinks +0.0 and -0.0 are 0 DLP's apart. - bool AlmostEquals(const FloatingPoint& rhs) const { - // The IEEE standard says that any comparison operation involving - // a NAN must return false. - if (is_nan() || rhs.is_nan()) return false; - - return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) - <= kMaxUlps; - } - - private: - // The data type used to store the actual floating-point number. - union FloatingPointUnion { - RawType value_; // The raw floating-point number. - Bits bits_; // The bits that represent the number. - }; - - // Converts an integer from the sign-and-magnitude representation to - // the biased representation. More precisely, let N be 2 to the - // power of (kBitCount - 1), an integer x is represented by the - // unsigned number x + N. - // - // For instance, - // - // -N + 1 (the most negative number representable using - // sign-and-magnitude) is represented by 1; - // 0 is represented by N; and - // N - 1 (the biggest number representable using - // sign-and-magnitude) is represented by 2N - 1. - // - // Read http://en.wikipedia.org/wiki/Signed_number_representations - // for more details on signed number representations. - static Bits SignAndMagnitudeToBiased(const Bits &sam) { - if (kSignBitMask & sam) { - // sam represents a negative number. - return ~sam + 1; - } else { - // sam represents a positive number. - return kSignBitMask | sam; - } - } - - // Given two numbers in the sign-and-magnitude representation, - // returns the distance between them as an unsigned number. - static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1, - const Bits &sam2) { - const Bits biased1 = SignAndMagnitudeToBiased(sam1); - const Bits biased2 = SignAndMagnitudeToBiased(sam2); - return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); - } - - FloatingPointUnion u_; -}; - -// We cannot use std::numeric_limits::max() as it clashes with the max() -// macro defined by . -template <> -inline float FloatingPoint::Max() { return FLT_MAX; } -template <> -inline double FloatingPoint::Max() { return DBL_MAX; } - -// Typedefs the instances of the FloatingPoint template class that we -// care to use. -typedef FloatingPoint Float; -typedef FloatingPoint Double; - -// In order to catch the mistake of putting tests that use different -// test fixture classes in the same test case, we need to assign -// unique IDs to fixture classes and compare them. The TypeId type is -// used to hold such IDs. The user should treat TypeId as an opaque -// type: the only operation allowed on TypeId values is to compare -// them for equality using the == operator. -typedef const void* TypeId; - -template -class TypeIdHelper { - public: - // dummy_ must not have a const type. Otherwise an overly eager - // compiler (e.g. MSVC 7.1 & 8.0) may try to merge - // TypeIdHelper::dummy_ for different Ts as an "optimization". - static bool dummy_; -}; - -template -bool TypeIdHelper::dummy_ = false; - -// GetTypeId() returns the ID of type T. Different values will be -// returned for different types. Calling the function twice with the -// same type argument is guaranteed to return the same ID. -template -TypeId GetTypeId() { - // The compiler is required to allocate a different - // TypeIdHelper::dummy_ variable for each T used to instantiate - // the template. Therefore, the address of dummy_ is guaranteed to - // be unique. - return &(TypeIdHelper::dummy_); -} - -// Returns the type ID of ::testing::Test. Always call this instead -// of GetTypeId< ::testing::Test>() to get the type ID of -// ::testing::Test, as the latter may give the wrong result due to a -// suspected linker bug when compiling Google Test as a Mac OS X -// framework. -GTEST_API_ TypeId GetTestTypeId(); - -// Defines the abstract factory interface that creates instances -// of a Test object. -class TestFactoryBase { - public: - virtual ~TestFactoryBase() {} - - // Creates a test instance to run. The instance is both created and destroyed - // within TestInfoImpl::Run() - virtual Test* CreateTest() = 0; - - protected: - TestFactoryBase() {} - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase); -}; - -// This class provides implementation of TeastFactoryBase interface. -// It is used in TEST and TEST_F macros. -template -class TestFactoryImpl : public TestFactoryBase { - public: - virtual Test* CreateTest() { return new TestClass; } -}; - -#if GTEST_OS_WINDOWS - -// Predicate-formatters for implementing the HRESULT checking macros -// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED} -// We pass a long instead of HRESULT to avoid causing an -// include dependency for the HRESULT type. -GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr, - long hr); // NOLINT -GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr, - long hr); // NOLINT - -#endif // GTEST_OS_WINDOWS - -// Types of SetUpTestCase() and TearDownTestCase() functions. -typedef void (*SetUpTestCaseFunc)(); -typedef void (*TearDownTestCaseFunc)(); - -// Creates a new TestInfo object and registers it with Google Test; -// returns the created object. -// -// Arguments: -// -// test_case_name: name of the test case -// name: name of the test -// type_param the name of the test's type parameter, or NULL if -// this is not a typed or a type-parameterized test. -// value_param text representation of the test's value parameter, -// or NULL if this is not a type-parameterized test. -// fixture_class_id: ID of the test fixture class -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case -// factory: pointer to the factory that creates a test object. -// The newly created TestInfo instance will assume -// ownership of the factory object. -GTEST_API_ TestInfo* MakeAndRegisterTestInfo( - const char* test_case_name, - const char* name, - const char* type_param, - const char* value_param, - TypeId fixture_class_id, - SetUpTestCaseFunc set_up_tc, - TearDownTestCaseFunc tear_down_tc, - TestFactoryBase* factory); - -// If *pstr starts with the given prefix, modifies *pstr to be right -// past the prefix and returns true; otherwise leaves *pstr unchanged -// and returns false. None of pstr, *pstr, and prefix can be NULL. -GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); - -#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - -// State of the definition of a type-parameterized test case. -class GTEST_API_ TypedTestCasePState { - public: - TypedTestCasePState() : registered_(false) {} - - // Adds the given test name to defined_test_names_ and return true - // if the test case hasn't been registered; otherwise aborts the - // program. - bool AddTestName(const char* file, int line, const char* case_name, - const char* test_name) { - if (registered_) { - fprintf(stderr, "%s Test %s must be defined before " - "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n", - FormatFileLocation(file, line).c_str(), test_name, case_name); - fflush(stderr); - posix::Abort(); - } - defined_test_names_.insert(test_name); - return true; - } - - // Verifies that registered_tests match the test names in - // defined_test_names_; returns registered_tests if successful, or - // aborts the program otherwise. - const char* VerifyRegisteredTestNames( - const char* file, int line, const char* registered_tests); - - private: - bool registered_; - ::std::set defined_test_names_; -}; - -// Skips to the first non-space char after the first comma in 'str'; -// returns NULL if no comma is found in 'str'. -inline const char* SkipComma(const char* str) { - const char* comma = strchr(str, ','); - if (comma == NULL) { - return NULL; - } - while (IsSpace(*(++comma))) {} - return comma; -} - -// Returns the prefix of 'str' before the first comma in it; returns -// the entire string if it contains no comma. -inline std::string GetPrefixUntilComma(const char* str) { - const char* comma = strchr(str, ','); - return comma == NULL ? str : std::string(str, comma); -} - -// TypeParameterizedTest::Register() -// registers a list of type-parameterized tests with Google Test. The -// return value is insignificant - we just need to return something -// such that we can call this function in a namespace scope. -// -// Implementation note: The GTEST_TEMPLATE_ macro declares a template -// template parameter. It's defined in gtest-type-util.h. -template -class TypeParameterizedTest { - public: - // 'index' is the index of the test in the type list 'Types' - // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase, - // Types). Valid values for 'index' are [0, N - 1] where N is the - // length of Types. - static bool Register(const char* prefix, const char* case_name, - const char* test_names, int index) { - typedef typename Types::Head Type; - typedef Fixture FixtureClass; - typedef typename GTEST_BIND_(TestSel, Type) TestClass; - - // First, registers the first type-parameterized test in the type - // list. - MakeAndRegisterTestInfo( - (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/" - + StreamableToString(index)).c_str(), - GetPrefixUntilComma(test_names).c_str(), - GetTypeName().c_str(), - NULL, // No value parameter. - GetTypeId(), - TestClass::SetUpTestCase, - TestClass::TearDownTestCase, - new TestFactoryImpl); - - // Next, recurses (at compile time) with the tail of the type list. - return TypeParameterizedTest - ::Register(prefix, case_name, test_names, index + 1); - } -}; - -// The base case for the compile time recursion. -template -class TypeParameterizedTest { - public: - static bool Register(const char* /*prefix*/, const char* /*case_name*/, - const char* /*test_names*/, int /*index*/) { - return true; - } -}; - -// TypeParameterizedTestCase::Register() -// registers *all combinations* of 'Tests' and 'Types' with Google -// Test. The return value is insignificant - we just need to return -// something such that we can call this function in a namespace scope. -template -class TypeParameterizedTestCase { - public: - static bool Register(const char* prefix, const char* case_name, - const char* test_names) { - typedef typename Tests::Head Head; - - // First, register the first test in 'Test' for each type in 'Types'. - TypeParameterizedTest::Register( - prefix, case_name, test_names, 0); - - // Next, recurses (at compile time) with the tail of the test list. - return TypeParameterizedTestCase - ::Register(prefix, case_name, SkipComma(test_names)); - } -}; - -// The base case for the compile time recursion. -template -class TypeParameterizedTestCase { - public: - static bool Register(const char* /*prefix*/, const char* /*case_name*/, - const char* /*test_names*/) { - return true; - } -}; - -#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P - -// Returns the current OS stack trace as an std::string. -// -// The maximum number of stack frames to be included is specified by -// the gtest_stack_trace_depth flag. The skip_count parameter -// specifies the number of top frames to be skipped, which doesn't -// count against the number of frames to be included. -// -// For example, if Foo() calls Bar(), which in turn calls -// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in -// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. -GTEST_API_ std::string GetCurrentOsStackTraceExceptTop( - UnitTest* unit_test, int skip_count); - -// Helpers for suppressing warnings on unreachable code or constant -// condition. - -// Always returns true. -GTEST_API_ bool AlwaysTrue(); - -// Always returns false. -inline bool AlwaysFalse() { return !AlwaysTrue(); } - -// Helper for suppressing false warning from Clang on a const char* -// variable declared in a conditional expression always being NULL in -// the else branch. -struct GTEST_API_ ConstCharPtr { - ConstCharPtr(const char* str) : value(str) {} - operator bool() const { return true; } - const char* value; -}; - -// A simple Linear Congruential Generator for generating random -// numbers with a uniform distribution. Unlike rand() and srand(), it -// doesn't use global state (and therefore can't interfere with user -// code). Unlike rand_r(), it's portable. An LCG isn't very random, -// but it's good enough for our purposes. -class GTEST_API_ Random { - public: - static const UInt32 kMaxRange = 1u << 31; - - explicit Random(UInt32 seed) : state_(seed) {} - - void Reseed(UInt32 seed) { state_ = seed; } - - // Generates a random number from [0, range). Crashes if 'range' is - // 0 or greater than kMaxRange. - UInt32 Generate(UInt32 range); - - private: - UInt32 state_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); -}; - -// Defining a variable of type CompileAssertTypesEqual will cause a -// compiler error iff T1 and T2 are different types. -template -struct CompileAssertTypesEqual; - -template -struct CompileAssertTypesEqual { -}; - -// Removes the reference from a type if it is a reference type, -// otherwise leaves it unchanged. This is the same as -// tr1::remove_reference, which is not widely available yet. -template -struct RemoveReference { typedef T type; }; // NOLINT -template -struct RemoveReference { typedef T type; }; // NOLINT - -// A handy wrapper around RemoveReference that works when the argument -// T depends on template parameters. -#define GTEST_REMOVE_REFERENCE_(T) \ - typename ::testing::internal::RemoveReference::type - -// Removes const from a type if it is a const type, otherwise leaves -// it unchanged. This is the same as tr1::remove_const, which is not -// widely available yet. -template -struct RemoveConst { typedef T type; }; // NOLINT -template -struct RemoveConst { typedef T type; }; // NOLINT - -// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above -// definition to fail to remove the const in 'const int[3]' and 'const -// char[3][4]'. The following specialization works around the bug. -template -struct RemoveConst { - typedef typename RemoveConst::type type[N]; -}; - -#if defined(_MSC_VER) && _MSC_VER < 1400 -// This is the only specialization that allows VC++ 7.1 to remove const in -// 'const int[3] and 'const int[3][4]'. However, it causes trouble with GCC -// and thus needs to be conditionally compiled. -template -struct RemoveConst { - typedef typename RemoveConst::type type[N]; -}; -#endif - -// A handy wrapper around RemoveConst that works when the argument -// T depends on template parameters. -#define GTEST_REMOVE_CONST_(T) \ - typename ::testing::internal::RemoveConst::type - -// Turns const U&, U&, const U, and U all into U. -#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ - GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T)) - -// Adds reference to a type if it is not a reference type, -// otherwise leaves it unchanged. This is the same as -// tr1::add_reference, which is not widely available yet. -template -struct AddReference { typedef T& type; }; // NOLINT -template -struct AddReference { typedef T& type; }; // NOLINT - -// A handy wrapper around AddReference that works when the argument T -// depends on template parameters. -#define GTEST_ADD_REFERENCE_(T) \ - typename ::testing::internal::AddReference::type - -// Adds a reference to const on top of T as necessary. For example, -// it transforms -// -// char ==> const char& -// const char ==> const char& -// char& ==> const char& -// const char& ==> const char& -// -// The argument T must depend on some template parameters. -#define GTEST_REFERENCE_TO_CONST_(T) \ - GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T)) - -// ImplicitlyConvertible::value is a compile-time bool -// constant that's true iff type From can be implicitly converted to -// type To. -template -class ImplicitlyConvertible { - private: - // We need the following helper functions only for their types. - // They have no implementations. - - // MakeFrom() is an expression whose type is From. We cannot simply - // use From(), as the type From may not have a public default - // constructor. - static From MakeFrom(); - - // These two functions are overloaded. Given an expression - // Helper(x), the compiler will pick the first version if x can be - // implicitly converted to type To; otherwise it will pick the - // second version. - // - // The first version returns a value of size 1, and the second - // version returns a value of size 2. Therefore, by checking the - // size of Helper(x), which can be done at compile time, we can tell - // which version of Helper() is used, and hence whether x can be - // implicitly converted to type To. - static char Helper(To); - static char (&Helper(...))[2]; // NOLINT - - // We have to put the 'public' section after the 'private' section, - // or MSVC refuses to compile the code. - public: - // MSVC warns about implicitly converting from double to int for - // possible loss of data, so we need to temporarily disable the - // warning. -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4244) // Temporarily disables warning 4244. - - static const bool value = - sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; -# pragma warning(pop) // Restores the warning state. -#elif defined(__BORLANDC__) - // C++Builder cannot use member overload resolution during template - // instantiation. The simplest workaround is to use its C++0x type traits - // functions (C++Builder 2009 and above only). - static const bool value = __is_convertible(From, To); -#else - static const bool value = - sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; -#endif // _MSV_VER -}; -template -const bool ImplicitlyConvertible::value; - -// IsAProtocolMessage::value is a compile-time bool constant that's -// true iff T is type ProtocolMessage, proto2::Message, or a subclass -// of those. -template -struct IsAProtocolMessage - : public bool_constant< - ImplicitlyConvertible::value || - ImplicitlyConvertible::value> { -}; - -// When the compiler sees expression IsContainerTest(0), if C is an -// STL-style container class, the first overload of IsContainerTest -// will be viable (since both C::iterator* and C::const_iterator* are -// valid types and NULL can be implicitly converted to them). It will -// be picked over the second overload as 'int' is a perfect match for -// the type of argument 0. If C::iterator or C::const_iterator is not -// a valid type, the first overload is not viable, and the second -// overload will be picked. Therefore, we can determine whether C is -// a container class by checking the type of IsContainerTest(0). -// The value of the expression is insignificant. -// -// Note that we look for both C::iterator and C::const_iterator. The -// reason is that C++ injects the name of a class as a member of the -// class itself (e.g. you can refer to class iterator as either -// 'iterator' or 'iterator::iterator'). If we look for C::iterator -// only, for example, we would mistakenly think that a class named -// iterator is an STL container. -// -// Also note that the simpler approach of overloading -// IsContainerTest(typename C::const_iterator*) and -// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. -typedef int IsContainer; -template -IsContainer IsContainerTest(int /* dummy */, - typename C::iterator* /* it */ = NULL, - typename C::const_iterator* /* const_it */ = NULL) { - return 0; -} - -typedef char IsNotContainer; -template -IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; } - -// EnableIf::type is void when 'Cond' is true, and -// undefined when 'Cond' is false. To use SFINAE to make a function -// overload only apply when a particular expression is true, add -// "typename EnableIf::type* = 0" as the last parameter. -template struct EnableIf; -template<> struct EnableIf { typedef void type; }; // NOLINT - -// Utilities for native arrays. - -// ArrayEq() compares two k-dimensional native arrays using the -// elements' operator==, where k can be any integer >= 0. When k is -// 0, ArrayEq() degenerates into comparing a single pair of values. - -template -bool ArrayEq(const T* lhs, size_t size, const U* rhs); - -// This generic version is used when k is 0. -template -inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; } - -// This overload is used when k >= 1. -template -inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) { - return internal::ArrayEq(lhs, N, rhs); -} - -// This helper reduces code bloat. If we instead put its logic inside -// the previous ArrayEq() function, arrays with different sizes would -// lead to different copies of the template code. -template -bool ArrayEq(const T* lhs, size_t size, const U* rhs) { - for (size_t i = 0; i != size; i++) { - if (!internal::ArrayEq(lhs[i], rhs[i])) - return false; - } - return true; -} - -// Finds the first element in the iterator range [begin, end) that -// equals elem. Element may be a native array type itself. -template -Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) { - for (Iter it = begin; it != end; ++it) { - if (internal::ArrayEq(*it, elem)) - return it; - } - return end; -} - -// CopyArray() copies a k-dimensional native array using the elements' -// operator=, where k can be any integer >= 0. When k is 0, -// CopyArray() degenerates into copying a single value. - -template -void CopyArray(const T* from, size_t size, U* to); - -// This generic version is used when k is 0. -template -inline void CopyArray(const T& from, U* to) { *to = from; } - -// This overload is used when k >= 1. -template -inline void CopyArray(const T(&from)[N], U(*to)[N]) { - internal::CopyArray(from, N, *to); -} - -// This helper reduces code bloat. If we instead put its logic inside -// the previous CopyArray() function, arrays with different sizes -// would lead to different copies of the template code. -template -void CopyArray(const T* from, size_t size, U* to) { - for (size_t i = 0; i != size; i++) { - internal::CopyArray(from[i], to + i); - } -} - -// The relation between an NativeArray object (see below) and the -// native array it represents. -enum RelationToSource { - kReference, // The NativeArray references the native array. - kCopy // The NativeArray makes a copy of the native array and - // owns the copy. -}; - -// Adapts a native array to a read-only STL-style container. Instead -// of the complete STL container concept, this adaptor only implements -// members useful for Google Mock's container matchers. New members -// should be added as needed. To simplify the implementation, we only -// support Element being a raw type (i.e. having no top-level const or -// reference modifier). It's the client's responsibility to satisfy -// this requirement. Element can be an array type itself (hence -// multi-dimensional arrays are supported). -template -class NativeArray { - public: - // STL-style container typedefs. - typedef Element value_type; - typedef Element* iterator; - typedef const Element* const_iterator; - - // Constructs from a native array. - NativeArray(const Element* array, size_t count, RelationToSource relation) { - Init(array, count, relation); - } - - // Copy constructor. - NativeArray(const NativeArray& rhs) { - Init(rhs.array_, rhs.size_, rhs.relation_to_source_); - } - - ~NativeArray() { - // Ensures that the user doesn't instantiate NativeArray with a - // const or reference type. - static_cast(StaticAssertTypeEqHelper()); - if (relation_to_source_ == kCopy) - delete[] array_; - } - - // STL-style container methods. - size_t size() const { return size_; } - const_iterator begin() const { return array_; } - const_iterator end() const { return array_ + size_; } - bool operator==(const NativeArray& rhs) const { - return size() == rhs.size() && - ArrayEq(begin(), size(), rhs.begin()); - } - - private: - // Initializes this object; makes a copy of the input array if - // 'relation' is kCopy. - void Init(const Element* array, size_t a_size, RelationToSource relation) { - if (relation == kReference) { - array_ = array; - } else { - Element* const copy = new Element[a_size]; - CopyArray(array, a_size, copy); - array_ = copy; - } - size_ = a_size; - relation_to_source_ = relation; - } - - const Element* array_; - size_t size_; - RelationToSource relation_to_source_; - - GTEST_DISALLOW_ASSIGN_(NativeArray); -}; - -} // namespace internal -} // namespace testing - -#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ - ::testing::internal::AssertHelper(result_type, file, line, message) \ - = ::testing::Message() - -#define GTEST_MESSAGE_(message, result_type) \ - GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) - -#define GTEST_FATAL_FAILURE_(message) \ - return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure) - -#define GTEST_NONFATAL_FAILURE_(message) \ - GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure) - -#define GTEST_SUCCESS_(message) \ - GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) - -// Suppresses MSVC warnings 4072 (unreachable code) for the code following -// statement if it returns or throws (or doesn't return or throw in some -// situations). -#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ - if (::testing::internal::AlwaysTrue()) { statement; } - -#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::ConstCharPtr gtest_msg = "") { \ - bool gtest_caught_expected = false; \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - catch (expected_exception const&) { \ - gtest_caught_expected = true; \ - } \ - catch (...) { \ - gtest_msg.value = \ - "Expected: " #statement " throws an exception of type " \ - #expected_exception ".\n Actual: it throws a different type."; \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ - } \ - if (!gtest_caught_expected) { \ - gtest_msg.value = \ - "Expected: " #statement " throws an exception of type " \ - #expected_exception ".\n Actual: it throws nothing."; \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \ - fail(gtest_msg.value) - -#define GTEST_TEST_NO_THROW_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - catch (...) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \ - fail("Expected: " #statement " doesn't throw an exception.\n" \ - " Actual: it throws.") - -#define GTEST_TEST_ANY_THROW_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - bool gtest_caught_any = false; \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } \ - catch (...) { \ - gtest_caught_any = true; \ - } \ - if (!gtest_caught_any) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \ - fail("Expected: " #statement " throws an exception.\n" \ - " Actual: it doesn't.") - - -// Implements Boolean test assertions such as EXPECT_TRUE. expression can be -// either a boolean expression or an AssertionResult. text is a textual -// represenation of expression as it was passed into the EXPECT_TRUE. -#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (const ::testing::AssertionResult gtest_ar_ = \ - ::testing::AssertionResult(expression)) \ - ; \ - else \ - fail(::testing::internal::GetBoolAssertionFailureMessage(\ - gtest_ar_, text, #actual, #expected).c_str()) - -#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \ - fail("Expected: " #statement " doesn't generate new fatal " \ - "failures in the current thread.\n" \ - " Actual: it does.") - -// Expands to the name of the class that implements the given test. -#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ - test_case_name##_##test_name##_Test - -// Helper macro for defining tests. -#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\ -class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ - public:\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\ - private:\ - virtual void TestBody();\ - static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\ - GTEST_DISALLOW_COPY_AND_ASSIGN_(\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\ -};\ -\ -::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\ - ::test_info_ =\ - ::testing::internal::MakeAndRegisterTestInfo(\ - #test_case_name, #test_name, NULL, NULL, \ - (parent_id), \ - parent_class::SetUpTestCase, \ - parent_class::TearDownTestCase, \ - new ::testing::internal::TestFactoryImpl<\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\ -void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) -// -// This header file defines the public API for death tests. It is -// #included by gtest.h so a user doesn't need to include this -// directly. - -#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ - -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) -// -// The Google C++ Testing Framework (Google Test) -// -// This header file defines internal utilities needed for implementing -// death tests. They are subject to change without notice. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ - - -#include - -namespace testing { -namespace internal { - -GTEST_DECLARE_string_(internal_run_death_test); - -// Names of the flags (needed for parsing Google Test flags). -const char kDeathTestStyleFlag[] = "death_test_style"; -const char kDeathTestUseFork[] = "death_test_use_fork"; -const char kInternalRunDeathTestFlag[] = "internal_run_death_test"; - -#if GTEST_HAS_DEATH_TEST - -// DeathTest is a class that hides much of the complexity of the -// GTEST_DEATH_TEST_ macro. It is abstract; its static Create method -// returns a concrete class that depends on the prevailing death test -// style, as defined by the --gtest_death_test_style and/or -// --gtest_internal_run_death_test flags. - -// In describing the results of death tests, these terms are used with -// the corresponding definitions: -// -// exit status: The integer exit information in the format specified -// by wait(2) -// exit code: The integer code passed to exit(3), _exit(2), or -// returned from main() -class GTEST_API_ DeathTest { - public: - // Create returns false if there was an error determining the - // appropriate action to take for the current death test; for example, - // if the gtest_death_test_style flag is set to an invalid value. - // The LastMessage method will return a more detailed message in that - // case. Otherwise, the DeathTest pointer pointed to by the "test" - // argument is set. If the death test should be skipped, the pointer - // is set to NULL; otherwise, it is set to the address of a new concrete - // DeathTest object that controls the execution of the current test. - static bool Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test); - DeathTest(); - virtual ~DeathTest() { } - - // A helper class that aborts a death test when it's deleted. - class ReturnSentinel { - public: - explicit ReturnSentinel(DeathTest* test) : test_(test) { } - ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } - private: - DeathTest* const test_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel); - } GTEST_ATTRIBUTE_UNUSED_; - - // An enumeration of possible roles that may be taken when a death - // test is encountered. EXECUTE means that the death test logic should - // be executed immediately. OVERSEE means that the program should prepare - // the appropriate environment for a child process to execute the death - // test, then wait for it to complete. - enum TestRole { OVERSEE_TEST, EXECUTE_TEST }; - - // An enumeration of the three reasons that a test might be aborted. - enum AbortReason { - TEST_ENCOUNTERED_RETURN_STATEMENT, - TEST_THREW_EXCEPTION, - TEST_DID_NOT_DIE - }; - - // Assumes one of the above roles. - virtual TestRole AssumeRole() = 0; - - // Waits for the death test to finish and returns its status. - virtual int Wait() = 0; - - // Returns true if the death test passed; that is, the test process - // exited during the test, its exit status matches a user-supplied - // predicate, and its stderr output matches a user-supplied regular - // expression. - // The user-supplied predicate may be a macro expression rather - // than a function pointer or functor, or else Wait and Passed could - // be combined. - virtual bool Passed(bool exit_status_ok) = 0; - - // Signals that the death test did not die as expected. - virtual void Abort(AbortReason reason) = 0; - - // Returns a human-readable outcome message regarding the outcome of - // the last death test. - static const char* LastMessage(); - - static void set_last_death_test_message(const std::string& message); - - private: - // A string containing a description of the outcome of the last death test. - static std::string last_death_test_message_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest); -}; - -// Factory interface for death tests. May be mocked out for testing. -class DeathTestFactory { - public: - virtual ~DeathTestFactory() { } - virtual bool Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test) = 0; -}; - -// A concrete DeathTestFactory implementation for normal use. -class DefaultDeathTestFactory : public DeathTestFactory { - public: - virtual bool Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test); -}; - -// Returns true if exit_status describes a process that was terminated -// by a signal, or exited normally with a nonzero exit code. -GTEST_API_ bool ExitedUnsuccessfully(int exit_status); - -// Traps C++ exceptions escaping statement and reports them as test -// failures. Note that trapping SEH exceptions is not implemented here. -# if GTEST_HAS_EXCEPTIONS -# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ - try { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } catch (const ::std::exception& gtest_exception) { \ - fprintf(\ - stderr, \ - "\n%s: Caught std::exception-derived exception escaping the " \ - "death test statement. Exception message: %s\n", \ - ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ - gtest_exception.what()); \ - fflush(stderr); \ - death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ - } catch (...) { \ - death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ - } - -# else -# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) - -# endif - -// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, -// ASSERT_EXIT*, and EXPECT_EXIT*. -# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - const ::testing::internal::RE& gtest_regex = (regex); \ - ::testing::internal::DeathTest* gtest_dt; \ - if (!::testing::internal::DeathTest::Create(#statement, >est_regex, \ - __FILE__, __LINE__, >est_dt)) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ - } \ - if (gtest_dt != NULL) { \ - ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \ - gtest_dt_ptr(gtest_dt); \ - switch (gtest_dt->AssumeRole()) { \ - case ::testing::internal::DeathTest::OVERSEE_TEST: \ - if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ - goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ - } \ - break; \ - case ::testing::internal::DeathTest::EXECUTE_TEST: { \ - ::testing::internal::DeathTest::ReturnSentinel \ - gtest_sentinel(gtest_dt); \ - GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ - gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ - break; \ - } \ - default: \ - break; \ - } \ - } \ - } else \ - GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \ - fail(::testing::internal::DeathTest::LastMessage()) -// The symbol "fail" here expands to something into which a message -// can be streamed. - -// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in -// NDEBUG mode. In this case we need the statements to be executed, the regex is -// ignored, and the macro must accept a streamed message even though the message -// is never printed. -# define GTEST_EXECUTE_STATEMENT_(statement, regex) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - } else \ - ::testing::Message() - -// A class representing the parsed contents of the -// --gtest_internal_run_death_test flag, as it existed when -// RUN_ALL_TESTS was called. -class InternalRunDeathTestFlag { - public: - InternalRunDeathTestFlag(const std::string& a_file, - int a_line, - int an_index, - int a_write_fd) - : file_(a_file), line_(a_line), index_(an_index), - write_fd_(a_write_fd) {} - - ~InternalRunDeathTestFlag() { - if (write_fd_ >= 0) - posix::Close(write_fd_); - } - - const std::string& file() const { return file_; } - int line() const { return line_; } - int index() const { return index_; } - int write_fd() const { return write_fd_; } - - private: - std::string file_; - int line_; - int index_; - int write_fd_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag); -}; - -// Returns a newly created InternalRunDeathTestFlag object with fields -// initialized from the GTEST_FLAG(internal_run_death_test) flag if -// the flag is specified; otherwise returns NULL. -InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag(); - -#else // GTEST_HAS_DEATH_TEST - -// This macro is used for implementing macros such as -// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where -// death tests are not supported. Those macros must compile on such systems -// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on -// systems that support death tests. This allows one to write such a macro -// on a system that does not support death tests and be sure that it will -// compile on a death-test supporting system. -// -// Parameters: -// statement - A statement that a macro such as EXPECT_DEATH would test -// for program termination. This macro has to make sure this -// statement is compiled but not executed, to ensure that -// EXPECT_DEATH_IF_SUPPORTED compiles with a certain -// parameter iff EXPECT_DEATH compiles with it. -// regex - A regex that a macro such as EXPECT_DEATH would use to test -// the output of statement. This parameter has to be -// compiled but not evaluated by this macro, to ensure that -// this macro only accepts expressions that a macro such as -// EXPECT_DEATH would accept. -// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED -// and a return statement for ASSERT_DEATH_IF_SUPPORTED. -// This ensures that ASSERT_DEATH_IF_SUPPORTED will not -// compile inside functions where ASSERT_DEATH doesn't -// compile. -// -// The branch that has an always false condition is used to ensure that -// statement and regex are compiled (and thus syntactically correct) but -// never executed. The unreachable code macro protects the terminator -// statement from generating an 'unreachable code' warning in case -// statement unconditionally returns or throws. The Message constructor at -// the end allows the syntax of streaming additional messages into the -// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. -# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (::testing::internal::AlwaysTrue()) { \ - GTEST_LOG_(WARNING) \ - << "Death tests are not supported on this platform.\n" \ - << "Statement '" #statement "' cannot be verified."; \ - } else if (::testing::internal::AlwaysFalse()) { \ - ::testing::internal::RE::PartialMatch(".*", (regex)); \ - GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ - terminator; \ - } else \ - ::testing::Message() - -#endif // GTEST_HAS_DEATH_TEST - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ - -namespace testing { - -// This flag controls the style of death tests. Valid values are "threadsafe", -// meaning that the death test child process will re-execute the test binary -// from the start, running only a single death test, or "fast", -// meaning that the child process will execute the test logic immediately -// after forking. -GTEST_DECLARE_string_(death_test_style); - -#if GTEST_HAS_DEATH_TEST - -namespace internal { - -// Returns a Boolean value indicating whether the caller is currently -// executing in the context of the death test child process. Tools such as -// Valgrind heap checkers may need this to modify their behavior in death -// tests. IMPORTANT: This is an internal utility. Using it may break the -// implementation of death tests. User code MUST NOT use it. -GTEST_API_ bool InDeathTestChild(); - -} // namespace internal - -// The following macros are useful for writing death tests. - -// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is -// executed: -// -// 1. It generates a warning if there is more than one active -// thread. This is because it's safe to fork() or clone() only -// when there is a single thread. -// -// 2. The parent process clone()s a sub-process and runs the death -// test in it; the sub-process exits with code 0 at the end of the -// death test, if it hasn't exited already. -// -// 3. The parent process waits for the sub-process to terminate. -// -// 4. The parent process checks the exit code and error message of -// the sub-process. -// -// Examples: -// -// ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number"); -// for (int i = 0; i < 5; i++) { -// EXPECT_DEATH(server.ProcessRequest(i), -// "Invalid request .* in ProcessRequest()") -// << "Failed to die on request " << i; -// } -// -// ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting"); -// -// bool KilledBySIGHUP(int exit_code) { -// return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP; -// } -// -// ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!"); -// -// On the regular expressions used in death tests: -// -// On POSIX-compliant systems (*nix), we use the library, -// which uses the POSIX extended regex syntax. -// -// On other platforms (e.g. Windows), we only support a simple regex -// syntax implemented as part of Google Test. This limited -// implementation should be enough most of the time when writing -// death tests; though it lacks many features you can find in PCRE -// or POSIX extended regex syntax. For example, we don't support -// union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and -// repetition count ("x{5,7}"), among others. -// -// Below is the syntax that we do support. We chose it to be a -// subset of both PCRE and POSIX extended regex, so it's easy to -// learn wherever you come from. In the following: 'A' denotes a -// literal character, period (.), or a single \\ escape sequence; -// 'x' and 'y' denote regular expressions; 'm' and 'n' are for -// natural numbers. -// -// c matches any literal character c -// \\d matches any decimal digit -// \\D matches any character that's not a decimal digit -// \\f matches \f -// \\n matches \n -// \\r matches \r -// \\s matches any ASCII whitespace, including \n -// \\S matches any character that's not a whitespace -// \\t matches \t -// \\v matches \v -// \\w matches any letter, _, or decimal digit -// \\W matches any character that \\w doesn't match -// \\c matches any literal character c, which must be a punctuation -// . matches any single character except \n -// A? matches 0 or 1 occurrences of A -// A* matches 0 or many occurrences of A -// A+ matches 1 or many occurrences of A -// ^ matches the beginning of a string (not that of each line) -// $ matches the end of a string (not that of each line) -// xy matches x followed by y -// -// If you accidentally use PCRE or POSIX extended regex features -// not implemented by us, you will get a run-time failure. In that -// case, please try to rewrite your regular expression within the -// above syntax. -// -// This implementation is *not* meant to be as highly tuned or robust -// as a compiled regex library, but should perform well enough for a -// death test, which already incurs significant overhead by launching -// a child process. -// -// Known caveats: -// -// A "threadsafe" style death test obtains the path to the test -// program from argv[0] and re-executes it in the sub-process. For -// simplicity, the current implementation doesn't search the PATH -// when launching the sub-process. This means that the user must -// invoke the test program via a path that contains at least one -// path separator (e.g. path/to/foo_test and -// /absolute/path/to/bar_test are fine, but foo_test is not). This -// is rarely a problem as people usually don't put the test binary -// directory in PATH. -// -// TODO(wan@google.com): make thread-safe death tests search the PATH. - -// Asserts that a given statement causes the program to exit, with an -// integer exit status that satisfies predicate, and emitting error output -// that matches regex. -# define ASSERT_EXIT(statement, predicate, regex) \ - GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_) - -// Like ASSERT_EXIT, but continues on to successive tests in the -// test case, if any: -# define EXPECT_EXIT(statement, predicate, regex) \ - GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_) - -// Asserts that a given statement causes the program to exit, either by -// explicitly exiting with a nonzero exit code or being killed by a -// signal, and emitting error output that matches regex. -# define ASSERT_DEATH(statement, regex) \ - ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) - -// Like ASSERT_DEATH, but continues on to successive tests in the -// test case, if any: -# define EXPECT_DEATH(statement, regex) \ - EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) - -// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: - -// Tests that an exit code describes a normal exit with a given exit code. -class GTEST_API_ ExitedWithCode { - public: - explicit ExitedWithCode(int exit_code); - bool operator()(int exit_status) const; - private: - // No implementation - assignment is unsupported. - void operator=(const ExitedWithCode& other); - - const int exit_code_; -}; - -# if !GTEST_OS_WINDOWS -// Tests that an exit code describes an exit due to termination by a -// given signal. -class GTEST_API_ KilledBySignal { - public: - explicit KilledBySignal(int signum); - bool operator()(int exit_status) const; - private: - const int signum_; -}; -# endif // !GTEST_OS_WINDOWS - -// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. -// The death testing framework causes this to have interesting semantics, -// since the sideeffects of the call are only visible in opt mode, and not -// in debug mode. -// -// In practice, this can be used to test functions that utilize the -// LOG(DFATAL) macro using the following style: -// -// int DieInDebugOr12(int* sideeffect) { -// if (sideeffect) { -// *sideeffect = 12; -// } -// LOG(DFATAL) << "death"; -// return 12; -// } -// -// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) { -// int sideeffect = 0; -// // Only asserts in dbg. -// EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death"); -// -// #ifdef NDEBUG -// // opt-mode has sideeffect visible. -// EXPECT_EQ(12, sideeffect); -// #else -// // dbg-mode no visible sideeffect. -// EXPECT_EQ(0, sideeffect); -// #endif -// } -// -// This will assert that DieInDebugReturn12InOpt() crashes in debug -// mode, usually due to a DCHECK or LOG(DFATAL), but returns the -// appropriate fallback value (12 in this case) in opt mode. If you -// need to test that a function has appropriate side-effects in opt -// mode, include assertions against the side-effects. A general -// pattern for this is: -// -// EXPECT_DEBUG_DEATH({ -// // Side-effects here will have an effect after this statement in -// // opt mode, but none in debug mode. -// EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); -// }, "death"); -// -# ifdef NDEBUG - -# define EXPECT_DEBUG_DEATH(statement, regex) \ - GTEST_EXECUTE_STATEMENT_(statement, regex) - -# define ASSERT_DEBUG_DEATH(statement, regex) \ - GTEST_EXECUTE_STATEMENT_(statement, regex) - -# else - -# define EXPECT_DEBUG_DEATH(statement, regex) \ - EXPECT_DEATH(statement, regex) - -# define ASSERT_DEBUG_DEATH(statement, regex) \ - ASSERT_DEATH(statement, regex) - -# endif // NDEBUG for EXPECT_DEBUG_DEATH -#endif // GTEST_HAS_DEATH_TEST - -// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and -// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if -// death tests are supported; otherwise they just issue a warning. This is -// useful when you are combining death test assertions with normal test -// assertions in one test. -#if GTEST_HAS_DEATH_TEST -# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ - EXPECT_DEATH(statement, regex) -# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ - ASSERT_DEATH(statement, regex) -#else -# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ - GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, ) -# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ - GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return) -#endif - -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ -// This file was GENERATED by command: -// pump.py gtest-param-test.h.pump -// DO NOT EDIT BY HAND!!! - -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: vladl@google.com (Vlad Losev) -// -// Macros and functions for implementing parameterized tests -// in Google C++ Testing Framework (Google Test) -// -// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ - - -// Value-parameterized tests allow you to test your code with different -// parameters without writing multiple copies of the same test. -// -// Here is how you use value-parameterized tests: - -#if 0 - -// To write value-parameterized tests, first you should define a fixture -// class. It is usually derived from testing::TestWithParam (see below for -// another inheritance scheme that's sometimes useful in more complicated -// class hierarchies), where the type of your parameter values. -// TestWithParam is itself derived from testing::Test. T can be any -// copyable type. If it's a raw pointer, you are responsible for managing the -// lifespan of the pointed values. - -class FooTest : public ::testing::TestWithParam { - // You can implement all the usual class fixture members here. -}; - -// Then, use the TEST_P macro to define as many parameterized tests -// for this fixture as you want. The _P suffix is for "parameterized" -// or "pattern", whichever you prefer to think. - -TEST_P(FooTest, DoesBlah) { - // Inside a test, access the test parameter with the GetParam() method - // of the TestWithParam class: - EXPECT_TRUE(foo.Blah(GetParam())); - ... -} - -TEST_P(FooTest, HasBlahBlah) { - ... -} - -// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test -// case with any set of parameters you want. Google Test defines a number -// of functions for generating test parameters. They return what we call -// (surprise!) parameter generators. Here is a summary of them, which -// are all in the testing namespace: -// -// -// Range(begin, end [, step]) - Yields values {begin, begin+step, -// begin+step+step, ...}. The values do not -// include end. step defaults to 1. -// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. -// ValuesIn(container) - Yields values from a C-style array, an STL -// ValuesIn(begin,end) container, or an iterator range [begin, end). -// Bool() - Yields sequence {false, true}. -// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product -// for the math savvy) of the values generated -// by the N generators. -// -// For more details, see comments at the definitions of these functions below -// in this file. -// -// The following statement will instantiate tests from the FooTest test case -// each with parameter values "meeny", "miny", and "moe". - -INSTANTIATE_TEST_CASE_P(InstantiationName, - FooTest, - Values("meeny", "miny", "moe")); - -// To distinguish different instances of the pattern, (yes, you -// can instantiate it more then once) the first argument to the -// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the -// actual test case name. Remember to pick unique prefixes for different -// instantiations. The tests from the instantiation above will have -// these names: -// -// * InstantiationName/FooTest.DoesBlah/0 for "meeny" -// * InstantiationName/FooTest.DoesBlah/1 for "miny" -// * InstantiationName/FooTest.DoesBlah/2 for "moe" -// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" -// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" -// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" -// -// You can use these names in --gtest_filter. -// -// This statement will instantiate all tests from FooTest again, each -// with parameter values "cat" and "dog": - -const char* pets[] = {"cat", "dog"}; -INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); - -// The tests from the instantiation above will have these names: -// -// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" -// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" -// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" -// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" -// -// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests -// in the given test case, whether their definitions come before or -// AFTER the INSTANTIATE_TEST_CASE_P statement. -// -// Please also note that generator expressions (including parameters to the -// generators) are evaluated in InitGoogleTest(), after main() has started. -// This allows the user on one hand, to adjust generator parameters in order -// to dynamically determine a set of tests to run and on the other hand, -// give the user a chance to inspect the generated tests with Google Test -// reflection API before RUN_ALL_TESTS() is executed. -// -// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc -// for more examples. -// -// In the future, we plan to publish the API for defining new parameter -// generators. But for now this interface remains part of the internal -// implementation and is subject to change. -// -// -// A parameterized test fixture must be derived from testing::Test and from -// testing::WithParamInterface, where T is the type of the parameter -// values. Inheriting from TestWithParam satisfies that requirement because -// TestWithParam inherits from both Test and WithParamInterface. In more -// complicated hierarchies, however, it is occasionally useful to inherit -// separately from Test and WithParamInterface. For example: - -class BaseTest : public ::testing::Test { - // You can inherit all the usual members for a non-parameterized test - // fixture here. -}; - -class DerivedTest : public BaseTest, public ::testing::WithParamInterface { - // The usual test fixture members go here too. -}; - -TEST_F(BaseTest, HasFoo) { - // This is an ordinary non-parameterized test. -} - -TEST_P(DerivedTest, DoesBlah) { - // GetParam works just the same here as if you inherit from TestWithParam. - EXPECT_TRUE(foo.Blah(GetParam())); -} - -#endif // 0 - - -#if !GTEST_OS_SYMBIAN -# include -#endif - -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: vladl@google.com (Vlad Losev) - -// Type and function utilities for implementing parameterized tests. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ - -#include -#include -#include - -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. -// Copyright 2003 Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: Dan Egnor (egnor@google.com) -// -// A "smart" pointer type with reference tracking. Every pointer to a -// particular object is kept on a circular linked list. When the last pointer -// to an object is destroyed or reassigned, the object is deleted. -// -// Used properly, this deletes the object when the last reference goes away. -// There are several caveats: -// - Like all reference counting schemes, cycles lead to leaks. -// - Each smart pointer is actually two pointers (8 bytes instead of 4). -// - Every time a pointer is assigned, the entire list of pointers to that -// object is traversed. This class is therefore NOT SUITABLE when there -// will often be more than two or three pointers to a particular object. -// - References are only tracked as long as linked_ptr<> objects are copied. -// If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS -// will happen (double deletion). -// -// A good use of this class is storing object references in STL containers. -// You can safely put linked_ptr<> in a vector<>. -// Other uses may not be as good. -// -// Note: If you use an incomplete type with linked_ptr<>, the class -// *containing* linked_ptr<> must have a constructor and destructor (even -// if they do nothing!). -// -// Bill Gibbons suggested we use something like this. -// -// Thread Safety: -// Unlike other linked_ptr implementations, in this implementation -// a linked_ptr object is thread-safe in the sense that: -// - it's safe to copy linked_ptr objects concurrently, -// - it's safe to copy *from* a linked_ptr and read its underlying -// raw pointer (e.g. via get()) concurrently, and -// - it's safe to write to two linked_ptrs that point to the same -// shared object concurrently. -// TODO(wan@google.com): rename this to safe_linked_ptr to avoid -// confusion with normal linked_ptr. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ - -#include -#include - - -namespace testing { -namespace internal { - -// Protects copying of all linked_ptr objects. -GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex); - -// This is used internally by all instances of linked_ptr<>. It needs to be -// a non-template class because different types of linked_ptr<> can refer to -// the same object (linked_ptr(obj) vs linked_ptr(obj)). -// So, it needs to be possible for different types of linked_ptr to participate -// in the same circular linked list, so we need a single class type here. -// -// DO NOT USE THIS CLASS DIRECTLY YOURSELF. Use linked_ptr. -class linked_ptr_internal { - public: - // Create a new circle that includes only this instance. - void join_new() { - next_ = this; - } - - // Many linked_ptr operations may change p.link_ for some linked_ptr - // variable p in the same circle as this object. Therefore we need - // to prevent two such operations from occurring concurrently. - // - // Note that different types of linked_ptr objects can coexist in a - // circle (e.g. linked_ptr, linked_ptr, and - // linked_ptr). Therefore we must use a single mutex to - // protect all linked_ptr objects. This can create serious - // contention in production code, but is acceptable in a testing - // framework. - - // Join an existing circle. - void join(linked_ptr_internal const* ptr) - GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { - MutexLock lock(&g_linked_ptr_mutex); - - linked_ptr_internal const* p = ptr; - while (p->next_ != ptr) p = p->next_; - p->next_ = this; - next_ = ptr; - } - - // Leave whatever circle we're part of. Returns true if we were the - // last member of the circle. Once this is done, you can join() another. - bool depart() - GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) { - MutexLock lock(&g_linked_ptr_mutex); - - if (next_ == this) return true; - linked_ptr_internal const* p = next_; - while (p->next_ != this) p = p->next_; - p->next_ = next_; - return false; - } - - private: - mutable linked_ptr_internal const* next_; -}; - -template -class linked_ptr { - public: - typedef T element_type; - - // Take over ownership of a raw pointer. This should happen as soon as - // possible after the object is created. - explicit linked_ptr(T* ptr = NULL) { capture(ptr); } - ~linked_ptr() { depart(); } - - // Copy an existing linked_ptr<>, adding ourselves to the list of references. - template linked_ptr(linked_ptr const& ptr) { copy(&ptr); } - linked_ptr(linked_ptr const& ptr) { // NOLINT - assert(&ptr != this); - copy(&ptr); - } - - // Assignment releases the old value and acquires the new. - template linked_ptr& operator=(linked_ptr const& ptr) { - depart(); - copy(&ptr); - return *this; - } - - linked_ptr& operator=(linked_ptr const& ptr) { - if (&ptr != this) { - depart(); - copy(&ptr); - } - return *this; - } - - // Smart pointer members. - void reset(T* ptr = NULL) { - depart(); - capture(ptr); - } - T* get() const { return value_; } - T* operator->() const { return value_; } - T& operator*() const { return *value_; } - - bool operator==(T* p) const { return value_ == p; } - bool operator!=(T* p) const { return value_ != p; } - template - bool operator==(linked_ptr const& ptr) const { - return value_ == ptr.get(); - } - template - bool operator!=(linked_ptr const& ptr) const { - return value_ != ptr.get(); - } - - private: - template - friend class linked_ptr; - - T* value_; - linked_ptr_internal link_; - - void depart() { - if (link_.depart()) delete value_; - } - - void capture(T* ptr) { - value_ = ptr; - link_.join_new(); - } - - template void copy(linked_ptr const* ptr) { - value_ = ptr->get(); - if (value_) - link_.join(&ptr->link_); - else - link_.join_new(); - } -}; - -template inline -bool operator==(T* ptr, const linked_ptr& x) { - return ptr == x.get(); -} - -template inline -bool operator!=(T* ptr, const linked_ptr& x) { - return ptr != x.get(); -} - -// A function to convert T* into linked_ptr -// Doing e.g. make_linked_ptr(new FooBarBaz(arg)) is a shorter notation -// for linked_ptr >(new FooBarBaz(arg)) -template -linked_ptr make_linked_ptr(T* ptr) { - return linked_ptr(ptr); -} - -} // namespace internal -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ -// Copyright 2007, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - -// Google Test - The Google C++ Testing Framework -// -// This file implements a universal value printer that can print a -// value of any type T: -// -// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); -// -// A user can teach this function how to print a class type T by -// defining either operator<<() or PrintTo() in the namespace that -// defines T. More specifically, the FIRST defined function in the -// following list will be used (assuming T is defined in namespace -// foo): -// -// 1. foo::PrintTo(const T&, ostream*) -// 2. operator<<(ostream&, const T&) defined in either foo or the -// global namespace. -// -// If none of the above is defined, it will print the debug string of -// the value if it is a protocol buffer, or print the raw bytes in the -// value otherwise. -// -// To aid debugging: when T is a reference type, the address of the -// value is also printed; when T is a (const) char pointer, both the -// pointer value and the NUL-terminated string it points to are -// printed. -// -// We also provide some convenient wrappers: -// -// // Prints a value to a string. For a (const or not) char -// // pointer, the NUL-terminated string (but not the pointer) is -// // printed. -// std::string ::testing::PrintToString(const T& value); -// -// // Prints a value tersely: for a reference type, the referenced -// // value (but not the address) is printed; for a (const or not) char -// // pointer, the NUL-terminated string (but not the pointer) is -// // printed. -// void ::testing::internal::UniversalTersePrint(const T& value, ostream*); -// -// // Prints value using the type inferred by the compiler. The difference -// // from UniversalTersePrint() is that this function prints both the -// // pointer and the NUL-terminated string for a (const or not) char pointer. -// void ::testing::internal::UniversalPrint(const T& value, ostream*); -// -// // Prints the fields of a tuple tersely to a string vector, one -// // element for each field. Tuple support must be enabled in -// // gtest-port.h. -// std::vector UniversalTersePrintTupleFieldsToStrings( -// const Tuple& value); -// -// Known limitation: -// -// The print primitives print the elements of an STL-style container -// using the compiler-inferred type of *iter where iter is a -// const_iterator of the container. When const_iterator is an input -// iterator but not a forward iterator, this inferred type may not -// match value_type, and the print output may be incorrect. In -// practice, this is rarely a problem as for most containers -// const_iterator is a forward iterator. We'll fix this if there's an -// actual need for it. Note that this fix cannot rely on value_type -// being defined as many user-defined container types don't have -// value_type. - -#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ - -#include // NOLINT -#include -#include -#include -#include - -namespace testing { - -// Definitions in the 'internal' and 'internal2' name spaces are -// subject to change without notice. DO NOT USE THEM IN USER CODE! -namespace internal2 { - -// Prints the given number of bytes in the given object to the given -// ostream. -GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, - size_t count, - ::std::ostream* os); - -// For selecting which printer to use when a given type has neither << -// nor PrintTo(). -enum TypeKind { - kProtobuf, // a protobuf type - kConvertibleToInteger, // a type implicitly convertible to BiggestInt - // (e.g. a named or unnamed enum type) - kOtherType // anything else -}; - -// TypeWithoutFormatter::PrintValue(value, os) is called -// by the universal printer to print a value of type T when neither -// operator<< nor PrintTo() is defined for T, where kTypeKind is the -// "kind" of T as defined by enum TypeKind. -template -class TypeWithoutFormatter { - public: - // This default version is called when kTypeKind is kOtherType. - static void PrintValue(const T& value, ::std::ostream* os) { - PrintBytesInObjectTo(reinterpret_cast(&value), - sizeof(value), os); - } -}; - -// We print a protobuf using its ShortDebugString() when the string -// doesn't exceed this many characters; otherwise we print it using -// DebugString() for better readability. -const size_t kProtobufOneLinerMaxLength = 50; - -template -class TypeWithoutFormatter { - public: - static void PrintValue(const T& value, ::std::ostream* os) { - const ::testing::internal::string short_str = value.ShortDebugString(); - const ::testing::internal::string pretty_str = - short_str.length() <= kProtobufOneLinerMaxLength ? - short_str : ("\n" + value.DebugString()); - *os << ("<" + pretty_str + ">"); - } -}; - -template -class TypeWithoutFormatter { - public: - // Since T has no << operator or PrintTo() but can be implicitly - // converted to BiggestInt, we print it as a BiggestInt. - // - // Most likely T is an enum type (either named or unnamed), in which - // case printing it as an integer is the desired behavior. In case - // T is not an enum, printing it as an integer is the best we can do - // given that it has no user-defined printer. - static void PrintValue(const T& value, ::std::ostream* os) { - const internal::BiggestInt kBigInt = value; - *os << kBigInt; - } -}; - -// Prints the given value to the given ostream. If the value is a -// protocol message, its debug string is printed; if it's an enum or -// of a type implicitly convertible to BiggestInt, it's printed as an -// integer; otherwise the bytes in the value are printed. This is -// what UniversalPrinter::Print() does when it knows nothing about -// type T and T has neither << operator nor PrintTo(). -// -// A user can override this behavior for a class type Foo by defining -// a << operator in the namespace where Foo is defined. -// -// We put this operator in namespace 'internal2' instead of 'internal' -// to simplify the implementation, as much code in 'internal' needs to -// use << in STL, which would conflict with our own << were it defined -// in 'internal'. -// -// Note that this operator<< takes a generic std::basic_ostream type instead of the more restricted std::ostream. If -// we define it to take an std::ostream instead, we'll get an -// "ambiguous overloads" compiler error when trying to print a type -// Foo that supports streaming to std::basic_ostream, as the compiler cannot tell whether -// operator<<(std::ostream&, const T&) or -// operator<<(std::basic_stream, const Foo&) is more -// specific. -template -::std::basic_ostream& operator<<( - ::std::basic_ostream& os, const T& x) { - TypeWithoutFormatter::value ? kProtobuf : - internal::ImplicitlyConvertible::value ? - kConvertibleToInteger : kOtherType)>::PrintValue(x, &os); - return os; -} - -} // namespace internal2 -} // namespace testing - -// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up -// magic needed for implementing UniversalPrinter won't work. -namespace testing_internal { - -// Used to print a value that is not an STL-style container when the -// user doesn't define PrintTo() for it. -template -void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) { - // With the following statement, during unqualified name lookup, - // testing::internal2::operator<< appears as if it was declared in - // the nearest enclosing namespace that contains both - // ::testing_internal and ::testing::internal2, i.e. the global - // namespace. For more details, refer to the C++ Standard section - // 7.3.4-1 [namespace.udir]. This allows us to fall back onto - // testing::internal2::operator<< in case T doesn't come with a << - // operator. - // - // We cannot write 'using ::testing::internal2::operator<<;', which - // gcc 3.3 fails to compile due to a compiler bug. - using namespace ::testing::internal2; // NOLINT - - // Assuming T is defined in namespace foo, in the next statement, - // the compiler will consider all of: - // - // 1. foo::operator<< (thanks to Koenig look-up), - // 2. ::operator<< (as the current namespace is enclosed in ::), - // 3. testing::internal2::operator<< (thanks to the using statement above). - // - // The operator<< whose type matches T best will be picked. - // - // We deliberately allow #2 to be a candidate, as sometimes it's - // impossible to define #1 (e.g. when foo is ::std, defining - // anything in it is undefined behavior unless you are a compiler - // vendor.). - *os << value; -} - -} // namespace testing_internal - -namespace testing { -namespace internal { - -// UniversalPrinter::Print(value, ostream_ptr) prints the given -// value to the given ostream. The caller must ensure that -// 'ostream_ptr' is not NULL, or the behavior is undefined. -// -// We define UniversalPrinter as a class template (as opposed to a -// function template), as we need to partially specialize it for -// reference types, which cannot be done with function templates. -template -class UniversalPrinter; - -template -void UniversalPrint(const T& value, ::std::ostream* os); - -// Used to print an STL-style container when the user doesn't define -// a PrintTo() for it. -template -void DefaultPrintTo(IsContainer /* dummy */, - false_type /* is not a pointer */, - const C& container, ::std::ostream* os) { - const size_t kMaxCount = 32; // The maximum number of elements to print. - *os << '{'; - size_t count = 0; - for (typename C::const_iterator it = container.begin(); - it != container.end(); ++it, ++count) { - if (count > 0) { - *os << ','; - if (count == kMaxCount) { // Enough has been printed. - *os << " ..."; - break; - } - } - *os << ' '; - // We cannot call PrintTo(*it, os) here as PrintTo() doesn't - // handle *it being a native array. - internal::UniversalPrint(*it, os); - } - - if (count > 0) { - *os << ' '; - } - *os << '}'; -} - -// Used to print a pointer that is neither a char pointer nor a member -// pointer, when the user doesn't define PrintTo() for it. (A member -// variable pointer or member function pointer doesn't really point to -// a location in the address space. Their representation is -// implementation-defined. Therefore they will be printed as raw -// bytes.) -template -void DefaultPrintTo(IsNotContainer /* dummy */, - true_type /* is a pointer */, - T* p, ::std::ostream* os) { - if (p == NULL) { - *os << "NULL"; - } else { - // C++ doesn't allow casting from a function pointer to any object - // pointer. - // - // IsTrue() silences warnings: "Condition is always true", - // "unreachable code". - if (IsTrue(ImplicitlyConvertible::value)) { - // T is not a function type. We just call << to print p, - // relying on ADL to pick up user-defined << for their pointer - // types, if any. - *os << p; - } else { - // T is a function type, so '*os << p' doesn't do what we want - // (it just prints p as bool). We want to print p as a const - // void*. However, we cannot cast it to const void* directly, - // even using reinterpret_cast, as earlier versions of gcc - // (e.g. 3.4.5) cannot compile the cast when p is a function - // pointer. Casting to UInt64 first solves the problem. - *os << reinterpret_cast( - reinterpret_cast(p)); - } - } -} - -// Used to print a non-container, non-pointer value when the user -// doesn't define PrintTo() for it. -template -void DefaultPrintTo(IsNotContainer /* dummy */, - false_type /* is not a pointer */, - const T& value, ::std::ostream* os) { - ::testing_internal::DefaultPrintNonContainerTo(value, os); -} - -// Prints the given value using the << operator if it has one; -// otherwise prints the bytes in it. This is what -// UniversalPrinter::Print() does when PrintTo() is not specialized -// or overloaded for type T. -// -// A user can override this behavior for a class type Foo by defining -// an overload of PrintTo() in the namespace where Foo is defined. We -// give the user this option as sometimes defining a << operator for -// Foo is not desirable (e.g. the coding style may prevent doing it, -// or there is already a << operator but it doesn't do what the user -// wants). -template -void PrintTo(const T& value, ::std::ostream* os) { - // DefaultPrintTo() is overloaded. The type of its first two - // arguments determine which version will be picked. If T is an - // STL-style container, the version for container will be called; if - // T is a pointer, the pointer version will be called; otherwise the - // generic version will be called. - // - // Note that we check for container types here, prior to we check - // for protocol message types in our operator<<. The rationale is: - // - // For protocol messages, we want to give people a chance to - // override Google Mock's format by defining a PrintTo() or - // operator<<. For STL containers, other formats can be - // incompatible with Google Mock's format for the container - // elements; therefore we check for container types here to ensure - // that our format is used. - // - // The second argument of DefaultPrintTo() is needed to bypass a bug - // in Symbian's C++ compiler that prevents it from picking the right - // overload between: - // - // PrintTo(const T& x, ...); - // PrintTo(T* x, ...); - DefaultPrintTo(IsContainerTest(0), is_pointer(), value, os); -} - -// The following list of PrintTo() overloads tells -// UniversalPrinter::Print() how to print standard types (built-in -// types, strings, plain arrays, and pointers). - -// Overloads for various char types. -GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os); -GTEST_API_ void PrintTo(signed char c, ::std::ostream* os); -inline void PrintTo(char c, ::std::ostream* os) { - // When printing a plain char, we always treat it as unsigned. This - // way, the output won't be affected by whether the compiler thinks - // char is signed or not. - PrintTo(static_cast(c), os); -} - -// Overloads for other simple built-in types. -inline void PrintTo(bool x, ::std::ostream* os) { - *os << (x ? "true" : "false"); -} - -// Overload for wchar_t type. -// Prints a wchar_t as a symbol if it is printable or as its internal -// code otherwise and also as its decimal code (except for L'\0'). -// The L'\0' char is printed as "L'\\0'". The decimal code is printed -// as signed integer when wchar_t is implemented by the compiler -// as a signed type and is printed as an unsigned integer when wchar_t -// is implemented as an unsigned type. -GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os); - -// Overloads for C strings. -GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); -inline void PrintTo(char* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} - -// signed/unsigned char is often used for representing binary data, so -// we print pointers to it as void* to be safe. -inline void PrintTo(const signed char* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} -inline void PrintTo(signed char* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} -inline void PrintTo(const unsigned char* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} -inline void PrintTo(unsigned char* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} - -// MSVC can be configured to define wchar_t as a typedef of unsigned -// short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native -// type. When wchar_t is a typedef, defining an overload for const -// wchar_t* would cause unsigned short* be printed as a wide string, -// possibly causing invalid memory accesses. -#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) -// Overloads for wide C strings -GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os); -inline void PrintTo(wchar_t* s, ::std::ostream* os) { - PrintTo(ImplicitCast_(s), os); -} -#endif - -// Overload for C arrays. Multi-dimensional arrays are printed -// properly. - -// Prints the given number of elements in an array, without printing -// the curly braces. -template -void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { - UniversalPrint(a[0], os); - for (size_t i = 1; i != count; i++) { - *os << ", "; - UniversalPrint(a[i], os); - } -} - -// Overloads for ::string and ::std::string. -#if GTEST_HAS_GLOBAL_STRING -GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os); -inline void PrintTo(const ::string& s, ::std::ostream* os) { - PrintStringTo(s, os); -} -#endif // GTEST_HAS_GLOBAL_STRING - -GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os); -inline void PrintTo(const ::std::string& s, ::std::ostream* os) { - PrintStringTo(s, os); -} - -// Overloads for ::wstring and ::std::wstring. -#if GTEST_HAS_GLOBAL_WSTRING -GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os); -inline void PrintTo(const ::wstring& s, ::std::ostream* os) { - PrintWideStringTo(s, os); -} -#endif // GTEST_HAS_GLOBAL_WSTRING - -#if GTEST_HAS_STD_WSTRING -GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os); -inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { - PrintWideStringTo(s, os); -} -#endif // GTEST_HAS_STD_WSTRING - -#if GTEST_HAS_TR1_TUPLE -// Overload for ::std::tr1::tuple. Needed for printing function arguments, -// which are packed as tuples. - -// Helper function for printing a tuple. T must be instantiated with -// a tuple type. -template -void PrintTupleTo(const T& t, ::std::ostream* os); - -// Overloaded PrintTo() for tuples of various arities. We support -// tuples of up-to 10 fields. The following implementation works -// regardless of whether tr1::tuple is implemented using the -// non-standard variadic template feature or not. - -inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo(const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} - -template -void PrintTo( - const ::std::tr1::tuple& t, - ::std::ostream* os) { - PrintTupleTo(t, os); -} -#endif // GTEST_HAS_TR1_TUPLE - -// Overload for std::pair. -template -void PrintTo(const ::std::pair& value, ::std::ostream* os) { - *os << '('; - // We cannot use UniversalPrint(value.first, os) here, as T1 may be - // a reference type. The same for printing value.second. - UniversalPrinter::Print(value.first, os); - *os << ", "; - UniversalPrinter::Print(value.second, os); - *os << ')'; -} - -// Implements printing a non-reference type T by letting the compiler -// pick the right overload of PrintTo() for T. -template -class UniversalPrinter { - public: - // MSVC warns about adding const to a function type, so we want to - // disable the warning. -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4180) // Temporarily disables warning 4180. -#endif // _MSC_VER - - // Note: we deliberately don't call this PrintTo(), as that name - // conflicts with ::testing::internal::PrintTo in the body of the - // function. - static void Print(const T& value, ::std::ostream* os) { - // By default, ::testing::internal::PrintTo() is used for printing - // the value. - // - // Thanks to Koenig look-up, if T is a class and has its own - // PrintTo() function defined in its namespace, that function will - // be visible here. Since it is more specific than the generic ones - // in ::testing::internal, it will be picked by the compiler in the - // following statement - exactly what we want. - PrintTo(value, os); - } - -#ifdef _MSC_VER -# pragma warning(pop) // Restores the warning state. -#endif // _MSC_VER -}; - -// UniversalPrintArray(begin, len, os) prints an array of 'len' -// elements, starting at address 'begin'. -template -void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { - if (len == 0) { - *os << "{}"; - } else { - *os << "{ "; - const size_t kThreshold = 18; - const size_t kChunkSize = 8; - // If the array has more than kThreshold elements, we'll have to - // omit some details by printing only the first and the last - // kChunkSize elements. - // TODO(wan@google.com): let the user control the threshold using a flag. - if (len <= kThreshold) { - PrintRawArrayTo(begin, len, os); - } else { - PrintRawArrayTo(begin, kChunkSize, os); - *os << ", ..., "; - PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os); - } - *os << " }"; - } -} -// This overload prints a (const) char array compactly. -GTEST_API_ void UniversalPrintArray( - const char* begin, size_t len, ::std::ostream* os); - -// This overload prints a (const) wchar_t array compactly. -GTEST_API_ void UniversalPrintArray( - const wchar_t* begin, size_t len, ::std::ostream* os); - -// Implements printing an array type T[N]. -template -class UniversalPrinter { - public: - // Prints the given array, omitting some elements when there are too - // many. - static void Print(const T (&a)[N], ::std::ostream* os) { - UniversalPrintArray(a, N, os); - } -}; - -// Implements printing a reference type T&. -template -class UniversalPrinter { - public: - // MSVC warns about adding const to a function type, so we want to - // disable the warning. -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4180) // Temporarily disables warning 4180. -#endif // _MSC_VER - - static void Print(const T& value, ::std::ostream* os) { - // Prints the address of the value. We use reinterpret_cast here - // as static_cast doesn't compile when T is a function type. - *os << "@" << reinterpret_cast(&value) << " "; - - // Then prints the value itself. - UniversalPrint(value, os); - } - -#ifdef _MSC_VER -# pragma warning(pop) // Restores the warning state. -#endif // _MSC_VER -}; - -// Prints a value tersely: for a reference type, the referenced value -// (but not the address) is printed; for a (const) char pointer, the -// NUL-terminated string (but not the pointer) is printed. - -template -class UniversalTersePrinter { - public: - static void Print(const T& value, ::std::ostream* os) { - UniversalPrint(value, os); - } -}; -template -class UniversalTersePrinter { - public: - static void Print(const T& value, ::std::ostream* os) { - UniversalPrint(value, os); - } -}; -template -class UniversalTersePrinter { - public: - static void Print(const T (&value)[N], ::std::ostream* os) { - UniversalPrinter::Print(value, os); - } -}; -template <> -class UniversalTersePrinter { - public: - static void Print(const char* str, ::std::ostream* os) { - if (str == NULL) { - *os << "NULL"; - } else { - UniversalPrint(string(str), os); - } - } -}; -template <> -class UniversalTersePrinter { - public: - static void Print(char* str, ::std::ostream* os) { - UniversalTersePrinter::Print(str, os); - } -}; - -#if GTEST_HAS_STD_WSTRING -template <> -class UniversalTersePrinter { - public: - static void Print(const wchar_t* str, ::std::ostream* os) { - if (str == NULL) { - *os << "NULL"; - } else { - UniversalPrint(::std::wstring(str), os); - } - } -}; -#endif - -template <> -class UniversalTersePrinter { - public: - static void Print(wchar_t* str, ::std::ostream* os) { - UniversalTersePrinter::Print(str, os); - } -}; - -template -void UniversalTersePrint(const T& value, ::std::ostream* os) { - UniversalTersePrinter::Print(value, os); -} - -// Prints a value using the type inferred by the compiler. The -// difference between this and UniversalTersePrint() is that for a -// (const) char pointer, this prints both the pointer and the -// NUL-terminated string. -template -void UniversalPrint(const T& value, ::std::ostream* os) { - // A workarond for the bug in VC++ 7.1 that prevents us from instantiating - // UniversalPrinter with T directly. - typedef T T1; - UniversalPrinter::Print(value, os); -} - -#if GTEST_HAS_TR1_TUPLE -typedef ::std::vector Strings; - -// This helper template allows PrintTo() for tuples and -// UniversalTersePrintTupleFieldsToStrings() to be defined by -// induction on the number of tuple fields. The idea is that -// TuplePrefixPrinter::PrintPrefixTo(t, os) prints the first N -// fields in tuple t, and can be defined in terms of -// TuplePrefixPrinter. - -// The inductive case. -template -struct TuplePrefixPrinter { - // Prints the first N fields of a tuple. - template - static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { - TuplePrefixPrinter::PrintPrefixTo(t, os); - *os << ", "; - UniversalPrinter::type> - ::Print(::std::tr1::get(t), os); - } - - // Tersely prints the first N fields of a tuple to a string vector, - // one element for each field. - template - static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { - TuplePrefixPrinter::TersePrintPrefixToStrings(t, strings); - ::std::stringstream ss; - UniversalTersePrint(::std::tr1::get(t), &ss); - strings->push_back(ss.str()); - } -}; - -// Base cases. -template <> -struct TuplePrefixPrinter<0> { - template - static void PrintPrefixTo(const Tuple&, ::std::ostream*) {} - - template - static void TersePrintPrefixToStrings(const Tuple&, Strings*) {} -}; -// We have to specialize the entire TuplePrefixPrinter<> class -// template here, even though the definition of -// TersePrintPrefixToStrings() is the same as the generic version, as -// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't -// support specializing a method template of a class template. -template <> -struct TuplePrefixPrinter<1> { - template - static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { - UniversalPrinter::type>:: - Print(::std::tr1::get<0>(t), os); - } - - template - static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { - ::std::stringstream ss; - UniversalTersePrint(::std::tr1::get<0>(t), &ss); - strings->push_back(ss.str()); - } -}; - -// Helper function for printing a tuple. T must be instantiated with -// a tuple type. -template -void PrintTupleTo(const T& t, ::std::ostream* os) { - *os << "("; - TuplePrefixPrinter< ::std::tr1::tuple_size::value>:: - PrintPrefixTo(t, os); - *os << ")"; -} - -// Prints the fields of a tuple tersely to a string vector, one -// element for each field. See the comment before -// UniversalTersePrint() for how we define "tersely". -template -Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { - Strings result; - TuplePrefixPrinter< ::std::tr1::tuple_size::value>:: - TersePrintPrefixToStrings(value, &result); - return result; -} -#endif // GTEST_HAS_TR1_TUPLE - -} // namespace internal - -template -::std::string PrintToString(const T& value) { - ::std::stringstream ss; - internal::UniversalTersePrinter::Print(value, &ss); - return ss.str(); -} - -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ - -#if GTEST_HAS_PARAM_TEST - -namespace testing { -namespace internal { - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Outputs a message explaining invalid registration of different -// fixture class for the same test case. This may happen when -// TEST_P macro is used to define two tests with the same name -// but in different namespaces. -GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name, - const char* file, int line); - -template class ParamGeneratorInterface; -template class ParamGenerator; - -// Interface for iterating over elements provided by an implementation -// of ParamGeneratorInterface. -template -class ParamIteratorInterface { - public: - virtual ~ParamIteratorInterface() {} - // A pointer to the base generator instance. - // Used only for the purposes of iterator comparison - // to make sure that two iterators belong to the same generator. - virtual const ParamGeneratorInterface* BaseGenerator() const = 0; - // Advances iterator to point to the next element - // provided by the generator. The caller is responsible - // for not calling Advance() on an iterator equal to - // BaseGenerator()->End(). - virtual void Advance() = 0; - // Clones the iterator object. Used for implementing copy semantics - // of ParamIterator. - virtual ParamIteratorInterface* Clone() const = 0; - // Dereferences the current iterator and provides (read-only) access - // to the pointed value. It is the caller's responsibility not to call - // Current() on an iterator equal to BaseGenerator()->End(). - // Used for implementing ParamGenerator::operator*(). - virtual const T* Current() const = 0; - // Determines whether the given iterator and other point to the same - // element in the sequence generated by the generator. - // Used for implementing ParamGenerator::operator==(). - virtual bool Equals(const ParamIteratorInterface& other) const = 0; -}; - -// Class iterating over elements provided by an implementation of -// ParamGeneratorInterface. It wraps ParamIteratorInterface -// and implements the const forward iterator concept. -template -class ParamIterator { - public: - typedef T value_type; - typedef const T& reference; - typedef ptrdiff_t difference_type; - - // ParamIterator assumes ownership of the impl_ pointer. - ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {} - ParamIterator& operator=(const ParamIterator& other) { - if (this != &other) - impl_.reset(other.impl_->Clone()); - return *this; - } - - const T& operator*() const { return *impl_->Current(); } - const T* operator->() const { return impl_->Current(); } - // Prefix version of operator++. - ParamIterator& operator++() { - impl_->Advance(); - return *this; - } - // Postfix version of operator++. - ParamIterator operator++(int /*unused*/) { - ParamIteratorInterface* clone = impl_->Clone(); - impl_->Advance(); - return ParamIterator(clone); - } - bool operator==(const ParamIterator& other) const { - return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_); - } - bool operator!=(const ParamIterator& other) const { - return !(*this == other); - } - - private: - friend class ParamGenerator; - explicit ParamIterator(ParamIteratorInterface* impl) : impl_(impl) {} - scoped_ptr > impl_; -}; - -// ParamGeneratorInterface is the binary interface to access generators -// defined in other translation units. -template -class ParamGeneratorInterface { - public: - typedef T ParamType; - - virtual ~ParamGeneratorInterface() {} - - // Generator interface definition - virtual ParamIteratorInterface* Begin() const = 0; - virtual ParamIteratorInterface* End() const = 0; -}; - -// Wraps ParamGeneratorInterface and provides general generator syntax -// compatible with the STL Container concept. -// This class implements copy initialization semantics and the contained -// ParamGeneratorInterface instance is shared among all copies -// of the original object. This is possible because that instance is immutable. -template -class ParamGenerator { - public: - typedef ParamIterator iterator; - - explicit ParamGenerator(ParamGeneratorInterface* impl) : impl_(impl) {} - ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {} - - ParamGenerator& operator=(const ParamGenerator& other) { - impl_ = other.impl_; - return *this; - } - - iterator begin() const { return iterator(impl_->Begin()); } - iterator end() const { return iterator(impl_->End()); } - - private: - linked_ptr > impl_; -}; - -// Generates values from a range of two comparable values. Can be used to -// generate sequences of user-defined types that implement operator+() and -// operator<(). -// This class is used in the Range() function. -template -class RangeGenerator : public ParamGeneratorInterface { - public: - RangeGenerator(T begin, T end, IncrementT step) - : begin_(begin), end_(end), - step_(step), end_index_(CalculateEndIndex(begin, end, step)) {} - virtual ~RangeGenerator() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, begin_, 0, step_); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, end_, end_index_, step_); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, T value, int index, - IncrementT step) - : base_(base), value_(value), index_(index), step_(step) {} - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - virtual void Advance() { - value_ = value_ + step_; - index_++; - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const T* Current() const { return &value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const int other_index = - CheckedDowncastToActualType(&other)->index_; - return index_ == other_index; - } - - private: - Iterator(const Iterator& other) - : ParamIteratorInterface(), - base_(other.base_), value_(other.value_), index_(other.index_), - step_(other.step_) {} - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - T value_; - int index_; - const IncrementT step_; - }; // class RangeGenerator::Iterator - - static int CalculateEndIndex(const T& begin, - const T& end, - const IncrementT& step) { - int end_index = 0; - for (T i = begin; i < end; i = i + step) - end_index++; - return end_index; - } - - // No implementation - assignment is unsupported. - void operator=(const RangeGenerator& other); - - const T begin_; - const T end_; - const IncrementT step_; - // The index for the end() iterator. All the elements in the generated - // sequence are indexed (0-based) to aid iterator comparison. - const int end_index_; -}; // class RangeGenerator - - -// Generates values from a pair of STL-style iterators. Used in the -// ValuesIn() function. The elements are copied from the source range -// since the source can be located on the stack, and the generator -// is likely to persist beyond that stack frame. -template -class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { - public: - template - ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end) - : container_(begin, end) {} - virtual ~ValuesInIteratorRangeGenerator() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, container_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, container_.end()); - } - - private: - typedef typename ::std::vector ContainerType; - - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - typename ContainerType::const_iterator iterator) - : base_(base), iterator_(iterator) {} - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - virtual void Advance() { - ++iterator_; - value_.reset(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - // We need to use cached value referenced by iterator_ because *iterator_ - // can return a temporary object (and of type other then T), so just - // having "return &*iterator_;" doesn't work. - // value_ is updated here and not in Advance() because Advance() - // can advance iterator_ beyond the end of the range, and we cannot - // detect that fact. The client code, on the other hand, is - // responsible for not calling Current() on an out-of-range iterator. - virtual const T* Current() const { - if (value_.get() == NULL) - value_.reset(new T(*iterator_)); - return value_.get(); - } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - return iterator_ == - CheckedDowncastToActualType(&other)->iterator_; - } - - private: - Iterator(const Iterator& other) - // The explicit constructor call suppresses a false warning - // emitted by gcc when supplied with the -Wextra option. - : ParamIteratorInterface(), - base_(other.base_), - iterator_(other.iterator_) {} - - const ParamGeneratorInterface* const base_; - typename ContainerType::const_iterator iterator_; - // A cached value of *iterator_. We keep it here to allow access by - // pointer in the wrapping iterator's operator->(). - // value_ needs to be mutable to be accessed in Current(). - // Use of scoped_ptr helps manage cached value's lifetime, - // which is bound by the lifespan of the iterator itself. - mutable scoped_ptr value_; - }; // class ValuesInIteratorRangeGenerator::Iterator - - // No implementation - assignment is unsupported. - void operator=(const ValuesInIteratorRangeGenerator& other); - - const ContainerType container_; -}; // class ValuesInIteratorRangeGenerator - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Stores a parameter value and later creates tests parameterized with that -// value. -template -class ParameterizedTestFactory : public TestFactoryBase { - public: - typedef typename TestClass::ParamType ParamType; - explicit ParameterizedTestFactory(ParamType parameter) : - parameter_(parameter) {} - virtual Test* CreateTest() { - TestClass::SetParam(¶meter_); - return new TestClass(); - } - - private: - const ParamType parameter_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory); -}; - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// TestMetaFactoryBase is a base class for meta-factories that create -// test factories for passing into MakeAndRegisterTestInfo function. -template -class TestMetaFactoryBase { - public: - virtual ~TestMetaFactoryBase() {} - - virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0; -}; - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// TestMetaFactory creates test factories for passing into -// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives -// ownership of test factory pointer, same factory object cannot be passed -// into that method twice. But ParameterizedTestCaseInfo is going to call -// it for each Test/Parameter value combination. Thus it needs meta factory -// creator class. -template -class TestMetaFactory - : public TestMetaFactoryBase { - public: - typedef typename TestCase::ParamType ParamType; - - TestMetaFactory() {} - - virtual TestFactoryBase* CreateTestFactory(ParamType parameter) { - return new ParameterizedTestFactory(parameter); - } - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory); -}; - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// ParameterizedTestCaseInfoBase is a generic interface -// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase -// accumulates test information provided by TEST_P macro invocations -// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations -// and uses that information to register all resulting test instances -// in RegisterTests method. The ParameterizeTestCaseRegistry class holds -// a collection of pointers to the ParameterizedTestCaseInfo objects -// and calls RegisterTests() on each of them when asked. -class ParameterizedTestCaseInfoBase { - public: - virtual ~ParameterizedTestCaseInfoBase() {} - - // Base part of test case name for display purposes. - virtual const string& GetTestCaseName() const = 0; - // Test case id to verify identity. - virtual TypeId GetTestCaseTypeId() const = 0; - // UnitTest class invokes this method to register tests in this - // test case right before running them in RUN_ALL_TESTS macro. - // This method should not be called more then once on any single - // instance of a ParameterizedTestCaseInfoBase derived class. - virtual void RegisterTests() = 0; - - protected: - ParameterizedTestCaseInfoBase() {} - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase); -}; - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P -// macro invocations for a particular test case and generators -// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that -// test case. It registers tests with all values generated by all -// generators when asked. -template -class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { - public: - // ParamType and GeneratorCreationFunc are private types but are required - // for declarations of public methods AddTestPattern() and - // AddTestCaseInstantiation(). - typedef typename TestCase::ParamType ParamType; - // A function that returns an instance of appropriate generator type. - typedef ParamGenerator(GeneratorCreationFunc)(); - - explicit ParameterizedTestCaseInfo(const char* name) - : test_case_name_(name) {} - - // Test case base name for display purposes. - virtual const string& GetTestCaseName() const { return test_case_name_; } - // Test case id to verify identity. - virtual TypeId GetTestCaseTypeId() const { return GetTypeId(); } - // TEST_P macro uses AddTestPattern() to record information - // about a single test in a LocalTestInfo structure. - // test_case_name is the base name of the test case (without invocation - // prefix). test_base_name is the name of an individual test without - // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is - // test case base name and DoBar is test base name. - void AddTestPattern(const char* test_case_name, - const char* test_base_name, - TestMetaFactoryBase* meta_factory) { - tests_.push_back(linked_ptr(new TestInfo(test_case_name, - test_base_name, - meta_factory))); - } - // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information - // about a generator. - int AddTestCaseInstantiation(const string& instantiation_name, - GeneratorCreationFunc* func, - const char* /* file */, - int /* line */) { - instantiations_.push_back(::std::make_pair(instantiation_name, func)); - return 0; // Return value used only to run this method in namespace scope. - } - // UnitTest class invokes this method to register tests in this test case - // test cases right before running tests in RUN_ALL_TESTS macro. - // This method should not be called more then once on any single - // instance of a ParameterizedTestCaseInfoBase derived class. - // UnitTest has a guard to prevent from calling this method more then once. - virtual void RegisterTests() { - for (typename TestInfoContainer::iterator test_it = tests_.begin(); - test_it != tests_.end(); ++test_it) { - linked_ptr test_info = *test_it; - for (typename InstantiationContainer::iterator gen_it = - instantiations_.begin(); gen_it != instantiations_.end(); - ++gen_it) { - const string& instantiation_name = gen_it->first; - ParamGenerator generator((*gen_it->second)()); - - string test_case_name; - if ( !instantiation_name.empty() ) - test_case_name = instantiation_name + "/"; - test_case_name += test_info->test_case_base_name; - - int i = 0; - for (typename ParamGenerator::iterator param_it = - generator.begin(); - param_it != generator.end(); ++param_it, ++i) { - Message test_name_stream; - test_name_stream << test_info->test_base_name << "/" << i; - MakeAndRegisterTestInfo( - test_case_name.c_str(), - test_name_stream.GetString().c_str(), - NULL, // No type parameter. - PrintToString(*param_it).c_str(), - GetTestCaseTypeId(), - TestCase::SetUpTestCase, - TestCase::TearDownTestCase, - test_info->test_meta_factory->CreateTestFactory(*param_it)); - } // for param_it - } // for gen_it - } // for test_it - } // RegisterTests - - private: - // LocalTestInfo structure keeps information about a single test registered - // with TEST_P macro. - struct TestInfo { - TestInfo(const char* a_test_case_base_name, - const char* a_test_base_name, - TestMetaFactoryBase* a_test_meta_factory) : - test_case_base_name(a_test_case_base_name), - test_base_name(a_test_base_name), - test_meta_factory(a_test_meta_factory) {} - - const string test_case_base_name; - const string test_base_name; - const scoped_ptr > test_meta_factory; - }; - typedef ::std::vector > TestInfoContainer; - // Keeps pairs of - // received from INSTANTIATE_TEST_CASE_P macros. - typedef ::std::vector > - InstantiationContainer; - - const string test_case_name_; - TestInfoContainer tests_; - InstantiationContainer instantiations_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo); -}; // class ParameterizedTestCaseInfo - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase -// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P -// macros use it to locate their corresponding ParameterizedTestCaseInfo -// descriptors. -class ParameterizedTestCaseRegistry { - public: - ParameterizedTestCaseRegistry() {} - ~ParameterizedTestCaseRegistry() { - for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); - it != test_case_infos_.end(); ++it) { - delete *it; - } - } - - // Looks up or creates and returns a structure containing information about - // tests and instantiations of a particular test case. - template - ParameterizedTestCaseInfo* GetTestCasePatternHolder( - const char* test_case_name, - const char* file, - int line) { - ParameterizedTestCaseInfo* typed_test_info = NULL; - for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); - it != test_case_infos_.end(); ++it) { - if ((*it)->GetTestCaseName() == test_case_name) { - if ((*it)->GetTestCaseTypeId() != GetTypeId()) { - // Complain about incorrect usage of Google Test facilities - // and terminate the program since we cannot guaranty correct - // test case setup and tear-down in this case. - ReportInvalidTestCaseType(test_case_name, file, line); - posix::Abort(); - } else { - // At this point we are sure that the object we found is of the same - // type we are looking for, so we downcast it to that type - // without further checks. - typed_test_info = CheckedDowncastToActualType< - ParameterizedTestCaseInfo >(*it); - } - break; - } - } - if (typed_test_info == NULL) { - typed_test_info = new ParameterizedTestCaseInfo(test_case_name); - test_case_infos_.push_back(typed_test_info); - } - return typed_test_info; - } - void RegisterTests() { - for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); - it != test_case_infos_.end(); ++it) { - (*it)->RegisterTests(); - } - } - - private: - typedef ::std::vector TestCaseInfoContainer; - - TestCaseInfoContainer test_case_infos_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry); -}; - -} // namespace internal -} // namespace testing - -#endif // GTEST_HAS_PARAM_TEST - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ -// This file was GENERATED by command: -// pump.py gtest-param-util-generated.h.pump -// DO NOT EDIT BY HAND!!! - -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: vladl@google.com (Vlad Losev) - -// Type and function utilities for implementing parameterized tests. -// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! -// -// Currently Google Test supports at most 50 arguments in Values, -// and at most 10 arguments in Combine. Please contact -// googletestframework@googlegroups.com if you need more. -// Please note that the number of arguments to Combine is limited -// by the maximum arity of the implementation of tr1::tuple which is -// currently set at 10. - -#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ -#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ - -// scripts/fuse_gtest.py depends on gtest's own header being #included -// *unconditionally*. Therefore these #includes cannot be moved -// inside #if GTEST_HAS_PARAM_TEST. - -#if GTEST_HAS_PARAM_TEST - -namespace testing { - -// Forward declarations of ValuesIn(), which is implemented in -// include/gtest/gtest-param-test.h. -template -internal::ParamGenerator< - typename ::testing::internal::IteratorTraits::value_type> -ValuesIn(ForwardIterator begin, ForwardIterator end); - -template -internal::ParamGenerator ValuesIn(const T (&array)[N]); - -template -internal::ParamGenerator ValuesIn( - const Container& container); - -namespace internal { - -// Used in the Values() function to provide polymorphic capabilities. -template -class ValueArray1 { - public: - explicit ValueArray1(T1 v1) : v1_(v1) {} - - template - operator ParamGenerator() const { return ValuesIn(&v1_, &v1_ + 1); } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray1& other); - - const T1 v1_; -}; - -template -class ValueArray2 { - public: - ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray2& other); - - const T1 v1_; - const T2 v2_; -}; - -template -class ValueArray3 { - public: - ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray3& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; -}; - -template -class ValueArray4 { - public: - ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray4& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; -}; - -template -class ValueArray5 { - public: - ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray5& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; -}; - -template -class ValueArray6 { - public: - ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray6& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; -}; - -template -class ValueArray7 { - public: - ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray7& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; -}; - -template -class ValueArray8 { - public: - ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray8& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; -}; - -template -class ValueArray9 { - public: - ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray9& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; -}; - -template -class ValueArray10 { - public: - ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray10& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; -}; - -template -class ValueArray11 { - public: - ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray11& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; -}; - -template -class ValueArray12 { - public: - ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray12& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; -}; - -template -class ValueArray13 { - public: - ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray13& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; -}; - -template -class ValueArray14 { - public: - ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray14& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; -}; - -template -class ValueArray15 { - public: - ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray15& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; -}; - -template -class ValueArray16 { - public: - ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray16& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; -}; - -template -class ValueArray17 { - public: - ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, - T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray17& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; -}; - -template -class ValueArray18 { - public: - ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray18& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; -}; - -template -class ValueArray19 { - public: - ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray19& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; -}; - -template -class ValueArray20 { - public: - ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray20& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; -}; - -template -class ValueArray21 { - public: - ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray21& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; -}; - -template -class ValueArray22 { - public: - ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray22& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; -}; - -template -class ValueArray23 { - public: - ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray23& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; -}; - -template -class ValueArray24 { - public: - ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray24& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; -}; - -template -class ValueArray25 { - public: - ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, - T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray25& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; -}; - -template -class ValueArray26 { - public: - ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray26& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; -}; - -template -class ValueArray27 { - public: - ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), - v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), - v26_(v26), v27_(v27) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray27& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; -}; - -template -class ValueArray28 { - public: - ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), - v25_(v25), v26_(v26), v27_(v27), v28_(v28) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray28& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; -}; - -template -class ValueArray29 { - public: - ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), - v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray29& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; -}; - -template -class ValueArray30 { - public: - ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray30& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; -}; - -template -class ValueArray31 { - public: - ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray31& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; -}; - -template -class ValueArray32 { - public: - ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), - v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray32& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; -}; - -template -class ValueArray33 { - public: - ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, - T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray33& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; -}; - -template -class ValueArray34 { - public: - ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray34& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; -}; - -template -class ValueArray35 { - public: - ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), - v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), - v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), - v32_(v32), v33_(v33), v34_(v34), v35_(v35) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray35& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; -}; - -template -class ValueArray36 { - public: - ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), - v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), - v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray36& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; -}; - -template -class ValueArray37 { - public: - ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), - v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), - v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), - v36_(v36), v37_(v37) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray37& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; -}; - -template -class ValueArray38 { - public: - ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray38& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; -}; - -template -class ValueArray39 { - public: - ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray39& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; -}; - -template -class ValueArray40 { - public: - ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), - v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), - v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), - v40_(v40) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray40& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; -}; - -template -class ValueArray41 { - public: - ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, - T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray41& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; -}; - -template -class ValueArray42 { - public: - ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41), v42_(v42) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray42& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; -}; - -template -class ValueArray43 { - public: - ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), - v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), - v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), - v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), - v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), - v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), - v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray43& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; -}; - -template -class ValueArray44 { - public: - ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), - v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), - v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), - v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), - v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), - v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), - v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), - v43_(v43), v44_(v44) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray44& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; -}; - -template -class ValueArray45 { - public: - ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), - v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), - v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), - v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), - v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), - v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), - v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), - v42_(v42), v43_(v43), v44_(v44), v45_(v45) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray45& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; -}; - -template -class ValueArray46 { - public: - ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3), - v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), - v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray46& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; -}; - -template -class ValueArray47 { - public: - ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2), - v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), - v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), - v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), - v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), - v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), - v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), - v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46), - v47_(v47) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray47& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; -}; - -template -class ValueArray48 { - public: - ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1), - v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), - v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), - v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), - v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), - v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), - v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), - v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), - v46_(v46), v47_(v47), v48_(v48) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_), - static_cast(v48_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray48& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; - const T48 v48_; -}; - -template -class ValueArray49 { - public: - ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, - T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), - v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_), - static_cast(v48_), static_cast(v49_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray49& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; - const T48 v48_; - const T49 v49_; -}; - -template -class ValueArray50 { - public: - ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49, - T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), - v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), - v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), - v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), - v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), - v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), - v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), - v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {} - - template - operator ParamGenerator() const { - const T array[] = {static_cast(v1_), static_cast(v2_), - static_cast(v3_), static_cast(v4_), static_cast(v5_), - static_cast(v6_), static_cast(v7_), static_cast(v8_), - static_cast(v9_), static_cast(v10_), static_cast(v11_), - static_cast(v12_), static_cast(v13_), static_cast(v14_), - static_cast(v15_), static_cast(v16_), static_cast(v17_), - static_cast(v18_), static_cast(v19_), static_cast(v20_), - static_cast(v21_), static_cast(v22_), static_cast(v23_), - static_cast(v24_), static_cast(v25_), static_cast(v26_), - static_cast(v27_), static_cast(v28_), static_cast(v29_), - static_cast(v30_), static_cast(v31_), static_cast(v32_), - static_cast(v33_), static_cast(v34_), static_cast(v35_), - static_cast(v36_), static_cast(v37_), static_cast(v38_), - static_cast(v39_), static_cast(v40_), static_cast(v41_), - static_cast(v42_), static_cast(v43_), static_cast(v44_), - static_cast(v45_), static_cast(v46_), static_cast(v47_), - static_cast(v48_), static_cast(v49_), static_cast(v50_)}; - return ValuesIn(array); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const ValueArray50& other); - - const T1 v1_; - const T2 v2_; - const T3 v3_; - const T4 v4_; - const T5 v5_; - const T6 v6_; - const T7 v7_; - const T8 v8_; - const T9 v9_; - const T10 v10_; - const T11 v11_; - const T12 v12_; - const T13 v13_; - const T14 v14_; - const T15 v15_; - const T16 v16_; - const T17 v17_; - const T18 v18_; - const T19 v19_; - const T20 v20_; - const T21 v21_; - const T22 v22_; - const T23 v23_; - const T24 v24_; - const T25 v25_; - const T26 v26_; - const T27 v27_; - const T28 v28_; - const T29 v29_; - const T30 v30_; - const T31 v31_; - const T32 v32_; - const T33 v33_; - const T34 v34_; - const T35 v35_; - const T36 v36_; - const T37 v37_; - const T38 v38_; - const T39 v39_; - const T40 v40_; - const T41 v41_; - const T42 v42_; - const T43 v43_; - const T44 v44_; - const T45 v45_; - const T46 v46_; - const T47 v47_; - const T48 v48_; - const T49 v49_; - const T50 v50_; -}; - -# if GTEST_HAS_COMBINE -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Generates values from the Cartesian product of values produced -// by the argument generators. -// -template -class CartesianProductGenerator2 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator2(const ParamGenerator& g1, - const ParamGenerator& g2) - : g1_(g1), g2_(g2) {} - virtual ~CartesianProductGenerator2() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current2_; - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - ParamType current_value_; - }; // class CartesianProductGenerator2::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator2& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; -}; // class CartesianProductGenerator2 - - -template -class CartesianProductGenerator3 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator3(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3) - : g1_(g1), g2_(g2), g3_(g3) {} - virtual ~CartesianProductGenerator3() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current3_; - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - ParamType current_value_; - }; // class CartesianProductGenerator3::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator3& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; -}; // class CartesianProductGenerator3 - - -template -class CartesianProductGenerator4 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator4(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} - virtual ~CartesianProductGenerator4() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current4_; - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - ParamType current_value_; - }; // class CartesianProductGenerator4::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator4& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; -}; // class CartesianProductGenerator4 - - -template -class CartesianProductGenerator5 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator5(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} - virtual ~CartesianProductGenerator5() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current5_; - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - ParamType current_value_; - }; // class CartesianProductGenerator5::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator5& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; -}; // class CartesianProductGenerator5 - - -template -class CartesianProductGenerator6 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator6(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} - virtual ~CartesianProductGenerator6() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current6_; - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - ParamType current_value_; - }; // class CartesianProductGenerator6::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator6& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; -}; // class CartesianProductGenerator6 - - -template -class CartesianProductGenerator7 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator7(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} - virtual ~CartesianProductGenerator7() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current7_; - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - ParamType current_value_; - }; // class CartesianProductGenerator7::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator7& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; -}; // class CartesianProductGenerator7 - - -template -class CartesianProductGenerator8 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator8(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7, - const ParamGenerator& g8) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), - g8_(g8) {} - virtual ~CartesianProductGenerator8() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin(), g8_, g8_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, - g8_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7, - const ParamGenerator& g8, - const typename ParamGenerator::iterator& current8) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7), - begin8_(g8.begin()), end8_(g8.end()), current8_(current8) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current8_; - if (current8_ == end8_) { - current8_ = begin8_; - ++current7_; - } - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_ && - current8_ == typed_other->current8_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_), - begin8_(other.begin8_), - end8_(other.end8_), - current8_(other.current8_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_, *current8_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_ || - current8_ == end8_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - const typename ParamGenerator::iterator begin8_; - const typename ParamGenerator::iterator end8_; - typename ParamGenerator::iterator current8_; - ParamType current_value_; - }; // class CartesianProductGenerator8::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator8& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; - const ParamGenerator g8_; -}; // class CartesianProductGenerator8 - - -template -class CartesianProductGenerator9 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator9(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7, - const ParamGenerator& g8, const ParamGenerator& g9) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9) {} - virtual ~CartesianProductGenerator9() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, - g8_.end(), g9_, g9_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7, - const ParamGenerator& g8, - const typename ParamGenerator::iterator& current8, - const ParamGenerator& g9, - const typename ParamGenerator::iterator& current9) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7), - begin8_(g8.begin()), end8_(g8.end()), current8_(current8), - begin9_(g9.begin()), end9_(g9.end()), current9_(current9) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current9_; - if (current9_ == end9_) { - current9_ = begin9_; - ++current8_; - } - if (current8_ == end8_) { - current8_ = begin8_; - ++current7_; - } - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_ && - current8_ == typed_other->current8_ && - current9_ == typed_other->current9_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_), - begin8_(other.begin8_), - end8_(other.end8_), - current8_(other.current8_), - begin9_(other.begin9_), - end9_(other.end9_), - current9_(other.current9_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_, *current8_, - *current9_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_ || - current8_ == end8_ || - current9_ == end9_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - const typename ParamGenerator::iterator begin8_; - const typename ParamGenerator::iterator end8_; - typename ParamGenerator::iterator current8_; - const typename ParamGenerator::iterator begin9_; - const typename ParamGenerator::iterator end9_; - typename ParamGenerator::iterator current9_; - ParamType current_value_; - }; // class CartesianProductGenerator9::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator9& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; - const ParamGenerator g8_; - const ParamGenerator g9_; -}; // class CartesianProductGenerator9 - - -template -class CartesianProductGenerator10 - : public ParamGeneratorInterface< ::std::tr1::tuple > { - public: - typedef ::std::tr1::tuple ParamType; - - CartesianProductGenerator10(const ParamGenerator& g1, - const ParamGenerator& g2, const ParamGenerator& g3, - const ParamGenerator& g4, const ParamGenerator& g5, - const ParamGenerator& g6, const ParamGenerator& g7, - const ParamGenerator& g8, const ParamGenerator& g9, - const ParamGenerator& g10) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9), g10_(g10) {} - virtual ~CartesianProductGenerator10() {} - - virtual ParamIteratorInterface* Begin() const { - return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, - g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, - g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin()); - } - virtual ParamIteratorInterface* End() const { - return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), - g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, - g8_.end(), g9_, g9_.end(), g10_, g10_.end()); - } - - private: - class Iterator : public ParamIteratorInterface { - public: - Iterator(const ParamGeneratorInterface* base, - const ParamGenerator& g1, - const typename ParamGenerator::iterator& current1, - const ParamGenerator& g2, - const typename ParamGenerator::iterator& current2, - const ParamGenerator& g3, - const typename ParamGenerator::iterator& current3, - const ParamGenerator& g4, - const typename ParamGenerator::iterator& current4, - const ParamGenerator& g5, - const typename ParamGenerator::iterator& current5, - const ParamGenerator& g6, - const typename ParamGenerator::iterator& current6, - const ParamGenerator& g7, - const typename ParamGenerator::iterator& current7, - const ParamGenerator& g8, - const typename ParamGenerator::iterator& current8, - const ParamGenerator& g9, - const typename ParamGenerator::iterator& current9, - const ParamGenerator& g10, - const typename ParamGenerator::iterator& current10) - : base_(base), - begin1_(g1.begin()), end1_(g1.end()), current1_(current1), - begin2_(g2.begin()), end2_(g2.end()), current2_(current2), - begin3_(g3.begin()), end3_(g3.end()), current3_(current3), - begin4_(g4.begin()), end4_(g4.end()), current4_(current4), - begin5_(g5.begin()), end5_(g5.end()), current5_(current5), - begin6_(g6.begin()), end6_(g6.end()), current6_(current6), - begin7_(g7.begin()), end7_(g7.end()), current7_(current7), - begin8_(g8.begin()), end8_(g8.end()), current8_(current8), - begin9_(g9.begin()), end9_(g9.end()), current9_(current9), - begin10_(g10.begin()), end10_(g10.end()), current10_(current10) { - ComputeCurrentValue(); - } - virtual ~Iterator() {} - - virtual const ParamGeneratorInterface* BaseGenerator() const { - return base_; - } - // Advance should not be called on beyond-of-range iterators - // so no component iterators must be beyond end of range, either. - virtual void Advance() { - assert(!AtEnd()); - ++current10_; - if (current10_ == end10_) { - current10_ = begin10_; - ++current9_; - } - if (current9_ == end9_) { - current9_ = begin9_; - ++current8_; - } - if (current8_ == end8_) { - current8_ = begin8_; - ++current7_; - } - if (current7_ == end7_) { - current7_ = begin7_; - ++current6_; - } - if (current6_ == end6_) { - current6_ = begin6_; - ++current5_; - } - if (current5_ == end5_) { - current5_ = begin5_; - ++current4_; - } - if (current4_ == end4_) { - current4_ = begin4_; - ++current3_; - } - if (current3_ == end3_) { - current3_ = begin3_; - ++current2_; - } - if (current2_ == end2_) { - current2_ = begin2_; - ++current1_; - } - ComputeCurrentValue(); - } - virtual ParamIteratorInterface* Clone() const { - return new Iterator(*this); - } - virtual const ParamType* Current() const { return ¤t_value_; } - virtual bool Equals(const ParamIteratorInterface& other) const { - // Having the same base generator guarantees that the other - // iterator is of the same type and we can downcast. - GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) - << "The program attempted to compare iterators " - << "from different generators." << std::endl; - const Iterator* typed_other = - CheckedDowncastToActualType(&other); - // We must report iterators equal if they both point beyond their - // respective ranges. That can happen in a variety of fashions, - // so we have to consult AtEnd(). - return (AtEnd() && typed_other->AtEnd()) || - ( - current1_ == typed_other->current1_ && - current2_ == typed_other->current2_ && - current3_ == typed_other->current3_ && - current4_ == typed_other->current4_ && - current5_ == typed_other->current5_ && - current6_ == typed_other->current6_ && - current7_ == typed_other->current7_ && - current8_ == typed_other->current8_ && - current9_ == typed_other->current9_ && - current10_ == typed_other->current10_); - } - - private: - Iterator(const Iterator& other) - : base_(other.base_), - begin1_(other.begin1_), - end1_(other.end1_), - current1_(other.current1_), - begin2_(other.begin2_), - end2_(other.end2_), - current2_(other.current2_), - begin3_(other.begin3_), - end3_(other.end3_), - current3_(other.current3_), - begin4_(other.begin4_), - end4_(other.end4_), - current4_(other.current4_), - begin5_(other.begin5_), - end5_(other.end5_), - current5_(other.current5_), - begin6_(other.begin6_), - end6_(other.end6_), - current6_(other.current6_), - begin7_(other.begin7_), - end7_(other.end7_), - current7_(other.current7_), - begin8_(other.begin8_), - end8_(other.end8_), - current8_(other.current8_), - begin9_(other.begin9_), - end9_(other.end9_), - current9_(other.current9_), - begin10_(other.begin10_), - end10_(other.end10_), - current10_(other.current10_) { - ComputeCurrentValue(); - } - - void ComputeCurrentValue() { - if (!AtEnd()) - current_value_ = ParamType(*current1_, *current2_, *current3_, - *current4_, *current5_, *current6_, *current7_, *current8_, - *current9_, *current10_); - } - bool AtEnd() const { - // We must report iterator past the end of the range when either of the - // component iterators has reached the end of its range. - return - current1_ == end1_ || - current2_ == end2_ || - current3_ == end3_ || - current4_ == end4_ || - current5_ == end5_ || - current6_ == end6_ || - current7_ == end7_ || - current8_ == end8_ || - current9_ == end9_ || - current10_ == end10_; - } - - // No implementation - assignment is unsupported. - void operator=(const Iterator& other); - - const ParamGeneratorInterface* const base_; - // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. - // current[i]_ is the actual traversing iterator. - const typename ParamGenerator::iterator begin1_; - const typename ParamGenerator::iterator end1_; - typename ParamGenerator::iterator current1_; - const typename ParamGenerator::iterator begin2_; - const typename ParamGenerator::iterator end2_; - typename ParamGenerator::iterator current2_; - const typename ParamGenerator::iterator begin3_; - const typename ParamGenerator::iterator end3_; - typename ParamGenerator::iterator current3_; - const typename ParamGenerator::iterator begin4_; - const typename ParamGenerator::iterator end4_; - typename ParamGenerator::iterator current4_; - const typename ParamGenerator::iterator begin5_; - const typename ParamGenerator::iterator end5_; - typename ParamGenerator::iterator current5_; - const typename ParamGenerator::iterator begin6_; - const typename ParamGenerator::iterator end6_; - typename ParamGenerator::iterator current6_; - const typename ParamGenerator::iterator begin7_; - const typename ParamGenerator::iterator end7_; - typename ParamGenerator::iterator current7_; - const typename ParamGenerator::iterator begin8_; - const typename ParamGenerator::iterator end8_; - typename ParamGenerator::iterator current8_; - const typename ParamGenerator::iterator begin9_; - const typename ParamGenerator::iterator end9_; - typename ParamGenerator::iterator current9_; - const typename ParamGenerator::iterator begin10_; - const typename ParamGenerator::iterator end10_; - typename ParamGenerator::iterator current10_; - ParamType current_value_; - }; // class CartesianProductGenerator10::Iterator - - // No implementation - assignment is unsupported. - void operator=(const CartesianProductGenerator10& other); - - const ParamGenerator g1_; - const ParamGenerator g2_; - const ParamGenerator g3_; - const ParamGenerator g4_; - const ParamGenerator g5_; - const ParamGenerator g6_; - const ParamGenerator g7_; - const ParamGenerator g8_; - const ParamGenerator g9_; - const ParamGenerator g10_; -}; // class CartesianProductGenerator10 - - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Helper classes providing Combine() with polymorphic features. They allow -// casting CartesianProductGeneratorN to ParamGenerator if T is -// convertible to U. -// -template -class CartesianProductHolder2 { - public: -CartesianProductHolder2(const Generator1& g1, const Generator2& g2) - : g1_(g1), g2_(g2) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator2( - static_cast >(g1_), - static_cast >(g2_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder2& other); - - const Generator1 g1_; - const Generator2 g2_; -}; // class CartesianProductHolder2 - -template -class CartesianProductHolder3 { - public: -CartesianProductHolder3(const Generator1& g1, const Generator2& g2, - const Generator3& g3) - : g1_(g1), g2_(g2), g3_(g3) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator3( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder3& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; -}; // class CartesianProductHolder3 - -template -class CartesianProductHolder4 { - public: -CartesianProductHolder4(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator4( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder4& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; -}; // class CartesianProductHolder4 - -template -class CartesianProductHolder5 { - public: -CartesianProductHolder5(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator5( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder5& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; -}; // class CartesianProductHolder5 - -template -class CartesianProductHolder6 { - public: -CartesianProductHolder6(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator6( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder6& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; -}; // class CartesianProductHolder6 - -template -class CartesianProductHolder7 { - public: -CartesianProductHolder7(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator7( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder7& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; -}; // class CartesianProductHolder7 - -template -class CartesianProductHolder8 { - public: -CartesianProductHolder8(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7, const Generator8& g8) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), - g8_(g8) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator8( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_), - static_cast >(g8_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder8& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; - const Generator8 g8_; -}; // class CartesianProductHolder8 - -template -class CartesianProductHolder9 { - public: -CartesianProductHolder9(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7, const Generator8& g8, - const Generator9& g9) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator9( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_), - static_cast >(g8_), - static_cast >(g9_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder9& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; - const Generator8 g8_; - const Generator9 g9_; -}; // class CartesianProductHolder9 - -template -class CartesianProductHolder10 { - public: -CartesianProductHolder10(const Generator1& g1, const Generator2& g2, - const Generator3& g3, const Generator4& g4, const Generator5& g5, - const Generator6& g6, const Generator7& g7, const Generator8& g8, - const Generator9& g9, const Generator10& g10) - : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), - g9_(g9), g10_(g10) {} - template - operator ParamGenerator< ::std::tr1::tuple >() const { - return ParamGenerator< ::std::tr1::tuple >( - new CartesianProductGenerator10( - static_cast >(g1_), - static_cast >(g2_), - static_cast >(g3_), - static_cast >(g4_), - static_cast >(g5_), - static_cast >(g6_), - static_cast >(g7_), - static_cast >(g8_), - static_cast >(g9_), - static_cast >(g10_))); - } - - private: - // No implementation - assignment is unsupported. - void operator=(const CartesianProductHolder10& other); - - const Generator1 g1_; - const Generator2 g2_; - const Generator3 g3_; - const Generator4 g4_; - const Generator5 g5_; - const Generator6 g6_; - const Generator7 g7_; - const Generator8 g8_; - const Generator9 g9_; - const Generator10 g10_; -}; // class CartesianProductHolder10 - -# endif // GTEST_HAS_COMBINE - -} // namespace internal -} // namespace testing - -#endif // GTEST_HAS_PARAM_TEST - -#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ - -#if GTEST_HAS_PARAM_TEST - -namespace testing { - -// Functions producing parameter generators. -// -// Google Test uses these generators to produce parameters for value- -// parameterized tests. When a parameterized test case is instantiated -// with a particular generator, Google Test creates and runs tests -// for each element in the sequence produced by the generator. -// -// In the following sample, tests from test case FooTest are instantiated -// each three times with parameter values 3, 5, and 8: -// -// class FooTest : public TestWithParam { ... }; -// -// TEST_P(FooTest, TestThis) { -// } -// TEST_P(FooTest, TestThat) { -// } -// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8)); -// - -// Range() returns generators providing sequences of values in a range. -// -// Synopsis: -// Range(start, end) -// - returns a generator producing a sequence of values {start, start+1, -// start+2, ..., }. -// Range(start, end, step) -// - returns a generator producing a sequence of values {start, start+step, -// start+step+step, ..., }. -// Notes: -// * The generated sequences never include end. For example, Range(1, 5) -// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) -// returns a generator producing {1, 3, 5, 7}. -// * start and end must have the same type. That type may be any integral or -// floating-point type or a user defined type satisfying these conditions: -// * It must be assignable (have operator=() defined). -// * It must have operator+() (operator+(int-compatible type) for -// two-operand version). -// * It must have operator<() defined. -// Elements in the resulting sequences will also have that type. -// * Condition start < end must be satisfied in order for resulting sequences -// to contain any elements. -// -template -internal::ParamGenerator Range(T start, T end, IncrementT step) { - return internal::ParamGenerator( - new internal::RangeGenerator(start, end, step)); -} - -template -internal::ParamGenerator Range(T start, T end) { - return Range(start, end, 1); -} - -// ValuesIn() function allows generation of tests with parameters coming from -// a container. -// -// Synopsis: -// ValuesIn(const T (&array)[N]) -// - returns a generator producing sequences with elements from -// a C-style array. -// ValuesIn(const Container& container) -// - returns a generator producing sequences with elements from -// an STL-style container. -// ValuesIn(Iterator begin, Iterator end) -// - returns a generator producing sequences with elements from -// a range [begin, end) defined by a pair of STL-style iterators. These -// iterators can also be plain C pointers. -// -// Please note that ValuesIn copies the values from the containers -// passed in and keeps them to generate tests in RUN_ALL_TESTS(). -// -// Examples: -// -// This instantiates tests from test case StringTest -// each with C-string values of "foo", "bar", and "baz": -// -// const char* strings[] = {"foo", "bar", "baz"}; -// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings)); -// -// This instantiates tests from test case StlStringTest -// each with STL strings with values "a" and "b": -// -// ::std::vector< ::std::string> GetParameterStrings() { -// ::std::vector< ::std::string> v; -// v.push_back("a"); -// v.push_back("b"); -// return v; -// } -// -// INSTANTIATE_TEST_CASE_P(CharSequence, -// StlStringTest, -// ValuesIn(GetParameterStrings())); -// -// -// This will also instantiate tests from CharTest -// each with parameter values 'a' and 'b': -// -// ::std::list GetParameterChars() { -// ::std::list list; -// list.push_back('a'); -// list.push_back('b'); -// return list; -// } -// ::std::list l = GetParameterChars(); -// INSTANTIATE_TEST_CASE_P(CharSequence2, -// CharTest, -// ValuesIn(l.begin(), l.end())); -// -template -internal::ParamGenerator< - typename ::testing::internal::IteratorTraits::value_type> -ValuesIn(ForwardIterator begin, ForwardIterator end) { - typedef typename ::testing::internal::IteratorTraits - ::value_type ParamType; - return internal::ParamGenerator( - new internal::ValuesInIteratorRangeGenerator(begin, end)); -} - -template -internal::ParamGenerator ValuesIn(const T (&array)[N]) { - return ValuesIn(array, array + N); -} - -template -internal::ParamGenerator ValuesIn( - const Container& container) { - return ValuesIn(container.begin(), container.end()); -} - -// Values() allows generating tests from explicitly specified list of -// parameters. -// -// Synopsis: -// Values(T v1, T v2, ..., T vN) -// - returns a generator producing sequences with elements v1, v2, ..., vN. -// -// For example, this instantiates tests from test case BarTest each -// with values "one", "two", and "three": -// -// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three")); -// -// This instantiates tests from test case BazTest each with values 1, 2, 3.5. -// The exact type of values will depend on the type of parameter in BazTest. -// -// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); -// -// Currently, Values() supports from 1 to 50 parameters. -// -template -internal::ValueArray1 Values(T1 v1) { - return internal::ValueArray1(v1); -} - -template -internal::ValueArray2 Values(T1 v1, T2 v2) { - return internal::ValueArray2(v1, v2); -} - -template -internal::ValueArray3 Values(T1 v1, T2 v2, T3 v3) { - return internal::ValueArray3(v1, v2, v3); -} - -template -internal::ValueArray4 Values(T1 v1, T2 v2, T3 v3, T4 v4) { - return internal::ValueArray4(v1, v2, v3, v4); -} - -template -internal::ValueArray5 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5) { - return internal::ValueArray5(v1, v2, v3, v4, v5); -} - -template -internal::ValueArray6 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6) { - return internal::ValueArray6(v1, v2, v3, v4, v5, v6); -} - -template -internal::ValueArray7 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6, T7 v7) { - return internal::ValueArray7(v1, v2, v3, v4, v5, - v6, v7); -} - -template -internal::ValueArray8 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) { - return internal::ValueArray8(v1, v2, v3, v4, - v5, v6, v7, v8); -} - -template -internal::ValueArray9 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) { - return internal::ValueArray9(v1, v2, v3, - v4, v5, v6, v7, v8, v9); -} - -template -internal::ValueArray10 Values(T1 v1, - T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) { - return internal::ValueArray10(v1, - v2, v3, v4, v5, v6, v7, v8, v9, v10); -} - -template -internal::ValueArray11 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11) { - return internal::ValueArray11(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11); -} - -template -internal::ValueArray12 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12) { - return internal::ValueArray12(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12); -} - -template -internal::ValueArray13 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13) { - return internal::ValueArray13(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13); -} - -template -internal::ValueArray14 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) { - return internal::ValueArray14(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, - v14); -} - -template -internal::ValueArray15 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) { - return internal::ValueArray15(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15); -} - -template -internal::ValueArray16 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16) { - return internal::ValueArray16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, - v12, v13, v14, v15, v16); -} - -template -internal::ValueArray17 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17) { - return internal::ValueArray17(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, - v11, v12, v13, v14, v15, v16, v17); -} - -template -internal::ValueArray18 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, - T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18) { - return internal::ValueArray18(v1, v2, v3, v4, v5, v6, v7, v8, v9, - v10, v11, v12, v13, v14, v15, v16, v17, v18); -} - -template -internal::ValueArray19 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, - T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, - T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) { - return internal::ValueArray19(v1, v2, v3, v4, v5, v6, v7, v8, - v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19); -} - -template -internal::ValueArray20 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) { - return internal::ValueArray20(v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20); -} - -template -internal::ValueArray21 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) { - return internal::ValueArray21(v1, v2, v3, v4, v5, v6, - v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21); -} - -template -internal::ValueArray22 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22) { - return internal::ValueArray22(v1, v2, v3, v4, - v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22); -} - -template -internal::ValueArray23 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23) { - return internal::ValueArray23(v1, v2, v3, - v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23); -} - -template -internal::ValueArray24 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24) { - return internal::ValueArray24(v1, v2, - v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, - v19, v20, v21, v22, v23, v24); -} - -template -internal::ValueArray25 Values(T1 v1, - T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, - T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, - T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) { - return internal::ValueArray25(v1, - v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, - v18, v19, v20, v21, v22, v23, v24, v25); -} - -template -internal::ValueArray26 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26) { - return internal::ValueArray26(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26); -} - -template -internal::ValueArray27 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27) { - return internal::ValueArray27(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, - v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27); -} - -template -internal::ValueArray28 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28) { - return internal::ValueArray28(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, - v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, - v28); -} - -template -internal::ValueArray29 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29) { - return internal::ValueArray29(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, - v27, v28, v29); -} - -template -internal::ValueArray30 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, - T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, - T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) { - return internal::ValueArray30(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, - v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, - v26, v27, v28, v29, v30); -} - -template -internal::ValueArray31 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) { - return internal::ValueArray31(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, - v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, - v25, v26, v27, v28, v29, v30, v31); -} - -template -internal::ValueArray32 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32) { - return internal::ValueArray32(v1, v2, v3, v4, v5, v6, v7, v8, v9, - v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32); -} - -template -internal::ValueArray33 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, - T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33) { - return internal::ValueArray33(v1, v2, v3, v4, v5, v6, v7, v8, - v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32, v33); -} - -template -internal::ValueArray34 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, - T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, - T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, - T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, - T31 v31, T32 v32, T33 v33, T34 v34) { - return internal::ValueArray34(v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, - v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34); -} - -template -internal::ValueArray35 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, - T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, - T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) { - return internal::ValueArray35(v1, v2, v3, v4, v5, v6, - v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, - v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35); -} - -template -internal::ValueArray36 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, - T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, - T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) { - return internal::ValueArray36(v1, v2, v3, v4, - v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, - v34, v35, v36); -} - -template -internal::ValueArray37 Values(T1 v1, T2 v2, T3 v3, - T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, - T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, - T37 v37) { - return internal::ValueArray37(v1, v2, v3, - v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, - v34, v35, v36, v37); -} - -template -internal::ValueArray38 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, - T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, - T37 v37, T38 v38) { - return internal::ValueArray38(v1, v2, - v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, - v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, - v33, v34, v35, v36, v37, v38); -} - -template -internal::ValueArray39 Values(T1 v1, T2 v2, - T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, - T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, - T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, - T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, - T37 v37, T38 v38, T39 v39) { - return internal::ValueArray39(v1, - v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, - v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, - v32, v33, v34, v35, v36, v37, v38, v39); -} - -template -internal::ValueArray40 Values(T1 v1, - T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, - T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, - T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, - T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, - T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) { - return internal::ValueArray40(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, - v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, - v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40); -} - -template -internal::ValueArray41 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) { - return internal::ValueArray41(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, - v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, - v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41); -} - -template -internal::ValueArray42 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42) { - return internal::ValueArray42(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, - v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, - v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, - v42); -} - -template -internal::ValueArray43 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43) { - return internal::ValueArray43(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, - v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, - v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, - v41, v42, v43); -} - -template -internal::ValueArray44 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, - T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, - T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, - T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, - T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, - T42 v42, T43 v43, T44 v44) { - return internal::ValueArray44(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, - v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, - v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, - v40, v41, v42, v43, v44); -} - -template -internal::ValueArray45 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, - T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, - T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, - T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, - T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, - T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) { - return internal::ValueArray45(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, - v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, - v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, - v39, v40, v41, v42, v43, v44, v45); -} - -template -internal::ValueArray46 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, - T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) { - return internal::ValueArray46(v1, v2, v3, v4, v5, v6, v7, v8, v9, - v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, - v38, v39, v40, v41, v42, v43, v44, v45, v46); -} - -template -internal::ValueArray47 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, - T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, - T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) { - return internal::ValueArray47(v1, v2, v3, v4, v5, v6, v7, v8, - v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, - v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, - v38, v39, v40, v41, v42, v43, v44, v45, v46, v47); -} - -template -internal::ValueArray48 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, - T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, - T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, - T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, - T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, - T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, - T48 v48) { - return internal::ValueArray48(v1, v2, v3, v4, v5, v6, v7, - v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, - v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, - v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48); -} - -template -internal::ValueArray49 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, - T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, - T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, - T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, - T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, - T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, - T47 v47, T48 v48, T49 v49) { - return internal::ValueArray49(v1, v2, v3, v4, v5, v6, - v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, - v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, - v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49); -} - -template -internal::ValueArray50 Values(T1 v1, T2 v2, T3 v3, T4 v4, - T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, - T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, - T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, - T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, - T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, - T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) { - return internal::ValueArray50(v1, v2, v3, v4, - v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, - v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, - v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, - v48, v49, v50); -} - -// Bool() allows generating tests with parameters in a set of (false, true). -// -// Synopsis: -// Bool() -// - returns a generator producing sequences with elements {false, true}. -// -// It is useful when testing code that depends on Boolean flags. Combinations -// of multiple flags can be tested when several Bool()'s are combined using -// Combine() function. -// -// In the following example all tests in the test case FlagDependentTest -// will be instantiated twice with parameters false and true. -// -// class FlagDependentTest : public testing::TestWithParam { -// virtual void SetUp() { -// external_flag = GetParam(); -// } -// } -// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool()); -// -inline internal::ParamGenerator Bool() { - return Values(false, true); -} - -# if GTEST_HAS_COMBINE -// Combine() allows the user to combine two or more sequences to produce -// values of a Cartesian product of those sequences' elements. -// -// Synopsis: -// Combine(gen1, gen2, ..., genN) -// - returns a generator producing sequences with elements coming from -// the Cartesian product of elements from the sequences generated by -// gen1, gen2, ..., genN. The sequence elements will have a type of -// tuple where T1, T2, ..., TN are the types -// of elements from sequences produces by gen1, gen2, ..., genN. -// -// Combine can have up to 10 arguments. This number is currently limited -// by the maximum number of elements in the tuple implementation used by Google -// Test. -// -// Example: -// -// This will instantiate tests in test case AnimalTest each one with -// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), -// tuple("dog", BLACK), and tuple("dog", WHITE): -// -// enum Color { BLACK, GRAY, WHITE }; -// class AnimalTest -// : public testing::TestWithParam > {...}; -// -// TEST_P(AnimalTest, AnimalLooksNice) {...} -// -// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest, -// Combine(Values("cat", "dog"), -// Values(BLACK, WHITE))); -// -// This will instantiate tests in FlagDependentTest with all variations of two -// Boolean flags: -// -// class FlagDependentTest -// : public testing::TestWithParam > { -// virtual void SetUp() { -// // Assigns external_flag_1 and external_flag_2 values from the tuple. -// tie(external_flag_1, external_flag_2) = GetParam(); -// } -// }; -// -// TEST_P(FlagDependentTest, TestFeature1) { -// // Test your code using external_flag_1 and external_flag_2 here. -// } -// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest, -// Combine(Bool(), Bool())); -// -template -internal::CartesianProductHolder2 Combine( - const Generator1& g1, const Generator2& g2) { - return internal::CartesianProductHolder2( - g1, g2); -} - -template -internal::CartesianProductHolder3 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3) { - return internal::CartesianProductHolder3( - g1, g2, g3); -} - -template -internal::CartesianProductHolder4 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4) { - return internal::CartesianProductHolder4( - g1, g2, g3, g4); -} - -template -internal::CartesianProductHolder5 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5) { - return internal::CartesianProductHolder5( - g1, g2, g3, g4, g5); -} - -template -internal::CartesianProductHolder6 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6) { - return internal::CartesianProductHolder6( - g1, g2, g3, g4, g5, g6); -} - -template -internal::CartesianProductHolder7 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7) { - return internal::CartesianProductHolder7( - g1, g2, g3, g4, g5, g6, g7); -} - -template -internal::CartesianProductHolder8 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7, const Generator8& g8) { - return internal::CartesianProductHolder8( - g1, g2, g3, g4, g5, g6, g7, g8); -} - -template -internal::CartesianProductHolder9 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7, const Generator8& g8, const Generator9& g9) { - return internal::CartesianProductHolder9( - g1, g2, g3, g4, g5, g6, g7, g8, g9); -} - -template -internal::CartesianProductHolder10 Combine( - const Generator1& g1, const Generator2& g2, const Generator3& g3, - const Generator4& g4, const Generator5& g5, const Generator6& g6, - const Generator7& g7, const Generator8& g8, const Generator9& g9, - const Generator10& g10) { - return internal::CartesianProductHolder10( - g1, g2, g3, g4, g5, g6, g7, g8, g9, g10); -} -# endif // GTEST_HAS_COMBINE - - - -# define TEST_P(test_case_name, test_name) \ - class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ - : public test_case_name { \ - public: \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \ - virtual void TestBody(); \ - private: \ - static int AddToRegistry() { \ - ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ - GetTestCasePatternHolder(\ - #test_case_name, __FILE__, __LINE__)->AddTestPattern(\ - #test_case_name, \ - #test_name, \ - new ::testing::internal::TestMetaFactory< \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \ - return 0; \ - } \ - static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ - GTEST_DISALLOW_COPY_AND_ASSIGN_(\ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ - }; \ - int GTEST_TEST_CLASS_NAME_(test_case_name, \ - test_name)::gtest_registering_dummy_ = \ - GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \ - void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() - -# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \ - ::testing::internal::ParamGenerator \ - gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ - int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ - GetTestCasePatternHolder(\ - #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\ - #prefix, \ - >est_##prefix##test_case_name##_EvalGenerator_, \ - __FILE__, __LINE__) - -} // namespace testing - -#endif // GTEST_HAS_PARAM_TEST - -#endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ -// Copyright 2006, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// Google C++ Testing Framework definitions useful in production code. - -#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_ - -// When you need to test the private or protected members of a class, -// use the FRIEND_TEST macro to declare your tests as friends of the -// class. For example: -// -// class MyClass { -// private: -// void MyMethod(); -// FRIEND_TEST(MyClassTest, MyMethod); -// }; -// -// class MyClassTest : public testing::Test { -// // ... -// }; -// -// TEST_F(MyClassTest, MyMethod) { -// // Can call MyClass::MyMethod() here. -// } - -#define FRIEND_TEST(test_case_name, test_name)\ -friend class test_case_name##_##test_name##_Test - -#endif // GTEST_INCLUDE_GTEST_GTEST_PROD_H_ -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: mheule@google.com (Markus Heule) -// - -#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ -#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ - -#include -#include - -namespace testing { - -// A copyable object representing the result of a test part (i.e. an -// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()). -// -// Don't inherit from TestPartResult as its destructor is not virtual. -class GTEST_API_ TestPartResult { - public: - // The possible outcomes of a test part (i.e. an assertion or an - // explicit SUCCEED(), FAIL(), or ADD_FAILURE()). - enum Type { - kSuccess, // Succeeded. - kNonFatalFailure, // Failed but the test can continue. - kFatalFailure // Failed and the test should be terminated. - }; - - // C'tor. TestPartResult does NOT have a default constructor. - // Always use this constructor (with parameters) to create a - // TestPartResult object. - TestPartResult(Type a_type, - const char* a_file_name, - int a_line_number, - const char* a_message) - : type_(a_type), - file_name_(a_file_name == NULL ? "" : a_file_name), - line_number_(a_line_number), - summary_(ExtractSummary(a_message)), - message_(a_message) { - } - - // Gets the outcome of the test part. - Type type() const { return type_; } - - // Gets the name of the source file where the test part took place, or - // NULL if it's unknown. - const char* file_name() const { - return file_name_.empty() ? NULL : file_name_.c_str(); - } - - // Gets the line in the source file where the test part took place, - // or -1 if it's unknown. - int line_number() const { return line_number_; } - - // Gets the summary of the failure message. - const char* summary() const { return summary_.c_str(); } - - // Gets the message associated with the test part. - const char* message() const { return message_.c_str(); } - - // Returns true iff the test part passed. - bool passed() const { return type_ == kSuccess; } - - // Returns true iff the test part failed. - bool failed() const { return type_ != kSuccess; } - - // Returns true iff the test part non-fatally failed. - bool nonfatally_failed() const { return type_ == kNonFatalFailure; } - - // Returns true iff the test part fatally failed. - bool fatally_failed() const { return type_ == kFatalFailure; } - - private: - Type type_; - - // Gets the summary of the failure message by omitting the stack - // trace in it. - static std::string ExtractSummary(const char* message); - - // The name of the source file where the test part took place, or - // "" if the source file is unknown. - std::string file_name_; - // The line in the source file where the test part took place, or -1 - // if the line number is unknown. - int line_number_; - std::string summary_; // The test failure summary. - std::string message_; // The test failure message. -}; - -// Prints a TestPartResult object. -std::ostream& operator<<(std::ostream& os, const TestPartResult& result); - -// An array of TestPartResult objects. -// -// Don't inherit from TestPartResultArray as its destructor is not -// virtual. -class GTEST_API_ TestPartResultArray { - public: - TestPartResultArray() {} - - // Appends the given TestPartResult to the array. - void Append(const TestPartResult& result); - - // Returns the TestPartResult at the given index (0-based). - const TestPartResult& GetTestPartResult(int index) const; - - // Returns the number of TestPartResult objects in the array. - int size() const; - - private: - std::vector array_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray); -}; - -// This interface knows how to report a test part result. -class TestPartResultReporterInterface { - public: - virtual ~TestPartResultReporterInterface() {} - - virtual void ReportTestPartResult(const TestPartResult& result) = 0; -}; - -namespace internal { - -// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a -// statement generates new fatal failures. To do so it registers itself as the -// current test part result reporter. Besides checking if fatal failures were -// reported, it only delegates the reporting to the former result reporter. -// The original result reporter is restored in the destructor. -// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -class GTEST_API_ HasNewFatalFailureHelper - : public TestPartResultReporterInterface { - public: - HasNewFatalFailureHelper(); - virtual ~HasNewFatalFailureHelper(); - virtual void ReportTestPartResult(const TestPartResult& result); - bool has_new_fatal_failure() const { return has_new_fatal_failure_; } - private: - bool has_new_fatal_failure_; - TestPartResultReporterInterface* original_reporter_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper); -}; - -} // namespace internal - -} // namespace testing - -#endif // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - -#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ -#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ - -// This header implements typed tests and type-parameterized tests. - -// Typed (aka type-driven) tests repeat the same test for types in a -// list. You must know which types you want to test with when writing -// typed tests. Here's how you do it: - -#if 0 - -// First, define a fixture class template. It should be parameterized -// by a type. Remember to derive it from testing::Test. -template -class FooTest : public testing::Test { - public: - ... - typedef std::list List; - static T shared_; - T value_; -}; - -// Next, associate a list of types with the test case, which will be -// repeated for each type in the list. The typedef is necessary for -// the macro to parse correctly. -typedef testing::Types MyTypes; -TYPED_TEST_CASE(FooTest, MyTypes); - -// If the type list contains only one type, you can write that type -// directly without Types<...>: -// TYPED_TEST_CASE(FooTest, int); - -// Then, use TYPED_TEST() instead of TEST_F() to define as many typed -// tests for this test case as you want. -TYPED_TEST(FooTest, DoesBlah) { - // Inside a test, refer to TypeParam to get the type parameter. - // Since we are inside a derived class template, C++ requires use to - // visit the members of FooTest via 'this'. - TypeParam n = this->value_; - - // To visit static members of the fixture, add the TestFixture:: - // prefix. - n += TestFixture::shared_; - - // To refer to typedefs in the fixture, add the "typename - // TestFixture::" prefix. - typename TestFixture::List values; - values.push_back(n); - ... -} - -TYPED_TEST(FooTest, HasPropertyA) { ... } - -#endif // 0 - -// Type-parameterized tests are abstract test patterns parameterized -// by a type. Compared with typed tests, type-parameterized tests -// allow you to define the test pattern without knowing what the type -// parameters are. The defined pattern can be instantiated with -// different types any number of times, in any number of translation -// units. -// -// If you are designing an interface or concept, you can define a -// suite of type-parameterized tests to verify properties that any -// valid implementation of the interface/concept should have. Then, -// each implementation can easily instantiate the test suite to verify -// that it conforms to the requirements, without having to write -// similar tests repeatedly. Here's an example: - -#if 0 - -// First, define a fixture class template. It should be parameterized -// by a type. Remember to derive it from testing::Test. -template -class FooTest : public testing::Test { - ... -}; - -// Next, declare that you will define a type-parameterized test case -// (the _P suffix is for "parameterized" or "pattern", whichever you -// prefer): -TYPED_TEST_CASE_P(FooTest); - -// Then, use TYPED_TEST_P() to define as many type-parameterized tests -// for this type-parameterized test case as you want. -TYPED_TEST_P(FooTest, DoesBlah) { - // Inside a test, refer to TypeParam to get the type parameter. - TypeParam n = 0; - ... -} - -TYPED_TEST_P(FooTest, HasPropertyA) { ... } - -// Now the tricky part: you need to register all test patterns before -// you can instantiate them. The first argument of the macro is the -// test case name; the rest are the names of the tests in this test -// case. -REGISTER_TYPED_TEST_CASE_P(FooTest, - DoesBlah, HasPropertyA); - -// Finally, you are free to instantiate the pattern with the types you -// want. If you put the above code in a header file, you can #include -// it in multiple C++ source files and instantiate it multiple times. -// -// To distinguish different instances of the pattern, the first -// argument to the INSTANTIATE_* macro is a prefix that will be added -// to the actual test case name. Remember to pick unique prefixes for -// different instances. -typedef testing::Types MyTypes; -INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); - -// If the type list contains only one type, you can write that type -// directly without Types<...>: -// INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int); - -#endif // 0 - - -// Implements typed tests. - -#if GTEST_HAS_TYPED_TEST - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Expands to the name of the typedef for the type parameters of the -// given test case. -# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_ - -// The 'Types' template argument below must have spaces around it -// since some compilers may choke on '>>' when passing a template -// instance (e.g. Types) -# define TYPED_TEST_CASE(CaseName, Types) \ - typedef ::testing::internal::TypeList< Types >::type \ - GTEST_TYPE_PARAMS_(CaseName) - -# define TYPED_TEST(CaseName, TestName) \ - template \ - class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ - : public CaseName { \ - private: \ - typedef CaseName TestFixture; \ - typedef gtest_TypeParam_ TypeParam; \ - virtual void TestBody(); \ - }; \ - bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::internal::TypeParameterizedTest< \ - CaseName, \ - ::testing::internal::TemplateSel< \ - GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \ - GTEST_TYPE_PARAMS_(CaseName)>::Register(\ - "", #CaseName, #TestName, 0); \ - template \ - void GTEST_TEST_CLASS_NAME_(CaseName, TestName)::TestBody() - -#endif // GTEST_HAS_TYPED_TEST - -// Implements type-parameterized tests. - -#if GTEST_HAS_TYPED_TEST_P - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Expands to the namespace name that the type-parameterized tests for -// the given type-parameterized test case are defined in. The exact -// name of the namespace is subject to change without notice. -# define GTEST_CASE_NAMESPACE_(TestCaseName) \ - gtest_case_##TestCaseName##_ - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. -// -// Expands to the name of the variable used to remember the names of -// the defined tests in the given test case. -# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \ - gtest_typed_test_case_p_state_##TestCaseName##_ - -// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY. -// -// Expands to the name of the variable used to remember the names of -// the registered tests in the given test case. -# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \ - gtest_registered_test_names_##TestCaseName##_ - -// The variables defined in the type-parameterized test macros are -// static as typically these macros are used in a .h file that can be -// #included in multiple translation units linked together. -# define TYPED_TEST_CASE_P(CaseName) \ - static ::testing::internal::TypedTestCasePState \ - GTEST_TYPED_TEST_CASE_P_STATE_(CaseName) - -# define TYPED_TEST_P(CaseName, TestName) \ - namespace GTEST_CASE_NAMESPACE_(CaseName) { \ - template \ - class TestName : public CaseName { \ - private: \ - typedef CaseName TestFixture; \ - typedef gtest_TypeParam_ TypeParam; \ - virtual void TestBody(); \ - }; \ - static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ - GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\ - __FILE__, __LINE__, #CaseName, #TestName); \ - } \ - template \ - void GTEST_CASE_NAMESPACE_(CaseName)::TestName::TestBody() - -# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \ - namespace GTEST_CASE_NAMESPACE_(CaseName) { \ - typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ - } \ - static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \ - GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\ - __FILE__, __LINE__, #__VA_ARGS__) - -// The 'Types' template argument below must have spaces around it -// since some compilers may choke on '>>' when passing a template -// instance (e.g. Types) -# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \ - bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \ - ::testing::internal::TypeParameterizedTestCase::type>::Register(\ - #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName)) - -#endif // GTEST_HAS_TYPED_TEST_P - -#endif // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ - -// Depending on the platform, different string classes are available. -// On Linux, in addition to ::std::string, Google also makes use of -// class ::string, which has the same interface as ::std::string, but -// has a different implementation. -// -// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that -// ::string is available AND is a distinct type to ::std::string, or -// define it to 0 to indicate otherwise. -// -// If the user's ::std::string and ::string are the same class due to -// aliasing, he should define GTEST_HAS_GLOBAL_STRING to 0. -// -// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined -// heuristically. - -namespace testing { +#include "gtest/gtest-assertion-result.h" +#include "gtest/gtest-death-test.h" +#include "gtest/gtest-matchers.h" +#include "gtest/gtest-message.h" +#include "gtest/gtest-param-test.h" +#include "gtest/gtest-printers.h" +#include "gtest/gtest-test-part.h" +#include "gtest/gtest-typed-test.h" +#include "gtest/gtest_pred_impl.h" +#include "gtest/gtest_prod.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) // Declares the flags. @@ -17512,10 +89,18 @@ GTEST_DECLARE_bool_(catch_exceptions); // to let Google Test decide. GTEST_DECLARE_string_(color); +// This flag controls whether the test runner should continue execution past +// first failure. +GTEST_DECLARE_bool_(fail_fast); + // This flag sets up the filter to select by name using a glob pattern // the tests to run. If the filter is not given all tests are executed. GTEST_DECLARE_string_(filter); +// This flag controls whether Google Test installs a signal handler that dumps +// debugging information when fatal signals are raised. +GTEST_DECLARE_bool_(install_failure_signal_handler); + // This flag causes the Google Test to list tests. None of the tests listed // are actually run if the flag is provided. GTEST_DECLARE_bool_(list_tests); @@ -17524,10 +109,16 @@ GTEST_DECLARE_bool_(list_tests); // in addition to its normal textual output. GTEST_DECLARE_string_(output); +// This flags control whether Google Test prints only test failures. +GTEST_DECLARE_bool_(brief); + // This flags control whether Google Test prints the elapsed time for each // test. GTEST_DECLARE_bool_(print_time); +// This flags control whether Google Test prints UTF8 characters as text. +GTEST_DECLARE_bool_(print_utf8); + // This flag specifies the random number seed. GTEST_DECLARE_int32_(random_seed); @@ -17535,6 +126,12 @@ GTEST_DECLARE_int32_(random_seed); // is 1. If the value is -1 the tests are repeating forever. GTEST_DECLARE_int32_(repeat); +// This flag controls whether Google Test Environments are recreated for each +// repeat of the tests. The default value is true. If set to false the global +// test Environment objects are only set up once, for the first iteration, and +// only torn down once, for the last. +GTEST_DECLARE_bool_(recreate_environments_when_repeating); + // This flag controls whether Google Test includes Google Test internal // stack frames in failure stack traces. GTEST_DECLARE_bool_(show_internal_stack_frames); @@ -17548,7 +145,7 @@ GTEST_DECLARE_int32_(stack_trace_depth); // When this flag is specified, a failed assertion will throw an // exception if exceptions are enabled, or exit the program with a -// non-zero code otherwise. +// non-zero code otherwise. For use with an external test framework. GTEST_DECLARE_bool_(throw_on_failure); // When this flag is set with a "host:port" string, on supported @@ -17556,6 +153,20 @@ GTEST_DECLARE_bool_(throw_on_failure); // the specified host machine. GTEST_DECLARE_string_(stream_result_to); +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +GTEST_DECLARE_string_(flagfile); +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +namespace testing { + +// Silence C4100 (unreferenced formal parameter) and 4805 +// unsafe mix of type 'const int' and type 'const bool' +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4805) +#pragma warning(disable : 4100) +#endif + // The upper limit for valid stack trace depths. const int kMaxStackTraceDepth = 100; @@ -17573,9 +184,11 @@ class TestEventListenersAccessor; class TestEventRepeater; class UnitTestRecordPropertyTestHelper; class WindowsDeathTest; +class FuchsiaDeathTest; class UnitTestImpl* GetUnitTestImpl(); void ReportFailureInUnknownLocation(TestPartResult::Type result_type, const std::string& message); +std::set* GetIgnoredParameterizedTestSuites(); } // namespace internal @@ -17583,173 +196,31 @@ void ReportFailureInUnknownLocation(TestPartResult::Type result_type, // If we don't forward declare them the compiler might confuse the classes // in friendship clauses with same named classes on the scope. class Test; -class TestCase; +class TestSuite; + +// Old API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +using TestCase = TestSuite; +#endif class TestInfo; class UnitTest; -// A class for indicating whether an assertion was successful. When -// the assertion wasn't successful, the AssertionResult object -// remembers a non-empty message that describes how it failed. -// -// To create an instance of this class, use one of the factory functions -// (AssertionSuccess() and AssertionFailure()). -// -// This class is useful for two purposes: -// 1. Defining predicate functions to be used with Boolean test assertions -// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts -// 2. Defining predicate-format functions to be -// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). -// -// For example, if you define IsEven predicate: -// -// testing::AssertionResult IsEven(int n) { -// if ((n % 2) == 0) -// return testing::AssertionSuccess(); -// else -// return testing::AssertionFailure() << n << " is odd"; -// } -// -// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) -// will print the message -// -// Value of: IsEven(Fib(5)) -// Actual: false (5 is odd) -// Expected: true -// -// instead of a more opaque -// -// Value of: IsEven(Fib(5)) -// Actual: false -// Expected: true -// -// in case IsEven is a simple Boolean predicate. -// -// If you expect your predicate to be reused and want to support informative -// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up -// about half as often as positive ones in our tests), supply messages for -// both success and failure cases: -// -// testing::AssertionResult IsEven(int n) { -// if ((n % 2) == 0) -// return testing::AssertionSuccess() << n << " is even"; -// else -// return testing::AssertionFailure() << n << " is odd"; -// } -// -// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print -// -// Value of: IsEven(Fib(6)) -// Actual: true (8 is even) -// Expected: false -// -// NB: Predicates that support negative Boolean assertions have reduced -// performance in positive ones so be careful not to use them in tests -// that have lots (tens of thousands) of positive Boolean assertions. -// -// To use this class with EXPECT_PRED_FORMAT assertions such as: -// -// // Verifies that Foo() returns an even number. -// EXPECT_PRED_FORMAT1(IsEven, Foo()); -// -// you need to define: -// -// testing::AssertionResult IsEven(const char* expr, int n) { -// if ((n % 2) == 0) -// return testing::AssertionSuccess(); -// else -// return testing::AssertionFailure() -// << "Expected: " << expr << " is even\n Actual: it's " << n; -// } -// -// If Foo() returns 5, you will see the following message: -// -// Expected: Foo() is even -// Actual: it's 5 -// -class GTEST_API_ AssertionResult { - public: - // Copy constructor. - // Used in EXPECT_TRUE/FALSE(assertion_result). - AssertionResult(const AssertionResult& other); - // Used in the EXPECT_TRUE/FALSE(bool_expression). - explicit AssertionResult(bool success) : success_(success) {} - - // Returns true iff the assertion succeeded. - operator bool() const { return success_; } // NOLINT - - // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. - AssertionResult operator!() const; - - // Returns the text streamed into this AssertionResult. Test assertions - // use it when they fail (i.e., the predicate's outcome doesn't match the - // assertion's expectation). When nothing has been streamed into the - // object, returns an empty string. - const char* message() const { - return message_.get() != NULL ? message_->c_str() : ""; - } - // TODO(vladl@google.com): Remove this after making sure no clients use it. - // Deprecated; please use message() instead. - const char* failure_message() const { return message(); } - - // Streams a custom failure message into this object. - template AssertionResult& operator<<(const T& value) { - AppendMessage(Message() << value); - return *this; - } - - // Allows streaming basic output manipulators such as endl or flush into - // this object. - AssertionResult& operator<<( - ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) { - AppendMessage(Message() << basic_manipulator); - return *this; - } - - private: - // Appends the contents of message to message_. - void AppendMessage(const Message& a_message) { - if (message_.get() == NULL) - message_.reset(new ::std::string); - message_->append(a_message.GetString().c_str()); - } - - // Stores result of the assertion predicate. - bool success_; - // Stores the message describing the condition in case the expectation - // construct is not satisfied with the predicate's outcome. - // Referenced via a pointer to avoid taking too much stack frame space - // with test assertions. - internal::scoped_ptr< ::std::string> message_; - - GTEST_DISALLOW_ASSIGN_(AssertionResult); -}; - -// Makes a successful assertion result. -GTEST_API_ AssertionResult AssertionSuccess(); - -// Makes a failed assertion result. -GTEST_API_ AssertionResult AssertionFailure(); - -// Makes a failed assertion result with the given failure message. -// Deprecated; use AssertionFailure() << msg. -GTEST_API_ AssertionResult AssertionFailure(const Message& msg); - // The abstract class that all tests inherit from. // -// In Google Test, a unit test program contains one or many TestCases, and -// each TestCase contains one or many Tests. +// In Google Test, a unit test program contains one or many TestSuites, and +// each TestSuite contains one or many Tests. // // When you define a test using the TEST macro, you don't need to // explicitly derive from Test - the TEST macro automatically does // this for you. // // The only time you derive from Test is when defining a test fixture -// to be used a TEST_F. For example: +// to be used in a TEST_F. For example: // // class FooTest : public testing::Test { // protected: -// virtual void SetUp() { ... } -// virtual void TearDown() { ... } +// void SetUp() override { ... } +// void TearDown() override { ... } // ... // }; // @@ -17761,49 +232,54 @@ class GTEST_API_ Test { public: friend class TestInfo; - // Defines types for pointers to functions that set up and tear down - // a test case. - typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc; - typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc; - // The d'tor is virtual as we intend to inherit from Test. virtual ~Test(); - // Sets up the stuff shared by all tests in this test case. + // Sets up the stuff shared by all tests in this test suite. // - // Google Test will call Foo::SetUpTestCase() before running the first - // test in test case Foo. Hence a sub-class can define its own - // SetUpTestCase() method to shadow the one defined in the super + // Google Test will call Foo::SetUpTestSuite() before running the first + // test in test suite Foo. Hence a sub-class can define its own + // SetUpTestSuite() method to shadow the one defined in the super // class. - static void SetUpTestCase() {} + static void SetUpTestSuite() {} - // Tears down the stuff shared by all tests in this test case. + // Tears down the stuff shared by all tests in this test suite. // - // Google Test will call Foo::TearDownTestCase() after running the last - // test in test case Foo. Hence a sub-class can define its own - // TearDownTestCase() method to shadow the one defined in the super + // Google Test will call Foo::TearDownTestSuite() after running the last + // test in test suite Foo. Hence a sub-class can define its own + // TearDownTestSuite() method to shadow the one defined in the super // class. + static void TearDownTestSuite() {} + + // Legacy API is deprecated but still available. Use SetUpTestSuite and + // TearDownTestSuite instead. +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ static void TearDownTestCase() {} + static void SetUpTestCase() {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ - // Returns true iff the current test has a fatal failure. + // Returns true if and only if the current test has a fatal failure. static bool HasFatalFailure(); - // Returns true iff the current test has a non-fatal failure. + // Returns true if and only if the current test has a non-fatal failure. static bool HasNonfatalFailure(); - // Returns true iff the current test has a (either fatal or + // Returns true if and only if the current test was skipped. + static bool IsSkipped(); + + // Returns true if and only if the current test has a (either fatal or // non-fatal) failure. static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); } - // Logs a property for the current test, test case, or for the entire + // Logs a property for the current test, test suite, or for the entire // invocation of the test program when used outside of the context of a - // test case. Only the last value for a given key is remembered. These + // test suite. Only the last value for a given key is remembered. These // are public static so they can be called from utility functions that are // not members of the test fixture. Calls to RecordProperty made during // lifespan of the test (from the moment its constructor starts to the // moment its destructor finishes) will be output in XML as attributes of // the element. Properties recorded from fixture's - // SetUpTestCase or TearDownTestCase are logged as attributes of the + // SetUpTestSuite or TearDownTestSuite are logged as attributes of the // corresponding element. Calls to RecordProperty made in the // global context (before or after invocation of RUN_ALL_TESTS and from // SetUp/TearDown method of Environment objects registered with Google @@ -17822,8 +298,8 @@ class GTEST_API_ Test { virtual void TearDown(); private: - // Returns true iff the current test has the same fixture class as - // the first test in the current test case. + // Returns true if and only if the current test has the same fixture class + // as the first test in the current test suite. static bool HasSameFixtureClass(); // Runs the test after the test fixture has been set up. @@ -17841,30 +317,30 @@ class GTEST_API_ Test { // internal method to avoid clashing with names used in user TESTs. void DeleteSelf_() { delete this; } - // Uses a GTestFlagSaver to save and restore all Google Test flags. - const internal::GTestFlagSaver* const gtest_flag_saver_; + const std::unique_ptr gtest_flag_saver_; - // Often a user mis-spells SetUp() as Setup() and spends a long time + // Often a user misspells SetUp() as Setup() and spends a long time // wondering why it is never called by Google Test. The declaration of // the following method is solely for catching such an error at // compile time: // // - The return type is deliberately chosen to be not void, so it - // will be a conflict if a user declares void Setup() in his test - // fixture. + // will be a conflict if void Setup() is declared in the user's + // test fixture. // // - This method is private, so it will be another compiler error - // if a user calls it from his test fixture. + // if the method is called from the user's test fixture. // // DO NOT OVERRIDE THIS FUNCTION. // // If you see an error about overriding the following function or // about it being private, you have mis-spelled SetUp() as Setup(). struct Setup_should_be_spelled_SetUp {}; - virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } + virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; } // We disallow copying Tests. - GTEST_DISALLOW_COPY_AND_ASSIGN_(Test); + Test(const Test&) = delete; + Test& operator=(const Test&) = delete; }; typedef internal::TimeInMillis TimeInMillis; @@ -17878,24 +354,17 @@ class TestProperty { // C'tor. TestProperty does NOT have a default constructor. // Always use this constructor (with parameters) to create a // TestProperty object. - TestProperty(const std::string& a_key, const std::string& a_value) : - key_(a_key), value_(a_value) { - } + TestProperty(const std::string& a_key, const std::string& a_value) + : key_(a_key), value_(a_value) {} // Gets the user supplied key. - const char* key() const { - return key_.c_str(); - } + const char* key() const { return key_.c_str(); } // Gets the user supplied value. - const char* value() const { - return value_.c_str(); - } + const char* value() const { return value_.c_str(); } // Sets a new value, overriding the one supplied in the constructor. - void SetValue(const std::string& new_value) { - value_ = new_value; - } + void SetValue(const std::string& new_value) { value_ = new_value; } private: // The key supplied by the user. @@ -17925,24 +394,30 @@ class GTEST_API_ TestResult { // Returns the number of the test properties. int test_property_count() const; - // Returns true iff the test passed (i.e. no test part failed). - bool Passed() const { return !Failed(); } + // Returns true if and only if the test passed (i.e. no test part failed). + bool Passed() const { return !Skipped() && !Failed(); } - // Returns true iff the test failed. + // Returns true if and only if the test was skipped. + bool Skipped() const; + + // Returns true if and only if the test failed. bool Failed() const; - // Returns true iff the test fatally failed. + // Returns true if and only if the test fatally failed. bool HasFatalFailure() const; - // Returns true iff the test has a non-fatal failure. + // Returns true if and only if the test has a non-fatal failure. bool HasNonfatalFailure() const; // Returns the elapsed time, in milliseconds. TimeInMillis elapsed_time() const { return elapsed_time_; } - // Returns the i-th test part result among all the results. i can range - // from 0 to test_property_count() - 1. If i is not in that range, aborts - // the program. + // Gets the time of the test case start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + + // Returns the i-th test part result among all the results. i can range from 0 + // to total_part_count() - 1. If i is not in that range, aborts the program. const TestPartResult& GetTestPartResult(int i) const; // Returns the i-th test property. i can range from 0 to @@ -17952,13 +427,14 @@ class GTEST_API_ TestResult { private: friend class TestInfo; - friend class TestCase; + friend class TestSuite; friend class UnitTest; friend class internal::DefaultGlobalTestPartResultReporter; friend class internal::ExecDeathTest; friend class internal::TestResultAccessor; friend class internal::UnitTestImpl; friend class internal::WindowsDeathTest; + friend class internal::FuchsiaDeathTest; // Gets the vector of TestPartResults. const std::vector& test_part_results() const { @@ -17970,6 +446,9 @@ class GTEST_API_ TestResult { return test_properties_; } + // Sets the start time. + void set_start_timestamp(TimeInMillis start) { start_timestamp_ = start; } + // Sets the elapsed time. void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; } @@ -17983,8 +462,8 @@ class GTEST_API_ TestResult { const TestProperty& test_property); // Adds a failure if the key is a reserved attribute of Google Test - // testcase tags. Returns true if the property is valid. - // TODO(russr): Validate attribute names are legal and human readable. + // testsuite tags. Returns true if the property is valid. + // FIXME: Validate attribute names are legal and human readable. static bool ValidateTestProperty(const std::string& xml_element, const TestProperty& test_property); @@ -18005,7 +484,7 @@ class GTEST_API_ TestResult { // Protects mutable state of the property vector and of owned // properties, whose values may be updated. - internal::Mutex test_properites_mutex_; + internal::Mutex test_properties_mutex_; // The vector of TestPartResults std::vector test_part_results_; @@ -18013,16 +492,19 @@ class GTEST_API_ TestResult { std::vector test_properties_; // Running count of death tests. int death_test_count_; + // The start time, in milliseconds since UNIX Epoch. + TimeInMillis start_timestamp_; // The elapsed time, in milliseconds. TimeInMillis elapsed_time_; // We disallow copying TestResult. - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult); + TestResult(const TestResult&) = delete; + TestResult& operator=(const TestResult&) = delete; }; // class TestResult // A TestInfo object stores the following information about a test: // -// Test case name +// Test suite name // Test name // Whether the test should be run // A function pointer that creates the test object when invoked @@ -18037,8 +519,13 @@ class GTEST_API_ TestInfo { // don't inherit from TestInfo. ~TestInfo(); - // Returns the test case name. - const char* test_case_name() const { return test_case_name_.c_str(); } + // Returns the test suite name. + const char* test_suite_name() const { return test_suite_name_.c_str(); } + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const char* test_case_name() const { return test_suite_name(); } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Returns the test name. const char* name() const { return name_.c_str(); } @@ -18046,25 +533,32 @@ class GTEST_API_ TestInfo { // Returns the name of the parameter type, or NULL if this is not a typed // or a type-parameterized test. const char* type_param() const { - if (type_param_.get() != NULL) - return type_param_->c_str(); - return NULL; + if (type_param_.get() != nullptr) return type_param_->c_str(); + return nullptr; } // Returns the text representation of the value parameter, or NULL if this // is not a value-parameterized test. const char* value_param() const { - if (value_param_.get() != NULL) - return value_param_->c_str(); - return NULL; + if (value_param_.get() != nullptr) return value_param_->c_str(); + return nullptr; } + // Returns the file name where this test is defined. + const char* file() const { return location_.file.c_str(); } + + // Returns the line where this test is defined. + int line() const { return location_.line; } + + // Return true if this test should not be run because it's in another shard. + bool is_in_another_shard() const { return is_in_another_shard_; } + // Returns true if this test should run, that is if the test is not // disabled (or it is disabled but the also_run_disabled_tests flag has // been specified) and its full name matches the user-specified filter. // // Google Test allows the user to filter the tests by their full names. - // The full name of a test Bar in test case Foo is defined as + // The full name of a test Bar in test suite Foo is defined as // "Foo.Bar". Only the tests that match the filter will run. // // A filter is a colon-separated list of glob (not regex) patterns, @@ -18077,12 +571,11 @@ class GTEST_API_ TestInfo { // contains the character 'A' or starts with "Foo.". bool should_run() const { return should_run_; } - // Returns true iff this test will appear in the XML report. + // Returns true if and only if this test will appear in the XML report. bool is_reportable() const { - // For now, the XML report includes all tests matching the filter. - // In the future, we may trim tests that are excluded because of - // sharding. - return matches_filter_; + // The XML report includes tests matching the filter, excluding those + // run in other shards. + return matches_filter_ && !is_in_another_shard_; } // Returns the result of the test. @@ -18093,25 +586,22 @@ class GTEST_API_ TestInfo { friend class internal::DefaultDeathTestFactory; #endif // GTEST_HAS_DEATH_TEST friend class Test; - friend class TestCase; + friend class TestSuite; friend class internal::UnitTestImpl; friend class internal::StreamingListenerTest; friend TestInfo* internal::MakeAndRegisterTestInfo( - const char* test_case_name, - const char* name, - const char* type_param, - const char* value_param, - internal::TypeId fixture_class_id, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc, + const char* test_suite_name, const char* name, const char* type_param, + const char* value_param, internal::CodeLocation code_location, + internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc, internal::TestFactoryBase* factory); // Constructs a TestInfo object. The newly constructed instance assumes // ownership of the factory object. - TestInfo(const std::string& test_case_name, - const std::string& name, + TestInfo(const std::string& test_suite_name, const std::string& name, const char* a_type_param, // NULL if not a type-parameterized test const char* a_value_param, // NULL if not a value-parameterized test + internal::CodeLocation a_code_location, internal::TypeId fixture_class_id, internal::TestFactoryBase* factory); @@ -18125,24 +615,29 @@ class GTEST_API_ TestInfo { // deletes it. void Run(); + // Skip and records the test result for this object. + void Skip(); + static void ClearTestResult(TestInfo* test_info) { test_info->result_.Clear(); } // These fields are immutable properties of the test. - const std::string test_case_name_; // Test case name - const std::string name_; // Test name + const std::string test_suite_name_; // test suite name + const std::string name_; // Test name // Name of the parameter type, or NULL if this is not a typed or a // type-parameterized test. - const internal::scoped_ptr type_param_; + const std::unique_ptr type_param_; // Text representation of the value parameter, or NULL if this is not a // value-parameterized test. - const internal::scoped_ptr value_param_; - const internal::TypeId fixture_class_id_; // ID of the test fixture class - bool should_run_; // True iff this test should run - bool is_disabled_; // True iff this test is disabled - bool matches_filter_; // True if this test matches the - // user-specified filter. + const std::unique_ptr value_param_; + internal::CodeLocation location_; + const internal::TypeId fixture_class_id_; // ID of the test fixture class + bool should_run_; // True if and only if this test should run + bool is_disabled_; // True if and only if this test is disabled + bool matches_filter_; // True if this test matches the + // user-specified filter. + bool is_in_another_shard_; // Will be run in another shard. internal::TestFactoryBase* const factory_; // The factory that creates // the test object @@ -18150,93 +645,102 @@ class GTEST_API_ TestInfo { // test for the second time. TestResult result_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo); + TestInfo(const TestInfo&) = delete; + TestInfo& operator=(const TestInfo&) = delete; }; -// A test case, which consists of a vector of TestInfos. +// A test suite, which consists of a vector of TestInfos. // -// TestCase is not copyable. -class GTEST_API_ TestCase { +// TestSuite is not copyable. +class GTEST_API_ TestSuite { public: - // Creates a TestCase with the given name. + // Creates a TestSuite with the given name. // - // TestCase does NOT have a default constructor. Always use this - // constructor to create a TestCase object. + // TestSuite does NOT have a default constructor. Always use this + // constructor to create a TestSuite object. // // Arguments: // - // name: name of the test case + // name: name of the test suite // a_type_param: the name of the test's type parameter, or NULL if // this is not a type-parameterized test. - // set_up_tc: pointer to the function that sets up the test case - // tear_down_tc: pointer to the function that tears down the test case - TestCase(const char* name, const char* a_type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc); + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + TestSuite(const char* name, const char* a_type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc); - // Destructor of TestCase. - virtual ~TestCase(); + // Destructor of TestSuite. + virtual ~TestSuite(); - // Gets the name of the TestCase. + // Gets the name of the TestSuite. const char* name() const { return name_.c_str(); } // Returns the name of the parameter type, or NULL if this is not a - // type-parameterized test case. + // type-parameterized test suite. const char* type_param() const { - if (type_param_.get() != NULL) - return type_param_->c_str(); - return NULL; + if (type_param_.get() != nullptr) return type_param_->c_str(); + return nullptr; } - // Returns true if any test in this test case should run. + // Returns true if any test in this test suite should run. bool should_run() const { return should_run_; } - // Gets the number of successful tests in this test case. + // Gets the number of successful tests in this test suite. int successful_test_count() const; - // Gets the number of failed tests in this test case. + // Gets the number of skipped tests in this test suite. + int skipped_test_count() const; + + // Gets the number of failed tests in this test suite. int failed_test_count() const; // Gets the number of disabled tests that will be reported in the XML report. int reportable_disabled_test_count() const; - // Gets the number of disabled tests in this test case. + // Gets the number of disabled tests in this test suite. int disabled_test_count() const; // Gets the number of tests to be printed in the XML report. int reportable_test_count() const; - // Get the number of tests in this test case that should run. + // Get the number of tests in this test suite that should run. int test_to_run_count() const; - // Gets the number of all tests in this test case. + // Gets the number of all tests in this test suite. int total_test_count() const; - // Returns true iff the test case passed. + // Returns true if and only if the test suite passed. bool Passed() const { return !Failed(); } - // Returns true iff the test case failed. - bool Failed() const { return failed_test_count() > 0; } + // Returns true if and only if the test suite failed. + bool Failed() const { + return failed_test_count() > 0 || ad_hoc_test_result().Failed(); + } // Returns the elapsed time, in milliseconds. TimeInMillis elapsed_time() const { return elapsed_time_; } + // Gets the time of the test suite start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + // Returns the i-th test among all the tests. i can range from 0 to // total_test_count() - 1. If i is not in that range, returns NULL. const TestInfo* GetTestInfo(int i) const; // Returns the TestResult that holds test properties recorded during - // execution of SetUpTestCase and TearDownTestCase. + // execution of SetUpTestSuite and TearDownTestSuite. const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; } private: friend class Test; friend class internal::UnitTestImpl; - // Gets the (mutable) vector of TestInfos in this TestCase. + // Gets the (mutable) vector of TestInfos in this TestSuite. std::vector& test_info_list() { return test_info_list_; } - // Gets the (immutable) vector of TestInfos in this TestCase. + // Gets the (immutable) vector of TestInfos in this TestSuite. const std::vector& test_info_list() const { return test_info_list_; } @@ -18248,51 +752,67 @@ class GTEST_API_ TestCase { // Sets the should_run member. void set_should_run(bool should) { should_run_ = should; } - // Adds a TestInfo to this test case. Will delete the TestInfo upon - // destruction of the TestCase object. - void AddTestInfo(TestInfo * test_info); + // Adds a TestInfo to this test suite. Will delete the TestInfo upon + // destruction of the TestSuite object. + void AddTestInfo(TestInfo* test_info); - // Clears the results of all tests in this test case. + // Clears the results of all tests in this test suite. void ClearResult(); - // Clears the results of all tests in the given test case. - static void ClearTestCaseResult(TestCase* test_case) { - test_case->ClearResult(); + // Clears the results of all tests in the given test suite. + static void ClearTestSuiteResult(TestSuite* test_suite) { + test_suite->ClearResult(); } - // Runs every test in this TestCase. + // Runs every test in this TestSuite. void Run(); - // Runs SetUpTestCase() for this TestCase. This wrapper is needed - // for catching exceptions thrown from SetUpTestCase(). - void RunSetUpTestCase() { (*set_up_tc_)(); } + // Skips the execution of tests under this TestSuite + void Skip(); - // Runs TearDownTestCase() for this TestCase. This wrapper is - // needed for catching exceptions thrown from TearDownTestCase(). - void RunTearDownTestCase() { (*tear_down_tc_)(); } + // Runs SetUpTestSuite() for this TestSuite. This wrapper is needed + // for catching exceptions thrown from SetUpTestSuite(). + void RunSetUpTestSuite() { + if (set_up_tc_ != nullptr) { + (*set_up_tc_)(); + } + } - // Returns true iff test passed. + // Runs TearDownTestSuite() for this TestSuite. This wrapper is + // needed for catching exceptions thrown from TearDownTestSuite(). + void RunTearDownTestSuite() { + if (tear_down_tc_ != nullptr) { + (*tear_down_tc_)(); + } + } + + // Returns true if and only if test passed. static bool TestPassed(const TestInfo* test_info) { return test_info->should_run() && test_info->result()->Passed(); } - // Returns true iff test failed. + // Returns true if and only if test skipped. + static bool TestSkipped(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Skipped(); + } + + // Returns true if and only if test failed. static bool TestFailed(const TestInfo* test_info) { return test_info->should_run() && test_info->result()->Failed(); } - // Returns true iff the test is disabled and will be reported in the XML - // report. + // Returns true if and only if the test is disabled and will be reported in + // the XML report. static bool TestReportableDisabled(const TestInfo* test_info) { return test_info->is_reportable() && test_info->is_disabled_; } - // Returns true iff test is disabled. + // Returns true if and only if test is disabled. static bool TestDisabled(const TestInfo* test_info) { return test_info->is_disabled_; } - // Returns true iff this test will appear in the XML report. + // Returns true if and only if this test will appear in the XML report. static bool TestReportable(const TestInfo* test_info) { return test_info->is_reportable(); } @@ -18302,17 +822,17 @@ class GTEST_API_ TestCase { return test_info->should_run(); } - // Shuffles the tests in this test case. + // Shuffles the tests in this test suite. void ShuffleTests(internal::Random* random); // Restores the test order to before the first shuffle. void UnshuffleTests(); - // Name of the test case. + // Name of the test suite. std::string name_; // Name of the parameter type, or NULL if this is not a typed or a // type-parameterized test. - const internal::scoped_ptr type_param_; + const std::unique_ptr type_param_; // The vector of TestInfos in their original order. It owns the // elements in the vector. std::vector test_info_list_; @@ -18320,24 +840,27 @@ class GTEST_API_ TestCase { // shuffling and restoring the test order. The i-th element in this // vector is the index of the i-th test in the shuffled test list. std::vector test_indices_; - // Pointer to the function that sets up the test case. - Test::SetUpTestCaseFunc set_up_tc_; - // Pointer to the function that tears down the test case. - Test::TearDownTestCaseFunc tear_down_tc_; - // True iff any test in this test case should run. + // Pointer to the function that sets up the test suite. + internal::SetUpTestSuiteFunc set_up_tc_; + // Pointer to the function that tears down the test suite. + internal::TearDownTestSuiteFunc tear_down_tc_; + // True if and only if any test in this test suite should run. bool should_run_; + // The start time, in milliseconds since UNIX Epoch. + TimeInMillis start_timestamp_; // Elapsed time, in milliseconds. TimeInMillis elapsed_time_; - // Holds test properties recorded during execution of SetUpTestCase and - // TearDownTestCase. + // Holds test properties recorded during execution of SetUpTestSuite and + // TearDownTestSuite. TestResult ad_hoc_test_result_; - // We disallow copying TestCases. - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase); + // We disallow copying TestSuites. + TestSuite(const TestSuite&) = delete; + TestSuite& operator=(const TestSuite&) = delete; }; // An Environment object is capable of setting up and tearing down an -// environment. The user should subclass this to define his own +// environment. You should subclass this to define your own // environment(s). // // An Environment object does the set-up and tear-down in virtual @@ -18360,13 +883,26 @@ class Environment { // Override this to define how to tear down the environment. virtual void TearDown() {} + private: // If you see an error about overriding the following function or // about it being private, you have mis-spelled SetUp() as Setup(). struct Setup_should_be_spelled_SetUp {}; - virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } + virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; } }; +#if GTEST_HAS_EXCEPTIONS + +// Exception which can be thrown from TestEventListener::OnTestPartResult. +class GTEST_API_ AssertionException + : public internal::GoogleTestFailureException { + public: + explicit AssertionException(const TestPartResult& result) + : GoogleTestFailureException(result) {} +}; + +#endif // GTEST_HAS_EXCEPTIONS + // The interface for tracing execution of tests. The methods are organized in // the order the corresponding events are fired. class TestEventListener { @@ -18388,20 +924,35 @@ class TestEventListener { // Fired after environment set-up for each iteration of tests ends. virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0; - // Fired before the test case starts. - virtual void OnTestCaseStart(const TestCase& test_case) = 0; + // Fired before the test suite starts. + virtual void OnTestSuiteStart(const TestSuite& /*test_suite*/) {} + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + virtual void OnTestCaseStart(const TestCase& /*test_case*/) {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Fired before the test starts. virtual void OnTestStart(const TestInfo& test_info) = 0; + // Fired when a test is disabled + virtual void OnTestDisabled(const TestInfo& /*test_info*/) {} + // Fired after a failed assertion or a SUCCEED() invocation. + // If you want to throw an exception from this function to skip to the next + // TEST, it must be AssertionException defined above, or inherited from it. virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0; // Fired after the test ends. virtual void OnTestEnd(const TestInfo& test_info) = 0; - // Fired after the test case ends. - virtual void OnTestCaseEnd(const TestCase& test_case) = 0; + // Fired after the test suite ends. + virtual void OnTestSuiteEnd(const TestSuite& /*test_suite*/) {} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Fired before environment tear-down for each iteration of tests starts. virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0; @@ -18410,8 +961,7 @@ class TestEventListener { virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0; // Fired after each iteration of tests finishes. - virtual void OnTestIterationEnd(const UnitTest& unit_test, - int iteration) = 0; + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0; // Fired after all test activities have ended. virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0; @@ -18424,21 +974,31 @@ class TestEventListener { // above. class EmptyTestEventListener : public TestEventListener { public: - virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} - virtual void OnTestIterationStart(const UnitTest& /*unit_test*/, - int /*iteration*/) {} - virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {} - virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestCaseStart(const TestCase& /*test_case*/) {} - virtual void OnTestStart(const TestInfo& /*test_info*/) {} - virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {} - virtual void OnTestEnd(const TestInfo& /*test_info*/) {} - virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {} - virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {} - virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/, - int /*iteration*/) {} - virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} + void OnTestProgramStart(const UnitTest& /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest& /*unit_test*/, + int /*iteration*/) override {} + void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {} + void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {} +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase& /*test_case*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnTestStart(const TestInfo& /*test_info*/) override {} + void OnTestDisabled(const TestInfo& /*test_info*/) override {} + void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {} + void OnTestEnd(const TestInfo& /*test_info*/) override {} + void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& /*test_case*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest& /*unit_test*/, + int /*iteration*/) override {} + void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {} }; // TestEventListeners lets users add listeners to track events in Google Test. @@ -18478,7 +1038,7 @@ class GTEST_API_ TestEventListeners { } private: - friend class TestCase; + friend class TestSuite; friend class TestInfo; friend class internal::DefaultGlobalTestPartResultReporter; friend class internal::NoExecDeathTest; @@ -18516,10 +1076,11 @@ class GTEST_API_ TestEventListeners { TestEventListener* default_xml_generator_; // We disallow copying TestEventListeners. - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners); + TestEventListeners(const TestEventListeners&) = delete; + TestEventListeners& operator=(const TestEventListeners&) = delete; }; -// A UnitTest consists of a vector of TestCases. +// A UnitTest consists of a vector of TestSuites. // // This is a singleton class. The only instance of UnitTest is // created when UnitTest::GetInstance() is first called. This @@ -18548,44 +1109,56 @@ class GTEST_API_ UnitTest { // was executed. The UnitTest object owns the string. const char* original_working_dir() const; - // Returns the TestCase object for the test that's currently running, + // Returns the TestSuite object for the test that's currently running, // or NULL if no test is running. - const TestCase* current_test_case() const - GTEST_LOCK_EXCLUDED_(mutex_); + const TestSuite* current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_); + +// Legacy API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const TestCase* current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_); +#endif // Returns the TestInfo object for the test that's currently running, // or NULL if no test is running. - const TestInfo* current_test_info() const - GTEST_LOCK_EXCLUDED_(mutex_); + const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_); // Returns the random seed used at the start of the current test run. int random_seed() const; -#if GTEST_HAS_PARAM_TEST - // Returns the ParameterizedTestCaseRegistry object used to keep track of + // Returns the ParameterizedTestSuiteRegistry object used to keep track of // value-parameterized tests and instantiate and register them. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. - internal::ParameterizedTestCaseRegistry& parameterized_test_registry() + internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_); -#endif // GTEST_HAS_PARAM_TEST - // Gets the number of successful test cases. - int successful_test_case_count() const; + // Gets the number of successful test suites. + int successful_test_suite_count() const; - // Gets the number of failed test cases. - int failed_test_case_count() const; + // Gets the number of failed test suites. + int failed_test_suite_count() const; - // Gets the number of all test cases. - int total_test_case_count() const; + // Gets the number of all test suites. + int total_test_suite_count() const; - // Gets the number of all test cases that contain at least one test + // Gets the number of all test suites that contain at least one test // that should run. + int test_suite_to_run_count() const; + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + int successful_test_case_count() const; + int failed_test_case_count() const; + int total_test_case_count() const; int test_case_to_run_count() const; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Gets the number of successful tests. int successful_test_count() const; + // Gets the number of skipped tests. + int skipped_test_count() const; + // Gets the number of failed tests. int failed_test_count() const; @@ -18611,19 +1184,25 @@ class GTEST_API_ UnitTest { // Gets the elapsed time, in milliseconds. TimeInMillis elapsed_time() const; - // Returns true iff the unit test passed (i.e. all test cases passed). + // Returns true if and only if the unit test passed (i.e. all test suites + // passed). bool Passed() const; - // Returns true iff the unit test failed (i.e. some test case failed - // or something outside of all tests failed). + // Returns true if and only if the unit test failed (i.e. some test suite + // failed or something outside of all tests failed). bool Failed() const; - // Gets the i-th test case among all the test cases. i can range from 0 to - // total_test_case_count() - 1. If i is not in that range, returns NULL. + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + const TestSuite* GetTestSuite(int i) const; + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ const TestCase* GetTestCase(int i) const; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ // Returns the TestResult containing information on test failures and - // properties logged outside of individual test cases. + // properties logged outside of individual test suites. const TestResult& ad_hoc_test_result() const; // Returns the list of event listeners that can be used to track events @@ -18647,39 +1226,38 @@ class GTEST_API_ UnitTest { // eventually call this to report their results. The user code // should use the assertion macros instead of calling this directly. void AddTestPartResult(TestPartResult::Type result_type, - const char* file_name, - int line_number, + const char* file_name, int line_number, const std::string& message, const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_); // Adds a TestProperty to the current TestResult object when invoked from - // inside a test, to current TestCase's ad_hoc_test_result_ when invoked - // from SetUpTestCase or TearDownTestCase, or to the global property set + // inside a test, to current TestSuite's ad_hoc_test_result_ when invoked + // from SetUpTestSuite or TearDownTestSuite, or to the global property set // when invoked elsewhere. If the result already contains a property with // the same key, the value will be updated. void RecordProperty(const std::string& key, const std::string& value); - // Gets the i-th test case among all the test cases. i can range from 0 to - // total_test_case_count() - 1. If i is not in that range, returns NULL. - TestCase* GetMutableTestCase(int i); + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + TestSuite* GetMutableTestSuite(int i); // Accessors for the implementation object. internal::UnitTestImpl* impl() { return impl_; } const internal::UnitTestImpl* impl() const { return impl_; } - // These classes and funcions are friends as they need to access private + // These classes and functions are friends as they need to access private // members of UnitTest. + friend class ScopedTrace; friend class Test; friend class internal::AssertHelper; - friend class internal::ScopedTrace; friend class internal::StreamingListenerTest; friend class internal::UnitTestRecordPropertyTestHelper; friend Environment* AddGlobalTestEnvironment(Environment* env); + friend std::set* internal::GetIgnoredParameterizedTestSuites(); friend internal::UnitTestImpl* internal::GetUnitTestImpl(); friend void internal::ReportFailureInUnknownLocation( - TestPartResult::Type result_type, - const std::string& message); + TestPartResult::Type result_type, const std::string& message); // Creates an empty UnitTest. UnitTest(); @@ -18693,8 +1271,7 @@ class GTEST_API_ UnitTest { GTEST_LOCK_EXCLUDED_(mutex_); // Pops a trace from the per-thread Google Test trace stack. - void PopGTestTrace() - GTEST_LOCK_EXCLUDED_(mutex_); + void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_); // Protects mutable state in *impl_. This is mutable as some const // methods need to lock it too. @@ -18707,7 +1284,8 @@ class GTEST_API_ UnitTest { internal::UnitTestImpl* impl_; // We disallow copying UnitTest. - GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest); + UnitTest(const UnitTest&) = delete; + UnitTest& operator=(const UnitTest&) = delete; }; // A convenient wrapper for adding an environment for the test @@ -18747,155 +1325,56 @@ GTEST_API_ void InitGoogleTest(int* argc, char** argv); // UNICODE mode. GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv); +// This overloaded version can be used on Arduino/embedded platforms where +// there is no argc/argv. +GTEST_API_ void InitGoogleTest(); + namespace internal { -// FormatForComparison::Format(value) formats a -// value of type ToPrint that is an operand of a comparison assertion -// (e.g. ASSERT_EQ). OtherOperand is the type of the other operand in -// the comparison, and is used to help determine the best way to -// format the value. In particular, when the value is a C string -// (char pointer) and the other operand is an STL string object, we -// want to format the C string as a string, since we know it is -// compared by value with the string object. If the value is a char -// pointer but the other operand is not an STL string object, we don't -// know whether the pointer is supposed to point to a NUL-terminated -// string, and thus want to print it as a pointer to be safe. -// -// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. - -// The default case. -template -class FormatForComparison { - public: - static ::std::string Format(const ToPrint& value) { - return ::testing::PrintToString(value); - } -}; - -// Array. -template -class FormatForComparison { - public: - static ::std::string Format(const ToPrint* value) { - return FormatForComparison::Format(value); - } -}; - -// By default, print C string as pointers to be safe, as we don't know -// whether they actually point to a NUL-terminated string. - -#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType) \ - template \ - class FormatForComparison { \ - public: \ - static ::std::string Format(CharType* value) { \ - return ::testing::PrintToString(static_cast(value)); \ - } \ - } - -GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char); -GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char); -GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t); -GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t); - -#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_ - -// If a C string is compared with an STL string object, we know it's meant -// to point to a NUL-terminated string, and thus can print it as a string. - -#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \ - template <> \ - class FormatForComparison { \ - public: \ - static ::std::string Format(CharType* value) { \ - return ::testing::PrintToString(value); \ - } \ - } - -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string); -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string); - -#if GTEST_HAS_GLOBAL_STRING -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string); -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string); -#endif - -#if GTEST_HAS_GLOBAL_WSTRING -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring); -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring); -#endif - -#if GTEST_HAS_STD_WSTRING -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring); -GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring); -#endif - -#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_ - -// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc) -// operand to be used in a failure message. The type (but not value) -// of the other operand may affect the format. This allows us to -// print a char* as a raw pointer when it is compared against another -// char* or void*, and print it as a C string when it is compared -// against an std::string object, for example. -// -// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// Separate the error generating code from the code path to reduce the stack +// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers +// when calling EXPECT_* in a tight loop. template -std::string FormatForComparisonFailureMessage( - const T1& value, const T2& /* other_operand */) { - return FormatForComparison::Format(value); +AssertionResult CmpHelperEQFailure(const char* lhs_expression, + const char* rhs_expression, const T1& lhs, + const T2& rhs) { + return EqFailure(lhs_expression, rhs_expression, + FormatForComparisonFailureMessage(lhs, rhs), + FormatForComparisonFailureMessage(rhs, lhs), false); } +// This block of code defines operator==/!= +// to block lexical scope lookup. +// It prevents using invalid operator==/!= defined at namespace scope. +struct faketype {}; +inline bool operator==(faketype, faketype) { return true; } +inline bool operator!=(faketype, faketype) { return false; } + // The helper function for {ASSERT|EXPECT}_EQ. template -AssertionResult CmpHelperEQ(const char* expected_expression, - const char* actual_expression, - const T1& expected, - const T2& actual) { -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4389) // Temporarily disables warning on - // signed/unsigned mismatch. -#endif - - if (expected == actual) { +AssertionResult CmpHelperEQ(const char* lhs_expression, + const char* rhs_expression, const T1& lhs, + const T2& rhs) { + if (lhs == rhs) { return AssertionSuccess(); } -#ifdef _MSC_VER -# pragma warning(pop) // Restores the warning state. -#endif - - return EqFailure(expected_expression, - actual_expression, - FormatForComparisonFailureMessage(expected, actual), - FormatForComparisonFailureMessage(actual, expected), - false); + return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs); } -// With this overloaded version, we allow anonymous enums to be used -// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums -// can be implicitly cast to BiggestInt. -GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression, - const char* actual_expression, - BiggestInt expected, - BiggestInt actual); - -// The helper class for {ASSERT|EXPECT}_EQ. The template argument -// lhs_is_null_literal is true iff the first argument to ASSERT_EQ() -// is a null pointer literal. The following default implementation is -// for lhs_is_null_literal being false. -template class EqHelper { public: // This templatized version is for the general case. - template - static AssertionResult Compare(const char* expected_expression, - const char* actual_expression, - const T1& expected, - const T2& actual) { - return CmpHelperEQ(expected_expression, actual_expression, expected, - actual); + template < + typename T1, typename T2, + // Disable this overload for cases where one argument is a pointer + // and the other is the null pointer constant. + typename std::enable_if::value || + !std::is_pointer::value>::type* = nullptr> + static AssertionResult Compare(const char* lhs_expression, + const char* rhs_expression, const T1& lhs, + const T2& rhs) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); } // With this overloaded version, we allow anonymous enums to be used @@ -18904,149 +1383,109 @@ class EqHelper { // // Even though its body looks the same as the above version, we // cannot merge the two, as it will make anonymous enums unhappy. - static AssertionResult Compare(const char* expected_expression, - const char* actual_expression, - BiggestInt expected, - BiggestInt actual) { - return CmpHelperEQ(expected_expression, actual_expression, expected, - actual); - } -}; - -// This specialization is used when the first argument to ASSERT_EQ() -// is a null pointer literal, like NULL, false, or 0. -template <> -class EqHelper { - public: - // We define two overloaded versions of Compare(). The first - // version will be picked when the second argument to ASSERT_EQ() is - // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or - // EXPECT_EQ(false, a_bool). - template - static AssertionResult Compare( - const char* expected_expression, - const char* actual_expression, - const T1& expected, - const T2& actual, - // The following line prevents this overload from being considered if T2 - // is not a pointer type. We need this because ASSERT_EQ(NULL, my_ptr) - // expands to Compare("", "", NULL, my_ptr), which requires a conversion - // to match the Secret* in the other overload, which would otherwise make - // this template match better. - typename EnableIf::value>::type* = 0) { - return CmpHelperEQ(expected_expression, actual_expression, expected, - actual); + static AssertionResult Compare(const char* lhs_expression, + const char* rhs_expression, BiggestInt lhs, + BiggestInt rhs) { + return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs); } - // This version will be picked when the second argument to ASSERT_EQ() is a - // pointer, e.g. ASSERT_EQ(NULL, a_pointer). template static AssertionResult Compare( - const char* expected_expression, - const char* actual_expression, - // We used to have a second template parameter instead of Secret*. That - // template parameter would deduce to 'long', making this a better match - // than the first overload even without the first overload's EnableIf. - // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to - // non-pointer argument" (even a deduced integral argument), so the old - // implementation caused warnings in user code. - Secret* /* expected (NULL) */, - T* actual) { - // We already know that 'expected' is a null pointer. - return CmpHelperEQ(expected_expression, actual_expression, - static_cast(NULL), actual); + const char* lhs_expression, const char* rhs_expression, + // Handle cases where '0' is used as a null pointer literal. + std::nullptr_t /* lhs */, T* rhs) { + // We already know that 'lhs' is a null pointer. + return CmpHelperEQ(lhs_expression, rhs_expression, static_cast(nullptr), + rhs); } }; +// Separate the error generating code from the code path to reduce the stack +// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers +// when calling EXPECT_OP in a tight loop. +template +AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2, + const T1& val1, const T2& val2, + const char* op) { + return AssertionFailure() + << "Expected: (" << expr1 << ") " << op << " (" << expr2 + << "), actual: " << FormatForComparisonFailureMessage(val1, val2) + << " vs " << FormatForComparisonFailureMessage(val2, val1); +} + // A macro for implementing the helper functions needed to implement // ASSERT_?? and EXPECT_??. It is here just to avoid copy-and-paste // of similar code. // -// For each templatized helper function, we also define an overloaded -// version for BiggestInt in order to reduce code bloat and allow -// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled -// with gcc 4. -// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ -template \ -AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ - const T1& val1, const T2& val2) {\ - if (val1 op val2) {\ - return AssertionSuccess();\ - } else {\ - return AssertionFailure() \ - << "Expected: (" << expr1 << ") " #op " (" << expr2\ - << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\ - << " vs " << FormatForComparisonFailureMessage(val2, val1);\ - }\ -}\ -GTEST_API_ AssertionResult CmpHelper##op_name(\ - const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2) + +#define GTEST_IMPL_CMP_HELPER_(op_name, op) \ + template \ + AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ + const T1& val1, const T2& val2) { \ + if (val1 op val2) { \ + return AssertionSuccess(); \ + } else { \ + return CmpHelperOpFailure(expr1, expr2, val1, val2, #op); \ + } \ + } // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. // Implements the helper function for {ASSERT|EXPECT}_NE -GTEST_IMPL_CMP_HELPER_(NE, !=); +GTEST_IMPL_CMP_HELPER_(NE, !=) // Implements the helper function for {ASSERT|EXPECT}_LE -GTEST_IMPL_CMP_HELPER_(LE, <=); +GTEST_IMPL_CMP_HELPER_(LE, <=) // Implements the helper function for {ASSERT|EXPECT}_LT -GTEST_IMPL_CMP_HELPER_(LT, <); +GTEST_IMPL_CMP_HELPER_(LT, <) // Implements the helper function for {ASSERT|EXPECT}_GE -GTEST_IMPL_CMP_HELPER_(GE, >=); +GTEST_IMPL_CMP_HELPER_(GE, >=) // Implements the helper function for {ASSERT|EXPECT}_GT -GTEST_IMPL_CMP_HELPER_(GT, >); +GTEST_IMPL_CMP_HELPER_(GT, >) #undef GTEST_IMPL_CMP_HELPER_ // The helper function for {ASSERT|EXPECT}_STREQ. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression, - const char* actual_expression, - const char* expected, - const char* actual); +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, + const char* s2_expression, + const char* s1, const char* s2); // The helper function for {ASSERT|EXPECT}_STRCASEEQ. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression, - const char* actual_expression, - const char* expected, - const char* actual); +GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression, + const char* s2_expression, + const char* s1, const char* s2); // The helper function for {ASSERT|EXPECT}_STRNE. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, const char* s2_expression, - const char* s1, - const char* s2); + const char* s1, const char* s2); // The helper function for {ASSERT|EXPECT}_STRCASENE. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression, const char* s2_expression, - const char* s1, - const char* s2); - + const char* s1, const char* s2); // Helper function for *_STREQ on wide strings. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. -GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression, - const char* actual_expression, - const wchar_t* expected, - const wchar_t* actual); +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, const wchar_t* s2); // Helper function for *_STRNE on wide strings. // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, const char* s2_expression, - const wchar_t* s1, - const wchar_t* s2); + const wchar_t* s1, const wchar_t* s2); } // namespace internal @@ -19058,32 +1497,40 @@ GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, // // The {needle,haystack}_expr arguments are the stringified // expressions that generated the two real arguments. -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack); -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack); -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const char* needle, + const char* haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const wchar_t* needle, + const wchar_t* haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const char* needle, + const char* haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const wchar_t* needle, + const wchar_t* haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack); #if GTEST_HAS_STD_WSTRING -GTEST_API_ AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack); -GTEST_API_ AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack); +GTEST_API_ AssertionResult IsSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack); +GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack); #endif // GTEST_HAS_STD_WSTRING namespace internal { @@ -19096,28 +1543,25 @@ namespace internal { // // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. template -AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression, - const char* actual_expression, - RawType expected, - RawType actual) { - const FloatingPoint lhs(expected), rhs(actual); +AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression, + const char* rhs_expression, + RawType lhs_value, RawType rhs_value) { + const FloatingPoint lhs(lhs_value), rhs(rhs_value); if (lhs.AlmostEquals(rhs)) { return AssertionSuccess(); } - ::std::stringstream expected_ss; - expected_ss << std::setprecision(std::numeric_limits::digits10 + 2) - << expected; + ::std::stringstream lhs_ss; + lhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << lhs_value; - ::std::stringstream actual_ss; - actual_ss << std::setprecision(std::numeric_limits::digits10 + 2) - << actual; + ::std::stringstream rhs_ss; + rhs_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << rhs_value; - return EqFailure(expected_expression, - actual_expression, - StringStreamToString(&expected_ss), - StringStreamToString(&actual_ss), + return EqFailure(lhs_expression, rhs_expression, + StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss), false); } @@ -19127,8 +1571,7 @@ AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression, GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2, const char* abs_error_expr, - double val1, - double val2, + double val1, double val2, double abs_error); // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. @@ -19136,9 +1579,7 @@ GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1, class GTEST_API_ AssertHelper { public: // Constructor. - AssertHelper(TestPartResult::Type type, - const char* file, - int line, + AssertHelper(TestPartResult::Type type, const char* file, int line, const char* message); ~AssertHelper(); @@ -19152,11 +1593,9 @@ class GTEST_API_ AssertHelper { // re-using stack space even for temporary variables, so every EXPECT_EQ // reserves stack space for another AssertHelper. struct AssertHelperData { - AssertHelperData(TestPartResult::Type t, - const char* srcfile, - int line_num, + AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num, const char* msg) - : type(t), file(srcfile), line(line_num), message(msg) { } + : type(t), file(srcfile), line(line_num), message(msg) {} TestPartResult::Type const type; const char* const file; @@ -19164,17 +1603,18 @@ class GTEST_API_ AssertHelper { std::string const message; private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData); + AssertHelperData(const AssertHelperData&) = delete; + AssertHelperData& operator=(const AssertHelperData&) = delete; }; AssertHelperData* const data_; - GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper); + AssertHelper(const AssertHelper&) = delete; + AssertHelper& operator=(const AssertHelper&) = delete; }; } // namespace internal -#if GTEST_HAS_PARAM_TEST // The pure interface class that all value-parameterized tests inherit from. // A value-parameterized class must inherit from both ::testing::Test and // ::testing::WithParamInterface. In most cases that just means inheriting @@ -19192,13 +1632,13 @@ class GTEST_API_ AssertHelper { // FooTest() { // // Can use GetParam() here. // } -// virtual ~FooTest() { +// ~FooTest() override { // // Can use GetParam() here. // } -// virtual void SetUp() { +// void SetUp() override { // // Can use GetParam() here. // } -// virtual void TearDown { +// void TearDown override { // // Can use GetParam() here. // } // }; @@ -19207,7 +1647,7 @@ class GTEST_API_ AssertHelper { // Foo foo; // ASSERT_TRUE(foo.DoesBar(GetParam())); // } -// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10)); +// INSTANTIATE_TEST_SUITE_P(OneToTenRange, FooTest, ::testing::Range(1, 10)); template class WithParamInterface { @@ -19216,12 +1656,9 @@ class WithParamInterface { virtual ~WithParamInterface() {} // The current parameter value. Is also available in the test fixture's - // constructor. This member function is non-static, even though it only - // references static data, to reduce the opportunity for incorrect uses - // like writing 'WithParamInterface::GetParam()' for a test that - // uses a fixture whose parameter type is int. - const ParamType& GetParam() const { - GTEST_CHECK_(parameter_ != NULL) + // constructor. + static const ParamType& GetParam() { + GTEST_CHECK_(parameter_ != nullptr) << "GetParam() can only be called inside a value-parameterized test " << "-- did you intend to write TEST_P instead of TEST_F?"; return *parameter_; @@ -19230,31 +1667,32 @@ class WithParamInterface { private: // Sets parameter value. The caller is responsible for making sure the value // remains alive and unchanged throughout the current test. - static void SetParam(const ParamType* parameter) { - parameter_ = parameter; - } + static void SetParam(const ParamType* parameter) { parameter_ = parameter; } // Static value used for accessing parameter during a test lifetime. static const ParamType* parameter_; // TestClass must be a subclass of WithParamInterface and Test. - template friend class internal::ParameterizedTestFactory; + template + friend class internal::ParameterizedTestFactory; }; template -const T* WithParamInterface::parameter_ = NULL; +const T* WithParamInterface::parameter_ = nullptr; // Most value-parameterized classes can ignore the existence of // WithParamInterface, and can just inherit from ::testing::TestWithParam. template -class TestWithParam : public Test, public WithParamInterface { -}; - -#endif // GTEST_HAS_PARAM_TEST +class TestWithParam : public Test, public WithParamInterface {}; // Macros for indicating success/failure in test code. +// Skips test in runtime. +// Skipping test aborts current function. +// Skipped tests are neither successful nor failed. +#define GTEST_SKIP() GTEST_SKIP_("") + // ADD_FAILURE unconditionally adds a failure to the current test. // SUCCEED generates a success - it doesn't automatically make the // current test successful, as a test is only successful when it has @@ -19277,17 +1715,22 @@ class TestWithParam : public Test, public WithParamInterface { // Generates a nonfatal failure at the given source file location with // a generic message. -#define ADD_FAILURE_AT(file, line) \ +#define ADD_FAILURE_AT(file, line) \ GTEST_MESSAGE_AT_(file, line, "Failed", \ ::testing::TestPartResult::kNonFatalFailure) // Generates a fatal failure with a generic message. #define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed") +// Like GTEST_FAIL(), but at the given source file location. +#define GTEST_FAIL_AT(file, line) \ + GTEST_MESSAGE_AT_(file, line, "Failed", \ + ::testing::TestPartResult::kFatalFailure) + // Define this macro to 1 to omit the definition of FAIL(), which is a // generic name and clashes with some other libraries. #if !GTEST_DONT_DEFINE_FAIL -# define FAIL() GTEST_FAIL() +#define FAIL() GTEST_FAIL() #endif // Generates a success with a generic message. @@ -19296,7 +1739,7 @@ class TestWithParam : public Test, public WithParamInterface { // Define this macro to 1 to omit the definition of SUCCEED(), which // is a generic name and clashes with some other libraries. #if !GTEST_DONT_DEFINE_SUCCEED -# define SUCCEED() GTEST_SUCCEED() +#define SUCCEED() GTEST_SUCCEED() #endif // Macros for testing exceptions. @@ -19324,388 +1767,45 @@ class TestWithParam : public Test, public WithParamInterface { // Boolean assertions. Condition can be either a Boolean expression or an // AssertionResult. For more information on how to use AssertionResult with // these macros see comments on that class. -#define EXPECT_TRUE(condition) \ +#define GTEST_EXPECT_TRUE(condition) \ GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ GTEST_NONFATAL_FAILURE_) -#define EXPECT_FALSE(condition) \ +#define GTEST_EXPECT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_NONFATAL_FAILURE_) -#define ASSERT_TRUE(condition) \ - GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ - GTEST_FATAL_FAILURE_) -#define ASSERT_FALSE(condition) \ +#define GTEST_ASSERT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_) +#define GTEST_ASSERT_FALSE(condition) \ GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ GTEST_FATAL_FAILURE_) -// Includes the auto-generated header that implements a family of -// generic predicate assertion macros. -// Copyright 2006, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Define these macros to 1 to omit the definition of the corresponding +// EXPECT or ASSERT, which clashes with some users' own code. -// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command -// 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! -// -// Implements a family of generic predicate assertion macros. +#if !GTEST_DONT_DEFINE_EXPECT_TRUE +#define EXPECT_TRUE(condition) GTEST_EXPECT_TRUE(condition) +#endif -#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ -#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#if !GTEST_DONT_DEFINE_EXPECT_FALSE +#define EXPECT_FALSE(condition) GTEST_EXPECT_FALSE(condition) +#endif -// Makes sure this header is not included before gtest.h. -#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ -# error Do not include gtest_pred_impl.h directly. Include gtest.h instead. -#endif // GTEST_INCLUDE_GTEST_GTEST_H_ +#if !GTEST_DONT_DEFINE_ASSERT_TRUE +#define ASSERT_TRUE(condition) GTEST_ASSERT_TRUE(condition) +#endif -// This header implements a family of generic predicate assertion -// macros: -// -// ASSERT_PRED_FORMAT1(pred_format, v1) -// ASSERT_PRED_FORMAT2(pred_format, v1, v2) -// ... -// -// where pred_format is a function or functor that takes n (in the -// case of ASSERT_PRED_FORMATn) values and their source expression -// text, and returns a testing::AssertionResult. See the definition -// of ASSERT_EQ in gtest.h for an example. -// -// If you don't care about formatting, you can use the more -// restrictive version: -// -// ASSERT_PRED1(pred, v1) -// ASSERT_PRED2(pred, v1, v2) -// ... -// -// where pred is an n-ary function or functor that returns bool, -// and the values v1, v2, ..., must support the << operator for -// streaming to std::ostream. -// -// We also define the EXPECT_* variations. -// -// For now we only support predicates whose arity is at most 5. -// Please email googletestframework@googlegroups.com if you need -// support for higher arities. - -// GTEST_ASSERT_ is the basic statement to which all of the assertions -// in this file reduce. Don't use this in your code. - -#define GTEST_ASSERT_(expression, on_failure) \ - GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ - if (const ::testing::AssertionResult gtest_ar = (expression)) \ - ; \ - else \ - on_failure(gtest_ar.failure_message()) - - -// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use -// this in your code. -template -AssertionResult AssertPred1Helper(const char* pred_text, - const char* e1, - Pred pred, - const T1& v1) { - if (pred(v1)) return AssertionSuccess(); - - return AssertionFailure() << pred_text << "(" - << e1 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1; -} - -// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. -// Don't use this in your code. -#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, v1), \ - on_failure) - -// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use -// this in your code. -#define GTEST_PRED1_(pred, v1, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \ - #v1, \ - pred, \ - v1), on_failure) - -// Unary predicate assertion macros. -#define EXPECT_PRED_FORMAT1(pred_format, v1) \ - GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED1(pred, v1) \ - GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) -#define ASSERT_PRED_FORMAT1(pred_format, v1) \ - GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED1(pred, v1) \ - GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) - - - -// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use -// this in your code. -template -AssertionResult AssertPred2Helper(const char* pred_text, - const char* e1, - const char* e2, - Pred pred, - const T1& v1, - const T2& v2) { - if (pred(v1, v2)) return AssertionSuccess(); - - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2; -} - -// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. -// Don't use this in your code. -#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \ - on_failure) - -// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use -// this in your code. -#define GTEST_PRED2_(pred, v1, v2, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \ - #v1, \ - #v2, \ - pred, \ - v1, \ - v2), on_failure) - -// Binary predicate assertion macros. -#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ - GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED2(pred, v1, v2) \ - GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_) -#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \ - GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED2(pred, v1, v2) \ - GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) - - - -// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use -// this in your code. -template -AssertionResult AssertPred3Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3) { - if (pred(v1, v2, v3)) return AssertionSuccess(); - - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ", " - << e3 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2 - << "\n" << e3 << " evaluates to " << v3; -} - -// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. -// Don't use this in your code. -#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \ - on_failure) - -// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use -// this in your code. -#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - pred, \ - v1, \ - v2, \ - v3), on_failure) - -// Ternary predicate assertion macros. -#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ - GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED3(pred, v1, v2, v3) \ - GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_) -#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \ - GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED3(pred, v1, v2, v3) \ - GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) - - - -// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use -// this in your code. -template -AssertionResult AssertPred4Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - const char* e4, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3, - const T4& v4) { - if (pred(v1, v2, v3, v4)) return AssertionSuccess(); - - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ", " - << e3 << ", " - << e4 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2 - << "\n" << e3 << " evaluates to " << v3 - << "\n" << e4 << " evaluates to " << v4; -} - -// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. -// Don't use this in your code. -#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \ - on_failure) - -// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use -// this in your code. -#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - #v4, \ - pred, \ - v1, \ - v2, \ - v3, \ - v4), on_failure) - -// 4-ary predicate assertion macros. -#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ - GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED4(pred, v1, v2, v3, v4) \ - GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) -#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ - GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED4(pred, v1, v2, v3, v4) \ - GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) - - - -// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use -// this in your code. -template -AssertionResult AssertPred5Helper(const char* pred_text, - const char* e1, - const char* e2, - const char* e3, - const char* e4, - const char* e5, - Pred pred, - const T1& v1, - const T2& v2, - const T3& v3, - const T4& v4, - const T5& v5) { - if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); - - return AssertionFailure() << pred_text << "(" - << e1 << ", " - << e2 << ", " - << e3 << ", " - << e4 << ", " - << e5 << ") evaluates to false, where" - << "\n" << e1 << " evaluates to " << v1 - << "\n" << e2 << " evaluates to " << v2 - << "\n" << e3 << " evaluates to " << v3 - << "\n" << e4 << " evaluates to " << v4 - << "\n" << e5 << " evaluates to " << v5; -} - -// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. -// Don't use this in your code. -#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\ - GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \ - on_failure) - -// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use -// this in your code. -#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\ - GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \ - #v1, \ - #v2, \ - #v3, \ - #v4, \ - #v5, \ - pred, \ - v1, \ - v2, \ - v3, \ - v4, \ - v5), on_failure) - -// 5-ary predicate assertion macros. -#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ - GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) -#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \ - GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) -#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ - GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) -#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ - GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) - - - -#endif // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#if !GTEST_DONT_DEFINE_ASSERT_FALSE +#define ASSERT_FALSE(condition) GTEST_ASSERT_FALSE(condition) +#endif // Macros for testing equalities and inequalities. // -// * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual -// * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2 -// * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2 -// * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2 -// * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2 -// * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2 +// * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2 +// * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2 +// * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2 +// * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2 +// * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2 +// * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2 // // When they are not, Google Test prints both the tested expressions and // their actual values. The values must be compatible built-in types, @@ -19727,8 +1827,8 @@ AssertionResult AssertPred5Helper(const char* pred_text, // are related, not how their content is related. To compare two C // strings by content, use {ASSERT|EXPECT}_STR*(). // -// 3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to -// {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you +// 3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to +// {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you // what the actual value is when it fails, and similarly for the // other comparisons. // @@ -19739,17 +1839,15 @@ AssertionResult AssertPred5Helper(const char* pred_text, // // Examples: // -// EXPECT_NE(5, Foo()); -// EXPECT_EQ(NULL, a_pointer); +// EXPECT_NE(Foo(), 5); +// EXPECT_EQ(a_pointer, NULL); // ASSERT_LT(i, array_size); // ASSERT_GT(records.size(), 0) << "There is no record left."; -#define EXPECT_EQ(expected, actual) \ - EXPECT_PRED_FORMAT2(::testing::internal:: \ - EqHelper::Compare, \ - expected, actual) -#define EXPECT_NE(expected, actual) \ - EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual) +#define EXPECT_EQ(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2) +#define EXPECT_NE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) #define EXPECT_LE(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) #define EXPECT_LT(val1, val2) \ @@ -19759,10 +1857,8 @@ AssertionResult AssertPred5Helper(const char* pred_text, #define EXPECT_GT(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) -#define GTEST_ASSERT_EQ(expected, actual) \ - ASSERT_PRED_FORMAT2(::testing::internal:: \ - EqHelper::Compare, \ - expected, actual) +#define GTEST_ASSERT_EQ(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2) #define GTEST_ASSERT_NE(val1, val2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) #define GTEST_ASSERT_LE(val1, val2) \ @@ -19778,27 +1874,27 @@ AssertionResult AssertPred5Helper(const char* pred_text, // ASSERT_XY(), which clashes with some users' own code. #if !GTEST_DONT_DEFINE_ASSERT_EQ -# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) +#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_NE -# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) +#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_LE -# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) +#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_LT -# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) +#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_GE -# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) +#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) #endif #if !GTEST_DONT_DEFINE_ASSERT_GT -# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) +#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) #endif // C-string Comparisons. All tests treat NULL and any non-NULL string @@ -19817,29 +1913,29 @@ AssertionResult AssertPred5Helper(const char* pred_text, // // These macros evaluate their arguments exactly once. -#define EXPECT_STREQ(expected, actual) \ - EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual) +#define EXPECT_STREQ(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2) #define EXPECT_STRNE(s1, s2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) -#define EXPECT_STRCASEEQ(expected, actual) \ - EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual) -#define EXPECT_STRCASENE(s1, s2)\ +#define EXPECT_STRCASEEQ(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) +#define EXPECT_STRCASENE(s1, s2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) -#define ASSERT_STREQ(expected, actual) \ - ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual) +#define ASSERT_STREQ(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2) #define ASSERT_STRNE(s1, s2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) -#define ASSERT_STRCASEEQ(expected, actual) \ - ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual) -#define ASSERT_STRCASENE(s1, s2)\ +#define ASSERT_STRCASEEQ(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2) +#define ASSERT_STRCASENE(s1, s2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) // Macros for comparing floating-point numbers. // -// * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual): +// * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2): // Tests that two float values are almost equal. -// * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual): +// * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2): // Tests that two double values are almost equal. // * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error): // Tests that v1 and v2 are within the given distance to each other. @@ -19849,29 +1945,29 @@ AssertionResult AssertPred5Helper(const char* pred_text, // FloatingPoint template class in gtest-internal.h if you are // interested in the implementation details. -#define EXPECT_FLOAT_EQ(expected, actual)\ +#define EXPECT_FLOAT_EQ(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ - expected, actual) + val1, val2) -#define EXPECT_DOUBLE_EQ(expected, actual)\ +#define EXPECT_DOUBLE_EQ(val1, val2) \ EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ - expected, actual) + val1, val2) -#define ASSERT_FLOAT_EQ(expected, actual)\ +#define ASSERT_FLOAT_EQ(val1, val2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ - expected, actual) + val1, val2) -#define ASSERT_DOUBLE_EQ(expected, actual)\ +#define ASSERT_DOUBLE_EQ(val1, val2) \ ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ - expected, actual) + val1, val2) -#define EXPECT_NEAR(val1, val2, abs_error)\ - EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ - val1, val2, abs_error) +#define EXPECT_NEAR(val1, val2, abs_error) \ + EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \ + abs_error) -#define ASSERT_NEAR(val1, val2, abs_error)\ - ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ - val1, val2, abs_error) +#define ASSERT_NEAR(val1, val2, abs_error) \ + ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \ + abs_error) // These predicate format functions work on floating-point values, and // can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g. @@ -19885,7 +1981,6 @@ GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2, GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1, double val2); - #if GTEST_OS_WINDOWS // Macros that test for HRESULT failure and success, these are only useful @@ -19897,17 +1992,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, // expected result and the actual result with both a human-readable // string representation of the error, if available, as well as the // hex result code. -# define EXPECT_HRESULT_SUCCEEDED(expr) \ - EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) +#define EXPECT_HRESULT_SUCCEEDED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) -# define ASSERT_HRESULT_SUCCEEDED(expr) \ - ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) +#define ASSERT_HRESULT_SUCCEEDED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) -# define EXPECT_HRESULT_FAILED(expr) \ - EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) +#define EXPECT_HRESULT_FAILED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) -# define ASSERT_HRESULT_FAILED(expr) \ - ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) +#define ASSERT_HRESULT_FAILED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) #endif // GTEST_OS_WINDOWS @@ -19922,9 +2017,55 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, // ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed"; // #define ASSERT_NO_FATAL_FAILURE(statement) \ - GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) #define EXPECT_NO_FATAL_FAILURE(statement) \ - GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) + +// Causes a trace (including the given source file path and line number, +// and the given message) to be included in every test failure message generated +// by code in the scope of the lifetime of an instance of this class. The effect +// is undone with the destruction of the instance. +// +// The message argument can be anything streamable to std::ostream. +// +// Example: +// testing::ScopedTrace trace("file.cc", 123, "message"); +// +class GTEST_API_ ScopedTrace { + public: + // The c'tor pushes the given source file location and message onto + // a trace stack maintained by Google Test. + + // Template version. Uses Message() to convert the values into strings. + // Slow, but flexible. + template + ScopedTrace(const char* file, int line, const T& message) { + PushTrace(file, line, (Message() << message).GetString()); + } + + // Optimize for some known types. + ScopedTrace(const char* file, int line, const char* message) { + PushTrace(file, line, message ? message : "(null)"); + } + + ScopedTrace(const char* file, int line, const std::string& message) { + PushTrace(file, line, message); + } + + // The d'tor pops the info pushed by the c'tor. + // + // Note that the d'tor is not virtual in order to be efficient. + // Don't inherit from ScopedTrace! + ~ScopedTrace(); + + private: + void PushTrace(const char* file, int line, std::string message); + + ScopedTrace(const ScopedTrace&) = delete; + ScopedTrace& operator=(const ScopedTrace&) = delete; +} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its + // c'tor and d'tor. Therefore it doesn't + // need to be used otherwise. // Causes a trace (including the source file path, the current line // number, and the given message) to be included in every test failure @@ -19937,13 +2078,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, // of the dummy variable name, thus allowing multiple SCOPED_TRACE()s // to appear in the same block - as long as they are on different // lines. -#define SCOPED_TRACE(message) \ - ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\ - __FILE__, __LINE__, ::testing::Message() << (message)) +// +// Assuming that each thread maintains its own stack of traces. +// Therefore, a SCOPED_TRACE() would (correctly) only affect the +// assertions in its own thread. +#define SCOPED_TRACE(message) \ + ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \ + __FILE__, __LINE__, (message)) // Compile-time assertion for type equality. -// StaticAssertTypeEq() compiles iff type1 and type2 are -// the same type. The value it returns is not interesting. +// StaticAssertTypeEq() compiles if and only if type1 and type2 +// are the same type. The value it returns is not interesting. // // Instead of making StaticAssertTypeEq a class template, we make it a // function template that invokes a helper class template. This @@ -19972,21 +2117,21 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, // // to cause a compiler error. template -bool StaticAssertTypeEq() { - (void)internal::StaticAssertTypeEqHelper(); +constexpr bool StaticAssertTypeEq() noexcept { + static_assert(std::is_same::value, "T1 and T2 are not the same type"); return true; } // Defines a test. // -// The first parameter is the name of the test case, and the second -// parameter is the name of the test within the test case. +// The first parameter is the name of the test suite, and the second +// parameter is the name of the test within the test suite. // -// The convention is to end the test case name with "Test". For -// example, a test case for the Foo class can be named FooTest. +// The convention is to end the test suite name with "Test". For +// example, a test suite for the Foo class can be named FooTest. // -// The user should put his test code between braces after using this -// macro. Example: +// Test code should appear between braces after an invocation of +// this macro. Example: // // TEST(FooTest, InitializesCorrectly) { // Foo foo; @@ -20002,28 +2147,28 @@ bool StaticAssertTypeEq() { // code. GetTestTypeId() is guaranteed to always return the same // value, as it always calls GetTypeId<>() from the Google Test // framework. -#define GTEST_TEST(test_case_name, test_name)\ - GTEST_TEST_(test_case_name, test_name, \ - ::testing::Test, ::testing::internal::GetTestTypeId()) +#define GTEST_TEST(test_suite_name, test_name) \ + GTEST_TEST_(test_suite_name, test_name, ::testing::Test, \ + ::testing::internal::GetTestTypeId()) // Define this macro to 1 to omit the definition of TEST(), which // is a generic name and clashes with some other libraries. #if !GTEST_DONT_DEFINE_TEST -# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name) +#define TEST(test_suite_name, test_name) GTEST_TEST(test_suite_name, test_name) #endif // Defines a test that uses a test fixture. // // The first parameter is the name of the test fixture class, which -// also doubles as the test case name. The second parameter is the -// name of the test within the test case. +// also doubles as the test suite name. The second parameter is the +// name of the test within the test suite. // // A test fixture class must be declared earlier. The user should put -// his test code between braces after using this macro. Example: +// the test code between braces after using this macro. Example: // // class FooTest : public testing::Test { // protected: -// virtual void SetUp() { b_.AddElement(3); } +// void SetUp() override { b_.AddElement(3); } // // Foo a_; // Foo b_; @@ -20034,13 +2179,104 @@ bool StaticAssertTypeEq() { // } // // TEST_F(FooTest, ReturnsElementCountCorrectly) { -// EXPECT_EQ(0, a_.size()); -// EXPECT_EQ(1, b_.size()); +// EXPECT_EQ(a_.size(), 0); +// EXPECT_EQ(b_.size(), 1); // } - -#define TEST_F(test_fixture, test_name)\ +#define GTEST_TEST_F(test_fixture, test_name) \ GTEST_TEST_(test_fixture, test_name, test_fixture, \ ::testing::internal::GetTypeId()) +#if !GTEST_DONT_DEFINE_TEST_F +#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name) +#endif + +// Returns a path to temporary directory. +// Tries to determine an appropriate directory for the platform. +GTEST_API_ std::string TempDir(); + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +// Dynamically registers a test with the framework. +// +// This is an advanced API only to be used when the `TEST` macros are +// insufficient. The macros should be preferred when possible, as they avoid +// most of the complexity of calling this function. +// +// The `factory` argument is a factory callable (move-constructible) object or +// function pointer that creates a new instance of the Test object. It +// handles ownership to the caller. The signature of the callable is +// `Fixture*()`, where `Fixture` is the test fixture class for the test. All +// tests registered with the same `test_suite_name` must return the same +// fixture type. This is checked at runtime. +// +// The framework will infer the fixture class from the factory and will call +// the `SetUpTestSuite` and `TearDownTestSuite` for it. +// +// Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is +// undefined. +// +// Use case example: +// +// class MyFixture : public ::testing::Test { +// public: +// // All of these optional, just like in regular macro usage. +// static void SetUpTestSuite() { ... } +// static void TearDownTestSuite() { ... } +// void SetUp() override { ... } +// void TearDown() override { ... } +// }; +// +// class MyTest : public MyFixture { +// public: +// explicit MyTest(int data) : data_(data) {} +// void TestBody() override { ... } +// +// private: +// int data_; +// }; +// +// void RegisterMyTests(const std::vector& values) { +// for (int v : values) { +// ::testing::RegisterTest( +// "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr, +// std::to_string(v).c_str(), +// __FILE__, __LINE__, +// // Important to use the fixture type as the return type here. +// [=]() -> MyFixture* { return new MyTest(v); }); +// } +// } +// ... +// int main(int argc, char** argv) { +// ::testing::InitGoogleTest(&argc, argv); +// std::vector values_to_test = LoadValuesFromConfig(); +// RegisterMyTests(values_to_test); +// ... +// return RUN_ALL_TESTS(); +// } +// +template +TestInfo* RegisterTest(const char* test_suite_name, const char* test_name, + const char* type_param, const char* value_param, + const char* file, int line, Factory factory) { + using TestT = typename std::remove_pointer::type; + + class FactoryImpl : public internal::TestFactoryBase { + public: + explicit FactoryImpl(Factory f) : factory_(std::move(f)) {} + Test* CreateTest() override { return factory_(); } + + private: + Factory factory_; + }; + + return internal::MakeAndRegisterTestInfo( + test_suite_name, test_name, type_param, value_param, + internal::CodeLocation(file, line), internal::GetTypeId(), + internal::SuiteApiResolver::GetSetUpCaseOrSuite(file, line), + internal::SuiteApiResolver::GetTearDownCaseOrSuite(file, line), + new FactoryImpl{std::move(factory)}); +} } // namespace testing @@ -20054,8 +2290,8 @@ bool StaticAssertTypeEq() { // namespace and has an all-caps name. int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_; -inline int RUN_ALL_TESTS() { - return ::testing::UnitTest::GetInstance()->Run(); -} +inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); } -#endif // GTEST_INCLUDE_GTEST_GTEST_H_ +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h new file mode 100644 index 0000000000..47a24aa687 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h @@ -0,0 +1,279 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Implements a family of generic predicate assertion macros. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ + +#include "gtest/gtest-assertion-result.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { + +// This header implements a family of generic predicate assertion +// macros: +// +// ASSERT_PRED_FORMAT1(pred_format, v1) +// ASSERT_PRED_FORMAT2(pred_format, v1, v2) +// ... +// +// where pred_format is a function or functor that takes n (in the +// case of ASSERT_PRED_FORMATn) values and their source expression +// text, and returns a testing::AssertionResult. See the definition +// of ASSERT_EQ in gtest.h for an example. +// +// If you don't care about formatting, you can use the more +// restrictive version: +// +// ASSERT_PRED1(pred, v1) +// ASSERT_PRED2(pred, v1, v2) +// ... +// +// where pred is an n-ary function or functor that returns bool, +// and the values v1, v2, ..., must support the << operator for +// streaming to std::ostream. +// +// We also define the EXPECT_* variations. +// +// For now we only support predicates whose arity is at most 5. +// Please email googletestframework@googlegroups.com if you need +// support for higher arities. + +// GTEST_ASSERT_ is the basic statement to which all of the assertions +// in this file reduce. Don't use this in your code. + +#define GTEST_ASSERT_(expression, on_failure) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar = (expression)) \ + ; \ + else \ + on_failure(gtest_ar.failure_message()) + +// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +template +AssertionResult AssertPred1Helper(const char* pred_text, const char* e1, + Pred pred, const T1& v1) { + if (pred(v1)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. +// Don't use this in your code. +#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, v1), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +#define GTEST_PRED1_(pred, v1, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure) + +// Unary predicate assertion macros. +#define EXPECT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +template +AssertionResult AssertPred2Helper(const char* pred_text, const char* e1, + const char* e2, Pred pred, const T1& v1, + const T2& v2) { + if (pred(v1, v2)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. +// Don't use this in your code. +#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +#define GTEST_PRED2_(pred, v1, v2, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \ + on_failure) + +// Binary predicate assertion macros. +#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +template +AssertionResult AssertPred3Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, Pred pred, + const T1& v1, const T2& v2, const T3& v3) { + if (pred(v1, v2, v3)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. +// Don't use this in your code. +#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +#define GTEST_PRED3_(pred, v1, v2, v3, on_failure) \ + GTEST_ASSERT_( \ + ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \ + on_failure) + +// Ternary predicate assertion macros. +#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +template +AssertionResult AssertPred4Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, + const char* e4, Pred pred, const T1& v1, + const T2& v2, const T3& v3, const T4& v4) { + if (pred(v1, v2, v3, v4)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4 + << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n" + << e4 << " evaluates to " << ::testing::PrintToString(v4); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. +// Don't use this in your code. +#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \ + v1, v2, v3, v4), \ + on_failure) + +// 4-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) + +// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +template +AssertionResult AssertPred5Helper(const char* pred_text, const char* e1, + const char* e2, const char* e3, + const char* e4, const char* e5, Pred pred, + const T1& v1, const T2& v2, const T3& v3, + const T4& v4, const T5& v5) { + if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); + + return AssertionFailure() + << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4 + << ", " << e5 << ") evaluates to false, where" + << "\n" + << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n" + << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n" + << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n" + << e4 << " evaluates to " << ::testing::PrintToString(v4) << "\n" + << e5 << " evaluates to " << ::testing::PrintToString(v5); +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. +// Don't use this in your code. +#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure) \ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure) \ + GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \ + pred, v1, v2, v3, v4, v5), \ + on_failure) + +// 5-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) + +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h new file mode 100644 index 0000000000..1f37dc31c3 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h @@ -0,0 +1,60 @@ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Google C++ Testing and Mocking Framework definitions useful in production +// code. + +#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ +#define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ + +// When you need to test the private or protected members of a class, +// use the FRIEND_TEST macro to declare your tests as friends of the +// class. For example: +// +// class MyClass { +// private: +// void PrivateMethod(); +// FRIEND_TEST(MyClassTest, PrivateMethodWorks); +// }; +// +// class MyClassTest : public testing::Test { +// // ... +// }; +// +// TEST_F(MyClassTest, PrivateMethodWorks) { +// // Can call MyClass::PrivateMethod() here. +// } +// +// Note: The test class must be in the same namespace as the class being tested. +// For example, putting MyClassTest in an anonymous namespace will not work. + +#define FRIEND_TEST(test_case_name, test_name) \ + friend class test_case_name##_##test_name##_Test + +#endif // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md new file mode 100644 index 0000000000..cb49e2c754 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md @@ -0,0 +1,44 @@ +# Customization Points + +The custom directory is an injection point for custom user configurations. + +## Header `gtest.h` + +### The following macros can be defined: + +* `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of + `OsStackTraceGetterInterface`. +* `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See + `testing::TempDir` for semantics and signature. + +## Header `gtest-port.h` + +The following macros can be defined: + +### Logging: + +* `GTEST_LOG_(severity)` +* `GTEST_CHECK_(condition)` +* Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too. + +### Threading: + +* `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided. +* `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal` + are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)` + and `GTEST_DEFINE_STATIC_MUTEX_(mutex)` +* `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)` +* `GTEST_LOCK_EXCLUDED_(locks)` + +### Underlying library support features + +* `GTEST_HAS_CXXABI_H_` + +### Exporting API symbols: + +* `GTEST_API_` - Specifier for exported symbols. + +## Header `gtest-printers.h` + +* See documentation at `gtest/gtest-printers.h` for details on how to define a + custom printer. diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h new file mode 100644 index 0000000000..9b7fb4261a --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h @@ -0,0 +1,68 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ + +// Use a stub Notification class. +// +// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and +// std::condition_variable. The and headers of +// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only +// when configured with the posix threads option but don't define them when +// configured with the win32 threads option. The Notification class is only +// used in GoogleTest's internal tests. Since we don't build GoogleTest's +// internal tests, we don't need a working Notification class. Although it's +// not hard to fix the mingw32 g++ compilation errors by implementing the +// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE, +// it's simpler to just use a stub Notification class on all platforms. +// +// The default constructor of the stub class is deleted and the declaration of +// the Notify() method is commented out, so that compilation will fail if any +// code actually uses the Notification class. + +#define GTEST_HAS_NOTIFICATION_ 1 +namespace testing { +namespace internal { +class Notification { + public: + Notification() = delete; + Notification(const Notification&) = delete; + Notification& operator=(const Notification&) = delete; + // void Notify(); + void WaitForNotification() {} +}; +} // namespace internal +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h new file mode 100644 index 0000000000..b9495d8378 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h @@ -0,0 +1,42 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// This file provides an injection point for custom printers in a local +// installation of gTest. +// It will be included from gtest-printers.h and the overrides in this file +// will be visible to everyone. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h new file mode 100644 index 0000000000..afaaf17ba2 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h @@ -0,0 +1,37 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Injection point for custom user configurations. See README for details +// +// ** Custom implementation starts here ** + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h new file mode 100644 index 0000000000..45580ae805 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h @@ -0,0 +1,306 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines internal utilities needed for implementing +// death tests. They are subject to change without notice. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ + +#include + +#include + +#include "gtest/gtest-matchers.h" +#include "gtest/internal/gtest-internal.h" + +GTEST_DECLARE_string_(internal_run_death_test); + +namespace testing { +namespace internal { + +// Names of the flags (needed for parsing Google Test flags). +const char kDeathTestStyleFlag[] = "death_test_style"; +const char kDeathTestUseFork[] = "death_test_use_fork"; +const char kInternalRunDeathTestFlag[] = "internal_run_death_test"; + +#if GTEST_HAS_DEATH_TEST + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// DeathTest is a class that hides much of the complexity of the +// GTEST_DEATH_TEST_ macro. It is abstract; its static Create method +// returns a concrete class that depends on the prevailing death test +// style, as defined by the --gtest_death_test_style and/or +// --gtest_internal_run_death_test flags. + +// In describing the results of death tests, these terms are used with +// the corresponding definitions: +// +// exit status: The integer exit information in the format specified +// by wait(2) +// exit code: The integer code passed to exit(3), _exit(2), or +// returned from main() +class GTEST_API_ DeathTest { + public: + // Create returns false if there was an error determining the + // appropriate action to take for the current death test; for example, + // if the gtest_death_test_style flag is set to an invalid value. + // The LastMessage method will return a more detailed message in that + // case. Otherwise, the DeathTest pointer pointed to by the "test" + // argument is set. If the death test should be skipped, the pointer + // is set to NULL; otherwise, it is set to the address of a new concrete + // DeathTest object that controls the execution of the current test. + static bool Create(const char* statement, Matcher matcher, + const char* file, int line, DeathTest** test); + DeathTest(); + virtual ~DeathTest() {} + + // A helper class that aborts a death test when it's deleted. + class ReturnSentinel { + public: + explicit ReturnSentinel(DeathTest* test) : test_(test) {} + ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } + + private: + DeathTest* const test_; + ReturnSentinel(const ReturnSentinel&) = delete; + ReturnSentinel& operator=(const ReturnSentinel&) = delete; + } GTEST_ATTRIBUTE_UNUSED_; + + // An enumeration of possible roles that may be taken when a death + // test is encountered. EXECUTE means that the death test logic should + // be executed immediately. OVERSEE means that the program should prepare + // the appropriate environment for a child process to execute the death + // test, then wait for it to complete. + enum TestRole { OVERSEE_TEST, EXECUTE_TEST }; + + // An enumeration of the three reasons that a test might be aborted. + enum AbortReason { + TEST_ENCOUNTERED_RETURN_STATEMENT, + TEST_THREW_EXCEPTION, + TEST_DID_NOT_DIE + }; + + // Assumes one of the above roles. + virtual TestRole AssumeRole() = 0; + + // Waits for the death test to finish and returns its status. + virtual int Wait() = 0; + + // Returns true if the death test passed; that is, the test process + // exited during the test, its exit status matches a user-supplied + // predicate, and its stderr output matches a user-supplied regular + // expression. + // The user-supplied predicate may be a macro expression rather + // than a function pointer or functor, or else Wait and Passed could + // be combined. + virtual bool Passed(bool exit_status_ok) = 0; + + // Signals that the death test did not die as expected. + virtual void Abort(AbortReason reason) = 0; + + // Returns a human-readable outcome message regarding the outcome of + // the last death test. + static const char* LastMessage(); + + static void set_last_death_test_message(const std::string& message); + + private: + // A string containing a description of the outcome of the last death test. + static std::string last_death_test_message_; + + DeathTest(const DeathTest&) = delete; + DeathTest& operator=(const DeathTest&) = delete; +}; + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// Factory interface for death tests. May be mocked out for testing. +class DeathTestFactory { + public: + virtual ~DeathTestFactory() {} + virtual bool Create(const char* statement, + Matcher matcher, const char* file, + int line, DeathTest** test) = 0; +}; + +// A concrete DeathTestFactory implementation for normal use. +class DefaultDeathTestFactory : public DeathTestFactory { + public: + bool Create(const char* statement, Matcher matcher, + const char* file, int line, DeathTest** test) override; +}; + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +GTEST_API_ bool ExitedUnsuccessfully(int exit_status); + +// A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads +// and interpreted as a regex (rather than an Eq matcher) for legacy +// compatibility. +inline Matcher MakeDeathTestMatcher( + ::testing::internal::RE regex) { + return ContainsRegex(regex.pattern()); +} +inline Matcher MakeDeathTestMatcher(const char* regex) { + return ContainsRegex(regex); +} +inline Matcher MakeDeathTestMatcher( + const ::std::string& regex) { + return ContainsRegex(regex); +} + +// If a Matcher is passed to EXPECT_DEATH (etc.), it's +// used directly. +inline Matcher MakeDeathTestMatcher( + Matcher matcher) { + return matcher; +} + +// Traps C++ exceptions escaping statement and reports them as test +// failures. Note that trapping SEH exceptions is not implemented here. +#if GTEST_HAS_EXCEPTIONS +#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (const ::std::exception& gtest_exception) { \ + fprintf( \ + stderr, \ + "\n%s: Caught std::exception-derived exception escaping the " \ + "death test statement. Exception message: %s\n", \ + ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ + gtest_exception.what()); \ + fflush(stderr); \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } catch (...) { \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } + +#else +#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) + +#endif + +// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, +// ASSERT_EXIT*, and EXPECT_EXIT*. +#define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::DeathTest* gtest_dt; \ + if (!::testing::internal::DeathTest::Create( \ + #statement, \ + ::testing::internal::MakeDeathTestMatcher(regex_or_matcher), \ + __FILE__, __LINE__, >est_dt)) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + if (gtest_dt != nullptr) { \ + std::unique_ptr< ::testing::internal::DeathTest> gtest_dt_ptr(gtest_dt); \ + switch (gtest_dt->AssumeRole()) { \ + case ::testing::internal::DeathTest::OVERSEE_TEST: \ + if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + break; \ + case ::testing::internal::DeathTest::EXECUTE_TEST: { \ + ::testing::internal::DeathTest::ReturnSentinel gtest_sentinel( \ + gtest_dt); \ + GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ + gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ + break; \ + } \ + } \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__) \ + : fail(::testing::internal::DeathTest::LastMessage()) +// The symbol "fail" here expands to something into which a message +// can be streamed. + +// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in +// NDEBUG mode. In this case we need the statements to be executed and the macro +// must accept a streamed message even though the message is never printed. +// The regex object is not evaluated, but it is used to prevent "unused" +// warnings and to avoid an expression that doesn't compile in debug mode. +#define GTEST_EXECUTE_STATEMENT_(statement, regex_or_matcher) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } else if (!::testing::internal::AlwaysTrue()) { \ + ::testing::internal::MakeDeathTestMatcher(regex_or_matcher); \ + } else \ + ::testing::Message() + +// A class representing the parsed contents of the +// --gtest_internal_run_death_test flag, as it existed when +// RUN_ALL_TESTS was called. +class InternalRunDeathTestFlag { + public: + InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index, + int a_write_fd) + : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {} + + ~InternalRunDeathTestFlag() { + if (write_fd_ >= 0) posix::Close(write_fd_); + } + + const std::string& file() const { return file_; } + int line() const { return line_; } + int index() const { return index_; } + int write_fd() const { return write_fd_; } + + private: + std::string file_; + int line_; + int index_; + int write_fd_; + + InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete; + InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete; +}; + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag(); + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace internal +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h new file mode 100644 index 0000000000..a2a60a962b --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h @@ -0,0 +1,210 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Google Test filepath utilities +// +// This header file declares classes and functions used internally by +// Google Test. They are subject to change without notice. +// +// This file is #included in gtest/internal/gtest-internal.h. +// Do not include this header file separately! + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ + +#include "gtest/internal/gtest-string.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +namespace testing { +namespace internal { + +// FilePath - a class for file and directory pathname manipulation which +// handles platform-specific conventions (like the pathname separator). +// Used for helper functions for naming files in a directory for xml output. +// Except for Set methods, all methods are const or static, which provides an +// "immutable value object" -- useful for peace of mind. +// A FilePath with a value ending in a path separator ("like/this/") represents +// a directory, otherwise it is assumed to represent a file. In either case, +// it may or may not represent an actual file or directory in the file system. +// Names are NOT checked for syntax correctness -- no checking for illegal +// characters, malformed paths, etc. + +class GTEST_API_ FilePath { + public: + FilePath() : pathname_("") {} + FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {} + + explicit FilePath(const std::string& pathname) : pathname_(pathname) { + Normalize(); + } + + FilePath& operator=(const FilePath& rhs) { + Set(rhs); + return *this; + } + + void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; } + + const std::string& string() const { return pathname_; } + const char* c_str() const { return pathname_.c_str(); } + + // Returns the current working directory, or "" if unsuccessful. + static FilePath GetCurrentDir(); + + // Given directory = "dir", base_name = "test", number = 0, + // extension = "xml", returns "dir/test.xml". If number is greater + // than zero (e.g., 12), returns "dir/test_12.xml". + // On Windows platform, uses \ as the separator rather than /. + static FilePath MakeFileName(const FilePath& directory, + const FilePath& base_name, int number, + const char* extension); + + // Given directory = "dir", relative_path = "test.xml", + // returns "dir/test.xml". + // On Windows, uses \ as the separator rather than /. + static FilePath ConcatPaths(const FilePath& directory, + const FilePath& relative_path); + + // Returns a pathname for a file that does not currently exist. The pathname + // will be directory/base_name.extension or + // directory/base_name_.extension if directory/base_name.extension + // already exists. The number will be incremented until a pathname is found + // that does not already exist. + // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. + // There could be a race condition if two or more processes are calling this + // function at the same time -- they could both pick the same filename. + static FilePath GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension); + + // Returns true if and only if the path is "". + bool IsEmpty() const { return pathname_.empty(); } + + // If input name has a trailing separator character, removes it and returns + // the name, otherwise return the name string unmodified. + // On Windows platform, uses \ as the separator, other platforms use /. + FilePath RemoveTrailingPathSeparator() const; + + // Returns a copy of the FilePath with the directory part removed. + // Example: FilePath("path/to/file").RemoveDirectoryName() returns + // FilePath("file"). If there is no directory part ("just_a_file"), it returns + // the FilePath unmodified. If there is no file part ("just_a_dir/") it + // returns an empty FilePath (""). + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveDirectoryName() const; + + // RemoveFileName returns the directory path with the filename removed. + // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". + // If the FilePath is "a_file" or "/a_file", RemoveFileName returns + // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does + // not have a file, like "just/a/dir/", it returns the FilePath unmodified. + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveFileName() const; + + // Returns a copy of the FilePath with the case-insensitive extension removed. + // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns + // FilePath("dir/file"). If a case-insensitive extension is not + // found, returns a copy of the original FilePath. + FilePath RemoveExtension(const char* extension) const; + + // Creates directories so that path exists. Returns true if successful or if + // the directories already exist; returns false if unable to create + // directories for any reason. Will also return false if the FilePath does + // not represent a directory (that is, it doesn't end with a path separator). + bool CreateDirectoriesRecursively() const; + + // Create the directory so that path exists. Returns true if successful or + // if the directory already exists; returns false if unable to create the + // directory for any reason, including if the parent directory does not + // exist. Not named "CreateDirectory" because that's a macro on Windows. + bool CreateFolder() const; + + // Returns true if FilePath describes something in the file-system, + // either a file, directory, or whatever, and that something exists. + bool FileOrDirectoryExists() const; + + // Returns true if pathname describes a directory in the file-system + // that exists. + bool DirectoryExists() const; + + // Returns true if FilePath ends with a path separator, which indicates that + // it is intended to represent a directory. Returns false otherwise. + // This does NOT check that a directory (or file) actually exists. + bool IsDirectory() const; + + // Returns true if pathname describes a root directory. (Windows has one + // root directory per disk drive.) + bool IsRootDirectory() const; + + // Returns true if pathname describes an absolute path. + bool IsAbsolutePath() const; + + private: + // Replaces multiple consecutive separators with a single separator. + // For example, "bar///foo" becomes "bar/foo". Does not eliminate other + // redundancies that might be in a pathname involving "." or "..". + // + // A pathname with multiple consecutive separators may occur either through + // user error or as a result of some scripts or APIs that generate a pathname + // with a trailing separator. On other platforms the same API or script + // may NOT generate a pathname with a trailing "/". Then elsewhere that + // pathname may have another "/" and pathname components added to it, + // without checking for the separator already being there. + // The script language and operating system may allow paths like "foo//bar" + // but some of the functions in FilePath will not handle that correctly. In + // particular, RemoveTrailingPathSeparator() only removes one separator, and + // it is called in CreateDirectoriesRecursively() assuming that it will change + // a pathname from directory syntax (trailing separator) to filename syntax. + // + // On Windows this method also replaces the alternate path separator '/' with + // the primary path separator '\\', so that for example "bar\\/\\foo" becomes + // "bar\\foo". + + void Normalize(); + + // Returns a pointer to the last occurrence of a valid path separator in + // the FilePath. On Windows, for example, both '/' and '\' are valid path + // separators. Returns NULL if no path separator was found. + const char* FindLastPathSeparator() const; + + std::string pathname_; +}; // class FilePath + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h new file mode 100644 index 0000000000..9b04e4c85f --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h @@ -0,0 +1,1570 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file declares functions and macros used internally by +// Google Test. They are subject to change without notice. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ + +#include "gtest/internal/gtest-port.h" + +#if GTEST_OS_LINUX +#include +#include +#include +#include +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +#include +#endif + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-filepath.h" +#include "gtest/internal/gtest-string.h" +#include "gtest/internal/gtest-type-util.h" + +// Due to C++ preprocessor weirdness, we need double indirection to +// concatenate two tokens when one of them is __LINE__. Writing +// +// foo ## __LINE__ +// +// will result in the token foo__LINE__, instead of foo followed by +// the current line number. For more details, see +// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 +#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) +#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar + +// Stringifies its argument. +// Work around a bug in visual studio which doesn't accept code like this: +// +// #define GTEST_STRINGIFY_(name) #name +// #define MACRO(a, b, c) ... GTEST_STRINGIFY_(a) ... +// MACRO(, x, y) +// +// Complaining about the argument to GTEST_STRINGIFY_ being empty. +// This is allowed by the spec. +#define GTEST_STRINGIFY_HELPER_(name, ...) #name +#define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, ) + +namespace proto2 { +class MessageLite; +} + +namespace testing { + +// Forward declarations. + +class AssertionResult; // Result of an assertion. +class Message; // Represents a failure message. +class Test; // Represents a test. +class TestInfo; // Information about a test. +class TestPartResult; // Result of a test part. +class UnitTest; // A collection of test suites. + +template +::std::string PrintToString(const T& value); + +namespace internal { + +struct TraceInfo; // Information about a trace point. +class TestInfoImpl; // Opaque implementation of TestInfo +class UnitTestImpl; // Opaque implementation of UnitTest + +// The text used in failure messages to indicate the start of the +// stack trace. +GTEST_API_ extern const char kStackTraceMarker[]; + +// An IgnoredValue object can be implicitly constructed from ANY value. +class IgnoredValue { + struct Sink {}; + + public: + // This constructor template allows any value to be implicitly + // converted to IgnoredValue. The object has no data member and + // doesn't try to remember anything about the argument. We + // deliberately omit the 'explicit' keyword in order to allow the + // conversion to be implicit. + // Disable the conversion if T already has a magical conversion operator. + // Otherwise we get ambiguity. + template ::value, + int>::type = 0> + IgnoredValue(const T& /* ignored */) {} // NOLINT(runtime/explicit) +}; + +// Appends the user-supplied message to the Google-Test-generated message. +GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg, + const Message& user_msg); + +#if GTEST_HAS_EXCEPTIONS + +GTEST_DISABLE_MSC_WARNINGS_PUSH_( + 4275 /* an exported class was derived from a class that was not exported */) + +// This exception is thrown by (and only by) a failed Google Test +// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions +// are enabled). We derive it from std::runtime_error, which is for +// errors presumably detectable only at run time. Since +// std::runtime_error inherits from std::exception, many testing +// frameworks know how to extract and print the message inside it. +class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error { + public: + explicit GoogleTestFailureException(const TestPartResult& failure); +}; + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4275 + +#endif // GTEST_HAS_EXCEPTIONS + +namespace edit_distance { +// Returns the optimal edits to go from 'left' to 'right'. +// All edits cost the same, with replace having lower priority than +// add/remove. +// Simple implementation of the Wagner-Fischer algorithm. +// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm +enum EditType { kMatch, kAdd, kRemove, kReplace }; +GTEST_API_ std::vector CalculateOptimalEdits( + const std::vector& left, const std::vector& right); + +// Same as above, but the input is represented as strings. +GTEST_API_ std::vector CalculateOptimalEdits( + const std::vector& left, + const std::vector& right); + +// Create a diff of the input strings in Unified diff format. +GTEST_API_ std::string CreateUnifiedDiff(const std::vector& left, + const std::vector& right, + size_t context = 2); + +} // namespace edit_distance + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// expected_expression: "foo" +// actual_expression: "bar" +// expected_value: "5" +// actual_value: "6" +// +// The ignoring_case parameter is true if and only if the assertion is a +// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will +// be inserted into the message. +GTEST_API_ AssertionResult EqFailure(const char* expected_expression, + const char* actual_expression, + const std::string& expected_value, + const std::string& actual_value, + bool ignoring_case); + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +GTEST_API_ std::string GetBoolAssertionFailureMessage( + const AssertionResult& assertion_result, const char* expression_text, + const char* actual_predicate_value, const char* expected_predicate_value); + +// This template class represents an IEEE floating-point number +// (either single-precision or double-precision, depending on the +// template parameters). +// +// The purpose of this class is to do more sophisticated number +// comparison. (Due to round-off error, etc, it's very unlikely that +// two floating-points will be equal exactly. Hence a naive +// comparison by the == operation often doesn't work.) +// +// Format of IEEE floating-point: +// +// The most-significant bit being the leftmost, an IEEE +// floating-point looks like +// +// sign_bit exponent_bits fraction_bits +// +// Here, sign_bit is a single bit that designates the sign of the +// number. +// +// For float, there are 8 exponent bits and 23 fraction bits. +// +// For double, there are 11 exponent bits and 52 fraction bits. +// +// More details can be found at +// http://en.wikipedia.org/wiki/IEEE_floating-point_standard. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +template +class FloatingPoint { + public: + // Defines the unsigned integer type that has the same size as the + // floating point number. + typedef typename TypeWithSize::UInt Bits; + + // Constants. + + // # of bits in a number. + static const size_t kBitCount = 8 * sizeof(RawType); + + // # of fraction bits in a number. + static const size_t kFractionBitCount = + std::numeric_limits::digits - 1; + + // # of exponent bits in a number. + static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; + + // The mask for the sign bit. + static const Bits kSignBitMask = static_cast(1) << (kBitCount - 1); + + // The mask for the fraction bits. + static const Bits kFractionBitMask = ~static_cast(0) >> + (kExponentBitCount + 1); + + // The mask for the exponent bits. + static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); + + // How many ULP's (Units in the Last Place) we want to tolerate when + // comparing two numbers. The larger the value, the more error we + // allow. A 0 value means that two numbers must be exactly the same + // to be considered equal. + // + // The maximum error of a single floating-point operation is 0.5 + // units in the last place. On Intel CPU's, all floating-point + // calculations are done with 80-bit precision, while double has 64 + // bits. Therefore, 4 should be enough for ordinary use. + // + // See the following article for more details on ULP: + // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/ + static const uint32_t kMaxUlps = 4; + + // Constructs a FloatingPoint from a raw floating-point number. + // + // On an Intel CPU, passing a non-normalized NAN (Not a Number) + // around may change its bits, although the new value is guaranteed + // to be also a NAN. Therefore, don't expect this constructor to + // preserve the bits in x when x is a NAN. + explicit FloatingPoint(const RawType& x) { u_.value_ = x; } + + // Static methods + + // Reinterprets a bit pattern as a floating-point number. + // + // This function is needed to test the AlmostEquals() method. + static RawType ReinterpretBits(const Bits bits) { + FloatingPoint fp(0); + fp.u_.bits_ = bits; + return fp.u_.value_; + } + + // Returns the floating-point number that represent positive infinity. + static RawType Infinity() { return ReinterpretBits(kExponentBitMask); } + + // Returns the maximum representable finite floating-point number. + static RawType Max(); + + // Non-static methods + + // Returns the bits that represents this number. + const Bits& bits() const { return u_.bits_; } + + // Returns the exponent bits of this number. + Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } + + // Returns the fraction bits of this number. + Bits fraction_bits() const { return kFractionBitMask & u_.bits_; } + + // Returns the sign bit of this number. + Bits sign_bit() const { return kSignBitMask & u_.bits_; } + + // Returns true if and only if this is NAN (not a number). + bool is_nan() const { + // It's a NAN if the exponent bits are all ones and the fraction + // bits are not entirely zeros. + return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0); + } + + // Returns true if and only if this number is at most kMaxUlps ULP's away + // from rhs. In particular, this function: + // + // - returns false if either number is (or both are) NAN. + // - treats really large numbers as almost equal to infinity. + // - thinks +0.0 and -0.0 are 0 DLP's apart. + bool AlmostEquals(const FloatingPoint& rhs) const { + // The IEEE standard says that any comparison operation involving + // a NAN must return false. + if (is_nan() || rhs.is_nan()) return false; + + return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <= + kMaxUlps; + } + + private: + // The data type used to store the actual floating-point number. + union FloatingPointUnion { + RawType value_; // The raw floating-point number. + Bits bits_; // The bits that represent the number. + }; + + // Converts an integer from the sign-and-magnitude representation to + // the biased representation. More precisely, let N be 2 to the + // power of (kBitCount - 1), an integer x is represented by the + // unsigned number x + N. + // + // For instance, + // + // -N + 1 (the most negative number representable using + // sign-and-magnitude) is represented by 1; + // 0 is represented by N; and + // N - 1 (the biggest number representable using + // sign-and-magnitude) is represented by 2N - 1. + // + // Read http://en.wikipedia.org/wiki/Signed_number_representations + // for more details on signed number representations. + static Bits SignAndMagnitudeToBiased(const Bits& sam) { + if (kSignBitMask & sam) { + // sam represents a negative number. + return ~sam + 1; + } else { + // sam represents a positive number. + return kSignBitMask | sam; + } + } + + // Given two numbers in the sign-and-magnitude representation, + // returns the distance between them as an unsigned number. + static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1, + const Bits& sam2) { + const Bits biased1 = SignAndMagnitudeToBiased(sam1); + const Bits biased2 = SignAndMagnitudeToBiased(sam2); + return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); + } + + FloatingPointUnion u_; +}; + +// We cannot use std::numeric_limits::max() as it clashes with the max() +// macro defined by . +template <> +inline float FloatingPoint::Max() { + return FLT_MAX; +} +template <> +inline double FloatingPoint::Max() { + return DBL_MAX; +} + +// Typedefs the instances of the FloatingPoint template class that we +// care to use. +typedef FloatingPoint Float; +typedef FloatingPoint Double; + +// In order to catch the mistake of putting tests that use different +// test fixture classes in the same test suite, we need to assign +// unique IDs to fixture classes and compare them. The TypeId type is +// used to hold such IDs. The user should treat TypeId as an opaque +// type: the only operation allowed on TypeId values is to compare +// them for equality using the == operator. +typedef const void* TypeId; + +template +class TypeIdHelper { + public: + // dummy_ must not have a const type. Otherwise an overly eager + // compiler (e.g. MSVC 7.1 & 8.0) may try to merge + // TypeIdHelper::dummy_ for different Ts as an "optimization". + static bool dummy_; +}; + +template +bool TypeIdHelper::dummy_ = false; + +// GetTypeId() returns the ID of type T. Different values will be +// returned for different types. Calling the function twice with the +// same type argument is guaranteed to return the same ID. +template +TypeId GetTypeId() { + // The compiler is required to allocate a different + // TypeIdHelper::dummy_ variable for each T used to instantiate + // the template. Therefore, the address of dummy_ is guaranteed to + // be unique. + return &(TypeIdHelper::dummy_); +} + +// Returns the type ID of ::testing::Test. Always call this instead +// of GetTypeId< ::testing::Test>() to get the type ID of +// ::testing::Test, as the latter may give the wrong result due to a +// suspected linker bug when compiling Google Test as a Mac OS X +// framework. +GTEST_API_ TypeId GetTestTypeId(); + +// Defines the abstract factory interface that creates instances +// of a Test object. +class TestFactoryBase { + public: + virtual ~TestFactoryBase() {} + + // Creates a test instance to run. The instance is both created and destroyed + // within TestInfoImpl::Run() + virtual Test* CreateTest() = 0; + + protected: + TestFactoryBase() {} + + private: + TestFactoryBase(const TestFactoryBase&) = delete; + TestFactoryBase& operator=(const TestFactoryBase&) = delete; +}; + +// This class provides implementation of TeastFactoryBase interface. +// It is used in TEST and TEST_F macros. +template +class TestFactoryImpl : public TestFactoryBase { + public: + Test* CreateTest() override { return new TestClass; } +}; + +#if GTEST_OS_WINDOWS + +// Predicate-formatters for implementing the HRESULT checking macros +// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED} +// We pass a long instead of HRESULT to avoid causing an +// include dependency for the HRESULT type. +GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr, + long hr); // NOLINT +GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr, + long hr); // NOLINT + +#endif // GTEST_OS_WINDOWS + +// Types of SetUpTestSuite() and TearDownTestSuite() functions. +using SetUpTestSuiteFunc = void (*)(); +using TearDownTestSuiteFunc = void (*)(); + +struct CodeLocation { + CodeLocation(const std::string& a_file, int a_line) + : file(a_file), line(a_line) {} + + std::string file; + int line; +}; + +// Helper to identify which setup function for TestCase / TestSuite to call. +// Only one function is allowed, either TestCase or TestSute but not both. + +// Utility functions to help SuiteApiResolver +using SetUpTearDownSuiteFuncType = void (*)(); + +inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull( + SetUpTearDownSuiteFuncType a, SetUpTearDownSuiteFuncType def) { + return a == def ? nullptr : a; +} + +template +// Note that SuiteApiResolver inherits from T because +// SetUpTestSuite()/TearDownTestSuite() could be protected. This way +// SuiteApiResolver can access them. +struct SuiteApiResolver : T { + // testing::Test is only forward declared at this point. So we make it a + // dependent class for the compiler to be OK with it. + using Test = + typename std::conditional::type; + + static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename, + int line_num) { +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + SetUpTearDownSuiteFuncType test_case_fp = + GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase); + SetUpTearDownSuiteFuncType test_suite_fp = + GetNotDefaultOrNull(&T::SetUpTestSuite, &Test::SetUpTestSuite); + + GTEST_CHECK_(!test_case_fp || !test_suite_fp) + << "Test can not provide both SetUpTestSuite and SetUpTestCase, please " + "make sure there is only one present at " + << filename << ":" << line_num; + + return test_case_fp != nullptr ? test_case_fp : test_suite_fp; +#else + (void)(filename); + (void)(line_num); + return &T::SetUpTestSuite; +#endif + } + + static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename, + int line_num) { +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + SetUpTearDownSuiteFuncType test_case_fp = + GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase); + SetUpTearDownSuiteFuncType test_suite_fp = + GetNotDefaultOrNull(&T::TearDownTestSuite, &Test::TearDownTestSuite); + + GTEST_CHECK_(!test_case_fp || !test_suite_fp) + << "Test can not provide both TearDownTestSuite and TearDownTestCase," + " please make sure there is only one present at" + << filename << ":" << line_num; + + return test_case_fp != nullptr ? test_case_fp : test_suite_fp; +#else + (void)(filename); + (void)(line_num); + return &T::TearDownTestSuite; +#endif + } +}; + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_suite_name: name of the test suite +// name: name of the test +// type_param: the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param: text representation of the test's value parameter, +// or NULL if this is not a type-parameterized test. +// code_location: code location where the test is defined +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +GTEST_API_ TestInfo* MakeAndRegisterTestInfo( + const char* test_suite_name, const char* name, const char* type_param, + const char* value_param, CodeLocation code_location, + TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc, + TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory); + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// State of the definition of a type-parameterized test suite. +class GTEST_API_ TypedTestSuitePState { + public: + TypedTestSuitePState() : registered_(false) {} + + // Adds the given test name to defined_test_names_ and return true + // if the test suite hasn't been registered; otherwise aborts the + // program. + bool AddTestName(const char* file, int line, const char* case_name, + const char* test_name) { + if (registered_) { + fprintf(stderr, + "%s Test %s must be defined before " + "REGISTER_TYPED_TEST_SUITE_P(%s, ...).\n", + FormatFileLocation(file, line).c_str(), test_name, case_name); + fflush(stderr); + posix::Abort(); + } + registered_tests_.insert( + ::std::make_pair(test_name, CodeLocation(file, line))); + return true; + } + + bool TestExists(const std::string& test_name) const { + return registered_tests_.count(test_name) > 0; + } + + const CodeLocation& GetCodeLocation(const std::string& test_name) const { + RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name); + GTEST_CHECK_(it != registered_tests_.end()); + return it->second; + } + + // Verifies that registered_tests match the test names in + // defined_test_names_; returns registered_tests if successful, or + // aborts the program otherwise. + const char* VerifyRegisteredTestNames(const char* test_suite_name, + const char* file, int line, + const char* registered_tests); + + private: + typedef ::std::map RegisteredTestsMap; + + bool registered_; + RegisteredTestsMap registered_tests_; +}; + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +using TypedTestCasePState = TypedTestSuitePState; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +// Skips to the first non-space char after the first comma in 'str'; +// returns NULL if no comma is found in 'str'. +inline const char* SkipComma(const char* str) { + const char* comma = strchr(str, ','); + if (comma == nullptr) { + return nullptr; + } + while (IsSpace(*(++comma))) { + } + return comma; +} + +// Returns the prefix of 'str' before the first comma in it; returns +// the entire string if it contains no comma. +inline std::string GetPrefixUntilComma(const char* str) { + const char* comma = strchr(str, ','); + return comma == nullptr ? str : std::string(str, comma); +} + +// Splits a given string on a given delimiter, populating a given +// vector with the fields. +void SplitString(const ::std::string& str, char delimiter, + ::std::vector<::std::string>* dest); + +// The default argument to the template below for the case when the user does +// not provide a name generator. +struct DefaultNameGenerator { + template + static std::string GetName(int i) { + return StreamableToString(i); + } +}; + +template +struct NameGeneratorSelector { + typedef Provided type; +}; + +template +void GenerateNamesRecursively(internal::None, std::vector*, int) {} + +template +void GenerateNamesRecursively(Types, std::vector* result, int i) { + result->push_back(NameGenerator::template GetName(i)); + GenerateNamesRecursively(typename Types::Tail(), result, + i + 1); +} + +template +std::vector GenerateNames() { + std::vector result; + GenerateNamesRecursively(Types(), &result, 0); + return result; +} + +// TypeParameterizedTest::Register() +// registers a list of type-parameterized tests with Google Test. The +// return value is insignificant - we just need to return something +// such that we can call this function in a namespace scope. +// +// Implementation note: The GTEST_TEMPLATE_ macro declares a template +// template parameter. It's defined in gtest-type-util.h. +template +class TypeParameterizedTest { + public: + // 'index' is the index of the test in the type list 'Types' + // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite, + // Types). Valid values for 'index' are [0, N - 1] where N is the + // length of Types. + static bool Register(const char* prefix, const CodeLocation& code_location, + const char* case_name, const char* test_names, int index, + const std::vector& type_names = + GenerateNames()) { + typedef typename Types::Head Type; + typedef Fixture FixtureClass; + typedef typename GTEST_BIND_(TestSel, Type) TestClass; + + // First, registers the first type-parameterized test in the type + // list. + MakeAndRegisterTestInfo( + (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + + "/" + type_names[static_cast(index)]) + .c_str(), + StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(), + GetTypeName().c_str(), + nullptr, // No value parameter. + code_location, GetTypeId(), + SuiteApiResolver::GetSetUpCaseOrSuite( + code_location.file.c_str(), code_location.line), + SuiteApiResolver::GetTearDownCaseOrSuite( + code_location.file.c_str(), code_location.line), + new TestFactoryImpl); + + // Next, recurses (at compile time) with the tail of the type list. + return TypeParameterizedTest::Register(prefix, + code_location, + case_name, + test_names, + index + 1, + type_names); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTest { + public: + static bool Register(const char* /*prefix*/, const CodeLocation&, + const char* /*case_name*/, const char* /*test_names*/, + int /*index*/, + const std::vector& = + std::vector() /*type_names*/) { + return true; + } +}; + +GTEST_API_ void RegisterTypeParameterizedTestSuite(const char* test_suite_name, + CodeLocation code_location); +GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation( + const char* case_name); + +// TypeParameterizedTestSuite::Register() +// registers *all combinations* of 'Tests' and 'Types' with Google +// Test. The return value is insignificant - we just need to return +// something such that we can call this function in a namespace scope. +template +class TypeParameterizedTestSuite { + public: + static bool Register(const char* prefix, CodeLocation code_location, + const TypedTestSuitePState* state, const char* case_name, + const char* test_names, + const std::vector& type_names = + GenerateNames()) { + RegisterTypeParameterizedTestSuiteInstantiation(case_name); + std::string test_name = + StripTrailingSpaces(GetPrefixUntilComma(test_names)); + if (!state->TestExists(test_name)) { + fprintf(stderr, "Failed to get code location for test %s.%s at %s.", + case_name, test_name.c_str(), + FormatFileLocation(code_location.file.c_str(), code_location.line) + .c_str()); + fflush(stderr); + posix::Abort(); + } + const CodeLocation& test_location = state->GetCodeLocation(test_name); + + typedef typename Tests::Head Head; + + // First, register the first test in 'Test' for each type in 'Types'. + TypeParameterizedTest::Register( + prefix, test_location, case_name, test_names, 0, type_names); + + // Next, recurses (at compile time) with the tail of the test list. + return TypeParameterizedTestSuite::Register(prefix, code_location, + state, case_name, + SkipComma(test_names), + type_names); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTestSuite { + public: + static bool Register(const char* /*prefix*/, const CodeLocation&, + const TypedTestSuitePState* /*state*/, + const char* /*case_name*/, const char* /*test_names*/, + const std::vector& = + std::vector() /*type_names*/) { + return true; + } +}; + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test, + int skip_count); + +// Helpers for suppressing warnings on unreachable code or constant +// condition. + +// Always returns true. +GTEST_API_ bool AlwaysTrue(); + +// Always returns false. +inline bool AlwaysFalse() { return !AlwaysTrue(); } + +// Helper for suppressing false warning from Clang on a const char* +// variable declared in a conditional expression always being NULL in +// the else branch. +struct GTEST_API_ ConstCharPtr { + ConstCharPtr(const char* str) : value(str) {} + operator bool() const { return true; } + const char* value; +}; + +// Helper for declaring std::string within 'if' statement +// in pre C++17 build environment. +struct TrueWithString { + TrueWithString() = default; + explicit TrueWithString(const char* str) : value(str) {} + explicit TrueWithString(const std::string& str) : value(str) {} + explicit operator bool() const { return true; } + std::string value; +}; + +// A simple Linear Congruential Generator for generating random +// numbers with a uniform distribution. Unlike rand() and srand(), it +// doesn't use global state (and therefore can't interfere with user +// code). Unlike rand_r(), it's portable. An LCG isn't very random, +// but it's good enough for our purposes. +class GTEST_API_ Random { + public: + static const uint32_t kMaxRange = 1u << 31; + + explicit Random(uint32_t seed) : state_(seed) {} + + void Reseed(uint32_t seed) { state_ = seed; } + + // Generates a random number from [0, range). Crashes if 'range' is + // 0 or greater than kMaxRange. + uint32_t Generate(uint32_t range); + + private: + uint32_t state_; + Random(const Random&) = delete; + Random& operator=(const Random&) = delete; +}; + +// Turns const U&, U&, const U, and U all into U. +#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ + typename std::remove_const::type>::type + +// HasDebugStringAndShortDebugString::value is a compile-time bool constant +// that's true if and only if T has methods DebugString() and ShortDebugString() +// that return std::string. +template +class HasDebugStringAndShortDebugString { + private: + template + static auto CheckDebugString(C*) -> typename std::is_same< + std::string, decltype(std::declval().DebugString())>::type; + template + static std::false_type CheckDebugString(...); + + template + static auto CheckShortDebugString(C*) -> typename std::is_same< + std::string, decltype(std::declval().ShortDebugString())>::type; + template + static std::false_type CheckShortDebugString(...); + + using HasDebugStringType = decltype(CheckDebugString(nullptr)); + using HasShortDebugStringType = decltype(CheckShortDebugString(nullptr)); + + public: + static constexpr bool value = + HasDebugStringType::value && HasShortDebugStringType::value; +}; + +template +constexpr bool HasDebugStringAndShortDebugString::value; + +// When the compiler sees expression IsContainerTest(0), if C is an +// STL-style container class, the first overload of IsContainerTest +// will be viable (since both C::iterator* and C::const_iterator* are +// valid types and NULL can be implicitly converted to them). It will +// be picked over the second overload as 'int' is a perfect match for +// the type of argument 0. If C::iterator or C::const_iterator is not +// a valid type, the first overload is not viable, and the second +// overload will be picked. Therefore, we can determine whether C is +// a container class by checking the type of IsContainerTest(0). +// The value of the expression is insignificant. +// +// In C++11 mode we check the existence of a const_iterator and that an +// iterator is properly implemented for the container. +// +// For pre-C++11 that we look for both C::iterator and C::const_iterator. +// The reason is that C++ injects the name of a class as a member of the +// class itself (e.g. you can refer to class iterator as either +// 'iterator' or 'iterator::iterator'). If we look for C::iterator +// only, for example, we would mistakenly think that a class named +// iterator is an STL container. +// +// Also note that the simpler approach of overloading +// IsContainerTest(typename C::const_iterator*) and +// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. +typedef int IsContainer; +template ().begin()), + class = decltype(::std::declval().end()), + class = decltype(++::std::declval()), + class = decltype(*::std::declval()), + class = typename C::const_iterator> +IsContainer IsContainerTest(int /* dummy */) { + return 0; +} + +typedef char IsNotContainer; +template +IsNotContainer IsContainerTest(long /* dummy */) { + return '\0'; +} + +// Trait to detect whether a type T is a hash table. +// The heuristic used is that the type contains an inner type `hasher` and does +// not contain an inner type `reverse_iterator`. +// If the container is iterable in reverse, then order might actually matter. +template +struct IsHashTable { + private: + template + static char test(typename U::hasher*, typename U::reverse_iterator*); + template + static int test(typename U::hasher*, ...); + template + static char test(...); + + public: + static const bool value = sizeof(test(nullptr, nullptr)) == sizeof(int); +}; + +template +const bool IsHashTable::value; + +template (0)) == sizeof(IsContainer)> +struct IsRecursiveContainerImpl; + +template +struct IsRecursiveContainerImpl : public std::false_type {}; + +// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to +// obey the same inconsistencies as the IsContainerTest, namely check if +// something is a container is relying on only const_iterator in C++11 and +// is relying on both const_iterator and iterator otherwise +template +struct IsRecursiveContainerImpl { + using value_type = decltype(*std::declval()); + using type = + std::is_same::type>::type, + C>; +}; + +// IsRecursiveContainer is a unary compile-time predicate that +// evaluates whether C is a recursive container type. A recursive container +// type is a container type whose value_type is equal to the container type +// itself. An example for a recursive container type is +// boost::filesystem::path, whose iterator has a value_type that is equal to +// boost::filesystem::path. +template +struct IsRecursiveContainer : public IsRecursiveContainerImpl::type {}; + +// Utilities for native arrays. + +// ArrayEq() compares two k-dimensional native arrays using the +// elements' operator==, where k can be any integer >= 0. When k is +// 0, ArrayEq() degenerates into comparing a single pair of values. + +template +bool ArrayEq(const T* lhs, size_t size, const U* rhs); + +// This generic version is used when k is 0. +template +inline bool ArrayEq(const T& lhs, const U& rhs) { + return lhs == rhs; +} + +// This overload is used when k >= 1. +template +inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) { + return internal::ArrayEq(lhs, N, rhs); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous ArrayEq() function, arrays with different sizes would +// lead to different copies of the template code. +template +bool ArrayEq(const T* lhs, size_t size, const U* rhs) { + for (size_t i = 0; i != size; i++) { + if (!internal::ArrayEq(lhs[i], rhs[i])) return false; + } + return true; +} + +// Finds the first element in the iterator range [begin, end) that +// equals elem. Element may be a native array type itself. +template +Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) { + for (Iter it = begin; it != end; ++it) { + if (internal::ArrayEq(*it, elem)) return it; + } + return end; +} + +// CopyArray() copies a k-dimensional native array using the elements' +// operator=, where k can be any integer >= 0. When k is 0, +// CopyArray() degenerates into copying a single value. + +template +void CopyArray(const T* from, size_t size, U* to); + +// This generic version is used when k is 0. +template +inline void CopyArray(const T& from, U* to) { + *to = from; +} + +// This overload is used when k >= 1. +template +inline void CopyArray(const T (&from)[N], U (*to)[N]) { + internal::CopyArray(from, N, *to); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous CopyArray() function, arrays with different sizes +// would lead to different copies of the template code. +template +void CopyArray(const T* from, size_t size, U* to) { + for (size_t i = 0; i != size; i++) { + internal::CopyArray(from[i], to + i); + } +} + +// The relation between an NativeArray object (see below) and the +// native array it represents. +// We use 2 different structs to allow non-copyable types to be used, as long +// as RelationToSourceReference() is passed. +struct RelationToSourceReference {}; +struct RelationToSourceCopy {}; + +// Adapts a native array to a read-only STL-style container. Instead +// of the complete STL container concept, this adaptor only implements +// members useful for Google Mock's container matchers. New members +// should be added as needed. To simplify the implementation, we only +// support Element being a raw type (i.e. having no top-level const or +// reference modifier). It's the client's responsibility to satisfy +// this requirement. Element can be an array type itself (hence +// multi-dimensional arrays are supported). +template +class NativeArray { + public: + // STL-style container typedefs. + typedef Element value_type; + typedef Element* iterator; + typedef const Element* const_iterator; + + // Constructs from a native array. References the source. + NativeArray(const Element* array, size_t count, RelationToSourceReference) { + InitRef(array, count); + } + + // Constructs from a native array. Copies the source. + NativeArray(const Element* array, size_t count, RelationToSourceCopy) { + InitCopy(array, count); + } + + // Copy constructor. + NativeArray(const NativeArray& rhs) { + (this->*rhs.clone_)(rhs.array_, rhs.size_); + } + + ~NativeArray() { + if (clone_ != &NativeArray::InitRef) delete[] array_; + } + + // STL-style container methods. + size_t size() const { return size_; } + const_iterator begin() const { return array_; } + const_iterator end() const { return array_ + size_; } + bool operator==(const NativeArray& rhs) const { + return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin()); + } + + private: + static_assert(!std::is_const::value, "Type must not be const"); + static_assert(!std::is_reference::value, + "Type must not be a reference"); + + // Initializes this object with a copy of the input. + void InitCopy(const Element* array, size_t a_size) { + Element* const copy = new Element[a_size]; + CopyArray(array, a_size, copy); + array_ = copy; + size_ = a_size; + clone_ = &NativeArray::InitCopy; + } + + // Initializes this object with a reference of the input. + void InitRef(const Element* array, size_t a_size) { + array_ = array; + size_ = a_size; + clone_ = &NativeArray::InitRef; + } + + const Element* array_; + size_t size_; + void (NativeArray::*clone_)(const Element*, size_t); +}; + +// Backport of std::index_sequence. +template +struct IndexSequence { + using type = IndexSequence; +}; + +// Double the IndexSequence, and one if plus_one is true. +template +struct DoubleSequence; +template +struct DoubleSequence, sizeofT> { + using type = IndexSequence; +}; +template +struct DoubleSequence, sizeofT> { + using type = IndexSequence; +}; + +// Backport of std::make_index_sequence. +// It uses O(ln(N)) instantiation depth. +template +struct MakeIndexSequenceImpl + : DoubleSequence::type, + N / 2>::type {}; + +template <> +struct MakeIndexSequenceImpl<0> : IndexSequence<> {}; + +template +using MakeIndexSequence = typename MakeIndexSequenceImpl::type; + +template +using IndexSequenceFor = typename MakeIndexSequence::type; + +template +struct Ignore { + Ignore(...); // NOLINT +}; + +template +struct ElemFromListImpl; +template +struct ElemFromListImpl> { + // We make Ignore a template to solve a problem with MSVC. + // A non-template Ignore would work fine with `decltype(Ignore(I))...`, but + // MSVC doesn't understand how to deal with that pack expansion. + // Use `0 * I` to have a single instantiation of Ignore. + template + static R Apply(Ignore<0 * I>..., R (*)(), ...); +}; + +template +struct ElemFromList { + using type = + decltype(ElemFromListImpl::type>::Apply( + static_cast(nullptr)...)); +}; + +struct FlatTupleConstructTag {}; + +template +class FlatTuple; + +template +struct FlatTupleElemBase; + +template +struct FlatTupleElemBase, I> { + using value_type = typename ElemFromList::type; + FlatTupleElemBase() = default; + template + explicit FlatTupleElemBase(FlatTupleConstructTag, Arg&& t) + : value(std::forward(t)) {} + value_type value; +}; + +template +struct FlatTupleBase; + +template +struct FlatTupleBase, IndexSequence> + : FlatTupleElemBase, Idx>... { + using Indices = IndexSequence; + FlatTupleBase() = default; + template + explicit FlatTupleBase(FlatTupleConstructTag, Args&&... args) + : FlatTupleElemBase, Idx>(FlatTupleConstructTag{}, + std::forward(args))... {} + + template + const typename ElemFromList::type& Get() const { + return FlatTupleElemBase, I>::value; + } + + template + typename ElemFromList::type& Get() { + return FlatTupleElemBase, I>::value; + } + + template + auto Apply(F&& f) -> decltype(std::forward(f)(this->Get()...)) { + return std::forward(f)(Get()...); + } + + template + auto Apply(F&& f) const -> decltype(std::forward(f)(this->Get()...)) { + return std::forward(f)(Get()...); + } +}; + +// Analog to std::tuple but with different tradeoffs. +// This class minimizes the template instantiation depth, thus allowing more +// elements than std::tuple would. std::tuple has been seen to require an +// instantiation depth of more than 10x the number of elements in some +// implementations. +// FlatTuple and ElemFromList are not recursive and have a fixed depth +// regardless of T... +// MakeIndexSequence, on the other hand, it is recursive but with an +// instantiation depth of O(ln(N)). +template +class FlatTuple + : private FlatTupleBase, + typename MakeIndexSequence::type> { + using Indices = typename FlatTupleBase< + FlatTuple, typename MakeIndexSequence::type>::Indices; + + public: + FlatTuple() = default; + template + explicit FlatTuple(FlatTupleConstructTag tag, Args&&... args) + : FlatTuple::FlatTupleBase(tag, std::forward(args)...) {} + + using FlatTuple::FlatTupleBase::Apply; + using FlatTuple::FlatTupleBase::Get; +}; + +// Utility functions to be called with static_assert to induce deprecation +// warnings. +GTEST_INTERNAL_DEPRECATED( + "INSTANTIATE_TEST_CASE_P is deprecated, please use " + "INSTANTIATE_TEST_SUITE_P") +constexpr bool InstantiateTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "TYPED_TEST_CASE_P is deprecated, please use " + "TYPED_TEST_SUITE_P") +constexpr bool TypedTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "TYPED_TEST_CASE is deprecated, please use " + "TYPED_TEST_SUITE") +constexpr bool TypedTestCaseIsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "REGISTER_TYPED_TEST_CASE_P is deprecated, please use " + "REGISTER_TYPED_TEST_SUITE_P") +constexpr bool RegisterTypedTestCase_P_IsDeprecated() { return true; } + +GTEST_INTERNAL_DEPRECATED( + "INSTANTIATE_TYPED_TEST_CASE_P is deprecated, please use " + "INSTANTIATE_TYPED_TEST_SUITE_P") +constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; } + +} // namespace internal +} // namespace testing + +namespace std { +// Some standard library implementations use `struct tuple_size` and some use +// `class tuple_size`. Clang warns about the mismatch. +// https://reviews.llvm.org/D55466 +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wmismatched-tags" +#endif +template +struct tuple_size> + : std::integral_constant {}; +#ifdef __clang__ +#pragma clang diagnostic pop +#endif +} // namespace std + +#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ + ::testing::internal::AssertHelper(result_type, file, line, message) = \ + ::testing::Message() + +#define GTEST_MESSAGE_(message, result_type) \ + GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) + +#define GTEST_FATAL_FAILURE_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure) + +#define GTEST_NONFATAL_FAILURE_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure) + +#define GTEST_SUCCESS_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) + +#define GTEST_SKIP_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kSkip) + +// Suppress MSVC warning 4072 (unreachable code) for the code following +// statement if it returns or throws (or doesn't return or throw in some +// situations). +// NOTE: The "else" is important to keep this expansion to prevent a top-level +// "else" from attaching to our "if". +#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ + if (::testing::internal::AlwaysTrue()) { \ + statement; \ + } else /* NOLINT */ \ + static_assert(true, "") // User must have a semicolon after expansion. + +#if GTEST_HAS_EXCEPTIONS + +namespace testing { +namespace internal { + +class NeverThrown { + public: + const char* what() const noexcept { + return "this exception should never be thrown"; + } +}; + +} // namespace internal +} // namespace testing + +#if GTEST_HAS_RTTI + +#define GTEST_EXCEPTION_TYPE_(e) ::testing::internal::GetTypeName(typeid(e)) + +#else // GTEST_HAS_RTTI + +#define GTEST_EXCEPTION_TYPE_(e) \ + std::string { "an std::exception-derived error" } + +#endif // GTEST_HAS_RTTI + +#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) \ + catch (typename std::conditional< \ + std::is_same::type>::type, \ + std::exception>::value, \ + const ::testing::internal::NeverThrown&, const std::exception&>::type \ + e) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws "; \ + gtest_msg.value += GTEST_EXCEPTION_TYPE_(e); \ + gtest_msg.value += " with description \""; \ + gtest_msg.value += e.what(); \ + gtest_msg.value += "\"."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } + +#else // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) + +#endif // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::TrueWithString gtest_msg{}) { \ + bool gtest_caught_expected = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (expected_exception const&) { \ + gtest_caught_expected = true; \ + } \ + GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception) \ + catch (...) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws a different type."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + if (!gtest_caught_expected) { \ + gtest_msg.value = "Expected: " #statement \ + " throws an exception of type " #expected_exception \ + ".\n Actual: it throws nothing."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + } else /*NOLINT*/ \ + GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__) \ + : fail(gtest_msg.value.c_str()) + +#if GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ + catch (std::exception const& e) { \ + gtest_msg.value = "it throws "; \ + gtest_msg.value += GTEST_EXCEPTION_TYPE_(e); \ + gtest_msg.value += " with description \""; \ + gtest_msg.value += e.what(); \ + gtest_msg.value += "\"."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } + +#else // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() + +#endif // GTEST_HAS_EXCEPTIONS + +#define GTEST_TEST_NO_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::TrueWithString gtest_msg{}) { \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \ + catch (...) { \ + gtest_msg.value = "it throws."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__) \ + : fail(("Expected: " #statement " doesn't throw an exception.\n" \ + " Actual: " + \ + gtest_msg.value) \ + .c_str()) + +#define GTEST_TEST_ANY_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + bool gtest_caught_any = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (...) { \ + gtest_caught_any = true; \ + } \ + if (!gtest_caught_any) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__) \ + : fail("Expected: " #statement \ + " throws an exception.\n" \ + " Actual: it doesn't.") + +// Implements Boolean test assertions such as EXPECT_TRUE. expression can be +// either a boolean expression or an AssertionResult. text is a textual +// representation of expression as it was passed into the EXPECT_TRUE. +#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar_ = \ + ::testing::AssertionResult(expression)) \ + ; \ + else \ + fail(::testing::internal::GetBoolAssertionFailureMessage( \ + gtest_ar_, text, #actual, #expected) \ + .c_str()) + +#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__) \ + : fail("Expected: " #statement \ + " doesn't generate new fatal " \ + "failures in the current thread.\n" \ + " Actual: it does.") + +// Expands to the name of the class that implements the given test. +#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + test_suite_name##_##test_name##_Test + +// Helper macro for defining tests. +#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id) \ + static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1, \ + "test_suite_name must not be empty"); \ + static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1, \ + "test_name must not be empty"); \ + class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + : public parent_class { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default; \ + ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &) = delete; /* NOLINT */ \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \ + (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \ + GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=( \ + GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name) &&) noexcept = delete; /* NOLINT */ \ + \ + private: \ + void TestBody() override; \ + static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_; \ + }; \ + \ + ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name, \ + test_name)::test_info_ = \ + ::testing::internal::MakeAndRegisterTestInfo( \ + #test_suite_name, #test_name, nullptr, nullptr, \ + ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__), \ + ::testing::internal::SuiteApiResolver< \ + parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__), \ + new ::testing::internal::TestFactoryImpl); \ + void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody() + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h new file mode 100644 index 0000000000..e7af2f904a --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h @@ -0,0 +1,956 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Type and function utilities for implementing parameterized tests. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest-printers.h" +#include "gtest/gtest-test-part.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { +// Input to a parameterized test name generator, describing a test parameter. +// Consists of the parameter value and the integer parameter index. +template +struct TestParamInfo { + TestParamInfo(const ParamType& a_param, size_t an_index) + : param(a_param), index(an_index) {} + ParamType param; + size_t index; +}; + +// A builtin parameterized test name generator which returns the result of +// testing::PrintToString. +struct PrintToStringParamName { + template + std::string operator()(const TestParamInfo& info) const { + return PrintToString(info.param); + } +}; + +namespace internal { + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// Utility Functions + +// Outputs a message explaining invalid registration of different +// fixture class for the same test suite. This may happen when +// TEST_P macro is used to define two tests with the same name +// but in different namespaces. +GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name, + CodeLocation code_location); + +template +class ParamGeneratorInterface; +template +class ParamGenerator; + +// Interface for iterating over elements provided by an implementation +// of ParamGeneratorInterface. +template +class ParamIteratorInterface { + public: + virtual ~ParamIteratorInterface() {} + // A pointer to the base generator instance. + // Used only for the purposes of iterator comparison + // to make sure that two iterators belong to the same generator. + virtual const ParamGeneratorInterface* BaseGenerator() const = 0; + // Advances iterator to point to the next element + // provided by the generator. The caller is responsible + // for not calling Advance() on an iterator equal to + // BaseGenerator()->End(). + virtual void Advance() = 0; + // Clones the iterator object. Used for implementing copy semantics + // of ParamIterator. + virtual ParamIteratorInterface* Clone() const = 0; + // Dereferences the current iterator and provides (read-only) access + // to the pointed value. It is the caller's responsibility not to call + // Current() on an iterator equal to BaseGenerator()->End(). + // Used for implementing ParamGenerator::operator*(). + virtual const T* Current() const = 0; + // Determines whether the given iterator and other point to the same + // element in the sequence generated by the generator. + // Used for implementing ParamGenerator::operator==(). + virtual bool Equals(const ParamIteratorInterface& other) const = 0; +}; + +// Class iterating over elements provided by an implementation of +// ParamGeneratorInterface. It wraps ParamIteratorInterface +// and implements the const forward iterator concept. +template +class ParamIterator { + public: + typedef T value_type; + typedef const T& reference; + typedef ptrdiff_t difference_type; + + // ParamIterator assumes ownership of the impl_ pointer. + ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {} + ParamIterator& operator=(const ParamIterator& other) { + if (this != &other) impl_.reset(other.impl_->Clone()); + return *this; + } + + const T& operator*() const { return *impl_->Current(); } + const T* operator->() const { return impl_->Current(); } + // Prefix version of operator++. + ParamIterator& operator++() { + impl_->Advance(); + return *this; + } + // Postfix version of operator++. + ParamIterator operator++(int /*unused*/) { + ParamIteratorInterface* clone = impl_->Clone(); + impl_->Advance(); + return ParamIterator(clone); + } + bool operator==(const ParamIterator& other) const { + return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_); + } + bool operator!=(const ParamIterator& other) const { + return !(*this == other); + } + + private: + friend class ParamGenerator; + explicit ParamIterator(ParamIteratorInterface* impl) : impl_(impl) {} + std::unique_ptr> impl_; +}; + +// ParamGeneratorInterface is the binary interface to access generators +// defined in other translation units. +template +class ParamGeneratorInterface { + public: + typedef T ParamType; + + virtual ~ParamGeneratorInterface() {} + + // Generator interface definition + virtual ParamIteratorInterface* Begin() const = 0; + virtual ParamIteratorInterface* End() const = 0; +}; + +// Wraps ParamGeneratorInterface and provides general generator syntax +// compatible with the STL Container concept. +// This class implements copy initialization semantics and the contained +// ParamGeneratorInterface instance is shared among all copies +// of the original object. This is possible because that instance is immutable. +template +class ParamGenerator { + public: + typedef ParamIterator iterator; + + explicit ParamGenerator(ParamGeneratorInterface* impl) : impl_(impl) {} + ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {} + + ParamGenerator& operator=(const ParamGenerator& other) { + impl_ = other.impl_; + return *this; + } + + iterator begin() const { return iterator(impl_->Begin()); } + iterator end() const { return iterator(impl_->End()); } + + private: + std::shared_ptr> impl_; +}; + +// Generates values from a range of two comparable values. Can be used to +// generate sequences of user-defined types that implement operator+() and +// operator<(). +// This class is used in the Range() function. +template +class RangeGenerator : public ParamGeneratorInterface { + public: + RangeGenerator(T begin, T end, IncrementT step) + : begin_(begin), + end_(end), + step_(step), + end_index_(CalculateEndIndex(begin, end, step)) {} + ~RangeGenerator() override {} + + ParamIteratorInterface* Begin() const override { + return new Iterator(this, begin_, 0, step_); + } + ParamIteratorInterface* End() const override { + return new Iterator(this, end_, end_index_, step_); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, T value, int index, + IncrementT step) + : base_(base), value_(value), index_(index), step_(step) {} + ~Iterator() override {} + + const ParamGeneratorInterface* BaseGenerator() const override { + return base_; + } + void Advance() override { + value_ = static_cast(value_ + step_); + index_++; + } + ParamIteratorInterface* Clone() const override { + return new Iterator(*this); + } + const T* Current() const override { return &value_; } + bool Equals(const ParamIteratorInterface& other) const override { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const int other_index = + CheckedDowncastToActualType(&other)->index_; + return index_ == other_index; + } + + private: + Iterator(const Iterator& other) + : ParamIteratorInterface(), + base_(other.base_), + value_(other.value_), + index_(other.index_), + step_(other.step_) {} + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + T value_; + int index_; + const IncrementT step_; + }; // class RangeGenerator::Iterator + + static int CalculateEndIndex(const T& begin, const T& end, + const IncrementT& step) { + int end_index = 0; + for (T i = begin; i < end; i = static_cast(i + step)) end_index++; + return end_index; + } + + // No implementation - assignment is unsupported. + void operator=(const RangeGenerator& other); + + const T begin_; + const T end_; + const IncrementT step_; + // The index for the end() iterator. All the elements in the generated + // sequence are indexed (0-based) to aid iterator comparison. + const int end_index_; +}; // class RangeGenerator + +// Generates values from a pair of STL-style iterators. Used in the +// ValuesIn() function. The elements are copied from the source range +// since the source can be located on the stack, and the generator +// is likely to persist beyond that stack frame. +template +class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { + public: + template + ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end) + : container_(begin, end) {} + ~ValuesInIteratorRangeGenerator() override {} + + ParamIteratorInterface* Begin() const override { + return new Iterator(this, container_.begin()); + } + ParamIteratorInterface* End() const override { + return new Iterator(this, container_.end()); + } + + private: + typedef typename ::std::vector ContainerType; + + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + typename ContainerType::const_iterator iterator) + : base_(base), iterator_(iterator) {} + ~Iterator() override {} + + const ParamGeneratorInterface* BaseGenerator() const override { + return base_; + } + void Advance() override { + ++iterator_; + value_.reset(); + } + ParamIteratorInterface* Clone() const override { + return new Iterator(*this); + } + // We need to use cached value referenced by iterator_ because *iterator_ + // can return a temporary object (and of type other then T), so just + // having "return &*iterator_;" doesn't work. + // value_ is updated here and not in Advance() because Advance() + // can advance iterator_ beyond the end of the range, and we cannot + // detect that fact. The client code, on the other hand, is + // responsible for not calling Current() on an out-of-range iterator. + const T* Current() const override { + if (value_.get() == nullptr) value_.reset(new T(*iterator_)); + return value_.get(); + } + bool Equals(const ParamIteratorInterface& other) const override { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + return iterator_ == + CheckedDowncastToActualType(&other)->iterator_; + } + + private: + Iterator(const Iterator& other) + // The explicit constructor call suppresses a false warning + // emitted by gcc when supplied with the -Wextra option. + : ParamIteratorInterface(), + base_(other.base_), + iterator_(other.iterator_) {} + + const ParamGeneratorInterface* const base_; + typename ContainerType::const_iterator iterator_; + // A cached value of *iterator_. We keep it here to allow access by + // pointer in the wrapping iterator's operator->(). + // value_ needs to be mutable to be accessed in Current(). + // Use of std::unique_ptr helps manage cached value's lifetime, + // which is bound by the lifespan of the iterator itself. + mutable std::unique_ptr value_; + }; // class ValuesInIteratorRangeGenerator::Iterator + + // No implementation - assignment is unsupported. + void operator=(const ValuesInIteratorRangeGenerator& other); + + const ContainerType container_; +}; // class ValuesInIteratorRangeGenerator + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Default parameterized test name generator, returns a string containing the +// integer test parameter index. +template +std::string DefaultParamName(const TestParamInfo& info) { + Message name_stream; + name_stream << info.index; + return name_stream.GetString(); +} + +template +void TestNotEmpty() { + static_assert(sizeof(T) == 0, "Empty arguments are not allowed."); +} +template +void TestNotEmpty(const T&) {} + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Stores a parameter value and later creates tests parameterized with that +// value. +template +class ParameterizedTestFactory : public TestFactoryBase { + public: + typedef typename TestClass::ParamType ParamType; + explicit ParameterizedTestFactory(ParamType parameter) + : parameter_(parameter) {} + Test* CreateTest() override { + TestClass::SetParam(¶meter_); + return new TestClass(); + } + + private: + const ParamType parameter_; + + ParameterizedTestFactory(const ParameterizedTestFactory&) = delete; + ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactoryBase is a base class for meta-factories that create +// test factories for passing into MakeAndRegisterTestInfo function. +template +class TestMetaFactoryBase { + public: + virtual ~TestMetaFactoryBase() {} + + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactory creates test factories for passing into +// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives +// ownership of test factory pointer, same factory object cannot be passed +// into that method twice. But ParameterizedTestSuiteInfo is going to call +// it for each Test/Parameter value combination. Thus it needs meta factory +// creator class. +template +class TestMetaFactory + : public TestMetaFactoryBase { + public: + using ParamType = typename TestSuite::ParamType; + + TestMetaFactory() {} + + TestFactoryBase* CreateTestFactory(ParamType parameter) override { + return new ParameterizedTestFactory(parameter); + } + + private: + TestMetaFactory(const TestMetaFactory&) = delete; + TestMetaFactory& operator=(const TestMetaFactory&) = delete; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestSuiteInfoBase is a generic interface +// to ParameterizedTestSuiteInfo classes. ParameterizedTestSuiteInfoBase +// accumulates test information provided by TEST_P macro invocations +// and generators provided by INSTANTIATE_TEST_SUITE_P macro invocations +// and uses that information to register all resulting test instances +// in RegisterTests method. The ParameterizeTestSuiteRegistry class holds +// a collection of pointers to the ParameterizedTestSuiteInfo objects +// and calls RegisterTests() on each of them when asked. +class ParameterizedTestSuiteInfoBase { + public: + virtual ~ParameterizedTestSuiteInfoBase() {} + + // Base part of test suite name for display purposes. + virtual const std::string& GetTestSuiteName() const = 0; + // Test suite id to verify identity. + virtual TypeId GetTestSuiteTypeId() const = 0; + // UnitTest class invokes this method to register tests in this + // test suite right before running them in RUN_ALL_TESTS macro. + // This method should not be called more than once on any single + // instance of a ParameterizedTestSuiteInfoBase derived class. + virtual void RegisterTests() = 0; + + protected: + ParameterizedTestSuiteInfoBase() {} + + private: + ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) = + delete; + ParameterizedTestSuiteInfoBase& operator=( + const ParameterizedTestSuiteInfoBase&) = delete; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Report a the name of a test_suit as safe to ignore +// as the side effect of construction of this type. +struct GTEST_API_ MarkAsIgnored { + explicit MarkAsIgnored(const char* test_suite); +}; + +GTEST_API_ void InsertSyntheticTestCase(const std::string& name, + CodeLocation location, bool has_test_p); + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P +// macro invocations for a particular test suite and generators +// obtained from INSTANTIATE_TEST_SUITE_P macro invocations for that +// test suite. It registers tests with all values generated by all +// generators when asked. +template +class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase { + public: + // ParamType and GeneratorCreationFunc are private types but are required + // for declarations of public methods AddTestPattern() and + // AddTestSuiteInstantiation(). + using ParamType = typename TestSuite::ParamType; + // A function that returns an instance of appropriate generator type. + typedef ParamGenerator(GeneratorCreationFunc)(); + using ParamNameGeneratorFunc = std::string(const TestParamInfo&); + + explicit ParameterizedTestSuiteInfo(const char* name, + CodeLocation code_location) + : test_suite_name_(name), code_location_(code_location) {} + + // Test suite base name for display purposes. + const std::string& GetTestSuiteName() const override { + return test_suite_name_; + } + // Test suite id to verify identity. + TypeId GetTestSuiteTypeId() const override { return GetTypeId(); } + // TEST_P macro uses AddTestPattern() to record information + // about a single test in a LocalTestInfo structure. + // test_suite_name is the base name of the test suite (without invocation + // prefix). test_base_name is the name of an individual test without + // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is + // test suite base name and DoBar is test base name. + void AddTestPattern(const char* test_suite_name, const char* test_base_name, + TestMetaFactoryBase* meta_factory, + CodeLocation code_location) { + tests_.push_back(std::shared_ptr(new TestInfo( + test_suite_name, test_base_name, meta_factory, code_location))); + } + // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information + // about a generator. + int AddTestSuiteInstantiation(const std::string& instantiation_name, + GeneratorCreationFunc* func, + ParamNameGeneratorFunc* name_func, + const char* file, int line) { + instantiations_.push_back( + InstantiationInfo(instantiation_name, func, name_func, file, line)); + return 0; // Return value used only to run this method in namespace scope. + } + // UnitTest class invokes this method to register tests in this test suite + // right before running tests in RUN_ALL_TESTS macro. + // This method should not be called more than once on any single + // instance of a ParameterizedTestSuiteInfoBase derived class. + // UnitTest has a guard to prevent from calling this method more than once. + void RegisterTests() override { + bool generated_instantiations = false; + + for (typename TestInfoContainer::iterator test_it = tests_.begin(); + test_it != tests_.end(); ++test_it) { + std::shared_ptr test_info = *test_it; + for (typename InstantiationContainer::iterator gen_it = + instantiations_.begin(); + gen_it != instantiations_.end(); ++gen_it) { + const std::string& instantiation_name = gen_it->name; + ParamGenerator generator((*gen_it->generator)()); + ParamNameGeneratorFunc* name_func = gen_it->name_func; + const char* file = gen_it->file; + int line = gen_it->line; + + std::string test_suite_name; + if (!instantiation_name.empty()) + test_suite_name = instantiation_name + "/"; + test_suite_name += test_info->test_suite_base_name; + + size_t i = 0; + std::set test_param_names; + for (typename ParamGenerator::iterator param_it = + generator.begin(); + param_it != generator.end(); ++param_it, ++i) { + generated_instantiations = true; + + Message test_name_stream; + + std::string param_name = + name_func(TestParamInfo(*param_it, i)); + + GTEST_CHECK_(IsValidParamName(param_name)) + << "Parameterized test name '" << param_name + << "' is invalid, in " << file << " line " << line << std::endl; + + GTEST_CHECK_(test_param_names.count(param_name) == 0) + << "Duplicate parameterized test name '" << param_name << "', in " + << file << " line " << line << std::endl; + + test_param_names.insert(param_name); + + if (!test_info->test_base_name.empty()) { + test_name_stream << test_info->test_base_name << "/"; + } + test_name_stream << param_name; + MakeAndRegisterTestInfo( + test_suite_name.c_str(), test_name_stream.GetString().c_str(), + nullptr, // No type parameter. + PrintToString(*param_it).c_str(), test_info->code_location, + GetTestSuiteTypeId(), + SuiteApiResolver::GetSetUpCaseOrSuite(file, line), + SuiteApiResolver::GetTearDownCaseOrSuite(file, line), + test_info->test_meta_factory->CreateTestFactory(*param_it)); + } // for param_it + } // for gen_it + } // for test_it + + if (!generated_instantiations) { + // There are no generaotrs, or they all generate nothing ... + InsertSyntheticTestCase(GetTestSuiteName(), code_location_, + !tests_.empty()); + } + } // RegisterTests + + private: + // LocalTestInfo structure keeps information about a single test registered + // with TEST_P macro. + struct TestInfo { + TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name, + TestMetaFactoryBase* a_test_meta_factory, + CodeLocation a_code_location) + : test_suite_base_name(a_test_suite_base_name), + test_base_name(a_test_base_name), + test_meta_factory(a_test_meta_factory), + code_location(a_code_location) {} + + const std::string test_suite_base_name; + const std::string test_base_name; + const std::unique_ptr> test_meta_factory; + const CodeLocation code_location; + }; + using TestInfoContainer = ::std::vector>; + // Records data received from INSTANTIATE_TEST_SUITE_P macros: + // + struct InstantiationInfo { + InstantiationInfo(const std::string& name_in, + GeneratorCreationFunc* generator_in, + ParamNameGeneratorFunc* name_func_in, const char* file_in, + int line_in) + : name(name_in), + generator(generator_in), + name_func(name_func_in), + file(file_in), + line(line_in) {} + + std::string name; + GeneratorCreationFunc* generator; + ParamNameGeneratorFunc* name_func; + const char* file; + int line; + }; + typedef ::std::vector InstantiationContainer; + + static bool IsValidParamName(const std::string& name) { + // Check for empty string + if (name.empty()) return false; + + // Check for invalid characters + for (std::string::size_type index = 0; index < name.size(); ++index) { + if (!IsAlNum(name[index]) && name[index] != '_') return false; + } + + return true; + } + + const std::string test_suite_name_; + CodeLocation code_location_; + TestInfoContainer tests_; + InstantiationContainer instantiations_; + + ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete; + ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) = + delete; +}; // class ParameterizedTestSuiteInfo + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +template +using ParameterizedTestCaseInfo = ParameterizedTestSuiteInfo; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestSuiteRegistry contains a map of +// ParameterizedTestSuiteInfoBase classes accessed by test suite names. TEST_P +// and INSTANTIATE_TEST_SUITE_P macros use it to locate their corresponding +// ParameterizedTestSuiteInfo descriptors. +class ParameterizedTestSuiteRegistry { + public: + ParameterizedTestSuiteRegistry() {} + ~ParameterizedTestSuiteRegistry() { + for (auto& test_suite_info : test_suite_infos_) { + delete test_suite_info; + } + } + + // Looks up or creates and returns a structure containing information about + // tests and instantiations of a particular test suite. + template + ParameterizedTestSuiteInfo* GetTestSuitePatternHolder( + const char* test_suite_name, CodeLocation code_location) { + ParameterizedTestSuiteInfo* typed_test_info = nullptr; + for (auto& test_suite_info : test_suite_infos_) { + if (test_suite_info->GetTestSuiteName() == test_suite_name) { + if (test_suite_info->GetTestSuiteTypeId() != GetTypeId()) { + // Complain about incorrect usage of Google Test facilities + // and terminate the program since we cannot guaranty correct + // test suite setup and tear-down in this case. + ReportInvalidTestSuiteType(test_suite_name, code_location); + posix::Abort(); + } else { + // At this point we are sure that the object we found is of the same + // type we are looking for, so we downcast it to that type + // without further checks. + typed_test_info = CheckedDowncastToActualType< + ParameterizedTestSuiteInfo>(test_suite_info); + } + break; + } + } + if (typed_test_info == nullptr) { + typed_test_info = new ParameterizedTestSuiteInfo( + test_suite_name, code_location); + test_suite_infos_.push_back(typed_test_info); + } + return typed_test_info; + } + void RegisterTests() { + for (auto& test_suite_info : test_suite_infos_) { + test_suite_info->RegisterTests(); + } + } +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + template + ParameterizedTestCaseInfo* GetTestCasePatternHolder( + const char* test_case_name, CodeLocation code_location) { + return GetTestSuitePatternHolder(test_case_name, code_location); + } + +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + private: + using TestSuiteInfoContainer = ::std::vector; + + TestSuiteInfoContainer test_suite_infos_; + + ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) = + delete; + ParameterizedTestSuiteRegistry& operator=( + const ParameterizedTestSuiteRegistry&) = delete; +}; + +// Keep track of what type-parameterized test suite are defined and +// where as well as which are intatiated. This allows susequently +// identifying suits that are defined but never used. +class TypeParameterizedTestSuiteRegistry { + public: + // Add a suite definition + void RegisterTestSuite(const char* test_suite_name, + CodeLocation code_location); + + // Add an instantiation of a suit. + void RegisterInstantiation(const char* test_suite_name); + + // For each suit repored as defined but not reported as instantiation, + // emit a test that reports that fact (configurably, as an error). + void CheckForInstantiations(); + + private: + struct TypeParameterizedTestSuiteInfo { + explicit TypeParameterizedTestSuiteInfo(CodeLocation c) + : code_location(c), instantiated(false) {} + + CodeLocation code_location; + bool instantiated; + }; + + std::map suites_; +}; + +} // namespace internal + +// Forward declarations of ValuesIn(), which is implemented in +// include/gtest/gtest-param-test.h. +template +internal::ParamGenerator ValuesIn( + const Container& container); + +namespace internal { +// Used in the Values() function to provide polymorphic capabilities. + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4100) +#endif + +template +class ValueArray { + public: + explicit ValueArray(Ts... v) : v_(FlatTupleConstructTag{}, std::move(v)...) {} + + template + operator ParamGenerator() const { // NOLINT + return ValuesIn(MakeVector(MakeIndexSequence())); + } + + private: + template + std::vector MakeVector(IndexSequence) const { + return std::vector{static_cast(v_.template Get())...}; + } + + FlatTuple v_; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +template +class CartesianProductGenerator + : public ParamGeneratorInterface<::std::tuple> { + public: + typedef ::std::tuple ParamType; + + CartesianProductGenerator(const std::tuple...>& g) + : generators_(g) {} + ~CartesianProductGenerator() override {} + + ParamIteratorInterface* Begin() const override { + return new Iterator(this, generators_, false); + } + ParamIteratorInterface* End() const override { + return new Iterator(this, generators_, true); + } + + private: + template + class IteratorImpl; + template + class IteratorImpl> + : public ParamIteratorInterface { + public: + IteratorImpl(const ParamGeneratorInterface* base, + const std::tuple...>& generators, + bool is_end) + : base_(base), + begin_(std::get(generators).begin()...), + end_(std::get(generators).end()...), + current_(is_end ? end_ : begin_) { + ComputeCurrentValue(); + } + ~IteratorImpl() override {} + + const ParamGeneratorInterface* BaseGenerator() const override { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + void Advance() override { + assert(!AtEnd()); + // Advance the last iterator. + ++std::get(current_); + // if that reaches end, propagate that up. + AdvanceIfEnd(); + ComputeCurrentValue(); + } + ParamIteratorInterface* Clone() const override { + return new IteratorImpl(*this); + } + + const ParamType* Current() const override { return current_value_.get(); } + + bool Equals(const ParamIteratorInterface& other) const override { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const IteratorImpl* typed_other = + CheckedDowncastToActualType(&other); + + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + if (AtEnd() && typed_other->AtEnd()) return true; + + bool same = true; + bool dummy[] = { + (same = same && std::get(current_) == + std::get(typed_other->current_))...}; + (void)dummy; + return same; + } + + private: + template + void AdvanceIfEnd() { + if (std::get(current_) != std::get(end_)) return; + + bool last = ThisI == 0; + if (last) { + // We are done. Nothing else to propagate. + return; + } + + constexpr size_t NextI = ThisI - (ThisI != 0); + std::get(current_) = std::get(begin_); + ++std::get(current_); + AdvanceIfEnd(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = std::make_shared(*std::get(current_)...); + } + bool AtEnd() const { + bool at_end = false; + bool dummy[] = { + (at_end = at_end || std::get(current_) == std::get(end_))...}; + (void)dummy; + return at_end; + } + + const ParamGeneratorInterface* const base_; + std::tuple::iterator...> begin_; + std::tuple::iterator...> end_; + std::tuple::iterator...> current_; + std::shared_ptr current_value_; + }; + + using Iterator = IteratorImpl::type>; + + std::tuple...> generators_; +}; + +template +class CartesianProductHolder { + public: + CartesianProductHolder(const Gen&... g) : generators_(g...) {} + template + operator ParamGenerator<::std::tuple>() const { + return ParamGenerator<::std::tuple>( + new CartesianProductGenerator(generators_)); + } + + private: + std::tuple generators_; +}; + +} // namespace internal +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h new file mode 100644 index 0000000000..f025db76ad --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h @@ -0,0 +1,116 @@ +// Copyright 2015, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file defines the GTEST_OS_* macro. +// It is separate from gtest-port.h so that custom/gtest-port.h can include it. + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ + +// Determines the platform on which Google Test is compiled. +#ifdef __CYGWIN__ +#define GTEST_OS_CYGWIN 1 +#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__) +#define GTEST_OS_WINDOWS_MINGW 1 +#define GTEST_OS_WINDOWS 1 +#elif defined _WIN32 +#define GTEST_OS_WINDOWS 1 +#ifdef _WIN32_WCE +#define GTEST_OS_WINDOWS_MOBILE 1 +#elif defined(WINAPI_FAMILY) +#include +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define GTEST_OS_WINDOWS_DESKTOP 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP) +#define GTEST_OS_WINDOWS_PHONE 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) +#define GTEST_OS_WINDOWS_RT 1 +#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE) +#define GTEST_OS_WINDOWS_PHONE 1 +#define GTEST_OS_WINDOWS_TV_TITLE 1 +#else +// WINAPI_FAMILY defined but no known partition matched. +// Default to desktop. +#define GTEST_OS_WINDOWS_DESKTOP 1 +#endif +#else +#define GTEST_OS_WINDOWS_DESKTOP 1 +#endif // _WIN32_WCE +#elif defined __OS2__ +#define GTEST_OS_OS2 1 +#elif defined __APPLE__ +#define GTEST_OS_MAC 1 +#include +#if TARGET_OS_IPHONE +#define GTEST_OS_IOS 1 +#endif +#elif defined __DragonFly__ +#define GTEST_OS_DRAGONFLY 1 +#elif defined __FreeBSD__ +#define GTEST_OS_FREEBSD 1 +#elif defined __Fuchsia__ +#define GTEST_OS_FUCHSIA 1 +#elif defined(__GNU__) +#define GTEST_OS_GNU_HURD 1 +#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__) +#define GTEST_OS_GNU_KFREEBSD 1 +#elif defined __linux__ +#define GTEST_OS_LINUX 1 +#if defined __ANDROID__ +#define GTEST_OS_LINUX_ANDROID 1 +#endif +#elif defined __MVS__ +#define GTEST_OS_ZOS 1 +#elif defined(__sun) && defined(__SVR4) +#define GTEST_OS_SOLARIS 1 +#elif defined(_AIX) +#define GTEST_OS_AIX 1 +#elif defined(__hpux) +#define GTEST_OS_HPUX 1 +#elif defined __native_client__ +#define GTEST_OS_NACL 1 +#elif defined __NetBSD__ +#define GTEST_OS_NETBSD 1 +#elif defined __OpenBSD__ +#define GTEST_OS_OPENBSD 1 +#elif defined __QNX__ +#define GTEST_OS_QNX 1 +#elif defined(__HAIKU__) +#define GTEST_OS_HAIKU 1 +#elif defined ESP8266 +#define GTEST_OS_ESP8266 1 +#elif defined ESP32 +#define GTEST_OS_ESP32 1 +#elif defined(__XTENSA__) +#define GTEST_OS_XTENSA 1 +#endif // __CYGWIN__ + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h new file mode 100644 index 0000000000..0003d27658 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h @@ -0,0 +1,2413 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Low-level types and utilities for porting Google Test to various +// platforms. All macros ending with _ and symbols defined in an +// internal namespace are subject to change without notice. Code +// outside Google Test MUST NOT USE THEM DIRECTLY. Macros that don't +// end with _ are part of Google Test's public API and can be used by +// code outside Google Test. +// +// This file is fundamental to Google Test. All other Google Test source +// files are expected to #include this. Therefore, it cannot #include +// any other Google Test header. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ + +// Environment-describing macros +// ----------------------------- +// +// Google Test can be used in many different environments. Macros in +// this section tell Google Test what kind of environment it is being +// used in, such that Google Test can provide environment-specific +// features and implementations. +// +// Google Test tries to automatically detect the properties of its +// environment, so users usually don't need to worry about these +// macros. However, the automatic detection is not perfect. +// Sometimes it's necessary for a user to define some of the following +// macros in the build script to override Google Test's decisions. +// +// If the user doesn't define a macro in the list, Google Test will +// provide a default definition. After this header is #included, all +// macros in this list will be defined to either 1 or 0. +// +// Notes to maintainers: +// - Each macro here is a user-tweakable knob; do not grow the list +// lightly. +// - Use #if to key off these macros. Don't use #ifdef or "#if +// defined(...)", which will not work as these macros are ALWAYS +// defined. +// +// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2) +// is/isn't available. +// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions +// are enabled. +// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular +// expressions are/aren't available. +// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that +// is/isn't available. +// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't +// enabled. +// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that +// std::wstring does/doesn't work (Google Test can +// be used where std::wstring is unavailable). +// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the +// compiler supports Microsoft's "Structured +// Exception Handling". +// GTEST_HAS_STREAM_REDIRECTION +// - Define it to 1/0 to indicate whether the +// platform supports I/O stream redirection using +// dup() and dup2(). +// GTEST_LINKED_AS_SHARED_LIBRARY +// - Define to 1 when compiling tests that use +// Google Test as a shared library (known as +// DLL on Windows). +// GTEST_CREATE_SHARED_LIBRARY +// - Define to 1 when compiling Google Test itself +// as a shared library. +// GTEST_DEFAULT_DEATH_TEST_STYLE +// - The default value of --gtest_death_test_style. +// The legacy default has been "fast" in the open +// source version since 2008. The recommended value +// is "threadsafe", and can be set in +// custom/gtest-port.h. + +// Platform-indicating macros +// -------------------------- +// +// Macros indicating the platform on which Google Test is being used +// (a macro is defined to 1 if compiled on the given platform; +// otherwise UNDEFINED -- it's never defined to 0.). Google Test +// defines these macros automatically. Code outside Google Test MUST +// NOT define them. +// +// GTEST_OS_AIX - IBM AIX +// GTEST_OS_CYGWIN - Cygwin +// GTEST_OS_DRAGONFLY - DragonFlyBSD +// GTEST_OS_FREEBSD - FreeBSD +// GTEST_OS_FUCHSIA - Fuchsia +// GTEST_OS_GNU_HURD - GNU/Hurd +// GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD +// GTEST_OS_HAIKU - Haiku +// GTEST_OS_HPUX - HP-UX +// GTEST_OS_LINUX - Linux +// GTEST_OS_LINUX_ANDROID - Google Android +// GTEST_OS_MAC - Mac OS X +// GTEST_OS_IOS - iOS +// GTEST_OS_NACL - Google Native Client (NaCl) +// GTEST_OS_NETBSD - NetBSD +// GTEST_OS_OPENBSD - OpenBSD +// GTEST_OS_OS2 - OS/2 +// GTEST_OS_QNX - QNX +// GTEST_OS_SOLARIS - Sun Solaris +// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile) +// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop +// GTEST_OS_WINDOWS_MINGW - MinGW +// GTEST_OS_WINDOWS_MOBILE - Windows Mobile +// GTEST_OS_WINDOWS_PHONE - Windows Phone +// GTEST_OS_WINDOWS_RT - Windows Store App/WinRT +// GTEST_OS_ZOS - z/OS +// +// Among the platforms, Cygwin, Linux, Mac OS X, and Windows have the +// most stable support. Since core members of the Google Test project +// don't have access to other platforms, support for them may be less +// stable. If you notice any problems on your platform, please notify +// googletestframework@googlegroups.com (patches for fixing them are +// even more welcome!). +// +// It is possible that none of the GTEST_OS_* macros are defined. + +// Feature-indicating macros +// ------------------------- +// +// Macros indicating which Google Test features are available (a macro +// is defined to 1 if the corresponding feature is supported; +// otherwise UNDEFINED -- it's never defined to 0.). Google Test +// defines these macros automatically. Code outside Google Test MUST +// NOT define them. +// +// These macros are public so that portable tests can be written. +// Such tests typically surround code using a feature with an #if +// which controls that code. For example: +// +// #if GTEST_HAS_DEATH_TEST +// EXPECT_DEATH(DoSomethingDeadly()); +// #endif +// +// GTEST_HAS_DEATH_TEST - death tests +// GTEST_HAS_TYPED_TEST - typed tests +// GTEST_HAS_TYPED_TEST_P - type-parameterized tests +// GTEST_IS_THREADSAFE - Google Test is thread-safe. +// GTEST_USES_RE2 - the RE2 regular expression library is used +// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with +// GTEST_HAS_POSIX_RE (see above) which users can +// define themselves. +// GTEST_USES_SIMPLE_RE - our own simple regex is used; +// the above RE\b(s) are mutually exclusive. + +// Misc public macros +// ------------------ +// +// GTEST_FLAG(flag_name) - references the variable corresponding to +// the given Google Test flag. + +// Internal utilities +// ------------------ +// +// The following macros and utilities are for Google Test's INTERNAL +// use only. Code outside Google Test MUST NOT USE THEM DIRECTLY. +// +// Macros for basic C++ coding: +// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. +// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a +// variable don't have to be used. +// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. +// GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is +// suppressed (constant conditional). +// GTEST_INTENTIONAL_CONST_COND_POP_ - finish code section where MSVC C4127 +// is suppressed. +// GTEST_INTERNAL_HAS_ANY - for enabling UniversalPrinter or +// UniversalPrinter specializations. +// GTEST_INTERNAL_HAS_OPTIONAL - for enabling UniversalPrinter +// or +// UniversalPrinter +// specializations. +// GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher or +// Matcher +// specializations. +// GTEST_INTERNAL_HAS_VARIANT - for enabling UniversalPrinter or +// UniversalPrinter +// specializations. +// +// Synchronization: +// Mutex, MutexLock, ThreadLocal, GetThreadCount() +// - synchronization primitives. +// +// Regular expressions: +// RE - a simple regular expression class using +// 1) the RE2 syntax on all platforms when built with RE2 +// and Abseil as dependencies +// 2) the POSIX Extended Regular Expression syntax on +// UNIX-like platforms, +// 3) A reduced regular exception syntax on other platforms, +// including Windows. +// Logging: +// GTEST_LOG_() - logs messages at the specified severity level. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. +// +// Stdout and stderr capturing: +// CaptureStdout() - starts capturing stdout. +// GetCapturedStdout() - stops capturing stdout and returns the captured +// string. +// CaptureStderr() - starts capturing stderr. +// GetCapturedStderr() - stops capturing stderr and returns the captured +// string. +// +// Integer types: +// TypeWithSize - maps an integer to a int type. +// TimeInMillis - integers of known sizes. +// BiggestInt - the biggest signed integer type. +// +// Command-line utilities: +// GetInjectableArgvs() - returns the command line as a vector of strings. +// +// Environment variable utilities: +// GetEnv() - gets the value of an environment variable. +// BoolFromGTestEnv() - parses a bool environment variable. +// Int32FromGTestEnv() - parses an int32_t environment variable. +// StringFromGTestEnv() - parses a string environment variable. +// +// Deprecation warnings: +// GTEST_INTERNAL_DEPRECATED(message) - attribute marking a function as +// deprecated; calling a marked function +// should generate a compiler warning + +#include // for isspace, etc +#include // for ptrdiff_t +#include +#include +#include + +#include +// #include // Guarded by GTEST_IS_THREADSAFE below +#include +#include +#include +#include +#include +#include +// #include // Guarded by GTEST_IS_THREADSAFE below +#include +#include +#include + +#ifndef _WIN32_WCE +#include +#include +#endif // !_WIN32_WCE + +#if defined __APPLE__ +#include +#include +#endif + +#include "gtest/internal/custom/gtest-port.h" +#include "gtest/internal/gtest-port-arch.h" + +#if GTEST_HAS_ABSL +#include "absl/flags/declare.h" +#include "absl/flags/flag.h" +#include "absl/flags/reflection.h" +#endif + +#if !defined(GTEST_DEV_EMAIL_) +#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" +#define GTEST_FLAG_PREFIX_ "gtest_" +#define GTEST_FLAG_PREFIX_DASH_ "gtest-" +#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" +#define GTEST_NAME_ "Google Test" +#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/" +#endif // !defined(GTEST_DEV_EMAIL_) + +#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_) +#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest" +#endif // !defined(GTEST_INIT_GOOGLE_TEST_NAME_) + +// Determines the version of gcc that is used to compile this. +#ifdef __GNUC__ +// 40302 means version 4.3.2. +#define GTEST_GCC_VER_ \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#endif // __GNUC__ + +// Macros for disabling Microsoft Visual C++ warnings. +// +// GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385) +// /* code that triggers warnings C4800 and C4385 */ +// GTEST_DISABLE_MSC_WARNINGS_POP_() +#if defined(_MSC_VER) +#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \ + __pragma(warning(push)) __pragma(warning(disable : warnings)) +#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop)) +#else +// Not all compilers are MSVC +#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) +#define GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + +// Clang on Windows does not understand MSVC's pragma warning. +// We need clang-specific way to disable function deprecation warning. +#ifdef __clang__ +#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \ + _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"") +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop") +#else +#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996) +#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_() +#endif + +// Brings in definitions for functions used in the testing::internal::posix +// namespace (read, write, close, chdir, isatty, stat). We do not currently +// use them on Windows Mobile. +#if GTEST_OS_WINDOWS +#if !GTEST_OS_WINDOWS_MOBILE +#include +#include +#endif +// In order to avoid having to include , use forward declaration +#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR) +// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two +// separate (equivalent) structs, instead of using typedef +typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION; +#else +// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION. +// This assumption is verified by +// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION. +typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; +#endif +#elif GTEST_OS_XTENSA +#include +// Xtensa toolchains define strcasecmp in the string.h header instead of +// strings.h. string.h is already included. +#else +// This assumes that non-Windows OSes provide unistd.h. For OSes where this +// is not the case, we need to include headers that provide the functions +// mentioned above. +#include +#include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_LINUX_ANDROID +// Used to define __ANDROID_API__ matching the target NDK API level. +#include // NOLINT +#endif + +// Defines this to true if and only if Google Test can use POSIX regular +// expressions. +#ifndef GTEST_HAS_POSIX_RE +#if GTEST_OS_LINUX_ANDROID +// On Android, is only available starting with Gingerbread. +#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9) +#else +#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA) +#endif +#endif + +// Select the regular expression implementation. +#if GTEST_HAS_ABSL +// When using Abseil, RE2 is required. +#include "absl/strings/string_view.h" +#include "re2/re2.h" +#define GTEST_USES_RE2 1 +#elif GTEST_HAS_POSIX_RE +#include // NOLINT +#define GTEST_USES_POSIX_RE 1 +#else +// Use our own simple regex implementation. +#define GTEST_USES_SIMPLE_RE 1 +#endif + +#ifndef GTEST_HAS_EXCEPTIONS +// The user didn't tell us whether exceptions are enabled, so we need +// to figure it out. +#if defined(_MSC_VER) && defined(_CPPUNWIND) +// MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__BORLANDC__) +// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS +// macro to enable exceptions, so we'll do the same. +// Assumes that exceptions are enabled by default. +#ifndef _HAS_EXCEPTIONS +#define _HAS_EXCEPTIONS 1 +#endif // _HAS_EXCEPTIONS +#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS +#elif defined(__clang__) +// clang defines __EXCEPTIONS if and only if exceptions are enabled before clang +// 220714, but if and only if cleanups are enabled after that. In Obj-C++ files, +// there can be cleanups for ObjC exceptions which also need cleanups, even if +// C++ exceptions are disabled. clang has __has_feature(cxx_exceptions) which +// checks for C++ exceptions starting at clang r206352, but which checked for +// cleanups prior to that. To reliably check for C++ exception availability with +// clang, check for +// __EXCEPTIONS && __has_feature(cxx_exceptions). +#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions)) +#elif defined(__GNUC__) && __EXCEPTIONS +// gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__SUNPRO_CC) +// Sun Pro CC supports exceptions. However, there is no compile-time way of +// detecting whether they are enabled or not. Therefore, we assume that +// they are enabled unless the user tells us otherwise. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__IBMCPP__) && __EXCEPTIONS +// xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled. +#define GTEST_HAS_EXCEPTIONS 1 +#elif defined(__HP_aCC) +// Exception handling is in effect by default in HP aCC compiler. It has to +// be turned of by +noeh compiler option if desired. +#define GTEST_HAS_EXCEPTIONS 1 +#else +// For other compilers, we assume exceptions are disabled to be +// conservative. +#define GTEST_HAS_EXCEPTIONS 0 +#endif // defined(_MSC_VER) || defined(__BORLANDC__) +#endif // GTEST_HAS_EXCEPTIONS + +#ifndef GTEST_HAS_STD_WSTRING +// The user didn't tell us whether ::std::wstring is available, so we need +// to figure it out. +// Cygwin 1.7 and below doesn't support ::std::wstring. +// Solaris' libc++ doesn't support it either. Android has +// no support for it at least as recent as Froyo (2.2). +#define GTEST_HAS_STD_WSTRING \ + (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266 || GTEST_OS_XTENSA)) + +#endif // GTEST_HAS_STD_WSTRING + +// Determines whether RTTI is available. +#ifndef GTEST_HAS_RTTI +// The user didn't tell us whether RTTI is enabled, so we need to +// figure it out. + +#ifdef _MSC_VER + +#ifdef _CPPRTTI // MSVC defines this macro if and only if RTTI is enabled. +#define GTEST_HAS_RTTI 1 +#else +#define GTEST_HAS_RTTI 0 +#endif + +// Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is +// enabled. +#elif defined(__GNUC__) + +#ifdef __GXX_RTTI +// When building against STLport with the Android NDK and with +// -frtti -fno-exceptions, the build fails at link time with undefined +// references to __cxa_bad_typeid. Note sure if STL or toolchain bug, +// so disable RTTI when detected. +#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS) +#define GTEST_HAS_RTTI 0 +#else +#define GTEST_HAS_RTTI 1 +#endif // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS +#else +#define GTEST_HAS_RTTI 0 +#endif // __GXX_RTTI + +// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends +// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the +// first version with C++ support. +#elif defined(__clang__) + +#define GTEST_HAS_RTTI __has_feature(cxx_rtti) + +// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if +// both the typeid and dynamic_cast features are present. +#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) + +#ifdef __RTTI_ALL__ +#define GTEST_HAS_RTTI 1 +#else +#define GTEST_HAS_RTTI 0 +#endif + +#else + +// For all other compilers, we assume RTTI is enabled. +#define GTEST_HAS_RTTI 1 + +#endif // _MSC_VER + +#endif // GTEST_HAS_RTTI + +// It's this header's responsibility to #include when RTTI +// is enabled. +#if GTEST_HAS_RTTI +#include +#endif + +// Determines whether Google Test can use the pthreads library. +#ifndef GTEST_HAS_PTHREAD +// The user didn't tell us explicitly, so we make reasonable assumptions about +// which platforms have pthreads support. +// +// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 +// to your compiler flags. +#define GTEST_HAS_PTHREAD \ + (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \ + GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ + GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD || \ + GTEST_OS_HAIKU || GTEST_OS_GNU_HURD) +#endif // GTEST_HAS_PTHREAD + +#if GTEST_HAS_PTHREAD +// gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is +// true. +#include // NOLINT + +// For timespec and nanosleep, used below. +#include // NOLINT +#endif + +// Determines whether clone(2) is supported. +// Usually it will only be available on Linux, excluding +// Linux on the Itanium architecture. +// Also see http://linux.die.net/man/2/clone. +#ifndef GTEST_HAS_CLONE +// The user didn't tell us, so we need to figure it out. + +#if GTEST_OS_LINUX && !defined(__ia64__) +#if GTEST_OS_LINUX_ANDROID +// On Android, clone() became available at different API levels for each 32-bit +// architecture. +#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \ + (defined(__mips__) && __ANDROID_API__ >= 12) || \ + (defined(__i386__) && __ANDROID_API__ >= 17) +#define GTEST_HAS_CLONE 1 +#else +#define GTEST_HAS_CLONE 0 +#endif +#else +#define GTEST_HAS_CLONE 1 +#endif +#else +#define GTEST_HAS_CLONE 0 +#endif // GTEST_OS_LINUX && !defined(__ia64__) + +#endif // GTEST_HAS_CLONE + +// Determines whether to support stream redirection. This is used to test +// output correctness and to implement death tests. +#ifndef GTEST_HAS_STREAM_REDIRECTION +// By default, we assume that stream redirection is supported on all +// platforms except known mobile ones. +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA +#define GTEST_HAS_STREAM_REDIRECTION 0 +#else +#define GTEST_HAS_STREAM_REDIRECTION 1 +#endif // !GTEST_OS_WINDOWS_MOBILE +#endif // GTEST_HAS_STREAM_REDIRECTION + +// Determines whether to support death tests. +// pops up a dialog window that cannot be suppressed programmatically. +#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + (GTEST_OS_MAC && !GTEST_OS_IOS) || \ + (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW || \ + GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \ + GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \ + GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU || \ + GTEST_OS_GNU_HURD) +#define GTEST_HAS_DEATH_TEST 1 +#endif + +// Determines whether to support type-driven tests. + +// Typed tests need and variadic macros, which GCC, VC++ 8.0, +// Sun Pro CC, IBM Visual Age, and HP aCC support. +#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \ + defined(__IBMCPP__) || defined(__HP_aCC) +#define GTEST_HAS_TYPED_TEST 1 +#define GTEST_HAS_TYPED_TEST_P 1 +#endif + +// Determines whether the system compiler uses UTF-16 for encoding wide strings. +#define GTEST_WIDE_STRING_USES_UTF16_ \ + (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_AIX || GTEST_OS_OS2) + +// Determines whether test results can be streamed to a socket. +#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \ + GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD || \ + GTEST_OS_GNU_HURD +#define GTEST_CAN_STREAM_RESULTS_ 1 +#endif + +// Defines some utility macros. + +// The GNU compiler emits a warning if nested "if" statements are followed by +// an "else" statement and braces are not used to explicitly disambiguate the +// "else" binding. This leads to problems with code like: +// +// if (gate) +// ASSERT_*(condition) << "Some message"; +// +// The "switch (0) case 0:" idiom is used to suppress this. +#ifdef __INTEL_COMPILER +#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ +#else +#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + switch (0) \ + case 0: \ + default: // NOLINT +#endif + +// Use this annotation at the end of a struct/class definition to +// prevent the compiler from optimizing away instances that are never +// used. This is useful when all interesting logic happens inside the +// c'tor and / or d'tor. Example: +// +// struct Foo { +// Foo() { ... } +// } GTEST_ATTRIBUTE_UNUSED_; +// +// Also use it after a variable or parameter declaration to tell the +// compiler the variable/parameter does not have to be used. +#if defined(__GNUC__) && !defined(COMPILER_ICC) +#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused)) +#elif defined(__clang__) +#if __has_attribute(unused) +#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused)) +#endif +#endif +#ifndef GTEST_ATTRIBUTE_UNUSED_ +#define GTEST_ATTRIBUTE_UNUSED_ +#endif + +// Use this annotation before a function that takes a printf format string. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC) +#if defined(__MINGW_PRINTF_FORMAT) +// MinGW has two different printf implementations. Ensure the format macro +// matches the selected implementation. See +// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/. +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__(( \ + __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check))) +#else +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#else +#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) +#endif + +// Tell the compiler to warn about unused return values for functions declared +// with this macro. The macro should be used on function declarations +// following the argument list: +// +// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; +#if defined(__GNUC__) && !defined(COMPILER_ICC) +#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result)) +#else +#define GTEST_MUST_USE_RESULT_ +#endif // __GNUC__ && !COMPILER_ICC + +// MS C++ compiler emits warning when a conditional expression is compile time +// constant. In some contexts this warning is false positive and needs to be +// suppressed. Use the following two macros in such cases: +// +// GTEST_INTENTIONAL_CONST_COND_PUSH_() +// while (true) { +// GTEST_INTENTIONAL_CONST_COND_POP_() +// } +#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \ + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127) +#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_() + +// Determine whether the compiler supports Microsoft's Structured Exception +// Handling. This is supported by several Windows compilers but generally +// does not exist on any other system. +#ifndef GTEST_HAS_SEH +// The user didn't tell us, so we need to figure it out. + +#if defined(_MSC_VER) || defined(__BORLANDC__) +// These two compilers are known to support SEH. +#define GTEST_HAS_SEH 1 +#else +// Assume no SEH. +#define GTEST_HAS_SEH 0 +#endif + +#endif // GTEST_HAS_SEH + +#ifndef GTEST_IS_THREADSAFE + +#define GTEST_IS_THREADSAFE \ + (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ || \ + (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) || \ + GTEST_HAS_PTHREAD) + +#endif // GTEST_IS_THREADSAFE + +#if GTEST_IS_THREADSAFE +// Some platforms don't support including these threading related headers. +#include // NOLINT +#include // NOLINT +#endif // GTEST_IS_THREADSAFE + +// GTEST_API_ qualifies all symbols that must be exported. The definitions below +// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in +// gtest/internal/custom/gtest-port.h +#ifndef GTEST_API_ + +#ifdef _MSC_VER +#if GTEST_LINKED_AS_SHARED_LIBRARY +#define GTEST_API_ __declspec(dllimport) +#elif GTEST_CREATE_SHARED_LIBRARY +#define GTEST_API_ __declspec(dllexport) +#endif +#elif __GNUC__ >= 4 || defined(__clang__) +#define GTEST_API_ __attribute__((visibility("default"))) +#endif // _MSC_VER + +#endif // GTEST_API_ + +#ifndef GTEST_API_ +#define GTEST_API_ +#endif // GTEST_API_ + +#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE +#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast" +#endif // GTEST_DEFAULT_DEATH_TEST_STYLE + +#ifdef __GNUC__ +// Ask the compiler to never inline a given function. +#define GTEST_NO_INLINE_ __attribute__((noinline)) +#else +#define GTEST_NO_INLINE_ +#endif + +#if defined(__clang__) +// Nested ifs to avoid triggering MSVC warning. +#if __has_attribute(disable_tail_calls) +// Ask the compiler not to perform tail call optimization inside +// the marked function. +#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls)) +#endif +#elif __GNUC__ +#define GTEST_NO_TAIL_CALL_ \ + __attribute__((optimize("no-optimize-sibling-calls"))) +#else +#define GTEST_NO_TAIL_CALL_ +#endif + +// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project. +#if !defined(GTEST_HAS_CXXABI_H_) +#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) +#define GTEST_HAS_CXXABI_H_ 1 +#else +#define GTEST_HAS_CXXABI_H_ 0 +#endif +#endif + +// A function level attribute to disable checking for use of uninitialized +// memory when built with MemorySanitizer. +#if defined(__clang__) +#if __has_feature(memory_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#endif // __has_feature(memory_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +#endif // __clang__ + +// A function level attribute to disable AddressSanitizer instrumentation. +#if defined(__clang__) +#if __has_feature(address_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \ + __attribute__((no_sanitize_address)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#endif // __has_feature(address_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +#endif // __clang__ + +// A function level attribute to disable HWAddressSanitizer instrumentation. +#if defined(__clang__) +#if __has_feature(hwaddress_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \ + __attribute__((no_sanitize("hwaddress"))) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#endif // __has_feature(hwaddress_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +#endif // __clang__ + +// A function level attribute to disable ThreadSanitizer instrumentation. +#if defined(__clang__) +#if __has_feature(thread_sanitizer) +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread)) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#endif // __has_feature(thread_sanitizer) +#else +#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +#endif // __clang__ + +namespace testing { + +class Message; + +// Legacy imports for backwards compatibility. +// New code should use std:: names directly. +using std::get; +using std::make_tuple; +using std::tuple; +using std::tuple_element; +using std::tuple_size; + +namespace internal { + +// A secret type that Google Test users don't know about. It has no +// definition on purpose. Therefore it's impossible to create a +// Secret object, which is what we want. +class Secret; + +// A helper for suppressing warnings on constant condition. It just +// returns 'condition'. +GTEST_API_ bool IsTrue(bool condition); + +// Defines RE. + +#if GTEST_USES_RE2 + +// This is almost `using RE = ::RE2`, except it is copy-constructible, and it +// needs to disambiguate the `std::string`, `absl::string_view`, and `const +// char*` constructors. +class GTEST_API_ RE { + public: + RE(absl::string_view regex) : regex_(regex) {} // NOLINT + RE(const char* regex) : RE(absl::string_view(regex)) {} // NOLINT + RE(const std::string& regex) : RE(absl::string_view(regex)) {} // NOLINT + RE(const RE& other) : RE(other.pattern()) {} + + const std::string& pattern() const { return regex_.pattern(); } + + static bool FullMatch(absl::string_view str, const RE& re) { + return RE2::FullMatch(str, re.regex_); + } + static bool PartialMatch(absl::string_view str, const RE& re) { + return RE2::PartialMatch(str, re.regex_); + } + + private: + RE2 regex_; +}; + +#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE + +// A simple C++ wrapper for . It uses the POSIX Extended +// Regular Expression syntax. +class GTEST_API_ RE { + public: + // A copy constructor is required by the Standard to initialize object + // references from r-values. + RE(const RE& other) { Init(other.pattern()); } + + // Constructs an RE from a string. + RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT + + RE(const char* regex) { Init(regex); } // NOLINT + ~RE(); + + // Returns the string representation of the regex. + const char* pattern() const { return pattern_; } + + // FullMatch(str, re) returns true if and only if regular expression re + // matches the entire str. + // PartialMatch(str, re) returns true if and only if regular expression re + // matches a substring of str (including str itself). + static bool FullMatch(const ::std::string& str, const RE& re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::std::string& str, const RE& re) { + return PartialMatch(str.c_str(), re); + } + + static bool FullMatch(const char* str, const RE& re); + static bool PartialMatch(const char* str, const RE& re); + + private: + void Init(const char* regex); + const char* pattern_; + bool is_valid_; + +#if GTEST_USES_POSIX_RE + + regex_t full_regex_; // For FullMatch(). + regex_t partial_regex_; // For PartialMatch(). + +#else // GTEST_USES_SIMPLE_RE + + const char* full_pattern_; // For FullMatch(); + +#endif +}; + +#endif // ::testing::internal::RE implementation + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line); + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, + int line); + +// Defines logging utilities: +// GTEST_LOG_(severity) - logs messages at the specified severity level. The +// message itself is streamed into the macro. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. + +enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL }; + +// Formats log entry severity, provides a stream object for streaming the +// log message, and terminates the message with a newline when going out of +// scope. +class GTEST_API_ GTestLog { + public: + GTestLog(GTestLogSeverity severity, const char* file, int line); + + // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. + ~GTestLog(); + + ::std::ostream& GetStream() { return ::std::cerr; } + + private: + const GTestLogSeverity severity_; + + GTestLog(const GTestLog&) = delete; + GTestLog& operator=(const GTestLog&) = delete; +}; + +#if !defined(GTEST_LOG_) + +#define GTEST_LOG_(severity) \ + ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ + __FILE__, __LINE__) \ + .GetStream() + +inline void LogToStderr() {} +inline void FlushInfoLog() { fflush(nullptr); } + +#endif // !defined(GTEST_LOG_) + +#if !defined(GTEST_CHECK_) +// INTERNAL IMPLEMENTATION - DO NOT USE. +// +// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition +// is not satisfied. +// Synopsis: +// GTEST_CHECK_(boolean_condition); +// or +// GTEST_CHECK_(boolean_condition) << "Additional message"; +// +// This checks the condition and if the condition is not satisfied +// it prints message about the condition violation, including the +// condition itself, plus additional message streamed into it, if any, +// and then it aborts the program. It aborts the program irrespective of +// whether it is built in the debug mode or not. +#define GTEST_CHECK_(condition) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::IsTrue(condition)) \ + ; \ + else \ + GTEST_LOG_(FATAL) << "Condition " #condition " failed. " +#endif // !defined(GTEST_CHECK_) + +// An all-mode assert to verify that the given POSIX-style function +// call returns 0 (indicating success). Known limitation: this +// doesn't expand to a balanced 'if' statement, so enclose the macro +// in {} if you need to use it as the only statement in an 'if' +// branch. +#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ + if (const int gtest_error = (posix_call)) \ + GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error + +// Transforms "T" into "const T&" according to standard reference collapsing +// rules (this is only needed as a backport for C++98 compilers that do not +// support reference collapsing). Specifically, it transforms: +// +// char ==> const char& +// const char ==> const char& +// char& ==> char& +// const char& ==> const char& +// +// Note that the non-const reference will not have "const" added. This is +// standard, and necessary so that "T" can always bind to "const T&". +template +struct ConstRef { + typedef const T& type; +}; +template +struct ConstRef { + typedef T& type; +}; + +// The argument T must depend on some template parameters. +#define GTEST_REFERENCE_TO_CONST_(T) \ + typename ::testing::internal::ConstRef::type + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Use ImplicitCast_ as a safe version of static_cast for upcasting in +// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a +// const Foo*). When you use ImplicitCast_, the compiler checks that +// the cast is safe. Such explicit ImplicitCast_s are necessary in +// surprisingly many situations where C++ demands an exact type match +// instead of an argument type convertible to a target type. +// +// The syntax for using ImplicitCast_ is the same as for static_cast: +// +// ImplicitCast_(expr) +// +// ImplicitCast_ would have been part of the C++ standard library, +// but the proposal was submitted too late. It will probably make +// its way into the language in the future. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., implicit_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template +inline To ImplicitCast_(To x) { + return x; +} + +// When you upcast (that is, cast a pointer from type Foo to type +// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts +// always succeed. When you downcast (that is, cast a pointer from +// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because +// how do you know the pointer is really of type SubclassOfFoo? It +// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, +// when you downcast, you should use this macro. In debug mode, we +// use dynamic_cast<> to double-check the downcast is legal (we die +// if it's not). In normal mode, we do the efficient static_cast<> +// instead. Thus, it's important to test in debug mode to make sure +// the cast is legal! +// This is the only place in the code we should use dynamic_cast<>. +// In particular, you SHOULDN'T be using dynamic_cast<> in order to +// do RTTI (eg code like this: +// if (dynamic_cast(foo)) HandleASubclass1Object(foo); +// if (dynamic_cast(foo)) HandleASubclass2Object(foo); +// You should design the code some other way not to need this. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., down_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template // use like this: DownCast_(foo); +inline To DownCast_(From* f) { // so we only accept pointers + // Ensures that To is a sub-type of From *. This test is here only + // for compile-time type checking, and has no overhead in an + // optimized build at run-time, as it will be optimized away + // completely. + GTEST_INTENTIONAL_CONST_COND_PUSH_() + if (false) { + GTEST_INTENTIONAL_CONST_COND_POP_() + const To to = nullptr; + ::testing::internal::ImplicitCast_(to); + } + +#if GTEST_HAS_RTTI + // RTTI: debug mode only! + GTEST_CHECK_(f == nullptr || dynamic_cast(f) != nullptr); +#endif + return static_cast(f); +} + +// Downcasts the pointer of type Base to Derived. +// Derived must be a subclass of Base. The parameter MUST +// point to a class of type Derived, not any subclass of it. +// When RTTI is available, the function performs a runtime +// check to enforce this. +template +Derived* CheckedDowncastToActualType(Base* base) { +#if GTEST_HAS_RTTI + GTEST_CHECK_(typeid(*base) == typeid(Derived)); +#endif + +#if GTEST_HAS_DOWNCAST_ + return ::down_cast(base); +#elif GTEST_HAS_RTTI + return dynamic_cast(base); // NOLINT +#else + return static_cast(base); // Poor man's downcast. +#endif +} + +#if GTEST_HAS_STREAM_REDIRECTION + +// Defines the stderr capturer: +// CaptureStdout - starts capturing stdout. +// GetCapturedStdout - stops capturing stdout and returns the captured string. +// CaptureStderr - starts capturing stderr. +// GetCapturedStderr - stops capturing stderr and returns the captured string. +// +GTEST_API_ void CaptureStdout(); +GTEST_API_ std::string GetCapturedStdout(); +GTEST_API_ void CaptureStderr(); +GTEST_API_ std::string GetCapturedStderr(); + +#endif // GTEST_HAS_STREAM_REDIRECTION +// Returns the size (in bytes) of a file. +GTEST_API_ size_t GetFileSize(FILE* file); + +// Reads the entire content of a file as a string. +GTEST_API_ std::string ReadEntireFile(FILE* file); + +// All command line arguments. +GTEST_API_ std::vector GetArgvs(); + +#if GTEST_HAS_DEATH_TEST + +std::vector GetInjectableArgvs(); +// Deprecated: pass the args vector by value instead. +void SetInjectableArgvs(const std::vector* new_argvs); +void SetInjectableArgvs(const std::vector& new_argvs); +void ClearInjectableArgvs(); + +#endif // GTEST_HAS_DEATH_TEST + +// Defines synchronization primitives. +#if GTEST_IS_THREADSAFE + +#if GTEST_OS_WINDOWS +// Provides leak-safe Windows kernel handle ownership. +// Used in death tests and in threading support. +class GTEST_API_ AutoHandle { + public: + // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to + // avoid including in this header file. Including is + // undesirable because it defines a lot of symbols and macros that tend to + // conflict with client code. This assumption is verified by + // WindowsTypesTest.HANDLEIsVoidStar. + typedef void* Handle; + AutoHandle(); + explicit AutoHandle(Handle handle); + + ~AutoHandle(); + + Handle Get() const; + void Reset(); + void Reset(Handle handle); + + private: + // Returns true if and only if the handle is a valid handle object that can be + // closed. + bool IsCloseable() const; + + Handle handle_; + + AutoHandle(const AutoHandle&) = delete; + AutoHandle& operator=(const AutoHandle&) = delete; +}; +#endif + +#if GTEST_HAS_NOTIFICATION_ +// Notification has already been imported into the namespace. +// Nothing to do here. + +#else +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// Allows a controller thread to pause execution of newly created +// threads until notified. Instances of this class must be created +// and destroyed in the controller thread. +// +// This class is only for testing Google Test's own constructs. Do not +// use it in user tests, either directly or indirectly. +// TODO(b/203539622): Replace unconditionally with absl::Notification. +class GTEST_API_ Notification { + public: + Notification() : notified_(false) {} + Notification(const Notification&) = delete; + Notification& operator=(const Notification&) = delete; + + // Notifies all threads created with this notification to start. Must + // be called from the controller thread. + void Notify() { + std::lock_guard lock(mu_); + notified_ = true; + cv_.notify_all(); + } + + // Blocks until the controller thread notifies. Must be called from a test + // thread. + void WaitForNotification() { + std::unique_lock lock(mu_); + cv_.wait(lock, [this]() { return notified_; }); + } + + private: + std::mutex mu_; + std::condition_variable cv_; + bool notified_; +}; +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 +#endif // GTEST_HAS_NOTIFICATION_ + +// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD +// defined, but we don't want to use MinGW's pthreads implementation, which +// has conformance problems with some versions of the POSIX standard. +#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW + +// As a C-function, ThreadFuncWithCLinkage cannot be templated itself. +// Consequently, it cannot select a correct instantiation of ThreadWithParam +// in order to call its Run(). Introducing ThreadWithParamBase as a +// non-templated base class for ThreadWithParam allows us to bypass this +// problem. +class ThreadWithParamBase { + public: + virtual ~ThreadWithParamBase() {} + virtual void Run() = 0; +}; + +// pthread_create() accepts a pointer to a function type with the C linkage. +// According to the Standard (7.5/1), function types with different linkages +// are different even if they are otherwise identical. Some compilers (for +// example, SunStudio) treat them as different types. Since class methods +// cannot be defined with C-linkage we need to define a free C-function to +// pass into pthread_create(). +extern "C" inline void* ThreadFuncWithCLinkage(void* thread) { + static_cast(thread)->Run(); + return nullptr; +} + +// Helper class for testing Google Test's multi-threading constructs. +// To use it, write: +// +// void ThreadFunc(int param) { /* Do things with param */ } +// Notification thread_can_start; +// ... +// // The thread_can_start parameter is optional; you can supply NULL. +// ThreadWithParam thread(&ThreadFunc, 5, &thread_can_start); +// thread_can_start.Notify(); +// +// These classes are only for testing Google Test's own constructs. Do +// not use them in user tests, either directly or indirectly. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void UserThreadFunc(T); + + ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start) + : func_(func), + param_(param), + thread_can_start_(thread_can_start), + finished_(false) { + ThreadWithParamBase* const base = this; + // The thread can be created only after all fields except thread_ + // have been initialized. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_create(&thread_, nullptr, &ThreadFuncWithCLinkage, base)); + } + ~ThreadWithParam() override { Join(); } + + void Join() { + if (!finished_) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, nullptr)); + finished_ = true; + } + } + + void Run() override { + if (thread_can_start_ != nullptr) thread_can_start_->WaitForNotification(); + func_(param_); + } + + private: + UserThreadFunc* const func_; // User-supplied thread function. + const T param_; // User-supplied parameter to the thread function. + // When non-NULL, used to block execution until the controller thread + // notifies. + Notification* const thread_can_start_; + bool finished_; // true if and only if we know that the thread function has + // finished. + pthread_t thread_; // The native thread object. + + ThreadWithParam(const ThreadWithParam&) = delete; + ThreadWithParam& operator=(const ThreadWithParam&) = delete; +}; +#endif // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD || + // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ + +#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ +// Mutex and ThreadLocal have already been imported into the namespace. +// Nothing to do here. + +#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + +// Mutex implements mutex on Windows platforms. It is used in conjunction +// with class MutexLock: +// +// Mutex mutex; +// ... +// MutexLock lock(&mutex); // Acquires the mutex and releases it at the +// // end of the current scope. +// +// A static Mutex *must* be defined or declared using one of the following +// macros: +// GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex); +// GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex); +// +// (A non-static Mutex is defined/declared in the usual way). +class GTEST_API_ Mutex { + public: + enum MutexType { kStatic = 0, kDynamic = 1 }; + // We rely on kStaticMutex being 0 as it is to what the linker initializes + // type_ in static mutexes. critical_section_ will be initialized lazily + // in ThreadSafeLazyInit(). + enum StaticConstructorSelector { kStaticMutex = 0 }; + + // This constructor intentionally does nothing. It relies on type_ being + // statically initialized to 0 (effectively setting it to kStatic) and on + // ThreadSafeLazyInit() to lazily initialize the rest of the members. + explicit Mutex(StaticConstructorSelector /*dummy*/) {} + + Mutex(); + ~Mutex(); + + void Lock(); + + void Unlock(); + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld(); + + private: + // Initializes owner_thread_id_ and critical_section_ in static mutexes. + void ThreadSafeLazyInit(); + + // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503, + // we assume that 0 is an invalid value for thread IDs. + unsigned int owner_thread_id_; + + // For static mutexes, we rely on these members being initialized to zeros + // by the linker. + MutexType type_; + long critical_section_init_phase_; // NOLINT + GTEST_CRITICAL_SECTION* critical_section_; + + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; +}; + +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex) + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + Mutex* const mutex_; + + GTestMutexLock(const GTestMutexLock&) = delete; + GTestMutexLock& operator=(const GTestMutexLock&) = delete; +}; + +typedef GTestMutexLock MutexLock; + +// Base class for ValueHolder. Allows a caller to hold and delete a value +// without knowing its type. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Provides a way for a thread to send notifications to a ThreadLocal +// regardless of its parameter type. +class ThreadLocalBase { + public: + // Creates a new ValueHolder object holding a default value passed to + // this ThreadLocal's constructor and returns it. It is the caller's + // responsibility not to call this when the ThreadLocal instance already + // has a value on the current thread. + virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0; + + protected: + ThreadLocalBase() {} + virtual ~ThreadLocalBase() {} + + private: + ThreadLocalBase(const ThreadLocalBase&) = delete; + ThreadLocalBase& operator=(const ThreadLocalBase&) = delete; +}; + +// Maps a thread to a set of ThreadLocals that have values instantiated on that +// thread and notifies them when the thread exits. A ThreadLocal instance is +// expected to persist until all threads it has values on have terminated. +class GTEST_API_ ThreadLocalRegistry { + public: + // Registers thread_local_instance as having value on the current thread. + // Returns a value that can be used to identify the thread from other threads. + static ThreadLocalValueHolderBase* GetValueOnCurrentThread( + const ThreadLocalBase* thread_local_instance); + + // Invoked when a ThreadLocal instance is destroyed. + static void OnThreadLocalDestroyed( + const ThreadLocalBase* thread_local_instance); +}; + +class GTEST_API_ ThreadWithParamBase { + public: + void Join(); + + protected: + class Runnable { + public: + virtual ~Runnable() {} + virtual void Run() = 0; + }; + + ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start); + virtual ~ThreadWithParamBase(); + + private: + AutoHandle thread_; +}; + +// Helper class for testing Google Test's multi-threading constructs. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void UserThreadFunc(T); + + ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start) + : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {} + virtual ~ThreadWithParam() {} + + private: + class RunnableImpl : public Runnable { + public: + RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {} + virtual ~RunnableImpl() {} + virtual void Run() { func_(param_); } + + private: + UserThreadFunc* const func_; + const T param_; + + RunnableImpl(const RunnableImpl&) = delete; + RunnableImpl& operator=(const RunnableImpl&) = delete; + }; + + ThreadWithParam(const ThreadWithParam&) = delete; + ThreadWithParam& operator=(const ThreadWithParam&) = delete; +}; + +// Implements thread-local storage on Windows systems. +// +// // Thread 1 +// ThreadLocal tl(100); // 100 is the default value for each thread. +// +// // Thread 2 +// tl.set(150); // Changes the value for thread 2 only. +// EXPECT_EQ(150, tl.get()); +// +// // Thread 1 +// EXPECT_EQ(100, tl.get()); // In thread 1, tl has the original value. +// tl.set(200); +// EXPECT_EQ(200, tl.get()); +// +// The template type argument T must have a public copy constructor. +// In addition, the default ThreadLocal constructor requires T to have +// a public default constructor. +// +// The users of a TheadLocal instance have to make sure that all but one +// threads (including the main one) using that instance have exited before +// destroying it. Otherwise, the per-thread objects managed for them by the +// ThreadLocal instance are not guaranteed to be destroyed on all platforms. +// +// Google Test only uses global ThreadLocal objects. That means they +// will die after main() has returned. Therefore, no per-thread +// object managed by Google Test will be leaked as long as all threads +// using Google Test have exited when main() returns. +template +class ThreadLocal : public ThreadLocalBase { + public: + ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {} + explicit ThreadLocal(const T& value) + : default_factory_(new InstanceValueHolderFactory(value)) {} + + ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); } + + T* pointer() { return GetOrCreateValue(); } + const T* pointer() const { return GetOrCreateValue(); } + const T& get() const { return *pointer(); } + void set(const T& value) { *pointer() = value; } + + private: + // Holds a value of T. Can be deleted via its base class without the caller + // knowing the type of T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + ValueHolder() : value_() {} + explicit ValueHolder(const T& value) : value_(value) {} + + T* pointer() { return &value_; } + + private: + T value_; + ValueHolder(const ValueHolder&) = delete; + ValueHolder& operator=(const ValueHolder&) = delete; + }; + + T* GetOrCreateValue() const { + return static_cast( + ThreadLocalRegistry::GetValueOnCurrentThread(this)) + ->pointer(); + } + + ThreadLocalValueHolderBase* NewValueForCurrentThread() const override { + return default_factory_->MakeNewHolder(); + } + + class ValueHolderFactory { + public: + ValueHolderFactory() {} + virtual ~ValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const = 0; + + private: + ValueHolderFactory(const ValueHolderFactory&) = delete; + ValueHolderFactory& operator=(const ValueHolderFactory&) = delete; + }; + + class DefaultValueHolderFactory : public ValueHolderFactory { + public: + DefaultValueHolderFactory() {} + ValueHolder* MakeNewHolder() const override { return new ValueHolder(); } + + private: + DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete; + DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) = + delete; + }; + + class InstanceValueHolderFactory : public ValueHolderFactory { + public: + explicit InstanceValueHolderFactory(const T& value) : value_(value) {} + ValueHolder* MakeNewHolder() const override { + return new ValueHolder(value_); + } + + private: + const T value_; // The value for each thread. + + InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete; + InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) = + delete; + }; + + std::unique_ptr default_factory_; + + ThreadLocal(const ThreadLocal&) = delete; + ThreadLocal& operator=(const ThreadLocal&) = delete; +}; + +#elif GTEST_HAS_PTHREAD + +// MutexBase and Mutex implement mutex on pthreads-based platforms. +class MutexBase { + public: + // Acquires this mutex. + void Lock() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_)); + owner_ = pthread_self(); + has_owner_ = true; + } + + // Releases this mutex. + void Unlock() { + // Since the lock is being released the owner_ field should no longer be + // considered valid. We don't protect writing to has_owner_ here, as it's + // the caller's responsibility to ensure that the current thread holds the + // mutex when this is called. + has_owner_ = false; + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_)); + } + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld() const { + GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self())) + << "The current thread is not holding the mutex @" << this; + } + + // A static mutex may be used before main() is entered. It may even + // be used before the dynamic initialization stage. Therefore we + // must be able to initialize a static mutex object at link time. + // This means MutexBase has to be a POD and its member variables + // have to be public. + public: + pthread_mutex_t mutex_; // The underlying pthread mutex. + // has_owner_ indicates whether the owner_ field below contains a valid thread + // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All + // accesses to the owner_ field should be protected by a check of this field. + // An alternative might be to memset() owner_ to all zeros, but there's no + // guarantee that a zero'd pthread_t is necessarily invalid or even different + // from pthread_self(). + bool has_owner_; + pthread_t owner_; // The thread holding the mutex. +}; + +// Forward-declares a static mutex. +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::MutexBase mutex + +// Defines and statically (i.e. at link time) initializes a static mutex. +// The initialization list here does not explicitly initialize each field, +// instead relying on default initialization for the unspecified fields. In +// particular, the owner_ field (a pthread_t) is not explicitly initialized. +// This allows initialization to work whether pthread_t is a scalar or struct. +// The flag -Wmissing-field-initializers must not be specified for this to work. +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0} + +// The Mutex class can only be used for mutexes created at runtime. It +// shares its API with MutexBase otherwise. +class Mutex : public MutexBase { + public: + Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr)); + has_owner_ = false; + } + ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); } + + private: + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; +}; + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + MutexBase* const mutex_; + + GTestMutexLock(const GTestMutexLock&) = delete; + GTestMutexLock& operator=(const GTestMutexLock&) = delete; +}; + +typedef GTestMutexLock MutexLock; + +// Helpers for ThreadLocal. + +// pthread_key_create() requires DeleteThreadLocalValue() to have +// C-linkage. Therefore it cannot be templatized to access +// ThreadLocal. Hence the need for class +// ThreadLocalValueHolderBase. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Called by pthread to delete thread-local data stored by +// pthread_setspecific(). +extern "C" inline void DeleteThreadLocalValue(void* value_holder) { + delete static_cast(value_holder); +} + +// Implements thread-local storage on pthreads-based systems. +template +class GTEST_API_ ThreadLocal { + public: + ThreadLocal() + : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {} + explicit ThreadLocal(const T& value) + : key_(CreateKey()), + default_factory_(new InstanceValueHolderFactory(value)) {} + + ~ThreadLocal() { + // Destroys the managed object for the current thread, if any. + DeleteThreadLocalValue(pthread_getspecific(key_)); + + // Releases resources associated with the key. This will *not* + // delete managed objects for other threads. + GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_)); + } + + T* pointer() { return GetOrCreateValue(); } + const T* pointer() const { return GetOrCreateValue(); } + const T& get() const { return *pointer(); } + void set(const T& value) { *pointer() = value; } + + private: + // Holds a value of type T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + ValueHolder() : value_() {} + explicit ValueHolder(const T& value) : value_(value) {} + + T* pointer() { return &value_; } + + private: + T value_; + ValueHolder(const ValueHolder&) = delete; + ValueHolder& operator=(const ValueHolder&) = delete; + }; + + static pthread_key_t CreateKey() { + pthread_key_t key; + // When a thread exits, DeleteThreadLocalValue() will be called on + // the object managed for that thread. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_key_create(&key, &DeleteThreadLocalValue)); + return key; + } + + T* GetOrCreateValue() const { + ThreadLocalValueHolderBase* const holder = + static_cast(pthread_getspecific(key_)); + if (holder != nullptr) { + return CheckedDowncastToActualType(holder)->pointer(); + } + + ValueHolder* const new_holder = default_factory_->MakeNewHolder(); + ThreadLocalValueHolderBase* const holder_base = new_holder; + GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base)); + return new_holder->pointer(); + } + + class ValueHolderFactory { + public: + ValueHolderFactory() {} + virtual ~ValueHolderFactory() {} + virtual ValueHolder* MakeNewHolder() const = 0; + + private: + ValueHolderFactory(const ValueHolderFactory&) = delete; + ValueHolderFactory& operator=(const ValueHolderFactory&) = delete; + }; + + class DefaultValueHolderFactory : public ValueHolderFactory { + public: + DefaultValueHolderFactory() {} + ValueHolder* MakeNewHolder() const override { return new ValueHolder(); } + + private: + DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete; + DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) = + delete; + }; + + class InstanceValueHolderFactory : public ValueHolderFactory { + public: + explicit InstanceValueHolderFactory(const T& value) : value_(value) {} + ValueHolder* MakeNewHolder() const override { + return new ValueHolder(value_); + } + + private: + const T value_; // The value for each thread. + + InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete; + InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) = + delete; + }; + + // A key pthreads uses for looking up per-thread values. + const pthread_key_t key_; + std::unique_ptr default_factory_; + + ThreadLocal(const ThreadLocal&) = delete; + ThreadLocal& operator=(const ThreadLocal&) = delete; +}; + +#endif // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ + +#else // GTEST_IS_THREADSAFE + +// A dummy implementation of synchronization primitives (mutex, lock, +// and thread-local variable). Necessary for compiling Google Test where +// mutex is not supported - using Google Test in multiple threads is not +// supported on such platforms. + +class Mutex { + public: + Mutex() {} + void Lock() {} + void Unlock() {} + void AssertHeld() const {} +}; + +#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex + +// We cannot name this class MutexLock because the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. That macro is used as a defensive measure to prevent against +// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than +// "MutexLock l(&mu)". Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex*) {} // NOLINT +}; + +typedef GTestMutexLock MutexLock; + +template +class GTEST_API_ ThreadLocal { + public: + ThreadLocal() : value_() {} + explicit ThreadLocal(const T& value) : value_(value) {} + T* pointer() { return &value_; } + const T* pointer() const { return &value_; } + const T& get() const { return value_; } + void set(const T& value) { value_ = value; } + + private: + T value_; +}; + +#endif // GTEST_IS_THREADSAFE + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +GTEST_API_ size_t GetThreadCount(); + +#if GTEST_OS_WINDOWS +#define GTEST_PATH_SEP_ "\\" +#define GTEST_HAS_ALT_PATH_SEP_ 1 +#else +#define GTEST_PATH_SEP_ "/" +#define GTEST_HAS_ALT_PATH_SEP_ 0 +#endif // GTEST_OS_WINDOWS + +// Utilities for char. + +// isspace(int ch) and friends accept an unsigned char or EOF. char +// may be signed, depending on the compiler (or compiler flags). +// Therefore we need to cast a char to unsigned char before calling +// isspace(), etc. + +inline bool IsAlpha(char ch) { + return isalpha(static_cast(ch)) != 0; +} +inline bool IsAlNum(char ch) { + return isalnum(static_cast(ch)) != 0; +} +inline bool IsDigit(char ch) { + return isdigit(static_cast(ch)) != 0; +} +inline bool IsLower(char ch) { + return islower(static_cast(ch)) != 0; +} +inline bool IsSpace(char ch) { + return isspace(static_cast(ch)) != 0; +} +inline bool IsUpper(char ch) { + return isupper(static_cast(ch)) != 0; +} +inline bool IsXDigit(char ch) { + return isxdigit(static_cast(ch)) != 0; +} +#ifdef __cpp_char8_t +inline bool IsXDigit(char8_t ch) { + return isxdigit(static_cast(ch)) != 0; +} +#endif +inline bool IsXDigit(char16_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} +inline bool IsXDigit(char32_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} +inline bool IsXDigit(wchar_t ch) { + const unsigned char low_byte = static_cast(ch); + return ch == low_byte && isxdigit(low_byte) != 0; +} + +inline char ToLower(char ch) { + return static_cast(tolower(static_cast(ch))); +} +inline char ToUpper(char ch) { + return static_cast(toupper(static_cast(ch))); +} + +inline std::string StripTrailingSpaces(std::string str) { + std::string::iterator it = str.end(); + while (it != str.begin() && IsSpace(*--it)) it = str.erase(it); + return str; +} + +// The testing::internal::posix namespace holds wrappers for common +// POSIX functions. These wrappers hide the differences between +// Windows/MSVC and POSIX systems. Since some compilers define these +// standard functions as macros, the wrapper cannot have the same name +// as the wrapped function. + +namespace posix { + +// Functions with a different name on Windows. + +#if GTEST_OS_WINDOWS + +typedef struct _stat StatStruct; + +#ifdef __BORLANDC__ +inline int DoIsATTY(int fd) { return isatty(fd); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +#else // !__BORLANDC__ +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \ + GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM) +inline int DoIsATTY(int /* fd */) { return 0; } +#else +inline int DoIsATTY(int fd) { return _isatty(fd); } +#endif // GTEST_OS_WINDOWS_MOBILE +inline int StrCaseCmp(const char* s1, const char* s2) { + return _stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return _strdup(src); } +#endif // __BORLANDC__ + +#if GTEST_OS_WINDOWS_MOBILE +inline int FileNo(FILE* file) { return reinterpret_cast(_fileno(file)); } +// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this +// time and thus not defined there. +#else +inline int FileNo(FILE* file) { return _fileno(file); } +inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); } +inline int RmDir(const char* dir) { return _rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; } +#endif // GTEST_OS_WINDOWS_MOBILE + +#elif GTEST_OS_ESP8266 +typedef struct stat StatStruct; + +inline int FileNo(FILE* file) { return fileno(file); } +inline int DoIsATTY(int fd) { return isatty(fd); } +inline int Stat(const char* path, StatStruct* buf) { + // stat function not implemented on ESP8266 + return 0; +} +inline int StrCaseCmp(const char* s1, const char* s2) { + return strcasecmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +inline int RmDir(const char* dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } + +#else + +typedef struct stat StatStruct; + +inline int FileNo(FILE* file) { return fileno(file); } +inline int DoIsATTY(int fd) { return isatty(fd); } +inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return strcasecmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +inline int RmDir(const char* dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } + +#endif // GTEST_OS_WINDOWS + +inline int IsATTY(int fd) { + // DoIsATTY might change errno (for example ENOTTY in case you redirect stdout + // to a file on Linux), which is unexpected, so save the previous value, and + // restore it after the call. + int savedErrno = errno; + int isAttyValue = DoIsATTY(fd); + errno = savedErrno; + + return isAttyValue; +} + +// Functions deprecated by MSVC 8.0. + +GTEST_DISABLE_MSC_DEPRECATED_PUSH_() + +// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and +// StrError() aren't needed on Windows CE at this time and thus not +// defined there. + +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_ESP8266 && !GTEST_OS_XTENSA +inline int ChDir(const char* dir) { return chdir(dir); } +#endif +inline FILE* FOpen(const char* path, const char* mode) { +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW + struct wchar_codecvt : public std::codecvt {}; + std::wstring_convert converter; + std::wstring wide_path = converter.from_bytes(path); + std::wstring wide_mode = converter.from_bytes(mode); + return _wfopen(wide_path.c_str(), wide_mode.c_str()); +#else // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW + return fopen(path, mode); +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW +} +#if !GTEST_OS_WINDOWS_MOBILE +inline FILE* FReopen(const char* path, const char* mode, FILE* stream) { + return freopen(path, mode, stream); +} +inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); } +#endif +inline int FClose(FILE* fp) { return fclose(fp); } +#if !GTEST_OS_WINDOWS_MOBILE +inline int Read(int fd, void* buf, unsigned int count) { + return static_cast(read(fd, buf, count)); +} +inline int Write(int fd, const void* buf, unsigned int count) { + return static_cast(write(fd, buf, count)); +} +inline int Close(int fd) { return close(fd); } +inline const char* StrError(int errnum) { return strerror(errnum); } +#endif +inline const char* GetEnv(const char* name) { +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA + // We are on an embedded platform, which has no environment variables. + static_cast(name); // To prevent 'unused argument' warning. + return nullptr; +#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) + // Environment variables which we programmatically clear will be set to the + // empty string rather than unset (NULL). Handle that case. + const char* const env = getenv(name); + return (env != nullptr && env[0] != '\0') ? env : nullptr; +#else + return getenv(name); +#endif +} + +GTEST_DISABLE_MSC_DEPRECATED_POP_() + +#if GTEST_OS_WINDOWS_MOBILE +// Windows CE has no C library. The abort() function is used in +// several places in Google Test. This implementation provides a reasonable +// imitation of standard behaviour. +[[noreturn]] void Abort(); +#else +[[noreturn]] inline void Abort() { abort(); } +#endif // GTEST_OS_WINDOWS_MOBILE + +} // namespace posix + +// MSVC "deprecates" snprintf and issues warnings wherever it is used. In +// order to avoid these warnings, we need to use _snprintf or _snprintf_s on +// MSVC-based platforms. We map the GTEST_SNPRINTF_ macro to the appropriate +// function in order to achieve that. We use macro definition here because +// snprintf is a variadic function. +#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE +// MSVC 2005 and above support variadic macros. +#define GTEST_SNPRINTF_(buffer, size, format, ...) \ + _snprintf_s(buffer, size, size, format, __VA_ARGS__) +#elif defined(_MSC_VER) +// Windows CE does not define _snprintf_s +#define GTEST_SNPRINTF_ _snprintf +#else +#define GTEST_SNPRINTF_ snprintf +#endif + +// The biggest signed integer type the compiler supports. +// +// long long is guaranteed to be at least 64-bits in C++11. +using BiggestInt = long long; // NOLINT + +// The maximum number a BiggestInt can represent. +constexpr BiggestInt kMaxBiggestInt = (std::numeric_limits::max)(); + +// This template class serves as a compile-time function from size to +// type. It maps a size in bytes to a primitive type with that +// size. e.g. +// +// TypeWithSize<4>::UInt +// +// is typedef-ed to be unsigned int (unsigned integer made up of 4 +// bytes). +// +// Such functionality should belong to STL, but I cannot find it +// there. +// +// Google Test uses this class in the implementation of floating-point +// comparison. +// +// For now it only handles UInt (unsigned int) as that's all Google Test +// needs. Other types can be easily added in the future if need +// arises. +template +class TypeWithSize { + public: + // This prevents the user from using TypeWithSize with incorrect + // values of N. + using UInt = void; +}; + +// The specialization for size 4. +template <> +class TypeWithSize<4> { + public: + using Int = std::int32_t; + using UInt = std::uint32_t; +}; + +// The specialization for size 8. +template <> +class TypeWithSize<8> { + public: + using Int = std::int64_t; + using UInt = std::uint64_t; +}; + +// Integer types of known sizes. +using TimeInMillis = int64_t; // Represents time in milliseconds. + +// Utilities for command line flags and environment variables. + +// Macro for referencing flags. +#if !defined(GTEST_FLAG) +#define GTEST_FLAG_NAME_(name) gtest_##name +#define GTEST_FLAG(name) FLAGS_gtest_##name +#endif // !defined(GTEST_FLAG) + +// Pick a command line flags implementation. +#if GTEST_HAS_ABSL + +// Macros for defining flags. +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc) +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc) +#define GTEST_DEFINE_string_(name, default_val, doc) \ + ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc) + +// Macros for declaring flags. +#define GTEST_DECLARE_bool_(name) \ + ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name)) +#define GTEST_DECLARE_int32_(name) \ + ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name)) +#define GTEST_DECLARE_string_(name) \ + ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name)) + +#define GTEST_FLAG_SAVER_ ::absl::FlagSaver + +#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name)) +#define GTEST_FLAG_SET(name, value) \ + (void)(::absl::SetFlag(>EST_FLAG(name), value)) +#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0 + +#else // GTEST_HAS_ABSL + +// Macros for defining flags. +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ bool GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DEFINE_string_(name, default_val, doc) \ + namespace testing { \ + GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \ + } \ + static_assert(true, "no-op to require trailing semicolon") + +// Macros for declaring flags. +#define GTEST_DECLARE_bool_(name) \ + namespace testing { \ + GTEST_API_ extern bool GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DECLARE_int32_(name) \ + namespace testing { \ + GTEST_API_ extern std::int32_t GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") +#define GTEST_DECLARE_string_(name) \ + namespace testing { \ + GTEST_API_ extern ::std::string GTEST_FLAG(name); \ + } \ + static_assert(true, "no-op to require trailing semicolon") + +#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver + +#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name) +#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value) +#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1 + +#endif // GTEST_HAS_ABSL + +// Thread annotations +#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) +#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks) +#define GTEST_LOCK_EXCLUDED_(locks) +#endif // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_) + +// Parses 'str' for a 32-bit signed integer. If successful, writes the result +// to *value and returns true; otherwise leaves *value unchanged and returns +// false. +GTEST_API_ bool ParseInt32(const Message& src_text, const char* str, + int32_t* value); + +// Parses a bool/int32_t/string from the environment variable +// corresponding to the given Google Test flag. +bool BoolFromGTestEnv(const char* flag, bool default_val); +GTEST_API_ int32_t Int32FromGTestEnv(const char* flag, int32_t default_val); +std::string OutputFlagAlsoCheckEnvVar(); +const char* StringFromGTestEnv(const char* flag, const char* default_val); + +} // namespace internal +} // namespace testing + +#if !defined(GTEST_INTERNAL_DEPRECATED) + +// Internal Macro to mark an API deprecated, for googletest usage only +// Usage: class GTEST_INTERNAL_DEPRECATED(message) MyClass or +// GTEST_INTERNAL_DEPRECATED(message) myFunction(); Every usage of +// a deprecated entity will trigger a warning when compiled with +// `-Wdeprecated-declarations` option (clang, gcc, any __GNUC__ compiler). +// For msvc /W3 option will need to be used +// Note that for 'other' compilers this macro evaluates to nothing to prevent +// compilations errors. +#if defined(_MSC_VER) +#define GTEST_INTERNAL_DEPRECATED(message) __declspec(deprecated(message)) +#elif defined(__GNUC__) +#define GTEST_INTERNAL_DEPRECATED(message) __attribute__((deprecated(message))) +#else +#define GTEST_INTERNAL_DEPRECATED(message) +#endif + +#endif // !defined(GTEST_INTERNAL_DEPRECATED) + +#if GTEST_HAS_ABSL +// Always use absl::any for UniversalPrinter<> specializations if googletest +// is built with absl support. +#define GTEST_INTERNAL_HAS_ANY 1 +#include "absl/types/any.h" +namespace testing { +namespace internal { +using Any = ::absl::any; +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::any for UniversalPrinter<> +// specializations. +#define GTEST_INTERNAL_HAS_ANY 1 +#include +namespace testing { +namespace internal { +using Any = ::std::any; +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::any is not +// supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#if GTEST_HAS_ABSL +// Always use absl::optional for UniversalPrinter<> specializations if +// googletest is built with absl support. +#define GTEST_INTERNAL_HAS_OPTIONAL 1 +#include "absl/types/optional.h" +namespace testing { +namespace internal { +template +using Optional = ::absl::optional; +inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; } +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::optional for UniversalPrinter<> +// specializations. +#define GTEST_INTERNAL_HAS_OPTIONAL 1 +#include +namespace testing { +namespace internal { +template +using Optional = ::std::optional; +inline ::std::nullopt_t Nullopt() { return ::std::nullopt; } +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::optional is not +// supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#if GTEST_HAS_ABSL +// Always use absl::string_view for Matcher<> specializations if googletest +// is built with absl support. +#define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#include "absl/strings/string_view.h" +namespace testing { +namespace internal { +using StringView = ::absl::string_view; +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::string_view for Matcher<> +// specializations. +#define GTEST_INTERNAL_HAS_STRING_VIEW 1 +#include +namespace testing { +namespace internal { +using StringView = ::std::string_view; +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::string_view is not +// supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#if GTEST_HAS_ABSL +// Always use absl::variant for UniversalPrinter<> specializations if googletest +// is built with absl support. +#define GTEST_INTERNAL_HAS_VARIANT 1 +#include "absl/types/variant.h" +namespace testing { +namespace internal { +template +using Variant = ::absl::variant; +} // namespace internal +} // namespace testing +#else +#ifdef __has_include +#if __has_include() && __cplusplus >= 201703L +// Otherwise for C++17 and higher use std::variant for UniversalPrinter<> +// specializations. +#define GTEST_INTERNAL_HAS_VARIANT 1 +#include +namespace testing { +namespace internal { +template +using Variant = ::std::variant; +} // namespace internal +} // namespace testing +// The case where absl is configured NOT to alias std::variant is not supported. +#endif // __has_include() && __cplusplus >= 201703L +#endif // __has_include +#endif // GTEST_HAS_ABSL + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h new file mode 100644 index 0000000000..cca2e1f2ad --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h @@ -0,0 +1,177 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This header file declares the String class and functions used internally by +// Google Test. They are subject to change without notice. They should not used +// by code external to Google Test. +// +// This header file is #included by gtest-internal.h. +// It should not be #included by other files. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ + +#ifdef __BORLANDC__ +// string.h is not guaranteed to provide strcpy on C++ Builder. +#include +#endif + +#include + +#include +#include + +#include "gtest/internal/gtest-port.h" + +namespace testing { +namespace internal { + +// String - an abstract class holding static string utilities. +class GTEST_API_ String { + public: + // Static utility methods + + // Clones a 0-terminated C string, allocating memory using new. The + // caller is responsible for deleting the return value using + // delete[]. Returns the cloned string, or NULL if the input is + // NULL. + // + // This is different from strdup() in string.h, which allocates + // memory using malloc(). + static const char* CloneCString(const char* c_str); + +#if GTEST_OS_WINDOWS_MOBILE + // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be + // able to pass strings to Win32 APIs on CE we need to convert them + // to 'Unicode', UTF-16. + + // Creates a UTF-16 wide string from the given ANSI string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the wide string, or NULL if the + // input is NULL. + // + // The wide string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static LPCWSTR AnsiToUtf16(const char* c_str); + + // Creates an ANSI string from the given wide string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the ANSI string, or NULL if the + // input is NULL. + // + // The returned string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static const char* Utf16ToAnsi(LPCWSTR utf16_str); +#endif + + // Compares two C strings. Returns true if and only if they have the same + // content. + // + // Unlike strcmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CStringEquals(const char* lhs, const char* rhs); + + // Converts a wide C string to a String using the UTF-8 encoding. + // NULL will be converted to "(null)". If an error occurred during + // the conversion, "(failed to convert from wide string)" is + // returned. + static std::string ShowWideCString(const wchar_t* wide_c_str); + + // Compares two wide C strings. Returns true if and only if they have the + // same content. + // + // Unlike wcscmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs); + + // Compares two C strings, ignoring case. Returns true if and only if + // they have the same content. + // + // Unlike strcasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs); + + // Compares two wide C strings, ignoring case. Returns true if and only if + // they have the same content. + // + // Unlike wcscasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL wide C string, + // including the empty string. + // NB: The implementations on different platforms slightly differ. + // On windows, this method uses _wcsicmp which compares according to LC_CTYPE + // environment variable. On GNU platform this method uses wcscasecmp + // which compares according to LC_CTYPE category of the current locale. + // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the + // current locale. + static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs); + + // Returns true if and only if the given string ends with the given suffix, + // ignoring case. Any string is considered to end with an empty suffix. + static bool EndsWithCaseInsensitive(const std::string& str, + const std::string& suffix); + + // Formats an int value as "%02d". + static std::string FormatIntWidth2(int value); // "%02d" for width == 2 + + // Formats an int value to given width with leading zeros. + static std::string FormatIntWidthN(int value, int width); + + // Formats an int value as "%X". + static std::string FormatHexInt(int value); + + // Formats an int value as "%X". + static std::string FormatHexUInt32(uint32_t value); + + // Formats a byte as "%02X". + static std::string FormatByte(unsigned char value); + + private: + String(); // Not meant to be instantiated. +}; // class String + +// Gets the content of the stringstream's buffer as an std::string. Each '\0' +// character in the buffer is replaced with "\\0". +GTEST_API_ std::string StringStreamToString(::std::stringstream* stream); + +} // namespace internal +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h new file mode 100644 index 0000000000..6bc02a7de3 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h @@ -0,0 +1,186 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Type utilities needed for implementing typed and type-parameterized +// tests. + +// IWYU pragma: private, include "gtest/gtest.h" +// IWYU pragma: friend gtest/.* +// IWYU pragma: friend gmock/.* + +#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + +#include "gtest/internal/gtest-port.h" + +// #ifdef __GNUC__ is too general here. It is possible to use gcc without using +// libstdc++ (which is where cxxabi.h comes from). +#if GTEST_HAS_CXXABI_H_ +#include +#elif defined(__HP_aCC) +#include +#endif // GTEST_HASH_CXXABI_H_ + +namespace testing { +namespace internal { + +// Canonicalizes a given name with respect to the Standard C++ Library. +// This handles removing the inline namespace within `std` that is +// used by various standard libraries (e.g., `std::__1`). Names outside +// of namespace std are returned unmodified. +inline std::string CanonicalizeForStdLibVersioning(std::string s) { + static const char prefix[] = "std::__"; + if (s.compare(0, strlen(prefix), prefix) == 0) { + std::string::size_type end = s.find("::", strlen(prefix)); + if (end != s.npos) { + // Erase everything between the initial `std` and the second `::`. + s.erase(strlen("std"), end - strlen("std")); + } + } + return s; +} + +#if GTEST_HAS_RTTI +// GetTypeName(const std::type_info&) returns a human-readable name of type T. +inline std::string GetTypeName(const std::type_info& type) { + const char* const name = type.name(); +#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC) + int status = 0; + // gcc's implementation of typeid(T).name() mangles the type name, + // so we have to demangle it. +#if GTEST_HAS_CXXABI_H_ + using abi::__cxa_demangle; +#endif // GTEST_HAS_CXXABI_H_ + char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status); + const std::string name_str(status == 0 ? readable_name : name); + free(readable_name); + return CanonicalizeForStdLibVersioning(name_str); +#else + return name; +#endif // GTEST_HAS_CXXABI_H_ || __HP_aCC +} +#endif // GTEST_HAS_RTTI + +// GetTypeName() returns a human-readable name of type T if and only if +// RTTI is enabled, otherwise it returns a dummy type name. +// NB: This function is also used in Google Mock, so don't move it inside of +// the typed-test-only section below. +template +std::string GetTypeName() { +#if GTEST_HAS_RTTI + return GetTypeName(typeid(T)); +#else + return ""; +#endif // GTEST_HAS_RTTI +} + +// A unique type indicating an empty node +struct None {}; + +#define GTEST_TEMPLATE_ \ + template \ + class + +// The template "selector" struct TemplateSel is used to +// represent Tmpl, which must be a class template with one type +// parameter, as a type. TemplateSel::Bind::type is defined +// as the type Tmpl. This allows us to actually instantiate the +// template "selected" by TemplateSel. +// +// This trick is necessary for simulating typedef for class templates, +// which C++ doesn't support directly. +template +struct TemplateSel { + template + struct Bind { + typedef Tmpl type; + }; +}; + +#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind::type + +template +struct Templates { + using Head = TemplateSel; + using Tail = Templates; +}; + +template +struct Templates { + using Head = TemplateSel; + using Tail = None; +}; + +// Tuple-like type lists +template +struct Types { + using Head = Head_; + using Tail = Types; +}; + +template +struct Types { + using Head = Head_; + using Tail = None; +}; + +// Helper metafunctions to tell apart a single type from types +// generated by ::testing::Types +template +struct ProxyTypeList { + using type = Types; +}; + +template +struct is_proxy_type_list : std::false_type {}; + +template +struct is_proxy_type_list> : std::true_type {}; + +// Generator which conditionally creates type lists. +// It recognizes if a requested type list should be created +// and prevents creating a new type list nested within another one. +template +struct GenerateTypeList { + private: + using proxy = typename std::conditional::value, T, + ProxyTypeList>::type; + + public: + using type = typename proxy::type; +}; + +} // namespace internal + +template +using Types = internal::ProxyTypeList; + +} // namespace testing + +#endif // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc index 912868148e..2a70ed88c7 100644 --- a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-all.cc @@ -26,10 +26,9 @@ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // -// Author: mheule@google.com (Markus Heule) -// -// Google C++ Testing Framework (Google Test) +// Google C++ Testing and Mocking Framework (Google Test) // // Sometimes it's desirable to build Google Test by compiling a single file. // This file serves this purpose. @@ -39,9554 +38,12 @@ #include "gtest/gtest.h" // The following lines pull in the real gtest *.cc files. -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// The Google C++ Testing Framework (Google Test) - -// Copyright 2007, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) -// -// Utilities for testing Google Test itself and code that uses Google Test -// (e.g. frameworks built on top of Google Test). - -#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_ -#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_ - - -namespace testing { - -// This helper class can be used to mock out Google Test failure reporting -// so that we can test Google Test or code that builds on Google Test. -// -// An object of this class appends a TestPartResult object to the -// TestPartResultArray object given in the constructor whenever a Google Test -// failure is reported. It can either intercept only failures that are -// generated in the same thread that created this object or it can intercept -// all generated failures. The scope of this mock object can be controlled with -// the second argument to the two arguments constructor. -class GTEST_API_ ScopedFakeTestPartResultReporter - : public TestPartResultReporterInterface { - public: - // The two possible mocking modes of this object. - enum InterceptMode { - INTERCEPT_ONLY_CURRENT_THREAD, // Intercepts only thread local failures. - INTERCEPT_ALL_THREADS // Intercepts all failures. - }; - - // The c'tor sets this object as the test part result reporter used - // by Google Test. The 'result' parameter specifies where to report the - // results. This reporter will only catch failures generated in the current - // thread. DEPRECATED - explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result); - - // Same as above, but you can choose the interception scope of this object. - ScopedFakeTestPartResultReporter(InterceptMode intercept_mode, - TestPartResultArray* result); - - // The d'tor restores the previous test part result reporter. - virtual ~ScopedFakeTestPartResultReporter(); - - // Appends the TestPartResult object to the TestPartResultArray - // received in the constructor. - // - // This method is from the TestPartResultReporterInterface - // interface. - virtual void ReportTestPartResult(const TestPartResult& result); - private: - void Init(); - - const InterceptMode intercept_mode_; - TestPartResultReporterInterface* old_reporter_; - TestPartResultArray* const result_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter); -}; - -namespace internal { - -// A helper class for implementing EXPECT_FATAL_FAILURE() and -// EXPECT_NONFATAL_FAILURE(). Its destructor verifies that the given -// TestPartResultArray contains exactly one failure that has the given -// type and contains the given substring. If that's not the case, a -// non-fatal failure will be generated. -class GTEST_API_ SingleFailureChecker { - public: - // The constructor remembers the arguments. - SingleFailureChecker(const TestPartResultArray* results, - TestPartResult::Type type, - const string& substr); - ~SingleFailureChecker(); - private: - const TestPartResultArray* const results_; - const TestPartResult::Type type_; - const string substr_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker); -}; - -} // namespace internal - -} // namespace testing - -// A set of macros for testing Google Test assertions or code that's expected -// to generate Google Test fatal failures. It verifies that the given -// statement will cause exactly one fatal Google Test failure with 'substr' -// being part of the failure message. -// -// There are two different versions of this macro. EXPECT_FATAL_FAILURE only -// affects and considers failures generated in the current thread and -// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. -// -// The verification of the assertion is done correctly even when the statement -// throws an exception or aborts the current function. -// -// Known restrictions: -// - 'statement' cannot reference local non-static variables or -// non-static members of the current object. -// - 'statement' cannot return a value. -// - You cannot stream a failure message to this macro. -// -// Note that even though the implementations of the following two -// macros are much alike, we cannot refactor them to use a common -// helper macro, due to some peculiarity in how the preprocessor -// works. The AcceptsMacroThatExpandsToUnprotectedComma test in -// gtest_unittest.cc will fail to compile if we do that. -#define EXPECT_FATAL_FAILURE(statement, substr) \ - do { \ - class GTestExpectFatalFailureHelper {\ - public:\ - static void Execute() { statement; }\ - };\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ - GTestExpectFatalFailureHelper::Execute();\ - }\ - } while (::testing::internal::AlwaysFalse()) - -#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ - do { \ - class GTestExpectFatalFailureHelper {\ - public:\ - static void Execute() { statement; }\ - };\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ALL_THREADS, >est_failures);\ - GTestExpectFatalFailureHelper::Execute();\ - }\ - } while (::testing::internal::AlwaysFalse()) - -// A macro for testing Google Test assertions or code that's expected to -// generate Google Test non-fatal failures. It asserts that the given -// statement will cause exactly one non-fatal Google Test failure with 'substr' -// being part of the failure message. -// -// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only -// affects and considers failures generated in the current thread and -// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. -// -// 'statement' is allowed to reference local variables and members of -// the current object. -// -// The verification of the assertion is done correctly even when the statement -// throws an exception or aborts the current function. -// -// Known restrictions: -// - You cannot stream a failure message to this macro. -// -// Note that even though the implementations of the following two -// macros are much alike, we cannot refactor them to use a common -// helper macro, due to some peculiarity in how the preprocessor -// works. If we do that, the code won't compile when the user gives -// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that -// expands to code containing an unprotected comma. The -// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc -// catches that. -// -// For the same reason, we have to write -// if (::testing::internal::AlwaysTrue()) { statement; } -// instead of -// GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) -// to avoid an MSVC warning on unreachable code. -#define EXPECT_NONFATAL_FAILURE(statement, substr) \ - do {\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ - (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter:: \ - INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ - if (::testing::internal::AlwaysTrue()) { statement; }\ - }\ - } while (::testing::internal::AlwaysFalse()) - -#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ - do {\ - ::testing::TestPartResultArray gtest_failures;\ - ::testing::internal::SingleFailureChecker gtest_checker(\ - >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ - (substr));\ - {\ - ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ - ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \ - >est_failures);\ - if (::testing::internal::AlwaysTrue()) { statement; }\ - }\ - } while (::testing::internal::AlwaysFalse()) - -#endif // GTEST_INCLUDE_GTEST_GTEST_SPI_H_ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include // NOLINT -#include -#include - -#if GTEST_OS_LINUX - -// TODO(kenton@google.com): Use autoconf to detect availability of -// gettimeofday(). -# define GTEST_HAS_GETTIMEOFDAY_ 1 - -# include // NOLINT -# include // NOLINT -# include // NOLINT -// Declares vsnprintf(). This header is not available on Windows. -# include // NOLINT -# include // NOLINT -# include // NOLINT -# include // NOLINT -# include - -#elif GTEST_OS_SYMBIAN -# define GTEST_HAS_GETTIMEOFDAY_ 1 -# include // NOLINT - -#elif GTEST_OS_ZOS -# define GTEST_HAS_GETTIMEOFDAY_ 1 -# include // NOLINT - -// On z/OS we additionally need strings.h for strcasecmp. -# include // NOLINT - -#elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE. - -# include // NOLINT - -#elif GTEST_OS_WINDOWS // We are on Windows proper. - -# include // NOLINT -# include // NOLINT -# include // NOLINT -# include // NOLINT - -# if GTEST_OS_WINDOWS_MINGW -// MinGW has gettimeofday() but not _ftime64(). -// TODO(kenton@google.com): Use autoconf to detect availability of -// gettimeofday(). -// TODO(kenton@google.com): There are other ways to get the time on -// Windows, like GetTickCount() or GetSystemTimeAsFileTime(). MinGW -// supports these. consider using them instead. -# define GTEST_HAS_GETTIMEOFDAY_ 1 -# include // NOLINT -# endif // GTEST_OS_WINDOWS_MINGW - -// cpplint thinks that the header is already included, so we want to -// silence it. -# include // NOLINT - -#else - -// Assume other platforms have gettimeofday(). -// TODO(kenton@google.com): Use autoconf to detect availability of -// gettimeofday(). -# define GTEST_HAS_GETTIMEOFDAY_ 1 - -// cpplint thinks that the header is already included, so we want to -// silence it. -# include // NOLINT -# include // NOLINT - -#endif // GTEST_OS_LINUX - -#if GTEST_HAS_EXCEPTIONS -# include -#endif - -#if GTEST_CAN_STREAM_RESULTS_ -# include // NOLINT -# include // NOLINT -#endif - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick is to -// prevent a user from accidentally including gtest-internal-inl.h in -// his code. -#define GTEST_IMPLEMENTATION_ 1 -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// Utility functions and classes used by the Google C++ testing framework. -// -// Author: wan@google.com (Zhanyong Wan) -// -// This file contains purely Google Test's internal implementation. Please -// DO NOT #INCLUDE IT IN A USER PROGRAM. - -#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_ -#define GTEST_SRC_GTEST_INTERNAL_INL_H_ - -// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is -// part of Google Test's implementation; otherwise it's undefined. -#if !GTEST_IMPLEMENTATION_ -// A user is trying to include this from his code - just say no. -# error "gtest-internal-inl.h is part of Google Test's internal implementation." -# error "It must not be included except by Google Test itself." -#endif // GTEST_IMPLEMENTATION_ - -#ifndef _WIN32_WCE -# include -#endif // !_WIN32_WCE -#include -#include // For strtoll/_strtoul64/malloc/free. -#include // For memmove. - -#include -#include -#include - - -#if GTEST_CAN_STREAM_RESULTS_ -# include // NOLINT -# include // NOLINT -#endif - -#if GTEST_OS_WINDOWS -# include // NOLINT -#endif // GTEST_OS_WINDOWS - - -namespace testing { - -// Declares the flags. -// -// We don't want the users to modify this flag in the code, but want -// Google Test's own unit tests to be able to access it. Therefore we -// declare it here as opposed to in gtest.h. -GTEST_DECLARE_bool_(death_test_use_fork); - -namespace internal { - -// The value of GetTestTypeId() as seen from within the Google Test -// library. This is solely for testing GetTestTypeId(). -GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; - -// Names of the flags (needed for parsing Google Test flags). -const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests"; -const char kBreakOnFailureFlag[] = "break_on_failure"; -const char kCatchExceptionsFlag[] = "catch_exceptions"; -const char kColorFlag[] = "color"; -const char kFilterFlag[] = "filter"; -const char kListTestsFlag[] = "list_tests"; -const char kOutputFlag[] = "output"; -const char kPrintTimeFlag[] = "print_time"; -const char kRandomSeedFlag[] = "random_seed"; -const char kRepeatFlag[] = "repeat"; -const char kShuffleFlag[] = "shuffle"; -const char kStackTraceDepthFlag[] = "stack_trace_depth"; -const char kStreamResultToFlag[] = "stream_result_to"; -const char kThrowOnFailureFlag[] = "throw_on_failure"; - -// A valid random seed must be in [1, kMaxRandomSeed]. -const int kMaxRandomSeed = 99999; - -// g_help_flag is true iff the --help flag or an equivalent form is -// specified on the command line. -GTEST_API_ extern bool g_help_flag; - -// Returns the current time in milliseconds. -GTEST_API_ TimeInMillis GetTimeInMillis(); - -// Returns true iff Google Test should use colors in the output. -GTEST_API_ bool ShouldUseColor(bool stdout_is_tty); - -// Formats the given time in milliseconds as seconds. -GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms); - -// Converts the given time in milliseconds to a date string in the ISO 8601 -// format, without the timezone information. N.B.: due to the use the -// non-reentrant localtime() function, this function is not thread safe. Do -// not use it in any code that can be called from multiple threads. -GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms); - -// Parses a string for an Int32 flag, in the form of "--flag=value". -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -GTEST_API_ bool ParseInt32Flag( - const char* str, const char* flag, Int32* value); - -// Returns a random seed in range [1, kMaxRandomSeed] based on the -// given --gtest_random_seed flag value. -inline int GetRandomSeedFromFlag(Int32 random_seed_flag) { - const unsigned int raw_seed = (random_seed_flag == 0) ? - static_cast(GetTimeInMillis()) : - static_cast(random_seed_flag); - - // Normalizes the actual seed to range [1, kMaxRandomSeed] such that - // it's easy to type. - const int normalized_seed = - static_cast((raw_seed - 1U) % - static_cast(kMaxRandomSeed)) + 1; - return normalized_seed; -} - -// Returns the first valid random seed after 'seed'. The behavior is -// undefined if 'seed' is invalid. The seed after kMaxRandomSeed is -// considered to be 1. -inline int GetNextRandomSeed(int seed) { - GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed) - << "Invalid random seed " << seed << " - must be in [1, " - << kMaxRandomSeed << "]."; - const int next_seed = seed + 1; - return (next_seed > kMaxRandomSeed) ? 1 : next_seed; -} - -// This class saves the values of all Google Test flags in its c'tor, and -// restores them in its d'tor. -class GTestFlagSaver { - public: - // The c'tor. - GTestFlagSaver() { - also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests); - break_on_failure_ = GTEST_FLAG(break_on_failure); - catch_exceptions_ = GTEST_FLAG(catch_exceptions); - color_ = GTEST_FLAG(color); - death_test_style_ = GTEST_FLAG(death_test_style); - death_test_use_fork_ = GTEST_FLAG(death_test_use_fork); - filter_ = GTEST_FLAG(filter); - internal_run_death_test_ = GTEST_FLAG(internal_run_death_test); - list_tests_ = GTEST_FLAG(list_tests); - output_ = GTEST_FLAG(output); - print_time_ = GTEST_FLAG(print_time); - random_seed_ = GTEST_FLAG(random_seed); - repeat_ = GTEST_FLAG(repeat); - shuffle_ = GTEST_FLAG(shuffle); - stack_trace_depth_ = GTEST_FLAG(stack_trace_depth); - stream_result_to_ = GTEST_FLAG(stream_result_to); - throw_on_failure_ = GTEST_FLAG(throw_on_failure); - } - - // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS. - ~GTestFlagSaver() { - GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_; - GTEST_FLAG(break_on_failure) = break_on_failure_; - GTEST_FLAG(catch_exceptions) = catch_exceptions_; - GTEST_FLAG(color) = color_; - GTEST_FLAG(death_test_style) = death_test_style_; - GTEST_FLAG(death_test_use_fork) = death_test_use_fork_; - GTEST_FLAG(filter) = filter_; - GTEST_FLAG(internal_run_death_test) = internal_run_death_test_; - GTEST_FLAG(list_tests) = list_tests_; - GTEST_FLAG(output) = output_; - GTEST_FLAG(print_time) = print_time_; - GTEST_FLAG(random_seed) = random_seed_; - GTEST_FLAG(repeat) = repeat_; - GTEST_FLAG(shuffle) = shuffle_; - GTEST_FLAG(stack_trace_depth) = stack_trace_depth_; - GTEST_FLAG(stream_result_to) = stream_result_to_; - GTEST_FLAG(throw_on_failure) = throw_on_failure_; - } - - private: - // Fields for saving the original values of flags. - bool also_run_disabled_tests_; - bool break_on_failure_; - bool catch_exceptions_; - std::string color_; - std::string death_test_style_; - bool death_test_use_fork_; - std::string filter_; - std::string internal_run_death_test_; - bool list_tests_; - std::string output_; - bool print_time_; - internal::Int32 random_seed_; - internal::Int32 repeat_; - bool shuffle_; - internal::Int32 stack_trace_depth_; - std::string stream_result_to_; - bool throw_on_failure_; -} GTEST_ATTRIBUTE_UNUSED_; - -// Converts a Unicode code point to a narrow string in UTF-8 encoding. -// code_point parameter is of type UInt32 because wchar_t may not be -// wide enough to contain a code point. -// If the code_point is not a valid Unicode code point -// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted -// to "(Invalid Unicode 0xXXXXXXXX)". -GTEST_API_ std::string CodePointToUtf8(UInt32 code_point); - -// Converts a wide string to a narrow string in UTF-8 encoding. -// The wide string is assumed to have the following encoding: -// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) -// UTF-32 if sizeof(wchar_t) == 4 (on Linux) -// Parameter str points to a null-terminated wide string. -// Parameter num_chars may additionally limit the number -// of wchar_t characters processed. -1 is used when the entire string -// should be processed. -// If the string contains code points that are not valid Unicode code points -// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output -// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding -// and contains invalid UTF-16 surrogate pairs, values in those pairs -// will be encoded as individual Unicode characters from Basic Normal Plane. -GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars); - -// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file -// if the variable is present. If a file already exists at this location, this -// function will write over it. If the variable is present, but the file cannot -// be created, prints an error and exits. -void WriteToShardStatusFileIfNeeded(); - -// Checks whether sharding is enabled by examining the relevant -// environment variable values. If the variables are present, -// but inconsistent (e.g., shard_index >= total_shards), prints -// an error and exits. If in_subprocess_for_death_test, sharding is -// disabled because it must only be applied to the original test -// process. Otherwise, we could filter out death tests we intended to execute. -GTEST_API_ bool ShouldShard(const char* total_shards_str, - const char* shard_index_str, - bool in_subprocess_for_death_test); - -// Parses the environment variable var as an Int32. If it is unset, -// returns default_val. If it is not an Int32, prints an error and -// and aborts. -GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val); - -// Given the total number of shards, the shard index, and the test id, -// returns true iff the test should be run on this shard. The test id is -// some arbitrary but unique non-negative integer assigned to each test -// method. Assumes that 0 <= shard_index < total_shards. -GTEST_API_ bool ShouldRunTestOnShard( - int total_shards, int shard_index, int test_id); - -// STL container utilities. - -// Returns the number of elements in the given container that satisfy -// the given predicate. -template -inline int CountIf(const Container& c, Predicate predicate) { - // Implemented as an explicit loop since std::count_if() in libCstd on - // Solaris has a non-standard signature. - int count = 0; - for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) { - if (predicate(*it)) - ++count; - } - return count; -} - -// Applies a function/functor to each element in the container. -template -void ForEach(const Container& c, Functor functor) { - std::for_each(c.begin(), c.end(), functor); -} - -// Returns the i-th element of the vector, or default_value if i is not -// in range [0, v.size()). -template -inline E GetElementOr(const std::vector& v, int i, E default_value) { - return (i < 0 || i >= static_cast(v.size())) ? default_value : v[i]; -} - -// Performs an in-place shuffle of a range of the vector's elements. -// 'begin' and 'end' are element indices as an STL-style range; -// i.e. [begin, end) are shuffled, where 'end' == size() means to -// shuffle to the end of the vector. -template -void ShuffleRange(internal::Random* random, int begin, int end, - std::vector* v) { - const int size = static_cast(v->size()); - GTEST_CHECK_(0 <= begin && begin <= size) - << "Invalid shuffle range start " << begin << ": must be in range [0, " - << size << "]."; - GTEST_CHECK_(begin <= end && end <= size) - << "Invalid shuffle range finish " << end << ": must be in range [" - << begin << ", " << size << "]."; - - // Fisher-Yates shuffle, from - // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle - for (int range_width = end - begin; range_width >= 2; range_width--) { - const int last_in_range = begin + range_width - 1; - const int selected = begin + random->Generate(range_width); - std::swap((*v)[selected], (*v)[last_in_range]); - } -} - -// Performs an in-place shuffle of the vector's elements. -template -inline void Shuffle(internal::Random* random, std::vector* v) { - ShuffleRange(random, 0, static_cast(v->size()), v); -} - -// A function for deleting an object. Handy for being used as a -// functor. -template -static void Delete(T* x) { - delete x; -} - -// A predicate that checks the key of a TestProperty against a known key. -// -// TestPropertyKeyIs is copyable. -class TestPropertyKeyIs { - public: - // Constructor. - // - // TestPropertyKeyIs has NO default constructor. - explicit TestPropertyKeyIs(const std::string& key) : key_(key) {} - - // Returns true iff the test name of test property matches on key_. - bool operator()(const TestProperty& test_property) const { - return test_property.key() == key_; - } - - private: - std::string key_; -}; - -// Class UnitTestOptions. -// -// This class contains functions for processing options the user -// specifies when running the tests. It has only static members. -// -// In most cases, the user can specify an option using either an -// environment variable or a command line flag. E.g. you can set the -// test filter using either GTEST_FILTER or --gtest_filter. If both -// the variable and the flag are present, the latter overrides the -// former. -class GTEST_API_ UnitTestOptions { - public: - // Functions for processing the gtest_output flag. - - // Returns the output format, or "" for normal printed output. - static std::string GetOutputFormat(); - - // Returns the absolute path of the requested output file, or the - // default (test_detail.xml in the original working directory) if - // none was explicitly specified. - static std::string GetAbsolutePathToOutputFile(); - - // Functions for processing the gtest_filter flag. - - // Returns true iff the wildcard pattern matches the string. The - // first ':' or '\0' character in pattern marks the end of it. - // - // This recursive algorithm isn't very efficient, but is clear and - // works well enough for matching test names, which are short. - static bool PatternMatchesString(const char *pattern, const char *str); - - // Returns true iff the user-specified filter matches the test case - // name and the test name. - static bool FilterMatchesTest(const std::string &test_case_name, - const std::string &test_name); - -#if GTEST_OS_WINDOWS - // Function for supporting the gtest_catch_exception flag. - - // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the - // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. - // This function is useful as an __except condition. - static int GTestShouldProcessSEH(DWORD exception_code); -#endif // GTEST_OS_WINDOWS - - // Returns true if "name" matches the ':' separated list of glob-style - // filters in "filter". - static bool MatchesFilter(const std::string& name, const char* filter); -}; - -// Returns the current application's name, removing directory path if that -// is present. Used by UnitTestOptions::GetOutputFile. -GTEST_API_ FilePath GetCurrentExecutableName(); - -// The role interface for getting the OS stack trace as a string. -class OsStackTraceGetterInterface { - public: - OsStackTraceGetterInterface() {} - virtual ~OsStackTraceGetterInterface() {} - - // Returns the current OS stack trace as an std::string. Parameters: - // - // max_depth - the maximum number of stack frames to be included - // in the trace. - // skip_count - the number of top frames to be skipped; doesn't count - // against max_depth. - virtual string CurrentStackTrace(int max_depth, int skip_count) = 0; - - // UponLeavingGTest() should be called immediately before Google Test calls - // user code. It saves some information about the current stack that - // CurrentStackTrace() will use to find and hide Google Test stack frames. - virtual void UponLeavingGTest() = 0; - - private: - GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface); -}; - -// A working implementation of the OsStackTraceGetterInterface interface. -class OsStackTraceGetter : public OsStackTraceGetterInterface { - public: - OsStackTraceGetter() : caller_frame_(NULL) {} - - virtual string CurrentStackTrace(int max_depth, int skip_count) - GTEST_LOCK_EXCLUDED_(mutex_); - - virtual void UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_); - - // This string is inserted in place of stack frames that are part of - // Google Test's implementation. - static const char* const kElidedFramesMarker; - - private: - Mutex mutex_; // protects all internal state - - // We save the stack frame below the frame that calls user code. - // We do this because the address of the frame immediately below - // the user code changes between the call to UponLeavingGTest() - // and any calls to CurrentStackTrace() from within the user code. - void* caller_frame_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter); -}; - -// Information about a Google Test trace point. -struct TraceInfo { - const char* file; - int line; - std::string message; -}; - -// This is the default global test part result reporter used in UnitTestImpl. -// This class should only be used by UnitTestImpl. -class DefaultGlobalTestPartResultReporter - : public TestPartResultReporterInterface { - public: - explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test); - // Implements the TestPartResultReporterInterface. Reports the test part - // result in the current test. - virtual void ReportTestPartResult(const TestPartResult& result); - - private: - UnitTestImpl* const unit_test_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter); -}; - -// This is the default per thread test part result reporter used in -// UnitTestImpl. This class should only be used by UnitTestImpl. -class DefaultPerThreadTestPartResultReporter - : public TestPartResultReporterInterface { - public: - explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test); - // Implements the TestPartResultReporterInterface. The implementation just - // delegates to the current global test part result reporter of *unit_test_. - virtual void ReportTestPartResult(const TestPartResult& result); - - private: - UnitTestImpl* const unit_test_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter); -}; - -// The private implementation of the UnitTest class. We don't protect -// the methods under a mutex, as this class is not accessible by a -// user and the UnitTest class that delegates work to this class does -// proper locking. -class GTEST_API_ UnitTestImpl { - public: - explicit UnitTestImpl(UnitTest* parent); - virtual ~UnitTestImpl(); - - // There are two different ways to register your own TestPartResultReporter. - // You can register your own repoter to listen either only for test results - // from the current thread or for results from all threads. - // By default, each per-thread test result repoter just passes a new - // TestPartResult to the global test result reporter, which registers the - // test part result for the currently running test. - - // Returns the global test part result reporter. - TestPartResultReporterInterface* GetGlobalTestPartResultReporter(); - - // Sets the global test part result reporter. - void SetGlobalTestPartResultReporter( - TestPartResultReporterInterface* reporter); - - // Returns the test part result reporter for the current thread. - TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread(); - - // Sets the test part result reporter for the current thread. - void SetTestPartResultReporterForCurrentThread( - TestPartResultReporterInterface* reporter); - - // Gets the number of successful test cases. - int successful_test_case_count() const; - - // Gets the number of failed test cases. - int failed_test_case_count() const; - - // Gets the number of all test cases. - int total_test_case_count() const; - - // Gets the number of all test cases that contain at least one test - // that should run. - int test_case_to_run_count() const; - - // Gets the number of successful tests. - int successful_test_count() const; - - // Gets the number of failed tests. - int failed_test_count() const; - - // Gets the number of disabled tests that will be reported in the XML report. - int reportable_disabled_test_count() const; - - // Gets the number of disabled tests. - int disabled_test_count() const; - - // Gets the number of tests to be printed in the XML report. - int reportable_test_count() const; - - // Gets the number of all tests. - int total_test_count() const; - - // Gets the number of tests that should run. - int test_to_run_count() const; - - // Gets the time of the test program start, in ms from the start of the - // UNIX epoch. - TimeInMillis start_timestamp() const { return start_timestamp_; } - - // Gets the elapsed time, in milliseconds. - TimeInMillis elapsed_time() const { return elapsed_time_; } - - // Returns true iff the unit test passed (i.e. all test cases passed). - bool Passed() const { return !Failed(); } - - // Returns true iff the unit test failed (i.e. some test case failed - // or something outside of all tests failed). - bool Failed() const { - return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed(); - } - - // Gets the i-th test case among all the test cases. i can range from 0 to - // total_test_case_count() - 1. If i is not in that range, returns NULL. - const TestCase* GetTestCase(int i) const { - const int index = GetElementOr(test_case_indices_, i, -1); - return index < 0 ? NULL : test_cases_[i]; - } - - // Gets the i-th test case among all the test cases. i can range from 0 to - // total_test_case_count() - 1. If i is not in that range, returns NULL. - TestCase* GetMutableTestCase(int i) { - const int index = GetElementOr(test_case_indices_, i, -1); - return index < 0 ? NULL : test_cases_[index]; - } - - // Provides access to the event listener list. - TestEventListeners* listeners() { return &listeners_; } - - // Returns the TestResult for the test that's currently running, or - // the TestResult for the ad hoc test if no test is running. - TestResult* current_test_result(); - - // Returns the TestResult for the ad hoc test. - const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; } - - // Sets the OS stack trace getter. - // - // Does nothing if the input and the current OS stack trace getter - // are the same; otherwise, deletes the old getter and makes the - // input the current getter. - void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter); - - // Returns the current OS stack trace getter if it is not NULL; - // otherwise, creates an OsStackTraceGetter, makes it the current - // getter, and returns it. - OsStackTraceGetterInterface* os_stack_trace_getter(); - - // Returns the current OS stack trace as an std::string. - // - // The maximum number of stack frames to be included is specified by - // the gtest_stack_trace_depth flag. The skip_count parameter - // specifies the number of top frames to be skipped, which doesn't - // count against the number of frames to be included. - // - // For example, if Foo() calls Bar(), which in turn calls - // CurrentOsStackTraceExceptTop(1), Foo() will be included in the - // trace but Bar() and CurrentOsStackTraceExceptTop() won't. - std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_; - - // Finds and returns a TestCase with the given name. If one doesn't - // exist, creates one and returns it. - // - // Arguments: - // - // test_case_name: name of the test case - // type_param: the name of the test's type parameter, or NULL if - // this is not a typed or a type-parameterized test. - // set_up_tc: pointer to the function that sets up the test case - // tear_down_tc: pointer to the function that tears down the test case - TestCase* GetTestCase(const char* test_case_name, - const char* type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc); - - // Adds a TestInfo to the unit test. - // - // Arguments: - // - // set_up_tc: pointer to the function that sets up the test case - // tear_down_tc: pointer to the function that tears down the test case - // test_info: the TestInfo object - void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc, - TestInfo* test_info) { - // In order to support thread-safe death tests, we need to - // remember the original working directory when the test program - // was first invoked. We cannot do this in RUN_ALL_TESTS(), as - // the user may have changed the current directory before calling - // RUN_ALL_TESTS(). Therefore we capture the current directory in - // AddTestInfo(), which is called to register a TEST or TEST_F - // before main() is reached. - if (original_working_dir_.IsEmpty()) { - original_working_dir_.Set(FilePath::GetCurrentDir()); - GTEST_CHECK_(!original_working_dir_.IsEmpty()) - << "Failed to get the current working directory."; - } - - GetTestCase(test_info->test_case_name(), - test_info->type_param(), - set_up_tc, - tear_down_tc)->AddTestInfo(test_info); - } - -#if GTEST_HAS_PARAM_TEST - // Returns ParameterizedTestCaseRegistry object used to keep track of - // value-parameterized tests and instantiate and register them. - internal::ParameterizedTestCaseRegistry& parameterized_test_registry() { - return parameterized_test_registry_; - } -#endif // GTEST_HAS_PARAM_TEST - - // Sets the TestCase object for the test that's currently running. - void set_current_test_case(TestCase* a_current_test_case) { - current_test_case_ = a_current_test_case; - } - - // Sets the TestInfo object for the test that's currently running. If - // current_test_info is NULL, the assertion results will be stored in - // ad_hoc_test_result_. - void set_current_test_info(TestInfo* a_current_test_info) { - current_test_info_ = a_current_test_info; - } - - // Registers all parameterized tests defined using TEST_P and - // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter - // combination. This method can be called more then once; it has guards - // protecting from registering the tests more then once. If - // value-parameterized tests are disabled, RegisterParameterizedTests is - // present but does nothing. - void RegisterParameterizedTests(); - - // Runs all tests in this UnitTest object, prints the result, and - // returns true if all tests are successful. If any exception is - // thrown during a test, this test is considered to be failed, but - // the rest of the tests will still be run. - bool RunAllTests(); - - // Clears the results of all tests, except the ad hoc tests. - void ClearNonAdHocTestResult() { - ForEach(test_cases_, TestCase::ClearTestCaseResult); - } - - // Clears the results of ad-hoc test assertions. - void ClearAdHocTestResult() { - ad_hoc_test_result_.Clear(); - } - - // Adds a TestProperty to the current TestResult object when invoked in a - // context of a test or a test case, or to the global property set. If the - // result already contains a property with the same key, the value will be - // updated. - void RecordProperty(const TestProperty& test_property); - - enum ReactionToSharding { - HONOR_SHARDING_PROTOCOL, - IGNORE_SHARDING_PROTOCOL - }; - - // Matches the full name of each test against the user-specified - // filter to decide whether the test should run, then records the - // result in each TestCase and TestInfo object. - // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests - // based on sharding variables in the environment. - // Returns the number of tests that should run. - int FilterTests(ReactionToSharding shard_tests); - - // Prints the names of the tests matching the user-specified filter flag. - void ListTestsMatchingFilter(); - - const TestCase* current_test_case() const { return current_test_case_; } - TestInfo* current_test_info() { return current_test_info_; } - const TestInfo* current_test_info() const { return current_test_info_; } - - // Returns the vector of environments that need to be set-up/torn-down - // before/after the tests are run. - std::vector& environments() { return environments_; } - - // Getters for the per-thread Google Test trace stack. - std::vector& gtest_trace_stack() { - return *(gtest_trace_stack_.pointer()); - } - const std::vector& gtest_trace_stack() const { - return gtest_trace_stack_.get(); - } - -#if GTEST_HAS_DEATH_TEST - void InitDeathTestSubprocessControlInfo() { - internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag()); - } - // Returns a pointer to the parsed --gtest_internal_run_death_test - // flag, or NULL if that flag was not specified. - // This information is useful only in a death test child process. - // Must not be called before a call to InitGoogleTest. - const InternalRunDeathTestFlag* internal_run_death_test_flag() const { - return internal_run_death_test_flag_.get(); - } - - // Returns a pointer to the current death test factory. - internal::DeathTestFactory* death_test_factory() { - return death_test_factory_.get(); - } - - void SuppressTestEventsIfInSubprocess(); - - friend class ReplaceDeathTestFactory; -#endif // GTEST_HAS_DEATH_TEST - - // Initializes the event listener performing XML output as specified by - // UnitTestOptions. Must not be called before InitGoogleTest. - void ConfigureXmlOutput(); - -#if GTEST_CAN_STREAM_RESULTS_ - // Initializes the event listener for streaming test results to a socket. - // Must not be called before InitGoogleTest. - void ConfigureStreamingOutput(); -#endif - - // Performs initialization dependent upon flag values obtained in - // ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to - // ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest - // this function is also called from RunAllTests. Since this function can be - // called more than once, it has to be idempotent. - void PostFlagParsingInit(); - - // Gets the random seed used at the start of the current test iteration. - int random_seed() const { return random_seed_; } - - // Gets the random number generator. - internal::Random* random() { return &random_; } - - // Shuffles all test cases, and the tests within each test case, - // making sure that death tests are still run first. - void ShuffleTests(); - - // Restores the test cases and tests to their order before the first shuffle. - void UnshuffleTests(); - - // Returns the value of GTEST_FLAG(catch_exceptions) at the moment - // UnitTest::Run() starts. - bool catch_exceptions() const { return catch_exceptions_; } - - private: - friend class ::testing::UnitTest; - - // Used by UnitTest::Run() to capture the state of - // GTEST_FLAG(catch_exceptions) at the moment it starts. - void set_catch_exceptions(bool value) { catch_exceptions_ = value; } - - // The UnitTest object that owns this implementation object. - UnitTest* const parent_; - - // The working directory when the first TEST() or TEST_F() was - // executed. - internal::FilePath original_working_dir_; - - // The default test part result reporters. - DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_; - DefaultPerThreadTestPartResultReporter - default_per_thread_test_part_result_reporter_; - - // Points to (but doesn't own) the global test part result reporter. - TestPartResultReporterInterface* global_test_part_result_repoter_; - - // Protects read and write access to global_test_part_result_reporter_. - internal::Mutex global_test_part_result_reporter_mutex_; - - // Points to (but doesn't own) the per-thread test part result reporter. - internal::ThreadLocal - per_thread_test_part_result_reporter_; - - // The vector of environments that need to be set-up/torn-down - // before/after the tests are run. - std::vector environments_; - - // The vector of TestCases in their original order. It owns the - // elements in the vector. - std::vector test_cases_; - - // Provides a level of indirection for the test case list to allow - // easy shuffling and restoring the test case order. The i-th - // element of this vector is the index of the i-th test case in the - // shuffled order. - std::vector test_case_indices_; - -#if GTEST_HAS_PARAM_TEST - // ParameterizedTestRegistry object used to register value-parameterized - // tests. - internal::ParameterizedTestCaseRegistry parameterized_test_registry_; - - // Indicates whether RegisterParameterizedTests() has been called already. - bool parameterized_tests_registered_; -#endif // GTEST_HAS_PARAM_TEST - - // Index of the last death test case registered. Initially -1. - int last_death_test_case_; - - // This points to the TestCase for the currently running test. It - // changes as Google Test goes through one test case after another. - // When no test is running, this is set to NULL and Google Test - // stores assertion results in ad_hoc_test_result_. Initially NULL. - TestCase* current_test_case_; - - // This points to the TestInfo for the currently running test. It - // changes as Google Test goes through one test after another. When - // no test is running, this is set to NULL and Google Test stores - // assertion results in ad_hoc_test_result_. Initially NULL. - TestInfo* current_test_info_; - - // Normally, a user only writes assertions inside a TEST or TEST_F, - // or inside a function called by a TEST or TEST_F. Since Google - // Test keeps track of which test is current running, it can - // associate such an assertion with the test it belongs to. - // - // If an assertion is encountered when no TEST or TEST_F is running, - // Google Test attributes the assertion result to an imaginary "ad hoc" - // test, and records the result in ad_hoc_test_result_. - TestResult ad_hoc_test_result_; - - // The list of event listeners that can be used to track events inside - // Google Test. - TestEventListeners listeners_; - - // The OS stack trace getter. Will be deleted when the UnitTest - // object is destructed. By default, an OsStackTraceGetter is used, - // but the user can set this field to use a custom getter if that is - // desired. - OsStackTraceGetterInterface* os_stack_trace_getter_; - - // True iff PostFlagParsingInit() has been called. - bool post_flag_parse_init_performed_; - - // The random number seed used at the beginning of the test run. - int random_seed_; - - // Our random number generator. - internal::Random random_; - - // The time of the test program start, in ms from the start of the - // UNIX epoch. - TimeInMillis start_timestamp_; - - // How long the test took to run, in milliseconds. - TimeInMillis elapsed_time_; - -#if GTEST_HAS_DEATH_TEST - // The decomposed components of the gtest_internal_run_death_test flag, - // parsed when RUN_ALL_TESTS is called. - internal::scoped_ptr internal_run_death_test_flag_; - internal::scoped_ptr death_test_factory_; -#endif // GTEST_HAS_DEATH_TEST - - // A per-thread stack of traces created by the SCOPED_TRACE() macro. - internal::ThreadLocal > gtest_trace_stack_; - - // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests() - // starts. - bool catch_exceptions_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl); -}; // class UnitTestImpl - -// Convenience function for accessing the global UnitTest -// implementation object. -inline UnitTestImpl* GetUnitTestImpl() { - return UnitTest::GetInstance()->impl(); -} - -#if GTEST_USES_SIMPLE_RE - -// Internal helper functions for implementing the simple regular -// expression matcher. -GTEST_API_ bool IsInSet(char ch, const char* str); -GTEST_API_ bool IsAsciiDigit(char ch); -GTEST_API_ bool IsAsciiPunct(char ch); -GTEST_API_ bool IsRepeat(char ch); -GTEST_API_ bool IsAsciiWhiteSpace(char ch); -GTEST_API_ bool IsAsciiWordChar(char ch); -GTEST_API_ bool IsValidEscape(char ch); -GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch); -GTEST_API_ bool ValidateRegex(const char* regex); -GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str); -GTEST_API_ bool MatchRepetitionAndRegexAtHead( - bool escaped, char ch, char repeat, const char* regex, const char* str); -GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str); - -#endif // GTEST_USES_SIMPLE_RE - -// Parses the command line for Google Test flags, without initializing -// other parts of Google Test. -GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv); -GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv); - -#if GTEST_HAS_DEATH_TEST - -// Returns the message describing the last system error, regardless of the -// platform. -GTEST_API_ std::string GetLastErrnoDescription(); - -# if GTEST_OS_WINDOWS -// Provides leak-safe Windows kernel handle ownership. -class AutoHandle { - public: - AutoHandle() : handle_(INVALID_HANDLE_VALUE) {} - explicit AutoHandle(HANDLE handle) : handle_(handle) {} - - ~AutoHandle() { Reset(); } - - HANDLE Get() const { return handle_; } - void Reset() { Reset(INVALID_HANDLE_VALUE); } - void Reset(HANDLE handle) { - if (handle != handle_) { - if (handle_ != INVALID_HANDLE_VALUE) - ::CloseHandle(handle_); - handle_ = handle; - } - } - - private: - HANDLE handle_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle); -}; -# endif // GTEST_OS_WINDOWS - -// Attempts to parse a string into a positive integer pointed to by the -// number parameter. Returns true if that is possible. -// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use -// it here. -template -bool ParseNaturalNumber(const ::std::string& str, Integer* number) { - // Fail fast if the given string does not begin with a digit; - // this bypasses strtoXXX's "optional leading whitespace and plus - // or minus sign" semantics, which are undesirable here. - if (str.empty() || !IsDigit(str[0])) { - return false; - } - errno = 0; - - char* end; - // BiggestConvertible is the largest integer type that system-provided - // string-to-number conversion routines can return. - -# if GTEST_OS_WINDOWS && !defined(__GNUC__) - - // MSVC and C++ Builder define __int64 instead of the standard long long. - typedef unsigned __int64 BiggestConvertible; - const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10); - -# else - - typedef unsigned long long BiggestConvertible; // NOLINT - const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); - -# endif // GTEST_OS_WINDOWS && !defined(__GNUC__) - - const bool parse_success = *end == '\0' && errno == 0; - - // TODO(vladl@google.com): Convert this to compile time assertion when it is - // available. - GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); - - const Integer result = static_cast(parsed); - if (parse_success && static_cast(result) == parsed) { - *number = result; - return true; - } - return false; -} -#endif // GTEST_HAS_DEATH_TEST - -// TestResult contains some private methods that should be hidden from -// Google Test user but are required for testing. This class allow our tests -// to access them. -// -// This class is supplied only for the purpose of testing Google Test's own -// constructs. Do not use it in user tests, either directly or indirectly. -class TestResultAccessor { - public: - static void RecordProperty(TestResult* test_result, - const std::string& xml_element, - const TestProperty& property) { - test_result->RecordProperty(xml_element, property); - } - - static void ClearTestPartResults(TestResult* test_result) { - test_result->ClearTestPartResults(); - } - - static const std::vector& test_part_results( - const TestResult& test_result) { - return test_result.test_part_results(); - } -}; - -#if GTEST_CAN_STREAM_RESULTS_ - -// Streams test results to the given port on the given host machine. -class StreamingListener : public EmptyTestEventListener { - public: - // Abstract base class for writing strings to a socket. - class AbstractSocketWriter { - public: - virtual ~AbstractSocketWriter() {} - - // Sends a string to the socket. - virtual void Send(const string& message) = 0; - - // Closes the socket. - virtual void CloseConnection() {} - - // Sends a string and a newline to the socket. - void SendLn(const string& message) { - Send(message + "\n"); - } - }; - - // Concrete class for actually writing strings to a socket. - class SocketWriter : public AbstractSocketWriter { - public: - SocketWriter(const string& host, const string& port) - : sockfd_(-1), host_name_(host), port_num_(port) { - MakeConnection(); - } - - virtual ~SocketWriter() { - if (sockfd_ != -1) - CloseConnection(); - } - - // Sends a string to the socket. - virtual void Send(const string& message) { - GTEST_CHECK_(sockfd_ != -1) - << "Send() can be called only when there is a connection."; - - const int len = static_cast(message.length()); - if (write(sockfd_, message.c_str(), len) != len) { - GTEST_LOG_(WARNING) - << "stream_result_to: failed to stream to " - << host_name_ << ":" << port_num_; - } - } - - private: - // Creates a client socket and connects to the server. - void MakeConnection(); - - // Closes the socket. - void CloseConnection() { - GTEST_CHECK_(sockfd_ != -1) - << "CloseConnection() can be called only when there is a connection."; - - close(sockfd_); - sockfd_ = -1; - } - - int sockfd_; // socket file descriptor - const string host_name_; - const string port_num_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter); - }; // class SocketWriter - - // Escapes '=', '&', '%', and '\n' characters in str as "%xx". - static string UrlEncode(const char* str); - - StreamingListener(const string& host, const string& port) - : socket_writer_(new SocketWriter(host, port)) { Start(); } - - explicit StreamingListener(AbstractSocketWriter* socket_writer) - : socket_writer_(socket_writer) { Start(); } - - void OnTestProgramStart(const UnitTest& /* unit_test */) { - SendLn("event=TestProgramStart"); - } - - void OnTestProgramEnd(const UnitTest& unit_test) { - // Note that Google Test current only report elapsed time for each - // test iteration, not for the entire test program. - SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed())); - - // Notify the streaming server to stop. - socket_writer_->CloseConnection(); - } - - void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) { - SendLn("event=TestIterationStart&iteration=" + - StreamableToString(iteration)); - } - - void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) { - SendLn("event=TestIterationEnd&passed=" + - FormatBool(unit_test.Passed()) + "&elapsed_time=" + - StreamableToString(unit_test.elapsed_time()) + "ms"); - } - - void OnTestCaseStart(const TestCase& test_case) { - SendLn(std::string("event=TestCaseStart&name=") + test_case.name()); - } - - void OnTestCaseEnd(const TestCase& test_case) { - SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) - + "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) - + "ms"); - } - - void OnTestStart(const TestInfo& test_info) { - SendLn(std::string("event=TestStart&name=") + test_info.name()); - } - - void OnTestEnd(const TestInfo& test_info) { - SendLn("event=TestEnd&passed=" + - FormatBool((test_info.result())->Passed()) + - "&elapsed_time=" + - StreamableToString((test_info.result())->elapsed_time()) + "ms"); - } - - void OnTestPartResult(const TestPartResult& test_part_result) { - const char* file_name = test_part_result.file_name(); - if (file_name == NULL) - file_name = ""; - SendLn("event=TestPartResult&file=" + UrlEncode(file_name) + - "&line=" + StreamableToString(test_part_result.line_number()) + - "&message=" + UrlEncode(test_part_result.message())); - } - - private: - // Sends the given message and a newline to the socket. - void SendLn(const string& message) { socket_writer_->SendLn(message); } - - // Called at the start of streaming to notify the receiver what - // protocol we are using. - void Start() { SendLn("gtest_streaming_protocol_version=1.0"); } - - string FormatBool(bool value) { return value ? "1" : "0"; } - - const scoped_ptr socket_writer_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener); -}; // class StreamingListener - -#endif // GTEST_CAN_STREAM_RESULTS_ - -} // namespace internal -} // namespace testing - -#endif // GTEST_SRC_GTEST_INTERNAL_INL_H_ -#undef GTEST_IMPLEMENTATION_ - -#if GTEST_OS_WINDOWS -# define vsnprintf _vsnprintf -#endif // GTEST_OS_WINDOWS - -namespace testing { - -using internal::CountIf; -using internal::ForEach; -using internal::GetElementOr; -using internal::Shuffle; - -// Constants. - -// A test whose test case name or test name matches this filter is -// disabled and not run. -static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*"; - -// A test case whose name matches this filter is considered a death -// test case and will be run before test cases whose name doesn't -// match this filter. -static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*"; - -// A test filter that matches everything. -static const char kUniversalFilter[] = "*"; - -// The default output file for XML output. -static const char kDefaultOutputFile[] = "test_detail.xml"; - -// The environment variable name for the test shard index. -static const char kTestShardIndex[] = "GTEST_SHARD_INDEX"; -// The environment variable name for the total number of test shards. -static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS"; -// The environment variable name for the test shard status file. -static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE"; - -namespace internal { - -// The text used in failure messages to indicate the start of the -// stack trace. -const char kStackTraceMarker[] = "\nStack trace:\n"; - -// g_help_flag is true iff the --help flag or an equivalent form is -// specified on the command line. -bool g_help_flag = false; - -} // namespace internal - -static const char* GetDefaultFilter() { - return kUniversalFilter; -} - -GTEST_DEFINE_bool_( - also_run_disabled_tests, - internal::BoolFromGTestEnv("also_run_disabled_tests", false), - "Run disabled tests too, in addition to the tests normally being run."); - -GTEST_DEFINE_bool_( - break_on_failure, - internal::BoolFromGTestEnv("break_on_failure", false), - "True iff a failed assertion should be a debugger break-point."); - -GTEST_DEFINE_bool_( - catch_exceptions, - internal::BoolFromGTestEnv("catch_exceptions", true), - "True iff " GTEST_NAME_ - " should catch exceptions and treat them as test failures."); - -GTEST_DEFINE_string_( - color, - internal::StringFromGTestEnv("color", "auto"), - "Whether to use colors in the output. Valid values: yes, no, " - "and auto. 'auto' means to use colors if the output is " - "being sent to a terminal and the TERM environment variable " - "is set to a terminal type that supports colors."); - -GTEST_DEFINE_string_( - filter, - internal::StringFromGTestEnv("filter", GetDefaultFilter()), - "A colon-separated list of glob (not regex) patterns " - "for filtering the tests to run, optionally followed by a " - "'-' and a : separated list of negative patterns (tests to " - "exclude). A test is run if it matches one of the positive " - "patterns and does not match any of the negative patterns."); - -GTEST_DEFINE_bool_(list_tests, false, - "List all tests without running them."); - -GTEST_DEFINE_string_( - output, - internal::StringFromGTestEnv("output", ""), - "A format (currently must be \"xml\"), optionally followed " - "by a colon and an output file name or directory. A directory " - "is indicated by a trailing pathname separator. " - "Examples: \"xml:filename.xml\", \"xml::directoryname/\". " - "If a directory is specified, output files will be created " - "within that directory, with file-names based on the test " - "executable's name and, if necessary, made unique by adding " - "digits."); - -GTEST_DEFINE_bool_( - print_time, - internal::BoolFromGTestEnv("print_time", true), - "True iff " GTEST_NAME_ - " should display elapsed time in text output."); - -GTEST_DEFINE_int32_( - random_seed, - internal::Int32FromGTestEnv("random_seed", 0), - "Random number seed to use when shuffling test orders. Must be in range " - "[1, 99999], or 0 to use a seed based on the current time."); - -GTEST_DEFINE_int32_( - repeat, - internal::Int32FromGTestEnv("repeat", 1), - "How many times to repeat each test. Specify a negative number " - "for repeating forever. Useful for shaking out flaky tests."); - -GTEST_DEFINE_bool_( - show_internal_stack_frames, false, - "True iff " GTEST_NAME_ " should include internal stack frames when " - "printing test failure stack traces."); - -GTEST_DEFINE_bool_( - shuffle, - internal::BoolFromGTestEnv("shuffle", false), - "True iff " GTEST_NAME_ - " should randomize tests' order on every run."); - -GTEST_DEFINE_int32_( - stack_trace_depth, - internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth), - "The maximum number of stack frames to print when an " - "assertion fails. The valid range is 0 through 100, inclusive."); - -GTEST_DEFINE_string_( - stream_result_to, - internal::StringFromGTestEnv("stream_result_to", ""), - "This flag specifies the host name and the port number on which to stream " - "test results. Example: \"localhost:555\". The flag is effective only on " - "Linux."); - -GTEST_DEFINE_bool_( - throw_on_failure, - internal::BoolFromGTestEnv("throw_on_failure", false), - "When this flag is specified, a failed assertion will throw an exception " - "if exceptions are enabled or exit the program with a non-zero code " - "otherwise."); - -namespace internal { - -// Generates a random number from [0, range), using a Linear -// Congruential Generator (LCG). Crashes if 'range' is 0 or greater -// than kMaxRange. -UInt32 Random::Generate(UInt32 range) { - // These constants are the same as are used in glibc's rand(3). - state_ = (1103515245U*state_ + 12345U) % kMaxRange; - - GTEST_CHECK_(range > 0) - << "Cannot generate a number in the range [0, 0)."; - GTEST_CHECK_(range <= kMaxRange) - << "Generation of a number in [0, " << range << ") was requested, " - << "but this can only generate numbers in [0, " << kMaxRange << ")."; - - // Converting via modulus introduces a bit of downward bias, but - // it's simple, and a linear congruential generator isn't too good - // to begin with. - return state_ % range; -} - -// GTestIsInitialized() returns true iff the user has initialized -// Google Test. Useful for catching the user mistake of not initializing -// Google Test before calling RUN_ALL_TESTS(). -// -// A user must call testing::InitGoogleTest() to initialize Google -// Test. g_init_gtest_count is set to the number of times -// InitGoogleTest() has been called. We don't protect this variable -// under a mutex as it is only accessed in the main thread. -GTEST_API_ int g_init_gtest_count = 0; -static bool GTestIsInitialized() { return g_init_gtest_count != 0; } - -// Iterates over a vector of TestCases, keeping a running sum of the -// results of calling a given int-returning method on each. -// Returns the sum. -static int SumOverTestCaseList(const std::vector& case_list, - int (TestCase::*method)() const) { - int sum = 0; - for (size_t i = 0; i < case_list.size(); i++) { - sum += (case_list[i]->*method)(); - } - return sum; -} - -// Returns true iff the test case passed. -static bool TestCasePassed(const TestCase* test_case) { - return test_case->should_run() && test_case->Passed(); -} - -// Returns true iff the test case failed. -static bool TestCaseFailed(const TestCase* test_case) { - return test_case->should_run() && test_case->Failed(); -} - -// Returns true iff test_case contains at least one test that should -// run. -static bool ShouldRunTestCase(const TestCase* test_case) { - return test_case->should_run(); -} - -// AssertHelper constructor. -AssertHelper::AssertHelper(TestPartResult::Type type, - const char* file, - int line, - const char* message) - : data_(new AssertHelperData(type, file, line, message)) { -} - -AssertHelper::~AssertHelper() { - delete data_; -} - -// Message assignment, for assertion streaming support. -void AssertHelper::operator=(const Message& message) const { - UnitTest::GetInstance()-> - AddTestPartResult(data_->type, data_->file, data_->line, - AppendUserMessage(data_->message, message), - UnitTest::GetInstance()->impl() - ->CurrentOsStackTraceExceptTop(1) - // Skips the stack frame for this function itself. - ); // NOLINT -} - -// Mutex for linked pointers. -GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex); - -// Application pathname gotten in InitGoogleTest. -std::string g_executable_path; - -// Returns the current application's name, removing directory path if that -// is present. -FilePath GetCurrentExecutableName() { - FilePath result; - -#if GTEST_OS_WINDOWS - result.Set(FilePath(g_executable_path).RemoveExtension("exe")); -#else - result.Set(FilePath(g_executable_path)); -#endif // GTEST_OS_WINDOWS - - return result.RemoveDirectoryName(); -} - -// Functions for processing the gtest_output flag. - -// Returns the output format, or "" for normal printed output. -std::string UnitTestOptions::GetOutputFormat() { - const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); - if (gtest_output_flag == NULL) return std::string(""); - - const char* const colon = strchr(gtest_output_flag, ':'); - return (colon == NULL) ? - std::string(gtest_output_flag) : - std::string(gtest_output_flag, colon - gtest_output_flag); -} - -// Returns the name of the requested output file, or the default if none -// was explicitly specified. -std::string UnitTestOptions::GetAbsolutePathToOutputFile() { - const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); - if (gtest_output_flag == NULL) - return ""; - - const char* const colon = strchr(gtest_output_flag, ':'); - if (colon == NULL) - return internal::FilePath::ConcatPaths( - internal::FilePath( - UnitTest::GetInstance()->original_working_dir()), - internal::FilePath(kDefaultOutputFile)).string(); - - internal::FilePath output_name(colon + 1); - if (!output_name.IsAbsolutePath()) - // TODO(wan@google.com): on Windows \some\path is not an absolute - // path (as its meaning depends on the current drive), yet the - // following logic for turning it into an absolute path is wrong. - // Fix it. - output_name = internal::FilePath::ConcatPaths( - internal::FilePath(UnitTest::GetInstance()->original_working_dir()), - internal::FilePath(colon + 1)); - - if (!output_name.IsDirectory()) - return output_name.string(); - - internal::FilePath result(internal::FilePath::GenerateUniqueFileName( - output_name, internal::GetCurrentExecutableName(), - GetOutputFormat().c_str())); - return result.string(); -} - -// Returns true iff the wildcard pattern matches the string. The -// first ':' or '\0' character in pattern marks the end of it. -// -// This recursive algorithm isn't very efficient, but is clear and -// works well enough for matching test names, which are short. -bool UnitTestOptions::PatternMatchesString(const char *pattern, - const char *str) { - switch (*pattern) { - case '\0': - case ':': // Either ':' or '\0' marks the end of the pattern. - return *str == '\0'; - case '?': // Matches any single character. - return *str != '\0' && PatternMatchesString(pattern + 1, str + 1); - case '*': // Matches any string (possibly empty) of characters. - return (*str != '\0' && PatternMatchesString(pattern, str + 1)) || - PatternMatchesString(pattern + 1, str); - default: // Non-special character. Matches itself. - return *pattern == *str && - PatternMatchesString(pattern + 1, str + 1); - } -} - -bool UnitTestOptions::MatchesFilter( - const std::string& name, const char* filter) { - const char *cur_pattern = filter; - for (;;) { - if (PatternMatchesString(cur_pattern, name.c_str())) { - return true; - } - - // Finds the next pattern in the filter. - cur_pattern = strchr(cur_pattern, ':'); - - // Returns if no more pattern can be found. - if (cur_pattern == NULL) { - return false; - } - - // Skips the pattern separater (the ':' character). - cur_pattern++; - } -} - -// Returns true iff the user-specified filter matches the test case -// name and the test name. -bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name, - const std::string &test_name) { - const std::string& full_name = test_case_name + "." + test_name.c_str(); - - // Split --gtest_filter at '-', if there is one, to separate into - // positive filter and negative filter portions - const char* const p = GTEST_FLAG(filter).c_str(); - const char* const dash = strchr(p, '-'); - std::string positive; - std::string negative; - if (dash == NULL) { - positive = GTEST_FLAG(filter).c_str(); // Whole string is a positive filter - negative = ""; - } else { - positive = std::string(p, dash); // Everything up to the dash - negative = std::string(dash + 1); // Everything after the dash - if (positive.empty()) { - // Treat '-test1' as the same as '*-test1' - positive = kUniversalFilter; - } - } - - // A filter is a colon-separated list of patterns. It matches a - // test if any pattern in it matches the test. - return (MatchesFilter(full_name, positive.c_str()) && - !MatchesFilter(full_name, negative.c_str())); -} - -#if GTEST_HAS_SEH -// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the -// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. -// This function is useful as an __except condition. -int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { - // Google Test should handle a SEH exception if: - // 1. the user wants it to, AND - // 2. this is not a breakpoint exception, AND - // 3. this is not a C++ exception (VC++ implements them via SEH, - // apparently). - // - // SEH exception code for C++ exceptions. - // (see http://support.microsoft.com/kb/185294 for more information). - const DWORD kCxxExceptionCode = 0xe06d7363; - - bool should_handle = true; - - if (!GTEST_FLAG(catch_exceptions)) - should_handle = false; - else if (exception_code == EXCEPTION_BREAKPOINT) - should_handle = false; - else if (exception_code == kCxxExceptionCode) - should_handle = false; - - return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH; -} -#endif // GTEST_HAS_SEH - -} // namespace internal - -// The c'tor sets this object as the test part result reporter used by -// Google Test. The 'result' parameter specifies where to report the -// results. Intercepts only failures from the current thread. -ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( - TestPartResultArray* result) - : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), - result_(result) { - Init(); -} - -// The c'tor sets this object as the test part result reporter used by -// Google Test. The 'result' parameter specifies where to report the -// results. -ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( - InterceptMode intercept_mode, TestPartResultArray* result) - : intercept_mode_(intercept_mode), - result_(result) { - Init(); -} - -void ScopedFakeTestPartResultReporter::Init() { - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - if (intercept_mode_ == INTERCEPT_ALL_THREADS) { - old_reporter_ = impl->GetGlobalTestPartResultReporter(); - impl->SetGlobalTestPartResultReporter(this); - } else { - old_reporter_ = impl->GetTestPartResultReporterForCurrentThread(); - impl->SetTestPartResultReporterForCurrentThread(this); - } -} - -// The d'tor restores the test part result reporter used by Google Test -// before. -ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() { - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - if (intercept_mode_ == INTERCEPT_ALL_THREADS) { - impl->SetGlobalTestPartResultReporter(old_reporter_); - } else { - impl->SetTestPartResultReporterForCurrentThread(old_reporter_); - } -} - -// Increments the test part result count and remembers the result. -// This method is from the TestPartResultReporterInterface interface. -void ScopedFakeTestPartResultReporter::ReportTestPartResult( - const TestPartResult& result) { - result_->Append(result); -} - -namespace internal { - -// Returns the type ID of ::testing::Test. We should always call this -// instead of GetTypeId< ::testing::Test>() to get the type ID of -// testing::Test. This is to work around a suspected linker bug when -// using Google Test as a framework on Mac OS X. The bug causes -// GetTypeId< ::testing::Test>() to return different values depending -// on whether the call is from the Google Test framework itself or -// from user test code. GetTestTypeId() is guaranteed to always -// return the same value, as it always calls GetTypeId<>() from the -// gtest.cc, which is within the Google Test framework. -TypeId GetTestTypeId() { - return GetTypeId(); -} - -// The value of GetTestTypeId() as seen from within the Google Test -// library. This is solely for testing GetTestTypeId(). -extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId(); - -// This predicate-formatter checks that 'results' contains a test part -// failure of the given type and that the failure message contains the -// given substring. -AssertionResult HasOneFailure(const char* /* results_expr */, - const char* /* type_expr */, - const char* /* substr_expr */, - const TestPartResultArray& results, - TestPartResult::Type type, - const string& substr) { - const std::string expected(type == TestPartResult::kFatalFailure ? - "1 fatal failure" : - "1 non-fatal failure"); - Message msg; - if (results.size() != 1) { - msg << "Expected: " << expected << "\n" - << " Actual: " << results.size() << " failures"; - for (int i = 0; i < results.size(); i++) { - msg << "\n" << results.GetTestPartResult(i); - } - return AssertionFailure() << msg; - } - - const TestPartResult& r = results.GetTestPartResult(0); - if (r.type() != type) { - return AssertionFailure() << "Expected: " << expected << "\n" - << " Actual:\n" - << r; - } - - if (strstr(r.message(), substr.c_str()) == NULL) { - return AssertionFailure() << "Expected: " << expected << " containing \"" - << substr << "\"\n" - << " Actual:\n" - << r; - } - - return AssertionSuccess(); -} - -// The constructor of SingleFailureChecker remembers where to look up -// test part results, what type of failure we expect, and what -// substring the failure message should contain. -SingleFailureChecker:: SingleFailureChecker( - const TestPartResultArray* results, - TestPartResult::Type type, - const string& substr) - : results_(results), - type_(type), - substr_(substr) {} - -// The destructor of SingleFailureChecker verifies that the given -// TestPartResultArray contains exactly one failure that has the given -// type and contains the given substring. If that's not the case, a -// non-fatal failure will be generated. -SingleFailureChecker::~SingleFailureChecker() { - EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_); -} - -DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter( - UnitTestImpl* unit_test) : unit_test_(unit_test) {} - -void DefaultGlobalTestPartResultReporter::ReportTestPartResult( - const TestPartResult& result) { - unit_test_->current_test_result()->AddTestPartResult(result); - unit_test_->listeners()->repeater()->OnTestPartResult(result); -} - -DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter( - UnitTestImpl* unit_test) : unit_test_(unit_test) {} - -void DefaultPerThreadTestPartResultReporter::ReportTestPartResult( - const TestPartResult& result) { - unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result); -} - -// Returns the global test part result reporter. -TestPartResultReporterInterface* -UnitTestImpl::GetGlobalTestPartResultReporter() { - internal::MutexLock lock(&global_test_part_result_reporter_mutex_); - return global_test_part_result_repoter_; -} - -// Sets the global test part result reporter. -void UnitTestImpl::SetGlobalTestPartResultReporter( - TestPartResultReporterInterface* reporter) { - internal::MutexLock lock(&global_test_part_result_reporter_mutex_); - global_test_part_result_repoter_ = reporter; -} - -// Returns the test part result reporter for the current thread. -TestPartResultReporterInterface* -UnitTestImpl::GetTestPartResultReporterForCurrentThread() { - return per_thread_test_part_result_reporter_.get(); -} - -// Sets the test part result reporter for the current thread. -void UnitTestImpl::SetTestPartResultReporterForCurrentThread( - TestPartResultReporterInterface* reporter) { - per_thread_test_part_result_reporter_.set(reporter); -} - -// Gets the number of successful test cases. -int UnitTestImpl::successful_test_case_count() const { - return CountIf(test_cases_, TestCasePassed); -} - -// Gets the number of failed test cases. -int UnitTestImpl::failed_test_case_count() const { - return CountIf(test_cases_, TestCaseFailed); -} - -// Gets the number of all test cases. -int UnitTestImpl::total_test_case_count() const { - return static_cast(test_cases_.size()); -} - -// Gets the number of all test cases that contain at least one test -// that should run. -int UnitTestImpl::test_case_to_run_count() const { - return CountIf(test_cases_, ShouldRunTestCase); -} - -// Gets the number of successful tests. -int UnitTestImpl::successful_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count); -} - -// Gets the number of failed tests. -int UnitTestImpl::failed_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count); -} - -// Gets the number of disabled tests that will be reported in the XML report. -int UnitTestImpl::reportable_disabled_test_count() const { - return SumOverTestCaseList(test_cases_, - &TestCase::reportable_disabled_test_count); -} - -// Gets the number of disabled tests. -int UnitTestImpl::disabled_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count); -} - -// Gets the number of tests to be printed in the XML report. -int UnitTestImpl::reportable_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count); -} - -// Gets the number of all tests. -int UnitTestImpl::total_test_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::total_test_count); -} - -// Gets the number of tests that should run. -int UnitTestImpl::test_to_run_count() const { - return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count); -} - -// Returns the current OS stack trace as an std::string. -// -// The maximum number of stack frames to be included is specified by -// the gtest_stack_trace_depth flag. The skip_count parameter -// specifies the number of top frames to be skipped, which doesn't -// count against the number of frames to be included. -// -// For example, if Foo() calls Bar(), which in turn calls -// CurrentOsStackTraceExceptTop(1), Foo() will be included in the -// trace but Bar() and CurrentOsStackTraceExceptTop() won't. -std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { - (void)skip_count; - return ""; -} - -// Returns the current time in milliseconds. -TimeInMillis GetTimeInMillis() { -#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__) - // Difference between 1970-01-01 and 1601-01-01 in milliseconds. - // http://analogous.blogspot.com/2005/04/epoch.html - const TimeInMillis kJavaEpochToWinFileTimeDelta = - static_cast(116444736UL) * 100000UL; - const DWORD kTenthMicrosInMilliSecond = 10000; - - SYSTEMTIME now_systime; - FILETIME now_filetime; - ULARGE_INTEGER now_int64; - // TODO(kenton@google.com): Shouldn't this just use - // GetSystemTimeAsFileTime()? - GetSystemTime(&now_systime); - if (SystemTimeToFileTime(&now_systime, &now_filetime)) { - now_int64.LowPart = now_filetime.dwLowDateTime; - now_int64.HighPart = now_filetime.dwHighDateTime; - now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) - - kJavaEpochToWinFileTimeDelta; - return now_int64.QuadPart; - } - return 0; -#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_ - __timeb64 now; - -# ifdef _MSC_VER - - // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996 - // (deprecated function) there. - // TODO(kenton@google.com): Use GetTickCount()? Or use - // SystemTimeToFileTime() -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4996) // Temporarily disables warning 4996. - _ftime64(&now); -# pragma warning(pop) // Restores the warning state. -# else - - _ftime64(&now); - -# endif // _MSC_VER - - return static_cast(now.time) * 1000 + now.millitm; -#elif GTEST_HAS_GETTIMEOFDAY_ - struct timeval now; - gettimeofday(&now, NULL); - return static_cast(now.tv_sec) * 1000 + now.tv_usec / 1000; -#else -# error "Don't know how to get the current time on your system." -#endif -} - -// Utilities - -// class String. - -#if GTEST_OS_WINDOWS_MOBILE -// Creates a UTF-16 wide string from the given ANSI string, allocating -// memory using new. The caller is responsible for deleting the return -// value using delete[]. Returns the wide string, or NULL if the -// input is NULL. -LPCWSTR String::AnsiToUtf16(const char* ansi) { - if (!ansi) return NULL; - const int length = strlen(ansi); - const int unicode_length = - MultiByteToWideChar(CP_ACP, 0, ansi, length, - NULL, 0); - WCHAR* unicode = new WCHAR[unicode_length + 1]; - MultiByteToWideChar(CP_ACP, 0, ansi, length, - unicode, unicode_length); - unicode[unicode_length] = 0; - return unicode; -} - -// Creates an ANSI string from the given wide string, allocating -// memory using new. The caller is responsible for deleting the return -// value using delete[]. Returns the ANSI string, or NULL if the -// input is NULL. -const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { - if (!utf16_str) return NULL; - const int ansi_length = - WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, - NULL, 0, NULL, NULL); - char* ansi = new char[ansi_length + 1]; - WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, - ansi, ansi_length, NULL, NULL); - ansi[ansi_length] = 0; - return ansi; -} - -#endif // GTEST_OS_WINDOWS_MOBILE - -// Compares two C strings. Returns true iff they have the same content. -// -// Unlike strcmp(), this function can handle NULL argument(s). A NULL -// C string is considered different to any non-NULL C string, -// including the empty string. -bool String::CStringEquals(const char * lhs, const char * rhs) { - if ( lhs == NULL ) return rhs == NULL; - - if ( rhs == NULL ) return false; - - return strcmp(lhs, rhs) == 0; -} - -#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING - -// Converts an array of wide chars to a narrow string using the UTF-8 -// encoding, and streams the result to the given Message object. -static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length, - Message* msg) { - for (size_t i = 0; i != length; ) { // NOLINT - if (wstr[i] != L'\0') { - *msg << WideStringToUtf8(wstr + i, static_cast(length - i)); - while (i != length && wstr[i] != L'\0') - i++; - } else { - *msg << '\0'; - i++; - } - } -} - -#endif // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING - -} // namespace internal - -// Constructs an empty Message. -// We allocate the stringstream separately because otherwise each use of -// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's -// stack frame leading to huge stack frames in some cases; gcc does not reuse -// the stack space. -Message::Message() : ss_(new ::std::stringstream) { - // By default, we want there to be enough precision when printing - // a double to a Message. - *ss_ << std::setprecision(std::numeric_limits::digits10 + 2); -} - -// These two overloads allow streaming a wide C string to a Message -// using the UTF-8 encoding. -Message& Message::operator <<(const wchar_t* wide_c_str) { - return *this << internal::String::ShowWideCString(wide_c_str); -} -Message& Message::operator <<(wchar_t* wide_c_str) { - return *this << internal::String::ShowWideCString(wide_c_str); -} - -#if GTEST_HAS_STD_WSTRING -// Converts the given wide string to a narrow string using the UTF-8 -// encoding, and streams the result to this Message object. -Message& Message::operator <<(const ::std::wstring& wstr) { - internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); - return *this; -} -#endif // GTEST_HAS_STD_WSTRING - -#if GTEST_HAS_GLOBAL_WSTRING -// Converts the given wide string to a narrow string using the UTF-8 -// encoding, and streams the result to this Message object. -Message& Message::operator <<(const ::wstring& wstr) { - internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); - return *this; -} -#endif // GTEST_HAS_GLOBAL_WSTRING - -// Gets the text streamed to this object so far as an std::string. -// Each '\0' character in the buffer is replaced with "\\0". -std::string Message::GetString() const { - return internal::StringStreamToString(ss_.get()); -} - -// AssertionResult constructors. -// Used in EXPECT_TRUE/FALSE(assertion_result). -AssertionResult::AssertionResult(const AssertionResult& other) - : success_(other.success_), - message_(other.message_.get() != NULL ? - new ::std::string(*other.message_) : - static_cast< ::std::string*>(NULL)) { -} - -// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. -AssertionResult AssertionResult::operator!() const { - AssertionResult negation(!success_); - if (message_.get() != NULL) - negation << *message_; - return negation; -} - -// Makes a successful assertion result. -AssertionResult AssertionSuccess() { - return AssertionResult(true); -} - -// Makes a failed assertion result. -AssertionResult AssertionFailure() { - return AssertionResult(false); -} - -// Makes a failed assertion result with the given failure message. -// Deprecated; use AssertionFailure() << message. -AssertionResult AssertionFailure(const Message& message) { - return AssertionFailure() << message; -} - -namespace internal { - -// Constructs and returns the message for an equality assertion -// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. -// -// The first four parameters are the expressions used in the assertion -// and their values, as strings. For example, for ASSERT_EQ(foo, bar) -// where foo is 5 and bar is 6, we have: -// -// expected_expression: "foo" -// actual_expression: "bar" -// expected_value: "5" -// actual_value: "6" -// -// The ignoring_case parameter is true iff the assertion is a -// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will -// be inserted into the message. -AssertionResult EqFailure(const char* expected_expression, - const char* actual_expression, - const std::string& expected_value, - const std::string& actual_value, - bool ignoring_case) { - Message msg; - msg << "Value of: " << actual_expression; - if (actual_value != actual_expression) { - msg << "\n Actual: " << actual_value; - } - - msg << "\nExpected: " << expected_expression; - if (ignoring_case) { - msg << " (ignoring case)"; - } - if (expected_value != expected_expression) { - msg << "\nWhich is: " << expected_value; - } - - return AssertionFailure() << msg; -} - -// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. -std::string GetBoolAssertionFailureMessage( - const AssertionResult& assertion_result, - const char* expression_text, - const char* actual_predicate_value, - const char* expected_predicate_value) { - const char* actual_message = assertion_result.message(); - Message msg; - msg << "Value of: " << expression_text - << "\n Actual: " << actual_predicate_value; - if (actual_message[0] != '\0') - msg << " (" << actual_message << ")"; - msg << "\nExpected: " << expected_predicate_value; - return msg.GetString(); -} - -// Helper function for implementing ASSERT_NEAR. -AssertionResult DoubleNearPredFormat(const char* expr1, - const char* expr2, - const char* abs_error_expr, - double val1, - double val2, - double abs_error) { - const double diff = fabs(val1 - val2); - if (diff <= abs_error) return AssertionSuccess(); - - // TODO(wan): do not print the value of an expression if it's - // already a literal. - return AssertionFailure() - << "The difference between " << expr1 << " and " << expr2 - << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n" - << expr1 << " evaluates to " << val1 << ",\n" - << expr2 << " evaluates to " << val2 << ", and\n" - << abs_error_expr << " evaluates to " << abs_error << "."; -} - - -// Helper template for implementing FloatLE() and DoubleLE(). -template -AssertionResult FloatingPointLE(const char* expr1, - const char* expr2, - RawType val1, - RawType val2) { - // Returns success if val1 is less than val2, - if (val1 < val2) { - return AssertionSuccess(); - } - - // or if val1 is almost equal to val2. - const FloatingPoint lhs(val1), rhs(val2); - if (lhs.AlmostEquals(rhs)) { - return AssertionSuccess(); - } - - // Note that the above two checks will both fail if either val1 or - // val2 is NaN, as the IEEE floating-point standard requires that - // any predicate involving a NaN must return false. - - ::std::stringstream val1_ss; - val1_ss << std::setprecision(std::numeric_limits::digits10 + 2) - << val1; - - ::std::stringstream val2_ss; - val2_ss << std::setprecision(std::numeric_limits::digits10 + 2) - << val2; - - return AssertionFailure() - << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" - << " Actual: " << StringStreamToString(&val1_ss) << " vs " - << StringStreamToString(&val2_ss); -} - -} // namespace internal - -// Asserts that val1 is less than, or almost equal to, val2. Fails -// otherwise. In particular, it fails if either val1 or val2 is NaN. -AssertionResult FloatLE(const char* expr1, const char* expr2, - float val1, float val2) { - return internal::FloatingPointLE(expr1, expr2, val1, val2); -} - -// Asserts that val1 is less than, or almost equal to, val2. Fails -// otherwise. In particular, it fails if either val1 or val2 is NaN. -AssertionResult DoubleLE(const char* expr1, const char* expr2, - double val1, double val2) { - return internal::FloatingPointLE(expr1, expr2, val1, val2); -} - -namespace internal { - -// The helper function for {ASSERT|EXPECT}_EQ with int or enum -// arguments. -AssertionResult CmpHelperEQ(const char* expected_expression, - const char* actual_expression, - BiggestInt expected, - BiggestInt actual) { - if (expected == actual) { - return AssertionSuccess(); - } - - return EqFailure(expected_expression, - actual_expression, - FormatForComparisonFailureMessage(expected, actual), - FormatForComparisonFailureMessage(actual, expected), - false); -} - -// A macro for implementing the helper functions needed to implement -// ASSERT_?? and EXPECT_?? with integer or enum arguments. It is here -// just to avoid copy-and-paste of similar code. -#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ -AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ - BiggestInt val1, BiggestInt val2) {\ - if (val1 op val2) {\ - return AssertionSuccess();\ - } else {\ - return AssertionFailure() \ - << "Expected: (" << expr1 << ") " #op " (" << expr2\ - << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\ - << " vs " << FormatForComparisonFailureMessage(val2, val1);\ - }\ -} - -// Implements the helper function for {ASSERT|EXPECT}_NE with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(NE, !=) -// Implements the helper function for {ASSERT|EXPECT}_LE with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(LE, <=) -// Implements the helper function for {ASSERT|EXPECT}_LT with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(LT, < ) -// Implements the helper function for {ASSERT|EXPECT}_GE with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(GE, >=) -// Implements the helper function for {ASSERT|EXPECT}_GT with int or -// enum arguments. -GTEST_IMPL_CMP_HELPER_(GT, > ) - -#undef GTEST_IMPL_CMP_HELPER_ - -// The helper function for {ASSERT|EXPECT}_STREQ. -AssertionResult CmpHelperSTREQ(const char* expected_expression, - const char* actual_expression, - const char* expected, - const char* actual) { - if (String::CStringEquals(expected, actual)) { - return AssertionSuccess(); - } - - return EqFailure(expected_expression, - actual_expression, - PrintToString(expected), - PrintToString(actual), - false); -} - -// The helper function for {ASSERT|EXPECT}_STRCASEEQ. -AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression, - const char* actual_expression, - const char* expected, - const char* actual) { - if (String::CaseInsensitiveCStringEquals(expected, actual)) { - return AssertionSuccess(); - } - - return EqFailure(expected_expression, - actual_expression, - PrintToString(expected), - PrintToString(actual), - true); -} - -// The helper function for {ASSERT|EXPECT}_STRNE. -AssertionResult CmpHelperSTRNE(const char* s1_expression, - const char* s2_expression, - const char* s1, - const char* s2) { - if (!String::CStringEquals(s1, s2)) { - return AssertionSuccess(); - } else { - return AssertionFailure() << "Expected: (" << s1_expression << ") != (" - << s2_expression << "), actual: \"" - << s1 << "\" vs \"" << s2 << "\""; - } -} - -// The helper function for {ASSERT|EXPECT}_STRCASENE. -AssertionResult CmpHelperSTRCASENE(const char* s1_expression, - const char* s2_expression, - const char* s1, - const char* s2) { - if (!String::CaseInsensitiveCStringEquals(s1, s2)) { - return AssertionSuccess(); - } else { - return AssertionFailure() - << "Expected: (" << s1_expression << ") != (" - << s2_expression << ") (ignoring case), actual: \"" - << s1 << "\" vs \"" << s2 << "\""; - } -} - -} // namespace internal - -namespace { - -// Helper functions for implementing IsSubString() and IsNotSubstring(). - -// This group of overloaded functions return true iff needle is a -// substring of haystack. NULL is considered a substring of itself -// only. - -bool IsSubstringPred(const char* needle, const char* haystack) { - if (needle == NULL || haystack == NULL) - return needle == haystack; - - return strstr(haystack, needle) != NULL; -} - -bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) { - if (needle == NULL || haystack == NULL) - return needle == haystack; - - return wcsstr(haystack, needle) != NULL; -} - -// StringType here can be either ::std::string or ::std::wstring. -template -bool IsSubstringPred(const StringType& needle, - const StringType& haystack) { - return haystack.find(needle) != StringType::npos; -} - -// This function implements either IsSubstring() or IsNotSubstring(), -// depending on the value of the expected_to_be_substring parameter. -// StringType here can be const char*, const wchar_t*, ::std::string, -// or ::std::wstring. -template -AssertionResult IsSubstringImpl( - bool expected_to_be_substring, - const char* needle_expr, const char* haystack_expr, - const StringType& needle, const StringType& haystack) { - if (IsSubstringPred(needle, haystack) == expected_to_be_substring) - return AssertionSuccess(); - - const bool is_wide_string = sizeof(needle[0]) > 1; - const char* const begin_string_quote = is_wide_string ? "L\"" : "\""; - return AssertionFailure() - << "Value of: " << needle_expr << "\n" - << " Actual: " << begin_string_quote << needle << "\"\n" - << "Expected: " << (expected_to_be_substring ? "" : "not ") - << "a substring of " << haystack_expr << "\n" - << "Which is: " << begin_string_quote << haystack << "\""; -} - -} // namespace - -// IsSubstring() and IsNotSubstring() check whether needle is a -// substring of haystack (NULL is considered a substring of itself -// only), and return an appropriate error message when they fail. - -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack) { - return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack) { - return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const char* needle, const char* haystack) { - return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const wchar_t* needle, const wchar_t* haystack) { - return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack) { - return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::string& needle, const ::std::string& haystack) { - return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); -} - -#if GTEST_HAS_STD_WSTRING -AssertionResult IsSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack) { - return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); -} - -AssertionResult IsNotSubstring( - const char* needle_expr, const char* haystack_expr, - const ::std::wstring& needle, const ::std::wstring& haystack) { - return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); -} -#endif // GTEST_HAS_STD_WSTRING - -namespace internal { - -#if GTEST_OS_WINDOWS - -namespace { - -// Helper function for IsHRESULT{SuccessFailure} predicates -AssertionResult HRESULTFailureHelper(const char* expr, - const char* expected, - long hr) { // NOLINT -# if GTEST_OS_WINDOWS_MOBILE - - // Windows CE doesn't support FormatMessage. - const char error_text[] = ""; - -# else - - // Looks up the human-readable system message for the HRESULT code - // and since we're not passing any params to FormatMessage, we don't - // want inserts expanded. - const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS; - const DWORD kBufSize = 4096; - // Gets the system's human readable message string for this HRESULT. - char error_text[kBufSize] = { '\0' }; - DWORD message_length = ::FormatMessageA(kFlags, - 0, // no source, we're asking system - hr, // the error - 0, // no line width restrictions - error_text, // output buffer - kBufSize, // buf size - NULL); // no arguments for inserts - // Trims tailing white space (FormatMessage leaves a trailing CR-LF) - for (; message_length && IsSpace(error_text[message_length - 1]); - --message_length) { - error_text[message_length - 1] = '\0'; - } - -# endif // GTEST_OS_WINDOWS_MOBILE - - const std::string error_hex("0x" + String::FormatHexInt(hr)); - return ::testing::AssertionFailure() - << "Expected: " << expr << " " << expected << ".\n" - << " Actual: " << error_hex << " " << error_text << "\n"; -} - -} // namespace - -AssertionResult IsHRESULTSuccess(const char* expr, long hr) { // NOLINT - if (SUCCEEDED(hr)) { - return AssertionSuccess(); - } - return HRESULTFailureHelper(expr, "succeeds", hr); -} - -AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT - if (FAILED(hr)) { - return AssertionSuccess(); - } - return HRESULTFailureHelper(expr, "fails", hr); -} - -#endif // GTEST_OS_WINDOWS - -// Utility functions for encoding Unicode text (wide strings) in -// UTF-8. - -// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8 -// like this: -// -// Code-point length Encoding -// 0 - 7 bits 0xxxxxxx -// 8 - 11 bits 110xxxxx 10xxxxxx -// 12 - 16 bits 1110xxxx 10xxxxxx 10xxxxxx -// 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - -// The maximum code-point a one-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint1 = (static_cast(1) << 7) - 1; - -// The maximum code-point a two-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint2 = (static_cast(1) << (5 + 6)) - 1; - -// The maximum code-point a three-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint3 = (static_cast(1) << (4 + 2*6)) - 1; - -// The maximum code-point a four-byte UTF-8 sequence can represent. -const UInt32 kMaxCodePoint4 = (static_cast(1) << (3 + 3*6)) - 1; - -// Chops off the n lowest bits from a bit pattern. Returns the n -// lowest bits. As a side effect, the original bit pattern will be -// shifted to the right by n bits. -inline UInt32 ChopLowBits(UInt32* bits, int n) { - const UInt32 low_bits = *bits & ((static_cast(1) << n) - 1); - *bits >>= n; - return low_bits; -} - -// Converts a Unicode code point to a narrow string in UTF-8 encoding. -// code_point parameter is of type UInt32 because wchar_t may not be -// wide enough to contain a code point. -// If the code_point is not a valid Unicode code point -// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted -// to "(Invalid Unicode 0xXXXXXXXX)". -std::string CodePointToUtf8(UInt32 code_point) { - if (code_point > kMaxCodePoint4) { - return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")"; - } - - char str[5]; // Big enough for the largest valid code point. - if (code_point <= kMaxCodePoint1) { - str[1] = '\0'; - str[0] = static_cast(code_point); // 0xxxxxxx - } else if (code_point <= kMaxCodePoint2) { - str[2] = '\0'; - str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[0] = static_cast(0xC0 | code_point); // 110xxxxx - } else if (code_point <= kMaxCodePoint3) { - str[3] = '\0'; - str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[0] = static_cast(0xE0 | code_point); // 1110xxxx - } else { // code_point <= kMaxCodePoint4 - str[4] = '\0'; - str[3] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx - str[0] = static_cast(0xF0 | code_point); // 11110xxx - } - return str; -} - -// The following two functions only make sense if the the system -// uses UTF-16 for wide string encoding. All supported systems -// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16. - -// Determines if the arguments constitute UTF-16 surrogate pair -// and thus should be combined into a single Unicode code point -// using CreateCodePointFromUtf16SurrogatePair. -inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { - return sizeof(wchar_t) == 2 && - (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00; -} - -// Creates a Unicode code point from UTF16 surrogate pair. -inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first, - wchar_t second) { - const UInt32 mask = (1 << 10) - 1; - return (sizeof(wchar_t) == 2) ? - (((first & mask) << 10) | (second & mask)) + 0x10000 : - // This function should not be called when the condition is - // false, but we provide a sensible default in case it is. - static_cast(first); -} - -// Converts a wide string to a narrow string in UTF-8 encoding. -// The wide string is assumed to have the following encoding: -// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) -// UTF-32 if sizeof(wchar_t) == 4 (on Linux) -// Parameter str points to a null-terminated wide string. -// Parameter num_chars may additionally limit the number -// of wchar_t characters processed. -1 is used when the entire string -// should be processed. -// If the string contains code points that are not valid Unicode code points -// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output -// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding -// and contains invalid UTF-16 surrogate pairs, values in those pairs -// will be encoded as individual Unicode characters from Basic Normal Plane. -std::string WideStringToUtf8(const wchar_t* str, int num_chars) { - if (num_chars == -1) - num_chars = static_cast(wcslen(str)); - - ::std::stringstream stream; - for (int i = 0; i < num_chars; ++i) { - UInt32 unicode_code_point; - - if (str[i] == L'\0') { - break; - } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { - unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i], - str[i + 1]); - i++; - } else { - unicode_code_point = static_cast(str[i]); - } - - stream << CodePointToUtf8(unicode_code_point); - } - return StringStreamToString(&stream); -} - -// Converts a wide C string to an std::string using the UTF-8 encoding. -// NULL will be converted to "(null)". -std::string String::ShowWideCString(const wchar_t * wide_c_str) { - if (wide_c_str == NULL) return "(null)"; - - return internal::WideStringToUtf8(wide_c_str, -1); -} - -// Compares two wide C strings. Returns true iff they have the same -// content. -// -// Unlike wcscmp(), this function can handle NULL argument(s). A NULL -// C string is considered different to any non-NULL C string, -// including the empty string. -bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) { - if (lhs == NULL) return rhs == NULL; - - if (rhs == NULL) return false; - - return wcscmp(lhs, rhs) == 0; -} - -// Helper function for *_STREQ on wide strings. -AssertionResult CmpHelperSTREQ(const char* expected_expression, - const char* actual_expression, - const wchar_t* expected, - const wchar_t* actual) { - if (String::WideCStringEquals(expected, actual)) { - return AssertionSuccess(); - } - - return EqFailure(expected_expression, - actual_expression, - PrintToString(expected), - PrintToString(actual), - false); -} - -// Helper function for *_STRNE on wide strings. -AssertionResult CmpHelperSTRNE(const char* s1_expression, - const char* s2_expression, - const wchar_t* s1, - const wchar_t* s2) { - if (!String::WideCStringEquals(s1, s2)) { - return AssertionSuccess(); - } - - return AssertionFailure() << "Expected: (" << s1_expression << ") != (" - << s2_expression << "), actual: " - << PrintToString(s1) - << " vs " << PrintToString(s2); -} - -// Compares two C strings, ignoring case. Returns true iff they have -// the same content. -// -// Unlike strcasecmp(), this function can handle NULL argument(s). A -// NULL C string is considered different to any non-NULL C string, -// including the empty string. -bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) { - if (lhs == NULL) - return rhs == NULL; - if (rhs == NULL) - return false; - return posix::StrCaseCmp(lhs, rhs) == 0; -} - - // Compares two wide C strings, ignoring case. Returns true iff they - // have the same content. - // - // Unlike wcscasecmp(), this function can handle NULL argument(s). - // A NULL C string is considered different to any non-NULL wide C string, - // including the empty string. - // NB: The implementations on different platforms slightly differ. - // On windows, this method uses _wcsicmp which compares according to LC_CTYPE - // environment variable. On GNU platform this method uses wcscasecmp - // which compares according to LC_CTYPE category of the current locale. - // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the - // current locale. -bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, - const wchar_t* rhs) { - if (lhs == NULL) return rhs == NULL; - - if (rhs == NULL) return false; - -#if GTEST_OS_WINDOWS - return _wcsicmp(lhs, rhs) == 0; -#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID - return wcscasecmp(lhs, rhs) == 0; -#else - // Android, Mac OS X and Cygwin don't define wcscasecmp. - // Other unknown OSes may not define it either. - wint_t left, right; - do { - left = towlower(*lhs++); - right = towlower(*rhs++); - } while (left && left == right); - return left == right; -#endif // OS selector -} - -// Returns true iff str ends with the given suffix, ignoring case. -// Any string is considered to end with an empty suffix. -bool String::EndsWithCaseInsensitive( - const std::string& str, const std::string& suffix) { - const size_t str_len = str.length(); - const size_t suffix_len = suffix.length(); - return (str_len >= suffix_len) && - CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len, - suffix.c_str()); -} - -// Formats an int value as "%02d". -std::string String::FormatIntWidth2(int value) { - std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << value; - return ss.str(); -} - -// Formats an int value as "%X". -std::string String::FormatHexInt(int value) { - std::stringstream ss; - ss << std::hex << std::uppercase << value; - return ss.str(); -} - -// Formats a byte as "%02X". -std::string String::FormatByte(unsigned char value) { - std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase - << static_cast(value); - return ss.str(); -} - -// Converts the buffer in a stringstream to an std::string, converting NUL -// bytes to "\\0" along the way. -std::string StringStreamToString(::std::stringstream* ss) { - const ::std::string& str = ss->str(); - const char* const start = str.c_str(); - const char* const end = start + str.length(); - - std::string result; - result.reserve(2 * (end - start)); - for (const char* ch = start; ch != end; ++ch) { - if (*ch == '\0') { - result += "\\0"; // Replaces NUL with "\\0"; - } else { - result += *ch; - } - } - - return result; -} - -// Appends the user-supplied message to the Google-Test-generated message. -std::string AppendUserMessage(const std::string& gtest_msg, - const Message& user_msg) { - // Appends the user message if it's non-empty. - const std::string user_msg_string = user_msg.GetString(); - if (user_msg_string.empty()) { - return gtest_msg; - } - - return gtest_msg + "\n" + user_msg_string; -} - -} // namespace internal - -// class TestResult - -// Creates an empty TestResult. -TestResult::TestResult() - : death_test_count_(0), - elapsed_time_(0) { -} - -// D'tor. -TestResult::~TestResult() { -} - -// Returns the i-th test part result among all the results. i can -// range from 0 to total_part_count() - 1. If i is not in that range, -// aborts the program. -const TestPartResult& TestResult::GetTestPartResult(int i) const { - if (i < 0 || i >= total_part_count()) - internal::posix::Abort(); - return test_part_results_.at(i); -} - -// Returns the i-th test property. i can range from 0 to -// test_property_count() - 1. If i is not in that range, aborts the -// program. -const TestProperty& TestResult::GetTestProperty(int i) const { - if (i < 0 || i >= test_property_count()) - internal::posix::Abort(); - return test_properties_.at(i); -} - -// Clears the test part results. -void TestResult::ClearTestPartResults() { - test_part_results_.clear(); -} - -// Adds a test part result to the list. -void TestResult::AddTestPartResult(const TestPartResult& test_part_result) { - test_part_results_.push_back(test_part_result); -} - -// Adds a test property to the list. If a property with the same key as the -// supplied property is already represented, the value of this test_property -// replaces the old value for that key. -void TestResult::RecordProperty(const std::string& xml_element, - const TestProperty& test_property) { - if (!ValidateTestProperty(xml_element, test_property)) { - return; - } - internal::MutexLock lock(&test_properites_mutex_); - const std::vector::iterator property_with_matching_key = - std::find_if(test_properties_.begin(), test_properties_.end(), - internal::TestPropertyKeyIs(test_property.key())); - if (property_with_matching_key == test_properties_.end()) { - test_properties_.push_back(test_property); - return; - } - property_with_matching_key->SetValue(test_property.value()); -} - -// The list of reserved attributes used in the element of XML -// output. -static const char* const kReservedTestSuitesAttributes[] = { - "disabled", - "errors", - "failures", - "name", - "random_seed", - "tests", - "time", - "timestamp" -}; - -// The list of reserved attributes used in the element of XML -// output. -static const char* const kReservedTestSuiteAttributes[] = { - "disabled", - "errors", - "failures", - "name", - "tests", - "time" -}; - -// The list of reserved attributes used in the element of XML output. -static const char* const kReservedTestCaseAttributes[] = { - "classname", - "name", - "status", - "time", - "type_param", - "value_param" -}; - -template -std::vector ArrayAsVector(const char* const (&array)[kSize]) { - return std::vector(array, array + kSize); -} - -static std::vector GetReservedAttributesForElement( - const std::string& xml_element) { - if (xml_element == "testsuites") { - return ArrayAsVector(kReservedTestSuitesAttributes); - } else if (xml_element == "testsuite") { - return ArrayAsVector(kReservedTestSuiteAttributes); - } else if (xml_element == "testcase") { - return ArrayAsVector(kReservedTestCaseAttributes); - } else { - GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; - } - // This code is unreachable but some compilers may not realizes that. - return std::vector(); -} - -static std::string FormatWordList(const std::vector& words) { - Message word_list; - for (size_t i = 0; i < words.size(); ++i) { - if (i > 0 && words.size() > 2) { - word_list << ", "; - } - if (i == words.size() - 1) { - word_list << "and "; - } - word_list << "'" << words[i] << "'"; - } - return word_list.GetString(); -} - -bool ValidateTestPropertyName(const std::string& property_name, - const std::vector& reserved_names) { - if (std::find(reserved_names.begin(), reserved_names.end(), property_name) != - reserved_names.end()) { - ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name - << " (" << FormatWordList(reserved_names) - << " are reserved by " << GTEST_NAME_ << ")"; - return false; - } - return true; -} - -// Adds a failure if the key is a reserved attribute of the element named -// xml_element. Returns true if the property is valid. -bool TestResult::ValidateTestProperty(const std::string& xml_element, - const TestProperty& test_property) { - return ValidateTestPropertyName(test_property.key(), - GetReservedAttributesForElement(xml_element)); -} - -// Clears the object. -void TestResult::Clear() { - test_part_results_.clear(); - test_properties_.clear(); - death_test_count_ = 0; - elapsed_time_ = 0; -} - -// Returns true iff the test failed. -bool TestResult::Failed() const { - for (int i = 0; i < total_part_count(); ++i) { - if (GetTestPartResult(i).failed()) - return true; - } - return false; -} - -// Returns true iff the test part fatally failed. -static bool TestPartFatallyFailed(const TestPartResult& result) { - return result.fatally_failed(); -} - -// Returns true iff the test fatally failed. -bool TestResult::HasFatalFailure() const { - return CountIf(test_part_results_, TestPartFatallyFailed) > 0; -} - -// Returns true iff the test part non-fatally failed. -static bool TestPartNonfatallyFailed(const TestPartResult& result) { - return result.nonfatally_failed(); -} - -// Returns true iff the test has a non-fatal failure. -bool TestResult::HasNonfatalFailure() const { - return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0; -} - -// Gets the number of all test parts. This is the sum of the number -// of successful test parts and the number of failed test parts. -int TestResult::total_part_count() const { - return static_cast(test_part_results_.size()); -} - -// Returns the number of the test properties. -int TestResult::test_property_count() const { - return static_cast(test_properties_.size()); -} - -// class Test - -// Creates a Test object. - -// The c'tor saves the values of all Google Test flags. -Test::Test() - : gtest_flag_saver_(new internal::GTestFlagSaver) { -} - -// The d'tor restores the values of all Google Test flags. -Test::~Test() { - delete gtest_flag_saver_; -} - -// Sets up the test fixture. -// -// A sub-class may override this. -void Test::SetUp() { -} - -// Tears down the test fixture. -// -// A sub-class may override this. -void Test::TearDown() { -} - -// Allows user supplied key value pairs to be recorded for later output. -void Test::RecordProperty(const std::string& key, const std::string& value) { - UnitTest::GetInstance()->RecordProperty(key, value); -} - -// Allows user supplied key value pairs to be recorded for later output. -void Test::RecordProperty(const std::string& key, int value) { - Message value_message; - value_message << value; - RecordProperty(key, value_message.GetString().c_str()); -} - -namespace internal { - -void ReportFailureInUnknownLocation(TestPartResult::Type result_type, - const std::string& message) { - // This function is a friend of UnitTest and as such has access to - // AddTestPartResult. - UnitTest::GetInstance()->AddTestPartResult( - result_type, - NULL, // No info about the source file where the exception occurred. - -1, // We have no info on which line caused the exception. - message, - ""); // No stack trace, either. -} - -} // namespace internal - -// Google Test requires all tests in the same test case to use the same test -// fixture class. This function checks if the current test has the -// same fixture class as the first test in the current test case. If -// yes, it returns true; otherwise it generates a Google Test failure and -// returns false. -bool Test::HasSameFixtureClass() { - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - const TestCase* const test_case = impl->current_test_case(); - - // Info about the first test in the current test case. - const TestInfo* const first_test_info = test_case->test_info_list()[0]; - const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_; - const char* const first_test_name = first_test_info->name(); - - // Info about the current test. - const TestInfo* const this_test_info = impl->current_test_info(); - const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_; - const char* const this_test_name = this_test_info->name(); - - if (this_fixture_id != first_fixture_id) { - // Is the first test defined using TEST? - const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId(); - // Is this test defined using TEST? - const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId(); - - if (first_is_TEST || this_is_TEST) { - // The user mixed TEST and TEST_F in this test case - we'll tell - // him/her how to fix it. - - // Gets the name of the TEST and the name of the TEST_F. Note - // that first_is_TEST and this_is_TEST cannot both be true, as - // the fixture IDs are different for the two tests. - const char* const TEST_name = - first_is_TEST ? first_test_name : this_test_name; - const char* const TEST_F_name = - first_is_TEST ? this_test_name : first_test_name; - - ADD_FAILURE() - << "All tests in the same test case must use the same test fixture\n" - << "class, so mixing TEST_F and TEST in the same test case is\n" - << "illegal. In test case " << this_test_info->test_case_name() - << ",\n" - << "test " << TEST_F_name << " is defined using TEST_F but\n" - << "test " << TEST_name << " is defined using TEST. You probably\n" - << "want to change the TEST to TEST_F or move it to another test\n" - << "case."; - } else { - // The user defined two fixture classes with the same name in - // two namespaces - we'll tell him/her how to fix it. - ADD_FAILURE() - << "All tests in the same test case must use the same test fixture\n" - << "class. However, in test case " - << this_test_info->test_case_name() << ",\n" - << "you defined test " << first_test_name - << " and test " << this_test_name << "\n" - << "using two different test fixture classes. This can happen if\n" - << "the two classes are from different namespaces or translation\n" - << "units and have the same name. You should probably rename one\n" - << "of the classes to put the tests into different test cases."; - } - return false; - } - - return true; -} - -#if GTEST_HAS_SEH - -// Adds an "exception thrown" fatal failure to the current test. This -// function returns its result via an output parameter pointer because VC++ -// prohibits creation of objects with destructors on stack in functions -// using __try (see error C2712). -static std::string* FormatSehExceptionMessage(DWORD exception_code, - const char* location) { - Message message; - message << "SEH exception with code 0x" << std::setbase(16) << - exception_code << std::setbase(10) << " thrown in " << location << "."; - - return new std::string(message.GetString()); -} - -#endif // GTEST_HAS_SEH - -namespace internal { - -#if GTEST_HAS_EXCEPTIONS - -// Adds an "exception thrown" fatal failure to the current test. -static std::string FormatCxxExceptionMessage(const char* description, - const char* location) { - Message message; - if (description != NULL) { - message << "C++ exception with description \"" << description << "\""; - } else { - message << "Unknown C++ exception"; - } - message << " thrown in " << location << "."; - - return message.GetString(); -} - -static std::string PrintTestPartResultToString( - const TestPartResult& test_part_result); - -GoogleTestFailureException::GoogleTestFailureException( - const TestPartResult& failure) - : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {} - -#endif // GTEST_HAS_EXCEPTIONS - -// We put these helper functions in the internal namespace as IBM's xlC -// compiler rejects the code if they were declared static. - -// Runs the given method and handles SEH exceptions it throws, when -// SEH is supported; returns the 0-value for type Result in case of an -// SEH exception. (Microsoft compilers cannot handle SEH and C++ -// exceptions in the same function. Therefore, we provide a separate -// wrapper function for handling SEH exceptions.) -template -Result HandleSehExceptionsInMethodIfSupported( - T* object, Result (T::*method)(), const char* location) { -#if GTEST_HAS_SEH - __try { - return (object->*method)(); - } __except (internal::UnitTestOptions::GTestShouldProcessSEH( // NOLINT - GetExceptionCode())) { - // We create the exception message on the heap because VC++ prohibits - // creation of objects with destructors on stack in functions using __try - // (see error C2712). - std::string* exception_message = FormatSehExceptionMessage( - GetExceptionCode(), location); - internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure, - *exception_message); - delete exception_message; - return static_cast(0); - } -#else - (void)location; - return (object->*method)(); -#endif // GTEST_HAS_SEH -} - -// Runs the given method and catches and reports C++ and/or SEH-style -// exceptions, if they are supported; returns the 0-value for type -// Result in case of an SEH exception. -template -Result HandleExceptionsInMethodIfSupported( - T* object, Result (T::*method)(), const char* location) { - // NOTE: The user code can affect the way in which Google Test handles - // exceptions by setting GTEST_FLAG(catch_exceptions), but only before - // RUN_ALL_TESTS() starts. It is technically possible to check the flag - // after the exception is caught and either report or re-throw the - // exception based on the flag's value: - // - // try { - // // Perform the test method. - // } catch (...) { - // if (GTEST_FLAG(catch_exceptions)) - // // Report the exception as failure. - // else - // throw; // Re-throws the original exception. - // } - // - // However, the purpose of this flag is to allow the program to drop into - // the debugger when the exception is thrown. On most platforms, once the - // control enters the catch block, the exception origin information is - // lost and the debugger will stop the program at the point of the - // re-throw in this function -- instead of at the point of the original - // throw statement in the code under test. For this reason, we perform - // the check early, sacrificing the ability to affect Google Test's - // exception handling in the method where the exception is thrown. - if (internal::GetUnitTestImpl()->catch_exceptions()) { -#if GTEST_HAS_EXCEPTIONS - try { - return HandleSehExceptionsInMethodIfSupported(object, method, location); - } catch (const internal::GoogleTestFailureException&) { // NOLINT - // This exception type can only be thrown by a failed Google - // Test assertion with the intention of letting another testing - // framework catch it. Therefore we just re-throw it. - throw; - } catch (const std::exception& e) { // NOLINT - internal::ReportFailureInUnknownLocation( - TestPartResult::kFatalFailure, - FormatCxxExceptionMessage(e.what(), location)); - } catch (...) { // NOLINT - internal::ReportFailureInUnknownLocation( - TestPartResult::kFatalFailure, - FormatCxxExceptionMessage(NULL, location)); - } - return static_cast(0); -#else - return HandleSehExceptionsInMethodIfSupported(object, method, location); -#endif // GTEST_HAS_EXCEPTIONS - } else { - return (object->*method)(); - } -} - -} // namespace internal - -// Runs the test and updates the test result. -void Test::Run() { - if (!HasSameFixtureClass()) return; - - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()"); - // We will run the test only if SetUp() was successful. - if (!HasFatalFailure()) { - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &Test::TestBody, "the test body"); - } - - // However, we want to clean up as much as possible. Hence we will - // always call TearDown(), even if SetUp() or the test body has - // failed. - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &Test::TearDown, "TearDown()"); -} - -// Returns true iff the current test has a fatal failure. -bool Test::HasFatalFailure() { - return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure(); -} - -// Returns true iff the current test has a non-fatal failure. -bool Test::HasNonfatalFailure() { - return internal::GetUnitTestImpl()->current_test_result()-> - HasNonfatalFailure(); -} - -// class TestInfo - -// Constructs a TestInfo object. It assumes ownership of the test factory -// object. -TestInfo::TestInfo(const std::string& a_test_case_name, - const std::string& a_name, - const char* a_type_param, - const char* a_value_param, - internal::TypeId fixture_class_id, - internal::TestFactoryBase* factory) - : test_case_name_(a_test_case_name), - name_(a_name), - type_param_(a_type_param ? new std::string(a_type_param) : NULL), - value_param_(a_value_param ? new std::string(a_value_param) : NULL), - fixture_class_id_(fixture_class_id), - should_run_(false), - is_disabled_(false), - matches_filter_(false), - factory_(factory), - result_() {} - -// Destructs a TestInfo object. -TestInfo::~TestInfo() { delete factory_; } - -namespace internal { - -// Creates a new TestInfo object and registers it with Google Test; -// returns the created object. -// -// Arguments: -// -// test_case_name: name of the test case -// name: name of the test -// type_param: the name of the test's type parameter, or NULL if -// this is not a typed or a type-parameterized test. -// value_param: text representation of the test's value parameter, -// or NULL if this is not a value-parameterized test. -// fixture_class_id: ID of the test fixture class -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case -// factory: pointer to the factory that creates a test object. -// The newly created TestInfo instance will assume -// ownership of the factory object. -TestInfo* MakeAndRegisterTestInfo( - const char* test_case_name, - const char* name, - const char* type_param, - const char* value_param, - TypeId fixture_class_id, - SetUpTestCaseFunc set_up_tc, - TearDownTestCaseFunc tear_down_tc, - TestFactoryBase* factory) { - TestInfo* const test_info = - new TestInfo(test_case_name, name, type_param, value_param, - fixture_class_id, factory); - GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info); - return test_info; -} - -#if GTEST_HAS_PARAM_TEST -void ReportInvalidTestCaseType(const char* test_case_name, - const char* file, int line) { - Message errors; - errors - << "Attempted redefinition of test case " << test_case_name << ".\n" - << "All tests in the same test case must use the same test fixture\n" - << "class. However, in test case " << test_case_name << ", you tried\n" - << "to define a test using a fixture class different from the one\n" - << "used earlier. This can happen if the two fixture classes are\n" - << "from different namespaces and have the same name. You should\n" - << "probably rename one of the classes to put the tests into different\n" - << "test cases."; - - fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), - errors.GetString().c_str()); -} -#endif // GTEST_HAS_PARAM_TEST - -} // namespace internal - -namespace { - -// A predicate that checks the test name of a TestInfo against a known -// value. -// -// This is used for implementation of the TestCase class only. We put -// it in the anonymous namespace to prevent polluting the outer -// namespace. -// -// TestNameIs is copyable. -class TestNameIs { - public: - // Constructor. - // - // TestNameIs has NO default constructor. - explicit TestNameIs(const char* name) - : name_(name) {} - - // Returns true iff the test name of test_info matches name_. - bool operator()(const TestInfo * test_info) const { - return test_info && test_info->name() == name_; - } - - private: - std::string name_; -}; - -} // namespace - -namespace internal { - -// This method expands all parameterized tests registered with macros TEST_P -// and INSTANTIATE_TEST_CASE_P into regular tests and registers those. -// This will be done just once during the program runtime. -void UnitTestImpl::RegisterParameterizedTests() { -#if GTEST_HAS_PARAM_TEST - if (!parameterized_tests_registered_) { - parameterized_test_registry_.RegisterTests(); - parameterized_tests_registered_ = true; - } -#endif -} - -} // namespace internal - -// Creates the test object, runs it, records its result, and then -// deletes it. -void TestInfo::Run() { - if (!should_run_) return; - - // Tells UnitTest where to store test result. - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - impl->set_current_test_info(this); - - TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); - - // Notifies the unit test event listeners that a test is about to start. - repeater->OnTestStart(*this); - - const TimeInMillis start = internal::GetTimeInMillis(); - - impl->os_stack_trace_getter()->UponLeavingGTest(); - - // Creates the test object. - Test* const test = internal::HandleExceptionsInMethodIfSupported( - factory_, &internal::TestFactoryBase::CreateTest, - "the test fixture's constructor"); - - // Runs the test only if the test object was created and its - // constructor didn't generate a fatal failure. - if ((test != NULL) && !Test::HasFatalFailure()) { - // This doesn't throw as all user code that can throw are wrapped into - // exception handling code. - test->Run(); - } - - // Deletes the test object. - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - test, &Test::DeleteSelf_, "the test fixture's destructor"); - - result_.set_elapsed_time(internal::GetTimeInMillis() - start); - - // Notifies the unit test event listener that a test has just finished. - repeater->OnTestEnd(*this); - - // Tells UnitTest to stop associating assertion results to this - // test. - impl->set_current_test_info(NULL); -} - -// class TestCase - -// Gets the number of successful tests in this test case. -int TestCase::successful_test_count() const { - return CountIf(test_info_list_, TestPassed); -} - -// Gets the number of failed tests in this test case. -int TestCase::failed_test_count() const { - return CountIf(test_info_list_, TestFailed); -} - -// Gets the number of disabled tests that will be reported in the XML report. -int TestCase::reportable_disabled_test_count() const { - return CountIf(test_info_list_, TestReportableDisabled); -} - -// Gets the number of disabled tests in this test case. -int TestCase::disabled_test_count() const { - return CountIf(test_info_list_, TestDisabled); -} - -// Gets the number of tests to be printed in the XML report. -int TestCase::reportable_test_count() const { - return CountIf(test_info_list_, TestReportable); -} - -// Get the number of tests in this test case that should run. -int TestCase::test_to_run_count() const { - return CountIf(test_info_list_, ShouldRunTest); -} - -// Gets the number of all tests. -int TestCase::total_test_count() const { - return static_cast(test_info_list_.size()); -} - -// Creates a TestCase with the given name. -// -// Arguments: -// -// name: name of the test case -// a_type_param: the name of the test case's type parameter, or NULL if -// this is not a typed or a type-parameterized test case. -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case -TestCase::TestCase(const char* a_name, const char* a_type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc) - : name_(a_name), - type_param_(a_type_param ? new std::string(a_type_param) : NULL), - set_up_tc_(set_up_tc), - tear_down_tc_(tear_down_tc), - should_run_(false), - elapsed_time_(0) { -} - -// Destructor of TestCase. -TestCase::~TestCase() { - // Deletes every Test in the collection. - ForEach(test_info_list_, internal::Delete); -} - -// Returns the i-th test among all the tests. i can range from 0 to -// total_test_count() - 1. If i is not in that range, returns NULL. -const TestInfo* TestCase::GetTestInfo(int i) const { - const int index = GetElementOr(test_indices_, i, -1); - return index < 0 ? NULL : test_info_list_[index]; -} - -// Returns the i-th test among all the tests. i can range from 0 to -// total_test_count() - 1. If i is not in that range, returns NULL. -TestInfo* TestCase::GetMutableTestInfo(int i) { - const int index = GetElementOr(test_indices_, i, -1); - return index < 0 ? NULL : test_info_list_[index]; -} - -// Adds a test to this test case. Will delete the test upon -// destruction of the TestCase object. -void TestCase::AddTestInfo(TestInfo * test_info) { - test_info_list_.push_back(test_info); - test_indices_.push_back(static_cast(test_indices_.size())); -} - -// Runs every test in this TestCase. -void TestCase::Run() { - if (!should_run_) return; - - internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); - impl->set_current_test_case(this); - - TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); - - repeater->OnTestCaseStart(*this); - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &TestCase::RunSetUpTestCase, "SetUpTestCase()"); - - const internal::TimeInMillis start = internal::GetTimeInMillis(); - for (int i = 0; i < total_test_count(); i++) { - GetMutableTestInfo(i)->Run(); - } - elapsed_time_ = internal::GetTimeInMillis() - start; - - impl->os_stack_trace_getter()->UponLeavingGTest(); - internal::HandleExceptionsInMethodIfSupported( - this, &TestCase::RunTearDownTestCase, "TearDownTestCase()"); - - repeater->OnTestCaseEnd(*this); - impl->set_current_test_case(NULL); -} - -// Clears the results of all tests in this test case. -void TestCase::ClearResult() { - ad_hoc_test_result_.Clear(); - ForEach(test_info_list_, TestInfo::ClearTestResult); -} - -// Shuffles the tests in this test case. -void TestCase::ShuffleTests(internal::Random* random) { - Shuffle(random, &test_indices_); -} - -// Restores the test order to before the first shuffle. -void TestCase::UnshuffleTests() { - for (size_t i = 0; i < test_indices_.size(); i++) { - test_indices_[i] = static_cast(i); - } -} - -// Formats a countable noun. Depending on its quantity, either the -// singular form or the plural form is used. e.g. -// -// FormatCountableNoun(1, "formula", "formuli") returns "1 formula". -// FormatCountableNoun(5, "book", "books") returns "5 books". -static std::string FormatCountableNoun(int count, - const char * singular_form, - const char * plural_form) { - return internal::StreamableToString(count) + " " + - (count == 1 ? singular_form : plural_form); -} - -// Formats the count of tests. -static std::string FormatTestCount(int test_count) { - return FormatCountableNoun(test_count, "test", "tests"); -} - -// Formats the count of test cases. -static std::string FormatTestCaseCount(int test_case_count) { - return FormatCountableNoun(test_case_count, "test case", "test cases"); -} - -// Converts a TestPartResult::Type enum to human-friendly string -// representation. Both kNonFatalFailure and kFatalFailure are translated -// to "Failure", as the user usually doesn't care about the difference -// between the two when viewing the test result. -static const char * TestPartResultTypeToString(TestPartResult::Type type) { - switch (type) { - case TestPartResult::kSuccess: - return "Success"; - - case TestPartResult::kNonFatalFailure: - case TestPartResult::kFatalFailure: -#ifdef _MSC_VER - return "error: "; -#else - return "Failure\n"; -#endif - default: - return "Unknown result type"; - } -} - -namespace internal { - -// Prints a TestPartResult to an std::string. -static std::string PrintTestPartResultToString( - const TestPartResult& test_part_result) { - return (Message() - << internal::FormatFileLocation(test_part_result.file_name(), - test_part_result.line_number()) - << " " << TestPartResultTypeToString(test_part_result.type()) - << test_part_result.message()).GetString(); -} - -// Prints a TestPartResult. -static void PrintTestPartResult(const TestPartResult& test_part_result) { - const std::string& result = - PrintTestPartResultToString(test_part_result); - printf("%s\n", result.c_str()); - fflush(stdout); - // If the test program runs in Visual Studio or a debugger, the - // following statements add the test part result message to the Output - // window such that the user can double-click on it to jump to the - // corresponding source code location; otherwise they do nothing. -#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE - // We don't call OutputDebugString*() on Windows Mobile, as printing - // to stdout is done by OutputDebugString() there already - we don't - // want the same message printed twice. - ::OutputDebugStringA(result.c_str()); - ::OutputDebugStringA("\n"); -#endif -} - -// class PrettyUnitTestResultPrinter - -enum GTestColor { - COLOR_DEFAULT, - COLOR_RED, - COLOR_GREEN, - COLOR_YELLOW -}; - -#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE - -// Returns the character attribute for the given color. -WORD GetColorAttribute(GTestColor color) { - switch (color) { - case COLOR_RED: return FOREGROUND_RED; - case COLOR_GREEN: return FOREGROUND_GREEN; - case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN; - default: return 0; - } -} - -#else - -// Returns the ANSI color code for the given color. COLOR_DEFAULT is -// an invalid input. -const char* GetAnsiColorCode(GTestColor color) { - switch (color) { - case COLOR_RED: return "1"; - case COLOR_GREEN: return "2"; - case COLOR_YELLOW: return "3"; - default: return NULL; - }; -} - -#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE - -// Returns true iff Google Test should use colors in the output. -bool ShouldUseColor(bool stdout_is_tty) { - const char* const gtest_color = GTEST_FLAG(color).c_str(); - - if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { -#if GTEST_OS_WINDOWS - // On Windows the TERM variable is usually not set, but the - // console there does support colors. - return stdout_is_tty; -#else - // On non-Windows platforms, we rely on the TERM variable. - const char* const term = posix::GetEnv("TERM"); - const bool term_supports_color = - String::CStringEquals(term, "xterm") || - String::CStringEquals(term, "xterm-color") || - String::CStringEquals(term, "xterm-256color") || - String::CStringEquals(term, "screen") || - String::CStringEquals(term, "screen-256color") || - String::CStringEquals(term, "linux") || - String::CStringEquals(term, "cygwin"); - return stdout_is_tty && term_supports_color; -#endif // GTEST_OS_WINDOWS - } - - return String::CaseInsensitiveCStringEquals(gtest_color, "yes") || - String::CaseInsensitiveCStringEquals(gtest_color, "true") || - String::CaseInsensitiveCStringEquals(gtest_color, "t") || - String::CStringEquals(gtest_color, "1"); - // We take "yes", "true", "t", and "1" as meaning "yes". If the - // value is neither one of these nor "auto", we treat it as "no" to - // be conservative. -} - -// Helpers for printing colored strings to stdout. Note that on Windows, we -// cannot simply emit special characters and have the terminal change colors. -// This routine must actually emit the characters rather than return a string -// that would be colored when printed, as can be done on Linux. -void ColoredPrintf(GTestColor color, const char* fmt, ...) { - va_list args; - va_start(args, fmt); - -#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || GTEST_OS_IOS - const bool use_color = false; -#else - static const bool in_color_mode = - ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); - const bool use_color = in_color_mode && (color != COLOR_DEFAULT); -#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS - // The '!= 0' comparison is necessary to satisfy MSVC 7.1. - - if (!use_color) { - vprintf(fmt, args); - va_end(args); - return; - } - -#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE - const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); - - // Gets the current text color. - CONSOLE_SCREEN_BUFFER_INFO buffer_info; - GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); - const WORD old_color_attrs = buffer_info.wAttributes; - - // We need to flush the stream buffers into the console before each - // SetConsoleTextAttribute call lest it affect the text that is already - // printed but has not yet reached the console. - fflush(stdout); - SetConsoleTextAttribute(stdout_handle, - GetColorAttribute(color) | FOREGROUND_INTENSITY); - vprintf(fmt, args); - - fflush(stdout); - // Restores the text color. - SetConsoleTextAttribute(stdout_handle, old_color_attrs); -#else - printf("\033[0;3%sm", GetAnsiColorCode(color)); - vprintf(fmt, args); - printf("\033[m"); // Resets the terminal to default. -#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE - va_end(args); -} - -// Text printed in Google Test's text output and --gunit_list_tests -// output to label the type parameter and value parameter for a test. -static const char kTypeParamLabel[] = "TypeParam"; -static const char kValueParamLabel[] = "GetParam()"; - -void PrintFullTestCommentIfPresent(const TestInfo& test_info) { - const char* const type_param = test_info.type_param(); - const char* const value_param = test_info.value_param(); - - if (type_param != NULL || value_param != NULL) { - printf(", where "); - if (type_param != NULL) { - printf("%s = %s", kTypeParamLabel, type_param); - if (value_param != NULL) - printf(" and "); - } - if (value_param != NULL) { - printf("%s = %s", kValueParamLabel, value_param); - } - } -} - -// This class implements the TestEventListener interface. -// -// Class PrettyUnitTestResultPrinter is copyable. -class PrettyUnitTestResultPrinter : public TestEventListener { - public: - PrettyUnitTestResultPrinter() {} - static void PrintTestName(const char * test_case, const char * test) { - printf("%s.%s", test_case, test); - } - - // The following methods override what's in the TestEventListener class. - virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} - virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); - virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); - virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestCaseStart(const TestCase& test_case); - virtual void OnTestStart(const TestInfo& test_info); - virtual void OnTestPartResult(const TestPartResult& result); - virtual void OnTestEnd(const TestInfo& test_info); - virtual void OnTestCaseEnd(const TestCase& test_case); - virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); - virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} - virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); - virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} - - private: - static void PrintFailedTests(const UnitTest& unit_test); -}; - - // Fired before each iteration of tests starts. -void PrettyUnitTestResultPrinter::OnTestIterationStart( - const UnitTest& unit_test, int iteration) { - if (GTEST_FLAG(repeat) != 1) - printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1); - - const char* const filter = GTEST_FLAG(filter).c_str(); - - // Prints the filter if it's not *. This reminds the user that some - // tests may be skipped. - if (!String::CStringEquals(filter, kUniversalFilter)) { - ColoredPrintf(COLOR_YELLOW, - "Note: %s filter = %s\n", GTEST_NAME_, filter); - } - - if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) { - const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); - ColoredPrintf(COLOR_YELLOW, - "Note: This is test shard %d of %s.\n", - static_cast(shard_index) + 1, - internal::posix::GetEnv(kTestTotalShards)); - } - - if (GTEST_FLAG(shuffle)) { - ColoredPrintf(COLOR_YELLOW, - "Note: Randomizing tests' orders with a seed of %d .\n", - unit_test.random_seed()); - } - - ColoredPrintf(COLOR_GREEN, "[==========] "); - printf("Running %s from %s.\n", - FormatTestCount(unit_test.test_to_run_count()).c_str(), - FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( - const UnitTest& /*unit_test*/) { - ColoredPrintf(COLOR_GREEN, "[----------] "); - printf("Global test environment set-up.\n"); - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) { - const std::string counts = - FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); - ColoredPrintf(COLOR_GREEN, "[----------] "); - printf("%s from %s", counts.c_str(), test_case.name()); - if (test_case.type_param() == NULL) { - printf("\n"); - } else { - printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param()); - } - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { - ColoredPrintf(COLOR_GREEN, "[ RUN ] "); - PrintTestName(test_info.test_case_name(), test_info.name()); - printf("\n"); - fflush(stdout); -} - -// Called after an assertion failure. -void PrettyUnitTestResultPrinter::OnTestPartResult( - const TestPartResult& result) { - // If the test part succeeded, we don't need to do anything. - if (result.type() == TestPartResult::kSuccess) - return; - - // Print failure message from the assertion (e.g. expected this and got that). - PrintTestPartResult(result); - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { - if (test_info.result()->Passed()) { - ColoredPrintf(COLOR_GREEN, "[ OK ] "); - } else { - ColoredPrintf(COLOR_RED, "[ FAILED ] "); - } - PrintTestName(test_info.test_case_name(), test_info.name()); - if (test_info.result()->Failed()) - PrintFullTestCommentIfPresent(test_info); - - if (GTEST_FLAG(print_time)) { - printf(" (%s ms)\n", internal::StreamableToString( - test_info.result()->elapsed_time()).c_str()); - } else { - printf("\n"); - } - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { - if (!GTEST_FLAG(print_time)) return; - - const std::string counts = - FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); - ColoredPrintf(COLOR_GREEN, "[----------] "); - printf("%s from %s (%s ms total)\n\n", - counts.c_str(), test_case.name(), - internal::StreamableToString(test_case.elapsed_time()).c_str()); - fflush(stdout); -} - -void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( - const UnitTest& /*unit_test*/) { - ColoredPrintf(COLOR_GREEN, "[----------] "); - printf("Global test environment tear-down\n"); - fflush(stdout); -} - -// Internal helper for printing the list of failed tests. -void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { - const int failed_test_count = unit_test.failed_test_count(); - if (failed_test_count == 0) { - return; - } - - for (int i = 0; i < unit_test.total_test_case_count(); ++i) { - const TestCase& test_case = *unit_test.GetTestCase(i); - if (!test_case.should_run() || (test_case.failed_test_count() == 0)) { - continue; - } - for (int j = 0; j < test_case.total_test_count(); ++j) { - const TestInfo& test_info = *test_case.GetTestInfo(j); - if (!test_info.should_run() || test_info.result()->Passed()) { - continue; - } - ColoredPrintf(COLOR_RED, "[ FAILED ] "); - printf("%s.%s", test_case.name(), test_info.name()); - PrintFullTestCommentIfPresent(test_info); - printf("\n"); - } - } -} - -void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, - int /*iteration*/) { - ColoredPrintf(COLOR_GREEN, "[==========] "); - printf("%s from %s ran.", - FormatTestCount(unit_test.test_to_run_count()).c_str(), - FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); - if (GTEST_FLAG(print_time)) { - printf(" (%s ms total)", - internal::StreamableToString(unit_test.elapsed_time()).c_str()); - } - printf("\n"); - ColoredPrintf(COLOR_GREEN, "[ PASSED ] "); - printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); - - int num_failures = unit_test.failed_test_count(); - if (!unit_test.Passed()) { - const int failed_test_count = unit_test.failed_test_count(); - ColoredPrintf(COLOR_RED, "[ FAILED ] "); - printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); - PrintFailedTests(unit_test); - printf("\n%2d FAILED %s\n", num_failures, - num_failures == 1 ? "TEST" : "TESTS"); - } - - int num_disabled = unit_test.reportable_disabled_test_count(); - if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { - if (!num_failures) { - printf("\n"); // Add a spacer if no FAILURE banner is displayed. - } - ColoredPrintf(COLOR_YELLOW, - " YOU HAVE %d DISABLED %s\n\n", - num_disabled, - num_disabled == 1 ? "TEST" : "TESTS"); - } - // Ensure that Google Test output is printed before, e.g., heapchecker output. - fflush(stdout); -} - -// End PrettyUnitTestResultPrinter - -// class TestEventRepeater -// -// This class forwards events to other event listeners. -class TestEventRepeater : public TestEventListener { - public: - TestEventRepeater() : forwarding_enabled_(true) {} - virtual ~TestEventRepeater(); - void Append(TestEventListener *listener); - TestEventListener* Release(TestEventListener* listener); - - // Controls whether events will be forwarded to listeners_. Set to false - // in death test child processes. - bool forwarding_enabled() const { return forwarding_enabled_; } - void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; } - - virtual void OnTestProgramStart(const UnitTest& unit_test); - virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); - virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); - virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test); - virtual void OnTestCaseStart(const TestCase& test_case); - virtual void OnTestStart(const TestInfo& test_info); - virtual void OnTestPartResult(const TestPartResult& result); - virtual void OnTestEnd(const TestInfo& test_info); - virtual void OnTestCaseEnd(const TestCase& test_case); - virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); - virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test); - virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); - virtual void OnTestProgramEnd(const UnitTest& unit_test); - - private: - // Controls whether events will be forwarded to listeners_. Set to false - // in death test child processes. - bool forwarding_enabled_; - // The list of listeners that receive events. - std::vector listeners_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater); -}; - -TestEventRepeater::~TestEventRepeater() { - ForEach(listeners_, Delete); -} - -void TestEventRepeater::Append(TestEventListener *listener) { - listeners_.push_back(listener); -} - -// TODO(vladl@google.com): Factor the search functionality into Vector::Find. -TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { - for (size_t i = 0; i < listeners_.size(); ++i) { - if (listeners_[i] == listener) { - listeners_.erase(listeners_.begin() + i); - return listener; - } - } - - return NULL; -} - -// Since most methods are very similar, use macros to reduce boilerplate. -// This defines a member that forwards the call to all listeners. -#define GTEST_REPEATER_METHOD_(Name, Type) \ -void TestEventRepeater::Name(const Type& parameter) { \ - if (forwarding_enabled_) { \ - for (size_t i = 0; i < listeners_.size(); i++) { \ - listeners_[i]->Name(parameter); \ - } \ - } \ -} -// This defines a member that forwards the call to all listeners in reverse -// order. -#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ -void TestEventRepeater::Name(const Type& parameter) { \ - if (forwarding_enabled_) { \ - for (int i = static_cast(listeners_.size()) - 1; i >= 0; i--) { \ - listeners_[i]->Name(parameter); \ - } \ - } \ -} - -GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest) -GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest) -GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase) -GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) -GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) -GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) -GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) -GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest) -GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo) -GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase) -GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest) - -#undef GTEST_REPEATER_METHOD_ -#undef GTEST_REVERSE_REPEATER_METHOD_ - -void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test, - int iteration) { - if (forwarding_enabled_) { - for (size_t i = 0; i < listeners_.size(); i++) { - listeners_[i]->OnTestIterationStart(unit_test, iteration); - } - } -} - -void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test, - int iteration) { - if (forwarding_enabled_) { - for (int i = static_cast(listeners_.size()) - 1; i >= 0; i--) { - listeners_[i]->OnTestIterationEnd(unit_test, iteration); - } - } -} - -// End TestEventRepeater - -// This class generates an XML output file. -class XmlUnitTestResultPrinter : public EmptyTestEventListener { - public: - explicit XmlUnitTestResultPrinter(const char* output_file); - - virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); - - private: - // Is c a whitespace character that is normalized to a space character - // when it appears in an XML attribute value? - static bool IsNormalizableWhitespace(char c) { - return c == 0x9 || c == 0xA || c == 0xD; - } - - // May c appear in a well-formed XML document? - static bool IsValidXmlCharacter(char c) { - return IsNormalizableWhitespace(c) || c >= 0x20; - } - - // Returns an XML-escaped copy of the input string str. If - // is_attribute is true, the text is meant to appear as an attribute - // value, and normalizable whitespace is preserved by replacing it - // with character references. - static std::string EscapeXml(const std::string& str, bool is_attribute); - - // Returns the given string with all characters invalid in XML removed. - static std::string RemoveInvalidXmlCharacters(const std::string& str); - - // Convenience wrapper around EscapeXml when str is an attribute value. - static std::string EscapeXmlAttribute(const std::string& str) { - return EscapeXml(str, true); - } - - // Convenience wrapper around EscapeXml when str is not an attribute value. - static std::string EscapeXmlText(const char* str) { - return EscapeXml(str, false); - } - - // Verifies that the given attribute belongs to the given element and - // streams the attribute as XML. - static void OutputXmlAttribute(std::ostream* stream, - const std::string& element_name, - const std::string& name, - const std::string& value); - - // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. - static void OutputXmlCDataSection(::std::ostream* stream, const char* data); - - // Streams an XML representation of a TestInfo object. - static void OutputXmlTestInfo(::std::ostream* stream, - const char* test_case_name, - const TestInfo& test_info); - - // Prints an XML representation of a TestCase object - static void PrintXmlTestCase(::std::ostream* stream, - const TestCase& test_case); - - // Prints an XML summary of unit_test to output stream out. - static void PrintXmlUnitTest(::std::ostream* stream, - const UnitTest& unit_test); - - // Produces a string representing the test properties in a result as space - // delimited XML attributes based on the property key="value" pairs. - // When the std::string is not empty, it includes a space at the beginning, - // to delimit this attribute from prior attributes. - static std::string TestPropertiesAsXmlAttributes(const TestResult& result); - - // The output file. - const std::string output_file_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter); -}; - -// Creates a new XmlUnitTestResultPrinter. -XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file) - : output_file_(output_file) { - if (output_file_.c_str() == NULL || output_file_.empty()) { - fprintf(stderr, "XML output file may not be null\n"); - fflush(stderr); - exit(EXIT_FAILURE); - } -} - -// Called after the unit test ends. -void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, - int /*iteration*/) { - FILE* xmlout = NULL; - FilePath output_file(output_file_); - FilePath output_dir(output_file.RemoveFileName()); - - if (output_dir.CreateDirectoriesRecursively()) { - xmlout = posix::FOpen(output_file_.c_str(), "w"); - } - if (xmlout == NULL) { - // TODO(wan): report the reason of the failure. - // - // We don't do it for now as: - // - // 1. There is no urgent need for it. - // 2. It's a bit involved to make the errno variable thread-safe on - // all three operating systems (Linux, Windows, and Mac OS). - // 3. To interpret the meaning of errno in a thread-safe way, - // we need the strerror_r() function, which is not available on - // Windows. - fprintf(stderr, - "Unable to open file \"%s\"\n", - output_file_.c_str()); - fflush(stderr); - exit(EXIT_FAILURE); - } - std::stringstream stream; - PrintXmlUnitTest(&stream, unit_test); - fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); - fclose(xmlout); -} - -// Returns an XML-escaped copy of the input string str. If is_attribute -// is true, the text is meant to appear as an attribute value, and -// normalizable whitespace is preserved by replacing it with character -// references. -// -// Invalid XML characters in str, if any, are stripped from the output. -// It is expected that most, if not all, of the text processed by this -// module will consist of ordinary English text. -// If this module is ever modified to produce version 1.1 XML output, -// most invalid characters can be retained using character references. -// TODO(wan): It might be nice to have a minimally invasive, human-readable -// escaping scheme for invalid characters, rather than dropping them. -std::string XmlUnitTestResultPrinter::EscapeXml( - const std::string& str, bool is_attribute) { - Message m; - - for (size_t i = 0; i < str.size(); ++i) { - const char ch = str[i]; - switch (ch) { - case '<': - m << "<"; - break; - case '>': - m << ">"; - break; - case '&': - m << "&"; - break; - case '\'': - if (is_attribute) - m << "'"; - else - m << '\''; - break; - case '"': - if (is_attribute) - m << """; - else - m << '"'; - break; - default: - if (IsValidXmlCharacter(ch)) { - if (is_attribute && IsNormalizableWhitespace(ch)) - m << "&#x" << String::FormatByte(static_cast(ch)) - << ";"; - else - m << ch; - } - break; - } - } - - return m.GetString(); -} - -// Returns the given string with all characters invalid in XML removed. -// Currently invalid characters are dropped from the string. An -// alternative is to replace them with certain characters such as . or ?. -std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( - const std::string& str) { - std::string output; - output.reserve(str.size()); - for (std::string::const_iterator it = str.begin(); it != str.end(); ++it) - if (IsValidXmlCharacter(*it)) - output.push_back(*it); - - return output; -} - -// The following routines generate an XML representation of a UnitTest -// object. -// -// This is how Google Test concepts map to the DTD: -// -// <-- corresponds to a UnitTest object -// <-- corresponds to a TestCase object -// <-- corresponds to a TestInfo object -// ... -// ... -// ... -// <-- individual assertion failures -// -// -// - -// Formats the given time in milliseconds as seconds. -std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) { - ::std::stringstream ss; - ss << ms/1000.0; - return ss.str(); -} - -// Converts the given epoch time in milliseconds to a date string in the ISO -// 8601 format, without the timezone information. -std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) { - // Using non-reentrant version as localtime_r is not portable. - time_t seconds = static_cast(ms / 1000); -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4996) // Temporarily disables warning 4996 - // (function or variable may be unsafe). - const struct tm* const time_struct = localtime(&seconds); // NOLINT -# pragma warning(pop) // Restores the warning state again. -#else - const struct tm* const time_struct = localtime(&seconds); // NOLINT -#endif - if (time_struct == NULL) - return ""; // Invalid ms value - - // YYYY-MM-DDThh:mm:ss - return StreamableToString(time_struct->tm_year + 1900) + "-" + - String::FormatIntWidth2(time_struct->tm_mon + 1) + "-" + - String::FormatIntWidth2(time_struct->tm_mday) + "T" + - String::FormatIntWidth2(time_struct->tm_hour) + ":" + - String::FormatIntWidth2(time_struct->tm_min) + ":" + - String::FormatIntWidth2(time_struct->tm_sec); -} - -// Streams an XML CDATA section, escaping invalid CDATA sequences as needed. -void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, - const char* data) { - const char* segment = data; - *stream << ""); - if (next_segment != NULL) { - stream->write( - segment, static_cast(next_segment - segment)); - *stream << "]]>]]>"); - } else { - *stream << segment; - break; - } - } - *stream << "]]>"; -} - -void XmlUnitTestResultPrinter::OutputXmlAttribute( - std::ostream* stream, - const std::string& element_name, - const std::string& name, - const std::string& value) { - const std::vector& allowed_names = - GetReservedAttributesForElement(element_name); - - GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != - allowed_names.end()) - << "Attribute " << name << " is not allowed for element <" << element_name - << ">."; - - *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\""; -} - -// Prints an XML representation of a TestInfo object. -// TODO(wan): There is also value in printing properties with the plain printer. -void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, - const char* test_case_name, - const TestInfo& test_info) { - const TestResult& result = *test_info.result(); - const std::string kTestcase = "testcase"; - - *stream << " \n"; - } - const string location = internal::FormatCompilerIndependentFileLocation( - part.file_name(), part.line_number()); - const string summary = location + "\n" + part.summary(); - *stream << " "; - const string detail = location + "\n" + part.message(); - OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); - *stream << "\n"; - } - } - - if (failures == 0) - *stream << " />\n"; - else - *stream << " \n"; -} - -// Prints an XML representation of a TestCase object -void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream, - const TestCase& test_case) { - const std::string kTestsuite = "testsuite"; - *stream << " <" << kTestsuite; - OutputXmlAttribute(stream, kTestsuite, "name", test_case.name()); - OutputXmlAttribute(stream, kTestsuite, "tests", - StreamableToString(test_case.reportable_test_count())); - OutputXmlAttribute(stream, kTestsuite, "failures", - StreamableToString(test_case.failed_test_count())); - OutputXmlAttribute( - stream, kTestsuite, "disabled", - StreamableToString(test_case.reportable_disabled_test_count())); - OutputXmlAttribute(stream, kTestsuite, "errors", "0"); - OutputXmlAttribute(stream, kTestsuite, "time", - FormatTimeInMillisAsSeconds(test_case.elapsed_time())); - *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result()) - << ">\n"; - - for (int i = 0; i < test_case.total_test_count(); ++i) { - if (test_case.GetTestInfo(i)->is_reportable()) - OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i)); - } - *stream << " \n"; -} - -// Prints an XML summary of unit_test to output stream out. -void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, - const UnitTest& unit_test) { - const std::string kTestsuites = "testsuites"; - - *stream << "\n"; - *stream << "<" << kTestsuites; - - OutputXmlAttribute(stream, kTestsuites, "tests", - StreamableToString(unit_test.reportable_test_count())); - OutputXmlAttribute(stream, kTestsuites, "failures", - StreamableToString(unit_test.failed_test_count())); - OutputXmlAttribute( - stream, kTestsuites, "disabled", - StreamableToString(unit_test.reportable_disabled_test_count())); - OutputXmlAttribute(stream, kTestsuites, "errors", "0"); - OutputXmlAttribute( - stream, kTestsuites, "timestamp", - FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp())); - OutputXmlAttribute(stream, kTestsuites, "time", - FormatTimeInMillisAsSeconds(unit_test.elapsed_time())); - - if (GTEST_FLAG(shuffle)) { - OutputXmlAttribute(stream, kTestsuites, "random_seed", - StreamableToString(unit_test.random_seed())); - } - - *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result()); - - OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); - *stream << ">\n"; - - for (int i = 0; i < unit_test.total_test_case_count(); ++i) { - if (unit_test.GetTestCase(i)->reportable_test_count() > 0) - PrintXmlTestCase(stream, *unit_test.GetTestCase(i)); - } - *stream << "\n"; -} - -// Produces a string representing the test properties in a result as space -// delimited XML attributes based on the property key="value" pairs. -std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( - const TestResult& result) { - Message attributes; - for (int i = 0; i < result.test_property_count(); ++i) { - const TestProperty& property = result.GetTestProperty(i); - attributes << " " << property.key() << "=" - << "\"" << EscapeXmlAttribute(property.value()) << "\""; - } - return attributes.GetString(); -} - -// End XmlUnitTestResultPrinter - -#if GTEST_CAN_STREAM_RESULTS_ - -// Checks if str contains '=', '&', '%' or '\n' characters. If yes, -// replaces them by "%xx" where xx is their hexadecimal value. For -// example, replaces "=" with "%3D". This algorithm is O(strlen(str)) -// in both time and space -- important as the input str may contain an -// arbitrarily long test failure message and stack trace. -string StreamingListener::UrlEncode(const char* str) { - string result; - result.reserve(strlen(str) + 1); - for (char ch = *str; ch != '\0'; ch = *++str) { - switch (ch) { - case '%': - case '=': - case '&': - case '\n': - result.append("%" + String::FormatByte(static_cast(ch))); - break; - default: - result.push_back(ch); - break; - } - } - return result; -} - -void StreamingListener::SocketWriter::MakeConnection() { - GTEST_CHECK_(sockfd_ == -1) - << "MakeConnection() can't be called when there is already a connection."; - - addrinfo hints; - memset(&hints, 0, sizeof(hints)); - hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. - hints.ai_socktype = SOCK_STREAM; - addrinfo* servinfo = NULL; - - // Use the getaddrinfo() to get a linked list of IP addresses for - // the given host name. - const int error_num = getaddrinfo( - host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); - if (error_num != 0) { - GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: " - << gai_strerror(error_num); - } - - // Loop through all the results and connect to the first we can. - for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL; - cur_addr = cur_addr->ai_next) { - sockfd_ = socket( - cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol); - if (sockfd_ != -1) { - // Connect the client socket to the server socket. - if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) { - close(sockfd_); - sockfd_ = -1; - } - } - } - - freeaddrinfo(servinfo); // all done with this structure - - if (sockfd_ == -1) { - GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to " - << host_name_ << ":" << port_num_; - } -} - -// End of class Streaming Listener -#endif // GTEST_CAN_STREAM_RESULTS__ - -// Class ScopedTrace - -// Pushes the given source file location and message onto a per-thread -// trace stack maintained by Google Test. -ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) - GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { - TraceInfo trace; - trace.file = file; - trace.line = line; - trace.message = message.GetString(); - - UnitTest::GetInstance()->PushGTestTrace(trace); -} - -// Pops the info pushed by the c'tor. -ScopedTrace::~ScopedTrace() - GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { - UnitTest::GetInstance()->PopGTestTrace(); -} - - -// class OsStackTraceGetter - -// Returns the current OS stack trace as an std::string. Parameters: -// -// max_depth - the maximum number of stack frames to be included -// in the trace. -// skip_count - the number of top frames to be skipped; doesn't count -// against max_depth. -// -string OsStackTraceGetter::CurrentStackTrace(int /* max_depth */, - int /* skip_count */) - GTEST_LOCK_EXCLUDED_(mutex_) { - return ""; -} - -void OsStackTraceGetter::UponLeavingGTest() - GTEST_LOCK_EXCLUDED_(mutex_) { -} - -const char* const -OsStackTraceGetter::kElidedFramesMarker = - "... " GTEST_NAME_ " internal frames ..."; - -// A helper class that creates the premature-exit file in its -// constructor and deletes the file in its destructor. -class ScopedPrematureExitFile { - public: - explicit ScopedPrematureExitFile(const char* premature_exit_filepath) - : premature_exit_filepath_(premature_exit_filepath) { - // If a path to the premature-exit file is specified... - if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') { - // create the file with a single "0" character in it. I/O - // errors are ignored as there's nothing better we can do and we - // don't want to fail the test because of this. - FILE* pfile = posix::FOpen(premature_exit_filepath, "w"); - fwrite("0", 1, 1, pfile); - fclose(pfile); - } - } - - ~ScopedPrematureExitFile() { - if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') { - remove(premature_exit_filepath_); - } - } - - private: - const char* const premature_exit_filepath_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile); -}; - -} // namespace internal - -// class TestEventListeners - -TestEventListeners::TestEventListeners() - : repeater_(new internal::TestEventRepeater()), - default_result_printer_(NULL), - default_xml_generator_(NULL) { -} - -TestEventListeners::~TestEventListeners() { delete repeater_; } - -// Returns the standard listener responsible for the default console -// output. Can be removed from the listeners list to shut down default -// console output. Note that removing this object from the listener list -// with Release transfers its ownership to the user. -void TestEventListeners::Append(TestEventListener* listener) { - repeater_->Append(listener); -} - -// Removes the given event listener from the list and returns it. It then -// becomes the caller's responsibility to delete the listener. Returns -// NULL if the listener is not found in the list. -TestEventListener* TestEventListeners::Release(TestEventListener* listener) { - if (listener == default_result_printer_) - default_result_printer_ = NULL; - else if (listener == default_xml_generator_) - default_xml_generator_ = NULL; - return repeater_->Release(listener); -} - -// Returns repeater that broadcasts the TestEventListener events to all -// subscribers. -TestEventListener* TestEventListeners::repeater() { return repeater_; } - -// Sets the default_result_printer attribute to the provided listener. -// The listener is also added to the listener list and previous -// default_result_printer is removed from it and deleted. The listener can -// also be NULL in which case it will not be added to the list. Does -// nothing if the previous and the current listener objects are the same. -void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) { - if (default_result_printer_ != listener) { - // It is an error to pass this method a listener that is already in the - // list. - delete Release(default_result_printer_); - default_result_printer_ = listener; - if (listener != NULL) - Append(listener); - } -} - -// Sets the default_xml_generator attribute to the provided listener. The -// listener is also added to the listener list and previous -// default_xml_generator is removed from it and deleted. The listener can -// also be NULL in which case it will not be added to the list. Does -// nothing if the previous and the current listener objects are the same. -void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) { - if (default_xml_generator_ != listener) { - // It is an error to pass this method a listener that is already in the - // list. - delete Release(default_xml_generator_); - default_xml_generator_ = listener; - if (listener != NULL) - Append(listener); - } -} - -// Controls whether events will be forwarded by the repeater to the -// listeners in the list. -bool TestEventListeners::EventForwardingEnabled() const { - return repeater_->forwarding_enabled(); -} - -void TestEventListeners::SuppressEventForwarding() { - repeater_->set_forwarding_enabled(false); -} - -// class UnitTest - -// Gets the singleton UnitTest object. The first time this method is -// called, a UnitTest object is constructed and returned. Consecutive -// calls will return the same object. -// -// We don't protect this under mutex_ as a user is not supposed to -// call this before main() starts, from which point on the return -// value will never change. -UnitTest* UnitTest::GetInstance() { - // When compiled with MSVC 7.1 in optimized mode, destroying the - // UnitTest object upon exiting the program messes up the exit code, - // causing successful tests to appear failed. We have to use a - // different implementation in this case to bypass the compiler bug. - // This implementation makes the compiler happy, at the cost of - // leaking the UnitTest object. - - // CodeGear C++Builder insists on a public destructor for the - // default implementation. Use this implementation to keep good OO - // design with private destructor. - -#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) - static UnitTest* const instance = new UnitTest; - return instance; -#else - static UnitTest instance; - return &instance; -#endif // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) -} - -// Gets the number of successful test cases. -int UnitTest::successful_test_case_count() const { - return impl()->successful_test_case_count(); -} - -// Gets the number of failed test cases. -int UnitTest::failed_test_case_count() const { - return impl()->failed_test_case_count(); -} - -// Gets the number of all test cases. -int UnitTest::total_test_case_count() const { - return impl()->total_test_case_count(); -} - -// Gets the number of all test cases that contain at least one test -// that should run. -int UnitTest::test_case_to_run_count() const { - return impl()->test_case_to_run_count(); -} - -// Gets the number of successful tests. -int UnitTest::successful_test_count() const { - return impl()->successful_test_count(); -} - -// Gets the number of failed tests. -int UnitTest::failed_test_count() const { return impl()->failed_test_count(); } - -// Gets the number of disabled tests that will be reported in the XML report. -int UnitTest::reportable_disabled_test_count() const { - return impl()->reportable_disabled_test_count(); -} - -// Gets the number of disabled tests. -int UnitTest::disabled_test_count() const { - return impl()->disabled_test_count(); -} - -// Gets the number of tests to be printed in the XML report. -int UnitTest::reportable_test_count() const { - return impl()->reportable_test_count(); -} - -// Gets the number of all tests. -int UnitTest::total_test_count() const { return impl()->total_test_count(); } - -// Gets the number of tests that should run. -int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); } - -// Gets the time of the test program start, in ms from the start of the -// UNIX epoch. -internal::TimeInMillis UnitTest::start_timestamp() const { - return impl()->start_timestamp(); -} - -// Gets the elapsed time, in milliseconds. -internal::TimeInMillis UnitTest::elapsed_time() const { - return impl()->elapsed_time(); -} - -// Returns true iff the unit test passed (i.e. all test cases passed). -bool UnitTest::Passed() const { return impl()->Passed(); } - -// Returns true iff the unit test failed (i.e. some test case failed -// or something outside of all tests failed). -bool UnitTest::Failed() const { return impl()->Failed(); } - -// Gets the i-th test case among all the test cases. i can range from 0 to -// total_test_case_count() - 1. If i is not in that range, returns NULL. -const TestCase* UnitTest::GetTestCase(int i) const { - return impl()->GetTestCase(i); -} - -// Returns the TestResult containing information on test failures and -// properties logged outside of individual test cases. -const TestResult& UnitTest::ad_hoc_test_result() const { - return *impl()->ad_hoc_test_result(); -} - -// Gets the i-th test case among all the test cases. i can range from 0 to -// total_test_case_count() - 1. If i is not in that range, returns NULL. -TestCase* UnitTest::GetMutableTestCase(int i) { - return impl()->GetMutableTestCase(i); -} - -// Returns the list of event listeners that can be used to track events -// inside Google Test. -TestEventListeners& UnitTest::listeners() { - return *impl()->listeners(); -} - -// Registers and returns a global test environment. When a test -// program is run, all global test environments will be set-up in the -// order they were registered. After all tests in the program have -// finished, all global test environments will be torn-down in the -// *reverse* order they were registered. -// -// The UnitTest object takes ownership of the given environment. -// -// We don't protect this under mutex_, as we only support calling it -// from the main thread. -Environment* UnitTest::AddEnvironment(Environment* env) { - if (env == NULL) { - return NULL; - } - - impl_->environments().push_back(env); - return env; -} - -// Adds a TestPartResult to the current TestResult object. All Google Test -// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call -// this to report their results. The user code should use the -// assertion macros instead of calling this directly. -void UnitTest::AddTestPartResult( - TestPartResult::Type result_type, - const char* file_name, - int line_number, - const std::string& message, - const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) { - Message msg; - msg << message; - - internal::MutexLock lock(&mutex_); - if (impl_->gtest_trace_stack().size() > 0) { - msg << "\n" << GTEST_NAME_ << " trace:"; - - for (int i = static_cast(impl_->gtest_trace_stack().size()); - i > 0; --i) { - const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1]; - msg << "\n" << internal::FormatFileLocation(trace.file, trace.line) - << " " << trace.message; - } - } - - if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) { - msg << internal::kStackTraceMarker << os_stack_trace; - } - - const TestPartResult result = - TestPartResult(result_type, file_name, line_number, - msg.GetString().c_str()); - impl_->GetTestPartResultReporterForCurrentThread()-> - ReportTestPartResult(result); - - if (result_type != TestPartResult::kSuccess) { - // gtest_break_on_failure takes precedence over - // gtest_throw_on_failure. This allows a user to set the latter - // in the code (perhaps in order to use Google Test assertions - // with another testing framework) and specify the former on the - // command line for debugging. - if (GTEST_FLAG(break_on_failure)) { -#if GTEST_OS_WINDOWS - // Using DebugBreak on Windows allows gtest to still break into a debugger - // when a failure happens and both the --gtest_break_on_failure and - // the --gtest_catch_exceptions flags are specified. - DebugBreak(); -#else - // Dereference NULL through a volatile pointer to prevent the compiler - // from removing. We use this rather than abort() or __builtin_trap() for - // portability: Symbian doesn't implement abort() well, and some debuggers - // don't correctly trap abort(). - *static_cast(NULL) = 1; -#endif // GTEST_OS_WINDOWS - } else if (GTEST_FLAG(throw_on_failure)) { -#if GTEST_HAS_EXCEPTIONS - throw internal::GoogleTestFailureException(result); -#else - // We cannot call abort() as it generates a pop-up in debug mode - // that cannot be suppressed in VC 7.1 or below. - exit(1); -#endif - } - } -} - -// Adds a TestProperty to the current TestResult object when invoked from -// inside a test, to current TestCase's ad_hoc_test_result_ when invoked -// from SetUpTestCase or TearDownTestCase, or to the global property set -// when invoked elsewhere. If the result already contains a property with -// the same key, the value will be updated. -void UnitTest::RecordProperty(const std::string& key, - const std::string& value) { - impl_->RecordProperty(TestProperty(key, value)); -} - -// Runs all tests in this UnitTest object and prints the result. -// Returns 0 if successful, or 1 otherwise. -// -// We don't protect this under mutex_, as we only support calling it -// from the main thread. -int UnitTest::Run() { - const bool in_death_test_child_process = - internal::GTEST_FLAG(internal_run_death_test).length() > 0; - - // Google Test implements this protocol for catching that a test - // program exits before returning control to Google Test: - // - // 1. Upon start, Google Test creates a file whose absolute path - // is specified by the environment variable - // TEST_PREMATURE_EXIT_FILE. - // 2. When Google Test has finished its work, it deletes the file. - // - // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before - // running a Google-Test-based test program and check the existence - // of the file at the end of the test execution to see if it has - // exited prematurely. - - // If we are in the child process of a death test, don't - // create/delete the premature exit file, as doing so is unnecessary - // and will confuse the parent process. Otherwise, create/delete - // the file upon entering/leaving this function. If the program - // somehow exits before this function has a chance to return, the - // premature-exit file will be left undeleted, causing a test runner - // that understands the premature-exit-file protocol to report the - // test as having failed. - const internal::ScopedPrematureExitFile premature_exit_file( - in_death_test_child_process ? - NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE")); - - // Captures the value of GTEST_FLAG(catch_exceptions). This value will be - // used for the duration of the program. - impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions)); - -#if GTEST_HAS_SEH - // Either the user wants Google Test to catch exceptions thrown by the - // tests or this is executing in the context of death test child - // process. In either case the user does not want to see pop-up dialogs - // about crashes - they are expected. - if (impl()->catch_exceptions() || in_death_test_child_process) { -# if !GTEST_OS_WINDOWS_MOBILE - // SetErrorMode doesn't exist on CE. - SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT | - SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX); -# endif // !GTEST_OS_WINDOWS_MOBILE - -# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE - // Death test children can be terminated with _abort(). On Windows, - // _abort() can show a dialog with a warning message. This forces the - // abort message to go to stderr instead. - _set_error_mode(_OUT_TO_STDERR); -# endif - -# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE - // In the debug version, Visual Studio pops up a separate dialog - // offering a choice to debug the aborted program. We need to suppress - // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement - // executed. Google Test will notify the user of any unexpected - // failure via stderr. - // - // VC++ doesn't define _set_abort_behavior() prior to the version 8.0. - // Users of prior VC versions shall suffer the agony and pain of - // clicking through the countless debug dialogs. - // TODO(vladl@google.com): find a way to suppress the abort dialog() in the - // debug mode when compiled with VC 7.1 or lower. - if (!GTEST_FLAG(break_on_failure)) - _set_abort_behavior( - 0x0, // Clear the following flags: - _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. -# endif - } -#endif // GTEST_HAS_SEH - - return internal::HandleExceptionsInMethodIfSupported( - impl(), - &internal::UnitTestImpl::RunAllTests, - "auxiliary test code (environments or event listeners)") ? 0 : 1; -} - -// Returns the working directory when the first TEST() or TEST_F() was -// executed. -const char* UnitTest::original_working_dir() const { - return impl_->original_working_dir_.c_str(); -} - -// Returns the TestCase object for the test that's currently running, -// or NULL if no test is running. -const TestCase* UnitTest::current_test_case() const - GTEST_LOCK_EXCLUDED_(mutex_) { - internal::MutexLock lock(&mutex_); - return impl_->current_test_case(); -} - -// Returns the TestInfo object for the test that's currently running, -// or NULL if no test is running. -const TestInfo* UnitTest::current_test_info() const - GTEST_LOCK_EXCLUDED_(mutex_) { - internal::MutexLock lock(&mutex_); - return impl_->current_test_info(); -} - -// Returns the random seed used at the start of the current test run. -int UnitTest::random_seed() const { return impl_->random_seed(); } - -#if GTEST_HAS_PARAM_TEST -// Returns ParameterizedTestCaseRegistry object used to keep track of -// value-parameterized tests and instantiate and register them. -internal::ParameterizedTestCaseRegistry& - UnitTest::parameterized_test_registry() - GTEST_LOCK_EXCLUDED_(mutex_) { - return impl_->parameterized_test_registry(); -} -#endif // GTEST_HAS_PARAM_TEST - -// Creates an empty UnitTest. -UnitTest::UnitTest() { - impl_ = new internal::UnitTestImpl(this); -} - -// Destructor of UnitTest. -UnitTest::~UnitTest() { - delete impl_; -} - -// Pushes a trace defined by SCOPED_TRACE() on to the per-thread -// Google Test trace stack. -void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) - GTEST_LOCK_EXCLUDED_(mutex_) { - internal::MutexLock lock(&mutex_); - impl_->gtest_trace_stack().push_back(trace); -} - -// Pops a trace from the per-thread Google Test trace stack. -void UnitTest::PopGTestTrace() - GTEST_LOCK_EXCLUDED_(mutex_) { - internal::MutexLock lock(&mutex_); - impl_->gtest_trace_stack().pop_back(); -} - -namespace internal { - -UnitTestImpl::UnitTestImpl(UnitTest* parent) - : parent_(parent), -#ifdef _MSC_VER -# pragma warning(push) // Saves the current warning state. -# pragma warning(disable:4355) // Temporarily disables warning 4355 - // (using this in initializer). - default_global_test_part_result_reporter_(this), - default_per_thread_test_part_result_reporter_(this), -# pragma warning(pop) // Restores the warning state again. -#else - default_global_test_part_result_reporter_(this), - default_per_thread_test_part_result_reporter_(this), -#endif // _MSC_VER - global_test_part_result_repoter_( - &default_global_test_part_result_reporter_), - per_thread_test_part_result_reporter_( - &default_per_thread_test_part_result_reporter_), -#if GTEST_HAS_PARAM_TEST - parameterized_test_registry_(), - parameterized_tests_registered_(false), -#endif // GTEST_HAS_PARAM_TEST - last_death_test_case_(-1), - current_test_case_(NULL), - current_test_info_(NULL), - ad_hoc_test_result_(), - os_stack_trace_getter_(NULL), - post_flag_parse_init_performed_(false), - random_seed_(0), // Will be overridden by the flag before first use. - random_(0), // Will be reseeded before first use. - start_timestamp_(0), - elapsed_time_(0), -#if GTEST_HAS_DEATH_TEST - death_test_factory_(new DefaultDeathTestFactory), -#endif - // Will be overridden by the flag before first use. - catch_exceptions_(false) { - listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter); -} - -UnitTestImpl::~UnitTestImpl() { - // Deletes every TestCase. - ForEach(test_cases_, internal::Delete); - - // Deletes every Environment. - ForEach(environments_, internal::Delete); - - delete os_stack_trace_getter_; -} - -// Adds a TestProperty to the current TestResult object when invoked in a -// context of a test, to current test case's ad_hoc_test_result when invoke -// from SetUpTestCase/TearDownTestCase, or to the global property set -// otherwise. If the result already contains a property with the same key, -// the value will be updated. -void UnitTestImpl::RecordProperty(const TestProperty& test_property) { - std::string xml_element; - TestResult* test_result; // TestResult appropriate for property recording. - - if (current_test_info_ != NULL) { - xml_element = "testcase"; - test_result = &(current_test_info_->result_); - } else if (current_test_case_ != NULL) { - xml_element = "testsuite"; - test_result = &(current_test_case_->ad_hoc_test_result_); - } else { - xml_element = "testsuites"; - test_result = &ad_hoc_test_result_; - } - test_result->RecordProperty(xml_element, test_property); -} - -#if GTEST_HAS_DEATH_TEST -// Disables event forwarding if the control is currently in a death test -// subprocess. Must not be called before InitGoogleTest. -void UnitTestImpl::SuppressTestEventsIfInSubprocess() { - if (internal_run_death_test_flag_.get() != NULL) - listeners()->SuppressEventForwarding(); -} -#endif // GTEST_HAS_DEATH_TEST - -// Initializes event listeners performing XML output as specified by -// UnitTestOptions. Must not be called before InitGoogleTest. -void UnitTestImpl::ConfigureXmlOutput() { - const std::string& output_format = UnitTestOptions::GetOutputFormat(); - if (output_format == "xml") { - listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter( - UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); - } else if (output_format != "") { - printf("WARNING: unrecognized output format \"%s\" ignored.\n", - output_format.c_str()); - fflush(stdout); - } -} - -#if GTEST_CAN_STREAM_RESULTS_ -// Initializes event listeners for streaming test results in string form. -// Must not be called before InitGoogleTest. -void UnitTestImpl::ConfigureStreamingOutput() { - const std::string& target = GTEST_FLAG(stream_result_to); - if (!target.empty()) { - const size_t pos = target.find(':'); - if (pos != std::string::npos) { - listeners()->Append(new StreamingListener(target.substr(0, pos), - target.substr(pos+1))); - } else { - printf("WARNING: unrecognized streaming target \"%s\" ignored.\n", - target.c_str()); - fflush(stdout); - } - } -} -#endif // GTEST_CAN_STREAM_RESULTS_ - -// Performs initialization dependent upon flag values obtained in -// ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to -// ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest -// this function is also called from RunAllTests. Since this function can be -// called more than once, it has to be idempotent. -void UnitTestImpl::PostFlagParsingInit() { - // Ensures that this function does not execute more than once. - if (!post_flag_parse_init_performed_) { - post_flag_parse_init_performed_ = true; - -#if GTEST_HAS_DEATH_TEST - InitDeathTestSubprocessControlInfo(); - SuppressTestEventsIfInSubprocess(); -#endif // GTEST_HAS_DEATH_TEST - - // Registers parameterized tests. This makes parameterized tests - // available to the UnitTest reflection API without running - // RUN_ALL_TESTS. - RegisterParameterizedTests(); - - // Configures listeners for XML output. This makes it possible for users - // to shut down the default XML output before invoking RUN_ALL_TESTS. - ConfigureXmlOutput(); - -#if GTEST_CAN_STREAM_RESULTS_ - // Configures listeners for streaming test results to the specified server. - ConfigureStreamingOutput(); -#endif // GTEST_CAN_STREAM_RESULTS_ - } -} - -// A predicate that checks the name of a TestCase against a known -// value. -// -// This is used for implementation of the UnitTest class only. We put -// it in the anonymous namespace to prevent polluting the outer -// namespace. -// -// TestCaseNameIs is copyable. -class TestCaseNameIs { - public: - // Constructor. - explicit TestCaseNameIs(const std::string& name) - : name_(name) {} - - // Returns true iff the name of test_case matches name_. - bool operator()(const TestCase* test_case) const { - return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0; - } - - private: - std::string name_; -}; - -// Finds and returns a TestCase with the given name. If one doesn't -// exist, creates one and returns it. It's the CALLER'S -// RESPONSIBILITY to ensure that this function is only called WHEN THE -// TESTS ARE NOT SHUFFLED. -// -// Arguments: -// -// test_case_name: name of the test case -// type_param: the name of the test case's type parameter, or NULL if -// this is not a typed or a type-parameterized test case. -// set_up_tc: pointer to the function that sets up the test case -// tear_down_tc: pointer to the function that tears down the test case -TestCase* UnitTestImpl::GetTestCase(const char* test_case_name, - const char* type_param, - Test::SetUpTestCaseFunc set_up_tc, - Test::TearDownTestCaseFunc tear_down_tc) { - // Can we find a TestCase with the given name? - const std::vector::const_iterator test_case = - std::find_if(test_cases_.begin(), test_cases_.end(), - TestCaseNameIs(test_case_name)); - - if (test_case != test_cases_.end()) - return *test_case; - - // No. Let's create one. - TestCase* const new_test_case = - new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc); - - // Is this a death test case? - if (internal::UnitTestOptions::MatchesFilter(test_case_name, - kDeathTestCaseFilter)) { - // Yes. Inserts the test case after the last death test case - // defined so far. This only works when the test cases haven't - // been shuffled. Otherwise we may end up running a death test - // after a non-death test. - ++last_death_test_case_; - test_cases_.insert(test_cases_.begin() + last_death_test_case_, - new_test_case); - } else { - // No. Appends to the end of the list. - test_cases_.push_back(new_test_case); - } - - test_case_indices_.push_back(static_cast(test_case_indices_.size())); - return new_test_case; -} - -// Helpers for setting up / tearing down the given environment. They -// are for use in the ForEach() function. -static void SetUpEnvironment(Environment* env) { env->SetUp(); } -static void TearDownEnvironment(Environment* env) { env->TearDown(); } - -// Runs all tests in this UnitTest object, prints the result, and -// returns true if all tests are successful. If any exception is -// thrown during a test, the test is considered to be failed, but the -// rest of the tests will still be run. -// -// When parameterized tests are enabled, it expands and registers -// parameterized tests first in RegisterParameterizedTests(). -// All other functions called from RunAllTests() may safely assume that -// parameterized tests are ready to be counted and run. -bool UnitTestImpl::RunAllTests() { - // Makes sure InitGoogleTest() was called. - if (!GTestIsInitialized()) { - printf("%s", - "\nThis test program did NOT call ::testing::InitGoogleTest " - "before calling RUN_ALL_TESTS(). Please fix it.\n"); - return false; - } - - // Do not run any test if the --help flag was specified. - if (g_help_flag) - return true; - - // Repeats the call to the post-flag parsing initialization in case the - // user didn't call InitGoogleTest. - PostFlagParsingInit(); - - // Even if sharding is not on, test runners may want to use the - // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding - // protocol. - internal::WriteToShardStatusFileIfNeeded(); - - // True iff we are in a subprocess for running a thread-safe-style - // death test. - bool in_subprocess_for_death_test = false; - -#if GTEST_HAS_DEATH_TEST - in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL); -#endif // GTEST_HAS_DEATH_TEST - - const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex, - in_subprocess_for_death_test); - - // Compares the full test names with the filter to decide which - // tests to run. - const bool has_tests_to_run = FilterTests(should_shard - ? HONOR_SHARDING_PROTOCOL - : IGNORE_SHARDING_PROTOCOL) > 0; - - // Lists the tests and exits if the --gtest_list_tests flag was specified. - if (GTEST_FLAG(list_tests)) { - // This must be called *after* FilterTests() has been called. - ListTestsMatchingFilter(); - return true; - } - - random_seed_ = GTEST_FLAG(shuffle) ? - GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0; - - // True iff at least one test has failed. - bool failed = false; - - TestEventListener* repeater = listeners()->repeater(); - - start_timestamp_ = GetTimeInMillis(); - repeater->OnTestProgramStart(*parent_); - - // How many times to repeat the tests? We don't want to repeat them - // when we are inside the subprocess of a death test. - const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat); - // Repeats forever if the repeat count is negative. - const bool forever = repeat < 0; - for (int i = 0; forever || i != repeat; i++) { - // We want to preserve failures generated by ad-hoc test - // assertions executed before RUN_ALL_TESTS(). - ClearNonAdHocTestResult(); - - const TimeInMillis start = GetTimeInMillis(); - - // Shuffles test cases and tests if requested. - if (has_tests_to_run && GTEST_FLAG(shuffle)) { - random()->Reseed(random_seed_); - // This should be done before calling OnTestIterationStart(), - // such that a test event listener can see the actual test order - // in the event. - ShuffleTests(); - } - - // Tells the unit test event listeners that the tests are about to start. - repeater->OnTestIterationStart(*parent_, i); - - // Runs each test case if there is at least one test to run. - if (has_tests_to_run) { - // Sets up all environments beforehand. - repeater->OnEnvironmentsSetUpStart(*parent_); - ForEach(environments_, SetUpEnvironment); - repeater->OnEnvironmentsSetUpEnd(*parent_); - - // Runs the tests only if there was no fatal failure during global - // set-up. - if (!Test::HasFatalFailure()) { - for (int test_index = 0; test_index < total_test_case_count(); - test_index++) { - GetMutableTestCase(test_index)->Run(); - } - } - - // Tears down all environments in reverse order afterwards. - repeater->OnEnvironmentsTearDownStart(*parent_); - std::for_each(environments_.rbegin(), environments_.rend(), - TearDownEnvironment); - repeater->OnEnvironmentsTearDownEnd(*parent_); - } - - elapsed_time_ = GetTimeInMillis() - start; - - // Tells the unit test event listener that the tests have just finished. - repeater->OnTestIterationEnd(*parent_, i); - - // Gets the result and clears it. - if (!Passed()) { - failed = true; - } - - // Restores the original test order after the iteration. This - // allows the user to quickly repro a failure that happens in the - // N-th iteration without repeating the first (N - 1) iterations. - // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in - // case the user somehow changes the value of the flag somewhere - // (it's always safe to unshuffle the tests). - UnshuffleTests(); - - if (GTEST_FLAG(shuffle)) { - // Picks a new random seed for each iteration. - random_seed_ = GetNextRandomSeed(random_seed_); - } - } - - repeater->OnTestProgramEnd(*parent_); - - return !failed; -} - -// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file -// if the variable is present. If a file already exists at this location, this -// function will write over it. If the variable is present, but the file cannot -// be created, prints an error and exits. -void WriteToShardStatusFileIfNeeded() { - const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile); - if (test_shard_file != NULL) { - FILE* const file = posix::FOpen(test_shard_file, "w"); - if (file == NULL) { - ColoredPrintf(COLOR_RED, - "Could not write to the test shard status file \"%s\" " - "specified by the %s environment variable.\n", - test_shard_file, kTestShardStatusFile); - fflush(stdout); - exit(EXIT_FAILURE); - } - fclose(file); - } -} - -// Checks whether sharding is enabled by examining the relevant -// environment variable values. If the variables are present, -// but inconsistent (i.e., shard_index >= total_shards), prints -// an error and exits. If in_subprocess_for_death_test, sharding is -// disabled because it must only be applied to the original test -// process. Otherwise, we could filter out death tests we intended to execute. -bool ShouldShard(const char* total_shards_env, - const char* shard_index_env, - bool in_subprocess_for_death_test) { - if (in_subprocess_for_death_test) { - return false; - } - - const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1); - const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1); - - if (total_shards == -1 && shard_index == -1) { - return false; - } else if (total_shards == -1 && shard_index != -1) { - const Message msg = Message() - << "Invalid environment variables: you have " - << kTestShardIndex << " = " << shard_index - << ", but have left " << kTestTotalShards << " unset.\n"; - ColoredPrintf(COLOR_RED, msg.GetString().c_str()); - fflush(stdout); - exit(EXIT_FAILURE); - } else if (total_shards != -1 && shard_index == -1) { - const Message msg = Message() - << "Invalid environment variables: you have " - << kTestTotalShards << " = " << total_shards - << ", but have left " << kTestShardIndex << " unset.\n"; - ColoredPrintf(COLOR_RED, msg.GetString().c_str()); - fflush(stdout); - exit(EXIT_FAILURE); - } else if (shard_index < 0 || shard_index >= total_shards) { - const Message msg = Message() - << "Invalid environment variables: we require 0 <= " - << kTestShardIndex << " < " << kTestTotalShards - << ", but you have " << kTestShardIndex << "=" << shard_index - << ", " << kTestTotalShards << "=" << total_shards << ".\n"; - ColoredPrintf(COLOR_RED, msg.GetString().c_str()); - fflush(stdout); - exit(EXIT_FAILURE); - } - - return total_shards > 1; -} - -// Parses the environment variable var as an Int32. If it is unset, -// returns default_val. If it is not an Int32, prints an error -// and aborts. -Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) { - const char* str_val = posix::GetEnv(var); - if (str_val == NULL) { - return default_val; - } - - Int32 result; - if (!ParseInt32(Message() << "The value of environment variable " << var, - str_val, &result)) { - exit(EXIT_FAILURE); - } - return result; -} - -// Given the total number of shards, the shard index, and the test id, -// returns true iff the test should be run on this shard. The test id is -// some arbitrary but unique non-negative integer assigned to each test -// method. Assumes that 0 <= shard_index < total_shards. -bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { - return (test_id % total_shards) == shard_index; -} - -// Compares the name of each test with the user-specified filter to -// decide whether the test should be run, then records the result in -// each TestCase and TestInfo object. -// If shard_tests == true, further filters tests based on sharding -// variables in the environment - see -// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide. -// Returns the number of tests that should run. -int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { - const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ? - Int32FromEnvOrDie(kTestTotalShards, -1) : -1; - const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ? - Int32FromEnvOrDie(kTestShardIndex, -1) : -1; - - // num_runnable_tests are the number of tests that will - // run across all shards (i.e., match filter and are not disabled). - // num_selected_tests are the number of tests to be run on - // this shard. - int num_runnable_tests = 0; - int num_selected_tests = 0; - for (size_t i = 0; i < test_cases_.size(); i++) { - TestCase* const test_case = test_cases_[i]; - const std::string &test_case_name = test_case->name(); - test_case->set_should_run(false); - - for (size_t j = 0; j < test_case->test_info_list().size(); j++) { - TestInfo* const test_info = test_case->test_info_list()[j]; - const std::string test_name(test_info->name()); - // A test is disabled if test case name or test name matches - // kDisableTestFilter. - const bool is_disabled = - internal::UnitTestOptions::MatchesFilter(test_case_name, - kDisableTestFilter) || - internal::UnitTestOptions::MatchesFilter(test_name, - kDisableTestFilter); - test_info->is_disabled_ = is_disabled; - - const bool matches_filter = - internal::UnitTestOptions::FilterMatchesTest(test_case_name, - test_name); - test_info->matches_filter_ = matches_filter; - - const bool is_runnable = - (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) && - matches_filter; - - const bool is_selected = is_runnable && - (shard_tests == IGNORE_SHARDING_PROTOCOL || - ShouldRunTestOnShard(total_shards, shard_index, - num_runnable_tests)); - - num_runnable_tests += is_runnable; - num_selected_tests += is_selected; - - test_info->should_run_ = is_selected; - test_case->set_should_run(test_case->should_run() || is_selected); - } - } - return num_selected_tests; -} - -// Prints the given C-string on a single line by replacing all '\n' -// characters with string "\\n". If the output takes more than -// max_length characters, only prints the first max_length characters -// and "...". -static void PrintOnOneLine(const char* str, int max_length) { - if (str != NULL) { - for (int i = 0; *str != '\0'; ++str) { - if (i >= max_length) { - printf("..."); - break; - } - if (*str == '\n') { - printf("\\n"); - i += 2; - } else { - printf("%c", *str); - ++i; - } - } - } -} - -// Prints the names of the tests matching the user-specified filter flag. -void UnitTestImpl::ListTestsMatchingFilter() { - // Print at most this many characters for each type/value parameter. - const int kMaxParamLength = 250; - - for (size_t i = 0; i < test_cases_.size(); i++) { - const TestCase* const test_case = test_cases_[i]; - bool printed_test_case_name = false; - - for (size_t j = 0; j < test_case->test_info_list().size(); j++) { - const TestInfo* const test_info = - test_case->test_info_list()[j]; - if (test_info->matches_filter_) { - if (!printed_test_case_name) { - printed_test_case_name = true; - printf("%s.", test_case->name()); - if (test_case->type_param() != NULL) { - printf(" # %s = ", kTypeParamLabel); - // We print the type parameter on a single line to make - // the output easy to parse by a program. - PrintOnOneLine(test_case->type_param(), kMaxParamLength); - } - printf("\n"); - } - printf(" %s", test_info->name()); - if (test_info->value_param() != NULL) { - printf(" # %s = ", kValueParamLabel); - // We print the value parameter on a single line to make the - // output easy to parse by a program. - PrintOnOneLine(test_info->value_param(), kMaxParamLength); - } - printf("\n"); - } - } - } - fflush(stdout); -} - -// Sets the OS stack trace getter. -// -// Does nothing if the input and the current OS stack trace getter are -// the same; otherwise, deletes the old getter and makes the input the -// current getter. -void UnitTestImpl::set_os_stack_trace_getter( - OsStackTraceGetterInterface* getter) { - if (os_stack_trace_getter_ != getter) { - delete os_stack_trace_getter_; - os_stack_trace_getter_ = getter; - } -} - -// Returns the current OS stack trace getter if it is not NULL; -// otherwise, creates an OsStackTraceGetter, makes it the current -// getter, and returns it. -OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() { - if (os_stack_trace_getter_ == NULL) { - os_stack_trace_getter_ = new OsStackTraceGetter; - } - - return os_stack_trace_getter_; -} - -// Returns the TestResult for the test that's currently running, or -// the TestResult for the ad hoc test if no test is running. -TestResult* UnitTestImpl::current_test_result() { - return current_test_info_ ? - &(current_test_info_->result_) : &ad_hoc_test_result_; -} - -// Shuffles all test cases, and the tests within each test case, -// making sure that death tests are still run first. -void UnitTestImpl::ShuffleTests() { - // Shuffles the death test cases. - ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_); - - // Shuffles the non-death test cases. - ShuffleRange(random(), last_death_test_case_ + 1, - static_cast(test_cases_.size()), &test_case_indices_); - - // Shuffles the tests inside each test case. - for (size_t i = 0; i < test_cases_.size(); i++) { - test_cases_[i]->ShuffleTests(random()); - } -} - -// Restores the test cases and tests to their order before the first shuffle. -void UnitTestImpl::UnshuffleTests() { - for (size_t i = 0; i < test_cases_.size(); i++) { - // Unshuffles the tests in each test case. - test_cases_[i]->UnshuffleTests(); - // Resets the index of each test case. - test_case_indices_[i] = static_cast(i); - } -} - -// Returns the current OS stack trace as an std::string. -// -// The maximum number of stack frames to be included is specified by -// the gtest_stack_trace_depth flag. The skip_count parameter -// specifies the number of top frames to be skipped, which doesn't -// count against the number of frames to be included. -// -// For example, if Foo() calls Bar(), which in turn calls -// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in -// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. -std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, - int skip_count) { - // We pass skip_count + 1 to skip this wrapper function in addition - // to what the user really wants to skip. - return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1); -} - -// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to -// suppress unreachable code warnings. -namespace { -class ClassUniqueToAlwaysTrue {}; -} - -bool IsTrue(bool condition) { return condition; } - -bool AlwaysTrue() { -#if GTEST_HAS_EXCEPTIONS - // This condition is always false so AlwaysTrue() never actually throws, - // but it makes the compiler think that it may throw. - if (IsTrue(false)) - throw ClassUniqueToAlwaysTrue(); -#endif // GTEST_HAS_EXCEPTIONS - return true; -} - -// If *pstr starts with the given prefix, modifies *pstr to be right -// past the prefix and returns true; otherwise leaves *pstr unchanged -// and returns false. None of pstr, *pstr, and prefix can be NULL. -bool SkipPrefix(const char* prefix, const char** pstr) { - const size_t prefix_len = strlen(prefix); - if (strncmp(*pstr, prefix, prefix_len) == 0) { - *pstr += prefix_len; - return true; - } - return false; -} - -// Parses a string as a command line flag. The string should have -// the format "--flag=value". When def_optional is true, the "=value" -// part can be omitted. -// -// Returns the value of the flag, or NULL if the parsing failed. -const char* ParseFlagValue(const char* str, - const char* flag, - bool def_optional) { - // str and flag must not be NULL. - if (str == NULL || flag == NULL) return NULL; - - // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. - const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag; - const size_t flag_len = flag_str.length(); - if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL; - - // Skips the flag name. - const char* flag_end = str + flag_len; - - // When def_optional is true, it's OK to not have a "=value" part. - if (def_optional && (flag_end[0] == '\0')) { - return flag_end; - } - - // If def_optional is true and there are more characters after the - // flag name, or if def_optional is false, there must be a '=' after - // the flag name. - if (flag_end[0] != '=') return NULL; - - // Returns the string after "=". - return flag_end + 1; -} - -// Parses a string for a bool flag, in the form of either -// "--flag=value" or "--flag". -// -// In the former case, the value is taken as true as long as it does -// not start with '0', 'f', or 'F'. -// -// In the latter case, the value is taken as true. -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -bool ParseBoolFlag(const char* str, const char* flag, bool* value) { - // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, true); - - // Aborts if the parsing failed. - if (value_str == NULL) return false; - - // Converts the string value to a bool. - *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F'); - return true; -} - -// Parses a string for an Int32 flag, in the form of -// "--flag=value". -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -bool ParseInt32Flag(const char* str, const char* flag, Int32* value) { - // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); - - // Aborts if the parsing failed. - if (value_str == NULL) return false; - - // Sets *value to the value of the flag. - return ParseInt32(Message() << "The value of flag --" << flag, - value_str, value); -} - -// Parses a string for a string flag, in the form of -// "--flag=value". -// -// On success, stores the value of the flag in *value, and returns -// true. On failure, returns false without changing *value. -bool ParseStringFlag(const char* str, const char* flag, std::string* value) { - // Gets the value of the flag as a string. - const char* const value_str = ParseFlagValue(str, flag, false); - - // Aborts if the parsing failed. - if (value_str == NULL) return false; - - // Sets *value to the value of the flag. - *value = value_str; - return true; -} - -// Determines whether a string has a prefix that Google Test uses for its -// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_. -// If Google Test detects that a command line flag has its prefix but is not -// recognized, it will print its help message. Flags starting with -// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test -// internal flags and do not trigger the help message. -static bool HasGoogleTestFlagPrefix(const char* str) { - return (SkipPrefix("--", &str) || - SkipPrefix("-", &str) || - SkipPrefix("/", &str)) && - !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) && - (SkipPrefix(GTEST_FLAG_PREFIX_, &str) || - SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str)); -} - -// Prints a string containing code-encoded text. The following escape -// sequences can be used in the string to control the text color: -// -// @@ prints a single '@' character. -// @R changes the color to red. -// @G changes the color to green. -// @Y changes the color to yellow. -// @D changes to the default terminal text color. -// -// TODO(wan@google.com): Write tests for this once we add stdout -// capturing to Google Test. -static void PrintColorEncoded(const char* str) { - GTestColor color = COLOR_DEFAULT; // The current color. - - // Conceptually, we split the string into segments divided by escape - // sequences. Then we print one segment at a time. At the end of - // each iteration, the str pointer advances to the beginning of the - // next segment. - for (;;) { - const char* p = strchr(str, '@'); - if (p == NULL) { - ColoredPrintf(color, "%s", str); - return; - } - - ColoredPrintf(color, "%s", std::string(str, p).c_str()); - - const char ch = p[1]; - str = p + 2; - if (ch == '@') { - ColoredPrintf(color, "@"); - } else if (ch == 'D') { - color = COLOR_DEFAULT; - } else if (ch == 'R') { - color = COLOR_RED; - } else if (ch == 'G') { - color = COLOR_GREEN; - } else if (ch == 'Y') { - color = COLOR_YELLOW; - } else { - --str; - } - } -} - -static const char kColorEncodedHelpMessage[] = -"This program contains tests written using " GTEST_NAME_ ". You can use the\n" -"following command line flags to control its behavior:\n" -"\n" -"Test Selection:\n" -" @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n" -" List the names of all tests instead of running them. The name of\n" -" TEST(Foo, Bar) is \"Foo.Bar\".\n" -" @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS" - "[@G-@YNEGATIVE_PATTERNS]@D\n" -" Run only the tests whose name matches one of the positive patterns but\n" -" none of the negative patterns. '?' matches any single character; '*'\n" -" matches any substring; ':' separates two patterns.\n" -" @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n" -" Run all disabled tests too.\n" -"\n" -"Test Execution:\n" -" @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n" -" Run the tests repeatedly; use a negative count to repeat forever.\n" -" @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n" -" Randomize tests' orders on every iteration.\n" -" @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n" -" Random number seed to use for shuffling test orders (between 1 and\n" -" 99999, or 0 to use a seed based on the current time).\n" -"\n" -"Test Output:\n" -" @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n" -" Enable/disable colored output. The default is @Gauto@D.\n" -" -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n" -" Don't print the elapsed time of each test.\n" -" @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G" - GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n" -" Generate an XML report in the given directory or with the given file\n" -" name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n" -#if GTEST_CAN_STREAM_RESULTS_ -" @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n" -" Stream test results to the given server.\n" -#endif // GTEST_CAN_STREAM_RESULTS_ -"\n" -"Assertion Behavior:\n" -#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS -" @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" -" Set the default death test style.\n" -#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS -" @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n" -" Turn assertion failures into debugger break-points.\n" -" @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n" -" Turn assertion failures into C++ exceptions.\n" -" @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n" -" Do not report exceptions as test failures. Instead, allow them\n" -" to crash the program or throw a pop-up (on Windows).\n" -"\n" -"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set " - "the corresponding\n" -"environment variable of a flag (all letters in upper-case). For example, to\n" -"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_ - "color=no@D or set\n" -"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n" -"\n" -"For more information, please read the " GTEST_NAME_ " documentation at\n" -"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n" -"(not one in your own code or tests), please report it to\n" -"@G<" GTEST_DEV_EMAIL_ ">@D.\n"; - -// Parses the command line for Google Test flags, without initializing -// other parts of Google Test. The type parameter CharType can be -// instantiated to either char or wchar_t. -template -void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { - for (int i = 1; i < *argc; i++) { - const std::string arg_string = StreamableToString(argv[i]); - const char* const arg = arg_string.c_str(); - - using internal::ParseBoolFlag; - using internal::ParseInt32Flag; - using internal::ParseStringFlag; - - // Do we see a Google Test flag? - if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag, - >EST_FLAG(also_run_disabled_tests)) || - ParseBoolFlag(arg, kBreakOnFailureFlag, - >EST_FLAG(break_on_failure)) || - ParseBoolFlag(arg, kCatchExceptionsFlag, - >EST_FLAG(catch_exceptions)) || - ParseStringFlag(arg, kColorFlag, >EST_FLAG(color)) || - ParseStringFlag(arg, kDeathTestStyleFlag, - >EST_FLAG(death_test_style)) || - ParseBoolFlag(arg, kDeathTestUseFork, - >EST_FLAG(death_test_use_fork)) || - ParseStringFlag(arg, kFilterFlag, >EST_FLAG(filter)) || - ParseStringFlag(arg, kInternalRunDeathTestFlag, - >EST_FLAG(internal_run_death_test)) || - ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || - ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || - ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || - ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || - ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || - ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || - ParseInt32Flag(arg, kStackTraceDepthFlag, - >EST_FLAG(stack_trace_depth)) || - ParseStringFlag(arg, kStreamResultToFlag, - >EST_FLAG(stream_result_to)) || - ParseBoolFlag(arg, kThrowOnFailureFlag, - >EST_FLAG(throw_on_failure)) - ) { - // Yes. Shift the remainder of the argv list left by one. Note - // that argv has (*argc + 1) elements, the last one always being - // NULL. The following loop moves the trailing NULL element as - // well. - for (int j = i; j != *argc; j++) { - argv[j] = argv[j + 1]; - } - - // Decrements the argument count. - (*argc)--; - - // We also need to decrement the iterator as we just removed - // an element. - i--; - } else if (arg_string == "--help" || arg_string == "-h" || - arg_string == "-?" || arg_string == "/?" || - HasGoogleTestFlagPrefix(arg)) { - // Both help flag and unrecognized Google Test flags (excluding - // internal ones) trigger help display. - g_help_flag = true; - } - } - - if (g_help_flag) { - // We print the help here instead of in RUN_ALL_TESTS(), as the - // latter may not be called at all if the user is using Google - // Test with another testing framework. - PrintColorEncoded(kColorEncodedHelpMessage); - } -} - -// Parses the command line for Google Test flags, without initializing -// other parts of Google Test. -void ParseGoogleTestFlagsOnly(int* argc, char** argv) { - ParseGoogleTestFlagsOnlyImpl(argc, argv); -} -void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) { - ParseGoogleTestFlagsOnlyImpl(argc, argv); -} - -// The internal implementation of InitGoogleTest(). -// -// The type parameter CharType can be instantiated to either char or -// wchar_t. -template -void InitGoogleTestImpl(int* argc, CharType** argv) { - g_init_gtest_count++; - - // We don't want to run the initialization code twice. - if (g_init_gtest_count != 1) return; - - if (*argc <= 0) return; - - internal::g_executable_path = internal::StreamableToString(argv[0]); - -#if GTEST_HAS_DEATH_TEST - - g_argvs.clear(); - for (int i = 0; i != *argc; i++) { - g_argvs.push_back(StreamableToString(argv[i])); - } - -#endif // GTEST_HAS_DEATH_TEST - - ParseGoogleTestFlagsOnly(argc, argv); - GetUnitTestImpl()->PostFlagParsingInit(); -} - -} // namespace internal - -// Initializes Google Test. This must be called before calling -// RUN_ALL_TESTS(). In particular, it parses a command line for the -// flags that Google Test recognizes. Whenever a Google Test flag is -// seen, it is removed from argv, and *argc is decremented. -// -// No value is returned. Instead, the Google Test flag variables are -// updated. -// -// Calling the function for the second time has no user-visible effect. -void InitGoogleTest(int* argc, char** argv) { - internal::InitGoogleTestImpl(argc, argv); -} - -// This overloaded version can be used in Windows programs compiled in -// UNICODE mode. -void InitGoogleTest(int* argc, wchar_t** argv) { - internal::InitGoogleTestImpl(argc, argv); -} - -} // namespace testing -// Copyright 2005, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev) -// -// This file implements death tests. - - -#if GTEST_HAS_DEATH_TEST - -# if GTEST_OS_MAC -# include -# endif // GTEST_OS_MAC - -# include -# include -# include - -# if GTEST_OS_LINUX -# include -# endif // GTEST_OS_LINUX - -# include - -# if GTEST_OS_WINDOWS -# include -# else -# include -# include -# endif // GTEST_OS_WINDOWS - -# if GTEST_OS_QNX -# include -# endif // GTEST_OS_QNX - -#endif // GTEST_HAS_DEATH_TEST - - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick is to -// prevent a user from accidentally including gtest-internal-inl.h in -// his code. -#define GTEST_IMPLEMENTATION_ 1 -#undef GTEST_IMPLEMENTATION_ - -namespace testing { - -// Constants. - -// The default death test style. -static const char kDefaultDeathTestStyle[] = "fast"; - -GTEST_DEFINE_string_( - death_test_style, - internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle), - "Indicates how to run a death test in a forked child process: " - "\"threadsafe\" (child process re-executes the test binary " - "from the beginning, running only the specific death test) or " - "\"fast\" (child process runs the death test immediately " - "after forking)."); - -GTEST_DEFINE_bool_( - death_test_use_fork, - internal::BoolFromGTestEnv("death_test_use_fork", false), - "Instructs to use fork()/_exit() instead of clone() in death tests. " - "Ignored and always uses fork() on POSIX systems where clone() is not " - "implemented. Useful when running under valgrind or similar tools if " - "those do not support clone(). Valgrind 3.3.1 will just fail if " - "it sees an unsupported combination of clone() flags. " - "It is not recommended to use this flag w/o valgrind though it will " - "work in 99% of the cases. Once valgrind is fixed, this flag will " - "most likely be removed."); - -namespace internal { -GTEST_DEFINE_string_( - internal_run_death_test, "", - "Indicates the file, line number, temporal index of " - "the single death test to run, and a file descriptor to " - "which a success code may be sent, all separated by " - "the '|' characters. This flag is specified if and only if the current " - "process is a sub-process launched for running a thread-safe " - "death test. FOR INTERNAL USE ONLY."); -} // namespace internal - -#if GTEST_HAS_DEATH_TEST - -namespace internal { - -# if !GTEST_OS_WINDOWS -// Valid only for fast death tests. Indicates the code is running in the -// child process of a fast style death test. -static bool g_in_fast_death_test_child = false; -# endif // !GTEST_OS_WINDOWS - -// Returns a Boolean value indicating whether the caller is currently -// executing in the context of the death test child process. Tools such as -// Valgrind heap checkers may need this to modify their behavior in death -// tests. IMPORTANT: This is an internal utility. Using it may break the -// implementation of death tests. User code MUST NOT use it. -bool InDeathTestChild() { -# if GTEST_OS_WINDOWS - - // On Windows, death tests are thread-safe regardless of the value of the - // death_test_style flag. - return !GTEST_FLAG(internal_run_death_test).empty(); - -# else - - if (GTEST_FLAG(death_test_style) == "threadsafe") - return !GTEST_FLAG(internal_run_death_test).empty(); - else - return g_in_fast_death_test_child; -#endif -} - -} // namespace internal - -// ExitedWithCode constructor. -ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) { -} - -// ExitedWithCode function-call operator. -bool ExitedWithCode::operator()(int exit_status) const { -# if GTEST_OS_WINDOWS - - return exit_status == exit_code_; - -# else - - return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; - -# endif // GTEST_OS_WINDOWS -} - -# if !GTEST_OS_WINDOWS -// KilledBySignal constructor. -KilledBySignal::KilledBySignal(int signum) : signum_(signum) { -} - -// KilledBySignal function-call operator. -bool KilledBySignal::operator()(int exit_status) const { - return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; -} -# endif // !GTEST_OS_WINDOWS - -namespace internal { - -// Utilities needed for death tests. - -// Generates a textual description of a given exit code, in the format -// specified by wait(2). -static std::string ExitSummary(int exit_code) { - Message m; - -# if GTEST_OS_WINDOWS - - m << "Exited with exit status " << exit_code; - -# else - - if (WIFEXITED(exit_code)) { - m << "Exited with exit status " << WEXITSTATUS(exit_code); - } else if (WIFSIGNALED(exit_code)) { - m << "Terminated by signal " << WTERMSIG(exit_code); - } -# ifdef WCOREDUMP - if (WCOREDUMP(exit_code)) { - m << " (core dumped)"; - } -# endif -# endif // GTEST_OS_WINDOWS - - return m.GetString(); -} - -// Returns true if exit_status describes a process that was terminated -// by a signal, or exited normally with a nonzero exit code. -bool ExitedUnsuccessfully(int exit_status) { - return !ExitedWithCode(0)(exit_status); -} - -# if !GTEST_OS_WINDOWS -// Generates a textual failure message when a death test finds more than -// one thread running, or cannot determine the number of threads, prior -// to executing the given statement. It is the responsibility of the -// caller not to pass a thread_count of 1. -static std::string DeathTestThreadWarning(size_t thread_count) { - Message msg; - msg << "Death tests use fork(), which is unsafe particularly" - << " in a threaded context. For this test, " << GTEST_NAME_ << " "; - if (thread_count == 0) - msg << "couldn't detect the number of threads."; - else - msg << "detected " << thread_count << " threads."; - return msg.GetString(); -} -# endif // !GTEST_OS_WINDOWS - -// Flag characters for reporting a death test that did not die. -static const char kDeathTestLived = 'L'; -static const char kDeathTestReturned = 'R'; -static const char kDeathTestThrew = 'T'; -static const char kDeathTestInternalError = 'I'; - -// An enumeration describing all of the possible ways that a death test can -// conclude. DIED means that the process died while executing the test -// code; LIVED means that process lived beyond the end of the test code; -// RETURNED means that the test statement attempted to execute a return -// statement, which is not allowed; THREW means that the test statement -// returned control by throwing an exception. IN_PROGRESS means the test -// has not yet concluded. -// TODO(vladl@google.com): Unify names and possibly values for -// AbortReason, DeathTestOutcome, and flag characters above. -enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; - -// Routine for aborting the program which is safe to call from an -// exec-style death test child process, in which case the error -// message is propagated back to the parent process. Otherwise, the -// message is simply printed to stderr. In either case, the program -// then exits with status 1. -void DeathTestAbort(const std::string& message) { - // On a POSIX system, this function may be called from a threadsafe-style - // death test child process, which operates on a very small stack. Use - // the heap for any additional non-minuscule memory requirements. - const InternalRunDeathTestFlag* const flag = - GetUnitTestImpl()->internal_run_death_test_flag(); - if (flag != NULL) { - FILE* parent = posix::FDOpen(flag->write_fd(), "w"); - fputc(kDeathTestInternalError, parent); - fprintf(parent, "%s", message.c_str()); - fflush(parent); - _exit(1); - } else { - fprintf(stderr, "%s", message.c_str()); - fflush(stderr); - posix::Abort(); - } -} - -// A replacement for CHECK that calls DeathTestAbort if the assertion -// fails. -# define GTEST_DEATH_TEST_CHECK_(expression) \ - do { \ - if (!::testing::internal::IsTrue(expression)) { \ - DeathTestAbort( \ - ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ - + ::testing::internal::StreamableToString(__LINE__) + ": " \ - + #expression); \ - } \ - } while (::testing::internal::AlwaysFalse()) - -// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for -// evaluating any system call that fulfills two conditions: it must return -// -1 on failure, and set errno to EINTR when it is interrupted and -// should be tried again. The macro expands to a loop that repeatedly -// evaluates the expression as long as it evaluates to -1 and sets -// errno to EINTR. If the expression evaluates to -1 but errno is -// something other than EINTR, DeathTestAbort is called. -# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ - do { \ - int gtest_retval; \ - do { \ - gtest_retval = (expression); \ - } while (gtest_retval == -1 && errno == EINTR); \ - if (gtest_retval == -1) { \ - DeathTestAbort( \ - ::std::string("CHECK failed: File ") + __FILE__ + ", line " \ - + ::testing::internal::StreamableToString(__LINE__) + ": " \ - + #expression + " != -1"); \ - } \ - } while (::testing::internal::AlwaysFalse()) - -// Returns the message describing the last system error in errno. -std::string GetLastErrnoDescription() { - return errno == 0 ? "" : posix::StrError(errno); -} - -// This is called from a death test parent process to read a failure -// message from the death test child process and log it with the FATAL -// severity. On Windows, the message is read from a pipe handle. On other -// platforms, it is read from a file descriptor. -static void FailFromInternalError(int fd) { - Message error; - char buffer[256]; - int num_read; - - do { - while ((num_read = posix::Read(fd, buffer, 255)) > 0) { - buffer[num_read] = '\0'; - error << buffer; - } - } while (num_read == -1 && errno == EINTR); - - if (num_read == 0) { - GTEST_LOG_(FATAL) << error.GetString(); - } else { - const int last_error = errno; - GTEST_LOG_(FATAL) << "Error while reading death test internal: " - << GetLastErrnoDescription() << " [" << last_error << "]"; - } -} - -// Death test constructor. Increments the running death test count -// for the current test. -DeathTest::DeathTest() { - TestInfo* const info = GetUnitTestImpl()->current_test_info(); - if (info == NULL) { - DeathTestAbort("Cannot run a death test outside of a TEST or " - "TEST_F construct"); - } -} - -// Creates and returns a death test by dispatching to the current -// death test factory. -bool DeathTest::Create(const char* statement, const RE* regex, - const char* file, int line, DeathTest** test) { - return GetUnitTestImpl()->death_test_factory()->Create( - statement, regex, file, line, test); -} - -const char* DeathTest::LastMessage() { - return last_death_test_message_.c_str(); -} - -void DeathTest::set_last_death_test_message(const std::string& message) { - last_death_test_message_ = message; -} - -std::string DeathTest::last_death_test_message_; - -// Provides cross platform implementation for some death functionality. -class DeathTestImpl : public DeathTest { - protected: - DeathTestImpl(const char* a_statement, const RE* a_regex) - : statement_(a_statement), - regex_(a_regex), - spawned_(false), - status_(-1), - outcome_(IN_PROGRESS), - read_fd_(-1), - write_fd_(-1) {} - - // read_fd_ is expected to be closed and cleared by a derived class. - ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); } - - void Abort(AbortReason reason); - virtual bool Passed(bool status_ok); - - const char* statement() const { return statement_; } - const RE* regex() const { return regex_; } - bool spawned() const { return spawned_; } - void set_spawned(bool is_spawned) { spawned_ = is_spawned; } - int status() const { return status_; } - void set_status(int a_status) { status_ = a_status; } - DeathTestOutcome outcome() const { return outcome_; } - void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; } - int read_fd() const { return read_fd_; } - void set_read_fd(int fd) { read_fd_ = fd; } - int write_fd() const { return write_fd_; } - void set_write_fd(int fd) { write_fd_ = fd; } - - // Called in the parent process only. Reads the result code of the death - // test child process via a pipe, interprets it to set the outcome_ - // member, and closes read_fd_. Outputs diagnostics and terminates in - // case of unexpected codes. - void ReadAndInterpretStatusByte(); - - private: - // The textual content of the code this object is testing. This class - // doesn't own this string and should not attempt to delete it. - const char* const statement_; - // The regular expression which test output must match. DeathTestImpl - // doesn't own this object and should not attempt to delete it. - const RE* const regex_; - // True if the death test child process has been successfully spawned. - bool spawned_; - // The exit status of the child process. - int status_; - // How the death test concluded. - DeathTestOutcome outcome_; - // Descriptor to the read end of the pipe to the child process. It is - // always -1 in the child process. The child keeps its write end of the - // pipe in write_fd_. - int read_fd_; - // Descriptor to the child's write end of the pipe to the parent process. - // It is always -1 in the parent process. The parent keeps its end of the - // pipe in read_fd_. - int write_fd_; -}; - -// Called in the parent process only. Reads the result code of the death -// test child process via a pipe, interprets it to set the outcome_ -// member, and closes read_fd_. Outputs diagnostics and terminates in -// case of unexpected codes. -void DeathTestImpl::ReadAndInterpretStatusByte() { - char flag; - int bytes_read; - - // The read() here blocks until data is available (signifying the - // failure of the death test) or until the pipe is closed (signifying - // its success), so it's okay to call this in the parent before - // the child process has exited. - do { - bytes_read = posix::Read(read_fd(), &flag, 1); - } while (bytes_read == -1 && errno == EINTR); - - if (bytes_read == 0) { - set_outcome(DIED); - } else if (bytes_read == 1) { - switch (flag) { - case kDeathTestReturned: - set_outcome(RETURNED); - break; - case kDeathTestThrew: - set_outcome(THREW); - break; - case kDeathTestLived: - set_outcome(LIVED); - break; - case kDeathTestInternalError: - FailFromInternalError(read_fd()); // Does not return. - break; - default: - GTEST_LOG_(FATAL) << "Death test child process reported " - << "unexpected status byte (" - << static_cast(flag) << ")"; - } - } else { - GTEST_LOG_(FATAL) << "Read from death test child process failed: " - << GetLastErrnoDescription(); - } - GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd())); - set_read_fd(-1); -} - -// Signals that the death test code which should have exited, didn't. -// Should be called only in a death test child process. -// Writes a status byte to the child's status file descriptor, then -// calls _exit(1). -void DeathTestImpl::Abort(AbortReason reason) { - // The parent process considers the death test to be a failure if - // it finds any data in our pipe. So, here we write a single flag byte - // to the pipe, then exit. - const char status_ch = - reason == TEST_DID_NOT_DIE ? kDeathTestLived : - reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned; - - GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1)); - // We are leaking the descriptor here because on some platforms (i.e., - // when built as Windows DLL), destructors of global objects will still - // run after calling _exit(). On such systems, write_fd_ will be - // indirectly closed from the destructor of UnitTestImpl, causing double - // close if it is also closed here. On debug configurations, double close - // may assert. As there are no in-process buffers to flush here, we are - // relying on the OS to close the descriptor after the process terminates - // when the destructors are not run. - _exit(1); // Exits w/o any normal exit hooks (we were supposed to crash) -} - -// Returns an indented copy of stderr output for a death test. -// This makes distinguishing death test output lines from regular log lines -// much easier. -static ::std::string FormatDeathTestOutput(const ::std::string& output) { - ::std::string ret; - for (size_t at = 0; ; ) { - const size_t line_end = output.find('\n', at); - ret += "[ DEATH ] "; - if (line_end == ::std::string::npos) { - ret += output.substr(at); - break; - } - ret += output.substr(at, line_end + 1 - at); - at = line_end + 1; - } - return ret; -} - -// Assesses the success or failure of a death test, using both private -// members which have previously been set, and one argument: -// -// Private data members: -// outcome: An enumeration describing how the death test -// concluded: DIED, LIVED, THREW, or RETURNED. The death test -// fails in the latter three cases. -// status: The exit status of the child process. On *nix, it is in the -// in the format specified by wait(2). On Windows, this is the -// value supplied to the ExitProcess() API or a numeric code -// of the exception that terminated the program. -// regex: A regular expression object to be applied to -// the test's captured standard error output; the death test -// fails if it does not match. -// -// Argument: -// status_ok: true if exit_status is acceptable in the context of -// this particular death test, which fails if it is false -// -// Returns true iff all of the above conditions are met. Otherwise, the -// first failing condition, in the order given above, is the one that is -// reported. Also sets the last death test message string. -bool DeathTestImpl::Passed(bool status_ok) { - if (!spawned()) - return false; - - const std::string error_message = GetCapturedStderr(); - - bool success = false; - Message buffer; - - buffer << "Death test: " << statement() << "\n"; - switch (outcome()) { - case LIVED: - buffer << " Result: failed to die.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); - break; - case THREW: - buffer << " Result: threw an exception.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); - break; - case RETURNED: - buffer << " Result: illegal return in test statement.\n" - << " Error msg:\n" << FormatDeathTestOutput(error_message); - break; - case DIED: - if (status_ok) { - const bool matched = RE::PartialMatch(error_message.c_str(), *regex()); - if (matched) { - success = true; - } else { - buffer << " Result: died but not with expected error.\n" - << " Expected: " << regex()->pattern() << "\n" - << "Actual msg:\n" << FormatDeathTestOutput(error_message); - } - } else { - buffer << " Result: died but not with expected exit code:\n" - << " " << ExitSummary(status()) << "\n" - << "Actual msg:\n" << FormatDeathTestOutput(error_message); - } - break; - case IN_PROGRESS: - default: - GTEST_LOG_(FATAL) - << "DeathTest::Passed somehow called before conclusion of test"; - } - - DeathTest::set_last_death_test_message(buffer.GetString()); - return success; -} - -# if GTEST_OS_WINDOWS -// WindowsDeathTest implements death tests on Windows. Due to the -// specifics of starting new processes on Windows, death tests there are -// always threadsafe, and Google Test considers the -// --gtest_death_test_style=fast setting to be equivalent to -// --gtest_death_test_style=threadsafe there. -// -// A few implementation notes: Like the Linux version, the Windows -// implementation uses pipes for child-to-parent communication. But due to -// the specifics of pipes on Windows, some extra steps are required: -// -// 1. The parent creates a communication pipe and stores handles to both -// ends of it. -// 2. The parent starts the child and provides it with the information -// necessary to acquire the handle to the write end of the pipe. -// 3. The child acquires the write end of the pipe and signals the parent -// using a Windows event. -// 4. Now the parent can release the write end of the pipe on its side. If -// this is done before step 3, the object's reference count goes down to -// 0 and it is destroyed, preventing the child from acquiring it. The -// parent now has to release it, or read operations on the read end of -// the pipe will not return when the child terminates. -// 5. The parent reads child's output through the pipe (outcome code and -// any possible error messages) from the pipe, and its stderr and then -// determines whether to fail the test. -// -// Note: to distinguish Win32 API calls from the local method and function -// calls, the former are explicitly resolved in the global namespace. -// -class WindowsDeathTest : public DeathTestImpl { - public: - WindowsDeathTest(const char* a_statement, - const RE* a_regex, - const char* file, - int line) - : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {} - - // All of these virtual functions are inherited from DeathTest. - virtual int Wait(); - virtual TestRole AssumeRole(); - - private: - // The name of the file in which the death test is located. - const char* const file_; - // The line number on which the death test is located. - const int line_; - // Handle to the write end of the pipe to the child process. - AutoHandle write_handle_; - // Child process handle. - AutoHandle child_handle_; - // Event the child process uses to signal the parent that it has - // acquired the handle to the write end of the pipe. After seeing this - // event the parent can release its own handles to make sure its - // ReadFile() calls return when the child terminates. - AutoHandle event_handle_; -}; - -// Waits for the child in a death test to exit, returning its exit -// status, or 0 if no child process exists. As a side effect, sets the -// outcome data member. -int WindowsDeathTest::Wait() { - if (!spawned()) - return 0; - - // Wait until the child either signals that it has acquired the write end - // of the pipe or it dies. - const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() }; - switch (::WaitForMultipleObjects(2, - wait_handles, - FALSE, // Waits for any of the handles. - INFINITE)) { - case WAIT_OBJECT_0: - case WAIT_OBJECT_0 + 1: - break; - default: - GTEST_DEATH_TEST_CHECK_(false); // Should not get here. - } - - // The child has acquired the write end of the pipe or exited. - // We release the handle on our side and continue. - write_handle_.Reset(); - event_handle_.Reset(); - - ReadAndInterpretStatusByte(); - - // Waits for the child process to exit if it haven't already. This - // returns immediately if the child has already exited, regardless of - // whether previous calls to WaitForMultipleObjects synchronized on this - // handle or not. - GTEST_DEATH_TEST_CHECK_( - WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(), - INFINITE)); - DWORD status_code; - GTEST_DEATH_TEST_CHECK_( - ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE); - child_handle_.Reset(); - set_status(static_cast(status_code)); - return status(); -} - -// The AssumeRole process for a Windows death test. It creates a child -// process with the same executable as the current process to run the -// death test. The child process is given the --gtest_filter and -// --gtest_internal_run_death_test flags such that it knows to run the -// current death test only. -DeathTest::TestRole WindowsDeathTest::AssumeRole() { - const UnitTestImpl* const impl = GetUnitTestImpl(); - const InternalRunDeathTestFlag* const flag = - impl->internal_run_death_test_flag(); - const TestInfo* const info = impl->current_test_info(); - const int death_test_index = info->result()->death_test_count(); - - if (flag != NULL) { - // ParseInternalRunDeathTestFlag() has performed all the necessary - // processing. - set_write_fd(flag->write_fd()); - return EXECUTE_TEST; - } - - // WindowsDeathTest uses an anonymous pipe to communicate results of - // a death test. - SECURITY_ATTRIBUTES handles_are_inheritable = { - sizeof(SECURITY_ATTRIBUTES), NULL, TRUE }; - HANDLE read_handle, write_handle; - GTEST_DEATH_TEST_CHECK_( - ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable, - 0) // Default buffer size. - != FALSE); - set_read_fd(::_open_osfhandle(reinterpret_cast(read_handle), - O_RDONLY)); - write_handle_.Reset(write_handle); - event_handle_.Reset(::CreateEvent( - &handles_are_inheritable, - TRUE, // The event will automatically reset to non-signaled state. - FALSE, // The initial state is non-signalled. - NULL)); // The even is unnamed. - GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL); - const std::string filter_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" + - info->test_case_name() + "." + info->name(); - const std::string internal_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + - "=" + file_ + "|" + StreamableToString(line_) + "|" + - StreamableToString(death_test_index) + "|" + - StreamableToString(static_cast(::GetCurrentProcessId())) + - // size_t has the same width as pointers on both 32-bit and 64-bit - // Windows platforms. - // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx. - "|" + StreamableToString(reinterpret_cast(write_handle)) + - "|" + StreamableToString(reinterpret_cast(event_handle_.Get())); - - char executable_path[_MAX_PATH + 1]; // NOLINT - GTEST_DEATH_TEST_CHECK_( - _MAX_PATH + 1 != ::GetModuleFileNameA(NULL, - executable_path, - _MAX_PATH)); - - std::string command_line = - std::string(::GetCommandLineA()) + " " + filter_flag + " \"" + - internal_flag + "\""; - - DeathTest::set_last_death_test_message(""); - - CaptureStderr(); - // Flush the log buffers since the log streams are shared with the child. - FlushInfoLog(); - - // The child process will share the standard handles with the parent. - STARTUPINFOA startup_info; - memset(&startup_info, 0, sizeof(STARTUPINFO)); - startup_info.dwFlags = STARTF_USESTDHANDLES; - startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE); - startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE); - startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE); - - PROCESS_INFORMATION process_info; - GTEST_DEATH_TEST_CHECK_(::CreateProcessA( - executable_path, - const_cast(command_line.c_str()), - NULL, // Retuned process handle is not inheritable. - NULL, // Retuned thread handle is not inheritable. - TRUE, // Child inherits all inheritable handles (for write_handle_). - 0x0, // Default creation flags. - NULL, // Inherit the parent's environment. - UnitTest::GetInstance()->original_working_dir(), - &startup_info, - &process_info) != FALSE); - child_handle_.Reset(process_info.hProcess); - ::CloseHandle(process_info.hThread); - set_spawned(true); - return OVERSEE_TEST; -} -# else // We are not on Windows. - -// ForkingDeathTest provides implementations for most of the abstract -// methods of the DeathTest interface. Only the AssumeRole method is -// left undefined. -class ForkingDeathTest : public DeathTestImpl { - public: - ForkingDeathTest(const char* statement, const RE* regex); - - // All of these virtual functions are inherited from DeathTest. - virtual int Wait(); - - protected: - void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; } - - private: - // PID of child process during death test; 0 in the child process itself. - pid_t child_pid_; -}; - -// Constructs a ForkingDeathTest. -ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex) - : DeathTestImpl(a_statement, a_regex), - child_pid_(-1) {} - -// Waits for the child in a death test to exit, returning its exit -// status, or 0 if no child process exists. As a side effect, sets the -// outcome data member. -int ForkingDeathTest::Wait() { - if (!spawned()) - return 0; - - ReadAndInterpretStatusByte(); - - int status_value; - GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0)); - set_status(status_value); - return status_value; -} - -// A concrete death test class that forks, then immediately runs the test -// in the child process. -class NoExecDeathTest : public ForkingDeathTest { - public: - NoExecDeathTest(const char* a_statement, const RE* a_regex) : - ForkingDeathTest(a_statement, a_regex) { } - virtual TestRole AssumeRole(); -}; - -// The AssumeRole process for a fork-and-run death test. It implements a -// straightforward fork, with a simple pipe to transmit the status byte. -DeathTest::TestRole NoExecDeathTest::AssumeRole() { - const size_t thread_count = GetThreadCount(); - if (thread_count != 1) { - GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count); - } - - int pipe_fd[2]; - GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); - - DeathTest::set_last_death_test_message(""); - CaptureStderr(); - // When we fork the process below, the log file buffers are copied, but the - // file descriptors are shared. We flush all log files here so that closing - // the file descriptors in the child process doesn't throw off the - // synchronization between descriptors and buffers in the parent process. - // This is as close to the fork as possible to avoid a race condition in case - // there are multiple threads running before the death test, and another - // thread writes to the log file. - FlushInfoLog(); - - const pid_t child_pid = fork(); - GTEST_DEATH_TEST_CHECK_(child_pid != -1); - set_child_pid(child_pid); - if (child_pid == 0) { - GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0])); - set_write_fd(pipe_fd[1]); - // Redirects all logging to stderr in the child process to prevent - // concurrent writes to the log files. We capture stderr in the parent - // process and append the child process' output to a log. - LogToStderr(); - // Event forwarding to the listeners of event listener API mush be shut - // down in death test subprocesses. - GetUnitTestImpl()->listeners()->SuppressEventForwarding(); - g_in_fast_death_test_child = true; - return EXECUTE_TEST; - } else { - GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); - set_read_fd(pipe_fd[0]); - set_spawned(true); - return OVERSEE_TEST; - } -} - -// A concrete death test class that forks and re-executes the main -// program from the beginning, with command-line flags set that cause -// only this specific death test to be run. -class ExecDeathTest : public ForkingDeathTest { - public: - ExecDeathTest(const char* a_statement, const RE* a_regex, - const char* file, int line) : - ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { } - virtual TestRole AssumeRole(); - private: - static ::std::vector - GetArgvsForDeathTestChildProcess() { - ::std::vector args = GetInjectableArgvs(); - return args; - } - // The name of the file in which the death test is located. - const char* const file_; - // The line number on which the death test is located. - const int line_; -}; - -// Utility class for accumulating command-line arguments. -class Arguments { - public: - Arguments() { - args_.push_back(NULL); - } - - ~Arguments() { - for (std::vector::iterator i = args_.begin(); i != args_.end(); - ++i) { - free(*i); - } - } - void AddArgument(const char* argument) { - args_.insert(args_.end() - 1, posix::StrDup(argument)); - } - - template - void AddArguments(const ::std::vector& arguments) { - for (typename ::std::vector::const_iterator i = arguments.begin(); - i != arguments.end(); - ++i) { - args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); - } - } - char* const* Argv() { - return &args_[0]; - } - - private: - std::vector args_; -}; - -// A struct that encompasses the arguments to the child process of a -// threadsafe-style death test process. -struct ExecDeathTestArgs { - char* const* argv; // Command-line arguments for the child's call to exec - int close_fd; // File descriptor to close; the read end of a pipe -}; - -# if GTEST_OS_MAC -inline char** GetEnviron() { - // When Google Test is built as a framework on MacOS X, the environ variable - // is unavailable. Apple's documentation (man environ) recommends using - // _NSGetEnviron() instead. - return *_NSGetEnviron(); -} -# else -// Some POSIX platforms expect you to declare environ. extern "C" makes -// it reside in the global namespace. -extern "C" char** environ; -inline char** GetEnviron() { return environ; } -# endif // GTEST_OS_MAC - -# if !GTEST_OS_QNX -// The main function for a threadsafe-style death test child process. -// This function is called in a clone()-ed process and thus must avoid -// any potentially unsafe operations like malloc or libc functions. -static int ExecDeathTestChildMain(void* child_arg) { - ExecDeathTestArgs* const args = static_cast(child_arg); - GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd)); - - // We need to execute the test program in the same environment where - // it was originally invoked. Therefore we change to the original - // working directory first. - const char* const original_dir = - UnitTest::GetInstance()->original_working_dir(); - // We can safely call chdir() as it's a direct system call. - if (chdir(original_dir) != 0) { - DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + - GetLastErrnoDescription()); - return EXIT_FAILURE; - } - - // We can safely call execve() as it's a direct system call. We - // cannot use execvp() as it's a libc function and thus potentially - // unsafe. Since execve() doesn't search the PATH, the user must - // invoke the test program via a valid path that contains at least - // one path separator. - execve(args->argv[0], args->argv, GetEnviron()); - DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " + - original_dir + " failed: " + - GetLastErrnoDescription()); - return EXIT_FAILURE; -} -# endif // !GTEST_OS_QNX - -// Two utility routines that together determine the direction the stack -// grows. -// This could be accomplished more elegantly by a single recursive -// function, but we want to guard against the unlikely possibility of -// a smart compiler optimizing the recursion away. -// -// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining -// StackLowerThanAddress into StackGrowsDown, which then doesn't give -// correct answer. -void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_; -void StackLowerThanAddress(const void* ptr, bool* result) { - int dummy; - *result = (&dummy < ptr); -} - -bool StackGrowsDown() { - int dummy; - bool result; - StackLowerThanAddress(&dummy, &result); - return result; -} - -// Spawns a child process with the same executable as the current process in -// a thread-safe manner and instructs it to run the death test. The -// implementation uses fork(2) + exec. On systems where clone(2) is -// available, it is used instead, being slightly more thread-safe. On QNX, -// fork supports only single-threaded environments, so this function uses -// spawn(2) there instead. The function dies with an error message if -// anything goes wrong. -static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { - ExecDeathTestArgs args = { argv, close_fd }; - pid_t child_pid = -1; - -# if GTEST_OS_QNX - // Obtains the current directory and sets it to be closed in the child - // process. - const int cwd_fd = open(".", O_RDONLY); - GTEST_DEATH_TEST_CHECK_(cwd_fd != -1); - GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC)); - // We need to execute the test program in the same environment where - // it was originally invoked. Therefore we change to the original - // working directory first. - const char* const original_dir = - UnitTest::GetInstance()->original_working_dir(); - // We can safely call chdir() as it's a direct system call. - if (chdir(original_dir) != 0) { - DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " + - GetLastErrnoDescription()); - return EXIT_FAILURE; - } - - int fd_flags; - // Set close_fd to be closed after spawn. - GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD)); - GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD, - fd_flags | FD_CLOEXEC)); - struct inheritance inherit = {0}; - // spawn is a system call. - child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron()); - // Restores the current working directory. - GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); - GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); - -# else // GTEST_OS_QNX -# if GTEST_OS_LINUX - // When a SIGPROF signal is received while fork() or clone() are executing, - // the process may hang. To avoid this, we ignore SIGPROF here and re-enable - // it after the call to fork()/clone() is complete. - struct sigaction saved_sigprof_action; - struct sigaction ignore_sigprof_action; - memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action)); - sigemptyset(&ignore_sigprof_action.sa_mask); - ignore_sigprof_action.sa_handler = SIG_IGN; - GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction( - SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); -# endif // GTEST_OS_LINUX - -# if GTEST_HAS_CLONE - const bool use_fork = GTEST_FLAG(death_test_use_fork); - - if (!use_fork) { - static const bool stack_grows_down = StackGrowsDown(); - const size_t stack_size = getpagesize(); - // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead. - void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, - MAP_ANON | MAP_PRIVATE, -1, 0); - GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED); - - // Maximum stack alignment in bytes: For a downward-growing stack, this - // amount is subtracted from size of the stack space to get an address - // that is within the stack space and is aligned on all systems we care - // about. As far as I know there is no ABI with stack alignment greater - // than 64. We assume stack and stack_size already have alignment of - // kMaxStackAlignment. - const size_t kMaxStackAlignment = 64; - void* const stack_top = - static_cast(stack) + - (stack_grows_down ? stack_size - kMaxStackAlignment : 0); - GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment && - reinterpret_cast(stack_top) % kMaxStackAlignment == 0); - - child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args); - - GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1); - } -# else - const bool use_fork = true; -# endif // GTEST_HAS_CLONE - - if (use_fork && (child_pid = fork()) == 0) { - ExecDeathTestChildMain(&args); - _exit(0); - } -# endif // GTEST_OS_QNX -# if GTEST_OS_LINUX - GTEST_DEATH_TEST_CHECK_SYSCALL_( - sigaction(SIGPROF, &saved_sigprof_action, NULL)); -# endif // GTEST_OS_LINUX - - GTEST_DEATH_TEST_CHECK_(child_pid != -1); - return child_pid; -} - -// The AssumeRole process for a fork-and-exec death test. It re-executes the -// main program from the beginning, setting the --gtest_filter -// and --gtest_internal_run_death_test flags to cause only the current -// death test to be re-run. -DeathTest::TestRole ExecDeathTest::AssumeRole() { - const UnitTestImpl* const impl = GetUnitTestImpl(); - const InternalRunDeathTestFlag* const flag = - impl->internal_run_death_test_flag(); - const TestInfo* const info = impl->current_test_info(); - const int death_test_index = info->result()->death_test_count(); - - if (flag != NULL) { - set_write_fd(flag->write_fd()); - return EXECUTE_TEST; - } - - int pipe_fd[2]; - GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); - // Clear the close-on-exec flag on the write end of the pipe, lest - // it be closed when the child process does an exec: - GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); - - const std::string filter_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" - + info->test_case_name() + "." + info->name(); - const std::string internal_flag = - std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" - + file_ + "|" + StreamableToString(line_) + "|" - + StreamableToString(death_test_index) + "|" - + StreamableToString(pipe_fd[1]); - Arguments args; - args.AddArguments(GetArgvsForDeathTestChildProcess()); - args.AddArgument(filter_flag.c_str()); - args.AddArgument(internal_flag.c_str()); - - DeathTest::set_last_death_test_message(""); - - CaptureStderr(); - // See the comment in NoExecDeathTest::AssumeRole for why the next line - // is necessary. - FlushInfoLog(); - - const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]); - GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); - set_child_pid(child_pid); - set_read_fd(pipe_fd[0]); - set_spawned(true); - return OVERSEE_TEST; -} - -# endif // !GTEST_OS_WINDOWS - -// Creates a concrete DeathTest-derived class that depends on the -// --gtest_death_test_style flag, and sets the pointer pointed to -// by the "test" argument to its address. If the test should be -// skipped, sets that pointer to NULL. Returns true, unless the -// flag is set to an invalid value. -bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, - const char* file, int line, - DeathTest** test) { - UnitTestImpl* const impl = GetUnitTestImpl(); - const InternalRunDeathTestFlag* const flag = - impl->internal_run_death_test_flag(); - const int death_test_index = impl->current_test_info() - ->increment_death_test_count(); - - if (flag != NULL) { - if (death_test_index > flag->index()) { - DeathTest::set_last_death_test_message( - "Death test count (" + StreamableToString(death_test_index) - + ") somehow exceeded expected maximum (" - + StreamableToString(flag->index()) + ")"); - return false; - } - - if (!(flag->file() == file && flag->line() == line && - flag->index() == death_test_index)) { - *test = NULL; - return true; - } - } - -# if GTEST_OS_WINDOWS - - if (GTEST_FLAG(death_test_style) == "threadsafe" || - GTEST_FLAG(death_test_style) == "fast") { - *test = new WindowsDeathTest(statement, regex, file, line); - } - -# else - - if (GTEST_FLAG(death_test_style) == "threadsafe") { - *test = new ExecDeathTest(statement, regex, file, line); - } else if (GTEST_FLAG(death_test_style) == "fast") { - *test = new NoExecDeathTest(statement, regex); - } - -# endif // GTEST_OS_WINDOWS - - else { // NOLINT - this is more readable than unbalanced brackets inside #if. - DeathTest::set_last_death_test_message( - "Unknown death test style \"" + GTEST_FLAG(death_test_style) - + "\" encountered"); - return false; - } - - return true; -} - -// Splits a given string on a given delimiter, populating a given -// vector with the fields. GTEST_HAS_DEATH_TEST implies that we have -// ::std::string, so we can use it here. -static void SplitString(const ::std::string& str, char delimiter, - ::std::vector< ::std::string>* dest) { - ::std::vector< ::std::string> parsed; - ::std::string::size_type pos = 0; - while (::testing::internal::AlwaysTrue()) { - const ::std::string::size_type colon = str.find(delimiter, pos); - if (colon == ::std::string::npos) { - parsed.push_back(str.substr(pos)); - break; - } else { - parsed.push_back(str.substr(pos, colon - pos)); - pos = colon + 1; - } - } - dest->swap(parsed); -} - -# if GTEST_OS_WINDOWS -// Recreates the pipe and event handles from the provided parameters, -// signals the event, and returns a file descriptor wrapped around the pipe -// handle. This function is called in the child process only. -int GetStatusFileDescriptor(unsigned int parent_process_id, - size_t write_handle_as_size_t, - size_t event_handle_as_size_t) { - AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, - FALSE, // Non-inheritable. - parent_process_id)); - if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) { - DeathTestAbort("Unable to open parent process " + - StreamableToString(parent_process_id)); - } - - // TODO(vladl@google.com): Replace the following check with a - // compile-time assertion when available. - GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); - - const HANDLE write_handle = - reinterpret_cast(write_handle_as_size_t); - HANDLE dup_write_handle; - - // The newly initialized handle is accessible only in in the parent - // process. To obtain one accessible within the child, we need to use - // DuplicateHandle. - if (!::DuplicateHandle(parent_process_handle.Get(), write_handle, - ::GetCurrentProcess(), &dup_write_handle, - 0x0, // Requested privileges ignored since - // DUPLICATE_SAME_ACCESS is used. - FALSE, // Request non-inheritable handler. - DUPLICATE_SAME_ACCESS)) { - DeathTestAbort("Unable to duplicate the pipe handle " + - StreamableToString(write_handle_as_size_t) + - " from the parent process " + - StreamableToString(parent_process_id)); - } - - const HANDLE event_handle = reinterpret_cast(event_handle_as_size_t); - HANDLE dup_event_handle; - - if (!::DuplicateHandle(parent_process_handle.Get(), event_handle, - ::GetCurrentProcess(), &dup_event_handle, - 0x0, - FALSE, - DUPLICATE_SAME_ACCESS)) { - DeathTestAbort("Unable to duplicate the event handle " + - StreamableToString(event_handle_as_size_t) + - " from the parent process " + - StreamableToString(parent_process_id)); - } - - const int write_fd = - ::_open_osfhandle(reinterpret_cast(dup_write_handle), O_APPEND); - if (write_fd == -1) { - DeathTestAbort("Unable to convert pipe handle " + - StreamableToString(write_handle_as_size_t) + - " to a file descriptor"); - } - - // Signals the parent that the write end of the pipe has been acquired - // so the parent can release its own write end. - ::SetEvent(dup_event_handle); - - return write_fd; -} -# endif // GTEST_OS_WINDOWS - -// Returns a newly created InternalRunDeathTestFlag object with fields -// initialized from the GTEST_FLAG(internal_run_death_test) flag if -// the flag is specified; otherwise returns NULL. -InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { - if (GTEST_FLAG(internal_run_death_test) == "") return NULL; - - // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we - // can use it here. - int line = -1; - int index = -1; - ::std::vector< ::std::string> fields; - SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields); - int write_fd = -1; - -# if GTEST_OS_WINDOWS - - unsigned int parent_process_id = 0; - size_t write_handle_as_size_t = 0; - size_t event_handle_as_size_t = 0; - - if (fields.size() != 6 - || !ParseNaturalNumber(fields[1], &line) - || !ParseNaturalNumber(fields[2], &index) - || !ParseNaturalNumber(fields[3], &parent_process_id) - || !ParseNaturalNumber(fields[4], &write_handle_as_size_t) - || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { - DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + - GTEST_FLAG(internal_run_death_test)); - } - write_fd = GetStatusFileDescriptor(parent_process_id, - write_handle_as_size_t, - event_handle_as_size_t); -# else - - if (fields.size() != 4 - || !ParseNaturalNumber(fields[1], &line) - || !ParseNaturalNumber(fields[2], &index) - || !ParseNaturalNumber(fields[3], &write_fd)) { - DeathTestAbort("Bad --gtest_internal_run_death_test flag: " - + GTEST_FLAG(internal_run_death_test)); - } - -# endif // GTEST_OS_WINDOWS - - return new InternalRunDeathTestFlag(fields[0], line, index, write_fd); -} - -} // namespace internal - -#endif // GTEST_HAS_DEATH_TEST - -} // namespace testing -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Authors: keith.ray@gmail.com (Keith Ray) - - -#include - -#if GTEST_OS_WINDOWS_MOBILE -# include -#elif GTEST_OS_WINDOWS -# include -# include -#elif GTEST_OS_SYMBIAN -// Symbian OpenC has PATH_MAX in sys/syslimits.h -# include -#else -# include -# include // Some Linux distributions define PATH_MAX here. -#endif // GTEST_OS_WINDOWS_MOBILE - -#if GTEST_OS_WINDOWS -# define GTEST_PATH_MAX_ _MAX_PATH -#elif defined(PATH_MAX) -# define GTEST_PATH_MAX_ PATH_MAX -#elif defined(_XOPEN_PATH_MAX) -# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX -#else -# define GTEST_PATH_MAX_ _POSIX_PATH_MAX -#endif // GTEST_OS_WINDOWS - - -namespace testing { -namespace internal { - -#if GTEST_OS_WINDOWS -// On Windows, '\\' is the standard path separator, but many tools and the -// Windows API also accept '/' as an alternate path separator. Unless otherwise -// noted, a file path can contain either kind of path separators, or a mixture -// of them. -const char kPathSeparator = '\\'; -const char kAlternatePathSeparator = '/'; -const char kAlternatePathSeparatorString[] = "/"; -# if GTEST_OS_WINDOWS_MOBILE -// Windows CE doesn't have a current directory. You should not use -// the current directory in tests on Windows CE, but this at least -// provides a reasonable fallback. -const char kCurrentDirectoryString[] = "\\"; -// Windows CE doesn't define INVALID_FILE_ATTRIBUTES -const DWORD kInvalidFileAttributes = 0xffffffff; -# else -const char kCurrentDirectoryString[] = ".\\"; -# endif // GTEST_OS_WINDOWS_MOBILE -#else -const char kPathSeparator = '/'; -const char kCurrentDirectoryString[] = "./"; -#endif // GTEST_OS_WINDOWS - -// Returns whether the given character is a valid path separator. -static bool IsPathSeparator(char c) { -#if GTEST_HAS_ALT_PATH_SEP_ - return (c == kPathSeparator) || (c == kAlternatePathSeparator); -#else - return c == kPathSeparator; -#endif -} - -// Returns the current working directory, or "" if unsuccessful. -FilePath FilePath::GetCurrentDir() { -#if GTEST_OS_WINDOWS_MOBILE - // Windows CE doesn't have a current directory, so we just return - // something reasonable. - return FilePath(kCurrentDirectoryString); -#elif GTEST_OS_WINDOWS - char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; - return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd); -#else - char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; - return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd); -#endif // GTEST_OS_WINDOWS_MOBILE -} - -// Returns a copy of the FilePath with the case-insensitive extension removed. -// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns -// FilePath("dir/file"). If a case-insensitive extension is not -// found, returns a copy of the original FilePath. -FilePath FilePath::RemoveExtension(const char* extension) const { - const std::string dot_extension = std::string(".") + extension; - if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) { - return FilePath(pathname_.substr( - 0, pathname_.length() - dot_extension.length())); - } - return *this; -} - -// Returns a pointer to the last occurence of a valid path separator in -// the FilePath. On Windows, for example, both '/' and '\' are valid path -// separators. Returns NULL if no path separator was found. -const char* FilePath::FindLastPathSeparator() const { - const char* const last_sep = strrchr(c_str(), kPathSeparator); -#if GTEST_HAS_ALT_PATH_SEP_ - const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator); - // Comparing two pointers of which only one is NULL is undefined. - if (last_alt_sep != NULL && - (last_sep == NULL || last_alt_sep > last_sep)) { - return last_alt_sep; - } -#endif - return last_sep; -} - -// Returns a copy of the FilePath with the directory part removed. -// Example: FilePath("path/to/file").RemoveDirectoryName() returns -// FilePath("file"). If there is no directory part ("just_a_file"), it returns -// the FilePath unmodified. If there is no file part ("just_a_dir/") it -// returns an empty FilePath (""). -// On Windows platform, '\' is the path separator, otherwise it is '/'. -FilePath FilePath::RemoveDirectoryName() const { - const char* const last_sep = FindLastPathSeparator(); - return last_sep ? FilePath(last_sep + 1) : *this; -} - -// RemoveFileName returns the directory path with the filename removed. -// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". -// If the FilePath is "a_file" or "/a_file", RemoveFileName returns -// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does -// not have a file, like "just/a/dir/", it returns the FilePath unmodified. -// On Windows platform, '\' is the path separator, otherwise it is '/'. -FilePath FilePath::RemoveFileName() const { - const char* const last_sep = FindLastPathSeparator(); - std::string dir; - if (last_sep) { - dir = std::string(c_str(), last_sep + 1 - c_str()); - } else { - dir = kCurrentDirectoryString; - } - return FilePath(dir); -} - -// Helper functions for naming files in a directory for xml output. - -// Given directory = "dir", base_name = "test", number = 0, -// extension = "xml", returns "dir/test.xml". If number is greater -// than zero (e.g., 12), returns "dir/test_12.xml". -// On Windows platform, uses \ as the separator rather than /. -FilePath FilePath::MakeFileName(const FilePath& directory, - const FilePath& base_name, - int number, - const char* extension) { - std::string file; - if (number == 0) { - file = base_name.string() + "." + extension; - } else { - file = base_name.string() + "_" + StreamableToString(number) - + "." + extension; - } - return ConcatPaths(directory, FilePath(file)); -} - -// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml". -// On Windows, uses \ as the separator rather than /. -FilePath FilePath::ConcatPaths(const FilePath& directory, - const FilePath& relative_path) { - if (directory.IsEmpty()) - return relative_path; - const FilePath dir(directory.RemoveTrailingPathSeparator()); - return FilePath(dir.string() + kPathSeparator + relative_path.string()); -} - -// Returns true if pathname describes something findable in the file-system, -// either a file, directory, or whatever. -bool FilePath::FileOrDirectoryExists() const { -#if GTEST_OS_WINDOWS_MOBILE - LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str()); - const DWORD attributes = GetFileAttributes(unicode); - delete [] unicode; - return attributes != kInvalidFileAttributes; -#else - posix::StatStruct file_stat; - return posix::Stat(pathname_.c_str(), &file_stat) == 0; -#endif // GTEST_OS_WINDOWS_MOBILE -} - -// Returns true if pathname describes a directory in the file-system -// that exists. -bool FilePath::DirectoryExists() const { - bool result = false; -#if GTEST_OS_WINDOWS - // Don't strip off trailing separator if path is a root directory on - // Windows (like "C:\\"). - const FilePath& path(IsRootDirectory() ? *this : - RemoveTrailingPathSeparator()); -#else - const FilePath& path(*this); -#endif - -#if GTEST_OS_WINDOWS_MOBILE - LPCWSTR unicode = String::AnsiToUtf16(path.c_str()); - const DWORD attributes = GetFileAttributes(unicode); - delete [] unicode; - if ((attributes != kInvalidFileAttributes) && - (attributes & FILE_ATTRIBUTE_DIRECTORY)) { - result = true; - } -#else - posix::StatStruct file_stat; - result = posix::Stat(path.c_str(), &file_stat) == 0 && - posix::IsDir(file_stat); -#endif // GTEST_OS_WINDOWS_MOBILE - - return result; -} - -// Returns true if pathname describes a root directory. (Windows has one -// root directory per disk drive.) -bool FilePath::IsRootDirectory() const { -#if GTEST_OS_WINDOWS - // TODO(wan@google.com): on Windows a network share like - // \\server\share can be a root directory, although it cannot be the - // current directory. Handle this properly. - return pathname_.length() == 3 && IsAbsolutePath(); -#else - return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]); -#endif -} - -// Returns true if pathname describes an absolute path. -bool FilePath::IsAbsolutePath() const { - const char* const name = pathname_.c_str(); -#if GTEST_OS_WINDOWS - return pathname_.length() >= 3 && - ((name[0] >= 'a' && name[0] <= 'z') || - (name[0] >= 'A' && name[0] <= 'Z')) && - name[1] == ':' && - IsPathSeparator(name[2]); -#else - return IsPathSeparator(name[0]); -#endif -} - -// Returns a pathname for a file that does not currently exist. The pathname -// will be directory/base_name.extension or -// directory/base_name_.extension if directory/base_name.extension -// already exists. The number will be incremented until a pathname is found -// that does not already exist. -// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. -// There could be a race condition if two or more processes are calling this -// function at the same time -- they could both pick the same filename. -FilePath FilePath::GenerateUniqueFileName(const FilePath& directory, - const FilePath& base_name, - const char* extension) { - FilePath full_pathname; - int number = 0; - do { - full_pathname.Set(MakeFileName(directory, base_name, number++, extension)); - } while (full_pathname.FileOrDirectoryExists()); - return full_pathname; -} - -// Returns true if FilePath ends with a path separator, which indicates that -// it is intended to represent a directory. Returns false otherwise. -// This does NOT check that a directory (or file) actually exists. -bool FilePath::IsDirectory() const { - return !pathname_.empty() && - IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]); -} - -// Create directories so that path exists. Returns true if successful or if -// the directories already exist; returns false if unable to create directories -// for any reason. -bool FilePath::CreateDirectoriesRecursively() const { - if (!this->IsDirectory()) { - return false; - } - - if (pathname_.length() == 0 || this->DirectoryExists()) { - return true; - } - - const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName()); - return parent.CreateDirectoriesRecursively() && this->CreateFolder(); -} - -// Create the directory so that path exists. Returns true if successful or -// if the directory already exists; returns false if unable to create the -// directory for any reason, including if the parent directory does not -// exist. Not named "CreateDirectory" because that's a macro on Windows. -bool FilePath::CreateFolder() const { -#if GTEST_OS_WINDOWS_MOBILE - FilePath removed_sep(this->RemoveTrailingPathSeparator()); - LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); - int result = CreateDirectory(unicode, NULL) ? 0 : -1; - delete [] unicode; -#elif GTEST_OS_WINDOWS - int result = _mkdir(pathname_.c_str()); -#else - int result = mkdir(pathname_.c_str(), 0777); -#endif // GTEST_OS_WINDOWS_MOBILE - - if (result == -1) { - return this->DirectoryExists(); // An error is OK if the directory exists. - } - return true; // No error. -} - -// If input name has a trailing separator character, remove it and return the -// name, otherwise return the name string unmodified. -// On Windows platform, uses \ as the separator, other platforms use /. -FilePath FilePath::RemoveTrailingPathSeparator() const { - return IsDirectory() - ? FilePath(pathname_.substr(0, pathname_.length() - 1)) - : *this; -} - -// Removes any redundant separators that might be in the pathname. -// For example, "bar///foo" becomes "bar/foo". Does not eliminate other -// redundancies that might be in a pathname involving "." or "..". -// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share). -void FilePath::Normalize() { - if (pathname_.c_str() == NULL) { - pathname_ = ""; - return; - } - const char* src = pathname_.c_str(); - char* const dest = new char[pathname_.length() + 1]; - char* dest_ptr = dest; - memset(dest_ptr, 0, pathname_.length() + 1); - - while (*src != '\0') { - *dest_ptr = *src; - if (!IsPathSeparator(*src)) { - src++; - } else { -#if GTEST_HAS_ALT_PATH_SEP_ - if (*dest_ptr == kAlternatePathSeparator) { - *dest_ptr = kPathSeparator; - } -#endif - while (IsPathSeparator(*src)) - src++; - } - dest_ptr++; - } - *dest_ptr = '\0'; - pathname_ = dest; - delete[] dest; -} - -} // namespace internal -} // namespace testing -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - - -#include -#include -#include -#include - -#if GTEST_OS_WINDOWS_MOBILE -# include // For TerminateProcess() -#elif GTEST_OS_WINDOWS -# include -# include -#else -# include -#endif // GTEST_OS_WINDOWS_MOBILE - -#if GTEST_OS_MAC -# include -# include -# include -#endif // GTEST_OS_MAC - -#if GTEST_OS_QNX -# include -# include -#endif // GTEST_OS_QNX - - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick is to -// prevent a user from accidentally including gtest-internal-inl.h in -// his code. -#define GTEST_IMPLEMENTATION_ 1 -#undef GTEST_IMPLEMENTATION_ - -namespace testing { -namespace internal { - -#if defined(_MSC_VER) || defined(__BORLANDC__) -// MSVC and C++Builder do not provide a definition of STDERR_FILENO. -const int kStdOutFileno = 1; -const int kStdErrFileno = 2; -#else -const int kStdOutFileno = STDOUT_FILENO; -const int kStdErrFileno = STDERR_FILENO; -#endif // _MSC_VER - -#if GTEST_OS_MAC - -// Returns the number of threads running in the process, or 0 to indicate that -// we cannot detect it. -size_t GetThreadCount() { - const task_t task = mach_task_self(); - mach_msg_type_number_t thread_count; - thread_act_array_t thread_list; - const kern_return_t status = task_threads(task, &thread_list, &thread_count); - if (status == KERN_SUCCESS) { - // task_threads allocates resources in thread_list and we need to free them - // to avoid leaks. - vm_deallocate(task, - reinterpret_cast(thread_list), - sizeof(thread_t) * thread_count); - return static_cast(thread_count); - } else { - return 0; - } -} - -#elif GTEST_OS_QNX - -// Returns the number of threads running in the process, or 0 to indicate that -// we cannot detect it. -size_t GetThreadCount() { - const int fd = open("/proc/self/as", O_RDONLY); - if (fd < 0) { - return 0; - } - procfs_info process_info; - const int status = - devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL); - close(fd); - if (status == EOK) { - return static_cast(process_info.num_threads); - } else { - return 0; - } -} - -#else - -size_t GetThreadCount() { - // There's no portable way to detect the number of threads, so we just - // return 0 to indicate that we cannot detect it. - return 0; -} - -#endif // GTEST_OS_MAC - -#if GTEST_USES_POSIX_RE - -// Implements RE. Currently only needed for death tests. - -RE::~RE() { - if (is_valid_) { - // regfree'ing an invalid regex might crash because the content - // of the regex is undefined. Since the regex's are essentially - // the same, one cannot be valid (or invalid) without the other - // being so too. - regfree(&partial_regex_); - regfree(&full_regex_); - } - free(const_cast(pattern_)); -} - -// Returns true iff regular expression re matches the entire str. -bool RE::FullMatch(const char* str, const RE& re) { - if (!re.is_valid_) return false; - - regmatch_t match; - return regexec(&re.full_regex_, str, 1, &match, 0) == 0; -} - -// Returns true iff regular expression re matches a substring of str -// (including str itself). -bool RE::PartialMatch(const char* str, const RE& re) { - if (!re.is_valid_) return false; - - regmatch_t match; - return regexec(&re.partial_regex_, str, 1, &match, 0) == 0; -} - -// Initializes an RE from its string representation. -void RE::Init(const char* regex) { - pattern_ = posix::StrDup(regex); - - // Reserves enough bytes to hold the regular expression used for a - // full match. - const size_t full_regex_len = strlen(regex) + 10; - char* const full_pattern = new char[full_regex_len]; - - snprintf(full_pattern, full_regex_len, "^(%s)$", regex); - is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0; - // We want to call regcomp(&partial_regex_, ...) even if the - // previous expression returns false. Otherwise partial_regex_ may - // not be properly initialized can may cause trouble when it's - // freed. - // - // Some implementation of POSIX regex (e.g. on at least some - // versions of Cygwin) doesn't accept the empty string as a valid - // regex. We change it to an equivalent form "()" to be safe. - if (is_valid_) { - const char* const partial_regex = (*regex == '\0') ? "()" : regex; - is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0; - } - EXPECT_TRUE(is_valid_) - << "Regular expression \"" << regex - << "\" is not a valid POSIX Extended regular expression."; - - delete[] full_pattern; -} - -#elif GTEST_USES_SIMPLE_RE - -// Returns true iff ch appears anywhere in str (excluding the -// terminating '\0' character). -bool IsInSet(char ch, const char* str) { - return ch != '\0' && strchr(str, ch) != NULL; -} - -// Returns true iff ch belongs to the given classification. Unlike -// similar functions in , these aren't affected by the -// current locale. -bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; } -bool IsAsciiPunct(char ch) { - return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~"); -} -bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); } -bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); } -bool IsAsciiWordChar(char ch) { - return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || - ('0' <= ch && ch <= '9') || ch == '_'; -} - -// Returns true iff "\\c" is a supported escape sequence. -bool IsValidEscape(char c) { - return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW")); -} - -// Returns true iff the given atom (specified by escaped and pattern) -// matches ch. The result is undefined if the atom is invalid. -bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { - if (escaped) { // "\\p" where p is pattern_char. - switch (pattern_char) { - case 'd': return IsAsciiDigit(ch); - case 'D': return !IsAsciiDigit(ch); - case 'f': return ch == '\f'; - case 'n': return ch == '\n'; - case 'r': return ch == '\r'; - case 's': return IsAsciiWhiteSpace(ch); - case 'S': return !IsAsciiWhiteSpace(ch); - case 't': return ch == '\t'; - case 'v': return ch == '\v'; - case 'w': return IsAsciiWordChar(ch); - case 'W': return !IsAsciiWordChar(ch); - } - return IsAsciiPunct(pattern_char) && pattern_char == ch; - } - - return (pattern_char == '.' && ch != '\n') || pattern_char == ch; -} - -// Helper function used by ValidateRegex() to format error messages. -std::string FormatRegexSyntaxError(const char* regex, int index) { - return (Message() << "Syntax error at index " << index - << " in simple regular expression \"" << regex << "\": ").GetString(); -} - -// Generates non-fatal failures and returns false if regex is invalid; -// otherwise returns true. -bool ValidateRegex(const char* regex) { - if (regex == NULL) { - // TODO(wan@google.com): fix the source file location in the - // assertion failures to match where the regex is used in user - // code. - ADD_FAILURE() << "NULL is not a valid simple regular expression."; - return false; - } - - bool is_valid = true; - - // True iff ?, *, or + can follow the previous atom. - bool prev_repeatable = false; - for (int i = 0; regex[i]; i++) { - if (regex[i] == '\\') { // An escape sequence - i++; - if (regex[i] == '\0') { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) - << "'\\' cannot appear at the end."; - return false; - } - - if (!IsValidEscape(regex[i])) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) - << "invalid escape sequence \"\\" << regex[i] << "\"."; - is_valid = false; - } - prev_repeatable = true; - } else { // Not an escape sequence. - const char ch = regex[i]; - - if (ch == '^' && i > 0) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'^' can only appear at the beginning."; - is_valid = false; - } else if (ch == '$' && regex[i + 1] != '\0') { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'$' can only appear at the end."; - is_valid = false; - } else if (IsInSet(ch, "()[]{}|")) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'" << ch << "' is unsupported."; - is_valid = false; - } else if (IsRepeat(ch) && !prev_repeatable) { - ADD_FAILURE() << FormatRegexSyntaxError(regex, i) - << "'" << ch << "' can only follow a repeatable token."; - is_valid = false; - } - - prev_repeatable = !IsInSet(ch, "^$?*+"); - } - } - - return is_valid; -} - -// Matches a repeated regex atom followed by a valid simple regular -// expression. The regex atom is defined as c if escaped is false, -// or \c otherwise. repeat is the repetition meta character (?, *, -// or +). The behavior is undefined if str contains too many -// characters to be indexable by size_t, in which case the test will -// probably time out anyway. We are fine with this limitation as -// std::string has it too. -bool MatchRepetitionAndRegexAtHead( - bool escaped, char c, char repeat, const char* regex, - const char* str) { - const size_t min_count = (repeat == '+') ? 1 : 0; - const size_t max_count = (repeat == '?') ? 1 : - static_cast(-1) - 1; - // We cannot call numeric_limits::max() as it conflicts with the - // max() macro on Windows. - - for (size_t i = 0; i <= max_count; ++i) { - // We know that the atom matches each of the first i characters in str. - if (i >= min_count && MatchRegexAtHead(regex, str + i)) { - // We have enough matches at the head, and the tail matches too. - // Since we only care about *whether* the pattern matches str - // (as opposed to *how* it matches), there is no need to find a - // greedy match. - return true; - } - if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) - return false; - } - return false; -} - -// Returns true iff regex matches a prefix of str. regex must be a -// valid simple regular expression and not start with "^", or the -// result is undefined. -bool MatchRegexAtHead(const char* regex, const char* str) { - if (*regex == '\0') // An empty regex matches a prefix of anything. - return true; - - // "$" only matches the end of a string. Note that regex being - // valid guarantees that there's nothing after "$" in it. - if (*regex == '$') - return *str == '\0'; - - // Is the first thing in regex an escape sequence? - const bool escaped = *regex == '\\'; - if (escaped) - ++regex; - if (IsRepeat(regex[1])) { - // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so - // here's an indirect recursion. It terminates as the regex gets - // shorter in each recursion. - return MatchRepetitionAndRegexAtHead( - escaped, regex[0], regex[1], regex + 2, str); - } else { - // regex isn't empty, isn't "$", and doesn't start with a - // repetition. We match the first atom of regex with the first - // character of str and recurse. - return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) && - MatchRegexAtHead(regex + 1, str + 1); - } -} - -// Returns true iff regex matches any substring of str. regex must be -// a valid simple regular expression, or the result is undefined. -// -// The algorithm is recursive, but the recursion depth doesn't exceed -// the regex length, so we won't need to worry about running out of -// stack space normally. In rare cases the time complexity can be -// exponential with respect to the regex length + the string length, -// but usually it's must faster (often close to linear). -bool MatchRegexAnywhere(const char* regex, const char* str) { - if (regex == NULL || str == NULL) - return false; - - if (*regex == '^') - return MatchRegexAtHead(regex + 1, str); - - // A successful match can be anywhere in str. - do { - if (MatchRegexAtHead(regex, str)) - return true; - } while (*str++ != '\0'); - return false; -} - -// Implements the RE class. - -RE::~RE() { - free(const_cast(pattern_)); - free(const_cast(full_pattern_)); -} - -// Returns true iff regular expression re matches the entire str. -bool RE::FullMatch(const char* str, const RE& re) { - return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str); -} - -// Returns true iff regular expression re matches a substring of str -// (including str itself). -bool RE::PartialMatch(const char* str, const RE& re) { - return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str); -} - -// Initializes an RE from its string representation. -void RE::Init(const char* regex) { - pattern_ = full_pattern_ = NULL; - if (regex != NULL) { - pattern_ = posix::StrDup(regex); - } - - is_valid_ = ValidateRegex(regex); - if (!is_valid_) { - // No need to calculate the full pattern when the regex is invalid. - return; - } - - const size_t len = strlen(regex); - // Reserves enough bytes to hold the regular expression used for a - // full match: we need space to prepend a '^', append a '$', and - // terminate the string with '\0'. - char* buffer = static_cast(malloc(len + 3)); - full_pattern_ = buffer; - - if (*regex != '^') - *buffer++ = '^'; // Makes sure full_pattern_ starts with '^'. - - // We don't use snprintf or strncpy, as they trigger a warning when - // compiled with VC++ 8.0. - memcpy(buffer, regex, len); - buffer += len; - - if (len == 0 || regex[len - 1] != '$') - *buffer++ = '$'; // Makes sure full_pattern_ ends with '$'. - - *buffer = '\0'; -} - -#endif // GTEST_USES_POSIX_RE - -const char kUnknownFile[] = "unknown file"; - -// Formats a source file path and a line number as they would appear -// in an error message from the compiler used to compile this code. -GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { - const std::string file_name(file == NULL ? kUnknownFile : file); - - if (line < 0) { - return file_name + ":"; - } -#ifdef _MSC_VER - return file_name + "(" + StreamableToString(line) + "):"; -#else - return file_name + ":" + StreamableToString(line) + ":"; -#endif // _MSC_VER -} - -// Formats a file location for compiler-independent XML output. -// Although this function is not platform dependent, we put it next to -// FormatFileLocation in order to contrast the two functions. -// Note that FormatCompilerIndependentFileLocation() does NOT append colon -// to the file location it produces, unlike FormatFileLocation(). -GTEST_API_ ::std::string FormatCompilerIndependentFileLocation( - const char* file, int line) { - const std::string file_name(file == NULL ? kUnknownFile : file); - - if (line < 0) - return file_name; - else - return file_name + ":" + StreamableToString(line); -} - - -GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line) - : severity_(severity) { - const char* const marker = - severity == GTEST_INFO ? "[ INFO ]" : - severity == GTEST_WARNING ? "[WARNING]" : - severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]"; - GetStream() << ::std::endl << marker << " " - << FormatFileLocation(file, line).c_str() << ": "; -} - -// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. -GTestLog::~GTestLog() { - GetStream() << ::std::endl; - if (severity_ == GTEST_FATAL) { - fflush(stderr); - posix::Abort(); - } -} -// Disable Microsoft deprecation warnings for POSIX functions called from -// this class (creat, dup, dup2, and close) -#ifdef _MSC_VER -# pragma warning(push) -# pragma warning(disable: 4996) -#endif // _MSC_VER - -#if GTEST_HAS_STREAM_REDIRECTION - -// Object that captures an output stream (stdout/stderr). -class CapturedStream { - public: - // The ctor redirects the stream to a temporary file. - explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) { -# if GTEST_OS_WINDOWS - char temp_dir_path[MAX_PATH + 1] = { '\0' }; // NOLINT - char temp_file_path[MAX_PATH + 1] = { '\0' }; // NOLINT - - ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path); - const UINT success = ::GetTempFileNameA(temp_dir_path, - "gtest_redir", - 0, // Generate unique file name. - temp_file_path); - GTEST_CHECK_(success != 0) - << "Unable to create a temporary file in " << temp_dir_path; - const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE); - GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file " - << temp_file_path; - filename_ = temp_file_path; -# else - // There's no guarantee that a test has write access to the current - // directory, so we create the temporary file in the /tmp directory - // instead. We use /tmp on most systems, and /sdcard on Android. - // That's because Android doesn't have /tmp. -# if GTEST_OS_LINUX_ANDROID - // Note: Android applications are expected to call the framework's - // Context.getExternalStorageDirectory() method through JNI to get - // the location of the world-writable SD Card directory. However, - // this requires a Context handle, which cannot be retrieved - // globally from native code. Doing so also precludes running the - // code as part of a regular standalone executable, which doesn't - // run in a Dalvik process (e.g. when running it through 'adb shell'). - // - // The location /sdcard is directly accessible from native code - // and is the only location (unofficially) supported by the Android - // team. It's generally a symlink to the real SD Card mount point - // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or - // other OEM-customized locations. Never rely on these, and always - // use /sdcard. - char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX"; -# else - char name_template[] = "/tmp/captured_stream.XXXXXX"; -# endif // GTEST_OS_LINUX_ANDROID - const int captured_fd = mkstemp(name_template); - filename_ = name_template; -# endif // GTEST_OS_WINDOWS - fflush(NULL); - dup2(captured_fd, fd_); - close(captured_fd); - } - - ~CapturedStream() { - remove(filename_.c_str()); - } - - std::string GetCapturedString() { - if (uncaptured_fd_ != -1) { - // Restores the original stream. - fflush(NULL); - dup2(uncaptured_fd_, fd_); - close(uncaptured_fd_); - uncaptured_fd_ = -1; - } - - FILE* const file = posix::FOpen(filename_.c_str(), "r"); - const std::string content = ReadEntireFile(file); - posix::FClose(file); - return content; - } - - private: - // Reads the entire content of a file as an std::string. - static std::string ReadEntireFile(FILE* file); - - // Returns the size (in bytes) of a file. - static size_t GetFileSize(FILE* file); - - const int fd_; // A stream to capture. - int uncaptured_fd_; - // Name of the temporary file holding the stderr output. - ::std::string filename_; - - GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream); -}; - -// Returns the size (in bytes) of a file. -size_t CapturedStream::GetFileSize(FILE* file) { - fseek(file, 0, SEEK_END); - return static_cast(ftell(file)); -} - -// Reads the entire content of a file as a string. -std::string CapturedStream::ReadEntireFile(FILE* file) { - const size_t file_size = GetFileSize(file); - char* const buffer = new char[file_size]; - - size_t bytes_last_read = 0; // # of bytes read in the last fread() - size_t bytes_read = 0; // # of bytes read so far - - fseek(file, 0, SEEK_SET); - - // Keeps reading the file until we cannot read further or the - // pre-determined file size is reached. - do { - bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file); - bytes_read += bytes_last_read; - } while (bytes_last_read > 0 && bytes_read < file_size); - - const std::string content(buffer, bytes_read); - delete[] buffer; - - return content; -} - -# ifdef _MSC_VER -# pragma warning(pop) -# endif // _MSC_VER - -static CapturedStream* g_captured_stderr = NULL; -static CapturedStream* g_captured_stdout = NULL; - -// Starts capturing an output stream (stdout/stderr). -void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) { - if (*stream != NULL) { - GTEST_LOG_(FATAL) << "Only one " << stream_name - << " capturer can exist at a time."; - } - *stream = new CapturedStream(fd); -} - -// Stops capturing the output stream and returns the captured string. -std::string GetCapturedStream(CapturedStream** captured_stream) { - const std::string content = (*captured_stream)->GetCapturedString(); - - delete *captured_stream; - *captured_stream = NULL; - - return content; -} - -// Starts capturing stdout. -void CaptureStdout() { - CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout); -} - -// Starts capturing stderr. -void CaptureStderr() { - CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr); -} - -// Stops capturing stdout and returns the captured string. -std::string GetCapturedStdout() { - return GetCapturedStream(&g_captured_stdout); -} - -// Stops capturing stderr and returns the captured string. -std::string GetCapturedStderr() { - return GetCapturedStream(&g_captured_stderr); -} - -#endif // GTEST_HAS_STREAM_REDIRECTION - -#if GTEST_HAS_DEATH_TEST - -// A copy of all command line arguments. Set by InitGoogleTest(). -::std::vector g_argvs; - -static const ::std::vector* g_injected_test_argvs = - NULL; // Owned. - -void SetInjectableArgvs(const ::std::vector* argvs) { - if (g_injected_test_argvs != argvs) - delete g_injected_test_argvs; - g_injected_test_argvs = argvs; -} - -const ::std::vector& GetInjectableArgvs() { - if (g_injected_test_argvs != NULL) { - return *g_injected_test_argvs; - } - return g_argvs; -} -#endif // GTEST_HAS_DEATH_TEST - -#if GTEST_OS_WINDOWS_MOBILE -namespace posix { -void Abort() { - DebugBreak(); - TerminateProcess(GetCurrentProcess(), 1); -} -} // namespace posix -#endif // GTEST_OS_WINDOWS_MOBILE - -// Returns the name of the environment variable corresponding to the -// given flag. For example, FlagToEnvVar("foo") will return -// "GTEST_FOO" in the open-source version. -static std::string FlagToEnvVar(const char* flag) { - const std::string full_flag = - (Message() << GTEST_FLAG_PREFIX_ << flag).GetString(); - - Message env_var; - for (size_t i = 0; i != full_flag.length(); i++) { - env_var << ToUpper(full_flag.c_str()[i]); - } - - return env_var.GetString(); -} - -// Parses 'str' for a 32-bit signed integer. If successful, writes -// the result to *value and returns true; otherwise leaves *value -// unchanged and returns false. -bool ParseInt32(const Message& src_text, const char* str, Int32* value) { - // Parses the environment variable as a decimal integer. - char* end = NULL; - const long long_value = strtol(str, &end, 10); // NOLINT - - // Has strtol() consumed all characters in the string? - if (*end != '\0') { - // No - an invalid character was encountered. - Message msg; - msg << "WARNING: " << src_text - << " is expected to be a 32-bit integer, but actually" - << " has value \"" << str << "\".\n"; - printf("%s", msg.GetString().c_str()); - fflush(stdout); - return false; - } - - // Is the parsed value in the range of an Int32? - const Int32 result = static_cast(long_value); - if (long_value == LONG_MAX || long_value == LONG_MIN || - // The parsed value overflows as a long. (strtol() returns - // LONG_MAX or LONG_MIN when the input overflows.) - result != long_value - // The parsed value overflows as an Int32. - ) { - Message msg; - msg << "WARNING: " << src_text - << " is expected to be a 32-bit integer, but actually" - << " has value " << str << ", which overflows.\n"; - printf("%s", msg.GetString().c_str()); - fflush(stdout); - return false; - } - - *value = result; - return true; -} - -// Reads and returns the Boolean environment variable corresponding to -// the given flag; if it's not set, returns default_value. -// -// The value is considered true iff it's not "0". -bool BoolFromGTestEnv(const char* flag, bool default_value) { - const std::string env_var = FlagToEnvVar(flag); - const char* const string_value = posix::GetEnv(env_var.c_str()); - return string_value == NULL ? - default_value : strcmp(string_value, "0") != 0; -} - -// Reads and returns a 32-bit integer stored in the environment -// variable corresponding to the given flag; if it isn't set or -// doesn't represent a valid 32-bit integer, returns default_value. -Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { - const std::string env_var = FlagToEnvVar(flag); - const char* const string_value = posix::GetEnv(env_var.c_str()); - if (string_value == NULL) { - // The environment variable is not set. - return default_value; - } - - Int32 result = default_value; - if (!ParseInt32(Message() << "Environment variable " << env_var, - string_value, &result)) { - printf("The default value %s is used.\n", - (Message() << default_value).GetString().c_str()); - fflush(stdout); - return default_value; - } - - return result; -} - -// Reads and returns the string environment variable corresponding to -// the given flag; if it's not set, returns default_value. -const char* StringFromGTestEnv(const char* flag, const char* default_value) { - const std::string env_var = FlagToEnvVar(flag); - const char* const value = posix::GetEnv(env_var.c_str()); - return value == NULL ? default_value : value; -} - -} // namespace internal -} // namespace testing -// Copyright 2007, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - -// Google Test - The Google C++ Testing Framework -// -// This file implements a universal value printer that can print a -// value of any type T: -// -// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); -// -// It uses the << operator when possible, and prints the bytes in the -// object otherwise. A user can override its behavior for a class -// type Foo by defining either operator<<(::std::ostream&, const Foo&) -// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that -// defines Foo. - -#include -#include -#include // NOLINT -#include - -namespace testing { - -namespace { - -using ::std::ostream; - -// Prints a segment of bytes in the given object. -void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start, - size_t count, ostream* os) { - char text[5] = ""; - for (size_t i = 0; i != count; i++) { - const size_t j = start + i; - if (i != 0) { - // Organizes the bytes into groups of 2 for easy parsing by - // human. - if ((j % 2) == 0) - *os << ' '; - else - *os << '-'; - } - GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]); - *os << text; - } -} - -// Prints the bytes in the given value to the given ostream. -void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, - ostream* os) { - // Tells the user how big the object is. - *os << count << "-byte object <"; - - const size_t kThreshold = 132; - const size_t kChunkSize = 64; - // If the object size is bigger than kThreshold, we'll have to omit - // some details by printing only the first and the last kChunkSize - // bytes. - // TODO(wan): let the user control the threshold using a flag. - if (count < kThreshold) { - PrintByteSegmentInObjectTo(obj_bytes, 0, count, os); - } else { - PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os); - *os << " ... "; - // Rounds up to 2-byte boundary. - const size_t resume_pos = (count - kChunkSize + 1)/2*2; - PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os); - } - *os << ">"; -} - -} // namespace - -namespace internal2 { - -// Delegates to PrintBytesInObjectToImpl() to print the bytes in the -// given object. The delegation simplifies the implementation, which -// uses the << operator and thus is easier done outside of the -// ::testing::internal namespace, which contains a << operator that -// sometimes conflicts with the one in STL. -void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count, - ostream* os) { - PrintBytesInObjectToImpl(obj_bytes, count, os); -} - -} // namespace internal2 - -namespace internal { - -// Depending on the value of a char (or wchar_t), we print it in one -// of three formats: -// - as is if it's a printable ASCII (e.g. 'a', '2', ' '), -// - as a hexidecimal escape sequence (e.g. '\x7F'), or -// - as a special escape sequence (e.g. '\r', '\n'). -enum CharFormat { - kAsIs, - kHexEscape, - kSpecialEscape -}; - -// Returns true if c is a printable ASCII character. We test the -// value of c directly instead of calling isprint(), which is buggy on -// Windows Mobile. -inline bool IsPrintableAscii(wchar_t c) { - return 0x20 <= c && c <= 0x7E; -} - -// Prints a wide or narrow char c as a character literal without the -// quotes, escaping it when necessary; returns how c was formatted. -// The template argument UnsignedChar is the unsigned version of Char, -// which is the type of c. -template -static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { - switch (static_cast(c)) { - case L'\0': - *os << "\\0"; - break; - case L'\'': - *os << "\\'"; - break; - case L'\\': - *os << "\\\\"; - break; - case L'\a': - *os << "\\a"; - break; - case L'\b': - *os << "\\b"; - break; - case L'\f': - *os << "\\f"; - break; - case L'\n': - *os << "\\n"; - break; - case L'\r': - *os << "\\r"; - break; - case L'\t': - *os << "\\t"; - break; - case L'\v': - *os << "\\v"; - break; - default: - if (IsPrintableAscii(c)) { - *os << static_cast(c); - return kAsIs; - } else { - *os << "\\x" + String::FormatHexInt(static_cast(c)); - return kHexEscape; - } - } - return kSpecialEscape; -} - -// Prints a wchar_t c as if it's part of a string literal, escaping it when -// necessary; returns how c was formatted. -static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) { - switch (c) { - case L'\'': - *os << "'"; - return kAsIs; - case L'"': - *os << "\\\""; - return kSpecialEscape; - default: - return PrintAsCharLiteralTo(c, os); - } -} - -// Prints a char c as if it's part of a string literal, escaping it when -// necessary; returns how c was formatted. -static CharFormat PrintAsStringLiteralTo(char c, ostream* os) { - return PrintAsStringLiteralTo( - static_cast(static_cast(c)), os); -} - -// Prints a wide or narrow character c and its code. '\0' is printed -// as "'\\0'", other unprintable characters are also properly escaped -// using the standard C++ escape sequence. The template argument -// UnsignedChar is the unsigned version of Char, which is the type of c. -template -void PrintCharAndCodeTo(Char c, ostream* os) { - // First, print c as a literal in the most readable form we can find. - *os << ((sizeof(c) > 1) ? "L'" : "'"); - const CharFormat format = PrintAsCharLiteralTo(c, os); - *os << "'"; - - // To aid user debugging, we also print c's code in decimal, unless - // it's 0 (in which case c was printed as '\\0', making the code - // obvious). - if (c == 0) - return; - *os << " (" << static_cast(c); - - // For more convenience, we print c's code again in hexidecimal, - // unless c was already printed in the form '\x##' or the code is in - // [1, 9]. - if (format == kHexEscape || (1 <= c && c <= 9)) { - // Do nothing. - } else { - *os << ", 0x" << String::FormatHexInt(static_cast(c)); - } - *os << ")"; -} - -void PrintTo(unsigned char c, ::std::ostream* os) { - PrintCharAndCodeTo(c, os); -} -void PrintTo(signed char c, ::std::ostream* os) { - PrintCharAndCodeTo(c, os); -} - -// Prints a wchar_t as a symbol if it is printable or as its internal -// code otherwise and also as its code. L'\0' is printed as "L'\\0'". -void PrintTo(wchar_t wc, ostream* os) { - PrintCharAndCodeTo(wc, os); -} - -// Prints the given array of characters to the ostream. CharType must be either -// char or wchar_t. -// The array starts at begin, the length is len, it may include '\0' characters -// and may not be NUL-terminated. -template -static void PrintCharsAsStringTo( - const CharType* begin, size_t len, ostream* os) { - const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\""; - *os << kQuoteBegin; - bool is_previous_hex = false; - for (size_t index = 0; index < len; ++index) { - const CharType cur = begin[index]; - if (is_previous_hex && IsXDigit(cur)) { - // Previous character is of '\x..' form and this character can be - // interpreted as another hexadecimal digit in its number. Break string to - // disambiguate. - *os << "\" " << kQuoteBegin; - } - is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape; - } - *os << "\""; -} - -// Prints a (const) char/wchar_t array of 'len' elements, starting at address -// 'begin'. CharType must be either char or wchar_t. -template -static void UniversalPrintCharArray( - const CharType* begin, size_t len, ostream* os) { - // The code - // const char kFoo[] = "foo"; - // generates an array of 4, not 3, elements, with the last one being '\0'. - // - // Therefore when printing a char array, we don't print the last element if - // it's '\0', such that the output matches the string literal as it's - // written in the source code. - if (len > 0 && begin[len - 1] == '\0') { - PrintCharsAsStringTo(begin, len - 1, os); - return; - } - - // If, however, the last element in the array is not '\0', e.g. - // const char kFoo[] = { 'f', 'o', 'o' }; - // we must print the entire array. We also print a message to indicate - // that the array is not NUL-terminated. - PrintCharsAsStringTo(begin, len, os); - *os << " (no terminating NUL)"; -} - -// Prints a (const) char array of 'len' elements, starting at address 'begin'. -void UniversalPrintArray(const char* begin, size_t len, ostream* os) { - UniversalPrintCharArray(begin, len, os); -} - -// Prints a (const) wchar_t array of 'len' elements, starting at address -// 'begin'. -void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) { - UniversalPrintCharArray(begin, len, os); -} - -// Prints the given C string to the ostream. -void PrintTo(const char* s, ostream* os) { - if (s == NULL) { - *os << "NULL"; - } else { - *os << ImplicitCast_(s) << " pointing to "; - PrintCharsAsStringTo(s, strlen(s), os); - } -} - -// MSVC compiler can be configured to define whar_t as a typedef -// of unsigned short. Defining an overload for const wchar_t* in that case -// would cause pointers to unsigned shorts be printed as wide strings, -// possibly accessing more memory than intended and causing invalid -// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when -// wchar_t is implemented as a native type. -#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) -// Prints the given wide C string to the ostream. -void PrintTo(const wchar_t* s, ostream* os) { - if (s == NULL) { - *os << "NULL"; - } else { - *os << ImplicitCast_(s) << " pointing to "; - PrintCharsAsStringTo(s, wcslen(s), os); - } -} -#endif // wchar_t is native - -// Prints a ::string object. -#if GTEST_HAS_GLOBAL_STRING -void PrintStringTo(const ::string& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); -} -#endif // GTEST_HAS_GLOBAL_STRING - -void PrintStringTo(const ::std::string& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); -} - -// Prints a ::wstring object. -#if GTEST_HAS_GLOBAL_WSTRING -void PrintWideStringTo(const ::wstring& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); -} -#endif // GTEST_HAS_GLOBAL_WSTRING - -#if GTEST_HAS_STD_WSTRING -void PrintWideStringTo(const ::std::wstring& s, ostream* os) { - PrintCharsAsStringTo(s.data(), s.size(), os); -} -#endif // GTEST_HAS_STD_WSTRING - -} // namespace internal - -} // namespace testing -// Copyright 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: mheule@google.com (Markus Heule) -// -// The Google C++ Testing Framework (Google Test) - - -// Indicates that this translation unit is part of Google Test's -// implementation. It must come before gtest-internal-inl.h is -// included, or there will be a compiler error. This trick is to -// prevent a user from accidentally including gtest-internal-inl.h in -// his code. -#define GTEST_IMPLEMENTATION_ 1 -#undef GTEST_IMPLEMENTATION_ - -namespace testing { - -using internal::GetUnitTestImpl; - -// Gets the summary of the failure message by omitting the stack trace -// in it. -std::string TestPartResult::ExtractSummary(const char* message) { - const char* const stack_trace = strstr(message, internal::kStackTraceMarker); - return stack_trace == NULL ? message : - std::string(message, stack_trace); -} - -// Prints a TestPartResult object. -std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { - return os - << result.file_name() << ":" << result.line_number() << ": " - << (result.type() == TestPartResult::kSuccess ? "Success" : - result.type() == TestPartResult::kFatalFailure ? "Fatal failure" : - "Non-fatal failure") << ":\n" - << result.message() << std::endl; -} - -// Appends a TestPartResult to the array. -void TestPartResultArray::Append(const TestPartResult& result) { - array_.push_back(result); -} - -// Returns the TestPartResult at the given index (0-based). -const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const { - if (index < 0 || index >= size()) { - printf("\nInvalid index (%d) into TestPartResultArray.\n", index); - internal::posix::Abort(); - } - - return array_[index]; -} - -// Returns the number of TestPartResult objects in the array. -int TestPartResultArray::size() const { - return static_cast(array_.size()); -} - -namespace internal { - -HasNewFatalFailureHelper::HasNewFatalFailureHelper() - : has_new_fatal_failure_(false), - original_reporter_(GetUnitTestImpl()-> - GetTestPartResultReporterForCurrentThread()) { - GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this); -} - -HasNewFatalFailureHelper::~HasNewFatalFailureHelper() { - GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread( - original_reporter_); -} - -void HasNewFatalFailureHelper::ReportTestPartResult( - const TestPartResult& result) { - if (result.fatally_failed()) - has_new_fatal_failure_ = true; - original_reporter_->ReportTestPartResult(result); -} - -} // namespace internal - -} // namespace testing -// Copyright 2008 Google Inc. -// All Rights Reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author: wan@google.com (Zhanyong Wan) - - -namespace testing { -namespace internal { - -#if GTEST_HAS_TYPED_TEST_P - -// Skips to the first non-space char in str. Returns an empty string if str -// contains only whitespace characters. -static const char* SkipSpaces(const char* str) { - while (IsSpace(*str)) - str++; - return str; -} - -// Verifies that registered_tests match the test names in -// defined_test_names_; returns registered_tests if successful, or -// aborts the program otherwise. -const char* TypedTestCasePState::VerifyRegisteredTestNames( - const char* file, int line, const char* registered_tests) { - typedef ::std::set::const_iterator DefinedTestIter; - registered_ = true; - - // Skip initial whitespace in registered_tests since some - // preprocessors prefix stringizied literals with whitespace. - registered_tests = SkipSpaces(registered_tests); - - Message errors; - ::std::set tests; - for (const char* names = registered_tests; names != NULL; - names = SkipComma(names)) { - const std::string name = GetPrefixUntilComma(names); - if (tests.count(name) != 0) { - errors << "Test " << name << " is listed more than once.\n"; - continue; - } - - bool found = false; - for (DefinedTestIter it = defined_test_names_.begin(); - it != defined_test_names_.end(); - ++it) { - if (name == *it) { - found = true; - break; - } - } - - if (found) { - tests.insert(name); - } else { - errors << "No test named " << name - << " can be found in this test case.\n"; - } - } - - for (DefinedTestIter it = defined_test_names_.begin(); - it != defined_test_names_.end(); - ++it) { - if (tests.count(*it) == 0) { - errors << "You forgot to list test " << *it << ".\n"; - } - } - - const std::string& errors_str = errors.GetString(); - if (errors_str != "") { - fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), - errors_str.c_str()); - fflush(stderr); - posix::Abort(); - } - - return registered_tests; -} - -#endif // GTEST_HAS_TYPED_TEST_P - -} // namespace internal -} // namespace testing +#include "src/gtest-assertion-result.cc" +#include "src/gtest-death-test.cc" +#include "src/gtest-filepath.cc" +#include "src/gtest-matchers.cc" +#include "src/gtest-port.cc" +#include "src/gtest-printers.cc" +#include "src/gtest-test-part.cc" +#include "src/gtest-typed-test.cc" +#include "src/gtest.cc" diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc new file mode 100644 index 0000000000..f1c0b10dc9 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc @@ -0,0 +1,77 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file defines the AssertionResult type. + +#include "gtest/gtest-assertion-result.h" + +#include +#include + +#include "gtest/gtest-message.h" + +namespace testing { + +// AssertionResult constructors. +// Used in EXPECT_TRUE/FALSE(assertion_result). +AssertionResult::AssertionResult(const AssertionResult& other) + : success_(other.success_), + message_(other.message_.get() != nullptr + ? new ::std::string(*other.message_) + : static_cast< ::std::string*>(nullptr)) {} + +// Swaps two AssertionResults. +void AssertionResult::swap(AssertionResult& other) { + using std::swap; + swap(success_, other.success_); + swap(message_, other.message_); +} + +// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. +AssertionResult AssertionResult::operator!() const { + AssertionResult negation(!success_); + if (message_.get() != nullptr) negation << *message_; + return negation; +} + +// Makes a successful assertion result. +AssertionResult AssertionSuccess() { return AssertionResult(true); } + +// Makes a failed assertion result. +AssertionResult AssertionFailure() { return AssertionResult(false); } + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << message. +AssertionResult AssertionFailure(const Message& message) { + return AssertionFailure() << message; +} + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc new file mode 100644 index 0000000000..e6abc6278a --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-death-test.cc @@ -0,0 +1,1620 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// This file implements death tests. + +#include "gtest/gtest-death-test.h" + +#include +#include + +#include "gtest/internal/custom/gtest.h" +#include "gtest/internal/gtest-port.h" + +#if GTEST_HAS_DEATH_TEST + +#if GTEST_OS_MAC +#include +#endif // GTEST_OS_MAC + +#include +#include +#include + +#if GTEST_OS_LINUX +#include +#endif // GTEST_OS_LINUX + +#include + +#if GTEST_OS_WINDOWS +#include +#else +#include +#include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_QNX +#include +#endif // GTEST_OS_QNX + +#if GTEST_OS_FUCHSIA +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif // GTEST_OS_FUCHSIA + +#endif // GTEST_HAS_DEATH_TEST + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-string.h" +#include "src/gtest-internal-inl.h" + +namespace testing { + +// Constants. + +// The default death test style. +// +// This is defined in internal/gtest-port.h as "fast", but can be overridden by +// a definition in internal/custom/gtest-port.h. The recommended value, which is +// used internally at Google, is "threadsafe". +static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE; + +} // namespace testing + +GTEST_DEFINE_string_( + death_test_style, + testing::internal::StringFromGTestEnv("death_test_style", + testing::kDefaultDeathTestStyle), + "Indicates how to run a death test in a forked child process: " + "\"threadsafe\" (child process re-executes the test binary " + "from the beginning, running only the specific death test) or " + "\"fast\" (child process runs the death test immediately " + "after forking)."); + +GTEST_DEFINE_bool_( + death_test_use_fork, + testing::internal::BoolFromGTestEnv("death_test_use_fork", false), + "Instructs to use fork()/_exit() instead of clone() in death tests. " + "Ignored and always uses fork() on POSIX systems where clone() is not " + "implemented. Useful when running under valgrind or similar tools if " + "those do not support clone(). Valgrind 3.3.1 will just fail if " + "it sees an unsupported combination of clone() flags. " + "It is not recommended to use this flag w/o valgrind though it will " + "work in 99% of the cases. Once valgrind is fixed, this flag will " + "most likely be removed."); + +GTEST_DEFINE_string_( + internal_run_death_test, "", + "Indicates the file, line number, temporal index of " + "the single death test to run, and a file descriptor to " + "which a success code may be sent, all separated by " + "the '|' characters. This flag is specified if and only if the " + "current process is a sub-process launched for running a thread-safe " + "death test. FOR INTERNAL USE ONLY."); + +namespace testing { + +#if GTEST_HAS_DEATH_TEST + +namespace internal { + +// Valid only for fast death tests. Indicates the code is running in the +// child process of a fast style death test. +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +static bool g_in_fast_death_test_child = false; +#endif + +// Returns a Boolean value indicating whether the caller is currently +// executing in the context of the death test child process. Tools such as +// Valgrind heap checkers may need this to modify their behavior in death +// tests. IMPORTANT: This is an internal utility. Using it may break the +// implementation of death tests. User code MUST NOT use it. +bool InDeathTestChild() { +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + // On Windows and Fuchsia, death tests are thread-safe regardless of the value + // of the death_test_style flag. + return !GTEST_FLAG_GET(internal_run_death_test).empty(); + +#else + + if (GTEST_FLAG_GET(death_test_style) == "threadsafe") + return !GTEST_FLAG_GET(internal_run_death_test).empty(); + else + return g_in_fast_death_test_child; +#endif +} + +} // namespace internal + +// ExitedWithCode constructor. +ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {} + +// ExitedWithCode function-call operator. +bool ExitedWithCode::operator()(int exit_status) const { +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + return exit_status == exit_code_; + +#else + + return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; + +#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA +} + +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +// KilledBySignal constructor. +KilledBySignal::KilledBySignal(int signum) : signum_(signum) {} + +// KilledBySignal function-call operator. +bool KilledBySignal::operator()(int exit_status) const { +#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) + { + bool result; + if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) { + return result; + } + } +#endif // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_) + return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; +} +#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA + +namespace internal { + +// Utilities needed for death tests. + +// Generates a textual description of a given exit code, in the format +// specified by wait(2). +static std::string ExitSummary(int exit_code) { + Message m; + +#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + m << "Exited with exit status " << exit_code; + +#else + + if (WIFEXITED(exit_code)) { + m << "Exited with exit status " << WEXITSTATUS(exit_code); + } else if (WIFSIGNALED(exit_code)) { + m << "Terminated by signal " << WTERMSIG(exit_code); + } +#ifdef WCOREDUMP + if (WCOREDUMP(exit_code)) { + m << " (core dumped)"; + } +#endif +#endif // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA + + return m.GetString(); +} + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +bool ExitedUnsuccessfully(int exit_status) { + return !ExitedWithCode(0)(exit_status); +} + +#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA +// Generates a textual failure message when a death test finds more than +// one thread running, or cannot determine the number of threads, prior +// to executing the given statement. It is the responsibility of the +// caller not to pass a thread_count of 1. +static std::string DeathTestThreadWarning(size_t thread_count) { + Message msg; + msg << "Death tests use fork(), which is unsafe particularly" + << " in a threaded context. For this test, " << GTEST_NAME_ << " "; + if (thread_count == 0) { + msg << "couldn't detect the number of threads."; + } else { + msg << "detected " << thread_count << " threads."; + } + msg << " See " + "https://github.com/google/googletest/blob/master/docs/" + "advanced.md#death-tests-and-threads" + << " for more explanation and suggested solutions, especially if" + << " this is the last message you see before your test times out."; + return msg.GetString(); +} +#endif // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA + +// Flag characters for reporting a death test that did not die. +static const char kDeathTestLived = 'L'; +static const char kDeathTestReturned = 'R'; +static const char kDeathTestThrew = 'T'; +static const char kDeathTestInternalError = 'I'; + +#if GTEST_OS_FUCHSIA + +// File descriptor used for the pipe in the child process. +static const int kFuchsiaReadPipeFd = 3; + +#endif + +// An enumeration describing all of the possible ways that a death test can +// conclude. DIED means that the process died while executing the test +// code; LIVED means that process lived beyond the end of the test code; +// RETURNED means that the test statement attempted to execute a return +// statement, which is not allowed; THREW means that the test statement +// returned control by throwing an exception. IN_PROGRESS means the test +// has not yet concluded. +enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; + +// Routine for aborting the program which is safe to call from an +// exec-style death test child process, in which case the error +// message is propagated back to the parent process. Otherwise, the +// message is simply printed to stderr. In either case, the program +// then exits with status 1. +static void DeathTestAbort(const std::string& message) { + // On a POSIX system, this function may be called from a threadsafe-style + // death test child process, which operates on a very small stack. Use + // the heap for any additional non-minuscule memory requirements. + const InternalRunDeathTestFlag* const flag = + GetUnitTestImpl()->internal_run_death_test_flag(); + if (flag != nullptr) { + FILE* parent = posix::FDOpen(flag->write_fd(), "w"); + fputc(kDeathTestInternalError, parent); + fprintf(parent, "%s", message.c_str()); + fflush(parent); + _exit(1); + } else { + fprintf(stderr, "%s", message.c_str()); + fflush(stderr); + posix::Abort(); + } +} + +// A replacement for CHECK that calls DeathTestAbort if the assertion +// fails. +#define GTEST_DEATH_TEST_CHECK_(expression) \ + do { \ + if (!::testing::internal::IsTrue(expression)) { \ + DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \ + ", line " + \ + ::testing::internal::StreamableToString(__LINE__) + \ + ": " + #expression); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for +// evaluating any system call that fulfills two conditions: it must return +// -1 on failure, and set errno to EINTR when it is interrupted and +// should be tried again. The macro expands to a loop that repeatedly +// evaluates the expression as long as it evaluates to -1 and sets +// errno to EINTR. If the expression evaluates to -1 but errno is +// something other than EINTR, DeathTestAbort is called. +#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ + do { \ + int gtest_retval; \ + do { \ + gtest_retval = (expression); \ + } while (gtest_retval == -1 && errno == EINTR); \ + if (gtest_retval == -1) { \ + DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ + \ + ", line " + \ + ::testing::internal::StreamableToString(__LINE__) + \ + ": " + #expression + " != -1"); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// Returns the message describing the last system error in errno. +std::string GetLastErrnoDescription() { + return errno == 0 ? "" : posix::StrError(errno); +} + +// This is called from a death test parent process to read a failure +// message from the death test child process and log it with the FATAL +// severity. On Windows, the message is read from a pipe handle. On other +// platforms, it is read from a file descriptor. +static void FailFromInternalError(int fd) { + Message error; + char buffer[256]; + int num_read; + + do { + while ((num_read = posix::Read(fd, buffer, 255)) > 0) { + buffer[num_read] = '\0'; + error << buffer; + } + } while (num_read == -1 && errno == EINTR); + + if (num_read == 0) { + GTEST_LOG_(FATAL) << error.GetString(); + } else { + const int last_error = errno; + GTEST_LOG_(FATAL) << "Error while reading death test internal: " + << GetLastErrnoDescription() << " [" << last_error << "]"; + } +} + +// Death test constructor. Increments the running death test count +// for the current test. +DeathTest::DeathTest() { + TestInfo* const info = GetUnitTestImpl()->current_test_info(); + if (info == nullptr) { + DeathTestAbort( + "Cannot run a death test outside of a TEST or " + "TEST_F construct"); + } +} + +// Creates and returns a death test by dispatching to the current +// death test factory. +bool DeathTest::Create(const char* statement, + Matcher matcher, const char* file, + int line, DeathTest** test) { + return GetUnitTestImpl()->death_test_factory()->Create( + statement, std::move(matcher), file, line, test); +} + +const char* DeathTest::LastMessage() { + return last_death_test_message_.c_str(); +} + +void DeathTest::set_last_death_test_message(const std::string& message) { + last_death_test_message_ = message; +} + +std::string DeathTest::last_death_test_message_; + +// Provides cross platform implementation for some death functionality. +class DeathTestImpl : public DeathTest { + protected: + DeathTestImpl(const char* a_statement, Matcher matcher) + : statement_(a_statement), + matcher_(std::move(matcher)), + spawned_(false), + status_(-1), + outcome_(IN_PROGRESS), + read_fd_(-1), + write_fd_(-1) {} + + // read_fd_ is expected to be closed and cleared by a derived class. + ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); } + + void Abort(AbortReason reason) override; + bool Passed(bool status_ok) override; + + const char* statement() const { return statement_; } + bool spawned() const { return spawned_; } + void set_spawned(bool is_spawned) { spawned_ = is_spawned; } + int status() const { return status_; } + void set_status(int a_status) { status_ = a_status; } + DeathTestOutcome outcome() const { return outcome_; } + void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; } + int read_fd() const { return read_fd_; } + void set_read_fd(int fd) { read_fd_ = fd; } + int write_fd() const { return write_fd_; } + void set_write_fd(int fd) { write_fd_ = fd; } + + // Called in the parent process only. Reads the result code of the death + // test child process via a pipe, interprets it to set the outcome_ + // member, and closes read_fd_. Outputs diagnostics and terminates in + // case of unexpected codes. + void ReadAndInterpretStatusByte(); + + // Returns stderr output from the child process. + virtual std::string GetErrorLogs(); + + private: + // The textual content of the code this object is testing. This class + // doesn't own this string and should not attempt to delete it. + const char* const statement_; + // A matcher that's expected to match the stderr output by the child process. + Matcher matcher_; + // True if the death test child process has been successfully spawned. + bool spawned_; + // The exit status of the child process. + int status_; + // How the death test concluded. + DeathTestOutcome outcome_; + // Descriptor to the read end of the pipe to the child process. It is + // always -1 in the child process. The child keeps its write end of the + // pipe in write_fd_. + int read_fd_; + // Descriptor to the child's write end of the pipe to the parent process. + // It is always -1 in the parent process. The parent keeps its end of the + // pipe in read_fd_. + int write_fd_; +}; + +// Called in the parent process only. Reads the result code of the death +// test child process via a pipe, interprets it to set the outcome_ +// member, and closes read_fd_. Outputs diagnostics and terminates in +// case of unexpected codes. +void DeathTestImpl::ReadAndInterpretStatusByte() { + char flag; + int bytes_read; + + // The read() here blocks until data is available (signifying the + // failure of the death test) or until the pipe is closed (signifying + // its success), so it's okay to call this in the parent before + // the child process has exited. + do { + bytes_read = posix::Read(read_fd(), &flag, 1); + } while (bytes_read == -1 && errno == EINTR); + + if (bytes_read == 0) { + set_outcome(DIED); + } else if (bytes_read == 1) { + switch (flag) { + case kDeathTestReturned: + set_outcome(RETURNED); + break; + case kDeathTestThrew: + set_outcome(THREW); + break; + case kDeathTestLived: + set_outcome(LIVED); + break; + case kDeathTestInternalError: + FailFromInternalError(read_fd()); // Does not return. + break; + default: + GTEST_LOG_(FATAL) << "Death test child process reported " + << "unexpected status byte (" + << static_cast(flag) << ")"; + } + } else { + GTEST_LOG_(FATAL) << "Read from death test child process failed: " + << GetLastErrnoDescription(); + } + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd())); + set_read_fd(-1); +} + +std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); } + +// Signals that the death test code which should have exited, didn't. +// Should be called only in a death test child process. +// Writes a status byte to the child's status file descriptor, then +// calls _exit(1). +void DeathTestImpl::Abort(AbortReason reason) { + // The parent process considers the death test to be a failure if + // it finds any data in our pipe. So, here we write a single flag byte + // to the pipe, then exit. + const char status_ch = reason == TEST_DID_NOT_DIE ? kDeathTestLived + : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew + : kDeathTestReturned; + + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1)); + // We are leaking the descriptor here because on some platforms (i.e., + // when built as Windows DLL), destructors of global objects will still + // run after calling _exit(). On such systems, write_fd_ will be + // indirectly closed from the destructor of UnitTestImpl, causing double + // close if it is also closed here. On debug configurations, double close + // may assert. As there are no in-process buffers to flush here, we are + // relying on the OS to close the descriptor after the process terminates + // when the destructors are not run. + _exit(1); // Exits w/o any normal exit hooks (we were supposed to crash) +} + +// Returns an indented copy of stderr output for a death test. +// This makes distinguishing death test output lines from regular log lines +// much easier. +static ::std::string FormatDeathTestOutput(const ::std::string& output) { + ::std::string ret; + for (size_t at = 0;;) { + const size_t line_end = output.find('\n', at); + ret += "[ DEATH ] "; + if (line_end == ::std::string::npos) { + ret += output.substr(at); + break; + } + ret += output.substr(at, line_end + 1 - at); + at = line_end + 1; + } + return ret; +} + +// Assesses the success or failure of a death test, using both private +// members which have previously been set, and one argument: +// +// Private data members: +// outcome: An enumeration describing how the death test +// concluded: DIED, LIVED, THREW, or RETURNED. The death test +// fails in the latter three cases. +// status: The exit status of the child process. On *nix, it is in the +// in the format specified by wait(2). On Windows, this is the +// value supplied to the ExitProcess() API or a numeric code +// of the exception that terminated the program. +// matcher_: A matcher that's expected to match the stderr output by the child +// process. +// +// Argument: +// status_ok: true if exit_status is acceptable in the context of +// this particular death test, which fails if it is false +// +// Returns true if and only if all of the above conditions are met. Otherwise, +// the first failing condition, in the order given above, is the one that is +// reported. Also sets the last death test message string. +bool DeathTestImpl::Passed(bool status_ok) { + if (!spawned()) return false; + + const std::string error_message = GetErrorLogs(); + + bool success = false; + Message buffer; + + buffer << "Death test: " << statement() << "\n"; + switch (outcome()) { + case LIVED: + buffer << " Result: failed to die.\n" + << " Error msg:\n" + << FormatDeathTestOutput(error_message); + break; + case THREW: + buffer << " Result: threw an exception.\n" + << " Error msg:\n" + << FormatDeathTestOutput(error_message); + break; + case RETURNED: + buffer << " Result: illegal return in test statement.\n" + << " Error msg:\n" + << FormatDeathTestOutput(error_message); + break; + case DIED: + if (status_ok) { + if (matcher_.Matches(error_message)) { + success = true; + } else { + std::ostringstream stream; + matcher_.DescribeTo(&stream); + buffer << " Result: died but not with expected error.\n" + << " Expected: " << stream.str() << "\n" + << "Actual msg:\n" + << FormatDeathTestOutput(error_message); + } + } else { + buffer << " Result: died but not with expected exit code:\n" + << " " << ExitSummary(status()) << "\n" + << "Actual msg:\n" + << FormatDeathTestOutput(error_message); + } + break; + case IN_PROGRESS: + default: + GTEST_LOG_(FATAL) + << "DeathTest::Passed somehow called before conclusion of test"; + } + + DeathTest::set_last_death_test_message(buffer.GetString()); + return success; +} + +#if GTEST_OS_WINDOWS +// WindowsDeathTest implements death tests on Windows. Due to the +// specifics of starting new processes on Windows, death tests there are +// always threadsafe, and Google Test considers the +// --gtest_death_test_style=fast setting to be equivalent to +// --gtest_death_test_style=threadsafe there. +// +// A few implementation notes: Like the Linux version, the Windows +// implementation uses pipes for child-to-parent communication. But due to +// the specifics of pipes on Windows, some extra steps are required: +// +// 1. The parent creates a communication pipe and stores handles to both +// ends of it. +// 2. The parent starts the child and provides it with the information +// necessary to acquire the handle to the write end of the pipe. +// 3. The child acquires the write end of the pipe and signals the parent +// using a Windows event. +// 4. Now the parent can release the write end of the pipe on its side. If +// this is done before step 3, the object's reference count goes down to +// 0 and it is destroyed, preventing the child from acquiring it. The +// parent now has to release it, or read operations on the read end of +// the pipe will not return when the child terminates. +// 5. The parent reads child's output through the pipe (outcome code and +// any possible error messages) from the pipe, and its stderr and then +// determines whether to fail the test. +// +// Note: to distinguish Win32 API calls from the local method and function +// calls, the former are explicitly resolved in the global namespace. +// +class WindowsDeathTest : public DeathTestImpl { + public: + WindowsDeathTest(const char* a_statement, Matcher matcher, + const char* file, int line) + : DeathTestImpl(a_statement, std::move(matcher)), + file_(file), + line_(line) {} + + // All of these virtual functions are inherited from DeathTest. + virtual int Wait(); + virtual TestRole AssumeRole(); + + private: + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; + // Handle to the write end of the pipe to the child process. + AutoHandle write_handle_; + // Child process handle. + AutoHandle child_handle_; + // Event the child process uses to signal the parent that it has + // acquired the handle to the write end of the pipe. After seeing this + // event the parent can release its own handles to make sure its + // ReadFile() calls return when the child terminates. + AutoHandle event_handle_; +}; + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int WindowsDeathTest::Wait() { + if (!spawned()) return 0; + + // Wait until the child either signals that it has acquired the write end + // of the pipe or it dies. + const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()}; + switch (::WaitForMultipleObjects(2, wait_handles, + FALSE, // Waits for any of the handles. + INFINITE)) { + case WAIT_OBJECT_0: + case WAIT_OBJECT_0 + 1: + break; + default: + GTEST_DEATH_TEST_CHECK_(false); // Should not get here. + } + + // The child has acquired the write end of the pipe or exited. + // We release the handle on our side and continue. + write_handle_.Reset(); + event_handle_.Reset(); + + ReadAndInterpretStatusByte(); + + // Waits for the child process to exit if it haven't already. This + // returns immediately if the child has already exited, regardless of + // whether previous calls to WaitForMultipleObjects synchronized on this + // handle or not. + GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 == + ::WaitForSingleObject(child_handle_.Get(), INFINITE)); + DWORD status_code; + GTEST_DEATH_TEST_CHECK_( + ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE); + child_handle_.Reset(); + set_status(static_cast(status_code)); + return status(); +} + +// The AssumeRole process for a Windows death test. It creates a child +// process with the same executable as the current process to run the +// death test. The child process is given the --gtest_filter and +// --gtest_internal_run_death_test flags such that it knows to run the +// current death test only. +DeathTest::TestRole WindowsDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != nullptr) { + // ParseInternalRunDeathTestFlag() has performed all the necessary + // processing. + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + // WindowsDeathTest uses an anonymous pipe to communicate results of + // a death test. + SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES), + nullptr, TRUE}; + HANDLE read_handle, write_handle; + GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle, + &handles_are_inheritable, + 0) // Default buffer size. + != FALSE); + set_read_fd( + ::_open_osfhandle(reinterpret_cast(read_handle), O_RDONLY)); + write_handle_.Reset(write_handle); + event_handle_.Reset(::CreateEvent( + &handles_are_inheritable, + TRUE, // The event will automatically reset to non-signaled state. + FALSE, // The initial state is non-signalled. + nullptr)); // The even is unnamed. + GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr); + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + "filter=" + info->test_suite_name() + "." + + info->name(); + const std::string internal_flag = + std::string("--") + GTEST_FLAG_PREFIX_ + + "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) + + "|" + StreamableToString(death_test_index) + "|" + + StreamableToString(static_cast(::GetCurrentProcessId())) + + // size_t has the same width as pointers on both 32-bit and 64-bit + // Windows platforms. + // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx. + "|" + StreamableToString(reinterpret_cast(write_handle)) + "|" + + StreamableToString(reinterpret_cast(event_handle_.Get())); + + char executable_path[_MAX_PATH + 1]; // NOLINT + GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr, + executable_path, + _MAX_PATH)); + + std::string command_line = std::string(::GetCommandLineA()) + " " + + filter_flag + " \"" + internal_flag + "\""; + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // Flush the log buffers since the log streams are shared with the child. + FlushInfoLog(); + + // The child process will share the standard handles with the parent. + STARTUPINFOA startup_info; + memset(&startup_info, 0, sizeof(STARTUPINFO)); + startup_info.dwFlags = STARTF_USESTDHANDLES; + startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE); + startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE); + startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE); + + PROCESS_INFORMATION process_info; + GTEST_DEATH_TEST_CHECK_( + ::CreateProcessA( + executable_path, const_cast(command_line.c_str()), + nullptr, // Returned process handle is not inheritable. + nullptr, // Returned thread handle is not inheritable. + TRUE, // Child inherits all inheritable handles (for write_handle_). + 0x0, // Default creation flags. + nullptr, // Inherit the parent's environment. + UnitTest::GetInstance()->original_working_dir(), &startup_info, + &process_info) != FALSE); + child_handle_.Reset(process_info.hProcess); + ::CloseHandle(process_info.hThread); + set_spawned(true); + return OVERSEE_TEST; +} + +#elif GTEST_OS_FUCHSIA + +class FuchsiaDeathTest : public DeathTestImpl { + public: + FuchsiaDeathTest(const char* a_statement, Matcher matcher, + const char* file, int line) + : DeathTestImpl(a_statement, std::move(matcher)), + file_(file), + line_(line) {} + + // All of these virtual functions are inherited from DeathTest. + int Wait() override; + TestRole AssumeRole() override; + std::string GetErrorLogs() override; + + private: + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; + // The stderr data captured by the child process. + std::string captured_stderr_; + + zx::process child_process_; + zx::channel exception_channel_; + zx::socket stderr_socket_; +}; + +// Utility class for accumulating command-line arguments. +class Arguments { + public: + Arguments() { args_.push_back(nullptr); } + + ~Arguments() { + for (std::vector::iterator i = args_.begin(); i != args_.end(); + ++i) { + free(*i); + } + } + void AddArgument(const char* argument) { + args_.insert(args_.end() - 1, posix::StrDup(argument)); + } + + template + void AddArguments(const ::std::vector& arguments) { + for (typename ::std::vector::const_iterator i = arguments.begin(); + i != arguments.end(); ++i) { + args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); + } + } + char* const* Argv() { return &args_[0]; } + + int size() { return static_cast(args_.size()) - 1; } + + private: + std::vector args_; +}; + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int FuchsiaDeathTest::Wait() { + const int kProcessKey = 0; + const int kSocketKey = 1; + const int kExceptionKey = 2; + + if (!spawned()) return 0; + + // Create a port to wait for socket/task/exception events. + zx_status_t status_zx; + zx::port port; + status_zx = zx::port::create(0, &port); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Register to wait for the child process to terminate. + status_zx = + child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Register to wait for the socket to be readable or closed. + status_zx = stderr_socket_.wait_async( + port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + // Register to wait for an exception. + status_zx = exception_channel_.wait_async(port, kExceptionKey, + ZX_CHANNEL_READABLE, 0); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + bool process_terminated = false; + bool socket_closed = false; + do { + zx_port_packet_t packet = {}; + status_zx = port.wait(zx::time::infinite(), &packet); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + if (packet.key == kExceptionKey) { + // Process encountered an exception. Kill it directly rather than + // letting other handlers process the event. We will get a kProcessKey + // event when the process actually terminates. + status_zx = child_process_.kill(); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + } else if (packet.key == kProcessKey) { + // Process terminated. + GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type)); + GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED); + process_terminated = true; + } else if (packet.key == kSocketKey) { + GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type)); + if (packet.signal.observed & ZX_SOCKET_READABLE) { + // Read data from the socket. + constexpr size_t kBufferSize = 1024; + do { + size_t old_length = captured_stderr_.length(); + size_t bytes_read = 0; + captured_stderr_.resize(old_length + kBufferSize); + status_zx = + stderr_socket_.read(0, &captured_stderr_.front() + old_length, + kBufferSize, &bytes_read); + captured_stderr_.resize(old_length + bytes_read); + } while (status_zx == ZX_OK); + if (status_zx == ZX_ERR_PEER_CLOSED) { + socket_closed = true; + } else { + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT); + status_zx = stderr_socket_.wait_async( + port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + } + } else { + GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_SOCKET_PEER_CLOSED); + socket_closed = true; + } + } + } while (!process_terminated && !socket_closed); + + ReadAndInterpretStatusByte(); + + zx_info_process_t buffer; + status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer), + nullptr, nullptr); + GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK); + + GTEST_DEATH_TEST_CHECK_(buffer.flags & ZX_INFO_PROCESS_FLAG_EXITED); + set_status(static_cast(buffer.return_code)); + return status(); +} + +// The AssumeRole process for a Fuchsia death test. It creates a child +// process with the same executable as the current process to run the +// death test. The child process is given the --gtest_filter and +// --gtest_internal_run_death_test flags such that it knows to run the +// current death test only. +DeathTest::TestRole FuchsiaDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != nullptr) { + // ParseInternalRunDeathTestFlag() has performed all the necessary + // processing. + set_write_fd(kFuchsiaReadPipeFd); + return EXECUTE_TEST; + } + + // Flush the log buffers since the log streams are shared with the child. + FlushInfoLog(); + + // Build the child process command line. + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + "filter=" + info->test_suite_name() + "." + + info->name(); + const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + kInternalRunDeathTestFlag + "=" + file_ + + "|" + StreamableToString(line_) + "|" + + StreamableToString(death_test_index); + Arguments args; + args.AddArguments(GetInjectableArgvs()); + args.AddArgument(filter_flag.c_str()); + args.AddArgument(internal_flag.c_str()); + + // Build the pipe for communication with the child. + zx_status_t status; + zx_handle_t child_pipe_handle; + int child_pipe_fd; + status = fdio_pipe_half(&child_pipe_fd, &child_pipe_handle); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + set_read_fd(child_pipe_fd); + + // Set the pipe handle for the child. + fdio_spawn_action_t spawn_actions[2] = {}; + fdio_spawn_action_t* add_handle_action = &spawn_actions[0]; + add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE; + add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd); + add_handle_action->h.handle = child_pipe_handle; + + // Create a socket pair will be used to receive the child process' stderr. + zx::socket stderr_producer_socket; + status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_); + GTEST_DEATH_TEST_CHECK_(status >= 0); + int stderr_producer_fd = -1; + status = + fdio_fd_create(stderr_producer_socket.release(), &stderr_producer_fd); + GTEST_DEATH_TEST_CHECK_(status >= 0); + + // Make the stderr socket nonblocking. + GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0); + + fdio_spawn_action_t* add_stderr_action = &spawn_actions[1]; + add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD; + add_stderr_action->fd.local_fd = stderr_producer_fd; + add_stderr_action->fd.target_fd = STDERR_FILENO; + + // Create a child job. + zx_handle_t child_job = ZX_HANDLE_INVALID; + status = zx_job_create(zx_job_default(), 0, &child_job); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + zx_policy_basic_t policy; + policy.condition = ZX_POL_NEW_ANY; + policy.policy = ZX_POL_ACTION_ALLOW; + status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, + &policy, 1); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + // Create an exception channel attached to the |child_job|, to allow + // us to suppress the system default exception handler from firing. + status = zx_task_create_exception_channel( + child_job, 0, exception_channel_.reset_and_get_address()); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + // Spawn the child process. + status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], + args.Argv(), nullptr, 2, spawn_actions, + child_process_.reset_and_get_address(), nullptr); + GTEST_DEATH_TEST_CHECK_(status == ZX_OK); + + set_spawned(true); + return OVERSEE_TEST; +} + +std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; } + +#else // We are neither on Windows, nor on Fuchsia. + +// ForkingDeathTest provides implementations for most of the abstract +// methods of the DeathTest interface. Only the AssumeRole method is +// left undefined. +class ForkingDeathTest : public DeathTestImpl { + public: + ForkingDeathTest(const char* statement, Matcher matcher); + + // All of these virtual functions are inherited from DeathTest. + int Wait() override; + + protected: + void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; } + + private: + // PID of child process during death test; 0 in the child process itself. + pid_t child_pid_; +}; + +// Constructs a ForkingDeathTest. +ForkingDeathTest::ForkingDeathTest(const char* a_statement, + Matcher matcher) + : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {} + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int ForkingDeathTest::Wait() { + if (!spawned()) return 0; + + ReadAndInterpretStatusByte(); + + int status_value; + GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0)); + set_status(status_value); + return status_value; +} + +// A concrete death test class that forks, then immediately runs the test +// in the child process. +class NoExecDeathTest : public ForkingDeathTest { + public: + NoExecDeathTest(const char* a_statement, Matcher matcher) + : ForkingDeathTest(a_statement, std::move(matcher)) {} + TestRole AssumeRole() override; +}; + +// The AssumeRole process for a fork-and-run death test. It implements a +// straightforward fork, with a simple pipe to transmit the status byte. +DeathTest::TestRole NoExecDeathTest::AssumeRole() { + const size_t thread_count = GetThreadCount(); + if (thread_count != 1) { + GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count); + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + + DeathTest::set_last_death_test_message(""); + CaptureStderr(); + // When we fork the process below, the log file buffers are copied, but the + // file descriptors are shared. We flush all log files here so that closing + // the file descriptors in the child process doesn't throw off the + // synchronization between descriptors and buffers in the parent process. + // This is as close to the fork as possible to avoid a race condition in case + // there are multiple threads running before the death test, and another + // thread writes to the log file. + FlushInfoLog(); + + const pid_t child_pid = fork(); + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + set_child_pid(child_pid); + if (child_pid == 0) { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0])); + set_write_fd(pipe_fd[1]); + // Redirects all logging to stderr in the child process to prevent + // concurrent writes to the log files. We capture stderr in the parent + // process and append the child process' output to a log. + LogToStderr(); + // Event forwarding to the listeners of event listener API mush be shut + // down in death test subprocesses. + GetUnitTestImpl()->listeners()->SuppressEventForwarding(); + g_in_fast_death_test_child = true; + return EXECUTE_TEST; + } else { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; + } +} + +// A concrete death test class that forks and re-executes the main +// program from the beginning, with command-line flags set that cause +// only this specific death test to be run. +class ExecDeathTest : public ForkingDeathTest { + public: + ExecDeathTest(const char* a_statement, Matcher matcher, + const char* file, int line) + : ForkingDeathTest(a_statement, std::move(matcher)), + file_(file), + line_(line) {} + TestRole AssumeRole() override; + + private: + static ::std::vector GetArgvsForDeathTestChildProcess() { + ::std::vector args = GetInjectableArgvs(); +#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) + ::std::vector extra_args = + GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_(); + args.insert(args.end(), extra_args.begin(), extra_args.end()); +#endif // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_) + return args; + } + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; +}; + +// Utility class for accumulating command-line arguments. +class Arguments { + public: + Arguments() { args_.push_back(nullptr); } + + ~Arguments() { + for (std::vector::iterator i = args_.begin(); i != args_.end(); + ++i) { + free(*i); + } + } + void AddArgument(const char* argument) { + args_.insert(args_.end() - 1, posix::StrDup(argument)); + } + + template + void AddArguments(const ::std::vector& arguments) { + for (typename ::std::vector::const_iterator i = arguments.begin(); + i != arguments.end(); ++i) { + args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); + } + } + char* const* Argv() { return &args_[0]; } + + private: + std::vector args_; +}; + +// A struct that encompasses the arguments to the child process of a +// threadsafe-style death test process. +struct ExecDeathTestArgs { + char* const* argv; // Command-line arguments for the child's call to exec + int close_fd; // File descriptor to close; the read end of a pipe +}; + +#if GTEST_OS_QNX +extern "C" char** environ; +#else // GTEST_OS_QNX +// The main function for a threadsafe-style death test child process. +// This function is called in a clone()-ed process and thus must avoid +// any potentially unsafe operations like malloc or libc functions. +static int ExecDeathTestChildMain(void* child_arg) { + ExecDeathTestArgs* const args = static_cast(child_arg); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd)); + + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char* const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(std::string("chdir(\"") + original_dir + + "\") failed: " + GetLastErrnoDescription()); + return EXIT_FAILURE; + } + + // We can safely call execv() as it's almost a direct system call. We + // cannot use execvp() as it's a libc function and thus potentially + // unsafe. Since execv() doesn't search the PATH, the user must + // invoke the test program via a valid path that contains at least + // one path separator. + execv(args->argv[0], args->argv); + DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " + + original_dir + " failed: " + GetLastErrnoDescription()); + return EXIT_FAILURE; +} +#endif // GTEST_OS_QNX + +#if GTEST_HAS_CLONE +// Two utility routines that together determine the direction the stack +// grows. +// This could be accomplished more elegantly by a single recursive +// function, but we want to guard against the unlikely possibility of +// a smart compiler optimizing the recursion away. +// +// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining +// StackLowerThanAddress into StackGrowsDown, which then doesn't give +// correct answer. +static void StackLowerThanAddress(const void* ptr, + bool* result) GTEST_NO_INLINE_; +// Make sure sanitizers do not tamper with the stack here. +// Ideally, we want to use `__builtin_frame_address` instead of a local variable +// address with sanitizer disabled, but it does not work when the +// compiler optimizes the stack frame out, which happens on PowerPC targets. +// HWAddressSanitizer add a random tag to the MSB of the local variable address, +// making comparison result unpredictable. +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +static void StackLowerThanAddress(const void* ptr, bool* result) { + int dummy = 0; + *result = std::less()(&dummy, ptr); +} + +// Make sure AddressSanitizer does not tamper with the stack here. +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +static bool StackGrowsDown() { + int dummy = 0; + bool result; + StackLowerThanAddress(&dummy, &result); + return result; +} +#endif // GTEST_HAS_CLONE + +// Spawns a child process with the same executable as the current process in +// a thread-safe manner and instructs it to run the death test. The +// implementation uses fork(2) + exec. On systems where clone(2) is +// available, it is used instead, being slightly more thread-safe. On QNX, +// fork supports only single-threaded environments, so this function uses +// spawn(2) there instead. The function dies with an error message if +// anything goes wrong. +static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) { + ExecDeathTestArgs args = {argv, close_fd}; + pid_t child_pid = -1; + +#if GTEST_OS_QNX + // Obtains the current directory and sets it to be closed in the child + // process. + const int cwd_fd = open(".", O_RDONLY); + GTEST_DEATH_TEST_CHECK_(cwd_fd != -1); + GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC)); + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char* const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(std::string("chdir(\"") + original_dir + + "\") failed: " + GetLastErrnoDescription()); + return EXIT_FAILURE; + } + + int fd_flags; + // Set close_fd to be closed after spawn. + GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD)); + GTEST_DEATH_TEST_CHECK_SYSCALL_( + fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC)); + struct inheritance inherit = {0}; + // spawn is a system call. + child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ); + // Restores the current working directory. + GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd)); + +#else // GTEST_OS_QNX +#if GTEST_OS_LINUX + // When a SIGPROF signal is received while fork() or clone() are executing, + // the process may hang. To avoid this, we ignore SIGPROF here and re-enable + // it after the call to fork()/clone() is complete. + struct sigaction saved_sigprof_action; + struct sigaction ignore_sigprof_action; + memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action)); + sigemptyset(&ignore_sigprof_action.sa_mask); + ignore_sigprof_action.sa_handler = SIG_IGN; + GTEST_DEATH_TEST_CHECK_SYSCALL_( + sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action)); +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_CLONE + const bool use_fork = GTEST_FLAG_GET(death_test_use_fork); + + if (!use_fork) { + static const bool stack_grows_down = StackGrowsDown(); + const auto stack_size = static_cast(getpagesize() * 2); + // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead. + void* const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED); + + // Maximum stack alignment in bytes: For a downward-growing stack, this + // amount is subtracted from size of the stack space to get an address + // that is within the stack space and is aligned on all systems we care + // about. As far as I know there is no ABI with stack alignment greater + // than 64. We assume stack and stack_size already have alignment of + // kMaxStackAlignment. + const size_t kMaxStackAlignment = 64; + void* const stack_top = + static_cast(stack) + + (stack_grows_down ? stack_size - kMaxStackAlignment : 0); + GTEST_DEATH_TEST_CHECK_( + static_cast(stack_size) > kMaxStackAlignment && + reinterpret_cast(stack_top) % kMaxStackAlignment == 0); + + child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args); + + GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1); + } +#else + const bool use_fork = true; +#endif // GTEST_HAS_CLONE + + if (use_fork && (child_pid = fork()) == 0) { + ExecDeathTestChildMain(&args); + _exit(0); + } +#endif // GTEST_OS_QNX +#if GTEST_OS_LINUX + GTEST_DEATH_TEST_CHECK_SYSCALL_( + sigaction(SIGPROF, &saved_sigprof_action, nullptr)); +#endif // GTEST_OS_LINUX + + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + return child_pid; +} + +// The AssumeRole process for a fork-and-exec death test. It re-executes the +// main program from the beginning, setting the --gtest_filter +// and --gtest_internal_run_death_test flags to cause only the current +// death test to be re-run. +DeathTest::TestRole ExecDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != nullptr) { + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + // Clear the close-on-exec flag on the write end of the pipe, lest + // it be closed when the child process does an exec: + GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); + + const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + "filter=" + info->test_suite_name() + "." + + info->name(); + const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ + + "internal_run_death_test=" + file_ + "|" + + StreamableToString(line_) + "|" + + StreamableToString(death_test_index) + "|" + + StreamableToString(pipe_fd[1]); + Arguments args; + args.AddArguments(GetArgvsForDeathTestChildProcess()); + args.AddArgument(filter_flag.c_str()); + args.AddArgument(internal_flag.c_str()); + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // See the comment in NoExecDeathTest::AssumeRole for why the next line + // is necessary. + FlushInfoLog(); + + const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_child_pid(child_pid); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; +} + +#endif // !GTEST_OS_WINDOWS + +// Creates a concrete DeathTest-derived class that depends on the +// --gtest_death_test_style flag, and sets the pointer pointed to +// by the "test" argument to its address. If the test should be +// skipped, sets that pointer to NULL. Returns true, unless the +// flag is set to an invalid value. +bool DefaultDeathTestFactory::Create(const char* statement, + Matcher matcher, + const char* file, int line, + DeathTest** test) { + UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const int death_test_index = + impl->current_test_info()->increment_death_test_count(); + + if (flag != nullptr) { + if (death_test_index > flag->index()) { + DeathTest::set_last_death_test_message( + "Death test count (" + StreamableToString(death_test_index) + + ") somehow exceeded expected maximum (" + + StreamableToString(flag->index()) + ")"); + return false; + } + + if (!(flag->file() == file && flag->line() == line && + flag->index() == death_test_index)) { + *test = nullptr; + return true; + } + } + +#if GTEST_OS_WINDOWS + + if (GTEST_FLAG_GET(death_test_style) == "threadsafe" || + GTEST_FLAG_GET(death_test_style) == "fast") { + *test = new WindowsDeathTest(statement, std::move(matcher), file, line); + } + +#elif GTEST_OS_FUCHSIA + + if (GTEST_FLAG_GET(death_test_style) == "threadsafe" || + GTEST_FLAG_GET(death_test_style) == "fast") { + *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line); + } + +#else + + if (GTEST_FLAG_GET(death_test_style) == "threadsafe") { + *test = new ExecDeathTest(statement, std::move(matcher), file, line); + } else if (GTEST_FLAG_GET(death_test_style) == "fast") { + *test = new NoExecDeathTest(statement, std::move(matcher)); + } + +#endif // GTEST_OS_WINDOWS + + else { // NOLINT - this is more readable than unbalanced brackets inside #if. + DeathTest::set_last_death_test_message("Unknown death test style \"" + + GTEST_FLAG_GET(death_test_style) + + "\" encountered"); + return false; + } + + return true; +} + +#if GTEST_OS_WINDOWS +// Recreates the pipe and event handles from the provided parameters, +// signals the event, and returns a file descriptor wrapped around the pipe +// handle. This function is called in the child process only. +static int GetStatusFileDescriptor(unsigned int parent_process_id, + size_t write_handle_as_size_t, + size_t event_handle_as_size_t) { + AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, + FALSE, // Non-inheritable. + parent_process_id)); + if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) { + DeathTestAbort("Unable to open parent process " + + StreamableToString(parent_process_id)); + } + + GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); + + const HANDLE write_handle = reinterpret_cast(write_handle_as_size_t); + HANDLE dup_write_handle; + + // The newly initialized handle is accessible only in the parent + // process. To obtain one accessible within the child, we need to use + // DuplicateHandle. + if (!::DuplicateHandle(parent_process_handle.Get(), write_handle, + ::GetCurrentProcess(), &dup_write_handle, + 0x0, // Requested privileges ignored since + // DUPLICATE_SAME_ACCESS is used. + FALSE, // Request non-inheritable handler. + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort("Unable to duplicate the pipe handle " + + StreamableToString(write_handle_as_size_t) + + " from the parent process " + + StreamableToString(parent_process_id)); + } + + const HANDLE event_handle = reinterpret_cast(event_handle_as_size_t); + HANDLE dup_event_handle; + + if (!::DuplicateHandle(parent_process_handle.Get(), event_handle, + ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE, + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort("Unable to duplicate the event handle " + + StreamableToString(event_handle_as_size_t) + + " from the parent process " + + StreamableToString(parent_process_id)); + } + + const int write_fd = + ::_open_osfhandle(reinterpret_cast(dup_write_handle), O_APPEND); + if (write_fd == -1) { + DeathTestAbort("Unable to convert pipe handle " + + StreamableToString(write_handle_as_size_t) + + " to a file descriptor"); + } + + // Signals the parent that the write end of the pipe has been acquired + // so the parent can release its own write end. + ::SetEvent(dup_event_handle); + + return write_fd; +} +#endif // GTEST_OS_WINDOWS + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { + if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr; + + // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we + // can use it here. + int line = -1; + int index = -1; + ::std::vector< ::std::string> fields; + SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields); + int write_fd = -1; + +#if GTEST_OS_WINDOWS + + unsigned int parent_process_id = 0; + size_t write_handle_as_size_t = 0; + size_t event_handle_as_size_t = 0; + + if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index) || + !ParseNaturalNumber(fields[3], &parent_process_id) || + !ParseNaturalNumber(fields[4], &write_handle_as_size_t) || + !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG_GET(internal_run_death_test)); + } + write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t, + event_handle_as_size_t); + +#elif GTEST_OS_FUCHSIA + + if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG_GET(internal_run_death_test)); + } + +#else + + if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) || + !ParseNaturalNumber(fields[2], &index) || + !ParseNaturalNumber(fields[3], &write_fd)) { + DeathTestAbort("Bad --gtest_internal_run_death_test flag: " + + GTEST_FLAG_GET(internal_run_death_test)); + } + +#endif // GTEST_OS_WINDOWS + + return new InternalRunDeathTestFlag(fields[0], line, index, write_fd); +} + +} // namespace internal + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc new file mode 100644 index 0000000000..f6ee90cdb7 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-filepath.cc @@ -0,0 +1,367 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gtest/internal/gtest-filepath.h" + +#include + +#include "gtest/gtest-message.h" +#include "gtest/internal/gtest-port.h" + +#if GTEST_OS_WINDOWS_MOBILE +#include +#elif GTEST_OS_WINDOWS +#include +#include +#else +#include + +#include // Some Linux distributions define PATH_MAX here. +#endif // GTEST_OS_WINDOWS_MOBILE + +#include "gtest/internal/gtest-string.h" + +#if GTEST_OS_WINDOWS +#define GTEST_PATH_MAX_ _MAX_PATH +#elif defined(PATH_MAX) +#define GTEST_PATH_MAX_ PATH_MAX +#elif defined(_XOPEN_PATH_MAX) +#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX +#else +#define GTEST_PATH_MAX_ _POSIX_PATH_MAX +#endif // GTEST_OS_WINDOWS + +namespace testing { +namespace internal { + +#if GTEST_OS_WINDOWS +// On Windows, '\\' is the standard path separator, but many tools and the +// Windows API also accept '/' as an alternate path separator. Unless otherwise +// noted, a file path can contain either kind of path separators, or a mixture +// of them. +const char kPathSeparator = '\\'; +const char kAlternatePathSeparator = '/'; +const char kAlternatePathSeparatorString[] = "/"; +#if GTEST_OS_WINDOWS_MOBILE +// Windows CE doesn't have a current directory. You should not use +// the current directory in tests on Windows CE, but this at least +// provides a reasonable fallback. +const char kCurrentDirectoryString[] = "\\"; +// Windows CE doesn't define INVALID_FILE_ATTRIBUTES +const DWORD kInvalidFileAttributes = 0xffffffff; +#else +const char kCurrentDirectoryString[] = ".\\"; +#endif // GTEST_OS_WINDOWS_MOBILE +#else +const char kPathSeparator = '/'; +const char kCurrentDirectoryString[] = "./"; +#endif // GTEST_OS_WINDOWS + +// Returns whether the given character is a valid path separator. +static bool IsPathSeparator(char c) { +#if GTEST_HAS_ALT_PATH_SEP_ + return (c == kPathSeparator) || (c == kAlternatePathSeparator); +#else + return c == kPathSeparator; +#endif +} + +// Returns the current working directory, or "" if unsuccessful. +FilePath FilePath::GetCurrentDir() { +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \ + GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 || \ + GTEST_OS_XTENSA + // These platforms do not have a current directory, so we just return + // something reasonable. + return FilePath(kCurrentDirectoryString); +#elif GTEST_OS_WINDOWS + char cwd[GTEST_PATH_MAX_ + 1] = {'\0'}; + return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd); +#else + char cwd[GTEST_PATH_MAX_ + 1] = {'\0'}; + char* result = getcwd(cwd, sizeof(cwd)); +#if GTEST_OS_NACL + // getcwd will likely fail in NaCl due to the sandbox, so return something + // reasonable. The user may have provided a shim implementation for getcwd, + // however, so fallback only when failure is detected. + return FilePath(result == nullptr ? kCurrentDirectoryString : cwd); +#endif // GTEST_OS_NACL + return FilePath(result == nullptr ? "" : cwd); +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns a copy of the FilePath with the case-insensitive extension removed. +// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns +// FilePath("dir/file"). If a case-insensitive extension is not +// found, returns a copy of the original FilePath. +FilePath FilePath::RemoveExtension(const char* extension) const { + const std::string dot_extension = std::string(".") + extension; + if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) { + return FilePath( + pathname_.substr(0, pathname_.length() - dot_extension.length())); + } + return *this; +} + +// Returns a pointer to the last occurrence of a valid path separator in +// the FilePath. On Windows, for example, both '/' and '\' are valid path +// separators. Returns NULL if no path separator was found. +const char* FilePath::FindLastPathSeparator() const { + const char* const last_sep = strrchr(c_str(), kPathSeparator); +#if GTEST_HAS_ALT_PATH_SEP_ + const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator); + // Comparing two pointers of which only one is NULL is undefined. + if (last_alt_sep != nullptr && + (last_sep == nullptr || last_alt_sep > last_sep)) { + return last_alt_sep; + } +#endif + return last_sep; +} + +// Returns a copy of the FilePath with the directory part removed. +// Example: FilePath("path/to/file").RemoveDirectoryName() returns +// FilePath("file"). If there is no directory part ("just_a_file"), it returns +// the FilePath unmodified. If there is no file part ("just_a_dir/") it +// returns an empty FilePath (""). +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveDirectoryName() const { + const char* const last_sep = FindLastPathSeparator(); + return last_sep ? FilePath(last_sep + 1) : *this; +} + +// RemoveFileName returns the directory path with the filename removed. +// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". +// If the FilePath is "a_file" or "/a_file", RemoveFileName returns +// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does +// not have a file, like "just/a/dir/", it returns the FilePath unmodified. +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveFileName() const { + const char* const last_sep = FindLastPathSeparator(); + std::string dir; + if (last_sep) { + dir = std::string(c_str(), static_cast(last_sep + 1 - c_str())); + } else { + dir = kCurrentDirectoryString; + } + return FilePath(dir); +} + +// Helper functions for naming files in a directory for xml output. + +// Given directory = "dir", base_name = "test", number = 0, +// extension = "xml", returns "dir/test.xml". If number is greater +// than zero (e.g., 12), returns "dir/test_12.xml". +// On Windows platform, uses \ as the separator rather than /. +FilePath FilePath::MakeFileName(const FilePath& directory, + const FilePath& base_name, int number, + const char* extension) { + std::string file; + if (number == 0) { + file = base_name.string() + "." + extension; + } else { + file = + base_name.string() + "_" + StreamableToString(number) + "." + extension; + } + return ConcatPaths(directory, FilePath(file)); +} + +// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml". +// On Windows, uses \ as the separator rather than /. +FilePath FilePath::ConcatPaths(const FilePath& directory, + const FilePath& relative_path) { + if (directory.IsEmpty()) return relative_path; + const FilePath dir(directory.RemoveTrailingPathSeparator()); + return FilePath(dir.string() + kPathSeparator + relative_path.string()); +} + +// Returns true if pathname describes something findable in the file-system, +// either a file, directory, or whatever. +bool FilePath::FileOrDirectoryExists() const { +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete[] unicode; + return attributes != kInvalidFileAttributes; +#else + posix::StatStruct file_stat{}; + return posix::Stat(pathname_.c_str(), &file_stat) == 0; +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns true if pathname describes a directory in the file-system +// that exists. +bool FilePath::DirectoryExists() const { + bool result = false; +#if GTEST_OS_WINDOWS + // Don't strip off trailing separator if path is a root directory on + // Windows (like "C:\\"). + const FilePath& path(IsRootDirectory() ? *this + : RemoveTrailingPathSeparator()); +#else + const FilePath& path(*this); +#endif + +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(path.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete[] unicode; + if ((attributes != kInvalidFileAttributes) && + (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + result = true; + } +#else + posix::StatStruct file_stat{}; + result = + posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat); +#endif // GTEST_OS_WINDOWS_MOBILE + + return result; +} + +// Returns true if pathname describes a root directory. (Windows has one +// root directory per disk drive.) +bool FilePath::IsRootDirectory() const { +#if GTEST_OS_WINDOWS + return pathname_.length() == 3 && IsAbsolutePath(); +#else + return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]); +#endif +} + +// Returns true if pathname describes an absolute path. +bool FilePath::IsAbsolutePath() const { + const char* const name = pathname_.c_str(); +#if GTEST_OS_WINDOWS + return pathname_.length() >= 3 && + ((name[0] >= 'a' && name[0] <= 'z') || + (name[0] >= 'A' && name[0] <= 'Z')) && + name[1] == ':' && IsPathSeparator(name[2]); +#else + return IsPathSeparator(name[0]); +#endif +} + +// Returns a pathname for a file that does not currently exist. The pathname +// will be directory/base_name.extension or +// directory/base_name_.extension if directory/base_name.extension +// already exists. The number will be incremented until a pathname is found +// that does not already exist. +// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. +// There could be a race condition if two or more processes are calling this +// function at the same time -- they could both pick the same filename. +FilePath FilePath::GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension) { + FilePath full_pathname; + int number = 0; + do { + full_pathname.Set(MakeFileName(directory, base_name, number++, extension)); + } while (full_pathname.FileOrDirectoryExists()); + return full_pathname; +} + +// Returns true if FilePath ends with a path separator, which indicates that +// it is intended to represent a directory. Returns false otherwise. +// This does NOT check that a directory (or file) actually exists. +bool FilePath::IsDirectory() const { + return !pathname_.empty() && + IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]); +} + +// Create directories so that path exists. Returns true if successful or if +// the directories already exist; returns false if unable to create directories +// for any reason. +bool FilePath::CreateDirectoriesRecursively() const { + if (!this->IsDirectory()) { + return false; + } + + if (pathname_.length() == 0 || this->DirectoryExists()) { + return true; + } + + const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName()); + return parent.CreateDirectoriesRecursively() && this->CreateFolder(); +} + +// Create the directory so that path exists. Returns true if successful or +// if the directory already exists; returns false if unable to create the +// directory for any reason, including if the parent directory does not +// exist. Not named "CreateDirectory" because that's a macro on Windows. +bool FilePath::CreateFolder() const { +#if GTEST_OS_WINDOWS_MOBILE + FilePath removed_sep(this->RemoveTrailingPathSeparator()); + LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); + int result = CreateDirectory(unicode, nullptr) ? 0 : -1; + delete[] unicode; +#elif GTEST_OS_WINDOWS + int result = _mkdir(pathname_.c_str()); +#elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA + // do nothing + int result = 0; +#else + int result = mkdir(pathname_.c_str(), 0777); +#endif // GTEST_OS_WINDOWS_MOBILE + + if (result == -1) { + return this->DirectoryExists(); // An error is OK if the directory exists. + } + return true; // No error. +} + +// If input name has a trailing separator character, remove it and return the +// name, otherwise return the name string unmodified. +// On Windows platform, uses \ as the separator, other platforms use /. +FilePath FilePath::RemoveTrailingPathSeparator() const { + return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1)) + : *this; +} + +// Removes any redundant separators that might be in the pathname. +// For example, "bar///foo" becomes "bar/foo". Does not eliminate other +// redundancies that might be in a pathname involving "." or "..". +void FilePath::Normalize() { + auto out = pathname_.begin(); + + for (const char character : pathname_) { + if (!IsPathSeparator(character)) { + *(out++) = character; + } else if (out == pathname_.begin() || *std::prev(out) != kPathSeparator) { + *(out++) = kPathSeparator; + } else { + continue; + } + } + + pathname_.erase(out, pathname_.end()); +} + +} // namespace internal +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h new file mode 100644 index 0000000000..0b9e929c68 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-internal-inl.h @@ -0,0 +1,1212 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Utility functions and classes used by the Google C++ testing framework.// +// This file contains purely Google Test's internal implementation. Please +// DO NOT #INCLUDE IT IN A USER PROGRAM. + +#ifndef GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ +#define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ + +#ifndef _WIN32_WCE +#include +#endif // !_WIN32_WCE +#include +#include // For strtoll/_strtoul64/malloc/free. +#include // For memmove. + +#include +#include +#include +#include +#include + +#include "gtest/internal/gtest-port.h" + +#if GTEST_CAN_STREAM_RESULTS_ +#include // NOLINT +#include // NOLINT +#endif + +#if GTEST_OS_WINDOWS +#include // NOLINT +#endif // GTEST_OS_WINDOWS + +#include "gtest/gtest-spi.h" +#include "gtest/gtest.h" + +GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \ +/* class A needs to have dll-interface to be used by clients of class B */) + +// Declares the flags. +// +// We don't want the users to modify this flag in the code, but want +// Google Test's own unit tests to be able to access it. Therefore we +// declare it here as opposed to in gtest.h. +GTEST_DECLARE_bool_(death_test_use_fork); + +namespace testing { +namespace internal { + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; + +// A valid random seed must be in [1, kMaxRandomSeed]. +const int kMaxRandomSeed = 99999; + +// g_help_flag is true if and only if the --help flag or an equivalent form +// is specified on the command line. +GTEST_API_ extern bool g_help_flag; + +// Returns the current time in milliseconds. +GTEST_API_ TimeInMillis GetTimeInMillis(); + +// Returns true if and only if Google Test should use colors in the output. +GTEST_API_ bool ShouldUseColor(bool stdout_is_tty); + +// Formats the given time in milliseconds as seconds. +GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms); + +// Converts the given time in milliseconds to a date string in the ISO 8601 +// format, without the timezone information. N.B.: due to the use the +// non-reentrant localtime() function, this function is not thread safe. Do +// not use it in any code that can be called from multiple threads. +GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms); + +// Parses a string for an Int32 flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value); + +// Returns a random seed in range [1, kMaxRandomSeed] based on the +// given --gtest_random_seed flag value. +inline int GetRandomSeedFromFlag(int32_t random_seed_flag) { + const unsigned int raw_seed = + (random_seed_flag == 0) ? static_cast(GetTimeInMillis()) + : static_cast(random_seed_flag); + + // Normalizes the actual seed to range [1, kMaxRandomSeed] such that + // it's easy to type. + const int normalized_seed = + static_cast((raw_seed - 1U) % + static_cast(kMaxRandomSeed)) + + 1; + return normalized_seed; +} + +// Returns the first valid random seed after 'seed'. The behavior is +// undefined if 'seed' is invalid. The seed after kMaxRandomSeed is +// considered to be 1. +inline int GetNextRandomSeed(int seed) { + GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed) + << "Invalid random seed " << seed << " - must be in [1, " + << kMaxRandomSeed << "]."; + const int next_seed = seed + 1; + return (next_seed > kMaxRandomSeed) ? 1 : next_seed; +} + +// This class saves the values of all Google Test flags in its c'tor, and +// restores them in its d'tor. +class GTestFlagSaver { + public: + // The c'tor. + GTestFlagSaver() { + also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests); + break_on_failure_ = GTEST_FLAG_GET(break_on_failure); + catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions); + color_ = GTEST_FLAG_GET(color); + death_test_style_ = GTEST_FLAG_GET(death_test_style); + death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork); + fail_fast_ = GTEST_FLAG_GET(fail_fast); + filter_ = GTEST_FLAG_GET(filter); + internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test); + list_tests_ = GTEST_FLAG_GET(list_tests); + output_ = GTEST_FLAG_GET(output); + brief_ = GTEST_FLAG_GET(brief); + print_time_ = GTEST_FLAG_GET(print_time); + print_utf8_ = GTEST_FLAG_GET(print_utf8); + random_seed_ = GTEST_FLAG_GET(random_seed); + repeat_ = GTEST_FLAG_GET(repeat); + recreate_environments_when_repeating_ = + GTEST_FLAG_GET(recreate_environments_when_repeating); + shuffle_ = GTEST_FLAG_GET(shuffle); + stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth); + stream_result_to_ = GTEST_FLAG_GET(stream_result_to); + throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure); + } + + // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS. + ~GTestFlagSaver() { + GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_); + GTEST_FLAG_SET(break_on_failure, break_on_failure_); + GTEST_FLAG_SET(catch_exceptions, catch_exceptions_); + GTEST_FLAG_SET(color, color_); + GTEST_FLAG_SET(death_test_style, death_test_style_); + GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_); + GTEST_FLAG_SET(filter, filter_); + GTEST_FLAG_SET(fail_fast, fail_fast_); + GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_); + GTEST_FLAG_SET(list_tests, list_tests_); + GTEST_FLAG_SET(output, output_); + GTEST_FLAG_SET(brief, brief_); + GTEST_FLAG_SET(print_time, print_time_); + GTEST_FLAG_SET(print_utf8, print_utf8_); + GTEST_FLAG_SET(random_seed, random_seed_); + GTEST_FLAG_SET(repeat, repeat_); + GTEST_FLAG_SET(recreate_environments_when_repeating, + recreate_environments_when_repeating_); + GTEST_FLAG_SET(shuffle, shuffle_); + GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_); + GTEST_FLAG_SET(stream_result_to, stream_result_to_); + GTEST_FLAG_SET(throw_on_failure, throw_on_failure_); + } + + private: + // Fields for saving the original values of flags. + bool also_run_disabled_tests_; + bool break_on_failure_; + bool catch_exceptions_; + std::string color_; + std::string death_test_style_; + bool death_test_use_fork_; + bool fail_fast_; + std::string filter_; + std::string internal_run_death_test_; + bool list_tests_; + std::string output_; + bool brief_; + bool print_time_; + bool print_utf8_; + int32_t random_seed_; + int32_t repeat_; + bool recreate_environments_when_repeating_; + bool shuffle_; + int32_t stack_trace_depth_; + std::string stream_result_to_; + bool throw_on_failure_; +} GTEST_ATTRIBUTE_UNUSED_; + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type UInt32 because wchar_t may not be +// wide enough to contain a code point. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted +// to "(Invalid Unicode 0xXXXXXXXX)". +GTEST_API_ std::string CodePointToUtf8(uint32_t code_point); + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars); + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded(); + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (e.g., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +GTEST_API_ bool ShouldShard(const char* total_shards_str, + const char* shard_index_str, + bool in_subprocess_for_death_test); + +// Parses the environment variable var as a 32-bit integer. If it is unset, +// returns default_val. If it is not a 32-bit integer, prints an error and +// and aborts. +GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val); + +// Given the total number of shards, the shard index, and the test id, +// returns true if and only if the test should be run on this shard. The test id +// is some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index, + int test_id); + +// STL container utilities. + +// Returns the number of elements in the given container that satisfy +// the given predicate. +template +inline int CountIf(const Container& c, Predicate predicate) { + // Implemented as an explicit loop since std::count_if() in libCstd on + // Solaris has a non-standard signature. + int count = 0; + for (auto it = c.begin(); it != c.end(); ++it) { + if (predicate(*it)) ++count; + } + return count; +} + +// Applies a function/functor to each element in the container. +template +void ForEach(const Container& c, Functor functor) { + std::for_each(c.begin(), c.end(), functor); +} + +// Returns the i-th element of the vector, or default_value if i is not +// in range [0, v.size()). +template +inline E GetElementOr(const std::vector& v, int i, E default_value) { + return (i < 0 || i >= static_cast(v.size())) ? default_value + : v[static_cast(i)]; +} + +// Performs an in-place shuffle of a range of the vector's elements. +// 'begin' and 'end' are element indices as an STL-style range; +// i.e. [begin, end) are shuffled, where 'end' == size() means to +// shuffle to the end of the vector. +template +void ShuffleRange(internal::Random* random, int begin, int end, + std::vector* v) { + const int size = static_cast(v->size()); + GTEST_CHECK_(0 <= begin && begin <= size) + << "Invalid shuffle range start " << begin << ": must be in range [0, " + << size << "]."; + GTEST_CHECK_(begin <= end && end <= size) + << "Invalid shuffle range finish " << end << ": must be in range [" + << begin << ", " << size << "]."; + + // Fisher-Yates shuffle, from + // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle + for (int range_width = end - begin; range_width >= 2; range_width--) { + const int last_in_range = begin + range_width - 1; + const int selected = + begin + + static_cast(random->Generate(static_cast(range_width))); + std::swap((*v)[static_cast(selected)], + (*v)[static_cast(last_in_range)]); + } +} + +// Performs an in-place shuffle of the vector's elements. +template +inline void Shuffle(internal::Random* random, std::vector* v) { + ShuffleRange(random, 0, static_cast(v->size()), v); +} + +// A function for deleting an object. Handy for being used as a +// functor. +template +static void Delete(T* x) { + delete x; +} + +// A predicate that checks the key of a TestProperty against a known key. +// +// TestPropertyKeyIs is copyable. +class TestPropertyKeyIs { + public: + // Constructor. + // + // TestPropertyKeyIs has NO default constructor. + explicit TestPropertyKeyIs(const std::string& key) : key_(key) {} + + // Returns true if and only if the test name of test property matches on key_. + bool operator()(const TestProperty& test_property) const { + return test_property.key() == key_; + } + + private: + std::string key_; +}; + +// Class UnitTestOptions. +// +// This class contains functions for processing options the user +// specifies when running the tests. It has only static members. +// +// In most cases, the user can specify an option using either an +// environment variable or a command line flag. E.g. you can set the +// test filter using either GTEST_FILTER or --gtest_filter. If both +// the variable and the flag are present, the latter overrides the +// former. +class GTEST_API_ UnitTestOptions { + public: + // Functions for processing the gtest_output flag. + + // Returns the output format, or "" for normal printed output. + static std::string GetOutputFormat(); + + // Returns the absolute path of the requested output file, or the + // default (test_detail.xml in the original working directory) if + // none was explicitly specified. + static std::string GetAbsolutePathToOutputFile(); + + // Functions for processing the gtest_filter flag. + + // Returns true if and only if the user-specified filter matches the test + // suite name and the test name. + static bool FilterMatchesTest(const std::string& test_suite_name, + const std::string& test_name); + +#if GTEST_OS_WINDOWS + // Function for supporting the gtest_catch_exception flag. + + // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the + // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. + // This function is useful as an __except condition. + static int GTestShouldProcessSEH(DWORD exception_code); +#endif // GTEST_OS_WINDOWS + + // Returns true if "name" matches the ':' separated list of glob-style + // filters in "filter". + static bool MatchesFilter(const std::string& name, const char* filter); +}; + +// Returns the current application's name, removing directory path if that +// is present. Used by UnitTestOptions::GetOutputFile. +GTEST_API_ FilePath GetCurrentExecutableName(); + +// The role interface for getting the OS stack trace as a string. +class OsStackTraceGetterInterface { + public: + OsStackTraceGetterInterface() {} + virtual ~OsStackTraceGetterInterface() {} + + // Returns the current OS stack trace as an std::string. Parameters: + // + // max_depth - the maximum number of stack frames to be included + // in the trace. + // skip_count - the number of top frames to be skipped; doesn't count + // against max_depth. + virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0; + + // UponLeavingGTest() should be called immediately before Google Test calls + // user code. It saves some information about the current stack that + // CurrentStackTrace() will use to find and hide Google Test stack frames. + virtual void UponLeavingGTest() = 0; + + // This string is inserted in place of stack frames that are part of + // Google Test's implementation. + static const char* const kElidedFramesMarker; + + private: + OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete; + OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) = + delete; +}; + +// A working implementation of the OsStackTraceGetterInterface interface. +class OsStackTraceGetter : public OsStackTraceGetterInterface { + public: + OsStackTraceGetter() {} + + std::string CurrentStackTrace(int max_depth, int skip_count) override; + void UponLeavingGTest() override; + + private: +#if GTEST_HAS_ABSL + Mutex mutex_; // Protects all internal state. + + // We save the stack frame below the frame that calls user code. + // We do this because the address of the frame immediately below + // the user code changes between the call to UponLeavingGTest() + // and any calls to the stack trace code from within the user code. + void* caller_frame_ = nullptr; +#endif // GTEST_HAS_ABSL + + OsStackTraceGetter(const OsStackTraceGetter&) = delete; + OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete; +}; + +// Information about a Google Test trace point. +struct TraceInfo { + const char* file; + int line; + std::string message; +}; + +// This is the default global test part result reporter used in UnitTestImpl. +// This class should only be used by UnitTestImpl. +class DefaultGlobalTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test); + // Implements the TestPartResultReporterInterface. Reports the test part + // result in the current test. + void ReportTestPartResult(const TestPartResult& result) override; + + private: + UnitTestImpl* const unit_test_; + + DefaultGlobalTestPartResultReporter( + const DefaultGlobalTestPartResultReporter&) = delete; + DefaultGlobalTestPartResultReporter& operator=( + const DefaultGlobalTestPartResultReporter&) = delete; +}; + +// This is the default per thread test part result reporter used in +// UnitTestImpl. This class should only be used by UnitTestImpl. +class DefaultPerThreadTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test); + // Implements the TestPartResultReporterInterface. The implementation just + // delegates to the current global test part result reporter of *unit_test_. + void ReportTestPartResult(const TestPartResult& result) override; + + private: + UnitTestImpl* const unit_test_; + + DefaultPerThreadTestPartResultReporter( + const DefaultPerThreadTestPartResultReporter&) = delete; + DefaultPerThreadTestPartResultReporter& operator=( + const DefaultPerThreadTestPartResultReporter&) = delete; +}; + +// The private implementation of the UnitTest class. We don't protect +// the methods under a mutex, as this class is not accessible by a +// user and the UnitTest class that delegates work to this class does +// proper locking. +class GTEST_API_ UnitTestImpl { + public: + explicit UnitTestImpl(UnitTest* parent); + virtual ~UnitTestImpl(); + + // There are two different ways to register your own TestPartResultReporter. + // You can register your own repoter to listen either only for test results + // from the current thread or for results from all threads. + // By default, each per-thread test result repoter just passes a new + // TestPartResult to the global test result reporter, which registers the + // test part result for the currently running test. + + // Returns the global test part result reporter. + TestPartResultReporterInterface* GetGlobalTestPartResultReporter(); + + // Sets the global test part result reporter. + void SetGlobalTestPartResultReporter( + TestPartResultReporterInterface* reporter); + + // Returns the test part result reporter for the current thread. + TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread(); + + // Sets the test part result reporter for the current thread. + void SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface* reporter); + + // Gets the number of successful test suites. + int successful_test_suite_count() const; + + // Gets the number of failed test suites. + int failed_test_suite_count() const; + + // Gets the number of all test suites. + int total_test_suite_count() const; + + // Gets the number of all test suites that contain at least one test + // that should run. + int test_suite_to_run_count() const; + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of skipped tests. + int skipped_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests that will be reported in the XML report. + int reportable_disabled_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of tests to be printed in the XML report. + int reportable_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp() const { return start_timestamp_; } + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns true if and only if the unit test passed (i.e. all test suites + // passed). + bool Passed() const { return !Failed(); } + + // Returns true if and only if the unit test failed (i.e. some test suite + // failed or something outside of all tests failed). + bool Failed() const { + return failed_test_suite_count() > 0 || ad_hoc_test_result()->Failed(); + } + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + const TestSuite* GetTestSuite(int i) const { + const int index = GetElementOr(test_suite_indices_, i, -1); + return index < 0 ? nullptr : test_suites_[static_cast(i)]; + } + + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + const TestCase* GetTestCase(int i) const { return GetTestSuite(i); } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Gets the i-th test suite among all the test suites. i can range from 0 to + // total_test_suite_count() - 1. If i is not in that range, returns NULL. + TestSuite* GetMutableSuiteCase(int i) { + const int index = GetElementOr(test_suite_indices_, i, -1); + return index < 0 ? nullptr : test_suites_[static_cast(index)]; + } + + // Provides access to the event listener list. + TestEventListeners* listeners() { return &listeners_; } + + // Returns the TestResult for the test that's currently running, or + // the TestResult for the ad hoc test if no test is running. + TestResult* current_test_result(); + + // Returns the TestResult for the ad hoc test. + const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; } + + // Sets the OS stack trace getter. + // + // Does nothing if the input and the current OS stack trace getter + // are the same; otherwise, deletes the old getter and makes the + // input the current getter. + void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter); + + // Returns the current OS stack trace getter if it is not NULL; + // otherwise, creates an OsStackTraceGetter, makes it the current + // getter, and returns it. + OsStackTraceGetterInterface* os_stack_trace_getter(); + + // Returns the current OS stack trace as an std::string. + // + // The maximum number of stack frames to be included is specified by + // the gtest_stack_trace_depth flag. The skip_count parameter + // specifies the number of top frames to be skipped, which doesn't + // count against the number of frames to be included. + // + // For example, if Foo() calls Bar(), which in turn calls + // CurrentOsStackTraceExceptTop(1), Foo() will be included in the + // trace but Bar() and CurrentOsStackTraceExceptTop() won't. + std::string CurrentOsStackTraceExceptTop(int skip_count) + GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_; + + // Finds and returns a TestSuite with the given name. If one doesn't + // exist, creates one and returns it. + // + // Arguments: + // + // test_suite_name: name of the test suite + // type_param: the name of the test's type parameter, or NULL if + // this is not a typed or a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc); + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + TestCase* GetTestCase(const char* test_case_name, const char* type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) { + return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc); + } +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + // Adds a TestInfo to the unit test. + // + // Arguments: + // + // set_up_tc: pointer to the function that sets up the test suite + // tear_down_tc: pointer to the function that tears down the test suite + // test_info: the TestInfo object + void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc, + TestInfo* test_info) { +#if GTEST_HAS_DEATH_TEST + // In order to support thread-safe death tests, we need to + // remember the original working directory when the test program + // was first invoked. We cannot do this in RUN_ALL_TESTS(), as + // the user may have changed the current directory before calling + // RUN_ALL_TESTS(). Therefore we capture the current directory in + // AddTestInfo(), which is called to register a TEST or TEST_F + // before main() is reached. + if (original_working_dir_.IsEmpty()) { + original_working_dir_.Set(FilePath::GetCurrentDir()); + GTEST_CHECK_(!original_working_dir_.IsEmpty()) + << "Failed to get the current working directory."; + } +#endif // GTEST_HAS_DEATH_TEST + + GetTestSuite(test_info->test_suite_name(), test_info->type_param(), + set_up_tc, tear_down_tc) + ->AddTestInfo(test_info); + } + + // Returns ParameterizedTestSuiteRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() { + return parameterized_test_registry_; + } + + std::set* ignored_parameterized_test_suites() { + return &ignored_parameterized_test_suites_; + } + + // Returns TypeParameterizedTestSuiteRegistry object used to keep track of + // type-parameterized tests and instantiations of them. + internal::TypeParameterizedTestSuiteRegistry& + type_parameterized_test_registry() { + return type_parameterized_test_registry_; + } + + // Sets the TestSuite object for the test that's currently running. + void set_current_test_suite(TestSuite* a_current_test_suite) { + current_test_suite_ = a_current_test_suite; + } + + // Sets the TestInfo object for the test that's currently running. If + // current_test_info is NULL, the assertion results will be stored in + // ad_hoc_test_result_. + void set_current_test_info(TestInfo* a_current_test_info) { + current_test_info_ = a_current_test_info; + } + + // Registers all parameterized tests defined using TEST_P and + // INSTANTIATE_TEST_SUITE_P, creating regular tests for each test/parameter + // combination. This method can be called more then once; it has guards + // protecting from registering the tests more then once. If + // value-parameterized tests are disabled, RegisterParameterizedTests is + // present but does nothing. + void RegisterParameterizedTests(); + + // Runs all tests in this UnitTest object, prints the result, and + // returns true if all tests are successful. If any exception is + // thrown during a test, this test is considered to be failed, but + // the rest of the tests will still be run. + bool RunAllTests(); + + // Clears the results of all tests, except the ad hoc tests. + void ClearNonAdHocTestResult() { + ForEach(test_suites_, TestSuite::ClearTestSuiteResult); + } + + // Clears the results of ad-hoc test assertions. + void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); } + + // Adds a TestProperty to the current TestResult object when invoked in a + // context of a test or a test suite, or to the global property set. If the + // result already contains a property with the same key, the value will be + // updated. + void RecordProperty(const TestProperty& test_property); + + enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL }; + + // Matches the full name of each test against the user-specified + // filter to decide whether the test should run, then records the + // result in each TestSuite and TestInfo object. + // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests + // based on sharding variables in the environment. + // Returns the number of tests that should run. + int FilterTests(ReactionToSharding shard_tests); + + // Prints the names of the tests matching the user-specified filter flag. + void ListTestsMatchingFilter(); + + const TestSuite* current_test_suite() const { return current_test_suite_; } + TestInfo* current_test_info() { return current_test_info_; } + const TestInfo* current_test_info() const { return current_test_info_; } + + // Returns the vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector& environments() { return environments_; } + + // Getters for the per-thread Google Test trace stack. + std::vector& gtest_trace_stack() { + return *(gtest_trace_stack_.pointer()); + } + const std::vector& gtest_trace_stack() const { + return gtest_trace_stack_.get(); + } + +#if GTEST_HAS_DEATH_TEST + void InitDeathTestSubprocessControlInfo() { + internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag()); + } + // Returns a pointer to the parsed --gtest_internal_run_death_test + // flag, or NULL if that flag was not specified. + // This information is useful only in a death test child process. + // Must not be called before a call to InitGoogleTest. + const InternalRunDeathTestFlag* internal_run_death_test_flag() const { + return internal_run_death_test_flag_.get(); + } + + // Returns a pointer to the current death test factory. + internal::DeathTestFactory* death_test_factory() { + return death_test_factory_.get(); + } + + void SuppressTestEventsIfInSubprocess(); + + friend class ReplaceDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + + // Initializes the event listener performing XML output as specified by + // UnitTestOptions. Must not be called before InitGoogleTest. + void ConfigureXmlOutput(); + +#if GTEST_CAN_STREAM_RESULTS_ + // Initializes the event listener for streaming test results to a socket. + // Must not be called before InitGoogleTest. + void ConfigureStreamingOutput(); +#endif + + // Performs initialization dependent upon flag values obtained in + // ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to + // ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest + // this function is also called from RunAllTests. Since this function can be + // called more than once, it has to be idempotent. + void PostFlagParsingInit(); + + // Gets the random seed used at the start of the current test iteration. + int random_seed() const { return random_seed_; } + + // Gets the random number generator. + internal::Random* random() { return &random_; } + + // Shuffles all test suites, and the tests within each test suite, + // making sure that death tests are still run first. + void ShuffleTests(); + + // Restores the test suites and tests to their order before the first shuffle. + void UnshuffleTests(); + + // Returns the value of GTEST_FLAG(catch_exceptions) at the moment + // UnitTest::Run() starts. + bool catch_exceptions() const { return catch_exceptions_; } + + private: + friend class ::testing::UnitTest; + + // Used by UnitTest::Run() to capture the state of + // GTEST_FLAG(catch_exceptions) at the moment it starts. + void set_catch_exceptions(bool value) { catch_exceptions_ = value; } + + // The UnitTest object that owns this implementation object. + UnitTest* const parent_; + + // The working directory when the first TEST() or TEST_F() was + // executed. + internal::FilePath original_working_dir_; + + // The default test part result reporters. + DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_; + DefaultPerThreadTestPartResultReporter + default_per_thread_test_part_result_reporter_; + + // Points to (but doesn't own) the global test part result reporter. + TestPartResultReporterInterface* global_test_part_result_repoter_; + + // Protects read and write access to global_test_part_result_reporter_. + internal::Mutex global_test_part_result_reporter_mutex_; + + // Points to (but doesn't own) the per-thread test part result reporter. + internal::ThreadLocal + per_thread_test_part_result_reporter_; + + // The vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector environments_; + + // The vector of TestSuites in their original order. It owns the + // elements in the vector. + std::vector test_suites_; + + // Provides a level of indirection for the test suite list to allow + // easy shuffling and restoring the test suite order. The i-th + // element of this vector is the index of the i-th test suite in the + // shuffled order. + std::vector test_suite_indices_; + + // ParameterizedTestRegistry object used to register value-parameterized + // tests. + internal::ParameterizedTestSuiteRegistry parameterized_test_registry_; + internal::TypeParameterizedTestSuiteRegistry + type_parameterized_test_registry_; + + // The set holding the name of parameterized + // test suites that may go uninstantiated. + std::set ignored_parameterized_test_suites_; + + // Indicates whether RegisterParameterizedTests() has been called already. + bool parameterized_tests_registered_; + + // Index of the last death test suite registered. Initially -1. + int last_death_test_suite_; + + // This points to the TestSuite for the currently running test. It + // changes as Google Test goes through one test suite after another. + // When no test is running, this is set to NULL and Google Test + // stores assertion results in ad_hoc_test_result_. Initially NULL. + TestSuite* current_test_suite_; + + // This points to the TestInfo for the currently running test. It + // changes as Google Test goes through one test after another. When + // no test is running, this is set to NULL and Google Test stores + // assertion results in ad_hoc_test_result_. Initially NULL. + TestInfo* current_test_info_; + + // Normally, a user only writes assertions inside a TEST or TEST_F, + // or inside a function called by a TEST or TEST_F. Since Google + // Test keeps track of which test is current running, it can + // associate such an assertion with the test it belongs to. + // + // If an assertion is encountered when no TEST or TEST_F is running, + // Google Test attributes the assertion result to an imaginary "ad hoc" + // test, and records the result in ad_hoc_test_result_. + TestResult ad_hoc_test_result_; + + // The list of event listeners that can be used to track events inside + // Google Test. + TestEventListeners listeners_; + + // The OS stack trace getter. Will be deleted when the UnitTest + // object is destructed. By default, an OsStackTraceGetter is used, + // but the user can set this field to use a custom getter if that is + // desired. + OsStackTraceGetterInterface* os_stack_trace_getter_; + + // True if and only if PostFlagParsingInit() has been called. + bool post_flag_parse_init_performed_; + + // The random number seed used at the beginning of the test run. + int random_seed_; + + // Our random number generator. + internal::Random random_; + + // The time of the test program start, in ms from the start of the + // UNIX epoch. + TimeInMillis start_timestamp_; + + // How long the test took to run, in milliseconds. + TimeInMillis elapsed_time_; + +#if GTEST_HAS_DEATH_TEST + // The decomposed components of the gtest_internal_run_death_test flag, + // parsed when RUN_ALL_TESTS is called. + std::unique_ptr internal_run_death_test_flag_; + std::unique_ptr death_test_factory_; +#endif // GTEST_HAS_DEATH_TEST + + // A per-thread stack of traces created by the SCOPED_TRACE() macro. + internal::ThreadLocal > gtest_trace_stack_; + + // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests() + // starts. + bool catch_exceptions_; + + UnitTestImpl(const UnitTestImpl&) = delete; + UnitTestImpl& operator=(const UnitTestImpl&) = delete; +}; // class UnitTestImpl + +// Convenience function for accessing the global UnitTest +// implementation object. +inline UnitTestImpl* GetUnitTestImpl() { + return UnitTest::GetInstance()->impl(); +} + +#if GTEST_USES_SIMPLE_RE + +// Internal helper functions for implementing the simple regular +// expression matcher. +GTEST_API_ bool IsInSet(char ch, const char* str); +GTEST_API_ bool IsAsciiDigit(char ch); +GTEST_API_ bool IsAsciiPunct(char ch); +GTEST_API_ bool IsRepeat(char ch); +GTEST_API_ bool IsAsciiWhiteSpace(char ch); +GTEST_API_ bool IsAsciiWordChar(char ch); +GTEST_API_ bool IsValidEscape(char ch); +GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch); +GTEST_API_ bool ValidateRegex(const char* regex); +GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str); +GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch, + char repeat, const char* regex, + const char* str); +GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str); + +#endif // GTEST_USES_SIMPLE_RE + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv); +GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv); + +#if GTEST_HAS_DEATH_TEST + +// Returns the message describing the last system error, regardless of the +// platform. +GTEST_API_ std::string GetLastErrnoDescription(); + +// Attempts to parse a string into a positive integer pointed to by the +// number parameter. Returns true if that is possible. +// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use +// it here. +template +bool ParseNaturalNumber(const ::std::string& str, Integer* number) { + // Fail fast if the given string does not begin with a digit; + // this bypasses strtoXXX's "optional leading whitespace and plus + // or minus sign" semantics, which are undesirable here. + if (str.empty() || !IsDigit(str[0])) { + return false; + } + errno = 0; + + char* end; + // BiggestConvertible is the largest integer type that system-provided + // string-to-number conversion routines can return. + using BiggestConvertible = unsigned long long; // NOLINT + + const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); // NOLINT + const bool parse_success = *end == '\0' && errno == 0; + + GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); + + const Integer result = static_cast(parsed); + if (parse_success && static_cast(result) == parsed) { + *number = result; + return true; + } + return false; +} +#endif // GTEST_HAS_DEATH_TEST + +// TestResult contains some private methods that should be hidden from +// Google Test user but are required for testing. This class allow our tests +// to access them. +// +// This class is supplied only for the purpose of testing Google Test's own +// constructs. Do not use it in user tests, either directly or indirectly. +class TestResultAccessor { + public: + static void RecordProperty(TestResult* test_result, + const std::string& xml_element, + const TestProperty& property) { + test_result->RecordProperty(xml_element, property); + } + + static void ClearTestPartResults(TestResult* test_result) { + test_result->ClearTestPartResults(); + } + + static const std::vector& test_part_results( + const TestResult& test_result) { + return test_result.test_part_results(); + } +}; + +#if GTEST_CAN_STREAM_RESULTS_ + +// Streams test results to the given port on the given host machine. +class StreamingListener : public EmptyTestEventListener { + public: + // Abstract base class for writing strings to a socket. + class AbstractSocketWriter { + public: + virtual ~AbstractSocketWriter() {} + + // Sends a string to the socket. + virtual void Send(const std::string& message) = 0; + + // Closes the socket. + virtual void CloseConnection() {} + + // Sends a string and a newline to the socket. + void SendLn(const std::string& message) { Send(message + "\n"); } + }; + + // Concrete class for actually writing strings to a socket. + class SocketWriter : public AbstractSocketWriter { + public: + SocketWriter(const std::string& host, const std::string& port) + : sockfd_(-1), host_name_(host), port_num_(port) { + MakeConnection(); + } + + ~SocketWriter() override { + if (sockfd_ != -1) CloseConnection(); + } + + // Sends a string to the socket. + void Send(const std::string& message) override { + GTEST_CHECK_(sockfd_ != -1) + << "Send() can be called only when there is a connection."; + + const auto len = static_cast(message.length()); + if (write(sockfd_, message.c_str(), len) != static_cast(len)) { + GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to " + << host_name_ << ":" << port_num_; + } + } + + private: + // Creates a client socket and connects to the server. + void MakeConnection(); + + // Closes the socket. + void CloseConnection() override { + GTEST_CHECK_(sockfd_ != -1) + << "CloseConnection() can be called only when there is a connection."; + + close(sockfd_); + sockfd_ = -1; + } + + int sockfd_; // socket file descriptor + const std::string host_name_; + const std::string port_num_; + + SocketWriter(const SocketWriter&) = delete; + SocketWriter& operator=(const SocketWriter&) = delete; + }; // class SocketWriter + + // Escapes '=', '&', '%', and '\n' characters in str as "%xx". + static std::string UrlEncode(const char* str); + + StreamingListener(const std::string& host, const std::string& port) + : socket_writer_(new SocketWriter(host, port)) { + Start(); + } + + explicit StreamingListener(AbstractSocketWriter* socket_writer) + : socket_writer_(socket_writer) { + Start(); + } + + void OnTestProgramStart(const UnitTest& /* unit_test */) override { + SendLn("event=TestProgramStart"); + } + + void OnTestProgramEnd(const UnitTest& unit_test) override { + // Note that Google Test current only report elapsed time for each + // test iteration, not for the entire test program. + SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed())); + + // Notify the streaming server to stop. + socket_writer_->CloseConnection(); + } + + void OnTestIterationStart(const UnitTest& /* unit_test */, + int iteration) override { + SendLn("event=TestIterationStart&iteration=" + + StreamableToString(iteration)); + } + + void OnTestIterationEnd(const UnitTest& unit_test, + int /* iteration */) override { + SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) + + "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) + + "ms"); + } + + // Note that "event=TestCaseStart" is a wire format and has to remain + // "case" for compatibility + void OnTestSuiteStart(const TestSuite& test_suite) override { + SendLn(std::string("event=TestCaseStart&name=") + test_suite.name()); + } + + // Note that "event=TestCaseEnd" is a wire format and has to remain + // "case" for compatibility + void OnTestSuiteEnd(const TestSuite& test_suite) override { + SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) + + "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) + + "ms"); + } + + void OnTestStart(const TestInfo& test_info) override { + SendLn(std::string("event=TestStart&name=") + test_info.name()); + } + + void OnTestEnd(const TestInfo& test_info) override { + SendLn("event=TestEnd&passed=" + + FormatBool((test_info.result())->Passed()) + "&elapsed_time=" + + StreamableToString((test_info.result())->elapsed_time()) + "ms"); + } + + void OnTestPartResult(const TestPartResult& test_part_result) override { + const char* file_name = test_part_result.file_name(); + if (file_name == nullptr) file_name = ""; + SendLn("event=TestPartResult&file=" + UrlEncode(file_name) + + "&line=" + StreamableToString(test_part_result.line_number()) + + "&message=" + UrlEncode(test_part_result.message())); + } + + private: + // Sends the given message and a newline to the socket. + void SendLn(const std::string& message) { socket_writer_->SendLn(message); } + + // Called at the start of streaming to notify the receiver what + // protocol we are using. + void Start() { SendLn("gtest_streaming_protocol_version=1.0"); } + + std::string FormatBool(bool value) { return value ? "1" : "0"; } + + const std::unique_ptr socket_writer_; + + StreamingListener(const StreamingListener&) = delete; + StreamingListener& operator=(const StreamingListener&) = delete; +}; // class StreamingListener + +#endif // GTEST_CAN_STREAM_RESULTS_ + +} // namespace internal +} // namespace testing + +GTEST_DISABLE_MSC_WARNINGS_POP_() // 4251 + +#endif // GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_ diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc new file mode 100644 index 0000000000..7e3bcc0cff --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-matchers.cc @@ -0,0 +1,98 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The Google C++ Testing and Mocking Framework (Google Test) +// +// This file implements just enough of the matcher interface to allow +// EXPECT_DEATH and friends to accept a matcher argument. + +#include "gtest/gtest-matchers.h" + +#include + +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-port.h" + +namespace testing { + +// Constructs a matcher that matches a const std::string& whose value is +// equal to s. +Matcher::Matcher(const std::string& s) { *this = Eq(s); } + +// Constructs a matcher that matches a const std::string& whose value is +// equal to s. +Matcher::Matcher(const char* s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a std::string whose value is equal to +// s. +Matcher::Matcher(const std::string& s) { *this = Eq(s); } + +// Constructs a matcher that matches a std::string whose value is equal to +// s. +Matcher::Matcher(const char* s) { *this = Eq(std::string(s)); } + +#if GTEST_INTERNAL_HAS_STRING_VIEW +// Constructs a matcher that matches a const StringView& whose value is +// equal to s. +Matcher::Matcher(const std::string& s) { + *this = Eq(s); +} + +// Constructs a matcher that matches a const StringView& whose value is +// equal to s. +Matcher::Matcher(const char* s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a const StringView& whose value is +// equal to s. +Matcher::Matcher(internal::StringView s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a StringView whose value is equal to +// s. +Matcher::Matcher(const std::string& s) { *this = Eq(s); } + +// Constructs a matcher that matches a StringView whose value is equal to +// s. +Matcher::Matcher(const char* s) { + *this = Eq(std::string(s)); +} + +// Constructs a matcher that matches a StringView whose value is equal to +// s. +Matcher::Matcher(internal::StringView s) { + *this = Eq(std::string(s)); +} +#endif // GTEST_INTERNAL_HAS_STRING_VIEW + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc new file mode 100644 index 0000000000..d797fe4d58 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-port.cc @@ -0,0 +1,1394 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gtest/internal/gtest-port.h" + +#include +#include +#include +#include + +#include +#include +#include + +#if GTEST_OS_WINDOWS +#include +#include +#include + +#include // Used in ThreadLocal. +#ifdef _MSC_VER +#include +#endif // _MSC_VER +#else +#include +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_MAC +#include +#include +#include +#endif // GTEST_OS_MAC + +#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ + GTEST_OS_NETBSD || GTEST_OS_OPENBSD +#include +#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD +#include +#endif +#endif + +#if GTEST_OS_QNX +#include +#include +#include +#endif // GTEST_OS_QNX + +#if GTEST_OS_AIX +#include +#include +#endif // GTEST_OS_AIX + +#if GTEST_OS_FUCHSIA +#include +#include +#endif // GTEST_OS_FUCHSIA + +#include "gtest/gtest-message.h" +#include "gtest/gtest-spi.h" +#include "gtest/internal/gtest-internal.h" +#include "gtest/internal/gtest-string.h" +#include "src/gtest-internal-inl.h" + +namespace testing { +namespace internal { + +#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD + +namespace { +template +T ReadProcFileField(const std::string& filename, int field) { + std::string dummy; + std::ifstream file(filename.c_str()); + while (field-- > 0) { + file >> dummy; + } + T output = 0; + file >> output; + return output; +} +} // namespace + +// Returns the number of active threads, or 0 when there is an error. +size_t GetThreadCount() { + const std::string filename = + (Message() << "/proc/" << getpid() << "/stat").GetString(); + return ReadProcFileField(filename, 19); +} + +#elif GTEST_OS_MAC + +size_t GetThreadCount() { + const task_t task = mach_task_self(); + mach_msg_type_number_t thread_count; + thread_act_array_t thread_list; + const kern_return_t status = task_threads(task, &thread_list, &thread_count); + if (status == KERN_SUCCESS) { + // task_threads allocates resources in thread_list and we need to free them + // to avoid leaks. + vm_deallocate(task, reinterpret_cast(thread_list), + sizeof(thread_t) * thread_count); + return static_cast(thread_count); + } else { + return 0; + } +} + +#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \ + GTEST_OS_NETBSD + +#if GTEST_OS_NETBSD +#undef KERN_PROC +#define KERN_PROC KERN_PROC2 +#define kinfo_proc kinfo_proc2 +#endif + +#if GTEST_OS_DRAGONFLY +#define KP_NLWP(kp) (kp.kp_nthreads) +#elif GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD +#define KP_NLWP(kp) (kp.ki_numthreads) +#elif GTEST_OS_NETBSD +#define KP_NLWP(kp) (kp.p_nlwps) +#endif + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + int mib[] = { + CTL_KERN, + KERN_PROC, + KERN_PROC_PID, + getpid(), +#if GTEST_OS_NETBSD + sizeof(struct kinfo_proc), + 1, +#endif + }; + u_int miblen = sizeof(mib) / sizeof(mib[0]); + struct kinfo_proc info; + size_t size = sizeof(info); + if (sysctl(mib, miblen, &info, &size, NULL, 0)) { + return 0; + } + return static_cast(KP_NLWP(info)); +} +#elif GTEST_OS_OPENBSD + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + int mib[] = { + CTL_KERN, + KERN_PROC, + KERN_PROC_PID | KERN_PROC_SHOW_THREADS, + getpid(), + sizeof(struct kinfo_proc), + 0, + }; + u_int miblen = sizeof(mib) / sizeof(mib[0]); + + // get number of structs + size_t size; + if (sysctl(mib, miblen, NULL, &size, NULL, 0)) { + return 0; + } + + mib[5] = static_cast(size / static_cast(mib[4])); + + // populate array of structs + struct kinfo_proc info[mib[5]]; + if (sysctl(mib, miblen, &info, &size, NULL, 0)) { + return 0; + } + + // exclude empty members + size_t nthreads = 0; + for (size_t i = 0; i < size / static_cast(mib[4]); i++) { + if (info[i].p_tid != -1) nthreads++; + } + return nthreads; +} + +#elif GTEST_OS_QNX + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + const int fd = open("/proc/self/as", O_RDONLY); + if (fd < 0) { + return 0; + } + procfs_info process_info; + const int status = + devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), nullptr); + close(fd); + if (status == EOK) { + return static_cast(process_info.num_threads); + } else { + return 0; + } +} + +#elif GTEST_OS_AIX + +size_t GetThreadCount() { + struct procentry64 entry; + pid_t pid = getpid(); + int status = getprocs64(&entry, sizeof(entry), nullptr, 0, &pid, 1); + if (status == 1) { + return entry.pi_thcount; + } else { + return 0; + } +} + +#elif GTEST_OS_FUCHSIA + +size_t GetThreadCount() { + int dummy_buffer; + size_t avail; + zx_status_t status = + zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS, + &dummy_buffer, 0, nullptr, &avail); + if (status == ZX_OK) { + return avail; + } else { + return 0; + } +} + +#else + +size_t GetThreadCount() { + // There's no portable way to detect the number of threads, so we just + // return 0 to indicate that we cannot detect it. + return 0; +} + +#endif // GTEST_OS_LINUX + +#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS + +AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {} + +AutoHandle::AutoHandle(Handle handle) : handle_(handle) {} + +AutoHandle::~AutoHandle() { Reset(); } + +AutoHandle::Handle AutoHandle::Get() const { return handle_; } + +void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); } + +void AutoHandle::Reset(HANDLE handle) { + // Resetting with the same handle we already own is invalid. + if (handle_ != handle) { + if (IsCloseable()) { + ::CloseHandle(handle_); + } + handle_ = handle; + } else { + GTEST_CHECK_(!IsCloseable()) + << "Resetting a valid handle to itself is likely a programmer error " + "and thus not allowed."; + } +} + +bool AutoHandle::IsCloseable() const { + // Different Windows APIs may use either of these values to represent an + // invalid handle. + return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE; +} + +Mutex::Mutex() + : owner_thread_id_(0), + type_(kDynamic), + critical_section_init_phase_(0), + critical_section_(new CRITICAL_SECTION) { + ::InitializeCriticalSection(critical_section_); +} + +Mutex::~Mutex() { + // Static mutexes are leaked intentionally. It is not thread-safe to try + // to clean them up. + if (type_ == kDynamic) { + ::DeleteCriticalSection(critical_section_); + delete critical_section_; + critical_section_ = nullptr; + } +} + +void Mutex::Lock() { + ThreadSafeLazyInit(); + ::EnterCriticalSection(critical_section_); + owner_thread_id_ = ::GetCurrentThreadId(); +} + +void Mutex::Unlock() { + ThreadSafeLazyInit(); + // We don't protect writing to owner_thread_id_ here, as it's the + // caller's responsibility to ensure that the current thread holds the + // mutex when this is called. + owner_thread_id_ = 0; + ::LeaveCriticalSection(critical_section_); +} + +// Does nothing if the current thread holds the mutex. Otherwise, crashes +// with high probability. +void Mutex::AssertHeld() { + ThreadSafeLazyInit(); + GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId()) + << "The current thread is not holding the mutex @" << this; +} + +namespace { + +#ifdef _MSC_VER +// Use the RAII idiom to flag mem allocs that are intentionally never +// deallocated. The motivation is to silence the false positive mem leaks +// that are reported by the debug version of MS's CRT which can only detect +// if an alloc is missing a matching deallocation. +// Example: +// MemoryIsNotDeallocated memory_is_not_deallocated; +// critical_section_ = new CRITICAL_SECTION; +// +class MemoryIsNotDeallocated { + public: + MemoryIsNotDeallocated() : old_crtdbg_flag_(0) { + old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG); + // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT + // doesn't report mem leak if there's no matching deallocation. + (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF); + } + + ~MemoryIsNotDeallocated() { + // Restore the original _CRTDBG_ALLOC_MEM_DF flag + (void)_CrtSetDbgFlag(old_crtdbg_flag_); + } + + private: + int old_crtdbg_flag_; + + MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete; + MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete; +}; +#endif // _MSC_VER + +} // namespace + +// Initializes owner_thread_id_ and critical_section_ in static mutexes. +void Mutex::ThreadSafeLazyInit() { + // Dynamic mutexes are initialized in the constructor. + if (type_ == kStatic) { + switch ( + ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) { + case 0: + // If critical_section_init_phase_ was 0 before the exchange, we + // are the first to test it and need to perform the initialization. + owner_thread_id_ = 0; + { + // Use RAII to flag that following mem alloc is never deallocated. +#ifdef _MSC_VER + MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER + critical_section_ = new CRITICAL_SECTION; + } + ::InitializeCriticalSection(critical_section_); + // Updates the critical_section_init_phase_ to 2 to signal + // initialization complete. + GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_, + 2L, 1L) == 1L); + break; + case 1: + // Somebody else is already initializing the mutex; spin until they + // are done. + while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L, + 2L) != 2L) { + // Possibly yields the rest of the thread's time slice to other + // threads. + ::Sleep(0); + } + break; + + case 2: + break; // The mutex is already initialized and ready for use. + + default: + GTEST_CHECK_(false) + << "Unexpected value of critical_section_init_phase_ " + << "while initializing a static mutex."; + } + } +} + +namespace { + +class ThreadWithParamSupport : public ThreadWithParamBase { + public: + static HANDLE CreateThread(Runnable* runnable, + Notification* thread_can_start) { + ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start); + DWORD thread_id; + HANDLE thread_handle = ::CreateThread( + nullptr, // Default security. + 0, // Default stack size. + &ThreadWithParamSupport::ThreadMain, + param, // Parameter to ThreadMainStatic + 0x0, // Default creation flags. + &thread_id); // Need a valid pointer for the call to work under Win98. + GTEST_CHECK_(thread_handle != nullptr) + << "CreateThread failed with error " << ::GetLastError() << "."; + if (thread_handle == nullptr) { + delete param; + } + return thread_handle; + } + + private: + struct ThreadMainParam { + ThreadMainParam(Runnable* runnable, Notification* thread_can_start) + : runnable_(runnable), thread_can_start_(thread_can_start) {} + std::unique_ptr runnable_; + // Does not own. + Notification* thread_can_start_; + }; + + static DWORD WINAPI ThreadMain(void* ptr) { + // Transfers ownership. + std::unique_ptr param(static_cast(ptr)); + if (param->thread_can_start_ != nullptr) + param->thread_can_start_->WaitForNotification(); + param->runnable_->Run(); + return 0; + } + + // Prohibit instantiation. + ThreadWithParamSupport(); + + ThreadWithParamSupport(const ThreadWithParamSupport&) = delete; + ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete; +}; + +} // namespace + +ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable, + Notification* thread_can_start) + : thread_( + ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {} + +ThreadWithParamBase::~ThreadWithParamBase() { Join(); } + +void ThreadWithParamBase::Join() { + GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0) + << "Failed to join the thread with error " << ::GetLastError() << "."; +} + +// Maps a thread to a set of ThreadIdToThreadLocals that have values +// instantiated on that thread and notifies them when the thread exits. A +// ThreadLocal instance is expected to persist until all threads it has +// values on have terminated. +class ThreadLocalRegistryImpl { + public: + // Registers thread_local_instance as having value on the current thread. + // Returns a value that can be used to identify the thread from other threads. + static ThreadLocalValueHolderBase* GetValueOnCurrentThread( + const ThreadLocalBase* thread_local_instance) { +#ifdef _MSC_VER + MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER + DWORD current_thread = ::GetCurrentThreadId(); + MutexLock lock(&mutex_); + ThreadIdToThreadLocals* const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + ThreadIdToThreadLocals::iterator thread_local_pos = + thread_to_thread_locals->find(current_thread); + if (thread_local_pos == thread_to_thread_locals->end()) { + thread_local_pos = + thread_to_thread_locals + ->insert(std::make_pair(current_thread, ThreadLocalValues())) + .first; + StartWatcherThreadFor(current_thread); + } + ThreadLocalValues& thread_local_values = thread_local_pos->second; + ThreadLocalValues::iterator value_pos = + thread_local_values.find(thread_local_instance); + if (value_pos == thread_local_values.end()) { + value_pos = + thread_local_values + .insert(std::make_pair( + thread_local_instance, + std::shared_ptr( + thread_local_instance->NewValueForCurrentThread()))) + .first; + } + return value_pos->second.get(); + } + + static void OnThreadLocalDestroyed( + const ThreadLocalBase* thread_local_instance) { + std::vector > value_holders; + // Clean up the ThreadLocalValues data structure while holding the lock, but + // defer the destruction of the ThreadLocalValueHolderBases. + { + MutexLock lock(&mutex_); + ThreadIdToThreadLocals* const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + for (ThreadIdToThreadLocals::iterator it = + thread_to_thread_locals->begin(); + it != thread_to_thread_locals->end(); ++it) { + ThreadLocalValues& thread_local_values = it->second; + ThreadLocalValues::iterator value_pos = + thread_local_values.find(thread_local_instance); + if (value_pos != thread_local_values.end()) { + value_holders.push_back(value_pos->second); + thread_local_values.erase(value_pos); + // This 'if' can only be successful at most once, so theoretically we + // could break out of the loop here, but we don't bother doing so. + } + } + } + // Outside the lock, let the destructor for 'value_holders' deallocate the + // ThreadLocalValueHolderBases. + } + + static void OnThreadExit(DWORD thread_id) { + GTEST_CHECK_(thread_id != 0) << ::GetLastError(); + std::vector > value_holders; + // Clean up the ThreadIdToThreadLocals data structure while holding the + // lock, but defer the destruction of the ThreadLocalValueHolderBases. + { + MutexLock lock(&mutex_); + ThreadIdToThreadLocals* const thread_to_thread_locals = + GetThreadLocalsMapLocked(); + ThreadIdToThreadLocals::iterator thread_local_pos = + thread_to_thread_locals->find(thread_id); + if (thread_local_pos != thread_to_thread_locals->end()) { + ThreadLocalValues& thread_local_values = thread_local_pos->second; + for (ThreadLocalValues::iterator value_pos = + thread_local_values.begin(); + value_pos != thread_local_values.end(); ++value_pos) { + value_holders.push_back(value_pos->second); + } + thread_to_thread_locals->erase(thread_local_pos); + } + } + // Outside the lock, let the destructor for 'value_holders' deallocate the + // ThreadLocalValueHolderBases. + } + + private: + // In a particular thread, maps a ThreadLocal object to its value. + typedef std::map > + ThreadLocalValues; + // Stores all ThreadIdToThreadLocals having values in a thread, indexed by + // thread's ID. + typedef std::map ThreadIdToThreadLocals; + + // Holds the thread id and thread handle that we pass from + // StartWatcherThreadFor to WatcherThreadFunc. + typedef std::pair ThreadIdAndHandle; + + static void StartWatcherThreadFor(DWORD thread_id) { + // The returned handle will be kept in thread_map and closed by + // watcher_thread in WatcherThreadFunc. + HANDLE thread = + ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id); + GTEST_CHECK_(thread != nullptr); + // We need to pass a valid thread ID pointer into CreateThread for it + // to work correctly under Win98. + DWORD watcher_thread_id; + HANDLE watcher_thread = ::CreateThread( + nullptr, // Default security. + 0, // Default stack size + &ThreadLocalRegistryImpl::WatcherThreadFunc, + reinterpret_cast(new ThreadIdAndHandle(thread_id, thread)), + CREATE_SUSPENDED, &watcher_thread_id); + GTEST_CHECK_(watcher_thread != nullptr) + << "CreateThread failed with error " << ::GetLastError() << "."; + // Give the watcher thread the same priority as ours to avoid being + // blocked by it. + ::SetThreadPriority(watcher_thread, + ::GetThreadPriority(::GetCurrentThread())); + ::ResumeThread(watcher_thread); + ::CloseHandle(watcher_thread); + } + + // Monitors exit from a given thread and notifies those + // ThreadIdToThreadLocals about thread termination. + static DWORD WINAPI WatcherThreadFunc(LPVOID param) { + const ThreadIdAndHandle* tah = + reinterpret_cast(param); + GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0); + OnThreadExit(tah->first); + ::CloseHandle(tah->second); + delete tah; + return 0; + } + + // Returns map of thread local instances. + static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() { + mutex_.AssertHeld(); +#ifdef _MSC_VER + MemoryIsNotDeallocated memory_is_not_deallocated; +#endif // _MSC_VER + static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals(); + return map; + } + + // Protects access to GetThreadLocalsMapLocked() and its return value. + static Mutex mutex_; + // Protects access to GetThreadMapLocked() and its return value. + static Mutex thread_map_mutex_; +}; + +Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex); // NOLINT +Mutex ThreadLocalRegistryImpl::thread_map_mutex_( + Mutex::kStaticMutex); // NOLINT + +ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread( + const ThreadLocalBase* thread_local_instance) { + return ThreadLocalRegistryImpl::GetValueOnCurrentThread( + thread_local_instance); +} + +void ThreadLocalRegistry::OnThreadLocalDestroyed( + const ThreadLocalBase* thread_local_instance) { + ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance); +} + +#endif // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS + +#if GTEST_USES_POSIX_RE + +// Implements RE. Currently only needed for death tests. + +RE::~RE() { + if (is_valid_) { + // regfree'ing an invalid regex might crash because the content + // of the regex is undefined. Since the regex's are essentially + // the same, one cannot be valid (or invalid) without the other + // being so too. + regfree(&partial_regex_); + regfree(&full_regex_); + } + free(const_cast(pattern_)); +} + +// Returns true if and only if regular expression re matches the entire str. +bool RE::FullMatch(const char* str, const RE& re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.full_regex_, str, 1, &match, 0) == 0; +} + +// Returns true if and only if regular expression re matches a substring of +// str (including str itself). +bool RE::PartialMatch(const char* str, const RE& re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.partial_regex_, str, 1, &match, 0) == 0; +} + +// Initializes an RE from its string representation. +void RE::Init(const char* regex) { + pattern_ = posix::StrDup(regex); + + // Reserves enough bytes to hold the regular expression used for a + // full match. + const size_t full_regex_len = strlen(regex) + 10; + char* const full_pattern = new char[full_regex_len]; + + snprintf(full_pattern, full_regex_len, "^(%s)$", regex); + is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0; + // We want to call regcomp(&partial_regex_, ...) even if the + // previous expression returns false. Otherwise partial_regex_ may + // not be properly initialized can may cause trouble when it's + // freed. + // + // Some implementation of POSIX regex (e.g. on at least some + // versions of Cygwin) doesn't accept the empty string as a valid + // regex. We change it to an equivalent form "()" to be safe. + if (is_valid_) { + const char* const partial_regex = (*regex == '\0') ? "()" : regex; + is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0; + } + EXPECT_TRUE(is_valid_) + << "Regular expression \"" << regex + << "\" is not a valid POSIX Extended regular expression."; + + delete[] full_pattern; +} + +#elif GTEST_USES_SIMPLE_RE + +// Returns true if and only if ch appears anywhere in str (excluding the +// terminating '\0' character). +bool IsInSet(char ch, const char* str) { + return ch != '\0' && strchr(str, ch) != nullptr; +} + +// Returns true if and only if ch belongs to the given classification. +// Unlike similar functions in , these aren't affected by the +// current locale. +bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; } +bool IsAsciiPunct(char ch) { + return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~"); +} +bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); } +bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); } +bool IsAsciiWordChar(char ch) { + return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || + ('0' <= ch && ch <= '9') || ch == '_'; +} + +// Returns true if and only if "\\c" is a supported escape sequence. +bool IsValidEscape(char c) { + return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW")); +} + +// Returns true if and only if the given atom (specified by escaped and +// pattern) matches ch. The result is undefined if the atom is invalid. +bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { + if (escaped) { // "\\p" where p is pattern_char. + switch (pattern_char) { + case 'd': + return IsAsciiDigit(ch); + case 'D': + return !IsAsciiDigit(ch); + case 'f': + return ch == '\f'; + case 'n': + return ch == '\n'; + case 'r': + return ch == '\r'; + case 's': + return IsAsciiWhiteSpace(ch); + case 'S': + return !IsAsciiWhiteSpace(ch); + case 't': + return ch == '\t'; + case 'v': + return ch == '\v'; + case 'w': + return IsAsciiWordChar(ch); + case 'W': + return !IsAsciiWordChar(ch); + } + return IsAsciiPunct(pattern_char) && pattern_char == ch; + } + + return (pattern_char == '.' && ch != '\n') || pattern_char == ch; +} + +// Helper function used by ValidateRegex() to format error messages. +static std::string FormatRegexSyntaxError(const char* regex, int index) { + return (Message() << "Syntax error at index " << index + << " in simple regular expression \"" << regex << "\": ") + .GetString(); +} + +// Generates non-fatal failures and returns false if regex is invalid; +// otherwise returns true. +bool ValidateRegex(const char* regex) { + if (regex == nullptr) { + ADD_FAILURE() << "NULL is not a valid simple regular expression."; + return false; + } + + bool is_valid = true; + + // True if and only if ?, *, or + can follow the previous atom. + bool prev_repeatable = false; + for (int i = 0; regex[i]; i++) { + if (regex[i] == '\\') { // An escape sequence + i++; + if (regex[i] == '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "'\\' cannot appear at the end."; + return false; + } + + if (!IsValidEscape(regex[i])) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "invalid escape sequence \"\\" << regex[i] << "\"."; + is_valid = false; + } + prev_repeatable = true; + } else { // Not an escape sequence. + const char ch = regex[i]; + + if (ch == '^' && i > 0) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'^' can only appear at the beginning."; + is_valid = false; + } else if (ch == '$' && regex[i + 1] != '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'$' can only appear at the end."; + is_valid = false; + } else if (IsInSet(ch, "()[]{}|")) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch + << "' is unsupported."; + is_valid = false; + } else if (IsRepeat(ch) && !prev_repeatable) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch + << "' can only follow a repeatable token."; + is_valid = false; + } + + prev_repeatable = !IsInSet(ch, "^$?*+"); + } + } + + return is_valid; +} + +// Matches a repeated regex atom followed by a valid simple regular +// expression. The regex atom is defined as c if escaped is false, +// or \c otherwise. repeat is the repetition meta character (?, *, +// or +). The behavior is undefined if str contains too many +// characters to be indexable by size_t, in which case the test will +// probably time out anyway. We are fine with this limitation as +// std::string has it too. +bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat, + const char* regex, const char* str) { + const size_t min_count = (repeat == '+') ? 1 : 0; + const size_t max_count = (repeat == '?') ? 1 : static_cast(-1) - 1; + // We cannot call numeric_limits::max() as it conflicts with the + // max() macro on Windows. + + for (size_t i = 0; i <= max_count; ++i) { + // We know that the atom matches each of the first i characters in str. + if (i >= min_count && MatchRegexAtHead(regex, str + i)) { + // We have enough matches at the head, and the tail matches too. + // Since we only care about *whether* the pattern matches str + // (as opposed to *how* it matches), there is no need to find a + // greedy match. + return true; + } + if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false; + } + return false; +} + +// Returns true if and only if regex matches a prefix of str. regex must +// be a valid simple regular expression and not start with "^", or the +// result is undefined. +bool MatchRegexAtHead(const char* regex, const char* str) { + if (*regex == '\0') // An empty regex matches a prefix of anything. + return true; + + // "$" only matches the end of a string. Note that regex being + // valid guarantees that there's nothing after "$" in it. + if (*regex == '$') return *str == '\0'; + + // Is the first thing in regex an escape sequence? + const bool escaped = *regex == '\\'; + if (escaped) ++regex; + if (IsRepeat(regex[1])) { + // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so + // here's an indirect recursion. It terminates as the regex gets + // shorter in each recursion. + return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2, + str); + } else { + // regex isn't empty, isn't "$", and doesn't start with a + // repetition. We match the first atom of regex with the first + // character of str and recurse. + return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) && + MatchRegexAtHead(regex + 1, str + 1); + } +} + +// Returns true if and only if regex matches any substring of str. regex must +// be a valid simple regular expression, or the result is undefined. +// +// The algorithm is recursive, but the recursion depth doesn't exceed +// the regex length, so we won't need to worry about running out of +// stack space normally. In rare cases the time complexity can be +// exponential with respect to the regex length + the string length, +// but usually it's must faster (often close to linear). +bool MatchRegexAnywhere(const char* regex, const char* str) { + if (regex == nullptr || str == nullptr) return false; + + if (*regex == '^') return MatchRegexAtHead(regex + 1, str); + + // A successful match can be anywhere in str. + do { + if (MatchRegexAtHead(regex, str)) return true; + } while (*str++ != '\0'); + return false; +} + +// Implements the RE class. + +RE::~RE() { + free(const_cast(pattern_)); + free(const_cast(full_pattern_)); +} + +// Returns true if and only if regular expression re matches the entire str. +bool RE::FullMatch(const char* str, const RE& re) { + return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str); +} + +// Returns true if and only if regular expression re matches a substring of +// str (including str itself). +bool RE::PartialMatch(const char* str, const RE& re) { + return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str); +} + +// Initializes an RE from its string representation. +void RE::Init(const char* regex) { + pattern_ = full_pattern_ = nullptr; + if (regex != nullptr) { + pattern_ = posix::StrDup(regex); + } + + is_valid_ = ValidateRegex(regex); + if (!is_valid_) { + // No need to calculate the full pattern when the regex is invalid. + return; + } + + const size_t len = strlen(regex); + // Reserves enough bytes to hold the regular expression used for a + // full match: we need space to prepend a '^', append a '$', and + // terminate the string with '\0'. + char* buffer = static_cast(malloc(len + 3)); + full_pattern_ = buffer; + + if (*regex != '^') + *buffer++ = '^'; // Makes sure full_pattern_ starts with '^'. + + // We don't use snprintf or strncpy, as they trigger a warning when + // compiled with VC++ 8.0. + memcpy(buffer, regex, len); + buffer += len; + + if (len == 0 || regex[len - 1] != '$') + *buffer++ = '$'; // Makes sure full_pattern_ ends with '$'. + + *buffer = '\0'; +} + +#endif // GTEST_USES_POSIX_RE + +const char kUnknownFile[] = "unknown file"; + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { + const std::string file_name(file == nullptr ? kUnknownFile : file); + + if (line < 0) { + return file_name + ":"; + } +#ifdef _MSC_VER + return file_name + "(" + StreamableToString(line) + "):"; +#else + return file_name + ":" + StreamableToString(line) + ":"; +#endif // _MSC_VER +} + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +// Note that FormatCompilerIndependentFileLocation() does NOT append colon +// to the file location it produces, unlike FormatFileLocation(). +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, + int line) { + const std::string file_name(file == nullptr ? kUnknownFile : file); + + if (line < 0) + return file_name; + else + return file_name + ":" + StreamableToString(line); +} + +GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line) + : severity_(severity) { + const char* const marker = severity == GTEST_INFO ? "[ INFO ]" + : severity == GTEST_WARNING ? "[WARNING]" + : severity == GTEST_ERROR ? "[ ERROR ]" + : "[ FATAL ]"; + GetStream() << ::std::endl + << marker << " " << FormatFileLocation(file, line).c_str() + << ": "; +} + +// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. +GTestLog::~GTestLog() { + GetStream() << ::std::endl; + if (severity_ == GTEST_FATAL) { + fflush(stderr); + posix::Abort(); + } +} + +// Disable Microsoft deprecation warnings for POSIX functions called from +// this class (creat, dup, dup2, and close) +GTEST_DISABLE_MSC_DEPRECATED_PUSH_() + +#if GTEST_HAS_STREAM_REDIRECTION + +// Object that captures an output stream (stdout/stderr). +class CapturedStream { + public: + // The ctor redirects the stream to a temporary file. + explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) { +#if GTEST_OS_WINDOWS + char temp_dir_path[MAX_PATH + 1] = {'\0'}; // NOLINT + char temp_file_path[MAX_PATH + 1] = {'\0'}; // NOLINT + + ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path); + const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir", + 0, // Generate unique file name. + temp_file_path); + GTEST_CHECK_(success != 0) + << "Unable to create a temporary file in " << temp_dir_path; + const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE); + GTEST_CHECK_(captured_fd != -1) + << "Unable to open temporary file " << temp_file_path; + filename_ = temp_file_path; +#else + // There's no guarantee that a test has write access to the current + // directory, so we create the temporary file in a temporary directory. + std::string name_template; + +#if GTEST_OS_LINUX_ANDROID + // Note: Android applications are expected to call the framework's + // Context.getExternalStorageDirectory() method through JNI to get + // the location of the world-writable SD Card directory. However, + // this requires a Context handle, which cannot be retrieved + // globally from native code. Doing so also precludes running the + // code as part of a regular standalone executable, which doesn't + // run in a Dalvik process (e.g. when running it through 'adb shell'). + // + // The location /data/local/tmp is directly accessible from native code. + // '/sdcard' and other variants cannot be relied on, as they are not + // guaranteed to be mounted, or may have a delay in mounting. + name_template = "/data/local/tmp/"; +#elif GTEST_OS_IOS + char user_temp_dir[PATH_MAX + 1]; + + // Documented alternative to NSTemporaryDirectory() (for obtaining creating + // a temporary directory) at + // https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/Articles/RaceConditions.html#//apple_ref/doc/uid/TP40002585-SW10 + // + // _CS_DARWIN_USER_TEMP_DIR (as well as _CS_DARWIN_USER_CACHE_DIR) is not + // documented in the confstr() man page at + // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/confstr.3.html#//apple_ref/doc/man/3/confstr + // but are still available, according to the WebKit patches at + // https://trac.webkit.org/changeset/262004/webkit + // https://trac.webkit.org/changeset/263705/webkit + // + // The confstr() implementation falls back to getenv("TMPDIR"). See + // https://opensource.apple.com/source/Libc/Libc-1439.100.3/gen/confstr.c.auto.html + ::confstr(_CS_DARWIN_USER_TEMP_DIR, user_temp_dir, sizeof(user_temp_dir)); + + name_template = user_temp_dir; + if (name_template.back() != GTEST_PATH_SEP_[0]) + name_template.push_back(GTEST_PATH_SEP_[0]); +#else + name_template = "/tmp/"; +#endif + name_template.append("gtest_captured_stream.XXXXXX"); + + // mkstemp() modifies the string bytes in place, and does not go beyond the + // string's length. This results in well-defined behavior in C++17. + // + // The const_cast is needed below C++17. The constraints on std::string + // implementations in C++11 and above make assumption behind the const_cast + // fairly safe. + const int captured_fd = ::mkstemp(const_cast(name_template.data())); + if (captured_fd == -1) { + GTEST_LOG_(WARNING) + << "Failed to create tmp file " << name_template + << " for test; does the test have access to the /tmp directory?"; + } + filename_ = std::move(name_template); +#endif // GTEST_OS_WINDOWS + fflush(nullptr); + dup2(captured_fd, fd_); + close(captured_fd); + } + + ~CapturedStream() { remove(filename_.c_str()); } + + std::string GetCapturedString() { + if (uncaptured_fd_ != -1) { + // Restores the original stream. + fflush(nullptr); + dup2(uncaptured_fd_, fd_); + close(uncaptured_fd_); + uncaptured_fd_ = -1; + } + + FILE* const file = posix::FOpen(filename_.c_str(), "r"); + if (file == nullptr) { + GTEST_LOG_(FATAL) << "Failed to open tmp file " << filename_ + << " for capturing stream."; + } + const std::string content = ReadEntireFile(file); + posix::FClose(file); + return content; + } + + private: + const int fd_; // A stream to capture. + int uncaptured_fd_; + // Name of the temporary file holding the stderr output. + ::std::string filename_; + + CapturedStream(const CapturedStream&) = delete; + CapturedStream& operator=(const CapturedStream&) = delete; +}; + +GTEST_DISABLE_MSC_DEPRECATED_POP_() + +static CapturedStream* g_captured_stderr = nullptr; +static CapturedStream* g_captured_stdout = nullptr; + +// Starts capturing an output stream (stdout/stderr). +static void CaptureStream(int fd, const char* stream_name, + CapturedStream** stream) { + if (*stream != nullptr) { + GTEST_LOG_(FATAL) << "Only one " << stream_name + << " capturer can exist at a time."; + } + *stream = new CapturedStream(fd); +} + +// Stops capturing the output stream and returns the captured string. +static std::string GetCapturedStream(CapturedStream** captured_stream) { + const std::string content = (*captured_stream)->GetCapturedString(); + + delete *captured_stream; + *captured_stream = nullptr; + + return content; +} + +#if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC and C++Builder do not provide a definition of STDERR_FILENO. +const int kStdOutFileno = 1; +const int kStdErrFileno = 2; +#else +const int kStdOutFileno = STDOUT_FILENO; +const int kStdErrFileno = STDERR_FILENO; +#endif // defined(_MSC_VER) || defined(__BORLANDC__) + +// Starts capturing stdout. +void CaptureStdout() { + CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout); +} + +// Starts capturing stderr. +void CaptureStderr() { + CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr); +} + +// Stops capturing stdout and returns the captured string. +std::string GetCapturedStdout() { + return GetCapturedStream(&g_captured_stdout); +} + +// Stops capturing stderr and returns the captured string. +std::string GetCapturedStderr() { + return GetCapturedStream(&g_captured_stderr); +} + +#endif // GTEST_HAS_STREAM_REDIRECTION + +size_t GetFileSize(FILE* file) { + fseek(file, 0, SEEK_END); + return static_cast(ftell(file)); +} + +std::string ReadEntireFile(FILE* file) { + const size_t file_size = GetFileSize(file); + char* const buffer = new char[file_size]; + + size_t bytes_last_read = 0; // # of bytes read in the last fread() + size_t bytes_read = 0; // # of bytes read so far + + fseek(file, 0, SEEK_SET); + + // Keeps reading the file until we cannot read further or the + // pre-determined file size is reached. + do { + bytes_last_read = + fread(buffer + bytes_read, 1, file_size - bytes_read, file); + bytes_read += bytes_last_read; + } while (bytes_last_read > 0 && bytes_read < file_size); + + const std::string content(buffer, bytes_read); + delete[] buffer; + + return content; +} + +#if GTEST_HAS_DEATH_TEST +static const std::vector* g_injected_test_argvs = + nullptr; // Owned. + +std::vector GetInjectableArgvs() { + if (g_injected_test_argvs != nullptr) { + return *g_injected_test_argvs; + } + return GetArgvs(); +} + +void SetInjectableArgvs(const std::vector* new_argvs) { + if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs; + g_injected_test_argvs = new_argvs; +} + +void SetInjectableArgvs(const std::vector& new_argvs) { + SetInjectableArgvs( + new std::vector(new_argvs.begin(), new_argvs.end())); +} + +void ClearInjectableArgvs() { + delete g_injected_test_argvs; + g_injected_test_argvs = nullptr; +} +#endif // GTEST_HAS_DEATH_TEST + +#if GTEST_OS_WINDOWS_MOBILE +namespace posix { +void Abort() { + DebugBreak(); + TerminateProcess(GetCurrentProcess(), 1); +} +} // namespace posix +#endif // GTEST_OS_WINDOWS_MOBILE + +// Returns the name of the environment variable corresponding to the +// given flag. For example, FlagToEnvVar("foo") will return +// "GTEST_FOO" in the open-source version. +static std::string FlagToEnvVar(const char* flag) { + const std::string full_flag = + (Message() << GTEST_FLAG_PREFIX_ << flag).GetString(); + + Message env_var; + for (size_t i = 0; i != full_flag.length(); i++) { + env_var << ToUpper(full_flag.c_str()[i]); + } + + return env_var.GetString(); +} + +// Parses 'str' for a 32-bit signed integer. If successful, writes +// the result to *value and returns true; otherwise leaves *value +// unchanged and returns false. +bool ParseInt32(const Message& src_text, const char* str, int32_t* value) { + // Parses the environment variable as a decimal integer. + char* end = nullptr; + const long long_value = strtol(str, &end, 10); // NOLINT + + // Has strtol() consumed all characters in the string? + if (*end != '\0') { + // No - an invalid character was encountered. + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value \"" << str << "\".\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + // Is the parsed value in the range of an int32_t? + const auto result = static_cast(long_value); + if (long_value == LONG_MAX || long_value == LONG_MIN || + // The parsed value overflows as a long. (strtol() returns + // LONG_MAX or LONG_MIN when the input overflows.) + result != long_value + // The parsed value overflows as an int32_t. + ) { + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value " << str << ", which overflows.\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + *value = result; + return true; +} + +// Reads and returns the Boolean environment variable corresponding to +// the given flag; if it's not set, returns default_value. +// +// The value is considered true if and only if it's not "0". +bool BoolFromGTestEnv(const char* flag, bool default_value) { +#if defined(GTEST_GET_BOOL_FROM_ENV_) + return GTEST_GET_BOOL_FROM_ENV_(flag, default_value); +#else + const std::string env_var = FlagToEnvVar(flag); + const char* const string_value = posix::GetEnv(env_var.c_str()); + return string_value == nullptr ? default_value + : strcmp(string_value, "0") != 0; +#endif // defined(GTEST_GET_BOOL_FROM_ENV_) +} + +// Reads and returns a 32-bit integer stored in the environment +// variable corresponding to the given flag; if it isn't set or +// doesn't represent a valid 32-bit integer, returns default_value. +int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) { +#if defined(GTEST_GET_INT32_FROM_ENV_) + return GTEST_GET_INT32_FROM_ENV_(flag, default_value); +#else + const std::string env_var = FlagToEnvVar(flag); + const char* const string_value = posix::GetEnv(env_var.c_str()); + if (string_value == nullptr) { + // The environment variable is not set. + return default_value; + } + + int32_t result = default_value; + if (!ParseInt32(Message() << "Environment variable " << env_var, string_value, + &result)) { + printf("The default value %s is used.\n", + (Message() << default_value).GetString().c_str()); + fflush(stdout); + return default_value; + } + + return result; +#endif // defined(GTEST_GET_INT32_FROM_ENV_) +} + +// As a special case for the 'output' flag, if GTEST_OUTPUT is not +// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build +// system. The value of XML_OUTPUT_FILE is a filename without the +// "xml:" prefix of GTEST_OUTPUT. +// Note that this is meant to be called at the call site so it does +// not check that the flag is 'output' +// In essence this checks an env variable called XML_OUTPUT_FILE +// and if it is set we prepend "xml:" to its value, if it not set we return "" +std::string OutputFlagAlsoCheckEnvVar() { + std::string default_value_for_output_flag = ""; + const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE"); + if (nullptr != xml_output_file_env) { + default_value_for_output_flag = std::string("xml:") + xml_output_file_env; + } + return default_value_for_output_flag; +} + +// Reads and returns the string environment variable corresponding to +// the given flag; if it's not set, returns default_value. +const char* StringFromGTestEnv(const char* flag, const char* default_value) { +#if defined(GTEST_GET_STRING_FROM_ENV_) + return GTEST_GET_STRING_FROM_ENV_(flag, default_value); +#else + const std::string env_var = FlagToEnvVar(flag); + const char* const value = posix::GetEnv(env_var.c_str()); + return value == nullptr ? default_value : value; +#endif // defined(GTEST_GET_STRING_FROM_ENV_) +} + +} // namespace internal +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc new file mode 100644 index 0000000000..f3976d230d --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-printers.cc @@ -0,0 +1,553 @@ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Google Test - The Google C++ Testing and Mocking Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// It uses the << operator when possible, and prints the bytes in the +// object otherwise. A user can override its behavior for a class +// type Foo by defining either operator<<(::std::ostream&, const Foo&) +// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that +// defines Foo. + +#include "gtest/gtest-printers.h" + +#include + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "gtest/internal/gtest-port.h" +#include "src/gtest-internal-inl.h" + +namespace testing { + +namespace { + +using ::std::ostream; + +// Prints a segment of bytes in the given object. +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ +GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ +GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ +void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start, + size_t count, ostream* os) { + char text[5] = ""; + for (size_t i = 0; i != count; i++) { + const size_t j = start + i; + if (i != 0) { + // Organizes the bytes into groups of 2 for easy parsing by + // human. + if ((j % 2) == 0) + *os << ' '; + else + *os << '-'; + } + GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]); + *os << text; + } +} + +// Prints the bytes in the given value to the given ostream. +void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, + ostream* os) { + // Tells the user how big the object is. + *os << count << "-byte object <"; + + const size_t kThreshold = 132; + const size_t kChunkSize = 64; + // If the object size is bigger than kThreshold, we'll have to omit + // some details by printing only the first and the last kChunkSize + // bytes. + if (count < kThreshold) { + PrintByteSegmentInObjectTo(obj_bytes, 0, count, os); + } else { + PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os); + *os << " ... "; + // Rounds up to 2-byte boundary. + const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2; + PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os); + } + *os << ">"; +} + +// Helpers for widening a character to char32_t. Since the standard does not +// specify if char / wchar_t is signed or unsigned, it is important to first +// convert it to the unsigned type of the same width before widening it to +// char32_t. +template +char32_t ToChar32(CharType in) { + return static_cast( + static_cast::type>(in)); +} + +} // namespace + +namespace internal { + +// Delegates to PrintBytesInObjectToImpl() to print the bytes in the +// given object. The delegation simplifies the implementation, which +// uses the << operator and thus is easier done outside of the +// ::testing::internal namespace, which contains a << operator that +// sometimes conflicts with the one in STL. +void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count, + ostream* os) { + PrintBytesInObjectToImpl(obj_bytes, count, os); +} + +// Depending on the value of a char (or wchar_t), we print it in one +// of three formats: +// - as is if it's a printable ASCII (e.g. 'a', '2', ' '), +// - as a hexadecimal escape sequence (e.g. '\x7F'), or +// - as a special escape sequence (e.g. '\r', '\n'). +enum CharFormat { kAsIs, kHexEscape, kSpecialEscape }; + +// Returns true if c is a printable ASCII character. We test the +// value of c directly instead of calling isprint(), which is buggy on +// Windows Mobile. +inline bool IsPrintableAscii(char32_t c) { return 0x20 <= c && c <= 0x7E; } + +// Prints c (of type char, char8_t, char16_t, char32_t, or wchar_t) as a +// character literal without the quotes, escaping it when necessary; returns how +// c was formatted. +template +static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { + const char32_t u_c = ToChar32(c); + switch (u_c) { + case L'\0': + *os << "\\0"; + break; + case L'\'': + *os << "\\'"; + break; + case L'\\': + *os << "\\\\"; + break; + case L'\a': + *os << "\\a"; + break; + case L'\b': + *os << "\\b"; + break; + case L'\f': + *os << "\\f"; + break; + case L'\n': + *os << "\\n"; + break; + case L'\r': + *os << "\\r"; + break; + case L'\t': + *os << "\\t"; + break; + case L'\v': + *os << "\\v"; + break; + default: + if (IsPrintableAscii(u_c)) { + *os << static_cast(c); + return kAsIs; + } else { + ostream::fmtflags flags = os->flags(); + *os << "\\x" << std::hex << std::uppercase << static_cast(u_c); + os->flags(flags); + return kHexEscape; + } + } + return kSpecialEscape; +} + +// Prints a char32_t c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) { + switch (c) { + case L'\'': + *os << "'"; + return kAsIs; + case L'"': + *os << "\\\""; + return kSpecialEscape; + default: + return PrintAsCharLiteralTo(c, os); + } +} + +static const char* GetCharWidthPrefix(char) { return ""; } + +static const char* GetCharWidthPrefix(signed char) { return ""; } + +static const char* GetCharWidthPrefix(unsigned char) { return ""; } + +#ifdef __cpp_char8_t +static const char* GetCharWidthPrefix(char8_t) { return "u8"; } +#endif + +static const char* GetCharWidthPrefix(char16_t) { return "u"; } + +static const char* GetCharWidthPrefix(char32_t) { return "U"; } + +static const char* GetCharWidthPrefix(wchar_t) { return "L"; } + +// Prints a char c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsStringLiteralTo(char c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); +} + +#ifdef __cpp_char8_t +static CharFormat PrintAsStringLiteralTo(char8_t c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); +} +#endif + +static CharFormat PrintAsStringLiteralTo(char16_t c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); +} + +static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) { + return PrintAsStringLiteralTo(ToChar32(c), os); +} + +// Prints a character c (of type char, char8_t, char16_t, char32_t, or wchar_t) +// and its code. '\0' is printed as "'\\0'", other unprintable characters are +// also properly escaped using the standard C++ escape sequence. +template +void PrintCharAndCodeTo(Char c, ostream* os) { + // First, print c as a literal in the most readable form we can find. + *os << GetCharWidthPrefix(c) << "'"; + const CharFormat format = PrintAsCharLiteralTo(c, os); + *os << "'"; + + // To aid user debugging, we also print c's code in decimal, unless + // it's 0 (in which case c was printed as '\\0', making the code + // obvious). + if (c == 0) return; + *os << " (" << static_cast(c); + + // For more convenience, we print c's code again in hexadecimal, + // unless c was already printed in the form '\x##' or the code is in + // [1, 9]. + if (format == kHexEscape || (1 <= c && c <= 9)) { + // Do nothing. + } else { + *os << ", 0x" << String::FormatHexInt(static_cast(c)); + } + *os << ")"; +} + +void PrintTo(unsigned char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); } +void PrintTo(signed char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); } + +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its code. L'\0' is printed as "L'\\0'". +void PrintTo(wchar_t wc, ostream* os) { PrintCharAndCodeTo(wc, os); } + +// TODO(dcheng): Consider making this delegate to PrintCharAndCodeTo() as well. +void PrintTo(char32_t c, ::std::ostream* os) { + *os << std::hex << "U+" << std::uppercase << std::setfill('0') << std::setw(4) + << static_cast(c); +} + +// gcc/clang __{u,}int128_t +#if defined(__SIZEOF_INT128__) +void PrintTo(__uint128_t v, ::std::ostream* os) { + if (v == 0) { + *os << "0"; + return; + } + + // Buffer large enough for ceil(log10(2^128))==39 and the null terminator + char buf[40]; + char* p = buf + sizeof(buf); + + // Some configurations have a __uint128_t, but no support for built in + // division. Do manual long division instead. + + uint64_t high = static_cast(v >> 64); + uint64_t low = static_cast(v); + + *--p = 0; + while (high != 0 || low != 0) { + uint64_t high_mod = high % 10; + high = high / 10; + // This is the long division algorithm specialized for a divisor of 10 and + // only two elements. + // Notable values: + // 2^64 / 10 == 1844674407370955161 + // 2^64 % 10 == 6 + const uint64_t carry = 6 * high_mod + low % 10; + low = low / 10 + high_mod * 1844674407370955161 + carry / 10; + + char digit = static_cast(carry % 10); + *--p = '0' + digit; + } + *os << p; +} +void PrintTo(__int128_t v, ::std::ostream* os) { + __uint128_t uv = static_cast<__uint128_t>(v); + if (v < 0) { + *os << "-"; + uv = -uv; + } + PrintTo(uv, os); +} +#endif // __SIZEOF_INT128__ + +// Prints the given array of characters to the ostream. CharType must be either +// char, char8_t, char16_t, char32_t, or wchar_t. +// The array starts at begin, the length is len, it may include '\0' characters +// and may not be NUL-terminated. +template +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat + PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) { + const char* const quote_prefix = GetCharWidthPrefix(*begin); + *os << quote_prefix << "\""; + bool is_previous_hex = false; + CharFormat print_format = kAsIs; + for (size_t index = 0; index < len; ++index) { + const CharType cur = begin[index]; + if (is_previous_hex && IsXDigit(cur)) { + // Previous character is of '\x..' form and this character can be + // interpreted as another hexadecimal digit in its number. Break string to + // disambiguate. + *os << "\" " << quote_prefix << "\""; + } + is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape; + // Remember if any characters required hex escaping. + if (is_previous_hex) { + print_format = kHexEscape; + } + } + *os << "\""; + return print_format; +} + +// Prints a (const) char/wchar_t array of 'len' elements, starting at address +// 'begin'. CharType must be either char or wchar_t. +template +GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ + GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void + UniversalPrintCharArray(const CharType* begin, size_t len, + ostream* os) { + // The code + // const char kFoo[] = "foo"; + // generates an array of 4, not 3, elements, with the last one being '\0'. + // + // Therefore when printing a char array, we don't print the last element if + // it's '\0', such that the output matches the string literal as it's + // written in the source code. + if (len > 0 && begin[len - 1] == '\0') { + PrintCharsAsStringTo(begin, len - 1, os); + return; + } + + // If, however, the last element in the array is not '\0', e.g. + // const char kFoo[] = { 'f', 'o', 'o' }; + // we must print the entire array. We also print a message to indicate + // that the array is not NUL-terminated. + PrintCharsAsStringTo(begin, len, os); + *os << " (no terminating NUL)"; +} + +// Prints a (const) char array of 'len' elements, starting at address 'begin'. +void UniversalPrintArray(const char* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +#ifdef __cpp_char8_t +// Prints a (const) char8_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const char8_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} +#endif + +// Prints a (const) char16_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const char16_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints a (const) char32_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const char32_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +// Prints a (const) wchar_t array of 'len' elements, starting at address +// 'begin'. +void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) { + UniversalPrintCharArray(begin, len, os); +} + +namespace { + +// Prints a null-terminated C-style string to the ostream. +template +void PrintCStringTo(const Char* s, ostream* os) { + if (s == nullptr) { + *os << "NULL"; + } else { + *os << ImplicitCast_(s) << " pointing to "; + PrintCharsAsStringTo(s, std::char_traits::length(s), os); + } +} + +} // anonymous namespace + +void PrintTo(const char* s, ostream* os) { PrintCStringTo(s, os); } + +#ifdef __cpp_char8_t +void PrintTo(const char8_t* s, ostream* os) { PrintCStringTo(s, os); } +#endif + +void PrintTo(const char16_t* s, ostream* os) { PrintCStringTo(s, os); } + +void PrintTo(const char32_t* s, ostream* os) { PrintCStringTo(s, os); } + +// MSVC compiler can be configured to define whar_t as a typedef +// of unsigned short. Defining an overload for const wchar_t* in that case +// would cause pointers to unsigned shorts be printed as wide strings, +// possibly accessing more memory than intended and causing invalid +// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when +// wchar_t is implemented as a native type. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Prints the given wide C string to the ostream. +void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); } +#endif // wchar_t is native + +namespace { + +bool ContainsUnprintableControlCodes(const char* str, size_t length) { + const unsigned char* s = reinterpret_cast(str); + + for (size_t i = 0; i < length; i++) { + unsigned char ch = *s++; + if (std::iscntrl(ch)) { + switch (ch) { + case '\t': + case '\n': + case '\r': + break; + default: + return true; + } + } + } + return false; +} + +bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; } + +bool IsValidUTF8(const char* str, size_t length) { + const unsigned char* s = reinterpret_cast(str); + + for (size_t i = 0; i < length;) { + unsigned char lead = s[i++]; + + if (lead <= 0x7f) { + continue; // single-byte character (ASCII) 0..7F + } + if (lead < 0xc2) { + return false; // trail byte or non-shortest form + } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) { + ++i; // 2-byte character + } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length && + IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) && + // check for non-shortest form and surrogate + (lead != 0xe0 || s[i] >= 0xa0) && + (lead != 0xed || s[i] < 0xa0)) { + i += 2; // 3-byte character + } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length && + IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) && + IsUTF8TrailByte(s[i + 2]) && + // check for non-shortest form + (lead != 0xf0 || s[i] >= 0x90) && + (lead != 0xf4 || s[i] < 0x90)) { + i += 3; // 4-byte character + } else { + return false; + } + } + return true; +} + +void ConditionalPrintAsText(const char* str, size_t length, ostream* os) { + if (!ContainsUnprintableControlCodes(str, length) && + IsValidUTF8(str, length)) { + *os << "\n As Text: \"" << str << "\""; + } +} + +} // anonymous namespace + +void PrintStringTo(const ::std::string& s, ostream* os) { + if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) { + if (GTEST_FLAG_GET(print_utf8)) { + ConditionalPrintAsText(s.data(), s.size(), os); + } + } +} + +#ifdef __cpp_char8_t +void PrintU8StringTo(const ::std::u8string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif + +void PrintU16StringTo(const ::std::u16string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} + +void PrintU32StringTo(const ::std::u32string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} + +#if GTEST_HAS_STD_WSTRING +void PrintWideStringTo(const ::std::wstring& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_STD_WSTRING + +} // namespace internal + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc new file mode 100644 index 0000000000..eb7c8d1cf9 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-test-part.cc @@ -0,0 +1,105 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) + +#include "gtest/gtest-test-part.h" + +#include "gtest/internal/gtest-port.h" +#include "src/gtest-internal-inl.h" + +namespace testing { + +using internal::GetUnitTestImpl; + +// Gets the summary of the failure message by omitting the stack trace +// in it. +std::string TestPartResult::ExtractSummary(const char* message) { + const char* const stack_trace = strstr(message, internal::kStackTraceMarker); + return stack_trace == nullptr ? message : std::string(message, stack_trace); +} + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { + return os << internal::FormatFileLocation(result.file_name(), + result.line_number()) + << " " + << (result.type() == TestPartResult::kSuccess ? "Success" + : result.type() == TestPartResult::kSkip ? "Skipped" + : result.type() == TestPartResult::kFatalFailure + ? "Fatal failure" + : "Non-fatal failure") + << ":\n" + << result.message() << std::endl; +} + +// Appends a TestPartResult to the array. +void TestPartResultArray::Append(const TestPartResult& result) { + array_.push_back(result); +} + +// Returns the TestPartResult at the given index (0-based). +const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const { + if (index < 0 || index >= size()) { + printf("\nInvalid index (%d) into TestPartResultArray.\n", index); + internal::posix::Abort(); + } + + return array_[static_cast(index)]; +} + +// Returns the number of TestPartResult objects in the array. +int TestPartResultArray::size() const { + return static_cast(array_.size()); +} + +namespace internal { + +HasNewFatalFailureHelper::HasNewFatalFailureHelper() + : has_new_fatal_failure_(false), + original_reporter_( + GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this); +} + +HasNewFatalFailureHelper::~HasNewFatalFailureHelper() { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread( + original_reporter_); +} + +void HasNewFatalFailureHelper::ReportTestPartResult( + const TestPartResult& result) { + if (result.fatally_failed()) has_new_fatal_failure_ = true; + original_reporter_->ReportTestPartResult(result); +} + +} // namespace internal + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc new file mode 100644 index 0000000000..a2828b83c6 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest-typed-test.cc @@ -0,0 +1,104 @@ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "gtest/gtest-typed-test.h" + +#include "gtest/gtest.h" + +namespace testing { +namespace internal { + +// Skips to the first non-space char in str. Returns an empty string if str +// contains only whitespace characters. +static const char* SkipSpaces(const char* str) { + while (IsSpace(*str)) str++; + return str; +} + +static std::vector SplitIntoTestNames(const char* src) { + std::vector name_vec; + src = SkipSpaces(src); + for (; src != nullptr; src = SkipComma(src)) { + name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src))); + } + return name_vec; +} + +// Verifies that registered_tests match the test names in +// registered_tests_; returns registered_tests if successful, or +// aborts the program otherwise. +const char* TypedTestSuitePState::VerifyRegisteredTestNames( + const char* test_suite_name, const char* file, int line, + const char* registered_tests) { + RegisterTypeParameterizedTestSuite(test_suite_name, CodeLocation(file, line)); + + typedef RegisteredTestsMap::const_iterator RegisteredTestIter; + registered_ = true; + + std::vector name_vec = SplitIntoTestNames(registered_tests); + + Message errors; + + std::set tests; + for (std::vector::const_iterator name_it = name_vec.begin(); + name_it != name_vec.end(); ++name_it) { + const std::string& name = *name_it; + if (tests.count(name) != 0) { + errors << "Test " << name << " is listed more than once.\n"; + continue; + } + + if (registered_tests_.count(name) != 0) { + tests.insert(name); + } else { + errors << "No test named " << name + << " can be found in this test suite.\n"; + } + } + + for (RegisteredTestIter it = registered_tests_.begin(); + it != registered_tests_.end(); ++it) { + if (tests.count(it->first) == 0) { + errors << "You forgot to list test " << it->first << ".\n"; + } + } + + const std::string& errors_str = errors.GetString(); + if (errors_str != "") { + fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), + errors_str.c_str()); + fflush(stderr); + posix::Abort(); + } + + return registered_tests; +} + +} // namespace internal +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc new file mode 100644 index 0000000000..6f31dd2260 --- /dev/null +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest.cc @@ -0,0 +1,6795 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +// The Google C++ Testing and Mocking Framework (Google Test) + +#include "gtest/gtest.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include + +#include "gtest/gtest-assertion-result.h" +#include "gtest/gtest-spi.h" +#include "gtest/internal/custom/gtest.h" + +#if GTEST_OS_LINUX + +#include // NOLINT +#include // NOLINT +#include // NOLINT +// Declares vsnprintf(). This header is not available on Windows. +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT + +#include + +#elif GTEST_OS_ZOS +#include // NOLINT + +// On z/OS we additionally need strings.h for strcasecmp. +#include // NOLINT + +#elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE. + +#include // NOLINT +#undef min + +#elif GTEST_OS_WINDOWS // We are on Windows proper. + +#include // NOLINT +#undef min + +#ifdef _MSC_VER +#include // NOLINT +#endif + +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT + +#if GTEST_OS_WINDOWS_MINGW +#include // NOLINT +#endif // GTEST_OS_WINDOWS_MINGW + +#else + +// cpplint thinks that the header is already included, so we want to +// silence it. +#include // NOLINT +#include // NOLINT + +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +#include +#endif + +#if GTEST_CAN_STREAM_RESULTS_ +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#endif + +#include "src/gtest-internal-inl.h" + +#if GTEST_OS_WINDOWS +#define vsnprintf _vsnprintf +#endif // GTEST_OS_WINDOWS + +#if GTEST_OS_MAC +#ifndef GTEST_OS_IOS +#include +#endif +#endif + +#if GTEST_HAS_ABSL +#include "absl/debugging/failure_signal_handler.h" +#include "absl/debugging/stacktrace.h" +#include "absl/debugging/symbolize.h" +#include "absl/flags/parse.h" +#include "absl/flags/usage.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_replace.h" +#endif // GTEST_HAS_ABSL + +namespace testing { + +using internal::CountIf; +using internal::ForEach; +using internal::GetElementOr; +using internal::Shuffle; + +// Constants. + +// A test whose test suite name or test name matches this filter is +// disabled and not run. +static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*"; + +// A test suite whose name matches this filter is considered a death +// test suite and will be run before test suites whose name doesn't +// match this filter. +static const char kDeathTestSuiteFilter[] = "*DeathTest:*DeathTest/*"; + +// A test filter that matches everything. +static const char kUniversalFilter[] = "*"; + +// The default output format. +static const char kDefaultOutputFormat[] = "xml"; +// The default output file. +static const char kDefaultOutputFile[] = "test_detail"; + +// The environment variable name for the test shard index. +static const char kTestShardIndex[] = "GTEST_SHARD_INDEX"; +// The environment variable name for the total number of test shards. +static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS"; +// The environment variable name for the test shard status file. +static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE"; + +namespace internal { + +// The text used in failure messages to indicate the start of the +// stack trace. +const char kStackTraceMarker[] = "\nStack trace:\n"; + +// g_help_flag is true if and only if the --help flag or an equivalent form +// is specified on the command line. +bool g_help_flag = false; + +// Utility function to Open File for Writing +static FILE* OpenFileForWriting(const std::string& output_file) { + FILE* fileout = nullptr; + FilePath output_file_path(output_file); + FilePath output_dir(output_file_path.RemoveFileName()); + + if (output_dir.CreateDirectoriesRecursively()) { + fileout = posix::FOpen(output_file.c_str(), "w"); + } + if (fileout == nullptr) { + GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\""; + } + return fileout; +} + +} // namespace internal + +// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY +// environment variable. +static const char* GetDefaultFilter() { + const char* const testbridge_test_only = + internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY"); + if (testbridge_test_only != nullptr) { + return testbridge_test_only; + } + return kUniversalFilter; +} + +// Bazel passes in the argument to '--test_runner_fail_fast' via the +// TESTBRIDGE_TEST_RUNNER_FAIL_FAST environment variable. +static bool GetDefaultFailFast() { + const char* const testbridge_test_runner_fail_fast = + internal::posix::GetEnv("TESTBRIDGE_TEST_RUNNER_FAIL_FAST"); + if (testbridge_test_runner_fail_fast != nullptr) { + return strcmp(testbridge_test_runner_fail_fast, "1") == 0; + } + return false; +} + +} // namespace testing + +GTEST_DEFINE_bool_( + fail_fast, + testing::internal::BoolFromGTestEnv("fail_fast", + testing::GetDefaultFailFast()), + "True if and only if a test failure should stop further test execution."); + +GTEST_DEFINE_bool_( + also_run_disabled_tests, + testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false), + "Run disabled tests too, in addition to the tests normally being run."); + +GTEST_DEFINE_bool_( + break_on_failure, + testing::internal::BoolFromGTestEnv("break_on_failure", false), + "True if and only if a failed assertion should be a debugger " + "break-point."); + +GTEST_DEFINE_bool_(catch_exceptions, + testing::internal::BoolFromGTestEnv("catch_exceptions", + true), + "True if and only if " GTEST_NAME_ + " should catch exceptions and treat them as test failures."); + +GTEST_DEFINE_string_( + color, testing::internal::StringFromGTestEnv("color", "auto"), + "Whether to use colors in the output. Valid values: yes, no, " + "and auto. 'auto' means to use colors if the output is " + "being sent to a terminal and the TERM environment variable " + "is set to a terminal type that supports colors."); + +GTEST_DEFINE_string_( + filter, + testing::internal::StringFromGTestEnv("filter", + testing::GetDefaultFilter()), + "A colon-separated list of glob (not regex) patterns " + "for filtering the tests to run, optionally followed by a " + "'-' and a : separated list of negative patterns (tests to " + "exclude). A test is run if it matches one of the positive " + "patterns and does not match any of the negative patterns."); + +GTEST_DEFINE_bool_( + install_failure_signal_handler, + testing::internal::BoolFromGTestEnv("install_failure_signal_handler", + false), + "If true and supported on the current platform, " GTEST_NAME_ + " should " + "install a signal handler that dumps debugging information when fatal " + "signals are raised."); + +GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them."); + +// The net priority order after flag processing is thus: +// --gtest_output command line flag +// GTEST_OUTPUT environment variable +// XML_OUTPUT_FILE environment variable +// '' +GTEST_DEFINE_string_( + output, + testing::internal::StringFromGTestEnv( + "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()), + "A format (defaults to \"xml\" but can be specified to be \"json\"), " + "optionally followed by a colon and an output file name or directory. " + "A directory is indicated by a trailing pathname separator. " + "Examples: \"xml:filename.xml\", \"xml::directoryname/\". " + "If a directory is specified, output files will be created " + "within that directory, with file-names based on the test " + "executable's name and, if necessary, made unique by adding " + "digits."); + +GTEST_DEFINE_bool_( + brief, testing::internal::BoolFromGTestEnv("brief", false), + "True if only test failures should be displayed in text output."); + +GTEST_DEFINE_bool_(print_time, + testing::internal::BoolFromGTestEnv("print_time", true), + "True if and only if " GTEST_NAME_ + " should display elapsed time in text output."); + +GTEST_DEFINE_bool_(print_utf8, + testing::internal::BoolFromGTestEnv("print_utf8", true), + "True if and only if " GTEST_NAME_ + " prints UTF8 characters as text."); + +GTEST_DEFINE_int32_( + random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0), + "Random number seed to use when shuffling test orders. Must be in range " + "[1, 99999], or 0 to use a seed based on the current time."); + +GTEST_DEFINE_int32_( + repeat, testing::internal::Int32FromGTestEnv("repeat", 1), + "How many times to repeat each test. Specify a negative number " + "for repeating forever. Useful for shaking out flaky tests."); + +GTEST_DEFINE_bool_( + recreate_environments_when_repeating, + testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating", + false), + "Controls whether global test environments are recreated for each repeat " + "of the tests. If set to false the global test environments are only set " + "up once, for the first iteration, and only torn down once, for the last. " + "Useful for shaking out flaky tests with stable, expensive test " + "environments. If --gtest_repeat is set to a negative number, meaning " + "there is no last run, the environments will always be recreated to avoid " + "leaks."); + +GTEST_DEFINE_bool_(show_internal_stack_frames, false, + "True if and only if " GTEST_NAME_ + " should include internal stack frames when " + "printing test failure stack traces."); + +GTEST_DEFINE_bool_(shuffle, + testing::internal::BoolFromGTestEnv("shuffle", false), + "True if and only if " GTEST_NAME_ + " should randomize tests' order on every run."); + +GTEST_DEFINE_int32_( + stack_trace_depth, + testing::internal::Int32FromGTestEnv("stack_trace_depth", + testing::kMaxStackTraceDepth), + "The maximum number of stack frames to print when an " + "assertion fails. The valid range is 0 through 100, inclusive."); + +GTEST_DEFINE_string_( + stream_result_to, + testing::internal::StringFromGTestEnv("stream_result_to", ""), + "This flag specifies the host name and the port number on which to stream " + "test results. Example: \"localhost:555\". The flag is effective only on " + "Linux."); + +GTEST_DEFINE_bool_( + throw_on_failure, + testing::internal::BoolFromGTestEnv("throw_on_failure", false), + "When this flag is specified, a failed assertion will throw an exception " + "if exceptions are enabled or exit the program with a non-zero code " + "otherwise. For use with an external test framework."); + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +GTEST_DEFINE_string_( + flagfile, testing::internal::StringFromGTestEnv("flagfile", ""), + "This flag specifies the flagfile to read command-line flags from."); +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +namespace testing { +namespace internal { + +// Generates a random number from [0, range), using a Linear +// Congruential Generator (LCG). Crashes if 'range' is 0 or greater +// than kMaxRange. +uint32_t Random::Generate(uint32_t range) { + // These constants are the same as are used in glibc's rand(3). + // Use wider types than necessary to prevent unsigned overflow diagnostics. + state_ = static_cast(1103515245ULL * state_ + 12345U) % kMaxRange; + + GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0)."; + GTEST_CHECK_(range <= kMaxRange) + << "Generation of a number in [0, " << range << ") was requested, " + << "but this can only generate numbers in [0, " << kMaxRange << ")."; + + // Converting via modulus introduces a bit of downward bias, but + // it's simple, and a linear congruential generator isn't too good + // to begin with. + return state_ % range; +} + +// GTestIsInitialized() returns true if and only if the user has initialized +// Google Test. Useful for catching the user mistake of not initializing +// Google Test before calling RUN_ALL_TESTS(). +static bool GTestIsInitialized() { return GetArgvs().size() > 0; } + +// Iterates over a vector of TestSuites, keeping a running sum of the +// results of calling a given int-returning method on each. +// Returns the sum. +static int SumOverTestSuiteList(const std::vector& case_list, + int (TestSuite::*method)() const) { + int sum = 0; + for (size_t i = 0; i < case_list.size(); i++) { + sum += (case_list[i]->*method)(); + } + return sum; +} + +// Returns true if and only if the test suite passed. +static bool TestSuitePassed(const TestSuite* test_suite) { + return test_suite->should_run() && test_suite->Passed(); +} + +// Returns true if and only if the test suite failed. +static bool TestSuiteFailed(const TestSuite* test_suite) { + return test_suite->should_run() && test_suite->Failed(); +} + +// Returns true if and only if test_suite contains at least one test that +// should run. +static bool ShouldRunTestSuite(const TestSuite* test_suite) { + return test_suite->should_run(); +} + +// AssertHelper constructor. +AssertHelper::AssertHelper(TestPartResult::Type type, const char* file, + int line, const char* message) + : data_(new AssertHelperData(type, file, line, message)) {} + +AssertHelper::~AssertHelper() { delete data_; } + +// Message assignment, for assertion streaming support. +void AssertHelper::operator=(const Message& message) const { + UnitTest::GetInstance()->AddTestPartResult( + data_->type, data_->file, data_->line, + AppendUserMessage(data_->message, message), + UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1) + // Skips the stack frame for this function itself. + ); // NOLINT +} + +namespace { + +// When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P +// to creates test cases for it, a synthetic test case is +// inserted to report ether an error or a log message. +// +// This configuration bit will likely be removed at some point. +constexpr bool kErrorOnUninstantiatedParameterizedTest = true; +constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = true; + +// A test that fails at a given file/line location with a given message. +class FailureTest : public Test { + public: + explicit FailureTest(const CodeLocation& loc, std::string error_message, + bool as_error) + : loc_(loc), + error_message_(std::move(error_message)), + as_error_(as_error) {} + + void TestBody() override { + if (as_error_) { + AssertHelper(TestPartResult::kNonFatalFailure, loc_.file.c_str(), + loc_.line, "") = Message() << error_message_; + } else { + std::cout << error_message_ << std::endl; + } + } + + private: + const CodeLocation loc_; + const std::string error_message_; + const bool as_error_; +}; + +} // namespace + +std::set* GetIgnoredParameterizedTestSuites() { + return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites(); +} + +// Add a given test_suit to the list of them allow to go un-instantiated. +MarkAsIgnored::MarkAsIgnored(const char* test_suite) { + GetIgnoredParameterizedTestSuites()->insert(test_suite); +} + +// If this parameterized test suite has no instantiations (and that +// has not been marked as okay), emit a test case reporting that. +void InsertSyntheticTestCase(const std::string& name, CodeLocation location, + bool has_test_p) { + const auto& ignored = *GetIgnoredParameterizedTestSuites(); + if (ignored.find(name) != ignored.end()) return; + + const char kMissingInstantiation[] = // + " is defined via TEST_P, but never instantiated. None of the test cases " + "will run. Either no INSTANTIATE_TEST_SUITE_P is provided or the only " + "ones provided expand to nothing." + "\n\n" + "Ideally, TEST_P definitions should only ever be included as part of " + "binaries that intend to use them. (As opposed to, for example, being " + "placed in a library that may be linked in to get other utilities.)"; + + const char kMissingTestCase[] = // + " is instantiated via INSTANTIATE_TEST_SUITE_P, but no tests are " + "defined via TEST_P . No test cases will run." + "\n\n" + "Ideally, INSTANTIATE_TEST_SUITE_P should only ever be invoked from " + "code that always depend on code that provides TEST_P. Failing to do " + "so is often an indication of dead code, e.g. the last TEST_P was " + "removed but the rest got left behind."; + + std::string message = + "Parameterized test suite " + name + + (has_test_p ? kMissingInstantiation : kMissingTestCase) + + "\n\n" + "To suppress this error for this test suite, insert the following line " + "(in a non-header) in the namespace it is defined in:" + "\n\n" + "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + + name + ");"; + + std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">"; + RegisterTest( // + "GoogleTestVerification", full_name.c_str(), + nullptr, // No type parameter. + nullptr, // No value parameter. + location.file.c_str(), location.line, [message, location] { + return new FailureTest(location, message, + kErrorOnUninstantiatedParameterizedTest); + }); +} + +void RegisterTypeParameterizedTestSuite(const char* test_suite_name, + CodeLocation code_location) { + GetUnitTestImpl()->type_parameterized_test_registry().RegisterTestSuite( + test_suite_name, code_location); +} + +void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) { + GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation( + case_name); +} + +void TypeParameterizedTestSuiteRegistry::RegisterTestSuite( + const char* test_suite_name, CodeLocation code_location) { + suites_.emplace(std::string(test_suite_name), + TypeParameterizedTestSuiteInfo(code_location)); +} + +void TypeParameterizedTestSuiteRegistry::RegisterInstantiation( + const char* test_suite_name) { + auto it = suites_.find(std::string(test_suite_name)); + if (it != suites_.end()) { + it->second.instantiated = true; + } else { + GTEST_LOG_(ERROR) << "Unknown type parameterized test suit '" + << test_suite_name << "'"; + } +} + +void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() { + const auto& ignored = *GetIgnoredParameterizedTestSuites(); + for (const auto& testcase : suites_) { + if (testcase.second.instantiated) continue; + if (ignored.find(testcase.first) != ignored.end()) continue; + + std::string message = + "Type parameterized test suite " + testcase.first + + " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated " + "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run." + "\n\n" + "Ideally, TYPED_TEST_P definitions should only ever be included as " + "part of binaries that intend to use them. (As opposed to, for " + "example, being placed in a library that may be linked in to get other " + "utilities.)" + "\n\n" + "To suppress this error for this test suite, insert the following line " + "(in a non-header) in the namespace it is defined in:" + "\n\n" + "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + + testcase.first + ");"; + + std::string full_name = + "UninstantiatedTypeParameterizedTestSuite<" + testcase.first + ">"; + RegisterTest( // + "GoogleTestVerification", full_name.c_str(), + nullptr, // No type parameter. + nullptr, // No value parameter. + testcase.second.code_location.file.c_str(), + testcase.second.code_location.line, [message, testcase] { + return new FailureTest(testcase.second.code_location, message, + kErrorOnUninstantiatedTypeParameterizedTest); + }); + } +} + +// A copy of all command line arguments. Set by InitGoogleTest(). +static ::std::vector g_argvs; + +::std::vector GetArgvs() { +#if defined(GTEST_CUSTOM_GET_ARGVS_) + // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or + // ::string. This code converts it to the appropriate type. + const auto& custom = GTEST_CUSTOM_GET_ARGVS_(); + return ::std::vector(custom.begin(), custom.end()); +#else // defined(GTEST_CUSTOM_GET_ARGVS_) + return g_argvs; +#endif // defined(GTEST_CUSTOM_GET_ARGVS_) +} + +// Returns the current application's name, removing directory path if that +// is present. +FilePath GetCurrentExecutableName() { + FilePath result; + +#if GTEST_OS_WINDOWS || GTEST_OS_OS2 + result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe")); +#else + result.Set(FilePath(GetArgvs()[0])); +#endif // GTEST_OS_WINDOWS + + return result.RemoveDirectoryName(); +} + +// Functions for processing the gtest_output flag. + +// Returns the output format, or "" for normal printed output. +std::string UnitTestOptions::GetOutputFormat() { + std::string s = GTEST_FLAG_GET(output); + const char* const gtest_output_flag = s.c_str(); + const char* const colon = strchr(gtest_output_flag, ':'); + return (colon == nullptr) + ? std::string(gtest_output_flag) + : std::string(gtest_output_flag, + static_cast(colon - gtest_output_flag)); +} + +// Returns the name of the requested output file, or the default if none +// was explicitly specified. +std::string UnitTestOptions::GetAbsolutePathToOutputFile() { + std::string s = GTEST_FLAG_GET(output); + const char* const gtest_output_flag = s.c_str(); + + std::string format = GetOutputFormat(); + if (format.empty()) format = std::string(kDefaultOutputFormat); + + const char* const colon = strchr(gtest_output_flag, ':'); + if (colon == nullptr) + return internal::FilePath::MakeFileName( + internal::FilePath( + UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(kDefaultOutputFile), 0, format.c_str()) + .string(); + + internal::FilePath output_name(colon + 1); + if (!output_name.IsAbsolutePath()) + output_name = internal::FilePath::ConcatPaths( + internal::FilePath(UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(colon + 1)); + + if (!output_name.IsDirectory()) return output_name.string(); + + internal::FilePath result(internal::FilePath::GenerateUniqueFileName( + output_name, internal::GetCurrentExecutableName(), + GetOutputFormat().c_str())); + return result.string(); +} + +// Returns true if and only if the wildcard pattern matches the string. Each +// pattern consists of regular characters, single-character wildcards (?), and +// multi-character wildcards (*). +// +// This function implements a linear-time string globbing algorithm based on +// https://research.swtch.com/glob. +static bool PatternMatchesString(const std::string& name_str, + const char* pattern, const char* pattern_end) { + const char* name = name_str.c_str(); + const char* const name_begin = name; + const char* const name_end = name + name_str.size(); + + const char* pattern_next = pattern; + const char* name_next = name; + + while (pattern < pattern_end || name < name_end) { + if (pattern < pattern_end) { + switch (*pattern) { + default: // Match an ordinary character. + if (name < name_end && *name == *pattern) { + ++pattern; + ++name; + continue; + } + break; + case '?': // Match any single character. + if (name < name_end) { + ++pattern; + ++name; + continue; + } + break; + case '*': + // Match zero or more characters. Start by skipping over the wildcard + // and matching zero characters from name. If that fails, restart and + // match one more character than the last attempt. + pattern_next = pattern; + name_next = name + 1; + ++pattern; + continue; + } + } + // Failed to match a character. Restart if possible. + if (name_begin < name_next && name_next <= name_end) { + pattern = pattern_next; + name = name_next; + continue; + } + return false; + } + return true; +} + +namespace { + +bool IsGlobPattern(const std::string& pattern) { + return std::any_of(pattern.begin(), pattern.end(), + [](const char c) { return c == '?' || c == '*'; }); +} + +class UnitTestFilter { + public: + UnitTestFilter() = default; + + // Constructs a filter from a string of patterns separated by `:`. + explicit UnitTestFilter(const std::string& filter) { + // By design "" filter matches "" string. + std::vector all_patterns; + SplitString(filter, ':', &all_patterns); + const auto exact_match_patterns_begin = std::partition( + all_patterns.begin(), all_patterns.end(), &IsGlobPattern); + + glob_patterns_.reserve(static_cast( + std::distance(all_patterns.begin(), exact_match_patterns_begin))); + std::move(all_patterns.begin(), exact_match_patterns_begin, + std::inserter(glob_patterns_, glob_patterns_.begin())); + std::move( + exact_match_patterns_begin, all_patterns.end(), + std::inserter(exact_match_patterns_, exact_match_patterns_.begin())); + } + + // Returns true if and only if name matches at least one of the patterns in + // the filter. + bool MatchesName(const std::string& name) const { + return exact_match_patterns_.count(name) > 0 || + std::any_of(glob_patterns_.begin(), glob_patterns_.end(), + [&name](const std::string& pattern) { + return PatternMatchesString( + name, pattern.c_str(), + pattern.c_str() + pattern.size()); + }); + } + + private: + std::vector glob_patterns_; + std::unordered_set exact_match_patterns_; +}; + +class PositiveAndNegativeUnitTestFilter { + public: + // Constructs a positive and a negative filter from a string. The string + // contains a positive filter optionally followed by a '-' character and a + // negative filter. In case only a negative filter is provided the positive + // filter will be assumed "*". + // A filter is a list of patterns separated by ':'. + explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) { + std::vector positive_and_negative_filters; + + // NOTE: `SplitString` always returns a non-empty container. + SplitString(filter, '-', &positive_and_negative_filters); + const auto& positive_filter = positive_and_negative_filters.front(); + + if (positive_and_negative_filters.size() > 1) { + positive_filter_ = UnitTestFilter( + positive_filter.empty() ? kUniversalFilter : positive_filter); + + // TODO(b/214626361): Fail on multiple '-' characters + // For the moment to preserve old behavior we concatenate the rest of the + // string parts with `-` as separator to generate the negative filter. + auto negative_filter_string = positive_and_negative_filters[1]; + for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++) + negative_filter_string = + negative_filter_string + '-' + positive_and_negative_filters[i]; + negative_filter_ = UnitTestFilter(negative_filter_string); + } else { + // In case we don't have a negative filter and positive filter is "" + // we do not use kUniversalFilter by design as opposed to when we have a + // negative filter. + positive_filter_ = UnitTestFilter(positive_filter); + } + } + + // Returns true if and only if test name (this is generated by appending test + // suit name and test name via a '.' character) matches the positive filter + // and does not match the negative filter. + bool MatchesTest(const std::string& test_suite_name, + const std::string& test_name) const { + return MatchesName(test_suite_name + "." + test_name); + } + + // Returns true if and only if name matches the positive filter and does not + // match the negative filter. + bool MatchesName(const std::string& name) const { + return positive_filter_.MatchesName(name) && + !negative_filter_.MatchesName(name); + } + + private: + UnitTestFilter positive_filter_; + UnitTestFilter negative_filter_; +}; +} // namespace + +bool UnitTestOptions::MatchesFilter(const std::string& name_str, + const char* filter) { + return UnitTestFilter(filter).MatchesName(name_str); +} + +// Returns true if and only if the user-specified filter matches the test +// suite name and the test name. +bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name, + const std::string& test_name) { + // Split --gtest_filter at '-', if there is one, to separate into + // positive filter and negative filter portions + return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter)) + .MatchesTest(test_suite_name, test_name); +} + +#if GTEST_HAS_SEH +// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the +// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. +// This function is useful as an __except condition. +int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { + // Google Test should handle a SEH exception if: + // 1. the user wants it to, AND + // 2. this is not a breakpoint exception, AND + // 3. this is not a C++ exception (VC++ implements them via SEH, + // apparently). + // + // SEH exception code for C++ exceptions. + // (see http://support.microsoft.com/kb/185294 for more information). + const DWORD kCxxExceptionCode = 0xe06d7363; + + bool should_handle = true; + + if (!GTEST_FLAG_GET(catch_exceptions)) + should_handle = false; + else if (exception_code == EXCEPTION_BREAKPOINT) + should_handle = false; + else if (exception_code == kCxxExceptionCode) + should_handle = false; + + return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH; +} +#endif // GTEST_HAS_SEH + +} // namespace internal + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. Intercepts only failures from the current thread. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + TestPartResultArray* result) + : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) { + Init(); +} + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + InterceptMode intercept_mode, TestPartResultArray* result) + : intercept_mode_(intercept_mode), result_(result) { + Init(); +} + +void ScopedFakeTestPartResultReporter::Init() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + old_reporter_ = impl->GetGlobalTestPartResultReporter(); + impl->SetGlobalTestPartResultReporter(this); + } else { + old_reporter_ = impl->GetTestPartResultReporterForCurrentThread(); + impl->SetTestPartResultReporterForCurrentThread(this); + } +} + +// The d'tor restores the test part result reporter used by Google Test +// before. +ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + impl->SetGlobalTestPartResultReporter(old_reporter_); + } else { + impl->SetTestPartResultReporterForCurrentThread(old_reporter_); + } +} + +// Increments the test part result count and remembers the result. +// This method is from the TestPartResultReporterInterface interface. +void ScopedFakeTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + result_->Append(result); +} + +namespace internal { + +// Returns the type ID of ::testing::Test. We should always call this +// instead of GetTypeId< ::testing::Test>() to get the type ID of +// testing::Test. This is to work around a suspected linker bug when +// using Google Test as a framework on Mac OS X. The bug causes +// GetTypeId< ::testing::Test>() to return different values depending +// on whether the call is from the Google Test framework itself or +// from user test code. GetTestTypeId() is guaranteed to always +// return the same value, as it always calls GetTypeId<>() from the +// gtest.cc, which is within the Google Test framework. +TypeId GetTestTypeId() { return GetTypeId(); } + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId(); + +// This predicate-formatter checks that 'results' contains a test part +// failure of the given type and that the failure message contains the +// given substring. +static AssertionResult HasOneFailure(const char* /* results_expr */, + const char* /* type_expr */, + const char* /* substr_expr */, + const TestPartResultArray& results, + TestPartResult::Type type, + const std::string& substr) { + const std::string expected(type == TestPartResult::kFatalFailure + ? "1 fatal failure" + : "1 non-fatal failure"); + Message msg; + if (results.size() != 1) { + msg << "Expected: " << expected << "\n" + << " Actual: " << results.size() << " failures"; + for (int i = 0; i < results.size(); i++) { + msg << "\n" << results.GetTestPartResult(i); + } + return AssertionFailure() << msg; + } + + const TestPartResult& r = results.GetTestPartResult(0); + if (r.type() != type) { + return AssertionFailure() << "Expected: " << expected << "\n" + << " Actual:\n" + << r; + } + + if (strstr(r.message(), substr.c_str()) == nullptr) { + return AssertionFailure() + << "Expected: " << expected << " containing \"" << substr << "\"\n" + << " Actual:\n" + << r; + } + + return AssertionSuccess(); +} + +// The constructor of SingleFailureChecker remembers where to look up +// test part results, what type of failure we expect, and what +// substring the failure message should contain. +SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results, + TestPartResult::Type type, + const std::string& substr) + : results_(results), type_(type), substr_(substr) {} + +// The destructor of SingleFailureChecker verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +SingleFailureChecker::~SingleFailureChecker() { + EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_); +} + +DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter( + UnitTestImpl* unit_test) + : unit_test_(unit_test) {} + +void DefaultGlobalTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + unit_test_->current_test_result()->AddTestPartResult(result); + unit_test_->listeners()->repeater()->OnTestPartResult(result); +} + +DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter( + UnitTestImpl* unit_test) + : unit_test_(unit_test) {} + +void DefaultPerThreadTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result); +} + +// Returns the global test part result reporter. +TestPartResultReporterInterface* +UnitTestImpl::GetGlobalTestPartResultReporter() { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + return global_test_part_result_repoter_; +} + +// Sets the global test part result reporter. +void UnitTestImpl::SetGlobalTestPartResultReporter( + TestPartResultReporterInterface* reporter) { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + global_test_part_result_repoter_ = reporter; +} + +// Returns the test part result reporter for the current thread. +TestPartResultReporterInterface* +UnitTestImpl::GetTestPartResultReporterForCurrentThread() { + return per_thread_test_part_result_reporter_.get(); +} + +// Sets the test part result reporter for the current thread. +void UnitTestImpl::SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface* reporter) { + per_thread_test_part_result_reporter_.set(reporter); +} + +// Gets the number of successful test suites. +int UnitTestImpl::successful_test_suite_count() const { + return CountIf(test_suites_, TestSuitePassed); +} + +// Gets the number of failed test suites. +int UnitTestImpl::failed_test_suite_count() const { + return CountIf(test_suites_, TestSuiteFailed); +} + +// Gets the number of all test suites. +int UnitTestImpl::total_test_suite_count() const { + return static_cast(test_suites_.size()); +} + +// Gets the number of all test suites that contain at least one test +// that should run. +int UnitTestImpl::test_suite_to_run_count() const { + return CountIf(test_suites_, ShouldRunTestSuite); +} + +// Gets the number of successful tests. +int UnitTestImpl::successful_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::successful_test_count); +} + +// Gets the number of skipped tests. +int UnitTestImpl::skipped_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::skipped_test_count); +} + +// Gets the number of failed tests. +int UnitTestImpl::failed_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::failed_test_count); +} + +// Gets the number of disabled tests that will be reported in the XML report. +int UnitTestImpl::reportable_disabled_test_count() const { + return SumOverTestSuiteList(test_suites_, + &TestSuite::reportable_disabled_test_count); +} + +// Gets the number of disabled tests. +int UnitTestImpl::disabled_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::disabled_test_count); +} + +// Gets the number of tests to be printed in the XML report. +int UnitTestImpl::reportable_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::reportable_test_count); +} + +// Gets the number of all tests. +int UnitTestImpl::total_test_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::total_test_count); +} + +// Gets the number of tests that should run. +int UnitTestImpl::test_to_run_count() const { + return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count); +} + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// CurrentOsStackTraceExceptTop(1), Foo() will be included in the +// trace but Bar() and CurrentOsStackTraceExceptTop() won't. +std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { + return os_stack_trace_getter()->CurrentStackTrace( + static_cast(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1 + // Skips the user-specified number of frames plus this function + // itself. + ); // NOLINT +} + +// A helper class for measuring elapsed times. +class Timer { + public: + Timer() : start_(std::chrono::steady_clock::now()) {} + + // Return time elapsed in milliseconds since the timer was created. + TimeInMillis Elapsed() { + return std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_) + .count(); + } + + private: + std::chrono::steady_clock::time_point start_; +}; + +// Returns a timestamp as milliseconds since the epoch. Note this time may jump +// around subject to adjustments by the system, to measure elapsed time use +// Timer instead. +TimeInMillis GetTimeInMillis() { + return std::chrono::duration_cast( + std::chrono::system_clock::now() - + std::chrono::system_clock::from_time_t(0)) + .count(); +} + +// Utilities + +// class String. + +#if GTEST_OS_WINDOWS_MOBILE +// Creates a UTF-16 wide string from the given ANSI string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the wide string, or NULL if the +// input is NULL. +LPCWSTR String::AnsiToUtf16(const char* ansi) { + if (!ansi) return nullptr; + const int length = strlen(ansi); + const int unicode_length = + MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0); + WCHAR* unicode = new WCHAR[unicode_length + 1]; + MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length); + unicode[unicode_length] = 0; + return unicode; +} + +// Creates an ANSI string from the given wide string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the ANSI string, or NULL if the +// input is NULL. +const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { + if (!utf16_str) return nullptr; + const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr, + 0, nullptr, nullptr); + char* ansi = new char[ansi_length + 1]; + WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr, + nullptr); + ansi[ansi_length] = 0; + return ansi; +} + +#endif // GTEST_OS_WINDOWS_MOBILE + +// Compares two C strings. Returns true if and only if they have the same +// content. +// +// Unlike strcmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CStringEquals(const char* lhs, const char* rhs) { + if (lhs == nullptr) return rhs == nullptr; + + if (rhs == nullptr) return false; + + return strcmp(lhs, rhs) == 0; +} + +#if GTEST_HAS_STD_WSTRING + +// Converts an array of wide chars to a narrow string using the UTF-8 +// encoding, and streams the result to the given Message object. +static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length, + Message* msg) { + for (size_t i = 0; i != length;) { // NOLINT + if (wstr[i] != L'\0') { + *msg << WideStringToUtf8(wstr + i, static_cast(length - i)); + while (i != length && wstr[i] != L'\0') i++; + } else { + *msg << '\0'; + i++; + } + } +} + +#endif // GTEST_HAS_STD_WSTRING + +void SplitString(const ::std::string& str, char delimiter, + ::std::vector< ::std::string>* dest) { + ::std::vector< ::std::string> parsed; + ::std::string::size_type pos = 0; + while (::testing::internal::AlwaysTrue()) { + const ::std::string::size_type colon = str.find(delimiter, pos); + if (colon == ::std::string::npos) { + parsed.push_back(str.substr(pos)); + break; + } else { + parsed.push_back(str.substr(pos, colon - pos)); + pos = colon + 1; + } + } + dest->swap(parsed); +} + +} // namespace internal + +// Constructs an empty Message. +// We allocate the stringstream separately because otherwise each use of +// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's +// stack frame leading to huge stack frames in some cases; gcc does not reuse +// the stack space. +Message::Message() : ss_(new ::std::stringstream) { + // By default, we want there to be enough precision when printing + // a double to a Message. + *ss_ << std::setprecision(std::numeric_limits::digits10 + 2); +} + +// These two overloads allow streaming a wide C string to a Message +// using the UTF-8 encoding. +Message& Message::operator<<(const wchar_t* wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); +} +Message& Message::operator<<(wchar_t* wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); +} + +#if GTEST_HAS_STD_WSTRING +// Converts the given wide string to a narrow string using the UTF-8 +// encoding, and streams the result to this Message object. +Message& Message::operator<<(const ::std::wstring& wstr) { + internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); + return *this; +} +#endif // GTEST_HAS_STD_WSTRING + +// Gets the text streamed to this object so far as an std::string. +// Each '\0' character in the buffer is replaced with "\\0". +std::string Message::GetString() const { + return internal::StringStreamToString(ss_.get()); +} + +namespace internal { + +namespace edit_distance { +std::vector CalculateOptimalEdits(const std::vector& left, + const std::vector& right) { + std::vector > costs( + left.size() + 1, std::vector(right.size() + 1)); + std::vector > best_move( + left.size() + 1, std::vector(right.size() + 1)); + + // Populate for empty right. + for (size_t l_i = 0; l_i < costs.size(); ++l_i) { + costs[l_i][0] = static_cast(l_i); + best_move[l_i][0] = kRemove; + } + // Populate for empty left. + for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) { + costs[0][r_i] = static_cast(r_i); + best_move[0][r_i] = kAdd; + } + + for (size_t l_i = 0; l_i < left.size(); ++l_i) { + for (size_t r_i = 0; r_i < right.size(); ++r_i) { + if (left[l_i] == right[r_i]) { + // Found a match. Consume it. + costs[l_i + 1][r_i + 1] = costs[l_i][r_i]; + best_move[l_i + 1][r_i + 1] = kMatch; + continue; + } + + const double add = costs[l_i + 1][r_i]; + const double remove = costs[l_i][r_i + 1]; + const double replace = costs[l_i][r_i]; + if (add < remove && add < replace) { + costs[l_i + 1][r_i + 1] = add + 1; + best_move[l_i + 1][r_i + 1] = kAdd; + } else if (remove < add && remove < replace) { + costs[l_i + 1][r_i + 1] = remove + 1; + best_move[l_i + 1][r_i + 1] = kRemove; + } else { + // We make replace a little more expensive than add/remove to lower + // their priority. + costs[l_i + 1][r_i + 1] = replace + 1.00001; + best_move[l_i + 1][r_i + 1] = kReplace; + } + } + } + + // Reconstruct the best path. We do it in reverse order. + std::vector best_path; + for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) { + EditType move = best_move[l_i][r_i]; + best_path.push_back(move); + l_i -= move != kAdd; + r_i -= move != kRemove; + } + std::reverse(best_path.begin(), best_path.end()); + return best_path; +} + +namespace { + +// Helper class to convert string into ids with deduplication. +class InternalStrings { + public: + size_t GetId(const std::string& str) { + IdMap::iterator it = ids_.find(str); + if (it != ids_.end()) return it->second; + size_t id = ids_.size(); + return ids_[str] = id; + } + + private: + typedef std::map IdMap; + IdMap ids_; +}; + +} // namespace + +std::vector CalculateOptimalEdits( + const std::vector& left, + const std::vector& right) { + std::vector left_ids, right_ids; + { + InternalStrings intern_table; + for (size_t i = 0; i < left.size(); ++i) { + left_ids.push_back(intern_table.GetId(left[i])); + } + for (size_t i = 0; i < right.size(); ++i) { + right_ids.push_back(intern_table.GetId(right[i])); + } + } + return CalculateOptimalEdits(left_ids, right_ids); +} + +namespace { + +// Helper class that holds the state for one hunk and prints it out to the +// stream. +// It reorders adds/removes when possible to group all removes before all +// adds. It also adds the hunk header before printint into the stream. +class Hunk { + public: + Hunk(size_t left_start, size_t right_start) + : left_start_(left_start), + right_start_(right_start), + adds_(), + removes_(), + common_() {} + + void PushLine(char edit, const char* line) { + switch (edit) { + case ' ': + ++common_; + FlushEdits(); + hunk_.push_back(std::make_pair(' ', line)); + break; + case '-': + ++removes_; + hunk_removes_.push_back(std::make_pair('-', line)); + break; + case '+': + ++adds_; + hunk_adds_.push_back(std::make_pair('+', line)); + break; + } + } + + void PrintTo(std::ostream* os) { + PrintHeader(os); + FlushEdits(); + for (std::list >::const_iterator it = + hunk_.begin(); + it != hunk_.end(); ++it) { + *os << it->first << it->second << "\n"; + } + } + + bool has_edits() const { return adds_ || removes_; } + + private: + void FlushEdits() { + hunk_.splice(hunk_.end(), hunk_removes_); + hunk_.splice(hunk_.end(), hunk_adds_); + } + + // Print a unified diff header for one hunk. + // The format is + // "@@ -, +, @@" + // where the left/right parts are omitted if unnecessary. + void PrintHeader(std::ostream* ss) const { + *ss << "@@ "; + if (removes_) { + *ss << "-" << left_start_ << "," << (removes_ + common_); + } + if (removes_ && adds_) { + *ss << " "; + } + if (adds_) { + *ss << "+" << right_start_ << "," << (adds_ + common_); + } + *ss << " @@\n"; + } + + size_t left_start_, right_start_; + size_t adds_, removes_, common_; + std::list > hunk_, hunk_adds_, hunk_removes_; +}; + +} // namespace + +// Create a list of diff hunks in Unified diff format. +// Each hunk has a header generated by PrintHeader above plus a body with +// lines prefixed with ' ' for no change, '-' for deletion and '+' for +// addition. +// 'context' represents the desired unchanged prefix/suffix around the diff. +// If two hunks are close enough that their contexts overlap, then they are +// joined into one hunk. +std::string CreateUnifiedDiff(const std::vector& left, + const std::vector& right, + size_t context) { + const std::vector edits = CalculateOptimalEdits(left, right); + + size_t l_i = 0, r_i = 0, edit_i = 0; + std::stringstream ss; + while (edit_i < edits.size()) { + // Find first edit. + while (edit_i < edits.size() && edits[edit_i] == kMatch) { + ++l_i; + ++r_i; + ++edit_i; + } + + // Find the first line to include in the hunk. + const size_t prefix_context = std::min(l_i, context); + Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1); + for (size_t i = prefix_context; i > 0; --i) { + hunk.PushLine(' ', left[l_i - i].c_str()); + } + + // Iterate the edits until we found enough suffix for the hunk or the input + // is over. + size_t n_suffix = 0; + for (; edit_i < edits.size(); ++edit_i) { + if (n_suffix >= context) { + // Continue only if the next hunk is very close. + auto it = edits.begin() + static_cast(edit_i); + while (it != edits.end() && *it == kMatch) ++it; + if (it == edits.end() || + static_cast(it - edits.begin()) - edit_i >= context) { + // There is no next edit or it is too far away. + break; + } + } + + EditType edit = edits[edit_i]; + // Reset count when a non match is found. + n_suffix = edit == kMatch ? n_suffix + 1 : 0; + + if (edit == kMatch || edit == kRemove || edit == kReplace) { + hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str()); + } + if (edit == kAdd || edit == kReplace) { + hunk.PushLine('+', right[r_i].c_str()); + } + + // Advance indices, depending on edit type. + l_i += edit != kAdd; + r_i += edit != kRemove; + } + + if (!hunk.has_edits()) { + // We are done. We don't want this hunk. + break; + } + + hunk.PrintTo(&ss); + } + return ss.str(); +} + +} // namespace edit_distance + +namespace { + +// The string representation of the values received in EqFailure() are already +// escaped. Split them on escaped '\n' boundaries. Leave all other escaped +// characters the same. +std::vector SplitEscapedString(const std::string& str) { + std::vector lines; + size_t start = 0, end = str.size(); + if (end > 2 && str[0] == '"' && str[end - 1] == '"') { + ++start; + --end; + } + bool escaped = false; + for (size_t i = start; i + 1 < end; ++i) { + if (escaped) { + escaped = false; + if (str[i] == 'n') { + lines.push_back(str.substr(start, i - start - 1)); + start = i + 1; + } + } else { + escaped = str[i] == '\\'; + } + } + lines.push_back(str.substr(start, end - start)); + return lines; +} + +} // namespace + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// lhs_expression: "foo" +// rhs_expression: "bar" +// lhs_value: "5" +// rhs_value: "6" +// +// The ignoring_case parameter is true if and only if the assertion is a +// *_STRCASEEQ*. When it's true, the string "Ignoring case" will +// be inserted into the message. +AssertionResult EqFailure(const char* lhs_expression, + const char* rhs_expression, + const std::string& lhs_value, + const std::string& rhs_value, bool ignoring_case) { + Message msg; + msg << "Expected equality of these values:"; + msg << "\n " << lhs_expression; + if (lhs_value != lhs_expression) { + msg << "\n Which is: " << lhs_value; + } + msg << "\n " << rhs_expression; + if (rhs_value != rhs_expression) { + msg << "\n Which is: " << rhs_value; + } + + if (ignoring_case) { + msg << "\nIgnoring case"; + } + + if (!lhs_value.empty() && !rhs_value.empty()) { + const std::vector lhs_lines = SplitEscapedString(lhs_value); + const std::vector rhs_lines = SplitEscapedString(rhs_value); + if (lhs_lines.size() > 1 || rhs_lines.size() > 1) { + msg << "\nWith diff:\n" + << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines); + } + } + + return AssertionFailure() << msg; +} + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +std::string GetBoolAssertionFailureMessage( + const AssertionResult& assertion_result, const char* expression_text, + const char* actual_predicate_value, const char* expected_predicate_value) { + const char* actual_message = assertion_result.message(); + Message msg; + msg << "Value of: " << expression_text + << "\n Actual: " << actual_predicate_value; + if (actual_message[0] != '\0') msg << " (" << actual_message << ")"; + msg << "\nExpected: " << expected_predicate_value; + return msg.GetString(); +} + +// Helper function for implementing ASSERT_NEAR. +AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2, + const char* abs_error_expr, double val1, + double val2, double abs_error) { + const double diff = fabs(val1 - val2); + if (diff <= abs_error) return AssertionSuccess(); + + // Find the value which is closest to zero. + const double min_abs = std::min(fabs(val1), fabs(val2)); + // Find the distance to the next double from that value. + const double epsilon = + nextafter(min_abs, std::numeric_limits::infinity()) - min_abs; + // Detect the case where abs_error is so small that EXPECT_NEAR is + // effectively the same as EXPECT_EQUAL, and give an informative error + // message so that the situation can be more easily understood without + // requiring exotic floating-point knowledge. + // Don't do an epsilon check if abs_error is zero because that implies + // that an equality check was actually intended. + if (!(std::isnan)(val1) && !(std::isnan)(val2) && abs_error > 0 && + abs_error < epsilon) { + return AssertionFailure() + << "The difference between " << expr1 << " and " << expr2 << " is " + << diff << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ".\nThe abs_error parameter " + << abs_error_expr << " evaluates to " << abs_error + << " which is smaller than the minimum distance between doubles for " + "numbers of this magnitude which is " + << epsilon + << ", thus making this EXPECT_NEAR check equivalent to " + "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead."; + } + return AssertionFailure() + << "The difference between " << expr1 << " and " << expr2 << " is " + << diff << ", which exceeds " << abs_error_expr << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ", and\n" + << abs_error_expr << " evaluates to " << abs_error << "."; +} + +// Helper template for implementing FloatLE() and DoubleLE(). +template +AssertionResult FloatingPointLE(const char* expr1, const char* expr2, + RawType val1, RawType val2) { + // Returns success if val1 is less than val2, + if (val1 < val2) { + return AssertionSuccess(); + } + + // or if val1 is almost equal to val2. + const FloatingPoint lhs(val1), rhs(val2); + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + // Note that the above two checks will both fail if either val1 or + // val2 is NaN, as the IEEE floating-point standard requires that + // any predicate involving a NaN must return false. + + ::std::stringstream val1_ss; + val1_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << val1; + + ::std::stringstream val2_ss; + val2_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << val2; + + return AssertionFailure() + << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" + << " Actual: " << StringStreamToString(&val1_ss) << " vs " + << StringStreamToString(&val2_ss); +} + +} // namespace internal + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult FloatLE(const char* expr1, const char* expr2, float val1, + float val2) { + return internal::FloatingPointLE(expr1, expr2, val1, val2); +} + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1, + double val2) { + return internal::FloatingPointLE(expr1, expr2, val1, val2); +} + +namespace internal { + +// The helper function for {ASSERT|EXPECT}_STREQ. +AssertionResult CmpHelperSTREQ(const char* lhs_expression, + const char* rhs_expression, const char* lhs, + const char* rhs) { + if (String::CStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), false); +} + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression, + const char* rhs_expression, const char* lhs, + const char* rhs) { + if (String::CaseInsensitiveCStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), true); +} + +// The helper function for {ASSERT|EXPECT}_STRNE. +AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, const char* s1, + const char* s2) { + if (!String::CStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << "), actual: \"" << s1 << "\" vs \"" << s2 << "\""; + } +} + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +AssertionResult CmpHelperSTRCASENE(const char* s1_expression, + const char* s2_expression, const char* s1, + const char* s2) { + if (!String::CaseInsensitiveCStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\""; + } +} + +} // namespace internal + +namespace { + +// Helper functions for implementing IsSubString() and IsNotSubstring(). + +// This group of overloaded functions return true if and only if needle +// is a substring of haystack. NULL is considered a substring of +// itself only. + +bool IsSubstringPred(const char* needle, const char* haystack) { + if (needle == nullptr || haystack == nullptr) return needle == haystack; + + return strstr(haystack, needle) != nullptr; +} + +bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) { + if (needle == nullptr || haystack == nullptr) return needle == haystack; + + return wcsstr(haystack, needle) != nullptr; +} + +// StringType here can be either ::std::string or ::std::wstring. +template +bool IsSubstringPred(const StringType& needle, const StringType& haystack) { + return haystack.find(needle) != StringType::npos; +} + +// This function implements either IsSubstring() or IsNotSubstring(), +// depending on the value of the expected_to_be_substring parameter. +// StringType here can be const char*, const wchar_t*, ::std::string, +// or ::std::wstring. +template +AssertionResult IsSubstringImpl(bool expected_to_be_substring, + const char* needle_expr, + const char* haystack_expr, + const StringType& needle, + const StringType& haystack) { + if (IsSubstringPred(needle, haystack) == expected_to_be_substring) + return AssertionSuccess(); + + const bool is_wide_string = sizeof(needle[0]) > 1; + const char* const begin_string_quote = is_wide_string ? "L\"" : "\""; + return AssertionFailure() + << "Value of: " << needle_expr << "\n" + << " Actual: " << begin_string_quote << needle << "\"\n" + << "Expected: " << (expected_to_be_substring ? "" : "not ") + << "a substring of " << haystack_expr << "\n" + << "Which is: " << begin_string_quote << haystack << "\""; +} + +} // namespace + +// IsSubstring() and IsNotSubstring() check whether needle is a +// substring of haystack (NULL is considered a substring of itself +// only), and return an appropriate error message when they fail. + +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, const char* needle, + const char* haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, const wchar_t* needle, + const wchar_t* haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::string& needle, + const ::std::string& haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +#if GTEST_HAS_STD_WSTRING +AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring(const char* needle_expr, + const char* haystack_expr, + const ::std::wstring& needle, + const ::std::wstring& haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +#if GTEST_OS_WINDOWS + +namespace { + +// Helper function for IsHRESULT{SuccessFailure} predicates +AssertionResult HRESULTFailureHelper(const char* expr, const char* expected, + long hr) { // NOLINT +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE + + // Windows CE doesn't support FormatMessage. + const char error_text[] = ""; + +#else + + // Looks up the human-readable system message for the HRESULT code + // and since we're not passing any params to FormatMessage, we don't + // want inserts expanded. + const DWORD kFlags = + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS; + const DWORD kBufSize = 4096; + // Gets the system's human readable message string for this HRESULT. + char error_text[kBufSize] = {'\0'}; + DWORD message_length = ::FormatMessageA(kFlags, + 0, // no source, we're asking system + static_cast(hr), // the error + 0, // no line width restrictions + error_text, // output buffer + kBufSize, // buf size + nullptr); // no arguments for inserts + // Trims tailing white space (FormatMessage leaves a trailing CR-LF) + for (; message_length && IsSpace(error_text[message_length - 1]); + --message_length) { + error_text[message_length - 1] = '\0'; + } + +#endif // GTEST_OS_WINDOWS_MOBILE + + const std::string error_hex("0x" + String::FormatHexInt(hr)); + return ::testing::AssertionFailure() + << "Expected: " << expr << " " << expected << ".\n" + << " Actual: " << error_hex << " " << error_text << "\n"; +} + +} // namespace + +AssertionResult IsHRESULTSuccess(const char* expr, long hr) { // NOLINT + if (SUCCEEDED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "succeeds", hr); +} + +AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT + if (FAILED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "fails", hr); +} + +#endif // GTEST_OS_WINDOWS + +// Utility functions for encoding Unicode text (wide strings) in +// UTF-8. + +// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8 +// like this: +// +// Code-point length Encoding +// 0 - 7 bits 0xxxxxxx +// 8 - 11 bits 110xxxxx 10xxxxxx +// 12 - 16 bits 1110xxxx 10xxxxxx 10xxxxxx +// 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +// The maximum code-point a one-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint1 = (static_cast(1) << 7) - 1; + +// The maximum code-point a two-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint2 = (static_cast(1) << (5 + 6)) - 1; + +// The maximum code-point a three-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint3 = + (static_cast(1) << (4 + 2 * 6)) - 1; + +// The maximum code-point a four-byte UTF-8 sequence can represent. +constexpr uint32_t kMaxCodePoint4 = + (static_cast(1) << (3 + 3 * 6)) - 1; + +// Chops off the n lowest bits from a bit pattern. Returns the n +// lowest bits. As a side effect, the original bit pattern will be +// shifted to the right by n bits. +inline uint32_t ChopLowBits(uint32_t* bits, int n) { + const uint32_t low_bits = *bits & ((static_cast(1) << n) - 1); + *bits >>= n; + return low_bits; +} + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type uint32_t because wchar_t may not be +// wide enough to contain a code point. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted +// to "(Invalid Unicode 0xXXXXXXXX)". +std::string CodePointToUtf8(uint32_t code_point) { + if (code_point > kMaxCodePoint4) { + return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")"; + } + + char str[5]; // Big enough for the largest valid code point. + if (code_point <= kMaxCodePoint1) { + str[1] = '\0'; + str[0] = static_cast(code_point); // 0xxxxxxx + } else if (code_point <= kMaxCodePoint2) { + str[2] = '\0'; + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xC0 | code_point); // 110xxxxx + } else if (code_point <= kMaxCodePoint3) { + str[3] = '\0'; + str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xE0 | code_point); // 1110xxxx + } else { // code_point <= kMaxCodePoint4 + str[4] = '\0'; + str[3] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xF0 | code_point); // 11110xxx + } + return str; +} + +// The following two functions only make sense if the system +// uses UTF-16 for wide string encoding. All supported systems +// with 16 bit wchar_t (Windows, Cygwin) do use UTF-16. + +// Determines if the arguments constitute UTF-16 surrogate pair +// and thus should be combined into a single Unicode code point +// using CreateCodePointFromUtf16SurrogatePair. +inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { + return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 && + (second & 0xFC00) == 0xDC00; +} + +// Creates a Unicode code point from UTF16 surrogate pair. +inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first, + wchar_t second) { + const auto first_u = static_cast(first); + const auto second_u = static_cast(second); + const uint32_t mask = (1 << 10) - 1; + return (sizeof(wchar_t) == 2) + ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000 + : + // This function should not be called when the condition is + // false, but we provide a sensible default in case it is. + first_u; +} + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +std::string WideStringToUtf8(const wchar_t* str, int num_chars) { + if (num_chars == -1) num_chars = static_cast(wcslen(str)); + + ::std::stringstream stream; + for (int i = 0; i < num_chars; ++i) { + uint32_t unicode_code_point; + + if (str[i] == L'\0') { + break; + } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { + unicode_code_point = + CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]); + i++; + } else { + unicode_code_point = static_cast(str[i]); + } + + stream << CodePointToUtf8(unicode_code_point); + } + return StringStreamToString(&stream); +} + +// Converts a wide C string to an std::string using the UTF-8 encoding. +// NULL will be converted to "(null)". +std::string String::ShowWideCString(const wchar_t* wide_c_str) { + if (wide_c_str == nullptr) return "(null)"; + + return internal::WideStringToUtf8(wide_c_str, -1); +} + +// Compares two wide C strings. Returns true if and only if they have the +// same content. +// +// Unlike wcscmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) { + if (lhs == nullptr) return rhs == nullptr; + + if (rhs == nullptr) return false; + + return wcscmp(lhs, rhs) == 0; +} + +// Helper function for *_STREQ on wide strings. +AssertionResult CmpHelperSTREQ(const char* lhs_expression, + const char* rhs_expression, const wchar_t* lhs, + const wchar_t* rhs) { + if (String::WideCStringEquals(lhs, rhs)) { + return AssertionSuccess(); + } + + return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs), + PrintToString(rhs), false); +} + +// Helper function for *_STRNE on wide strings. +AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, const wchar_t* s1, + const wchar_t* s2) { + if (!String::WideCStringEquals(s1, s2)) { + return AssertionSuccess(); + } + + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" << s2_expression + << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2); +} + +// Compares two C strings, ignoring case. Returns true if and only if they have +// the same content. +// +// Unlike strcasecmp(), this function can handle NULL argument(s). A +// NULL C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) { + if (lhs == nullptr) return rhs == nullptr; + if (rhs == nullptr) return false; + return posix::StrCaseCmp(lhs, rhs) == 0; +} + +// Compares two wide C strings, ignoring case. Returns true if and only if they +// have the same content. +// +// Unlike wcscasecmp(), this function can handle NULL argument(s). +// A NULL C string is considered different to any non-NULL wide C string, +// including the empty string. +// NB: The implementations on different platforms slightly differ. +// On windows, this method uses _wcsicmp which compares according to LC_CTYPE +// environment variable. On GNU platform this method uses wcscasecmp +// which compares according to LC_CTYPE category of the current locale. +// On MacOS X, it uses towlower, which also uses LC_CTYPE category of the +// current locale. +bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs) { + if (lhs == nullptr) return rhs == nullptr; + + if (rhs == nullptr) return false; + +#if GTEST_OS_WINDOWS + return _wcsicmp(lhs, rhs) == 0; +#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID + return wcscasecmp(lhs, rhs) == 0; +#else + // Android, Mac OS X and Cygwin don't define wcscasecmp. + // Other unknown OSes may not define it either. + wint_t left, right; + do { + left = towlower(static_cast(*lhs++)); + right = towlower(static_cast(*rhs++)); + } while (left && left == right); + return left == right; +#endif // OS selector +} + +// Returns true if and only if str ends with the given suffix, ignoring case. +// Any string is considered to end with an empty suffix. +bool String::EndsWithCaseInsensitive(const std::string& str, + const std::string& suffix) { + const size_t str_len = str.length(); + const size_t suffix_len = suffix.length(); + return (str_len >= suffix_len) && + CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len, + suffix.c_str()); +} + +// Formats an int value as "%02d". +std::string String::FormatIntWidth2(int value) { + return FormatIntWidthN(value, 2); +} + +// Formats an int value to given width with leading zeros. +std::string String::FormatIntWidthN(int value, int width) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(width) << value; + return ss.str(); +} + +// Formats an int value as "%X". +std::string String::FormatHexUInt32(uint32_t value) { + std::stringstream ss; + ss << std::hex << std::uppercase << value; + return ss.str(); +} + +// Formats an int value as "%X". +std::string String::FormatHexInt(int value) { + return FormatHexUInt32(static_cast(value)); +} + +// Formats a byte as "%02X". +std::string String::FormatByte(unsigned char value) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase + << static_cast(value); + return ss.str(); +} + +// Converts the buffer in a stringstream to an std::string, converting NUL +// bytes to "\\0" along the way. +std::string StringStreamToString(::std::stringstream* ss) { + const ::std::string& str = ss->str(); + const char* const start = str.c_str(); + const char* const end = start + str.length(); + + std::string result; + result.reserve(static_cast(2 * (end - start))); + for (const char* ch = start; ch != end; ++ch) { + if (*ch == '\0') { + result += "\\0"; // Replaces NUL with "\\0"; + } else { + result += *ch; + } + } + + return result; +} + +// Appends the user-supplied message to the Google-Test-generated message. +std::string AppendUserMessage(const std::string& gtest_msg, + const Message& user_msg) { + // Appends the user message if it's non-empty. + const std::string user_msg_string = user_msg.GetString(); + if (user_msg_string.empty()) { + return gtest_msg; + } + if (gtest_msg.empty()) { + return user_msg_string; + } + return gtest_msg + "\n" + user_msg_string; +} + +} // namespace internal + +// class TestResult + +// Creates an empty TestResult. +TestResult::TestResult() + : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {} + +// D'tor. +TestResult::~TestResult() {} + +// Returns the i-th test part result among all the results. i can +// range from 0 to total_part_count() - 1. If i is not in that range, +// aborts the program. +const TestPartResult& TestResult::GetTestPartResult(int i) const { + if (i < 0 || i >= total_part_count()) internal::posix::Abort(); + return test_part_results_.at(static_cast(i)); +} + +// Returns the i-th test property. i can range from 0 to +// test_property_count() - 1. If i is not in that range, aborts the +// program. +const TestProperty& TestResult::GetTestProperty(int i) const { + if (i < 0 || i >= test_property_count()) internal::posix::Abort(); + return test_properties_.at(static_cast(i)); +} + +// Clears the test part results. +void TestResult::ClearTestPartResults() { test_part_results_.clear(); } + +// Adds a test part result to the list. +void TestResult::AddTestPartResult(const TestPartResult& test_part_result) { + test_part_results_.push_back(test_part_result); +} + +// Adds a test property to the list. If a property with the same key as the +// supplied property is already represented, the value of this test_property +// replaces the old value for that key. +void TestResult::RecordProperty(const std::string& xml_element, + const TestProperty& test_property) { + if (!ValidateTestProperty(xml_element, test_property)) { + return; + } + internal::MutexLock lock(&test_properties_mutex_); + const std::vector::iterator property_with_matching_key = + std::find_if(test_properties_.begin(), test_properties_.end(), + internal::TestPropertyKeyIs(test_property.key())); + if (property_with_matching_key == test_properties_.end()) { + test_properties_.push_back(test_property); + return; + } + property_with_matching_key->SetValue(test_property.value()); +} + +// The list of reserved attributes used in the element of XML +// output. +static const char* const kReservedTestSuitesAttributes[] = { + "disabled", "errors", "failures", "name", + "random_seed", "tests", "time", "timestamp"}; + +// The list of reserved attributes used in the element of XML +// output. +static const char* const kReservedTestSuiteAttributes[] = { + "disabled", "errors", "failures", "name", + "tests", "time", "timestamp", "skipped"}; + +// The list of reserved attributes used in the element of XML output. +static const char* const kReservedTestCaseAttributes[] = { + "classname", "name", "status", "time", + "type_param", "value_param", "file", "line"}; + +// Use a slightly different set for allowed output to ensure existing tests can +// still RecordProperty("result") or "RecordProperty(timestamp") +static const char* const kReservedOutputTestCaseAttributes[] = { + "classname", "name", "status", "time", "type_param", + "value_param", "file", "line", "result", "timestamp"}; + +template +std::vector ArrayAsVector(const char* const (&array)[kSize]) { + return std::vector(array, array + kSize); +} + +static std::vector GetReservedAttributesForElement( + const std::string& xml_element) { + if (xml_element == "testsuites") { + return ArrayAsVector(kReservedTestSuitesAttributes); + } else if (xml_element == "testsuite") { + return ArrayAsVector(kReservedTestSuiteAttributes); + } else if (xml_element == "testcase") { + return ArrayAsVector(kReservedTestCaseAttributes); + } else { + GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; + } + // This code is unreachable but some compilers may not realizes that. + return std::vector(); +} + +// TODO(jdesprez): Merge the two getReserved attributes once skip is improved +static std::vector GetReservedOutputAttributesForElement( + const std::string& xml_element) { + if (xml_element == "testsuites") { + return ArrayAsVector(kReservedTestSuitesAttributes); + } else if (xml_element == "testsuite") { + return ArrayAsVector(kReservedTestSuiteAttributes); + } else if (xml_element == "testcase") { + return ArrayAsVector(kReservedOutputTestCaseAttributes); + } else { + GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element; + } + // This code is unreachable but some compilers may not realizes that. + return std::vector(); +} + +static std::string FormatWordList(const std::vector& words) { + Message word_list; + for (size_t i = 0; i < words.size(); ++i) { + if (i > 0 && words.size() > 2) { + word_list << ", "; + } + if (i == words.size() - 1) { + word_list << "and "; + } + word_list << "'" << words[i] << "'"; + } + return word_list.GetString(); +} + +static bool ValidateTestPropertyName( + const std::string& property_name, + const std::vector& reserved_names) { + if (std::find(reserved_names.begin(), reserved_names.end(), property_name) != + reserved_names.end()) { + ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name + << " (" << FormatWordList(reserved_names) + << " are reserved by " << GTEST_NAME_ << ")"; + return false; + } + return true; +} + +// Adds a failure if the key is a reserved attribute of the element named +// xml_element. Returns true if the property is valid. +bool TestResult::ValidateTestProperty(const std::string& xml_element, + const TestProperty& test_property) { + return ValidateTestPropertyName(test_property.key(), + GetReservedAttributesForElement(xml_element)); +} + +// Clears the object. +void TestResult::Clear() { + test_part_results_.clear(); + test_properties_.clear(); + death_test_count_ = 0; + elapsed_time_ = 0; +} + +// Returns true off the test part was skipped. +static bool TestPartSkipped(const TestPartResult& result) { + return result.skipped(); +} + +// Returns true if and only if the test was skipped. +bool TestResult::Skipped() const { + return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0; +} + +// Returns true if and only if the test failed. +bool TestResult::Failed() const { + for (int i = 0; i < total_part_count(); ++i) { + if (GetTestPartResult(i).failed()) return true; + } + return false; +} + +// Returns true if and only if the test part fatally failed. +static bool TestPartFatallyFailed(const TestPartResult& result) { + return result.fatally_failed(); +} + +// Returns true if and only if the test fatally failed. +bool TestResult::HasFatalFailure() const { + return CountIf(test_part_results_, TestPartFatallyFailed) > 0; +} + +// Returns true if and only if the test part non-fatally failed. +static bool TestPartNonfatallyFailed(const TestPartResult& result) { + return result.nonfatally_failed(); +} + +// Returns true if and only if the test has a non-fatal failure. +bool TestResult::HasNonfatalFailure() const { + return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0; +} + +// Gets the number of all test parts. This is the sum of the number +// of successful test parts and the number of failed test parts. +int TestResult::total_part_count() const { + return static_cast(test_part_results_.size()); +} + +// Returns the number of the test properties. +int TestResult::test_property_count() const { + return static_cast(test_properties_.size()); +} + +// class Test + +// Creates a Test object. + +// The c'tor saves the states of all flags. +Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {} + +// The d'tor restores the states of all flags. The actual work is +// done by the d'tor of the gtest_flag_saver_ field, and thus not +// visible here. +Test::~Test() {} + +// Sets up the test fixture. +// +// A sub-class may override this. +void Test::SetUp() {} + +// Tears down the test fixture. +// +// A sub-class may override this. +void Test::TearDown() {} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const std::string& key, const std::string& value) { + UnitTest::GetInstance()->RecordProperty(key, value); +} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const std::string& key, int value) { + Message value_message; + value_message << value; + RecordProperty(key, value_message.GetString().c_str()); +} + +namespace internal { + +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const std::string& message) { + // This function is a friend of UnitTest and as such has access to + // AddTestPartResult. + UnitTest::GetInstance()->AddTestPartResult( + result_type, + nullptr, // No info about the source file where the exception occurred. + -1, // We have no info on which line caused the exception. + message, + ""); // No stack trace, either. +} + +} // namespace internal + +// Google Test requires all tests in the same test suite to use the same test +// fixture class. This function checks if the current test has the +// same fixture class as the first test in the current test suite. If +// yes, it returns true; otherwise it generates a Google Test failure and +// returns false. +bool Test::HasSameFixtureClass() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + const TestSuite* const test_suite = impl->current_test_suite(); + + // Info about the first test in the current test suite. + const TestInfo* const first_test_info = test_suite->test_info_list()[0]; + const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_; + const char* const first_test_name = first_test_info->name(); + + // Info about the current test. + const TestInfo* const this_test_info = impl->current_test_info(); + const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_; + const char* const this_test_name = this_test_info->name(); + + if (this_fixture_id != first_fixture_id) { + // Is the first test defined using TEST? + const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId(); + // Is this test defined using TEST? + const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId(); + + if (first_is_TEST || this_is_TEST) { + // Both TEST and TEST_F appear in same test suite, which is incorrect. + // Tell the user how to fix this. + + // Gets the name of the TEST and the name of the TEST_F. Note + // that first_is_TEST and this_is_TEST cannot both be true, as + // the fixture IDs are different for the two tests. + const char* const TEST_name = + first_is_TEST ? first_test_name : this_test_name; + const char* const TEST_F_name = + first_is_TEST ? this_test_name : first_test_name; + + ADD_FAILURE() + << "All tests in the same test suite must use the same test fixture\n" + << "class, so mixing TEST_F and TEST in the same test suite is\n" + << "illegal. In test suite " << this_test_info->test_suite_name() + << ",\n" + << "test " << TEST_F_name << " is defined using TEST_F but\n" + << "test " << TEST_name << " is defined using TEST. You probably\n" + << "want to change the TEST to TEST_F or move it to another test\n" + << "case."; + } else { + // Two fixture classes with the same name appear in two different + // namespaces, which is not allowed. Tell the user how to fix this. + ADD_FAILURE() + << "All tests in the same test suite must use the same test fixture\n" + << "class. However, in test suite " + << this_test_info->test_suite_name() << ",\n" + << "you defined test " << first_test_name << " and test " + << this_test_name << "\n" + << "using two different test fixture classes. This can happen if\n" + << "the two classes are from different namespaces or translation\n" + << "units and have the same name. You should probably rename one\n" + << "of the classes to put the tests into different test suites."; + } + return false; + } + + return true; +} + +#if GTEST_HAS_SEH + +// Adds an "exception thrown" fatal failure to the current test. This +// function returns its result via an output parameter pointer because VC++ +// prohibits creation of objects with destructors on stack in functions +// using __try (see error C2712). +static std::string* FormatSehExceptionMessage(DWORD exception_code, + const char* location) { + Message message; + message << "SEH exception with code 0x" << std::setbase(16) << exception_code + << std::setbase(10) << " thrown in " << location << "."; + + return new std::string(message.GetString()); +} + +#endif // GTEST_HAS_SEH + +namespace internal { + +#if GTEST_HAS_EXCEPTIONS + +// Adds an "exception thrown" fatal failure to the current test. +static std::string FormatCxxExceptionMessage(const char* description, + const char* location) { + Message message; + if (description != nullptr) { + message << "C++ exception with description \"" << description << "\""; + } else { + message << "Unknown C++ exception"; + } + message << " thrown in " << location << "."; + + return message.GetString(); +} + +static std::string PrintTestPartResultToString( + const TestPartResult& test_part_result); + +GoogleTestFailureException::GoogleTestFailureException( + const TestPartResult& failure) + : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {} + +#endif // GTEST_HAS_EXCEPTIONS + +// We put these helper functions in the internal namespace as IBM's xlC +// compiler rejects the code if they were declared static. + +// Runs the given method and handles SEH exceptions it throws, when +// SEH is supported; returns the 0-value for type Result in case of an +// SEH exception. (Microsoft compilers cannot handle SEH and C++ +// exceptions in the same function. Therefore, we provide a separate +// wrapper function for handling SEH exceptions.) +template +Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(), + const char* location) { +#if GTEST_HAS_SEH + __try { + return (object->*method)(); + } __except (internal::UnitTestOptions::GTestShouldProcessSEH( // NOLINT + GetExceptionCode())) { + // We create the exception message on the heap because VC++ prohibits + // creation of objects with destructors on stack in functions using __try + // (see error C2712). + std::string* exception_message = + FormatSehExceptionMessage(GetExceptionCode(), location); + internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure, + *exception_message); + delete exception_message; + return static_cast(0); + } +#else + (void)location; + return (object->*method)(); +#endif // GTEST_HAS_SEH +} + +// Runs the given method and catches and reports C++ and/or SEH-style +// exceptions, if they are supported; returns the 0-value for type +// Result in case of an SEH exception. +template +Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(), + const char* location) { + // NOTE: The user code can affect the way in which Google Test handles + // exceptions by setting GTEST_FLAG(catch_exceptions), but only before + // RUN_ALL_TESTS() starts. It is technically possible to check the flag + // after the exception is caught and either report or re-throw the + // exception based on the flag's value: + // + // try { + // // Perform the test method. + // } catch (...) { + // if (GTEST_FLAG_GET(catch_exceptions)) + // // Report the exception as failure. + // else + // throw; // Re-throws the original exception. + // } + // + // However, the purpose of this flag is to allow the program to drop into + // the debugger when the exception is thrown. On most platforms, once the + // control enters the catch block, the exception origin information is + // lost and the debugger will stop the program at the point of the + // re-throw in this function -- instead of at the point of the original + // throw statement in the code under test. For this reason, we perform + // the check early, sacrificing the ability to affect Google Test's + // exception handling in the method where the exception is thrown. + if (internal::GetUnitTestImpl()->catch_exceptions()) { +#if GTEST_HAS_EXCEPTIONS + try { + return HandleSehExceptionsInMethodIfSupported(object, method, location); + } catch (const AssertionException&) { // NOLINT + // This failure was reported already. + } catch (const internal::GoogleTestFailureException&) { // NOLINT + // This exception type can only be thrown by a failed Google + // Test assertion with the intention of letting another testing + // framework catch it. Therefore we just re-throw it. + throw; + } catch (const std::exception& e) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(e.what(), location)); + } catch (...) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(nullptr, location)); + } + return static_cast(0); +#else + return HandleSehExceptionsInMethodIfSupported(object, method, location); +#endif // GTEST_HAS_EXCEPTIONS + } else { + return (object->*method)(); + } +} + +} // namespace internal + +// Runs the test and updates the test result. +void Test::Run() { + if (!HasSameFixtureClass()) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()"); + // We will run the test only if SetUp() was successful and didn't call + // GTEST_SKIP(). + if (!HasFatalFailure() && !IsSkipped()) { + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody, + "the test body"); + } + + // However, we want to clean up as much as possible. Hence we will + // always call TearDown(), even if SetUp() or the test body has + // failed. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown, + "TearDown()"); +} + +// Returns true if and only if the current test has a fatal failure. +bool Test::HasFatalFailure() { + return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure(); +} + +// Returns true if and only if the current test has a non-fatal failure. +bool Test::HasNonfatalFailure() { + return internal::GetUnitTestImpl() + ->current_test_result() + ->HasNonfatalFailure(); +} + +// Returns true if and only if the current test was skipped. +bool Test::IsSkipped() { + return internal::GetUnitTestImpl()->current_test_result()->Skipped(); +} + +// class TestInfo + +// Constructs a TestInfo object. It assumes ownership of the test factory +// object. +TestInfo::TestInfo(const std::string& a_test_suite_name, + const std::string& a_name, const char* a_type_param, + const char* a_value_param, + internal::CodeLocation a_code_location, + internal::TypeId fixture_class_id, + internal::TestFactoryBase* factory) + : test_suite_name_(a_test_suite_name), + name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : nullptr), + value_param_(a_value_param ? new std::string(a_value_param) : nullptr), + location_(a_code_location), + fixture_class_id_(fixture_class_id), + should_run_(false), + is_disabled_(false), + matches_filter_(false), + is_in_another_shard_(false), + factory_(factory), + result_() {} + +// Destructs a TestInfo object. +TestInfo::~TestInfo() { delete factory_; } + +namespace internal { + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_suite_name: name of the test suite +// name: name of the test +// type_param: the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param: text representation of the test's value parameter, +// or NULL if this is not a value-parameterized test. +// code_location: code location where the test is defined +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +TestInfo* MakeAndRegisterTestInfo( + const char* test_suite_name, const char* name, const char* type_param, + const char* value_param, CodeLocation code_location, + TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc, + TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory) { + TestInfo* const test_info = + new TestInfo(test_suite_name, name, type_param, value_param, + code_location, fixture_class_id, factory); + GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info); + return test_info; +} + +void ReportInvalidTestSuiteType(const char* test_suite_name, + CodeLocation code_location) { + Message errors; + errors + << "Attempted redefinition of test suite " << test_suite_name << ".\n" + << "All tests in the same test suite must use the same test fixture\n" + << "class. However, in test suite " << test_suite_name << ", you tried\n" + << "to define a test using a fixture class different from the one\n" + << "used earlier. This can happen if the two fixture classes are\n" + << "from different namespaces and have the same name. You should\n" + << "probably rename one of the classes to put the tests into different\n" + << "test suites."; + + GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(), + code_location.line) + << " " << errors.GetString(); +} +} // namespace internal + +namespace { + +// A predicate that checks the test name of a TestInfo against a known +// value. +// +// This is used for implementation of the TestSuite class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestNameIs is copyable. +class TestNameIs { + public: + // Constructor. + // + // TestNameIs has NO default constructor. + explicit TestNameIs(const char* name) : name_(name) {} + + // Returns true if and only if the test name of test_info matches name_. + bool operator()(const TestInfo* test_info) const { + return test_info && test_info->name() == name_; + } + + private: + std::string name_; +}; + +} // namespace + +namespace internal { + +// This method expands all parameterized tests registered with macros TEST_P +// and INSTANTIATE_TEST_SUITE_P into regular tests and registers those. +// This will be done just once during the program runtime. +void UnitTestImpl::RegisterParameterizedTests() { + if (!parameterized_tests_registered_) { + parameterized_test_registry_.RegisterTests(); + type_parameterized_test_registry_.CheckForInstantiations(); + parameterized_tests_registered_ = true; + } +} + +} // namespace internal + +// Creates the test object, runs it, records its result, and then +// deletes it. +void TestInfo::Run() { + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + if (!should_run_) { + if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this); + return; + } + + // Tells UnitTest where to store test result. + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_info(this); + + // Notifies the unit test event listeners that a test is about to start. + repeater->OnTestStart(*this); + result_.set_start_timestamp(internal::GetTimeInMillis()); + internal::Timer timer; + impl->os_stack_trace_getter()->UponLeavingGTest(); + + // Creates the test object. + Test* const test = internal::HandleExceptionsInMethodIfSupported( + factory_, &internal::TestFactoryBase::CreateTest, + "the test fixture's constructor"); + + // Runs the test if the constructor didn't generate a fatal failure or invoke + // GTEST_SKIP(). + // Note that the object will not be null + if (!Test::HasFatalFailure() && !Test::IsSkipped()) { + // This doesn't throw as all user code that can throw are wrapped into + // exception handling code. + test->Run(); + } + + if (test != nullptr) { + // Deletes the test object. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + test, &Test::DeleteSelf_, "the test fixture's destructor"); + } + + result_.set_elapsed_time(timer.Elapsed()); + + // Notifies the unit test event listener that a test has just finished. + repeater->OnTestEnd(*this); + + // Tells UnitTest to stop associating assertion results to this + // test. + impl->set_current_test_info(nullptr); +} + +// Skip and records a skipped test result for this object. +void TestInfo::Skip() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_info(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Notifies the unit test event listeners that a test is about to start. + repeater->OnTestStart(*this); + + const TestPartResult test_part_result = + TestPartResult(TestPartResult::kSkip, this->file(), this->line(), ""); + impl->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult( + test_part_result); + + // Notifies the unit test event listener that a test has just finished. + repeater->OnTestEnd(*this); + impl->set_current_test_info(nullptr); +} + +// class TestSuite + +// Gets the number of successful tests in this test suite. +int TestSuite::successful_test_count() const { + return CountIf(test_info_list_, TestPassed); +} + +// Gets the number of successful tests in this test suite. +int TestSuite::skipped_test_count() const { + return CountIf(test_info_list_, TestSkipped); +} + +// Gets the number of failed tests in this test suite. +int TestSuite::failed_test_count() const { + return CountIf(test_info_list_, TestFailed); +} + +// Gets the number of disabled tests that will be reported in the XML report. +int TestSuite::reportable_disabled_test_count() const { + return CountIf(test_info_list_, TestReportableDisabled); +} + +// Gets the number of disabled tests in this test suite. +int TestSuite::disabled_test_count() const { + return CountIf(test_info_list_, TestDisabled); +} + +// Gets the number of tests to be printed in the XML report. +int TestSuite::reportable_test_count() const { + return CountIf(test_info_list_, TestReportable); +} + +// Get the number of tests in this test suite that should run. +int TestSuite::test_to_run_count() const { + return CountIf(test_info_list_, ShouldRunTest); +} + +// Gets the number of all tests. +int TestSuite::total_test_count() const { + return static_cast(test_info_list_.size()); +} + +// Creates a TestSuite with the given name. +// +// Arguments: +// +// a_name: name of the test suite +// a_type_param: the name of the test suite's type parameter, or NULL if +// this is not a typed or a type-parameterized test suite. +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +TestSuite::TestSuite(const char* a_name, const char* a_type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) + : name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : nullptr), + set_up_tc_(set_up_tc), + tear_down_tc_(tear_down_tc), + should_run_(false), + start_timestamp_(0), + elapsed_time_(0) {} + +// Destructor of TestSuite. +TestSuite::~TestSuite() { + // Deletes every Test in the collection. + ForEach(test_info_list_, internal::Delete); +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +const TestInfo* TestSuite::GetTestInfo(int i) const { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? nullptr : test_info_list_[static_cast(index)]; +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +TestInfo* TestSuite::GetMutableTestInfo(int i) { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? nullptr : test_info_list_[static_cast(index)]; +} + +// Adds a test to this test suite. Will delete the test upon +// destruction of the TestSuite object. +void TestSuite::AddTestInfo(TestInfo* test_info) { + test_info_list_.push_back(test_info); + test_indices_.push_back(static_cast(test_indices_.size())); +} + +// Runs every test in this TestSuite. +void TestSuite::Run() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_suite(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Call both legacy and the new API + repeater->OnTestSuiteStart(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + repeater->OnTestCaseStart(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()"); + + const bool skip_all = ad_hoc_test_result().Failed(); + + start_timestamp_ = internal::GetTimeInMillis(); + internal::Timer timer; + for (int i = 0; i < total_test_count(); i++) { + if (skip_all) { + GetMutableTestInfo(i)->Skip(); + } else { + GetMutableTestInfo(i)->Run(); + } + if (GTEST_FLAG_GET(fail_fast) && + GetMutableTestInfo(i)->result()->Failed()) { + for (int j = i + 1; j < total_test_count(); j++) { + GetMutableTestInfo(j)->Skip(); + } + break; + } + } + elapsed_time_ = timer.Elapsed(); + + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()"); + + // Call both legacy and the new API + repeater->OnTestSuiteEnd(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + repeater->OnTestCaseEnd(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + impl->set_current_test_suite(nullptr); +} + +// Skips all tests under this TestSuite. +void TestSuite::Skip() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_suite(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Call both legacy and the new API + repeater->OnTestSuiteStart(*this); +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + repeater->OnTestCaseStart(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + for (int i = 0; i < total_test_count(); i++) { + GetMutableTestInfo(i)->Skip(); + } + + // Call both legacy and the new API + repeater->OnTestSuiteEnd(*this); + // Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + repeater->OnTestCaseEnd(*this); +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + impl->set_current_test_suite(nullptr); +} + +// Clears the results of all tests in this test suite. +void TestSuite::ClearResult() { + ad_hoc_test_result_.Clear(); + ForEach(test_info_list_, TestInfo::ClearTestResult); +} + +// Shuffles the tests in this test suite. +void TestSuite::ShuffleTests(internal::Random* random) { + Shuffle(random, &test_indices_); +} + +// Restores the test order to before the first shuffle. +void TestSuite::UnshuffleTests() { + for (size_t i = 0; i < test_indices_.size(); i++) { + test_indices_[i] = static_cast(i); + } +} + +// Formats a countable noun. Depending on its quantity, either the +// singular form or the plural form is used. e.g. +// +// FormatCountableNoun(1, "formula", "formuli") returns "1 formula". +// FormatCountableNoun(5, "book", "books") returns "5 books". +static std::string FormatCountableNoun(int count, const char* singular_form, + const char* plural_form) { + return internal::StreamableToString(count) + " " + + (count == 1 ? singular_form : plural_form); +} + +// Formats the count of tests. +static std::string FormatTestCount(int test_count) { + return FormatCountableNoun(test_count, "test", "tests"); +} + +// Formats the count of test suites. +static std::string FormatTestSuiteCount(int test_suite_count) { + return FormatCountableNoun(test_suite_count, "test suite", "test suites"); +} + +// Converts a TestPartResult::Type enum to human-friendly string +// representation. Both kNonFatalFailure and kFatalFailure are translated +// to "Failure", as the user usually doesn't care about the difference +// between the two when viewing the test result. +static const char* TestPartResultTypeToString(TestPartResult::Type type) { + switch (type) { + case TestPartResult::kSkip: + return "Skipped\n"; + case TestPartResult::kSuccess: + return "Success"; + + case TestPartResult::kNonFatalFailure: + case TestPartResult::kFatalFailure: +#ifdef _MSC_VER + return "error: "; +#else + return "Failure\n"; +#endif + default: + return "Unknown result type"; + } +} + +namespace internal { +namespace { +enum class GTestColor { kDefault, kRed, kGreen, kYellow }; +} // namespace + +// Prints a TestPartResult to an std::string. +static std::string PrintTestPartResultToString( + const TestPartResult& test_part_result) { + return (Message() << internal::FormatFileLocation( + test_part_result.file_name(), + test_part_result.line_number()) + << " " + << TestPartResultTypeToString(test_part_result.type()) + << test_part_result.message()) + .GetString(); +} + +// Prints a TestPartResult. +static void PrintTestPartResult(const TestPartResult& test_part_result) { + const std::string& result = PrintTestPartResultToString(test_part_result); + printf("%s\n", result.c_str()); + fflush(stdout); + // If the test program runs in Visual Studio or a debugger, the + // following statements add the test part result message to the Output + // window such that the user can double-click on it to jump to the + // corresponding source code location; otherwise they do nothing. +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + // We don't call OutputDebugString*() on Windows Mobile, as printing + // to stdout is done by OutputDebugString() there already - we don't + // want the same message printed twice. + ::OutputDebugStringA(result.c_str()); + ::OutputDebugStringA("\n"); +#endif +} + +// class PrettyUnitTestResultPrinter +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW + +// Returns the character attribute for the given color. +static WORD GetColorAttribute(GTestColor color) { + switch (color) { + case GTestColor::kRed: + return FOREGROUND_RED; + case GTestColor::kGreen: + return FOREGROUND_GREEN; + case GTestColor::kYellow: + return FOREGROUND_RED | FOREGROUND_GREEN; + default: + return 0; + } +} + +static int GetBitOffset(WORD color_mask) { + if (color_mask == 0) return 0; + + int bitOffset = 0; + while ((color_mask & 1) == 0) { + color_mask >>= 1; + ++bitOffset; + } + return bitOffset; +} + +static WORD GetNewColor(GTestColor color, WORD old_color_attrs) { + // Let's reuse the BG + static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN | + BACKGROUND_RED | BACKGROUND_INTENSITY; + static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN | + FOREGROUND_RED | FOREGROUND_INTENSITY; + const WORD existing_bg = old_color_attrs & background_mask; + + WORD new_color = + GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY; + static const int bg_bitOffset = GetBitOffset(background_mask); + static const int fg_bitOffset = GetBitOffset(foreground_mask); + + if (((new_color & background_mask) >> bg_bitOffset) == + ((new_color & foreground_mask) >> fg_bitOffset)) { + new_color ^= FOREGROUND_INTENSITY; // invert intensity + } + return new_color; +} + +#else + +// Returns the ANSI color code for the given color. GTestColor::kDefault is +// an invalid input. +static const char* GetAnsiColorCode(GTestColor color) { + switch (color) { + case GTestColor::kRed: + return "1"; + case GTestColor::kGreen: + return "2"; + case GTestColor::kYellow: + return "3"; + default: + return nullptr; + } +} + +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + +// Returns true if and only if Google Test should use colors in the output. +bool ShouldUseColor(bool stdout_is_tty) { + std::string c = GTEST_FLAG_GET(color); + const char* const gtest_color = c.c_str(); + + if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW + // On Windows the TERM variable is usually not set, but the + // console there does support colors. + return stdout_is_tty; +#else + // On non-Windows platforms, we rely on the TERM variable. + const char* const term = posix::GetEnv("TERM"); + const bool term_supports_color = + String::CStringEquals(term, "xterm") || + String::CStringEquals(term, "xterm-color") || + String::CStringEquals(term, "xterm-256color") || + String::CStringEquals(term, "screen") || + String::CStringEquals(term, "screen-256color") || + String::CStringEquals(term, "tmux") || + String::CStringEquals(term, "tmux-256color") || + String::CStringEquals(term, "rxvt-unicode") || + String::CStringEquals(term, "rxvt-unicode-256color") || + String::CStringEquals(term, "linux") || + String::CStringEquals(term, "cygwin"); + return stdout_is_tty && term_supports_color; +#endif // GTEST_OS_WINDOWS + } + + return String::CaseInsensitiveCStringEquals(gtest_color, "yes") || + String::CaseInsensitiveCStringEquals(gtest_color, "true") || + String::CaseInsensitiveCStringEquals(gtest_color, "t") || + String::CStringEquals(gtest_color, "1"); + // We take "yes", "true", "t", and "1" as meaning "yes". If the + // value is neither one of these nor "auto", we treat it as "no" to + // be conservative. +} + +// Helpers for printing colored strings to stdout. Note that on Windows, we +// cannot simply emit special characters and have the terminal change colors. +// This routine must actually emit the characters rather than return a string +// that would be colored when printed, as can be done on Linux. + +GTEST_ATTRIBUTE_PRINTF_(2, 3) +static void ColoredPrintf(GTestColor color, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + + static const bool in_color_mode = + ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); + const bool use_color = in_color_mode && (color != GTestColor::kDefault); + + if (!use_color) { + vprintf(fmt, args); + va_end(args); + return; + } + +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \ + !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW + const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); + + // Gets the current text color. + CONSOLE_SCREEN_BUFFER_INFO buffer_info; + GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); + const WORD old_color_attrs = buffer_info.wAttributes; + const WORD new_color = GetNewColor(color, old_color_attrs); + + // We need to flush the stream buffers into the console before each + // SetConsoleTextAttribute call lest it affect the text that is already + // printed but has not yet reached the console. + fflush(stdout); + SetConsoleTextAttribute(stdout_handle, new_color); + + vprintf(fmt, args); + + fflush(stdout); + // Restores the text color. + SetConsoleTextAttribute(stdout_handle, old_color_attrs); +#else + printf("\033[0;3%sm", GetAnsiColorCode(color)); + vprintf(fmt, args); + printf("\033[m"); // Resets the terminal to default. +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + va_end(args); +} + +// Text printed in Google Test's text output and --gtest_list_tests +// output to label the type parameter and value parameter for a test. +static const char kTypeParamLabel[] = "TypeParam"; +static const char kValueParamLabel[] = "GetParam()"; + +static void PrintFullTestCommentIfPresent(const TestInfo& test_info) { + const char* const type_param = test_info.type_param(); + const char* const value_param = test_info.value_param(); + + if (type_param != nullptr || value_param != nullptr) { + printf(", where "); + if (type_param != nullptr) { + printf("%s = %s", kTypeParamLabel, type_param); + if (value_param != nullptr) printf(" and "); + } + if (value_param != nullptr) { + printf("%s = %s", kValueParamLabel, value_param); + } + } +} + +// This class implements the TestEventListener interface. +// +// Class PrettyUnitTestResultPrinter is copyable. +class PrettyUnitTestResultPrinter : public TestEventListener { + public: + PrettyUnitTestResultPrinter() {} + static void PrintTestName(const char* test_suite, const char* test) { + printf("%s.%s", test_suite, test); + } + + // The following methods override what's in the TestEventListener class. + void OnTestProgramStart(const UnitTest& /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest& unit_test, int iteration) override; + void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override; + void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase& test_case) override; +#else + void OnTestSuiteStart(const TestSuite& test_suite) override; +#endif // OnTestCaseStart + + void OnTestStart(const TestInfo& test_info) override; + void OnTestDisabled(const TestInfo& test_info) override; + + void OnTestPartResult(const TestPartResult& result) override; + void OnTestEnd(const TestInfo& test_info) override; +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& test_case) override; +#else + void OnTestSuiteEnd(const TestSuite& test_suite) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override; + void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {} + + private: + static void PrintFailedTests(const UnitTest& unit_test); + static void PrintFailedTestSuites(const UnitTest& unit_test); + static void PrintSkippedTests(const UnitTest& unit_test); +}; + +// Fired before each iteration of tests starts. +void PrettyUnitTestResultPrinter::OnTestIterationStart( + const UnitTest& unit_test, int iteration) { + if (GTEST_FLAG_GET(repeat) != 1) + printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1); + + std::string f = GTEST_FLAG_GET(filter); + const char* const filter = f.c_str(); + + // Prints the filter if it's not *. This reminds the user that some + // tests may be skipped. + if (!String::CStringEquals(filter, kUniversalFilter)) { + ColoredPrintf(GTestColor::kYellow, "Note: %s filter = %s\n", GTEST_NAME_, + filter); + } + + if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) { + const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); + ColoredPrintf(GTestColor::kYellow, "Note: This is test shard %d of %s.\n", + static_cast(shard_index) + 1, + internal::posix::GetEnv(kTestTotalShards)); + } + + if (GTEST_FLAG_GET(shuffle)) { + ColoredPrintf(GTestColor::kYellow, + "Note: Randomizing tests' orders with a seed of %d .\n", + unit_test.random_seed()); + } + + ColoredPrintf(GTestColor::kGreen, "[==========] "); + printf("Running %s from %s.\n", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( + const UnitTest& /*unit_test*/) { + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("Global test environment set-up.\n"); + fflush(stdout); +} + +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) { + const std::string counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("%s from %s", counts.c_str(), test_case.name()); + if (test_case.type_param() == nullptr) { + printf("\n"); + } else { + printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param()); + } + fflush(stdout); +} +#else +void PrettyUnitTestResultPrinter::OnTestSuiteStart( + const TestSuite& test_suite) { + const std::string counts = + FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("%s from %s", counts.c_str(), test_suite.name()); + if (test_suite.type_param() == nullptr) { + printf("\n"); + } else { + printf(", where %s = %s\n", kTypeParamLabel, test_suite.type_param()); + } + fflush(stdout); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { + ColoredPrintf(GTestColor::kGreen, "[ RUN ] "); + PrintTestName(test_info.test_suite_name(), test_info.name()); + printf("\n"); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) { + ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] "); + PrintTestName(test_info.test_suite_name(), test_info.name()); + printf("\n"); + fflush(stdout); +} + +// Called after an assertion failure. +void PrettyUnitTestResultPrinter::OnTestPartResult( + const TestPartResult& result) { + switch (result.type()) { + // If the test part succeeded, we don't need to do anything. + case TestPartResult::kSuccess: + return; + default: + // Print failure message from the assertion + // (e.g. expected this and got that). + PrintTestPartResult(result); + fflush(stdout); + } +} + +void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { + if (test_info.result()->Passed()) { + ColoredPrintf(GTestColor::kGreen, "[ OK ] "); + } else if (test_info.result()->Skipped()) { + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); + } else { + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + } + PrintTestName(test_info.test_suite_name(), test_info.name()); + if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info); + + if (GTEST_FLAG_GET(print_time)) { + printf(" (%s ms)\n", + internal::StreamableToString(test_info.result()->elapsed_time()) + .c_str()); + } else { + printf("\n"); + } + fflush(stdout); +} + +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { + if (!GTEST_FLAG_GET(print_time)) return; + + const std::string counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(), + internal::StreamableToString(test_case.elapsed_time()).c_str()); + fflush(stdout); +} +#else +void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) { + if (!GTEST_FLAG_GET(print_time)) return; + + const std::string counts = + FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests"); + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(), + internal::StreamableToString(test_suite.elapsed_time()).c_str()); + fflush(stdout); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( + const UnitTest& /*unit_test*/) { + ColoredPrintf(GTestColor::kGreen, "[----------] "); + printf("Global test environment tear-down\n"); + fflush(stdout); +} + +// Internal helper for printing the list of failed tests. +void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { + const int failed_test_count = unit_test.failed_test_count(); + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); + + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite& test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_suite.total_test_count(); ++j) { + const TestInfo& test_info = *test_suite.GetTestInfo(j); + if (!test_info.should_run() || !test_info.result()->Failed()) { + continue; + } + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + printf("%s.%s", test_suite.name(), test_info.name()); + PrintFullTestCommentIfPresent(test_info); + printf("\n"); + } + } + printf("\n%2d FAILED %s\n", failed_test_count, + failed_test_count == 1 ? "TEST" : "TESTS"); +} + +// Internal helper for printing the list of test suite failures not covered by +// PrintFailedTests. +void PrettyUnitTestResultPrinter::PrintFailedTestSuites( + const UnitTest& unit_test) { + int suite_failure_count = 0; + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite& test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run()) { + continue; + } + if (test_suite.ad_hoc_test_result().Failed()) { + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name()); + ++suite_failure_count; + } + } + if (suite_failure_count > 0) { + printf("\n%2d FAILED TEST %s\n", suite_failure_count, + suite_failure_count == 1 ? "SUITE" : "SUITES"); + } +} + +// Internal helper for printing the list of skipped tests. +void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) { + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count == 0) { + return; + } + + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + const TestSuite& test_suite = *unit_test.GetTestSuite(i); + if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_suite.total_test_count(); ++j) { + const TestInfo& test_info = *test_suite.GetTestInfo(j); + if (!test_info.should_run() || !test_info.result()->Skipped()) { + continue; + } + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); + printf("%s.%s", test_suite.name(), test_info.name()); + printf("\n"); + } + } +} + +void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + ColoredPrintf(GTestColor::kGreen, "[==========] "); + printf("%s from %s ran.", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); + if (GTEST_FLAG_GET(print_time)) { + printf(" (%s ms total)", + internal::StreamableToString(unit_test.elapsed_time()).c_str()); + } + printf("\n"); + ColoredPrintf(GTestColor::kGreen, "[ PASSED ] "); + printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); + + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count > 0) { + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); + printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str()); + PrintSkippedTests(unit_test); + } + + if (!unit_test.Passed()) { + PrintFailedTests(unit_test); + PrintFailedTestSuites(unit_test); + } + + int num_disabled = unit_test.reportable_disabled_test_count(); + if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) { + if (unit_test.Passed()) { + printf("\n"); // Add a spacer if no FAILURE banner is displayed. + } + ColoredPrintf(GTestColor::kYellow, " YOU HAVE %d DISABLED %s\n\n", + num_disabled, num_disabled == 1 ? "TEST" : "TESTS"); + } + // Ensure that Google Test output is printed before, e.g., heapchecker output. + fflush(stdout); +} + +// End PrettyUnitTestResultPrinter + +// This class implements the TestEventListener interface. +// +// Class BriefUnitTestResultPrinter is copyable. +class BriefUnitTestResultPrinter : public TestEventListener { + public: + BriefUnitTestResultPrinter() {} + static void PrintTestName(const char* test_suite, const char* test) { + printf("%s.%s", test_suite, test); + } + + // The following methods override what's in the TestEventListener class. + void OnTestProgramStart(const UnitTest& /*unit_test*/) override {} + void OnTestIterationStart(const UnitTest& /*unit_test*/, + int /*iteration*/) override {} + void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {} +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestCase& /*test_case*/) override {} +#else + void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {} +#endif // OnTestCaseStart + + void OnTestStart(const TestInfo& /*test_info*/) override {} + void OnTestDisabled(const TestInfo& /*test_info*/) override {} + + void OnTestPartResult(const TestPartResult& result) override; + void OnTestEnd(const TestInfo& test_info) override; +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& /*test_case*/) override {} +#else + void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + + void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {} + void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {} + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {} +}; + +// Called after an assertion failure. +void BriefUnitTestResultPrinter::OnTestPartResult( + const TestPartResult& result) { + switch (result.type()) { + // If the test part succeeded, we don't need to do anything. + case TestPartResult::kSuccess: + return; + default: + // Print failure message from the assertion + // (e.g. expected this and got that). + PrintTestPartResult(result); + fflush(stdout); + } +} + +void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { + if (test_info.result()->Failed()) { + ColoredPrintf(GTestColor::kRed, "[ FAILED ] "); + PrintTestName(test_info.test_suite_name(), test_info.name()); + PrintFullTestCommentIfPresent(test_info); + + if (GTEST_FLAG_GET(print_time)) { + printf(" (%s ms)\n", + internal::StreamableToString(test_info.result()->elapsed_time()) + .c_str()); + } else { + printf("\n"); + } + fflush(stdout); + } +} + +void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + ColoredPrintf(GTestColor::kGreen, "[==========] "); + printf("%s from %s ran.", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str()); + if (GTEST_FLAG_GET(print_time)) { + printf(" (%s ms total)", + internal::StreamableToString(unit_test.elapsed_time()).c_str()); + } + printf("\n"); + ColoredPrintf(GTestColor::kGreen, "[ PASSED ] "); + printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); + + const int skipped_test_count = unit_test.skipped_test_count(); + if (skipped_test_count > 0) { + ColoredPrintf(GTestColor::kGreen, "[ SKIPPED ] "); + printf("%s.\n", FormatTestCount(skipped_test_count).c_str()); + } + + int num_disabled = unit_test.reportable_disabled_test_count(); + if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) { + if (unit_test.Passed()) { + printf("\n"); // Add a spacer if no FAILURE banner is displayed. + } + ColoredPrintf(GTestColor::kYellow, " YOU HAVE %d DISABLED %s\n\n", + num_disabled, num_disabled == 1 ? "TEST" : "TESTS"); + } + // Ensure that Google Test output is printed before, e.g., heapchecker output. + fflush(stdout); +} + +// End BriefUnitTestResultPrinter + +// class TestEventRepeater +// +// This class forwards events to other event listeners. +class TestEventRepeater : public TestEventListener { + public: + TestEventRepeater() : forwarding_enabled_(true) {} + ~TestEventRepeater() override; + void Append(TestEventListener* listener); + TestEventListener* Release(TestEventListener* listener); + + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled() const { return forwarding_enabled_; } + void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; } + + void OnTestProgramStart(const UnitTest& unit_test) override; + void OnTestIterationStart(const UnitTest& unit_test, int iteration) override; + void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override; + void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) override; +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseStart(const TestSuite& parameter) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestSuiteStart(const TestSuite& parameter) override; + void OnTestStart(const TestInfo& test_info) override; + void OnTestDisabled(const TestInfo& test_info) override; + void OnTestPartResult(const TestPartResult& result) override; + void OnTestEnd(const TestInfo& test_info) override; +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestCaseEnd(const TestCase& parameter) override; +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + void OnTestSuiteEnd(const TestSuite& parameter) override; + void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override; + void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) override; + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void OnTestProgramEnd(const UnitTest& unit_test) override; + + private: + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled_; + // The list of listeners that receive events. + std::vector listeners_; + + TestEventRepeater(const TestEventRepeater&) = delete; + TestEventRepeater& operator=(const TestEventRepeater&) = delete; +}; + +TestEventRepeater::~TestEventRepeater() { + ForEach(listeners_, Delete); +} + +void TestEventRepeater::Append(TestEventListener* listener) { + listeners_.push_back(listener); +} + +TestEventListener* TestEventRepeater::Release(TestEventListener* listener) { + for (size_t i = 0; i < listeners_.size(); ++i) { + if (listeners_[i] == listener) { + listeners_.erase(listeners_.begin() + static_cast(i)); + return listener; + } + } + + return nullptr; +} + +// Since most methods are very similar, use macros to reduce boilerplate. +// This defines a member that forwards the call to all listeners. +#define GTEST_REPEATER_METHOD_(Name, Type) \ + void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (size_t i = 0; i < listeners_.size(); i++) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ + } +// This defines a member that forwards the call to all listeners in reverse +// order. +#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ + void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (size_t i = listeners_.size(); i != 0; i--) { \ + listeners_[i - 1]->Name(parameter); \ + } \ + } \ + } + +GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest) +GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest) +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite) +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite) +GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) +GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo) +GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) +GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo) +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestSuite) +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +GTEST_REVERSE_REPEATER_METHOD_(OnTestSuiteEnd, TestSuite) +GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest) + +#undef GTEST_REPEATER_METHOD_ +#undef GTEST_REVERSE_REPEATER_METHOD_ + +void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test, + int iteration) { + if (forwarding_enabled_) { + for (size_t i = 0; i < listeners_.size(); i++) { + listeners_[i]->OnTestIterationStart(unit_test, iteration); + } + } +} + +void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test, + int iteration) { + if (forwarding_enabled_) { + for (size_t i = listeners_.size(); i > 0; i--) { + listeners_[i - 1]->OnTestIterationEnd(unit_test, iteration); + } + } +} + +// End TestEventRepeater + +// This class generates an XML output file. +class XmlUnitTestResultPrinter : public EmptyTestEventListener { + public: + explicit XmlUnitTestResultPrinter(const char* output_file); + + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + void ListTestsMatchingFilter(const std::vector& test_suites); + + // Prints an XML summary of all unit tests. + static void PrintXmlTestsList(std::ostream* stream, + const std::vector& test_suites); + + private: + // Is c a whitespace character that is normalized to a space character + // when it appears in an XML attribute value? + static bool IsNormalizableWhitespace(unsigned char c) { + return c == '\t' || c == '\n' || c == '\r'; + } + + // May c appear in a well-formed XML document? + // https://www.w3.org/TR/REC-xml/#charsets + static bool IsValidXmlCharacter(unsigned char c) { + return IsNormalizableWhitespace(c) || c >= 0x20; + } + + // Returns an XML-escaped copy of the input string str. If + // is_attribute is true, the text is meant to appear as an attribute + // value, and normalizable whitespace is preserved by replacing it + // with character references. + static std::string EscapeXml(const std::string& str, bool is_attribute); + + // Returns the given string with all characters invalid in XML removed. + static std::string RemoveInvalidXmlCharacters(const std::string& str); + + // Convenience wrapper around EscapeXml when str is an attribute value. + static std::string EscapeXmlAttribute(const std::string& str) { + return EscapeXml(str, true); + } + + // Convenience wrapper around EscapeXml when str is not an attribute value. + static std::string EscapeXmlText(const char* str) { + return EscapeXml(str, false); + } + + // Verifies that the given attribute belongs to the given element and + // streams the attribute as XML. + static void OutputXmlAttribute(std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value); + + // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. + static void OutputXmlCDataSection(::std::ostream* stream, const char* data); + + // Streams a test suite XML stanza containing the given test result. + // + // Requires: result.Failed() + static void OutputXmlTestSuiteForTestResult(::std::ostream* stream, + const TestResult& result); + + // Streams an XML representation of a TestResult object. + static void OutputXmlTestResult(::std::ostream* stream, + const TestResult& result); + + // Streams an XML representation of a TestInfo object. + static void OutputXmlTestInfo(::std::ostream* stream, + const char* test_suite_name, + const TestInfo& test_info); + + // Prints an XML representation of a TestSuite object + static void PrintXmlTestSuite(::std::ostream* stream, + const TestSuite& test_suite); + + // Prints an XML summary of unit_test to output stream out. + static void PrintXmlUnitTest(::std::ostream* stream, + const UnitTest& unit_test); + + // Produces a string representing the test properties in a result as space + // delimited XML attributes based on the property key="value" pairs. + // When the std::string is not empty, it includes a space at the beginning, + // to delimit this attribute from prior attributes. + static std::string TestPropertiesAsXmlAttributes(const TestResult& result); + + // Streams an XML representation of the test properties of a TestResult + // object. + static void OutputXmlTestProperties(std::ostream* stream, + const TestResult& result); + + // The output file. + const std::string output_file_; + + XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete; + XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete; +}; + +// Creates a new XmlUnitTestResultPrinter. +XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file) + : output_file_(output_file) { + if (output_file_.empty()) { + GTEST_LOG_(FATAL) << "XML output file may not be null"; + } +} + +// Called after the unit test ends. +void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + FILE* xmlout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintXmlUnitTest(&stream, unit_test); + fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); + fclose(xmlout); +} + +void XmlUnitTestResultPrinter::ListTestsMatchingFilter( + const std::vector& test_suites) { + FILE* xmlout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintXmlTestsList(&stream, test_suites); + fprintf(xmlout, "%s", StringStreamToString(&stream).c_str()); + fclose(xmlout); +} + +// Returns an XML-escaped copy of the input string str. If is_attribute +// is true, the text is meant to appear as an attribute value, and +// normalizable whitespace is preserved by replacing it with character +// references. +// +// Invalid XML characters in str, if any, are stripped from the output. +// It is expected that most, if not all, of the text processed by this +// module will consist of ordinary English text. +// If this module is ever modified to produce version 1.1 XML output, +// most invalid characters can be retained using character references. +std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str, + bool is_attribute) { + Message m; + + for (size_t i = 0; i < str.size(); ++i) { + const char ch = str[i]; + switch (ch) { + case '<': + m << "<"; + break; + case '>': + m << ">"; + break; + case '&': + m << "&"; + break; + case '\'': + if (is_attribute) + m << "'"; + else + m << '\''; + break; + case '"': + if (is_attribute) + m << """; + else + m << '"'; + break; + default: + if (IsValidXmlCharacter(static_cast(ch))) { + if (is_attribute && + IsNormalizableWhitespace(static_cast(ch))) + m << "&#x" << String::FormatByte(static_cast(ch)) + << ";"; + else + m << ch; + } + break; + } + } + + return m.GetString(); +} + +// Returns the given string with all characters invalid in XML removed. +// Currently invalid characters are dropped from the string. An +// alternative is to replace them with certain characters such as . or ?. +std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters( + const std::string& str) { + std::string output; + output.reserve(str.size()); + for (std::string::const_iterator it = str.begin(); it != str.end(); ++it) + if (IsValidXmlCharacter(static_cast(*it))) + output.push_back(*it); + + return output; +} + +// The following routines generate an XML representation of a UnitTest +// object. +// +// This is how Google Test concepts map to the DTD: +// +// <-- corresponds to a UnitTest object +// <-- corresponds to a TestSuite object +// <-- corresponds to a TestInfo object +// ... +// ... +// ... +// <-- individual assertion failures +// +// +// + +// Formats the given time in milliseconds as seconds. +std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) { + ::std::stringstream ss; + ss << (static_cast(ms) * 1e-3); + return ss.str(); +} + +static bool PortableLocaltime(time_t seconds, struct tm* out) { +#if defined(_MSC_VER) + return localtime_s(out, &seconds) == 0; +#elif defined(__MINGW32__) || defined(__MINGW64__) + // MINGW provides neither localtime_r nor localtime_s, but uses + // Windows' localtime(), which has a thread-local tm buffer. + struct tm* tm_ptr = localtime(&seconds); // NOLINT + if (tm_ptr == nullptr) return false; + *out = *tm_ptr; + return true; +#elif defined(__STDC_LIB_EXT1__) + // Uses localtime_s when available as localtime_r is only available from + // C23 standard. + return localtime_s(&seconds, out) != nullptr; +#else + return localtime_r(&seconds, out) != nullptr; +#endif +} + +// Converts the given epoch time in milliseconds to a date string in the ISO +// 8601 format, without the timezone information. +std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) { + struct tm time_struct; + if (!PortableLocaltime(static_cast(ms / 1000), &time_struct)) + return ""; + // YYYY-MM-DDThh:mm:ss.sss + return StreamableToString(time_struct.tm_year + 1900) + "-" + + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec) + "." + + String::FormatIntWidthN(static_cast(ms % 1000), 3); +} + +// Streams an XML CDATA section, escaping invalid CDATA sequences as needed. +void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, + const char* data) { + const char* segment = data; + *stream << ""); + if (next_segment != nullptr) { + stream->write(segment, + static_cast(next_segment - segment)); + *stream << "]]>]]>"); + } else { + *stream << segment; + break; + } + } + *stream << "]]>"; +} + +void XmlUnitTestResultPrinter::OutputXmlAttribute( + std::ostream* stream, const std::string& element_name, + const std::string& name, const std::string& value) { + const std::vector& allowed_names = + GetReservedOutputAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Attribute " << name << " is not allowed for element <" << element_name + << ">."; + + *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\""; +} + +// Streams a test suite XML stanza containing the given test result. +void XmlUnitTestResultPrinter::OutputXmlTestSuiteForTestResult( + ::std::ostream* stream, const TestResult& result) { + // Output the boilerplate for a minimal test suite with one test. + *stream << " "; + + // Output the boilerplate for a minimal test case with a single test. + *stream << " \n"; +} + +// Prints an XML representation of a TestInfo object. +void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, + const char* test_suite_name, + const TestInfo& test_info) { + const TestResult& result = *test_info.result(); + const std::string kTestsuite = "testcase"; + + if (test_info.is_in_another_shard()) { + return; + } + + *stream << " \n"; + return; + } + + OutputXmlAttribute(stream, kTestsuite, "status", + test_info.should_run() ? "run" : "notrun"); + OutputXmlAttribute(stream, kTestsuite, "result", + test_info.should_run() + ? (result.Skipped() ? "skipped" : "completed") + : "suppressed"); + OutputXmlAttribute(stream, kTestsuite, "time", + FormatTimeInMillisAsSeconds(result.elapsed_time())); + OutputXmlAttribute( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsIso8601(result.start_timestamp())); + OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name); + + OutputXmlTestResult(stream, result); +} + +void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream, + const TestResult& result) { + int failures = 0; + int skips = 0; + for (int i = 0; i < result.total_part_count(); ++i) { + const TestPartResult& part = result.GetTestPartResult(i); + if (part.failed()) { + if (++failures == 1 && skips == 0) { + *stream << ">\n"; + } + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string summary = location + "\n" + part.summary(); + *stream << " "; + const std::string detail = location + "\n" + part.message(); + OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); + *stream << "\n"; + } else if (part.skipped()) { + if (++skips == 1 && failures == 0) { + *stream << ">\n"; + } + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string summary = location + "\n" + part.summary(); + *stream << " "; + const std::string detail = location + "\n" + part.message(); + OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str()); + *stream << "\n"; + } + } + + if (failures == 0 && skips == 0 && result.test_property_count() == 0) { + *stream << " />\n"; + } else { + if (failures == 0 && skips == 0) { + *stream << ">\n"; + } + OutputXmlTestProperties(stream, result); + *stream << " \n"; + } +} + +// Prints an XML representation of a TestSuite object +void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream, + const TestSuite& test_suite) { + const std::string kTestsuite = "testsuite"; + *stream << " <" << kTestsuite; + OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name()); + OutputXmlAttribute(stream, kTestsuite, "tests", + StreamableToString(test_suite.reportable_test_count())); + if (!GTEST_FLAG_GET(list_tests)) { + OutputXmlAttribute(stream, kTestsuite, "failures", + StreamableToString(test_suite.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuite, "disabled", + StreamableToString(test_suite.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuite, "skipped", + StreamableToString(test_suite.skipped_test_count())); + + OutputXmlAttribute(stream, kTestsuite, "errors", "0"); + + OutputXmlAttribute(stream, kTestsuite, "time", + FormatTimeInMillisAsSeconds(test_suite.elapsed_time())); + OutputXmlAttribute( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsIso8601(test_suite.start_timestamp())); + *stream << TestPropertiesAsXmlAttributes(test_suite.ad_hoc_test_result()); + } + *stream << ">\n"; + for (int i = 0; i < test_suite.total_test_count(); ++i) { + if (test_suite.GetTestInfo(i)->is_reportable()) + OutputXmlTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i)); + } + *stream << " \n"; +} + +// Prints an XML summary of unit_test to output stream out. +void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream, + const UnitTest& unit_test) { + const std::string kTestsuites = "testsuites"; + + *stream << "\n"; + *stream << "<" << kTestsuites; + + OutputXmlAttribute(stream, kTestsuites, "tests", + StreamableToString(unit_test.reportable_test_count())); + OutputXmlAttribute(stream, kTestsuites, "failures", + StreamableToString(unit_test.failed_test_count())); + OutputXmlAttribute( + stream, kTestsuites, "disabled", + StreamableToString(unit_test.reportable_disabled_test_count())); + OutputXmlAttribute(stream, kTestsuites, "errors", "0"); + OutputXmlAttribute(stream, kTestsuites, "time", + FormatTimeInMillisAsSeconds(unit_test.elapsed_time())); + OutputXmlAttribute( + stream, kTestsuites, "timestamp", + FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp())); + + if (GTEST_FLAG_GET(shuffle)) { + OutputXmlAttribute(stream, kTestsuites, "random_seed", + StreamableToString(unit_test.random_seed())); + } + *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result()); + + OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); + *stream << ">\n"; + + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) + PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i)); + } + + // If there was a test failure outside of one of the test suites (like in a + // test environment) include that in the output. + if (unit_test.ad_hoc_test_result().Failed()) { + OutputXmlTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result()); + } + + *stream << "\n"; +} + +void XmlUnitTestResultPrinter::PrintXmlTestsList( + std::ostream* stream, const std::vector& test_suites) { + const std::string kTestsuites = "testsuites"; + + *stream << "\n"; + *stream << "<" << kTestsuites; + + int total_tests = 0; + for (auto test_suite : test_suites) { + total_tests += test_suite->total_test_count(); + } + OutputXmlAttribute(stream, kTestsuites, "tests", + StreamableToString(total_tests)); + OutputXmlAttribute(stream, kTestsuites, "name", "AllTests"); + *stream << ">\n"; + + for (auto test_suite : test_suites) { + PrintXmlTestSuite(stream, *test_suite); + } + *stream << "\n"; +} + +// Produces a string representing the test properties in a result as space +// delimited XML attributes based on the property key="value" pairs. +std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( + const TestResult& result) { + Message attributes; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + attributes << " " << property.key() << "=" + << "\"" << EscapeXmlAttribute(property.value()) << "\""; + } + return attributes.GetString(); +} + +void XmlUnitTestResultPrinter::OutputXmlTestProperties( + std::ostream* stream, const TestResult& result) { + const std::string kProperties = "properties"; + const std::string kProperty = "property"; + + if (result.test_property_count() <= 0) { + return; + } + + *stream << " <" << kProperties << ">\n"; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + *stream << " <" << kProperty; + *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\""; + *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\""; + *stream << "/>\n"; + } + *stream << " \n"; +} + +// End XmlUnitTestResultPrinter + +// This class generates an JSON output file. +class JsonUnitTestResultPrinter : public EmptyTestEventListener { + public: + explicit JsonUnitTestResultPrinter(const char* output_file); + + void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override; + + // Prints an JSON summary of all unit tests. + static void PrintJsonTestList(::std::ostream* stream, + const std::vector& test_suites); + + private: + // Returns an JSON-escaped copy of the input string str. + static std::string EscapeJson(const std::string& str); + + //// Verifies that the given attribute belongs to the given element and + //// streams the attribute as JSON. + static void OutputJsonKey(std::ostream* stream, + const std::string& element_name, + const std::string& name, const std::string& value, + const std::string& indent, bool comma = true); + static void OutputJsonKey(std::ostream* stream, + const std::string& element_name, + const std::string& name, int value, + const std::string& indent, bool comma = true); + + // Streams a test suite JSON stanza containing the given test result. + // + // Requires: result.Failed() + static void OutputJsonTestSuiteForTestResult(::std::ostream* stream, + const TestResult& result); + + // Streams a JSON representation of a TestResult object. + static void OutputJsonTestResult(::std::ostream* stream, + const TestResult& result); + + // Streams a JSON representation of a TestInfo object. + static void OutputJsonTestInfo(::std::ostream* stream, + const char* test_suite_name, + const TestInfo& test_info); + + // Prints a JSON representation of a TestSuite object + static void PrintJsonTestSuite(::std::ostream* stream, + const TestSuite& test_suite); + + // Prints a JSON summary of unit_test to output stream out. + static void PrintJsonUnitTest(::std::ostream* stream, + const UnitTest& unit_test); + + // Produces a string representing the test properties in a result as + // a JSON dictionary. + static std::string TestPropertiesAsJson(const TestResult& result, + const std::string& indent); + + // The output file. + const std::string output_file_; + + JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete; + JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) = + delete; +}; + +// Creates a new JsonUnitTestResultPrinter. +JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file) + : output_file_(output_file) { + if (output_file_.empty()) { + GTEST_LOG_(FATAL) << "JSON output file may not be null"; + } +} + +void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + FILE* jsonout = OpenFileForWriting(output_file_); + std::stringstream stream; + PrintJsonUnitTest(&stream, unit_test); + fprintf(jsonout, "%s", StringStreamToString(&stream).c_str()); + fclose(jsonout); +} + +// Returns an JSON-escaped copy of the input string str. +std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) { + Message m; + + for (size_t i = 0; i < str.size(); ++i) { + const char ch = str[i]; + switch (ch) { + case '\\': + case '"': + case '/': + m << '\\' << ch; + break; + case '\b': + m << "\\b"; + break; + case '\t': + m << "\\t"; + break; + case '\n': + m << "\\n"; + break; + case '\f': + m << "\\f"; + break; + case '\r': + m << "\\r"; + break; + default: + if (ch < ' ') { + m << "\\u00" << String::FormatByte(static_cast(ch)); + } else { + m << ch; + } + break; + } + } + + return m.GetString(); +} + +// The following routines generate an JSON representation of a UnitTest +// object. + +// Formats the given time in milliseconds as seconds. +static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) { + ::std::stringstream ss; + ss << (static_cast(ms) * 1e-3) << "s"; + return ss.str(); +} + +// Converts the given epoch time in milliseconds to a date string in the +// RFC3339 format, without the timezone information. +static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) { + struct tm time_struct; + if (!PortableLocaltime(static_cast(ms / 1000), &time_struct)) + return ""; + // YYYY-MM-DDThh:mm:ss + return StreamableToString(time_struct.tm_year + 1900) + "-" + + String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" + + String::FormatIntWidth2(time_struct.tm_mday) + "T" + + String::FormatIntWidth2(time_struct.tm_hour) + ":" + + String::FormatIntWidth2(time_struct.tm_min) + ":" + + String::FormatIntWidth2(time_struct.tm_sec) + "Z"; +} + +static inline std::string Indent(size_t width) { + return std::string(width, ' '); +} + +void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream, + const std::string& element_name, + const std::string& name, + const std::string& value, + const std::string& indent, + bool comma) { + const std::vector& allowed_names = + GetReservedOutputAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Key \"" << name << "\" is not allowed for value \"" << element_name + << "\"."; + + *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\""; + if (comma) *stream << ",\n"; +} + +void JsonUnitTestResultPrinter::OutputJsonKey( + std::ostream* stream, const std::string& element_name, + const std::string& name, int value, const std::string& indent, bool comma) { + const std::vector& allowed_names = + GetReservedOutputAttributesForElement(element_name); + + GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) != + allowed_names.end()) + << "Key \"" << name << "\" is not allowed for value \"" << element_name + << "\"."; + + *stream << indent << "\"" << name << "\": " << StreamableToString(value); + if (comma) *stream << ",\n"; +} + +// Streams a test suite JSON stanza containing the given test result. +void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult( + ::std::ostream* stream, const TestResult& result) { + // Output the boilerplate for a new test suite. + *stream << Indent(4) << "{\n"; + OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6)); + OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6)); + if (!GTEST_FLAG_GET(list_tests)) { + OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6)); + OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6)); + OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6)); + OutputJsonKey(stream, "testsuite", "errors", 0, Indent(6)); + OutputJsonKey(stream, "testsuite", "time", + FormatTimeInMillisAsDuration(result.elapsed_time()), + Indent(6)); + OutputJsonKey(stream, "testsuite", "timestamp", + FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()), + Indent(6)); + } + *stream << Indent(6) << "\"testsuite\": [\n"; + + // Output the boilerplate for a new test case. + *stream << Indent(8) << "{\n"; + OutputJsonKey(stream, "testcase", "name", "", Indent(10)); + OutputJsonKey(stream, "testcase", "status", "RUN", Indent(10)); + OutputJsonKey(stream, "testcase", "result", "COMPLETED", Indent(10)); + OutputJsonKey(stream, "testcase", "timestamp", + FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()), + Indent(10)); + OutputJsonKey(stream, "testcase", "time", + FormatTimeInMillisAsDuration(result.elapsed_time()), + Indent(10)); + OutputJsonKey(stream, "testcase", "classname", "", Indent(10), false); + *stream << TestPropertiesAsJson(result, Indent(10)); + + // Output the actual test result. + OutputJsonTestResult(stream, result); + + // Finish the test suite. + *stream << "\n" << Indent(6) << "]\n" << Indent(4) << "}"; +} + +// Prints a JSON representation of a TestInfo object. +void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream, + const char* test_suite_name, + const TestInfo& test_info) { + const TestResult& result = *test_info.result(); + const std::string kTestsuite = "testcase"; + const std::string kIndent = Indent(10); + + *stream << Indent(8) << "{\n"; + OutputJsonKey(stream, kTestsuite, "name", test_info.name(), kIndent); + + if (test_info.value_param() != nullptr) { + OutputJsonKey(stream, kTestsuite, "value_param", test_info.value_param(), + kIndent); + } + if (test_info.type_param() != nullptr) { + OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(), + kIndent); + } + + OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent); + OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false); + if (GTEST_FLAG_GET(list_tests)) { + *stream << "\n" << Indent(8) << "}"; + return; + } else { + *stream << ",\n"; + } + + OutputJsonKey(stream, kTestsuite, "status", + test_info.should_run() ? "RUN" : "NOTRUN", kIndent); + OutputJsonKey(stream, kTestsuite, "result", + test_info.should_run() + ? (result.Skipped() ? "SKIPPED" : "COMPLETED") + : "SUPPRESSED", + kIndent); + OutputJsonKey(stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuite, "time", + FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent); + OutputJsonKey(stream, kTestsuite, "classname", test_suite_name, kIndent, + false); + *stream << TestPropertiesAsJson(result, kIndent); + + OutputJsonTestResult(stream, result); +} + +void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream, + const TestResult& result) { + const std::string kIndent = Indent(10); + + int failures = 0; + for (int i = 0; i < result.total_part_count(); ++i) { + const TestPartResult& part = result.GetTestPartResult(i); + if (part.failed()) { + *stream << ",\n"; + if (++failures == 1) { + *stream << kIndent << "\"" + << "failures" + << "\": [\n"; + } + const std::string location = + internal::FormatCompilerIndependentFileLocation(part.file_name(), + part.line_number()); + const std::string message = EscapeJson(location + "\n" + part.message()); + *stream << kIndent << " {\n" + << kIndent << " \"failure\": \"" << message << "\",\n" + << kIndent << " \"type\": \"\"\n" + << kIndent << " }"; + } + } + + if (failures > 0) *stream << "\n" << kIndent << "]"; + *stream << "\n" << Indent(8) << "}"; +} + +// Prints an JSON representation of a TestSuite object +void JsonUnitTestResultPrinter::PrintJsonTestSuite( + std::ostream* stream, const TestSuite& test_suite) { + const std::string kTestsuite = "testsuite"; + const std::string kIndent = Indent(6); + + *stream << Indent(4) << "{\n"; + OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent); + OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(), + kIndent); + if (!GTEST_FLAG_GET(list_tests)) { + OutputJsonKey(stream, kTestsuite, "failures", + test_suite.failed_test_count(), kIndent); + OutputJsonKey(stream, kTestsuite, "disabled", + test_suite.reportable_disabled_test_count(), kIndent); + OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent); + OutputJsonKey( + stream, kTestsuite, "timestamp", + FormatEpochTimeInMillisAsRFC3339(test_suite.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuite, "time", + FormatTimeInMillisAsDuration(test_suite.elapsed_time()), + kIndent, false); + *stream << TestPropertiesAsJson(test_suite.ad_hoc_test_result(), kIndent) + << ",\n"; + } + + *stream << kIndent << "\"" << kTestsuite << "\": [\n"; + + bool comma = false; + for (int i = 0; i < test_suite.total_test_count(); ++i) { + if (test_suite.GetTestInfo(i)->is_reportable()) { + if (comma) { + *stream << ",\n"; + } else { + comma = true; + } + OutputJsonTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i)); + } + } + *stream << "\n" << kIndent << "]\n" << Indent(4) << "}"; +} + +// Prints a JSON summary of unit_test to output stream out. +void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream, + const UnitTest& unit_test) { + const std::string kTestsuites = "testsuites"; + const std::string kIndent = Indent(2); + *stream << "{\n"; + + OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(), + kIndent); + OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(), + kIndent); + OutputJsonKey(stream, kTestsuites, "disabled", + unit_test.reportable_disabled_test_count(), kIndent); + OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent); + if (GTEST_FLAG_GET(shuffle)) { + OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(), + kIndent); + } + OutputJsonKey(stream, kTestsuites, "timestamp", + FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()), + kIndent); + OutputJsonKey(stream, kTestsuites, "time", + FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent, + false); + + *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent) + << ",\n"; + + OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent); + *stream << kIndent << "\"" << kTestsuites << "\": [\n"; + + bool comma = false; + for (int i = 0; i < unit_test.total_test_suite_count(); ++i) { + if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) { + if (comma) { + *stream << ",\n"; + } else { + comma = true; + } + PrintJsonTestSuite(stream, *unit_test.GetTestSuite(i)); + } + } + + // If there was a test failure outside of one of the test suites (like in a + // test environment) include that in the output. + if (unit_test.ad_hoc_test_result().Failed()) { + OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result()); + } + + *stream << "\n" + << kIndent << "]\n" + << "}\n"; +} + +void JsonUnitTestResultPrinter::PrintJsonTestList( + std::ostream* stream, const std::vector& test_suites) { + const std::string kTestsuites = "testsuites"; + const std::string kIndent = Indent(2); + *stream << "{\n"; + int total_tests = 0; + for (auto test_suite : test_suites) { + total_tests += test_suite->total_test_count(); + } + OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent); + + OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent); + *stream << kIndent << "\"" << kTestsuites << "\": [\n"; + + for (size_t i = 0; i < test_suites.size(); ++i) { + if (i != 0) { + *stream << ",\n"; + } + PrintJsonTestSuite(stream, *test_suites[i]); + } + + *stream << "\n" + << kIndent << "]\n" + << "}\n"; +} +// Produces a string representing the test properties in a result as +// a JSON dictionary. +std::string JsonUnitTestResultPrinter::TestPropertiesAsJson( + const TestResult& result, const std::string& indent) { + Message attributes; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + attributes << ",\n" + << indent << "\"" << property.key() << "\": " + << "\"" << EscapeJson(property.value()) << "\""; + } + return attributes.GetString(); +} + +// End JsonUnitTestResultPrinter + +#if GTEST_CAN_STREAM_RESULTS_ + +// Checks if str contains '=', '&', '%' or '\n' characters. If yes, +// replaces them by "%xx" where xx is their hexadecimal value. For +// example, replaces "=" with "%3D". This algorithm is O(strlen(str)) +// in both time and space -- important as the input str may contain an +// arbitrarily long test failure message and stack trace. +std::string StreamingListener::UrlEncode(const char* str) { + std::string result; + result.reserve(strlen(str) + 1); + for (char ch = *str; ch != '\0'; ch = *++str) { + switch (ch) { + case '%': + case '=': + case '&': + case '\n': + result.append("%" + String::FormatByte(static_cast(ch))); + break; + default: + result.push_back(ch); + break; + } + } + return result; +} + +void StreamingListener::SocketWriter::MakeConnection() { + GTEST_CHECK_(sockfd_ == -1) + << "MakeConnection() can't be called when there is already a connection."; + + addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. + hints.ai_socktype = SOCK_STREAM; + addrinfo* servinfo = nullptr; + + // Use the getaddrinfo() to get a linked list of IP addresses for + // the given host name. + const int error_num = + getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); + if (error_num != 0) { + GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: " + << gai_strerror(error_num); + } + + // Loop through all the results and connect to the first we can. + for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr; + cur_addr = cur_addr->ai_next) { + sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype, + cur_addr->ai_protocol); + if (sockfd_ != -1) { + // Connect the client socket to the server socket. + if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) { + close(sockfd_); + sockfd_ = -1; + } + } + } + + freeaddrinfo(servinfo); // all done with this structure + + if (sockfd_ == -1) { + GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to " + << host_name_ << ":" << port_num_; + } +} + +// End of class Streaming Listener +#endif // GTEST_CAN_STREAM_RESULTS__ + +// class OsStackTraceGetter + +const char* const OsStackTraceGetterInterface::kElidedFramesMarker = + "... " GTEST_NAME_ " internal frames ..."; + +std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count) + GTEST_LOCK_EXCLUDED_(mutex_) { +#if GTEST_HAS_ABSL + std::string result; + + if (max_depth <= 0) { + return result; + } + + max_depth = std::min(max_depth, kMaxStackTraceDepth); + + std::vector raw_stack(max_depth); + // Skips the frames requested by the caller, plus this function. + const int raw_stack_size = + absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1); + + void* caller_frame = nullptr; + { + MutexLock lock(&mutex_); + caller_frame = caller_frame_; + } + + for (int i = 0; i < raw_stack_size; ++i) { + if (raw_stack[i] == caller_frame && + !GTEST_FLAG_GET(show_internal_stack_frames)) { + // Add a marker to the trace and stop adding frames. + absl::StrAppend(&result, kElidedFramesMarker, "\n"); + break; + } + + char tmp[1024]; + const char* symbol = "(unknown)"; + if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) { + symbol = tmp; + } + + char line[1024]; + snprintf(line, sizeof(line), " %p: %s\n", raw_stack[i], symbol); + result += line; + } + + return result; + +#else // !GTEST_HAS_ABSL + static_cast(max_depth); + static_cast(skip_count); + return ""; +#endif // GTEST_HAS_ABSL +} + +void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) { +#if GTEST_HAS_ABSL + void* caller_frame = nullptr; + if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) { + caller_frame = nullptr; + } + + MutexLock lock(&mutex_); + caller_frame_ = caller_frame; +#endif // GTEST_HAS_ABSL +} + +// A helper class that creates the premature-exit file in its +// constructor and deletes the file in its destructor. +class ScopedPrematureExitFile { + public: + explicit ScopedPrematureExitFile(const char* premature_exit_filepath) + : premature_exit_filepath_( + premature_exit_filepath ? premature_exit_filepath : "") { + // If a path to the premature-exit file is specified... + if (!premature_exit_filepath_.empty()) { + // create the file with a single "0" character in it. I/O + // errors are ignored as there's nothing better we can do and we + // don't want to fail the test because of this. + FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w"); + fwrite("0", 1, 1, pfile); + fclose(pfile); + } + } + + ~ScopedPrematureExitFile() { +#if !defined GTEST_OS_ESP8266 + if (!premature_exit_filepath_.empty()) { + int retval = remove(premature_exit_filepath_.c_str()); + if (retval) { + GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \"" + << premature_exit_filepath_ << "\" with error " + << retval; + } + } +#endif + } + + private: + const std::string premature_exit_filepath_; + + ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete; + ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete; +}; + +} // namespace internal + +// class TestEventListeners + +TestEventListeners::TestEventListeners() + : repeater_(new internal::TestEventRepeater()), + default_result_printer_(nullptr), + default_xml_generator_(nullptr) {} + +TestEventListeners::~TestEventListeners() { delete repeater_; } + +// Returns the standard listener responsible for the default console +// output. Can be removed from the listeners list to shut down default +// console output. Note that removing this object from the listener list +// with Release transfers its ownership to the user. +void TestEventListeners::Append(TestEventListener* listener) { + repeater_->Append(listener); +} + +// Removes the given event listener from the list and returns it. It then +// becomes the caller's responsibility to delete the listener. Returns +// NULL if the listener is not found in the list. +TestEventListener* TestEventListeners::Release(TestEventListener* listener) { + if (listener == default_result_printer_) + default_result_printer_ = nullptr; + else if (listener == default_xml_generator_) + default_xml_generator_ = nullptr; + return repeater_->Release(listener); +} + +// Returns repeater that broadcasts the TestEventListener events to all +// subscribers. +TestEventListener* TestEventListeners::repeater() { return repeater_; } + +// Sets the default_result_printer attribute to the provided listener. +// The listener is also added to the listener list and previous +// default_result_printer is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) { + if (default_result_printer_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_result_printer_); + default_result_printer_ = listener; + if (listener != nullptr) Append(listener); + } +} + +// Sets the default_xml_generator attribute to the provided listener. The +// listener is also added to the listener list and previous +// default_xml_generator is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) { + if (default_xml_generator_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_xml_generator_); + default_xml_generator_ = listener; + if (listener != nullptr) Append(listener); + } +} + +// Controls whether events will be forwarded by the repeater to the +// listeners in the list. +bool TestEventListeners::EventForwardingEnabled() const { + return repeater_->forwarding_enabled(); +} + +void TestEventListeners::SuppressEventForwarding() { + repeater_->set_forwarding_enabled(false); +} + +// class UnitTest + +// Gets the singleton UnitTest object. The first time this method is +// called, a UnitTest object is constructed and returned. Consecutive +// calls will return the same object. +// +// We don't protect this under mutex_ as a user is not supposed to +// call this before main() starts, from which point on the return +// value will never change. +UnitTest* UnitTest::GetInstance() { + // CodeGear C++Builder insists on a public destructor for the + // default implementation. Use this implementation to keep good OO + // design with private destructor. + +#if defined(__BORLANDC__) + static UnitTest* const instance = new UnitTest; + return instance; +#else + static UnitTest instance; + return &instance; +#endif // defined(__BORLANDC__) +} + +// Gets the number of successful test suites. +int UnitTest::successful_test_suite_count() const { + return impl()->successful_test_suite_count(); +} + +// Gets the number of failed test suites. +int UnitTest::failed_test_suite_count() const { + return impl()->failed_test_suite_count(); +} + +// Gets the number of all test suites. +int UnitTest::total_test_suite_count() const { + return impl()->total_test_suite_count(); +} + +// Gets the number of all test suites that contain at least one test +// that should run. +int UnitTest::test_suite_to_run_count() const { + return impl()->test_suite_to_run_count(); +} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +int UnitTest::successful_test_case_count() const { + return impl()->successful_test_suite_count(); +} +int UnitTest::failed_test_case_count() const { + return impl()->failed_test_suite_count(); +} +int UnitTest::total_test_case_count() const { + return impl()->total_test_suite_count(); +} +int UnitTest::test_case_to_run_count() const { + return impl()->test_suite_to_run_count(); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// Gets the number of successful tests. +int UnitTest::successful_test_count() const { + return impl()->successful_test_count(); +} + +// Gets the number of skipped tests. +int UnitTest::skipped_test_count() const { + return impl()->skipped_test_count(); +} + +// Gets the number of failed tests. +int UnitTest::failed_test_count() const { return impl()->failed_test_count(); } + +// Gets the number of disabled tests that will be reported in the XML report. +int UnitTest::reportable_disabled_test_count() const { + return impl()->reportable_disabled_test_count(); +} + +// Gets the number of disabled tests. +int UnitTest::disabled_test_count() const { + return impl()->disabled_test_count(); +} + +// Gets the number of tests to be printed in the XML report. +int UnitTest::reportable_test_count() const { + return impl()->reportable_test_count(); +} + +// Gets the number of all tests. +int UnitTest::total_test_count() const { return impl()->total_test_count(); } + +// Gets the number of tests that should run. +int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); } + +// Gets the time of the test program start, in ms from the start of the +// UNIX epoch. +internal::TimeInMillis UnitTest::start_timestamp() const { + return impl()->start_timestamp(); +} + +// Gets the elapsed time, in milliseconds. +internal::TimeInMillis UnitTest::elapsed_time() const { + return impl()->elapsed_time(); +} + +// Returns true if and only if the unit test passed (i.e. all test suites +// passed). +bool UnitTest::Passed() const { return impl()->Passed(); } + +// Returns true if and only if the unit test failed (i.e. some test suite +// failed or something outside of all tests failed). +bool UnitTest::Failed() const { return impl()->Failed(); } + +// Gets the i-th test suite among all the test suites. i can range from 0 to +// total_test_suite_count() - 1. If i is not in that range, returns NULL. +const TestSuite* UnitTest::GetTestSuite(int i) const { + return impl()->GetTestSuite(i); +} + +// Legacy API is deprecated but still available +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +const TestCase* UnitTest::GetTestCase(int i) const { + return impl()->GetTestCase(i); +} +#endif // GTEST_REMOVE_LEGACY_TEST_CASEAPI_ + +// Returns the TestResult containing information on test failures and +// properties logged outside of individual test suites. +const TestResult& UnitTest::ad_hoc_test_result() const { + return *impl()->ad_hoc_test_result(); +} + +// Gets the i-th test suite among all the test suites. i can range from 0 to +// total_test_suite_count() - 1. If i is not in that range, returns NULL. +TestSuite* UnitTest::GetMutableTestSuite(int i) { + return impl()->GetMutableSuiteCase(i); +} + +// Returns the list of event listeners that can be used to track events +// inside Google Test. +TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); } + +// Registers and returns a global test environment. When a test +// program is run, all global test environments will be set-up in the +// order they were registered. After all tests in the program have +// finished, all global test environments will be torn-down in the +// *reverse* order they were registered. +// +// The UnitTest object takes ownership of the given environment. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +Environment* UnitTest::AddEnvironment(Environment* env) { + if (env == nullptr) { + return nullptr; + } + + impl_->environments().push_back(env); + return env; +} + +// Adds a TestPartResult to the current TestResult object. All Google Test +// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call +// this to report their results. The user code should use the +// assertion macros instead of calling this directly. +void UnitTest::AddTestPartResult(TestPartResult::Type result_type, + const char* file_name, int line_number, + const std::string& message, + const std::string& os_stack_trace) + GTEST_LOCK_EXCLUDED_(mutex_) { + Message msg; + msg << message; + + internal::MutexLock lock(&mutex_); + if (impl_->gtest_trace_stack().size() > 0) { + msg << "\n" << GTEST_NAME_ << " trace:"; + + for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) { + const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1]; + msg << "\n" + << internal::FormatFileLocation(trace.file, trace.line) << " " + << trace.message; + } + } + + if (os_stack_trace.c_str() != nullptr && !os_stack_trace.empty()) { + msg << internal::kStackTraceMarker << os_stack_trace; + } + + const TestPartResult result = TestPartResult( + result_type, file_name, line_number, msg.GetString().c_str()); + impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult( + result); + + if (result_type != TestPartResult::kSuccess && + result_type != TestPartResult::kSkip) { + // gtest_break_on_failure takes precedence over + // gtest_throw_on_failure. This allows a user to set the latter + // in the code (perhaps in order to use Google Test assertions + // with another testing framework) and specify the former on the + // command line for debugging. + if (GTEST_FLAG_GET(break_on_failure)) { +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + // Using DebugBreak on Windows allows gtest to still break into a debugger + // when a failure happens and both the --gtest_break_on_failure and + // the --gtest_catch_exceptions flags are specified. + DebugBreak(); +#elif (!defined(__native_client__)) && \ + ((defined(__clang__) || defined(__GNUC__)) && \ + (defined(__x86_64__) || defined(__i386__))) + // with clang/gcc we can achieve the same effect on x86 by invoking int3 + asm("int3"); +#else + // Dereference nullptr through a volatile pointer to prevent the compiler + // from removing. We use this rather than abort() or __builtin_trap() for + // portability: some debuggers don't correctly trap abort(). + *static_cast(nullptr) = 1; +#endif // GTEST_OS_WINDOWS + } else if (GTEST_FLAG_GET(throw_on_failure)) { +#if GTEST_HAS_EXCEPTIONS + throw internal::GoogleTestFailureException(result); +#else + // We cannot call abort() as it generates a pop-up in debug mode + // that cannot be suppressed in VC 7.1 or below. + exit(1); +#endif + } + } +} + +// Adds a TestProperty to the current TestResult object when invoked from +// inside a test, to current TestSuite's ad_hoc_test_result_ when invoked +// from SetUpTestSuite or TearDownTestSuite, or to the global property set +// when invoked elsewhere. If the result already contains a property with +// the same key, the value will be updated. +void UnitTest::RecordProperty(const std::string& key, + const std::string& value) { + impl_->RecordProperty(TestProperty(key, value)); +} + +// Runs all tests in this UnitTest object and prints the result. +// Returns 0 if successful, or 1 otherwise. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +int UnitTest::Run() { + const bool in_death_test_child_process = + GTEST_FLAG_GET(internal_run_death_test).length() > 0; + + // Google Test implements this protocol for catching that a test + // program exits before returning control to Google Test: + // + // 1. Upon start, Google Test creates a file whose absolute path + // is specified by the environment variable + // TEST_PREMATURE_EXIT_FILE. + // 2. When Google Test has finished its work, it deletes the file. + // + // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before + // running a Google-Test-based test program and check the existence + // of the file at the end of the test execution to see if it has + // exited prematurely. + + // If we are in the child process of a death test, don't + // create/delete the premature exit file, as doing so is unnecessary + // and will confuse the parent process. Otherwise, create/delete + // the file upon entering/leaving this function. If the program + // somehow exits before this function has a chance to return, the + // premature-exit file will be left undeleted, causing a test runner + // that understands the premature-exit-file protocol to report the + // test as having failed. + const internal::ScopedPrematureExitFile premature_exit_file( + in_death_test_child_process + ? nullptr + : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE")); + + // Captures the value of GTEST_FLAG(catch_exceptions). This value will be + // used for the duration of the program. + impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions)); + +#if GTEST_OS_WINDOWS + // Either the user wants Google Test to catch exceptions thrown by the + // tests or this is executing in the context of death test child + // process. In either case the user does not want to see pop-up dialogs + // about crashes - they are expected. + if (impl()->catch_exceptions() || in_death_test_child_process) { +#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT + // SetErrorMode doesn't exist on CE. + SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT | + SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX); +#endif // !GTEST_OS_WINDOWS_MOBILE + +#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE + // Death test children can be terminated with _abort(). On Windows, + // _abort() can show a dialog with a warning message. This forces the + // abort message to go to stderr instead. + _set_error_mode(_OUT_TO_STDERR); +#endif + +#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE + // In the debug version, Visual Studio pops up a separate dialog + // offering a choice to debug the aborted program. We need to suppress + // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement + // executed. Google Test will notify the user of any unexpected + // failure via stderr. + if (!GTEST_FLAG_GET(break_on_failure)) + _set_abort_behavior( + 0x0, // Clear the following flags: + _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. + + // In debug mode, the Windows CRT can crash with an assertion over invalid + // input (e.g. passing an invalid file descriptor). The default handling + // for these assertions is to pop up a dialog and wait for user input. + // Instead ask the CRT to dump such assertions to stderr non-interactively. + if (!IsDebuggerPresent()) { + (void)_CrtSetReportMode(_CRT_ASSERT, + _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG); + (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR); + } +#endif + } +#endif // GTEST_OS_WINDOWS + + return internal::HandleExceptionsInMethodIfSupported( + impl(), &internal::UnitTestImpl::RunAllTests, + "auxiliary test code (environments or event listeners)") + ? 0 + : 1; +} + +// Returns the working directory when the first TEST() or TEST_F() was +// executed. +const char* UnitTest::original_working_dir() const { + return impl_->original_working_dir_.c_str(); +} + +// Returns the TestSuite object for the test that's currently running, +// or NULL if no test is running. +const TestSuite* UnitTest::current_test_suite() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_suite(); +} + +// Legacy API is still available but deprecated +#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_ +const TestCase* UnitTest::current_test_case() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_suite(); +} +#endif + +// Returns the TestInfo object for the test that's currently running, +// or NULL if no test is running. +const TestInfo* UnitTest::current_test_info() const + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + return impl_->current_test_info(); +} + +// Returns the random seed used at the start of the current test run. +int UnitTest::random_seed() const { return impl_->random_seed(); } + +// Returns ParameterizedTestSuiteRegistry object used to keep track of +// value-parameterized tests and instantiate and register them. +internal::ParameterizedTestSuiteRegistry& +UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) { + return impl_->parameterized_test_registry(); +} + +// Creates an empty UnitTest. +UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); } + +// Destructor of UnitTest. +UnitTest::~UnitTest() { delete impl_; } + +// Pushes a trace defined by SCOPED_TRACE() on to the per-thread +// Google Test trace stack. +void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) + GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().push_back(trace); +} + +// Pops a trace from the per-thread Google Test trace stack. +void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().pop_back(); +} + +namespace internal { + +UnitTestImpl::UnitTestImpl(UnitTest* parent) + : parent_(parent), + GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */) + default_global_test_part_result_reporter_(this), + default_per_thread_test_part_result_reporter_(this), + GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_repoter_( + &default_global_test_part_result_reporter_), + per_thread_test_part_result_reporter_( + &default_per_thread_test_part_result_reporter_), + parameterized_test_registry_(), + parameterized_tests_registered_(false), + last_death_test_suite_(-1), + current_test_suite_(nullptr), + current_test_info_(nullptr), + ad_hoc_test_result_(), + os_stack_trace_getter_(nullptr), + post_flag_parse_init_performed_(false), + random_seed_(0), // Will be overridden by the flag before first use. + random_(0), // Will be reseeded before first use. + start_timestamp_(0), + elapsed_time_(0), +#if GTEST_HAS_DEATH_TEST + death_test_factory_(new DefaultDeathTestFactory), +#endif + // Will be overridden by the flag before first use. + catch_exceptions_(false) { + listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter); +} + +UnitTestImpl::~UnitTestImpl() { + // Deletes every TestSuite. + ForEach(test_suites_, internal::Delete); + + // Deletes every Environment. + ForEach(environments_, internal::Delete); + + delete os_stack_trace_getter_; +} + +// Adds a TestProperty to the current TestResult object when invoked in a +// context of a test, to current test suite's ad_hoc_test_result when invoke +// from SetUpTestSuite/TearDownTestSuite, or to the global property set +// otherwise. If the result already contains a property with the same key, +// the value will be updated. +void UnitTestImpl::RecordProperty(const TestProperty& test_property) { + std::string xml_element; + TestResult* test_result; // TestResult appropriate for property recording. + + if (current_test_info_ != nullptr) { + xml_element = "testcase"; + test_result = &(current_test_info_->result_); + } else if (current_test_suite_ != nullptr) { + xml_element = "testsuite"; + test_result = &(current_test_suite_->ad_hoc_test_result_); + } else { + xml_element = "testsuites"; + test_result = &ad_hoc_test_result_; + } + test_result->RecordProperty(xml_element, test_property); +} + +#if GTEST_HAS_DEATH_TEST +// Disables event forwarding if the control is currently in a death test +// subprocess. Must not be called before InitGoogleTest. +void UnitTestImpl::SuppressTestEventsIfInSubprocess() { + if (internal_run_death_test_flag_.get() != nullptr) + listeners()->SuppressEventForwarding(); +} +#endif // GTEST_HAS_DEATH_TEST + +// Initializes event listeners performing XML output as specified by +// UnitTestOptions. Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureXmlOutput() { + const std::string& output_format = UnitTestOptions::GetOutputFormat(); + if (output_format == "xml") { + listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); + } else if (output_format == "json") { + listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); + } else if (output_format != "") { + GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \"" + << output_format << "\" ignored."; + } +} + +#if GTEST_CAN_STREAM_RESULTS_ +// Initializes event listeners for streaming test results in string form. +// Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureStreamingOutput() { + const std::string& target = GTEST_FLAG_GET(stream_result_to); + if (!target.empty()) { + const size_t pos = target.find(':'); + if (pos != std::string::npos) { + listeners()->Append( + new StreamingListener(target.substr(0, pos), target.substr(pos + 1))); + } else { + GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target + << "\" ignored."; + } + } +} +#endif // GTEST_CAN_STREAM_RESULTS_ + +// Performs initialization dependent upon flag values obtained in +// ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to +// ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest +// this function is also called from RunAllTests. Since this function can be +// called more than once, it has to be idempotent. +void UnitTestImpl::PostFlagParsingInit() { + // Ensures that this function does not execute more than once. + if (!post_flag_parse_init_performed_) { + post_flag_parse_init_performed_ = true; + +#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_) + // Register to send notifications about key process state changes. + listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_()); +#endif // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_) + +#if GTEST_HAS_DEATH_TEST + InitDeathTestSubprocessControlInfo(); + SuppressTestEventsIfInSubprocess(); +#endif // GTEST_HAS_DEATH_TEST + + // Registers parameterized tests. This makes parameterized tests + // available to the UnitTest reflection API without running + // RUN_ALL_TESTS. + RegisterParameterizedTests(); + + // Configures listeners for XML output. This makes it possible for users + // to shut down the default XML output before invoking RUN_ALL_TESTS. + ConfigureXmlOutput(); + + if (GTEST_FLAG_GET(brief)) { + listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter); + } + +#if GTEST_CAN_STREAM_RESULTS_ + // Configures listeners for streaming test results to the specified server. + ConfigureStreamingOutput(); +#endif // GTEST_CAN_STREAM_RESULTS_ + +#if GTEST_HAS_ABSL + if (GTEST_FLAG_GET(install_failure_signal_handler)) { + absl::FailureSignalHandlerOptions options; + absl::InstallFailureSignalHandler(options); + } +#endif // GTEST_HAS_ABSL + } +} + +// A predicate that checks the name of a TestSuite against a known +// value. +// +// This is used for implementation of the UnitTest class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestSuiteNameIs is copyable. +class TestSuiteNameIs { + public: + // Constructor. + explicit TestSuiteNameIs(const std::string& name) : name_(name) {} + + // Returns true if and only if the name of test_suite matches name_. + bool operator()(const TestSuite* test_suite) const { + return test_suite != nullptr && + strcmp(test_suite->name(), name_.c_str()) == 0; + } + + private: + std::string name_; +}; + +// Finds and returns a TestSuite with the given name. If one doesn't +// exist, creates one and returns it. It's the CALLER'S +// RESPONSIBILITY to ensure that this function is only called WHEN THE +// TESTS ARE NOT SHUFFLED. +// +// Arguments: +// +// test_suite_name: name of the test suite +// type_param: the name of the test suite's type parameter, or NULL if +// this is not a typed or a type-parameterized test suite. +// set_up_tc: pointer to the function that sets up the test suite +// tear_down_tc: pointer to the function that tears down the test suite +TestSuite* UnitTestImpl::GetTestSuite( + const char* test_suite_name, const char* type_param, + internal::SetUpTestSuiteFunc set_up_tc, + internal::TearDownTestSuiteFunc tear_down_tc) { + // Can we find a TestSuite with the given name? + const auto test_suite = + std::find_if(test_suites_.rbegin(), test_suites_.rend(), + TestSuiteNameIs(test_suite_name)); + + if (test_suite != test_suites_.rend()) return *test_suite; + + // No. Let's create one. + auto* const new_test_suite = + new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc); + + const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter); + // Is this a death test suite? + if (death_test_suite_filter.MatchesName(test_suite_name)) { + // Yes. Inserts the test suite after the last death test suite + // defined so far. This only works when the test suites haven't + // been shuffled. Otherwise we may end up running a death test + // after a non-death test. + ++last_death_test_suite_; + test_suites_.insert(test_suites_.begin() + last_death_test_suite_, + new_test_suite); + } else { + // No. Appends to the end of the list. + test_suites_.push_back(new_test_suite); + } + + test_suite_indices_.push_back(static_cast(test_suite_indices_.size())); + return new_test_suite; +} + +// Helpers for setting up / tearing down the given environment. They +// are for use in the ForEach() function. +static void SetUpEnvironment(Environment* env) { env->SetUp(); } +static void TearDownEnvironment(Environment* env) { env->TearDown(); } + +// Runs all tests in this UnitTest object, prints the result, and +// returns true if all tests are successful. If any exception is +// thrown during a test, the test is considered to be failed, but the +// rest of the tests will still be run. +// +// When parameterized tests are enabled, it expands and registers +// parameterized tests first in RegisterParameterizedTests(). +// All other functions called from RunAllTests() may safely assume that +// parameterized tests are ready to be counted and run. +bool UnitTestImpl::RunAllTests() { + // True if and only if Google Test is initialized before RUN_ALL_TESTS() is + // called. + const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized(); + + // Do not run any test if the --help flag was specified. + if (g_help_flag) return true; + + // Repeats the call to the post-flag parsing initialization in case the + // user didn't call InitGoogleTest. + PostFlagParsingInit(); + + // Even if sharding is not on, test runners may want to use the + // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding + // protocol. + internal::WriteToShardStatusFileIfNeeded(); + + // True if and only if we are in a subprocess for running a thread-safe-style + // death test. + bool in_subprocess_for_death_test = false; + +#if GTEST_HAS_DEATH_TEST + in_subprocess_for_death_test = + (internal_run_death_test_flag_.get() != nullptr); +#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) + if (in_subprocess_for_death_test) { + GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_(); + } +#endif // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_) +#endif // GTEST_HAS_DEATH_TEST + + const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex, + in_subprocess_for_death_test); + + // Compares the full test names with the filter to decide which + // tests to run. + const bool has_tests_to_run = + FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL + : IGNORE_SHARDING_PROTOCOL) > 0; + + // Lists the tests and exits if the --gtest_list_tests flag was specified. + if (GTEST_FLAG_GET(list_tests)) { + // This must be called *after* FilterTests() has been called. + ListTestsMatchingFilter(); + return true; + } + + random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed)); + + // True if and only if at least one test has failed. + bool failed = false; + + TestEventListener* repeater = listeners()->repeater(); + + start_timestamp_ = GetTimeInMillis(); + repeater->OnTestProgramStart(*parent_); + + // How many times to repeat the tests? We don't want to repeat them + // when we are inside the subprocess of a death test. + const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat); + + // Repeats forever if the repeat count is negative. + const bool gtest_repeat_forever = repeat < 0; + + // Should test environments be set up and torn down for each repeat, or only + // set up on the first and torn down on the last iteration? If there is no + // "last" iteration because the tests will repeat forever, always recreate the + // environments to avoid leaks in case one of the environments is using + // resources that are external to this process. Without this check there would + // be no way to clean up those external resources automatically. + const bool recreate_environments_when_repeating = + GTEST_FLAG_GET(recreate_environments_when_repeating) || + gtest_repeat_forever; + + for (int i = 0; gtest_repeat_forever || i != repeat; i++) { + // We want to preserve failures generated by ad-hoc test + // assertions executed before RUN_ALL_TESTS(). + ClearNonAdHocTestResult(); + + Timer timer; + + // Shuffles test suites and tests if requested. + if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) { + random()->Reseed(static_cast(random_seed_)); + // This should be done before calling OnTestIterationStart(), + // such that a test event listener can see the actual test order + // in the event. + ShuffleTests(); + } + + // Tells the unit test event listeners that the tests are about to start. + repeater->OnTestIterationStart(*parent_, i); + + // Runs each test suite if there is at least one test to run. + if (has_tests_to_run) { + // Sets up all environments beforehand. If test environments aren't + // recreated for each iteration, only do so on the first iteration. + if (i == 0 || recreate_environments_when_repeating) { + repeater->OnEnvironmentsSetUpStart(*parent_); + ForEach(environments_, SetUpEnvironment); + repeater->OnEnvironmentsSetUpEnd(*parent_); + } + + // Runs the tests only if there was no fatal failure or skip triggered + // during global set-up. + if (Test::IsSkipped()) { + // Emit diagnostics when global set-up calls skip, as it will not be + // emitted by default. + TestResult& test_result = + *internal::GetUnitTestImpl()->current_test_result(); + for (int j = 0; j < test_result.total_part_count(); ++j) { + const TestPartResult& test_part_result = + test_result.GetTestPartResult(j); + if (test_part_result.type() == TestPartResult::kSkip) { + const std::string& result = test_part_result.message(); + printf("%s\n", result.c_str()); + } + } + fflush(stdout); + } else if (!Test::HasFatalFailure()) { + for (int test_index = 0; test_index < total_test_suite_count(); + test_index++) { + GetMutableSuiteCase(test_index)->Run(); + if (GTEST_FLAG_GET(fail_fast) && + GetMutableSuiteCase(test_index)->Failed()) { + for (int j = test_index + 1; j < total_test_suite_count(); j++) { + GetMutableSuiteCase(j)->Skip(); + } + break; + } + } + } else if (Test::HasFatalFailure()) { + // If there was a fatal failure during the global setup then we know we + // aren't going to run any tests. Explicitly mark all of the tests as + // skipped to make this obvious in the output. + for (int test_index = 0; test_index < total_test_suite_count(); + test_index++) { + GetMutableSuiteCase(test_index)->Skip(); + } + } + + // Tears down all environments in reverse order afterwards. If test + // environments aren't recreated for each iteration, only do so on the + // last iteration. + if (i == repeat - 1 || recreate_environments_when_repeating) { + repeater->OnEnvironmentsTearDownStart(*parent_); + std::for_each(environments_.rbegin(), environments_.rend(), + TearDownEnvironment); + repeater->OnEnvironmentsTearDownEnd(*parent_); + } + } + + elapsed_time_ = timer.Elapsed(); + + // Tells the unit test event listener that the tests have just finished. + repeater->OnTestIterationEnd(*parent_, i); + + // Gets the result and clears it. + if (!Passed()) { + failed = true; + } + + // Restores the original test order after the iteration. This + // allows the user to quickly repro a failure that happens in the + // N-th iteration without repeating the first (N - 1) iterations. + // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in + // case the user somehow changes the value of the flag somewhere + // (it's always safe to unshuffle the tests). + UnshuffleTests(); + + if (GTEST_FLAG_GET(shuffle)) { + // Picks a new random seed for each iteration. + random_seed_ = GetNextRandomSeed(random_seed_); + } + } + + repeater->OnTestProgramEnd(*parent_); + + if (!gtest_is_initialized_before_run_all_tests) { + ColoredPrintf( + GTestColor::kRed, + "\nIMPORTANT NOTICE - DO NOT IGNORE:\n" + "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_ + "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_ + " will start to enforce the valid usage. " + "Please fix it ASAP, or IT WILL START TO FAIL.\n"); // NOLINT +#if GTEST_FOR_GOOGLE_ + ColoredPrintf(GTestColor::kRed, + "For more details, see http://wiki/Main/ValidGUnitMain.\n"); +#endif // GTEST_FOR_GOOGLE_ + } + + return !failed; +} + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded() { + const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile); + if (test_shard_file != nullptr) { + FILE* const file = posix::FOpen(test_shard_file, "w"); + if (file == nullptr) { + ColoredPrintf(GTestColor::kRed, + "Could not write to the test shard status file \"%s\" " + "specified by the %s environment variable.\n", + test_shard_file, kTestShardStatusFile); + fflush(stdout); + exit(EXIT_FAILURE); + } + fclose(file); + } +} + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (i.e., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +bool ShouldShard(const char* total_shards_env, const char* shard_index_env, + bool in_subprocess_for_death_test) { + if (in_subprocess_for_death_test) { + return false; + } + + const int32_t total_shards = Int32FromEnvOrDie(total_shards_env, -1); + const int32_t shard_index = Int32FromEnvOrDie(shard_index_env, -1); + + if (total_shards == -1 && shard_index == -1) { + return false; + } else if (total_shards == -1 && shard_index != -1) { + const Message msg = Message() << "Invalid environment variables: you have " + << kTestShardIndex << " = " << shard_index + << ", but have left " << kTestTotalShards + << " unset.\n"; + ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (total_shards != -1 && shard_index == -1) { + const Message msg = Message() + << "Invalid environment variables: you have " + << kTestTotalShards << " = " << total_shards + << ", but have left " << kTestShardIndex << " unset.\n"; + ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (shard_index < 0 || shard_index >= total_shards) { + const Message msg = + Message() << "Invalid environment variables: we require 0 <= " + << kTestShardIndex << " < " << kTestTotalShards + << ", but you have " << kTestShardIndex << "=" << shard_index + << ", " << kTestTotalShards << "=" << total_shards << ".\n"; + ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } + + return total_shards > 1; +} + +// Parses the environment variable var as an Int32. If it is unset, +// returns default_val. If it is not an Int32, prints an error +// and aborts. +int32_t Int32FromEnvOrDie(const char* var, int32_t default_val) { + const char* str_val = posix::GetEnv(var); + if (str_val == nullptr) { + return default_val; + } + + int32_t result; + if (!ParseInt32(Message() << "The value of environment variable " << var, + str_val, &result)) { + exit(EXIT_FAILURE); + } + return result; +} + +// Given the total number of shards, the shard index, and the test id, +// returns true if and only if the test should be run on this shard. The test id +// is some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { + return (test_id % total_shards) == shard_index; +} + +// Compares the name of each test with the user-specified filter to +// decide whether the test should be run, then records the result in +// each TestSuite and TestInfo object. +// If shard_tests == true, further filters tests based on sharding +// variables in the environment - see +// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md +// . Returns the number of tests that should run. +int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { + const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL + ? Int32FromEnvOrDie(kTestTotalShards, -1) + : -1; + const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL + ? Int32FromEnvOrDie(kTestShardIndex, -1) + : -1; + + const PositiveAndNegativeUnitTestFilter gtest_flag_filter( + GTEST_FLAG_GET(filter)); + const UnitTestFilter disable_test_filter(kDisableTestFilter); + // num_runnable_tests are the number of tests that will + // run across all shards (i.e., match filter and are not disabled). + // num_selected_tests are the number of tests to be run on + // this shard. + int num_runnable_tests = 0; + int num_selected_tests = 0; + for (auto* test_suite : test_suites_) { + const std::string& test_suite_name = test_suite->name(); + test_suite->set_should_run(false); + + for (size_t j = 0; j < test_suite->test_info_list().size(); j++) { + TestInfo* const test_info = test_suite->test_info_list()[j]; + const std::string test_name(test_info->name()); + // A test is disabled if test suite name or test name matches + // kDisableTestFilter. + const bool is_disabled = + disable_test_filter.MatchesName(test_suite_name) || + disable_test_filter.MatchesName(test_name); + test_info->is_disabled_ = is_disabled; + + const bool matches_filter = + gtest_flag_filter.MatchesTest(test_suite_name, test_name); + test_info->matches_filter_ = matches_filter; + + const bool is_runnable = + (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) && + matches_filter; + + const bool is_in_another_shard = + shard_tests != IGNORE_SHARDING_PROTOCOL && + !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests); + test_info->is_in_another_shard_ = is_in_another_shard; + const bool is_selected = is_runnable && !is_in_another_shard; + + num_runnable_tests += is_runnable; + num_selected_tests += is_selected; + + test_info->should_run_ = is_selected; + test_suite->set_should_run(test_suite->should_run() || is_selected); + } + } + return num_selected_tests; +} + +// Prints the given C-string on a single line by replacing all '\n' +// characters with string "\\n". If the output takes more than +// max_length characters, only prints the first max_length characters +// and "...". +static void PrintOnOneLine(const char* str, int max_length) { + if (str != nullptr) { + for (int i = 0; *str != '\0'; ++str) { + if (i >= max_length) { + printf("..."); + break; + } + if (*str == '\n') { + printf("\\n"); + i += 2; + } else { + printf("%c", *str); + ++i; + } + } + } +} + +// Prints the names of the tests matching the user-specified filter flag. +void UnitTestImpl::ListTestsMatchingFilter() { + // Print at most this many characters for each type/value parameter. + const int kMaxParamLength = 250; + + for (auto* test_suite : test_suites_) { + bool printed_test_suite_name = false; + + for (size_t j = 0; j < test_suite->test_info_list().size(); j++) { + const TestInfo* const test_info = test_suite->test_info_list()[j]; + if (test_info->matches_filter_) { + if (!printed_test_suite_name) { + printed_test_suite_name = true; + printf("%s.", test_suite->name()); + if (test_suite->type_param() != nullptr) { + printf(" # %s = ", kTypeParamLabel); + // We print the type parameter on a single line to make + // the output easy to parse by a program. + PrintOnOneLine(test_suite->type_param(), kMaxParamLength); + } + printf("\n"); + } + printf(" %s", test_info->name()); + if (test_info->value_param() != nullptr) { + printf(" # %s = ", kValueParamLabel); + // We print the value parameter on a single line to make the + // output easy to parse by a program. + PrintOnOneLine(test_info->value_param(), kMaxParamLength); + } + printf("\n"); + } + } + } + fflush(stdout); + const std::string& output_format = UnitTestOptions::GetOutputFormat(); + if (output_format == "xml" || output_format == "json") { + FILE* fileout = OpenFileForWriting( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()); + std::stringstream stream; + if (output_format == "xml") { + XmlUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()) + .PrintXmlTestsList(&stream, test_suites_); + } else if (output_format == "json") { + JsonUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str()) + .PrintJsonTestList(&stream, test_suites_); + } + fprintf(fileout, "%s", StringStreamToString(&stream).c_str()); + fclose(fileout); + } +} + +// Sets the OS stack trace getter. +// +// Does nothing if the input and the current OS stack trace getter are +// the same; otherwise, deletes the old getter and makes the input the +// current getter. +void UnitTestImpl::set_os_stack_trace_getter( + OsStackTraceGetterInterface* getter) { + if (os_stack_trace_getter_ != getter) { + delete os_stack_trace_getter_; + os_stack_trace_getter_ = getter; + } +} + +// Returns the current OS stack trace getter if it is not NULL; +// otherwise, creates an OsStackTraceGetter, makes it the current +// getter, and returns it. +OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() { + if (os_stack_trace_getter_ == nullptr) { +#ifdef GTEST_OS_STACK_TRACE_GETTER_ + os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_; +#else + os_stack_trace_getter_ = new OsStackTraceGetter; +#endif // GTEST_OS_STACK_TRACE_GETTER_ + } + + return os_stack_trace_getter_; +} + +// Returns the most specific TestResult currently running. +TestResult* UnitTestImpl::current_test_result() { + if (current_test_info_ != nullptr) { + return ¤t_test_info_->result_; + } + if (current_test_suite_ != nullptr) { + return ¤t_test_suite_->ad_hoc_test_result_; + } + return &ad_hoc_test_result_; +} + +// Shuffles all test suites, and the tests within each test suite, +// making sure that death tests are still run first. +void UnitTestImpl::ShuffleTests() { + // Shuffles the death test suites. + ShuffleRange(random(), 0, last_death_test_suite_ + 1, &test_suite_indices_); + + // Shuffles the non-death test suites. + ShuffleRange(random(), last_death_test_suite_ + 1, + static_cast(test_suites_.size()), &test_suite_indices_); + + // Shuffles the tests inside each test suite. + for (auto& test_suite : test_suites_) { + test_suite->ShuffleTests(random()); + } +} + +// Restores the test suites and tests to their order before the first shuffle. +void UnitTestImpl::UnshuffleTests() { + for (size_t i = 0; i < test_suites_.size(); i++) { + // Unshuffles the tests in each test suite. + test_suites_[i]->UnshuffleTests(); + // Resets the index of each test suite. + test_suite_indices_[i] = static_cast(i); + } +} + +// Returns the current OS stack trace as an std::string. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string +GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) { + // We pass skip_count + 1 to skip this wrapper function in addition + // to what the user really wants to skip. + return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1); +} + +// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to +// suppress unreachable code warnings. +namespace { +class ClassUniqueToAlwaysTrue {}; +} // namespace + +bool IsTrue(bool condition) { return condition; } + +bool AlwaysTrue() { +#if GTEST_HAS_EXCEPTIONS + // This condition is always false so AlwaysTrue() never actually throws, + // but it makes the compiler think that it may throw. + if (IsTrue(false)) throw ClassUniqueToAlwaysTrue(); +#endif // GTEST_HAS_EXCEPTIONS + return true; +} + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +bool SkipPrefix(const char* prefix, const char** pstr) { + const size_t prefix_len = strlen(prefix); + if (strncmp(*pstr, prefix, prefix_len) == 0) { + *pstr += prefix_len; + return true; + } + return false; +} + +// Parses a string as a command line flag. The string should have +// the format "--flag=value". When def_optional is true, the "=value" +// part can be omitted. +// +// Returns the value of the flag, or NULL if the parsing failed. +static const char* ParseFlagValue(const char* str, const char* flag_name, + bool def_optional) { + // str and flag must not be NULL. + if (str == nullptr || flag_name == nullptr) return nullptr; + + // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. + const std::string flag_str = + std::string("--") + GTEST_FLAG_PREFIX_ + flag_name; + const size_t flag_len = flag_str.length(); + if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr; + + // Skips the flag name. + const char* flag_end = str + flag_len; + + // When def_optional is true, it's OK to not have a "=value" part. + if (def_optional && (flag_end[0] == '\0')) { + return flag_end; + } + + // If def_optional is true and there are more characters after the + // flag name, or if def_optional is false, there must be a '=' after + // the flag name. + if (flag_end[0] != '=') return nullptr; + + // Returns the string after "=". + return flag_end + 1; +} + +// Parses a string for a bool flag, in the form of either +// "--flag=value" or "--flag". +// +// In the former case, the value is taken as true as long as it does +// not start with '0', 'f', or 'F'. +// +// In the latter case, the value is taken as true. +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +static bool ParseFlag(const char* str, const char* flag_name, bool* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag_name, true); + + // Aborts if the parsing failed. + if (value_str == nullptr) return false; + + // Converts the string value to a bool. + *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F'); + return true; +} + +// Parses a string for an int32_t flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseFlag(const char* str, const char* flag_name, int32_t* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag_name, false); + + // Aborts if the parsing failed. + if (value_str == nullptr) return false; + + // Sets *value to the value of the flag. + return ParseInt32(Message() << "The value of flag --" << flag_name, value_str, + value); +} + +// Parses a string for a string flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +template +static bool ParseFlag(const char* str, const char* flag_name, String* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag_name, false); + + // Aborts if the parsing failed. + if (value_str == nullptr) return false; + + // Sets *value to the value of the flag. + *value = value_str; + return true; +} + +// Determines whether a string has a prefix that Google Test uses for its +// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_. +// If Google Test detects that a command line flag has its prefix but is not +// recognized, it will print its help message. Flags starting with +// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test +// internal flags and do not trigger the help message. +static bool HasGoogleTestFlagPrefix(const char* str) { + return (SkipPrefix("--", &str) || SkipPrefix("-", &str) || + SkipPrefix("/", &str)) && + !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) && + (SkipPrefix(GTEST_FLAG_PREFIX_, &str) || + SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str)); +} + +// Prints a string containing code-encoded text. The following escape +// sequences can be used in the string to control the text color: +// +// @@ prints a single '@' character. +// @R changes the color to red. +// @G changes the color to green. +// @Y changes the color to yellow. +// @D changes to the default terminal text color. +// +static void PrintColorEncoded(const char* str) { + GTestColor color = GTestColor::kDefault; // The current color. + + // Conceptually, we split the string into segments divided by escape + // sequences. Then we print one segment at a time. At the end of + // each iteration, the str pointer advances to the beginning of the + // next segment. + for (;;) { + const char* p = strchr(str, '@'); + if (p == nullptr) { + ColoredPrintf(color, "%s", str); + return; + } + + ColoredPrintf(color, "%s", std::string(str, p).c_str()); + + const char ch = p[1]; + str = p + 2; + if (ch == '@') { + ColoredPrintf(color, "@"); + } else if (ch == 'D') { + color = GTestColor::kDefault; + } else if (ch == 'R') { + color = GTestColor::kRed; + } else if (ch == 'G') { + color = GTestColor::kGreen; + } else if (ch == 'Y') { + color = GTestColor::kYellow; + } else { + --str; + } + } +} + +static const char kColorEncodedHelpMessage[] = + "This program contains tests written using " GTEST_NAME_ + ". You can use the\n" + "following command line flags to control its behavior:\n" + "\n" + "Test Selection:\n" + " @G--" GTEST_FLAG_PREFIX_ + "list_tests@D\n" + " List the names of all tests instead of running them. The name of\n" + " TEST(Foo, Bar) is \"Foo.Bar\".\n" + " @G--" GTEST_FLAG_PREFIX_ + "filter=@YPOSITIVE_PATTERNS" + "[@G-@YNEGATIVE_PATTERNS]@D\n" + " Run only the tests whose name matches one of the positive patterns " + "but\n" + " none of the negative patterns. '?' matches any single character; " + "'*'\n" + " matches any substring; ':' separates two patterns.\n" + " @G--" GTEST_FLAG_PREFIX_ + "also_run_disabled_tests@D\n" + " Run all disabled tests too.\n" + "\n" + "Test Execution:\n" + " @G--" GTEST_FLAG_PREFIX_ + "repeat=@Y[COUNT]@D\n" + " Run the tests repeatedly; use a negative count to repeat forever.\n" + " @G--" GTEST_FLAG_PREFIX_ + "shuffle@D\n" + " Randomize tests' orders on every iteration.\n" + " @G--" GTEST_FLAG_PREFIX_ + "random_seed=@Y[NUMBER]@D\n" + " Random number seed to use for shuffling test orders (between 1 and\n" + " 99999, or 0 to use a seed based on the current time).\n" + " @G--" GTEST_FLAG_PREFIX_ + "recreate_environments_when_repeating@D\n" + " Sets up and tears down the global test environment on each repeat\n" + " of the test.\n" + "\n" + "Test Output:\n" + " @G--" GTEST_FLAG_PREFIX_ + "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n" + " Enable/disable colored output. The default is @Gauto@D.\n" + " @G--" GTEST_FLAG_PREFIX_ + "brief=1@D\n" + " Only print test failures.\n" + " @G--" GTEST_FLAG_PREFIX_ + "print_time=0@D\n" + " Don't print the elapsed time of each test.\n" + " @G--" GTEST_FLAG_PREFIX_ + "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_ + "@Y|@G:@YFILE_PATH]@D\n" + " Generate a JSON or XML report in the given directory or with the " + "given\n" + " file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n" +#if GTEST_CAN_STREAM_RESULTS_ + " @G--" GTEST_FLAG_PREFIX_ + "stream_result_to=@YHOST@G:@YPORT@D\n" + " Stream test results to the given server.\n" +#endif // GTEST_CAN_STREAM_RESULTS_ + "\n" + "Assertion Behavior:\n" +#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS + " @G--" GTEST_FLAG_PREFIX_ + "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" + " Set the default death test style.\n" +#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS + " @G--" GTEST_FLAG_PREFIX_ + "break_on_failure@D\n" + " Turn assertion failures into debugger break-points.\n" + " @G--" GTEST_FLAG_PREFIX_ + "throw_on_failure@D\n" + " Turn assertion failures into C++ exceptions for use by an external\n" + " test framework.\n" + " @G--" GTEST_FLAG_PREFIX_ + "catch_exceptions=0@D\n" + " Do not report exceptions as test failures. Instead, allow them\n" + " to crash the program or throw a pop-up (on Windows).\n" + "\n" + "Except for @G--" GTEST_FLAG_PREFIX_ + "list_tests@D, you can alternatively set " + "the corresponding\n" + "environment variable of a flag (all letters in upper-case). For example, " + "to\n" + "disable colored text output, you can either specify " + "@G--" GTEST_FLAG_PREFIX_ + "color=no@D or set\n" + "the @G" GTEST_FLAG_PREFIX_UPPER_ + "COLOR@D environment variable to @Gno@D.\n" + "\n" + "For more information, please read the " GTEST_NAME_ + " documentation at\n" + "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ + "\n" + "(not one in your own code or tests), please report it to\n" + "@G<" GTEST_DEV_EMAIL_ ">@D.\n"; + +static bool ParseGoogleTestFlag(const char* const arg) { +#define GTEST_INTERNAL_PARSE_FLAG(flag_name) \ + do { \ + auto value = GTEST_FLAG_GET(flag_name); \ + if (ParseFlag(arg, #flag_name, &value)) { \ + GTEST_FLAG_SET(flag_name, value); \ + return true; \ + } \ + } while (false) + + GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests); + GTEST_INTERNAL_PARSE_FLAG(break_on_failure); + GTEST_INTERNAL_PARSE_FLAG(catch_exceptions); + GTEST_INTERNAL_PARSE_FLAG(color); + GTEST_INTERNAL_PARSE_FLAG(death_test_style); + GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork); + GTEST_INTERNAL_PARSE_FLAG(fail_fast); + GTEST_INTERNAL_PARSE_FLAG(filter); + GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test); + GTEST_INTERNAL_PARSE_FLAG(list_tests); + GTEST_INTERNAL_PARSE_FLAG(output); + GTEST_INTERNAL_PARSE_FLAG(brief); + GTEST_INTERNAL_PARSE_FLAG(print_time); + GTEST_INTERNAL_PARSE_FLAG(print_utf8); + GTEST_INTERNAL_PARSE_FLAG(random_seed); + GTEST_INTERNAL_PARSE_FLAG(repeat); + GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating); + GTEST_INTERNAL_PARSE_FLAG(shuffle); + GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth); + GTEST_INTERNAL_PARSE_FLAG(stream_result_to); + GTEST_INTERNAL_PARSE_FLAG(throw_on_failure); + return false; +} + +#if GTEST_USE_OWN_FLAGFILE_FLAG_ +static void LoadFlagsFromFile(const std::string& path) { + FILE* flagfile = posix::FOpen(path.c_str(), "r"); + if (!flagfile) { + GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile) + << "\""; + } + std::string contents(ReadEntireFile(flagfile)); + posix::FClose(flagfile); + std::vector lines; + SplitString(contents, '\n', &lines); + for (size_t i = 0; i < lines.size(); ++i) { + if (lines[i].empty()) continue; + if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true; + } +} +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. The type parameter CharType can be +// instantiated to either char or wchar_t. +template +void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { + std::string flagfile_value; + for (int i = 1; i < *argc; i++) { + const std::string arg_string = StreamableToString(argv[i]); + const char* const arg = arg_string.c_str(); + + using internal::ParseFlag; + + bool remove_flag = false; + if (ParseGoogleTestFlag(arg)) { + remove_flag = true; +#if GTEST_USE_OWN_FLAGFILE_FLAG_ + } else if (ParseFlag(arg, "flagfile", &flagfile_value)) { + GTEST_FLAG_SET(flagfile, flagfile_value); + LoadFlagsFromFile(flagfile_value); + remove_flag = true; +#endif // GTEST_USE_OWN_FLAGFILE_FLAG_ + } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) { + // Both help flag and unrecognized Google Test flags (excluding + // internal ones) trigger help display. + g_help_flag = true; + } + + if (remove_flag) { + // Shift the remainder of the argv list left by one. Note + // that argv has (*argc + 1) elements, the last one always being + // NULL. The following loop moves the trailing NULL element as + // well. + for (int j = i; j != *argc; j++) { + argv[j] = argv[j + 1]; + } + + // Decrements the argument count. + (*argc)--; + + // We also need to decrement the iterator as we just removed + // an element. + i--; + } + } + + if (g_help_flag) { + // We print the help here instead of in RUN_ALL_TESTS(), as the + // latter may not be called at all if the user is using Google + // Test with another testing framework. + PrintColorEncoded(kColorEncodedHelpMessage); + } +} + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +void ParseGoogleTestFlagsOnly(int* argc, char** argv) { +#if GTEST_HAS_ABSL + if (*argc > 0) { + // absl::ParseCommandLine() requires *argc > 0. + auto positional_args = absl::flags_internal::ParseCommandLineImpl( + *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs, + absl::flags_internal::UsageFlagsAction::kHandleUsage, + absl::flags_internal::OnUndefinedFlag::kReportUndefined); + // Any command-line positional arguments not part of any command-line flag + // (or arguments to a flag) are copied back out to argv, with the program + // invocation name at position 0, and argc is resized. This includes + // positional arguments after the flag-terminating delimiter '--'. + // See https://abseil.io/docs/cpp/guides/flags. + std::copy(positional_args.begin(), positional_args.end(), argv); + if (static_cast(positional_args.size()) < *argc) { + argv[positional_args.size()] = nullptr; + *argc = static_cast(positional_args.size()); + } + } +#else + ParseGoogleTestFlagsOnlyImpl(argc, argv); +#endif + + // Fix the value of *_NSGetArgc() on macOS, but if and only if + // *_NSGetArgv() == argv + // Only applicable to char** version of argv +#if GTEST_OS_MAC +#ifndef GTEST_OS_IOS + if (*_NSGetArgv() == argv) { + *_NSGetArgc() = *argc; + } +#endif +#endif +} +void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) { + ParseGoogleTestFlagsOnlyImpl(argc, argv); +} + +// The internal implementation of InitGoogleTest(). +// +// The type parameter CharType can be instantiated to either char or +// wchar_t. +template +void InitGoogleTestImpl(int* argc, CharType** argv) { + // We don't want to run the initialization code twice. + if (GTestIsInitialized()) return; + + if (*argc <= 0) return; + + g_argvs.clear(); + for (int i = 0; i != *argc; i++) { + g_argvs.push_back(StreamableToString(argv[i])); + } + +#if GTEST_HAS_ABSL + absl::InitializeSymbolizer(g_argvs[0].c_str()); + + // When using the Abseil Flags library, set the program usage message to the + // help message, but remove the color-encoding from the message first. + absl::SetProgramUsageMessage(absl::StrReplaceAll( + kColorEncodedHelpMessage, + {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}})); +#endif // GTEST_HAS_ABSL + + ParseGoogleTestFlagsOnly(argc, argv); + GetUnitTestImpl()->PostFlagParsingInit(); +} + +} // namespace internal + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +void InitGoogleTest(int* argc, char** argv) { +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +void InitGoogleTest(int* argc, wchar_t** argv) { +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +// This overloaded version can be used on Arduino/embedded platforms where +// there is no argc/argv. +void InitGoogleTest() { + // Since Arduino doesn't have a command line, fake out the argc/argv arguments + int argc = 1; + const auto arg0 = "dummy"; + char* argv0 = const_cast(arg0); + char** argv = &argv0; + +#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv); +#else // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) + internal::InitGoogleTestImpl(&argc, argv); +#endif // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_) +} + +#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) +// Return value of first environment variable that is set and contains +// a non-empty string. If there are none, return the "fallback" string. +// Since we like the temporary directory to have a directory separator suffix, +// add it if not provided in the environment variable value. +static std::string GetTempDirFromEnv( + std::initializer_list environment_variables, + const char* fallback, char separator) { + for (const char* variable_name : environment_variables) { + const char* value = internal::posix::GetEnv(variable_name); + if (value != nullptr && value[0] != '\0') { + if (value[strlen(value) - 1] != separator) { + return std::string(value).append(1, separator); + } + return value; + } + } + return fallback; +} +#endif + +std::string TempDir() { +#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_) + return GTEST_CUSTOM_TEMPDIR_FUNCTION_(); +#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE + return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\'); +#elif GTEST_OS_LINUX_ANDROID + return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/'); +#else + return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/'); +#endif +} + +// Class ScopedTrace + +// Pushes the given source file location and message onto a per-thread +// trace stack maintained by Google Test. +void ScopedTrace::PushTrace(const char* file, int line, std::string message) { + internal::TraceInfo trace; + trace.file = file; + trace.line = line; + trace.message.swap(message); + + UnitTest::GetInstance()->PushGTestTrace(trace); +} + +// Pops the info pushed by the c'tor. +ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) { + UnitTest::GetInstance()->PopGTestTrace(); +} + +} // namespace testing diff --git a/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc b/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc index f302822552..44976375c9 100644 --- a/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc +++ b/media/libvpx/libvpx/third_party/googletest/src/src/gtest_main.cc @@ -27,12 +27,27 @@ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include +#include #include "gtest/gtest.h" +#if GTEST_OS_ESP8266 || GTEST_OS_ESP32 +#if GTEST_OS_ESP8266 +extern "C" { +#endif +void setup() { testing::InitGoogleTest(); } + +void loop() { RUN_ALL_TESTS(); } + +#if GTEST_OS_ESP8266 +} +#endif + +#else + GTEST_API_ int main(int argc, char **argv) { - printf("Running main() from gtest_main.cc\n"); + printf("Running main() from %s\n", __FILE__); testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } +#endif diff --git a/media/libvpx/libvpx/third_party/libwebm/AUTHORS.TXT b/media/libvpx/libvpx/third_party/libwebm/AUTHORS.TXT index 9686ac13eb..59b648ca68 100644 --- a/media/libvpx/libvpx/third_party/libwebm/AUTHORS.TXT +++ b/media/libvpx/libvpx/third_party/libwebm/AUTHORS.TXT @@ -2,3 +2,4 @@ # Name or Organization Google Inc. +Elijah Cirioli diff --git a/media/libvpx/libvpx/third_party/libwebm/Android.mk b/media/libvpx/libvpx/third_party/libwebm/Android.mk index 8149a083f4..e6c17df021 100644 --- a/media/libvpx/libvpx/third_party/libwebm/Android.mk +++ b/media/libvpx/libvpx/third_party/libwebm/Android.mk @@ -1,9 +1,11 @@ +# Ignore this file during non-NDK builds. +ifdef NDK_ROOT LOCAL_PATH:= $(call my-dir) include $(CLEAR_VARS) LOCAL_MODULE:= libwebm LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -Wno-extern-c-compat +LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11 LOCAL_C_INCLUDES:= $(LOCAL_PATH) LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH) @@ -14,4 +16,8 @@ LOCAL_SRC_FILES:= common/file_util.cc \ mkvmuxer/mkvmuxer.cc \ mkvmuxer/mkvmuxerutil.cc \ mkvmuxer/mkvwriter.cc +LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD +LOCAL_LICENSE_CONDITIONS := notice +LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT include $(BUILD_STATIC_LIBRARY) +endif # NDK_ROOT diff --git a/media/libvpx/libvpx/third_party/libwebm/README.libvpx b/media/libvpx/libvpx/third_party/libwebm/README.libvpx index 1f8a13d78c..6e43487540 100644 --- a/media/libvpx/libvpx/third_party/libwebm/README.libvpx +++ b/media/libvpx/libvpx/third_party/libwebm/README.libvpx @@ -1,10 +1,20 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: 9732ae991efb71aced4267d4794918279e362d99 +Version: 3b630045052e1e4d563207ab9e3be8d137c26067 License: BSD -License File: LICENSE.txt +License File: LICENSE.TXT Description: libwebm is used to handle WebM container I/O. Local Changes: -* +Only keep: + - Android.mk + - AUTHORS.TXT + - common/ + file_util.cc/h + hdr_util.cc/h + webmids.h + - LICENSE.TXT + - mkvmuxer/ + - mkvparser/ + - PATENTS.TXT diff --git a/media/libvpx/libvpx/third_party/libwebm/common/file_util.cc b/media/libvpx/libvpx/third_party/libwebm/common/file_util.cc index 6dab146dd9..6eb6428b98 100644 --- a/media/libvpx/libvpx/third_party/libwebm/common/file_util.cc +++ b/media/libvpx/libvpx/third_party/libwebm/common/file_util.cc @@ -17,14 +17,15 @@ #include #include #include +#include namespace libwebm { std::string GetTempFileName() { #if !defined _MSC_VER && !defined __MINGW32__ std::string temp_file_name_template_str = - std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") : - ".") + + std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") + : ".") + "/libwebm_temp.XXXXXX"; char* temp_file_name_template = new char[temp_file_name_template_str.length() + 1]; @@ -41,7 +42,12 @@ std::string GetTempFileName() { return temp_file_name; #else char tmp_file_name[_MAX_PATH]; +#if defined _MSC_VER || defined MINGW_HAS_SECURE_API errno_t err = tmpnam_s(tmp_file_name); +#else + char* fname_pointer = tmpnam(tmp_file_name); + int err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1; +#endif if (err == 0) { return std::string(tmp_file_name); } @@ -65,6 +71,15 @@ uint64_t GetFileSize(const std::string& file_name) { return file_size; } +bool GetFileContents(const std::string& file_name, std::string* contents) { + std::ifstream file(file_name.c_str()); + *contents = std::string(static_cast(GetFileSize(file_name)), 0); + if (file.good() && contents->size()) { + file.read(&(*contents)[0], contents->size()); + } + return !file.fail(); +} + TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); } TempFileDeleter::~TempFileDeleter() { diff --git a/media/libvpx/libvpx/third_party/libwebm/common/file_util.h b/media/libvpx/libvpx/third_party/libwebm/common/file_util.h index 0e71eac11e..a873734641 100644 --- a/media/libvpx/libvpx/third_party/libwebm/common/file_util.h +++ b/media/libvpx/libvpx/third_party/libwebm/common/file_util.h @@ -22,6 +22,9 @@ std::string GetTempFileName(); // Returns size of file specified by |file_name|, or 0 upon failure. uint64_t GetFileSize(const std::string& file_name); +// Gets the contents file_name as a string. Returns false on error. +bool GetFileContents(const std::string& file_name, std::string* contents); + // Manages life of temporary file specified at time of construction. Deletes // file upon destruction. class TempFileDeleter { @@ -38,4 +41,4 @@ class TempFileDeleter { } // namespace libwebm -#endif // LIBWEBM_COMMON_FILE_UTIL_H_ \ No newline at end of file +#endif // LIBWEBM_COMMON_FILE_UTIL_H_ diff --git a/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.cc b/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.cc index e1618ce75a..f1320a5361 100644 --- a/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.cc +++ b/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.cc @@ -36,10 +36,10 @@ bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm, if (MasteringMetadataValuePresent(parser_mm.luminance_min)) muxer_mm->set_luminance_min(parser_mm.luminance_min); - PrimaryChromaticityPtr r_ptr(NULL); - PrimaryChromaticityPtr g_ptr(NULL); - PrimaryChromaticityPtr b_ptr(NULL); - PrimaryChromaticityPtr wp_ptr(NULL); + PrimaryChromaticityPtr r_ptr(nullptr); + PrimaryChromaticityPtr g_ptr(nullptr); + PrimaryChromaticityPtr b_ptr(nullptr); + PrimaryChromaticityPtr wp_ptr(nullptr); if (parser_mm.r) { if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr)) @@ -202,7 +202,8 @@ bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length, features->bit_depth = priv_profile; } else if (id_byte == kVp9ChromaSubsamplingId) { const int priv_profile = static_cast(private_data[offset++]); - if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3) + if (priv_profile != 0 && priv_profile != 1 && priv_profile != 2 && + priv_profile != 3) return false; if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent && features->chroma_subsampling != priv_profile) { diff --git a/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.h b/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.h index 689fb30a3f..78e2eeb705 100644 --- a/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.h +++ b/media/libvpx/libvpx/third_party/libwebm/common/hdr_util.h @@ -47,7 +47,7 @@ struct Vp9CodecFeatures { int chroma_subsampling; }; -typedef std::auto_ptr PrimaryChromaticityPtr; +typedef std::unique_ptr PrimaryChromaticityPtr; bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc, PrimaryChromaticityPtr* muxer_pc); diff --git a/media/libvpx/libvpx/third_party/libwebm/common/webmids.h b/media/libvpx/libvpx/third_party/libwebm/common/webmids.h index 89d722a71b..fc0c208140 100644 --- a/media/libvpx/libvpx/third_party/libwebm/common/webmids.h +++ b/media/libvpx/libvpx/third_party/libwebm/common/webmids.h @@ -93,6 +93,7 @@ enum MkvId { kMkvDisplayHeight = 0x54BA, kMkvDisplayUnit = 0x54B2, kMkvAspectRatioType = 0x54B3, + kMkvColourSpace = 0x2EB524, kMkvFrameRate = 0x2383E3, // end video // colour diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc index 299b45c989..21e51be474 100644 --- a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -8,6 +8,8 @@ #include "mkvmuxer/mkvmuxer.h" +#include + #include #include #include @@ -63,11 +65,12 @@ bool StrCpy(const char* src, char** dst_ptr) { if (dst == NULL) return false; - strcpy(dst, src); // NOLINT + memcpy(dst, src, size - 1); + dst[size - 1] = '\0'; return true; } -typedef std::auto_ptr PrimaryChromaticityPtr; +typedef std::unique_ptr PrimaryChromaticityPtr; bool CopyChromaticity(const PrimaryChromaticity* src, PrimaryChromaticityPtr* dst) { if (!dst) @@ -605,10 +608,10 @@ bool ContentEncoding::Write(IMkvWriter* writer) const { return true; } -uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size, +uint64_t ContentEncoding::EncodingSize(uint64_t compression_size, uint64_t encryption_size) const { // TODO(fgalligan): Add support for compression settings. - if (compresion_size != 0) + if (compression_size != 0) return 0; uint64_t encoding_size = 0; @@ -771,6 +774,14 @@ bool Track::Write(IMkvWriter* writer) const { if (!type_ || !codec_id_) return false; + // AV1 tracks require a CodecPrivate. See + // https://github.com/ietf-wg-cellar/matroska-specification/blob/HEAD/codec/av1.md + // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to + // point to a stable version once it is finalized, or our own WebM mappings + // page on webmproject.org should we decide to release them. + if (!strcmp(codec_id_, Tracks::kAv1CodecId) && !codec_private_) + return false; + // |size| may be bigger than what is written out in this function because // derived classes may write out more data in the Track element. const uint64_t payload_size = PayloadSize(); @@ -909,11 +920,8 @@ void Track::set_codec_id(const char* codec_id) { const size_t length = strlen(codec_id) + 1; codec_id_ = new (std::nothrow) char[length]; // NOLINT if (codec_id_) { -#ifdef _MSC_VER - strcpy_s(codec_id_, length, codec_id); -#else - strcpy(codec_id_, codec_id); -#endif + memcpy(codec_id_, codec_id, length - 1); + codec_id_[length - 1] = '\0'; } } } @@ -926,11 +934,8 @@ void Track::set_language(const char* language) { const size_t length = strlen(language) + 1; language_ = new (std::nothrow) char[length]; // NOLINT if (language_) { -#ifdef _MSC_VER - strcpy_s(language_, length, language); -#else - strcpy(language_, language); -#endif + memcpy(language_, language, length - 1); + language_[length - 1] = '\0'; } } } @@ -942,11 +947,8 @@ void Track::set_name(const char* name) { const size_t length = strlen(name) + 1; name_ = new (std::nothrow) char[length]; // NOLINT if (name_) { -#ifdef _MSC_VER - strcpy_s(name_, length, name); -#else - strcpy(name_, name); -#endif + memcpy(name_, name, length - 1); + name_[length - 1] = '\0'; } } } @@ -1025,19 +1027,16 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const { !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) { return false; } - if (r_ && - !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX, - libwebm::kMkvPrimaryRChromaticityY)) { + if (r_ && !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX, + libwebm::kMkvPrimaryRChromaticityY)) { return false; } - if (g_ && - !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX, - libwebm::kMkvPrimaryGChromaticityY)) { + if (g_ && !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX, + libwebm::kMkvPrimaryGChromaticityY)) { return false; } - if (b_ && - !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX, - libwebm::kMkvPrimaryBChromaticityY)) { + if (b_ && !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX, + libwebm::kMkvPrimaryBChromaticityY)) { return false; } if (white_point_ && @@ -1052,22 +1051,22 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const { bool MasteringMetadata::SetChromaticity( const PrimaryChromaticity* r, const PrimaryChromaticity* g, const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) { - PrimaryChromaticityPtr r_ptr(NULL); + PrimaryChromaticityPtr r_ptr(nullptr); if (r) { if (!CopyChromaticity(r, &r_ptr)) return false; } - PrimaryChromaticityPtr g_ptr(NULL); + PrimaryChromaticityPtr g_ptr(nullptr); if (g) { if (!CopyChromaticity(g, &g_ptr)) return false; } - PrimaryChromaticityPtr b_ptr(NULL); + PrimaryChromaticityPtr b_ptr(nullptr); if (b) { if (!CopyChromaticity(b, &b_ptr)) return false; } - PrimaryChromaticityPtr wp_ptr(NULL); + PrimaryChromaticityPtr wp_ptr(nullptr); if (white_point) { if (!CopyChromaticity(white_point, &wp_ptr)) return false; @@ -1233,7 +1232,7 @@ bool Colour::Write(IMkvWriter* writer) const { } bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) { - std::auto_ptr mm_ptr(new MasteringMetadata()); + std::unique_ptr mm_ptr(new MasteringMetadata()); if (!mm_ptr.get()) return false; @@ -1419,6 +1418,7 @@ VideoTrack::VideoTrack(unsigned int* seed) stereo_mode_(0), alpha_mode_(0), width_(0), + colour_space_(NULL), colour_(NULL), projection_(NULL) {} @@ -1516,6 +1516,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const { static_cast(alpha_mode_))) return false; } + if (colour_space_) { + if (!WriteEbmlElement(writer, libwebm::kMkvColourSpace, colour_space_)) + return false; + } if (frame_rate_ > 0.0) { if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate, static_cast(frame_rate_))) { @@ -1540,8 +1544,21 @@ bool VideoTrack::Write(IMkvWriter* writer) const { return true; } +void VideoTrack::set_colour_space(const char* colour_space) { + if (colour_space) { + delete[] colour_space_; + + const size_t length = strlen(colour_space) + 1; + colour_space_ = new (std::nothrow) char[length]; // NOLINT + if (colour_space_) { + memcpy(colour_space_, colour_space, length - 1); + colour_space_[length - 1] = '\0'; + } + } +} + bool VideoTrack::SetColour(const Colour& colour) { - std::auto_ptr colour_ptr(new Colour()); + std::unique_ptr colour_ptr(new Colour()); if (!colour_ptr.get()) return false; @@ -1569,7 +1586,7 @@ bool VideoTrack::SetColour(const Colour& colour) { } bool VideoTrack::SetProjection(const Projection& projection) { - std::auto_ptr projection_ptr(new Projection()); + std::unique_ptr projection_ptr(new Projection()); if (!projection_ptr.get()) return false; @@ -1623,6 +1640,8 @@ uint64_t VideoTrack::VideoPayloadSize() const { if (frame_rate_ > 0.0) size += EbmlElementSize(libwebm::kMkvFrameRate, static_cast(frame_rate_)); + if (colour_space_) + size += EbmlElementSize(libwebm::kMkvColourSpace, colour_space_); if (colour_) size += colour_->ColourSize(); if (projection_) @@ -1700,9 +1719,9 @@ bool AudioTrack::Write(IMkvWriter* writer) const { const char Tracks::kOpusCodecId[] = "A_OPUS"; const char Tracks::kVorbisCodecId[] = "A_VORBIS"; +const char Tracks::kAv1CodecId[] = "V_AV1"; const char Tracks::kVp8CodecId[] = "V_VP8"; const char Tracks::kVp9CodecId[] = "V_VP9"; -const char Tracks::kVp10CodecId[] = "V_VP10"; const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS"; const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS"; const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA"; @@ -2661,7 +2680,7 @@ bool Cluster::QueueOrWriteFrame(const Frame* const frame) { // and write it if it is okay to do so (i.e.) no other track has an held back // frame with timestamp <= the timestamp of the frame in question. std::vector::iterator> frames_to_erase; - for (std::list::iterator + for (std::list::iterator current_track_iterator = stored_frames_[track_number].begin(), end = --stored_frames_[track_number].end(); current_track_iterator != end; ++current_track_iterator) { @@ -2826,13 +2845,13 @@ bool SeekHead::AddSeekEntry(uint32_t id, uint64_t pos) { uint32_t SeekHead::GetId(int index) const { if (index < 0 || index >= kSeekEntryCount) - return UINT_MAX; + return UINT32_MAX; return seek_entry_id_[index]; } uint64_t SeekHead::GetPosition(int index) const { if (index < 0 || index >= kSeekEntryCount) - return ULLONG_MAX; + return UINT64_MAX; return seek_entry_pos_[index]; } @@ -2866,7 +2885,7 @@ SegmentInfo::SegmentInfo() muxing_app_(NULL), timecode_scale_(1000000ULL), writing_app_(NULL), - date_utc_(LLONG_MIN), + date_utc_(INT64_MIN), duration_pos_(-1) {} SegmentInfo::~SegmentInfo() { @@ -2897,11 +2916,8 @@ bool SegmentInfo::Init() { if (!muxing_app_) return false; -#ifdef _MSC_VER - strcpy_s(muxing_app_, app_len, temp); -#else - strcpy(muxing_app_, temp); -#endif + memcpy(muxing_app_, temp, app_len - 1); + muxing_app_[app_len - 1] = '\0'; set_writing_app(temp); if (!writing_app_) @@ -2944,7 +2960,7 @@ bool SegmentInfo::Write(IMkvWriter* writer) { if (duration_ > 0.0) size += EbmlElementSize(libwebm::kMkvDuration, static_cast(duration_)); - if (date_utc_ != LLONG_MIN) + if (date_utc_ != INT64_MIN) size += EbmlDateElementSize(libwebm::kMkvDateUTC); size += EbmlElementSize(libwebm::kMkvMuxingApp, muxing_app_); size += EbmlElementSize(libwebm::kMkvWritingApp, writing_app_); @@ -2969,7 +2985,7 @@ bool SegmentInfo::Write(IMkvWriter* writer) { return false; } - if (date_utc_ != LLONG_MIN) + if (date_utc_ != INT64_MIN) WriteEbmlDateElement(writer, libwebm::kMkvDateUTC, date_utc_); if (!WriteEbmlElement(writer, libwebm::kMkvMuxingApp, muxing_app_)) @@ -2992,11 +3008,8 @@ void SegmentInfo::set_muxing_app(const char* app) { if (!temp_str) return; -#ifdef _MSC_VER - strcpy_s(temp_str, length, app); -#else - strcpy(temp_str, app); -#endif + memcpy(temp_str, app, length - 1); + temp_str[length - 1] = '\0'; delete[] muxing_app_; muxing_app_ = temp_str; @@ -3010,11 +3023,8 @@ void SegmentInfo::set_writing_app(const char* app) { if (!temp_str) return; -#ifdef _MSC_VER - strcpy_s(temp_str, length, app); -#else - strcpy(temp_str, app); -#endif + memcpy(temp_str, app, length - 1); + temp_str[length - 1] = '\0'; delete[] writing_app_; writing_app_ = temp_str; @@ -3053,7 +3063,8 @@ Segment::Segment() output_cues_(true), accurate_cluster_duration_(false), fixed_size_cluster_timecode_(false), - estimate_file_duration_(true), + estimate_file_duration_(false), + ebml_header_size_(0), payload_pos_(0), size_position_(0), doc_type_version_(kDefaultDocTypeVersion), @@ -3361,7 +3372,10 @@ uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) { track->set_width(width); track->set_height(height); - tracks_.AddTrack(track, number); + if (!tracks_.AddTrack(track, number)) { + delete track; + return 0; + } has_video_ = true; return track->number(); @@ -3383,8 +3397,10 @@ bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) { cue->set_block_number(cluster->blocks_added()); cue->set_cluster_pos(cluster->position_for_cues()); cue->set_track(track); - if (!cues_.AddCue(cue)) + if (!cues_.AddCue(cue)) { + delete cue; return false; + } new_cuepoint_ = false; return true; @@ -3401,7 +3417,10 @@ uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels, track->set_sample_rate(sample_rate); track->set_channels(channels); - tracks_.AddTrack(track, number); + if (!tracks_.AddTrack(track, number)) { + delete track; + return 0; + } return track->number(); } @@ -3490,16 +3509,33 @@ bool Segment::AddGenericFrame(const Frame* frame) { if (frame->discard_padding() != 0) doc_type_version_ = 4; + if (cluster_list_size_ > 0) { + const uint64_t timecode_scale = segment_info_.timecode_scale(); + const uint64_t frame_timecode = frame->timestamp() / timecode_scale; + + const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1]; + const uint64_t last_cluster_timecode = last_cluster->timecode(); + + const uint64_t rel_timecode = frame_timecode - last_cluster_timecode; + if (rel_timecode > kMaxBlockTimecode) { + force_new_cluster_ = true; + } + } + // If the segment has a video track hold onto audio frames to make sure the // audio that is associated with the start time of a video key-frame is // muxed into the same cluster. if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) && !force_new_cluster_) { Frame* const new_frame = new (std::nothrow) Frame(); - if (!new_frame || !new_frame->CopyFrom(*frame)) + if (!new_frame || !new_frame->CopyFrom(*frame)) { + delete new_frame; return false; - if (!QueueFrame(new_frame)) + } + if (!QueueFrame(new_frame)) { + delete new_frame; return false; + } track_frames_written_[frame->track_number() - 1]++; return true; } @@ -3522,8 +3558,10 @@ bool Segment::AddGenericFrame(const Frame* frame) { if (!frame->CanBeSimpleBlock() && !frame->is_key() && !frame->reference_block_timestamp_set()) { Frame* const new_frame = new (std::nothrow) Frame(); - if (!new_frame->CopyFrom(*frame)) + if (!new_frame || !new_frame->CopyFrom(*frame)) { + delete new_frame; return false; + } new_frame->set_reference_block_timestamp( last_track_timestamp_[frame->track_number() - 1]); frame = new_frame; @@ -3570,19 +3608,17 @@ bool Segment::SetChunking(bool chunking, const char* filename) { if (chunking_ && !strcmp(filename, chunking_base_name_)) return true; - const size_t name_length = strlen(filename) + 1; - char* const temp = new (std::nothrow) char[name_length]; // NOLINT + const size_t filename_length = strlen(filename); + char* const temp = new (std::nothrow) char[filename_length + 1]; // NOLINT if (!temp) return false; -#ifdef _MSC_VER - strcpy_s(temp, name_length, filename); -#else - strcpy(temp, filename); -#endif + memcpy(temp, filename, filename_length); + temp[filename_length] = '\0'; delete[] chunking_base_name_; chunking_base_name_ = temp; + // From this point, strlen(chunking_base_name_) == filename_length if (!UpdateChunkName("chk", &chunk_name_)) return false; @@ -3608,18 +3644,16 @@ bool Segment::SetChunking(bool chunking, const char* filename) { if (!chunk_writer_cluster_->Open(chunk_name_)) return false; - const size_t header_length = strlen(filename) + strlen(".hdr") + 1; + const size_t hdr_length = strlen(".hdr"); + const size_t header_length = filename_length + hdr_length + 1; char* const header = new (std::nothrow) char[header_length]; // NOLINT if (!header) return false; -#ifdef _MSC_VER - strcpy_s(header, header_length - strlen(".hdr"), chunking_base_name_); - strcat_s(header, header_length, ".hdr"); -#else - strcpy(header, chunking_base_name_); - strcat(header, ".hdr"); -#endif + memcpy(header, chunking_base_name_, filename_length); + memcpy(&header[filename_length], ".hdr", hdr_length); + header[filename_length + hdr_length] = '\0'; + if (!chunk_writer_header_->Open(header)) { delete[] header; return false; @@ -3964,18 +3998,16 @@ bool Segment::UpdateChunkName(const char* ext, char** name) const { snprintf(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext); #endif - const size_t length = strlen(chunking_base_name_) + strlen(ext_chk) + 1; + const size_t chunking_base_name_length = strlen(chunking_base_name_); + const size_t ext_chk_length = strlen(ext_chk); + const size_t length = chunking_base_name_length + ext_chk_length + 1; char* const str = new (std::nothrow) char[length]; // NOLINT if (!str) return false; -#ifdef _MSC_VER - strcpy_s(str, length - strlen(ext_chk), chunking_base_name_); - strcat_s(str, length, ext_chk); -#else - strcpy(str, chunking_base_name_); - strcat(str, ext_chk); -#endif + memcpy(str, chunking_base_name_, chunking_base_name_length); + memcpy(&str[chunking_base_name_length], ext_chk, ext_chk_length); + str[chunking_base_name_length + ext_chk_length] = '\0'; delete[] * name; *name = str; @@ -4048,12 +4080,16 @@ int Segment::WriteFramesAll() { // places where |doc_type_version_| needs to be updated. if (frame->discard_padding() != 0) doc_type_version_ = 4; - if (!cluster->AddFrame(frame)) - return -1; + if (!cluster->AddFrame(frame)) { + delete frame; + continue; + } if (new_cuepoint_ && cues_track_ == frame->track_number()) { - if (!AddCuePoint(frame->timestamp(), cues_track_)) - return -1; + if (!AddCuePoint(frame->timestamp(), cues_track_)) { + delete frame; + continue; + } } if (frame->timestamp() > last_timestamp_) { @@ -4096,12 +4132,16 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) { const Frame* const frame_prev = frames_[i - 1]; if (frame_prev->discard_padding() != 0) doc_type_version_ = 4; - if (!cluster->AddFrame(frame_prev)) - return false; + if (!cluster->AddFrame(frame_prev)) { + delete frame_prev; + continue; + } if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) { - if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) - return false; + if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) { + delete frame_prev; + continue; + } } ++shift_left; @@ -4136,8 +4176,8 @@ bool Segment::DocTypeIsWebm() const { // TODO(vigneshv): Tweak .clang-format. const char* kWebmCodecIds[kNumCodecIds] = { Tracks::kOpusCodecId, Tracks::kVorbisCodecId, - Tracks::kVp8CodecId, Tracks::kVp9CodecId, - Tracks::kVp10CodecId, Tracks::kWebVttCaptionsId, + Tracks::kAv1CodecId, Tracks::kVp8CodecId, + Tracks::kVp9CodecId, Tracks::kWebVttCaptionsId, Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId, Tracks::kWebVttSubtitlesId}; diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h index 46b0029dc4..2c4bb9e93e 100644 --- a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h +++ b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h @@ -330,7 +330,7 @@ class ContentEncoding { private: // Returns the size in bytes for the encoding elements. - uint64_t EncodingSize(uint64_t compresion_size, + uint64_t EncodingSize(uint64_t compression_size, uint64_t encryption_size) const; // Returns the size in bytes for the encryption elements. @@ -795,6 +795,8 @@ class VideoTrack : public Track { uint64_t alpha_mode() { return alpha_mode_; } void set_width(uint64_t width) { width_ = width; } uint64_t width() const { return width_; } + void set_colour_space(const char* colour_space); + const char* colour_space() const { return colour_space_; } Colour* colour() { return colour_; } @@ -824,6 +826,7 @@ class VideoTrack : public Track { uint64_t stereo_mode_; uint64_t alpha_mode_; uint64_t width_; + char* colour_space_; Colour* colour_; Projection* projection_; @@ -871,9 +874,9 @@ class Tracks { static const char kOpusCodecId[]; static const char kVorbisCodecId[]; + static const char kAv1CodecId[]; static const char kVp8CodecId[]; static const char kVp9CodecId[]; - static const char kVp10CodecId[]; static const char kWebVttCaptionsId[]; static const char kWebVttDescriptionsId[]; static const char kWebVttMetadataId[]; @@ -1422,7 +1425,7 @@ class SeekHead { bool Write(IMkvWriter* writer); // We are going to put a cap on the number of Seek Entries. - const static int32_t kSeekEntryCount = 5; + constexpr static int32_t kSeekEntryCount = 5; private: // Returns the maximum size in bytes of one seek entry. @@ -1478,7 +1481,7 @@ class SegmentInfo { uint64_t timecode_scale_; // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision. char* writing_app_; - // LLONG_MIN when DateUTC is not set. + // INT64_MIN when DateUTC is not set. int64_t date_utc_; // The file position of the duration element. @@ -1502,8 +1505,8 @@ class Segment { kBeforeClusters = 0x1 // Position Cues before Clusters }; - static const uint32_t kDefaultDocTypeVersion = 4; - static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL; + static constexpr uint32_t kDefaultDocTypeVersion = 4; + static constexpr uint64_t kDefaultMaxClusterDuration = 30000000000ULL; Segment(); ~Segment(); diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc index 1ba17ac1ba..d1e835cd00 100644 --- a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc +++ b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc @@ -10,6 +10,7 @@ #ifdef __ANDROID__ #include +#include #endif #include @@ -135,9 +136,8 @@ uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode, return false; } - if (!frame->is_key() && - !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock, - reference_block_timestamp)) { + if (!frame->is_key() && !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock, + reference_block_timestamp)) { return false; } @@ -288,7 +288,7 @@ uint64 EbmlElementSize(uint64 type, const char* value) { ebml_size += strlen(value); // Size of Datasize - ebml_size++; + ebml_size += GetCodedUIntSize(strlen(value)); return ebml_size; } @@ -508,7 +508,7 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) { if (WriteUInt(writer, length)) return false; - if (writer->Write(value, static_cast(length))) + if (writer->Write(value, static_cast(length))) return false; return true; @@ -562,10 +562,10 @@ uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame, if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode) return 0; - return frame->CanBeSimpleBlock() ? - WriteSimpleBlock(writer, frame, relative_timecode) : - WriteBlock(writer, frame, relative_timecode, - cluster->timecode_scale()); + return frame->CanBeSimpleBlock() + ? WriteSimpleBlock(writer, frame, relative_timecode) + : WriteBlock(writer, frame, relative_timecode, + cluster->timecode_scale()); } uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) { @@ -606,26 +606,22 @@ uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) { void GetVersion(int32* major, int32* minor, int32* build, int32* revision) { *major = 0; - *minor = 2; - *build = 1; + *minor = 3; + *build = 3; *revision = 0; } uint64 MakeUID(unsigned int* seed) { uint64 uid = 0; -#ifdef __MINGW32__ - srand(*seed); -#endif - for (int i = 0; i < 7; ++i) { // avoid problems with 8-byte values uid <<= 8; // TODO(fgalligan): Move random number generation to platform specific code. -#ifdef _MSC_VER +#ifdef _WIN32 (void)seed; const int32 nn = rand(); -#elif __ANDROID__ +#elif defined(__ANDROID__) (void)seed; int32 temp_num = 1; int fd = open("/dev/urandom", O_RDONLY); @@ -634,8 +630,6 @@ uint64 MakeUID(unsigned int* seed) { close(fd); } const int32 nn = temp_num; -#elif defined __MINGW32__ - const int32 nn = rand(); #else const int32 nn = rand_r(seed); #endif diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h index 132388da59..85fc2a209e 100644 --- a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h +++ b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h @@ -8,9 +8,9 @@ #ifndef MKVMUXER_MKVMUXERUTIL_H_ #define MKVMUXER_MKVMUXERUTIL_H_ -#include "mkvmuxertypes.h" +#include -#include "stdint.h" +#include "mkvmuxertypes.h" namespace mkvmuxer { class Cluster; @@ -31,6 +31,9 @@ const int64 kMaxBlockTimecode = 0x07FFFLL; // Writes out |value| in Big Endian order. Returns 0 on success. int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size); +// Writes out |f| in Big Endian order. Returns 0 on success. +int32 SerializeFloat(IMkvWriter* writer, float f); + // Returns the size in bytes of the element. int32 GetUIntSize(uint64 value); int32 GetIntSize(int64 value); diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc index ec34e4df81..9b714a5e7c 100644 --- a/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc +++ b/media/libvpx/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc @@ -8,6 +8,8 @@ #include "mkvmuxer/mkvwriter.h" +#include + #ifdef _MSC_VER #include // for _SH_DENYWR #endif @@ -76,8 +78,16 @@ int32 MkvWriter::Position(int64 position) { #ifdef _MSC_VER return _fseeki64(file_, position, SEEK_SET); -#else +#elif defined(_WIN32) + return fseeko64(file_, static_cast(position), SEEK_SET); +#elif !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \ + defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64) + // POSIX.1 has fseeko and ftello. fseeko and ftello are not available before + // Android API level 24. See + // https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md return fseeko(file_, static_cast(position), SEEK_SET); +#else + return fseek(file_, static_cast(position), SEEK_SET); #endif } diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.cc index e62d6f6075..4fa7b37887 100644 --- a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.cc +++ b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.cc @@ -8,7 +8,6 @@ #include "mkvparser/mkvparser.h" #if defined(_MSC_VER) && _MSC_VER < 1800 -#include // _isnan() / _finite() #define MSC_COMPAT #endif @@ -16,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +23,7 @@ #include "common/webmids.h" namespace mkvparser { +const long long kStringElementSizeLimit = 20 * 1000 * 1000; const float MasteringMetadata::kValueNotPresent = FLT_MAX; const long long Colour::kValueNotPresent = LLONG_MAX; const float Projection::kValueNotPresent = FLT_MAX; @@ -35,8 +36,6 @@ inline bool isnan(double val) { return std::isnan(val); } inline bool isinf(double val) { return std::isinf(val); } #endif // MSC_COMPAT -IMkvReader::~IMkvReader() {} - template Type* SafeArrayAlloc(unsigned long long num_elements, unsigned long long element_size) { @@ -55,9 +54,9 @@ Type* SafeArrayAlloc(unsigned long long num_elements, void GetVersion(int& major, int& minor, int& build, int& revision) { major = 1; - minor = 0; - build = 0; - revision = 30; + minor = 1; + build = 3; + revision = 0; } long long ReadUInt(IMkvReader* pReader, long long pos, long& len) { @@ -247,7 +246,8 @@ long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_, if (size == 4) { union { float f; - unsigned long ff; + uint32_t ff; + static_assert(sizeof(float) == sizeof(uint32_t), ""); }; ff = 0; @@ -265,7 +265,8 @@ long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_, } else { union { double d; - unsigned long long dd; + uint64_t dd; + static_assert(sizeof(double) == sizeof(uint64_t), ""); }; dd = 0; @@ -299,7 +300,7 @@ long UnserializeInt(IMkvReader* pReader, long long pos, long long size, if (status < 0) return status; - unsigned long long result = first_byte; + unsigned long long result = static_cast(first_byte); ++pos; for (long i = 1; i < size; ++i) { @@ -325,7 +326,7 @@ long UnserializeString(IMkvReader* pReader, long long pos, long long size, delete[] str; str = NULL; - if (size >= LONG_MAX || size < 0) + if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit) return E_FILE_FORMAT_INVALID; // +1 for '\0' terminator @@ -1503,8 +1504,8 @@ long SeekHead::Parse() { // first count the seek head entries - int entry_count = 0; - int void_element_count = 0; + long long entry_count = 0; + long long void_element_count = 0; while (pos < stop) { long long id, size; @@ -1514,10 +1515,15 @@ long SeekHead::Parse() { if (status < 0) // error return status; - if (id == libwebm::kMkvSeek) + if (id == libwebm::kMkvSeek) { ++entry_count; - else if (id == libwebm::kMkvVoid) + if (entry_count > INT_MAX) + return E_PARSE_FAILED; + } else if (id == libwebm::kMkvVoid) { ++void_element_count; + if (void_element_count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload @@ -1528,15 +1534,20 @@ long SeekHead::Parse() { if (pos != stop) return E_FILE_FORMAT_INVALID; - m_entries = new (std::nothrow) Entry[entry_count]; + if (entry_count > 0) { + m_entries = new (std::nothrow) Entry[static_cast(entry_count)]; - if (m_entries == NULL) - return -1; + if (m_entries == NULL) + return -1; + } - m_void_elements = new (std::nothrow) VoidElement[void_element_count]; + if (void_element_count > 0) { + m_void_elements = + new (std::nothrow) VoidElement[static_cast(void_element_count)]; - if (m_void_elements == NULL) - return -1; + if (m_void_elements == NULL) + return -1; + } // now parse the entries and void elements @@ -1555,14 +1566,14 @@ long SeekHead::Parse() { if (status < 0) // error return status; - if (id == libwebm::kMkvSeek) { + if (id == libwebm::kMkvSeek && entry_count > 0) { if (ParseEntry(pReader, pos, size, pEntry)) { Entry& e = *pEntry++; e.element_start = idpos; e.element_size = (pos + size) - idpos; } - } else if (id == libwebm::kMkvVoid) { + } else if (id == libwebm::kMkvVoid && void_element_count > 0) { VoidElement& e = *pVoidElement++; e.element_start = idpos; @@ -1579,13 +1590,13 @@ long SeekHead::Parse() { ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries); assert(count_ >= 0); - assert(count_ <= entry_count); + assert(static_cast(count_) <= entry_count); m_entry_count = static_cast(count_); count_ = ptrdiff_t(pVoidElement - m_void_elements); assert(count_ >= 0); - assert(count_ <= void_element_count); + assert(static_cast(count_) <= void_element_count); m_void_element_count = static_cast(count_); @@ -2296,7 +2307,7 @@ bool CuePoint::Load(IMkvReader* pReader) { long long pos = pos_; // First count number of track positions - + unsigned long long track_positions_count = 0; while (pos < stop) { long len; @@ -2320,12 +2331,17 @@ bool CuePoint::Load(IMkvReader* pReader) { if (id == libwebm::kMkvCueTime) m_timecode = UnserializeUInt(pReader, pos, size); - else if (id == libwebm::kMkvCueTrackPositions) - ++m_track_positions_count; + else if (id == libwebm::kMkvCueTrackPositions) { + ++track_positions_count; + if (track_positions_count > UINT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload } + m_track_positions_count = static_cast(track_positions_count); + if (m_timecode < 0 || m_track_positions_count <= 0) { return false; } @@ -2418,7 +2434,7 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_, pos += size; // consume payload } - if ((m_pos < 0) || (m_track <= 0)) { + if ((m_pos < 0) || (m_track <= 0) || (m_block < 0) || (m_block > LONG_MAX)) { return false; } @@ -2426,7 +2442,9 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_, } const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const { - assert(pTrack); + if (pTrack == NULL) { + return NULL; + } const long long n = pTrack->GetNumber(); @@ -4026,7 +4044,7 @@ long SegmentInfo::Parse() { } const double rollover_check = m_duration * m_timecodeScale; - if (rollover_check > LLONG_MAX) + if (rollover_check > static_cast(LLONG_MAX)) return E_FILE_FORMAT_INVALID; if (pos != stop) @@ -4189,8 +4207,8 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, const long long stop = start + size; // Count ContentCompression and ContentEncryption elements. - int compression_count = 0; - int encryption_count = 0; + long long compression_count = 0; + long long encryption_count = 0; while (pos < stop) { long long id, size; @@ -4198,11 +4216,17 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, if (status < 0) // error return status; - if (id == libwebm::kMkvContentCompression) + if (id == libwebm::kMkvContentCompression) { ++compression_count; + if (compression_count > INT_MAX) + return E_PARSE_FAILED; + } - if (id == libwebm::kMkvContentEncryption) + if (id == libwebm::kMkvContentEncryption) { ++encryption_count; + if (encryption_count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload if (pos > stop) @@ -4213,18 +4237,19 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, return -1; if (compression_count > 0) { - compression_entries_ = - new (std::nothrow) ContentCompression*[compression_count]; + compression_entries_ = new (std::nothrow) + ContentCompression*[static_cast(compression_count)]; if (!compression_entries_) return -1; compression_entries_end_ = compression_entries_; } if (encryption_count > 0) { - encryption_entries_ = - new (std::nothrow) ContentEncryption*[encryption_count]; + encryption_entries_ = new (std::nothrow) + ContentEncryption*[static_cast(encryption_count)]; if (!encryption_entries_) { delete[] compression_entries_; + compression_entries_ = NULL; return -1; } encryption_entries_end_ = encryption_entries_; @@ -4256,6 +4281,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, delete compression; return status; } + assert(compression_count > 0); *compression_entries_end_++ = compression; } else if (id == libwebm::kMkvContentEncryption) { ContentEncryption* const encryption = @@ -4268,6 +4294,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size, delete encryption; return status; } + assert(encryption_count > 0); *encryption_entries_end_++ = encryption; } @@ -4320,6 +4347,12 @@ long ContentEncoding::ParseCompressionEntry(long long start, long long size, return status; } + // There should be only one settings element per content compression. + if (compression->settings != NULL) { + delete[] buf; + return E_FILE_FORMAT_INVALID; + } + compression->settings = buf; compression->settings_len = buflen; } @@ -4538,7 +4571,8 @@ int Track::Info::CopyStr(char* Info::*str, Info& dst_) const { if (dst == NULL) return -1; - strcpy(dst, src); + memcpy(dst, src, len); + dst[len] = '\0'; return 0; } @@ -4904,7 +4938,7 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) { const long long stop = start + size; // Count ContentEncoding elements. - int count = 0; + long long count = 0; while (pos < stop) { long long id, size; const long status = ParseElementHeader(pReader, pos, stop, id, size); @@ -4912,8 +4946,11 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) { return status; // pos now designates start of element - if (id == libwebm::kMkvContentEncoding) + if (id == libwebm::kMkvContentEncoding) { ++count; + if (count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload if (pos > stop) @@ -4923,7 +4960,8 @@ long Track::ParseContentEncodingsEntry(long long start, long long size) { if (count <= 0) return -1; - content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count]; + content_encoding_entries_ = + new (std::nothrow) ContentEncoding*[static_cast(count)]; if (!content_encoding_entries_) return -1; @@ -4975,29 +5013,27 @@ bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos, if (!reader) return false; - std::auto_ptr chromaticity_ptr; + if (!*chromaticity) + *chromaticity = new PrimaryChromaticity(); - if (!*chromaticity) { - chromaticity_ptr.reset(new PrimaryChromaticity()); - } else { - chromaticity_ptr.reset(*chromaticity); - } - - if (!chromaticity_ptr.get()) + if (!*chromaticity) return false; - float* value = is_x ? &chromaticity_ptr->x : &chromaticity_ptr->y; + PrimaryChromaticity* pc = *chromaticity; + float* value = is_x ? &pc->x : &pc->y; double parser_value = 0; - const long long value_parse_status = + const long long parse_status = UnserializeFloat(reader, read_pos, value_size, parser_value); + // Valid range is [0, 1]. Make sure the double is representable as a float + // before casting. + if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 || + (parser_value > 0.0 && parser_value < FLT_MIN)) + return false; + *value = static_cast(parser_value); - if (value_parse_status < 0 || *value < 0.0 || *value > 1.0) - return false; - - *chromaticity = chromaticity_ptr.release(); return true; } @@ -5006,7 +5042,7 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, if (!reader || *mm) return false; - std::auto_ptr mm_ptr(new MasteringMetadata()); + std::unique_ptr mm_ptr(new MasteringMetadata()); if (!mm_ptr.get()) return false; @@ -5026,6 +5062,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, double value = 0; const long long value_parse_status = UnserializeFloat(reader, read_pos, child_size, value); + if (value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } mm_ptr->luminance_max = static_cast(value); if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 || mm_ptr->luminance_max > 9999.99) { @@ -5035,6 +5075,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start, double value = 0; const long long value_parse_status = UnserializeFloat(reader, read_pos, child_size, value); + if (value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { + return false; + } mm_ptr->luminance_min = static_cast(value); if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 || mm_ptr->luminance_min > 999.9999) { @@ -5087,7 +5131,7 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start, if (!reader || *colour) return false; - std::auto_ptr colour_ptr(new Colour()); + std::unique_ptr colour_ptr(new Colour()); if (!colour_ptr.get()) return false; @@ -5185,7 +5229,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size, if (!reader || *projection) return false; - std::auto_ptr projection_ptr(new Projection()); + std::unique_ptr projection_ptr(new Projection()); if (!projection_ptr.get()) return false; @@ -5209,6 +5253,8 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size, projection_ptr->type = static_cast(projection_type); } else if (child_id == libwebm::kMkvProjectionPrivate) { + if (projection_ptr->private_data != NULL) + return false; unsigned char* data = SafeArrayAlloc(1, child_size); if (data == NULL) @@ -5228,7 +5274,9 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size, double value = 0; const long long value_parse_status = UnserializeFloat(reader, read_pos, child_size, value); - if (value_parse_status < 0) { + // Make sure value is representable as a float before casting. + if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX || + (value > 0.0 && value < FLT_MIN)) { return false; } @@ -5259,10 +5307,12 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size, VideoTrack::VideoTrack(Segment* pSegment, long long element_start, long long element_size) : Track(pSegment, element_start, element_size), + m_colour_space(NULL), m_colour(NULL), m_projection(NULL) {} VideoTrack::~VideoTrack() { + delete[] m_colour_space; delete m_colour; delete m_projection; } @@ -5284,6 +5334,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, long long stereo_mode = 0; double rate = 0.0; + std::unique_ptr colour_space_ptr; IMkvReader* const pReader = pSegment->m_pReader; @@ -5296,8 +5347,8 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, const long long stop = pos + s.size; - Colour* colour = NULL; - Projection* projection = NULL; + std::unique_ptr colour_ptr; + std::unique_ptr projection_ptr; while (pos < stop) { long long id, size; @@ -5346,11 +5397,25 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, if (rate <= 0) return E_FILE_FORMAT_INVALID; } else if (id == libwebm::kMkvColour) { - if (!Colour::Parse(pReader, pos, size, &colour)) + Colour* colour = NULL; + if (!Colour::Parse(pReader, pos, size, &colour)) { return E_FILE_FORMAT_INVALID; + } else { + colour_ptr.reset(colour); + } } else if (id == libwebm::kMkvProjection) { - if (!Projection::Parse(pReader, pos, size, &projection)) + Projection* projection = NULL; + if (!Projection::Parse(pReader, pos, size, &projection)) { return E_FILE_FORMAT_INVALID; + } else { + projection_ptr.reset(projection); + } + } else if (id == libwebm::kMkvColourSpace) { + char* colour_space = NULL; + const long status = UnserializeString(pReader, pos, size, colour_space); + if (status < 0) + return status; + colour_space_ptr.reset(colour_space); } pos += size; // consume payload @@ -5381,8 +5446,9 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, pTrack->m_display_unit = display_unit; pTrack->m_stereo_mode = stereo_mode; pTrack->m_rate = rate; - pTrack->m_colour = colour; - pTrack->m_projection = projection; + pTrack->m_colour = colour_ptr.release(); + pTrack->m_colour_space = colour_space_ptr.release(); + pTrack->m_projection = projection_ptr.release(); pResult = pTrack; return 0; // success @@ -5611,7 +5677,7 @@ long Tracks::Parse() { const long long stop = m_start + m_size; IMkvReader* const pReader = m_pSegment->m_pReader; - int count = 0; + long long count = 0; long long pos = m_start; while (pos < stop) { @@ -5625,8 +5691,11 @@ long Tracks::Parse() { if (size == 0) // weird continue; - if (id == libwebm::kMkvTrackEntry) + if (id == libwebm::kMkvTrackEntry) { ++count; + if (count > INT_MAX) + return E_PARSE_FAILED; + } pos += size; // consume payload if (pos > stop) @@ -5639,7 +5708,7 @@ long Tracks::Parse() { if (count <= 0) return 0; // success - m_trackEntries = new (std::nothrow) Track*[count]; + m_trackEntries = new (std::nothrow) Track*[static_cast(count)]; if (m_trackEntries == NULL) return -1; @@ -7821,8 +7890,10 @@ long Block::Parse(const Cluster* pCluster) { if (frame_size <= 0) return E_FILE_FORMAT_INVALID; +#if LLONG_MAX > LONG_MAX if (frame_size > LONG_MAX) return E_FILE_FORMAT_INVALID; +#endif if ((pos + len) > stop) return E_FILE_FORMAT_INVALID; @@ -7888,10 +7959,16 @@ long Block::Parse(const Cluster* pCluster) { if (frame_size <= 0) return E_FILE_FORMAT_INVALID; +#if LLONG_MAX > LONG_MAX if (frame_size > LONG_MAX) return E_FILE_FORMAT_INVALID; +#endif curr.len = static_cast(frame_size); + // Check if size + curr.len could overflow. + if (size > LLONG_MAX - curr.len) { + return E_FILE_FORMAT_INVALID; + } size += curr.len; // contribution of this frame --frame_count; @@ -7932,7 +8009,6 @@ long Block::Parse(const Cluster* pCluster) { pf = m_frames; while (pf != pf_end) { Frame& f = *pf++; - assert((pos + f.len) <= stop); if ((pos + f.len) > stop) return E_FILE_FORMAT_INVALID; @@ -7954,6 +8030,11 @@ long long Block::GetTimeCode(const Cluster* pCluster) const { const long long tc0 = pCluster->GetTimeCode(); assert(tc0 >= 0); + // Check if tc0 + m_timecode would overflow. + if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) { + return -1; + } + const long long tc = tc0 + m_timecode; return tc; // unscaled timecode units @@ -7971,6 +8052,10 @@ long long Block::GetTime(const Cluster* pCluster) const { const long long scale = pInfo->GetTimeCodeScale(); assert(scale >= 1); + // Check if tc * scale could overflow. + if (tc != 0 && scale > LLONG_MAX / tc) { + return -1; + } const long long ns = tc * scale; return ns; diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.h b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.h index 26c2b7e5eb..848d01f03e 100644 --- a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.h +++ b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvparser.h @@ -22,7 +22,7 @@ class IMkvReader { virtual int Length(long long* total, long long* available) = 0; protected: - virtual ~IMkvReader(); + virtual ~IMkvReader() {} }; template @@ -527,6 +527,8 @@ class VideoTrack : public Track { Projection* GetProjection() const; + const char* GetColourSpace() const { return m_colour_space; } + private: long long m_width; long long m_height; @@ -534,7 +536,7 @@ class VideoTrack : public Track { long long m_display_height; long long m_display_unit; long long m_stereo_mode; - + char* m_colour_space; double m_rate; Colour* m_colour; diff --git a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvreader.cc index b8fd00c263..467260402a 100644 --- a/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvreader.cc +++ b/media/libvpx/libvpx/third_party/libwebm/mkvparser/mkvreader.cc @@ -7,6 +7,8 @@ // be found in the AUTHORS file in the root of the source tree. #include "mkvparser/mkvreader.h" +#include + #include namespace mkvparser { @@ -116,8 +118,16 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) { if (status) return -1; // error -#else +#elif defined(_WIN32) + fseeko64(m_file, static_cast(offset), SEEK_SET); +#elif !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \ + defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64) + // POSIX.1 has fseeko and ftello. fseeko and ftello are not available before + // Android API level 24. See + // https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md fseeko(m_file, static_cast(offset), SEEK_SET); +#else + fseek(m_file, static_cast(offset), SEEK_SET); #endif const size_t size = fread(buffer, 1, len, m_file); diff --git a/media/libvpx/libvpx/third_party/libyuv/LICENSE b/media/libvpx/libvpx/third_party/libyuv/LICENSE new file mode 100644 index 0000000000..c911747a6b --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/LICENSE @@ -0,0 +1,29 @@ +Copyright 2011 The LibYuv Project Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of Google nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/media/libvpx/libvpx/third_party/libyuv/README.libvpx b/media/libvpx/libvpx/third_party/libyuv/README.libvpx index 485f79c0ff..9519dc4bee 100644 --- a/media/libvpx/libvpx/third_party/libyuv/README.libvpx +++ b/media/libvpx/libvpx/third_party/libyuv/README.libvpx @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv -Version: de944ed8c74909ea6fbd743a22efe1e55e851b83 +Version: a37e7bfece9e0676ae90a1700b0ec85b0f4f22a1 License: BSD License File: LICENSE @@ -8,15 +8,16 @@ Description: libyuv is an open source project that includes YUV conversion and scaling functionality. -The optimized scaler in libyuv is used in multiple resolution encoder example, -which down-samples the original input video (f.g. 1280x720) a number of times -in order to encode multiple resolution bit streams. +The optimized scaler in libyuv is used in the multiple resolution encoder +example which down-samples the original input video (f.g. 1280x720) a number of +times in order to encode multiple resolution bit streams. Local Modifications: -rm -rf .gitignore .gn AUTHORS Android.mk BUILD.gn CMakeLists.txt DEPS LICENSE \ - LICENSE_THIRD_PARTY OWNERS PATENTS PRESUBMIT.py README.chromium README.md \ - all.gyp build_overrides/ chromium/ codereview.settings docs/ \ - download_vs_toolchain.py gyp_libyuv gyp_libyuv.py include/libyuv.h \ - include/libyuv/compare_row.h libyuv.gyp libyuv.gypi libyuv_nacl.gyp \ - libyuv_test.gyp linux.mk public.mk setup_links.py sync_chromium.py \ - third_party/ tools/ unit_test/ util/ winarm.mk +Disable ARGBToRGB24Row_AVX512VBMI due to build failure on Mac. +rm libyuv/include/libyuv.h libyuv/include/libyuv/compare_row.h +mv libyuv/include tmp/ +mv libyuv/source tmp/ +mv libyuv/LICENSE tmp/ +rm -rf libyuv + +mv tmp/* third_party/libyuv/ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h index 54a2181430..01d9dfc773 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/basic_types.h @@ -8,82 +8,36 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ #define INCLUDE_LIBYUV_BASIC_TYPES_H_ -#include // for NULL, size_t +#include // For size_t and NULL + +#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG) +#define INT_TYPES_DEFINED #if defined(_MSC_VER) && (_MSC_VER < 1600) #include // for uintptr_t on x86 +typedef unsigned __int64 uint64_t; +typedef __int64 int64_t; +typedef unsigned int uint32_t; +typedef int int32_t; +typedef unsigned short uint16_t; +typedef short int16_t; +typedef unsigned char uint8_t; +typedef signed char int8_t; #else -#include // for uintptr_t -#endif - -#ifndef GG_LONGLONG -#ifndef INT_TYPES_DEFINED -#define INT_TYPES_DEFINED -#ifdef COMPILER_MSVC -typedef unsigned __int64 uint64; -typedef __int64 int64; -#ifndef INT64_C -#define INT64_C(x) x ## I64 -#endif -#ifndef UINT64_C -#define UINT64_C(x) x ## UI64 -#endif -#define INT64_F "I64" -#else // COMPILER_MSVC -#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) -typedef unsigned long uint64; // NOLINT -typedef long int64; // NOLINT -#ifndef INT64_C -#define INT64_C(x) x ## L -#endif -#ifndef UINT64_C -#define UINT64_C(x) x ## UL -#endif -#define INT64_F "l" -#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) -typedef unsigned long long uint64; // NOLINT -typedef long long int64; // NOLINT -#ifndef INT64_C -#define INT64_C(x) x ## LL -#endif -#ifndef UINT64_C -#define UINT64_C(x) x ## ULL -#endif -#define INT64_F "ll" -#endif // __LP64__ -#endif // COMPILER_MSVC -typedef unsigned int uint32; -typedef int int32; -typedef unsigned short uint16; // NOLINT -typedef short int16; // NOLINT -typedef unsigned char uint8; -typedef signed char int8; +#include // for uintptr_t and C99 types +#endif // defined(_MSC_VER) && (_MSC_VER < 1600) +typedef uint64_t uint64; +typedef int64_t int64; +typedef uint32_t uint32; +typedef int32_t int32; +typedef uint16_t uint16; +typedef int16_t int16; +typedef uint8_t uint8; +typedef int8_t int8; #endif // INT_TYPES_DEFINED -#endif // GG_LONGLONG - -// Detect compiler is for x86 or x64. -#if defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || defined(_M_IX86) -#define CPU_X86 1 -#endif -// Detect compiler is for ARM. -#if defined(__arm__) || defined(_M_ARM) -#define CPU_ARM 1 -#endif - -#ifndef ALIGNP -#ifdef __cplusplus -#define ALIGNP(p, t) \ - (reinterpret_cast(((reinterpret_cast(p) + \ - ((t) - 1)) & ~((t) - 1)))) -#else -#define ALIGNP(p, t) \ - ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */ -#endif -#endif #if !defined(LIBYUV_API) #if defined(_WIN32) || defined(__CYGWIN__) @@ -95,24 +49,17 @@ typedef signed char int8; #define LIBYUV_API #endif // LIBYUV_BUILDING_SHARED_LIBRARY #elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \ - (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ - defined(LIBYUV_USING_SHARED_LIBRARY)) -#define LIBYUV_API __attribute__ ((visibility ("default"))) + (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ + defined(LIBYUV_USING_SHARED_LIBRARY)) +#define LIBYUV_API __attribute__((visibility("default"))) #else #define LIBYUV_API #endif // __GNUC__ #endif // LIBYUV_API +// TODO(fbarchard): Remove bool macros. #define LIBYUV_BOOL int #define LIBYUV_FALSE 0 #define LIBYUV_TRUE 1 -// Visual C x86 or GCC little endian. -#if defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || defined(_M_IX86) || \ - defined(__arm__) || defined(_M_ARM) || \ - (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -#define LIBYUV_LITTLE_ENDIAN -#endif - -#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h index 08b2bb2ecf..3353ad71c6 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/compare.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_COMPARE_H_ #define INCLUDE_LIBYUV_COMPARE_H_ #include "libyuv/basic_types.h" @@ -20,59 +20,92 @@ extern "C" { // Compute a hash for specified memory. Seed of 5381 recommended. LIBYUV_API -uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed); +uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed); + +// Hamming Distance +LIBYUV_API +uint64_t ComputeHammingDistance(const uint8_t* src_a, + const uint8_t* src_b, + int count); // Scan an opaque argb image and return fourcc based on alpha offset. // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. LIBYUV_API -uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height); +uint32_t ARGBDetect(const uint8_t* argb, + int stride_argb, + int width, + int height); // Sum Square Error - used to compute Mean Square Error or PSNR. LIBYUV_API -uint64 ComputeSumSquareError(const uint8* src_a, - const uint8* src_b, int count); +uint64_t ComputeSumSquareError(const uint8_t* src_a, + const uint8_t* src_b, + int count); LIBYUV_API -uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height); +uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height); static const int kMaxPsnr = 128; LIBYUV_API -double SumSquareErrorToPsnr(uint64 sse, uint64 count); +double SumSquareErrorToPsnr(uint64_t sse, uint64_t count); LIBYUV_API -double CalcFramePsnr(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height); +double CalcFramePsnr(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height); LIBYUV_API -double I420Psnr(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height); +double I420Psnr(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height); LIBYUV_API -double CalcFrameSsim(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height); +double CalcFrameSsim(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height); LIBYUV_API -double I420Ssim(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height); +double I420Ssim(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT +#endif // INCLUDE_LIBYUV_COMPARE_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h index fcfcf544e1..d12ef24f79 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_H_ #define INCLUDE_LIBYUV_CONVERT_H_ #include "libyuv/basic_types.h" @@ -16,8 +16,8 @@ #include "libyuv/rotate.h" // For enum RotationMode. // TODO(fbarchard): fix WebRTC source to include following libyuv headers: -#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620 -#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620 +#include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620 +#include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620 #include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618 #ifdef __cplusplus @@ -27,195 +27,335 @@ extern "C" { // Convert I444 to I420. LIBYUV_API -int I444ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I444ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert I422 to I420. LIBYUV_API -int I422ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Convert I411 to I420. -LIBYUV_API -int I411ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I422ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Copy I420 to I420. #define I420ToI420 I420Copy LIBYUV_API -int I420Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Copy I010 to I010 +#define I010ToI010 I010Copy +#define H010ToH010 I010Copy +LIBYUV_API +int I010Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert 10 bit YUV to 8 bit +#define H010ToH420 I010ToI420 +LIBYUV_API +int I010ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert I400 (grey) to I420. LIBYUV_API -int I400ToI420(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I400ToI420(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); #define J400ToJ420 I400ToI420 // Convert NV12 to I420. LIBYUV_API -int NV12ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int NV12ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert NV21 to I420. LIBYUV_API -int NV21ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int NV21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert YUY2 to I420. LIBYUV_API -int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int YUY2ToI420(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert UYVY to I420. LIBYUV_API -int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int UYVYToI420(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert M420 to I420. LIBYUV_API -int M420ToI420(const uint8* src_m420, int src_stride_m420, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int M420ToI420(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert Android420 to I420. LIBYUV_API -int Android420ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - int pixel_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int Android420ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // ARGB little endian (bgra in memory) to I420. LIBYUV_API -int ARGBToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // BGRA little endian (argb in memory) to I420. LIBYUV_API -int BGRAToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int BGRAToI420(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // ABGR little endian (rgba in memory) to I420. LIBYUV_API -int ABGRToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ABGRToI420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGBA little endian (abgr in memory) to I420. LIBYUV_API -int RGBAToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RGBAToI420(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB little endian (bgr in memory) to I420. LIBYUV_API -int RGB24ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RGB24ToI420(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB big endian (rgb in memory) to I420. LIBYUV_API -int RAWToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RAWToI420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB16 (RGBP fourcc) little endian to I420. LIBYUV_API -int RGB565ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int RGB565ToI420(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB15 (RGBO fourcc) little endian to I420. LIBYUV_API -int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGB1555ToI420(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB12 (R444 fourcc) little endian to I420. LIBYUV_API -int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGB4444ToI420(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); #ifdef HAVE_JPEG // src_width/height provided by capture. // dst_width/height for clipping determine final size. LIBYUV_API -int MJPGToI420(const uint8* sample, size_t sample_size, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_width, int src_height, - int dst_width, int dst_height); +int MJPGToI420(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_width, + int src_height, + int dst_width, + int dst_height); // Query size of MJPG in pixels. LIBYUV_API -int MJPGSize(const uint8* sample, size_t sample_size, - int* width, int* height); +int MJPGSize(const uint8_t* sample, + size_t sample_size, + int* width, + int* height); #endif // Convert camera sample to I420 with cropping, rotation and vertical flip. @@ -238,22 +378,29 @@ int MJPGSize(const uint8* sample, size_t sample_size, // Must be less than or equal to src_width/src_height // Cropping parameters are pre-rotation. // "rotation" can be 0, 90, 180 or 270. -// "format" is a fourcc. ie 'I420', 'YUY2' +// "fourcc" is a fourcc. ie 'I420', 'YUY2' // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. LIBYUV_API -int ConvertToI420(const uint8* src_frame, size_t src_size, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, +int ConvertToI420(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, - uint32 format); + uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h index 19672f3269..ab772b6c32 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ #define INCLUDE_LIBYUV_CONVERT_ARGB_H_ #include "libyuv/basic_types.h" @@ -30,258 +30,621 @@ extern "C" { // Copy ARGB to ARGB. LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I420 to ARGB. LIBYUV_API -int I420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Duplicate prototype for function in convert_from.h for remoting. LIBYUV_API -int I420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I010 to ABGR. +LIBYUV_API +int I010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert H010 to ABGR. +LIBYUV_API +int H010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert I422 to ARGB. LIBYUV_API -int I422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I444 to ARGB. LIBYUV_API -int I444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J444 to ARGB. LIBYUV_API -int J444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert I444 to ABGR. LIBYUV_API -int I444ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); - -// Convert I411 to ARGB. -LIBYUV_API -int I411ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert I420 with Alpha to preattenuated ARGB. LIBYUV_API -int I420AlphaToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int attenuate); +int I420AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate); // Convert I420 with Alpha to preattenuated ABGR. LIBYUV_API -int I420AlphaToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height, int attenuate); +int I420AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate); // Convert I400 (grey) to ARGB. Reverse of ARGBToI400. LIBYUV_API -int I400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J400 (jpeg grey) to ARGB. LIBYUV_API -int J400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Alias. #define YToARGB I400ToARGB // Convert NV12 to ARGB. LIBYUV_API -int NV12ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int NV12ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert NV21 to ARGB. LIBYUV_API -int NV21ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int NV21ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert NV12 to ABGR. +int NV12ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert NV21 to ABGR. +LIBYUV_API +int NV21ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert NV12 to RGB24. +LIBYUV_API +int NV12ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +// Convert NV21 to RGB24. +LIBYUV_API +int NV21ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); // Convert M420 to ARGB. LIBYUV_API -int M420ToARGB(const uint8* src_m420, int src_stride_m420, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int M420ToARGB(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert YUY2 to ARGB. LIBYUV_API -int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int YUY2ToARGB(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert UYVY to ARGB. LIBYUV_API -int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int UYVYToARGB(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J420 to ARGB. LIBYUV_API -int J420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J422 to ARGB. LIBYUV_API -int J422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert J420 to ABGR. LIBYUV_API -int J420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int J420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert J422 to ABGR. LIBYUV_API -int J422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int J422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert H420 to ARGB. LIBYUV_API -int H420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int H420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert H422 to ARGB. LIBYUV_API -int H422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int H422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert H420 to ABGR. LIBYUV_API -int H420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int H420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert H422 to ABGR. LIBYUV_API -int H422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int H422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I010 to AR30. +LIBYUV_API +int I010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert H010 to AR30. +LIBYUV_API +int H010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert I010 to AB30. +LIBYUV_API +int I010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); + +// Convert H010 to AB30. +LIBYUV_API +int H010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); // BGRA little endian (argb in memory) to ARGB. LIBYUV_API -int BGRAToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int BGRAToARGB(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // ABGR little endian (rgba in memory) to ARGB. LIBYUV_API -int ABGRToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ABGRToARGB(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // RGBA little endian (abgr in memory) to ARGB. LIBYUV_API -int RGBAToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RGBAToARGB(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Deprecated function name. #define BG24ToARGB RGB24ToARGB // RGB little endian (bgr in memory) to ARGB. LIBYUV_API -int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RGB24ToARGB(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB big endian (rgb in memory) to ARGB. LIBYUV_API -int RAWToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RAWToARGB(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB16 (RGBP fourcc) little endian to ARGB. LIBYUV_API -int RGB565ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int RGB565ToARGB(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB15 (RGBO fourcc) little endian to ARGB. LIBYUV_API -int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGB1555ToARGB(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // RGB12 (R444 fourcc) little endian to ARGB. LIBYUV_API -int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGB4444ToARGB(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Aliases +#define AB30ToARGB AR30ToABGR +#define AB30ToABGR AR30ToARGB +#define AB30ToAR30 AR30ToAB30 + +// Convert AR30 To ARGB. +LIBYUV_API +int AR30ToARGB(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert AR30 To ABGR. +LIBYUV_API +int AR30ToABGR(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert AR30 To AB30. +LIBYUV_API +int AR30ToAB30(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height); #ifdef HAVE_JPEG // src_width/height provided by capture // dst_width/height for clipping determine final size. LIBYUV_API -int MJPGToARGB(const uint8* sample, size_t sample_size, - uint8* dst_argb, int dst_stride_argb, - int src_width, int src_height, - int dst_width, int dst_height); +int MJPGToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + int dst_width, + int dst_height); #endif +// Convert Android420 to ARGB. +LIBYUV_API +int Android420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert Android420 to ABGR. +LIBYUV_API +int Android420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + // Convert camera sample to ARGB with cropping, rotation and vertical flip. -// "src_size" is needed to parse MJPG. +// "sample_size" is needed to parse MJPG. // "dst_stride_argb" number of bytes in a row of the dst_argb plane. // Normally this would be the same as dst_width, with recommended alignment // to 16 bytes for better efficiency. @@ -300,20 +663,25 @@ int MJPGToARGB(const uint8* sample, size_t sample_size, // Must be less than or equal to src_width/src_height // Cropping parameters are pre-rotation. // "rotation" can be 0, 90, 180 or 270. -// "format" is a fourcc. ie 'I420', 'YUY2' +// "fourcc" is a fourcc. ie 'I420', 'YUY2' // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. LIBYUV_API -int ConvertToARGB(const uint8* src_frame, size_t src_size, - uint8* dst_argb, int dst_stride_argb, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, +int ConvertToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, - uint32 format); + uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h index 39e1578a0e..5cd8a4bfc0 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ #define INCLUDE_LIBYUV_CONVERT_FROM_H_ #include "libyuv/basic_types.h" @@ -21,159 +21,322 @@ extern "C" { // See Also convert.h for conversions from formats to I420. -// I420Copy in convert to I420ToI420. +// Convert 8 bit YUV to 10 bit. +#define H420ToH010 I420ToI010 +int I420ToI010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height); LIBYUV_API -int I420ToI422(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420ToI422(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); LIBYUV_API -int I420ToI444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -LIBYUV_API -int I420ToI411(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int I420ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. LIBYUV_API -int I400Copy(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); +int I400Copy(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); LIBYUV_API -int I420ToNV12(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); +int I420ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); LIBYUV_API -int I420ToNV21(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height); +int I420ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height); LIBYUV_API -int I420ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); LIBYUV_API -int I420ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height); LIBYUV_API -int I420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); LIBYUV_API -int I420ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); LIBYUV_API -int I420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); LIBYUV_API -int I420ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height); +int I420ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); LIBYUV_API -int I420ToRGB24(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); LIBYUV_API -int I420ToRAW(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); LIBYUV_API -int I420ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); + +LIBYUV_API +int I420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); // Convert I420 To RGB565 with 4x4 dither matrix (16 bytes). // Values in dither matrix from 0 to 7 recommended. // The order of the dither matrix is first byte is upper left. LIBYUV_API -int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - const uint8* dither4x4, int width, int height); +int I420ToRGB565Dither(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height); LIBYUV_API -int I420ToARGB1555(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height); LIBYUV_API -int I420ToARGB4444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); +int I420ToARGB4444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height); + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); // Convert I420 to specified format. // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the // buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. LIBYUV_API -int ConvertFromI420(const uint8* y, int y_stride, - const uint8* u, int u_stride, - const uint8* v, int v_stride, - uint8* dst_sample, int dst_sample_stride, - int width, int height, - uint32 format); +int ConvertFromI420(const uint8_t* y, + int y_stride, + const uint8_t* u, + int u_stride, + const uint8_t* v, + int v_stride, + uint8_t* dst_sample, + int dst_sample_stride, + int width, + int height, + uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h index 1df53200dd..05c815a093 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ #define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ #include "libyuv/basic_types.h" @@ -21,170 +21,267 @@ extern "C" { // Copy ARGB to ARGB. #define ARGBToARGB ARGBCopy LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Convert ARGB To BGRA. LIBYUV_API -int ARGBToBGRA(const uint8* src_argb, int src_stride_argb, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height); +int ARGBToBGRA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); // Convert ARGB To ABGR. LIBYUV_API -int ARGBToABGR(const uint8* src_argb, int src_stride_argb, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); +int ARGBToABGR(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); // Convert ARGB To RGBA. LIBYUV_API -int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height); +int ARGBToRGBA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); + +// Aliases +#define ARGBToAB30 ABGRToAR30 +#define ABGRToAB30 ARGBToAR30 + +// Convert ABGR To AR30. +LIBYUV_API +int ABGRToAR30(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + +// Convert ARGB To AR30. +LIBYUV_API +int ARGBToAR30(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); // Convert ARGB To RGB24. LIBYUV_API -int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height); +int ARGBToRGB24(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); // Convert ARGB To RAW. LIBYUV_API -int ARGBToRAW(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb, int dst_stride_rgb, - int width, int height); +int ARGBToRAW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height); // Convert ARGB To RGB565. LIBYUV_API -int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height); +int ARGBToRGB565(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). // Values in dither matrix from 0 to 7 recommended. // The order of the dither matrix is first byte is upper left. // TODO(fbarchard): Consider pointer to 2d array for dither4x4. -// const uint8(*dither)[4][4]; +// const uint8_t(*dither)[4][4]; LIBYUV_API -int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, int width, int height); +int ARGBToRGB565Dither(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height); // Convert ARGB To ARGB1555. LIBYUV_API -int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb1555, int dst_stride_argb1555, - int width, int height); +int ARGBToARGB1555(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height); // Convert ARGB To ARGB4444. LIBYUV_API -int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb4444, int dst_stride_argb4444, - int width, int height); +int ARGBToARGB4444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height); // Convert ARGB To I444. LIBYUV_API -int ARGBToI444(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB To I422. LIBYUV_API -int ARGBToI422(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB To I420. (also in convert.h) LIBYUV_API -int ARGBToI420(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToI420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB to J420. (JPeg full range I420). LIBYUV_API -int ARGBToJ420(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToJ420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB to J422. LIBYUV_API -int ARGBToJ422(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Convert ARGB To I411. -LIBYUV_API -int ARGBToI411(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); +int ARGBToJ422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // Convert ARGB to J400. (JPeg full range). LIBYUV_API -int ARGBToJ400(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - int width, int height); +int ARGBToJ400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height); // Convert ARGB to I400. LIBYUV_API -int ARGBToI400(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height); +int ARGBToI400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); // Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB) LIBYUV_API -int ARGBToG(const uint8* src_argb, int src_stride_argb, - uint8* dst_g, int dst_stride_g, - int width, int height); +int ARGBToG(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_g, + int dst_stride_g, + int width, + int height); // Convert ARGB To NV12. LIBYUV_API -int ARGBToNV12(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); +int ARGBToNV12(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); // Convert ARGB To NV21. LIBYUV_API -int ARGBToNV21(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height); +int ARGBToNV21(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height); // Convert ARGB To NV21. LIBYUV_API -int ARGBToNV21(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height); +int ARGBToNV21(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height); // Convert ARGB To YUY2. LIBYUV_API -int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height); +int ARGBToYUY2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); // Convert ARGB To UYVY. LIBYUV_API -int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height); +int ARGBToUYVY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h index dfb7445e2f..0229cb5e73 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/cpu_id.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ #define INCLUDE_LIBYUV_CPU_ID_H_ #include "libyuv/basic_types.h" @@ -31,50 +31,89 @@ static const int kCpuHasX86 = 0x10; static const int kCpuHasSSE2 = 0x20; static const int kCpuHasSSSE3 = 0x40; static const int kCpuHasSSE41 = 0x80; -static const int kCpuHasSSE42 = 0x100; +static const int kCpuHasSSE42 = 0x100; // unused at this time. static const int kCpuHasAVX = 0x200; static const int kCpuHasAVX2 = 0x400; static const int kCpuHasERMS = 0x800; static const int kCpuHasFMA3 = 0x1000; -static const int kCpuHasAVX3 = 0x2000; -// 0x2000, 0x4000, 0x8000 reserved for future X86 flags. +static const int kCpuHasF16C = 0x2000; +static const int kCpuHasGFNI = 0x4000; +static const int kCpuHasAVX512BW = 0x8000; +static const int kCpuHasAVX512VL = 0x10000; +static const int kCpuHasAVX512VBMI = 0x20000; +static const int kCpuHasAVX512VBMI2 = 0x40000; +static const int kCpuHasAVX512VBITALG = 0x80000; +static const int kCpuHasAVX512VPOPCNTDQ = 0x100000; // These flags are only valid on MIPS processors. -static const int kCpuHasMIPS = 0x10000; -static const int kCpuHasDSPR2 = 0x20000; +static const int kCpuHasMIPS = 0x200000; +static const int kCpuHasMSA = 0x400000; -// Internal function used to auto-init. +// Optional init function. TestCpuFlag does an auto-init. +// Returns cpu_info flags. LIBYUV_API int InitCpuFlags(void); +// Detect CPU has SSE2 etc. +// Test_flag parameter should be one of kCpuHas constants above. +// Returns non-zero if instruction set is detected +static __inline int TestCpuFlag(int test_flag) { + LIBYUV_API extern int cpu_info_; +#ifdef __ATOMIC_RELAXED + int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED); +#else + int cpu_info = cpu_info_; +#endif + return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag; +} + // Internal function for parsing /proc/cpuinfo. LIBYUV_API int ArmCpuCaps(const char* cpuinfo_name); -// Detect CPU has SSE2 etc. -// Test_flag parameter should be one of kCpuHas constants above. -// returns non-zero if instruction set is detected -static __inline int TestCpuFlag(int test_flag) { - LIBYUV_API extern int cpu_info_; - return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag; -} - // For testing, allow CPU flags to be disabled. // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. // MaskCpuFlags(-1) to enable all cpu specific optimizations. // MaskCpuFlags(1) to disable all cpu specific optimizations. +// MaskCpuFlags(0) to reset state so next call will auto init. +// Returns cpu_info flags. LIBYUV_API -void MaskCpuFlags(int enable_flags); +int MaskCpuFlags(int enable_flags); + +// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags| +// should be a valid combination of the kCpuHas constants above and include +// kCpuInitialized. Use this method when running in a sandboxed process where +// the detection code might fail (as it might access /proc/cpuinfo). In such +// cases the cpu_info can be obtained from a non sandboxed process by calling +// InitCpuFlags() and passed to the sandboxed process (via command line +// parameters, IPC...) which can then call this method to initialize the CPU +// flags. +// Notes: +// - when specifying 0 for |cpu_flags|, the auto initialization is enabled +// again. +// - enabling CPU features that are not supported by the CPU will result in +// undefined behavior. +// TODO(fbarchard): consider writing a helper function that translates from +// other library CPU info to libyuv CPU info and add a .md doc that explains +// CPU detection. +static __inline void SetCpuFlags(int cpu_flags) { + LIBYUV_API extern int cpu_info_; +#ifdef __ATOMIC_RELAXED + __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED); +#else + cpu_info_ = cpu_flags; +#endif +} // Low level cpuid for X86. Returns zeros on other CPUs. // eax is the info type that you want. // ecx is typically the cpu number, and should normally be zero. LIBYUV_API -void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info); +void CpuId(int info_eax, int info_ecx, int* cpu_info); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT +#endif // INCLUDE_LIBYUV_CPU_ID_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h new file mode 100644 index 0000000000..bba0e8aeda --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/macros_msa.h @@ -0,0 +1,233 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ +#define INCLUDE_LIBYUV_MACROS_MSA_H_ + +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include +#include + +#if (__mips_isa_rev >= 6) +#define LW(psrc) \ + ({ \ + const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ + uint32_t val_m; \ + asm volatile("lw %[val_m], %[psrc_lw_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint64_t val_m = 0; \ + asm volatile("ld %[val_m], %[psrc_ld_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + val0_m = LW(psrc_ld_m); \ + val1_m = LW(psrc_ld_m + 4); \ + val_m = (uint64_t)(val1_m); /* NOLINT */ \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SW(val, pdst) \ + ({ \ + uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val_m = (val); \ + asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_m] "r"(val_m)); \ + }) + +#if (__mips == 64) +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint64_t val_m = (val); \ + asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ + : [pdst_sd_m] "=m"(*pdst_sd_m) \ + : [val_m] "r"(val_m)); \ + }) +#else // !(__mips == 64) +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val0_m, val1_m; \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + }) +#endif // !(__mips == 64) +#else // !(__mips_isa_rev >= 6) +#define LW(psrc) \ + ({ \ + const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ + uint32_t val_m; \ + asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_lw_m] "m"(*psrc_lw_m)); \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint64_t val_m = 0; \ + asm volatile("uld %[val_m], %[psrc_ld_m] \n" \ + : [val_m] "=r"(val_m) \ + : [psrc_ld_m] "m"(*psrc_ld_m)); \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + val0_m = LW(psrc_ld_m); \ + val1_m = LW(psrc_ld_m + 4); \ + val_m = (uint64_t)(val1_m); /* NOLINT */ \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SW(val, pdst) \ + ({ \ + uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val_m = (val); \ + asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_m] "r"(val_m)); \ + }) + +#define SD(val, pdst) \ + ({ \ + uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ + uint32_t val0_m, val1_m; \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + }) +#endif // (__mips_isa_rev >= 6) + +// TODO(fbarchard): Consider removing __VAR_ARGS versions. +#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ +#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ +#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) + +/* Description : Load two vectors with 16 'byte' sized elements + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ + } +#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__) + +/* Description : Store two vectors with stride each having 16 'byte' sized + elements + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) + +// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) + +#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ + +#endif // INCLUDE_LIBYUV_MACROS_MSA_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h index 8423121d11..275f8d4c18 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ #define INCLUDE_LIBYUV_MJPEG_DECODER_H_ #include "libyuv/basic_types.h" @@ -26,25 +26,24 @@ namespace libyuv { extern "C" { #endif -LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size); +LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size); #ifdef __cplusplus } // extern "C" #endif -static const uint32 kUnknownDataSize = 0xFFFFFFFF; +static const uint32_t kUnknownDataSize = 0xFFFFFFFF; enum JpegSubsamplingType { kJpegYuv420, kJpegYuv422, - kJpegYuv411, kJpegYuv444, kJpegYuv400, kJpegUnknown }; struct Buffer { - const uint8* data; + const uint8_t* data; int len; }; @@ -66,7 +65,7 @@ struct SetJmpErrorMgr; class LIBYUV_API MJpegDecoder { public: typedef void (*CallbackFunction)(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows); @@ -86,7 +85,7 @@ class LIBYUV_API MJpegDecoder { // If return value is LIBYUV_TRUE, then the values for all the following // getters are populated. // src_len is the size of the compressed mjpeg frame in bytes. - LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len); + LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len); // Returns width of the last loaded frame in pixels. int GetWidth(); @@ -139,18 +138,22 @@ class LIBYUV_API MJpegDecoder { // at least GetComponentSize(i). The pointers in planes are incremented // to point to after the end of the written data. // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. - LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height); + LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height); // Decodes the entire image and passes the data via repeated calls to a // callback function. Each call will get the data for a whole number of // image scanlines. // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. - LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque, - int dst_width, int dst_height); + LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, + void* opaque, + int dst_width, + int dst_height); // The helper function which recognizes the jpeg sub-sampling type. static JpegSubsamplingType JpegSubsamplingTypeHelper( - int* subsample_x, int* subsample_y, int number_of_components); + int* subsample_x, + int* subsample_y, + int number_of_components); private: void AllocOutputBuffers(int num_outbufs); @@ -159,7 +162,7 @@ class LIBYUV_API MJpegDecoder { LIBYUV_BOOL StartDecode(); LIBYUV_BOOL FinishDecode(); - void SetScanlinePointers(uint8** data); + void SetScanlinePointers(uint8_t** data); LIBYUV_BOOL DecodeImcuRow(); int GetComponentScanlinePadding(int component); @@ -178,15 +181,15 @@ class LIBYUV_API MJpegDecoder { // Temporaries used to point to scanline outputs. int num_outbufs_; // Outermost size of all arrays below. - uint8*** scanlines_; + uint8_t*** scanlines_; int* scanlines_sizes_; // Temporary buffer used for decoding when we can't decode directly to the // output buffers. Large enough for just one iMCU row. - uint8** databuf_; + uint8_t** databuf_; int* databuf_strides_; }; } // namespace libyuv #endif // __cplusplus -#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT +#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h index 9662516c57..91137baba2 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/planar_functions.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ #include "libyuv/basic_types.h" @@ -22,449 +22,10 @@ namespace libyuv { extern "C" { #endif -// Copy a plane of data. -LIBYUV_API -void CopyPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); - -LIBYUV_API -void CopyPlane_16(const uint16* src_y, int src_stride_y, - uint16* dst_y, int dst_stride_y, - int width, int height); - -// Set a plane of data to a 32 bit value. -LIBYUV_API -void SetPlane(uint8* dst_y, int dst_stride_y, - int width, int height, - uint32 value); - -// Split interleaved UV plane into separate U and V planes. -LIBYUV_API -void SplitUVPlane(const uint8* src_uv, int src_stride_uv, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Merge separate U and V planes into one interleaved UV plane. -LIBYUV_API -void MergeUVPlane(const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_uv, int dst_stride_uv, - int width, int height); - -// Copy I400. Supports inverting. -LIBYUV_API -int I400ToI400(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); - -#define J400ToJ400 I400ToI400 - -// Copy I422 to I422. -#define I422ToI422 I422Copy -LIBYUV_API -int I422Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Copy I444 to I444. -#define I444ToI444 I444Copy -LIBYUV_API -int I444Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Convert YUY2 to I422. -LIBYUV_API -int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Convert UYVY to I422. -LIBYUV_API -int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -LIBYUV_API -int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); - -LIBYUV_API -int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height); - -// Convert I420 to I400. (calls CopyPlane ignoring u/v). -LIBYUV_API -int I420ToI400(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - int width, int height); - -// Alias -#define J420ToJ400 I420ToI400 -#define I420ToI420Mirror I420Mirror - -// I420 mirror. -LIBYUV_API -int I420Mirror(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Alias -#define I400ToI400Mirror I400Mirror - -// I400 mirror. A single plane is mirrored horizontally. -// Pass negative height to achieve 180 degree rotation. -LIBYUV_API -int I400Mirror(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height); - -// Alias -#define ARGBToARGBMirror ARGBMirror - -// ARGB mirror. -LIBYUV_API -int ARGBMirror(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Convert NV12 to RGB565. -LIBYUV_API -int NV12ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height); - -// I422ToARGB is in convert_argb.h -// Convert I422 to BGRA. -LIBYUV_API -int I422ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height); - -// Convert I422 to ABGR. -LIBYUV_API -int I422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height); - -// Convert I422 to RGBA. -LIBYUV_API -int I422ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height); - -// Alias -#define RGB24ToRAW RAWToRGB24 - -LIBYUV_API -int RAWToRGB24(const uint8* src_raw, int src_stride_raw, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height); - -// Draw a rectangle into I420. -LIBYUV_API -int I420Rect(uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int x, int y, int width, int height, - int value_y, int value_u, int value_v); - -// Draw a rectangle into ARGB. -LIBYUV_API -int ARGBRect(uint8* dst_argb, int dst_stride_argb, - int x, int y, int width, int height, uint32 value); - -// Convert ARGB to gray scale ARGB. -LIBYUV_API -int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Make a rectangle of ARGB gray scale. -LIBYUV_API -int ARGBGray(uint8* dst_argb, int dst_stride_argb, - int x, int y, int width, int height); - -// Make a rectangle of ARGB Sepia tone. -LIBYUV_API -int ARGBSepia(uint8* dst_argb, int dst_stride_argb, - int x, int y, int width, int height); - -// Apply a matrix rotation to each ARGB pixel. -// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. -// The first 4 coefficients apply to B, G, R, A and produce B of the output. -// The next 4 coefficients apply to B, G, R, A and produce G of the output. -// The next 4 coefficients apply to B, G, R, A and produce R of the output. -// The last 4 coefficients apply to B, G, R, A and produce A of the output. -LIBYUV_API -int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - const int8* matrix_argb, - int width, int height); - -// Deprecated. Use ARGBColorMatrix instead. -// Apply a matrix rotation to each ARGB pixel. -// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. -// The first 4 coefficients apply to B, G, R, A and produce B of the output. -// The next 4 coefficients apply to B, G, R, A and produce G of the output. -// The last 4 coefficients apply to B, G, R, A and produce R of the output. -LIBYUV_API -int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, - const int8* matrix_rgb, - int x, int y, int width, int height); - -// Apply a color table each ARGB pixel. -// Table contains 256 ARGB values. -LIBYUV_API -int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, - const uint8* table_argb, - int x, int y, int width, int height); - -// Apply a color table each ARGB pixel but preserve destination alpha. -// Table contains 256 ARGB values. -LIBYUV_API -int RGBColorTable(uint8* dst_argb, int dst_stride_argb, - const uint8* table_argb, - int x, int y, int width, int height); - -// Apply a luma/color table each ARGB pixel but preserve destination alpha. -// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from -// RGB (YJ style) and C is an 8 bit color component (R, G or B). -LIBYUV_API -int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - const uint8* luma_rgb_table, - int width, int height); - -// Apply a 3 term polynomial to ARGB values. -// poly points to a 4x4 matrix. The first row is constants. The 2nd row is -// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, -// g squared, r squared and a squared. The 4rd row is coefficients for b to -// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and -// result clamped to 0 to 255. -// A polynomial approximation can be dirived using software such as 'R'. - -LIBYUV_API -int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - const float* poly, - int width, int height); - -// Quantize a rectangle of ARGB. Alpha unaffected. -// scale is a 16 bit fractional fixed point scaler between 0 and 65535. -// interval_size should be a value between 1 and 255. -// interval_offset should be a value between 0 and 255. -LIBYUV_API -int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, - int scale, int interval_size, int interval_offset, - int x, int y, int width, int height); - -// Copy ARGB to ARGB. -LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Copy Alpha channel of ARGB to alpha of ARGB. -LIBYUV_API -int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Extract the alpha channel from ARGB. -LIBYUV_API -int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb, - uint8* dst_a, int dst_stride_a, - int width, int height); - -// Copy Y channel to Alpha of ARGB. -LIBYUV_API -int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width); - -// Get function to Alpha Blend ARGB pixels and store to destination. -LIBYUV_API -ARGBBlendRow GetARGBBlend(); - -// Alpha Blend ARGB images and store to destination. -// Source is pre-multiplied by alpha using ARGBAttenuate. -// Alpha of destination is set to 255. -LIBYUV_API -int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Alpha Blend plane and store to destination. -// Source is not pre-multiplied by alpha. -LIBYUV_API -int BlendPlane(const uint8* src_y0, int src_stride_y0, - const uint8* src_y1, int src_stride_y1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - int width, int height); - -// Alpha Blend YUV images and store to destination. -// Source is not pre-multiplied by alpha. -// Alpha is full width x height and subsampled to half size to apply to UV. -LIBYUV_API -int I420Blend(const uint8* src_y0, int src_stride_y0, - const uint8* src_u0, int src_stride_u0, - const uint8* src_v0, int src_stride_v0, - const uint8* src_y1, int src_stride_y1, - const uint8* src_u1, int src_stride_u1, - const uint8* src_v1, int src_stride_v1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. -LIBYUV_API -int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Add ARGB image with ARGB image. Saturates to 255. -LIBYUV_API -int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. -LIBYUV_API -int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Convert I422 to YUY2. -LIBYUV_API -int I422ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -// Convert I422 to UYVY. -LIBYUV_API -int I422ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -// Convert unattentuated ARGB to preattenuated ARGB. -LIBYUV_API -int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Convert preattentuated ARGB to unattenuated ARGB. -LIBYUV_API -int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Internal function - do not call directly. -// Computes table of cumulative sum for image where the value is the sum -// of all values above and to the left of the entry. Used by ARGBBlur. -LIBYUV_API -int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height); - -// Blur ARGB image. -// dst_cumsum table of width * (height + 1) * 16 bytes aligned to -// 16 byte boundary. -// dst_stride32_cumsum is number of ints in a row (width * 4). -// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. -// Blur is optimized for radius of 5 (11x11) or less. -LIBYUV_API -int ARGBBlur(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height, int radius); - -// Multiply ARGB image by ARGB value. -LIBYUV_API -int ARGBShade(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height, uint32 value); - -// Interpolate between two images using specified amount of interpolation -// (0 to 255) and store to destination. -// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0 -// and 255 means 1% src0 and 99% src1. -LIBYUV_API -int InterpolatePlane(const uint8* src0, int src_stride0, - const uint8* src1, int src_stride1, - uint8* dst, int dst_stride, - int width, int height, int interpolation); - -// Interpolate between two ARGB images using specified amount of interpolation -// Internally calls InterpolatePlane with width * 4 (bpp). -LIBYUV_API -int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int interpolation); - -// Interpolate between two YUV images using specified amount of interpolation -// Internally calls InterpolatePlane on each plane where the U and V planes -// are half width and half height. -LIBYUV_API -int I420Interpolate(const uint8* src0_y, int src0_stride_y, - const uint8* src0_u, int src0_stride_u, - const uint8* src0_v, int src0_stride_v, - const uint8* src1_y, int src1_stride_y, - const uint8* src1_u, int src1_stride_u, - const uint8* src1_v, int src1_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, int interpolation); - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) +// TODO(fbarchard): Move cpu macros to row.h +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 @@ -479,43 +40,808 @@ int I420Interpolate(const uint8* src0_y, int src0_stride_y, #define HAS_ARGBAFFINEROW_SSE2 #endif +// Copy a plane of data. +LIBYUV_API +void CopyPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +LIBYUV_API +void CopyPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height); + +LIBYUV_API +void Convert16To8Plane(const uint16_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height); + +LIBYUV_API +void Convert8To16Plane(const uint8_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int scale, // 1024 for 10 bits + int width, + int height); + +// Set a plane of data to a 32 bit value. +LIBYUV_API +void SetPlane(uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + uint32_t value); + +// Split interleaved UV plane into separate U and V planes. +LIBYUV_API +void SplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Merge separate U and V planes into one interleaved UV plane. +LIBYUV_API +void MergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Split interleaved RGB plane into separate R, G and B planes. +LIBYUV_API +void SplitRGBPlane(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); + +// Merge separate R, G and B planes into one interleaved RGB plane. +LIBYUV_API +void MergeRGBPlane(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_rgb, + int dst_stride_rgb, + int width, + int height); + +// Copy I400. Supports inverting. +LIBYUV_API +int I400ToI400(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +#define J400ToJ400 I400ToI400 + +// Copy I422 to I422. +#define I422ToI422 I422Copy +LIBYUV_API +int I422Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Copy I444 to I444. +#define I444ToI444 I444Copy +LIBYUV_API +int I444Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert YUY2 to I422. +LIBYUV_API +int YUY2ToI422(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Convert UYVY to I422. +LIBYUV_API +int UYVYToI422(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +LIBYUV_API +int YUY2ToNV12(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +LIBYUV_API +int UYVYToNV12(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +LIBYUV_API +int YUY2ToY(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Convert I420 to I400. (calls CopyPlane ignoring u/v). +LIBYUV_API +int I420ToI400(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Alias +#define J420ToJ400 I420ToI400 +#define I420ToI420Mirror I420Mirror + +// I420 mirror. +LIBYUV_API +int I420Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Alias +#define I400ToI400Mirror I400Mirror + +// I400 mirror. A single plane is mirrored horizontally. +// Pass negative height to achieve 180 degree rotation. +LIBYUV_API +int I400Mirror(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Alias +#define ARGBToARGBMirror ARGBMirror + +// ARGB mirror. +LIBYUV_API +int ARGBMirror(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert NV12 to RGB565. +LIBYUV_API +int NV12ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height); + +// I422ToARGB is in convert_argb.h +// Convert I422 to BGRA. +LIBYUV_API +int I422ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height); + +// Convert I422 to ABGR. +LIBYUV_API +int I422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height); + +// Convert I422 to RGBA. +LIBYUV_API +int I422ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height); + +// Alias +#define RGB24ToRAW RAWToRGB24 + +LIBYUV_API +int RAWToRGB24(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + +// Draw a rectangle into I420. +LIBYUV_API +int I420Rect(uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int x, + int y, + int width, + int height, + int value_y, + int value_u, + int value_v); + +// Draw a rectangle into ARGB. +LIBYUV_API +int ARGBRect(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height, + uint32_t value); + +// Convert ARGB to gray scale ARGB. +LIBYUV_API +int ARGBGrayTo(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Make a rectangle of ARGB gray scale. +LIBYUV_API +int ARGBGray(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Make a rectangle of ARGB Sepia tone. +LIBYUV_API +int ARGBSepia(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The next 4 coefficients apply to B, G, R, A and produce R of the output. +// The last 4 coefficients apply to B, G, R, A and produce A of the output. +LIBYUV_API +int ARGBColorMatrix(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_argb, + int width, + int height); + +// Deprecated. Use ARGBColorMatrix instead. +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The last 4 coefficients apply to B, G, R, A and produce R of the output. +LIBYUV_API +int RGBColorMatrix(uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_rgb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +LIBYUV_API +int ARGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height); + +// Apply a luma/color table each ARGB pixel but preserve destination alpha. +// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from +// RGB (YJ style) and C is an 8 bit color component (R, G or B). +LIBYUV_API +int ARGBLumaColorTable(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* luma, + int width, + int height); + +// Apply a 3 term polynomial to ARGB values. +// poly points to a 4x4 matrix. The first row is constants. The 2nd row is +// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, +// g squared, r squared and a squared. The 4rd row is coefficients for b to +// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and +// result clamped to 0 to 255. +// A polynomial approximation can be dirived using software such as 'R'. + +LIBYUV_API +int ARGBPolynomial(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const float* poly, + int width, + int height); + +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + float scale, + int width, + int height); + +// Convert a buffer of bytes to floats, scale the values and store as floats. +LIBYUV_API +int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width); + +// Quantize a rectangle of ARGB. Alpha unaffected. +// scale is a 16 bit fractional fixed point scaler between 0 and 65535. +// interval_size should be a value between 1 and 255. +// interval_offset should be a value between 0 and 255. +LIBYUV_API +int ARGBQuantize(uint8_t* dst_argb, + int dst_stride_argb, + int scale, + int interval_size, + int interval_offset, + int dst_x, + int dst_y, + int width, + int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Copy Alpha channel of ARGB to alpha of ARGB. +LIBYUV_API +int ARGBCopyAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Extract the alpha channel from ARGB. +LIBYUV_API +int ARGBExtractAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height); + +// Copy Y channel to Alpha of ARGB. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +typedef void (*ARGBBlendRow)(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); + +// Get function to Alpha Blend ARGB pixels and store to destination. +LIBYUV_API +ARGBBlendRow GetARGBBlend(); + +// Alpha Blend ARGB images and store to destination. +// Source is pre-multiplied by alpha using ARGBAttenuate. +// Alpha of destination is set to 255. +LIBYUV_API +int ARGBBlend(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Alpha Blend plane and store to destination. +// Source is not pre-multiplied by alpha. +LIBYUV_API +int BlendPlane(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); + +// Alpha Blend YUV images and store to destination. +// Source is not pre-multiplied by alpha. +// Alpha is full width x height and subsampled to half size to apply to UV. +LIBYUV_API +int I420Blend(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_u0, + int src_stride_u0, + const uint8_t* src_v0, + int src_stride_v0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* src_u1, + int src_stride_u1, + const uint8_t* src_v1, + int src_stride_v1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + +// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. +LIBYUV_API +int ARGBMultiply(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Add ARGB image with ARGB image. Saturates to 255. +LIBYUV_API +int ARGBAdd(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. +LIBYUV_API +int ARGBSubtract(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert I422 to YUY2. +LIBYUV_API +int I422ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); + +// Convert I422 to UYVY. +LIBYUV_API +int I422ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height); + +// Convert unattentuated ARGB to preattenuated ARGB. +LIBYUV_API +int ARGBAttenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert preattentuated ARGB to unattenuated ARGB. +LIBYUV_API +int ARGBUnattenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Internal function - do not call directly. +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +LIBYUV_API +int ARGBComputeCumulativeSum(const uint8_t* src_argb, + int src_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height); + +// Blur ARGB image. +// dst_cumsum table of width * (height + 1) * 16 bytes aligned to +// 16 byte boundary. +// dst_stride32_cumsum is number of ints in a row (width * 4). +// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. +// Blur is optimized for radius of 5 (11x11) or less. +LIBYUV_API +int ARGBBlur(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height, + int radius); + +// Multiply ARGB image by ARGB value. +LIBYUV_API +int ARGBShade(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + uint32_t value); + +// Interpolate between two images using specified amount of interpolation +// (0 to 255) and store to destination. +// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0 +// and 255 means 1% src0 and 99% src1. +LIBYUV_API +int InterpolatePlane(const uint8_t* src0, + int src_stride0, + const uint8_t* src1, + int src_stride1, + uint8_t* dst, + int dst_stride, + int width, + int height, + int interpolation); + +// Interpolate between two ARGB images using specified amount of interpolation +// Internally calls InterpolatePlane with width * 4 (bpp). +LIBYUV_API +int ARGBInterpolate(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int interpolation); + +// Interpolate between two YUV images using specified amount of interpolation +// Internally calls InterpolatePlane on each plane where the U and V planes +// are half width and half height. +LIBYUV_API +int I420Interpolate(const uint8_t* src0_y, + int src0_stride_y, + const uint8_t* src0_u, + int src0_stride_u, + const uint8_t* src0_v, + int src0_stride_v, + const uint8_t* src1_y, + int src1_stride_y, + const uint8_t* src1_u, + int src1_stride_u, + const uint8_t* src1_v, + int src1_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int interpolation); + // Row function for copying pixels from a source with a slope to a row // of destination. Useful for scaling, rotation, mirror, texture mapping. LIBYUV_API -void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_C(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width); +// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width); // Shuffle ARGB channel order. e.g. BGRA to ARGB. // shuffler is 16 bytes and must be aligned. LIBYUV_API -int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - const uint8* shuffler, int width, int height); +int ARGBShuffle(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* shuffler, + int width, + int height); // Sobel ARGB effect with planar output. LIBYUV_API -int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height); +int ARGBSobelToPlane(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height); // Sobel ARGB effect. LIBYUV_API -int ARGBSobel(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBSobel(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); // Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB. LIBYUV_API -int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int ARGBSobelXY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT +#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h index 8af60b8955..76b692be8b 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROTATE_H_ #define INCLUDE_LIBYUV_ROTATE_H_ #include "libyuv/basic_types.h" @@ -20,8 +20,8 @@ extern "C" { // Supported rotation. typedef enum RotationMode { - kRotate0 = 0, // No rotation. - kRotate90 = 90, // Rotate 90 degrees clockwise. + kRotate0 = 0, // No rotation. + kRotate90 = 90, // Rotate 90 degrees clockwise. kRotate180 = 180, // Rotate 180 degrees. kRotate270 = 270, // Rotate 270 degrees clockwise. @@ -33,85 +33,132 @@ typedef enum RotationMode { // Rotate I420 frame. LIBYUV_API -int I420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_width, int src_height, enum RotationMode mode); +int I420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode); // Rotate NV12 input and store in I420. LIBYUV_API -int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_width, int src_height, enum RotationMode mode); +int NV12ToI420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + enum RotationMode mode); // Rotate a plane by 0, 90, 180, or 270. LIBYUV_API -int RotatePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int src_width, int src_height, enum RotationMode mode); +int RotatePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height, + enum RotationMode mode); // Rotate planes by 90, 180, 270. Deprecated. LIBYUV_API -void RotatePlane90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void RotatePlane90(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void RotatePlane180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void RotatePlane180(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void RotatePlane270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void RotatePlane270(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void RotateUV90(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void RotateUV90(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); // Rotations for when U and V are interleaved. // These functions take one input pointer and // split the data into two buffers while // rotating them. Deprecated. LIBYUV_API -void RotateUV180(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void RotateUV180(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); LIBYUV_API -void RotateUV270(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void RotateUV270(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); // The 90 and 270 functions are based on transposes. // Doing a transpose with reversing the read/write // order will result in a rotation by +- 90 degrees. // Deprecated. LIBYUV_API -void TransposePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height); +void TransposePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); LIBYUV_API -void TransposeUV(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void TransposeUV(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROTATE_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h index 660ff5573e..20432949ab 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ #define INCLUDE_LIBYUV_ROTATE_ARGB_H_ #include "libyuv/basic_types.h" @@ -21,13 +21,17 @@ extern "C" { // Rotate ARGB frame LIBYUV_API -int ARGBRotate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int src_width, int src_height, enum RotationMode mode); +int ARGBRotate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + enum RotationMode mode); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h index ebc487f9ab..5edc0fcf13 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/rotate_row.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ #define INCLUDE_LIBYUV_ROTATE_ROW_H_ #include "libyuv/basic_types.h" @@ -18,10 +18,14 @@ namespace libyuv { extern "C" { #endif -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) @@ -29,93 +33,162 @@ extern "C" { #endif #endif // The following are available for Visual C and clangcl 32 bit: -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #define HAS_TRANSPOSEWX8_SSSE3 #define HAS_TRANSPOSEUVWX8_SSE2 #endif -// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) +// The following are available for GCC 32 or 64 bit: +#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) #define HAS_TRANSPOSEWX8_SSSE3 #endif -// The following are available for 64 bit GCC but not NaCL: -#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ - defined(__x86_64__) +// The following are available for 64 bit GCC: +#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) #define HAS_TRANSPOSEWX8_FAST_SSSE3 #define HAS_TRANSPOSEUVWX8_SSE2 #endif -#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ +#if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_TRANSPOSEWX8_NEON #define HAS_TRANSPOSEUVWX8_NEON #endif -#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ - defined(__mips__) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_TRANSPOSEWX8_DSPR2 -#define HAS_TRANSPOSEUVWX8_DSPR2 -#endif // defined(__mips__) +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_TRANSPOSEWX16_MSA +#define HAS_TRANSPOSEUVWX16_MSA +#endif -void TransposeWxH_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height); +void TransposeWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height); -void TransposeWx8_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); +void TransposeWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_Fast_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); -void TransposeWx8_Any_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); +void TransposeWx8_Any_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_Any_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); +void TransposeWx16_Any_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); -void TransposeUVWxH_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height); +void TransposeUVWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height); -void TransposeUVWx8_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); -void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); -void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_Any_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx8_Any_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); +void TransposeUVWx16_Any_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h index 013a7e53e3..65ef448b8c 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/row.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_ROW_H_ #define INCLUDE_LIBYUV_ROW_H_ #include // For malloc. @@ -20,41 +20,20 @@ namespace libyuv { extern "C" { #endif -#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) - -#ifdef __cplusplus -#define align_buffer_64(var, size) \ - uint8* var##_mem = reinterpret_cast(malloc((size) + 63)); \ - uint8* var = reinterpret_cast \ - ((reinterpret_cast(var##_mem) + 63) & ~63) -#else -#define align_buffer_64(var, size) \ - uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \ - uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ -#endif - -#define free_aligned_buffer_64(var) \ - free(var##_mem); \ - var = 0 - -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) #define LIBYUV_DISABLE_X86 #endif #endif -// True if compiling for SSSE3 as a requirement. -#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3)) -#define LIBYUV_SSSE3_ONLY -#endif - -#if defined(__native_client__) -#define LIBYUV_DISABLE_NEON -#endif // clang >= 3.5.0 required for Arm64. #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) @@ -76,9 +55,19 @@ extern "C" { #endif // clang >= 3.4 #endif // __clang__ +// clang >= 6.0.0 required for AVX512. +// TODO(fbarchard): fix xcode 9 ios b/789. +#if 0 // Build fails in libvpx on Mac +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ >= 7) && !defined(__APPLE_EMBEDDED_SIMULATOR__) +#define CLANG_HAS_AVX512 1 +#endif // clang >= 7 +#endif // __clang__ +#endif // 0 + // Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && \ - defined(_MSC_VER) && _MSC_VER >= 1700 +#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ + _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 @@ -90,8 +79,8 @@ extern "C" { #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_ARGBSETROW_X86 -#define HAS_ARGBSHUFFLEROW_SSE2 #define HAS_ARGBSHUFFLEROW_SSSE3 #define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2 @@ -104,12 +93,12 @@ extern "C" { #define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 -#define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 +#define HAS_HALFFLOATROW_SSE2 #define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 @@ -126,8 +115,10 @@ extern "C" { #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 +#define HAS_NV12TORGB24ROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3 #define HAS_NV21TOARGBROW_SSSE3 +#define HAS_NV21TORGB24ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTORGB24ROW_SSSE3 #define HAS_RAWTOYROW_SSSE3 @@ -180,11 +171,8 @@ extern "C" { // The following functions fail on gcc/clang 32 bit with fpic and framepointer. // caveat: clangcl uses row_win.cc which works. -#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \ - !defined(__i386__) || defined(_MSC_VER) -// TODO(fbarchard): fix build error on x86 debug -// https://code.google.com/p/libyuv/issues/detail?id=524 -#define HAS_I411TOARGBROW_SSSE3 +#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ + defined(_MSC_VER) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 #define HAS_I422ALPHATOARGBROW_SSSE3 @@ -193,11 +181,12 @@ extern "C" { // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. -// The code supports NaCL but requires a new compiler and validator. -#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ - defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(GCC_HAS_AVX2)) #define HAS_ARGBCOPYALPHAROW_AVX2 #define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#define HAS_ARGBEXTRACTALPHAROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 @@ -208,13 +197,9 @@ extern "C" { #define HAS_ARGBTOYROW_AVX2 #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 +#define HAS_HALFFLOATROW_AVX2 +// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast #define HAS_I400TOARGBROW_AVX2 -#if !(defined(_DEBUG) && defined(__i386__)) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_AVX2 -#endif -#define HAS_I411TOARGBROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGBROW_AVX2 @@ -227,8 +212,10 @@ extern "C" { #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 +#define HAS_NV12TORGB24ROW_AVX2 #define HAS_NV12TORGB565ROW_AVX2 #define HAS_NV21TOARGBROW_AVX2 +#define HAS_NV21TORGB24ROW_AVX2 #define HAS_SPLITUVROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2 #define HAS_UYVYTOUV422ROW_AVX2 @@ -246,11 +233,18 @@ extern "C" { #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_BLENDPLANEROW_AVX2 + +#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ + defined(_MSC_VER) +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +#define HAS_I422ALPHATOARGBROW_AVX2 +#endif #endif // The following are available for AVX2 Visual C and clangcl 32 bit: // TODO(fbarchard): Port to gcc. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) #define HAS_ARGB1555TOARGBROW_AVX2 #define HAS_ARGB4444TOARGBROW_AVX2 @@ -268,6 +262,51 @@ extern "C" { #define HAS_I422TOARGBROW_SSSE3 #endif +// The following are available for gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_ABGRTOAR30ROW_SSSE3 +#define HAS_ARGBTOAR30ROW_SSSE3 +#define HAS_CONVERT16TO8ROW_SSSE3 +#define HAS_CONVERT8TO16ROW_SSE2 +// I210 is for H010. 2 = 422. I for 601 vs H for 709. +#define HAS_I210TOAR30ROW_SSSE3 +#define HAS_I210TOARGBROW_SSSE3 +#define HAS_I422TOAR30ROW_SSSE3 +#define HAS_MERGERGBROW_SSSE3 +#define HAS_SPLITRGBROW_SSSE3 +#endif + +// The following are available for AVX2 gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_ABGRTOAR30ROW_AVX2 +#define HAS_ARGBTOAR30ROW_AVX2 +#define HAS_ARGBTORAWROW_AVX2 +#define HAS_ARGBTORGB24ROW_AVX2 +#define HAS_CONVERT16TO8ROW_AVX2 +#define HAS_CONVERT8TO16ROW_AVX2 +#define HAS_I210TOAR30ROW_AVX2 +#define HAS_I210TOARGBROW_AVX2 +#define HAS_I422TOAR30ROW_AVX2 +#define HAS_I422TOUYVYROW_AVX2 +#define HAS_I422TOYUY2ROW_AVX2 +#define HAS_MERGEUVROW_16_AVX2 +#define HAS_MULTIPLYROW_16_AVX2 +#endif + +// The following are available for AVX512 clang x86 platforms: +// TODO(fbarchard): Port to GCC and Visual C +// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ + (defined(CLANG_HAS_AVX512)) +#define HAS_ARGBTORGB24ROW_AVX512VBMI +#endif + // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) @@ -279,6 +318,7 @@ extern "C" { #define HAS_ARGB4444TOARGBROW_NEON #define HAS_ARGB4444TOUVROW_NEON #define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBEXTRACTALPHAROW_NEON #define HAS_ARGBSETROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON @@ -286,18 +326,17 @@ extern "C" { #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565DITHERROW_NEON #define HAS_ARGBTORGB565ROW_NEON -#define HAS_ARGBTOUV411ROW_NEON #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYROW_NEON -#define HAS_ARGBEXTRACTALPHAROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON +#define HAS_BYTETOFLOATROW_NEON #define HAS_COPYROW_NEON +#define HAS_HALFFLOATROW_NEON #define HAS_I400TOARGBROW_NEON -#define HAS_I411TOARGBROW_NEON #define HAS_I422ALPHATOARGBROW_NEON #define HAS_I422TOARGB1555ROW_NEON #define HAS_I422TOARGB4444ROW_NEON @@ -313,8 +352,10 @@ extern "C" { #define HAS_MIRRORROW_NEON #define HAS_MIRRORUVROW_NEON #define HAS_NV12TOARGBROW_NEON +#define HAS_NV12TORGB24ROW_NEON #define HAS_NV12TORGB565ROW_NEON #define HAS_NV21TOARGBROW_NEON +#define HAS_NV21TORGB24ROW_NEON #define HAS_RAWTOARGBROW_NEON #define HAS_RAWTORGB24ROW_NEON #define HAS_RAWTOUVROW_NEON @@ -328,6 +369,7 @@ extern "C" { #define HAS_RGBATOUVROW_NEON #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON +#define HAS_SPLITRGBROW_NEON #define HAS_SPLITUVROW_NEON #define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOUV422ROW_NEON @@ -359,17 +401,87 @@ extern "C" { #define HAS_SOBELYROW_NEON #endif -// The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) -#define HAS_COPYROW_MIPS -#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_I422TOARGBROW_DSPR2 -#define HAS_INTERPOLATEROW_DSPR2 -#define HAS_MIRRORROW_DSPR2 -#define HAS_MIRRORUVROW_DSPR2 -#define HAS_SPLITUVROW_DSPR2 +// The following are available on AArch64 platforms: +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_SCALESUMSAMPLES_NEON #endif +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_ABGRTOUVROW_MSA +#define HAS_ABGRTOYROW_MSA +#define HAS_ARGB1555TOARGBROW_MSA +#define HAS_ARGB1555TOUVROW_MSA +#define HAS_ARGB1555TOYROW_MSA +#define HAS_ARGB4444TOARGBROW_MSA +#define HAS_ARGBADDROW_MSA +#define HAS_ARGBATTENUATEROW_MSA +#define HAS_ARGBBLENDROW_MSA +#define HAS_ARGBCOLORMATRIXROW_MSA +#define HAS_ARGBEXTRACTALPHAROW_MSA +#define HAS_ARGBGRAYROW_MSA +#define HAS_ARGBMIRRORROW_MSA +#define HAS_ARGBMULTIPLYROW_MSA +#define HAS_ARGBQUANTIZEROW_MSA +#define HAS_ARGBSEPIAROW_MSA +#define HAS_ARGBSETROW_MSA +#define HAS_ARGBSHADEROW_MSA +#define HAS_ARGBSHUFFLEROW_MSA +#define HAS_ARGBSUBTRACTROW_MSA +#define HAS_ARGBTOARGB1555ROW_MSA +#define HAS_ARGBTOARGB4444ROW_MSA +#define HAS_ARGBTORAWROW_MSA +#define HAS_ARGBTORGB24ROW_MSA +#define HAS_ARGBTORGB565DITHERROW_MSA +#define HAS_ARGBTORGB565ROW_MSA +#define HAS_ARGBTOUV444ROW_MSA +#define HAS_ARGBTOUVJROW_MSA +#define HAS_ARGBTOUVROW_MSA +#define HAS_ARGBTOYJROW_MSA +#define HAS_ARGBTOYROW_MSA +#define HAS_BGRATOUVROW_MSA +#define HAS_BGRATOYROW_MSA +#define HAS_HALFFLOATROW_MSA +#define HAS_I400TOARGBROW_MSA +#define HAS_I422ALPHATOARGBROW_MSA +#define HAS_I422TOARGBROW_MSA +#define HAS_I422TORGB24ROW_MSA +#define HAS_I422TORGBAROW_MSA +#define HAS_I422TOUYVYROW_MSA +#define HAS_I422TOYUY2ROW_MSA +#define HAS_I444TOARGBROW_MSA +#define HAS_INTERPOLATEROW_MSA +#define HAS_J400TOARGBROW_MSA +#define HAS_MERGEUVROW_MSA +#define HAS_MIRRORROW_MSA +#define HAS_MIRRORUVROW_MSA +#define HAS_NV12TOARGBROW_MSA +#define HAS_NV12TORGB565ROW_MSA +#define HAS_NV21TOARGBROW_MSA +#define HAS_RAWTOARGBROW_MSA +#define HAS_RAWTORGB24ROW_MSA +#define HAS_RAWTOUVROW_MSA +#define HAS_RAWTOYROW_MSA +#define HAS_RGB24TOARGBROW_MSA +#define HAS_RGB24TOUVROW_MSA +#define HAS_RGB24TOYROW_MSA +#define HAS_RGB565TOARGBROW_MSA +#define HAS_RGB565TOUVROW_MSA +#define HAS_RGB565TOYROW_MSA +#define HAS_RGBATOUVROW_MSA +#define HAS_RGBATOYROW_MSA +#define HAS_SETROW_MSA +#define HAS_SOBELROW_MSA +#define HAS_SOBELTOPLANEROW_MSA +#define HAS_SOBELXROW_MSA +#define HAS_SOBELXYROW_MSA +#define HAS_SOBELYROW_MSA +#define HAS_SPLITUVROW_MSA +#define HAS_UYVYTOARGBROW_MSA +#define HAS_UYVYTOUVROW_MSA +#define HAS_UYVYTOYROW_MSA +#define HAS_YUY2TOARGBROW_MSA +#define HAS_YUY2TOUV422ROW_MSA +#define HAS_YUY2TOUVROW_MSA +#define HAS_YUY2TOYROW_MSA #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -378,18 +490,18 @@ extern "C" { #else #define SIMD_ALIGNED(var) __declspec(align(16)) var #endif -typedef __declspec(align(16)) int16 vec16[8]; -typedef __declspec(align(16)) int32 vec32[4]; -typedef __declspec(align(16)) int8 vec8[16]; -typedef __declspec(align(16)) uint16 uvec16[8]; -typedef __declspec(align(16)) uint32 uvec32[4]; -typedef __declspec(align(16)) uint8 uvec8[16]; -typedef __declspec(align(32)) int16 lvec16[16]; -typedef __declspec(align(32)) int32 lvec32[8]; -typedef __declspec(align(32)) int8 lvec8[32]; -typedef __declspec(align(32)) uint16 ulvec16[16]; -typedef __declspec(align(32)) uint32 ulvec32[8]; -typedef __declspec(align(32)) uint8 ulvec8[32]; +typedef __declspec(align(16)) int16_t vec16[8]; +typedef __declspec(align(16)) int32_t vec32[4]; +typedef __declspec(align(16)) int8_t vec8[16]; +typedef __declspec(align(16)) uint16_t uvec16[8]; +typedef __declspec(align(16)) uint32_t uvec32[4]; +typedef __declspec(align(16)) uint8_t uvec8[16]; +typedef __declspec(align(32)) int16_t lvec16[16]; +typedef __declspec(align(32)) int32_t lvec32[8]; +typedef __declspec(align(32)) int8_t lvec8[32]; +typedef __declspec(align(32)) uint16_t ulvec16[16]; +typedef __declspec(align(32)) uint32_t ulvec32[8]; +typedef __declspec(align(32)) uint8_t ulvec8[32]; #elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__)) // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. #if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2) @@ -397,32 +509,32 @@ typedef __declspec(align(32)) uint8 ulvec8[32]; #else #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #endif -typedef int16 __attribute__((vector_size(16))) vec16; -typedef int32 __attribute__((vector_size(16))) vec32; -typedef int8 __attribute__((vector_size(16))) vec8; -typedef uint16 __attribute__((vector_size(16))) uvec16; -typedef uint32 __attribute__((vector_size(16))) uvec32; -typedef uint8 __attribute__((vector_size(16))) uvec8; -typedef int16 __attribute__((vector_size(32))) lvec16; -typedef int32 __attribute__((vector_size(32))) lvec32; -typedef int8 __attribute__((vector_size(32))) lvec8; -typedef uint16 __attribute__((vector_size(32))) ulvec16; -typedef uint32 __attribute__((vector_size(32))) ulvec32; -typedef uint8 __attribute__((vector_size(32))) ulvec8; +typedef int16_t __attribute__((vector_size(16))) vec16; +typedef int32_t __attribute__((vector_size(16))) vec32; +typedef int8_t __attribute__((vector_size(16))) vec8; +typedef uint16_t __attribute__((vector_size(16))) uvec16; +typedef uint32_t __attribute__((vector_size(16))) uvec32; +typedef uint8_t __attribute__((vector_size(16))) uvec8; +typedef int16_t __attribute__((vector_size(32))) lvec16; +typedef int32_t __attribute__((vector_size(32))) lvec32; +typedef int8_t __attribute__((vector_size(32))) lvec8; +typedef uint16_t __attribute__((vector_size(32))) ulvec16; +typedef uint32_t __attribute__((vector_size(32))) ulvec32; +typedef uint8_t __attribute__((vector_size(32))) ulvec8; #else #define SIMD_ALIGNED(var) var -typedef int16 vec16[8]; -typedef int32 vec32[4]; -typedef int8 vec8[16]; -typedef uint16 uvec16[8]; -typedef uint32 uvec32[4]; -typedef uint8 uvec8[16]; -typedef int16 lvec16[16]; -typedef int32 lvec32[8]; -typedef int8 lvec8[32]; -typedef uint16 ulvec16[16]; -typedef uint32 ulvec32[8]; -typedef uint8 ulvec8[32]; +typedef int16_t vec16[8]; +typedef int32_t vec32[4]; +typedef int8_t vec8[16]; +typedef uint16_t uvec16[8]; +typedef uint32_t uvec32[4]; +typedef uint8_t uvec8[16]; +typedef int16_t lvec16[16]; +typedef int32_t lvec32[8]; +typedef int8_t lvec8[32]; +typedef uint16_t ulvec16[16]; +typedef uint32_t ulvec32[8]; +typedef uint8_t ulvec8[32]; #endif #if defined(__aarch64__) @@ -446,23 +558,23 @@ struct YuvConstants { #else // This struct is for Intel color conversion. struct YuvConstants { - int8 kUVToB[32]; - int8 kUVToG[32]; - int8 kUVToR[32]; - int16 kUVBiasB[16]; - int16 kUVBiasG[16]; - int16 kUVBiasR[16]; - int16 kYToRgb[16]; + int8_t kUVToB[32]; + int8_t kUVToG[32]; + int8_t kUVToR[32]; + int16_t kUVBiasB[16]; + int16_t kUVBiasG[16]; + int16_t kUVBiasR[16]; + int16_t kYToRgb[16]; }; // Offsets into YuvConstants structure -#define KUVTOB 0 -#define KUVTOG 32 -#define KUVTOR 64 +#define KUVTOB 0 +#define KUVTOG 32 +#define KUVTOR 64 #define KUVBIASB 96 #define KUVBIASG 128 #define KUVBIASR 160 -#define KYTORGB 192 +#define KYTORGB 192 #endif // Conversion matrix for YUV to RGB @@ -475,6 +587,16 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants); // BT.601 extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709 +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) + +#define align_buffer_64(var, size) \ + uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \ + uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ + +#define free_aligned_buffer_64(var) \ + free(var##_mem); \ + var = 0 + #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) #define OMITFP #else @@ -487,1458 +609,2863 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709 #else #define LABELALIGN #endif -#if defined(__native_client__) && defined(__x86_64__) -// r14 is used for MEMOP macros. -#define NACL_R14 "r14", -#define BUNDLELOCK ".bundle_lock\n" -#define BUNDLEUNLOCK ".bundle_unlock\n" -#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" -#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" -#define MEMLEA(offset, base) #offset "(%q" #base ")" -#define MEMLEA3(offset, index, scale) \ - #offset "(,%q" #index "," #scale ")" -#define MEMLEA4(offset, base, index, scale) \ - #offset "(%q" #base ",%q" #index "," #scale ")" -#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15" -#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15" -#define MEMOPREG(opcode, offset, base, index, scale, reg) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%%" #reg "\n" \ - BUNDLEUNLOCK -#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " %%" #reg ",(%%r15,%%r14)\n" \ - BUNDLEUNLOCK -#define MEMOPARG(opcode, offset, base, index, scale, arg) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%" #arg "\n" \ - BUNDLEUNLOCK -#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \ - BUNDLEUNLOCK -#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ - BUNDLELOCK \ - "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ - #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \ - BUNDLEUNLOCK -#else // defined(__native_client__) && defined(__x86_64__) -#define NACL_R14 -#define BUNDLEALIGN -#define MEMACCESS(base) "(%" #base ")" -#define MEMACCESS2(offset, base) #offset "(%" #base ")" -#define MEMLEA(offset, base) #offset "(%" #base ")" -#define MEMLEA3(offset, index, scale) \ - #offset "(,%" #index "," #scale ")" -#define MEMLEA4(offset, base, index, scale) \ - #offset "(%" #base ",%" #index "," #scale ")" -#define MEMMOVESTRING(s, d) -#define MEMSTORESTRING(reg, d) -#define MEMOPREG(opcode, offset, base, index, scale, reg) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" -#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ - #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" -#define MEMOPARG(opcode, offset, base, index, scale, arg) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" -#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ - #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \ - #reg2 "\n" -#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ - #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" -#endif // defined(__native_client__) && defined(__x86_64__) -#if defined(__arm__) || defined(__aarch64__) -#undef MEMACCESS -#if defined(__native_client__) -#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" -#else -#define MEMACCESS(base) -#endif +// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be +// measured and then run with iaca -64 libyuv_unittest. +// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within +// inline assembly blocks. +// example of iaca: +// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest + +#if defined(__x86_64__) || defined(__i386__) + +#define IACA_ASM_START \ + ".byte 0x0F, 0x0B\n" \ + " movl $111, %%ebx\n" \ + ".byte 0x64, 0x67, 0x90\n" + +#define IACA_ASM_END \ + " movl $222, %%ebx\n" \ + ".byte 0x64, 0x67, 0x90\n" \ + ".byte 0x0F, 0x0B\n" + +#define IACA_SSC_MARK(MARK_ID) \ + __asm__ __volatile__("\n\t movl $" #MARK_ID \ + ", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : \ + : \ + : "memory"); + +#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B"); + +#else /* Visual C */ +#define IACA_UD_BYTES \ + { __asm _emit 0x0F __asm _emit 0x0B } + +#define IACA_SSC_MARK(x) \ + { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 } + +#define IACA_VC64_START __writegsbyte(111, 111); +#define IACA_VC64_END __writegsbyte(222, 222); #endif -void I444ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +#define IACA_START \ + { \ + IACA_UD_BYTES \ + IACA_SSC_MARK(111) \ + } +#define IACA_END \ + { \ + IACA_SSC_MARK(222) \ + IACA_UD_BYTES \ + } + +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_NEON(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_NEON(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); - -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width); -void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width); -void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width); -void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width); -void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width); -void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width); -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, int width); -void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, int width); -void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width); -void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width); -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width); -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width); -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width); -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width); -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width); -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width); -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width); -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width); -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width); -void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width); -void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width); -void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width); -void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width); -void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width); -void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width); -void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width); -void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width); -void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width); -void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width); -void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width); -void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width); -void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width); -void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width); -void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width); -void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width); -void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width); -void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width); -void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width); -void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width); -void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width); -void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width); -void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width); -void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, - int width); -void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, - int width); - -void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width); -void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width); -void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width); -void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width); -void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555, - int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width); -void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444, - int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); -void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width); -void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width); -void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width); -void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width); -void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width); - -void ARGBToUV444Row_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); - -void ARGBToUV444Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV411Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); - -void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); -void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); -void MirrorRow_NEON(const uint8* src, uint8* dst, int width); -void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width); -void MirrorRow_C(const uint8* src, uint8* dst, int width); -void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); -void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width); -void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); -void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); - -void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I444ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + +void I422ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width); -void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); -void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); - -void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width); -void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_MSA(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, int width); +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width); +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width); +void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); +void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); +void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); +void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); +void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); +void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); -void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void ARGBToUVRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void BGRAToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGBAToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RGB24ToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void RAWToUVRow_C(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void RGB565ToUVRow_C(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBToUV444Row_C(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void MirrorUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MirrorUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void SplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void SplitUVRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void SplitUVRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, int width); +void SplitUVRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); -void CopyRow_SSE2(const uint8* src, uint8* dst, int count); -void CopyRow_AVX(const uint8* src, uint8* dst, int count); -void CopyRow_ERMS(const uint8* src, uint8* dst, int count); -void CopyRow_NEON(const uint8* src, uint8* dst, int count); -void CopyRow_MIPS(const uint8* src, uint8* dst, int count); -void CopyRow_C(const uint8* src, uint8* dst, int count); -void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count); -void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count); -void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count); +void MergeUVRow_C(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_MSA(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); +void MergeUVRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void MergeUVRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void MergeUVRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void MergeUVRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); -void CopyRow_16_C(const uint16* src, uint16* dst, int count); +void SplitRGBRow_C(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_SSSE3(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); +void SplitRGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); -void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, +void MergeRGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); +void MergeRGBRow_SSSE3(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); +void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void MergeRGBRow_Any_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); + +void MergeUVRow_16_C(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, /* 64 for 10 bit */ + int width); +void MergeUVRow_16_AVX2(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width); + +void MultiplyRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void MultiplyRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width); + +void Convert8To16Row_C(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void Convert8To16Row_SSE2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void Convert8To16Row_AVX2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int scale, + int width); +void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int scale, + int width); + +void Convert16To8Row_C(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); +void Convert16To8Row_SSSE3(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); +void Convert16To8Row_AVX2(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); +void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int scale, int width); -void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, +void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int scale, + int width); + +void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count); +void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); +void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count); + +void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width); -void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width); -void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width); -void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a, +void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width); +void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, + uint8_t* dst_a, + int width); +void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a, +void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); -void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); -void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, - int width); -void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, - int width); +void SetRow_C(uint8_t* dst, uint8_t v8, int width); +void SetRow_MSA(uint8_t* dst, uint8_t v8, int width); +void SetRow_X86(uint8_t* dst, uint8_t v8, int width); +void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width); +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width); +void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width); +void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width); -void SetRow_C(uint8* dst, uint8 v8, int count); -void SetRow_X86(uint8* dst, uint8 v8, int count); -void SetRow_ERMS(uint8* dst, uint8 v8, int count); -void SetRow_NEON(uint8* dst, uint8 v8, int count); -void SetRow_Any_X86(uint8* dst, uint8 v8, int count); -void SetRow_Any_NEON(uint8* dst, uint8 v8, int count); - -void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count); -void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count); -void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count); -void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width); +void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width); +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width); +void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width); +void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width); +void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width); // ARGBShufflers for BGRAToARGB etc. -void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); -void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width); - -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width); -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width); -void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width); -void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int width); -void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, - int width); -void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, - int width); -void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, - int width); - -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width); -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width); -void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width); -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, - int width); -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, - int width); -void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width); -void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width); -void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width); -void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width); -void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, +void ARGBShuffleRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); +void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, int width); -void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width); -void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width); - -void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, - int width); -void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int width); -void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, - int width); -void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb, - int width); -void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb, - int width); -void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb, - int width); - -void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, +void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, int width); -void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width); -void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width); -void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, +void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); +void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); + +void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width); +void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); +void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); +void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width); +void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); +void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width); +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); +void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); +void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); +void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width); +void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width); +void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width); +void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width); +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); +void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width); + +void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb, +void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + +void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); +void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); -void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); -void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); -void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); +void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); -void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width); +void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); +void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width); -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); +void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); +void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width); +void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width); -void I444ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width); +void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width); + +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width); +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width); +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width); +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, + int width); +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, + int width); +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); +void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width); +void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width); + +void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); + +void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); +void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); + +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToAR30Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void NV12ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_C(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_C(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_C(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_C(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void UYVYToARGBRow_C(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGBARow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void NV12ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void NV21ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_C(const uint8_t* src_yuy2, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_C(const uint8_t* src_uyvy, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB1555Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGBARow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I444ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, + +void I422ToAR30Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToAR30Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void NV12ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I411ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void NV12ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void UYVYToARGBRow_AVX2(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGBARow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB4444Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB1555Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I444ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB565Row_Any_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB24Row_Any_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width); +void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width); +void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width); +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); +void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); +void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); // ARGB preattenuated alpha blend. -void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBBlendRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBBlendRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); // Unattenuated planar alpha blend. -void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); -void BlendPlaneRow_C(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width); +void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); +void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); +void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void BlendPlaneRow_C(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width); // ARGB multiply images. Same API as Blend, but these require // pointer and width alignment for SSE2. -void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBMultiplyRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); // ARGB add images. -void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBAddRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBAddRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBAddRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); // ARGB subtract images. Same API as Blend, but these require // pointer and width alignment for SSE2. -void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBSubtractRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void ARGBSubtractRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width); +void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); -void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); +void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); +void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); -void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); -void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); +void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); -void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, +void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); +void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); +void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint32_t param, + int width); -void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, - int width); -void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, - int width); -void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width); - -void I444ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* dst_argb, +void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I411ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGBARow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGBARow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB24Row_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB1555Row_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToRGB565Row_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_Any_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB565Row_Any_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGBRow_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, +void I444ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); + +void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width); -void I422ToARGBRow_DSPR2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, +void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width); +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width); +void YUY2ToUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_C(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_MSA(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); -void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_NEON(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_C(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width); -void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_NEON(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width); +void UYVYToUVRow_C(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_C(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr, + int src_stride_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); -void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_C(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width); -void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width); -void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width); - -void I422ToYUY2Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width); -void I422ToUYVYRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width); -void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width); -void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width); -void I422ToYUY2Row_Any_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width); -void I422ToUYVYRow_Any_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width); -void I422ToYUY2Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width); -void I422ToUYVYRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width); -void I422ToYUY2Row_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width); -void I422ToUYVYRow_Any_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width); +void I422ToYUY2Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width); +void I422ToUYVYRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width); +void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToYUY2Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToYUY2Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); +void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + int width); // Effects related row functions. -void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, - int width); -void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, +void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, +void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, +void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); +void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); // Inverse table for unattenuate, shared by C and SSE2. -extern const uint32 fixed_invtbl8[256]; -void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, +extern const uint32_t fixed_invtbl8[256]; +void ARGBUnattenuateRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); +void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, +void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, int width); -void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); -void ARGBSepiaRow_C(uint8* dst_argb, int width); -void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); -void ARGBSepiaRow_NEON(uint8* dst_argb, int width); +void ARGBSepiaRow_C(uint8_t* dst_argb, int width); +void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width); +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width); +void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width); -void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width); -void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width); -void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width); +void ARGBColorMatrixRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); +void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); +void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width); -void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); -void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); +void ARGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); +void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); -void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); -void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); +void RGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); +void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width); -void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width); -void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width); -void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width); +void ARGBQuantizeRow_C(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); +void ARGBQuantizeRow_MSA(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width); -void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value); -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value); -void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value); +void ARGBShadeRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); +void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); +void ARGBShadeRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value); // Used for blur. -void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, int count); -void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width); +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, + int count); +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width); -void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, int count); -void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width); +void CumulativeSumToAverageRow_C(const int32_t* tl, + const int32_t* bl, + int w, + int area, + uint8_t* dst, + int count); +void ComputeCumulativeSumRow_C(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width); LIBYUV_API -void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_C(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width); LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width); +void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* src_dudv, + int width); // Used for I420Scale, ARGBScale, and ARGBInterpolate. -void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, - int width, int source_y_fraction); -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); +void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction); -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction); -void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction); -void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); -void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); +void InterpolateRow_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, +void InterpolateRow_Any_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); -void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); +void InterpolateRow_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride_ptr, + int width, + int source_y_fraction); -void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, - ptrdiff_t src_stride_ptr, - int width, int source_y_fraction); +void InterpolateRow_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); // Sobel images. -void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, - uint8* dst_sobelx, int width); -void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width); -void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width); -void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width); -void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width); -void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width); -void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width); -void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); -void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width); - -void ARGBPolynomialRow_C(const uint8* src_argb, - uint8* dst_argb, const float* poly, +void SobelXRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelXRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width); +void SobelYRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelYRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width); +void SobelRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelToPlaneRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); +void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width); +void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, int width); -void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, const float* poly, +void SobelXYRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelXYRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width); +void SobelRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelXYRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelXYRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); +void SobelXYRow_Any_MSA(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, + int width); + +void ARGBPolynomialRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width); +void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, int width); -void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, const float* poly, +void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, int width); -void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, - const uint8* luma, uint32 lumacoeff); -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, +// Scale and convert to half float. +void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width); +void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloat1Row_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloat1Row_Any_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void HalfFloatRow_MSA(const uint16_t* src, + uint16_t* dst, + float scale, + int width); +void HalfFloatRow_Any_MSA(const uint16_t* src_ptr, + uint16_t* dst_ptr, + float param, + int width); +void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width); +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width); +void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr, + float* dst_ptr, + float param, + int width); + +void ARGBLumaColorTableRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff); +void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - const uint8* luma, uint32 lumacoeff); + const uint8_t* luma, + uint32_t lumacoeff); + +float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width); +float ScaleMaxSamples_NEON(const float* src, + float* dst, + float scale, + int width); +float ScaleSumSamples_C(const float* src, float* dst, float scale, int width); +float ScaleSumSamples_NEON(const float* src, + float* dst, + float scale, + int width); +void ScaleSamples_C(const float* src, float* dst, float scale, int width); +void ScaleSamples_NEON(const float* src, float* dst, float scale, int width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT +#endif // INCLUDE_LIBYUV_ROW_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h index 102158d1ab..b937d348ca 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_SCALE_H_ #define INCLUDE_LIBYUV_SCALE_H_ #include "libyuv/basic_types.h" @@ -20,25 +20,33 @@ extern "C" { // Supported filtering. typedef enum FilterMode { - kFilterNone = 0, // Point sample; Fastest. - kFilterLinear = 1, // Filter horizontally only. + kFilterNone = 0, // Point sample; Fastest. + kFilterLinear = 1, // Filter horizontally only. kFilterBilinear = 2, // Faster than box, but lower quality scaling down. - kFilterBox = 3 // Highest quality. + kFilterBox = 3 // Highest quality. } FilterModeEnum; // Scale a YUV plane. LIBYUV_API -void ScalePlane(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, +void ScalePlane(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, enum FilterMode filtering); LIBYUV_API -void ScalePlane_16(const uint16* src, int src_stride, - int src_width, int src_height, - uint16* dst, int dst_stride, - int dst_width, int dst_height, +void ScalePlane_16(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, enum FilterMode filtering); // Scales a YUV 4:2:0 image from the src width and height to the @@ -52,44 +60,64 @@ void ScalePlane_16(const uint16* src, int src_stride, // Returns 0 if successful. LIBYUV_API -int I420Scale(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering); LIBYUV_API -int I420Scale_16(const uint16* src_y, int src_stride_y, - const uint16* src_u, int src_stride_u, - const uint16* src_v, int src_stride_v, - int src_width, int src_height, - uint16* dst_y, int dst_stride_y, - uint16* dst_u, int dst_stride_u, - uint16* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering); #ifdef __cplusplus // Legacy API. Deprecated. LIBYUV_API -int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, - int src_stride_y, int src_stride_u, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, uint8* dst_u, uint8* dst_v, - int dst_stride_y, int dst_stride_u, int dst_stride_v, - int dst_width, int dst_height, +int Scale(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + uint8_t* dst_u, + uint8_t* dst_v, + int dst_stride_y, + int dst_stride_u, + int dst_stride_v, + int dst_width, + int dst_height, LIBYUV_BOOL interpolate); -// Legacy API. Deprecated. -LIBYUV_API -int ScaleOffset(const uint8* src_i420, int src_width, int src_height, - uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset, - LIBYUV_BOOL interpolate); - // For testing, allow disabling of specialized scalers. LIBYUV_API void SetUseReferenceImpl(LIBYUV_BOOL use); @@ -100,4 +128,4 @@ void SetUseReferenceImpl(LIBYUV_BOOL use); } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT +#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h index b56cf52099..7641f18e34 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_argb.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ #define INCLUDE_LIBYUV_SCALE_ARGB_H_ #include "libyuv/basic_types.h" @@ -20,32 +20,52 @@ extern "C" { #endif LIBYUV_API -int ARGBScale(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, +int ARGBScale(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, enum FilterMode filtering); // Clipped scale takes destination rectangle coordinates for clip values. LIBYUV_API -int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +int ARGBScaleClip(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering); // Scale with YUV conversion to ARGB and clipping. LIBYUV_API -int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint32 src_fourcc, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - uint32 dst_fourcc, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +int YUVToARGBScaleClip(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint32_t src_fourcc, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + uint32_t dst_fourcc, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering); #ifdef __cplusplus @@ -53,4 +73,4 @@ int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT +#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h index df699e6c22..7194ba09f8 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/scale_row.h @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ #define INCLUDE_LIBYUV_SCALE_ROW_H_ #include "libyuv/basic_types.h" @@ -19,17 +19,20 @@ namespace libyuv { extern "C" { #endif -#if defined(__pnacl__) || defined(__CLR_VER) || \ - (defined(__i386__) && !defined(__SSE2__)) +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__native_client__) && defined(__x86_64__)) || \ + (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) #define LIBYUV_DISABLE_X86 #endif #endif - // GCC >= 4.7.0 required for AVX2. #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) @@ -45,8 +48,8 @@ extern "C" { #endif // __clang__ // Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && \ - defined(_MSC_VER) && _MSC_VER >= 1700 +#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ + _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 @@ -72,15 +75,16 @@ extern "C" { // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. // The code supports NaCL but requires a new compiler and validator. -#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ - defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ + defined(GCC_HAS_AVX2)) #define HAS_SCALEADDROW_AVX2 #define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN4_AVX2 #endif // The following are available on Neon platforms: -#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ +#if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SCALEARGBCOLS_NEON #define HAS_SCALEARGBROWDOWN2_NEON @@ -93,33 +97,51 @@ extern "C" { #define HAS_SCALEARGBFILTERCOLS_NEON #endif -// The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ - defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_SCALEROWDOWN2_DSPR2 -#define HAS_SCALEROWDOWN4_DSPR2 -#define HAS_SCALEROWDOWN34_DSPR2 -#define HAS_SCALEROWDOWN38_DSPR2 +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_SCALEADDROW_MSA +#define HAS_SCALEARGBCOLS_MSA +#define HAS_SCALEARGBFILTERCOLS_MSA +#define HAS_SCALEARGBROWDOWN2_MSA +#define HAS_SCALEARGBROWDOWNEVEN_MSA +#define HAS_SCALEFILTERCOLS_MSA +#define HAS_SCALEROWDOWN2_MSA +#define HAS_SCALEROWDOWN34_MSA +#define HAS_SCALEROWDOWN38_MSA +#define HAS_SCALEROWDOWN4_MSA #endif // Scale ARGB vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int y, int dy, - int bpp, enum FilterMode filtering); + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int y, + int dy, + int bpp, + enum FilterMode filtering); void ScalePlaneVertical_16(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_argb, uint16* dst_argb, - int x, int y, int dy, - int wpp, enum FilterMode filtering); + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_argb, + uint16_t* dst_argb, + int x, + int y, + int dy, + int wpp, + enum FilterMode filtering); // Simplify the filtering based on scale factors. -enum FilterMode ScaleFilterReduce(int src_width, int src_height, - int dst_width, int dst_height, +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering); // Divide num by div and return as 16.16 fixed point result. @@ -137,367 +159,786 @@ int FixedDiv1_X86(int num, int div); #endif // Compute slope values for stepping. -void ScaleSlope(int src_width, int src_height, - int dst_width, int dst_height, +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering, - int* x, int* y, int* dx, int* dy); + int* x, + int* y, + int* dx, + int* dy); -void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width); -void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width); -void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx); -void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int, int); -void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int, int); -void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx); -void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx); -void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width); -void ScaleRowDown38_3_Box_C(const uint8* src_ptr, +void ScaleRowDown2_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Linear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown2Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, - ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); -void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); -void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width); -void ScaleARGBRowDown2_C(const uint8* src_argb, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown4_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown4Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown34_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown34_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + uint16_t* dst, + int dst_width); +void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, + uint16_t* d, + int dst_width); +void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width); +void ScaleCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleColsUp2_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int, + int); +void ScaleColsUp2_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int, + int); +void ScaleFilterCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleFilterCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleFilterCols64_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x32, + int dx); +void ScaleFilterCols64_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x32, + int dx); +void ScaleRowDown38_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width); +void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width); +void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_16_C(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width); +void ScaleARGBRowDown2_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_C(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int, int); -void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); + uint8_t* dst_argb, + int dst_width); +void ScaleARGBCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx); +void ScaleARGBColsUp2_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int, + int); +void ScaleARGBFilterCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx); // Specialized scalers for x86. -void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); -void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); -void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); -void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); - -void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); +void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); +void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); +void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); // ARGB Column functions -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); -void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx); +void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); // ARGB Row functions -void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width); + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. // Note - not static due to reuse in convert for 444 to 420. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8* src_ptr, +void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); // 32 -> 12 -void ScaleRowDown38_NEON(const uint8* src_ptr, +void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); // 32x3 -> 12x1 -void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, +void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + uint8_t* dst_ptr, + int dst_width); -void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); // 32 -> 12 -void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); // 32x3 -> 12x1 -void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); -void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); -void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleAddRow_Any_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); -void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); -void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx); +void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); -void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleFilterCols_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleRowDown34_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); + +void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_Any_MSA(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); +void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT +#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h index 0fbdc022d5..7022785d8c 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/version.h @@ -8,9 +8,9 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1616 +#define LIBYUV_VERSION 1711 -#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT +#endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h index ad934e4241..bcef378b5a 100644 --- a/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h +++ b/media/libvpx/libvpx/third_party/libyuv/include/libyuv/video_common.h @@ -10,7 +10,7 @@ // Common definitions for video, including fourcc and VideoFormat. -#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT +#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ #define INCLUDE_LIBYUV_VIDEO_COMMON_H_ #include "libyuv/basic_types.h" @@ -28,13 +28,13 @@ extern "C" { // Needs to be a macro otherwise the OS X compiler complains when the kFormat* // constants are used in a switch. #ifdef __cplusplus -#define FOURCC(a, b, c, d) ( \ - (static_cast(a)) | (static_cast(b) << 8) | \ - (static_cast(c) << 16) | (static_cast(d) << 24)) +#define FOURCC(a, b, c, d) \ + ((static_cast(a)) | (static_cast(b) << 8) | \ + (static_cast(c) << 16) | (static_cast(d) << 24)) #else -#define FOURCC(a, b, c, d) ( \ - ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \ - ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */ +#define FOURCC(a, b, c, d) \ + (((uint32_t)(a)) | ((uint32_t)(b) << 8) | /* NOLINT */ \ + ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */ #endif // Some pages discussing FourCC codes: @@ -53,38 +53,33 @@ enum FourCC { FOURCC_I420 = FOURCC('I', '4', '2', '0'), FOURCC_I422 = FOURCC('I', '4', '2', '2'), FOURCC_I444 = FOURCC('I', '4', '4', '4'), - FOURCC_I411 = FOURCC('I', '4', '1', '1'), FOURCC_I400 = FOURCC('I', '4', '0', '0'), FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), + FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb - // 2 Secondary YUV formats: row biplanar. + // 1 Secondary YUV format: row biplanar. FOURCC_M420 = FOURCC('M', '4', '2', '0'), - FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated. - // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp. + // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), + FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. + FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), - FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), + FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. - // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated. - FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), - FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), - FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), - FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), - // 1 Primary Compressed YUV format. FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), - // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. + // 7 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), @@ -112,7 +107,13 @@ enum FourCC { FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. - // 1 Auxiliary compressed YUV format set aside for capturer. + // deprecated formats. Not supported, but defined for backward compatibility. + FOURCC_I411 = FOURCC('I', '4', '1', '1'), + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), + FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), + FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), + FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), + FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), FOURCC_H264 = FOURCC('H', '2', '6', '4'), // Match any fourcc. @@ -136,8 +137,10 @@ enum FourCCBpp { FOURCC_BPP_BGRA = 32, FOURCC_BPP_ABGR = 32, FOURCC_BPP_RGBA = 32, + FOURCC_BPP_AR30 = 32, + FOURCC_BPP_AB30 = 32, FOURCC_BPP_24BG = 24, - FOURCC_BPP_RAW = 24, + FOURCC_BPP_RAW = 24, FOURCC_BPP_RGBP = 16, FOURCC_BPP_RGBO = 16, FOURCC_BPP_R444 = 16, @@ -152,6 +155,7 @@ enum FourCCBpp { FOURCC_BPP_J420 = 12, FOURCC_BPP_J400 = 8, FOURCC_BPP_H420 = 12, + FOURCC_BPP_H010 = 24, FOURCC_BPP_MJPG = 0, // 0 means unknown. FOURCC_BPP_H264 = 0, FOURCC_BPP_IYUV = 12, @@ -170,15 +174,15 @@ enum FourCCBpp { FOURCC_BPP_CM24 = 24, // Match any fourcc. - FOURCC_BPP_ANY = 0, // 0 means unknown. + FOURCC_BPP_ANY = 0, // 0 means unknown. }; // Converts fourcc aliases into canonical ones. -LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc); +LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif -#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT +#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare.cc index e3846bdfdd..50e3abd055 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/compare.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare.cc @@ -29,10 +29,10 @@ extern "C" { // hash seed of 5381 recommended. LIBYUV_API -uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { +uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { const int kBlockSize = 1 << 15; // 32768; int remainder; - uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = + uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) = HashDjb2_C; #if defined(HAS_HASHDJB2_SSE41) if (TestCpuFlag(kCpuHasSSE41)) { @@ -45,25 +45,25 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { } #endif - while (count >= (uint64)(kBlockSize)) { + while (count >= (uint64_t)(kBlockSize)) { seed = HashDjb2_SSE(src, kBlockSize, seed); src += kBlockSize; count -= kBlockSize; } - remainder = (int)(count) & ~15; + remainder = (int)count & ~15; if (remainder) { seed = HashDjb2_SSE(src, remainder, seed); src += remainder; count -= remainder; } - remainder = (int)(count) & 15; + remainder = (int)count & 15; if (remainder) { seed = HashDjb2_C(src, remainder, seed); } return seed; } -static uint32 ARGBDetectRow_C(const uint8* argb, int width) { +static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) { int x; for (x = 0; x < width - 1; x += 2) { if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. @@ -94,8 +94,11 @@ static uint32 ARGBDetectRow_C(const uint8* argb, int width) { // Scan an opaque argb image and return fourcc based on alpha offset. // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. LIBYUV_API -uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { - uint32 fourcc = 0; +uint32_t ARGBDetect(const uint8_t* argb, + int stride_argb, + int width, + int height) { + uint32_t fourcc = 0; int h; // Coalesce rows. @@ -111,19 +114,80 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { return fourcc; } +// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes. +// So actual maximum is 1 less loop, which is 64436 - 32 bytes. + +LIBYUV_API +uint64_t ComputeHammingDistance(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + const int kBlockSize = 1 << 15; // 32768; + const int kSimdSize = 64; + // SIMD for multiple of 64, and C for remainder + int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1); + uint64_t diff = 0; + int i; + uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b, + int count) = HammingDistance_C; +#if defined(HAS_HAMMINGDISTANCE_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HammingDistance = HammingDistance_NEON; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + HammingDistance = HammingDistance_SSSE3; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_SSE42) + if (TestCpuFlag(kCpuHasSSE42)) { + HammingDistance = HammingDistance_SSE42; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HammingDistance = HammingDistance_AVX2; + } +#endif +#if defined(HAS_HAMMINGDISTANCE_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HammingDistance = HammingDistance_MSA; + } +#endif +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : diff) +#endif + for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { + diff += HammingDistance(src_a + i, src_b + i, kBlockSize); + } + src_a += count & ~(kBlockSize - 1); + src_b += count & ~(kBlockSize - 1); + if (remainder) { + diff += HammingDistance(src_a, src_b, remainder); + src_a += remainder; + src_b += remainder; + } + remainder = count & (kSimdSize - 1); + if (remainder) { + diff += HammingDistance_C(src_a, src_b, remainder); + } + return diff; +} + // TODO(fbarchard): Refactor into row function. LIBYUV_API -uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, - int count) { +uint64_t ComputeSumSquareError(const uint8_t* src_a, + const uint8_t* src_b, + int count) { // SumSquareError returns values 0 to 65535 for each squared difference. - // Up to 65536 of those can be summed and remain within a uint32. - // After each block of 65536 pixels, accumulate into a uint64. + // Up to 65536 of those can be summed and remain within a uint32_t. + // After each block of 65536 pixels, accumulate into a uint64_t. const int kBlockSize = 65536; int remainder = count & (kBlockSize - 1) & ~31; - uint64 sse = 0; + uint64_t sse = 0; int i; - uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = - SumSquareError_C; + uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, + int count) = SumSquareError_C; #if defined(HAS_SUMSQUAREERROR_NEON) if (TestCpuFlag(kCpuHasNEON)) { SumSquareError = SumSquareError_NEON; @@ -141,8 +205,13 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, SumSquareError = SumSquareError_AVX2; } #endif +#if defined(HAS_SUMSQUAREERROR_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SumSquareError = SumSquareError_MSA; + } +#endif #ifdef _OPENMP -#pragma omp parallel for reduction(+: sse) +#pragma omp parallel for reduction(+ : sse) #endif for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { sse += SumSquareError(src_a + i, src_b + i, kBlockSize); @@ -162,14 +231,16 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, } LIBYUV_API -uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height) { - uint64 sse = 0; +uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + uint64_t sse = 0; int h; // Coalesce rows. - if (stride_a == width && - stride_b == width) { + if (stride_a == width && stride_b == width) { width *= height; height = 1; stride_a = stride_b = 0; @@ -183,66 +254,76 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, } LIBYUV_API -double SumSquareErrorToPsnr(uint64 sse, uint64 count) { +double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) { double psnr; if (sse > 0) { - double mse = (double)(count) / (double)(sse); + double mse = (double)count / (double)sse; psnr = 10.0 * log10(255.0 * 255.0 * mse); } else { - psnr = kMaxPsnr; // Limit to prevent divide by 0 + psnr = kMaxPsnr; // Limit to prevent divide by 0 } - if (psnr > kMaxPsnr) + if (psnr > kMaxPsnr) { psnr = kMaxPsnr; + } return psnr; } LIBYUV_API -double CalcFramePsnr(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height) { - const uint64 samples = width * height; - const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, - src_b, stride_b, - width, height); +double CalcFramePsnr(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { + const uint64_t samples = (uint64_t)width * (uint64_t)height; + const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b, + stride_b, width, height); return SumSquareErrorToPsnr(sse, samples); } LIBYUV_API -double I420Psnr(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height) { - const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, - src_y_b, stride_y_b, - width, height); +double I420Psnr(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height) { + const uint64_t sse_y = ComputeSumSquareErrorPlane( + src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); const int width_uv = (width + 1) >> 1; const int height_uv = (height + 1) >> 1; - const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a, - src_u_b, stride_u_b, - width_uv, height_uv); - const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a, - src_v_b, stride_v_b, - width_uv, height_uv); - const uint64 samples = width * height + 2 * (width_uv * height_uv); - const uint64 sse = sse_y + sse_u + sse_v; + const uint64_t sse_u = ComputeSumSquareErrorPlane( + src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); + const uint64_t sse_v = ComputeSumSquareErrorPlane( + src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); + const uint64_t samples = (uint64_t)width * (uint64_t)height + + 2 * ((uint64_t)width_uv * (uint64_t)height_uv); + const uint64_t sse = sse_y + sse_u + sse_v; return SumSquareErrorToPsnr(sse, samples); } -static const int64 cc1 = 26634; // (64^2*(.01*255)^2 -static const int64 cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 -static double Ssim8x8_C(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b) { - int64 sum_a = 0; - int64 sum_b = 0; - int64 sum_sq_a = 0; - int64 sum_sq_b = 0; - int64 sum_axb = 0; +static double Ssim8x8_C(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b) { + int64_t sum_a = 0; + int64_t sum_b = 0; + int64_t sum_sq_a = 0; + int64_t sum_sq_b = 0; + int64_t sum_axb = 0; int i; for (i = 0; i < 8; ++i) { @@ -260,22 +341,22 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a, } { - const int64 count = 64; + const int64_t count = 64; // scale the constants by number of pixels - const int64 c1 = (cc1 * count * count) >> 12; - const int64 c2 = (cc2 * count * count) >> 12; + const int64_t c1 = (cc1 * count * count) >> 12; + const int64_t c2 = (cc2 * count * count) >> 12; - const int64 sum_a_x_sum_b = sum_a * sum_b; + const int64_t sum_a_x_sum_b = sum_a * sum_b; - const int64 ssim_n = (2 * sum_a_x_sum_b + c1) * - (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); + const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) * + (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); - const int64 sum_a_sq = sum_a*sum_a; - const int64 sum_b_sq = sum_b*sum_b; + const int64_t sum_a_sq = sum_a * sum_a; + const int64_t sum_b_sq = sum_b * sum_b; - const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) * - (count * sum_sq_a - sum_a_sq + - count * sum_sq_b - sum_b_sq + c2); + const int64_t ssim_d = + (sum_a_sq + sum_b_sq + c1) * + (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2); if (ssim_d == 0.0) { return DBL_MAX; @@ -288,13 +369,16 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a, // on the 4x4 pixel grid. Such arrangement allows the windows to overlap // block boundaries to penalize blocking artifacts. LIBYUV_API -double CalcFrameSsim(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b, - int width, int height) { +double CalcFrameSsim(const uint8_t* src_a, + int stride_a, + const uint8_t* src_b, + int stride_b, + int width, + int height) { int samples = 0; double ssim_total = 0; - double (*Ssim8x8)(const uint8* src_a, int stride_a, - const uint8* src_b, int stride_b) = Ssim8x8_C; + double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b, + int stride_b) = Ssim8x8_C; // sample point start with each 4x4 location int i; @@ -314,22 +398,27 @@ double CalcFrameSsim(const uint8* src_a, int stride_a, } LIBYUV_API -double I420Ssim(const uint8* src_y_a, int stride_y_a, - const uint8* src_u_a, int stride_u_a, - const uint8* src_v_a, int stride_v_a, - const uint8* src_y_b, int stride_y_b, - const uint8* src_u_b, int stride_u_b, - const uint8* src_v_b, int stride_v_b, - int width, int height) { - const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a, - src_y_b, stride_y_b, width, height); +double I420Ssim(const uint8_t* src_y_a, + int stride_y_a, + const uint8_t* src_u_a, + int stride_u_a, + const uint8_t* src_v_a, + int stride_v_a, + const uint8_t* src_y_b, + int stride_y_b, + const uint8_t* src_u_b, + int stride_u_b, + const uint8_t* src_v_b, + int stride_v_b, + int width, + int height) { + const double ssim_y = + CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); const int width_uv = (width + 1) >> 1; const int height_uv = (height + 1) >> 1; - const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, - src_u_b, stride_u_b, + const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); - const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, - src_v_b, stride_v_b, + const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); } diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc index 42fc589354..d4b170ad98 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_common.cc @@ -17,20 +17,80 @@ namespace libyuv { extern "C" { #endif -uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) { - uint32 sse = 0u; +#if ORIGINAL_OPT +uint32_t HammingDistance_C1(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count; ++i) { + int x = src_a[i] ^ src_b[i]; + if (x & 1) + ++diff; + if (x & 2) + ++diff; + if (x & 4) + ++diff; + if (x & 8) + ++diff; + if (x & 16) + ++diff; + if (x & 32) + ++diff; + if (x & 64) + ++diff; + if (x & 128) + ++diff; + } + return diff; +} +#endif + +// Hakmem method for hamming distance. +uint32_t HammingDistance_C(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b); + uint32_t u = x - ((x >> 1) & 0x55555555); + u = ((u >> 2) & 0x33333333) + (u & 0x33333333); + diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24); + src_a += 4; + src_b += 4; + } + + for (; i < count; ++i) { + uint32_t x = *src_a ^ *src_b; + uint32_t u = x - ((x >> 1) & 0x55); + u = ((u >> 2) & 0x33) + (u & 0x33); + diff += (u + (u >> 4)) & 0x0f; + src_a += 1; + src_b += 1; + } + + return diff; +} + +uint32_t SumSquareError_C(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; int i; for (i = 0; i < count; ++i) { int diff = src_a[i] - src_b[i]; - sse += (uint32)(diff * diff); + sse += (uint32_t)(diff * diff); } return sse; } // hash seed of 5381 recommended. // Internal C version of HashDjb2 with int sized count for efficiency. -uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { - uint32 hash = seed; +uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash = seed; int i; for (i = 0; i < count; ++i) { hash += (hash << 5) + src[i]; diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc index 1b83edb166..676527c1b1 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_gcc.cc @@ -22,124 +22,334 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) -uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { - uint32 sse; - asm volatile ( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "lea " MEMLEA(0x10, 0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "lea " MEMLEA(0x10, 1) ",%1 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psubusb %%xmm2,%%xmm1 \n" - "psubusb %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm2 \n" - "pmaddwd %%xmm1,%%xmm1 \n" - "pmaddwd %%xmm2,%%xmm2 \n" - "paddd %%xmm1,%%xmm0 \n" - "paddd %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "jg 1b \n" +#if defined(__x86_64__) +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint64_t diff = 0u; - "pshufd $0xee,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "pshufd $0x1,%%xmm0,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "movd %%xmm0,%3 \n" + asm volatile( + "xor %3,%3 \n" + "xor %%r8,%%r8 \n" + "xor %%r9,%%r9 \n" + "xor %%r10,%%r10 \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "=g"(sse) // %3 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // Process 32 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%rcx \n" + "mov 0x8(%0),%%rdx \n" + "xor (%1),%%rcx \n" + "xor 0x8(%1),%%rdx \n" + "popcnt %%rcx,%%rcx \n" + "popcnt %%rdx,%%rdx \n" + "mov 0x10(%0),%%rsi \n" + "mov 0x18(%0),%%rdi \n" + "xor 0x10(%1),%%rsi \n" + "xor 0x18(%1),%%rdi \n" + "popcnt %%rsi,%%rsi \n" + "popcnt %%rdi,%%rdi \n" + "add $0x20,%0 \n" + "add $0x20,%1 \n" + "add %%rcx,%3 \n" + "add %%rdx,%%r8 \n" + "add %%rsi,%%r9 \n" + "add %%rdi,%%r10 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "add %%r8, %3 \n" + "add %%r9, %3 \n" + "add %%r10, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : + : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); + + return static_cast(diff); +} +#else +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + // Process 16 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%ecx \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%ecx \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%ecx \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%ecx \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%ecx,%%ecx \n" + "add %%ecx,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "+r"(diff) // %3 + : + : "memory", "cc", "ecx", "edx"); + + return diff; +} +#endif + +static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15}; +static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; + +uint32_t HammingDistance_SSSE3(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + "movdqa %4,%%xmm2 \n" + "movdqa %5,%%xmm3 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm4 \n" + "movdqa 0x10(%0), %%xmm5 \n" + "pxor (%0,%1), %%xmm4 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pand %%xmm2,%%xmm6 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm6,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "paddb %%xmm7,%%xmm6 \n" + "pxor 0x10(%0,%1),%%xmm5 \n" + "add $0x20,%0 \n" + "movdqa %%xmm5,%%xmm4 \n" + "pand %%xmm2,%%xmm5 \n" + "psrlw $0x4,%%xmm4 \n" + "movdqa %%xmm3,%%xmm7 \n" + "pshufb %%xmm5,%%xmm7 \n" + "pand %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm4,%%xmm5 \n" + "paddb %%xmm7,%%xmm5 \n" + "paddb %%xmm5,%%xmm6 \n" + "psadbw %%xmm1,%%xmm6 \n" + "paddd %%xmm6,%%xmm0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + "pshufd $0xaa,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0, %3 \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); + + return diff; +} + +#ifdef HAS_HAMMINGDISTANCE_AVX2 +uint32_t HammingDistance_AVX2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + asm volatile( + "vbroadcastf128 %4,%%ymm2 \n" + "vbroadcastf128 %5,%%ymm3 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm1,%%ymm1,%%ymm1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqa (%0),%%ymm4 \n" + "vmovdqa 0x20(%0), %%ymm5 \n" + "vpxor (%0,%1), %%ymm4, %%ymm4 \n" + "vpand %%ymm2,%%ymm4,%%ymm6 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" + "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" + "add $0x40,%0 \n" + "vpand %%ymm2,%%ymm4,%%ymm5 \n" + "vpsrlw $0x4,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" + "vpand %%ymm2,%%ymm4,%%ymm4 \n" + "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" + "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" + "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" + "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + + "vpermq $0xb1,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xaa,%%ymm0,%%ymm1 \n" + "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" + "vmovd %%xmm0, %3 \n" + "vzeroupper \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=r"(diff) // %3 + : "m"(kNibbleMask), // %4 + "m"(kBitCount) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + + return diff; +} +#endif // HAS_HAMMINGDISTANCE_AVX2 + +uint32_t SumSquareError_SSE2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" + + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); return sse; } -static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 -static uvec32 kHashMul0 = { - 0x0c3525e1, // 33 ^ 15 - 0xa3476dc1, // 33 ^ 14 - 0x3b4039a1, // 33 ^ 13 - 0x4f5f0981, // 33 ^ 12 +static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 +static const uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 }; -static uvec32 kHashMul1 = { - 0x30f35d61, // 33 ^ 11 - 0x855cb541, // 33 ^ 10 - 0x040a9121, // 33 ^ 9 - 0x747c7101, // 33 ^ 8 +static const uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 }; -static uvec32 kHashMul2 = { - 0xec41d4e1, // 33 ^ 7 - 0x4cfa3cc1, // 33 ^ 6 - 0x025528a1, // 33 ^ 5 - 0x00121881, // 33 ^ 4 +static const uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 }; -static uvec32 kHashMul3 = { - 0x00008c61, // 33 ^ 3 - 0x00000441, // 33 ^ 2 - 0x00000021, // 33 ^ 1 - 0x00000001, // 33 ^ 0 +static const uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 }; -uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { - uint32 hash; - asm volatile ( - "movd %2,%%xmm0 \n" - "pxor %%xmm7,%%xmm7 \n" - "movdqa %4,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "lea " MEMLEA(0x10, 0) ",%0 \n" - "pmulld %%xmm6,%%xmm0 \n" - "movdqa %5,%%xmm5 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm7,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm7,%%xmm3 \n" - "pmulld %%xmm5,%%xmm3 \n" - "movdqa %6,%%xmm5 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpckhwd %%xmm7,%%xmm4 \n" - "pmulld %%xmm5,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "punpckhbw %%xmm7,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm7,%%xmm2 \n" - "pmulld %%xmm5,%%xmm2 \n" - "movdqa %8,%%xmm5 \n" - "punpckhwd %%xmm7,%%xmm1 \n" - "pmulld %%xmm5,%%xmm1 \n" - "paddd %%xmm4,%%xmm3 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm1 \n" - "pshufd $0xe,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "pshufd $0x1,%%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm1 \n" - "paddd %%xmm1,%%xmm0 \n" - "sub $0x10,%1 \n" - "jg 1b \n" - "movd %%xmm0,%3 \n" - : "+r"(src), // %0 - "+r"(count), // %1 - "+rm"(seed), // %2 - "=g"(hash) // %3 - : "m"(kHash16x33), // %4 - "m"(kHashMul0), // %5 - "m"(kHashMul1), // %6 - "m"(kHashMul2), // %7 - "m"(kHashMul3) // %8 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { + uint32_t hash; + asm volatile( + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" + : "+r"(src), // %0 + "+r"(count), // %1 + "+rm"(seed), // %2 + "=g"(hash) // %3 + : "m"(kHash16x33), // %4 + "m"(kHashMul0), // %5 + "m"(kHashMul1), // %6 + "m"(kHashMul2), // %7 + "m"(kHashMul3) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); return hash; } #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) @@ -148,4 +358,3 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { } // extern "C" } // namespace libyuv #endif - diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc new file mode 100644 index 0000000000..0b807d37be --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_msa.cc @@ -0,0 +1,97 @@ +/* + * Copyright 2017 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +uint32_t HammingDistance_MSA(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + int i; + v16u8 src0, src1, src2, src3; + v2i64 vec0 = {0}, vec1 = {0}; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + src0 ^= src2; + src1 ^= src3; + vec0 += __msa_pcnt_d((v2i64)src0); + vec1 += __msa_pcnt_d((v2i64)src1); + src_a += 32; + src_b += 32; + } + + vec0 += vec1; + diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0); + diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2); + return diff; +} + +uint32_t SumSquareError_MSA(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse = 0u; + int i; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2, vec3; + v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0}; + v2i64 tmp0; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + reg0 = __msa_dpadd_s_w(reg0, vec0, vec0); + reg1 = __msa_dpadd_s_w(reg1, vec1, vec1); + reg2 = __msa_dpadd_s_w(reg2, vec2, vec2); + reg3 = __msa_dpadd_s_w(reg3, vec3, vec3); + src_a += 32; + src_b += 32; + } + + reg0 += reg1; + reg2 += reg3; + reg0 += reg2; + tmp0 = __msa_hadd_s_d(reg0, reg0); + sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0); + sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2); + return sse; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc index 49aa3b4eef..2a2181e0cb 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon.cc @@ -21,40 +21,70 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) -uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { - volatile uint32 sse; - asm volatile ( - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q11, #0 \n" +// 256 bits at a time +// uses short accumulator which restricts count to 131 KB +uint32_t HammingDistance_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q8, d4, d4 \n" - "vmlal.s16 q9, d6, d6 \n" - "vmlal.s16 q10, d5, d5 \n" - "vmlal.s16 q11, d7, d7 \n" - "bgt 1b \n" + asm volatile( + "vmov.u16 q4, #0 \n" // accumulator - "vadd.u32 q8, q8, q9 \n" - "vadd.u32 q10, q10, q11 \n" - "vadd.u32 q11, q8, q10 \n" - "vpaddl.u32 q1, q11 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" + "vld1.8 {q2, q3}, [%1]! \n" + "veor.32 q0, q0, q2 \n" + "veor.32 q1, q1, q3 \n" + "vcnt.i8 q0, q0 \n" + "vcnt.i8 q1, q1 \n" + "subs %2, %2, #32 \n" + "vadd.u8 q0, q0, q1 \n" // 16 byte counts + "vpadal.u8 q4, q0 \n" // 8 shorts + "bgt 1b \n" + + "vpaddl.u16 q0, q4 \n" // 4 ints + "vpadd.u32 d0, d0, d1 \n" + "vpadd.u32 d0, d0, d0 \n" + "vmov.32 %3, d0[0] \n" + + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "q0", "q1", "q2", "q3", "q4"); + return diff; +} + +uint32_t SumSquareError_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); return sse; } diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc index f9c7df98c8..6e8f672ab7 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_neon64.cc @@ -20,39 +20,65 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { - volatile uint32 sse; - asm volatile ( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" +// 256 bits at a time +// uses short accumulator which restricts count to 131 KB +uint32_t HammingDistance_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; + asm volatile( + "movi v4.8h, #0 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "cnt v1.16b, v1.16b \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uadalp v4.8h, v0.16b \n" + "b.gt 1b \n" - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + "uaddlv s4, v4.8h \n" + "fmov %w3, s4 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "v0", "v1", "v2", "v3", "v4"); + return diff; +} + +uint32_t SumSquareError_NEON(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t sse; + asm volatile( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); return sse; } diff --git a/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc index dc86fe25b1..d57d3d9d1c 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/compare_win.cc @@ -13,20 +13,39 @@ #include "libyuv/compare_row.h" #include "libyuv/row.h" +#if defined(_MSC_VER) +#include // For __popcnt +#endif + #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -__declspec(naked) -uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { +uint32_t HammingDistance_SSE42(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff = 0u; + + int i; + for (i = 0; i < count - 3; i += 4) { + uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT + src_a += 4; + src_b += 4; + diff += __popcnt(x); + } + return diff; +} + +__declspec(naked) uint32_t + SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count pxor xmm0, xmm0 pxor xmm5, xmm5 @@ -61,13 +80,13 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. -#pragma warning(disable: 4752) -__declspec(naked) -uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { +#pragma warning(disable : 4752) +__declspec(naked) uint32_t + SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { __asm { - mov eax, [esp + 4] // src_a - mov edx, [esp + 8] // src_b - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count vpxor ymm0, ymm0, ymm0 // sum vpxor ymm5, ymm5, ymm5 // constant 0 for unpck sub edx, eax @@ -101,65 +120,65 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { } #endif // _MSC_VER >= 1700 -uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 uvec32 kHashMul0 = { - 0x0c3525e1, // 33 ^ 15 - 0xa3476dc1, // 33 ^ 14 - 0x3b4039a1, // 33 ^ 13 - 0x4f5f0981, // 33 ^ 12 + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 }; uvec32 kHashMul1 = { - 0x30f35d61, // 33 ^ 11 - 0x855cb541, // 33 ^ 10 - 0x040a9121, // 33 ^ 9 - 0x747c7101, // 33 ^ 8 + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 }; uvec32 kHashMul2 = { - 0xec41d4e1, // 33 ^ 7 - 0x4cfa3cc1, // 33 ^ 6 - 0x025528a1, // 33 ^ 5 - 0x00121881, // 33 ^ 4 + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 }; uvec32 kHashMul3 = { - 0x00008c61, // 33 ^ 3 - 0x00000441, // 33 ^ 2 - 0x00000021, // 33 ^ 1 - 0x00000001, // 33 ^ 0 + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 }; -__declspec(naked) -uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { +__declspec(naked) uint32_t + HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count movd xmm0, [esp + 12] // seed - pxor xmm7, xmm7 // constant 0 for unpck + pxor xmm7, xmm7 // constant 0 for unpck movdqa xmm6, xmmword ptr kHash16x33 wloop: - movdqu xmm1, [eax] // src[0-15] + movdqu xmm1, [eax] // src[0-15] lea eax, [eax + 16] - pmulld xmm0, xmm6 // hash *= 33 ^ 16 + pmulld xmm0, xmm6 // hash *= 33 ^ 16 movdqa xmm5, xmmword ptr kHashMul0 movdqa xmm2, xmm1 - punpcklbw xmm2, xmm7 // src[0-7] + punpcklbw xmm2, xmm7 // src[0-7] movdqa xmm3, xmm2 - punpcklwd xmm3, xmm7 // src[0-3] + punpcklwd xmm3, xmm7 // src[0-3] pmulld xmm3, xmm5 movdqa xmm5, xmmword ptr kHashMul1 movdqa xmm4, xmm2 - punpckhwd xmm4, xmm7 // src[4-7] + punpckhwd xmm4, xmm7 // src[4-7] pmulld xmm4, xmm5 movdqa xmm5, xmmword ptr kHashMul2 - punpckhbw xmm1, xmm7 // src[8-15] + punpckhbw xmm1, xmm7 // src[8-15] movdqa xmm2, xmm1 - punpcklwd xmm2, xmm7 // src[8-11] + punpcklwd xmm2, xmm7 // src[8-11] pmulld xmm2, xmm5 movdqa xmm5, xmmword ptr kHashMul3 - punpckhwd xmm1, xmm7 // src[12-15] + punpckhwd xmm1, xmm7 // src[12-15] pmulld xmm1, xmm5 - paddd xmm3, xmm4 // add 16 results + paddd xmm3, xmm4 // add 16 results paddd xmm1, xmm2 paddd xmm1, xmm3 @@ -171,18 +190,18 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { sub ecx, 16 jg wloop - movd eax, xmm0 // return hash + movd eax, xmm0 // return hash ret } } // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 -__declspec(naked) -uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { +__declspec(naked) uint32_t + HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count vmovd xmm0, [esp + 12] // seed wloop: @@ -196,7 +215,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { vpmulld xmm2, xmm2, xmmword ptr kHashMul2 lea eax, [eax + 16] vpmulld xmm1, xmm1, xmmword ptr kHashMul3 - vpaddd xmm3, xmm3, xmm4 // add 16 results + vpaddd xmm3, xmm3, xmm4 // add 16 results vpaddd xmm1, xmm1, xmm2 vpaddd xmm1, xmm1, xmm3 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords @@ -207,7 +226,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { sub ecx, 16 jg wloop - vmovd eax, xmm0 // return hash + vmovd eax, xmm0 // return hash vzeroupper ret } diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert.cc index a33742d24d..375cc732c1 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/convert.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert.cc @@ -14,8 +14,8 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" -#include "libyuv/scale.h" // For ScalePlane() #include "libyuv/row.h" +#include "libyuv/scale.h" // For ScalePlane() #ifdef __cplusplus namespace libyuv { @@ -28,14 +28,22 @@ static __inline int Abs(int v) { } // Any I4xx To I420 format with mirroring. -static int I4xxToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_y_width, int src_y_height, - int src_uv_width, int src_uv_height) { +static int I4xxToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_y_width, + int src_y_height, + int src_uv_width, + int src_uv_height) { const int dst_y_width = Abs(src_y_width); const int dst_y_height = Abs(src_y_height); const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); @@ -44,35 +52,37 @@ static int I4xxToI420(const uint8* src_y, int src_stride_y, return -1; } if (dst_y) { - ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, - dst_y, dst_stride_y, dst_y_width, dst_y_height, - kFilterBilinear); + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, + dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); } - ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, - dst_u, dst_stride_u, dst_uv_width, dst_uv_height, - kFilterBilinear); - ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, - dst_v, dst_stride_v, dst_uv_width, dst_uv_height, - kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); return 0; } -// Copy I420 with optional flipping +// Copy I420 with optional flipping. // TODO(fbarchard): Use Scale plane which supports mirroring, but ensure // is does row coalescing. LIBYUV_API -int I420Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || - !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -96,79 +106,152 @@ int I420Copy(const uint8* src_y, int src_stride_y, return 0; } +// Copy I010 with optional flipping. +LIBYUV_API +int I010Copy(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + if (dst_y) { + CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + // Copy UV planes. + CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); + CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); + return 0; +} + +// Convert 10 bit YUV to 8 bit. +LIBYUV_API +int I010ToI420(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width, + height); + // Convert UV planes. + Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth, + halfheight); + Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth, + halfheight); + return 0; +} + // 422 chroma is 1/2 width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API -int I422ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I422ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { const int src_uv_width = SUBSAMPLE(width, 1, 1); - return I4xxToI420(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - src_uv_width, height); + return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, src_uv_width, height); } // 444 chroma is 1x width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API -int I444ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return I4xxToI420(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - width, height); -} - -// 411 chroma is 1/4 width, 1x height -// 420 chroma is 1/2 width, 1/2 height -LIBYUV_API -int I411ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - const int src_uv_width = SUBSAMPLE(width, 3, 2); - return I4xxToI420(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - src_uv_width, height); +int I444ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, width, height); } // I400 is greyscale typically used in MJPG LIBYUV_API -int I400ToI420(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I400ToI420(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!dst_u || !dst_v || - width <= 0 || height == 0) { + if (!dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -186,11 +269,15 @@ int I400ToI420(const uint8* src_y, int src_stride_y, return 0; } -static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, - uint8* dst, int dst_stride, - int width, int height) { +static void CopyPlane2(const uint8_t* src, + int src_stride_0, + int src_stride_1, + uint8_t* dst, + int dst_stride, + int width, + int height) { int y; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -211,11 +298,6 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Copy plane for (y = 0; y < height - 1; y += 2) { @@ -238,17 +320,22 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, // src_stride_m420 is row planar. Normally this will be the width in pixels. // The UV plane is half width, but 2 values, so src_stride_m420 applies to // this as well as the two Y planes. -static int X420ToI420(const uint8* src_y, - int src_stride_y0, int src_stride_y1, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +static int X420ToI420(const uint8_t* src_y, + int src_stride_y0, + int src_stride_y1, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_uv || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -265,16 +352,14 @@ static int X420ToI420(const uint8* src_y, dst_stride_v = -dst_stride_v; } // Coalesce rows. - if (src_stride_y0 == width && - src_stride_y1 == width && + if (src_stride_y0 == width && src_stride_y1 == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y0 = src_stride_y1 = dst_stride_y = 0; } // Coalesce rows. - if (src_stride_uv == halfwidth * 2 && - dst_stride_u == halfwidth && + if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth && dst_stride_v == halfwidth) { halfwidth *= halfheight; halfheight = 1; @@ -299,63 +384,78 @@ static int X420ToI420(const uint8* src_y, // Convert NV12 to I420. LIBYUV_API -int NV12ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, - src_uv, src_stride_uv, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height); +int NV12ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, width, height); } // Convert NV21 to I420. Same as NV12 but u and v pointers swapped. LIBYUV_API -int NV21ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return X420ToI420(src_y, src_stride_y, src_stride_y, - src_vu, src_stride_vu, - dst_y, dst_stride_y, - dst_v, dst_stride_v, - dst_u, dst_stride_u, - width, height); +int NV21ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu, + dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u, + dst_stride_u, width, height); } // Convert M420 to I420. LIBYUV_API -int M420ToI420(const uint8* src_m420, int src_stride_m420, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int M420ToI420(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, - src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, + src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); } // Convert YUY2 to I420. LIBYUV_API -int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int YUY2ToI420(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C; - void (*YUY2ToYRow)(const uint8* src_yuy2, - uint8* dst_y, int width) = YUY2ToYRow_C; + void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2, + uint8_t* dst_u, uint8_t* dst_v, int width) = + YUY2ToUVRow_C; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -392,6 +492,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUVRow = YUY2ToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUVRow = YUY2ToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); @@ -411,16 +521,22 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, // Convert UYVY to I420. LIBYUV_API -int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int UYVYToI420(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C; - void (*UYVYToYRow)(const uint8* src_uyvy, - uint8* dst_y, int width) = UYVYToYRow_C; + void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy, + uint8_t* dst_u, uint8_t* dst_v, int width) = + UYVYToUVRow_C; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = + UYVYToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -457,6 +573,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUVRow = UYVYToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUVRow = UYVYToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); @@ -476,19 +602,23 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, // Convert ARGB to I420. LIBYUV_API -int ARGBToI420(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToI420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - if (!src_argb || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -533,6 +663,22 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -552,19 +698,23 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, // Convert BGRA to I420. LIBYUV_API -int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int BGRAToI420(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C; - void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) = + void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra, + uint8_t* dst_u, uint8_t* dst_v, int width) = + BGRAToUVRow_C; + void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) = BGRAToYRow_C; - if (!src_bgra || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -592,12 +742,28 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, } #endif #if defined(HAS_BGRATOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - BGRAToUVRow = BGRAToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_NEON; - } + if (TestCpuFlag(kCpuHasNEON)) { + BGRAToUVRow = BGRAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_NEON; } + } +#endif +#if defined(HAS_BGRATOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + BGRAToYRow = BGRAToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + BGRAToYRow = BGRAToYRow_MSA; + } + } +#endif +#if defined(HAS_BGRATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + BGRAToUVRow = BGRAToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_MSA; + } + } #endif for (y = 0; y < height - 1; y += 2) { @@ -618,19 +784,23 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, // Convert ABGR to I420. LIBYUV_API -int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ABGRToI420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C; - void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) = + void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ABGRToUVRow_C; + void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = ABGRToYRow_C; - if (!src_abgr || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -665,6 +835,22 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYRow = ABGRToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYRow = ABGRToYRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToUVRow = ABGRToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); @@ -684,19 +870,23 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, // Convert RGBA to I420. LIBYUV_API -int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RGBAToI420(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C; - void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) = + void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGBAToUVRow_C; + void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) = RGBAToYRow_C; - if (!src_rgba || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -731,6 +921,22 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, } } #endif +#if defined(HAS_RGBATOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToYRow = RGBAToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToYRow = RGBAToYRow_MSA; + } + } +#endif +#if defined(HAS_RGBATOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGBAToUVRow = RGBAToUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); @@ -750,27 +956,33 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, // Convert RGB24 to I420. LIBYUV_API -int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RGB24ToI420(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_RGB24TOYROW_NEON) - void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; - void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) = +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) + void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB24ToUVRow_C; + void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = RGB24ToYRow_C; #else - void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB24ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_rgb24 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -792,6 +1004,15 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } } } +#elif defined(HAS_RGB24TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToUVRow = RGB24ToUVRow_Any_MSA; + RGB24ToYRow = RGB24ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToYRow = RGB24ToYRow_MSA; + RGB24ToUVRow = RGB24ToUVRow_MSA; + } + } // Other platforms do intermediate conversion from RGB24 to ARGB. #else #if defined(HAS_RGB24TOARGBROW_SSSE3) @@ -822,14 +1043,17 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } } #endif +#endif + { +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RGB24TOYROW_NEON) +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); @@ -846,7 +1070,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, dst_v += dst_stride_v; } if (height & 1) { -#if defined(HAS_RGB24TOYROW_NEON) +#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); #else @@ -855,36 +1079,41 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, ARGBToYRow(row, dst_y, width); #endif } -#if !defined(HAS_RGB24TOYROW_NEON) +#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA)) free_aligned_buffer_64(row); - } #endif + } return 0; } // Convert RAW to I420. LIBYUV_API -int RAWToI420(const uint8* src_raw, int src_stride_raw, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RAWToI420(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_RAWTOYROW_NEON) - void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C; - void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) = +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) + void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, + uint8_t* dst_v, int width) = RAWToUVRow_C; + void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = RAWToYRow_C; #else - void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_raw || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -906,6 +1135,15 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } } } +#elif defined(HAS_RAWTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToUVRow = RAWToUVRow_Any_MSA; + RAWToYRow = RAWToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToYRow = RAWToYRow_MSA; + RAWToUVRow = RAWToUVRow_MSA; + } + } // Other platforms do intermediate conversion from RAW to ARGB. #else #if defined(HAS_RAWTOARGBROW_SSSE3) @@ -936,14 +1174,17 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } } #endif +#endif + { +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RAWTOYROW_NEON) +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); @@ -960,7 +1201,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, dst_v += dst_stride_v; } if (height & 1) { -#if defined(HAS_RAWTOYROW_NEON) +#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) RAWToUVRow(src_raw, 0, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); #else @@ -969,36 +1210,42 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, ARGBToYRow(row, dst_y, width); #endif } -#if !defined(HAS_RAWTOYROW_NEON) +#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA)) free_aligned_buffer_64(row); - } #endif + } return 0; } // Convert RGB565 to I420. LIBYUV_API -int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int RGB565ToI420(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_RGB565TOYROW_NEON) - void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C; - void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) = +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) + void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565, + uint8_t* dst_u, uint8_t* dst_v, int width) = + RGB565ToUVRow_C; + void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) = RGB565ToYRow_C; #else - void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = - RGB565ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = RGB565ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_rgb565 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1020,6 +1267,15 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, } } } +#elif defined(HAS_RGB565TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToUVRow = RGB565ToUVRow_Any_MSA; + RGB565ToYRow = RGB565ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToYRow = RGB565ToYRow_MSA; + RGB565ToUVRow = RGB565ToUVRow_MSA; + } + } // Other platforms do intermediate conversion from RGB565 to ARGB. #else #if defined(HAS_RGB565TOARGBROW_SSE2) @@ -1057,15 +1313,16 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, ARGBToYRow = ARGBToYRow_AVX2; } } +#endif #endif { +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif - for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_RGB565TOYROW_NEON) +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); @@ -1082,7 +1339,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, dst_v += dst_stride_v; } if (height & 1) { -#if defined(HAS_RGB565TOYROW_NEON) +#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); #else @@ -1091,36 +1348,43 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, ARGBToYRow(row, dst_y, width); #endif } -#if !defined(HAS_RGB565TOYROW_NEON) +#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA)) free_aligned_buffer_64(row); - } #endif + } return 0; } // Convert ARGB1555 to I420. LIBYUV_API -int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGB1555ToI420(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; -#if defined(HAS_ARGB1555TOYROW_NEON) - void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C; - void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) = - ARGB1555ToYRow_C; +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) + void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGB1555ToUVRow_C; + void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y, + int width) = ARGB1555ToYRow_C; #else - void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = - ARGB1555ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = ARGB1555ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_argb1555 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -1142,6 +1406,15 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, } } } +#elif defined(HAS_ARGB1555TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; + ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToYRow = ARGB1555ToYRow_MSA; + ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; + } + } // Other platforms do intermediate conversion from ARGB1555 to ARGB. #else #if defined(HAS_ARGB1555TOARGBROW_SSE2) @@ -1179,15 +1452,17 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, ARGBToYRow = ARGBToYRow_AVX2; } } +#endif #endif { +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { -#if defined(HAS_ARGB1555TOYROW_NEON) +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, @@ -1206,7 +1481,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, dst_v += dst_stride_v; } if (height & 1) { -#if defined(HAS_ARGB1555TOYROW_NEON) +#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); #else @@ -1215,36 +1490,43 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, ARGBToYRow(row, dst_y, width); #endif } -#if !defined(HAS_ARGB1555TOYROW_NEON) +#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA)) free_aligned_buffer_64(row); - } #endif + } return 0; } // Convert ARGB4444 to I420. LIBYUV_API -int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGB4444ToI420(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; #if defined(HAS_ARGB4444TOYROW_NEON) - void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C; - void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) = - ARGB4444ToYRow_C; + void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGB4444ToUVRow_C; + void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y, + int width) = ARGB4444ToYRow_C; #else - void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = - ARGB4444ToARGBRow_C; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, + int width) = ARGB4444ToARGBRow_C; + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif - if (!src_argb4444 || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -1284,6 +1566,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; @@ -1304,7 +1594,22 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } + } +#endif +#endif + { +#if !defined(HAS_ARGB4444TOYROW_NEON) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); @@ -1341,13 +1646,15 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } #if !defined(HAS_ARGB4444TOYROW_NEON) free_aligned_buffer_64(row); - } #endif + } return 0; } -static void SplitPixels(const uint8* src_u, int src_pixel_stride_uv, - uint8* dst_u, int width) { +static void SplitPixels(const uint8_t* src_u, + int src_pixel_stride_uv, + uint8_t* dst_u, + int width) { int i; for (i = 0; i < width; ++i) { *dst_u = *src_u; @@ -1358,21 +1665,26 @@ static void SplitPixels(const uint8* src_u, int src_pixel_stride_uv, // Convert Android420 to I420. LIBYUV_API -int Android420ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, +int Android420ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, int src_pixel_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - const int vu_off = src_v - src_u; + const ptrdiff_t vu_off = src_v - src_u; int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_u || !src_v || - !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1396,15 +1708,16 @@ int Android420ToI420(const uint8* src_y, int src_stride_y, CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; - // Split UV planes - NV21 - } else if (src_pixel_stride_uv == 2 && vu_off == -1 && - src_stride_u == src_stride_v) { + // Split UV planes - NV21 + } + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u, halfwidth, halfheight); return 0; - // Split UV planes - NV12 - } else if (src_pixel_stride_uv == 2 && vu_off == 1 && - src_stride_u == src_stride_v) { + // Split UV planes - NV12 + } + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v, halfwidth, halfheight); return 0; diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc index fb9582d627..f2fe474f70 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_argb.cc @@ -26,11 +26,13 @@ extern "C" { // Copy ARGB with optional flipping LIBYUV_API -int ARGBCopy(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - if (!src_argb || !dst_argb || - width <= 0 || height == 0) { +int ARGBCopy(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -40,27 +42,29 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } - CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, - width * 4, height); + CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4, + height); return 0; } -// Convert I422 to ARGB with matrix -static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, +// Convert I420 to ARGB with matrix +static int I420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGBRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || - width <= 0 || height == 0) { + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -93,13 +97,12 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } } #endif @@ -117,111 +120,130 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I420 to ARGB. LIBYUV_API -int I420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height); +int I420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); } // Convert I420 to ABGR. LIBYUV_API -int I420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int I420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J420 to ARGB. LIBYUV_API -int J420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvJPEGConstants, - width, height); +int J420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); } // Convert J420 to ABGR. LIBYUV_API -int J420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int J420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuJPEGConstants, // Use Yvu matrix width, height); } // Convert H420 to ARGB. LIBYUV_API -int H420ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvH709Constants, - width, height); +int H420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); } // Convert H420 to ABGR. LIBYUV_API -int H420ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I420ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int H420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I420ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert I422 to ARGB with matrix -static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, +static int I422ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || - width <= 0 || height == 0) { + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -231,10 +253,8 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; @@ -263,13 +283,12 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } } #endif @@ -285,111 +304,380 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I422 to ARGB. LIBYUV_API -int I422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height); +int I422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); } // Convert I422 to ABGR. LIBYUV_API -int I422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int I422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J422 to ARGB. LIBYUV_API -int J422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvJPEGConstants, - width, height); +int J422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); } // Convert J422 to ABGR. LIBYUV_API -int J422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int J422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuJPEGConstants, // Use Yvu matrix width, height); } // Convert H422 to ARGB. LIBYUV_API -int H422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvH709Constants, - width, height); +int H422ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); } // Convert H422 to ABGR. LIBYUV_API -int H422ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I422ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int H422ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I422ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + +// Convert 10 bit YUV to ARGB with matrix +// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to +// multiply 10 bit yuv into high bits to allow any number of bits. +static int I010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToAR30Row = I210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToAR30Row = I210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToAR30Row = I210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToAR30Row = I210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I010 to AR30. +LIBYUV_API +int I010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H010 to AR30. +LIBYUV_API +int H010ToAR30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvH709Constants, width, height); +} + +// Convert I010 to AB30. +LIBYUV_API +int I010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuI601Constants, width, height); +} + +// Convert H010 to AB30. +LIBYUV_API +int H010ToAB30(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_ab30, dst_stride_ab30, + &kYvuH709Constants, width, height); +} + +// Convert 10 bit YUV to ARGB with matrix +static int I010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I210ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210ToARGBRow = I210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210ToARGBRow = I210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToARGBRow = I210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToARGBRow = I210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I010 to ARGB. +LIBYUV_API +int I010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + +// Convert I010 to ABGR. +LIBYUV_API +int I010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height); +} + +// Convert H010 to ARGB. +LIBYUV_API +int H010ToARGB(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvH709Constants, width, height); +} + +// Convert H010 to ABGR. +LIBYUV_API +int H010ToABGR(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I010ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert I444 to ARGB with matrix -static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, +static int I444ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I444ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I444ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || - width <= 0 || height == 0) { + void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I444ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -399,9 +687,7 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u == width && - src_stride_v == width && + if (src_stride_y == width && src_stride_u == width && src_stride_v == width && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -431,6 +717,14 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I444ToARGBRow = I444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -444,138 +738,81 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I444 to ARGB. LIBYUV_API -int I444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height); +int I444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); } // Convert I444 to ABGR. LIBYUV_API -int I444ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, +int I444ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J444 to ARGB. LIBYUV_API -int J444ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_argb, dst_stride_argb, - &kYuvJPEGConstants, - width, height); -} - -// Convert I411 to ARGB. -LIBYUV_API -int I411ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - int y; - void (*I411ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I411ToARGBRow_C; - if (!src_y || !src_u || !src_v || - !dst_argb || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 4 == width && - src_stride_v * 4 == width && - dst_stride_argb == width * 4) { - width *= height; - height = 1; - src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; - } -#if defined(HAS_I411TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I411ToARGBRow = I411ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I411ToARGBRow = I411ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I411TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I411ToARGBRow = I411ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I411ToARGBRow = I411ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I411TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I411ToARGBRow = I411ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I411ToARGBRow = I411ToARGBRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; +int J444ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + &kYuvJPEGConstants, width, height); } // Convert I420 with Alpha to preattenuated ARGB. -static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_argb, int dst_stride_argb, +static int I420AlphaToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, const struct YuvConstants* yuvconstants, - int width, int height, int attenuate) { + int width, + int height, + int attenuate) { int y; - void (*I422AlphaToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, + void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = I422AlphaToARGBRow_C; - void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_argb || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -608,13 +845,12 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422ALPHATOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2; +#if defined(HAS_I422ALPHATOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; + } } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) @@ -641,6 +877,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -661,49 +905,59 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, // Convert I420 with Alpha to ARGB. LIBYUV_API -int I420AlphaToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int attenuate) { - return I420AlphaToARGBMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - src_a, src_stride_a, - dst_argb, dst_stride_argb, - &kYuvI601Constants, - width, height, attenuate); +int I420AlphaToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int attenuate) { + return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_a, src_stride_a, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height, attenuate); } // Convert I420 with Alpha to ABGR. LIBYUV_API -int I420AlphaToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height, int attenuate) { - return I420AlphaToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - src_a, src_stride_a, - dst_abgr, dst_stride_abgr, - &kYvuI601Constants, // Use Yvu matrix - width, height, attenuate); +int I420AlphaToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + const uint8_t* src_a, + int src_stride_a, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height, + int attenuate) { + return I420AlphaToARGBMatrix( + src_y, src_stride_y, src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, + &kYvuI601Constants, // Use Yvu matrix + width, height, attenuate); } // Convert I400 to ARGB. LIBYUV_API -int I400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int I400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*I400ToARGBRow)(const uint8* y_buf, - uint8* rgb_buf, - int width) = I400ToARGBRow_C; - if (!src_y || !dst_argb || - width <= 0 || height == 0) { + void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) = + I400ToARGBRow_C; + if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -713,8 +967,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; @@ -743,6 +996,14 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I400ToARGBRow = I400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I400ToARGBRow(src_y, dst_argb, width); @@ -754,14 +1015,16 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, // Convert J400 to ARGB. LIBYUV_API -int J400ToARGB(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int J400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) = + void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) = J400ToARGBRow_C; - if (!src_y || !dst_argb || - width <= 0 || height == 0) { + if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -771,8 +1034,7 @@ int J400ToARGB(const uint8* src_y, int src_stride_y, src_stride_y = -src_stride_y; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; @@ -800,6 +1062,14 @@ int J400ToARGB(const uint8* src_y, int src_stride_y, J400ToARGBRow = J400ToARGBRow_NEON; } } +#endif +#if defined(HAS_J400TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + J400ToARGBRow = J400ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_MSA; + } + } #endif for (y = 0; y < height; ++y) { J400ToARGBRow(src_y, dst_argb, width); @@ -810,85 +1080,89 @@ int J400ToARGB(const uint8* src_y, int src_stride_y, } // Shuffle table for converting BGRA to ARGB. -static uvec8 kShuffleMaskBGRAToARGB = { - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u -}; +static const uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u}; // Shuffle table for converting ABGR to ARGB. -static uvec8 kShuffleMaskABGRToARGB = { - 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u -}; +static const uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u}; // Shuffle table for converting RGBA to ARGB. -static uvec8 kShuffleMaskRGBAToARGB = { - 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u -}; +static const uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; // Convert BGRA to ARGB. LIBYUV_API -int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_bgra, src_stride_bgra, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskBGRAToARGB), - width, height); +int BGRAToARGB(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); } // Convert ARGB to BGRA (same as BGRAToARGB). LIBYUV_API -int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_bgra, src_stride_bgra, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskBGRAToARGB), - width, height); +int ARGBToBGRA(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); } // Convert ABGR to ARGB. LIBYUV_API -int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_abgr, src_stride_abgr, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskABGRToARGB), - width, height); +int ABGRToARGB(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); } // Convert ARGB to ABGR to (same as ABGRToARGB). LIBYUV_API -int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_abgr, src_stride_abgr, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskABGRToARGB), - width, height); +int ARGBToABGR(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); } // Convert RGBA to ARGB. LIBYUV_API -int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - return ARGBShuffle(src_rgba, src_stride_rgba, - dst_argb, dst_stride_argb, - (const uint8*)(&kShuffleMaskRGBAToARGB), - width, height); +int RGBAToARGB(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, + (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height); } // Convert RGB24 to ARGB. LIBYUV_API -int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int RGB24ToARGB(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB24ToARGBRow_C; - if (!src_rgb24 || !dst_argb || - width <= 0 || height == 0) { + if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -898,8 +1172,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, src_stride_rgb24 = -src_stride_rgb24; } // Coalesce rows. - if (src_stride_rgb24 == width * 3 && - dst_stride_argb == width * 4) { + if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_rgb24 = dst_stride_argb = 0; @@ -920,6 +1193,14 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); @@ -931,14 +1212,16 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, // Convert RAW to ARGB. LIBYUV_API -int RAWToARGB(const uint8* src_raw, int src_stride_raw, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int RAWToARGB(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) = + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; - if (!src_raw || !dst_argb || - width <= 0 || height == 0) { + if (!src_raw || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -948,8 +1231,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, src_stride_raw = -src_stride_raw; } // Coalesce rows. - if (src_stride_raw == width * 3 && - dst_stride_argb == width * 4) { + if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_raw = dst_stride_argb = 0; @@ -970,6 +1252,14 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToARGBRow = RAWToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RAWToARGBRow(src_raw, dst_argb, width); @@ -981,14 +1271,16 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, // Convert RGB565 to ARGB. LIBYUV_API -int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int RGB565ToARGB(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) = - RGB565ToARGBRow_C; - if (!src_rgb565 || !dst_argb || - width <= 0 || height == 0) { + void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb, + int width) = RGB565ToARGBRow_C; + if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -998,8 +1290,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, src_stride_rgb565 = -src_stride_rgb565; } // Coalesce rows. - if (src_stride_rgb565 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_rgb565 = dst_stride_argb = 0; @@ -1028,6 +1319,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, } } #endif +#if defined(HAS_RGB565TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RGB565ToARGBRow(src_rgb565, dst_argb, width); @@ -1039,14 +1338,16 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, // Convert ARGB1555 to ARGB. LIBYUV_API -int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGB1555ToARGB(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, - int width) = ARGB1555ToARGBRow_C; - if (!src_argb1555 || !dst_argb || - width <= 0 || height == 0) { + void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb, + int width) = ARGB1555ToARGBRow_C; + if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1056,8 +1357,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, src_stride_argb1555 = -src_stride_argb1555; } // Coalesce rows. - if (src_stride_argb1555 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb1555 = dst_stride_argb = 0; @@ -1086,6 +1386,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, } } #endif +#if defined(HAS_ARGB1555TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGB1555ToARGBRow(src_argb1555, dst_argb, width); @@ -1097,14 +1405,16 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, // Convert ARGB4444 to ARGB. LIBYUV_API -int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGB4444ToARGB(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, - int width) = ARGB4444ToARGBRow_C; - if (!src_argb4444 || !dst_argb || - width <= 0 || height == 0) { + void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb, + int width) = ARGB4444ToARGBRow_C; + if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1114,8 +1424,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, src_stride_argb4444 = -src_stride_argb4444; } // Coalesce rows. - if (src_stride_argb4444 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb4444 = dst_stride_argb = 0; @@ -1144,6 +1453,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGB4444ToARGBRow(src_argb4444, dst_argb, width); @@ -1153,20 +1470,117 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, return 0; } -// Convert NV12 to ARGB. +// Convert AR30 to ARGB. LIBYUV_API -int NV12ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int AR30ToARGB(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*NV12ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV12ToARGBRow_C; - if (!src_y || !src_uv || !dst_argb || - width <= 0 || height == 0) { + if (!src_ar30 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_argb = 0; + } + for (y = 0; y < height; ++y) { + AR30ToARGBRow_C(src_ar30, dst_argb, width); + src_ar30 += src_stride_ar30; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AR30 to ABGR. +LIBYUV_API +int AR30ToABGR(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + int y; + if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_abgr = 0; + } + for (y = 0; y < height; ++y) { + AR30ToABGRRow_C(src_ar30, dst_abgr, width); + src_ar30 += src_stride_ar30; + dst_abgr += dst_stride_abgr; + } + return 0; +} + +// Convert AR30 to AB30. +LIBYUV_API +int AR30ToAB30(const uint8_t* src_ar30, + int src_stride_ar30, + uint8_t* dst_ab30, + int dst_stride_ab30, + int width, + int height) { + int y; + if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; + src_stride_ar30 = -src_stride_ar30; + } + // Coalesce rows. + if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) { + width *= height; + height = 1; + src_stride_ar30 = dst_stride_ab30 = 0; + } + for (y = 0; y < height; ++y) { + AR30ToAB30Row_C(src_ar30, dst_ab30, width); + src_ar30 += src_stride_ar30; + dst_ab30 += dst_stride_ab30; + } + return 0; +} + +// Convert NV12 to ARGB with matrix +static int NV12ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1199,9 +1613,17 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { - NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width); + NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -1211,20 +1633,21 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, return 0; } -// Convert NV21 to ARGB. -LIBYUV_API -int NV21ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +// Convert NV21 to ARGB with matrix +static int NV21ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; - void (*NV21ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV21ToARGBRow_C; - if (!src_y || !src_uv || !dst_argb || - width <= 0 || height == 0) { + void (*NV21ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C; + if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1257,11 +1680,136 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV21TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV21ToARGBRow = NV21ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { - NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width); + NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// Convert NV12 to ARGB. +LIBYUV_API +int NV12ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, height); +} + +// Convert NV21 to ARGB. +LIBYUV_API +int NV21ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, height); +} + +// Convert NV12 to ABGR. +// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix. +// To swap the UV use NV12 instead of NV21.LIBYUV_API +int NV12ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, height); +} + +// Convert NV21 to ABGR. +LIBYUV_API +int NV21ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, height); +} + +// TODO(fbarchard): Consider SSSE3 2 step conversion. +// Convert NV12 to RGB24 with matrix +static int NV12ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV12ToRGB24Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C; + if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_NV12TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB24Row = NV12ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_NV12TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB24Row = NV12ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV12TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV12ToRGB24Row = NV12ToRGB24Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; if (y & 1) { src_uv += src_stride_uv; } @@ -1269,19 +1817,109 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, return 0; } +// Convert NV21 to RGB24 with matrix +static int NV21ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*NV21ToRGB24Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C; + if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; + dst_stride_rgb24 = -dst_stride_rgb24; + } +#if defined(HAS_NV21TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + NV21ToRGB24Row = NV21ToRGB24Row_NEON; + } + } +#endif +#if defined(HAS_NV21TORGB24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV21ToRGB24Row = NV21ToRGB24Row_SSSE3; + } + } +#endif +#if defined(HAS_NV21TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV21ToRGB24Row = NV21ToRGB24Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width); + dst_rgb24 += dst_stride_rgb24; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// TODO(fbarchard): NV12ToRAW can be implemented by mirrored matrix. +// Convert NV12 to RGB24. +LIBYUV_API +int NV12ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, + dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, + width, height); +} + +// Convert NV21 to RGB24. +LIBYUV_API +int NV21ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, + dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, + width, height); +} + // Convert M420 to ARGB. LIBYUV_API -int M420ToARGB(const uint8* src_m420, int src_stride_m420, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int M420ToARGB(const uint8_t* src_m420, + int src_stride_m420, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*NV12ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV12ToARGBRow_C; - if (!src_m420 || !dst_argb || - width <= 0 || height == 0) { + void (*NV12ToARGBRow)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; + if (!src_m420 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1314,6 +1952,14 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, } } #endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, @@ -1332,17 +1978,17 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, // Convert YUY2 to ARGB. LIBYUV_API -int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int YUY2ToARGB(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*YUY2ToARGBRow)(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = + void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width) = YUY2ToARGBRow_C; - if (!src_yuy2 || !dst_argb || - width <= 0 || height == 0) { + if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1352,8 +1998,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = -src_stride_yuy2; } // Coalesce rows. - if (src_stride_yuy2 == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_yuy2 = dst_stride_argb = 0; @@ -1381,6 +2026,14 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToARGBRow = YUY2ToARGBRow_NEON; } } +#endif +#if defined(HAS_YUY2TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + YUY2ToARGBRow = YUY2ToARGBRow_MSA; + } + } #endif for (y = 0; y < height; ++y) { YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); @@ -1392,17 +2045,17 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, // Convert UYVY to ARGB. LIBYUV_API -int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int UYVYToARGB(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*UYVYToARGBRow)(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) = + void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, int width) = UYVYToARGBRow_C; - if (!src_uyvy || !dst_argb || - width <= 0 || height == 0) { + if (!src_uyvy || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1412,8 +2065,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = -src_stride_uyvy; } // Coalesce rows. - if (src_stride_uyvy == width * 2 && - dst_stride_argb == width * 4) { + if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_uyvy = dst_stride_argb = 0; @@ -1441,6 +2093,14 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, UYVYToARGBRow = UYVYToARGBRow_NEON; } } +#endif +#if defined(HAS_UYVYTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToARGBRow = UYVYToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + UYVYToARGBRow = UYVYToARGBRow_MSA; + } + } #endif for (y = 0; y < height; ++y) { UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); @@ -1449,6 +2109,121 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, } return 0; } +static void WeavePixels(const uint8_t* src_u, + const uint8_t* src_v, + int src_pixel_stride_uv, + uint8_t* dst_uv, + int width) { + int i; + for (i = 0; i < width; ++i) { + dst_uv[0] = *src_u; + dst_uv[1] = *src_v; + dst_uv += 2; + src_u += src_pixel_stride_uv; + src_v += src_pixel_stride_uv; + } +} + +// Convert Android420 to ARGB. +LIBYUV_API +int Android420ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + uint8_t* dst_uv; + const ptrdiff_t vu_off = src_v - src_u; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + + // I420 + if (src_pixel_stride_uv == 1) { + return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_argb, dst_stride_argb, + yuvconstants, width, height); + // NV21 + } + if (src_pixel_stride_uv == 2 && vu_off == -1 && + src_stride_u == src_stride_v) { + return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb, + dst_stride_argb, yuvconstants, width, height); + // NV12 + } + if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { + return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb, + dst_stride_argb, yuvconstants, width, height); + } + + // General case fallback creates NV12 + align_buffer_64(plane_uv, halfwidth * 2 * halfheight); + dst_uv = plane_uv; + for (y = 0; y < halfheight; ++y) { + WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += halfwidth * 2; + } + NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb, + dst_stride_argb, yuvconstants, width, height); + free_aligned_buffer_64(plane_uv); + return 0; +} + +// Convert Android420 to ARGB. +LIBYUV_API +int Android420ToARGB(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_pixel_stride_uv, dst_argb, + dst_stride_argb, &kYuvI601Constants, width, + height); +} + +// Convert Android420 to ABGR. +LIBYUV_API +int Android420ToABGR(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_pixel_stride_uv, + uint8_t* dst_abgr, + int dst_stride_abgr, + int width, + int height) { + return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, src_pixel_stride_uv, dst_abgr, + dst_stride_abgr, &kYvuI601Constants, width, + height); +} #ifdef __cplusplus } // extern "C" diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc index 3b2dca8163..6fa253237e 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_from.cc @@ -15,9 +15,9 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" +#include "libyuv/row.h" #include "libyuv/scale.h" // For ScalePlane() #include "libyuv/video_common.h" -#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -30,109 +30,144 @@ static __inline int Abs(int v) { } // I420 To any I4xx YUV format with mirroring. -static int I420ToI4xx(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int src_y_width, int src_y_height, - int dst_uv_width, int dst_uv_height) { +static int I420ToI4xx(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_y_width, + int src_y_height, + int dst_uv_width, + int dst_uv_height) { const int dst_y_width = Abs(src_y_width); const int dst_y_height = Abs(src_y_height); const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); - if (src_y_width == 0 || src_y_height == 0 || - dst_uv_width <= 0 || dst_uv_height <= 0) { + if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 || + dst_uv_height <= 0) { return -1; } if (dst_y) { - ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, - dst_y, dst_stride_y, dst_y_width, dst_y_height, - kFilterBilinear); + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, + dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); } - ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, - dst_u, dst_stride_u, dst_uv_width, dst_uv_height, - kFilterBilinear); - ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, - dst_v, dst_stride_v, dst_uv_width, dst_uv_height, - kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, + dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, + dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); + return 0; +} + +// Convert 8 bit YUV to 10 bit. +LIBYUV_API +int I420ToI010(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int width, + int height) { + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width, + height); + // Convert UV planes. + Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth, + halfheight); + Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth, + halfheight); return 0; } // 420 chroma is 1/2 width, 1/2 height // 422 chroma is 1/2 width, 1x height LIBYUV_API -int I420ToI422(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420ToI422(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { const int dst_uv_width = (Abs(width) + 1) >> 1; const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - dst_uv_width, dst_uv_height); + return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, dst_uv_width, + dst_uv_height); } // 420 chroma is 1/2 width, 1/2 height // 444 chroma is 1x width, 1x height LIBYUV_API -int I420ToI444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420ToI444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { const int dst_uv_width = Abs(width); const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - dst_uv_width, dst_uv_height); -} - -// 420 chroma is 1/2 width, 1/2 height -// 411 chroma is 1/4 width, 1x height -LIBYUV_API -int I420ToI411(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - const int dst_uv_width = (Abs(width) + 3) >> 2; - const int dst_uv_height = Abs(height); - return I420ToI4xx(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, - dst_uv_width, dst_uv_height); + return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height, dst_uv_width, + dst_uv_height); } // Copy to I400. Source can be I420,422,444,400,NV12,NV21 LIBYUV_API -int I400Copy(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { - if (!src_y || !dst_y || - width <= 0 || height == 0) { +int I400Copy(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -146,17 +181,21 @@ int I400Copy(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I422ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height) { +int I422ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { int y; - void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_yuy2, int width) = + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = I422ToYUY2Row_C; - if (!src_y || !src_u || !src_v || !dst_yuy2 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -166,10 +205,8 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, dst_stride_yuy2 = -dst_stride_yuy2; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_yuy2 == width * 2) { + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; @@ -182,6 +219,14 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -202,17 +247,21 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I420ToYUY2(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height) { +int I420ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { int y; - void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_yuy2, int width) = + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = I422ToYUY2Row_C; - if (!src_y || !src_u || !src_v || !dst_yuy2 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -229,6 +278,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -237,6 +294,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); @@ -254,17 +319,21 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I422ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height) { +int I422ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { int y; - void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_uyvy, int width) = + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = I422ToUYVYRow_C; - if (!src_y || !src_u || !src_v || !dst_uyvy || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -274,10 +343,8 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, dst_stride_uyvy = -dst_stride_uyvy; } // Coalesce rows. - if (src_stride_y == width && - src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_uyvy == width * 2) { + if (src_stride_y == width && src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_uyvy == width * 2) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; @@ -290,6 +357,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; @@ -298,6 +373,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -310,17 +393,21 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, } LIBYUV_API -int I420ToUYVY(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height) { +int I420ToUYVY(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { int y; - void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_uyvy, int width) = + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = I422ToUYVYRow_C; - if (!src_y || !src_u || !src_v || !dst_uyvy || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -337,6 +424,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; @@ -345,6 +440,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -363,14 +466,20 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, // TODO(fbarchard): test negative height for invert. LIBYUV_API -int I420ToNV12(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { - if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || - width <= 0 || height == 0) { +int I420ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || + height == 0) { return -1; } int halfwidth = (width + 1) / 2; @@ -378,44 +487,47 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } - MergeUVPlane(src_u, src_stride_u, - src_v, src_stride_v, - dst_uv, dst_stride_uv, + MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv, halfwidth, halfheight); return 0; } LIBYUV_API -int I420ToNV21(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_vu, int dst_stride_vu, - int width, int height) { - return I420ToNV12(src_y, src_stride_y, - src_v, src_stride_v, - src_u, src_stride_u, - dst_y, dst_stride_y, - dst_vu, dst_stride_vu, +int I420ToNV21(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, width, height); } // Convert I422 to RGBA with matrix -static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, +static int I420ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToRGBARow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || - width <= 0 || height == 0) { + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -448,13 +560,12 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TORGBAROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) { - I422ToRGBARow = I422ToRGBARow_DSPR2; +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } } #endif @@ -472,50 +583,58 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, // Convert I420 to RGBA. LIBYUV_API -int I420ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_rgba, dst_stride_rgba, - &kYuvI601Constants, - width, height); +int I420ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); } // Convert I420 to BGRA. LIBYUV_API -int I420ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height) { - return I420ToRGBAMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_bgra, dst_stride_bgra, +int I420ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I420ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert I420 to RGB24 with matrix -static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb24, int dst_stride_rgb24, +static int I420ToRGB24Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToRGB24Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGB24Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb24 || - width <= 0 || height == 0) { + void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB24Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -548,6 +667,14 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB24Row = I422ToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); @@ -563,50 +690,95 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y, // Convert I420 to RGB24. LIBYUV_API -int I420ToRGB24(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_rgb24, dst_stride_rgb24, - &kYuvI601Constants, - width, height); +int I420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvI601Constants, width, height); } // Convert I420 to RAW. LIBYUV_API -int I420ToRAW(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_raw, int dst_stride_raw, - int width, int height) { - return I420ToRGB24Matrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_raw, dst_stride_raw, +int I420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, &kYvuI601Constants, // Use Yvu matrix width, height); } +// Convert H420 to RGB24. +LIBYUV_API +int H420ToRGB24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgb24, dst_stride_rgb24, + &kYuvH709Constants, width, height); +} + +// Convert H420 to RAW. +LIBYUV_API +int H420ToRAW(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { + return I420ToRGB24Matrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_raw, dst_stride_raw, + &kYvuH709Constants, // Use Yvu matrix + width, height); +} + // Convert I420 to ARGB1555. LIBYUV_API -int I420ToARGB1555(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb1555, int dst_stride_argb1555, - int width, int height) { +int I420ToARGB1555(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { int y; - void (*I422ToARGB1555Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, + void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGB1555Row_C; - if (!src_y || !src_u || !src_v || !dst_argb1555 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -639,6 +811,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB1555Row = I422ToARGB1555Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, @@ -653,23 +833,25 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, return 0; } - // Convert I420 to ARGB4444. LIBYUV_API -int I420ToARGB4444(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb4444, int dst_stride_argb4444, - int width, int height) { +int I420ToARGB4444(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { int y; - void (*I422ToARGB4444Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, + void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGB4444Row_C; - if (!src_y || !src_u || !src_v || !dst_argb4444 || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -702,6 +884,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGB4444Row = I422ToARGB4444Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, @@ -718,20 +908,22 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, // Convert I420 to RGB565. LIBYUV_API -int I420ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height) { +int I420ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { int y; - void (*I422ToRGB565Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGB565Row_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || - width <= 0 || height == 0) { + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -764,6 +956,14 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); @@ -777,32 +977,102 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, return 0; } +// Convert I422 to RGB565. +LIBYUV_API +int I422ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { + int y; + void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGB565Row_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } +#if defined(HAS_I422TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToRGB565Row = I422ToRGB565Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_NEON; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGB565Row = I422ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGB565Row = I422ToRGB565Row_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. -static const uint8 kDither565_4x4[16] = { - 0, 4, 1, 5, - 6, 2, 7, 3, - 1, 5, 0, 4, - 7, 3, 6, 2, +static const uint8_t kDither565_4x4[16] = { + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; // Convert I420 to RGB565 with dithering. LIBYUV_API -int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, int width, int height) { +int I420ToRGB565Dither(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToARGBRow_C; - void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) = ARGBToRGB565DitherRow_C; - if (!src_y || !src_u || !src_v || !dst_rgb565 || - width <= 0 || height == 0) { + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToARGBRow_C; + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + const uint32_t dither4, int width) = + ARGBToRGB565DitherRow_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -838,12 +1108,12 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) @@ -869,6 +1139,14 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; } } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } #endif { // Allocate a row of argb. @@ -876,7 +1154,8 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); ARGBToRGB565DitherRow(row_argb, dst_rgb565, - *(uint32*)(dither4x4 + ((y & 3) << 2)), width); + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { @@ -889,220 +1168,254 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, return 0; } +// Convert I420 to AR30 with matrix +static int I420ToAR30Matrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToAR30Row_C; + + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + +#if defined(HAS_I422TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToAR30Row = I422ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToAR30Row = I422ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I422TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToAR30Row = I422ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToAR30Row = I422ToAR30Row_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I420 to AR30. +LIBYUV_API +int I420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + +// Convert H420 to AR30. +LIBYUV_API +int H420ToAR30(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYvuH709Constants, width, height); +} + // Convert I420 to specified format LIBYUV_API -int ConvertFromI420(const uint8* y, int y_stride, - const uint8* u, int u_stride, - const uint8* v, int v_stride, - uint8* dst_sample, int dst_sample_stride, - int width, int height, - uint32 fourcc) { - uint32 format = CanonicalFourCC(fourcc); +int ConvertFromI420(const uint8_t* y, + int y_stride, + const uint8_t* u, + int u_stride, + const uint8_t* v, + int v_stride, + uint8_t* dst_sample, + int dst_sample_stride, + int width, + int height, + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); int r = 0; - if (!y || !u|| !v || !dst_sample || - width <= 0 || height == 0) { + if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) { return -1; } switch (format) { // Single plane formats case FOURCC_YUY2: - r = I420ToYUY2(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); break; case FOURCC_UYVY: - r = I420ToUYVY(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); break; case FOURCC_RGBP: - r = I420ToRGB565(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 2, - width, height); + r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 2, width, + height); break; case FOURCC_RGBO: - r = I420ToARGB1555(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, + r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_R444: - r = I420ToARGB4444(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, + r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_24BG: - r = I420ToRGB24(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 3, - width, height); + r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, width, + height); break; case FOURCC_RAW: - r = I420ToRAW(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 3, - width, height); + r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 3, width, + height); break; case FOURCC_ARGB: - r = I420ToARGB(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_BGRA: - r = I420ToBGRA(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_ABGR: - r = I420ToABGR(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_RGBA: - r = I420ToRGBA(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width * 4, - width, height); + r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); + break; + case FOURCC_AR30: + r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width * 4, width, + height); break; case FOURCC_I400: - r = I400Copy(y, y_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); + r = I400Copy(y, y_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, width, + height); break; case FOURCC_NV12: { - uint8* dst_uv = dst_sample + width * height; - r = I420ToNV12(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - dst_uv, - dst_sample_stride ? dst_sample_stride : width, - width, height); + uint8_t* dst_uv = dst_sample + width * height; + r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, dst_uv, + dst_sample_stride ? dst_sample_stride : width, width, + height); break; } case FOURCC_NV21: { - uint8* dst_vu = dst_sample + width * height; - r = I420ToNV21(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - dst_vu, - dst_sample_stride ? dst_sample_stride : width, - width, height); + uint8_t* dst_vu = dst_sample + width * height; + r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride ? dst_sample_stride : width, dst_vu, + dst_sample_stride ? dst_sample_stride : width, width, + height); break; } // TODO(fbarchard): Add M420. // Triplanar formats - // TODO(fbarchard): halfstride instead of halfwidth case FOURCC_I420: case FOURCC_YV12: { - int halfwidth = (width + 1) / 2; + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + int halfstride = (dst_sample_stride + 1) / 2; int halfheight = (height + 1) / 2; - uint8* dst_u; - uint8* dst_v; + uint8_t* dst_u; + uint8_t* dst_v; if (format == FOURCC_YV12) { - dst_v = dst_sample + width * height; - dst_u = dst_v + halfwidth * halfheight; + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + halfstride * halfheight; } else { - dst_u = dst_sample + width * height; - dst_v = dst_u + halfwidth * halfheight; + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + halfstride * halfheight; } - r = I420Copy(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, halfwidth, - dst_v, halfwidth, + r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, halfstride, dst_v, halfstride, width, height); break; } case FOURCC_I422: case FOURCC_YV16: { - int halfwidth = (width + 1) / 2; - uint8* dst_u; - uint8* dst_v; + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + int halfstride = (dst_sample_stride + 1) / 2; + uint8_t* dst_u; + uint8_t* dst_v; if (format == FOURCC_YV16) { - dst_v = dst_sample + width * height; - dst_u = dst_v + halfwidth * height; + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + halfstride * height; } else { - dst_u = dst_sample + width * height; - dst_v = dst_u + halfwidth * height; + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + halfstride * height; } - r = I420ToI422(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, halfwidth, - dst_v, halfwidth, + r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, halfstride, dst_v, halfstride, width, height); break; } case FOURCC_I444: case FOURCC_YV24: { - uint8* dst_u; - uint8* dst_v; + dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; + uint8_t* dst_u; + uint8_t* dst_v; if (format == FOURCC_YV24) { - dst_v = dst_sample + width * height; - dst_u = dst_v + width * height; + dst_v = dst_sample + dst_sample_stride * height; + dst_u = dst_v + dst_sample_stride * height; } else { - dst_u = dst_sample + width * height; - dst_v = dst_u + width * height; + dst_u = dst_sample + dst_sample_stride * height; + dst_v = dst_u + dst_sample_stride * height; } - r = I420ToI444(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, width, - dst_v, width, - width, height); + r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample, + dst_sample_stride, dst_u, dst_sample_stride, dst_v, + dst_sample_stride, width, height); break; } - case FOURCC_I411: { - int quarterwidth = (width + 3) / 4; - uint8* dst_u = dst_sample + width * height; - uint8* dst_v = dst_u + quarterwidth * height; - r = I420ToI411(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, width, - dst_u, quarterwidth, - dst_v, quarterwidth, - width, height); - break; - } - // Formats not supported - MJPG, biplanar, some rgb formats. default: return -1; // unknown fourcc - return failure code. diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc index 2a8682b7eb..c8d91252e9 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_from_argb.cc @@ -22,16 +22,21 @@ extern "C" { // ARGB little endian (bgra in memory) to I444 LIBYUV_API -int ARGBToI444(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToI444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) = ARGBToUV444Row_C; + void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u, + uint8_t* dst_v, int width) = ARGBToUV444Row_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } @@ -41,20 +46,18 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width && - dst_stride_u == width && - dst_stride_v == width) { + if (src_stride_argb == width * 4 && dst_stride_y == width && + dst_stride_u == width && dst_stride_v == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_ARGBTOUV444ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; - } + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_SSSE3; + } } #endif #if defined(HAS_ARGBTOUV444ROW_NEON) @@ -65,6 +68,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUV444Row = ARGBToUV444Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_MSA; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -89,6 +100,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUV444Row(src_argb, dst_u, dst_v, width); @@ -103,19 +122,23 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, // ARGB little endian (bgra in memory) to I422 LIBYUV_API -int ARGBToI422(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToI422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - if (!src_argb || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -125,10 +148,8 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_argb == width * 4 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; @@ -170,6 +191,23 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif + for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); @@ -181,95 +219,25 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, return 0; } -// ARGB little endian (bgra in memory) to I411 LIBYUV_API -int ARGBToI411(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - int y; - void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) = ARGBToUV411Row_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = - ARGBToYRow_C; - if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { - return -1; - } - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width && - dst_stride_u * 4 == width && - dst_stride_v * 4 == width) { - width *= height; - height = 1; - src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; - } -#if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUV411ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUV411Row = ARGBToUV411Row_Any_NEON; - if (IS_ALIGNED(width, 32)) { - ARGBToUV411Row = ARGBToUV411Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToUV411Row(src_argb, dst_u, dst_v, width); - ARGBToYRow(src_argb, dst_y, width); - src_argb += src_stride_argb; - dst_y += dst_stride_y; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; -} - -LIBYUV_API -int ARGBToNV12(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int ARGBToNV12(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) = MergeUVRow_C; - if (!src_argb || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -314,6 +282,22 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -337,11 +321,19 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, MergeUVRow_ = MergeUVRow_NEON; } } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } #endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8* row_v = row_u + ((halfwidth + 31) & ~31); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); @@ -364,21 +356,24 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, // Same as NV12 but U and V swapped. LIBYUV_API -int ARGBToNV21(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int ARGBToNV21(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) = MergeUVRow_C; - if (!src_argb || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_vu, int width) = MergeUVRow_C; + if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -423,6 +418,22 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; @@ -446,24 +457,32 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, MergeUVRow_ = MergeUVRow_NEON; } } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow_ = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow_ = MergeUVRow_MSA; + } + } #endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8* row_v = row_u + ((halfwidth + 31) & ~31); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); src_argb += src_stride_argb * 2; dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; + dst_vu += dst_stride_vu; } if (height & 1) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_uv, halfwidth); + MergeUVRow_(row_v, row_u, dst_vu, halfwidth); ARGBToYRow(src_argb, dst_y, width); } free_aligned_buffer_64(row_u); @@ -473,19 +492,23 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, // Convert ARGB to YUY2. LIBYUV_API -int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, - uint8* dst_yuy2, int dst_stride_yuy2, - int width, int height) { +int ARGBToYUY2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C; + void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_yuy2, int width) = + I422ToYUY2Row_C; - if (!src_argb || !dst_yuy2 || - width <= 0 || height == 0) { + if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -495,8 +518,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, dst_stride_yuy2 = -dst_stride_yuy2; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_yuy2 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_yuy2 = 0; @@ -537,6 +559,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; @@ -545,6 +583,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -553,12 +599,20 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif { // Allocate a rows of yuv. align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; + uint8_t* row_u = row_y + ((width + 63) & ~63); + uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); @@ -575,19 +629,23 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, // Convert ARGB to UYVY. LIBYUV_API -int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, - uint8* dst_uyvy, int dst_stride_uyvy, - int width, int height) { +int ARGBToUYVY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_uyvy, + int dst_stride_uyvy, + int width, + int height) { int y; - void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVRow_C; + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; - void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, - const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C; + void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, + const uint8_t* src_v, uint8_t* dst_uyvy, int width) = + I422ToUYVYRow_C; - if (!src_argb || !dst_uyvy || - width <= 0 || height == 0) { + if (!src_argb || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -597,8 +655,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, dst_stride_uyvy = -dst_stride_uyvy; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_uyvy == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_uyvy = 0; @@ -639,6 +696,22 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVRow = ARGBToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_MSA; + } + } +#endif #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; @@ -647,6 +720,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; @@ -655,12 +736,20 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif { // Allocate a rows of yuv. align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; + uint8_t* row_u = row_y + ((width + 63) & ~63); + uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); @@ -677,11 +766,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, // Convert ARGB to I400. LIBYUV_API -int ARGBToI400(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int ARGBToI400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { int y; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) = + void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || width <= 0 || height == 0) { return -1; @@ -692,8 +784,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_y == width) { + if (src_stride_argb == width * 4 && dst_stride_y == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = 0; @@ -722,6 +813,14 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYRow(src_argb, dst_y, width); @@ -732,28 +831,31 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } // Shuffle table for converting ARGB to RGBA. -static uvec8 kShuffleMaskARGBToRGBA = { - 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u -}; +static const uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u}; // Convert ARGB to RGBA. LIBYUV_API -int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height) { - return ARGBShuffle(src_argb, src_stride_argb, - dst_rgba, dst_stride_rgba, - (const uint8*)(&kShuffleMaskARGBToRGBA), - width, height); +int ARGBToRGBA(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba, + (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height); } // Convert ARGB To RGB24. LIBYUV_API -int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height) { +int ARGBToRGB24(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { int y; - void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) = + void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToRGB24Row_C; if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { return -1; @@ -764,8 +866,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_rgb24 == width * 3) { + if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) { width *= height; height = 1; src_stride_argb = dst_stride_rgb24 = 0; @@ -778,6 +879,22 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI; + if (IS_ALIGNED(width, 32)) { + ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI; + } + } +#endif #if defined(HAS_ARGBTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; @@ -786,6 +903,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); @@ -797,11 +922,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, // Convert ARGB To RAW. LIBYUV_API -int ARGBToRAW(const uint8* src_argb, int src_stride_argb, - uint8* dst_raw, int dst_stride_raw, - int width, int height) { +int ARGBToRAW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_raw, + int dst_stride_raw, + int width, + int height) { int y; - void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) = + void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToRAWRow_C; if (!src_argb || !dst_raw || width <= 0 || height == 0) { return -1; @@ -812,8 +940,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_raw == width * 3) { + if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) { width *= height; height = 1; src_stride_argb = dst_stride_raw = 0; @@ -826,6 +953,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRAWRow = ARGBToRAWRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToRAWRow = ARGBToRAWRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTORAWROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRAWRow = ARGBToRAWRow_Any_NEON; @@ -834,6 +969,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRAWRow = ARGBToRAWRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); @@ -844,21 +987,23 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, } // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. -static const uint8 kDither565_4x4[16] = { - 0, 4, 1, 5, - 6, 2, 7, 3, - 1, 5, 0, 4, - 7, 3, 6, 2, +static const uint8_t kDither565_4x4[16] = { + 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). LIBYUV_API -int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither4x4, int width, int height) { +int ARGBToRGB565Dither(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + const uint8_t* dither4x4, + int width, + int height) { int y; - void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) = ARGBToRGB565DitherRow_C; + void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, + const uint32_t dither4, int width) = + ARGBToRGB565DitherRow_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -894,9 +1039,19 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; + } + } +#endif + for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, - *(uint32*)(dither4x4 + ((y & 3) << 2)), width); + *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), + width); src_argb += src_stride_argb; dst_rgb565 += dst_stride_rgb565; } @@ -906,12 +1061,15 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, // Convert ARGB To RGB565. // TODO(fbarchard): Consider using dither function low level with zeros. LIBYUV_API -int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height) { +int ARGBToRGB565(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { int y; - void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToRGB565Row_C; + void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToRGB565Row_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -921,8 +1079,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_rgb565 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_rgb565 = 0; @@ -951,6 +1108,14 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565Row(src_argb, dst_rgb565, width); @@ -962,12 +1127,15 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, // Convert ARGB To ARGB1555. LIBYUV_API -int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb1555, int dst_stride_argb1555, - int width, int height) { +int ARGBToARGB1555(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb1555, + int dst_stride_argb1555, + int width, + int height) { int y; - void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToARGB1555Row_C; + void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToARGB1555Row_C; if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { return -1; } @@ -977,8 +1145,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb1555 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_argb1555 = 0; @@ -1007,6 +1174,14 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOARGB1555ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB1555Row(src_argb, dst_argb1555, width); @@ -1018,12 +1193,15 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, // Convert ARGB To ARGB4444. LIBYUV_API -int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb4444, int dst_stride_argb4444, - int width, int height) { +int ARGBToARGB4444(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb4444, + int dst_stride_argb4444, + int width, + int height) { int y; - void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToARGB4444Row_C; + void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb, + int width) = ARGBToARGB4444Row_C; if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { return -1; } @@ -1033,8 +1211,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb4444 == width * 2) { + if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_argb4444 = 0; @@ -1063,6 +1240,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOARGB4444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToARGB4444Row(src_argb, dst_argb4444, width); @@ -1072,21 +1257,123 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, return 0; } +// Convert ABGR To AR30. +LIBYUV_API +int ABGRToAR30(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) = + ABGRToAR30Row_C; + if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + // Coalesce rows. + if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_abgr = dst_stride_ar30 = 0; + } +#if defined(HAS_ABGRTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ABGRToAR30Row = ABGRToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToAR30Row = ABGRToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ABGRToAR30Row = ABGRToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ABGRToAR30Row(src_abgr, dst_ar30, width); + src_abgr += src_stride_abgr; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + +// Convert ARGB To AR30. +LIBYUV_API +int ARGBToAR30(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = + ARGBToAR30Row_C; + if (!src_argb || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar30 = 0; + } +#if defined(HAS_ARGBTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR30Row = ARGBToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR30Row = ARGBToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ARGBToAR30Row(src_argb, dst_ar30, width); + src_argb += src_stride_argb; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + // Convert ARGB to J420. (JPeg full range I420). LIBYUV_API -int ARGBToJ420(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToJ420(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || - !dst_yj || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1129,6 +1416,22 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); @@ -1148,19 +1451,23 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, // Convert ARGB to J422. (JPeg full range I422). LIBYUV_API -int ARGBToJ422(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int ARGBToJ422(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = + void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, + uint8_t* dst_u, uint8_t* dst_v, int width) = + ARGBToUVJRow_C; + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || - !dst_yj || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1170,10 +1477,8 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_yj == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_argb == width * 4 && dst_stride_yj == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width) { width *= height; height = 1; src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; @@ -1212,6 +1517,22 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUVJRow = ARGBToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); @@ -1226,11 +1547,14 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, // Convert ARGB to J400. LIBYUV_API -int ARGBToJ400(const uint8* src_argb, int src_stride_argb, - uint8* dst_yj, int dst_stride_yj, - int width, int height) { +int ARGBToJ400(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { int y; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) = + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; if (!src_argb || !dst_yj || width <= 0 || height == 0) { return -1; @@ -1241,8 +1565,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_yj == width) { + if (src_stride_argb == width * 4 && dst_stride_yj == width) { width *= height; height = 1; src_stride_argb = dst_stride_yj = 0; @@ -1271,6 +1594,14 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToYJRow(src_argb, dst_yj, width); diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc index 90f550a26a..ae3cc18cd2 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_jpeg.cc @@ -22,28 +22,24 @@ extern "C" { #ifdef HAVE_JPEG struct I420Buffers { - uint8* y; + uint8_t* y; int y_stride; - uint8* u; + uint8_t* u; int u_stride; - uint8* v; + uint8_t* v; int v_stride; int w; int h; }; static void JpegCopyI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I420Copy(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -51,17 +47,13 @@ static void JpegCopyI420(void* opaque, } static void JpegI422ToI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I422ToI420(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -69,35 +61,13 @@ static void JpegI422ToI420(void* opaque, } static void JpegI444ToI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I444ToI420(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); - dest->y += rows * dest->y_stride; - dest->u += ((rows + 1) >> 1) * dest->u_stride; - dest->v += ((rows + 1) >> 1) * dest->v_stride; - dest->h -= rows; -} - -static void JpegI411ToI420(void* opaque, - const uint8* const* data, - const int* strides, - int rows) { - I420Buffers* dest = (I420Buffers*)(opaque); - I411ToI420(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, + dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -105,15 +75,12 @@ static void JpegI411ToI420(void* opaque, } static void JpegI400ToI420(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); - I400ToI420(data[0], strides[0], - dest->y, dest->y_stride, - dest->u, dest->u_stride, - dest->v, dest->v_stride, - dest->w, rows); + I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u, + dest->u_stride, dest->v, dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; @@ -122,8 +89,10 @@ static void JpegI400ToI420(void* opaque, // Query size of MJPG in pixels. LIBYUV_API -int MJPGSize(const uint8* sample, size_t sample_size, - int* width, int* height) { +int MJPGSize(const uint8_t* sample, + size_t sample_size, + int* width, + int* height) { MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); if (ret) { @@ -135,15 +104,21 @@ int MJPGSize(const uint8* sample, size_t sample_size, } // MJPG (Motion JPeg) to I420 -// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +// TODO(fbarchard): review src_width and src_height requirement. dst_width and +// dst_height may be enough. LIBYUV_API -int MJPGToI420(const uint8* sample, +int MJPGToI420(const uint8_t* sample, size_t sample_size, - uint8* y, int y_stride, - uint8* u, int u_stride, - uint8* v, int v_stride, - int w, int h, - int dw, int dh) { + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int src_width, + int src_height, + int dst_width, + int dst_height) { if (sample_size == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; @@ -152,17 +127,17 @@ int MJPGToI420(const uint8* sample, // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); - if (ret && (mjpeg_decoder.GetWidth() != w || - mjpeg_decoder.GetHeight() != h)) { + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { - I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh }; + I420Buffers bufs = {dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, dst_width, dst_height}; // YUV420 - if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && @@ -170,8 +145,9 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh); - // YUV422 + ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width, + dst_height); + // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -181,8 +157,9 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh); - // YUV444 + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width, + dst_height); + // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -192,28 +169,19 @@ int MJPGToI420(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh); - // YUV411 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 4 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh); - // YUV400 + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width, + dst_height); + // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width, + dst_height); } else { // TODO(fbarchard): Implement conversion for any other colorspace/sample - // factors that occur in practice. 411 is supported by libjpeg + // factors that occur in practice. // ERROR: Unable to convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; @@ -224,88 +192,67 @@ int MJPGToI420(const uint8* sample, #ifdef HAVE_JPEG struct ARGBBuffers { - uint8* argb; + uint8_t* argb; int argb_stride; int w; int h; }; static void JpegI420ToARGB(void* opaque, - const uint8* const* data, - const int* strides, - int rows) { + const uint8_t* const* data, + const int* strides, + int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I420ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); + I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } static void JpegI422ToARGB(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I422ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); + I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } static void JpegI444ToARGB(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I444ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); - dest->argb += rows * dest->argb_stride; - dest->h -= rows; -} - -static void JpegI411ToARGB(void* opaque, - const uint8* const* data, - const int* strides, - int rows) { - ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I411ToARGB(data[0], strides[0], - data[1], strides[1], - data[2], strides[2], - dest->argb, dest->argb_stride, - dest->w, rows); + I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], + dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } static void JpegI400ToARGB(void* opaque, - const uint8* const* data, + const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); - I400ToARGB(data[0], strides[0], - dest->argb, dest->argb_stride, - dest->w, rows); + I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } // MJPG (Motion JPeg) to ARGB -// TODO(fbarchard): review w and h requirement. dw and dh may be enough. +// TODO(fbarchard): review src_width and src_height requirement. dst_width and +// dst_height may be enough. LIBYUV_API -int MJPGToARGB(const uint8* sample, +int MJPGToARGB(const uint8_t* sample, size_t sample_size, - uint8* argb, int argb_stride, - int w, int h, - int dw, int dh) { + uint8_t* dst_argb, + int dst_stride_argb, + int src_width, + int src_height, + int dst_width, + int dst_height) { if (sample_size == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; @@ -314,17 +261,16 @@ int MJPGToARGB(const uint8* sample, // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); - if (ret && (mjpeg_decoder.GetWidth() != w || - mjpeg_decoder.GetHeight() != h)) { + if (ret && (mjpeg_decoder.GetWidth() != src_width || + mjpeg_decoder.GetHeight() != src_height)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { - ARGBBuffers bufs = { argb, argb_stride, dw, dh }; + ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height}; // YUV420 - if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && + if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && @@ -332,8 +278,9 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh); - // YUV422 + ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width, + dst_height); + // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -343,8 +290,9 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh); - // YUV444 + ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width, + dst_height); + // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && @@ -354,28 +302,19 @@ int MJPGToARGB(const uint8* sample, mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh); - // YUV411 - } else if (mjpeg_decoder.GetColorSpace() == - MJpegDecoder::kColorSpaceYCbCr && - mjpeg_decoder.GetNumComponents() == 3 && - mjpeg_decoder.GetVertSampFactor(0) == 1 && - mjpeg_decoder.GetHorizSampFactor(0) == 4 && - mjpeg_decoder.GetVertSampFactor(1) == 1 && - mjpeg_decoder.GetHorizSampFactor(1) == 1 && - mjpeg_decoder.GetVertSampFactor(2) == 1 && - mjpeg_decoder.GetHorizSampFactor(2) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh); - // YUV400 + ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width, + dst_height); + // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { - ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh); + ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width, + dst_height); } else { // TODO(fbarchard): Implement conversion for any other colorspace/sample - // factors that occur in practice. 411 is supported by libjpeg + // factors that occur in practice. // ERROR: Unable to convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc index aecdc80fde..67484522c0 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_argb.cc @@ -28,36 +28,50 @@ extern "C" { // src_height is used to compute location of planes, and indicate inversion // sample_size is measured in bytes and is the size of the frame. // With MJPEG it is the compressed size of the frame. + +// TODO(fbarchard): Add the following: +// H010ToARGB +// H420ToARGB +// H422ToARGB +// I010ToARGB +// J400ToARGB +// J422ToARGB +// J444ToARGB + LIBYUV_API -int ConvertToARGB(const uint8* sample, size_t sample_size, - uint8* crop_argb, int argb_stride, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, +int ConvertToARGB(const uint8_t* sample, + size_t sample_size, + uint8_t* dst_argb, + int dst_stride_argb, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, - uint32 fourcc) { - uint32 format = CanonicalFourCC(fourcc); + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); int aligned_src_width = (src_width + 1) & ~1; - const uint8* src; - const uint8* src_uv; + const uint8_t* src; + const uint8_t* src_uv; int abs_src_height = (src_height < 0) ? -src_height : src_height; int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; int r = 0; // One pass rotation is available for some formats. For the rest, convert - // to I420 (with optional vertical flipping) into a temporary I420 buffer, - // and then rotate the I420 to the final destination buffer. - // For in-place conversion, if destination crop_argb is same as source sample, + // to ARGB (with optional vertical flipping) into a temporary ARGB buffer, + // and then rotate the ARGB to the final destination buffer. + // For in-place conversion, if destination dst_argb is same as source sample, // also enable temporary buffer. - LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) || - crop_argb == sample; - uint8* dest_argb = crop_argb; - int dest_argb_stride = argb_stride; - uint8* rotate_buffer = NULL; + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_ARGB) || dst_argb == sample; + uint8_t* dest_argb = dst_argb; + int dest_dst_stride_argb = dst_stride_argb; + uint8_t* rotate_buffer = NULL; int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; - if (crop_argb == NULL || sample == NULL || - src_width <= 0 || crop_width <= 0 || + if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 || src_height == 0 || crop_height == 0) { return -1; } @@ -67,187 +81,174 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, if (need_buf) { int argb_size = crop_width * 4 * abs_crop_height; - rotate_buffer = (uint8*)malloc(argb_size); + rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */ if (!rotate_buffer) { return 1; // Out of memory runtime error. } - crop_argb = rotate_buffer; - argb_stride = crop_width * 4; + dst_argb = rotate_buffer; + dst_stride_argb = crop_width * 4; } switch (format) { // Single plane formats case FOURCC_YUY2: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToARGB(src, aligned_src_width * 2, - crop_argb, argb_stride, + r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_UYVY: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToARGB(src, aligned_src_width * 2, - crop_argb, argb_stride, + r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_24BG: src = sample + (src_width * crop_y + crop_x) * 3; - r = RGB24ToARGB(src, src_width * 3, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; case FOURCC_RAW: src = sample + (src_width * crop_y + crop_x) * 3; - r = RAWToARGB(src, src_width * 3, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; case FOURCC_ARGB: - src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + if (!need_buf && !rotation) { + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb, + crop_width, inv_crop_height); + } break; case FOURCC_BGRA: src = sample + (src_width * crop_y + crop_x) * 4; - r = BGRAToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; case FOURCC_ABGR: src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; case FOURCC_RGBA: src = sample + (src_width * crop_y + crop_x) * 4; - r = RGBAToARGB(src, src_width * 4, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_AR30: + src = sample + (src_width * crop_y + crop_x) * 4; + r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); + break; + case FOURCC_AB30: + src = sample + (src_width * crop_y + crop_x) * 4; + r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToARGB(src, src_width * 2, - crop_argb, argb_stride, + r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_RGBO: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB1555ToARGB(src, src_width * 2, - crop_argb, argb_stride, + r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_R444: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB4444ToARGB(src, src_width * 2, - crop_argb, argb_stride, + r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_I400: src = sample + src_width * crop_y + crop_x; - r = I400ToARGB(src, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; - r = NV12ToARGB(src, src_width, - src_uv, aligned_src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; + r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, + dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; // Call NV12 but with u and v parameters swapped. - r = NV21ToARGB(src, src_width, - src_uv, aligned_src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, + dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_M420: src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToARGB(src, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, + inv_crop_height); break; + // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { - const uint8* src_y = sample + (src_width * crop_y + crop_x); - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; if (format == FOURCC_YV12) { src_v = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } else { src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } - r = I420ToARGB(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_J420: { - const uint8* src_y = sample + (src_width * crop_y + crop_x); - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; - r = J420ToARGB(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_I422: case FOURCC_YV16: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; if (format == FOURCC_YV16) { - src_v = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } else { - src_u = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } - r = I422ToARGB(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_I444: case FOURCC_YV24: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; if (format == FOURCC_YV24) { src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; @@ -255,32 +256,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } - r = I444ToARGB(src_y, src_width, - src_u, src_width, - src_v, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); - break; - } - case FOURCC_I411: { - int quarterwidth = (src_width + 3) / 4; - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u = sample + src_width * abs_src_height + - quarterwidth * crop_y + crop_x / 4; - const uint8* src_v = sample + src_width * abs_src_height + - quarterwidth * (abs_src_height + crop_y) + crop_x / 4; - r = I411ToARGB(src_y, src_width, - src_u, quarterwidth, - src_v, quarterwidth, - crop_argb, argb_stride, - crop_width, inv_crop_height); + r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, + dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } #ifdef HAVE_JPEG case FOURCC_MJPG: - r = MJPGToARGB(sample, sample_size, - crop_argb, argb_stride, - src_width, abs_src_height, crop_width, inv_crop_height); + r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width, + abs_src_height, crop_width, inv_crop_height); break; #endif default: @@ -289,11 +272,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, if (need_buf) { if (!r) { - r = ARGBRotate(crop_argb, argb_stride, - dest_argb, dest_argb_stride, + r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb, crop_width, abs_crop_height, rotation); } free(rotate_buffer); + } else if (rotation) { + src = sample + (src_width * crop_y + crop_x) * 4; + r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, + inv_crop_height, rotation); } return r; diff --git a/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc index e5f307c446..df08309f9b 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/convert_to_i420.cc @@ -25,251 +25,216 @@ extern "C" { // sample_size is measured in bytes and is the size of the frame. // With MJPEG it is the compressed size of the frame. LIBYUV_API -int ConvertToI420(const uint8* sample, +int ConvertToI420(const uint8_t* sample, size_t sample_size, - uint8* y, int y_stride, - uint8* u, int u_stride, - uint8* v, int v_stride, - int crop_x, int crop_y, - int src_width, int src_height, - int crop_width, int crop_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int crop_x, + int crop_y, + int src_width, + int src_height, + int crop_width, + int crop_height, enum RotationMode rotation, - uint32 fourcc) { - uint32 format = CanonicalFourCC(fourcc); + uint32_t fourcc) { + uint32_t format = CanonicalFourCC(fourcc); int aligned_src_width = (src_width + 1) & ~1; - const uint8* src; - const uint8* src_uv; + const uint8_t* src; + const uint8_t* src_uv; const int abs_src_height = (src_height < 0) ? -src_height : src_height; // TODO(nisse): Why allow crop_height < 0? const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; int r = 0; - LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 && - format != FOURCC_NV12 && format != FOURCC_NV21 && - format != FOURCC_YV12) || y == sample; - uint8* tmp_y = y; - uint8* tmp_u = u; - uint8* tmp_v = v; - int tmp_y_stride = y_stride; - int tmp_u_stride = u_stride; - int tmp_v_stride = v_stride; - uint8* rotate_buffer = NULL; + LIBYUV_BOOL need_buf = + (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && + format != FOURCC_NV21 && format != FOURCC_YV12) || + dst_y == sample; + uint8_t* tmp_y = dst_y; + uint8_t* tmp_u = dst_u; + uint8_t* tmp_v = dst_v; + int tmp_y_stride = dst_stride_y; + int tmp_u_stride = dst_stride_u; + int tmp_v_stride = dst_stride_v; + uint8_t* rotate_buffer = NULL; const int inv_crop_height = (src_height < 0) ? -abs_crop_height : abs_crop_height; - if (!y || !u || !v || !sample || - src_width <= 0 || crop_width <= 0 || - src_height == 0 || crop_height == 0) { + if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || + crop_width <= 0 || src_height == 0 || crop_height == 0) { return -1; } // One pass rotation is available for some formats. For the rest, convert // to I420 (with optional vertical flipping) into a temporary I420 buffer, // and then rotate the I420 to the final destination buffer. - // For in-place conversion, if destination y is same as source sample, + // For in-place conversion, if destination dst_y is same as source sample, // also enable temporary buffer. if (need_buf) { int y_size = crop_width * abs_crop_height; int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); - rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); + rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */ if (!rotate_buffer) { return 1; // Out of memory runtime error. } - y = rotate_buffer; - u = y + y_size; - v = u + uv_size; - y_stride = crop_width; - u_stride = v_stride = ((crop_width + 1) / 2); + dst_y = rotate_buffer; + dst_u = dst_y + y_size; + dst_v = dst_u + uv_size; + dst_stride_y = crop_width; + dst_stride_u = dst_stride_v = ((crop_width + 1) / 2); } switch (format) { // Single plane formats case FOURCC_YUY2: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToI420(src, aligned_src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_UYVY: src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToI420(src, aligned_src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; - r = RGB565ToI420(src, src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RGBO: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB1555ToI420(src, src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_R444: src = sample + (src_width * crop_y + crop_x) * 2; - r = ARGB4444ToI420(src, src_width * 2, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_24BG: src = sample + (src_width * crop_y + crop_x) * 3; - r = RGB24ToI420(src, src_width * 3, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RAW: src = sample + (src_width * crop_y + crop_x) * 3; - r = RAWToI420(src, src_width * 3, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_ARGB: src = sample + (src_width * crop_y + crop_x) * 4; - r = ARGBToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_BGRA: src = sample + (src_width * crop_y + crop_x) * 4; - r = BGRAToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_ABGR: src = sample + (src_width * crop_y + crop_x) * 4; - r = ABGRToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; case FOURCC_RGBA: src = sample + (src_width * crop_y + crop_x) * 4; - r = RGBAToI420(src, src_width * 4, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, crop_width, + inv_crop_height); break; + // TODO(fbarchard): Add AR30 and AB30 case FOURCC_I400: src = sample + src_width * crop_y + crop_x; - r = I400ToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, crop_width, inv_crop_height); break; // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + (src_width * src_height) + - ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); - r = NV12ToI420Rotate(src, src_width, - src_uv, aligned_src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height, rotation); + src_uv = sample + (src_width * abs_src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height, rotation); break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + (src_width * src_height) + - ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); - // Call NV12 but with u and v parameters swapped. - r = NV12ToI420Rotate(src, src_width, - src_uv, aligned_src_width, - y, y_stride, - v, v_stride, - u, u_stride, - crop_width, inv_crop_height, rotation); + src_uv = sample + (src_width * abs_src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); + // Call NV12 but with dst_u and dst_v parameters swapped. + r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, + dst_stride_y, dst_v, dst_stride_v, dst_u, + dst_stride_u, crop_width, inv_crop_height, rotation); break; case FOURCC_M420: src = sample + (src_width * crop_y) * 12 / 8 + crop_x; - r = M420ToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, crop_width, inv_crop_height); break; // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { - const uint8* src_y = sample + (src_width * crop_y + crop_x); - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + (src_width * crop_y + crop_x); + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; if (format == FOURCC_YV12) { src_v = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } else { src_u = sample + src_width * abs_src_height + - (halfwidth * crop_y + crop_x) / 2; + (halfwidth * crop_y + crop_x) / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } - r = I420Rotate(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height, rotation); + r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height, rotation); break; } case FOURCC_I422: case FOURCC_YV16: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; if (format == FOURCC_YV16) { - src_v = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_v = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_u = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } else { - src_u = sample + src_width * abs_src_height + - halfwidth * crop_y + crop_x / 2; + src_u = sample + src_width * abs_src_height + halfwidth * crop_y + + crop_x / 2; src_v = sample + src_width * abs_src_height + - halfwidth * (abs_src_height + crop_y) + crop_x / 2; + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } - r = I422ToI420(src_y, src_width, - src_u, halfwidth, - src_v, halfwidth, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height); break; } case FOURCC_I444: case FOURCC_YV24: { - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u; - const uint8* src_v; + const uint8_t* src_y = sample + src_width * crop_y + crop_x; + const uint8_t* src_u; + const uint8_t* src_v; if (format == FOURCC_YV24) { src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; @@ -277,38 +242,16 @@ int ConvertToI420(const uint8* sample, src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } - r = I444ToI420(src_y, src_width, - src_u, src_width, - src_v, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); - break; - } - case FOURCC_I411: { - int quarterwidth = (src_width + 3) / 4; - const uint8* src_y = sample + src_width * crop_y + crop_x; - const uint8* src_u = sample + src_width * abs_src_height + - quarterwidth * crop_y + crop_x / 4; - const uint8* src_v = sample + src_width * abs_src_height + - quarterwidth * (abs_src_height + crop_y) + crop_x / 4; - r = I411ToI420(src_y, src_width, - src_u, quarterwidth, - src_v, quarterwidth, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); + r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, + dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, crop_width, inv_crop_height); break; } #ifdef HAVE_JPEG case FOURCC_MJPG: - r = MJPGToI420(sample, sample_size, - y, y_stride, - u, u_stride, - v, v_stride, - src_width, abs_src_height, crop_width, inv_crop_height); + r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, src_width, + abs_src_height, crop_width, inv_crop_height); break; #endif default: @@ -317,13 +260,10 @@ int ConvertToI420(const uint8* sample, if (need_buf) { if (!r) { - r = I420Rotate(y, y_stride, - u, u_stride, - v, v_stride, - tmp_y, tmp_y_stride, - tmp_u, tmp_u_stride, - tmp_v, tmp_v_stride, - crop_width, abs_crop_height, rotation); + r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, + dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride, + tmp_v, tmp_v_stride, crop_width, abs_crop_height, + rotation); } free(rotate_buffer); } diff --git a/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc b/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc index 84927ebc3e..31e24b6739 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/cpu_id.cc @@ -13,22 +13,16 @@ #if defined(_MSC_VER) #include // For __cpuidex() #endif -#if !defined(__pnacl__) && !defined(__CLR_VER) && \ +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) #include // For _xgetbv() #endif -#if !defined(__native_client__) -#include // For getenv() -#endif - // For ArmCpuCaps() but unittested on all platforms #include #include -#include "libyuv/basic_types.h" // For CPU_X86 - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -43,16 +37,20 @@ extern "C" { #define SAFEBUFFERS #endif +// cpu_info_ variable for SIMD instruction sets detected. +LIBYUV_API int cpu_info_ = 0; + +// TODO(fbarchard): Consider using int for cpuid so casting is not needed. // Low level cpuid for X86. -#if (defined(_M_IX86) || defined(_M_X64) || \ - defined(__i386__) || defined(__x86_64__)) && \ +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) LIBYUV_API -void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { +void CpuId(int info_eax, int info_ecx, int* cpu_info) { #if defined(_MSC_VER) // Visual C version uses intrinsic or inline x86 assembly. #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) - __cpuidex((int*)(cpu_info), info_eax, info_ecx); + __cpuidex(cpu_info, info_eax, info_ecx); #elif defined(_M_IX86) __asm { mov eax, info_eax @@ -66,26 +64,26 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { } #else // Visual C but not x86 if (info_ecx == 0) { - __cpuid((int*)(cpu_info), info_eax); + __cpuid(cpu_info, info_eax); } else { - cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0; + cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u; } #endif // GCC version uses inline x86 assembly. #else // defined(_MSC_VER) - uint32 info_ebx, info_edx; - asm volatile ( -#if defined( __i386__) && defined(__PIC__) - // Preserve ebx for fpic 32 bit. - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=D" (info_ebx), + int info_ebx, info_edx; + asm volatile( +#if defined(__i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D"(info_ebx), #else - "cpuid \n" - : "=b" (info_ebx), + "cpuid \n" + : "=b"(info_ebx), #endif // defined( __i386__) && defined(__PIC__) - "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx)); + "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx)); cpu_info[0] = info_eax; cpu_info[1] = info_ebx; cpu_info[2] = info_ecx; @@ -94,7 +92,9 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { } #else // (defined(_M_IX86) || defined(_M_X64) ... LIBYUV_API -void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { +void CpuId(int eax, int ecx, int* cpu_info) { + (void)eax; + (void)ecx; cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; } #endif @@ -111,20 +111,22 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { #if defined(_M_IX86) && (_MSC_VER < 1900) #pragma optimize("g", off) #endif -#if (defined(_M_IX86) || defined(_M_X64) || \ - defined(__i386__) || defined(__x86_64__)) && \ +#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) -#define HAS_XGETBV // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. int GetXCR0() { - uint32 xcr0 = 0u; + int xcr0 = 0; #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) - xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. + xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT #elif defined(__i386__) || defined(__x86_64__) - asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); + asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); #endif // defined(__i386__) || defined(__x86_64__) return xcr0; } +#else +// xgetbv unavailable to query for OSSave support. Return 0. +#define GetXCR0() 0 #endif // defined(_M_IX86) || defined(_M_X64) .. // Return optimization to previous setting. #if defined(_M_IX86) && (_MSC_VER < 1900) @@ -133,8 +135,7 @@ int GetXCR0() { // based on libvpx arm_cpudetect.c // For Arm, but public to allow testing on any CPU -LIBYUV_API SAFEBUFFERS -int ArmCpuCaps(const char* cpuinfo_name) { +LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; FILE* f = fopen(cpuinfo_name, "r"); if (!f) { @@ -151,7 +152,7 @@ int ArmCpuCaps(const char* cpuinfo_name) { } // aarch64 uses asimd for Neon. p = strstr(cpuinfo_line, " asimd"); - if (p && (p[6] == ' ' || p[6] == '\n')) { + if (p) { fclose(f); return kCpuHasNEON; } @@ -161,103 +162,78 @@ int ArmCpuCaps(const char* cpuinfo_name) { return 0; } -// CPU detect function for SIMD instruction sets. -LIBYUV_API -int cpu_info_ = 0; // cpu_info is not initialized yet. - -// Test environment variable for disabling CPU features. Any non-zero value -// to disable. Zero ignored to make it easy to set the variable on/off. -#if !defined(__native_client__) && !defined(_M_ARM) - -static LIBYUV_BOOL TestEnv(const char* name) { - const char* var = getenv(name); - if (var) { - if (var[0] != '0') { - return LIBYUV_TRUE; +// TODO(fbarchard): Consider read_msa_ir(). +// TODO(fbarchard): Add unittest. +LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name, + const char ase[]) { + char cpuinfo_line[512]; + FILE* f = fopen(cpuinfo_name, "r"); + if (!f) { + // ase enabled if /proc/cpuinfo is unavailable. + if (strcmp(ase, " msa") == 0) { + return kCpuHasMSA; + } + return 0; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { + char* p = strstr(cpuinfo_line, ase); + if (p) { + fclose(f); + if (strcmp(ase, " msa") == 0) { + return kCpuHasMSA; + } + return 0; + } } } - return LIBYUV_FALSE; + fclose(f); + return 0; } -#else // nacl does not support getenv(). -static LIBYUV_BOOL TestEnv(const char*) { - return LIBYUV_FALSE; -} -#endif -LIBYUV_API SAFEBUFFERS -int InitCpuFlags(void) { - // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized. +static SAFEBUFFERS int GetCpuFlags(void) { int cpu_info = 0; -#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86) - uint32 cpu_info0[4] = { 0, 0, 0, 0 }; - uint32 cpu_info1[4] = { 0, 0, 0, 0 }; - uint32 cpu_info7[4] = { 0, 0, 0, 0 }; +#if !defined(__pnacl__) && !defined(__CLR_VER) && \ + (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86)) + int cpu_info0[4] = {0, 0, 0, 0}; + int cpu_info1[4] = {0, 0, 0, 0}; + int cpu_info7[4] = {0, 0, 0, 0}; CpuId(0, 0, cpu_info0); CpuId(1, 0, cpu_info1); if (cpu_info0[0] >= 7) { CpuId(7, 0, cpu_info7); } - cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | + cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | - ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | - ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | - kCpuHasX86; + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); -#ifdef HAS_XGETBV - // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv + // AVX requires OS saves YMM registers. if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers - cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX; + cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | + ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0); // Detect AVX512bw if ((GetXCR0() & 0xe0) == 0xe0) { - cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0; + cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; + cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; + cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; + cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; + cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; + cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; + cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; } } #endif - - // Environment variable overrides for testing. - if (TestEnv("LIBYUV_DISABLE_X86")) { - cpu_info &= ~kCpuHasX86; - } - if (TestEnv("LIBYUV_DISABLE_SSE2")) { - cpu_info &= ~kCpuHasSSE2; - } - if (TestEnv("LIBYUV_DISABLE_SSSE3")) { - cpu_info &= ~kCpuHasSSSE3; - } - if (TestEnv("LIBYUV_DISABLE_SSE41")) { - cpu_info &= ~kCpuHasSSE41; - } - if (TestEnv("LIBYUV_DISABLE_SSE42")) { - cpu_info &= ~kCpuHasSSE42; - } - if (TestEnv("LIBYUV_DISABLE_AVX")) { - cpu_info &= ~kCpuHasAVX; - } - if (TestEnv("LIBYUV_DISABLE_AVX2")) { - cpu_info &= ~kCpuHasAVX2; - } - if (TestEnv("LIBYUV_DISABLE_ERMS")) { - cpu_info &= ~kCpuHasERMS; - } - if (TestEnv("LIBYUV_DISABLE_FMA3")) { - cpu_info &= ~kCpuHasFMA3; - } - if (TestEnv("LIBYUV_DISABLE_AVX3")) { - cpu_info &= ~kCpuHasAVX3; - } -#endif #if defined(__mips__) && defined(__linux__) -#if defined(__mips_dspr2) - cpu_info |= kCpuHasDSPR2; +#if defined(__mips_msa) + cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa"); #endif cpu_info |= kCpuHasMIPS; - if (getenv("LIBYUV_DISABLE_DSPR2")) { - cpu_info &= ~kCpuHasDSPR2; - } #endif #if defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ @@ -276,22 +252,22 @@ int InitCpuFlags(void) { cpu_info = ArmCpuCaps("/proc/cpuinfo"); #endif cpu_info |= kCpuHasARM; - if (TestEnv("LIBYUV_DISABLE_NEON")) { - cpu_info &= ~kCpuHasNEON; - } #endif // __arm__ - if (TestEnv("LIBYUV_DISABLE_ASM")) { - cpu_info = 0; - } - cpu_info |= kCpuInitialized; - cpu_info_ = cpu_info; + cpu_info |= kCpuInitialized; return cpu_info; } // Note that use of this function is not thread safe. LIBYUV_API -void MaskCpuFlags(int enable_flags) { - cpu_info_ = InitCpuFlags() & enable_flags; +int MaskCpuFlags(int enable_flags) { + int cpu_info = GetCpuFlags() & enable_flags; + SetCpuFlags(cpu_info); + return cpu_info; +} + +LIBYUV_API +int InitCpuFlags(void) { + return MaskCpuFlags(-1); } #ifdef __cplusplus diff --git a/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc index 22025ad04a..eaf2530130 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_decoder.cc @@ -21,7 +21,7 @@ #if defined(_MSC_VER) // disable warning 4324: structure was padded due to __declspec(align()) -#pragma warning(disable:4324) +#pragma warning(disable : 4324) #endif #endif @@ -102,7 +102,7 @@ MJpegDecoder::~MJpegDecoder() { DestroyOutputBuffers(); } -LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { +LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) { if (!ValidateJpeg(src, src_len)) { return LIBYUV_FALSE; } @@ -129,7 +129,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { if (scanlines_[i]) { delete scanlines_[i]; } - scanlines_[i] = new uint8* [scanlines_size]; + scanlines_[i] = new uint8_t*[scanlines_size]; scanlines_sizes_[i] = scanlines_size; } @@ -145,7 +145,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) { if (databuf_[i]) { delete databuf_[i]; } - databuf_[i] = new uint8[databuf_size]; + databuf_[i] = new uint8_t[databuf_size]; databuf_strides_[i] = databuf_stride; } @@ -195,13 +195,11 @@ int MJpegDecoder::GetVertSampFactor(int component) { } int MJpegDecoder::GetHorizSubSampFactor(int component) { - return decompress_struct_->max_h_samp_factor / - GetHorizSampFactor(component); + return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component); } int MJpegDecoder::GetVertSubSampFactor(int component) { - return decompress_struct_->max_v_samp_factor / - GetVertSampFactor(component); + return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component); } int MJpegDecoder::GetImageScanlinesPerImcuRow() { @@ -245,10 +243,10 @@ LIBYUV_BOOL MJpegDecoder::UnloadFrame() { } // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. -LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( - uint8** planes, int dst_width, int dst_height) { - if (dst_width != GetWidth() || - dst_height > GetHeight()) { +LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes, + int dst_width, + int dst_height) { + if (dst_width != GetWidth() || dst_height > GetHeight()) { // ERROR: Bad dimensions return LIBYUV_FALSE; } @@ -289,14 +287,13 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( for (int i = 0; i < num_outbufs_; ++i) { // TODO(fbarchard): Compute skip to avoid this assert(skip % GetVertSubSampFactor(i) == 0); - int rows_to_skip = - DivideAndRoundDown(skip, GetVertSubSampFactor(i)); - int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) - - rows_to_skip; + int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); + int scanlines_to_copy = + GetComponentScanlinesPerImcuRow(i) - rows_to_skip; int data_to_skip = rows_to_skip * GetComponentStride(i); - CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), - planes[i], GetComponentWidth(i), - GetComponentWidth(i), scanlines_to_copy); + CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), + scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } lines_left -= (GetImageScanlinesPerImcuRow() - skip); @@ -305,16 +302,15 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( // Read full MCUs but cropped horizontally for (; lines_left > GetImageScanlinesPerImcuRow(); - lines_left -= GetImageScanlinesPerImcuRow()) { + lines_left -= GetImageScanlinesPerImcuRow()) { if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } for (int i = 0; i < num_outbufs_; ++i) { int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); - CopyPlane(databuf_[i], GetComponentStride(i), - planes[i], GetComponentWidth(i), - GetComponentWidth(i), scanlines_to_copy); + CopyPlane(databuf_[i], GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } } @@ -328,19 +324,19 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers( for (int i = 0; i < num_outbufs_; ++i) { int scanlines_to_copy = DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); - CopyPlane(databuf_[i], GetComponentStride(i), - planes[i], GetComponentWidth(i), - GetComponentWidth(i), scanlines_to_copy); + CopyPlane(databuf_[i], GetComponentStride(i), planes[i], + GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } } return FinishDecode(); } -LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, - int dst_width, int dst_height) { - if (dst_width != GetWidth() || - dst_height > GetHeight()) { +LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, + void* opaque, + int dst_width, + int dst_height) { + if (dst_width != GetWidth() || dst_height > GetHeight()) { // ERROR: Bad dimensions return LIBYUV_FALSE; } @@ -395,7 +391,7 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, } // Read full MCUs until we get to the crop point. for (; lines_left >= GetImageScanlinesPerImcuRow(); - lines_left -= GetImageScanlinesPerImcuRow()) { + lines_left -= GetImageScanlinesPerImcuRow()) { if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; @@ -435,22 +431,22 @@ void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT } void term_source(j_decompress_ptr cinfo) { - // Nothing to do. + (void)cinfo; // Nothing to do. } #ifdef HAVE_SETJMP void ErrorHandler(j_common_ptr cinfo) { - // This is called when a jpeglib command experiences an error. Unfortunately - // jpeglib's error handling model is not very flexible, because it expects the - // error handler to not return--i.e., it wants the program to terminate. To - // recover from errors we use setjmp() as shown in their example. setjmp() is - // C's implementation for the "call with current continuation" functionality - // seen in some functional programming languages. - // A formatted message can be output, but is unsafe for release. +// This is called when a jpeglib command experiences an error. Unfortunately +// jpeglib's error handling model is not very flexible, because it expects the +// error handler to not return--i.e., it wants the program to terminate. To +// recover from errors we use setjmp() as shown in their example. setjmp() is +// C's implementation for the "call with current continuation" functionality +// seen in some functional programming languages. +// A formatted message can be output, but is unsafe for release. #ifdef DEBUG char buf[JMSG_LENGTH_MAX]; (*cinfo->err->format_message)(cinfo, buf); - // ERROR: Error in jpeglib: buf +// ERROR: Error in jpeglib: buf #endif SetJmpErrorMgr* mgr = reinterpret_cast(cinfo->err); @@ -459,8 +455,9 @@ void ErrorHandler(j_common_ptr cinfo) { longjmp(mgr->setjmp_buffer, 1); } +// Suppress fprintf warnings. void OutputHandler(j_common_ptr cinfo) { - // Suppress fprintf warnings. + (void)cinfo; } #endif // HAVE_SETJMP @@ -472,9 +469,9 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { // it. DestroyOutputBuffers(); - scanlines_ = new uint8** [num_outbufs]; + scanlines_ = new uint8_t**[num_outbufs]; scanlines_sizes_ = new int[num_outbufs]; - databuf_ = new uint8* [num_outbufs]; + databuf_ = new uint8_t*[num_outbufs]; databuf_strides_ = new int[num_outbufs]; for (int i = 0; i < num_outbufs; ++i) { @@ -490,13 +487,13 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { void MJpegDecoder::DestroyOutputBuffers() { for (int i = 0; i < num_outbufs_; ++i) { - delete [] scanlines_[i]; - delete [] databuf_[i]; + delete[] scanlines_[i]; + delete[] databuf_[i]; } - delete [] scanlines_; - delete [] databuf_; - delete [] scanlines_sizes_; - delete [] databuf_strides_; + delete[] scanlines_; + delete[] databuf_; + delete[] scanlines_sizes_; + delete[] databuf_strides_; scanlines_ = NULL; databuf_ = NULL; scanlines_sizes_ = NULL; @@ -530,9 +527,9 @@ LIBYUV_BOOL MJpegDecoder::FinishDecode() { return LIBYUV_TRUE; } -void MJpegDecoder::SetScanlinePointers(uint8** data) { +void MJpegDecoder::SetScanlinePointers(uint8_t** data) { for (int i = 0; i < num_outbufs_; ++i) { - uint8* data_i = data[i]; + uint8_t* data_i = data[i]; for (int j = 0; j < scanlines_sizes_[i]; ++j) { scanlines_[i][j] = data_i; data_i += GetComponentStride(i); @@ -542,26 +539,26 @@ void MJpegDecoder::SetScanlinePointers(uint8** data) { inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() { return (unsigned int)(GetImageScanlinesPerImcuRow()) == - jpeg_read_raw_data(decompress_struct_, - scanlines_, - GetImageScanlinesPerImcuRow()); + jpeg_read_raw_data(decompress_struct_, scanlines_, + GetImageScanlinesPerImcuRow()); } // The helper function which recognizes the jpeg sub-sampling type. JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( - int* subsample_x, int* subsample_y, int number_of_components) { + int* subsample_x, + int* subsample_y, + int number_of_components) { if (number_of_components == 3) { // Color images. - if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 2 && subsample_y[1] == 2 && - subsample_x[2] == 2 && subsample_y[2] == 2) { + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && + subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) { return kJpegYuv420; - } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 2 && subsample_y[1] == 1 && - subsample_x[2] == 2 && subsample_y[2] == 1) { + } + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && + subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) { return kJpegYuv422; - } else if (subsample_x[0] == 1 && subsample_y[0] == 1 && - subsample_x[1] == 1 && subsample_y[1] == 1 && - subsample_x[2] == 1 && subsample_y[2] == 1) { + } + if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 && + subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) { return kJpegYuv444; } } else if (number_of_components == 1) { // Grey-scale images. @@ -574,4 +571,3 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( } // namespace libyuv #endif // HAVE_JPEG - diff --git a/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc index 9c48832045..80c2cc0cb9 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/mjpeg_validate.cc @@ -18,13 +18,13 @@ extern "C" { #endif // Helper function to scan for EOI marker (0xff 0xd9). -static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { +static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) { if (sample_size >= 2) { - const uint8* end = sample + sample_size - 1; - const uint8* it = sample; + const uint8_t* end = sample + sample_size - 1; + const uint8_t* it = sample; while (it < end) { // TODO(fbarchard): scan for 0xd9 instead. - it = static_cast(memchr(it, 0xff, end - it)); + it = (const uint8_t*)(memchr(it, 0xff, end - it)); if (it == NULL) { break; } @@ -39,7 +39,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { } // Helper function to validate the jpeg appears intact. -LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { +LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size) { // Maximum size that ValidateJpeg will consider valid. const size_t kMaxJpegSize = 0x7fffffffull; const size_t kBackSearchSize = 1024; @@ -68,4 +68,3 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { } // extern "C" } // namespace libyuv #endif - diff --git a/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc b/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc index a764f8da47..5eae3f763a 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/planar_functions.cc @@ -26,11 +26,14 @@ extern "C" { // Copy a plane of data LIBYUV_API -void CopyPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { +void CopyPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { int y; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -38,8 +41,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y, dst_stride_y = -dst_stride_y; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_y == width) { + if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; @@ -48,6 +50,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y, if (src_y == dst_y && src_stride_y == dst_stride_y) { return; } + #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -68,11 +71,6 @@ void CopyPlane(const uint8* src_y, int src_stride_y, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Copy plane for (y = 0; y < height; ++y) { @@ -83,15 +81,18 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } // TODO(fbarchard): Consider support for negative height. +// TODO(fbarchard): Consider stride measured in bytes. LIBYUV_API -void CopyPlane_16(const uint16* src_y, int src_stride_y, - uint16* dst_y, int dst_stride_y, - int width, int height) { +void CopyPlane_16(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int width, + int height) { int y; - void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C; + void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C; // Coalesce rows. - if (src_stride_y == width && - dst_stride_y == width) { + if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; @@ -111,11 +112,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, CopyRow = CopyRow_16_NEON; } #endif -#if defined(HAS_COPYROW_16_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_16_MIPS; - } -#endif // Copy plane for (y = 0; y < height; ++y) { @@ -125,19 +121,124 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, } } +// Convert a plane of 16 bit data to 8 bit +LIBYUV_API +void Convert16To8Plane(const uint16_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height) { + int y; + void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, + int width) = Convert16To8Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT16TO8ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Convert16To8Row = Convert16To8Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_SSSE3; + } + } +#endif +#if defined(HAS_CONVERT16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert16To8Row = Convert16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert16To8Row = Convert16To8Row_AVX2; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert16To8Row(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + +// Convert a plane of 8 bit data to 16 bit +LIBYUV_API +void Convert8To16Plane(const uint8_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + int scale, // 16384 for 10 bits + int width, + int height) { + int y; + void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale, + int width) = Convert8To16Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT8TO16ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Convert8To16Row = Convert8To16Row_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + Convert8To16Row = Convert8To16Row_SSE2; + } + } +#endif +#if defined(HAS_CONVERT8TO16ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert8To16Row = Convert8To16Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert8To16Row = Convert8To16Row_AVX2; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert8To16Row(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + // Copy I422. LIBYUV_API -int I422Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I422Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; - if (!src_u || !src_v || - !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -161,16 +262,21 @@ int I422Copy(const uint8* src_y, int src_stride_y, // Copy I444. LIBYUV_API -int I444Copy(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - if (!src_u || !src_v || - !dst_u || !dst_v || - width <= 0 || height == 0) { +int I444Copy(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -194,9 +300,12 @@ int I444Copy(const uint8* src_y, int src_stride_y, // Copy I400. LIBYUV_API -int I400ToI400(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int I400ToI400(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } @@ -212,11 +321,20 @@ int I400ToI400(const uint8* src_y, int src_stride_y, // Convert I420 to I400. LIBYUV_API -int I420ToI400(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int I420ToI400(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + (void)src_u; + (void)src_stride_u; + (void)src_v; + (void)src_stride_v; if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } @@ -234,12 +352,16 @@ int I420ToI400(const uint8* src_y, int src_stride_y, // Support function for NV12 etc UV channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API -void SplitUVPlane(const uint8* src_uv, int src_stride_uv, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +void SplitUVPlane(const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; // Negative height means invert the image. if (height < 0) { @@ -250,8 +372,7 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv, dst_stride_v = -dst_stride_v; } // Coalesce rows. - if (src_stride_uv == width * 2 && - dst_stride_u == width && + if (src_stride_uv == width * 2 && dst_stride_u == width && dst_stride_v == width) { width *= height; height = 1; @@ -281,13 +402,11 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv, } } #endif -#if defined(HAS_SPLITUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && - IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { - SplitUVRow = SplitUVRow_Any_DSPR2; - if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_DSPR2; +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; } } #endif @@ -302,13 +421,17 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv, } LIBYUV_API -void MergeUVPlane(const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +void MergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; - void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) = MergeUVRow_C; + void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; // Coalesce rows. // Negative height means invert the image. if (height < 0) { @@ -317,8 +440,7 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u, dst_stride_uv = -dst_stride_uv; } // Coalesce rows. - if (src_stride_u == width && - src_stride_v == width && + if (src_stride_u == width && src_stride_v == width && dst_stride_uv == width * 2) { width *= height; height = 1; @@ -348,6 +470,14 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u, } } #endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow = MergeUVRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + MergeUVRow = MergeUVRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. @@ -358,12 +488,131 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u, } } -// Mirror a plane of data. -void MirrorPlane(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { +// Support function for NV12 etc RGB channels. +// Width and height are plane sizes (typically half pixel width). +LIBYUV_API +void SplitRGBPlane(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_r, + int dst_stride_r, + uint8_t* dst_g, + int dst_stride_g, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int y; - void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; + void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, + uint8_t* dst_b, int width) = SplitRGBRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_r = dst_r + (height - 1) * dst_stride_r; + dst_g = dst_g + (height - 1) * dst_stride_g; + dst_b = dst_b + (height - 1) * dst_stride_b; + dst_stride_r = -dst_stride_r; + dst_stride_g = -dst_stride_g; + dst_stride_b = -dst_stride_b; + } + // Coalesce rows. + if (src_stride_rgb == width * 3 && dst_stride_r == width && + dst_stride_g == width && dst_stride_b == width) { + width *= height; + height = 1; + src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; + } +#if defined(HAS_SPLITRGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SplitRGBRow = SplitRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_SPLITRGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitRGBRow = SplitRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitRGBRow = SplitRGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Copy a row of RGB. + SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width); + dst_r += dst_stride_r; + dst_g += dst_stride_g; + dst_b += dst_stride_b; + src_rgb += src_stride_rgb; + } +} + +LIBYUV_API +void MergeRGBPlane(const uint8_t* src_r, + int src_stride_r, + const uint8_t* src_g, + int src_stride_g, + const uint8_t* src_b, + int src_stride_b, + uint8_t* dst_rgb, + int dst_stride_rgb, + int width, + int height) { + int y; + void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g, + const uint8_t* src_b, uint8_t* dst_rgb, int width) = + MergeRGBRow_C; + // Coalesce rows. + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; + dst_stride_rgb = -dst_stride_rgb; + } + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_rgb == width * 3) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; + } +#if defined(HAS_MERGERGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + MergeRGBRow = MergeRGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MergeRGBRow = MergeRGBRow_SSSE3; + } + } +#endif +#if defined(HAS_MERGERGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeRGBRow = MergeRGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MergeRGBRow = MergeRGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + // Merge a row of U and V into a row of RGB. + MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_rgb += dst_stride_rgb; + } +} + +// Mirror a plane of data. +void MirrorPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -394,12 +643,12 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } } #endif -// TODO(fbarchard): Mirror on mips handle unaligned memory. -#if defined(HAS_MIRRORROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) { - MirrorRow = MirrorRow_DSPR2; +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } } #endif @@ -413,17 +662,24 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, // Convert YUY2 to I422. LIBYUV_API -int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int YUY2ToI422(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*YUY2ToUV422Row)(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) = - YUY2ToUV422Row_C; - void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) = + void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u, + uint8_t* dst_v, int width) = YUY2ToUV422Row_C; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = YUY2ToYRow_C; + if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -431,10 +687,9 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = -src_stride_yuy2; } // Coalesce rows. - if (src_stride_yuy2 == width * 2 && - dst_stride_y == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_yuy2 == width * 2 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width && + width * height <= 32768) { width *= height; height = 1; src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; @@ -462,15 +717,23 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, #if defined(HAS_YUY2TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { YUY2ToYRow = YUY2ToYRow_Any_NEON; - if (width >= 16) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; - } + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_NEON; YUY2ToUV422Row = YUY2ToUV422Row_NEON; } } #endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); @@ -485,17 +748,24 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, // Convert UYVY to I422. LIBYUV_API -int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int UYVYToI422(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; - void (*UYVYToUV422Row)(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) = - UYVYToUV422Row_C; - void (*UYVYToYRow)(const uint8* src_uyvy, - uint8* dst_y, int width) = UYVYToYRow_C; + void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u, + uint8_t* dst_v, int width) = UYVYToUV422Row_C; + void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = + UYVYToYRow_C; + if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -503,10 +773,9 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = -src_stride_uyvy; } // Coalesce rows. - if (src_stride_uyvy == width * 2 && - dst_stride_y == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { + if (src_stride_uyvy == width * 2 && dst_stride_y == width && + dst_stride_u * 2 == width && dst_stride_v * 2 == width && + width * height <= 32768) { width *= height; height = 1; src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; @@ -534,15 +803,23 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, #if defined(HAS_UYVYTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { UYVYToYRow = UYVYToYRow_Any_NEON; - if (width >= 16) { - UYVYToUV422Row = UYVYToUV422Row_Any_NEON; - } + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_NEON; UYVYToUV422Row = UYVYToUV422Row_NEON; } } #endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUV422Row = UYVYToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUV422Row = UYVYToUV422Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); @@ -555,13 +832,82 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, return 0; } +// Convert YUY2 to Y. +LIBYUV_API +int YUY2ToY(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + int y; + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + if (!src_yuy2 || !dst_y || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } + // Coalesce rows. + if (src_stride_yuy2 == width * 2 && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = 0; + } +#if defined(HAS_YUY2TOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + } + } +#endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + YUY2ToYRow(src_yuy2, dst_y, width); + src_yuy2 += src_stride_yuy2; + dst_y += dst_stride_y; + } + return 0; +} + // Mirror I400 with optional flipping LIBYUV_API -int I400Mirror(const uint8* src_y, int src_stride_y, - uint8* dst_y, int dst_stride_y, - int width, int height) { - if (!src_y || !dst_y || - width <= 0 || height == 0) { +int I400Mirror(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -577,17 +923,24 @@ int I400Mirror(const uint8* src_y, int src_stride_y, // Mirror I420 with optional flipping LIBYUV_API -int I420Mirror(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420Mirror(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 || + height == 0) { return -1; } // Negative height means invert the image. @@ -612,11 +965,14 @@ int I420Mirror(const uint8* src_y, int src_stride_y, // ARGB mirror. LIBYUV_API -int ARGBMirror(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBMirror(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) = ARGBMirrorRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -651,6 +1007,14 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif // Mirror plane for (y = 0; y < height; ++y) { @@ -666,8 +1030,8 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, // the same blend function for all pixels if possible. LIBYUV_API ARGBBlendRow GetARGBBlend() { - void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width) = ARGBBlendRow_C; + void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, + uint8_t* dst_argb, int width) = ARGBBlendRow_C; #if defined(HAS_ARGBBLENDROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBBlendRow = ARGBBlendRow_SSSE3; @@ -678,19 +1042,28 @@ ARGBBlendRow GetARGBBlend() { if (TestCpuFlag(kCpuHasNEON)) { ARGBBlendRow = ARGBBlendRow_NEON; } +#endif +#if defined(HAS_ARGBBLENDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBBlendRow = ARGBBlendRow_MSA; + } #endif return ARGBBlendRow; } // Alpha Blend 2 ARGB images and store to destination. LIBYUV_API -int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBBlend(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width) = GetARGBBlend(); + void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, + uint8_t* dst_argb, int width) = GetARGBBlend(); if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -701,8 +1074,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -720,14 +1092,20 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, // Alpha Blend plane and store to destination. LIBYUV_API -int BlendPlane(const uint8* src_y0, int src_stride_y0, - const uint8* src_y1, int src_stride_y1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - int width, int height) { +int BlendPlane(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { int y; - void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C; + void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, + const uint8_t* alpha, uint8_t* dst, int width) = + BlendPlaneRow_C; if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) { return -1; } @@ -739,10 +1117,8 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, } // Coalesce rows for Y plane. - if (src_stride_y0 == width && - src_stride_y1 == width && - alpha_stride == width && - dst_stride_y == width) { + if (src_stride_y0 == width && src_stride_y1 == width && + alpha_stride == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; @@ -750,7 +1126,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, #if defined(HAS_BLENDPLANEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - BlendPlaneRow = BlendPlaneRow_Any_SSSE3; + BlendPlaneRow = BlendPlaneRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { BlendPlaneRow = BlendPlaneRow_SSSE3; } @@ -758,7 +1134,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, #endif #if defined(HAS_BLENDPLANEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - BlendPlaneRow = BlendPlaneRow_Any_AVX2; + BlendPlaneRow = BlendPlaneRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { BlendPlaneRow = BlendPlaneRow_AVX2; } @@ -778,24 +1154,36 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, #define MAXTWIDTH 2048 // Alpha Blend YUV images and store to destination. LIBYUV_API -int I420Blend(const uint8* src_y0, int src_stride_y0, - const uint8* src_u0, int src_stride_u0, - const uint8* src_v0, int src_stride_v0, - const uint8* src_y1, int src_stride_y1, - const uint8* src_u1, int src_stride_u1, - const uint8* src_v1, int src_stride_v1, - const uint8* alpha, int alpha_stride, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { +int I420Blend(const uint8_t* src_y0, + int src_stride_y0, + const uint8_t* src_u0, + int src_stride_u0, + const uint8_t* src_v0, + int src_stride_v0, + const uint8_t* src_y1, + int src_stride_y1, + const uint8_t* src_u1, + int src_stride_u1, + const uint8_t* src_v1, + int src_stride_v1, + const uint8_t* alpha, + int alpha_stride, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; // Half width/height for UV. int halfwidth = (width + 1) >> 1; - void (*BlendPlaneRow)(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C; - void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C; + void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, + const uint8_t* alpha, uint8_t* dst, int width) = + BlendPlaneRow_C; + void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C; if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; @@ -809,11 +1197,8 @@ int I420Blend(const uint8* src_y0, int src_stride_y0, } // Blend Y plane. - BlendPlane(src_y0, src_stride_y0, - src_y1, src_stride_y1, - alpha, alpha_stride, - dst_y, dst_stride_y, - width, height); + BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride, + dst_y, dst_stride_y, width, height); #if defined(HAS_BLENDPLANEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -893,13 +1278,17 @@ int I420Blend(const uint8* src_y0, int src_stride_y0, // Multiply 2 ARGB images and store to destination. LIBYUV_API -int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBMultiply(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, - int width) = ARGBMultiplyRow_C; + void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1, + uint8_t* dst, int width) = ARGBMultiplyRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -910,8 +1299,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -941,6 +1329,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_MSA; + } + } +#endif // Multiply plane for (y = 0; y < height; ++y) { @@ -954,12 +1350,16 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, // Add 2 ARGB images and store to destination. LIBYUV_API -int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBAdd(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, + void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst, int width) = ARGBAddRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; @@ -971,8 +1371,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -1007,6 +1406,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, } } #endif +#if defined(HAS_ARGBADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAddRow = ARGBAddRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_MSA; + } + } +#endif // Add plane for (y = 0; y < height; ++y) { @@ -1020,13 +1427,17 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, // Subtract 2 ARGB images and store to destination. LIBYUV_API -int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBSubtract(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, - int width) = ARGBSubtractRow_C; + void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1, + uint8_t* dst, int width) = ARGBSubtractRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1037,8 +1448,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, dst_stride_argb = -dst_stride_argb; } // Coalesce rows. - if (src_stride_argb0 == width * 4 && - src_stride_argb1 == width * 4 && + if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; @@ -1068,6 +1478,14 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, } } #endif +#if defined(HAS_ARGBSUBTRACTROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSubtractRow = ARGBSubtractRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_MSA; + } + } +#endif // Subtract plane for (y = 0; y < height; ++y) { @@ -1079,21 +1497,23 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, return 0; } // Convert I422 to RGBA with matrix -static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, +static int I422ToRGBAMatrix(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, const struct YuvConstants* yuvconstants, - int width, int height) { + int width, + int height) { int y; - void (*I422ToRGBARow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = I422ToRGBARow_C; - if (!src_y || !src_u || !src_v || !dst_rgba || - width <= 0 || height == 0) { + void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I422ToRGBARow_C; + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1126,13 +1546,12 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_I422TORGBAROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) { - I422ToRGBARow = I422ToRGBARow_DSPR2; +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } } #endif @@ -1148,48 +1567,55 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, // Convert I422 to RGBA. LIBYUV_API -int I422ToRGBA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_rgba, int dst_stride_rgba, - int width, int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_rgba, dst_stride_rgba, - &kYuvI601Constants, - width, height); +int I422ToRGBA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_rgba, + int dst_stride_rgba, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_rgba, dst_stride_rgba, + &kYuvI601Constants, width, height); } // Convert I422 to BGRA. LIBYUV_API -int I422ToBGRA(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bgra, int dst_stride_bgra, - int width, int height) { - return I422ToRGBAMatrix(src_y, src_stride_y, - src_v, src_stride_v, // Swap U and V - src_u, src_stride_u, - dst_bgra, dst_stride_bgra, +int I422ToBGRA(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_bgra, + int dst_stride_bgra, + int width, + int height) { + return I422ToRGBAMatrix(src_y, src_stride_y, src_v, + src_stride_v, // Swap U and V + src_u, src_stride_u, dst_bgra, dst_stride_bgra, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert NV12 to RGB565. LIBYUV_API -int NV12ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_rgb565, int dst_stride_rgb565, - int width, int height) { +int NV12ToRGB565(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_rgb565, + int dst_stride_rgb565, + int width, + int height) { int y; - void (*NV12ToRGB565Row)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) = NV12ToRGB565Row_C; - if (!src_y || !src_uv || !dst_rgb565 || - width <= 0 || height == 0) { + void (*NV12ToRGB565Row)( + const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; + if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1222,6 +1648,14 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV12TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width); @@ -1236,14 +1670,16 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, // Convert RAW to RGB24. LIBYUV_API -int RAWToRGB24(const uint8* src_raw, int src_stride_raw, - uint8* dst_rgb24, int dst_stride_rgb24, - int width, int height) { +int RAWToRGB24(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { int y; - void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) = + void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) = RAWToRGB24Row_C; - if (!src_raw || !dst_rgb24 || - width <= 0 || height == 0) { + if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1253,8 +1689,7 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw, src_stride_raw = -src_stride_raw; } // Coalesce rows. - if (src_stride_raw == width * 3 && - dst_stride_rgb24 == width * 3) { + if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) { width *= height; height = 1; src_stride_raw = dst_stride_rgb24 = 0; @@ -1275,6 +1710,14 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw, } } #endif +#if defined(HAS_RAWTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToRGB24Row = RAWToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToRGB24Row = RAWToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { RAWToRGB24Row(src_raw, dst_rgb24, width); @@ -1285,11 +1728,13 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw, } LIBYUV_API -void SetPlane(uint8* dst_y, int dst_stride_y, - int width, int height, - uint32 value) { +void SetPlane(uint8_t* dst_y, + int dst_stride_y, + int width, + int height, + uint32_t value) { int y; - void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C; + void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C; if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; @@ -1322,6 +1767,11 @@ void SetPlane(uint8* dst_y, int dst_stride_y, SetRow = SetRow_ERMS; } #endif +#if defined(HAS_SETROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) { + SetRow = SetRow_MSA; + } +#endif // Set plane for (y = 0; y < height; ++y) { @@ -1332,22 +1782,26 @@ void SetPlane(uint8* dst_y, int dst_stride_y, // Draw a rectangle into I420 LIBYUV_API -int I420Rect(uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int x, int y, - int width, int height, - int value_y, int value_u, int value_v) { +int I420Rect(uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int x, + int y, + int width, + int height, + int value_y, + int value_u, + int value_v) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - uint8* start_y = dst_y + y * dst_stride_y + x; - uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); - uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); - if (!dst_y || !dst_u || !dst_v || - width <= 0 || height == 0 || - x < 0 || y < 0 || - value_y < 0 || value_y > 255 || - value_u < 0 || value_u > 255 || + uint8_t* start_y = dst_y + y * dst_stride_y + x; + uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); + uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); + if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 || + y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || value_v < 0 || value_v > 255) { return -1; } @@ -1360,15 +1814,17 @@ int I420Rect(uint8* dst_y, int dst_stride_y, // Draw a rectangle into ARGB LIBYUV_API -int ARGBRect(uint8* dst_argb, int dst_stride_argb, - int dst_x, int dst_y, - int width, int height, - uint32 value) { +int ARGBRect(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height, + uint32_t value) { int y; - void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C; - if (!dst_argb || - width <= 0 || height == 0 || - dst_x < 0 || dst_y < 0) { + void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) = + ARGBSetRow_C; + if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { return -1; } if (height < 0) { @@ -1397,6 +1853,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, ARGBSetRow = ARGBSetRow_X86; } #endif +#if defined(HAS_ARGBSETROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSetRow = ARGBSetRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_MSA; + } + } +#endif // Set plane for (y = 0; y < height; ++y) { @@ -1420,11 +1884,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, // f is foreground pixel premultiplied by alpha LIBYUV_API -int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBAttenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1435,8 +1902,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1465,6 +1931,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); @@ -1476,11 +1950,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, // Convert preattentuated ARGB to unattenuated ARGB. LIBYUV_API -int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBUnattenuate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, + void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBUnattenuateRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1491,8 +1968,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1513,7 +1989,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, } } #endif -// TODO(fbarchard): Neon version. + // TODO(fbarchard): Neon version. for (y = 0; y < height; ++y) { ARGBUnattenuateRow(src_argb, dst_argb, width); @@ -1525,12 +2001,15 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, // Convert ARGB to Grayed ARGB. LIBYUV_API -int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBGrayTo(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, - int width) = ARGBGrayRow_C; + void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + ARGBGrayRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -1540,8 +2019,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1556,6 +2034,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, ARGBGrayRow = ARGBGrayRow_NEON; } #endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBGrayRow(src_argb, dst_argb, width); @@ -1567,13 +2050,16 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, // Make a rectangle of ARGB gray scale. LIBYUV_API -int ARGBGray(uint8* dst_argb, int dst_stride_argb, - int dst_x, int dst_y, - int width, int height) { +int ARGBGray(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, - int width) = ARGBGrayRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + ARGBGrayRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } @@ -1593,6 +2079,12 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, ARGBGrayRow = ARGBGrayRow_NEON; } #endif +#if defined(HAS_ARGBGRAYROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBGrayRow = ARGBGrayRow_MSA; + } +#endif + for (y = 0; y < height; ++y) { ARGBGrayRow(dst, dst, width); dst += dst_stride_argb; @@ -1602,11 +2094,15 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, // Make a rectangle of ARGB Sepia tone. LIBYUV_API -int ARGBSepia(uint8* dst_argb, int dst_stride_argb, - int dst_x, int dst_y, int width, int height) { +int ARGBSepia(uint8_t* dst_argb, + int dst_stride_argb, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } @@ -1626,6 +2122,12 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, ARGBSepiaRow = ARGBSepiaRow_NEON; } #endif +#if defined(HAS_ARGBSEPIAROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_MSA; + } +#endif + for (y = 0; y < height; ++y) { ARGBSepiaRow(dst, width); dst += dst_stride_argb; @@ -1636,13 +2138,17 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, // Apply a 4x4 matrix to each ARGB pixel. // Note: Normally for shading, but can be used to swizzle or invert. LIBYUV_API -int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - const int8* matrix_argb, - int width, int height) { +int ARGBColorMatrix(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_argb, + int width, + int height) { int y; - void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) = ARGBColorMatrixRow_C; + void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb, + const int8_t* matrix_argb, int width) = + ARGBColorMatrixRow_C; if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { return -1; } @@ -1652,8 +2158,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1667,6 +2172,11 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; } +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; + } #endif for (y = 0; y < height; ++y) { ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); @@ -1679,13 +2189,17 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, // Apply a 4x3 matrix to each ARGB pixel. // Deprecated. LIBYUV_API -int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, - const int8* matrix_rgb, - int dst_x, int dst_y, int width, int height) { - SIMD_ALIGNED(int8 matrix_argb[16]); - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { +int RGBColorMatrix(uint8_t* dst_argb, + int dst_stride_argb, + const int8_t* matrix_rgb, + int dst_x, + int dst_y, + int width, + int height) { + SIMD_ALIGNED(int8_t matrix_argb[16]); + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { return -1; } @@ -1705,23 +2219,26 @@ int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; matrix_argb[15] = 64; // 1.0 - return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, - dst, dst_stride_argb, - &matrix_argb[0], width, height); + return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst, + dst_stride_argb, &matrix_argb[0], width, height); } // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. LIBYUV_API -int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, - const uint8* table_argb, - int dst_x, int dst_y, int width, int height) { +int ARGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, int width) = ARGBColorTableRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !table_argb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { return -1; } // Coalesce rows. @@ -1745,15 +2262,19 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, // Apply a color table each ARGB pixel but preserve destination alpha. // Table contains 256 ARGB values. LIBYUV_API -int RGBColorTable(uint8* dst_argb, int dst_stride_argb, - const uint8* table_argb, - int dst_x, int dst_y, int width, int height) { +int RGBColorTable(uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* table_argb, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, int width) = RGBColorTableRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; - if (!dst_argb || !table_argb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || + dst_y < 0) { return -1; } // Coalesce rows. @@ -1784,13 +2305,19 @@ int RGBColorTable(uint8* dst_argb, int dst_stride_argb, // Caveat - although SSE2 saturates, the C function does not and should be used // with care if doing anything but quantization. LIBYUV_API -int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, - int scale, int interval_size, int interval_offset, - int dst_x, int dst_y, int width, int height) { +int ARGBQuantize(uint8_t* dst_argb, + int dst_stride_argb, + int scale, + int interval_size, + int interval_offset, + int dst_x, + int dst_y, + int width, + int height) { int y; - void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size, + void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size, int interval_offset, int width) = ARGBQuantizeRow_C; - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || interval_size < 1 || interval_size > 255) { return -1; @@ -1810,6 +2337,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBQuantizeRow = ARGBQuantizeRow_NEON; } +#endif +#if defined(HAS_ARGBQUANTIZEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { + ARGBQuantizeRow = ARGBQuantizeRow_MSA; + } #endif for (y = 0; y < height; ++y) { ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); @@ -1821,13 +2353,17 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, // Computes table of cumulative sum for image where the value is the sum // of all values above and to the left of the entry. Used by ARGBBlur. LIBYUV_API -int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height) { +int ARGBComputeCumulativeSum(const uint8_t* src_argb, + int src_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height) { int y; - void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; - int32* previous_cumsum = dst_cumsum; + void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, + const int32_t* previous_cumsum, int width) = + ComputeCumulativeSumRow_C; + int32_t* previous_cumsum = dst_cumsum; if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { return -1; } @@ -1851,18 +2387,25 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, // aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory // as the buffer is treated as circular. LIBYUV_API -int ARGBBlur(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int32* dst_cumsum, int dst_stride32_cumsum, - int width, int height, int radius) { +int ARGBBlur(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int32_t* dst_cumsum, + int dst_stride32_cumsum, + int width, + int height, + int radius) { int y; - void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum, - const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; - void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C; - int32* cumsum_bot_row; - int32* max_cumsum_bot_row; - int32* cumsum_top_row; + void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, + const int32_t* previous_cumsum, int width) = + ComputeCumulativeSumRow_C; + void (*CumulativeSumToAverageRow)( + const int32_t* topleft, const int32_t* botleft, int width, int area, + uint8_t* dst, int count) = CumulativeSumToAverageRow_C; + int32_t* cumsum_bot_row; + int32_t* max_cumsum_bot_row; + int32_t* cumsum_top_row; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1889,9 +2432,8 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, #endif // Compute enough CumulativeSum for first row to be blurred. After this // one row of CumulativeSum is updated at a time. - ARGBComputeCumulativeSum(src_argb, src_stride_argb, - dst_cumsum, dst_stride32_cumsum, - width, radius); + ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, + dst_stride32_cumsum, width, radius); src_argb = src_argb + radius * src_stride_argb; cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; @@ -1917,7 +2459,7 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, // Increment cumsum_bot_row pointer with circular buffer wrap around and // then fill in a row of CumulativeSum. if ((y + radius) < height) { - const int32* prev_cumsum_bot_row = cumsum_bot_row; + const int32_t* prev_cumsum_bot_row = cumsum_bot_row; cumsum_bot_row += dst_stride32_cumsum; if (cumsum_bot_row >= max_cumsum_bot_row) { cumsum_bot_row = dst_cumsum; @@ -1929,24 +2471,24 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, // Left clipped. for (x = 0; x < radius + 1; ++x) { - CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, - boxwidth, area, &dst_argb[x * 4], 1); + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, + &dst_argb[x * 4], 1); area += (bot_y - top_y); boxwidth += 4; } // Middle unclipped. n = (width - 1) - radius - x + 1; - CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, - boxwidth, area, &dst_argb[x * 4], n); + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, + &dst_argb[x * 4], n); // Right clipped. for (x += n; x <= width - 1; ++x) { area -= (bot_y - top_y); boxwidth -= 4; CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, - cumsum_bot_row + (x - radius - 1) * 4, - boxwidth, area, &dst_argb[x * 4], 1); + cumsum_bot_row + (x - radius - 1) * 4, boxwidth, + area, &dst_argb[x * 4], 1); } dst_argb += dst_stride_argb; } @@ -1955,12 +2497,16 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, // Multiply ARGB image by a specified ARGB value. LIBYUV_API -int ARGBShade(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height, uint32 value) { +int ARGBShade(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + uint32_t value) { int y; - void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, - int width, uint32 value) = ARGBShadeRow_C; + void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width, + uint32_t value) = ARGBShadeRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { return -1; } @@ -1970,8 +2516,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -1986,6 +2531,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, ARGBShadeRow = ARGBShadeRow_NEON; } #endif +#if defined(HAS_ARGBSHADEROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) { + ARGBShadeRow = ARGBShadeRow_MSA; + } +#endif for (y = 0; y < height; ++y) { ARGBShadeRow(src_argb, dst_argb, width, value); @@ -1997,12 +2547,17 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, // Interpolate 2 planes by specified amount (0 to 255). LIBYUV_API -int InterpolatePlane(const uint8* src0, int src_stride0, - const uint8* src1, int src_stride1, - uint8* dst, int dst_stride, - int width, int height, int interpolation) { +int InterpolatePlane(const uint8_t* src0, + int src_stride0, + const uint8_t* src1, + int src_stride1, + uint8_t* dst, + int dst_stride, + int width, + int height, + int interpolation) { int y; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; if (!src0 || !src1 || !dst || width <= 0 || height == 0) { @@ -2015,9 +2570,7 @@ int InterpolatePlane(const uint8* src0, int src_stride0, dst_stride = -dst_stride; } // Coalesce rows. - if (src_stride0 == width && - src_stride1 == width && - dst_stride == width) { + if (src_stride0 == width && src_stride1 == width && dst_stride == width) { width *= height; height = 1; src_stride0 = src_stride1 = dst_stride = 0; @@ -2046,13 +2599,12 @@ int InterpolatePlane(const uint8* src0, int src_stride0, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) && - IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) && - IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) && - IS_ALIGNED(width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } } #endif @@ -2067,61 +2619,71 @@ int InterpolatePlane(const uint8* src0, int src_stride0, // Interpolate 2 ARGB images by specified amount (0 to 255). LIBYUV_API -int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int interpolation) { - return InterpolatePlane(src_argb0, src_stride_argb0, - src_argb1, src_stride_argb1, - dst_argb, dst_stride_argb, +int ARGBInterpolate(const uint8_t* src_argb0, + int src_stride_argb0, + const uint8_t* src_argb1, + int src_stride_argb1, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int interpolation) { + return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1, + src_stride_argb1, dst_argb, dst_stride_argb, width * 4, height, interpolation); } // Interpolate 2 YUV images by specified amount (0 to 255). LIBYUV_API -int I420Interpolate(const uint8* src0_y, int src0_stride_y, - const uint8* src0_u, int src0_stride_u, - const uint8* src0_v, int src0_stride_v, - const uint8* src1_y, int src1_stride_y, - const uint8* src1_u, int src1_stride_u, - const uint8* src1_v, int src1_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, int interpolation) { +int I420Interpolate(const uint8_t* src0_y, + int src0_stride_y, + const uint8_t* src0_u, + int src0_stride_u, + const uint8_t* src0_v, + int src0_stride_v, + const uint8_t* src1_y, + int src1_stride_y, + const uint8_t* src1_u, + int src1_stride_u, + const uint8_t* src1_v, + int src1_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int interpolation) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src0_y || !src0_u || !src0_v || - !src1_y || !src1_u || !src1_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { + if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v || + !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } - InterpolatePlane(src0_y, src0_stride_y, - src1_y, src1_stride_y, - dst_y, dst_stride_y, - width, height, interpolation); - InterpolatePlane(src0_u, src0_stride_u, - src1_u, src1_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight, interpolation); - InterpolatePlane(src0_v, src0_stride_v, - src1_v, src1_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight, interpolation); + InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y, + dst_stride_y, width, height, interpolation); + InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u, + dst_stride_u, halfwidth, halfheight, interpolation); + InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v, + dst_stride_v, halfwidth, halfheight, interpolation); return 0; } // Shuffle ARGB channel order. e.g. BGRA to ARGB. LIBYUV_API -int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_argb, int dst_stride_argb, - const uint8* shuffler, int width, int height) { +int ARGBShuffle(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* shuffler, + int width, + int height) { int y; - void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, - const uint8* shuffler, int width) = ARGBShuffleRow_C; - if (!src_bgra || !dst_argb || - width <= 0 || height == 0) { + void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb, + const uint8_t* shuffler, int width) = ARGBShuffleRow_C; + if (!src_bgra || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -2131,20 +2693,11 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, src_stride_bgra = -src_stride_bgra; } // Coalesce rows. - if (src_stride_bgra == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_bgra = dst_stride_argb = 0; } -#if defined(HAS_ARGBSHUFFLEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBShuffleRow = ARGBShuffleRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBShuffleRow = ARGBShuffleRow_SSE2; - } - } -#endif #if defined(HAS_ARGBSHUFFLEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; @@ -2169,6 +2722,14 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBShuffleRow = ARGBShuffleRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); @@ -2179,28 +2740,32 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } // Sobel ARGB effect. -static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height, - void (*SobelRow)(const uint8* src_sobelx, - const uint8* src_sobely, - uint8* dst, int width)) { +static int ARGBSobelize(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + void (*SobelRow)(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst, + int width)) { int y; - void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) = + void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) = ARGBToYJRow_C; - void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) = SobelYRow_C; - void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobely, int width) = + void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1, + uint8_t* dst_sobely, int width) = SobelYRow_C; + void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1, + const uint8_t* src_y2, uint8_t* dst_sobely, int width) = SobelXRow_C; const int kEdge = 16; // Extra pixels at start of row for extrude/align. - if (!src_argb || !dst_argb || width <= 0 || height == 0) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } @@ -2228,6 +2793,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYJRow = ARGBToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_MSA; + } + } +#endif #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -2239,6 +2812,11 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, SobelYRow = SobelYRow_NEON; } #endif +#if defined(HAS_SOBELYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelYRow = SobelYRow_MSA; + } +#endif #if defined(HAS_SOBELXROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXRow = SobelXRow_SSE2; @@ -2248,19 +2826,24 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasNEON)) { SobelXRow = SobelXRow_NEON; } +#endif +#if defined(HAS_SOBELXROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXRow = SobelXRow_MSA; + } #endif { // 3 rows with edges before/after. const int kRowSize = (width + kEdge + 31) & ~31; align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); - uint8* row_sobelx = rows; - uint8* row_sobely = rows + kRowSize; - uint8* row_y = rows + kRowSize * 2; + uint8_t* row_sobelx = rows; + uint8_t* row_sobely = rows + kRowSize; + uint8_t* row_y = rows + kRowSize * 2; // Convert first row. - uint8* row_y0 = row_y + kEdge; - uint8* row_y1 = row_y0 + kRowSize; - uint8* row_y2 = row_y1 + kRowSize; + uint8_t* row_y0 = row_y + kEdge; + uint8_t* row_y1 = row_y0 + kRowSize; + uint8_t* row_y2 = row_y1 + kRowSize; ARGBToYJRow(src_argb, row_y0, width); row_y0[-1] = row_y0[0]; memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. @@ -2284,7 +2867,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, // Cycle thru circular queue of 3 row_y buffers. { - uint8* row_yt = row_y0; + uint8_t* row_yt = row_y0; row_y0 = row_y1; row_y1 = row_y2; row_y2 = row_yt; @@ -2299,11 +2882,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, // Sobel ARGB effect. LIBYUV_API -int ARGBSobel(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) = SobelRow_C; +int ARGBSobel(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelRow = SobelRow_Any_SSE2; @@ -2319,6 +2905,14 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, SobelRow = SobelRow_NEON; } } +#endif +#if defined(HAS_SOBELROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelRow = SobelRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_MSA; + } + } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelRow); @@ -2326,11 +2920,14 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, // Sobel ARGB effect with planar output. LIBYUV_API -int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, - uint8* dst_y, int dst_stride_y, - int width, int height) { - void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_, int width) = SobelToPlaneRow_C; +int ARGBSobelToPlane(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { + void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_, int width) = SobelToPlaneRow_C; #if defined(HAS_SOBELTOPLANEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; @@ -2347,18 +2944,29 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, } } #endif - return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, - width, height, SobelToPlaneRow); +#if defined(HAS_SOBELTOPLANEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelToPlaneRow = SobelToPlaneRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SobelToPlaneRow = SobelToPlaneRow_MSA; + } + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, + height, SobelToPlaneRow); } // SobelXY ARGB effect. // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. LIBYUV_API -int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) = SobelXYRow_C; +int ARGBSobelXY(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, + uint8_t* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXYRow = SobelXYRow_Any_SSE2; @@ -2374,6 +2982,14 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, SobelXYRow = SobelXYRow_NEON; } } +#endif +#if defined(HAS_SOBELXYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXYRow = SobelXYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_MSA; + } + } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelXYRow); @@ -2381,26 +2997,27 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, // Apply a 4x4 polynomial to each ARGB pixel. LIBYUV_API -int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, +int ARGBPolynomial(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, const float* poly, - int width, int height) { + int width, + int height) { int y; - void (*ARGBPolynomialRow)(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) = ARGBPolynomialRow_C; + void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb, + const float* poly, int width) = ARGBPolynomialRow_C; if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -2425,28 +3042,132 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, return 0; } +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16_t* src_y, + int src_stride_y, + uint16_t* dst_y, + int dst_stride_y, + float scale, + int width, + int height) { + int y; + void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale, + int width) = HalfFloatRow_C; + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + src_stride_y >>= 1; + dst_stride_y >>= 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_HALFFLOATROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + HalfFloatRow = HalfFloatRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = HalfFloatRow_SSE2; + } + } +#endif +#if defined(HAS_HALFFLOATROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HalfFloatRow = HalfFloatRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = HalfFloatRow_AVX2; + } + } +#endif +#if defined(HAS_HALFFLOATROW_F16C) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { + HalfFloatRow = + (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C; + } + } +#endif +#if defined(HAS_HALFFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + HalfFloatRow = + (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON; + } + } +#endif +#if defined(HAS_HALFFLOATROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HalfFloatRow = HalfFloatRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + HalfFloatRow = HalfFloatRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + HalfFloatRow(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } + return 0; +} + +// Convert a buffer of bytes to floats, scale the values and store as floats. +LIBYUV_API +int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) { + void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale, + int width) = ByteToFloatRow_C; + if (!src_y || !dst_y || width <= 0) { + return -1; + } +#if defined(HAS_BYTETOFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ByteToFloatRow = ByteToFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ByteToFloatRow = ByteToFloatRow_NEON; + } + } +#endif + + ByteToFloatRow(src_y, dst_y, scale, width); + return 0; +} + // Apply a lumacolortable to each ARGB pixel. LIBYUV_API -int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - const uint8* luma, - int width, int height) { +int ARGBLumaColorTable(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + const uint8_t* luma, + int width, + int height) { int y; - void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb, - int width, const uint8* luma, const uint32 lumacoeff) = - ARGBLumaColorTableRow_C; + void (*ARGBLumaColorTableRow)( + const uint8_t* src_argb, uint8_t* dst_argb, int width, + const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C; if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; + src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -2467,12 +3188,15 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, // Copy Alpha from one ARGB image to another. LIBYUV_API -int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBCopyAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) = - ARGBCopyAlphaRow_C; + void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBCopyAlphaRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -2483,8 +3207,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride_argb == width * 4 && - dst_stride_argb == width * 4) { + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; @@ -2516,55 +3239,73 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, // Extract just the alpha channel from ARGB. LIBYUV_API -int ARGBExtractAlpha(const uint8* src_argb, int src_stride, - uint8* dst_a, int dst_stride, - int width, int height) { +int ARGBExtractAlpha(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_a, + int dst_stride_a, + int width, + int height) { if (!src_argb || !dst_a || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; - src_argb += (height - 1) * src_stride; - src_stride = -src_stride; + src_argb += (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; } // Coalesce rows. - if (src_stride == width * 4 && dst_stride == width) { + if (src_stride_argb == width * 4 && dst_stride_a == width) { width *= height; height = 1; - src_stride = dst_stride = 0; + src_stride_argb = dst_stride_a = 0; } - void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) = - ARGBExtractAlphaRow_C; + void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, + int width) = ARGBExtractAlphaRow_C; #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 : ARGBExtractAlphaRow_Any_SSE2; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2 + : ARGBExtractAlphaRow_Any_AVX2; + } +#endif #if defined(HAS_ARGBEXTRACTALPHAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON : ARGBExtractAlphaRow_Any_NEON; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA + : ARGBExtractAlphaRow_Any_MSA; + } +#endif for (int y = 0; y < height; ++y) { ARGBExtractAlphaRow(src_argb, dst_a, width); - src_argb += src_stride; - dst_a += dst_stride; + src_argb += src_stride_argb; + dst_a += dst_stride_a; } return 0; } // Copy a planar Y channel to the alpha channel of a destination ARGB image. LIBYUV_API -int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int ARGBCopyYToAlpha(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int y; - void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) = - ARGBCopyYToAlphaRow_C; + void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb, + int width) = ARGBCopyYToAlphaRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } @@ -2575,8 +3316,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, src_stride_y = -src_stride_y; } // Coalesce rows. - if (src_stride_y == width && - dst_stride_argb == width * 4) { + if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; @@ -2610,20 +3350,22 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, // directly. A SplitUVRow_Odd function could copy the remaining chroma. LIBYUV_API -int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int YUY2ToNV12(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - if (!src_yuy2 || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -2656,6 +3398,14 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -2680,6 +3430,14 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif { int awidth = halfwidth * 2; @@ -2708,20 +3466,22 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, } LIBYUV_API -int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_y, int dst_stride_y, - uint8* dst_uv, int dst_stride_uv, - int width, int height) { +int UYVYToNV12(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { int y; int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; - if (!src_uyvy || - !dst_y || !dst_uv || - width <= 0 || height == 0) { + if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -2754,6 +3514,14 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -2778,6 +3546,14 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif { int awidth = halfwidth * 2; diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc index 01ea5c4074..f2bed85b75 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate.cc @@ -10,8 +10,8 @@ #include "libyuv/rotate.h" -#include "libyuv/cpu_id.h" #include "libyuv/convert.h" +#include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate_row.h" #include "libyuv/row.h" @@ -22,12 +22,20 @@ extern "C" { #endif LIBYUV_API -void TransposePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void TransposePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { int i = height; - void (*TransposeWx8)(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) = TransposeWx8_C; +#if defined(HAS_TRANSPOSEWX16_MSA) + void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst, + int dst_stride, int width) = TransposeWx16_C; +#else + void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst, + int dst_stride, int width) = TransposeWx8_C; +#endif #if defined(HAS_TRANSPOSEWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeWx8 = TransposeWx8_NEON; @@ -49,24 +57,32 @@ void TransposePlane(const uint8* src, int src_stride, } } #endif -#if defined(HAS_TRANSPOSEWX8_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - if (IS_ALIGNED(width, 4) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { - TransposeWx8 = TransposeWx8_Fast_DSPR2; - } else { - TransposeWx8 = TransposeWx8_DSPR2; +#if defined(HAS_TRANSPOSEWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeWx16 = TransposeWx16_Any_MSA; + if (IS_ALIGNED(width, 16)) { + TransposeWx16 = TransposeWx16_MSA; } } #endif +#if defined(HAS_TRANSPOSEWX16_MSA) + // Work across the source in 16x16 tiles + while (i >= 16) { + TransposeWx16(src, src_stride, dst, dst_stride, width); + src += 16 * src_stride; // Go down 16 rows. + dst += 16; // Move over 16 columns. + i -= 16; + } +#else // Work across the source in 8x8 tiles while (i >= 8) { TransposeWx8(src, src_stride, dst, dst_stride, width); - src += 8 * src_stride; // Go down 8 rows. - dst += 8; // Move over 8 columns. + src += 8 * src_stride; // Go down 8 rows. + dst += 8; // Move over 8 columns. i -= 8; } +#endif if (i > 0) { TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); @@ -74,9 +90,12 @@ void TransposePlane(const uint8* src, int src_stride, } LIBYUV_API -void RotatePlane90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void RotatePlane90(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { // Rotate by 90 is a transpose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. @@ -86,9 +105,12 @@ void RotatePlane90(const uint8* src, int src_stride, } LIBYUV_API -void RotatePlane270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void RotatePlane270(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { // Rotate by 270 is a transpose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. @@ -98,17 +120,20 @@ void RotatePlane270(const uint8* src, int src_stride, } LIBYUV_API -void RotatePlane180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void RotatePlane180(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width); - const uint8* src_bot = src + src_stride * (height - 1); - uint8* dst_bot = dst + dst_stride * (height - 1); + const uint8_t* src_bot = src + src_stride * (height - 1); + uint8_t* dst_bot = dst + dst_stride * (height - 1); int half_height = (height + 1) >> 1; int y; - void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; + void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; #if defined(HAS_MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MirrorRow = MirrorRow_Any_NEON; @@ -133,12 +158,12 @@ void RotatePlane180(const uint8* src, int src_stride, } } #endif -// TODO(fbarchard): Mirror on mips handle unaligned memory. -#if defined(HAS_MIRRORROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { - MirrorRow = MirrorRow_DSPR2; +#if defined(HAS_MIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MirrorRow = MirrorRow_Any_MSA; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_MSA; + } } #endif #if defined(HAS_COPYROW_SSE2) @@ -161,11 +186,6 @@ void RotatePlane180(const uint8* src, int src_stride, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { @@ -181,15 +201,24 @@ void RotatePlane180(const uint8* src, int src_stride, } LIBYUV_API -void TransposeUV(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void TransposeUV(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int i = height; - void (*TransposeUVWx8)(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, +#if defined(HAS_TRANSPOSEUVWX16_MSA) + void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, + int width) = TransposeUVWx16_C; +#else + void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a, + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) = TransposeUVWx8_C; +#endif #if defined(HAS_TRANSPOSEUVWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeUVWx8 = TransposeUVWx8_NEON; @@ -203,72 +232,90 @@ void TransposeUV(const uint8* src, int src_stride, } } #endif -#if defined(HAS_TRANSPOSEUVWX8_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { - TransposeUVWx8 = TransposeUVWx8_DSPR2; +#if defined(HAS_TRANSPOSEUVWX16_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + TransposeUVWx16 = TransposeUVWx16_Any_MSA; + if (IS_ALIGNED(width, 8)) { + TransposeUVWx16 = TransposeUVWx16_MSA; + } } #endif +#if defined(HAS_TRANSPOSEUVWX16_MSA) + // Work through the source in 8x8 tiles. + while (i >= 16) { + TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + src += 16 * src_stride; // Go down 16 rows. + dst_a += 16; // Move over 8 columns. + dst_b += 16; // Move over 8 columns. + i -= 16; + } +#else // Work through the source in 8x8 tiles. while (i >= 8) { - TransposeUVWx8(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, + TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width); - src += 8 * src_stride; // Go down 8 rows. - dst_a += 8; // Move over 8 columns. - dst_b += 8; // Move over 8 columns. + src += 8 * src_stride; // Go down 8 rows. + dst_a += 8; // Move over 8 columns. + dst_b += 8; // Move over 8 columns. i -= 8; } +#endif if (i > 0) { - TransposeUVWxH_C(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, + TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, i); } } LIBYUV_API -void RotateUV90(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void RotateUV90(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { src += src_stride * (height - 1); src_stride = -src_stride; - TransposeUV(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, - width, height); + TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, + height); } LIBYUV_API -void RotateUV270(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void RotateUV270(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { dst_a += dst_stride_a * (width - 1); dst_b += dst_stride_b * (width - 1); dst_stride_a = -dst_stride_a; dst_stride_b = -dst_stride_b; - TransposeUV(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, - width, height); + TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, + height); } // Rotate 180 is a horizontal and vertical flip. LIBYUV_API -void RotateUV180(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void RotateUV180(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int i; - void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = - MirrorUVRow_C; + void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, + int width) = MirrorUVRow_C; #if defined(HAS_MIRRORUVROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { MirrorUVRow = MirrorUVRow_NEON; @@ -279,10 +326,9 @@ void RotateUV180(const uint8* src, int src_stride, MirrorUVRow = MirrorUVRow_SSSE3; } #endif -#if defined(HAS_MIRRORUVROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { - MirrorUVRow = MirrorUVRow_DSPR2; +#if defined(HAS_MIRRORUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { + MirrorUVRow = MirrorUVRow_MSA; } #endif @@ -298,9 +344,12 @@ void RotateUV180(const uint8* src, int src_stride, } LIBYUV_API -int RotatePlane(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height, +int RotatePlane(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height, enum RotationMode mode) { if (!src || width <= 0 || height == 0 || !dst) { return -1; @@ -316,24 +365,16 @@ int RotatePlane(const uint8* src, int src_stride, switch (mode) { case kRotate0: // copy frame - CopyPlane(src, src_stride, - dst, dst_stride, - width, height); + CopyPlane(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate90: - RotatePlane90(src, src_stride, - dst, dst_stride, - width, height); + RotatePlane90(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate270: - RotatePlane270(src, src_stride, - dst, dst_stride, - width, height); + RotatePlane270(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate180: - RotatePlane180(src, src_stride, - dst, dst_stride, - width, height); + RotatePlane180(src, src_stride, dst, dst_stride, width, height); return 0; default: break; @@ -342,18 +383,25 @@ int RotatePlane(const uint8* src, int src_stride, } LIBYUV_API -int I420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, +int I420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, enum RotationMode mode) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || - !dst_y || !dst_u || !dst_v) { + if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || + !dst_u || !dst_v) { return -1; } @@ -372,45 +420,29 @@ int I420Rotate(const uint8* src_y, int src_stride_y, switch (mode) { case kRotate0: // copy frame - return I420Copy(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height); + return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, + dst_v, dst_stride_v, width, height); case kRotate90: - RotatePlane90(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotatePlane90(src_u, src_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight); - RotatePlane90(src_v, src_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); return 0; case kRotate270: - RotatePlane270(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotatePlane270(src_u, src_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight); - RotatePlane270(src_v, src_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); return 0; case kRotate180: - RotatePlane180(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotatePlane180(src_u, src_stride_u, - dst_u, dst_stride_u, - halfwidth, halfheight); - RotatePlane180(src_v, src_stride_v, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, + halfheight); + RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, + halfheight); return 0; default: break; @@ -419,17 +451,23 @@ int I420Rotate(const uint8* src_y, int src_stride_y, } LIBYUV_API -int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, +int NV12ToI420Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, enum RotationMode mode) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; - if (!src_y || !src_uv || width <= 0 || height == 0 || - !dst_y || !dst_u || !dst_v) { + if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u || + !dst_v) { return -1; } @@ -446,38 +484,23 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, switch (mode) { case kRotate0: // copy frame - return NV12ToI420(src_y, src_stride_y, - src_uv, src_stride_uv, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, + return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, + dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); case kRotate90: - RotatePlane90(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotateUV90(src_uv, src_stride_uv, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; case kRotate270: - RotatePlane270(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotateUV270(src_uv, src_stride_uv, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; case kRotate180: - RotatePlane180(src_y, src_stride_y, - dst_y, dst_stride_y, - width, height); - RotateUV180(src_uv, src_stride_uv, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - halfwidth, halfheight); + RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, + dst_stride_v, halfwidth, halfheight); return 0; default: break; diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc index 31a74c3155..c2752e6222 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_any.cc @@ -18,16 +18,16 @@ namespace libyuv { extern "C" { #endif -#define TANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8* src, int src_stride, \ - uint8* dst, int dst_stride, int width) { \ - int r = width & MASK; \ - int n = width - r; \ - if (n > 0) { \ - TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ - } \ - TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\ - } +#define TANY(NAMEANY, TPOS_SIMD, MASK) \ + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \ + int dst_stride, int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ + } \ + TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ + } #ifdef HAS_TRANSPOSEWX8_NEON TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) @@ -38,25 +38,23 @@ TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) #endif -#ifdef HAS_TRANSPOSEWX8_DSPR2 -TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7) +#ifdef HAS_TRANSPOSEWX16_MSA +TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15) #endif #undef TANY #define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ - void NAMEANY(const uint8* src, int src_stride, \ - uint8* dst_a, int dst_stride_a, \ - uint8* dst_b, int dst_stride_b, int width) { \ - int r = width & MASK; \ - int n = width - r; \ - if (n > 0) { \ - TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \ - n); \ - } \ - TransposeUVWx8_C(src + n * 2, src_stride, \ - dst_a + n * dst_stride_a, dst_stride_a, \ - dst_b + n * dst_stride_b, dst_stride_b, r); \ - } + void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \ + int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \ + int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \ + } \ + TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \ + dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \ + } #ifdef HAS_TRANSPOSEUVWX8_NEON TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) @@ -64,8 +62,8 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) #ifdef HAS_TRANSPOSEUVWX8_SSE2 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) #endif -#ifdef HAS_TRANSPOSEUVWX8_DSPR2 -TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7) +#ifdef HAS_TRANSPOSEUVWX16_MSA +TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7) #endif #undef TUVANY @@ -73,8 +71,3 @@ TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7) } // extern "C" } // namespace libyuv #endif - - - - - diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc index 787c0ad1be..5a6e05376f 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_argb.cc @@ -10,90 +10,106 @@ #include "libyuv/rotate.h" -#include "libyuv/cpu_id.h" #include "libyuv/convert.h" +#include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/row.h" +#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */ #ifdef __cplusplus namespace libyuv { extern "C" { #endif -// ARGBScale has a function to copy pixels to a row, striding each source -// pixel by a constant. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || \ - (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__)) -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, - int src_stepx, uint8* dst_ptr, int dst_width); -#endif -#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_SCALEARGBROWDOWNEVEN_NEON -void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride, - int src_stepx, uint8* dst_ptr, int dst_width); -#endif - -void ScaleARGBRowDownEven_C(const uint8* src_ptr, int, - int src_stepx, uint8* dst_ptr, int dst_width); - -static void ARGBTranspose(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +static void ARGBTranspose(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { int i; - int src_pixel_step = src_stride >> 2; - void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, - int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C; + int src_pixel_step = src_stride_argb >> 2; + void (*ScaleARGBRowDownEven)( + const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step, + uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; + } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest. - ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA; + } } #endif for (i = 0; i < width; ++i) { // column of source to row of dest. - ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height); - dst += dst_stride; - src += 4; + ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); + dst_argb += dst_stride_argb; + src_argb += 4; } } -void ARGBRotate90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +void ARGBRotate90(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Rotate by 90 is a ARGBTranspose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. - src += src_stride * (height - 1); - src_stride = -src_stride; - ARGBTranspose(src, src_stride, dst, dst_stride, width, height); + src_argb += src_stride_argb * (height - 1); + src_stride_argb = -src_stride_argb; + ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); } -void ARGBRotate270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +void ARGBRotate270(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Rotate by 270 is a ARGBTranspose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. - dst += dst_stride * (width - 1); - dst_stride = -dst_stride; - ARGBTranspose(src, src_stride, dst, dst_stride, width, height); + dst_argb += dst_stride_argb * (width - 1); + dst_stride_argb = -dst_stride_argb; + ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); } -void ARGBRotate180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width, int height) { +void ARGBRotate180(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width * 4); - const uint8* src_bot = src + src_stride * (height - 1); - uint8* dst_bot = dst + dst_stride * (height - 1); + const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1); + uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1); int half_height = (height + 1) >> 1; int y; - void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = + void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBMirrorRow_C; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; + void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = + CopyRow_C; #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; @@ -118,6 +134,14 @@ void ARGBRotate180(const uint8* src, int src_stride, } } #endif +#if defined(HAS_ARGBMIRRORROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMirrorRow = ARGBMirrorRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBMirrorRow = ARGBMirrorRow_MSA; + } + } +#endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; @@ -138,28 +162,27 @@ void ARGBRotate180(const uint8* src, int src_stride, CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { - ARGBMirrorRow(src, row, width); // Mirror first row into a buffer - ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row + ARGBMirrorRow(src_argb, row, width); // Mirror first row into a buffer + ARGBMirrorRow(src_bot, dst_argb, width); // Mirror last row into first row CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last - src += src_stride; - dst += dst_stride; - src_bot -= src_stride; - dst_bot -= dst_stride; + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + src_bot -= src_stride_argb; + dst_bot -= dst_stride_argb; } free_aligned_buffer_64(row); } LIBYUV_API -int ARGBRotate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, int width, int height, +int ARGBRotate(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, enum RotationMode mode) { if (!src_argb || width <= 0 || height == 0 || !dst_argb) { return -1; @@ -175,23 +198,19 @@ int ARGBRotate(const uint8* src_argb, int src_stride_argb, switch (mode) { case kRotate0: // copy frame - return ARGBCopy(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, + return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); case kRotate90: - ARGBRotate90(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, - width, height); + ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); return 0; case kRotate270: - ARGBRotate270(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, - width, height); + ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); return 0; case kRotate180: - ARGBRotate180(src_argb, src_stride_argb, - dst_argb, dst_stride_argb, - width, height); + ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, + height); return 0; default: break; diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc index b33a9a0c6e..ff212adebc 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_common.cc @@ -8,16 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/rotate_row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif -void TransposeWx8_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { +void TransposeWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { int i; for (i = 0; i < width; ++i) { dst[0] = src[0 * src_stride]; @@ -33,9 +36,13 @@ void TransposeWx8_C(const uint8* src, int src_stride, } } -void TransposeUVWx8_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width) { +void TransposeUVWx8_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { int i; for (i = 0; i < width; ++i) { dst_a[0] = src[0 * src_stride + 0]; @@ -60,9 +67,12 @@ void TransposeUVWx8_C(const uint8* src, int src_stride, } } -void TransposeWxH_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { +void TransposeWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width, + int height) { int i; for (i = 0; i < width; ++i) { int j; @@ -72,10 +82,14 @@ void TransposeWxH_C(const uint8* src, int src_stride, } } -void TransposeUVWxH_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { +void TransposeUVWxH_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width, + int height) { int i; for (i = 0; i < width * 2; i += 2) { int j; diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc index cbe870caa7..04e19e29ee 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_gcc.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/rotate_row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -22,342 +22,348 @@ extern "C" { // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. #if defined(HAS_TRANSPOSEWX8_SSSE3) -void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // defined(HAS_TRANSPOSEWX8_SSSE3) // Transpose 16x8. 64 bit #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) -void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" - ); +void TransposeWx8_Fast_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm15"); } #endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) // Transpose UV 8x8. 64 bit. #if defined(HAS_TRANSPOSEUVWX8_SSE2) -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" - // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(src_stride)), // %4 - "r"((intptr_t)(dst_stride_a)), // %5 - "r"((intptr_t)(dst_stride_b)) // %6 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9" - ); +void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + asm volatile( + // Read in the data from the source pointer. + // First round of bit swap. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9"); } #endif // defined(HAS_TRANSPOSEUVWX8_SSE2) #endif // defined(__x86_64__) || defined(__i386__) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_mips.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_mips.cc deleted file mode 100644 index 1e8ce25197..0000000000 --- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_mips.cc +++ /dev/null @@ -1,484 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" -#include "libyuv/rotate_row.h" - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#if !defined(LIBYUV_DISABLE_MIPS) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) - -void TransposeWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 - "addu $t3, $t2, %[src_stride] \n" - "addu $t5, $t4, %[src_stride] \n" - "addu $t6, $t2, $t4 \n" - "andi $t0, %[dst], 0x3 \n" - "andi $t1, %[dst_stride], 0x3 \n" - "or $t0, $t0, $t1 \n" - "bnez $t0, 11f \n" - " subu $t7, $t9, %[src_stride] \n" -//dst + dst_stride word aligned - "1: \n" - "lbu $t0, 0(%[src]) \n" - "lbux $t1, %[src_stride](%[src]) \n" - "lbux $t8, $t2(%[src]) \n" - "lbux $t9, $t3(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s0, $t8, $t0 \n" - "lbux $t0, $t4(%[src]) \n" - "lbux $t1, $t5(%[src]) \n" - "lbux $t8, $t6(%[src]) \n" - "lbux $t9, $t7(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s1, $t8, $t0 \n" - "sw $s0, 0(%[dst]) \n" - "addiu %[width], -1 \n" - "addiu %[src], 1 \n" - "sw $s1, 4(%[dst]) \n" - "bnez %[width], 1b \n" - " addu %[dst], %[dst], %[dst_stride] \n" - "b 2f \n" -//dst + dst_stride unaligned - "11: \n" - "lbu $t0, 0(%[src]) \n" - "lbux $t1, %[src_stride](%[src]) \n" - "lbux $t8, $t2(%[src]) \n" - "lbux $t9, $t3(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s0, $t8, $t0 \n" - "lbux $t0, $t4(%[src]) \n" - "lbux $t1, $t5(%[src]) \n" - "lbux $t8, $t6(%[src]) \n" - "lbux $t9, $t7(%[src]) \n" - "sll $t1, $t1, 16 \n" - "sll $t9, $t9, 16 \n" - "or $t0, $t0, $t1 \n" - "or $t8, $t8, $t9 \n" - "precr.qb.ph $s1, $t8, $t0 \n" - "swr $s0, 0(%[dst]) \n" - "swl $s0, 3(%[dst]) \n" - "addiu %[width], -1 \n" - "addiu %[src], 1 \n" - "swr $s1, 4(%[dst]) \n" - "swl $s1, 7(%[dst]) \n" - "bnez %[width], 11b \n" - "addu %[dst], %[dst], %[dst_stride] \n" - "2: \n" - ".set pop \n" - :[src] "+r" (src), - [dst] "+r" (dst), - [width] "+r" (width) - :[src_stride] "r" (src_stride), - [dst_stride] "r" (dst_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1" - ); -} - -void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - __asm__ __volatile__ ( - ".set noat \n" - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 - "addu $t3, $t2, %[src_stride] \n" - "addu $t5, $t4, %[src_stride] \n" - "addu $t6, $t2, $t4 \n" - - "srl $AT, %[width], 0x2 \n" - "andi $t0, %[dst], 0x3 \n" - "andi $t1, %[dst_stride], 0x3 \n" - "or $t0, $t0, $t1 \n" - "bnez $t0, 11f \n" - " subu $t7, $t9, %[src_stride] \n" -//dst + dst_stride word aligned - "1: \n" - "lw $t0, 0(%[src]) \n" - "lwx $t1, %[src_stride](%[src]) \n" - "lwx $t8, $t2(%[src]) \n" - "lwx $t9, $t3(%[src]) \n" - -// t0 = | 30 | 20 | 10 | 00 | -// t1 = | 31 | 21 | 11 | 01 | -// t8 = | 32 | 22 | 12 | 02 | -// t9 = | 33 | 23 | 13 | 03 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 21 | 01 | 20 | 00 | - // s1 = | 23 | 03 | 22 | 02 | - // s2 = | 31 | 11 | 30 | 10 | - // s3 = | 33 | 13 | 32 | 12 | - - "precr.qb.ph $s4, $s1, $s0 \n" - "precrq.qb.ph $s5, $s1, $s0 \n" - "precr.qb.ph $s6, $s3, $s2 \n" - "precrq.qb.ph $s7, $s3, $s2 \n" - - // s4 = | 03 | 02 | 01 | 00 | - // s5 = | 23 | 22 | 21 | 20 | - // s6 = | 13 | 12 | 11 | 10 | - // s7 = | 33 | 32 | 31 | 30 | - - "lwx $t0, $t4(%[src]) \n" - "lwx $t1, $t5(%[src]) \n" - "lwx $t8, $t6(%[src]) \n" - "lwx $t9, $t7(%[src]) \n" - -// t0 = | 34 | 24 | 14 | 04 | -// t1 = | 35 | 25 | 15 | 05 | -// t8 = | 36 | 26 | 16 | 06 | -// t9 = | 37 | 27 | 17 | 07 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 25 | 05 | 24 | 04 | - // s1 = | 27 | 07 | 26 | 06 | - // s2 = | 35 | 15 | 34 | 14 | - // s3 = | 37 | 17 | 36 | 16 | - - "precr.qb.ph $t0, $s1, $s0 \n" - "precrq.qb.ph $t1, $s1, $s0 \n" - "precr.qb.ph $t8, $s3, $s2 \n" - "precrq.qb.ph $t9, $s3, $s2 \n" - - // t0 = | 07 | 06 | 05 | 04 | - // t1 = | 27 | 26 | 25 | 24 | - // t8 = | 17 | 16 | 15 | 14 | - // t9 = | 37 | 36 | 35 | 34 | - - "addu $s0, %[dst], %[dst_stride] \n" - "addu $s1, $s0, %[dst_stride] \n" - "addu $s2, $s1, %[dst_stride] \n" - - "sw $s4, 0(%[dst]) \n" - "sw $t0, 4(%[dst]) \n" - "sw $s6, 0($s0) \n" - "sw $t8, 4($s0) \n" - "sw $s5, 0($s1) \n" - "sw $t1, 4($s1) \n" - "sw $s7, 0($s2) \n" - "sw $t9, 4($s2) \n" - - "addiu $AT, -1 \n" - "addiu %[src], 4 \n" - - "bnez $AT, 1b \n" - " addu %[dst], $s2, %[dst_stride] \n" - "b 2f \n" -//dst + dst_stride unaligned - "11: \n" - "lw $t0, 0(%[src]) \n" - "lwx $t1, %[src_stride](%[src]) \n" - "lwx $t8, $t2(%[src]) \n" - "lwx $t9, $t3(%[src]) \n" - -// t0 = | 30 | 20 | 10 | 00 | -// t1 = | 31 | 21 | 11 | 01 | -// t8 = | 32 | 22 | 12 | 02 | -// t9 = | 33 | 23 | 13 | 03 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 21 | 01 | 20 | 00 | - // s1 = | 23 | 03 | 22 | 02 | - // s2 = | 31 | 11 | 30 | 10 | - // s3 = | 33 | 13 | 32 | 12 | - - "precr.qb.ph $s4, $s1, $s0 \n" - "precrq.qb.ph $s5, $s1, $s0 \n" - "precr.qb.ph $s6, $s3, $s2 \n" - "precrq.qb.ph $s7, $s3, $s2 \n" - - // s4 = | 03 | 02 | 01 | 00 | - // s5 = | 23 | 22 | 21 | 20 | - // s6 = | 13 | 12 | 11 | 10 | - // s7 = | 33 | 32 | 31 | 30 | - - "lwx $t0, $t4(%[src]) \n" - "lwx $t1, $t5(%[src]) \n" - "lwx $t8, $t6(%[src]) \n" - "lwx $t9, $t7(%[src]) \n" - -// t0 = | 34 | 24 | 14 | 04 | -// t1 = | 35 | 25 | 15 | 05 | -// t8 = | 36 | 26 | 16 | 06 | -// t9 = | 37 | 27 | 17 | 07 | - - "precr.qb.ph $s0, $t1, $t0 \n" - "precr.qb.ph $s1, $t9, $t8 \n" - "precrq.qb.ph $s2, $t1, $t0 \n" - "precrq.qb.ph $s3, $t9, $t8 \n" - - // s0 = | 25 | 05 | 24 | 04 | - // s1 = | 27 | 07 | 26 | 06 | - // s2 = | 35 | 15 | 34 | 14 | - // s3 = | 37 | 17 | 36 | 16 | - - "precr.qb.ph $t0, $s1, $s0 \n" - "precrq.qb.ph $t1, $s1, $s0 \n" - "precr.qb.ph $t8, $s3, $s2 \n" - "precrq.qb.ph $t9, $s3, $s2 \n" - - // t0 = | 07 | 06 | 05 | 04 | - // t1 = | 27 | 26 | 25 | 24 | - // t8 = | 17 | 16 | 15 | 14 | - // t9 = | 37 | 36 | 35 | 34 | - - "addu $s0, %[dst], %[dst_stride] \n" - "addu $s1, $s0, %[dst_stride] \n" - "addu $s2, $s1, %[dst_stride] \n" - - "swr $s4, 0(%[dst]) \n" - "swl $s4, 3(%[dst]) \n" - "swr $t0, 4(%[dst]) \n" - "swl $t0, 7(%[dst]) \n" - "swr $s6, 0($s0) \n" - "swl $s6, 3($s0) \n" - "swr $t8, 4($s0) \n" - "swl $t8, 7($s0) \n" - "swr $s5, 0($s1) \n" - "swl $s5, 3($s1) \n" - "swr $t1, 4($s1) \n" - "swl $t1, 7($s1) \n" - "swr $s7, 0($s2) \n" - "swl $s7, 3($s2) \n" - "swr $t9, 4($s2) \n" - "swl $t9, 7($s2) \n" - - "addiu $AT, -1 \n" - "addiu %[src], 4 \n" - - "bnez $AT, 11b \n" - " addu %[dst], $s2, %[dst_stride] \n" - "2: \n" - ".set pop \n" - ".set at \n" - :[src] "+r" (src), - [dst] "+r" (dst), - [width] "+r" (width) - :[src_stride] "r" (src_stride), - [dst_stride] "r" (dst_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7" - ); -} - -void TransposeUVWx8_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 - "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 - "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 - "addu $t3, $t2, %[src_stride] \n" - "addu $t5, $t4, %[src_stride] \n" - "addu $t6, $t2, $t4 \n" - "subu $t7, $t9, %[src_stride] \n" - "srl $t1, %[width], 1 \n" - -// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b - "andi $t0, %[dst_a], 0x3 \n" - "andi $t8, %[dst_b], 0x3 \n" - "or $t0, $t0, $t8 \n" - "andi $t8, %[dst_stride_a], 0x3 \n" - "andi $s5, %[dst_stride_b], 0x3 \n" - "or $t8, $t8, $s5 \n" - "or $t0, $t0, $t8 \n" - "bnez $t0, 11f \n" - " nop \n" -// dst + dst_stride word aligned (both, a & b dst addresses) - "1: \n" - "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| - "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| - "addu $s5, %[dst_a], %[dst_stride_a] \n" - "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| - "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| - "addu $s6, %[dst_b], %[dst_stride_b] \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| - "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| - "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| - - "sw $s3, 0($s5) \n" - "sw $s4, 0($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| - - "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| - "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| - "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| - "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| - "sw $s3, 0(%[dst_a]) \n" - "sw $s4, 0(%[dst_b]) \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| - "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| - "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| - "sw $s3, 4($s5) \n" - "sw $s4, 4($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| - - "addiu %[src], 4 \n" - "addiu $t1, -1 \n" - "sll $t0, %[dst_stride_a], 1 \n" - "sll $t8, %[dst_stride_b], 1 \n" - "sw $s3, 4(%[dst_a]) \n" - "sw $s4, 4(%[dst_b]) \n" - "addu %[dst_a], %[dst_a], $t0 \n" - "bnez $t1, 1b \n" - " addu %[dst_b], %[dst_b], $t8 \n" - "b 2f \n" - " nop \n" - -// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned - "11: \n" - "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| - "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| - "addu $s5, %[dst_a], %[dst_stride_a] \n" - "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| - "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| - "addu $s6, %[dst_b], %[dst_stride_b] \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| - "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| - "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| - - "swr $s3, 0($s5) \n" - "swl $s3, 3($s5) \n" - "swr $s4, 0($s6) \n" - "swl $s4, 3($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| - - "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| - "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| - "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| - "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| - "swr $s3, 0(%[dst_a]) \n" - "swl $s3, 3(%[dst_a]) \n" - "swr $s4, 0(%[dst_b]) \n" - "swl $s4, 3(%[dst_b]) \n" - - "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| - "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| - "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| - - "sll $t0, $t0, 16 \n" - "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| - "sll $t9, $t9, 16 \n" - "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| - - "swr $s3, 4($s5) \n" - "swl $s3, 7($s5) \n" - "swr $s4, 4($s6) \n" - "swl $s4, 7($s6) \n" - - "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| - "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| - - "addiu %[src], 4 \n" - "addiu $t1, -1 \n" - "sll $t0, %[dst_stride_a], 1 \n" - "sll $t8, %[dst_stride_b], 1 \n" - "swr $s3, 4(%[dst_a]) \n" - "swl $s3, 7(%[dst_a]) \n" - "swr $s4, 4(%[dst_b]) \n" - "swl $s4, 7(%[dst_b]) \n" - "addu %[dst_a], %[dst_a], $t0 \n" - "bnez $t1, 11b \n" - " addu %[dst_b], %[dst_b], $t8 \n" - - "2: \n" - ".set pop \n" - : [src] "+r" (src), - [dst_a] "+r" (dst_a), - [dst_b] "+r" (dst_b), - [width] "+r" (width), - [src_stride] "+r" (src_stride) - : [dst_stride_a] "r" (dst_stride_a), - [dst_stride_b] "r" (dst_stride_b) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", - "s4", "s5", "s6" - ); -} - -#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc new file mode 100644 index 0000000000..99bdca65b3 --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_msa.cc @@ -0,0 +1,250 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \ + out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \ + out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \ + out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \ + } + +#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \ + out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \ + out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \ + out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \ + } + +#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \ + out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \ + out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \ + out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \ + } + +#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \ + out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ + } + +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + TransposeWx8_C(src, src_stride, dst, dst_stride, width); + TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, + width); +} + +void TransposeUVWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, + width); + TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), + dst_stride_a, (dst_b + 8), dst_stride_b, width); +} + +void TransposeWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + int x; + const uint8_t* s; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < width; x += 16) { + s = src; + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); + ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); + ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); + ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + dst += dst_stride * 4; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); + ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); + src += 16; + dst += dst_stride * 4; + } +} + +void TransposeUVWx16_MSA(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int width) { + int x; + const uint8_t* s; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; + + for (x = 0; x < width; x += 8) { + s = src; + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); + ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); + s += src_stride; + ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); + res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); + ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); + ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); + ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); + res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); + ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); + ST_UB2(dst0, dst2, dst_a, dst_stride_a); + ST_UB2(dst1, dst3, dst_b, dst_stride_b); + src += 16; + dst_a += dst_stride_a * 2; + dst_b += dst_stride_b * 2; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc index 1c22b472bc..fdc0dd476c 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/rotate_row.h" +#include "libyuv/row.h" #include "libyuv/basic_types.h" @@ -21,38 +21,32 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) -static uvec8 kVTbl4x4Transpose = - { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; +static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, int width) { - const uint8* src_temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %5, #8 \n" + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %5, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" "mov %0, %1 \n" - MEMACCESS(0) "vld1.8 {d0}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d1}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d2}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d3}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d4}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d5}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d6}, [%0], %2 \n" - MEMACCESS(0) "vld1.8 {d7}, [%0] \n" "vtrn.8 d1, d0 \n" @@ -77,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "mov %0, %3 \n" - MEMACCESS(0) "vst1.8 {d1}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d0}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d3}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d2}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d5}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d4}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d7}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d6}, [%0] \n" "add %1, #8 \n" // src += 8 @@ -99,180 +85,138 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "subs %5, #8 \n" // w -= 8 "bge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %5, #8 \n" - "beq 4f \n" + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %5, #8 \n" + "beq 4f \n" - // some residual, so between 1 and 7 lines left to transpose - "cmp %5, #2 \n" - "blt 3f \n" + // some residual, so between 1 and 7 lines left to transpose + "cmp %5, #2 \n" + "blt 3f \n" - "cmp %5, #4 \n" - "blt 2f \n" + "cmp %5, #4 \n" + "blt 2f \n" - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "vld1.32 {d0[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d0[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d1[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d1[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d2[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d2[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d3[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.32 {d3[1]}, [%0] \n" + // 4x8 block + "mov %0, %1 \n" + "vld1.32 {d0[0]}, [%0], %2 \n" + "vld1.32 {d0[1]}, [%0], %2 \n" + "vld1.32 {d1[0]}, [%0], %2 \n" + "vld1.32 {d1[1]}, [%0], %2 \n" + "vld1.32 {d2[0]}, [%0], %2 \n" + "vld1.32 {d2[1]}, [%0], %2 \n" + "vld1.32 {d3[0]}, [%0], %2 \n" + "vld1.32 {d3[1]}, [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(6) - "vld1.8 {q3}, [%6] \n" + "vld1.8 {q3}, [%6] \n" - "vtbl.8 d4, {d0, d1}, d6 \n" - "vtbl.8 d5, {d0, d1}, d7 \n" - "vtbl.8 d0, {d2, d3}, d6 \n" - "vtbl.8 d1, {d2, d3}, d7 \n" + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" - // TODO(frkoenig): Rework shuffle above to - // write out with 4 instead of 8 writes. - MEMACCESS(0) - "vst1.32 {d4[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d4[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d5[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d5[1]}, [%0] \n" + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "vst1.32 {d4[0]}, [%0], %4 \n" + "vst1.32 {d4[1]}, [%0], %4 \n" + "vst1.32 {d5[0]}, [%0], %4 \n" + "vst1.32 {d5[1]}, [%0] \n" - "add %0, %3, #4 \n" - MEMACCESS(0) - "vst1.32 {d0[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d0[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d1[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d1[1]}, [%0] \n" + "add %0, %3, #4 \n" + "vst1.32 {d0[0]}, [%0], %4 \n" + "vst1.32 {d0[1]}, [%0], %4 \n" + "vst1.32 {d1[0]}, [%0], %4 \n" + "vst1.32 {d1[1]}, [%0] \n" - "add %1, #4 \n" // src += 4 - "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride - "subs %5, #4 \n" // w -= 4 - "beq 4f \n" + "add %1, #4 \n" // src += 4 + "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride + "subs %5, #4 \n" // w -= 4 + "beq 4f \n" - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %5, #2 \n" - "blt 3f \n" + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %5, #2 \n" + "blt 3f \n" - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "vld1.16 {d0[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d0[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d0[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d0[3]}, [%0], %2 \n" - MEMACCESS(0) - "vld1.16 {d1[3]}, [%0] \n" + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "vld1.16 {d0[0]}, [%0], %2 \n" + "vld1.16 {d1[0]}, [%0], %2 \n" + "vld1.16 {d0[1]}, [%0], %2 \n" + "vld1.16 {d1[1]}, [%0], %2 \n" + "vld1.16 {d0[2]}, [%0], %2 \n" + "vld1.16 {d1[2]}, [%0], %2 \n" + "vld1.16 {d0[3]}, [%0], %2 \n" + "vld1.16 {d1[3]}, [%0] \n" - "vtrn.8 d0, d1 \n" + "vtrn.8 d0, d1 \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "vst1.64 {d0}, [%0], %4 \n" - MEMACCESS(0) - "vst1.64 {d1}, [%0] \n" + "vst1.64 {d0}, [%0], %4 \n" + "vst1.64 {d1}, [%0] \n" - "add %1, #2 \n" // src += 2 - "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride - "subs %5, #2 \n" // w -= 2 - "beq 4f \n" + "add %1, #2 \n" // src += 2 + "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride + "subs %5, #2 \n" // w -= 2 + "beq 4f \n" - // 1x8 block - "3: \n" - MEMACCESS(1) - "vld1.8 {d0[0]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[1]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[2]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[3]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[4]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[5]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[6]}, [%1], %2 \n" - MEMACCESS(1) - "vld1.8 {d0[7]}, [%1] \n" + // 1x8 block + "3: \n" + "vld1.8 {d0[0]}, [%1], %2 \n" + "vld1.8 {d0[1]}, [%1], %2 \n" + "vld1.8 {d0[2]}, [%1], %2 \n" + "vld1.8 {d0[3]}, [%1], %2 \n" + "vld1.8 {d0[4]}, [%1], %2 \n" + "vld1.8 {d0[5]}, [%1], %2 \n" + "vld1.8 {d0[6]}, [%1], %2 \n" + "vld1.8 {d0[7]}, [%1] \n" - MEMACCESS(3) - "vst1.64 {d0}, [%3] \n" + "vst1.64 {d0}, [%3] \n" - "4: \n" + "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(src_stride), // %2 - "+r"(dst), // %3 - "+r"(dst_stride), // %4 - "+r"(width) // %5 - : "r"(&kVTbl4x4Transpose) // %6 - : "memory", "cc", "q0", "q1", "q2", "q3" - ); + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst), // %3 + "+r"(dst_stride), // %4 + "+r"(width) // %5 + : "r"(&kVTbl4x4Transpose) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3"); } -static uvec8 kVTbl4x4TransposeDi = - { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; +static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15}; -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, int width) { - const uint8* src_temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %7, #8 \n" + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %7, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" "mov %0, %1 \n" - MEMACCESS(0) "vld2.8 {d0, d1}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d2, d3}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d4, d5}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d6, d7}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d16, d17}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d18, d19}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d20, d21}, [%0], %2 \n" - MEMACCESS(0) "vld2.8 {d22, d23}, [%0] \n" "vtrn.8 q1, q0 \n" @@ -301,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "mov %0, %3 \n" - MEMACCESS(0) "vst1.8 {d2}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d0}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d6}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d4}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d18}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d16}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d22}, [%0], %4 \n" - MEMACCESS(0) "vst1.8 {d20}, [%0] \n" "mov %0, %5 \n" - MEMACCESS(0) "vst1.8 {d3}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d1}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d7}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d5}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d19}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d17}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d23}, [%0], %6 \n" - MEMACCESS(0) "vst1.8 {d21}, [%0] \n" "add %1, #8*2 \n" // src += 8*2 @@ -343,187 +271,142 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "subs %7, #8 \n" // w -= 8 "bge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %7, #8 \n" - "beq 4f \n" + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %7, #8 \n" + "beq 4f \n" - // some residual, so between 1 and 7 lines left to transpose - "cmp %7, #2 \n" - "blt 3f \n" + // some residual, so between 1 and 7 lines left to transpose + "cmp %7, #2 \n" + "blt 3f \n" - "cmp %7, #4 \n" - "blt 2f \n" + "cmp %7, #4 \n" + "blt 2f \n" - // TODO(frkoenig): Clean this up - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "vld1.64 {d0}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d1}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d2}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d3}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d4}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d5}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d6}, [%0], %2 \n" - MEMACCESS(0) - "vld1.64 {d7}, [%0] \n" + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "vld1.64 {d0}, [%0], %2 \n" + "vld1.64 {d1}, [%0], %2 \n" + "vld1.64 {d2}, [%0], %2 \n" + "vld1.64 {d3}, [%0], %2 \n" + "vld1.64 {d4}, [%0], %2 \n" + "vld1.64 {d5}, [%0], %2 \n" + "vld1.64 {d6}, [%0], %2 \n" + "vld1.64 {d7}, [%0] \n" - MEMACCESS(8) - "vld1.8 {q15}, [%8] \n" + "vld1.8 {q15}, [%8] \n" - "vtrn.8 q0, q1 \n" - "vtrn.8 q2, q3 \n" + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" - "vtbl.8 d16, {d0, d1}, d30 \n" - "vtbl.8 d17, {d0, d1}, d31 \n" - "vtbl.8 d18, {d2, d3}, d30 \n" - "vtbl.8 d19, {d2, d3}, d31 \n" - "vtbl.8 d20, {d4, d5}, d30 \n" - "vtbl.8 d21, {d4, d5}, d31 \n" - "vtbl.8 d22, {d6, d7}, d30 \n" - "vtbl.8 d23, {d6, d7}, d31 \n" + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "vst1.32 {d16[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d16[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d17[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d17[1]}, [%0], %4 \n" + "vst1.32 {d16[0]}, [%0], %4 \n" + "vst1.32 {d16[1]}, [%0], %4 \n" + "vst1.32 {d17[0]}, [%0], %4 \n" + "vst1.32 {d17[1]}, [%0], %4 \n" - "add %0, %3, #4 \n" - MEMACCESS(0) - "vst1.32 {d20[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d20[1]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d21[0]}, [%0], %4 \n" - MEMACCESS(0) - "vst1.32 {d21[1]}, [%0] \n" + "add %0, %3, #4 \n" + "vst1.32 {d20[0]}, [%0], %4 \n" + "vst1.32 {d20[1]}, [%0], %4 \n" + "vst1.32 {d21[0]}, [%0], %4 \n" + "vst1.32 {d21[1]}, [%0] \n" - "mov %0, %5 \n" + "mov %0, %5 \n" - MEMACCESS(0) - "vst1.32 {d18[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d18[1]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d19[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d19[1]}, [%0], %6 \n" + "vst1.32 {d18[0]}, [%0], %6 \n" + "vst1.32 {d18[1]}, [%0], %6 \n" + "vst1.32 {d19[0]}, [%0], %6 \n" + "vst1.32 {d19[1]}, [%0], %6 \n" - "add %0, %5, #4 \n" - MEMACCESS(0) - "vst1.32 {d22[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d22[1]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d23[0]}, [%0], %6 \n" - MEMACCESS(0) - "vst1.32 {d23[1]}, [%0] \n" + "add %0, %5, #4 \n" + "vst1.32 {d22[0]}, [%0], %6 \n" + "vst1.32 {d22[1]}, [%0], %6 \n" + "vst1.32 {d23[0]}, [%0], %6 \n" + "vst1.32 {d23[1]}, [%0] \n" - "add %1, #4*2 \n" // src += 4 * 2 - "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a - "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b - "subs %7, #4 \n" // w -= 4 - "beq 4f \n" + "add %1, #4*2 \n" // src += 4 * 2 + "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %7, #4 \n" // w -= 4 + "beq 4f \n" - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %7, #2 \n" - "blt 3f \n" + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %7, #2 \n" + "blt 3f \n" - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" - MEMACCESS(0) - "vld2.16 {d1[3], d3[3]}, [%0] \n" + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" + "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" + "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" + "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" + "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" + "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" + "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" + "vld2.16 {d1[3], d3[3]}, [%0] \n" - "vtrn.8 d0, d1 \n" - "vtrn.8 d2, d3 \n" + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "vst1.64 {d0}, [%0], %4 \n" - MEMACCESS(0) - "vst1.64 {d2}, [%0] \n" + "vst1.64 {d0}, [%0], %4 \n" + "vst1.64 {d2}, [%0] \n" - "mov %0, %5 \n" + "mov %0, %5 \n" - MEMACCESS(0) - "vst1.64 {d1}, [%0], %6 \n" - MEMACCESS(0) - "vst1.64 {d3}, [%0] \n" + "vst1.64 {d1}, [%0], %6 \n" + "vst1.64 {d3}, [%0] \n" - "add %1, #2*2 \n" // src += 2 * 2 - "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a - "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b - "subs %7, #2 \n" // w -= 2 - "beq 4f \n" + "add %1, #2*2 \n" // src += 2 * 2 + "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %7, #2 \n" // w -= 2 + "beq 4f \n" - // 1x8 block - "3: \n" - MEMACCESS(1) - "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" - MEMACCESS(1) - "vld2.8 {d0[7], d1[7]}, [%1] \n" + // 1x8 block + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" + "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" + "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" + "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" + "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" + "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" + "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" + "vld2.8 {d0[7], d1[7]}, [%1] \n" - MEMACCESS(3) - "vst1.64 {d0}, [%3] \n" - MEMACCESS(5) - "vst1.64 {d1}, [%5] \n" + "vst1.64 {d0}, [%3] \n" + "vst1.64 {d1}, [%5] \n" - "4: \n" + "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(src_stride), // %2 - "+r"(dst_a), // %3 - "+r"(dst_stride_a), // %4 - "+r"(dst_b), // %5 - "+r"(dst_stride_b), // %6 - "+r"(width) // %7 - : "r"(&kVTbl4x4TransposeDi) // %8 - : "memory", "cc", - "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" - ); + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(src_stride), // %2 + "+r"(dst_a), // %3 + "+r"(dst_stride_a), // %4 + "+r"(dst_b), // %5 + "+r"(dst_stride_b), // %6 + "+r"(width) // %7 + : "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } #endif // defined(__ARM_NEON__) && !defined(__aarch64__) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc index 1ab448f3ab..f469baacf6 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_neon64.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/rotate_row.h" +#include "libyuv/row.h" #include "libyuv/basic_types.h" @@ -21,38 +21,32 @@ extern "C" { // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -static uvec8 kVTbl4x4Transpose = - { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; +static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - const uint8* src_temp; - int64 width64 = (int64) width; // Work around clang 3.4 warning. - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %3, %3, #8 \n" +void TransposeWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w3, %w3, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" "mov %0, %1 \n" - MEMACCESS(0) "ld1 {v0.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v1.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v2.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v3.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v4.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v5.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v6.8b}, [%0], %5 \n" - MEMACCESS(0) "ld1 {v7.8b}, [%0] \n" "trn2 v16.8b, v0.8b, v1.8b \n" @@ -84,456 +78,345 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "mov %0, %2 \n" - MEMACCESS(0) "st1 {v17.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v16.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v19.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v18.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v21.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v20.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v23.8b}, [%0], %6 \n" - MEMACCESS(0) "st1 {v22.8b}, [%0] \n" "add %1, %1, #8 \n" // src += 8 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride - "subs %3, %3, #8 \n" // w -= 8 + "subs %w3, %w3, #8 \n" // w -= 8 "b.ge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %3, %3, #8 \n" - "b.eq 4f \n" + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w3, %w3, #8 \n" + "b.eq 4f \n" - // some residual, so between 1 and 7 lines left to transpose - "cmp %3, #2 \n" - "b.lt 3f \n" + // some residual, so between 1 and 7 lines left to transpose + "cmp %w3, #2 \n" + "b.lt 3f \n" - "cmp %3, #4 \n" - "b.lt 2f \n" + "cmp %w3, #4 \n" + "b.lt 2f \n" - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.s}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.s}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.s}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.s}[3], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.s}[3], [%0] \n" + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.s}[0], [%0], %5 \n" + "ld1 {v0.s}[1], [%0], %5 \n" + "ld1 {v0.s}[2], [%0], %5 \n" + "ld1 {v0.s}[3], [%0], %5 \n" + "ld1 {v1.s}[0], [%0], %5 \n" + "ld1 {v1.s}[1], [%0], %5 \n" + "ld1 {v1.s}[2], [%0], %5 \n" + "ld1 {v1.s}[3], [%0] \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - MEMACCESS(4) - "ld1 {v2.16b}, [%4] \n" + "ld1 {v2.16b}, [%4] \n" - "tbl v3.16b, {v0.16b}, v2.16b \n" - "tbl v0.16b, {v1.16b}, v2.16b \n" + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" - // TODO(frkoenig): Rework shuffle above to - // write out with 4 instead of 8 writes. - MEMACCESS(0) - "st1 {v3.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v3.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v3.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v3.s}[3], [%0] \n" + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "st1 {v3.s}[0], [%0], %6 \n" + "st1 {v3.s}[1], [%0], %6 \n" + "st1 {v3.s}[2], [%0], %6 \n" + "st1 {v3.s}[3], [%0] \n" - "add %0, %2, #4 \n" - MEMACCESS(0) - "st1 {v0.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v0.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v0.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v0.s}[3], [%0] \n" + "add %0, %2, #4 \n" + "st1 {v0.s}[0], [%0], %6 \n" + "st1 {v0.s}[1], [%0], %6 \n" + "st1 {v0.s}[2], [%0], %6 \n" + "st1 {v0.s}[3], [%0] \n" - "add %1, %1, #4 \n" // src += 4 - "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride - "subs %3, %3, #4 \n" // w -= 4 - "b.eq 4f \n" + "add %1, %1, #4 \n" // src += 4 + "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride + "subs %w3, %w3, #4 \n" // w -= 4 + "b.eq 4f \n" - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %3, #2 \n" - "b.lt 3f \n" + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w3, #2 \n" + "b.lt 3f \n" - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v0.h}[3], [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.h}[3], [%0] \n" + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld1 {v0.h}[0], [%0], %5 \n" + "ld1 {v1.h}[0], [%0], %5 \n" + "ld1 {v0.h}[1], [%0], %5 \n" + "ld1 {v1.h}[1], [%0], %5 \n" + "ld1 {v0.h}[2], [%0], %5 \n" + "ld1 {v1.h}[2], [%0], %5 \n" + "ld1 {v0.h}[3], [%0], %5 \n" + "ld1 {v1.h}[3], [%0] \n" - "trn2 v2.8b, v0.8b, v1.8b \n" - "trn1 v3.8b, v0.8b, v1.8b \n" + "trn2 v2.8b, v0.8b, v1.8b \n" + "trn1 v3.8b, v0.8b, v1.8b \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - MEMACCESS(0) - "st1 {v3.8b}, [%0], %6 \n" - MEMACCESS(0) - "st1 {v2.8b}, [%0] \n" + "st1 {v3.8b}, [%0], %6 \n" + "st1 {v2.8b}, [%0] \n" - "add %1, %1, #2 \n" // src += 2 - "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride - "subs %3, %3, #2 \n" // w -= 2 - "b.eq 4f \n" + "add %1, %1, #2 \n" // src += 2 + "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride + "subs %w3, %w3, #2 \n" // w -= 2 + "b.eq 4f \n" - // 1x8 block - "3: \n" - MEMACCESS(1) - "ld1 {v0.b}[0], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[1], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[2], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[3], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[4], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[5], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[6], [%1], %5 \n" - MEMACCESS(1) - "ld1 {v0.b}[7], [%1] \n" + // 1x8 block + "3: \n" + "ld1 {v0.b}[0], [%1], %5 \n" + "ld1 {v0.b}[1], [%1], %5 \n" + "ld1 {v0.b}[2], [%1], %5 \n" + "ld1 {v0.b}[3], [%1], %5 \n" + "ld1 {v0.b}[4], [%1], %5 \n" + "ld1 {v0.b}[5], [%1], %5 \n" + "ld1 {v0.b}[6], [%1], %5 \n" + "ld1 {v0.b}[7], [%1] \n" - MEMACCESS(2) - "st1 {v0.8b}, [%2] \n" + "st1 {v0.8b}, [%2] \n" - "4: \n" + "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst), // %2 - "+r"(width64) // %3 - : "r"(&kVTbl4x4Transpose), // %4 - "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride)) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23" - ); + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } -static uint8 kVTbl4x4TransposeDi[32] = - { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, - 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; +static const uint8_t kVTbl4x4TransposeDi[32] = { + 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, + 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, +void TransposeUVWx8_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, int width) { - const uint8* src_temp; - int64 width64 = (int64) width; // Work around clang 3.4 warning. - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %4, %4, #8 \n" + const uint8_t* src_temp; + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w4, %w4, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" - "mov %0, %1 \n" + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v2.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v3.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v4.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v5.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v6.16b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v7.16b}, [%0] \n" + "ld1 {v0.16b}, [%0], %5 \n" + "ld1 {v1.16b}, [%0], %5 \n" + "ld1 {v2.16b}, [%0], %5 \n" + "ld1 {v3.16b}, [%0], %5 \n" + "ld1 {v4.16b}, [%0], %5 \n" + "ld1 {v5.16b}, [%0], %5 \n" + "ld1 {v6.16b}, [%0], %5 \n" + "ld1 {v7.16b}, [%0] \n" - "trn1 v16.16b, v0.16b, v1.16b \n" - "trn2 v17.16b, v0.16b, v1.16b \n" - "trn1 v18.16b, v2.16b, v3.16b \n" - "trn2 v19.16b, v2.16b, v3.16b \n" - "trn1 v20.16b, v4.16b, v5.16b \n" - "trn2 v21.16b, v4.16b, v5.16b \n" - "trn1 v22.16b, v6.16b, v7.16b \n" - "trn2 v23.16b, v6.16b, v7.16b \n" + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" - "trn1 v0.8h, v16.8h, v18.8h \n" - "trn2 v1.8h, v16.8h, v18.8h \n" - "trn1 v2.8h, v20.8h, v22.8h \n" - "trn2 v3.8h, v20.8h, v22.8h \n" - "trn1 v4.8h, v17.8h, v19.8h \n" - "trn2 v5.8h, v17.8h, v19.8h \n" - "trn1 v6.8h, v21.8h, v23.8h \n" - "trn2 v7.8h, v21.8h, v23.8h \n" + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" - "trn1 v16.4s, v0.4s, v2.4s \n" - "trn2 v17.4s, v0.4s, v2.4s \n" - "trn1 v18.4s, v1.4s, v3.4s \n" - "trn2 v19.4s, v1.4s, v3.4s \n" - "trn1 v20.4s, v4.4s, v6.4s \n" - "trn2 v21.4s, v4.4s, v6.4s \n" - "trn1 v22.4s, v5.4s, v7.4s \n" - "trn2 v23.4s, v5.4s, v7.4s \n" + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - MEMACCESS(0) - "st1 {v16.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v17.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v19.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.d}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.d}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v17.d}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v19.d}[1], [%0] \n" + "st1 {v16.d}[0], [%0], %6 \n" + "st1 {v18.d}[0], [%0], %6 \n" + "st1 {v17.d}[0], [%0], %6 \n" + "st1 {v19.d}[0], [%0], %6 \n" + "st1 {v16.d}[1], [%0], %6 \n" + "st1 {v18.d}[1], [%0], %6 \n" + "st1 {v17.d}[1], [%0], %6 \n" + "st1 {v19.d}[1], [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "st1 {v20.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v22.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v21.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v23.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v20.d}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v22.d}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v21.d}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v23.d}[1], [%0] \n" + "st1 {v20.d}[0], [%0], %7 \n" + "st1 {v22.d}[0], [%0], %7 \n" + "st1 {v21.d}[0], [%0], %7 \n" + "st1 {v23.d}[0], [%0], %7 \n" + "st1 {v20.d}[1], [%0], %7 \n" + "st1 {v22.d}[1], [%0], %7 \n" + "st1 {v21.d}[1], [%0], %7 \n" + "st1 {v23.d}[1], [%0] \n" - "add %1, %1, #16 \n" // src += 8*2 - "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a - "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b - "subs %4, %4, #8 \n" // w -= 8 - "b.ge 1b \n" + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * + // dst_stride_a + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * + // dst_stride_b + "subs %w4, %w4, #8 \n" // w -= 8 + "b.ge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %4, %4, #8 \n" - "b.eq 4f \n" + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w4, %w4, #8 \n" + "b.eq 4f \n" - // some residual, so between 1 and 7 lines left to transpose - "cmp %4, #2 \n" - "b.lt 3f \n" + // some residual, so between 1 and 7 lines left to transpose + "cmp %w4, #2 \n" + "b.lt 3f \n" - "cmp %4, #4 \n" - "b.lt 2f \n" + "cmp %w4, #4 \n" + "b.lt 2f \n" - // TODO(frkoenig): Clean this up - // 4x8 block - "mov %0, %1 \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v1.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v3.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v4.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v5.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v6.8b}, [%0], %5 \n" - MEMACCESS(0) - "ld1 {v7.8b}, [%0] \n" + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" - MEMACCESS(8) - "ld1 {v30.16b}, [%8], #16 \n" - "ld1 {v31.16b}, [%8] \n" + "ld1 {v30.16b}, [%8], #16 \n" + "ld1 {v31.16b}, [%8] \n" - "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" - "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" - "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" - "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" + "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" + "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" + "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" + "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - MEMACCESS(0) - "st1 {v16.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v16.s}[3], [%0], %6 \n" + "st1 {v16.s}[0], [%0], %6 \n" + "st1 {v16.s}[1], [%0], %6 \n" + "st1 {v16.s}[2], [%0], %6 \n" + "st1 {v16.s}[3], [%0], %6 \n" - "add %0, %2, #4 \n" - MEMACCESS(0) - "st1 {v18.s}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.s}[1], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.s}[2], [%0], %6 \n" - MEMACCESS(0) - "st1 {v18.s}[3], [%0] \n" + "add %0, %2, #4 \n" + "st1 {v18.s}[0], [%0], %6 \n" + "st1 {v18.s}[1], [%0], %6 \n" + "st1 {v18.s}[2], [%0], %6 \n" + "st1 {v18.s}[3], [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "st1 {v17.s}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v17.s}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v17.s}[2], [%0], %7 \n" - MEMACCESS(0) - "st1 {v17.s}[3], [%0], %7 \n" + "st1 {v17.s}[0], [%0], %7 \n" + "st1 {v17.s}[1], [%0], %7 \n" + "st1 {v17.s}[2], [%0], %7 \n" + "st1 {v17.s}[3], [%0], %7 \n" - "add %0, %3, #4 \n" - MEMACCESS(0) - "st1 {v19.s}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v19.s}[1], [%0], %7 \n" - MEMACCESS(0) - "st1 {v19.s}[2], [%0], %7 \n" - MEMACCESS(0) - "st1 {v19.s}[3], [%0] \n" + "add %0, %3, #4 \n" + "st1 {v19.s}[0], [%0], %7 \n" + "st1 {v19.s}[1], [%0], %7 \n" + "st1 {v19.s}[2], [%0], %7 \n" + "st1 {v19.s}[3], [%0] \n" - "add %1, %1, #8 \n" // src += 4 * 2 - "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a - "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b - "subs %4, %4, #4 \n" // w -= 4 - "b.eq 4f \n" + "add %1, %1, #8 \n" // src += 4 * 2 + "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %w4, %w4, #4 \n" // w -= 4 + "b.eq 4f \n" - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %4, #2 \n" - "b.lt 3f \n" + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w4, #2 \n" + "b.lt 3f \n" - // 2x8 block - "2: \n" - "mov %0, %1 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[0], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[1], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[2], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v0.h, v1.h}[3], [%0], %5 \n" - MEMACCESS(0) - "ld2 {v2.h, v3.h}[3], [%0] \n" + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld2 {v0.h, v1.h}[0], [%0], %5 \n" + "ld2 {v2.h, v3.h}[0], [%0], %5 \n" + "ld2 {v0.h, v1.h}[1], [%0], %5 \n" + "ld2 {v2.h, v3.h}[1], [%0], %5 \n" + "ld2 {v0.h, v1.h}[2], [%0], %5 \n" + "ld2 {v2.h, v3.h}[2], [%0], %5 \n" + "ld2 {v0.h, v1.h}[3], [%0], %5 \n" + "ld2 {v2.h, v3.h}[3], [%0] \n" - "trn1 v4.8b, v0.8b, v2.8b \n" - "trn2 v5.8b, v0.8b, v2.8b \n" - "trn1 v6.8b, v1.8b, v3.8b \n" - "trn2 v7.8b, v1.8b, v3.8b \n" + "trn1 v4.8b, v0.8b, v2.8b \n" + "trn2 v5.8b, v0.8b, v2.8b \n" + "trn1 v6.8b, v1.8b, v3.8b \n" + "trn2 v7.8b, v1.8b, v3.8b \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - MEMACCESS(0) - "st1 {v4.d}[0], [%0], %6 \n" - MEMACCESS(0) - "st1 {v6.d}[0], [%0] \n" + "st1 {v4.d}[0], [%0], %6 \n" + "st1 {v6.d}[0], [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - MEMACCESS(0) - "st1 {v5.d}[0], [%0], %7 \n" - MEMACCESS(0) - "st1 {v7.d}[0], [%0] \n" + "st1 {v5.d}[0], [%0], %7 \n" + "st1 {v7.d}[0], [%0] \n" - "add %1, %1, #4 \n" // src += 2 * 2 - "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a - "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b - "subs %4, %4, #2 \n" // w -= 2 - "b.eq 4f \n" + "add %1, %1, #4 \n" // src += 2 * 2 + "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %w4, %w4, #2 \n" // w -= 2 + "b.eq 4f \n" - // 1x8 block - "3: \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[0], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[1], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[2], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[3], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[4], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[5], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[6], [%1], %5 \n" - MEMACCESS(1) - "ld2 {v0.b, v1.b}[7], [%1] \n" + // 1x8 block + "3: \n" + "ld2 {v0.b, v1.b}[0], [%1], %5 \n" + "ld2 {v0.b, v1.b}[1], [%1], %5 \n" + "ld2 {v0.b, v1.b}[2], [%1], %5 \n" + "ld2 {v0.b, v1.b}[3], [%1], %5 \n" + "ld2 {v0.b, v1.b}[4], [%1], %5 \n" + "ld2 {v0.b, v1.b}[5], [%1], %5 \n" + "ld2 {v0.b, v1.b}[6], [%1], %5 \n" + "ld2 {v0.b, v1.b}[7], [%1] \n" - MEMACCESS(2) - "st1 {v0.d}[0], [%2] \n" - MEMACCESS(3) - "st1 {v1.d}[0], [%3] \n" + "st1 {v0.d}[0], [%2] \n" + "st1 {v1.d}[0], [%3] \n" - "4: \n" + "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst_a), // %2 - "+r"(dst_b), // %3 - "+r"(width64) // %4 - : "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride_a)), // %6 - "r"(static_cast(dst_stride_b)), // %7 - "r"(&kVTbl4x4TransposeDi) // %8 - : "memory", "cc", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v30", "v31" - ); + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride_a)), // %6 + "r"(static_cast(dst_stride_b)), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc index 1300fc0feb..e887dd525c 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/rotate_win.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/rotate_row.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -17,17 +17,19 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -__declspec(naked) -void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { +__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { __asm { push edi push esi push ebp - mov eax, [esp + 12 + 4] // src - mov edi, [esp + 12 + 8] // src_stride + mov eax, [esp + 12 + 4] // src + mov edi, [esp + 12 + 8] // src_stride mov edx, [esp + 12 + 12] // dst mov esi, [esp + 12 + 16] // dst_stride mov ecx, [esp + 12 + 20] // width @@ -110,18 +112,20 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride, } } -__declspec(naked) -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int w) { +__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst_a, + int dst_stride_a, + uint8_t* dst_b, + int dst_stride_b, + int w) { __asm { push ebx push esi push edi push ebp - mov eax, [esp + 16 + 4] // src - mov edi, [esp + 16 + 8] // src_stride + mov eax, [esp + 16 + 4] // src + mov edi, [esp + 16 + 8] // src_stride mov edx, [esp + 16 + 12] // dst_a mov esi, [esp + 16 + 16] // dst_stride_a mov ebx, [esp + 16 + 20] // dst_b @@ -133,9 +137,9 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, mov ecx, [ecx + 16 + 28] // w align 4 - convertloop: // Read in the data from the source pointer. // First round of bit swap. + convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + edi] lea eax, [eax + 2 * edi] @@ -162,13 +166,13 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea eax, [eax + 2 * edi] movdqu [esp], xmm5 // backup xmm5 neg edi - movdqa xmm5, xmm6 // use xmm5 as temp register. + movdqa xmm5, xmm6 // use xmm5 as temp register. punpcklbw xmm6, xmm7 punpckhbw xmm5, xmm7 movdqa xmm7, xmm5 lea eax, [eax + 8 * edi + 16] neg edi - // Second round of bit swap. + // Second round of bit swap. movdqa xmm5, xmm0 punpcklwd xmm0, xmm2 punpckhwd xmm5, xmm2 @@ -183,12 +187,13 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, movdqa xmm6, xmm5 movdqu xmm5, [esp] // restore xmm5 movdqu [esp], xmm6 // backup xmm6 - movdqa xmm6, xmm5 // use xmm6 as temp register. + movdqa xmm6, xmm5 // use xmm6 as temp register. punpcklwd xmm5, xmm7 punpckhwd xmm6, xmm7 movdqa xmm7, xmm6 - // Third round of bit swap. - // Write to the destination pointer. + + // Third round of bit swap. + // Write to the destination pointer. movdqa xmm6, xmm0 punpckldq xmm0, xmm4 punpckhdq xmm6, xmm4 @@ -200,7 +205,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm4 lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm2 // use xmm0 as the temp register. + movdqa xmm0, xmm2 // use xmm0 as the temp register. punpckldq xmm2, xmm6 movlpd qword ptr [edx], xmm2 movhpd qword ptr [ebx], xmm2 @@ -209,7 +214,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm1 // use xmm0 as the temp register. + movdqa xmm0, xmm1 // use xmm0 as the temp register. punpckldq xmm1, xmm5 movlpd qword ptr [edx], xmm1 movhpd qword ptr [ebx], xmm1 @@ -218,7 +223,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm3 // use xmm0 as the temp register. + movdqa xmm0, xmm3 // use xmm0 as the temp register. punpckldq xmm3, xmm7 movlpd qword ptr [edx], xmm3 movhpd qword ptr [ebx], xmm3 diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc index 494164fd02..e91560c44c 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_any.cc @@ -19,30 +19,38 @@ namespace libyuv { extern "C" { #endif +// memset for temp is meant to clear the source buffer (not dest) so that +// SIMD that reads full multiple of 16 bytes will not trigger msan errors. +// memset is not needed for production, as the garbage values are processed but +// not used, although there may be edge cases for subsampling. +// The size of the buffer is based on the largest read, which can be inferred +// by the source type (e.g. ARGB) and the mask (last parameter), or by examining +// the source code for how much the source pointers are advanced. + // Subsampled source needs to be increase by 1 of not even. #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) // Any 4 planes to 1 with yuvconstants -#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - const uint8* a_buf, uint8* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 5]); \ - memset(temp, 0, 64 * 4); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 192, a_buf + n, r); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ - yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ - SS(r, DUVSHIFT) * BPP); \ - } +#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 5]); \ + memset(temp, 0, 64 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 192, a_buf + n, r); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ + yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ + SS(r, DUVSHIFT) * BPP); \ + } #ifdef HAS_I422ALPHATOARGBROW_SSSE3 ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) @@ -53,36 +61,57 @@ ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) #ifdef HAS_I422ALPHATOARGBROW_NEON ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) #endif +#ifdef HAS_I422ALPHATOARGBROW_MSA +ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) +#endif #undef ANY41C // Any 3 planes to 1. -#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 4]); \ - memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ - SS(r, DUVSHIFT) * BPP); \ - } +#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 4]); \ + memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ + SS(r, DUVSHIFT) * BPP); \ + } + +// Merge functions. +#ifdef HAS_MERGERGBROW_SSSE3 +ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) +#endif +#ifdef HAS_MERGERGBROW_NEON +ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) +#endif #ifdef HAS_I422TOYUY2ROW_SSE2 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOYUY2ROW_AVX2 +ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31) +ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31) +#endif #ifdef HAS_I422TOYUY2ROW_NEON ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOYUY2ROW_MSA +ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) +#endif #ifdef HAS_I422TOUYVYROW_NEON ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOUYVYROW_MSA +ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) +#endif #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #endif @@ -94,35 +123,38 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) // Note that odd width replication includes 444 due to implementation // on arm that subsamples 444 to 422 internally. // Any 3 planes to 1 with yuvconstants -#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - uint8* dst_ptr, const struct YuvConstants* yuvconstants, \ - int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 4]); \ - memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n, r); \ - memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ - if (width & 1) { \ - temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ - temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ - } \ - ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \ - yuvconstants, MASK + 1); \ - memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ - SS(r, DUVSHIFT) * BPP); \ - } +#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ + const uint8_t* v_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 4]); \ + memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + if (width & 1) { \ + temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ + temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \ + } \ + ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \ + MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \ + SS(r, DUVSHIFT) * BPP); \ + } #ifdef HAS_I422TOARGBROW_SSSE3 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) #endif -#ifdef HAS_I411TOARGBROW_SSSE3 -ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7) +#ifdef HAS_I422TOAR30ROW_SSSE3 +ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_I422TOAR30ROW_AVX2 +ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I444TOARGBROW_SSSE3 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) @@ -130,10 +162,10 @@ ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) -ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7) +ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) #endif // HAS_I444TOARGBROW_SSSE3 #ifdef HAS_I422TORGB24ROW_AVX2 -ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15) +ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) #endif #ifdef HAS_I422TOARGBROW_AVX2 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) @@ -144,47 +176,87 @@ ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) #ifdef HAS_I444TOARGBROW_AVX2 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) #endif -#ifdef HAS_I411TOARGBROW_AVX2 -ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15) -#endif #ifdef HAS_I422TOARGB4444ROW_AVX2 -ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7) +ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15) #endif #ifdef HAS_I422TOARGB1555ROW_AVX2 -ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15) #endif #ifdef HAS_I422TORGB565ROW_AVX2 -ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15) #endif #ifdef HAS_I422TOARGBROW_NEON ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) -ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7) ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) #endif +#ifdef HAS_I422TOARGBROW_MSA +ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7) +ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15) +ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7) +ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7) +ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) +#endif #undef ANY31C +// Any 3 planes of 16 bit to 1 with yuvconstants +// TODO(fbarchard): consider sharing this code with ANY31C +#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \ + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(T temp[16 * 3]); \ + SIMD_ALIGNED(uint8_t out[64]); \ + memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r * SBPP); \ + memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I210TOAR30ROW_SSSE3 +ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I210TOARGBROW_SSSE3 +ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I210TOARGBROW_AVX2 +ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I210TOAR30ROW_AVX2 +ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#undef ANY31CT + // Any 2 planes to 1. -#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ - uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } // Merge functions. #ifdef HAS_MERGEUVROW_SSE2 @@ -196,6 +268,9 @@ ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) #ifdef HAS_MERGEUVROW_NEON ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) #endif +#ifdef HAS_MERGEUVROW_MSA +ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) +#endif // Math functions. #ifdef HAS_ARGBMULTIPLYROW_SSE2 @@ -225,44 +300,61 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) #ifdef HAS_ARGBSUBTRACTROW_NEON ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBMULTIPLYROW_MSA +ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBADDROW_MSA +ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_MSA +ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) +#endif #ifdef HAS_SOBELROW_SSE2 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELROW_NEON ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) #endif +#ifdef HAS_SOBELROW_MSA +ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) +#endif #ifdef HAS_SOBELTOPLANEROW_SSE2 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) #endif #ifdef HAS_SOBELTOPLANEROW_NEON ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) #endif +#ifdef HAS_SOBELTOPLANEROW_MSA +ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) +#endif #ifdef HAS_SOBELXYROW_SSE2 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELXYROW_NEON ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #endif +#ifdef HAS_SOBELXYROW_MSA +ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) +#endif #undef ANY21 // Any 2 planes to 1 with yuvconstants -#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ - uint8* dst_ptr, const struct YuvConstants* yuvconstants, \ - int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ + } // Biplanar to RGB. #ifdef HAS_NV12TOARGBROW_SSSE3 @@ -274,6 +366,9 @@ ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) #ifdef HAS_NV12TOARGBROW_NEON ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV12TOARGBROW_MSA +ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif #ifdef HAS_NV21TOARGBROW_SSSE3 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) #endif @@ -283,6 +378,27 @@ ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) #ifdef HAS_NV21TOARGBROW_NEON ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV21TOARGBROW_MSA +ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif +#ifdef HAS_NV12TORGB24ROW_NEON +ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) +#endif +#ifdef HAS_NV21TORGB24ROW_NEON +ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7) +#endif +#ifdef HAS_NV12TORGB24ROW_SSSE3 +ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV21TORGB24ROW_SSSE3 +ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV12TORGB24ROW_AVX2 +ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31) +#endif +#ifdef HAS_NV21TORGB24ROW_AVX2 +ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31) +#endif #ifdef HAS_NV12TORGB565ROW_SSSE3 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) #endif @@ -292,22 +408,25 @@ ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) #ifdef HAS_NV12TORGB565ROW_NEON ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) #endif +#ifdef HAS_NV12TORGB565ROW_MSA +ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) +#endif #undef ANY21C // Any 1 to 1. -#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ - memset(temp, 0, 128); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } #ifdef HAS_COPYROW_AVX ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) @@ -325,6 +444,15 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) #endif +#if defined(HAS_ARGBTORGB24ROW_AVX2) +ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) +ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31) +#endif +#if defined(HAS_ARGBTORAWROW_AVX2) +ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31) +#endif #if defined(HAS_ARGBTORGB565ROW_AVX2) ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) #endif @@ -332,6 +460,18 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif +#if defined(HAS_ABGRTOAR30ROW_SSSE3) +ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) +#endif +#if defined(HAS_ARGBTOAR30ROW_SSSE3) +ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) +ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) +#endif +#if defined(HAS_ARGBTOAR30ROW_AVX2) +ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) +#endif #if defined(HAS_J400TOARGBROW_SSE2) ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) #endif @@ -372,9 +512,21 @@ ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) #endif +#if defined(HAS_ARGBTORGB24ROW_MSA) +ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) +ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) +ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15) +#endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) #endif +#if defined(HAS_RAWTORGB24ROW_MSA) +ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15) +#endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) #endif @@ -403,30 +555,57 @@ ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) #ifdef HAS_ARGBTOYROW_NEON ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBTOYROW_MSA +ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_ARGBTOYJROW_NEON ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBTOYJROW_MSA +ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_BGRATOYROW_NEON ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_BGRATOYROW_MSA +ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_ABGRTOYROW_NEON ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_ABGRTOYROW_MSA +ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) +#endif #ifdef HAS_RGBATOYROW_NEON ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) #endif +#ifdef HAS_RGBATOYROW_MSA +ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) +#endif #ifdef HAS_RGB24TOYROW_NEON ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) #endif +#ifdef HAS_RGB24TOYROW_MSA +ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) +#endif #ifdef HAS_RAWTOYROW_NEON ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) #endif +#ifdef HAS_RAWTOYROW_MSA +ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) +#endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) #endif +#ifdef HAS_RGB565TOYROW_MSA +ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) +#endif #ifdef HAS_ARGB1555TOYROW_NEON ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) #endif +#ifdef HAS_ARGB1555TOYROW_MSA +ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) +#endif #ifdef HAS_ARGB4444TOYROW_NEON ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) #endif @@ -434,23 +613,44 @@ ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) #endif #ifdef HAS_UYVYTOYROW_NEON -ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15) +ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOYROW_MSA +ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_MSA +ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #endif #ifdef HAS_RGB24TOARGBROW_NEON ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #endif +#ifdef HAS_RGB24TOARGBROW_MSA +ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) +#endif #ifdef HAS_RAWTOARGBROW_NEON ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) #endif +#ifdef HAS_RAWTOARGBROW_MSA +ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) +#endif #ifdef HAS_RGB565TOARGBROW_NEON ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) #endif +#ifdef HAS_RGB565TOARGBROW_MSA +ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) +#endif #ifdef HAS_ARGB1555TOARGBROW_NEON ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) #endif +#ifdef HAS_ARGB1555TOARGBROW_MSA +ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) +#endif #ifdef HAS_ARGB4444TOARGBROW_NEON ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #endif +#ifdef HAS_ARGB4444TOARGBROW_MSA +ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) +#endif #ifdef HAS_ARGBATTENUATEROW_SSSE3 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) #endif @@ -466,29 +666,38 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7) #ifdef HAS_ARGBATTENUATEROW_NEON ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #endif +#ifdef HAS_ARGBATTENUATEROW_MSA +ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31) +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_NEON ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBEXTRACTALPHAROW_MSA +ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) +#endif #undef ANY11 // Any 1 to 1 blended. Destination is read, modify, write. -#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ - memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \ - ANY_SIMD(temp, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 64, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 64, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + } #ifdef HAS_ARGBCOPYALPHAROW_AVX2 ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) @@ -506,61 +715,184 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) // Any 1 to 1 with parameter. #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ - T shuffler, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ - } + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp, temp + 64, param, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + } #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) -ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, - const uint32, 4, 2, 3) +ANY11P(ARGBToRGB565DitherRow_Any_SSE2, + ARGBToRGB565DitherRow_SSE2, + const uint32_t, + 4, + 2, + 3) #endif #if defined(HAS_ARGBTORGB565DITHERROW_AVX2) -ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2, - const uint32, 4, 2, 7) +ANY11P(ARGBToRGB565DitherRow_Any_AVX2, + ARGBToRGB565DitherRow_AVX2, + const uint32_t, + 4, + 2, + 7) #endif #if defined(HAS_ARGBTORGB565DITHERROW_NEON) -ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON, - const uint32, 4, 2, 7) +ANY11P(ARGBToRGB565DitherRow_Any_NEON, + ARGBToRGB565DitherRow_NEON, + const uint32_t, + 4, + 2, + 7) #endif -#ifdef HAS_ARGBSHUFFLEROW_SSE2 -ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3) +#if defined(HAS_ARGBTORGB565DITHERROW_MSA) +ANY11P(ARGBToRGB565DitherRow_Any_MSA, + ARGBToRGB565DitherRow_MSA, + const uint32_t, + 4, + 2, + 7) #endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 -ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7) +ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) #endif #ifdef HAS_ARGBSHUFFLEROW_AVX2 -ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15) +ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15) #endif #ifdef HAS_ARGBSHUFFLEROW_NEON -ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3) +ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) +#endif +#ifdef HAS_ARGBSHUFFLEROW_MSA +ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) #endif #undef ANY11P +// Any 1 to 1 with parameter and shorts. BPP measures in shorts. +#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ + SIMD_ALIGNED(STYPE temp[32]); \ + SIMD_ALIGNED(DTYPE out[32]); \ + memset(temp, 0, 32 * SBPP); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, scale, n); \ + } \ + memcpy(temp, src_ptr + n, r * SBPP); \ + ANY_SIMD(temp, out, scale, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ + } + +#ifdef HAS_CONVERT16TO8ROW_SSSE3 +ANY11C(Convert16To8Row_Any_SSSE3, + Convert16To8Row_SSSE3, + 2, + 1, + uint16_t, + uint8_t, + 15) +#endif +#ifdef HAS_CONVERT16TO8ROW_AVX2 +ANY11C(Convert16To8Row_Any_AVX2, + Convert16To8Row_AVX2, + 2, + 1, + uint16_t, + uint8_t, + 31) +#endif +#ifdef HAS_CONVERT8TO16ROW_SSE2 +ANY11C(Convert8To16Row_Any_SSE2, + Convert8To16Row_SSE2, + 1, + 2, + uint8_t, + uint16_t, + 15) +#endif +#ifdef HAS_CONVERT8TO16ROW_AVX2 +ANY11C(Convert8To16Row_Any_AVX2, + Convert8To16Row_AVX2, + 1, + 2, + uint8_t, + uint16_t, + 31) +#endif +#undef ANY11C + +// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. +#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ + void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ + SIMD_ALIGNED(ST temp[32]); \ + SIMD_ALIGNED(T out[32]); \ + memset(temp, 0, SBPP * 32); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(temp, src_ptr + n, r * SBPP); \ + ANY_SIMD(temp, out, param, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ + } + +#ifdef HAS_HALFFLOATROW_SSE2 +ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) +#endif +#ifdef HAS_HALFFLOATROW_AVX2 +ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) +#endif +#ifdef HAS_HALFFLOATROW_F16C +ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15) +ANY11P16(HalfFloat1Row_Any_F16C, + HalfFloat1Row_F16C, + uint16_t, + uint16_t, + 2, + 2, + 15) +#endif +#ifdef HAS_HALFFLOATROW_NEON +ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7) +ANY11P16(HalfFloat1Row_Any_NEON, + HalfFloat1Row_NEON, + uint16_t, + uint16_t, + 2, + 2, + 7) +#endif +#ifdef HAS_HALFFLOATROW_MSA +ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31) +#endif +#ifdef HAS_BYTETOFLOATROW_NEON +ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7) +#endif +#undef ANY11P16 + // Any 1 to 1 with yuvconstants -#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ - const struct YuvConstants* yuvconstants, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 2]); \ - memset(temp, 0, 128); /* for YUY2 and msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ - ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } #if defined(HAS_YUY2TOARGBROW_SSSE3) ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) @@ -573,25 +905,28 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) #endif +#if defined(HAS_YUY2TOARGBROW_MSA) +ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7) +ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7) +#endif #undef ANY11C // Any 1 to 1 interpolate. Takes 2 rows of source via stride. -#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ - ptrdiff_t src_stride_ptr, int width, \ - int source_y_fraction) { \ - SIMD_ALIGNED(uint8 temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ - } +#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \ + ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \ + SIMD_ALIGNED(uint8_t temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } #ifdef HAS_INTERPOLATEROW_AVX2 ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) @@ -602,25 +937,25 @@ ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) #ifdef HAS_INTERPOLATEROW_NEON ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #endif -#ifdef HAS_INTERPOLATEROW_DSPR2 -ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3) +#ifdef HAS_INTERPOLATEROW_MSA +ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) #endif #undef ANY11T // Any 1 to 1 mirror. -#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8 temp[64 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ - } \ - memcpy(temp, src_ptr, r * BPP); \ - ANY_SIMD(temp, temp + 64, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ - } +#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr, r* BPP); \ + ANY_SIMD(temp, temp + 64, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ + } #ifdef HAS_MIRRORROW_AVX2 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) @@ -631,6 +966,9 @@ ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) #ifdef HAS_MIRRORROW_NEON ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) #endif +#ifdef HAS_MIRRORROW_MSA +ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) +#endif #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif @@ -640,67 +978,54 @@ ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) #ifdef HAS_ARGBMIRRORROW_NEON ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) #endif +#ifdef HAS_ARGBMIRRORROW_MSA +ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) +#endif #undef ANY11M // Any 1 plane. (memset) -#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, T v32, int width) { \ - SIMD_ALIGNED(uint8 temp[64]); \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, v32, n); \ - } \ - ANY_SIMD(temp, v32, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp, r * BPP); \ - } +#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \ + SIMD_ALIGNED(uint8_t temp[64]); \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, v32, n); \ + } \ + ANY_SIMD(temp, v32, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp, r * BPP); \ + } #ifdef HAS_SETROW_X86 -ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3) +ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3) #endif #ifdef HAS_SETROW_NEON -ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15) +ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15) #endif #ifdef HAS_ARGBSETROW_NEON -ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3) +ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3) +#endif +#ifdef HAS_ARGBSETROW_MSA +ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3) #endif #undef ANY1 // Any 1 to 2. Outputs UV planes. -#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ - void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\ - SIMD_ALIGNED(uint8 temp[128 * 3]); \ - memset(temp, 0, 128); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_u, dst_v, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - /* repeat last 4 bytes for 422 subsampler */ \ - if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - /* repeat last 4 - 12 bytes for 411 subsampler */ \ - if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \ - } \ - if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \ - } \ - if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ - memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ - memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ - } +#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ + memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ + memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ + } #ifdef HAS_SPLITUVROW_SSE2 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) @@ -711,8 +1036,8 @@ ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) #ifdef HAS_SPLITUVROW_NEON ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #endif -#ifdef HAS_SPLITUVROW_DSPR2 -ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15) +#ifdef HAS_SPLITUVROW_MSA +ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) #endif #ifdef HAS_ARGBTOUV444ROW_SSSE3 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) @@ -727,37 +1052,66 @@ ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) #endif #ifdef HAS_YUY2TOUV422ROW_NEON ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) -ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) #endif +#ifdef HAS_YUY2TOUV422ROW_MSA +ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) +ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) +#endif #undef ANY12 +// Any 1 to 3. Outputs RGB planes. +#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ + uint8_t* dst_b, int width) { \ + SIMD_ALIGNED(uint8_t temp[16 * 6]); \ + memset(temp, 0, 16 * 3); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ + } \ + memcpy(temp, src_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \ + memcpy(dst_r + n, temp + 16 * 3, r); \ + memcpy(dst_g + n, temp + 16 * 4, r); \ + memcpy(dst_b + n, temp + 16 * 5, r); \ + } + +#ifdef HAS_SPLITRGBROW_SSSE3 +ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) +#endif +#ifdef HAS_SPLITRGBROW_NEON +ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) +#endif + // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // 128 byte row allows for 32 avx ARGB pixels. -#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, int src_stride_ptr, \ - uint8* dst_u, uint8* dst_v, int width) { \ - SIMD_ALIGNED(uint8 temp[128 * 4]); \ - memset(temp, 0, 128 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ - } \ - memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ - SS(r, UVSHIFT) * BPP); \ - if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\ - memcpy(temp + SS(r, UVSHIFT) * BPP, \ - temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ - temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ - } \ - ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ - memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ - memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ - } +#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \ + uint8_t* dst_v, int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 4]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ + memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ + BPP); \ + memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ + temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + } \ + ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ + memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ + memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ + } #ifdef HAS_ARGBTOUVROW_AVX2 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) @@ -783,30 +1137,57 @@ ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) #ifdef HAS_ARGBTOUVROW_NEON ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVROW_MSA +ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVJROW_MSA +ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) +#endif #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_BGRATOUVROW_MSA +ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_ABGRTOUVROW_NEON ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVROW_MSA +ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_RGBATOUVROW_NEON ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif +#ifdef HAS_RGBATOUVROW_MSA +ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31) +#endif #ifdef HAS_RGB24TOUVROW_NEON ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) #endif +#ifdef HAS_RGB24TOUVROW_MSA +ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) +#endif #ifdef HAS_RAWTOUVROW_NEON ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) #endif +#ifdef HAS_RAWTOUVROW_MSA +ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) +#endif #ifdef HAS_RGB565TOUVROW_NEON ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) #endif +#ifdef HAS_RGB565TOUVROW_MSA +ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) +#endif #ifdef HAS_ARGB1555TOUVROW_NEON ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) #endif +#ifdef HAS_ARGB1555TOUVROW_MSA +ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) +#endif #ifdef HAS_ARGB4444TOUVROW_NEON ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) #endif @@ -816,6 +1197,12 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) #ifdef HAS_UYVYTOUVROW_NEON ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #endif +#ifdef HAS_YUY2TOUVROW_MSA +ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) +#endif +#ifdef HAS_UYVYTOUVROW_MSA +ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) +#endif #undef ANY12S #ifdef __cplusplus diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc index aefa38c495..2bbc5adbf1 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_common.cc @@ -10,6 +10,7 @@ #include "libyuv/row.h" +#include #include // For memcpy and memset. #include "libyuv/basic_types.h" @@ -23,59 +24,69 @@ extern "C" { #define USE_BRANCHLESS 1 #if USE_BRANCHLESS -static __inline int32 clamp0(int32 v) { +static __inline int32_t clamp0(int32_t v) { return ((-(v) >> 31) & (v)); } -static __inline int32 clamp255(int32 v) { +static __inline int32_t clamp255(int32_t v) { return (((255 - (v)) >> 31) | (v)) & 255; } -static __inline uint32 Clamp(int32 val) { - int v = clamp0(val); - return (uint32)(clamp255(v)); +static __inline int32_t clamp1023(int32_t v) { + return (((1023 - (v)) >> 31) | (v)) & 1023; } -static __inline uint32 Abs(int32 v) { +static __inline uint32_t Abs(int32_t v) { int m = v >> 31; return (v + m) ^ m; } -#else // USE_BRANCHLESS -static __inline int32 clamp0(int32 v) { +#else // USE_BRANCHLESS +static __inline int32_t clamp0(int32_t v) { return (v < 0) ? 0 : v; } -static __inline int32 clamp255(int32 v) { +static __inline int32_t clamp255(int32_t v) { return (v > 255) ? 255 : v; } -static __inline uint32 Clamp(int32 val) { - int v = clamp0(val); - return (uint32)(clamp255(v)); +static __inline int32_t clamp1023(int32_t v) { + return (v > 1023) ? 1023 : v; } -static __inline uint32 Abs(int32 v) { +static __inline uint32_t Abs(int32_t v) { return (v < 0) ? -v : v; } #endif // USE_BRANCHLESS +static __inline uint32_t Clamp(int32_t val) { + int v = clamp0(val); + return (uint32_t)(clamp255(v)); +} -#ifdef LIBYUV_LITTLE_ENDIAN -#define WRITEWORD(p, v) *(uint32*)(p) = v +static __inline uint32_t Clamp10(int32_t val) { + int v = clamp0(val); + return (uint32_t)(clamp1023(v)); +} + +// Little Endian +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define WRITEWORD(p, v) *(uint32_t*)(p) = v #else -static inline void WRITEWORD(uint8* p, uint32 v) { - p[0] = (uint8)(v & 255); - p[1] = (uint8)((v >> 8) & 255); - p[2] = (uint8)((v >> 16) & 255); - p[3] = (uint8)((v >> 24) & 255); +static inline void WRITEWORD(uint8_t* p, uint32_t v) { + p[0] = (uint8_t)(v & 255); + p[1] = (uint8_t)((v >> 8) & 255); + p[2] = (uint8_t)((v >> 16) & 255); + p[3] = (uint8_t)((v >> 24) & 255); } #endif -void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { +void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_rgb24[0]; - uint8 g = src_rgb24[1]; - uint8 r = src_rgb24[2]; + uint8_t b = src_rgb24[0]; + uint8_t g = src_rgb24[1]; + uint8_t r = src_rgb24[2]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; @@ -85,12 +96,12 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { } } -void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { +void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 r = src_raw[0]; - uint8 g = src_raw[1]; - uint8 b = src_raw[2]; + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; @@ -100,12 +111,12 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { } } -void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) { +void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { int x; for (x = 0; x < width; ++x) { - uint8 r = src_raw[0]; - uint8 g = src_raw[1]; - uint8 b = src_raw[2]; + uint8_t r = src_raw[0]; + uint8_t g = src_raw[1]; + uint8_t b = src_raw[2]; dst_rgb24[0] = b; dst_rgb24[1] = g; dst_rgb24[2] = r; @@ -114,12 +125,14 @@ void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) { } } -void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { +void RGB565ToARGBRow_C(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_rgb565[0] & 0x1f; - uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r = src_rgb565[1] >> 3; + uint8_t b = src_rgb565[0] & 0x1f; + uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r = src_rgb565[1] >> 3; dst_argb[0] = (b << 3) | (b >> 2); dst_argb[1] = (g << 2) | (g >> 4); dst_argb[2] = (r << 3) | (r >> 2); @@ -129,14 +142,15 @@ void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { } } -void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb1555[0] & 0x1f; - uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r = (src_argb1555[1] & 0x7c) >> 2; - uint8 a = src_argb1555[1] >> 7; + uint8_t b = src_argb1555[0] & 0x1f; + uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r = (src_argb1555[1] & 0x7c) >> 2; + uint8_t a = src_argb1555[1] >> 7; dst_argb[0] = (b << 3) | (b >> 2); dst_argb[1] = (g << 3) | (g >> 2); dst_argb[2] = (r << 3) | (r >> 2); @@ -146,14 +160,15 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, } } -void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb4444[0] & 0x0f; - uint8 g = src_argb4444[0] >> 4; - uint8 r = src_argb4444[1] & 0x0f; - uint8 a = src_argb4444[1] >> 4; + uint8_t b = src_argb4444[0] & 0x0f; + uint8_t g = src_argb4444[0] >> 4; + uint8_t r = src_argb4444[1] & 0x0f; + uint8_t a = src_argb4444[1] >> 4; dst_argb[0] = (b << 4) | b; dst_argb[1] = (g << 4) | g; dst_argb[2] = (r << 4) | r; @@ -163,12 +178,53 @@ void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, } } -void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb[0]; - uint8 g = src_argb[1]; - uint8 r = src_argb[2]; + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = (ar30 >> 2) & 0xff; + uint32_t g = (ar30 >> 12) & 0xff; + uint32_t r = (ar30 >> 22) & 0xff; + uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. + *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24); + dst_argb += 4; + src_ar30 += 4; + } +} + +void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = (ar30 >> 2) & 0xff; + uint32_t g = (ar30 >> 12) & 0xff; + uint32_t r = (ar30 >> 22) & 0xff; + uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. + *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24); + dst_abgr += 4; + src_ar30 += 4; + } +} + +void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t ar30 = *(const uint32_t*)src_ar30; + uint32_t b = ar30 & 0x3ff; + uint32_t ga = ar30 & 0xc00ffc00; + uint32_t r = (ar30 >> 20) & 0x3ff; + *(uint32_t*)(dst_ab30) = r | ga | (b << 20); + dst_ab30 += 4; + src_ar30 += 4; + } +} + +void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t b = src_argb[0]; + uint8_t g = src_argb[1]; + uint8_t r = src_argb[2]; dst_rgb[0] = b; dst_rgb[1] = g; dst_rgb[2] = r; @@ -177,12 +233,12 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb[0]; - uint8 g = src_argb[1]; - uint8 r = src_argb[2]; + uint8_t b = src_argb[0]; + uint8_t g = src_argb[1]; + uint8_t r = src_argb[2]; dst_rgb[0] = r; dst_rgb[1] = g; dst_rgb[2] = b; @@ -191,25 +247,25 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { } } -void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 2; - uint8 r0 = src_argb[2] >> 3; - uint8 b1 = src_argb[4] >> 3; - uint8 g1 = src_argb[5] >> 2; - uint8 r1 = src_argb[6] >> 3; - WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27)); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 2; + uint8_t r0 = src_argb[2] >> 3; + uint8_t b1 = src_argb[4] >> 3; + uint8_t g1 = src_argb[5] >> 2; + uint8_t r1 = src_argb[6] >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); dst_rgb += 4; src_argb += 8; } if (width & 1) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 2; - uint8 r0 = src_argb[2] >> 3; - *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 2; + uint8_t r0 = src_argb[2] >> 3; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); } } @@ -221,132 +277,160 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { // endian will not affect order of the original matrix. But the dither4 // will containing the first pixel in the lower byte for little endian // or the upper byte for big endian. -void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { int x; for (x = 0; x < width - 1; x += 2) { int dither0 = ((const unsigned char*)(&dither4))[x & 3]; int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3]; - uint8 b0 = clamp255(src_argb[0] + dither0) >> 3; - uint8 g0 = clamp255(src_argb[1] + dither0) >> 2; - uint8 r0 = clamp255(src_argb[2] + dither0) >> 3; - uint8 b1 = clamp255(src_argb[4] + dither1) >> 3; - uint8 g1 = clamp255(src_argb[5] + dither1) >> 2; - uint8 r1 = clamp255(src_argb[6] + dither1) >> 3; - WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27)); + uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; + uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3; + uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2; + uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); dst_rgb += 4; src_argb += 8; } if (width & 1) { int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3]; - uint8 b0 = clamp255(src_argb[0] + dither0) >> 3; - uint8 g0 = clamp255(src_argb[1] + dither0) >> 2; - uint8 r0 = clamp255(src_argb[2] + dither0) >> 3; - *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); } } -void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 3; - uint8 r0 = src_argb[2] >> 3; - uint8 a0 = src_argb[3] >> 7; - uint8 b1 = src_argb[4] >> 3; - uint8 g1 = src_argb[5] >> 3; - uint8 r1 = src_argb[6] >> 3; - uint8 a1 = src_argb[7] >> 7; - *(uint32*)(dst_rgb) = - b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | - (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 3; + uint8_t r0 = src_argb[2] >> 3; + uint8_t a0 = src_argb[3] >> 7; + uint8_t b1 = src_argb[4] >> 3; + uint8_t g1 = src_argb[5] >> 3; + uint8_t r1 = src_argb[6] >> 3; + uint8_t a1 = src_argb[7] >> 7; + *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) | + (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31); dst_rgb += 4; src_argb += 8; } if (width & 1) { - uint8 b0 = src_argb[0] >> 3; - uint8 g0 = src_argb[1] >> 3; - uint8 r0 = src_argb[2] >> 3; - uint8 a0 = src_argb[3] >> 7; - *(uint16*)(dst_rgb) = - b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); + uint8_t b0 = src_argb[0] >> 3; + uint8_t g0 = src_argb[1] >> 3; + uint8_t r0 = src_argb[2] >> 3; + uint8_t a0 = src_argb[3] >> 7; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); } } -void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { +void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb[0] >> 4; - uint8 g0 = src_argb[1] >> 4; - uint8 r0 = src_argb[2] >> 4; - uint8 a0 = src_argb[3] >> 4; - uint8 b1 = src_argb[4] >> 4; - uint8 g1 = src_argb[5] >> 4; - uint8 r1 = src_argb[6] >> 4; - uint8 a1 = src_argb[7] >> 4; - *(uint32*)(dst_rgb) = - b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | - (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); + uint8_t b0 = src_argb[0] >> 4; + uint8_t g0 = src_argb[1] >> 4; + uint8_t r0 = src_argb[2] >> 4; + uint8_t a0 = src_argb[3] >> 4; + uint8_t b1 = src_argb[4] >> 4; + uint8_t g1 = src_argb[5] >> 4; + uint8_t r1 = src_argb[6] >> 4; + uint8_t a1 = src_argb[7] >> 4; + *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | + (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28); dst_rgb += 4; src_argb += 8; } if (width & 1) { - uint8 b0 = src_argb[0] >> 4; - uint8 g0 = src_argb[1] >> 4; - uint8 r0 = src_argb[2] >> 4; - uint8 a0 = src_argb[3] >> 4; - *(uint16*)(dst_rgb) = - b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); + uint8_t b0 = src_argb[0] >> 4; + uint8_t g0 = src_argb[1] >> 4; + uint8_t r0 = src_argb[2] >> 4; + uint8_t a0 = src_argb[3] >> 4; + *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); } } -static __inline int RGBToY(uint8 r, uint8 g, uint8 b) { - return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; +void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2); + uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2); + uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2); + uint32_t a0 = (src_abgr[3] >> 6); + *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30); + dst_ar30 += 4; + src_abgr += 4; + } } -static __inline int RGBToU(uint8 r, uint8 g, uint8 b) { +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2); + uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2); + uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2); + uint32_t a0 = (src_argb[3] >> 6); + *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30); + dst_ar30 += 4; + src_argb += 4; + } +} + +static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { + return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; +} + +static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; } -static __inline int RGBToV(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; } -#define MAKEROWY(NAME, R, G, B, BPP) \ -void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ -} \ -void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ - uint8* dst_u, uint8* dst_v, int width) { \ - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \ - src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \ - uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \ - src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \ - uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \ - src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ - uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ - uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ -} +// ARGBToY_C and ARGBToUV_C +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP]) >> \ + 2; \ + uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP]) >> \ + 2; \ + uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP]) >> \ + 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ + } MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(BGRA, 1, 2, 3, 4) @@ -381,64 +465,65 @@ MAKEROWY(RAW, 0, 1, 2, 3) // g -0.41869 * 255 = -106.76595 = -107 // r 0.50000 * 255 = 127.5 = 127 -static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) { - return (38 * r + 75 * g + 15 * b + 64) >> 7; +static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { + return (38 * r + 75 * g + 15 * b + 64) >> 7; } -static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; } -static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) { +static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; } #define AVGB(a, b) (((a) + (b) + 1) >> 1) -#define MAKEROWYJ(NAME, R, G, B, BPP) \ -void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ -} \ -void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ - uint8* dst_u, uint8* dst_v, int width) { \ - const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ - AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ - uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ - AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ - uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ - AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \ - uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \ - uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - } \ -} +// ARGBToYJ_C and ARGBToUVJ_C +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ + } MAKEROWYJ(ARGB, 2, 1, 0, 4) #undef MAKEROWYJ -void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { +void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_rgb565[0] & 0x1f; - uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r = src_rgb565[1] >> 3; + uint8_t b = src_rgb565[0] & 0x1f; + uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r = src_rgb565[1] >> 3; b = (b << 3) | (b >> 2); g = (g << 2) | (g >> 4); r = (r << 3) | (r >> 2); @@ -448,12 +533,12 @@ void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { } } -void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) { +void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb1555[0] & 0x1f; - uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b = src_argb1555[0] & 0x1f; + uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r = (src_argb1555[1] & 0x7c) >> 2; b = (b << 3) | (b >> 2); g = (g << 3) | (g >> 2); r = (r << 3) | (r >> 2); @@ -463,12 +548,12 @@ void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) { } } -void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { +void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { - uint8 b = src_argb4444[0] & 0x0f; - uint8 g = src_argb4444[0] >> 4; - uint8 r = src_argb4444[1] & 0x0f; + uint8_t b = src_argb4444[0] & 0x0f; + uint8_t g = src_argb4444[0] >> 4; + uint8_t r = src_argb4444[1] & 0x0f; b = (b << 4) | b; g = (g << 4) | g; r = (r << 4) | r; @@ -478,26 +563,29 @@ void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { } } -void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565; +void RGB565ToUVRow_C(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_rgb565[0] & 0x1f; - uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r0 = src_rgb565[1] >> 3; - uint8 b1 = src_rgb565[2] & 0x1f; - uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); - uint8 r1 = src_rgb565[3] >> 3; - uint8 b2 = next_rgb565[0] & 0x1f; - uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); - uint8 r2 = next_rgb565[1] >> 3; - uint8 b3 = next_rgb565[2] & 0x1f; - uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); - uint8 r3 = next_rgb565[3] >> 3; - uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. - uint8 g = (g0 + g1 + g2 + g3); - uint8 r = (r0 + r1 + r2 + r3); + uint8_t b0 = src_rgb565[0] & 0x1f; + uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r0 = src_rgb565[1] >> 3; + uint8_t b1 = src_rgb565[2] & 0x1f; + uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); + uint8_t r1 = src_rgb565[3] >> 3; + uint8_t b2 = next_rgb565[0] & 0x1f; + uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8_t r2 = next_rgb565[1] >> 3; + uint8_t b3 = next_rgb565[2] & 0x1f; + uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); + uint8_t r3 = next_rgb565[3] >> 3; + uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); b = (b << 1) | (b >> 6); // 787 -> 888. r = (r << 1) | (r >> 6); dst_u[0] = RGBToU(r, g, b); @@ -508,15 +596,15 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, dst_v += 1; } if (width & 1) { - uint8 b0 = src_rgb565[0] & 0x1f; - uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); - uint8 r0 = src_rgb565[1] >> 3; - uint8 b2 = next_rgb565[0] & 0x1f; - uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); - uint8 r2 = next_rgb565[1] >> 3; - uint8 b = (b0 + b2); // 565 * 2 = 676. - uint8 g = (g0 + g2); - uint8 r = (r0 + r2); + uint8_t b0 = src_rgb565[0] & 0x1f; + uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); + uint8_t r0 = src_rgb565[1] >> 3; + uint8_t b2 = next_rgb565[0] & 0x1f; + uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); + uint8_t r2 = next_rgb565[1] >> 3; + uint8_t b = (b0 + b2); // 565 * 2 = 676. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); b = (b << 2) | (b >> 4); // 676 -> 888 g = (g << 1) | (g >> 6); r = (r << 2) | (r >> 4); @@ -525,26 +613,29 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, } } -void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555; +void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb1555[0] & 0x1f; - uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; - uint8 b1 = src_argb1555[2] & 0x1f; - uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); - uint8 r1 = (src_argb1555[3] & 0x7c) >> 2; - uint8 b2 = next_argb1555[0] & 0x1f; - uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8 r2 = (next_argb1555[1] & 0x7c) >> 2; - uint8 b3 = next_argb1555[2] & 0x1f; - uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); - uint8 r3 = (next_argb1555[3] & 0x7c) >> 2; - uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. - uint8 g = (g0 + g1 + g2 + g3); - uint8 r = (r0 + r1 + r2 + r3); + uint8_t b0 = src_argb1555[0] & 0x1f; + uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b1 = src_argb1555[2] & 0x1f; + uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); + uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2; + uint8_t b2 = next_argb1555[0] & 0x1f; + uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2; + uint8_t b3 = next_argb1555[2] & 0x1f; + uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); + uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2; + uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); b = (b << 1) | (b >> 6); // 777 -> 888. g = (g << 1) | (g >> 6); r = (r << 1) | (r >> 6); @@ -556,15 +647,15 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, dst_v += 1; } if (width & 1) { - uint8 b0 = src_argb1555[0] & 0x1f; - uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); - uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; - uint8 b2 = next_argb1555[0] & 0x1f; - uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); - uint8 r2 = next_argb1555[1] >> 3; - uint8 b = (b0 + b2); // 555 * 2 = 666. - uint8 g = (g0 + g2); - uint8 r = (r0 + r2); + uint8_t b0 = src_argb1555[0] & 0x1f; + uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8_t b2 = next_argb1555[0] & 0x1f; + uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8_t r2 = next_argb1555[1] >> 3; + uint8_t b = (b0 + b2); // 555 * 2 = 666. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); b = (b << 2) | (b >> 4); // 666 -> 888. g = (g << 2) | (g >> 4); r = (r << 2) | (r >> 4); @@ -573,26 +664,29 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, } } -void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444; +void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444; int x; for (x = 0; x < width - 1; x += 2) { - uint8 b0 = src_argb4444[0] & 0x0f; - uint8 g0 = src_argb4444[0] >> 4; - uint8 r0 = src_argb4444[1] & 0x0f; - uint8 b1 = src_argb4444[2] & 0x0f; - uint8 g1 = src_argb4444[2] >> 4; - uint8 r1 = src_argb4444[3] & 0x0f; - uint8 b2 = next_argb4444[0] & 0x0f; - uint8 g2 = next_argb4444[0] >> 4; - uint8 r2 = next_argb4444[1] & 0x0f; - uint8 b3 = next_argb4444[2] & 0x0f; - uint8 g3 = next_argb4444[2] >> 4; - uint8 r3 = next_argb4444[3] & 0x0f; - uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. - uint8 g = (g0 + g1 + g2 + g3); - uint8 r = (r0 + r1 + r2 + r3); + uint8_t b0 = src_argb4444[0] & 0x0f; + uint8_t g0 = src_argb4444[0] >> 4; + uint8_t r0 = src_argb4444[1] & 0x0f; + uint8_t b1 = src_argb4444[2] & 0x0f; + uint8_t g1 = src_argb4444[2] >> 4; + uint8_t r1 = src_argb4444[3] & 0x0f; + uint8_t b2 = next_argb4444[0] & 0x0f; + uint8_t g2 = next_argb4444[0] >> 4; + uint8_t r2 = next_argb4444[1] & 0x0f; + uint8_t b3 = next_argb4444[2] & 0x0f; + uint8_t g3 = next_argb4444[2] >> 4; + uint8_t r3 = next_argb4444[3] & 0x0f; + uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. + uint8_t g = (g0 + g1 + g2 + g3); + uint8_t r = (r0 + r1 + r2 + r3); b = (b << 2) | (b >> 4); // 666 -> 888. g = (g << 2) | (g >> 4); r = (r << 2) | (r >> 4); @@ -604,15 +698,15 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, dst_v += 1; } if (width & 1) { - uint8 b0 = src_argb4444[0] & 0x0f; - uint8 g0 = src_argb4444[0] >> 4; - uint8 r0 = src_argb4444[1] & 0x0f; - uint8 b2 = next_argb4444[0] & 0x0f; - uint8 g2 = next_argb4444[0] >> 4; - uint8 r2 = next_argb4444[1] & 0x0f; - uint8 b = (b0 + b2); // 444 * 2 = 555. - uint8 g = (g0 + g2); - uint8 r = (r0 + r2); + uint8_t b0 = src_argb4444[0] & 0x0f; + uint8_t g0 = src_argb4444[0] >> 4; + uint8_t r0 = src_argb4444[1] & 0x0f; + uint8_t b2 = next_argb4444[0] & 0x0f; + uint8_t g2 = next_argb4444[0] >> 4; + uint8_t r2 = next_argb4444[1] & 0x0f; + uint8_t b = (b0 + b2); // 444 * 2 = 555. + uint8_t g = (g0 + g2); + uint8_t r = (r0 + r2); b = (b << 3) | (b >> 2); // 555 -> 888. g = (g << 3) | (g >> 2); r = (r << 3) | (r >> 2); @@ -621,13 +715,15 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, } } -void ARGBToUV444Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUV444Row_C(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; for (x = 0; x < width; ++x) { - uint8 ab = src_argb[0]; - uint8 ag = src_argb[1]; - uint8 ar = src_argb[2]; + uint8_t ab = src_argb[0]; + uint8_t ag = src_argb[1]; + uint8_t ar = src_argb[2]; dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); src_argb += 4; @@ -636,45 +732,10 @@ void ARGBToUV444Row_C(const uint8* src_argb, } } -void ARGBToUV411Row_C(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width) { - int x; - for (x = 0; x < width - 3; x += 4) { - uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2; - uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2; - uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - src_argb += 16; - dst_u += 1; - dst_v += 1; - } - // Odd width handling mimics 'any' function which replicates last pixel. - if ((width & 3) == 3) { - uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2; - uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2; - uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } else if ((width & 3) == 2) { - uint8 ab = (src_argb[0] + src_argb[4]) >> 1; - uint8 ag = (src_argb[1] + src_argb[5]) >> 1; - uint8 ar = (src_argb[2] + src_argb[6]) >> 1; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } else if ((width & 3) == 1) { - uint8 ab = src_argb[0]; - uint8 ag = src_argb[1]; - uint8 ar = src_argb[2]; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } -} - -void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { - uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); + uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = src_argb[3]; dst_argb += 4; @@ -683,7 +744,7 @@ void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } // Convert a row of image to Sepia tone. -void ARGBSepiaRow_C(uint8* dst_argb, int width) { +void ARGBSepiaRow_C(uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -702,22 +763,28 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) { // Apply color matrix to a row of image. Matrix is signed. // TODO(fbarchard): Consider adding rounding (+32). -void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { +void ARGBColorMatrixRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { int x; for (x = 0; x < width; ++x) { int b = src_argb[0]; int g = src_argb[1]; int r = src_argb[2]; int a = src_argb[3]; - int sb = (b * matrix_argb[0] + g * matrix_argb[1] + - r * matrix_argb[2] + a * matrix_argb[3]) >> 6; - int sg = (b * matrix_argb[4] + g * matrix_argb[5] + - r * matrix_argb[6] + a * matrix_argb[7]) >> 6; - int sr = (b * matrix_argb[8] + g * matrix_argb[9] + - r * matrix_argb[10] + a * matrix_argb[11]) >> 6; - int sa = (b * matrix_argb[12] + g * matrix_argb[13] + - r * matrix_argb[14] + a * matrix_argb[15]) >> 6; + int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] + + a * matrix_argb[3]) >> + 6; + int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] + + a * matrix_argb[7]) >> + 6; + int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] + + a * matrix_argb[11]) >> + 6; + int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] + + a * matrix_argb[15]) >> + 6; dst_argb[0] = Clamp(sb); dst_argb[1] = Clamp(sg); dst_argb[2] = Clamp(sr); @@ -728,7 +795,9 @@ void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, } // Apply color table to a row of image. -void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { +void ARGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -744,7 +813,9 @@ void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { } // Apply color table to a row of image. -void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { +void RGBColorTableRow_C(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -757,8 +828,11 @@ void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { } } -void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { +void ARGBQuantizeRow_C(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; @@ -772,21 +846,23 @@ void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, } #define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v * f >> 24 +#define SHADE(f, v) v* f >> 24 -void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - const uint32 b_scale = REPEAT8(value & 0xff); - const uint32 g_scale = REPEAT8((value >> 8) & 0xff); - const uint32 r_scale = REPEAT8((value >> 16) & 0xff); - const uint32 a_scale = REPEAT8(value >> 24); +void ARGBShadeRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + const uint32_t b_scale = REPEAT8(value & 0xff); + const uint32_t g_scale = REPEAT8((value >> 8) & 0xff); + const uint32_t r_scale = REPEAT8((value >> 16) & 0xff); + const uint32_t a_scale = REPEAT8(value >> 24); int i; for (i = 0; i < width; ++i) { - const uint32 b = REPEAT8(src_argb[0]); - const uint32 g = REPEAT8(src_argb[1]); - const uint32 r = REPEAT8(src_argb[2]); - const uint32 a = REPEAT8(src_argb[3]); + const uint32_t b = REPEAT8(src_argb[0]); + const uint32_t g = REPEAT8(src_argb[1]); + const uint32_t r = REPEAT8(src_argb[2]); + const uint32_t a = REPEAT8(src_argb[3]); dst_argb[0] = SHADE(b, b_scale); dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); @@ -799,20 +875,22 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, #undef SHADE #define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v * f >> 16 +#define SHADE(f, v) v* f >> 16 -void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBMultiplyRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { - const uint32 b = REPEAT8(src_argb0[0]); - const uint32 g = REPEAT8(src_argb0[1]); - const uint32 r = REPEAT8(src_argb0[2]); - const uint32 a = REPEAT8(src_argb0[3]); - const uint32 b_scale = src_argb1[0]; - const uint32 g_scale = src_argb1[1]; - const uint32 r_scale = src_argb1[2]; - const uint32 a_scale = src_argb1[3]; + const uint32_t b = REPEAT8(src_argb0[0]); + const uint32_t g = REPEAT8(src_argb0[1]); + const uint32_t r = REPEAT8(src_argb0[2]); + const uint32_t a = REPEAT8(src_argb0[3]); + const uint32_t b_scale = src_argb1[0]; + const uint32_t g_scale = src_argb1[1]; + const uint32_t r_scale = src_argb1[2]; + const uint32_t a_scale = src_argb1[3]; dst_argb[0] = SHADE(b, b_scale); dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); @@ -827,8 +905,10 @@ void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1, #define SHADE(f, v) clamp255(v + f) -void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBAddRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { const int b = src_argb0[0]; @@ -852,8 +932,10 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, #define SHADE(f, v) clamp0(f - v) -void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBSubtractRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { const int b = src_argb0[0]; @@ -876,8 +958,11 @@ void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, #undef SHADE // Sobel functions which mimics SSSE3. -void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, - uint8* dst_sobelx, int width) { +void SobelXRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { int i; for (i = 0; i < width; ++i) { int a = src_y0[i]; @@ -890,12 +975,14 @@ void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, int b_diff = b - b_sub; int c_diff = c - c_sub; int sobel = Abs(a_diff + b_diff * 2 + c_diff); - dst_sobelx[i] = (uint8)(clamp255(sobel)); + dst_sobelx[i] = (uint8_t)(clamp255(sobel)); } } -void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +void SobelYRow_C(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { int i; for (i = 0; i < width; ++i) { int a = src_y0[i + 0]; @@ -908,56 +995,62 @@ void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, int b_diff = b - b_sub; int c_diff = c - c_sub; int sobel = Abs(a_diff + b_diff * 2 + c_diff); - dst_sobely[i] = (uint8)(clamp255(sobel)); + dst_sobely[i] = (uint8_t)(clamp255(sobel)); } } -void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int s = clamp255(r + b); - dst_argb[0] = (uint8)(s); - dst_argb[1] = (uint8)(s); - dst_argb[2] = (uint8)(s); - dst_argb[3] = (uint8)(255u); + dst_argb[0] = (uint8_t)(s); + dst_argb[1] = (uint8_t)(s); + dst_argb[2] = (uint8_t)(s); + dst_argb[3] = (uint8_t)(255u); dst_argb += 4; } } -void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { +void SobelToPlaneRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int s = clamp255(r + b); - dst_y[i] = (uint8)(s); + dst_y[i] = (uint8_t)(s); } } -void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +void SobelXYRow_C(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int g = clamp255(r + b); - dst_argb[0] = (uint8)(b); - dst_argb[1] = (uint8)(g); - dst_argb[2] = (uint8)(r); - dst_argb[3] = (uint8)(255u); + dst_argb[0] = (uint8_t)(b); + dst_argb[1] = (uint8_t)(g); + dst_argb[2] = (uint8_t)(r); + dst_argb[3] = (uint8_t)(255u); dst_argb += 4; } } -void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { +void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // Copy a Y to RGB. int x; for (x = 0; x < width; ++x) { - uint8 y = src_y[0]; + uint8_t y = src_y[0]; dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = 255u; dst_argb += 4; @@ -974,75 +1067,69 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // B = (Y - 16) * 1.164 - U * -2.018 // Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ // U and V contributions to R,G,B. #define UB -128 /* max(-128, round(-2.018 * 64)) */ -#define UG 25 /* round(0.391 * 64) */ -#define VG 52 /* round(0.813 * 64) */ +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ #define VR -102 /* round(-1.596 * 64) */ // Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 + YGB) +#define BB (UB * 128 + YGB) #define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) +#define BR (VR * 128 + YGB) #if defined(__aarch64__) // 64 bit arm const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #elif defined(__arm__) // 32 bit arm const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, - { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, - { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; #endif #undef BB @@ -1062,74 +1149,68 @@ const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { // Y contribution to R,G,B. Scale and bias. #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YGB 32 /* 64 / 2 */ +#define YGB 32 /* 64 / 2 */ // U and V contributions to R,G,B. #define UB -113 /* round(-1.77200 * 64) */ -#define UG 22 /* round(0.34414 * 64) */ -#define VG 46 /* round(0.71414 * 64) */ -#define VR -90 /* round(-1.40200 * 64) */ +#define UG 22 /* round(0.34414 * 64) */ +#define VG 46 /* round(0.71414 * 64) */ +#define VR -90 /* round(-1.40200 * 64) */ // Bias values to round, and subtract 128 from U and V. -#define BB (UB * 128 + YGB) +#define BB (UB * 128 + YGB) #define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) +#define BR (VR * 128 + YGB) #if defined(__aarch64__) const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #elif defined(__arm__) const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, - { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, - { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; #endif #undef BB @@ -1143,81 +1224,76 @@ const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { #undef YG // BT.709 YUV to RGB reference -// * R = Y - V * -1.28033 -// * G = Y - U * 0.21482 - V * 0.38059 -// * B = Y - U * -2.12798 +// R = (Y - 16) * 1.164 - V * -1.793 +// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 +// B = (Y - 16) * 1.164 - U * -2.112 +// See also http://www.equasys.de/colorconversion.html // Y contribution to R,G,B. Scale and bias. -#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ -#define YGB 32 /* 64 / 2 */ +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ -// TODO(fbarchard): Find way to express 2.12 instead of 2.0. +// TODO(fbarchard): Find way to express 2.112 instead of 2.0. // U and V contributions to R,G,B. -#define UB -128 /* max(-128, round(-2.12798 * 64)) */ -#define UG 14 /* round(0.21482 * 64) */ -#define VG 24 /* round(0.38059 * 64) */ -#define VR -82 /* round(-1.28033 * 64) */ +#define UB -128 /* max(-128, round(-2.112 * 64)) */ +#define UG 14 /* round(0.213 * 64) */ +#define VG 34 /* round(0.533 * 64) */ +#define VR -115 /* round(-1.793 * 64) */ // Bias values to round, and subtract 128 from U and V. -#define BB (UB * 128 + YGB) +#define BB (UB * 128 + YGB) #define BG (UG * 128 + VG * 128 + YGB) -#define BR (VR * 128 + YGB) +#define BR (VR * 128 + YGB) #if defined(__aarch64__) const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { UG, VG, UG, VG, UG, VG, UG, VG }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {UG, VG, UG, VG, UG, VG, UG, VG}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { VG, UG, VG, UG, VG, UG, VG, UG }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {VG, UG, VG, UG, VG, UG, VG, UG}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #elif defined(__arm__) const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 }, - { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BB, BG, BR, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, + {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BB, BG, BR, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 }, - { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 }, - { BR, BG, BB, 0, 0, 0, 0, 0 }, - { 0x0101 * YG, 0, 0, 0 } -}; + {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, + {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, + {BR, BG, BB, 0, 0, 0, 0, 0}, + {0x0101 * YG, 0, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { - { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, - UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, - 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, + {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, + {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { - { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, - VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, - 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, - { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, - { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, - { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, - { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } -}; + {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, + {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, + {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, + {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, + {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, + {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; #endif #undef BB @@ -1231,8 +1307,14 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { #undef YG // C reference code that mimics the YUV assembly. -static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, - uint8* b, uint8* g, uint8* r, +// Reads 8 bit YUV and leaves result as 16 bit. + +static __inline void YuvPixel(uint8_t y, + uint8_t u, + uint8_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, const struct YuvConstants* yuvconstants) { #if defined(__aarch64__) int ub = -yuvconstants->kUVToRB[0]; @@ -1263,22 +1345,129 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, int yg = yuvconstants->kYToRgb[0]; #endif - uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16; - *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6); - *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6); - *r = Clamp((int32) (-(v * vr) + y1 + br) >> 6); + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6); + *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6); + *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6); +} + +// Reads 8 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel8_16(uint8_t y, + uint8_t u, + uint8_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 10 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel16(int16_t y, + int16_t u, + int16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16; + u = clamp255(u >> 2); + v = clamp255(v >> 2); + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); +} + +// C reference code that mimics the YUV 10 bit assembly. +// Reads 10 bit YUV and clamps down to 8 bit RGB. +static __inline void YuvPixel10(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { + int b16; + int g16; + int r16; + YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants); + *b = Clamp(b16 >> 6); + *g = Clamp(g16 >> 6); + *r = Clamp(r16 >> 6); } // Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ // C reference code that mimics the YUV assembly. -static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { - uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; - *b = Clamp((int32)(y1 + YGB) >> 6); - *g = Clamp((int32)(y1 + YGB) >> 6); - *r = Clamp((int32)(y1 + YGB) >> 6); +static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) { + uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16; + *b = Clamp((int32_t)(y1 + YGB) >> 6); + *g = Clamp((int32_t)(y1 + YGB) >> 6); + *r = Clamp((int32_t)(y1 + YGB) >> 6); } #undef YG @@ -1288,16 +1477,16 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) // C mimic assembly. // TODO(fbarchard): Remove subsampling from Neon. -void I444ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint8 u = (src_u[0] + src_u[1] + 1) >> 1; - uint8 v = (src_v[0] + src_v[1] + 1) >> 1; + uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; + uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; @@ -1310,22 +1499,22 @@ void I444ToARGBRow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } #else -void I444ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I444ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width; ++x) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; src_y += 1; src_u += 1; @@ -1336,19 +1525,19 @@ void I444ToARGBRow_C(const uint8* src_y, #endif // Also used for 420 -void I422ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; @@ -1356,26 +1545,120 @@ void I422ToARGBRow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void I422AlphaToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* rgb_buf, +// 10 bit YUV to ARGB +void I210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { + uint32_t ar30; + b = b >> 4; // convert 10.6 to 10 bit. + g = g >> 4; + r = r >> 4; + b = Clamp10(b); + g = Clamp10(g); + r = Clamp10(r); + ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000; + (*(uint32_t*)rgb_buf) = ar30; +} + +// 10 bit YUV to 10 bit AR30 +void I210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +// 8 bit YUV to 10 bit AR30 +// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. +void I422ToAR30Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +void I422AlphaToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = src_a[1]; src_y += 2; src_u += 1; @@ -1384,47 +1667,47 @@ void I422AlphaToARGBRow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; } } -void I422ToRGB24Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); src_y += 2; src_u += 1; src_v += 1; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); } } -void I422ToARGB4444Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); @@ -1435,8 +1718,8 @@ void I422ToARGB4444Row_C(const uint8* src_y, b1 = b1 >> 4; g1 = g1 >> 4; r1 = r1 >> 4; - *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | - (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000; + *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) | + (g1 << 20) | (r1 << 24) | 0xf000f000; src_y += 2; src_u += 1; src_v += 1; @@ -1447,23 +1730,22 @@ void I422ToARGB4444Row_C(const uint8* src_y, b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; - *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | - 0xf000; + *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; } } -void I422ToARGB1555Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); @@ -1474,8 +1756,8 @@ void I422ToARGB1555Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 3; r1 = r1 >> 3; - *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | - (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000; + *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) | + (g1 << 21) | (r1 << 26) | 0x80008000; src_y += 2; src_u += 1; src_v += 1; @@ -1486,23 +1768,22 @@ void I422ToARGB1555Row_C(const uint8* src_y, b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; - *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | - 0x8000; + *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; } } -void I422ToRGB565Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); @@ -1513,8 +1794,8 @@ void I422ToRGB565Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27); + *(uint32_t*)(dst_rgb565) = + b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); src_y += 2; src_u += 1; src_v += 1; @@ -1525,111 +1806,111 @@ void I422ToRGB565Row_C(const uint8* src_y, b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; - *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); } } -void I411ToARGBRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 3; x += 4) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - YuvPixel(src_y[2], src_u[0], src_v[0], - rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants); - rgb_buf[11] = 255; - YuvPixel(src_y[3], src_u[0], src_v[0], - rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants); - rgb_buf[15] = 255; - src_y += 4; - src_u += 1; - src_v += 1; - rgb_buf += 16; // Advance 4 pixels. - } - if (width & 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); - rgb_buf[7] = 255; - src_y += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); - rgb_buf[3] = 255; - } -} - -void NV12ToARGBRow_C(const uint8* src_y, - const uint8* src_uv, - uint8* rgb_buf, +void NV12ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_uv[0], src_uv[1], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_uv += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void NV21ToARGBRow_C(const uint8* src_y, - const uint8* src_vu, - uint8* rgb_buf, +void NV21ToARGBRow_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_vu[1], src_vu[0], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_vu += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void NV12ToRGB565Row_C(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_uv += 2; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void NV21ToRGB24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); + src_y += 2; + src_vu += 2; + rgb_buf += 6; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + } +} + +void NV12ToRGB565Row_C(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - uint8 b0; - uint8 g0; - uint8 r0; - uint8 b1; - uint8 g1; - uint8 r1; + uint8_t b0; + uint8_t g0; + uint8_t r0; + uint8_t b1; + uint8_t g1; + uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); @@ -1640,8 +1921,8 @@ void NV12ToRGB565Row_C(const uint8* src_y, b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; - *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | - (b1 << 16) | (g1 << 21) | (r1 << 27); + *(uint32_t*)(dst_rgb565) = + b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); src_y += 2; src_uv += 2; dst_rgb565 += 4; // Advance 2 pixels. @@ -1651,67 +1932,67 @@ void NV12ToRGB565Row_C(const uint8* src_y, b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; - *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); } } -void YUY2ToARGBRow_C(const uint8* src_yuy2, - uint8* rgb_buf, +void YUY2ToARGBRow_C(const uint8_t* src_yuy2, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_yuy2 += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void UYVYToARGBRow_C(const uint8* src_uyvy, - uint8* rgb_buf, +void UYVYToARGBRow_C(const uint8_t* src_uyvy, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); + YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_uyvy += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); + YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } -void I422ToRGBARow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, +void I422ToRGBARow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants); + YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, + rgb_buf + 7, yuvconstants); rgb_buf[4] = 255; src_y += 2; src_u += 1; @@ -1719,13 +2000,13 @@ void I422ToRGBARow_C(const uint8* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants); + YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; } } -void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { +void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) { int x; for (x = 0; x < width - 1; x += 2) { YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); @@ -1741,7 +2022,7 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { } } -void MirrorRow_C(const uint8* src, uint8* dst, int width) { +void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { int x; src += width - 1; for (x = 0; x < width - 1; x += 2) { @@ -1754,7 +2035,10 @@ void MirrorRow_C(const uint8* src, uint8* dst, int width) { } } -void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { +void MirrorUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; src_uv += (width - 1) << 1; for (x = 0; x < width - 1; x += 2) { @@ -1770,10 +2054,10 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { } } -void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) { +void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { int x; - const uint32* src32 = (const uint32*)(src); - uint32* dst32 = (uint32*)(dst); + const uint32_t* src32 = (const uint32_t*)(src); + uint32_t* dst32 = (uint32_t*)(dst); src32 += width - 1; for (x = 0; x < width - 1; x += 2) { dst32[x] = src32[0]; @@ -1785,7 +2069,10 @@ void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) { } } -void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { +void SplitUVRow_C(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_u[x] = src_uv[0]; @@ -1800,7 +2087,9 @@ void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { } } -void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_C(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { int x; for (x = 0; x < width - 1; x += 2) { @@ -1816,20 +2105,110 @@ void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, } } -void CopyRow_C(const uint8* src, uint8* dst, int count) { +void SplitRGBRow_C(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_r[x] = src_rgb[0]; + dst_g[x] = src_rgb[1]; + dst_b[x] = src_rgb[2]; + src_rgb += 3; + } +} + +void MergeRGBRow_C(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_rgb[0] = src_r[x]; + dst_rgb[1] = src_g[x]; + dst_rgb[2] = src_b[x]; + dst_rgb += 3; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +void MergeUVRow_16_C(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = src_u[x] * scale; + dst_uv[1] = src_v[x] * scale; + dst_uv[2] = src_u[x + 1] * scale; + dst_uv[3] = src_v[x + 1] * scale; + dst_uv += 4; + } + if (width & 1) { + dst_uv[0] = src_u[width - 1] * scale; + dst_uv[1] = src_v[width - 1] * scale; + } +} + +void MultiplyRow_16_C(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = src_y[x] * scale; + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_C(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = clamp255((src_y[x] * scale) >> 16); + } +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 1024 = 10 bits +void Convert8To16Row_C(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + int x; + scale *= 0x0101; // replicates the byte. + for (x = 0; x < width; ++x) { + dst_y[x] = (src_y[x] * scale) >> 16; + } +} + +void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) { memcpy(dst, src, count); } -void CopyRow_16_C(const uint16* src, uint16* dst, int count) { +void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) { memcpy(dst, src, count * 2); } -void SetRow_C(uint8* dst, uint8 v8, int width) { +void SetRow_C(uint8_t* dst, uint8_t v8, int width) { memset(dst, v8, width); } -void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) { - uint32* d = (uint32*)(dst_argb); +void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) { + uint32_t* d = (uint32_t*)(dst_argb); int x; for (x = 0; x < width; ++x) { d[x] = v32; @@ -1837,8 +2216,11 @@ void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) { } // Filter 2 rows of YUY2 UV's (422) into U and V (420). -void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +void YUY2ToUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { // Output a row of UV values, filtering 2 rows of YUY2. int x; for (x = 0; x < width; x += 2) { @@ -1851,8 +2233,10 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, } // Copy row of YUY2 UV's (422) into U and V (422). -void YUY2ToUV422Row_C(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +void YUY2ToUV422Row_C(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { @@ -1865,7 +2249,7 @@ void YUY2ToUV422Row_C(const uint8* src_yuy2, } // Copy row of YUY2 Y's (422) into Y (420/422). -void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { +void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { // Output a row of Y values. int x; for (x = 0; x < width - 1; x += 2) { @@ -1879,8 +2263,11 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { } // Filter 2 rows of UYVY UV's (422) into U and V (420). -void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +void UYVYToUVRow_C(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { @@ -1893,8 +2280,10 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, } // Copy row of UYVY UV's (422) into U and V (422). -void UYVYToUV422Row_C(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +void UYVYToUV422Row_C(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { @@ -1907,7 +2296,7 @@ void UYVYToUV422Row_C(const uint8* src_uyvy, } // Copy row of UYVY Y's (422) into Y (420/422). -void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { +void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { // Output a row of Y values. int x; for (x = 0; x < width - 1; x += 2) { @@ -1925,17 +2314,19 @@ void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { // Blend src_argb0 over src_argb1 and store to dst_argb. // dst_argb may be src_argb0 or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBBlendRow_C(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint32 fb = src_argb0[0]; - uint32 fg = src_argb0[1]; - uint32 fr = src_argb0[2]; - uint32 a = src_argb0[3]; - uint32 bb = src_argb1[0]; - uint32 bg = src_argb1[1]; - uint32 br = src_argb1[2]; + uint32_t fb = src_argb0[0]; + uint32_t fg = src_argb0[1]; + uint32_t fr = src_argb0[2]; + uint32_t a = src_argb0[3]; + uint32_t bb = src_argb1[0]; + uint32_t bg = src_argb1[1]; + uint32_t br = src_argb1[2]; dst_argb[0] = BLEND(fb, bb, a); dst_argb[1] = BLEND(fg, bg, a); dst_argb[2] = BLEND(fr, br, a); @@ -1958,13 +2349,13 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, } if (width & 1) { - uint32 fb = src_argb0[0]; - uint32 fg = src_argb0[1]; - uint32 fr = src_argb0[2]; - uint32 a = src_argb0[3]; - uint32 bb = src_argb1[0]; - uint32 bg = src_argb1[1]; - uint32 br = src_argb1[2]; + uint32_t fb = src_argb0[0]; + uint32_t fg = src_argb0[1]; + uint32_t fr = src_argb0[2]; + uint32_t a = src_argb0[3]; + uint32_t bb = src_argb1[0]; + uint32_t bg = src_argb1[1]; + uint32_t br = src_argb1[2]; dst_argb[0] = BLEND(fb, bb, a); dst_argb[1] = BLEND(fg, bg, a); dst_argb[2] = BLEND(fr, br, a); @@ -1973,9 +2364,12 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, } #undef BLEND -#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8 -void BlendPlaneRow_C(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { +#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8 +void BlendPlaneRow_C(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst[0] = UBLEND(src0[0], src1[0], alpha[0]); @@ -1995,13 +2389,13 @@ void BlendPlaneRow_C(const uint8* src0, const uint8* src1, // Multiply source RGB by alpha and store to destination. // This code mimics the SSSE3 version for better testability. -void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width - 1; i += 2) { - uint32 b = src_argb[0]; - uint32 g = src_argb[1]; - uint32 r = src_argb[2]; - uint32 a = src_argb[3]; + uint32_t b = src_argb[0]; + uint32_t g = src_argb[1]; + uint32_t r = src_argb[2]; + uint32_t a = src_argb[3]; dst_argb[0] = ATTENUATE(b, a); dst_argb[1] = ATTENUATE(g, a); dst_argb[2] = ATTENUATE(r, a); @@ -2019,10 +2413,10 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } if (width & 1) { - const uint32 b = src_argb[0]; - const uint32 g = src_argb[1]; - const uint32 r = src_argb[2]; - const uint32 a = src_argb[3]; + const uint32_t b = src_argb[0]; + const uint32_t g = src_argb[1]; + const uint32_t r = src_argb[2]; + const uint32_t a = src_argb[3]; dst_argb[0] = ATTENUATE(b, a); dst_argb[1] = ATTENUATE(g, a); dst_argb[2] = ATTENUATE(r, a); @@ -2038,49 +2432,56 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { // Reciprocal method is off by 1 on some values. ie 125 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. #define T(a) 0x01000000 + (0x10000 / a) -const uint32 fixed_invtbl8[256] = { - 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), - T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), - T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), - T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), - T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), - T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), - T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), - T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), - T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), - T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), - T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), - T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), - T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), - T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), - T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), - T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), - T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), - T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), - T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), - T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), - T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), - T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), - T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), - T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), - T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), - T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), - T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), - T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), - T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), - T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), - T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), - T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 }; +const uint32_t fixed_invtbl8[256] = { + 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), + T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), + T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), + T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b), + T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22), + T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29), + T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30), + T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), + T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), + T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), + T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53), + T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a), + T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61), + T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68), + T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), + T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), + T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), + T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b), + T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92), + T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99), + T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0), + T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), + T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), + T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), + T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3), + T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca), + T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1), + T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8), + T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), + T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), + T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), + T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb), + T(0xfc), T(0xfd), T(0xfe), 0x01000100}; #undef T -void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { +void ARGBUnattenuateRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { int i; for (i = 0; i < width; ++i) { - uint32 b = src_argb[0]; - uint32 g = src_argb[1]; - uint32 r = src_argb[2]; - const uint32 a = src_argb[3]; - const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point + uint32_t b = src_argb[0]; + uint32_t g = src_argb[1]; + uint32_t r = src_argb[2]; + const uint32_t a = src_argb[3]; + const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point b = (b * ia) >> 8; g = (g * ia) >> 8; r = (r * ia) >> 8; @@ -2094,31 +2495,37 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } } -void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) { - int32 row_sum[4] = {0, 0, 0, 0}; +void ComputeCumulativeSumRow_C(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + int32_t row_sum[4] = {0, 0, 0, 0}; int x; for (x = 0; x < width; ++x) { row_sum[0] += row[x * 4 + 0]; row_sum[1] += row[x * 4 + 1]; row_sum[2] += row[x * 4 + 2]; row_sum[3] += row[x * 4 + 3]; - cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; - cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; - cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; - cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; + cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; + cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; + cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; + cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; } } -void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl, - int w, int area, uint8* dst, int count) { +void CumulativeSumToAverageRow_C(const int32_t* tl, + const int32_t* bl, + int w, + int area, + uint8_t* dst, + int count) { float ooa = 1.0f / area; int i; for (i = 0; i < count; ++i) { - dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); - dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); - dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); - dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); + dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); + dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); + dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); + dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); dst += 4; tl += 4; bl += 4; @@ -2127,8 +2534,11 @@ void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl, // Copy pixels from rotated source to destination row with a slope. LIBYUV_API -void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width) { +void ARGBAffineRow_C(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width) { int i; // Render a row of pixels from source into a buffer. float uv[2]; @@ -2137,9 +2547,8 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, for (i = 0; i < width; ++i) { int x = (int)(uv[0]); int y = (int)(uv[1]); - *(uint32*)(dst_argb) = - *(const uint32*)(src_argb + y * src_argb_stride + - x * 4); + *(uint32_t*)(dst_argb) = + *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4); dst_argb += 4; uv[0] += uv_dudv[2]; uv[1] += uv_dudv[3]; @@ -2147,16 +2556,20 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, } // Blend 2 rows into 1. -static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride, - uint8* dst_uv, int width) { +static void HalfRow_C(const uint8_t* src_uv, + ptrdiff_t src_uv_stride, + uint8_t* dst_uv, + int width) { int x; for (x = 0; x < width; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; } } -static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride, - uint16* dst_uv, int width) { +static void HalfRow_16_C(const uint16_t* src_uv, + ptrdiff_t src_uv_stride, + uint16_t* dst_uv, + int width) { int x; for (x = 0; x < width; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; @@ -2164,12 +2577,14 @@ static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride, } // C version 2x2 -> 2x1. -void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, +void InterpolateRow_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, ptrdiff_t src_stride, - int width, int source_y_fraction) { + int width, + int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr1 = src_ptr + src_stride; int x; if (y1_fraction == 0) { memcpy(dst_ptr, src_ptr, width); @@ -2194,12 +2609,14 @@ void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, } } -void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, +void InterpolateRow_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, ptrdiff_t src_stride, - int width, int source_y_fraction) { + int width, + int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; - const uint16* src_ptr1 = src_ptr + src_stride; + const uint16_t* src_ptr1 = src_ptr + src_stride; int x; if (source_y_fraction == 0) { memcpy(dst_ptr, src_ptr, width * 2); @@ -2222,8 +2639,10 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, } // Use first 4 shuffler values to reorder ARGB channels. -void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +void ARGBShuffleRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { int index0 = shuffler[0]; int index1 = shuffler[1]; int index2 = shuffler[2]; @@ -2232,10 +2651,10 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, int x; for (x = 0; x < width; ++x) { // To support in-place conversion. - uint8 b = src_argb[index0]; - uint8 g = src_argb[index1]; - uint8 r = src_argb[index2]; - uint8 a = src_argb[index3]; + uint8_t b = src_argb[index0]; + uint8_t g = src_argb[index1]; + uint8_t r = src_argb[index2]; + uint8_t a = src_argb[index3]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; @@ -2245,10 +2664,11 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, } } -void I422ToYUY2Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { +void I422ToYUY2Row_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_frame[0] = src_y[0]; @@ -2268,10 +2688,11 @@ void I422ToYUY2Row_C(const uint8* src_y, } } -void I422ToUYVYRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { +void I422ToUYVYRow_C(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_frame[0] = src_u[0]; @@ -2291,9 +2712,8 @@ void I422ToUYVYRow_C(const uint8* src_y, } } - -void ARGBPolynomialRow_C(const uint8* src_argb, - uint8* dst_argb, +void ARGBPolynomialRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, const float* poly, int width) { int i; @@ -2323,33 +2743,75 @@ void ARGBPolynomialRow_C(const uint8* src_argb, dr += poly[14] * r3; da += poly[15] * a3; - dst_argb[0] = Clamp((int32)(db)); - dst_argb[1] = Clamp((int32)(dg)); - dst_argb[2] = Clamp((int32)(dr)); - dst_argb[3] = Clamp((int32)(da)); + dst_argb[0] = Clamp((int32_t)(db)); + dst_argb[1] = Clamp((int32_t)(dg)); + dst_argb[2] = Clamp((int32_t)(dr)); + dst_argb[3] = Clamp((int32_t)(da)); src_argb += 4; dst_argb += 4; } } -void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, - const uint8* luma, uint32 lumacoeff) { - uint32 bc = lumacoeff & 0xff; - uint32 gc = (lumacoeff >> 8) & 0xff; - uint32 rc = (lumacoeff >> 16) & 0xff; +// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor +// adjust the source integer range to the half float range desired. + +// This magic constant is 2^-112. Multiplying by this +// is the same as subtracting 112 from the exponent, which +// is the difference in exponent bias between 32-bit and +// 16-bit floats. Once we've done this subtraction, we can +// simply extract the low bits of the exponent and the high +// bits of the mantissa from our float and we're done. + +// Work around GCC 7 punning warning -Wstrict-aliasing +#if defined(__GNUC__) +typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t; +#else +typedef uint32_t uint32_alias_t; +#endif + +void HalfFloatRow_C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int i; + float mult = 1.9259299444e-34f * scale; + for (i = 0; i < width; ++i) { + float value = src[i] * mult; + dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13); + } +} + +void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + float value = src[i] * scale; + dst[i] = value; + } +} + +void ARGBLumaColorTableRow_C(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff) { + uint32_t bc = lumacoeff & 0xff; + uint32_t gc = (lumacoeff >> 8) & 0xff; + uint32_t rc = (lumacoeff >> 16) & 0xff; int i; for (i = 0; i < width - 1; i += 2) { // Luminance in rows, color values in columns. - const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + - src_argb[2] * rc) & 0x7F00u) + luma; - const uint8* luma1; + const uint8_t* luma0 = + ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + + luma; + const uint8_t* luma1; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; dst_argb[3] = src_argb[3]; - luma1 = ((src_argb[4] * bc + src_argb[5] * gc + - src_argb[6] * rc) & 0x7F00u) + luma; + luma1 = + ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) + + luma; dst_argb[4] = luma1[src_argb[4]]; dst_argb[5] = luma1[src_argb[5]]; dst_argb[6] = luma1[src_argb[6]]; @@ -2359,8 +2821,9 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, } if (width & 1) { // Luminance in rows, color values in columns. - const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + - src_argb[2] * rc) & 0x7F00u) + luma; + const uint8_t* luma0 = + ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + + luma; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; @@ -2368,7 +2831,7 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, } } -void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { +void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst[3] = src[3]; @@ -2381,7 +2844,7 @@ void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { } } -void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) { +void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst_a[0] = src_argb[3]; @@ -2394,7 +2857,7 @@ void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) { } } -void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { +void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst[3] = src[0]; @@ -2413,13 +2876,13 @@ void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { #if !(defined(_MSC_VER) && defined(_M_IX86)) && \ defined(HAS_I422TORGB565ROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. -void I422ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2434,14 +2897,14 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, #endif #if defined(HAS_I422TOARGB1555ROW_SSSE3) -void I422ToARGB1555Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2456,14 +2919,14 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y, #endif #if defined(HAS_I422TOARGB4444ROW_SSSE3) -void I422ToARGB4444Row_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2478,13 +2941,13 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, #endif #if defined(HAS_NV12TORGB565ROW_SSSE3) -void NV12ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); @@ -2497,14 +2960,102 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y, } #endif -#if defined(HAS_I422TORGB565ROW_AVX2) -void I422ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +#if defined(HAS_NV12TORGB24ROW_SSSE3) +void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB24ROW_SSSE3) +void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB24ROW_AVX2) +void NV12ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_uv += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB24ROW_AVX2) +void NV21ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth); +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif + src_y += twidth; + src_vu += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TORGB565ROW_AVX2) +void I422ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2523,14 +3074,14 @@ void I422ToRGB565Row_AVX2(const uint8* src_y, #endif #if defined(HAS_I422TOARGB1555ROW_AVX2) -void I422ToARGB1555Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2549,14 +3100,14 @@ void I422ToARGB1555Row_AVX2(const uint8* src_y, #endif #if defined(HAS_I422TOARGB4444ROW_AVX2) -void I422ToARGB4444Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); @@ -2575,19 +3126,22 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y, #endif #if defined(HAS_I422TORGB24ROW_AVX2) -void I422ToRGB24Row_AVX2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { +void I422ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); - // TODO(fbarchard): ARGBToRGB24Row_AVX2 +#if defined(HAS_ARGBTORGB24ROW_AVX2) + ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); +#else ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); +#endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -2598,13 +3152,13 @@ void I422ToRGB24Row_AVX2(const uint8* src_y, #endif #if defined(HAS_NV12TORGB565ROW_AVX2) -void NV12ToRGB565Row_AVX2(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB565Row_AVX2(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); @@ -2621,6 +3175,62 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y, } #endif +float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { + float fsum = 0.f; + int i; +#if defined(__clang__) +#pragma clang loop vectorize_width(4) +#endif + for (i = 0; i < width; ++i) { + float v = *src++; + fsum += v * v; + *dst++ = v * scale; + } + return fsum; +} + +float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) { + float fmax = 0.f; + int i; + for (i = 0; i < width; ++i) { + float v = *src++; + float vs = v * scale; + fmax = (v > fmax) ? v : fmax; + *dst++ = vs; + } + return fmax; +} + +void ScaleSamples_C(const float* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src++ * scale; + } +} + +void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = + (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8; + ++src; + } +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_C(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc index 1ac7ef1aa3..8d3cb81cec 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_gcc.cc @@ -1,4 +1,3 @@ -// VERSION 2 /* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * @@ -23,1663 +22,2001 @@ extern "C" { #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB -static vec8 kARGBToY = { - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 -}; +static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; // JPeg full range. -static vec8 kARGBToYJ = { - 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 -}; +static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) -static vec8 kARGBToU = { - 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 -}; +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; -static vec8 kARGBToUJ = { - 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 -}; +static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; -static vec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -}; +static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0}; -static vec8 kARGBToVJ = { - -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 -}; +static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; // Constants for BGRA -static vec8 kBGRAToY = { - 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 -}; +static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, + 0, 33, 65, 13, 0, 33, 65, 13}; -static vec8 kBGRAToU = { - 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 -}; +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; -static vec8 kBGRAToV = { - 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 -}; +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; // Constants for ABGR -static vec8 kABGRToY = { - 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 -}; +static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, + 33, 65, 13, 0, 33, 65, 13, 0}; -static vec8 kABGRToU = { - -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 -}; +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; -static vec8 kABGRToV = { - 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 -}; +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; // Constants for RGBA. -static vec8 kRGBAToY = { - 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 -}; +static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, + 0, 13, 65, 33, 0, 13, 65, 33}; -static vec8 kRGBAToU = { - 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 -}; +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; -static vec8 kRGBAToV = { - 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 -}; +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; -static uvec8 kAddY16 = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u -}; +static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; // 7 bit fixed point 0.5. -static vec16 kAddYJ64 = { - 64, 64, 64, 64, 64, 64, 64, 64 -}; +static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -static uvec8 kAddUV128 = { - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; +static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; -static uvec16 kAddUVJ128 = { - 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u -}; +static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) #ifdef HAS_RGB24TOARGBROW_SSSE3 // Shuffle table for converting RGB24 to ARGB. -static uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u -}; +static const uvec8 kShuffleMaskRGB24ToARGB = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; // Shuffle table for converting RAW to ARGB. -static uvec8 kShuffleMaskRAWToARGB = { - 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u -}; +static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { - 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Middle 8. static const uvec8 kShuffleMaskRAWToRGB24_1 = { - 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleMaskRAWToRGB24_2 = { - 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RGB24. -static uvec8 kShuffleMaskARGBToRGB24 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u -}; +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RAW. -static uvec8 kShuffleMaskARGBToRAW = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u -}; +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 -static uvec8 kShuffleMaskARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u -}; +static const uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; // YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 -}; +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; // YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = { - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 -}; +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; // UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = { - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 -}; +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; // UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = { - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 -}; +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; // NV21 shuf 8 VU to 16 UV. static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, }; #endif // HAS_RGB24TOARGBROW_SSSE3 #ifdef HAS_J400TOARGBROW_SSE2 -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" - ); +void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_J400TOARGBROW_SSE2 #ifdef HAS_RGB24TOARGBROW_SSSE3 -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" - "lea " MEMLEA(0x30,0) ",%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRGB24ToARGB) // %3 - : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRGB24ToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" - "lea " MEMLEA(0x30,0) ",%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToARGB) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqu %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToARGB) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { - asm volatile ( - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" - "movdqa %5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n" - "lea " MEMLEA(0x18,0) ",%0 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movq %%xmm1," MEMACCESS2(0x8,1) " \n" - "movq %%xmm2," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToRGB24_0), // %3 - "m"(kShuffleMaskRAWToRGB24_1), // %4 - "m"(kShuffleMaskRAWToRGB24_2) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, + uint8_t* dst_rgb24, + int width) { + asm volatile( + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + "movdqa %5,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x4(%0),%%xmm1 \n" + "movdqu 0x8(%0),%%xmm2 \n" + "lea 0x18(%0),%0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToRGB24_0), // %3 + "m"(kShuffleMaskRAWToRGB24_1), // %4 + "m"(kShuffleMaskRAWToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x20802080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xa,%%xmm4 \n" - "psrlw $0x5,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "pand %%xmm4,%%xmm0 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) - MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x20802080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xa,%%xmm4 \n" + "psrlw $0x5,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "pand %%xmm4,%%xmm0 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } -void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x42004200,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "movdqa %%xmm3,%%xmm4 \n" - "psrlw $0x6,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psllw $0x1,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) - MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); +void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0x1080108,%%eax \n" + "movd %%eax,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x42004200,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psllw $0xb,%%xmm3 \n" + "movdqa %%xmm3,%%xmm4 \n" + "psrlw $0x6,%%xmm4 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "psllw $0x8,%%xmm7 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psllw $0x1,%%xmm1 \n" + "psllw $0xb,%%xmm2 \n" + "pand %%xmm3,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "psllw $0x8,%%xmm1 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,0x00(%1,%0,2) \n" + "movdqu %%xmm2,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); } -void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "mov $0xf0f0f0f,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x4,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "pand %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "psllw $0x4,%%xmm1 \n" - "psrlw $0x4,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) - MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "mov $0xf0f0f0f,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x4,%%xmm5 \n" + "sub %0,%1 \n" + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pand %%xmm4,%%xmm0 \n" + "pand %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "psllw $0x4,%%xmm1 \n" + "psrlw $0x4,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,0x00(%1,%0,2) \n" + "movdqu %%xmm1,0x10(%1,%0,2) \n" + "lea 0x10(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) { - asm volatile ( - "movdqa %3,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x30,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRGB24) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); +void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + "movdqa %3,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) { - asm volatile ( - "movdqa %3,%%xmm6 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x30,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRAW) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); +void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + "movdqa %3,%%xmm6 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +#ifdef HAS_ARGBTORGB24ROW_AVX2 +// vpermd for 12+12 to 24 +static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; + +void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm6 \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24), // %3 + "m"(kPermdRGB24_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI +// Shuffle table for converting ARGBToRGB24 +static const ulvec8 kPermARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, + 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u, + 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u}; +static const ulvec8 kPermARGBToRGB24_1 = { + 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, + 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, + 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u}; +static const ulvec8 kPermARGBToRGB24_2 = { + 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, + 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, + 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u}; + +void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vmovdqa %3,%%ymm5 \n" + "vmovdqa %4,%%ymm6 \n" + "vmovdqa %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" + "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kPermARGBToRGB24_0), // %3 + "m"(kPermARGBToRGB24_1), // %4 + "m"(kPermARGBToRGB24_2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7"); +} +#endif + +#ifdef HAS_ARGBTORAWROW_AVX2 +void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm6 \n" + "vmovdqa %4,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes + "vpermd %%ymm1,%%ymm7,%%ymm1 \n" + "vpermd %%ymm2,%%ymm7,%%ymm2 \n" + "vpermd %%ymm3,%%ymm7,%%ymm3 \n" + "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 + "vpermq $0x4f,%%ymm2,%%ymm4 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 + "vpermq $0x93,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW), // %3 + "m"(kPermdRGB24_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, - const uint32 dither4, int width) { - asm volatile ( - "movd %3,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "movdqa %%xmm6,%%xmm7 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "punpckhwd %%xmm7,%%xmm7 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" +void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width) { + asm volatile( + "movd %3,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "punpckhwd %%xmm7,%%xmm7 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "paddusb %%xmm6,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "paddusb %%xmm6,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, - const uint32 dither4, int width) { - asm volatile ( - "vbroadcastss %3,%%xmm6 \n" - "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" - "vpermq $0xd8,%%ymm6,%%ymm6 \n" - "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" - "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" - "vpsrld $0x1b,%%ymm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $0x1a,%%ymm4,%%ymm4 \n" - "vpslld $0x5,%%ymm4,%%ymm4 \n" - "vpslld $0xb,%%ymm3,%%ymm5 \n" +void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, + uint8_t* dst, + const uint32_t dither4, + int width) { + asm volatile( + "vbroadcastss %3,%%xmm6 \n" + "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" + "vpermq $0xd8,%%ymm6,%%ymm6 \n" + "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrld $0x1b,%%ymm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $0x1a,%%ymm4,%%ymm4 \n" + "vpslld $0x5,%%ymm4,%%ymm4 \n" + "vpslld $0xb,%%ymm3,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" - "vpsrld $0x5,%%ymm0,%%ymm2 \n" - "vpsrld $0x3,%%ymm0,%%ymm1 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - "vpand %%ymm4,%%ymm2,%%ymm2 \n" - "vpand %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpor %%ymm2,%%ymm1,%%ymm1 \n" - "vpor %%ymm1,%%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" + "vpsrld $0x5,%%ymm0,%%ymm2 \n" + "vpsrld $0x3,%%ymm0,%%ymm1 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + "vpand %%ymm4,%%ymm2,%%ymm2 \n" + "vpand %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(dither4) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTORGB565DITHERROW_AVX2 +void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1b,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "pslld $0x5,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pslld $0xa,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" + "pslld $0xf,%%xmm7 \n" -void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1b,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x5,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pslld $0xa,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "pslld $0xf,%%xmm7 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "psrad $0x10,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x6,%%xmm2 \n" - "psrld $0x9,%%xmm3 \n" - "pand %%xmm7,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm6,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - :: "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "psrad $0x10,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x6,%%xmm2 \n" + "psrld $0x9,%%xmm3 \n" + "pand %%xmm7,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm6,%%xmm3 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } -void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xc,%%xmm4 \n" - "movdqa %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm3 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "psrlq $0x4,%%xmm0 \n" - "psrlq $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); +void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0xc,%%xmm4 \n" + "movdqa %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm3,%%xmm0 \n" + "pand %%xmm4,%%xmm1 \n" + "psrlq $0x4,%%xmm0 \n" + "psrlq $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_RGB24TOARGBROW_SSSE3 +/* + +ARGBToAR30Row: + +Red Blue +With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will +produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats +wanted for the blue channel. The red needs to be shifted 4 left, so multiply by +(1024+4)*16 for red. + +Alpha Green +Alpha and Green are already in the high bits so vpand can zero out the other +bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier +could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha +would be a simple multiplier to shift it into position. It wants a gap of 10 +above the green. Green is 10 bits, so there are 6 bits in the low short. 4 +more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits, +and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the +result left 10 to position the A and G channels. +*/ + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, + 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; + +static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u, + 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u}; + +static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028; +static const uint32_t kMaskRB10 = 0x3ff003ff; +static const uint32_t kMaskAG10 = 0xc000ff00; +static const uint32_t kMulAG10 = 64 * 65536 + 1028; + +void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_ARGBTOAR30ROW_AVX2 +void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_ABGRTOAR30ROW_AVX2 +void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYJROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBTOYJROW_SSSE3 #ifdef HAS_ARGBTOYROW_AVX2 // vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = { - 0, 4, 1, 5, 2, 6, 3, 7 -}; +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "lea " MEMLEA(0x80,0) ",%0 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" - "vpsrlw $0x7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x7,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kAddY16), // %4 - "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); +void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBTOYROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "lea " MEMLEA(0x80,0) ",%0 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. - "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" - "vpsrlw $0x7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x7,%%ymm2,%%ymm2 \n" - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64), // %4 - "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); +void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. + "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" +void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kARGBToV), // %5 - "m"(kARGBToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToV), // %5 + "m"(kARGBToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_AVX2 // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -}; -void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) - VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) - "lea " MEMLEA(0x80,0) ",%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; +void ARGBToUVRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kAddUV128), // %5 - "m"(kARGBToV), // %6 - "m"(kARGBToU), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUV128), // %5 + "m"(kARGBToV), // %6 + "m"(kARGBToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" - "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) - VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) - "lea " MEMLEA(0x80,0) ",%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" +void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kAddUVJ128), // %5 - "m"(kARGBToVJ), // %6 - "m"(kARGBToUJ), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUVJ128), // %5 + "m"(kARGBToVJ), // %6 + "m"(kARGBToUJ), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kARGBToVJ), // %5 - "m"(kARGBToUJ), // %6 - "m"(kAddUVJ128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToVJ), // %5 + "m"(kARGBToUJ), // %6 + "m"(kAddUVJ128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVJROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_SSSE3 -void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movdqa %4,%%xmm3 \n" - "movdqa %5,%%xmm4 \n" - "movdqa %6,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "m"(kARGBToV), // %4 - "m"(kARGBToU), // %5 - "m"(kAddUV128) // %6 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6" - ); + asm volatile( + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm2 \n" + "packsswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "lea 0x40(%0),%0 \n" + "movdqu %%xmm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "m"(kARGBToV), // %4 + "m"(kARGBToU), // %5 + "m"(kAddUV128) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6"); } #endif // HAS_ARGBTOUV444ROW_SSSE3 -void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kBGRAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_bgra0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_bgra)), // %4 - "m"(kBGRAToV), // %5 - "m"(kBGRAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_bgra)), // %4 + "m"(kBGRAToV), // %5 + "m"(kBGRAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } -void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kRGBAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kRGBAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_abgr0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)), // %4 - "m"(kABGRToV), // %5 - "m"(kABGRToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kABGRToV), // %5 + "m"(kABGRToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } -void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_rgba0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)), // %4 - "m"(kRGBAToV), // %5 - "m"(kRGBAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_rgba0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_rgba)), // %4 + "m"(kRGBAToV), // %5 + "m"(kRGBAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) // Read 8 UV from 444 -#define READYUV444 \ - "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READYUV444 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV -#define READYUV422 \ - "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READYUV422 \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" + +// Read 4 UV from 422 10 bit, upsample to 8 UV +// TODO(fbarchard): Consider shufb to replace pack/unpack +// TODO(fbarchard): Consider pmulhuw to replace psraw +// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. +#define READYUV210 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm0 \n" \ + "psraw $0x2,%%xmm0 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $0x6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ - "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ - "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" - -// Read 2 UV from 411, upsample to 8 UV. -// reading 4 bytes is an msan violation. -// "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" -// MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) -// pinsrw fails with drmemory -// __asm pinsrw xmm0, [esi], 0 /* U */ -// __asm pinsrw xmm1, [esi + edi], 0 /* V */ -#define READYUV411_TEMP \ - "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \ - "movd %[temp],%%xmm0 \n" \ - MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \ - "movd %[temp],%%xmm1 \n" \ - "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "punpckldq %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READYUVA422 \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ + "movq (%[a_buf]),%%xmm5 \n" \ + "lea 0x8(%[a_buf]),%[a_buf] \n" // Read 4 UV from NV12, upsample to 8 UV -#define READNV12 \ - "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READNV12 \ + "movq (%[uv_buf]),%%xmm0 \n" \ + "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 VU from NV21, upsample to 8 UV -#define READNV21 \ - "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ - "pshufb %[kShuffleNV21], %%xmm0 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "punpcklbw %%xmm4,%%xmm4 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +#define READNV21 \ + "movq (%[vu_buf]),%%xmm0 \n" \ + "lea 0x8(%[vu_buf]),%[vu_buf] \n" \ + "pshufb %[kShuffleNV21], %%xmm0 \n" \ + "movq (%[y_buf]),%%xmm4 \n" \ + "punpcklbw %%xmm4,%%xmm4 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. -#define READYUY2 \ - "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ - "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ - "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ - "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ - "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" +#define READYUY2 \ + "movdqu (%[yuy2_buf]),%%xmm4 \n" \ + "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ + "movdqu (%[yuy2_buf]),%%xmm0 \n" \ + "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ + "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. -#define READUYVY \ - "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ - "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ - "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ - "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ - "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" +#define READUYVY \ + "movdqu (%[uyvy_buf]),%%xmm4 \n" \ + "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ + "movdqu (%[uyvy_buf]),%%xmm0 \n" \ + "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ + "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" #if defined(__x86_64__) -#define YUVTORGB_SETUP(yuvconstants) \ - "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ - "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ - "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ - "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ - "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ - "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ - "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" +#define YUVTORGB_SETUP(yuvconstants) \ + "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "movdqa 32(%[yuvconstants]),%%xmm9 \n" \ + "movdqa 64(%[yuvconstants]),%%xmm10 \n" \ + "movdqa 96(%[yuvconstants]),%%xmm11 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm12 \n" \ + "movdqa 160(%[yuvconstants]),%%xmm13 \n" \ + "movdqa 192(%[yuvconstants]),%%xmm14 \n" // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa %%xmm11,%%xmm0 \n" \ - "pmaddubsw %%xmm8,%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa %%xmm12,%%xmm1 \n" \ - "pmaddubsw %%xmm9,%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa %%xmm13,%%xmm2 \n" \ - "pmaddubsw %%xmm10,%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw %%xmm14,%%xmm4 \n" \ - "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" +#define YUVTORGB16(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa %%xmm11,%%xmm0 \n" \ + "pmaddubsw %%xmm8,%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa %%xmm12,%%xmm1 \n" \ + "pmaddubsw %%xmm9,%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa %%xmm13,%%xmm2 \n" \ + "pmaddubsw %%xmm10,%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "pmulhuw %%xmm14,%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" #define YUVTORGB_REGS \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", #else #define YUVTORGB_SETUP(yuvconstants) // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ - "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ - "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ - "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ - "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" +#define YUVTORGB16(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa 96(%[yuvconstants]),%%xmm0 \n" \ + "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm1 \n" \ + "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa 160(%[yuvconstants]),%%xmm2 \n" \ + "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" #define YUVTORGB_REGS #endif +#define YUVTORGB(yuvconstants) \ + YUVTORGB16(yuvconstants) \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + // Store 8 ARGB values. -#define STOREARGB \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklbw %%xmm5,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm1 \n" \ - "punpcklwd %%xmm2,%%xmm0 \n" \ - "punpckhwd %%xmm2,%%xmm1 \n" \ - "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ - "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ - "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" +#define STOREARGB \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm5,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm0,(%[dst_argb]) \n" \ + "movdqu %%xmm1,0x10(%[dst_argb]) \n" \ + "lea 0x20(%[dst_argb]), %[dst_argb] \n" // Store 8 RGBA values. -#define STORERGBA \ - "pcmpeqb %%xmm5,%%xmm5 \n" \ - "punpcklbw %%xmm2,%%xmm1 \n" \ - "punpcklbw %%xmm0,%%xmm5 \n" \ - "movdqa %%xmm5,%%xmm0 \n" \ - "punpcklwd %%xmm1,%%xmm5 \n" \ - "punpckhwd %%xmm1,%%xmm0 \n" \ - "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ - "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ - "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" +#define STORERGBA \ + "pcmpeqb %%xmm5,%%xmm5 \n" \ + "punpcklbw %%xmm2,%%xmm1 \n" \ + "punpcklbw %%xmm0,%%xmm5 \n" \ + "movdqa %%xmm5,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm5 \n" \ + "punpckhwd %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm5,(%[dst_rgba]) \n" \ + "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \ + "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" -void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +// Store 8 AR30 values. +#define STOREAR30 \ + "psraw $0x4,%%xmm0 \n" \ + "psraw $0x4,%%xmm1 \n" \ + "psraw $0x4,%%xmm2 \n" \ + "pminsw %%xmm7,%%xmm0 \n" \ + "pminsw %%xmm7,%%xmm1 \n" \ + "pminsw %%xmm7,%%xmm2 \n" \ + "pmaxsw %%xmm6,%%xmm0 \n" \ + "pmaxsw %%xmm6,%%xmm1 \n" \ + "pmaxsw %%xmm6,%%xmm2 \n" \ + "psllw $0x4,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm3 \n" \ + "movdqa %%xmm1,%%xmm2 \n" \ + "punpcklwd %%xmm5,%%xmm1 \n" \ + "punpckhwd %%xmm5,%%xmm2 \n" \ + "pslld $0xa,%%xmm1 \n" \ + "pslld $0xa,%%xmm2 \n" \ + "por %%xmm1,%%xmm0 \n" \ + "por %%xmm2,%%xmm3 \n" \ + "movdqu %%xmm0,(%[dst_ar30]) \n" \ + "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \ + "lea 0x20(%[dst_ar30]), %[dst_ar30] \n" + +void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV444 YUVTORGB(yuvconstants) STOREARGB @@ -1691,15 +2028,15 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgb24, +void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -1707,8 +2044,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" "sub %[u_buf],%[v_buf] \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) "punpcklbw %%xmm1,%%xmm0 \n" @@ -1719,16 +2057,16 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" - "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" + "movq %%xmm0,(%[dst_rgb24]) \n" + "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" + "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] -#if defined(__i386__) && defined(__pic__) +#if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] @@ -1736,23 +2074,24 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } -void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STOREARGB @@ -1764,24 +2103,125 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } -#ifdef HAS_I422ALPHATOARGBROW_SSSE3 -void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + LABELALIGN - "1: \n" + "1: \n" + READYUV422 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +// 10 bit YUV to ARGB +void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +// 10 bit YUV to AR30 +void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +#ifdef HAS_I422ALPHATOARGBROW_SSSE3 +void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" READYUVA422 YUVTORGB(yuvconstants) STOREARGB @@ -1792,64 +2232,31 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) && defined(__pic__) +#if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_I422ALPHATOARGBROW_SSSE3 -#ifdef HAS_I411TOARGBROW_SSSE3 -void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - int temp; + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) - "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - READYUV411_TEMP - YUVTORGB(yuvconstants) - STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [temp]"=&r"(temp), // %[temp] -#if defined(__i386__) && defined(__pic__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif -void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP(yuvconstants) - "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN - "1: \n" + "1: \n" READNV12 YUVTORGB(yuvconstants) STOREARGB @@ -1860,21 +2267,24 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } -void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, +void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READNV21 YUVTORGB(yuvconstants) STOREARGB @@ -1886,20 +2296,23 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleNV21]"m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } -void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, - uint8* dst_argb, +void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB @@ -1911,20 +2324,23 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleYUY2Y]"m"(kShuffleYUY2Y), [kShuffleYUY2UV]"m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } -void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, - uint8* dst_argb, +void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB @@ -1936,23 +2352,25 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleUYVYY]"m"(kShuffleUYVYY), [kShuffleUYVYUV]"m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } -void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, +void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422 YUVTORGB(yuvconstants) STORERGBA @@ -1964,7 +2382,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1972,179 +2390,211 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, #endif // HAS_I422TOARGBROW_SSSE3 // Read 16 UV from 444 -#define READYUV444_AVX2 \ - "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READYUV444_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 \ - "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READYUV422_AVX2 \ + "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 210 10 bit, upsample to 16 UV +// TODO(fbarchard): Consider vshufb to replace pack/unpack +// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. +#define READYUV210_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x2,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $0x6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 \ - "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ - "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ - "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ - "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" - -// Read 4 UV from 411, upsample to 16 UV. -#define READYUV411_AVX2 \ - "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \ - "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READYUVA422_AVX2 \ + "vmovq (%[u_buf]),%%xmm0 \n" \ + "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%xmm5 \n" \ + "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" // Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 \ - "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READNV12_AVX2 \ + "vmovdqu (%[uv_buf]),%%xmm0 \n" \ + "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 VU from NV21, upsample to 16 UV. -#define READNV21_AVX2 \ - "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ - "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ - "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ - "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ - "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +#define READNV21_AVX2 \ + "vmovdqu (%[vu_buf]),%%xmm0 \n" \ + "lea 0x10(%[vu_buf]),%[vu_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%xmm4 \n" \ + "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ + "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 \ - "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ - "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ - "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ - "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ - "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" +#define READYUY2_AVX2 \ + "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ + "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ + "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \ + "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ + "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n" // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 \ - "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ - "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ - "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ - "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ - "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" +#define READUYVY_AVX2 \ + "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \ + "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ + "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \ + "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ + "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" #if defined(__x86_64__) -#define YUVTORGB_SETUP_AVX2(yuvconstants) \ - "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ - "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ - "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ - "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ - "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ - "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ - "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" -#define YUVTORGB_AVX2(yuvconstants) \ - "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ - "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ - "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ - "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ - "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ - "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ - "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ - "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ - "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" +#define YUVTORGB_SETUP_AVX2(yuvconstants) \ + "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ + "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ + "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ + "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ + "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \ + "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \ + "vmovdqa 192(%[yuvconstants]),%%ymm14 \n" + +#define YUVTORGB16_AVX2(yuvconstants) \ + "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ + "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ + "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ + "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ + "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ + "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ + "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" + #define YUVTORGB_REGS_AVX2 \ - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", + #else // Convert 16 pixels: 16 UV and 16 Y. + #define YUVTORGB_SETUP_AVX2(yuvconstants) -#define YUVTORGB_AVX2(yuvconstants) \ - "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ - "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ - "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ - "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ - "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ - "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ - "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ - "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ - "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ - "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ - "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ - "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ - "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" +#define YUVTORGB16_AVX2(yuvconstants) \ + "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \ + "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \ + "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \ + "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ + "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ + "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \ + "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ + "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ + "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" #define YUVTORGB_REGS_AVX2 #endif +#define YUVTORGB_AVX2(yuvconstants) \ + YUVTORGB16_AVX2(yuvconstants) \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + // Store 16 ARGB values. -#define STOREARGB_AVX2 \ - "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ - "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ - "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ - "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ - "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ - "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ - "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ - "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" +#define STOREARGB_AVX2 \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vmovdqu %%ymm1,(%[dst_argb]) \n" \ + "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ + "lea 0x40(%[dst_argb]), %[dst_argb] \n" + +// Store 16 AR30 values. +#define STOREAR30_AVX2 \ + "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x4,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x4,%%ymm2,%%ymm2 \n" \ + "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \ + "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \ + "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \ + "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \ + "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \ + "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \ + "vpsllw $0x4,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \ + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm2,%%ymm2 \n" \ + "vpor %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpor %%ymm2,%%ymm3,%%ymm3 \n" \ + "vmovdqu %%ymm0,(%[dst_ar30]) \n" \ + "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \ + "lea 0x40(%[dst_ar30]), %[dst_ar30] \n" #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV444_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2157,65 +2607,34 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I444TOARGBROW_AVX2 -#ifdef HAS_I411TOARGBROW_AVX2 -// 16 pixels -// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP_AVX2(yuvconstants) - "sub %[u_buf],%[v_buf] \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - READYUV411_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I411TOARGBROW_AVX2 - #if defined(HAS_I422TOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] @@ -2223,27 +2642,144 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TOARGBROW_AVX2 -#if defined(HAS_I422ALPHATOARGBROW_AVX2) +#if defined(HAS_I422TOAR30ROW_AVX2) // 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + LABELALIGN - "1: \n" + "1: \n" + READYUV422_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I422TOAR30ROW_AVX2 + +#if defined(HAS_I210TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOARGBROW_AVX2 + +#if defined(HAS_I210TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOAR30ROW_AVX2 + +#if defined(HAS_I422ALPHATOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. +void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2255,33 +2791,35 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, [v_buf]"+r"(v_buf), // %[v_buf] [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] -#if defined(__i386__) && defined(__pic__) +#if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_I422ALPHATOARGBROW_AVX2 #if defined(HAS_I422TORGBAROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) @@ -2292,11 +2830,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, "vpermq $0xd8,%%ymm2,%%ymm2 \n" "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" - "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" - "sub $0x10,%[width] \n" - "jg 1b \n" + "vmovdqu %%ymm0,(%[dst_argb]) \n" + "vmovdqu %%ymm1,0x20(%[dst_argb]) \n" + "lea 0x40(%[dst_argb]),%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] @@ -2304,7 +2842,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -2313,16 +2851,18 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, #if defined(HAS_NV12TOARGBROW_AVX2) // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, +void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READNV12_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2334,25 +2874,28 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_NV12TOARGBROW_AVX2 #if defined(HAS_NV21TOARGBROW_AVX2) // 16 pixels. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, +void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READNV21_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2365,24 +2908,27 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleNV21]"m"(kShuffleNV21) - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_NV21TOARGBROW_AVX2 #if defined(HAS_YUY2TOARGBROW_AVX2) // 16 pixels. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, - uint8* dst_argb, +void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2395,24 +2941,27 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleYUY2Y]"m"(kShuffleYUY2Y), [kShuffleYUY2UV]"m"(kShuffleYUY2UV) - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_YUY2TOARGBROW_AVX2 #if defined(HAS_UYVYTOARGBROW_AVX2) // 16 pixels. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, - uint8* dst_argb, +void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { + // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN - "1: \n" + "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -2425,1131 +2974,1603 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleUYVYY]"m"(kShuffleUYVYY), [kShuffleUYVYUV]"m"(kShuffleUYVYUV) - : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); + // clang-format on } #endif // HAS_UYVYTOARGBROW_AVX2 #ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { - asm volatile ( - "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 - "movd %%eax,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 - "movd %%eax,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - LABELALIGN - "1: \n" - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "psubusw %%xmm3,%%xmm0 \n" - "psrlw $6, %%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" +void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { + asm volatile( + "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 + "movd %%eax,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * + // 16 + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" - // Step 2: Weave into ARGB - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "por %%xmm4,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "psubusw %%xmm3,%%xmm0 \n" + "psrlw $6, %%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { - asm volatile ( - "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 - "vmovd %%eax,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" - "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 - "vmovd %%eax,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpslld $0x18,%%ymm4,%%ymm4 \n" +void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { + asm volatile( + "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * + // 16 + "vmovd %%eax,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 + "vmovd %%eax,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpslld $0x18,%%ymm4,%%ymm4 \n" - LABELALIGN - "1: \n" - // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 - "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" - "vpsrlw $0x6,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" - "vpor %%ymm4,%%ymm0,%%ymm0 \n" - "vpor %%ymm4,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + "vmovdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsrlw $0x6,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_AVX2 #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. -static uvec8 kShuffleMirror = { - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { +void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "movdqa %3,%%xmm5 \n" - LABELALIGN - "1: \n" - MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm5" - ); + asm volatile( + + "movdqa %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 -void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "vbroadcastf128 %3,%%ymm5 \n" - LABELALIGN - "1: \n" - MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm5" - ); + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. -static uvec8 kShuffleMirrorUV = { - 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u -}; -void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, +static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; +void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "movdqa %4,%%xmm1 \n" - "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(-0x10,0) ",%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $8,%3 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorUV) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); + asm volatile( + "movdqa %4,%%xmm1 \n" + "lea -0x10(%0,%3,2),%0 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $8,%3 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorUV) // %4 + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 -void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { +void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea " MEMLEA(-0x10,0) ",%0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc" - , "xmm0" - ); + asm volatile( + + "lea -0x10(%0,%2,4),%0 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBMIRRORROW_SSE2 #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = { - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; -void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; +void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "vmovdqu %3,%%ymm5 \n" - LABELALIGN - "1: \n" - VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kARGBShuffleMirror_AVX2) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm5" - ); + asm volatile( + + "vmovdqu %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_AVX2 -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm2 \n" - "vpsrlw $0x8,%%ymm1,%%ymm3 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,0x00(%1,%2,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 -void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { - asm volatile ( - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 - "lea " MEMLEA(0x20,0) ",%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" - "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" - "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" - "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" - "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2" - ); + asm volatile( + + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { - asm volatile ( - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2" - ); + asm volatile( + + "sub %0,%1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_SSE2 -#ifdef HAS_COPYROW_SSE2 -void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MERGEUVROW_16_AVX2 +void MergeUVRow_16_AVX2(const uint16_t* src_u, + const uint16_t* src_v, + uint16_t* dst_uv, + int scale, + int width) { + // clang-format off asm volatile ( - "test $0xf,%0 \n" - "jne 2f \n" - "test $0xf,%1 \n" - "jne 2f \n" + "vmovd %4,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1,1),%%ymm1 \n" + "add $0x20,%0 \n" + + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates + "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2,(%2) \n" + "vextractf128 $0x0,%%ymm0,0x10(%2) \n" + "vextractf128 $0x1,%%ymm2,0x20(%2) \n" + "vextractf128 $0x1,%%ymm0,0x30(%2) \n" + "add $0x40,%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"(scale) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + // clang-format on +} +#endif // HAS_MERGEUVROW_AVX2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 128 = 9 bits +// 64 = 10 bits +// 16 = 12 bits +// 1 = 16 bits +#ifdef HAS_MULTIPLYROW_16_AVX2 +void MultiplyRow_16_AVX2(const uint16_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm3 \n" + "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "sub %0,%1 \n" + + // 16 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%0,%1) \n" + "vmovdqu %%ymm1,0x20(%0,%1) \n" + "add $0x40,%0 \n" "sub $0x20,%2 \n" "jg 1b \n" - "jmp 9f \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm3"); + // clang-format on +} +#endif // HAS_MULTIPLYROW_16_AVX2 + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_SSSE3(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. LABELALIGN - "2: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "add $0x20,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} + +#ifdef HAS_CONVERT16TO8ROW_AVX2 +void Convert16To8Row_AVX2(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "add $0x40,%0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "add $0x20,%1 \n" "sub $0x20,%2 \n" - "jg 2b \n" - "9: \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT16TO8ROW_AVX2 + +// Use scale to convert to lsb formats depending how many bits there are: +// 512 = 9 bits +// 1024 = 10 bits +// 4096 = 12 bits +// TODO(fbarchard): reduce to SSE2 +void Convert8To16Row_SSE2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "movd %3,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "add $0x10,%0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "add $0x20,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} + +#ifdef HAS_CONVERT8TO16ROW_AVX2 +void Convert8To16Row_AVX2(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // clang-format off + asm volatile ( + "vmovd %3,%%xmm2 \n" + "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + + // 32 pixels per loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); + // clang-format on +} +#endif // HAS_CONVERT8TO16ROW_AVX2 + +#ifdef HAS_SPLITRGBROW_SSSE3 + +// Shuffle table for converting RGB to Planar. +static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u, + 2u, 5u, 8u, 11u, 14u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 1u, + 4u, 7u, 10u, 13u}; + +static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u, + 3u, 6u, 9u, 12u, 15u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 2u, + 5u, 8u, 11u, 14u}; + +static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u, + 4u, 7u, 10u, 13u, 128u, 128u, + 128u, 128u, 128u, 128u}; +static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 0u, 3u, + 6u, 9u, 12u, 15u}; + +void SplitRGBRow_SSSE3(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + "lea 0x10(%3),%3 \n" + "lea 0x30(%0),%0 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskRGBToR0), // %5 + "m"(kShuffleMaskRGBToR1), // %6 + "m"(kShuffleMaskRGBToR2), // %7 + "m"(kShuffleMaskRGBToG0), // %8 + "m"(kShuffleMaskRGBToG1), // %9 + "m"(kShuffleMaskRGBToG2), // %10 + "m"(kShuffleMaskRGBToB0), // %11 + "m"(kShuffleMaskRGBToB1), // %12 + "m"(kShuffleMaskRGBToB2) // %13 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_SPLITRGBROW_SSSE3 + +#ifdef HAS_MERGERGBROW_SSSE3 + +// Shuffle table for converting RGB to Planar. +static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u, + 2u, 128u, 128u, 3u, 128u, 128u, + 4u, 128u, 128u, 5u}; +static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u, + 128u, 2u, 128u, 128u, 3u, 128u, + 128u, 4u, 128u, 128u}; +static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u, + 128u, 128u, 2u, 128u, 128u, 3u, + 128u, 128u, 4u, 128u}; + +static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u, + 7u, 128u, 128u, 8u, 128u, 128u, + 9u, 128u, 128u, 10u}; +static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u, + 128u, 7u, 128u, 128u, 8u, 128u, + 128u, 9u, 128u, 128u}; +static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u, + 128u, 128u, 8u, 128u, 128u, 9u, + 128u, 128u, 10u, 128u}; + +static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u, + 12u, 128u, 128u, 13u, 128u, 128u, + 14u, 128u, 128u, 15u}; +static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u, + 128u, 13u, 128u, 128u, 14u, 128u, + 128u, 15u, 128u, 128u}; +static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u, + 128u, 128u, 13u, 128u, 128u, 14u, + 128u, 128u, 15u, 128u}; + +void MergeRGBRow_SSSE3(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %5, %%xmm0 \n" + "pshufb %6, %%xmm1 \n" + "pshufb %7, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %8, %%xmm0 \n" + "pshufb %9, %%xmm1 \n" + "pshufb %10, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,16(%3) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu (%1),%%xmm1 \n" + "movdqu (%2),%%xmm2 \n" + "pshufb %11, %%xmm0 \n" + "pshufb %12, %%xmm1 \n" + "pshufb %13, %%xmm2 \n" + "por %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,32(%3) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "lea 0x10(%2),%2 \n" + "lea 0x30(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskRToRGB0), // %5 + "m"(kShuffleMaskGToRGB0), // %6 + "m"(kShuffleMaskBToRGB0), // %7 + "m"(kShuffleMaskRToRGB1), // %8 + "m"(kShuffleMaskGToRGB1), // %9 + "m"(kShuffleMaskBToRGB1), // %10 + "m"(kShuffleMaskRToRGB2), // %11 + "m"(kShuffleMaskGToRGB2), // %12 + "m"(kShuffleMaskBToRGB2) // %13 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MERGERGBROW_SSSE3 + +#ifdef HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "test $0xf,%0 \n" + "jne 2f \n" + "test $0xf,%1 \n" + "jne 2f \n" + + LABELALIGN + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 9f \n" + + LABELALIGN + "2: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 2b \n" + + LABELALIGN "9: \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_AVX -void CopyRow_AVX(const uint8* src, uint8* dst, int count) { - asm volatile ( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_COPYROW_AVX #ifdef HAS_COPYROW_ERMS // Multiple of 1. -void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { +void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep movsb " MEMMOVESTRING(0,1) " \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 - : - : "memory", "cc" - ); + asm volatile( + + "rep movsb \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc"); } #endif // HAS_COPYROW_ERMS #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels -void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm4 \n" - "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBCOPYALPHAROW_SSE2 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels -void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1," MEMACCESS(1) " \n" - "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2" - ); +void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "vmovdqu 0x20(%0),%%ymm2 \n" + "lea 0x40(%0),%0 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBCOPYALPHAROW_AVX2 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 // width in pixels -void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ", %%xmm0 \n" - "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" - "lea " MEMLEA(0x20, 0) ", %0 \n" - "psrld $0x18, %%xmm0 \n" - "psrld $0x18, %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "packuswb %%xmm0, %%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8, 1) ", %1 \n" - "sub $0x8, %2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+rm"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0), %%xmm0 \n" + "movdqu 0x10(%0), %%xmm1 \n" + "lea 0x20(%0), %0 \n" + "psrld $0x18, %%xmm0 \n" + "psrld $0x18, %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1), %1 \n" + "sub $0x8, %2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +static const uvec8 kShuffleAlphaShort_AVX2 = { + 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, + 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; + +void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "vmovdqa %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0), %%ymm0 \n" + "vmovdqu 0x20(%0), %%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu 0x40(%0), %%ymm2 \n" + "vmovdqu 0x60(%0), %%ymm3 \n" + "lea 0x80(%0), %0 \n" + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20, %2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : "m"(kPermdARGBToY_AVX), // %3 + "m"(kShuffleAlphaShort_AVX2) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels -void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm0,%%xmm0 \n" - "pslld $0x18,%%xmm0 \n" - "pcmpeqb %%xmm1,%%xmm1 \n" - "psrld $0x8,%%xmm1 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpckhwd %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm2,%%xmm2 \n" - "movdqu " MEMACCESS(1) ",%%xmm4 \n" - "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" - "pand %%xmm0,%%xmm2 \n" - "pand %%xmm0,%%xmm3 \n" - "pand %%xmm1,%%xmm4 \n" - "pand %%xmm1,%%xmm5 \n" - "por %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm3 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm2 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqu (%1),%%xmm4 \n" + "movdqu 0x10(%1),%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels -void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { - asm volatile ( - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrld $0x8,%%ymm0,%%ymm0 \n" - LABELALIGN - "1: \n" - "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" - "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "vpslld $0x18,%%ymm1,%%ymm1 \n" - "vpslld $0x18,%%ymm2,%%ymm2 \n" - "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" - "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1," MEMACCESS(1) " \n" - "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2" - ); +void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + + LABELALIGN + "1: \n" + "vpmovzxbd (%0),%%ymm1 \n" + "vpmovzxbd 0x8(%0),%%ymm2 \n" + "lea 0x10(%0),%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%1) \n" + "vmovdqu %%ymm2,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 -void SetRow_X86(uint8* dst, uint8 v8, int width) { +void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width >> 2); - const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm volatile ( - "rep stosl " MEMSTORESTRING(eax,0) " \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. + asm volatile( + + "rep stosl \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } -void SetRow_ERMS(uint8* dst, uint8 v8, int width) { +void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep stosb " MEMSTORESTRING(al,0) " \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v8) // %2 - : "memory", "cc"); + asm volatile( + + "rep stosb \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); } -void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { +void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep stosl " MEMSTORESTRING(eax,0) " \n" - : "+D"(dst_argb), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + asm volatile( + + "rep stosl \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_SSE2 -void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); +void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); +void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); +void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); } -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); +void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); +void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_YUY2TOYROW_AVX2 -void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); +void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); +void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); +void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); -} -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" +void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 - VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } -void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrlw $0x8,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm0,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" - VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); +void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1,(%1) \n" + "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. -static uvec8 kShuffleAlpha = { - 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 -}; +static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time -void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" +void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0xf,%%xmm7 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x8,%%xmm6 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" - // 4 pixel loop. - LABELALIGN - "40: \n" - "movdqu " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movdqu (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" - "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" + "49: \n" + "add $0x3,%3 \n" + "jl 99f \n" - // 1 pixel loop. - "91: \n" - "movd " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd " MEMACCESS(1) ",%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x4,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x4,2) ",%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" - "99: \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : "m"(kShuffleAlpha) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 1 pixel loop. + "91: \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "movdqa %%xmm3,%%xmm0 \n" + "pxor %%xmm4,%%xmm3 \n" + "movd (%1),%%xmm2 \n" + "pshufb %4,%%xmm3 \n" + "pand %%xmm6,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "pmullw %%xmm3,%%xmm2 \n" + "movd (%1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "pmullw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm2 \n" + "paddusb %%xmm2,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 91b \n" + "99: \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "m"(kShuffleAlpha) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBBLENDROW_SSSE3 @@ -3559,46 +4580,49 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "mov $0x807f807f,%%eax \n" - "movd %%eax,%%xmm7 \n" - "pshufd $0x0,%%xmm7,%%xmm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" +void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $0x8,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "mov $0x807f807f,%%eax \n" + "movd %%eax,%%xmm7 \n" + "pshufd $0x0,%%xmm7,%%xmm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq (%2),%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pxor %%xmm5,%%xmm0 \n" - "movq (%0,%2,1),%%xmm1 \n" - "movq (%1,%2,1),%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "psubb %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "paddw %%xmm7,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%3,%2,1) \n" - "lea 0x8(%2),%2 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(alpha), // %2 - "+r"(dst), // %3 - "+rm"(width) // %4 - :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%2),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm0 \n" + "movq (%0,%2,1),%%xmm1 \n" + "movq (%1,%2,1),%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm1 \n" + "psubb %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "paddw %%xmm7,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%3,%2,1) \n" + "lea 0x8(%2),%2 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + ::"memory", + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); } #endif // HAS_BLENDPLANEROW_SSSE3 @@ -3608,312 +4632,308 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsllw $0x8,%%ymm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm6 \n" - "vbroadcastss %%xmm6,%%ymm6 \n" - "mov $0x807f807f,%%eax \n" - "vmovd %%eax,%%xmm7 \n" - "vbroadcastss %%xmm7,%%ymm7 \n" - "sub %2,%0 \n" - "sub %2,%1 \n" - "sub %2,%3 \n" +void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $0x8,%%ymm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm6 \n" + "vbroadcastss %%xmm6,%%ymm6 \n" + "mov $0x807f807f,%%eax \n" + "vmovd %%eax,%%xmm7 \n" + "vbroadcastss %%xmm7,%%ymm7 \n" + "sub %2,%0 \n" + "sub %2,%1 \n" + "sub %2,%3 \n" - // 32 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu (%2),%%ymm0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpxor %%ymm5,%%ymm3,%%ymm3 \n" - "vpxor %%ymm5,%%ymm0,%%ymm0 \n" - "vmovdqu (%0,%2,1),%%ymm1 \n" - "vmovdqu (%1,%2,1),%%ymm2 \n" - "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" - "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" - "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm3,%%ymm3 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%3,%2,1) \n" - "lea 0x20(%2),%2 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(alpha), // %2 - "+r"(dst), // %3 - "+rm"(width) // %4 - :: "memory", "cc", "eax", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 32 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%2),%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm3,%%ymm3 \n" + "vpxor %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu (%0,%2,1),%%ymm1 \n" + "vmovdqu (%1,%2,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3,%2,1) \n" + "lea 0x20(%2),%2 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(alpha), // %2 + "+r"(dst), // %3 + "+rm"(width) // %4 + ::"memory", + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha -static uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u -}; -static uvec8 kShuffleAlpha1 = { - 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u -}; +static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, + 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; +static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; // Attenuate 4 pixels at a time. -void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm3,%%xmm3 \n" - "pslld $0x18,%%xmm3 \n" - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" +void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm1,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "punpckhbw %%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "pand %%xmm3,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha0), // %3 - "m"(kShuffleAlpha1) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = { - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u -}; +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; // Attenuate 8 pixels at a time. -void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm4 \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpslld $0x18,%%ymm5,%%ymm5 \n" - "sub %0,%1 \n" +void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm6,%%ymm6 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpor %%ymm6,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha_AVX2) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBATTENUATEROW_AVX2 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, +void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { uintptr_t alpha; - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movzb " MEMACCESS2(0x03,0) ",%3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x07,0) ",%3 \n" - MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "movzb " MEMACCESS2(0x0b,0) ",%3 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x0f,0) ",%3 \n" - MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "movlhps %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "=&r"(alpha) // %3 - : "r"(fixed_invtbl8) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movzb 0x03(%0),%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x07(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "movd 0x00(%4,%3,4),%%xmm3 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 + : "r"(fixed_invtbl8) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u -}; + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; // Unattenuate 8 pixels at a time. -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, +void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, int width) { uintptr_t alpha; - asm volatile ( - "sub %0,%1 \n" - "vbroadcastf128 %5,%%ymm5 \n" + asm volatile( + "sub %0,%1 \n" + "vbroadcastf128 %5,%%ymm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - // replace VPGATHER - "movzb " MEMACCESS2(0x03,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 - "movzb " MEMACCESS2(0x07,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 - "movzb " MEMACCESS2(0x0b,0) ",%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x0f,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 - "movzb " MEMACCESS2(0x13,0) ",%3 \n" - "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 - "movzb " MEMACCESS2(0x17,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 - "movzb " MEMACCESS2(0x1b,0) ",%3 \n" - "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 - "movzb " MEMACCESS2(0x1f,0) ",%3 \n" - MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 - "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" - "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" - "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" - "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" - // end of VPGATHER + // 8 pixel loop. + LABELALIGN + "1: \n" + // replace VPGATHER + "movzb 0x03(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x07(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x0b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x0f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "movzb 0x13(%0),%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + "vmovd 0x00(%4,%3,4),%%xmm0 \n" + "movzb 0x17(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm1 \n" + "movzb 0x1b(%0),%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + "vmovd 0x00(%4,%3,4),%%xmm2 \n" + "movzb 0x1f(%0),%3 \n" + "vmovd 0x00(%4,%3,4),%%xmm3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" + "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" + "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" + // end of VPGATHER - "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" - "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" - "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" - "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width), // %2 - "=&r"(alpha) // %3 - : "r"(fixed_invtbl8), // %4 - "m"(kUnattenShuffleAlpha_AVX2) // %5 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "=&r"(alpha) // %3 + : "r"(fixed_invtbl8), // %4 + "m"(kUnattenShuffleAlpha_AVX2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBUNATTENUATEROW_AVX2 #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" +void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrld $0x18,%%xmm2 \n" - "psrld $0x18,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpcklbw %%xmm2,%%xmm3 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu 0x10(%0),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBGRAYROW_SSSE3 @@ -3922,412 +4942,415 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 // Constant for ARGB color to sepia tone -static vec8 kARGBToSepiaB = { - 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 -}; +static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; -static vec8 kARGBToSepiaG = { - 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 -}; +static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; -static vec8 kARGBToSepiaR = { - 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 -}; +static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { - asm volatile ( - "movdqa %2,%%xmm2 \n" - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm4 \n" +void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { + asm volatile( + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "phaddw %%xmm6,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm5 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm5 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddw %%xmm1,%%xmm5 \n" - "psrlw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "m"(kARGBToSepiaB), // %2 - "m"(kARGBToSepiaG), // %3 - "m"(kARGBToSepiaR) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqu (%0),%%xmm5 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "m"(kARGBToSepiaB), // %2 + "m"(kARGBToSepiaG), // %3 + "m"(kARGBToSepiaR) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBSEPIAROW_SSSE3 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 // Tranform 8 ARGB pixels (32 bytes) with color matrix. // Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { - asm volatile ( - "movdqu " MEMACCESS(3) ",%%xmm5 \n" - "pshufd $0x00,%%xmm5,%%xmm2 \n" - "pshufd $0x55,%%xmm5,%%xmm3 \n" - "pshufd $0xaa,%%xmm5,%%xmm4 \n" - "pshufd $0xff,%%xmm5,%%xmm5 \n" +void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "movdqu (%3),%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm7,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm6 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm7 \n" - "phaddsw %%xmm7,%%xmm6 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm6 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm6,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "punpcklwd %%xmm1,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm6 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x10(%0),%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm6,0x10(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { - asm volatile ( - "movd %2,%%xmm2 \n" - "movd %3,%%xmm3 \n" - "movd %4,%%xmm4 \n" - "pshuflw $0x40,%%xmm2,%%xmm2 \n" - "pshufd $0x44,%%xmm2,%%xmm2 \n" - "pshuflw $0x40,%%xmm3,%%xmm3 \n" - "pshufd $0x44,%%xmm3,%%xmm3 \n" - "pshuflw $0x40,%%xmm4,%%xmm4 \n" - "pshufd $0x44,%%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "pslld $0x18,%%xmm6 \n" +void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "movd %2,%%xmm2 \n" + "movd %3,%%xmm3 \n" + "movd %4,%%xmm4 \n" + "pshuflw $0x40,%%xmm2,%%xmm2 \n" + "pshufd $0x44,%%xmm2,%%xmm2 \n" + "pshuflw $0x40,%%xmm3,%%xmm3 \n" + "pshufd $0x44,%%xmm3,%%xmm3 \n" + "pshuflw $0x40,%%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "pslld $0x18,%%xmm6 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "pmullw %%xmm3,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm7 \n" - "pmullw %%xmm3,%%xmm1 \n" - "pand %%xmm6,%%xmm7 \n" - "paddw %%xmm4,%%xmm0 \n" - "paddw %%xmm4,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "sub $0x4,%1 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "pmullw %%xmm3,%%xmm0 \n" + "movdqu (%0),%%xmm7 \n" + "pmullw %%xmm3,%%xmm1 \n" + "pand %%xmm6,%%xmm7 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm7,%%xmm0 \n" + "movdqu %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x4,%1 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBQUANTIZEROW_SSE2 #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - asm volatile ( - "movd %3,%%xmm2 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm2 \n" +void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "movd %3,%%xmm2 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2" - ); + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBSHADEROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_ARGBMULTIPLYROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" #if defined(__AVX2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + , + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif - ); + ); } #endif // HAS_ARGBMULTIPLYROW_AVX2 #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBADDROW_SSE2 #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "vmovdqu %%ymm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0" - ); +void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpaddusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBADDROW_AVX2 #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. -void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "psubusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1" - ); +void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm1 \n" + "lea 0x10(%1),%1 \n" + "psubusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBSUBTRACTROW_SSE2 #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 4 pixel loop. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "vmovdqu %%ymm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0" - ); +void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpsubusb (%1),%%ymm0,%%ymm0 \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBSUBTRACTROW_AVX2 @@ -4336,52 +5359,53 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { - asm volatile ( - "sub %0,%1 \n" - "sub %0,%2 \n" - "sub %0,%3 \n" - "pxor %%xmm5,%%xmm5 \n" +void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 - MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 - MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) - "lea " MEMLEA(0x8,0) ",%0 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x2(%0),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "movq 0x02(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x00(%0,%2,1),%%xmm2 \n" + "movq 0x02(%0,%2,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%3,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELXROW_SSE2 @@ -4390,50 +5414,50 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { - asm volatile ( - "sub %0,%1 \n" - "sub %0,%2 \n" - "pxor %%xmm5,%%xmm5 \n" +void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "psubw %%xmm1,%%xmm0 \n" - "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" - MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "psubw %%xmm2,%%xmm1 \n" - "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" - MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "psubw %%xmm3,%%xmm2 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "paddw %%xmm1,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "psubw %%xmm0,%%xmm1 \n" - "pmaxsw %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) - "lea " MEMLEA(0x8,0) ",%0 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x00(%0,%1,1),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x1(%0),%%xmm1 \n" + "movq 0x01(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x2(%0),%%xmm2 \n" + "movq 0x02(%0,%1,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x00(%0,%2,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELYROW_SSE2 @@ -4443,79 +5467,79 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" +void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm2 \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm1 \n" - "punpckhwd %%xmm2,%%xmm2 \n" - "por %%xmm5,%%xmm1 \n" - "por %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklwd %%xmm0,%%xmm3 \n" - "punpckhwd %%xmm0,%%xmm0 \n" - "por %%xmm5,%%xmm3 \n" - "por %%xmm5,%%xmm0 \n" - "movdqu %%xmm1," MEMACCESS(2) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" - "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" - "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "movdqu %%xmm1,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "movdqu %%xmm3,0x20(%2) \n" + "movdqu %%xmm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELROW_SSE2 #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. -void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { - asm volatile ( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" +void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_SOBELTOPLANEROW_SSE2 @@ -4525,1004 +5549,1123 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "sub %0,%1 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" +void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" - // 8 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "paddusb %%xmm1,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - "punpckhbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm1,%%xmm4 \n" - "punpcklbw %%xmm2,%%xmm4 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm4,%%xmm6 \n" - "punpcklwd %%xmm3,%%xmm6 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "movdqa %%xmm1,%%xmm7 \n" - "punpcklwd %%xmm0,%%xmm7 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm6," MEMACCESS(2) " \n" - "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" - "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" - "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 8 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm6,(%2) \n" + "movdqu %%xmm4,0x10(%2) \n" + "movdqu %%xmm7,0x20(%2) \n" + "movdqu %%xmm1,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_SOBELXYROW_SSE2 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. -void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) { - asm volatile ( - "pxor %%xmm0,%%xmm0 \n" - "pxor %%xmm1,%%xmm1 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "test $0xf,%1 \n" - "jne 49f \n" +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { + asm volatile( + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" - // 4 pixel loop \n" - LABELALIGN - "40: \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm2,%%xmm4 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "punpckhwd %%xmm1,%%xmm3 \n" - "punpckhbw %%xmm1,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "punpcklwd %%xmm1,%%xmm4 \n" - "punpckhwd %%xmm1,%%xmm5 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(2) ",%%xmm2 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" - "paddd %%xmm0,%%xmm3 \n" - "paddd %%xmm4,%%xmm0 \n" - "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" - "paddd %%xmm0,%%xmm4 \n" - "paddd %%xmm5,%%xmm0 \n" - "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" - "lea " MEMLEA(0x40,2) ",%2 \n" - "paddd %%xmm0,%%xmm5 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" - "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + // 4 pixel loop. + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqu 0x10(%2),%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqu 0x20(%2),%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqu 0x30(%2),%%xmm5 \n" + "lea 0x40(%2),%2 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqu %%xmm2,(%1) \n" + "movdqu %%xmm3,0x10(%1) \n" + "movdqu %%xmm4,0x20(%1) \n" + "movdqu %%xmm5,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" - // 1 pixel loop \n" - LABELALIGN - "10: \n" - "movd " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklwd %%xmm1,%%xmm2 \n" - "paddd %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(2) ",%%xmm2 \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "paddd %%xmm0,%%xmm2 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x1,%3 \n" - "jge 10b \n" + // 1 pixel loop. + LABELALIGN + "10: \n" + "movd (%0),%%xmm2 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%2),%%xmm2 \n" + "lea 0x10(%2),%2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" - "19: \n" - : "+r"(row), // %0 - "+r"(cumsum), // %1 - "+r"(previous_cumsum), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + "19: \n" + : "+r"(row), // %0 + "+r"(cumsum), // %1 + "+r"(previous_cumsum), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, int count) { - asm volatile ( - "movd %5,%%xmm5 \n" - "cvtdq2ps %%xmm5,%%xmm5 \n" - "rcpss %%xmm5,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - "cmpl $0x80,%5 \n" - "ja 40f \n" + asm volatile( + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrld $0x10,%%xmm6 \n" - "cvtdq2ps %%xmm6,%%xmm6 \n" - "addps %%xmm6,%%xmm5 \n" - "mulps %%xmm4,%%xmm5 \n" - "cvtps2dq %%xmm5,%%xmm5 \n" - "packssdw %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" - // 4 pixel small loop \n" - LABELALIGN - "4: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 - MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 - MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 - MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "psubd " MEMACCESS(1) ",%%xmm0 \n" - "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" - "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" - "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" - MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 - MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 - MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 - MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 - "lea " MEMLEA(0x40,1) ",%1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm0 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 4b \n" - "jmp 49f \n" + // 4 pixel small loop. + LABELALIGN + "4: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" - // 4 pixel loop \n" - LABELALIGN - "40: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" - MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 - MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 - MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 - MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "psubd " MEMACCESS(1) ",%%xmm0 \n" - "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" - "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" - "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" - MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 - MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 - MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 - MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 - "lea " MEMLEA(0x40,1) ",%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm1,%%xmm1 \n" - "mulps %%xmm4,%%xmm0 \n" - "mulps %%xmm4,%%xmm1 \n" - "cvtdq2ps %%xmm2,%%xmm2 \n" - "cvtdq2ps %%xmm3,%%xmm3 \n" - "mulps %%xmm4,%%xmm2 \n" - "mulps %%xmm4,%%xmm3 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "cvtps2dq %%xmm1,%%xmm1 \n" - "cvtps2dq %%xmm2,%%xmm2 \n" - "cvtps2dq %%xmm3,%%xmm3 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packssdw %%xmm3,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 40b \n" + // 4 pixel loop + LABELALIGN + "40: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" - "49: \n" - "add $0x3,%3 \n" - "jl 19f \n" + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" - // 1 pixel loop \n" - LABELALIGN - "10: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 - "lea " MEMLEA(0x10,0) ",%0 \n" - "psubd " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 - "lea " MEMLEA(0x10,1) ",%1 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "mulps %%xmm4,%%xmm0 \n" - "cvtps2dq %%xmm0,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x4,2) ",%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - "19: \n" - : "+r"(topleft), // %0 - "+r"(botleft), // %1 - "+r"(dst), // %2 - "+rm"(count) // %3 - : "r"((intptr_t)(width)), // %4 - "rm"(area) // %5 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + // 1 pixel loop + LABELALIGN + "10: \n" + "movdqu (%0),%%xmm0 \n" + "psubd 0x00(%0,%4,4),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "paddd 0x00(%1,%4,4),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"((intptr_t)(width)), // %4 + "rm"(area) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* src_dudv, int width) { +void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* src_dudv, + int width) { intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; - asm volatile ( - "movq " MEMACCESS(3) ",%%xmm2 \n" - "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" - "shl $0x10,%1 \n" - "add $0x4,%1 \n" - "movd %1,%%xmm5 \n" - "sub $0x4,%4 \n" - "jl 49f \n" + asm volatile( + "movq (%3),%%xmm2 \n" + "movq 0x08(%3),%%xmm7 \n" + "shl $0x10,%1 \n" + "add $0x4,%1 \n" + "movd %1,%%xmm5 \n" + "sub $0x4,%4 \n" + "jl 49f \n" - "pshufd $0x44,%%xmm7,%%xmm7 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "movdqa %%xmm2,%%xmm0 \n" - "addps %%xmm7,%%xmm0 \n" - "movlhps %%xmm0,%%xmm2 \n" - "movdqa %%xmm7,%%xmm4 \n" - "addps %%xmm4,%%xmm4 \n" - "movdqa %%xmm2,%%xmm3 \n" - "addps %%xmm4,%%xmm3 \n" - "addps %%xmm4,%%xmm4 \n" + "pshufd $0x44,%%xmm7,%%xmm7 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movdqa %%xmm2,%%xmm0 \n" + "addps %%xmm7,%%xmm0 \n" + "movlhps %%xmm0,%%xmm2 \n" + "movdqa %%xmm7,%%xmm4 \n" + "addps %%xmm4,%%xmm4 \n" + "movdqa %%xmm2,%%xmm3 \n" + "addps %%xmm4,%%xmm3 \n" + "addps %%xmm4,%%xmm4 \n" - // 4 pixel loop \n" - LABELALIGN - "40: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 - "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 - "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts - "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 - MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 - "punpckldq %%xmm6,%%xmm1 \n" - "addps %%xmm4,%%xmm2 \n" - "movq %%xmm1," MEMACCESS(2) " \n" - "movd %%xmm0,%k1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 - MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 - "punpckldq %%xmm6,%%xmm0 \n" - "addps %%xmm4,%%xmm3 \n" - "movq %%xmm0," MEMACCESS2(0x08,2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" + // 4 pixel loop + LABELALIGN + "40: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm1 \n" + "addps %%xmm4,%%xmm2 \n" + "movq %%xmm1,(%2) \n" + "movd %%xmm0,%k1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k5 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd 0x00(%0,%5,1),%%xmm6 \n" + "punpckldq %%xmm6,%%xmm0 \n" + "addps %%xmm4,%%xmm3 \n" + "movq %%xmm0,0x08(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" - "49: \n" - "add $0x3,%4 \n" - "jl 19f \n" + "49: \n" + "add $0x3,%4 \n" + "jl 19f \n" - // 1 pixel loop \n" - LABELALIGN - "10: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "pmaddwd %%xmm5,%%xmm0 \n" - "addps %%xmm7,%%xmm2 \n" - "movd %%xmm0,%k1 \n" - MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x04,2) ",%2 \n" - "sub $0x1,%4 \n" - "jge 10b \n" - "19: \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_stride_temp), // %1 - "+r"(dst_argb), // %2 - "+r"(src_dudv), // %3 - "+rm"(width), // %4 - "=&r"(temp) // %5 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 1 pixel loop + LABELALIGN + "10: \n" + "cvttps2dq %%xmm2,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "pmaddwd %%xmm5,%%xmm0 \n" + "addps %%xmm7,%%xmm2 \n" + "movd %%xmm0,%k1 \n" + "movd 0x00(%0,%1,1),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x04(%2),%2 \n" + "sub $0x1,%4 \n" + "jge 10b \n" + "19: \n" + : "+r"(src_argb), // %0 + "+r"(src_argb_stride_temp), // %1 + "+r"(dst_argb), // %2 + "+r"(src_dudv), // %3 + "+rm"(width), // %4 + "=&r"(temp) // %5 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBAFFINEROW_SSE2 #ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, +void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x80,%3 \n" - "je 50f \n" + asm volatile( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "mov $0x80808080,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "movd %3,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x80808080,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm2) - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "psubb %%xmm4,%%xmm0 \n" - "psubb %%xmm4,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm3 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "pmaddubsw %%xmm1,%%xmm3 \n" - "paddw %%xmm4,%%xmm2 \n" - "paddw %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - MEMOPMEM(movdqu,xmm2,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" + // General purpose row blend. + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "psubb %%xmm4,%%xmm0 \n" + "psubb %%xmm4,%%xmm1 \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "pmaddubsw %%xmm1,%%xmm3 \n" + "paddw %%xmm4,%%xmm2 \n" + "paddw %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" + // Blend 50 / 50. + LABELALIGN + "50: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu 0x00(%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 50b \n" + "jmp 99f \n" - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "movdqu (%1),%%xmm0 \n" + "movdqu %%xmm0,0x00(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 100b \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+rm"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+rm"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, +void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { - asm volatile ( - "cmp $0x0,%3 \n" - "je 100f \n" - "sub %1,%0 \n" - "cmp $0x80,%3 \n" - "je 50f \n" + asm volatile( + "cmp $0x0,%3 \n" + "je 100f \n" + "sub %1,%0 \n" + "cmp $0x80,%3 \n" + "je 50f \n" - "vmovd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x100,%3 \n" - "vmovd %3,%%xmm5 \n" - "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" - "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" - "vbroadcastss %%xmm5,%%ymm5 \n" - "mov $0x80808080,%%eax \n" - "vmovd %%eax,%%xmm4 \n" - "vbroadcastss %%xmm4,%%ymm4 \n" + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "vbroadcastss %%xmm5,%%ymm5 \n" + "mov $0x80808080,%%eax \n" + "vmovd %%eax,%%xmm4 \n" + "vbroadcastss %%xmm4,%%ymm4 \n" - // General purpose row blend. - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" - MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) - "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" - "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" - "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" - "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "jmp 99f \n" + // General purpose row blend. + LABELALIGN + "1: \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" + "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" - // Blend 50 / 50. - LABELALIGN - "50: \n" - "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" - VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0 - MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 50b \n" - "jmp 99f \n" + // Blend 50 / 50. + LABELALIGN + "50: \n" + "vmovdqu (%1),%%ymm0 \n" + "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "rep movsb " MEMMOVESTRING(1,0) " \n" - "jmp 999f \n" + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "rep movsb \n" + "jmp 999f \n" - "99: \n" - "vzeroupper \n" - "999: \n" - : "+D"(dst_ptr), // %0 - "+S"(src_ptr), // %1 - "+cm"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", "eax", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" - ); + "99: \n" + "vzeroupper \n" + "999: \n" + : "+D"(dst_ptr), // %0 + "+S"(src_ptr), // %1 + "+cm"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_AVX2 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - asm volatile ( - "movdqu " MEMACCESS(3) ",%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); +void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + + "movdqu (%3),%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_SSSE3 #ifdef HAS_ARGBSHUFFLEROW_AVX2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - asm volatile ( - "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); +void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + + "vbroadcastf128 (%3),%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_AVX2 -#ifdef HAS_ARGBSHUFFLEROW_SSE2 -// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - uintptr_t pixel_temp; - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" - "mov " MEMACCESS(4) ",%k2 \n" - "cmp $0x3000102,%k2 \n" - "je 3012f \n" - "cmp $0x10203,%k2 \n" - "je 123f \n" - "cmp $0x30201,%k2 \n" - "je 321f \n" - "cmp $0x2010003,%k2 \n" - "je 2103f \n" - - LABELALIGN - "1: \n" - "movzb " MEMACCESS(4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS(1) " \n" - "movzb " MEMACCESS2(0x1,4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS2(0x1,1) " \n" - "movzb " MEMACCESS2(0x2,4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS2(0x2,1) " \n" - "movzb " MEMACCESS2(0x3,4) ",%2 \n" - MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 - "mov %b2," MEMACCESS2(0x3,1) " \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "lea " MEMLEA(0x4,1) ",%1 \n" - "sub $0x1,%3 \n" - "jg 1b \n" - "jmp 99f \n" - - LABELALIGN - "123: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0x1b,%%xmm0,%%xmm0 \n" - "pshuflw $0x1b,%%xmm0,%%xmm0 \n" - "pshufhw $0x1b,%%xmm1,%%xmm1 \n" - "pshuflw $0x1b,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 123b \n" - "jmp 99f \n" - - LABELALIGN - "321: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0x39,%%xmm0,%%xmm0 \n" - "pshuflw $0x39,%%xmm0,%%xmm0 \n" - "pshufhw $0x39,%%xmm1,%%xmm1 \n" - "pshuflw $0x39,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 321b \n" - "jmp 99f \n" - - LABELALIGN - "2103: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0x93,%%xmm0,%%xmm0 \n" - "pshuflw $0x93,%%xmm0,%%xmm0 \n" - "pshufhw $0x93,%%xmm1,%%xmm1 \n" - "pshuflw $0x93,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 2103b \n" - "jmp 99f \n" - - LABELALIGN - "3012: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpckhbw %%xmm5,%%xmm1 \n" - "pshufhw $0xc6,%%xmm0,%%xmm0 \n" - "pshuflw $0xc6,%%xmm0,%%xmm0 \n" - "pshufhw $0xc6,%%xmm1,%%xmm1 \n" - "pshuflw $0xc6,%%xmm1,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%3 \n" - "jg 3012b \n" - - "99: \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "=&d"(pixel_temp), // %2 - "+r"(width) // %3 - : "r"(shuffler) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm5" - ); -} -#endif // HAS_ARGBSHUFFLEROW_SSE2 - #ifdef HAS_I422TOYUY2ROW_SSE2 -void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(1) ",%%xmm2 \n" - MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 - "lea " MEMLEA(0x8,1) ",%1 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(3) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" - "lea " MEMLEA(0x20,3) ",%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_frame), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); +void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOYUY2ROW_SSE2 #ifdef HAS_I422TOUYVYROW_SSE2 -void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(1) ",%%xmm2 \n" - MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 - "lea " MEMLEA(0x8,1) ",%1 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1," MEMACCESS(3) " \n" - "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" - "lea " MEMLEA(0x20,3) ",%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_frame), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); +void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOUYVYROW_SSE2 -#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) { - asm volatile ( - "pxor %%xmm3,%%xmm3 \n" +#ifdef HAS_I422TOYUY2ROW_AVX2 +void I422ToYUY2Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( - // 2 pixel loop. - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" - "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" - "addps " MEMACCESS(3) ",%%xmm0 \n" - "addps " MEMACCESS(3) ",%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" - "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" - "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" - "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOYUY2ROW_AVX2 + +#ifdef HAS_I422TOUYVYROW_AVX2 +void I422ToUYVYRow_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_I422TOUYVYROW_AVX2 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { + asm volatile( + + "pxor %%xmm3,%%xmm3 \n" + + // 2 pixel loop. + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, const float* poly, +void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, int width) { - asm volatile ( - "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" - "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" - "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" - "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" + asm volatile( + "vbroadcastf128 (%3),%%ymm4 \n" + "vbroadcastf128 0x10(%3),%%ymm5 \n" + "vbroadcastf128 0x20(%3),%%ymm6 \n" + "vbroadcastf128 0x30(%3),%%ymm7 \n" - // 2 pixel loop. - LABELALIGN - "1: \n" - "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels - "lea " MEMLEA(0x8,0) ",%0 \n" - "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats - "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X - "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X - "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X - "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X - "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X - "vcvttps2dq %%ymm0,%%ymm0 \n" - "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" - "vmovq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + // 2 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels + "lea 0x8(%0),%0 \n" + "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats + "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X + "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X + "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X + "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X + "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * + // X + "vcvttps2dq %%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" + "vmovq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 +#ifdef HAS_HALFFLOATROW_SSE2 +static float kScaleBias = 1.9259299444e-34f; +void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + scale *= kScaleBias; + asm volatile( + "movd %3,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm2 \n" // 8 shorts + "add $0x10,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 + "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats + "punpckhwd %%xmm5,%%xmm3 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "psrld $0xd,%%xmm2 \n" + "psrld $0xd,%%xmm3 \n" + "packssdw %%xmm3,%%xmm2 \n" + "movdqu %%xmm2,-0x10(%0,%1,1) \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(scale) // %3 + : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_HALFFLOATROW_SSE2 + +#ifdef HAS_HALFFLOATROW_AVX2 +void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + scale *= kScaleBias; + asm volatile( + "vbroadcastss %3, %%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // 16 shorts + "add $0x20,%0 \n" + "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates + "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vpsrld $0xd,%%ymm3,%%ymm3 \n" + "vpsrld $0xd,%%ymm2,%%ymm2 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates + "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif + : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + "vbroadcastss %3, %%ymm4 \n" + "sub %0,%1 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd 0x10(%0),%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2,0x00(%0,%1,1) \n" + "vmovdqu %%xmm3,0x10(%0,%1,1) \n" + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 +#if defined(__x86_64__) + : "x"(scale) // %3 +#else + : "m"(scale) // %3 +#endif + : "memory", "cc", "xmm2", "xmm3", "xmm4"); +} +#endif // HAS_HALFFLOATROW_F16C + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { + asm volatile( + "sub %0,%1 \n" + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd 0x10(%0),%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2,0x00(%0,%1,1) \n" + "vmovdqu %%xmm3,0x10(%0,%1,1) \n" + "add $0x20,%0 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm2", "xmm3"); +} +#endif // HAS_HALFFLOATROW_F16C + #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. -void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, +void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, int width) { uintptr_t pixel_temp; - asm volatile ( - // 1 pixel loop. - LABELALIGN - "1: \n" - "movzb " MEMACCESS(0) ",%1 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x4,0) " \n" - "movzb " MEMACCESS2(-0x3,0) ",%1 \n" - MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x3,0) " \n" - "movzb " MEMACCESS2(-0x2,0) ",%1 \n" - MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x2,0) " \n" - "movzb " MEMACCESS2(-0x1,0) ",%1 \n" - MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x1,0) " \n" - "dec %2 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "=&d"(pixel_temp), // %1 - "+r"(width) // %2 - : "r"(table_argb) // %3 - : "memory", "cc"); + asm volatile( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "movzb -0x1(%0),%1 \n" + "movzb 0x03(%3,%1,4),%1 \n" + "mov %b1,-0x1(%0) \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); } #endif // HAS_ARGBCOLORTABLEROW_X86 #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. -void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { +void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { uintptr_t pixel_temp; - asm volatile ( - // 1 pixel loop. - LABELALIGN - "1: \n" - "movzb " MEMACCESS(0) ",%1 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x4,0) " \n" - "movzb " MEMACCESS2(-0x3,0) ",%1 \n" - MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x3,0) " \n" - "movzb " MEMACCESS2(-0x2,0) ",%1 \n" - MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 - "mov %b1," MEMACCESS2(-0x2,0) " \n" - "dec %2 \n" - "jg 1b \n" - : "+r"(dst_argb), // %0 - "=&d"(pixel_temp), // %1 - "+r"(width) // %2 - : "r"(table_argb) // %3 - : "memory", "cc"); + asm volatile( + // 1 pixel loop. + LABELALIGN + "1: \n" + "movzb (%0),%1 \n" + "lea 0x4(%0),%0 \n" + "movzb 0x00(%3,%1,4),%1 \n" + "mov %b1,-0x4(%0) \n" + "movzb -0x3(%0),%1 \n" + "movzb 0x01(%3,%1,4),%1 \n" + "mov %b1,-0x3(%0) \n" + "movzb -0x2(%0),%1 \n" + "movzb 0x02(%3,%1,4),%1 \n" + "mov %b1,-0x2(%0) \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "=&d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); } #endif // HAS_RGBCOLORTABLEROW_X86 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, +void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, int width, - const uint8* luma, uint32 lumacoeff) { + const uint8_t* luma, + uint32_t lumacoeff) { uintptr_t pixel_temp; uintptr_t table_temp; - asm volatile ( - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0x8,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" + asm volatile( + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(2) ",%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "phaddw %%xmm0,%%xmm0 \n" - "pand %%xmm4,%%xmm0 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" + // 4 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%2),%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movzb " MEMACCESS(2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS(3) " \n" - "movzb " MEMACCESS2(0x1,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x1,3) " \n" - "movzb " MEMACCESS2(0x2,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x2,3) " \n" - "movzb " MEMACCESS2(0x3,2) ",%0 \n" - "mov %b0," MEMACCESS2(0x3,3) " \n" + "movzb (%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,(%3) \n" + "movzb 0x1(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x1(%3) \n" + "movzb 0x2(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x2(%3) \n" + "movzb 0x3(%2),%0 \n" + "mov %b0,0x3(%3) \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movzb " MEMACCESS2(0x4,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x4,3) " \n" - "movzb " MEMACCESS2(0x5,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x5,3) " \n" - "movzb " MEMACCESS2(0x6,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x6,3) " \n" - "movzb " MEMACCESS2(0x7,2) ",%0 \n" - "mov %b0," MEMACCESS2(0x7,3) " \n" + "movzb 0x4(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x4(%3) \n" + "movzb 0x5(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x5(%3) \n" + "movzb 0x6(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x6(%3) \n" + "movzb 0x7(%2),%0 \n" + "mov %b0,0x7(%3) \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" - "pshufd $0x39,%%xmm0,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movzb " MEMACCESS2(0x8,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x8,3) " \n" - "movzb " MEMACCESS2(0x9,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0x9,3) " \n" - "movzb " MEMACCESS2(0xa,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xa,3) " \n" - "movzb " MEMACCESS2(0xb,2) ",%0 \n" - "mov %b0," MEMACCESS2(0xb,3) " \n" + "movzb 0x8(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x8(%3) \n" + "movzb 0x9(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0x9(%3) \n" + "movzb 0xa(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xa(%3) \n" + "movzb 0xb(%2),%0 \n" + "mov %b0,0xb(%3) \n" - "movd %%xmm0,%k1 \n" // 32 bit offset - "add %5,%1 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" - "movzb " MEMACCESS2(0xc,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xc,3) " \n" - "movzb " MEMACCESS2(0xd,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xd,3) " \n" - "movzb " MEMACCESS2(0xe,2) ",%0 \n" - MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 - "mov %b0," MEMACCESS2(0xe,3) " \n" - "movzb " MEMACCESS2(0xf,2) ",%0 \n" - "mov %b0," MEMACCESS2(0xf,3) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "lea " MEMLEA(0x10,3) ",%3 \n" - "sub $0x4,%4 \n" - "jg 1b \n" - : "=&d"(pixel_temp), // %0 - "=&a"(table_temp), // %1 - "+r"(src_argb), // %2 - "+r"(dst_argb), // %3 - "+rm"(width) // %4 - : "r"(luma), // %5 - "rm"(lumacoeff) // %6 - : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" - ); + "movzb 0xc(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xc(%3) \n" + "movzb 0xd(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xd(%3) \n" + "movzb 0xe(%2),%0 \n" + "movzb 0x00(%1,%0,1),%0 \n" + "mov %b0,0xe(%3) \n" + "movzb 0xf(%2),%0 \n" + "mov %b0,0xf(%3) \n" + "lea 0x10(%2),%2 \n" + "lea 0x10(%3),%3 \n" + "sub $0x4,%4 \n" + "jg 1b \n" + : "=&d"(pixel_temp), // %0 + "=&a"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 + : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_mips.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_mips.cc deleted file mode 100644 index 285f0b5adc..0000000000 --- a/media/libvpx/libvpx/third_party/libyuv/source/row_mips.cc +++ /dev/null @@ -1,782 +0,0 @@ -/* - * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// The following are available on Mips platforms: -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) - -#ifdef HAS_COPYROW_MIPS -void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { - __asm__ __volatile__ ( - ".set noreorder \n" - ".set noat \n" - "slti $at, %[count], 8 \n" - "bne $at ,$zero, $last8 \n" - "xor $t8, %[src], %[dst] \n" - "andi $t8, $t8, 0x3 \n" - - "bne $t8, $zero, unaligned \n" - "negu $a3, %[dst] \n" - // make dst/src aligned - "andi $a3, $a3, 0x3 \n" - "beq $a3, $zero, $chk16w \n" - // word-aligned now count is the remining bytes count - "subu %[count], %[count], $a3 \n" - - "lwr $t8, 0(%[src]) \n" - "addu %[src], %[src], $a3 \n" - "swr $t8, 0(%[dst]) \n" - "addu %[dst], %[dst], $a3 \n" - - // Now the dst/src are mutually word-aligned with word-aligned addresses - "$chk16w: \n" - "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? - // t8 is the byte count after 64-byte chunks - "beq %[count], $t8, chk8w \n" - // There will be at most 1 32-byte chunk after it - "subu $a3, %[count], $t8 \n" // the reminder - // Here a3 counts bytes in 16w chunks - "addu $a3, %[dst], $a3 \n" - // Now a3 is the final dst after 64-byte chunks - "addu $t0, %[dst], %[count] \n" - // t0 is the "past the end" address - - // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past - // the "t0-32" address - // This means: for x=128 the last "safe" a1 address is "t0-160" - // Alternatively, for x=64 the last "safe" a1 address is "t0-96" - // we will use "pref 30,128(a1)", so "t0-160" is the limit - "subu $t9, $t0, 160 \n" - // t9 is the "last safe pref 30,128(a1)" address - "pref 0, 0(%[src]) \n" // first line of src - "pref 0, 32(%[src]) \n" // second line of src - "pref 0, 64(%[src]) \n" - "pref 30, 32(%[dst]) \n" - // In case the a1 > t9 don't use "pref 30" at all - "sgtu $v1, %[dst], $t9 \n" - "bgtz $v1, $loop16w \n" - "nop \n" - // otherwise, start with using pref30 - "pref 30, 64(%[dst]) \n" - "$loop16w: \n" - "pref 0, 96(%[src]) \n" - "lw $t0, 0(%[src]) \n" - "bgtz $v1, $skip_pref30_96 \n" // skip - "lw $t1, 4(%[src]) \n" - "pref 30, 96(%[dst]) \n" // continue - "$skip_pref30_96: \n" - "lw $t2, 8(%[src]) \n" - "lw $t3, 12(%[src]) \n" - "lw $t4, 16(%[src]) \n" - "lw $t5, 20(%[src]) \n" - "lw $t6, 24(%[src]) \n" - "lw $t7, 28(%[src]) \n" - "pref 0, 128(%[src]) \n" - // bring the next lines of src, addr 128 - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "lw $t0, 32(%[src]) \n" - "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1) - "lw $t1, 36(%[src]) \n" - "pref 30, 128(%[dst]) \n" // set dest, addr 128 - "$skip_pref30_128: \n" - "lw $t2, 40(%[src]) \n" - "lw $t3, 44(%[src]) \n" - "lw $t4, 48(%[src]) \n" - "lw $t5, 52(%[src]) \n" - "lw $t6, 56(%[src]) \n" - "lw $t7, 60(%[src]) \n" - "pref 0, 160(%[src]) \n" - // bring the next lines of src, addr 160 - "sw $t0, 32(%[dst]) \n" - "sw $t1, 36(%[dst]) \n" - "sw $t2, 40(%[dst]) \n" - "sw $t3, 44(%[dst]) \n" - "sw $t4, 48(%[dst]) \n" - "sw $t5, 52(%[dst]) \n" - "sw $t6, 56(%[dst]) \n" - "sw $t7, 60(%[dst]) \n" - - "addiu %[dst], %[dst], 64 \n" // adding 64 to dest - "sgtu $v1, %[dst], $t9 \n" - "bne %[dst], $a3, $loop16w \n" - " addiu %[src], %[src], 64 \n" // adding 64 to src - "move %[count], $t8 \n" - - // Here we have src and dest word-aligned but less than 64-bytes to go - - "chk8w: \n" - "pref 0, 0x0(%[src]) \n" - "andi $t8, %[count], 0x1f \n" // 32-byte chunk? - // the t8 is the reminder count past 32-bytes - "beq %[count], $t8, chk1w \n" - // count=t8,no 32-byte chunk - " nop \n" - - "lw $t0, 0(%[src]) \n" - "lw $t1, 4(%[src]) \n" - "lw $t2, 8(%[src]) \n" - "lw $t3, 12(%[src]) \n" - "lw $t4, 16(%[src]) \n" - "lw $t5, 20(%[src]) \n" - "lw $t6, 24(%[src]) \n" - "lw $t7, 28(%[src]) \n" - "addiu %[src], %[src], 32 \n" - - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "addiu %[dst], %[dst], 32 \n" - - "chk1w: \n" - "andi %[count], $t8, 0x3 \n" - // now count is the reminder past 1w chunks - "beq %[count], $t8, $last8 \n" - " subu $a3, $t8, %[count] \n" - // a3 is count of bytes in 1w chunks - "addu $a3, %[dst], $a3 \n" - // now a3 is the dst address past the 1w chunks - // copying in words (4-byte chunks) - "$wordCopy_loop: \n" - "lw $t3, 0(%[src]) \n" - // the first t3 may be equal t0 ... optimize? - "addiu %[src], %[src],4 \n" - "addiu %[dst], %[dst],4 \n" - "bne %[dst], $a3,$wordCopy_loop \n" - " sw $t3, -4(%[dst]) \n" - - // For the last (<8) bytes - "$last8: \n" - "blez %[count], leave \n" - " addu $a3, %[dst], %[count] \n" // a3 -last dst address - "$last8loop: \n" - "lb $v1, 0(%[src]) \n" - "addiu %[src], %[src], 1 \n" - "addiu %[dst], %[dst], 1 \n" - "bne %[dst], $a3, $last8loop \n" - " sb $v1, -1(%[dst]) \n" - - "leave: \n" - " j $ra \n" - " nop \n" - - // - // UNALIGNED case - // - - "unaligned: \n" - // got here with a3="negu a1" - "andi $a3, $a3, 0x3 \n" // a1 is word aligned? - "beqz $a3, $ua_chk16w \n" - " subu %[count], %[count], $a3 \n" - // bytes left after initial a3 bytes - "lwr $v1, 0(%[src]) \n" - "lwl $v1, 3(%[src]) \n" - "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3 - "swr $v1, 0(%[dst]) \n" - "addu %[dst], %[dst], $a3 \n" - // below the dst will be word aligned (NOTE1) - "$ua_chk16w: \n" - "andi $t8, %[count], 0x3f \n" // whole 64-B chunks? - // t8 is the byte count after 64-byte chunks - "beq %[count], $t8, ua_chk8w \n" - // if a2==t8, no 64-byte chunks - // There will be at most 1 32-byte chunk after it - "subu $a3, %[count], $t8 \n" // the reminder - // Here a3 counts bytes in 16w chunks - "addu $a3, %[dst], $a3 \n" - // Now a3 is the final dst after 64-byte chunks - "addu $t0, %[dst], %[count] \n" // t0 "past the end" - "subu $t9, $t0, 160 \n" - // t9 is the "last safe pref 30,128(a1)" address - "pref 0, 0(%[src]) \n" // first line of src - "pref 0, 32(%[src]) \n" // second line addr 32 - "pref 0, 64(%[src]) \n" - "pref 30, 32(%[dst]) \n" - // safe, as we have at least 64 bytes ahead - // In case the a1 > t9 don't use "pref 30" at all - "sgtu $v1, %[dst], $t9 \n" - "bgtz $v1, $ua_loop16w \n" - // skip "pref 30,64(a1)" for too short arrays - " nop \n" - // otherwise, start with using pref30 - "pref 30, 64(%[dst]) \n" - "$ua_loop16w: \n" - "pref 0, 96(%[src]) \n" - "lwr $t0, 0(%[src]) \n" - "lwl $t0, 3(%[src]) \n" - "lwr $t1, 4(%[src]) \n" - "bgtz $v1, $ua_skip_pref30_96 \n" - " lwl $t1, 7(%[src]) \n" - "pref 30, 96(%[dst]) \n" - // continue setting up the dest, addr 96 - "$ua_skip_pref30_96: \n" - "lwr $t2, 8(%[src]) \n" - "lwl $t2, 11(%[src]) \n" - "lwr $t3, 12(%[src]) \n" - "lwl $t3, 15(%[src]) \n" - "lwr $t4, 16(%[src]) \n" - "lwl $t4, 19(%[src]) \n" - "lwr $t5, 20(%[src]) \n" - "lwl $t5, 23(%[src]) \n" - "lwr $t6, 24(%[src]) \n" - "lwl $t6, 27(%[src]) \n" - "lwr $t7, 28(%[src]) \n" - "lwl $t7, 31(%[src]) \n" - "pref 0, 128(%[src]) \n" - // bring the next lines of src, addr 128 - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "lwr $t0, 32(%[src]) \n" - "lwl $t0, 35(%[src]) \n" - "lwr $t1, 36(%[src]) \n" - "bgtz $v1, ua_skip_pref30_128 \n" - " lwl $t1, 39(%[src]) \n" - "pref 30, 128(%[dst]) \n" - // continue setting up the dest, addr 128 - "ua_skip_pref30_128: \n" - - "lwr $t2, 40(%[src]) \n" - "lwl $t2, 43(%[src]) \n" - "lwr $t3, 44(%[src]) \n" - "lwl $t3, 47(%[src]) \n" - "lwr $t4, 48(%[src]) \n" - "lwl $t4, 51(%[src]) \n" - "lwr $t5, 52(%[src]) \n" - "lwl $t5, 55(%[src]) \n" - "lwr $t6, 56(%[src]) \n" - "lwl $t6, 59(%[src]) \n" - "lwr $t7, 60(%[src]) \n" - "lwl $t7, 63(%[src]) \n" - "pref 0, 160(%[src]) \n" - // bring the next lines of src, addr 160 - "sw $t0, 32(%[dst]) \n" - "sw $t1, 36(%[dst]) \n" - "sw $t2, 40(%[dst]) \n" - "sw $t3, 44(%[dst]) \n" - "sw $t4, 48(%[dst]) \n" - "sw $t5, 52(%[dst]) \n" - "sw $t6, 56(%[dst]) \n" - "sw $t7, 60(%[dst]) \n" - - "addiu %[dst],%[dst],64 \n" // adding 64 to dest - "sgtu $v1,%[dst],$t9 \n" - "bne %[dst],$a3,$ua_loop16w \n" - " addiu %[src],%[src],64 \n" // adding 64 to src - "move %[count],$t8 \n" - - // Here we have src and dest word-aligned but less than 64-bytes to go - - "ua_chk8w: \n" - "pref 0, 0x0(%[src]) \n" - "andi $t8, %[count], 0x1f \n" // 32-byte chunk? - // the t8 is the reminder count - "beq %[count], $t8, $ua_chk1w \n" - // when count==t8, no 32-byte chunk - - "lwr $t0, 0(%[src]) \n" - "lwl $t0, 3(%[src]) \n" - "lwr $t1, 4(%[src]) \n" - "lwl $t1, 7(%[src]) \n" - "lwr $t2, 8(%[src]) \n" - "lwl $t2, 11(%[src]) \n" - "lwr $t3, 12(%[src]) \n" - "lwl $t3, 15(%[src]) \n" - "lwr $t4, 16(%[src]) \n" - "lwl $t4, 19(%[src]) \n" - "lwr $t5, 20(%[src]) \n" - "lwl $t5, 23(%[src]) \n" - "lwr $t6, 24(%[src]) \n" - "lwl $t6, 27(%[src]) \n" - "lwr $t7, 28(%[src]) \n" - "lwl $t7, 31(%[src]) \n" - "addiu %[src], %[src], 32 \n" - - "sw $t0, 0(%[dst]) \n" - "sw $t1, 4(%[dst]) \n" - "sw $t2, 8(%[dst]) \n" - "sw $t3, 12(%[dst]) \n" - "sw $t4, 16(%[dst]) \n" - "sw $t5, 20(%[dst]) \n" - "sw $t6, 24(%[dst]) \n" - "sw $t7, 28(%[dst]) \n" - "addiu %[dst], %[dst], 32 \n" - - "$ua_chk1w: \n" - "andi %[count], $t8, 0x3 \n" - // now count is the reminder past 1w chunks - "beq %[count], $t8, ua_smallCopy \n" - "subu $a3, $t8, %[count] \n" - // a3 is count of bytes in 1w chunks - "addu $a3, %[dst], $a3 \n" - // now a3 is the dst address past the 1w chunks - - // copying in words (4-byte chunks) - "$ua_wordCopy_loop: \n" - "lwr $v1, 0(%[src]) \n" - "lwl $v1, 3(%[src]) \n" - "addiu %[src], %[src], 4 \n" - "addiu %[dst], %[dst], 4 \n" - // note: dst=a1 is word aligned here, see NOTE1 - "bne %[dst], $a3, $ua_wordCopy_loop \n" - " sw $v1,-4(%[dst]) \n" - - // Now less than 4 bytes (value in count) left to copy - "ua_smallCopy: \n" - "beqz %[count], leave \n" - " addu $a3, %[dst], %[count] \n" // a3 = last dst address - "$ua_smallCopy_loop: \n" - "lb $v1, 0(%[src]) \n" - "addiu %[src], %[src], 1 \n" - "addiu %[dst], %[dst], 1 \n" - "bne %[dst],$a3,$ua_smallCopy_loop \n" - " sb $v1, -1(%[dst]) \n" - - "j $ra \n" - " nop \n" - ".set at \n" - ".set reorder \n" - : [dst] "+r" (dst), [src] "+r" (src) - : [count] "r" (count) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", - "t8", "t9", "a3", "v1", "at" - ); -} -#endif // HAS_COPYROW_MIPS - -// DSPR2 functions -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ - (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) - -void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "srl $t4, %[width], 4 \n" // multiplies of 16 - "blez $t4, 2f \n" - " andi %[width], %[width], 0xf \n" // residual - - "1: \n" - "addiu $t4, $t4, -1 \n" - "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 - "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2 - "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4 - "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6 - "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8 - "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10 - "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12 - "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14 - "addiu %[src_uv], %[src_uv], 32 \n" - "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 - "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 - "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 - "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 - "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 - "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 - "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 - "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 - "sw $t9, 0(%[dst_v]) \n" - "sw $t0, 0(%[dst_u]) \n" - "sw $t1, 4(%[dst_v]) \n" - "sw $t2, 4(%[dst_u]) \n" - "sw $t3, 8(%[dst_v]) \n" - "sw $t5, 8(%[dst_u]) \n" - "sw $t6, 12(%[dst_v]) \n" - "sw $t7, 12(%[dst_u]) \n" - "addiu %[dst_v], %[dst_v], 16 \n" - "bgtz $t4, 1b \n" - " addiu %[dst_u], %[dst_u], 16 \n" - - "beqz %[width], 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, 0(%[src_uv]) \n" - "lbu $t1, 1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], 2 \n" - "addiu %[width], %[width], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[width], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r" (src_uv), - [width] "+r" (width), - [dst_u] "+r" (dst_u), - [dst_v] "+r" (dst_v) - : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6", "t7", "t8", "t9" - ); -} - -void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "srl $t4, %[width], 4 \n" // multiplies of 16 - "andi $t5, %[width], 0xf \n" - "blez $t4, 2f \n" - " addu %[src], %[src], %[width] \n" // src += width - - "1: \n" - "lw $t0, -16(%[src]) \n" // |3|2|1|0| - "lw $t1, -12(%[src]) \n" // |7|6|5|4| - "lw $t2, -8(%[src]) \n" // |11|10|9|8| - "lw $t3, -4(%[src]) \n" // |15|14|13|12| - "wsbh $t0, $t0 \n" // |2|3|0|1| - "wsbh $t1, $t1 \n" // |6|7|4|5| - "wsbh $t2, $t2 \n" // |10|11|8|9| - "wsbh $t3, $t3 \n" // |14|15|12|13| - "rotr $t0, $t0, 16 \n" // |0|1|2|3| - "rotr $t1, $t1, 16 \n" // |4|5|6|7| - "rotr $t2, $t2, 16 \n" // |8|9|10|11| - "rotr $t3, $t3, 16 \n" // |12|13|14|15| - "addiu %[src], %[src], -16 \n" - "addiu $t4, $t4, -1 \n" - "sw $t3, 0(%[dst]) \n" // |15|14|13|12| - "sw $t2, 4(%[dst]) \n" // |11|10|9|8| - "sw $t1, 8(%[dst]) \n" // |7|6|5|4| - "sw $t0, 12(%[dst]) \n" // |3|2|1|0| - "bgtz $t4, 1b \n" - " addiu %[dst], %[dst], 16 \n" - "beqz $t5, 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, -1(%[src]) \n" - "addiu $t5, $t5, -1 \n" - "addiu %[src], %[src], -1 \n" - "sb $t0, 0(%[dst]) \n" - "bgez $t5, 2b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src] "+r" (src), [dst] "+r" (dst) - : [width] "r" (width) - : "t0", "t1", "t2", "t3", "t4", "t5" - ); -} - -void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { - int x; - int y; - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "addu $t4, %[width], %[width] \n" - "srl %[x], %[width], 4 \n" - "andi %[y], %[width], 0xf \n" - "blez %[x], 2f \n" - " addu %[src_uv], %[src_uv], $t4 \n" - - "1: \n" - "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| - "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| - "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8| - "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12| - "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16| - "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20| - "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24| - "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28| - - "rotr $t0, $t0, 16 \n" // |1|0|3|2| - "rotr $t1, $t1, 16 \n" // |5|4|7|6| - "rotr $t2, $t2, 16 \n" // |9|8|11|10| - "rotr $t3, $t3, 16 \n" // |13|12|15|14| - "rotr $t4, $t4, 16 \n" // |17|16|19|18| - "rotr $t6, $t6, 16 \n" // |21|20|23|22| - "rotr $t7, $t7, 16 \n" // |25|24|27|26| - "rotr $t8, $t8, 16 \n" // |29|28|31|30| - "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6| - "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7| - "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14| - "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15| - "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22| - "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23| - "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30| - "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31| - "addiu %[src_uv], %[src_uv], -32 \n" - "addiu %[x], %[x], -1 \n" - "swr $t4, 0(%[dst_u]) \n" - "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24| - "swr $t6, 0(%[dst_v]) \n" - "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25| - "swr $t2, 4(%[dst_u]) \n" - "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16| - "swr $t3, 4(%[dst_v]) \n" - "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17| - "swr $t0, 8(%[dst_u]) \n" - "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8| - "swr $t1, 8(%[dst_v]) \n" - "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9| - "swr $t9, 12(%[dst_u]) \n" - "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0| - "swr $t5, 12(%[dst_v]) \n" - "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1| - "addiu %[dst_v], %[dst_v], 16 \n" - "bgtz %[x], 1b \n" - " addiu %[dst_u], %[dst_u], 16 \n" - "beqz %[y], 3f \n" - " nop \n" - "b 2f \n" - " nop \n" - - "2: \n" - "lbu $t0, -2(%[src_uv]) \n" - "lbu $t1, -1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], -2 \n" - "addiu %[y], %[y], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[y], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r" (src_uv), - [dst_u] "+r" (dst_u), - [dst_v] "+r" (dst_v), - [x] "=&r" (x), - [y] "=&r" (y) - : [width] "r" (width) - : "t0", "t1", "t2", "t3", "t4", - "t5", "t7", "t8", "t9" - ); -} - -// Convert (4 Y and 2 VU) I422 and arrange RGB values into -// t5 = | 0 | B0 | 0 | b0 | -// t4 = | 0 | B1 | 0 | b1 | -// t9 = | 0 | G0 | 0 | g0 | -// t8 = | 0 | G1 | 0 | g1 | -// t2 = | 0 | R0 | 0 | r0 | -// t1 = | 0 | R1 | 0 | r1 | -#define YUVTORGB \ - "lw $t0, 0(%[y_buf]) \n" \ - "lhu $t1, 0(%[u_buf]) \n" \ - "lhu $t2, 0(%[v_buf]) \n" \ - "preceu.ph.qbr $t1, $t1 \n" \ - "preceu.ph.qbr $t2, $t2 \n" \ - "preceu.ph.qbra $t3, $t0 \n" \ - "preceu.ph.qbla $t0, $t0 \n" \ - "subu.ph $t1, $t1, $s5 \n" \ - "subu.ph $t2, $t2, $s5 \n" \ - "subu.ph $t3, $t3, $s4 \n" \ - "subu.ph $t0, $t0, $s4 \n" \ - "mul.ph $t3, $t3, $s0 \n" \ - "mul.ph $t0, $t0, $s0 \n" \ - "shll.ph $t4, $t1, 0x7 \n" \ - "subu.ph $t4, $t4, $t1 \n" \ - "mul.ph $t6, $t1, $s1 \n" \ - "mul.ph $t1, $t2, $s2 \n" \ - "addq_s.ph $t5, $t4, $t3 \n" \ - "addq_s.ph $t4, $t4, $t0 \n" \ - "shra.ph $t5, $t5, 6 \n" \ - "shra.ph $t4, $t4, 6 \n" \ - "addiu %[u_buf], 2 \n" \ - "addiu %[v_buf], 2 \n" \ - "addu.ph $t6, $t6, $t1 \n" \ - "mul.ph $t1, $t2, $s3 \n" \ - "addu.ph $t9, $t6, $t3 \n" \ - "addu.ph $t8, $t6, $t0 \n" \ - "shra.ph $t9, $t9, 6 \n" \ - "shra.ph $t8, $t8, 6 \n" \ - "addu.ph $t2, $t1, $t3 \n" \ - "addu.ph $t1, $t1, $t0 \n" \ - "shra.ph $t2, $t2, 6 \n" \ - "shra.ph $t1, $t1, 6 \n" \ - "subu.ph $t5, $t5, $s5 \n" \ - "subu.ph $t4, $t4, $s5 \n" \ - "subu.ph $t9, $t9, $s5 \n" \ - "subu.ph $t8, $t8, $s5 \n" \ - "subu.ph $t2, $t2, $s5 \n" \ - "subu.ph $t1, $t1, $s5 \n" \ - "shll_s.ph $t5, $t5, 8 \n" \ - "shll_s.ph $t4, $t4, 8 \n" \ - "shll_s.ph $t9, $t9, 8 \n" \ - "shll_s.ph $t8, $t8, 8 \n" \ - "shll_s.ph $t2, $t2, 8 \n" \ - "shll_s.ph $t1, $t1, 8 \n" \ - "shra.ph $t5, $t5, 8 \n" \ - "shra.ph $t4, $t4, 8 \n" \ - "shra.ph $t9, $t9, 8 \n" \ - "shra.ph $t8, $t8, 8 \n" \ - "shra.ph $t2, $t2, 8 \n" \ - "shra.ph $t1, $t1, 8 \n" \ - "addu.ph $t5, $t5, $s5 \n" \ - "addu.ph $t4, $t4, $s5 \n" \ - "addu.ph $t9, $t9, $s5 \n" \ - "addu.ph $t8, $t8, $s5 \n" \ - "addu.ph $t2, $t2, $s5 \n" \ - "addu.ph $t1, $t1, $s5 \n" - -// TODO(fbarchard): accept yuv conversion constants. -void I422ToARGBRow_DSPR2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "beqz %[width], 2f \n" - " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| - "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| - "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| - "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| - "repl.ph $s4, 16 \n" // |0|16|0|16| - "repl.ph $s5, 128 \n" // |128|128| // clipping - "lui $s6, 0xff00 \n" - "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| - - "1: \n" - YUVTORGB -// Arranging into argb format - "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1| - "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0| - "addiu %[width], -4 \n" - "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0| - "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0| - "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| - - "addiu %[y_buf], 4 \n" - "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| - "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| - "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0| - "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0| - "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1| - "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1| - "sll $t9, $t9, 16 \n" - "sll $t8, $t8, 16 \n" - "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0| - "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0| -// Store results. - "sw $t2, 0(%[rgb_buf]) \n" - "sw $t0, 4(%[rgb_buf]) \n" - "sw $t1, 8(%[rgb_buf]) \n" - "sw $t3, 12(%[rgb_buf]) \n" - "bnez %[width], 1b \n" - " addiu %[rgb_buf], 16 \n" - "2: \n" - ".set pop \n" - :[y_buf] "+r" (y_buf), - [u_buf] "+r" (u_buf), - [v_buf] "+r" (v_buf), - [width] "+r" (width), - [rgb_buf] "+r" (rgb_buf) - : - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9", - "s0", "s1", "s2", "s3", - "s4", "s5", "s6" - ); -} - -// Bilinear filter 8x2 -> 8x1 -void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - int y0_fraction = 256 - source_y_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "replv.ph $t0, %[y0_fraction] \n" - "replv.ph $t1, %[source_y_fraction] \n" - - "1: \n" - "lw $t2, 0(%[src_ptr]) \n" - "lw $t3, 0(%[src_ptr1]) \n" - "lw $t4, 4(%[src_ptr]) \n" - "lw $t5, 4(%[src_ptr1]) \n" - "muleu_s.ph.qbl $t6, $t2, $t0 \n" - "muleu_s.ph.qbr $t7, $t2, $t0 \n" - "muleu_s.ph.qbl $t8, $t3, $t1 \n" - "muleu_s.ph.qbr $t9, $t3, $t1 \n" - "muleu_s.ph.qbl $t2, $t4, $t0 \n" - "muleu_s.ph.qbr $t3, $t4, $t0 \n" - "muleu_s.ph.qbl $t4, $t5, $t1 \n" - "muleu_s.ph.qbr $t5, $t5, $t1 \n" - "addq.ph $t6, $t6, $t8 \n" - "addq.ph $t7, $t7, $t9 \n" - "addq.ph $t2, $t2, $t4 \n" - "addq.ph $t3, $t3, $t5 \n" - "shra.ph $t6, $t6, 8 \n" - "shra.ph $t7, $t7, 8 \n" - "shra.ph $t2, $t2, 8 \n" - "shra.ph $t3, $t3, 8 \n" - "precr.qb.ph $t6, $t6, $t7 \n" - "precr.qb.ph $t2, $t2, $t3 \n" - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[src_ptr1], %[src_ptr1], 8 \n" - "addiu %[dst_width], %[dst_width], -8 \n" - "sw $t6, 0(%[dst_ptr]) \n" - "sw $t2, 4(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[dst_ptr], %[dst_ptr], 8 \n" - - ".set pop \n" - : [dst_ptr] "+r" (dst_ptr), - [src_ptr1] "+r" (src_ptr1), - [src_ptr] "+r" (src_ptr), - [dst_width] "+r" (dst_width) - : [source_y_fraction] "r" (source_y_fraction), - [y0_fraction] "r" (y0_fraction), - [src_stride] "r" (src_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); -} -#endif // __mips_dsp_rev >= 2 - -#endif // defined(__mips__) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc new file mode 100644 index 0000000000..4fb2631f0b --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_msa.cc @@ -0,0 +1,3512 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define ALPHA_VAL (-1) + +// Fill YUV -> RGB conversion constants into vectors +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \ + { \ + ub = __msa_fill_w(yuvconst->kUVToB[0]); \ + vr = __msa_fill_w(yuvconst->kUVToR[1]); \ + ug = __msa_fill_w(yuvconst->kUVToG[0]); \ + vg = __msa_fill_w(yuvconst->kUVToG[1]); \ + bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \ + bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \ + br = __msa_fill_w(yuvconst->kUVBiasR[0]); \ + yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ + } + +// Load YUV 422 pixel data +#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64_t y_m; \ + uint32_t u_m, v_m; \ + v4i32 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LW(psrc_u); \ + v_m = LW(psrc_v); \ + out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \ + out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \ + out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \ + } + +// Clip input vector elements between 0 to 255 +#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \ + { \ + v4i32 max_m = __msa_ldi_w(0xFF); \ + \ + in0 = __msa_maxi_s_w(in0, 0); \ + in1 = __msa_maxi_s_w(in1, 0); \ + in2 = __msa_maxi_s_w(in2, 0); \ + in3 = __msa_maxi_s_w(in3, 0); \ + in4 = __msa_maxi_s_w(in4, 0); \ + in5 = __msa_maxi_s_w(in5, 0); \ + in0 = __msa_min_s_w(max_m, in0); \ + in1 = __msa_min_s_w(max_m, in1); \ + in2 = __msa_min_s_w(max_m, in2); \ + in3 = __msa_min_s_w(max_m, in3); \ + in4 = __msa_min_s_w(max_m, in4); \ + in5 = __msa_min_s_w(max_m, in5); \ + } + +// Convert 8 pixels of YUV 420 to RGB. +#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ + { \ + v8i16 vec0_m, vec1_m; \ + v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ + v4i32 reg5_m, reg6_m, reg7_m; \ + v16i8 zero_m = {0}; \ + \ + vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ + vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ + reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ + reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ + reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \ + reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \ + reg0_m *= yg; \ + reg1_m *= yg; \ + reg2_m *= ubvr; \ + reg3_m *= ubvr; \ + reg0_m = __msa_srai_w(reg0_m, 16); \ + reg1_m = __msa_srai_w(reg1_m, 16); \ + reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ + reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ + reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ + reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ + reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ + reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ + reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ + reg5_m = reg0_m - reg5_m; \ + reg6_m = reg1_m - reg6_m; \ + reg2_m = reg0_m - reg2_m; \ + reg3_m = reg1_m - reg3_m; \ + reg7_m = reg0_m - reg7_m; \ + reg4_m = reg1_m - reg4_m; \ + reg5_m += bb; \ + reg6_m += bb; \ + reg7_m += bg; \ + reg4_m += bg; \ + reg2_m += br; \ + reg3_m += br; \ + reg5_m = __msa_srai_w(reg5_m, 6); \ + reg6_m = __msa_srai_w(reg6_m, 6); \ + reg7_m = __msa_srai_w(reg7_m, 6); \ + reg4_m = __msa_srai_w(reg4_m, 6); \ + reg2_m = __msa_srai_w(reg2_m, 6); \ + reg3_m = __msa_srai_w(reg3_m, 6); \ + CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ + out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ + out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ + out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + } + +// Pack and Store 8 ARGB values. +#define STOREARGB(in0, in1, in2, in3, pdst_argb) \ + { \ + v8i16 vec0_m, vec1_m; \ + v16u8 dst0_m, dst1_m; \ + vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ + dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ + ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ + } + +// Takes ARGB input and calculates Y. +#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \ + y_out) \ + { \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8u16 reg0_m, reg1_m; \ + \ + vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \ + vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \ + vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \ + vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \ + reg0_m = __msa_dotp_u_h(vec0_m, const0); \ + reg1_m = __msa_dotp_u_h(vec1_m, const0); \ + reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \ + reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \ + reg0_m += const2; \ + reg1_m += const2; \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \ + y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + } + +// Loads current and next row of ARGB input and averages it to calculate U and V +#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \ + { \ + v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v16u8 vec8_m, vec9_m; \ + v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ + v8u16 reg8_m, reg9_m; \ + \ + src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \ + src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \ + src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \ + src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \ + src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \ + src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \ + src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \ + src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \ + vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ + vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ + vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ + vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ + vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ + vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ + vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ + vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ + reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \ + reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \ + reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \ + reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \ + reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \ + reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \ + reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \ + reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \ + reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ + reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ + argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ + argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \ + src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \ + src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \ + src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \ + src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \ + src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \ + src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \ + src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \ + vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ + vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ + vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ + vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ + vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ + vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ + vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ + vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ + reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \ + reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \ + reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \ + reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \ + reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \ + reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \ + reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \ + reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \ + reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ + reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ + reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ + reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ + reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ + reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ + reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ + reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ + argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ + argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + } + +// Takes ARGB input and calculates U and V. +#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ + shf0, shf1, shf2, shf3, v_out, u_out) \ + { \ + v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \ + \ + vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_h(vec0_m, const1); \ + reg1_m = __msa_dotp_u_h(vec1_m, const1); \ + reg2_m = __msa_dotp_u_h(vec4_m, const1); \ + reg3_m = __msa_dotp_u_h(vec5_m, const1); \ + reg0_m += const3; \ + reg1_m += const3; \ + reg2_m += const3; \ + reg3_m += const3; \ + reg0_m -= __msa_dotp_u_h(vec2_m, const0); \ + reg1_m -= __msa_dotp_u_h(vec3_m, const0); \ + reg2_m -= __msa_dotp_u_h(vec6_m, const2); \ + reg3_m -= __msa_dotp_u_h(vec7_m, const2); \ + v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \ + u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ + } + +// Load I444 pixel data +#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ + { \ + uint64_t y_m, u_m, v_m; \ + v2i64 zero_m = {0}; \ + y_m = LD(psrc_y); \ + u_m = LD(psrc_u); \ + v_m = LD(psrc_v); \ + out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \ + out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \ + out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ + } + +void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; + src += width - 64; + + for (x = 0; x < width; x += 64) { + LD_UB4(src, 16, src3, src2, src1, src0); + VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); + VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += 64; + src -= 64; + } +} + +void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; + src += width * 4 - 64; + + for (x = 0; x < width; x += 16) { + LD_UB4(src, 16, src3, src2, src1, src0); + VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); + VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += 64; + src -= 64; + } +} + +void I422ToYUY2Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1); + ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3); + ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_yuy2 += 64; + } +} + +void I422ToUYVYRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); + ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); + ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_uyvy += 64; + } +} + +void I422ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb += 32; + } +} + +void I422ToRGBARow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(alpha, vec0, vec1, vec2, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb += 32; + } +} + +void I422AlphaToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int64_t data_a; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v4i32 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + data_a = LD(src_a); + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); + STOREARGB(vec0, vec1, vec2, src3, dst_argb); + src_y += 8; + src_u += 4; + src_v += 4; + src_a += 8; + dst_argb += 32; + } +} + +void I422ToRGB24Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int32_t width) { + int x; + int64_t data_u, data_v; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 reg0, reg1, reg2, reg3; + v2i64 zero = {0}; + v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; + v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; + v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, + 11, 29, 12, 13, 30, 14, 15, 31}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); + data_u = LD(src_u); + data_v = LD(src_v); + src1 = (v16u8)__msa_insert_d(zero, 0, data_u); + src2 = (v16u8)__msa_insert_d(zero, 0, data_v); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); + src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec3, vec4, vec5); + reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); + reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); + reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); + ST_UB2(dst0, dst1, dst_argb, 16); + ST_UB(dst2, (dst_argb + 32)); + src_y += 16; + src_u += 8; + src_v += 8; + dst_argb += 48; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. +void I422ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec2, vec1); + vec0 = __msa_srai_h(vec0, 3); + vec1 = __msa_srai_h(vec1, 3); + vec2 = __msa_srai_h(vec2, 2); + vec1 = __msa_slli_h(vec1, 11); + vec2 = __msa_slli_h(vec2, 5); + vec0 |= vec1; + dst0 = (v16u8)(vec2 | vec0); + ST_UB(dst0, dst_rgb565); + src_y += 8; + src_u += 4; + src_v += 4; + dst_rgb565 += 16; + } +} + +// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. +void I422ToARGB4444Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v8u16 reg0, reg1, reg2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + reg0 = (v8u16)__msa_srai_h(vec0, 4); + reg1 = (v8u16)__msa_srai_h(vec1, 4); + reg2 = (v8u16)__msa_srai_h(vec2, 4); + reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); + reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); + reg1 |= const_0xF000; + reg0 |= reg2; + dst0 = (v16u8)(reg1 | reg0); + ST_UB(dst0, dst_argb4444); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb4444 += 16; + } +} + +void I422ToARGB1555Row_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0; + v8i16 vec0, vec1, vec2; + v8u16 reg0, reg1, reg2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + reg0 = (v8u16)__msa_srai_h(vec0, 3); + reg1 = (v8u16)__msa_srai_h(vec1, 3); + reg2 = (v8u16)__msa_srai_h(vec2, 3); + reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); + reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); + reg1 |= const_0x8000; + reg0 |= reg2; + dst0 = (v16u8)(reg1 | reg0); + ST_UB(dst0, dst_argb1555); + src_y += 8; + src_u += 4; + src_v += 4; + dst_argb1555 += 16; + } +} + +void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_yuy2 += 64; + dst_y += 32; + } +} + +void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7); + src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + src_yuy2_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_uyvy += 64; + dst_y += 32; + } +} + +void UYVYToUVRow_MSA(const uint8_t* src_uyvy, + int src_stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7); + src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + src_uyvy_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16i8 zero = {0}; + v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); + v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); + v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0); + reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1); + reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2); + reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3); + reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0); + reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1); + reg0 *= const_0x19; + reg1 *= const_0x19; + reg2 *= const_0x81; + reg3 *= const_0x81; + reg4 *= const_0x42; + reg5 *= const_0x42; + reg0 += reg2; + reg1 += reg3; + reg0 += reg4; + reg1 += reg5; + reg0 += const_0x1080; + reg1 += const_0x1080; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ARGBToUVRow_MSA(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* src_argb0_next = src_argb0 + src_stride_argb; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; + v16u8 dst0, dst1; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); + reg0 = __msa_hadd_u_h(vec8, vec8); + reg1 = __msa_hadd_u_h(vec9, vec9); + reg2 = __msa_hadd_u_h(vec4, vec4); + reg3 = __msa_hadd_u_h(vec5, vec5); + reg4 = __msa_hadd_u_h(vec0, vec0); + reg5 = __msa_hadd_u_h(vec1, vec1); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); + vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); + vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); + vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); + vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); + reg0 += __msa_hadd_u_h(vec8, vec8); + reg1 += __msa_hadd_u_h(vec9, vec9); + reg2 += __msa_hadd_u_h(vec4, vec4); + reg3 += __msa_hadd_u_h(vec5, vec5); + reg4 += __msa_hadd_u_h(vec0, vec0); + reg5 += __msa_hadd_u_h(vec1, vec1); + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2); + reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2); + reg6 = reg0 * const_0x70; + reg7 = reg1 * const_0x70; + reg8 = reg2 * const_0x4A; + reg9 = reg3 * const_0x4A; + reg6 += const_0x8080; + reg7 += const_0x8080; + reg8 += reg4 * const_0x26; + reg9 += reg5 * const_0x26; + reg0 *= const_0x12; + reg1 *= const_0x12; + reg2 *= const_0x5E; + reg3 *= const_0x5E; + reg4 *= const_0x70; + reg5 *= const_0x70; + reg2 += reg0; + reg3 += reg1; + reg4 += const_0x8080; + reg5 += const_0x8080; + reg6 -= reg8; + reg7 -= reg9; + reg4 -= reg2; + reg5 -= reg3; + reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8); + reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_argb0 += 128; + src_argb0_next += 128; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; + v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14, + 16, 17, 18, 20, 21, 22, 24, 25}; + v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20, + 21, 22, 24, 25, 26, 28, 29, 30}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; + v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12, + 18, 17, 16, 22, 21, 20, 26, 25}; + v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22, + 21, 20, 26, 25, 24, 30, 29, 28}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { + int x; + v16u8 src0, src1, dst0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); + vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3); + vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5); + vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3); + vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3); + vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5); + vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); + vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); + vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1); + vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); + vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2); + vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2); + vec0 = __msa_binsli_b(vec0, vec1, 2); + vec1 = __msa_binsli_b(vec2, vec3, 4); + vec4 = __msa_binsli_b(vec4, vec5, 2); + vec5 = __msa_binsli_b(vec6, vec7, 4); + vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4); + dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + v16u8 src0, src1, dst0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); + vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2); + vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3); + vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); + vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); + vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1); + vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3); + vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2); + vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3); + vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); + vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1); + vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1); + vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2); + vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2); + vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3); + vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3); + vec0 = __msa_binsli_b(vec0, vec1, 2); + vec5 = __msa_binsli_b(vec5, vec6, 2); + vec1 = __msa_binsli_b(vec2, vec3, 5); + vec6 = __msa_binsli_b(vec7, vec8, 5); + vec1 = __msa_binsli_b(vec1, vec4, 0); + vec6 = __msa_binsli_b(vec6, vec9, 0); + vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5); + dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { + int x; + v16u8 src0, src1; + v16u8 vec0, vec1; + v16u8 dst0; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4); + vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4); + src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1); + src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1); + vec0 = __msa_binsli_b(vec0, src0, 3); + vec1 = __msa_binsli_b(vec1, src1, 3); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBToUV444Row_MSA(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int32_t width) { + int32_t x; + v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 vec8, vec9, vec10, vec11; + v8u16 const_112 = (v8u16)__msa_ldi_h(112); + v8u16 const_74 = (v8u16)__msa_ldi_h(74); + v8u16 const_38 = (v8u16)__msa_ldi_h(38); + v8u16 const_94 = (v8u16)__msa_ldi_h(94); + v8u16 const_18 = (v8u16)__msa_ldi_h(18); + v8u16 const_32896 = (v8u16)__msa_fill_h(32896); + v16i8 zero = {0}; + + for (x = width; x > 0; x -= 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0); + vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); + vec10 = vec0 * const_18; + vec11 = vec1 * const_18; + vec8 = vec2 * const_94; + vec9 = vec3 * const_94; + vec6 = vec4 * const_112; + vec7 = vec5 * const_112; + vec0 *= const_112; + vec1 *= const_112; + vec2 *= const_74; + vec3 *= const_74; + vec4 *= const_38; + vec5 *= const_38; + vec8 += vec10; + vec9 += vec11; + vec6 += const_32896; + vec7 += const_32896; + vec0 += const_32896; + vec1 += const_32896; + vec2 += vec4; + vec3 += vec5; + vec0 -= vec2; + vec1 -= vec3; + vec6 -= vec8; + vec7 -= vec9; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8); + vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_argb += 64; + dst_u += 16; + dst_v += 16; + } +} + +void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + v8i16 zero = {0}; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAddRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBSubtractRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16); + dst0 = __msa_subs_u_b(src0, src2); + dst1 = __msa_subs_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBAttenuateRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 zero = {0}; + v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1); + vec4 = (v8u16)__msa_fill_h(vec0[3]); + vec5 = (v8u16)__msa_fill_h(vec0[7]); + vec6 = (v8u16)__msa_fill_h(vec1[3]); + vec7 = (v8u16)__msa_fill_h(vec1[7]); + vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + vec6 = (v8u16)__msa_fill_h(vec2[3]); + vec7 = (v8u16)__msa_fill_h(vec2[7]); + vec8 = (v8u16)__msa_fill_h(vec3[3]); + vec9 = (v8u16)__msa_fill_h(vec3[7]); + vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5); + reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6); + reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6); + reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7); + reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); + reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24); + reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24); + reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24); + reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst0 = __msa_bmnz_v(dst0, src0, mask); + dst1 = __msa_bmnz_v(dst1, src1, mask); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, + uint8_t* dst_rgb, + uint32_t dither4, + int width) { + int x; + v16u8 src0, src1, dst0, vec0, vec1; + v8i16 vec_d0; + v8i16 reg0, reg1, reg2; + v16i8 zero = {0}; + v8i16 max = __msa_ldi_h(0xFF); + + vec_d0 = (v8i16)__msa_fill_w(dither4); + vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); + reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1); + reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0); + reg0 += vec_d0; + reg1 += vec_d0; + reg2 += vec_d0; + reg0 = __msa_maxi_s_h((v8i16)reg0, 0); + reg1 = __msa_maxi_s_h((v8i16)reg1, 0); + reg2 = __msa_maxi_s_h((v8i16)reg2, 0); + reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0); + reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1); + reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2); + reg0 = __msa_srai_h(reg0, 3); + reg2 = __msa_srai_h(reg2, 3); + reg1 = __msa_srai_h(reg1, 2); + reg2 = __msa_slli_h(reg2, 11); + reg1 = __msa_slli_h(reg1, 5); + reg0 |= reg1; + dst0 = (v16u8)(reg0 | reg2); + ST_UB(dst0, dst_rgb); + src_argb += 32; + dst_rgb += 16; + } +} + +void ARGBShuffleRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + v16i8 vec0; + v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; + int32_t val = LW((int32_t*)shuffler); + + vec0 = (v16i8)__msa_fill_w(val); + shuffler_vec += vec0; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16); + dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBShadeRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + int x; + v16u8 src0, dst0; + v8u16 vec0, vec1; + v4u32 reg0, reg1, reg2, reg3, rgba_scale; + v8i16 zero = {0}; + + rgba_scale[0] = value; + rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale); + rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale); + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= rgba_scale; + reg1 *= rgba_scale; + reg2 *= rgba_scale; + reg3 *= rgba_scale; + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb += 16; + dst_argb += 16; + } +} + +void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0, dst1; + v8u16 reg0; + v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26); + v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); + vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); + reg0 = __msa_dotp_u_h(vec0, const_0x4B0F); + reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26); + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7); + vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); + vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2; + v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411); + v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23); + v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816); + v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D); + v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218); + v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32); + v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16); + vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); + vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1); + reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411); + reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816); + reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218); + reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23); + reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D); + reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32); + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7); + reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF); + reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF); + vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0); + vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1); + vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2); + vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); + ST_UB2(dst0, dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1; + v8u16 vec0, vec1, vec2, vec3; + v16u8 dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 0); + src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 16); + vec0 = (v8u16)__msa_andi_b(src0, 0x0F); + vec1 = (v8u16)__msa_andi_b(src1, 0x0F); + vec2 = (v8u16)__msa_andi_b(src0, 0xF0); + vec3 = (v8u16)__msa_andi_b(src1, 0xF0); + vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4); + vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4); + vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4); + vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_argb4444 += 32; + dst_argb += 64; + } +} + +void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { + int x; + v8u16 src0, src1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6; + v16u8 dst0, dst1, dst2, dst3; + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); + reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3); + reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3); + reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3); + reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2); + reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2); + reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2); + reg3 = -reg3; + reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4); + reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4); + reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5); + reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_argb1555 += 32; + dst_argb += 64; + } +} + +void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); + v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 16); + vec0 = src0 & const_0x1F; + vec1 = src0 & const_0x7E0; + vec2 = src0 & const_0xF800; + vec3 = src1 & const_0x1F; + vec4 = src1 & const_0x7E0; + vec5 = src1 & const_0xF800; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); + reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); + reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); + reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); + reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); + reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); + reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); + res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1); + res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3); + res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_rgb565 += 32; + dst_argb += 64; + } +} + +void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2; + v16u8 vec0, vec1, vec2; + v16u8 dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 32); + vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); + vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); + dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1); + dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_rgb24 += 48; + dst_argb += 64; + } +} + +void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, src1, src2; + v16u8 vec0, vec1, vec2; + v16u8 dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32); + vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); + vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); + dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1); + dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_raw += 48; + dst_argb += 64; + } +} + +void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v16u8 dst0; + v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); + v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); + v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 0); + src1 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3); + reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2); + reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2); + reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2); + reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3); + reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2); + reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2); + reg0 *= const_0x19; + reg1 *= const_0x19; + reg2 *= const_0x81; + reg3 *= const_0x81; + reg4 *= const_0x42; + reg5 *= const_0x42; + reg0 += reg2; + reg1 += reg3; + reg0 += reg4; + reg1 += reg5; + reg0 += const_0x1080; + reg1 += const_0x1080; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_y); + src_argb1555 += 32; + dst_y += 16; + } +} + +void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + int x; + v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5; + v4u32 res0, res1, res2, res3; + v16u8 dst0; + v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019); + v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042); + v8i16 const_0x1080 = __msa_fill_h(0x1080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); + v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 0); + src1 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 16); + vec0 = src0 & const_0x1F; + vec1 = src0 & const_0x7E0; + vec2 = src0 & const_0xF800; + vec3 = src1 & const_0x1F; + vec4 = src1 & const_0x7E0; + vec5 = src1 & const_0xF800; + reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); + reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); + reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); + reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); + reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); + reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); + reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); + reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); + reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); + reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); + reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); + reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); + vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0); + vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3); + vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3); + vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2); + vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2); + vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5); + vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5); + res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019); + res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019); + res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019); + res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019); + res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042); + res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042); + res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042); + res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042); + res0 = (v4u32)__msa_srai_w((v4i32)res0, 8); + res1 = (v4u32)__msa_srai_w((v4i32)res1, 8); + res2 = (v4u32)__msa_srai_w((v4i32)res2, 8); + res3 = (v4u32)__msa_srai_w((v4i32)res3, 8); + vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0); + vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_rgb565 += 32; + dst_y += 16; + } +} + +void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119); + v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; + v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, + 18, 19, 20, 21, 21, 22, 23, 24}; + v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; + v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); + reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); + reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119); + vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119); + vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42); + vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42); + vec0 += const_0x1080; + vec1 += const_0x1080; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_argb0 += 48; + dst_y += 16; + } +} + +void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142); + v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; + v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, + 18, 19, 20, 21, 21, 22, 23, 24}; + v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; + v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); + reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); + reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142); + vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142); + vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19); + vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19); + vec0 += const_0x1080; + vec1 += const_0x1080; + vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_y); + src_argb0 += 48; + dst_y += 16; + } +} + +void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint16_t* s = (const uint16_t*)src_argb1555; + const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555); + int64_t res0, res1; + v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); + src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); + src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); + src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + vec0 += src2 & const_0x1F; + vec1 += src3 & const_0x1F; + vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec2 = src0 & const_0x1F; + vec3 = src1 & const_0x1F; + vec2 += src2 & const_0x1F; + vec3 += src3 & const_0x1F; + vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + vec4 += src2 & const_0x1F; + vec5 += src3 & const_0x1F; + vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1); + vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); + vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1); + vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); + vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1); + vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6); + reg0 = vec6 * const_0x70; + reg1 = vec0 * const_0x4A; + reg2 = vec2 * const_0x70; + reg3 = vec0 * const_0x5E; + reg0 += const_0x8080; + reg1 += vec2 * const_0x26; + reg2 += const_0x8080; + reg3 += vec6 * const_0x12; + reg0 -= reg1; + reg2 -= reg3; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + s += 16; + t += 16; + dst_u += 8; + dst_v += 8; + } +} + +void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint16_t* s = (const uint16_t*)src_rgb565; + const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565); + int64_t res0, res1; + v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F); + + for (x = 0; x < width; x += 16) { + src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); + src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); + src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); + src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); + vec0 = src0 & const_0x1F; + vec1 = src1 & const_0x1F; + vec0 += src2 & const_0x1F; + vec1 += src3 & const_0x1F; + vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); + vec2 = src0 & const_0x3F; + vec3 = src1 & const_0x3F; + vec2 += src2 & const_0x3F; + vec3 += src3 & const_0x3F; + vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + src0 = (v8u16)__msa_srai_h((v8i16)src0, 6); + src1 = (v8u16)__msa_srai_h((v8i16)src1, 6); + src2 = (v8u16)__msa_srai_h((v8i16)src2, 6); + src3 = (v8u16)__msa_srai_h((v8i16)src3, 6); + vec4 = src0 & const_0x1F; + vec5 = src1 & const_0x1F; + vec4 += src2 & const_0x1F; + vec5 += src3 & const_0x1F; + vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1); + vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); + vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1); + vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); + reg0 = vec3 * const_0x70; + reg1 = vec1 * const_0x4A; + reg2 = vec4 * const_0x70; + reg3 = vec1 * const_0x5E; + reg0 += const_32896; + reg1 += vec4 * const_0x26; + reg2 += const_32896; + reg3 += vec3 * const_0x12; + reg0 -= reg1; + reg2 -= reg3; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + s += 16; + t += 16; + dst_u += 8; + dst_v += 8; + } +} + +void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + int64_t res0, res1; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 reg0, reg1, reg2, reg3; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32); + inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32); + src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); + src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); + src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); + src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); + src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); + src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); + src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); + src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); + src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); + src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); + src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); + src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); + src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); + src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); + vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); + reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); + reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); + reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); + reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); + reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); + reg0 = __msa_srai_h((v8i16)reg0, 2); + reg1 = __msa_srai_h((v8i16)reg1, 2); + reg2 = __msa_srai_h((v8i16)reg2, 2); + reg3 = __msa_srai_h((v8i16)reg3, 2); + vec4 = (v8u16)__msa_pckev_h(reg1, reg0); + vec5 = (v8u16)__msa_pckev_h(reg3, reg2); + vec6 = (v8u16)__msa_pckod_h(reg1, reg0); + vec7 = (v8u16)__msa_pckod_h(reg3, reg2); + vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); + vec3 = vec0 * const_0x70; + vec4 = vec1 * const_0x4A; + vec5 = vec2 * const_0x26; + vec2 *= const_0x70; + vec1 *= const_0x5E; + vec0 *= const_0x12; + reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); + reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); + reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); + reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); + reg0 += reg1; + reg2 += reg3; + reg0 = __msa_srai_h(reg0, 8); + reg2 = __msa_srai_h(reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + t += 48; + s += 48; + dst_u += 8; + dst_v += 8; + } +} + +void RAWToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + int64_t res0, res1; + v16u8 inp0, inp1, inp2, inp3, inp4, inp5; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 reg0, reg1, reg2, reg3; + v16u8 dst0; + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 16) { + inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32); + inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32); + src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); + src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); + src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); + src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); + src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); + src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); + src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); + src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); + src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); + src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); + src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); + src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); + src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); + src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); + vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); + reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); + reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); + reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); + reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); + reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); + reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); + reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); + reg0 = __msa_srai_h(reg0, 2); + reg1 = __msa_srai_h(reg1, 2); + reg2 = __msa_srai_h(reg2, 2); + reg3 = __msa_srai_h(reg3, 2); + vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); + vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); + vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); + vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + vec3 = vec0 * const_0x70; + vec4 = vec1 * const_0x4A; + vec5 = vec2 * const_0x26; + vec2 *= const_0x70; + vec1 *= const_0x5E; + vec0 *= const_0x12; + reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); + reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); + reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); + reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); + reg0 += reg1; + reg2 += reg3; + reg0 = __msa_srai_h(reg0, 8); + reg2 = __msa_srai_h(reg2, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); + res0 = __msa_copy_u_d((v2i64)dst0, 0); + res1 = __msa_copy_u_d((v2i64)dst0, 1); + SD(res0, dst_u); + SD(res1, dst_v); + t += 48; + s += 48; + dst_u += 8; + dst_v += 8; + } +} + +void NV12ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_uv += 8; + dst_argb += 32; + } +} + +void NV12ToRGB565Row_MSA(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + vec0 = vec0 >> 3; + vec1 = (vec1 >> 2) << 5; + vec2 = (vec2 >> 3) << 11; + dst0 = (v16u8)(vec0 | vec1 | vec2); + ST_UB(dst0, dst_rgb565); + src_y += 8; + src_uv += 8; + dst_rgb565 += 16; + } +} + +void NV21ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64_t val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v16u8 zero = {0}; + v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_vu); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_vu += 8; + dst_argb += 32; + } +} + +void SobelRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; + v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; + v16i8 const_0x4 = __msa_ldi_b(0x4); + v16i8 mask1 = mask0 + const_0x4; + v16i8 mask2 = mask1 + const_0x4; + v16i8 mask3 = mask2 + const_0x4; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0); + dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_y, 16); + src_sobelx += 32; + src_sobely += 32; + dst_y += 32; + } +} + +void SobelXYRow_MSA(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, vec1, vec2; + v16u8 reg0, reg1, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); + reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0); + reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); + v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26); + v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); + v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); + v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0; + v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); + v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281); + v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48); + ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, + dst0); + ST_UB(dst0, dst_y); + src_argb0 += 64; + dst_y += 16; + } +} + +void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, vec2, vec3; + v16u8 dst0, dst1; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; + v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F); + v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14); + v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((const v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((const v16i8*)t, 48); + src0 = __msa_aver_u_b(src0, src4); + src1 = __msa_aver_u_b(src1, src5); + src2 = __msa_aver_u_b(src2, src6); + src3 = __msa_aver_u_b(src3, src7); + src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); + src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); + vec0 = __msa_aver_u_b(src4, src6); + vec1 = __msa_aver_u_b(src5, src7); + src0 = (v16u8)__msa_ld_b((v16i8*)s, 64); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 80); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 96); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 112); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 64); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 80); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 96); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 112); + src0 = __msa_aver_u_b(src0, src4); + src1 = __msa_aver_u_b(src1, src5); + src2 = __msa_aver_u_b(src2, src6); + src3 = __msa_aver_u_b(src3, src7); + src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); + src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); + vec2 = __msa_aver_u_b(src4, src6); + vec3 = __msa_aver_u_b(src5, src7); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54, + const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_v); + ST_UB(dst1, dst_u); + s += 128; + t += 128; + dst_v += 16; + dst_u += 16; + } +} + +void BGRAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 dst0, dst1, vec0, vec1, vec2, vec3; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); + v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); + v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, vec0, vec1, vec2, vec3); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, + const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_v); + ST_UB(dst1, dst_u); + s += 128; + t += 128; + dst_v += 16; + dst_u += 16; + } +} + +void ABGRToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; + v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26); + v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070); + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, src0, src1, src2, src3); + ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E, + const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + s += 128; + t += 128; + dst_u += 16; + dst_v += 16; + } +} + +void RGBAToUVRow_MSA(const uint8_t* src_rgb0, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + const uint8_t* s = src_rgb0; + const uint8_t* t = src_rgb0 + src_stride_rgb; + v16u8 dst0, dst1, vec0, vec1, vec2, vec3; + v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; + v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, + 18, 19, 22, 23, 26, 27, 30, 31}; + v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; + v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; + v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A); + v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); + v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E); + v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + + for (x = 0; x < width; x += 32) { + READ_ARGB(s, t, vec0, vec1, vec2, vec3); + ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, + const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, + dst1); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + s += 128; + t += 128; + dst_u += 16; + dst_v += 16; + } +} + +void I444ToARGBRow_MSA(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2, dst0, dst1; + v8u16 vec0, vec1, vec2; + v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + + for (x = 0; x < width; x += 8) { + READI444(src_y, src_u, src_v, src0, src1, src2); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + reg0 *= vec_yg; + reg1 *= vec_yg; + reg0 = __msa_srai_w(reg0, 16); + reg1 = __msa_srai_w(reg1, 16); + reg4 = reg0 + vec_br; + reg5 = reg1 + vec_br; + reg2 = reg0 + vec_bg; + reg3 = reg1 + vec_bg; + reg0 += vec_bb; + reg1 += vec_bb; + vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); + reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); + reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); + reg0 -= reg6 * vec_ub; + reg1 -= reg7 * vec_ub; + reg2 -= reg6 * vec_ug; + reg3 -= reg7 * vec_ug; + reg4 -= reg8 * vec_vr; + reg5 -= reg9 * vec_vr; + reg2 -= reg8 * vec_vg; + reg3 -= reg9 * vec_vg; + reg0 = __msa_srai_w(reg0, 6); + reg1 = __msa_srai_w(reg1, 6); + reg2 = __msa_srai_w(reg2, 6); + reg3 = __msa_srai_w(reg3, 6); + reg4 = __msa_srai_w(reg4, 6); + reg5 = __msa_srai_w(reg5, 6); + CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); + vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); + vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); + dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); + dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); + ST_UB2(dst0, dst1, dst_argb, 16); + src_y += 8; + src_u += 8; + src_v += 8; + dst_argb += 32; + } +} + +void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; + v8i16 vec0, vec1; + v4i32 reg0, reg1, reg2, reg3; + v4i32 vec_yg = __msa_fill_w(0x4A35); + v8i16 vec_ygb = __msa_fill_h(0xFB78); + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + v8i16 max = __msa_ldi_h(0xFF); + v8i16 zero = {0}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + reg0 = (v4i32)__msa_ilvr_h(zero, vec0); + reg1 = (v4i32)__msa_ilvl_h(zero, vec0); + reg2 = (v4i32)__msa_ilvr_h(zero, vec1); + reg3 = (v4i32)__msa_ilvl_h(zero, vec1); + reg0 *= vec_yg; + reg1 *= vec_yg; + reg2 *= vec_yg; + reg3 *= vec_yg; + reg0 = __msa_srai_w(reg0, 16); + reg1 = __msa_srai_w(reg1, 16); + reg2 = __msa_srai_w(reg2, 16); + reg3 = __msa_srai_w(reg3, 16); + vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + vec0 += vec_ygb; + vec1 += vec_ygb; + vec0 = __msa_srai_h(vec0, 6); + vec1 = __msa_srai_h(vec1, 6); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0); + res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0); + res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0); + res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_y += 16; + dst_argb += 64; + } +} + +void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { + int x; + v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); + dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); + dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_y += 16; + dst_argb += 64; + } +} + +void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_yuy2, 0); + src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); + src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_yuy2 += 16; + dst_argb += 32; + } +} + +void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + int x; + v16u8 src0, src1, src2; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_uyvy, 0); + src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); + src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + STOREARGB(vec0, vec1, vec2, alpha, dst_argb); + src_uyvy += 16; + dst_argb += 32; + } +} + +void InterpolateRow_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int32_t source_y_fraction) { + int32_t y1_fraction = source_y_fraction; + int32_t y0_fraction = 256 - y1_fraction; + uint16_t y_fractions; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, y_frac; + + if (0 == y1_fraction) { + memcpy(dst_ptr, src_ptr, width); + return; + } + + if (128 == y1_fraction) { + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + dst0 = __msa_aver_u_b(src0, src2); + dst1 = __msa_aver_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_ptr, 16); + s += 32; + t += 32; + dst_ptr += 32; + } + return; + } + + y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); + y_frac = (v8u16)__msa_fill_h(y_fractions); + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac); + vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac); + vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac); + vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac); + vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8); + vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8); + vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8); + vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + ST_UB2(dst0, dst1, dst_ptr, 16); + s += 32; + t += 32; + dst_ptr += 32; + } +} + +void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) { + int x; + v4i32 dst0 = __builtin_msa_fill_w(v32); + + for (x = 0; x < width; x += 4) { + ST_UB(dst0, dst_argb); + dst_argb += 16; + } +} + +void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + int x; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17}; + v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13, + 18, 17, 16, 21, 20, 19, 24, 23}; + v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25, + 24, 23, 28, 27, 26, 31, 30, 29}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32); + src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8); + src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1); + ST_UB2(dst0, dst1, dst_rgb24, 16); + ST_UB(dst2, (dst_rgb24 + 32)); + src_raw += 48; + dst_rgb24 += 48; + } +} + +void MergeUVRow_MSA(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + int x; + v16u8 src0, src1, dst0, dst1; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_u, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_v, 0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); + ST_UB2(dst0, dst1, dst_uv, 16); + src_u += 16; + src_v += 16; + dst_uv += 32; + } +} + +void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + int i; + v16u8 src0, src1, src2, src3, vec0, vec1, dst0; + + for (i = 0; i < width; i += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48); + vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_a); + src_argb += 64; + dst_a += 16; + } +} + +void ARGBBlendRow_MSA(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 vec8, vec9, vec10, vec11, vec12, vec13; + v8u16 const_256 = (v8u16)__msa_ldi_h(256); + v16u8 const_255 = (v16u8)__msa_ldi_b(255); + v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16); + vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3); + vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3); + vec8 = (v8u16)__msa_fill_h(vec0[3]); + vec9 = (v8u16)__msa_fill_h(vec0[7]); + vec10 = (v8u16)__msa_fill_h(vec1[3]); + vec11 = (v8u16)__msa_fill_h(vec1[7]); + vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); + vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); + vec10 = (v8u16)__msa_fill_h(vec2[3]); + vec11 = (v8u16)__msa_fill_h(vec2[7]); + vec12 = (v8u16)__msa_fill_h(vec3[3]); + vec13 = (v8u16)__msa_fill_h(vec3[7]); + vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); + vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12); + vec8 = const_256 - vec8; + vec9 = const_256 - vec9; + vec10 = const_256 - vec10; + vec11 = const_256 - vec11; + vec8 *= vec4; + vec9 *= vec5; + vec10 *= vec6; + vec11 *= vec7; + vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8); + vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); + vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); + vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); + vec0 += vec8; + vec1 += vec9; + vec2 += vec10; + vec3 += vec11; + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst0 = __msa_bmnz_v(dst0, const_255, mask); + dst1 = __msa_bmnz_v(dst1, const_255, mask); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBQuantizeRow_MSA(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v4i32 vec_scale = __msa_fill_w(scale); + v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size); + v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset); + v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; + v16i8 zero = {0}; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48); + vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3); + vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3); + tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); + tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); + tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); + tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); + tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2); + tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2); + tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3); + tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3); + tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4); + tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4); + tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5); + tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5); + tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6); + tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6); + tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7); + tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7); + tmp0 *= vec_scale; + tmp1 *= vec_scale; + tmp2 *= vec_scale; + tmp3 *= vec_scale; + tmp4 *= vec_scale; + tmp5 *= vec_scale; + tmp6 *= vec_scale; + tmp7 *= vec_scale; + tmp8 *= vec_scale; + tmp9 *= vec_scale; + tmp10 *= vec_scale; + tmp11 *= vec_scale; + tmp12 *= vec_scale; + tmp13 *= vec_scale; + tmp14 *= vec_scale; + tmp15 *= vec_scale; + tmp0 >>= 16; + tmp1 >>= 16; + tmp2 >>= 16; + tmp3 >>= 16; + tmp4 >>= 16; + tmp5 >>= 16; + tmp6 >>= 16; + tmp7 >>= 16; + tmp8 >>= 16; + tmp9 >>= 16; + tmp10 >>= 16; + tmp11 >>= 16; + tmp12 >>= 16; + tmp13 >>= 16; + tmp14 >>= 16; + tmp15 >>= 16; + vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); + vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); + vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); + vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); + dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); + dst0 *= vec_int_sz; + dst1 *= vec_int_sz; + dst2 *= vec_int_sz; + dst3 *= vec_int_sz; + dst0 += vec_int_ofst; + dst1 += vec_int_ofst; + dst2 += vec_int_ofst; + dst3 += vec_int_ofst; + dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0); + dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1); + dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2); + dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + dst_argb += 64; + } +} + +void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + int32_t x; + v16i8 src0; + v16u8 src1, src2, dst0, dst1; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v16i8 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + src0 = __msa_ld_b((v16i8*)matrix_argb, 0); + vec0 = (v8i16)__msa_ilvr_b(zero, src0); + vec1 = (v8i16)__msa_ilvl_b(zero, src0); + + for (x = 0; x < width; x += 8) { + src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16); + vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); + vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); + vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); + vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2); + vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3); + vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4); + vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5); + vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2); + vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3); + vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4); + vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5); + vec10 = vec2 * vec0; + vec11 = vec2 * vec1; + vec12 = vec6 * vec0; + vec13 = vec6 * vec1; + tmp0 = __msa_hadd_s_w(vec10, vec10); + tmp1 = __msa_hadd_s_w(vec11, vec11); + tmp2 = __msa_hadd_s_w(vec12, vec12); + tmp3 = __msa_hadd_s_w(vec13, vec13); + vec14 = vec3 * vec0; + vec15 = vec3 * vec1; + vec16 = vec7 * vec0; + vec17 = vec7 * vec1; + tmp4 = __msa_hadd_s_w(vec14, vec14); + tmp5 = __msa_hadd_s_w(vec15, vec15); + tmp6 = __msa_hadd_s_w(vec16, vec16); + tmp7 = __msa_hadd_s_w(vec17, vec17); + vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + tmp0 = __msa_hadd_s_w(vec10, vec10); + tmp1 = __msa_hadd_s_w(vec11, vec11); + tmp2 = __msa_hadd_s_w(vec12, vec12); + tmp3 = __msa_hadd_s_w(vec13, vec13); + tmp0 = __msa_srai_w(tmp0, 6); + tmp1 = __msa_srai_w(tmp1, 6); + tmp2 = __msa_srai_w(tmp2, 6); + tmp3 = __msa_srai_w(tmp3, 6); + vec2 = vec4 * vec0; + vec6 = vec4 * vec1; + vec3 = vec8 * vec0; + vec7 = vec8 * vec1; + tmp8 = __msa_hadd_s_w(vec2, vec2); + tmp9 = __msa_hadd_s_w(vec6, vec6); + tmp10 = __msa_hadd_s_w(vec3, vec3); + tmp11 = __msa_hadd_s_w(vec7, vec7); + vec4 = vec5 * vec0; + vec8 = vec5 * vec1; + vec5 = vec9 * vec0; + vec9 = vec9 * vec1; + tmp12 = __msa_hadd_s_w(vec4, vec4); + tmp13 = __msa_hadd_s_w(vec8, vec8); + tmp14 = __msa_hadd_s_w(vec5, vec5); + tmp15 = __msa_hadd_s_w(vec9, vec9); + vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); + vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); + vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); + vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); + tmp4 = __msa_hadd_s_w(vec14, vec14); + tmp5 = __msa_hadd_s_w(vec15, vec15); + tmp6 = __msa_hadd_s_w(vec16, vec16); + tmp7 = __msa_hadd_s_w(vec17, vec17); + tmp4 = __msa_srai_w(tmp4, 6); + tmp5 = __msa_srai_w(tmp5, 6); + tmp6 = __msa_srai_w(tmp6, 6); + tmp7 = __msa_srai_w(tmp7, 6); + vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); + vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); + vec10 = __msa_maxi_s_h(vec10, 0); + vec11 = __msa_maxi_s_h(vec11, 0); + vec12 = __msa_maxi_s_h(vec12, 0); + vec13 = __msa_maxi_s_h(vec13, 0); + vec10 = __msa_min_s_h(vec10, max); + vec11 = __msa_min_s_h(vec11, max); + vec12 = __msa_min_s_h(vec12, max); + vec13 = __msa_min_s_h(vec13, max); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb += 32; + dst_argb += 32; + } +} + +void SplitUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_u, 16); + ST_UB2(dst2, dst3, dst_v, 16); + src_uv += 64; + dst_u += 32; + dst_v += 32; + } +} + +void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) { + int x; + v16u8 dst0 = (v16u8)__msa_fill_b(v8); + + for (x = 0; x < width; x += 16) { + ST_UB(dst0, dst); + dst += 16; + } +} + +void MirrorUVRow_MSA(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; + v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; + + src_uv += (2 * width); + + for (x = 0; x < width; x += 32) { + src_uv -= 64; + src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16); + src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48); + dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_v, 16); + ST_UB2(dst2, dst3, dst_u, 16); + dst_u += 32; + dst_v += 32; + } +} + +void SobelXRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int32_t width) { + int x; + v16u8 src0, src1, src2, src3, src4, src5, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; + v16i8 tmp = __msa_ldi_b(8); + v16i8 mask1 = mask0 + tmp; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 16); + src2 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0); + src3 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 16); + src4 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 0); + src5 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 16); + vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); + vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobelx); + src_y0 += 16; + src_y1 += 16; + src_y2 += 16; + dst_sobelx += 16; + } +} + +void SobelYRow_MSA(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int32_t width) { + int x; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + vec0 -= vec2; + vec1 -= vec3; + vec6[0] = src_y0[16] - src_y1[16]; + vec6[1] = src_y0[17] - src_y1[17]; + vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); + vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); + vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); + vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobely); + src_y0 += 16; + src_y1 += 16; + dst_sobely += 16; + } +} + +void HalfFloatRow_MSA(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + int i; + v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7; + v4f32 mult_vec; + v8i16 zero = {0}; + mult_vec[0] = 1.9259299444e-34f * scale; + mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0); + + for (i = 0; i < width; i += 32) { + src0 = (v8u16)__msa_ld_h((v8i16*)src, 0); + src1 = (v8u16)__msa_ld_h((v8i16*)src, 16); + src2 = (v8u16)__msa_ld_h((v8i16*)src, 32); + src3 = (v8u16)__msa_ld_h((v8i16*)src, 48); + vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0); + vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0); + vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1); + vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1); + vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2); + vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2); + vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3); + vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3); + fvec0 = __msa_ffint_u_w(vec0); + fvec1 = __msa_ffint_u_w(vec1); + fvec2 = __msa_ffint_u_w(vec2); + fvec3 = __msa_ffint_u_w(vec3); + fvec4 = __msa_ffint_u_w(vec4); + fvec5 = __msa_ffint_u_w(vec5); + fvec6 = __msa_ffint_u_w(vec6); + fvec7 = __msa_ffint_u_w(vec7); + fvec0 *= mult_vec; + fvec1 *= mult_vec; + fvec2 *= mult_vec; + fvec3 *= mult_vec; + fvec4 *= mult_vec; + fvec5 *= mult_vec; + fvec6 *= mult_vec; + fvec7 *= mult_vec; + vec0 = ((v4u32)fvec0) >> 13; + vec1 = ((v4u32)fvec1) >> 13; + vec2 = ((v4u32)fvec2) >> 13; + vec3 = ((v4u32)fvec3) >> 13; + vec4 = ((v4u32)fvec4) >> 13; + vec5 = ((v4u32)fvec5) >> 13; + vec6 = ((v4u32)fvec6) >> 13; + vec7 = ((v4u32)fvec7) >> 13; + dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); + dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2); + dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); + dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); + ST_UH2(dst0, dst1, dst, 8); + ST_UH2(dst2, dst3, dst + 16, 8); + src += 32; + dst += 32; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc index 909df060c6..ff87e74c62 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_neon.cc @@ -10,6 +10,8 @@ #include "libyuv/row.h" +#include + #ifdef __cplusplus namespace libyuv { extern "C" { @@ -20,1446 +22,1311 @@ extern "C" { !defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.32 {d2[0]}, [%1]! \n" \ - MEMACCESS(2) \ - "vld1.32 {d2[1]}, [%2]! \n" - -// Read 8 Y, 2 U and 2 V from 422 -#define READYUV411 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.16 {d2[0]}, [%1]! \n" \ - MEMACCESS(2) \ - "vld1.16 {d2[1]}, [%2]! \n" \ - "vmov.u8 d3, d2 \n" \ - "vzip.u8 d2, d3 \n" +#define READYUV422 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.32 {d2[0]}, [%1]! \n" \ + "vld1.32 {d2[1]}, [%2]! \n" // Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - MEMACCESS(2) \ - "vld1.8 {d3}, [%2]! \n" \ - "vpaddl.u8 q1, q1 \n" \ - "vrshrn.u16 d2, q1, #1 \n" +#define READYUV444 \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vld1.8 {d3}, [%2]! \n" \ + "vpaddl.u8 q1, q1 \n" \ + "vrshrn.u16 d2, q1, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 -#define READYUV400 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - "vmov.u8 d2, #128 \n" +#define READYUV400 \ + "vld1.8 {d0}, [%0]! \n" \ + "vmov.u8 d2, #128 \n" // Read 8 Y and 4 UV from NV12 #define READNV12 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" // Read 8 Y and 4 VU from NV21 #define READNV21 \ - MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ - "vuzp.u8 d3, d2 \n" \ - "vtrn.u32 d2, d3 \n" + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \ + "vuzp.u8 d3, d2 \n" \ + "vtrn.u32 d2, d3 \n" // Read 8 YUY2 -#define READYUY2 \ - MEMACCESS(0) \ - "vld2.8 {d0, d2}, [%0]! \n" \ - "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" +#define READYUY2 \ + "vld2.8 {d0, d2}, [%0]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" // Read 8 UYVY -#define READUYVY \ - MEMACCESS(0) \ - "vld2.8 {d2, d3}, [%0]! \n" \ - "vmov.u8 d0, d3 \n" \ - "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" +#define READUYVY \ + "vld2.8 {d2, d3}, [%0]! \n" \ + "vmov.u8 d0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" -#define YUVTORGB_SETUP \ - MEMACCESS([kUVToRB]) \ - "vld1.8 {d24}, [%[kUVToRB]] \n" \ - MEMACCESS([kUVToG]) \ - "vld1.8 {d25}, [%[kUVToG]] \n" \ - MEMACCESS([kUVBiasBGR]) \ - "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ - MEMACCESS([kUVBiasBGR]) \ - "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ - MEMACCESS([kUVBiasBGR]) \ - "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ - MEMACCESS([kYToRgb]) \ - "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" +#define YUVTORGB_SETUP \ + "vld1.8 {d24}, [%[kUVToRB]] \n" \ + "vld1.8 {d25}, [%[kUVToG]] \n" \ + "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ + "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ + "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ + "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" -#define YUVTORGB \ - "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\ - "vmull.u8 q9, d2, d25 \n" /* u/v G component */\ - "vmovl.u8 q0, d0 \n" /* Y */\ - "vmovl.s16 q10, d1 \n" \ - "vmovl.s16 q0, d0 \n" \ - "vmul.s32 q10, q10, q15 \n" \ - "vmul.s32 q0, q0, q15 \n" \ - "vqshrun.s32 d0, q0, #16 \n" \ - "vqshrun.s32 d1, q10, #16 \n" /* Y */\ - "vadd.s16 d18, d19 \n" \ - "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\ - "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\ - "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\ - "vaddw.u16 q1, q1, d16 \n" \ - "vaddw.u16 q10, q10, d17 \n" \ - "vaddw.u16 q3, q3, d18 \n" \ - "vqadd.s16 q8, q0, q13 \n" /* B */ \ - "vqadd.s16 q9, q0, q14 \n" /* R */ \ - "vqadd.s16 q0, q0, q4 \n" /* G */ \ - "vqadd.s16 q8, q8, q1 \n" /* B */ \ - "vqadd.s16 q9, q9, q10 \n" /* R */ \ - "vqsub.s16 q0, q0, q3 \n" /* G */ \ - "vqshrun.s16 d20, q8, #6 \n" /* B */ \ - "vqshrun.s16 d22, q9, #6 \n" /* R */ \ - "vqshrun.s16 d21, q0, #6 \n" /* G */ +#define YUVTORGB \ + "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \ + "vmull.u8 q9, d2, d25 \n" /* u/v G component */ \ + "vmovl.u8 q0, d0 \n" /* Y */ \ + "vmovl.s16 q10, d1 \n" \ + "vmovl.s16 q0, d0 \n" \ + "vmul.s32 q10, q10, q15 \n" \ + "vmul.s32 q0, q0, q15 \n" \ + "vqshrun.s32 d0, q0, #16 \n" \ + "vqshrun.s32 d1, q10, #16 \n" /* Y */ \ + "vadd.s16 d18, d19 \n" \ + "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */ \ + "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */ \ + "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/ \ + "vaddw.u16 q1, q1, d16 \n" \ + "vaddw.u16 q10, q10, d17 \n" \ + "vaddw.u16 q3, q3, d18 \n" \ + "vqadd.s16 q8, q0, q13 \n" /* B */ \ + "vqadd.s16 q9, q0, q14 \n" /* R */ \ + "vqadd.s16 q0, q0, q4 \n" /* G */ \ + "vqadd.s16 q8, q8, q1 \n" /* B */ \ + "vqadd.s16 q9, q9, q10 \n" /* R */ \ + "vqsub.s16 q0, q0, q3 \n" /* G */ \ + "vqshrun.s16 d20, q8, #6 \n" /* B */ \ + "vqshrun.s16 d22, q9, #6 \n" /* R */ \ + "vqshrun.s16 d21, q0, #6 \n" /* G */ -void I444ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV444 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV444 YUVTORGB + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void I422ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void I422AlphaToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* dst_argb, +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %5, %5, #8 \n" - MEMACCESS(3) - "vld1.8 {d23}, [%3]! \n" - MEMACCESS(4) - "vst4.8 {d20, d21, d22, d23}, [%4]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %5, %5, #8 \n" + "vld1.8 {d23}, [%3]! \n" + "vst4.8 {d20, d21, d22, d23}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV411 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d19, #255 \n" // YUVTORGB modified d19 + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void I422ToRGBARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" // d19 modified by YUVTORGB - MEMACCESS(3) - "vst4.8 {d19, d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgba), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); -} - -void I422ToRGB24Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - MEMACCESS(3) - "vst3.8 {d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb24), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vst3.8 {d20, d21, d22}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -#define ARGBTORGB565 \ - "vshll.u8 q0, d22, #8 \n" /* R */ \ - "vshll.u8 q8, d21, #8 \n" /* G */ \ - "vshll.u8 q9, d20, #8 \n" /* B */ \ - "vsri.16 q0, q8, #5 \n" /* RG */ \ - "vsri.16 q0, q9, #11 \n" /* RGB */ +#define ARGBTORGB565 \ + "vshll.u8 q0, d22, #8 \n" /* R */ \ + "vshll.u8 q8, d21, #8 \n" /* G */ \ + "vshll.u8 q9, d20, #8 \n" /* B */ \ + "vsri.16 q0, q8, #5 \n" /* RG */ \ + "vsri.16 q0, q9, #11 \n" /* RGB */ -void I422ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - ARGBTORGB565 - MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb565), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -#define ARGBTOARGB1555 \ - "vshll.u8 q0, d23, #8 \n" /* A */ \ - "vshll.u8 q8, d22, #8 \n" /* R */ \ - "vshll.u8 q9, d21, #8 \n" /* G */ \ - "vshll.u8 q10, d20, #8 \n" /* B */ \ - "vsri.16 q0, q8, #1 \n" /* AR */ \ - "vsri.16 q0, q9, #6 \n" /* ARG */ \ - "vsri.16 q0, q10, #11 \n" /* ARGB */ +#define ARGBTOARGB1555 \ + "vshll.u8 q0, d23, #8 \n" /* A */ \ + "vshll.u8 q8, d22, #8 \n" /* R */ \ + "vshll.u8 q9, d21, #8 \n" /* G */ \ + "vshll.u8 q10, d20, #8 \n" /* B */ \ + "vsri.16 q0, q8, #1 \n" /* AR */ \ + "vsri.16 q0, q9, #6 \n" /* ARG */ \ + "vsri.16 q0, q10, #11 \n" /* ARGB */ -void I422ToARGB1555Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" - ARGBTOARGB1555 - MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb1555), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB1555 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -#define ARGBTOARGB4444 \ - "vshr.u8 d20, d20, #4 \n" /* B */ \ - "vbic.32 d21, d21, d4 \n" /* G */ \ - "vshr.u8 d22, d22, #4 \n" /* R */ \ - "vbic.32 d23, d23, d4 \n" /* A */ \ - "vorr d0, d20, d21 \n" /* BG */ \ - "vorr d1, d22, d23 \n" /* RA */ \ - "vzip.u8 d0, d1 \n" /* BGRA */ +#define ARGBTOARGB4444 \ + "vshr.u8 d20, d20, #4 \n" /* B */ \ + "vbic.32 d21, d21, d4 \n" /* G */ \ + "vshr.u8 d22, d22, #4 \n" /* R */ \ + "vbic.32 d23, d23, d4 \n" /* A */ \ + "vorr d0, d20, d21 \n" /* BG */ \ + "vorr d1, d22, d23 \n" /* RA */ \ + "vzip.u8 d0, d1 \n" /* BGRA */ -void I422ToARGB4444Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - "1: \n" - READYUV422 - YUVTORGB - "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" - ARGBTOARGB4444 - MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb4444), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d4, #0x0f \n" // vbic bits to clear + "1: \n" + + READYUV422 YUVTORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" ARGBTOARGB4444 + "vst1.8 {q0}, [%3]! \n" // store 8 pixels + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb4444), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void I400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUV400 - YUVTORGB - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), - [kUVToG]"r"(&kYuvI601Constants.kUVToG), - [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), - [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUV400 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB), + [kUVToG] "r"(&kYuvI601Constants.kUVToG), + [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR), + [kYToRgb] "r"(&kYuvI601Constants.kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void J400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { - asm volatile ( - "vmov.u8 d23, #255 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {d20}, [%0]! \n" - "vmov d21, d20 \n" - "vmov d22, d20 \n" - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d20", "d21", "d22", "d23" - ); +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d23, #255 \n" + "1: \n" + "vld1.8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23"); } -void NV12ToARGBRow_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READNV12 - YUVTORGB - "subs %3, %3, #8 \n" - MEMACCESS(2) - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } -void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READNV21 - YUVTORGB - "subs %3, %3, #8 \n" - MEMACCESS(2) - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } -void NV12ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP + + "1: \n" + + READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP + + "1: \n" + + READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst3.8 {d20, d21, d22}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); +} + +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READNV12 - YUVTORGB - "subs %3, %3, #8 \n" - ARGBTORGB565 - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 - "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" ARGBTORGB565 + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -void YUY2ToARGBRow_NEON(const uint8* src_yuy2, - uint8* dst_argb, +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READYUY2 - YUVTORGB - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUY2 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } -void UYVYToARGBRow_NEON(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" - READUYVY - YUVTORGB - "subs %2, %2, #8 \n" - MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READUYVY YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV - "subs %3, %3, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store U - MEMACCESS(2) - "vst1.8 {q1}, [%2]! \n" // store V - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "subs %3, %3, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store U + "vst1.8 {q1}, [%2]! \n" // store V + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); } // Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load U - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load V - "subs %3, %3, #16 \n" // 16 processed per loop - MEMACCESS(2) - "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV - "bgt 1b \n" - : - "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load U + "vld1.8 {q1}, [%1]! \n" // load V + "subs %3, %3, #16 \n" // 16 processed per loop + "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + asm volatile( + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB + "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB + "subs %4, %4, #16 \n" // 16 processed per loop + "vst1.8 {q0}, [%1]! \n" // store R + "vst1.8 {q1}, [%2]! \n" // store G + "vst1.8 {q2}, [%3]! \n" // store B + "bgt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "d0", "d1", "d2" // Clobber List + ); +} + +// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load R + "vld1.8 {q1}, [%1]! \n" // load G + "vld1.8 {q2}, [%2]! \n" // load B + "subs %4, %4, #16 \n" // 16 processed per loop + "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB + "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); } // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. -void CopyRow_NEON(const uint8* src, uint8* dst, int count) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 - "subs %2, %2, #32 \n" // 32 processed per loop - MEMACCESS(1) - "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 // Output registers - : // Input registers - : "cc", "memory", "q0", "q1" // Clobber List - ); +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "subs %2, %2, #32 \n" // 32 processed per loop + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 // Output registers + : // Input registers + : "cc", "memory", "q0", "q1" // Clobber List + ); } -// SetRow writes 'count' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8* dst, uint8 v8, int count) { - asm volatile ( - "vdup.8 q0, %2 \n" // duplicate 16 bytes - "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v8) // %2 - : "cc", "memory", "q0" - ); +// SetRow writes 'width' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { + asm volatile( + "vdup.8 q0, %2 \n" // duplicate 16 bytes + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v8) // %2 + : "cc", "memory", "q0"); } -// ARGBSetRow writes 'count' pixels using an 32 bit value repeated. -void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { - asm volatile ( - "vdup.u32 q0, %2 \n" // duplicate 4 ints - "1: \n" - "subs %1, %1, #4 \n" // 4 pixels per loop - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store - "bgt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v32) // %2 - : "cc", "memory", "q0" - ); +// ARGBSetRow writes 'width' pixels using an 32 bit value repeated. +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { + asm volatile( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #4 \n" // 4 pixels per loop + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v32) // %2 + : "cc", "memory", "q0"); } -void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2 \n" - "sub %0, #16 \n" +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2 \n" + "sub %0, #16 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #16 \n" // 16 pixels per loop. - "vrev64.8 q0, q0 \n" - MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // dst += 16 - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0" - ); + "1: \n" + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #16 \n" // 16 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" // dst += 16 + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0"); } -void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - // Start at end of source row. - "mov r12, #-16 \n" - "add %0, %0, %3, lsl #1 \n" - "sub %0, #16 \n" + asm volatile( + // Start at end of source row. + "mov r12, #-16 \n" + "add %0, %0, %3, lsl #1 \n" + "sub %0, #16 \n" - "1: \n" - MEMACCESS(0) - "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 - "subs %3, #8 \n" // 8 pixels per loop. - "vrev64.8 q0, q0 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // dst += 8 - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "r12", "q0" - ); + "1: \n" + "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 + "subs %3, #8 \n" // 8 pixels per loop. + "vrev64.8 q0, q0 \n" + "vst1.8 {d0}, [%1]! \n" // dst += 8 + "vst1.8 {d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "r12", "q0"); } -void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2, lsl #2 \n" - "sub %0, #16 \n" +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "mov r3, #-16 \n" + "add %0, %0, %2, lsl #2 \n" + "sub %0, #16 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #4 \n" // 4 pixels per loop. - "vrev64.32 q0, q0 \n" - MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // dst += 16 - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0" - ); + "1: \n" + "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "subs %2, #4 \n" // 4 pixels per loop. + "vrev64.32 q0, q0 \n" + "vst1.8 {d1}, [%1]! \n" // dst += 16 + "vst1.8 {d0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "r3", "q0"); } -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d4, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); } -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d4, #255 \n" // Alpha + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); } -void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3" // Clobber List - ); +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + asm volatile( + "1: \n" + "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + // RGB24. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3" // Clobber List + ); } -#define RGB565TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ - "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ - "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ +#define RGB565TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ + "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ + "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - RGB565TOARGB - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); } -#define ARGB1555TOARGB \ - "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ - "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ - "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ - "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ - "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ - "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ - "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ - "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ - "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ - "vorr.u8 q1, q1, q3 \n" /* R,A */ \ - "vorr.u8 q0, q0, q2 \n" /* B,G */ \ +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. -#define RGB555TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ - "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ - "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); } -#define ARGB4444TOARGB \ - "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ - "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ - "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ - "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ - "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ - "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ - "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ - "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); + asm volatile( + "vmov.u8 d3, #255 \n" // Alpha + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); } -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B - MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_raw), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); -} - -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. - "subs %2, %2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. - "subs %2, %2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // store 8 U. - MEMACCESS(2) - "vst1.8 {d3}, [%2]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); + asm volatile( + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of + // RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); } -void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + asm volatile( + "1: \n" + "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d1, d3 \n" // swap R, B + "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 U. - MEMACCESS(2) - "vst1.8 {d2}, [%2]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d1}, [%1]! \n" // store 8 U. + "vst1.8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); } -void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // stride + src_yuy2 - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 U. - MEMACCESS(3) - "vst1.8 {d3}, [%3]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(stride_yuy2), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List - ); +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.8 {d0}, [%1]! \n" // store 8 U. + "vst1.8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + ); } -void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // stride + src_uyvy - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 U. - MEMACCESS(3) - "vst1.8 {d2}, [%3]! \n" // store 8 V. - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(stride_uyvy), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List - ); +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_yuy2 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.8 {d1}, [%2]! \n" // store 8 U. + "vst1.8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_uyvy + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.8 {d0}, [%2]! \n" // store 8 U. + "vst1.8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - asm volatile ( - MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // shuffler - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. - "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 4. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + "vld1.8 {q2}, [%3] \n" // shuffler + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "vst1.8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "q0", "q1", "q2" // Clobber List + ); } -void I422ToYUY2Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys - MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 Us - MEMACCESS(2) - "vld1.8 {d3}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3" - ); +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "vld1.8 {d1}, [%1]! \n" // load 8 Us + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3"); } -void I422ToUYVYRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys - MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 Us - MEMACCESS(2) - "vld1.8 {d2}, [%2]! \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels - MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "d0", "d1", "d2", "d3" - ); +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + "1: \n" + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "vld1.8 {d0}, [%1]! \n" // load 8 Us + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3"); } -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTORGB565 - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb565), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" - ); +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); } -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { - asm volatile ( - "vdup.32 d2, %2 \n" // dither4 - "1: \n" - MEMACCESS(1) - "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d20, d20, d2 \n" - "vqadd.u8 d21, d21, d2 \n" - "vqadd.u8 d22, d22, d2 \n" - ARGBTORGB565 - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565. - "bgt 1b \n" - : "+r"(dst_rgb) // %0 - : "r"(src_argb), // %1 - "r"(dither4), // %2 - "r"(width) // %3 - : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11" - ); +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + asm volatile( + "vdup.32 d2, %2 \n" // dither4 + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d20, d20, d2 \n" + "vqadd.u8 d21, d21, d2 \n" + "vqadd.u8 d22, d22, d2 \n" // add for dither + ARGBTORGB565 + "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. + "bgt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"); } -void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTOARGB1555 - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb1555), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" - ); + asm volatile( + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); } -void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, int width) { - asm volatile ( - "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGBTOARGB4444 - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb4444), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" - ); + asm volatile( + "vmov.u8 d4, #0x0f \n" // bits to clear with + // vbic. + "1: \n" + "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q8", "q9", "q10", "q11"); } -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" - ); +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } -void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels - "subs %2, %2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vst1.8 {q3}, [%1]! \n" // store 16 A's. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q3}, [%1]! \n" // store 16 A's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + ); } -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" - ); +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient - "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient - "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient - "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient - "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R - "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + asm volatile( + "vmov.u8 d24, #112 \n" // UB / VR 0.875 + // coefficient + "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient + "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient + "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient + "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R + "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned - "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B - "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + "vmull.u8 q3, d2, d24 \n" // R + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B + "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" - ); -} - -// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. -void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) { - asm volatile ( - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(0) - "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. - MEMACCESS(0) - "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. - "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. - - "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. - "vpadd.u16 d1, d8, d9 \n" // B - "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. - "vpadd.u16 d3, d10, d11 \n" // G - "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. - "vpadd.u16 d5, d12, d13 \n" // R - - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" - - "subs %3, %3, #32 \n" // 32 processed per loop. - "vmul.s16 q8, q0, q10 \n" // B - "vmls.s16 q8, q1, q11 \n" // G - "vmls.s16 q8, q2, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q2, q10 \n" // R - "vmls.s16 q9, q1, q14 \n" // G - "vmls.s16 q9, q0, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. - MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", + "q15"); } +// clang-format off // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -#define RGBTOUV(QB, QG, QR) \ - "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ - "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ - "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ - "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ - "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ - "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ - "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ - "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ - "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ - "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ + "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ + "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ + "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ + "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ +// clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. -void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1468,17 +1335,13 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1490,9 +1353,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1507,8 +1368,11 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, } // TODO(fbarchard): Subsample match C code. -void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient @@ -1517,17 +1381,13 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1539,9 +1399,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1555,8 +1413,11 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ); } -void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_bgra "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1565,17 +1426,13 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. @@ -1587,9 +1444,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q3, q2, q1) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_bgra), // %0 @@ -1603,8 +1458,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ); } -void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_abgr "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1613,17 +1471,13 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1635,9 +1489,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_abgr), // %0 @@ -1651,8 +1503,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ); } -void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgba "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1661,17 +1516,13 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. - MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. - MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. @@ -1683,9 +1534,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgba), // %0 @@ -1699,8 +1548,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ); } -void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) { +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1709,17 +1561,13 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. - MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. - MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1731,9 +1579,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -1747,8 +1593,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ); } -void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width) { +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient @@ -1757,17 +1606,13 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. - MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. - MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1779,9 +1624,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) - MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -1796,875 +1639,815 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - RGB565TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. - RGB565TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. - RGB565TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. - RGB565TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + RGB565TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(src_stride_rgb565), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_stride_rgb565), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(src_stride_argb1555), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 + // coefficient + "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "vrshr.u16 q4, q4, #1 \n" // 2x average + "vrshr.u16 q5, q5, #1 \n" + "vrshr.u16 q6, q6, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. - MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(src_stride_argb4444), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" - ); +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" - ); +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" - ); +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + asm volatile( + "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d27, #16 \n" // Add 16 constant + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d27 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // R - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // R + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // R - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // R + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // B + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // B - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d1, d4 \n" // B + "vmlal.u8 q8, d2, d5 \n" // G + "vmlal.u8 q8, d3, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { - asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" - MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" - ); +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + asm volatile( + "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + "1: \n" + "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { int y1_fraction = source_y_fraction; - asm volatile ( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #128 \n" - "beq 50f \n" + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 50f \n" - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(y1_fraction) // %4 - : - : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" - ); + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction) // %4 + : + : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"); } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "subs %3, #8 \n" - "blt 89f \n" - // Blend 8 pixels. - "8: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. - "bge 8b \n" +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "subs %3, #8 \n" + "blt 89f \n" + // Blend 8 pixels. + "8: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "bge 8b \n" - "89: \n" - "adds %3, #8-1 \n" - "blt 99f \n" + "89: \n" + "adds %3, #8-1 \n" + "blt 99f \n" - // Blend 1 pixels. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. - MEMACCESS(1) - "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. - "subs %3, %3, #1 \n" // 1 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 - MEMACCESS(2) - "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. - "bge 1b \n" + // Blend 1 pixels. + "1: \n" + "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "subs %3, %3, #1 \n" // 1 processed per loop. + "vmull.u8 q10, d4, d3 \n" // db * a + "vmull.u8 q11, d5, d3 \n" // dg * a + "vmull.u8 q12, d6, d3 \n" // dr * a + "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 + "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 + "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 + "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 + "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 + "vqadd.u8 q0, q0, q2 \n" // + sbg + "vqadd.u8 d2, d2, d6 \n" // + sr + "vmov.u8 d3, #255 \n" // a = 255 + "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "bge 1b \n" - "99: \n" + "99: \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" - ); + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"); } // Attenuate 8 pixels at a time. -void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - // Attenuate 8 pixels. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d0, d3 \n" // b * a - "vmull.u8 q11, d1, d3 \n" // g * a - "vmull.u8 q12, d2, d3 \n" // r * a - "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 - "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 - "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q10", "q11", "q12" - ); +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + // Attenuate 8 pixels. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q10, d0, d3 \n" // b * a + "vmull.u8 q11, d1, d3 \n" // g * a + "vmull.u8 q12, d2, d3 \n" // r * a + "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 + "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 + "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q10", "q11", "q12"); } // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { - asm volatile ( - "vdup.u16 q8, %2 \n" - "vshr.u16 q8, q8, #1 \n" // scale >>= 1 - "vdup.u16 q9, %3 \n" // interval multiply. - "vdup.u16 q10, %4 \n" // interval add +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "vdup.u16 q8, %2 \n" + "vshr.u16 q8, q8, #1 \n" // scale >>= 1 + "vdup.u16 q9, %3 \n" // interval multiply. + "vdup.u16 q10, %4 \n" // interval add - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmovl.u8 q0, d0 \n" // b (0 .. 255) - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q2, d4 \n" - "vqdmulh.s16 q0, q0, q8 \n" // b * scale - "vqdmulh.s16 q1, q1, q8 \n" // g - "vqdmulh.s16 q2, q2, q8 \n" // r - "vmul.u16 q0, q0, q9 \n" // b * interval_size - "vmul.u16 q1, q1, q9 \n" // g - "vmul.u16 q2, q2, q9 \n" // r - "vadd.u16 q0, q0, q10 \n" // b + interval_offset - "vadd.u16 q1, q1, q10 \n" // g - "vadd.u16 q2, q2, q10 \n" // r - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d2, q1 \n" - "vqmovn.u16 d4, q2 \n" - MEMACCESS(0) - "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" - ); + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmovl.u8 q0, d0 \n" // b (0 .. 255) + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q2, d4 \n" + "vqdmulh.s16 q0, q0, q8 \n" // b * scale + "vqdmulh.s16 q1, q1, q8 \n" // g + "vqdmulh.s16 q2, q2, q8 \n" // r + "vmul.u16 q0, q0, q9 \n" // b * interval_size + "vmul.u16 q1, q1, q9 \n" // g + "vmul.u16 q2, q2, q9 \n" // r + "vadd.u16 q0, q0, q10 \n" // b + interval_offset + "vadd.u16 q1, q1, q10 \n" // g + "vadd.u16 q2, q2, q10 \n" // r + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d2, q1 \n" + "vqmovn.u16 d4, q2 \n" + "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"); } // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - asm volatile ( - "vdup.u32 q0, %3 \n" // duplicate scale value. - "vzip.u8 d0, d1 \n" // d0 aarrggbb. - "vshr.u16 q0, q0, #1 \n" // scale / 2. +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vzip.u8 d0, d1 \n" // d0 aarrggbb. + "vshr.u16 q0, q0, #1 \n" // scale / 2. - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q10, d20 \n" // b (0 .. 255) - "vmovl.u8 q11, d22 \n" - "vmovl.u8 q12, d24 \n" - "vmovl.u8 q13, d26 \n" - "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 - "vqrdmulh.s16 q11, q11, d0[1] \n" // g - "vqrdmulh.s16 q12, q12, d0[2] \n" // r - "vqrdmulh.s16 q13, q13, d0[3] \n" // a - "vqmovn.u16 d20, q10 \n" - "vqmovn.u16 d22, q11 \n" - "vqmovn.u16 d24, q12 \n" - "vqmovn.u16 d26, q13 \n" - MEMACCESS(1) - "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "cc", "memory", "q0", "q10", "q11", "q12", "q13" - ); + // 8 pixel loop. + "1: \n" + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" + "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 + "vqrdmulh.s16 q11, q11, d0[1] \n" // g + "vqrdmulh.s16 q12, q12, d0[2] \n" // r + "vqrdmulh.s16 q13, q13, d0[3] \n" // a + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "q0", "q10", "q11", "q12", "q13"); } // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; -void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B - "vmov d1, d0 \n" // G - "vmov d2, d0 \n" // R - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" - ); +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d24 \n" // B + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B + "vmov d1, d0 \n" // G + "vmov d2, d0 \n" // R + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 -void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d20, #17 \n" // BB coefficient - "vmov.u8 d21, #68 \n" // BG coefficient - "vmov.u8 d22, #35 \n" // BR coefficient - "vmov.u8 d24, #22 \n" // GB coefficient - "vmov.u8 d25, #88 \n" // GG coefficient - "vmov.u8 d26, #45 \n" // GR coefficient - "vmov.u8 d28, #24 \n" // BB coefficient - "vmov.u8 d29, #98 \n" // BG coefficient - "vmov.u8 d30, #50 \n" // BR coefficient - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d20 \n" // B to Sepia B - "vmlal.u8 q2, d1, d21 \n" // G - "vmlal.u8 q2, d2, d22 \n" // R - "vmull.u8 q3, d0, d24 \n" // B to Sepia G - "vmlal.u8 q3, d1, d25 \n" // G - "vmlal.u8 q3, d2, d26 \n" // R - "vmull.u8 q8, d0, d28 \n" // B to Sepia R - "vmlal.u8 q8, d1, d29 \n" // G - "vmlal.u8 q8, d2, d30 \n" // R - "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B - "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G - "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R - MEMACCESS(0) - "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : - : "cc", "memory", "q0", "q1", "q2", "q3", - "q10", "q11", "q12", "q13", "q14", "q15" - ); +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { + asm volatile( + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13", + "q14", "q15"); } // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { - asm volatile ( - MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. - "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R,A coefficients s16. +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. + "vmovl.s8 q0, d4 \n" // B,G coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - "1: \n" - MEMACCESS(0) - "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit - "vmovl.u8 q9, d18 \n" // g - "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q11, d22 \n" // a - "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B - "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G - "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R - "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A - "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B - "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G - "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R - "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B - "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G - "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R - "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B - "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G - "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R - "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A - MEMACCESS(1) - "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15" - ); + "1: \n" + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit + "vmovl.u8 q9, d18 \n" // g + "vmovl.u8 q10, d20 \n" // r + "vmovl.u8 q11, d22 \n" // a + "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B + "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G + "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A + "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B + "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G + "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B + "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G + "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A + "vqadd.s16 q12, q12, q4 \n" // Accumulate B + "vqadd.s16 q13, q13, q5 \n" // Accumulate G + "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(1) - "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q0, d0, d1 \n" // multiply B - "vmull.u8 q1, d2, d3 \n" // multiply G - "vmull.u8 q2, d4, d5 \n" // multiply R - "vmull.u8 q3, d6, d7 \n" // multiply A - "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B - "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G - "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R - "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3" - ); +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vmull.u8 q0, d0, d1 \n" // multiply B + "vmull.u8 q1, d2, d3 \n" // multiply G + "vmull.u8 q2, d4, d5 \n" // multiply R + "vmull.u8 q3, d6, d7 \n" // multiply A + "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B + "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G + "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R + "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 q0, q0, q2 \n" // add B, G - "vqadd.u8 q1, q1, q3 \n" // add R, A - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3" - ); +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 q0, q0, q2 \n" // add B, G + "vqadd.u8 q1, q1, q3 \n" // add R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqsub.u8 q0, q0, q2 \n" // subtract B, G - "vqsub.u8 q1, q1, q3 \n" // subtract R, A - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3" - ); +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqsub.u8 q0, q0, q2 \n" // subtract B, G + "vqsub.u8 q1, q1, q3 \n" // subtract R, A + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); } // Adds Sobel X and Sobel Y and stores Sobel into ARGB. @@ -2672,54 +2455,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. - MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1" - ); +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); } // Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { - asm volatile ( - // 16 pixel loop. - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. - "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1" - ); +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + // 16 pixel loop. + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); } // Mixes Sobel X, Sobel Y and Sobel into ARGB. @@ -2727,115 +2506,186 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "vmov.u8 d3, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. - MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1" - ); +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1"); } // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0],%5 \n" // top - MEMACCESS(0) - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" - MEMACCESS(1) - "vld1.8 {d2}, [%1],%5 \n" // center * 2 - MEMACCESS(1) - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - MEMACCESS(2) - "vld1.8 {d2}, [%2],%5 \n" // bottom - MEMACCESS(2) - "vld1.8 {d3}, [%2],%6 \n" - "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - MEMACCESS(3) - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx - "bgt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : "r"(2), // %5 - "r"(6) // %6 - : "cc", "memory", "q0", "q1" // Clobber List - ); +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "1: \n" + "vld1.8 {d0}, [%0],%5 \n" // top + "vld1.8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "vld1.8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%2],%5 \n" // bottom + "vld1.8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "q0", "q1" // Clobber List + ); } // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0],%4 \n" // left - MEMACCESS(1) - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0],%4 \n" // center * 2 - MEMACCESS(1) - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0],%5 \n" // right - MEMACCESS(1) - "vld1.8 {d3}, [%1],%5 \n" - "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" - MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 sobely - "bgt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : "r"(1), // %4 - "r"(6) // %5 - : "cc", "memory", "q0", "q1" // Clobber List - ); +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "1: \n" + "vld1.8 {d0}, [%0],%4 \n" // left + "vld1.8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "vld1.8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.8 {d2}, [%0],%5 \n" // right + "vld1.8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "q0", "q1" // Clobber List + ); } -#endif // defined(__ARM_NEON__) && !defined(__aarch64__) + +// %y passes a float as a scalar vector for vector * scalar multiply. +// the regoster must be d0 to d15 and indexed with [0] or [1] to access +// the float in the first or second float of the d-reg + +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float /*unused*/, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {q1}, [%0]! \n" // load 8 shorts + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u16 q2, d2 \n" // 8 int's + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // adjust exponent + "vmul.f32 q3, q3, %y3 \n" + "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat + "vqshrn.u32 d3, q3, #13 \n" + "vst1.8 {q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 bytes + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q1, d2 \n" // 8 shorts + "vmovl.u16 q2, d2 \n" // 8 ints + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, %y3 \n" // scale + "vmul.f32 q3, q3, %y3 \n" + "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "q1", "q2", "q3"); +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus } // extern "C" diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc index 6375d4f55f..24b4520bab 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_neon64.cc @@ -19,118 +19,103 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 -#define READYUV422 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v1.s}[0], [%1], #4 \n" \ - MEMACCESS(2) \ - "ld1 {v1.s}[1], [%2], #4 \n" - -// Read 8 Y, 2 U and 2 V from 422 -#define READYUV411 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v2.h}[0], [%1], #2 \n" \ - MEMACCESS(2) \ - "ld1 {v2.h}[1], [%2], #2 \n" \ - "zip1 v1.8b, v2.8b, v2.8b \n" +#define READYUV422 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v1.s}[0], [%1], #4 \n" \ + "ld1 {v1.s}[1], [%2], #4 \n" // Read 8 Y, 8 U and 8 V from 444 -#define READYUV444 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v1.d}[0], [%1], #8 \n" \ - MEMACCESS(2) \ - "ld1 {v1.d}[1], [%2], #8 \n" \ - "uaddlp v1.8h, v1.16b \n" \ - "rshrn v1.8b, v1.8h, #1 \n" +#define READYUV444 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v1.d}[0], [%1], #8 \n" \ + "ld1 {v1.d}[1], [%2], #8 \n" \ + "uaddlp v1.8h, v1.16b \n" \ + "rshrn v1.8b, v1.8h, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 -#define READYUV400 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - "movi v1.8b , #128 \n" +#define READYUV400 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "movi v1.8b , #128 \n" // Read 8 Y and 4 UV from NV12 -#define READNV12 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v2.8b}, [%1], #8 \n" \ - "uzp1 v1.8b, v2.8b, v2.8b \n" \ - "uzp2 v3.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READNV12 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 Y and 4 VU from NV21 -#define READNV21 \ - MEMACCESS(0) \ - "ld1 {v0.8b}, [%0], #8 \n" \ - MEMACCESS(1) \ - "ld1 {v2.8b}, [%1], #8 \n" \ - "uzp1 v3.8b, v2.8b, v2.8b \n" \ - "uzp2 v1.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READNV21 \ + "ld1 {v0.8b}, [%0], #8 \n" \ + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v3.8b, v2.8b, v2.8b \n" \ + "uzp2 v1.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 YUY2 -#define READYUY2 \ - MEMACCESS(0) \ - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ - "uzp2 v3.8b, v1.8b, v1.8b \n" \ - "uzp1 v1.8b, v1.8b, v1.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READYUY2 \ + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ + "uzp2 v3.8b, v1.8b, v1.8b \n" \ + "uzp1 v1.8b, v1.8b, v1.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 UYVY -#define READUYVY \ - MEMACCESS(0) \ - "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ - "orr v0.8b, v3.8b, v3.8b \n" \ - "uzp1 v1.8b, v2.8b, v2.8b \n" \ - "uzp2 v3.8b, v2.8b, v2.8b \n" \ - "ins v1.s[1], v3.s[0] \n" +#define READUYVY \ + "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ + "orr v0.8b, v3.8b, v3.8b \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" -#define YUVTORGB_SETUP \ - "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ - "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ - "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ - "ld1r {v31.4s}, [%[kYToRgb]] \n" \ - "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ - "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" +#define YUVTORGB_SETUP \ + "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ + "ld1r {v31.4s}, [%[kYToRgb]] \n" \ + "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ + "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" -#define YUVTORGB(vR, vG, vB) \ - "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ - "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ - "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ - "ushll v0.4s, v0.4h, #0 \n" \ - "mul v3.4s, v3.4s, v31.4s \n" \ - "mul v0.4s, v0.4s, v31.4s \n" \ - "sqshrun v0.4h, v0.4s, #16 \n" \ - "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ - "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ - "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ - "uxtl v2.8h, v2.8b \n" \ - "uxtl v1.8h, v1.8b \n" /* Extract U */ \ - "mul v3.8h, v1.8h, v27.8h \n" \ - "mul v5.8h, v1.8h, v29.8h \n" \ - "mul v6.8h, v2.8h, v30.8h \n" \ - "mul v7.8h, v2.8h, v28.8h \n" \ - "sqadd v6.8h, v6.8h, v5.8h \n" \ - "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ - "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ - "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ - "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ - "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ - "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ - "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ - "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ - "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ +#define YUVTORGB(vR, vG, vB) \ + "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ + "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ + "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ + "ushll v0.4s, v0.4h, #0 \n" \ + "mul v3.4s, v3.4s, v31.4s \n" \ + "mul v0.4s, v0.4s, v31.4s \n" \ + "sqshrun v0.4h, v0.4s, #16 \n" \ + "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ + "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ + "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ + "uxtl v2.8h, v2.8b \n" \ + "uxtl v1.8h, v1.8b \n" /* Extract U */ \ + "mul v3.8h, v1.8h, v27.8h \n" \ + "mul v5.8h, v1.8h, v29.8h \n" \ + "mul v6.8h, v2.8h, v30.8h \n" \ + "mul v7.8h, v2.8h, v28.8h \n" \ + "sqadd v6.8h, v6.8h, v5.8h \n" \ + "sqadd " #vB \ + ".8h, v24.8h, v0.8h \n" /* B */ \ + "sqadd " #vG \ + ".8h, v25.8h, v0.8h \n" /* G */ \ + "sqadd " #vR \ + ".8h, v26.8h, v0.8h \n" /* R */ \ + "sqadd " #vB ".8h, " #vB \ + ".8h, v3.8h \n" /* B */ \ + "sqsub " #vG ".8h, " #vG \ + ".8h, v6.8h \n" /* G */ \ + "sqadd " #vR ".8h, " #vR \ + ".8h, v7.8h \n" /* R */ \ + "sqshrun " #vB ".8b, " #vB \ + ".8h, #6 \n" /* B */ \ + "sqshrun " #vG ".8b, " #vG \ + ".8h, #6 \n" /* G */ \ + "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ -void I444ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I444ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -140,7 +125,6 @@ void I444ToARGBRow_NEON(const uint8* src_y, READYUV444 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -157,10 +141,10 @@ void I444ToARGBRow_NEON(const uint8* src_y, ); } -void I422ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, +void I422ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -170,7 +154,6 @@ void I422ToARGBRow_NEON(const uint8* src_y, READYUV422 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -187,11 +170,11 @@ void I422ToARGBRow_NEON(const uint8* src_y, ); } -void I422AlphaToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* dst_argb, +void I422AlphaToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -199,10 +182,8 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y, "1: \n" READYUV422 YUVTORGB(v22, v21, v20) - MEMACCESS(3) "ld1 {v23.8b}, [%3], #8 \n" "subs %w5, %w5, #8 \n" - MEMACCESS(4) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -220,40 +201,10 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y, ); } -void I411ToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ - "1: \n" - READYUV411 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - MEMACCESS(3) - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); -} - -void I422ToRGBARow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, +void I422ToRGBARow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -263,7 +214,6 @@ void I422ToRGBARow_NEON(const uint8* src_y, READYUV422 YUVTORGB(v23, v22, v21) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -280,10 +230,10 @@ void I422ToRGBARow_NEON(const uint8* src_y, ); } -void I422ToRGB24Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb24, +void I422ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -292,7 +242,6 @@ void I422ToRGB24Row_NEON(const uint8* src_y, READYUV422 YUVTORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" - MEMACCESS(3) "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -309,97 +258,91 @@ void I422ToRGB24Row_NEON(const uint8* src_y, ); } -#define ARGBTORGB565 \ - "shll v0.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ - "sri v0.8h, v21.8h, #5 \n" /* RG */ \ - "sri v0.8h, v20.8h, #11 \n" /* RGB */ +#define ARGBTORGB565 \ + "shll v0.8h, v22.8b, #8 \n" /* R */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "sri v0.8h, v21.8h, #5 \n" /* RG */ \ + "sri v0.8h, v20.8h, #11 \n" /* RGB */ -void I422ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, +void I422ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - ARGBTORGB565 - MEMACCESS(3) - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb565), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "1: \n" READYUV422 YUVTORGB( + v22, v21, + v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb565), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); } -#define ARGBTOARGB1555 \ - "shll v0.8h, v23.8b, #8 \n" /* A */ \ - "shll v22.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ - "shll v20.8h, v20.8b, #8 \n" /* B */ \ - "sri v0.8h, v22.8h, #1 \n" /* AR */ \ - "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ - "sri v0.8h, v20.8h, #11 \n" /* ARGB */ +#define ARGBTOARGB1555 \ + "shll v0.8h, v23.8b, #8 \n" /* A */ \ + "shll v22.8h, v22.8b, #8 \n" /* R */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "sri v0.8h, v22.8h, #1 \n" /* AR */ \ + "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v20.8h, #11 \n" /* ARGB */ -void I422ToARGB1555Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb1555, +void I422ToARGB1555Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" - READYUV422 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" - ARGBTOARGB1555 - MEMACCESS(3) - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_argb1555), // %3 - "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" - ); + asm volatile( + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" READYUV422 YUVTORGB( + v22, v21, + v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb1555), // %3 + "+r"(width) // %4 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); } -#define ARGBTOARGB4444 \ - /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ - "ushr v20.8b, v20.8b, #4 \n" /* B */ \ - "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ - "ushr v22.8b, v22.8b, #4 \n" /* R */ \ - "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ - "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ - "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ - "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ +#define ARGBTOARGB4444 \ + /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ + "ushr v20.8b, v20.8b, #4 \n" /* B */ \ + "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ + "ushr v22.8b, v22.8b, #4 \n" /* R */ \ + "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ + "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ + "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ + "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ -void I422ToARGB4444Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb4444, +void I422ToARGB4444Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -411,7 +354,6 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" ARGBTOARGB4444 - MEMACCESS(3) "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. "b.gt 1b \n" : "+r"(src_y), // %0 @@ -428,9 +370,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, ); } -void I400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { +void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile ( YUVTORGB_SETUP "movi v23.8b, #255 \n" @@ -438,7 +378,6 @@ void I400ToARGBRow_NEON(const uint8* src_y, READYUV400 YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -453,31 +392,26 @@ void I400ToARGBRow_NEON(const uint8* src_y, ); } -void J400ToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { - asm volatile ( - "movi v23.8b, #255 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v20.8b}, [%0], #8 \n" - "orr v21.8b, v20.8b, v20.8b \n" - "orr v22.8b, v20.8b, v20.8b \n" - "subs %w2, %w2, #8 \n" - MEMACCESS(1) - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v20", "v21", "v22", "v23" - ); +void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { + asm volatile( + "movi v23.8b, #255 \n" + "1: \n" + "ld1 {v20.8b}, [%0], #8 \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" + "subs %w2, %w2, #8 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v20", "v21", "v22", "v23"); } -void NV12ToARGBRow_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, +void NV12ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -487,7 +421,6 @@ void NV12ToARGBRow_NEON(const uint8* src_y, READNV12 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -503,9 +436,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y, ); } -void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, +void NV21ToARGBRow_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -515,7 +448,6 @@ void NV21ToARGBRow_NEON(const uint8* src_y, READNV21 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 @@ -531,24 +463,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y, ); } -void NV12ToRGB565Row_NEON(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { +void NV12ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { asm volatile ( YUVTORGB_SETUP "1: \n" READNV12 YUVTORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" - ARGBTORGB565 - MEMACCESS(2) - "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 + "+r"(dst_rgb24), // %2 "+r"(width) // %3 : [kUVToRB]"r"(&yuvconstants->kUVToRB), [kUVToG]"r"(&yuvconstants->kUVToG), @@ -559,8 +489,59 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, ); } -void YUY2ToARGBRow_NEON(const uint8* src_yuy2, - uint8* dst_argb, +void NV21ToRGB24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP + "1: \n" + READNV21 + YUVTORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_rgb24), // %2 + "+r"(width) // %3 + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} + +void NV12ToRGB565Row_NEON(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + YUVTORGB_SETUP + "1: \n" READNV12 YUVTORGB( + v22, v21, + v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels + // RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); +} + +void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -570,7 +551,6 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, READYUY2 YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_yuy2), // %0 @@ -585,8 +565,8 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, ); } -void UYVYToARGBRow_NEON(const uint8* src_uyvy, - uint8* dst_argb, +void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( @@ -596,7 +576,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, READUYVY YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" - MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" "b.gt 1b \n" : "+r"(src_uyvy), // %0 @@ -612,869 +591,819 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. -void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +void SplitUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "subs %w3, %w3, #16 \n" // 16 processed per loop - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store U - MEMACCESS(2) - "st1 {v1.16b}, [%2], #16 \n" // store V - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store U + "st1 {v1.16b}, [%2], #16 \n" // store V + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); } // Reads 16 U's and V's and writes out 16 pairs of UV. -void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +void MergeUVRow_NEON(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load U - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" // load V - "subs %w3, %w3, #16 \n" // 16 processed per loop - MEMACCESS(2) - "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV - "b.gt 1b \n" - : - "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1" // Clobber List - ); + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load U + "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); } -// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. -void CopyRow_NEON(const uint8* src, uint8* dst, int count) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 - "subs %w2, %w2, #32 \n" // 32 processed per loop - MEMACCESS(1) - "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 // Output registers - : // Input registers - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -// SetRow writes 'count' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8* dst, uint8 v8, int count) { - asm volatile ( - "dup v0.16b, %w2 \n" // duplicate 16 bytes - "1: \n" - "subs %w1, %w1, #16 \n" // 16 bytes per loop - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v8) // %2 - : "cc", "memory", "v0" - ); -} - -void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { - asm volatile ( - "dup v0.4s, %w2 \n" // duplicate 4 ints - "1: \n" - "subs %w1, %w1, #4 \n" // 4 ints per loop - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" - : "+r"(dst), // %0 - "+r"(count) // %1 - : "r"(v32) // %2 - : "cc", "memory", "v0" - ); -} - -void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #16 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "rev64 v0.16b, v0.16b \n" - MEMACCESS(1) - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - MEMACCESS(1) - "st1 {v0.D}[0], [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0" - ); -} - -void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, +// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. +void SplitRGBRow_NEON(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, int width) { - asm volatile ( - // Start at end of source row. - "add %0, %0, %w3, sxtw #1 \n" - "sub %0, %0, #16 \n" - "1: \n" - MEMACCESS(0) - "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 - "subs %w3, %w3, #8 \n" // 8 pixels per loop. - "rev64 v0.8b, v0.8b \n" - "rev64 v1.8b, v1.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // dst += 8 - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" - "b.gt 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"((ptrdiff_t)-16) // %4 - : "cc", "memory", "v0", "v1" - ); + asm volatile( + "1: \n" + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st1 {v0.16b}, [%1], #16 \n" // store R + "st1 {v1.16b}, [%2], #16 \n" // store G + "st1 {v2.16b}, [%3], #16 \n" // store B + "b.gt 1b \n" + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); } -void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // Start at end of source row. - "add %0, %0, %w2, sxtw #2 \n" - "sub %0, %0, #16 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "rev64 v0.4s, v0.4s \n" - MEMACCESS(1) - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - MEMACCESS(1) - "st1 {v0.D}[0], [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0" - ); +// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time +void MergeRGBRow_NEON(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load R + "ld1 {v1.16b}, [%1], #16 \n" // load G + "ld1 {v2.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_rgb), // %3 + "+r"(width) // %4 + : // Input registers + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); } -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { - asm volatile ( - "movi v4.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); +// Copy multiple of 32. +void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "subs %w2, %w2, #32 \n" // 32 processed per loop + "stp q0, q1, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 // Output registers + : // Input registers + : "cc", "memory", "v0", "v1" // Clobber List + ); } -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { - asm volatile ( - "movi v5.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - MEMACCESS(1) - "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); +// SetRow writes 'width' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { + asm volatile( + "dup v0.16b, %w2 \n" // duplicate 16 bytes + "1: \n" + "subs %w1, %w1, #16 \n" // 16 bytes per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v8) // %2 + : "cc", "memory", "v0"); } -void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - MEMACCESS(1) - "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); +void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { + asm volatile( + "dup v0.4s, %w2 \n" // duplicate 4 ints + "1: \n" + "subs %w1, %w1, #4 \n" // 4 ints per loop + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(width) // %1 + : "r"(v32) // %2 + : "cc", "memory", "v0"); } -#define RGB565TOARGB \ - "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ - "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ - "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ - "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ - "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ - "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ - "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ - "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ - "dup v2.2D, v0.D[1] \n" /* R */ - -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - RGB565TOARGB - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List - ); +void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "rev64 v0.16b, v0.16b \n" + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0"); } -#define ARGB1555TOARGB \ - "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ - "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ - "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ - \ - "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ - "xtn2 v3.16b, v2.8h \n" \ - \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ - \ - "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ - "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ - \ - "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ - "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ - "dup v1.2D, v0.D[1] \n" \ - "dup v3.2D, v2.D[1] \n" +void MirrorUVRow_NEON(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w3, sxtw #1 \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 + "subs %w3, %w3, #8 \n" // 8 pixels per loop. + "rev64 v0.8b, v0.8b \n" + "rev64 v1.8b, v1.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // dst += 8 + "st1 {v1.8b}, [%2], #8 \n" + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((ptrdiff_t)-16) // %4 + : "cc", "memory", "v0", "v1"); +} + +void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + // Start at end of source row. + "add %0, %0, %w2, sxtw #2 \n" + "sub %0, %0, #16 \n" + "1: \n" + "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "rev64 v0.4s, v0.4s \n" + "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 + "st1 {v0.D}[0], [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-16) // %3 + : "cc", "memory", "v0"); +} + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v4.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "movi v5.8b, #255 \n" // Alpha + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + asm volatile( + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r + "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} + +#define RGB565TOARGB \ + "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ + "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ + "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ + "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ + "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ + "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ + "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ + "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ + "dup v2.2D, v0.D[1] \n" /* R */ + +void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List + ); +} + +#define ARGB1555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ + \ + "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ + "xtn2 v3.16b, v2.8h \n" \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ + "dup v1.2D, v0.D[1] \n" \ + "dup v3.2D, v2.D[1] \n" // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. -#define RGB555TOARGB \ - "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ - "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ - "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ - \ - "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ - "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ - \ - "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ - "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ - \ - "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ - "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ - "dup v1.2D, v0.D[1] \n" /* G */ \ +#define RGB555TOARGB \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ + "dup v1.2D, v0.D[1] \n" /* G */ -void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, +void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // Alpha - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + asm volatile( + "movi v3.8b, #255 \n" // Alpha + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } -#define ARGB4444TOARGB \ - "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ - "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ - "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ - "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ - "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ - "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ - "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ - "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ - "dup v0.2D, v2.D[1] \n" \ - "dup v1.2D, v3.D[1] \n" +#define ARGB4444TOARGB \ + "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ + "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ + "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ + "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ + "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ + "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ + "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ + "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ + "dup v0.2D, v2.D[1] \n" \ + "dup v1.2D, v3.D[1] \n" -void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, +void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_argb, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); } -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - MEMACCESS(1) - "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); -} - -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v4.8b, v2.8b, v2.8b \n" // mov g - "orr v5.8b, v1.8b, v1.8b \n" // mov b - MEMACCESS(1) - "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_raw), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); -} - -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "subs %w2, %w2, #16 \n" // 16 processed per loop. - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} - -void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, +void ARGBToRGB24Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "st1 {v1.8b}, [%1], #8 \n" // store 8 U. - MEMACCESS(2) - "st1 {v3.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + asm volatile( + "1: \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of + // RGB24. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + ); } -void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, +void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + asm volatile( + "1: \n" + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "orr v5.8b, v1.8b, v1.8b \n" // mov b + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List + ); +} + +void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "subs %w2, %w2, #16 \n" // 16 processed per loop. + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1" // Clobber List + ); +} + +void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels - "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 U. - MEMACCESS(2) - "st1 {v2.8b}, [%2], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } -void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_yuy2b = src_yuy2 + stride_yuy2; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U - "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" // store 8 U. - MEMACCESS(3) - "st1 {v3.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(src_yuy2b), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", - "v5", "v6", "v7" // Clobber List - ); +void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } -void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_uyvyb = src_uyvy + stride_uyvy; - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row - "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U - "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 U. - MEMACCESS(3) - "st1 {v2.8b}, [%3], #8 \n" // store 8 V. - "b.gt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(src_uyvyb), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", - "v5", "v6", "v7" // Clobber List - ); +void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(src_yuy2b), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); +} + +void UYVYToUVRow_NEON(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. + "b.gt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(src_uyvyb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7" // Clobber List + ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - asm volatile ( - MEMACCESS(3) - "ld1 {v2.16b}, [%3] \n" // shuffler - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "subs %w2, %w2, #4 \n" // 4 processed per loop - "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store 4. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); +void ARGBShuffleRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + "ld1 {v2.16b}, [%3] \n" // shuffler + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. + "subs %w2, %w2, #4 \n" // 4 processed per loop + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels + "st1 {v1.16b}, [%1], #16 \n" // store 4. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "v0", "v1", "v2" // Clobber List + ); } -void I422ToYUY2Row_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_yuy2, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "orr v2.8b, v1.8b, v1.8b \n" - MEMACCESS(1) - "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us - MEMACCESS(2) - "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - MEMACCESS(3) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); +void I422ToYUY2Row_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "orr v2.8b, v1.8b, v1.8b \n" + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } -void I422ToUYVYRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_uyvy, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "orr v3.8b, v2.8b, v2.8b \n" - MEMACCESS(1) - "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us - MEMACCESS(2) - "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels - MEMACCESS(3) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); +void I422ToUYVYRow_NEON(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width) { + asm volatile( + "1: \n" + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "orr v3.8b, v2.8b, v2.8b \n" + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs + "subs %w4, %w4, #16 \n" // 16 pixels + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTORGB565 - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgb565), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v20", "v21", "v22", "v23" - ); +void ARGBToRGB565Row_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb565, + int width) { + asm volatile( + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTORGB565 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb565), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); } -void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { - asm volatile ( - "dup v1.4s, %w2 \n" // dither4 - "1: \n" - MEMACCESS(1) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v20.8b, v20.8b, v1.8b \n" - "uqadd v21.8b, v21.8b, v1.8b \n" - "uqadd v22.8b, v22.8b, v1.8b \n" - ARGBTORGB565 - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. - "b.gt 1b \n" - : "+r"(dst_rgb) // %0 - : "r"(src_argb), // %1 - "r"(dither4), // %2 - "r"(width) // %3 - : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" - ); +void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { + asm volatile( + "dup v1.4s, %w2 \n" // dither4 + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v20.8b, v20.8b, v1.8b \n" + "uqadd v21.8b, v21.8b, v1.8b \n" + "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 + "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"); } -void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, +void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb1555, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTOARGB1555 - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb1555), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v20", "v21", "v22", "v23" - ); + asm volatile( + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTOARGB1555 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + // ARGB1555. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb1555), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v20", "v21", "v22", "v23"); } -void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, +void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_argb4444, int width) { - asm volatile ( - "movi v4.16b, #0x0f \n" // bits to clear with vbic. - "1: \n" - MEMACCESS(0) - "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGBTOARGB4444 - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb4444), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" - ); + asm volatile( + "movi v4.16b, #0x0f \n" // bits to clear with + // vbic. + "1: \n" + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGBTOARGB4444 + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels + // ARGB4444. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb4444), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"); } -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); +void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } -void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels - "subs %w2, %w2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); +void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 + // pixels + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #15 \n" // B * 0.11400 coefficient - "movi v5.8b, #75 \n" // G * 0.58700 coefficient - "movi v6.8b, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" - ); +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #15 \n" // B * 0.11400 coefficient + "movi v5.8b, #75 \n" // G * 0.58700 coefficient + "movi v6.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // 8x1 pixels. -void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, +void ARGBToUV444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, int width) { - asm volatile ( - "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient - "movi v25.8b, #74 \n" // UG -0.5781 coefficient - "movi v26.8b, #38 \n" // UR -0.2969 coefficient - "movi v27.8b, #18 \n" // VB -0.1406 coefficient - "movi v28.8b, #94 \n" // VG -0.7344 coefficient - "movi v29.16b,#0x80 \n" // 128.5 - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlsl v4.8h, v1.8b, v25.8b \n" // G - "umlsl v4.8h, v2.8b, v26.8b \n" // R - "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned + asm volatile( + "movi v24.8b, #112 \n" // UB / VR 0.875 + // coefficient + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + // pixels. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned - "umull v3.8h, v2.8b, v24.8b \n" // R - "umlsl v3.8h, v1.8b, v28.8b \n" // G - "umlsl v3.8h, v0.8b, v27.8b \n" // B - "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", - "v24", "v25", "v26", "v27", "v28", "v29" - ); + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", + "v27", "v28", "v29"); } -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ - "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ - "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ - "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ - "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ - -// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. -void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) { - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - MEMACCESS(0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(0) - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. - "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - - "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. - "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. - "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. - - "urshr v0.8h, v0.8h, #1 \n" // 2x average - "urshr v1.8h, v1.8h, #1 \n" - "urshr v2.8h, v2.8h, #1 \n" - - "subs %w3, %w3, #32 \n" // 32 processed per loop. - "mul v3.8h, v0.8h, v20.8h \n" // B - "mls v3.8h, v1.8h, v21.8h \n" // G - "mls v3.8h, v2.8h, v22.8h \n" // R - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "mul v4.8h, v2.8h, v20.8h \n" // R - "mls v4.8h, v1.8h, v24.8h \n" // G - "mls v4.8h, v0.8h, v23.8h \n" // B - "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. - MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" - ); -} +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -#define RGBTOUV(QB, QG, QR) \ - "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ - "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ - "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ - "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ - "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ - "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ - "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ - "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ - "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ - "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ +// clang-format off +#define RGBTOUV(QB, QG, QR) \ + "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ + "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ + "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ + "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ + "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ + "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ + "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ + "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ + "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ + "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ +// clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. -void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_argb_1 = src_argb + src_stride_argb; +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1486,9 +1415,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -1503,9 +1430,12 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, } // TODO(fbarchard): Subsample match C code. -void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_argb_1 = src_argb + src_stride_argb; +void ARGBToUVJRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 @@ -1514,12 +1444,10 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1531,9 +1459,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -1547,18 +1473,19 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, ); } -void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_bgra_1 = src_bgra + src_stride_bgra; +void BGRAToUVRow_NEON(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. @@ -1570,9 +1497,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_bgra), // %0 @@ -1586,18 +1511,19 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, ); } -void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_abgr_1 = src_abgr + src_stride_abgr; +void ABGRToUVRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1609,9 +1535,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v2.8h, v1.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_abgr), // %0 @@ -1625,18 +1549,19 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, ); } -void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_rgba_1 = src_rgba + src_stride_rgba; +void RGBAToUVRow_NEON(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. @@ -1648,9 +1573,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_rgba), // %0 @@ -1664,18 +1587,19 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, ); } -void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; +void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1687,9 +1611,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_rgb24), // %0 @@ -1703,18 +1625,19 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, ); } -void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_raw_1 = src_raw + src_stride_raw; +void RAWToUVRow_NEON(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG "1: \n" - MEMACCESS(0) "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -1726,9 +1649,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) - MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_raw), // %0 @@ -1743,699 +1664,656 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; - asm volatile ( - "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 - "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 - "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 - "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 - "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 - "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - RGB565TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. - RGB565TOARGB - "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. +void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; + asm volatile( + "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / + // 2 + "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 + "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 + "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 + "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 + "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. - RGB565TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. - RGB565TOARGB - "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. + RGB565TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. + RGB565TOARGB + "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ins v16.D[1], v17.D[0] \n" - "ins v18.D[1], v19.D[0] \n" - "ins v20.D[1], v21.D[0] \n" + "ins v16.D[1], v17.D[0] \n" + "ins v18.D[1], v19.D[0] \n" + "ins v20.D[1], v21.D[0] \n" - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v18.8h, #1 \n" - "urshr v6.8h, v20.8h, #1 \n" + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v18.8h, #1 \n" + "urshr v6.8h, v20.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v16.8h, v4.8h, v22.8h \n" // B - "mls v16.8h, v5.8h, v23.8h \n" // G - "mls v16.8h, v6.8h, v24.8h \n" // R - "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned - "mul v17.8h, v6.8h, v22.8h \n" // R - "mls v17.8h, v5.8h, v26.8h \n" // G - "mls v17.8h, v4.8h, v25.8h \n" // B - "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(src_rgb565_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27" - ); + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v16.8h, v4.8h, v22.8h \n" // B + "mls v16.8h, v5.8h, v23.8h \n" // G + "mls v16.8h, v6.8h, v24.8h \n" // R + "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned + "mul v17.8h, v6.8h, v22.8h \n" // R + "mls v17.8h, v5.8h, v26.8h \n" // G + "mls v17.8h, v4.8h, v25.8h \n" // B + "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(src_rgb565_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. +void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; + asm volatile( + RGBTOUV_SETUP_REG + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. - RGB555TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. - RGB555TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v17.8h, #1 \n" - "urshr v6.8h, v18.8h, #1 \n" + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v2.8h, v4.8h, v20.8h \n" // B - "mls v2.8h, v5.8h, v21.8h \n" // G - "mls v2.8h, v6.8h, v22.8h \n" // R - "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned - "mul v3.8h, v6.8h, v20.8h \n" // R - "mls v3.8h, v5.8h, v24.8h \n" // G - "mls v3.8h, v4.8h, v23.8h \n" // B - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(src_argb1555_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28" - ); + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_argb1555_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. -void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; - asm volatile ( - RGBTOUV_SETUP_REG - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. +void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; + asm volatile( + RGBTOUV_SETUP_REG + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. - ARGB4444TOARGB - "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. - ARGB4444TOARGB - "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. - "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. - "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "ins v16.D[1], v26.D[0] \n" - "ins v17.D[1], v27.D[0] \n" - "ins v18.D[1], v28.D[0] \n" + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" - "urshr v4.8h, v16.8h, #1 \n" // 2x average - "urshr v5.8h, v17.8h, #1 \n" - "urshr v6.8h, v18.8h, #1 \n" + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. - "mul v2.8h, v4.8h, v20.8h \n" // B - "mls v2.8h, v5.8h, v21.8h \n" // G - "mls v2.8h, v6.8h, v22.8h \n" // R - "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned - "mul v3.8h, v6.8h, v20.8h \n" // R - "mls v3.8h, v5.8h, v24.8h \n" // G - "mls v3.8h, v4.8h, v23.8h \n" // B - "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U - "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - MEMACCESS(3) - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(src_argb4444_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28" + "subs %w4, %w4, #16 \n" // 16 processed per loop. + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_argb4444_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28" - ); + ); } -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { - asm volatile ( - "movi v24.8b, #13 \n" // B * 0.1016 coefficient - "movi v25.8b, #65 \n" // G * 0.5078 coefficient - "movi v26.8b, #33 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - RGB565TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb565), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", - "v24", "v25", "v26", "v27" - ); +void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { + asm volatile( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + RGB565TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb565), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", + "v27"); } -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB1555TOARGB - "umull v3.8h, v0.8b, v4.8b \n" // B - "umlal v3.8h, v1.8b, v5.8b \n" // G - "umlal v3.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb1555), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); +void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { - asm volatile ( - "movi v24.8b, #13 \n" // B * 0.1016 coefficient - "movi v25.8b, #65 \n" // G * 0.5078 coefficient - "movi v26.8b, #33 \n" // R * 0.2578 coefficient - "movi v27.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - ARGB4444TOARGB - "umull v3.8h, v0.8b, v24.8b \n" // B - "umlal v3.8h, v1.8b, v25.8b \n" // G - "umlal v3.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v27.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_argb4444), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" - ); +void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width) { + asm volatile( + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); } -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // R - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // B - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); +void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // R - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // B - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); +void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v1.8b, v4.8b \n" // B - "umlal v16.8h, v2.8b, v5.8b \n" // G - "umlal v16.8h, v3.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); +void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #13 \n" // B * 0.1016 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #33 \n" // R * 0.2578 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { - asm volatile ( - "movi v4.8b, #33 \n" // R * 0.2578 coefficient - "movi v5.8b, #65 \n" // G * 0.5078 coefficient - "movi v6.8b, #13 \n" // B * 0.1016 coefficient - "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y - "uqadd v0.8b, v0.8b, v7.8b \n" - MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + asm volatile( + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant + "1: \n" + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } // Bilinear filter 16x2 -> 16x1 -void InterpolateRow_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { +void InterpolateRow_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; - asm volatile ( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" + const uint8_t* src_ptr1 = src_ptr + src_stride; + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" - "dup v5.16b, %w4 \n" - "dup v4.16b, %w5 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v2.8h, v0.8b, v4.8b \n" - "umull2 v3.8h, v0.16b, v4.16b \n" - "umlal v2.8h, v1.8b, v5.8b \n" - "umlal2 v3.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v2.8h, #8 \n" - "rshrn2 v0.16b, v3.8h, #8 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(dst_width), // %3 - "+r"(y1_fraction), // %4 - "+r"(y0_fraction) // %5 - : - : "cc", "memory", "v0", "v1", "v3", "v4", "v5" - ); + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(dst_width), // %3 + "+r"(y1_fraction), // %4 + "+r"(y0_fraction) // %5 + : + : "cc", "memory", "v0", "v1", "v3", "v4", "v5"); } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "subs %w3, %w3, #8 \n" - "b.lt 89f \n" - // Blend 8 pixels. - "8: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.ge 8b \n" +void ARGBBlendRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + "subs %w3, %w3, #8 \n" + "b.lt 89f \n" + // Blend 8 pixels. + "8: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 + // pixels + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 + // pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + // pixels + "b.ge 8b \n" - "89: \n" - "adds %w3, %w3, #8-1 \n" - "b.lt 99f \n" + "89: \n" + "adds %w3, %w3, #8-1 \n" + "b.lt 99f \n" - // Blend 1 pixels. - "1: \n" - MEMACCESS(0) - "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. - MEMACCESS(1) - "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. - "subs %w3, %w3, #1 \n" // 1 processed per loop. - "umull v16.8h, v4.8b, v3.8b \n" // db * a - "umull v17.8h, v5.8b, v3.8b \n" // dg * a - "umull v18.8h, v6.8b, v3.8b \n" // dr * a - "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 - "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 - "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 - "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) - "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) - "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) - "uqadd v0.8b, v0.8b, v4.8b \n" // + sb - "uqadd v1.8b, v1.8b, v5.8b \n" // + sg - "uqadd v2.8b, v2.8b, v6.8b \n" // + sr - "movi v3.8b, #255 \n" // a = 255 - MEMACCESS(2) - "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. - "b.ge 1b \n" + // Blend 1 pixels. + "1: \n" + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. + "subs %w3, %w3, #1 \n" // 1 processed per loop. + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" - "99: \n" + "99: \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18" - ); + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18"); } // Attenuate 8 pixels at a time. -void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - // Attenuate 8 pixels. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v3.8b \n" // b * a - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" - ); +void ARGBAttenuateRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + asm volatile( + // Attenuate 8 pixels. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB + // pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; -void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { - asm volatile ( - "dup v4.8h, %w2 \n" - "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 - "dup v5.8h, %w3 \n" // interval multiply. - "dup v6.8h, %w4 \n" // interval add +void ARGBQuantizeRow_NEON(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { + asm volatile( + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) - "uxtl v1.8h, v1.8b \n" - "uxtl v2.8h, v2.8b \n" - "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale - "sqdmulh v1.8h, v1.8h, v4.8h \n" // g - "sqdmulh v2.8h, v2.8h, v4.8h \n" // r - "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size - "mul v1.8h, v1.8h, v5.8h \n" // g - "mul v2.8h, v2.8h, v5.8h \n" // r - "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset - "add v1.8h, v1.8h, v6.8h \n" // g - "add v2.8h, v2.8h, v6.8h \n" // r - "uqxtn v0.8b, v0.8h \n" - "uqxtn v1.8b, v1.8h \n" - "uqxtn v2.8b, v2.8h \n" - MEMACCESS(0) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(scale), // %2 - "r"(interval_size), // %3 - "r"(interval_offset) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" - ); + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(scale), // %2 + "r"(interval_size), // %3 + "r"(interval_offset) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. -void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - asm volatile ( - "dup v0.4s, %w3 \n" // duplicate scale value. - "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. - "ushr v0.8h, v0.8h, #1 \n" // scale / 2. +void ARGBShadeRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { + asm volatile( + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v4.8h, v4.8b \n" // b (0 .. 255) - "uxtl v5.8h, v5.8b \n" - "uxtl v6.8h, v6.8b \n" - "uxtl v7.8h, v7.8b \n" - "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 - "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g - "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r - "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a - "uqxtn v4.8b, v4.8h \n" - "uqxtn v5.8b, v5.8h \n" - "uqxtn v6.8b, v6.8h \n" - "uqxtn v7.8b, v7.8h \n" - MEMACCESS(1) - "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "cc", "memory", "v0", "v4", "v5", "v6", "v7" - ); + // 8 pixel loop. + "1: \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "v0", "v4", "v5", "v6", "v7"); } // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; -void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "movi v24.8b, #15 \n" // B * 0.11400 coefficient - "movi v25.8b, #75 \n" // G * 0.58700 coefficient - "movi v26.8b, #38 \n" // R * 0.29900 coefficient - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v24.8b \n" // B - "umlal v4.8h, v1.8b, v25.8b \n" // G - "umlal v4.8h, v2.8b, v26.8b \n" // R - "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B - "orr v1.8b, v0.8b, v0.8b \n" // G - "orr v2.8b, v0.8b, v0.8b \n" // R - MEMACCESS(1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" - ); +void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { + asm volatile( + "movi v24.8b, #15 \n" // B * 0.11400 coefficient + "movi v25.8b, #75 \n" // G * 0.58700 coefficient + "movi v26.8b, #38 \n" // R * 0.29900 coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"); } // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. @@ -2443,194 +2321,180 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 -void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { - asm volatile ( - "movi v20.8b, #17 \n" // BB coefficient - "movi v21.8b, #68 \n" // BG coefficient - "movi v22.8b, #35 \n" // BR coefficient - "movi v24.8b, #22 \n" // GB coefficient - "movi v25.8b, #88 \n" // GG coefficient - "movi v26.8b, #45 \n" // GR coefficient - "movi v28.8b, #24 \n" // BB coefficient - "movi v29.8b, #98 \n" // BG coefficient - "movi v30.8b, #50 \n" // BR coefficient - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B - "umlal v4.8h, v1.8b, v21.8b \n" // G - "umlal v4.8h, v2.8b, v22.8b \n" // R - "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G - "umlal v5.8h, v1.8b, v25.8b \n" // G - "umlal v5.8h, v2.8b, v26.8b \n" // R - "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R - "umlal v6.8h, v1.8b, v29.8b \n" // G - "umlal v6.8h, v2.8b, v30.8b \n" // R - "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B - "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G - "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R - MEMACCESS(0) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" - ); +void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { + asm volatile( + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"); } // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. -void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { - asm volatile ( - MEMACCESS(3) - "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. - "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. - "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. +void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { + asm volatile( + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - "1: \n" - MEMACCESS(0) - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit - "uxtl v17.8h, v17.8b \n" // g - "uxtl v18.8h, v18.8b \n" // r - "uxtl v19.8h, v19.8b \n" // a - "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B - "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G - "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R - "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A - "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B - "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G - "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R - "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B - "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G - "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R - "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B - "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G - "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R - "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A - "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B - "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G - "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R - "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A - "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B - "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G - "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R - "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A - MEMACCESS(1) - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(matrix_argb) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v22", "v23", "v24", "v25" - ); + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v22", "v23", "v24", "v25"); } // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // multiply B - "umull v1.8h, v1.8b, v5.8b \n" // multiply G - "umull v2.8h, v2.8b, v6.8b \n" // multiply R - "umull v3.8h, v3.8b, v7.8b \n" // multiply A - "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B - "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G - "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R - "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); +void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "umull v2.8h, v2.8b, v6.8b \n" // multiply R + "umull v3.8h, v3.8b, v7.8b \n" // multiply A + "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B + "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G + "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R + "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v4.8b \n" - "uqadd v1.8b, v1.8b, v5.8b \n" - "uqadd v2.8b, v2.8b, v6.8b \n" - "uqadd v3.8b, v3.8b, v7.8b \n" - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); +void ARGBAddRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v4.8b \n" + "uqadd v1.8b, v1.8b, v5.8b \n" + "uqadd v2.8b, v2.8b, v6.8b \n" + "uqadd v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqsub v0.8b, v0.8b, v4.8b \n" - "uqsub v1.8b, v1.8b, v5.8b \n" - "uqsub v2.8b, v2.8b, v6.8b \n" - "uqsub v3.8b, v3.8b, v7.8b \n" - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); +void ARGBSubtractRow_NEON(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + asm volatile( + // 8 pixel loop. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqsub v0.8b, v0.8b, v4.8b \n" + "uqsub v1.8b, v1.8b, v5.8b \n" + "uqsub v2.8b, v2.8b, v6.8b \n" + "uqsub v3.8b, v3.8b, v7.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Adds Sobel X and Sobel Y and stores Sobel into ARGB. @@ -2638,54 +2502,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // R = Sobel // G = Sobel // B = Sobel -void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. - MEMACCESS(1) - "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v0.8b, v0.8b, v1.8b \n" // add - "orr v1.8b, v0.8b, v0.8b \n" - "orr v2.8b, v0.8b, v0.8b \n" - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); +void SobelRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "orr v1.8b, v0.8b, v0.8b \n" + "orr v2.8b, v0.8b, v0.8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } // Adds Sobel X and Sobel Y and stores Sobel into plane. -void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { - asm volatile ( - // 16 pixel loop. - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "uqadd v0.16b, v0.16b, v1.16b \n" // add - MEMACCESS(2) - "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_y), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1" - ); +void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { + asm volatile( + // 16 pixel loop. + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "uqadd v0.16b, v0.16b, v1.16b \n" // add + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1"); } // Mixes Sobel X, Sobel Y and Sobel into ARGB. @@ -2693,114 +2553,329 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { - asm volatile ( - "movi v3.8b, #255 \n" // alpha - // 8 pixel loop. - "1: \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. - MEMACCESS(1) - "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uqadd v1.8b, v0.8b, v2.8b \n" // add - MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels - "b.gt 1b \n" - : "+r"(src_sobelx), // %0 - "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3" - ); +void SobelXYRow_NEON(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { + asm volatile( + "movi v3.8b, #255 \n" // alpha + // 8 pixel loop. + "1: \n" + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v1.8b, v0.8b, v2.8b \n" // add + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "b.gt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0],%5 \n" // top - MEMACCESS(0) - "ld1 {v1.8b}, [%0],%6 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - MEMACCESS(1) - "ld1 {v2.8b}, [%1],%5 \n" // center * 2 - MEMACCESS(1) - "ld1 {v3.8b}, [%1],%6 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - MEMACCESS(2) - "ld1 {v2.8b}, [%2],%5 \n" // bottom - MEMACCESS(2) - "ld1 {v3.8b}, [%2],%6 \n" - "subs %w4, %w4, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - MEMACCESS(3) - "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx - "b.gt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(src_y2), // %2 - "+r"(dst_sobelx), // %3 - "+r"(width) // %4 - : "r"(2LL), // %5 - "r"(6LL) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); +void SobelXRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.8b}, [%0],%5 \n" // top + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%2],%5 \n" // bottom + "ld1 {v3.8b}, [%2],%6 \n" + "subs %w4, %w4, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2LL), // %5 + "r"(6LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0],%4 \n" // left - MEMACCESS(1) - "ld1 {v1.8b}, [%1],%4 \n" - "usubl v0.8h, v0.8b, v1.8b \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0],%4 \n" // center * 2 - MEMACCESS(1) - "ld1 {v3.8b}, [%1],%4 \n" - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "add v0.8h, v0.8h, v1.8h \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0],%5 \n" // right - MEMACCESS(1) - "ld1 {v3.8b}, [%1],%5 \n" - "subs %w3, %w3, #8 \n" // 8 pixels - "usubl v1.8h, v2.8b, v3.8b \n" - "add v0.8h, v0.8h, v1.8h \n" - "abs v0.8h, v0.8h \n" - "uqxtn v0.8b, v0.8h \n" - MEMACCESS(2) - "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely - "b.gt 1b \n" - : "+r"(src_y0), // %0 - "+r"(src_y1), // %1 - "+r"(dst_sobely), // %2 - "+r"(width) // %3 - : "r"(1LL), // %4 - "r"(6LL) // %5 - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); +void SobelYRow_NEON(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.8b}, [%0],%4 \n" // left + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" + "ld1 {v2.8b}, [%0],%5 \n" // right + "ld1 {v3.8b}, [%1],%5 \n" + "subs %w3, %w3, #8 \n" // 8 pixels + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely + "b.gt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1LL), // %4 + "r"(6LL) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List + ); } + +// Caveat - rounds float to half float whereas scaling version truncates. +void HalfFloat1Row_NEON(const uint16_t* src, + uint16_t* dst, + float /*unused*/, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fcvtn v1.4h, v2.4s \n" // 8 half floats + "fcvtn2 v1.8h, v3.4s \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v3"); +} + +void HalfFloatRow_NEON(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v3.4s, v3.4s, %3.s[0] \n" + "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v1.8h, v3.4s, #13 \n" + "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v1.8h, v1.8b \n" // 8 shorts + "uxtl v2.4s, v1.4h \n" // 8 ints + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "fmul v3.4s, v3.4s, %3.s[0] \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + +float ScaleMaxSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fmax; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" // scale + "fmax v5.4s, v5.4s, v1.4s \n" // max + "fmax v6.4s, v6.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "fmax v5.4s, v5.4s, v6.4s \n" // max + "fmaxv %s3, v5.4s \n" // signed max acculator + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fmax) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fmax; +} + +float ScaleSumSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fsum; + asm volatile( + "movi v5.4s, #0 \n" // max + "movi v6.4s, #0 \n" // max + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "fmul v4.4s, v2.4s, %4.s[0] \n" + "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares + "fmla v6.4s, v2.4s, v2.4s \n" + "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + "faddp v5.4s, v5.4s, v6.4s \n" + "faddp v5.4s, v5.4s, v5.4s \n" + "faddp %3.4s, v5.4s, v5.4s \n" // sum + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fsum) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); + return fsum; +} + +void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { + asm volatile( + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v1.4s, v1.4s, %3.s[0] \n" // scale + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_NEON(const uint16_t* src0, + const uint16_t* src1, + const uint16_t* src2, + const uint16_t* src3, + const uint16_t* src4, + uint32_t* dst, + int width) { + asm volatile( + "movi v6.8h, #4 \n" // constant 4 + "movi v7.8h, #6 \n" // constant 6 + + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows + "ld1 {v2.8h}, [%4], #16 \n" + "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 + "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 + "ld1 {v2.8h}, [%1], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "ld1 {v2.8h}, [%2], #16 \n" + "umlal v0.4s, v2.4h, v7.4h \n" // * 6 + "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 + "ld1 {v2.8h}, [%3], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : + : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { + const uint32_t* src1 = src + 1; + const uint32_t* src2 = src + 2; + const uint32_t* src3 = src + 3; + asm volatile( + "movi v6.4s, #4 \n" // constant 4 + "movi v7.4s, #6 \n" // constant 6 + + "1: \n" + "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples + "add v0.4s, v0.4s, v1.4s \n" // * 1 + "add v1.4s, v1.4s, v2.4s \n" // * 1 + "ld1 {v2.4s,v3.4s}, [%2], #32 \n" + "mla v0.4s, v2.4s, v7.4s \n" // * 6 + "mla v1.4s, v3.4s, v7.4s \n" // * 6 + "ld1 {v2.4s,v3.4s}, [%1], #32 \n" + "ld1 {v4.4s,v5.4s}, [%3], #32 \n" + "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 + "add v3.4s, v3.4s, v5.4s \n" + "mla v0.4s, v2.4s, v6.4s \n" // * 4 + "mla v1.4s, v3.4s, v6.4s \n" // * 4 + "subs %w5, %w5, #8 \n" // 8 processed per loop + "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack + "uqrshrn2 v0.8h, v1.4s, #8 \n" + "st1 {v0.8h}, [%4], #16 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(dst), // %4 + "+r"(width) // %5 + : "r"(32LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc index 2a3da8969f..5500d7f5a6 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/row_win.cc @@ -28,72 +28,71 @@ extern "C" { #if defined(_M_X64) // Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; +#define READYUV422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 \ - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ - u_buf += 4; \ - xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ - y_buf += 8; \ - xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ - a_buf += 8; +#define READYUVA422 \ + xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ + xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ + u_buf += 4; \ + xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ + y_buf += 8; \ + xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ + a_buf += 8; // Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(yuvconstants) \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm2 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ - xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ - xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ - xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ - xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ - xmm0 = _mm_adds_epi16(xmm0, xmm4); \ - xmm1 = _mm_adds_epi16(xmm1, xmm4); \ - xmm2 = _mm_adds_epi16(xmm2, xmm4); \ - xmm0 = _mm_srai_epi16(xmm0, 6); \ - xmm1 = _mm_srai_epi16(xmm1, 6); \ - xmm2 = _mm_srai_epi16(xmm2, 6); \ - xmm0 = _mm_packus_epi16(xmm0, xmm0); \ - xmm1 = _mm_packus_epi16(xmm1, xmm1); \ - xmm2 = _mm_packus_epi16(xmm2, xmm2); +#define YUVTORGB(yuvconstants) \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm2 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ + xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ + xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ + xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ + xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ + xmm0 = _mm_adds_epi16(xmm0, xmm4); \ + xmm1 = _mm_adds_epi16(xmm1, xmm4); \ + xmm2 = _mm_adds_epi16(xmm2, xmm4); \ + xmm0 = _mm_srai_epi16(xmm0, 6); \ + xmm1 = _mm_srai_epi16(xmm1, 6); \ + xmm2 = _mm_srai_epi16(xmm2, 6); \ + xmm0 = _mm_packus_epi16(xmm0, xmm0); \ + xmm1 = _mm_packus_epi16(xmm1, xmm1); \ + xmm2 = _mm_packus_epi16(xmm2, xmm2); // Store 8 ARGB values. -#define STOREARGB \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ - xmm1 = _mm_loadu_si128(&xmm0); \ - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ - _mm_storeu_si128((__m128i *)dst_argb, xmm0); \ - _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ - dst_argb += 32; - +#define STOREARGB \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ + xmm1 = _mm_loadu_si128(&xmm0); \ + xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ + _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ + _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ + dst_argb += 32; #if defined(HAS_I422TOARGBROW_SSSE3) -void I422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, +void I422ToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __m128i xmm0, xmm1, xmm2, xmm4; const __m128i xmm5 = _mm_set1_epi8(-1); - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUV422 YUVTORGB(yuvconstants) @@ -104,15 +103,15 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, #endif #if defined(HAS_I422ALPHATOARGBROW_SSSE3) -void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, +void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __m128i xmm0, xmm1, xmm2, xmm4, xmm5; - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; + const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUVA422 YUVTORGB(yuvconstants) @@ -127,175 +126,143 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, #ifdef HAS_ARGBTOYROW_SSSE3 // Constants for ARGB. -static const vec8 kARGBToY = { - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 -}; +static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; // JPeg full range. -static const vec8 kARGBToYJ = { - 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 -}; +static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0}; -static const vec8 kARGBToU = { - 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 -}; +static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, + 112, -74, -38, 0, 112, -74, -38, 0}; -static const vec8 kARGBToUJ = { - 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 -}; +static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, + 127, -84, -43, 0, 127, -84, -43, 0}; static const vec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; -static const vec8 kARGBToVJ = { - -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 -}; +static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, + -20, -107, 127, 0, -20, -107, 127, 0}; // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -}; + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; // Constants for BGRA. -static const vec8 kBGRAToY = { - 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 -}; +static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, + 0, 33, 65, 13, 0, 33, 65, 13}; -static const vec8 kBGRAToU = { - 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 -}; +static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, + 0, -38, -74, 112, 0, -38, -74, 112}; -static const vec8 kBGRAToV = { - 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 -}; +static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, + 0, 112, -94, -18, 0, 112, -94, -18}; // Constants for ABGR. -static const vec8 kABGRToY = { - 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 -}; +static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, + 33, 65, 13, 0, 33, 65, 13, 0}; -static const vec8 kABGRToU = { - -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 -}; +static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, + -38, -74, 112, 0, -38, -74, 112, 0}; -static const vec8 kABGRToV = { - 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 -}; +static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, + 112, -94, -18, 0, 112, -94, -18, 0}; // Constants for RGBA. -static const vec8 kRGBAToY = { - 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 -}; +static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, + 0, 13, 65, 33, 0, 13, 65, 33}; -static const vec8 kRGBAToU = { - 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 -}; +static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, + 0, 112, -74, -38, 0, 112, -74, -38}; -static const vec8 kRGBAToV = { - 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 -}; +static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, + 0, -18, -94, 112, 0, -18, -94, 112}; -static const uvec8 kAddY16 = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u -}; +static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; // 7 bit fixed point 0.5. -static const vec16 kAddYJ64 = { - 64, 64, 64, 64, 64, 64, 64, 64 -}; +static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -static const uvec8 kAddUV128 = { - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; +static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; -static const uvec16 kAddUVJ128 = { - 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u -}; +static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u -}; + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; // Shuffle table for converting RAW to ARGB. -static const uvec8 kShuffleMaskRAWToARGB = { - 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u -}; +static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, + 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { - 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Middle 8. static const uvec8 kShuffleMaskRAWToRGB24_1 = { - 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleMaskRAWToRGB24_2 = { - 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; + 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RGB24. static const uvec8 kShuffleMaskARGBToRGB24 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u -}; + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RAW. static const uvec8 kShuffleMaskARGBToRAW = { - 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u -}; + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 static const uvec8 kShuffleMaskARGBToRGB24_0 = { - 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u -}; + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; // YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 -}; +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; // YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = { - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, - 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 -}; +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; // UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = { - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 -}; +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; // UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = { - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, - 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 -}; +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; // NV21 shuf 8 VU to 16 UV. static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, }; // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) -void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { +__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 convertloop: @@ -318,13 +285,14 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { #ifdef HAS_J400TOARGBROW_AVX2 // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) -void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) { +__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 convertloop: @@ -348,13 +316,14 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) { } #endif // HAS_J400TOARGBROW_AVX2 -__declspec(naked) -void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { +__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_rgb24 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_rgb24 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB @@ -364,17 +333,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { movdqu xmm3, [eax + 32] lea eax, [eax + 48] movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} pshufb xmm2, xmm4 por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 movdqu [edx], xmm0 por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 movdqu [edx + 16], xmm1 por xmm3, xmm5 @@ -386,14 +355,14 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { } } -__declspec(naked) -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, - int width) { +__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB @@ -403,17 +372,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, movdqu xmm3, [eax + 32] lea eax, [eax + 48] movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} pshufb xmm2, xmm4 por xmm2, xmm5 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 movdqu [edx], xmm0 por xmm1, xmm5 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 movdqu [edx + 16], xmm1 por xmm3, xmm5 @@ -425,11 +394,12 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, } } -__declspec(naked) -void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { +__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, + uint8_t* dst_rgb24, + int width) { __asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_rgb24 + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_rgb24 mov ecx, [esp + 12] // width movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 @@ -460,9 +430,9 @@ void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 // 20 instructions. -__declspec(naked) -void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, - int width) { +__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits movd xmm5, eax @@ -470,33 +440,33 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits movd xmm6, eax pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red psllw xmm3, 11 - pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green psllw xmm4, 10 psrlw xmm4, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha psllw xmm7, 8 - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgr565 + movdqu xmm0, [eax] // fetch 8 pixels of bgr565 movdqa xmm1, xmm0 movdqa xmm2, xmm0 - pand xmm1, xmm3 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits - pmulhuw xmm1, xmm5 // * (256 + 8) - pmulhuw xmm2, xmm5 // * (256 + 8) + pand xmm1, xmm3 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) psllw xmm1, 8 - por xmm1, xmm2 // RB - pand xmm0, xmm4 // G in middle 6 bits - pmulhuw xmm0, xmm6 // << 5 * (256 + 4) - por xmm0, xmm7 // AG + por xmm1, xmm2 // RB + pand xmm0, xmm4 // G in middle 6 bits + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) + por xmm0, xmm7 // AG movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 @@ -516,9 +486,9 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, // v * 256 + v * 8 // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -__declspec(naked) -void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, - int width) { +__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits vmovd xmm5, eax @@ -526,32 +496,32 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits vmovd xmm6, eax vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green vpsllw ymm4, ymm4, 10 vpsrlw ymm4, ymm4, 5 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha vpsllw ymm7, ymm7, 8 - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 - vpand ymm1, ymm0, ymm3 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 + vpand ymm1, ymm0, ymm3 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpand ymm0, ymm0, ymm4 // G in middle 6 bits - vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) - vpor ymm0, ymm0, ymm7 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpor ymm1, ymm1, ymm2 // RB + vpand ymm0, ymm0, ymm4 // G in middle 6 bits + vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) + vpor ymm0, ymm0, ymm7 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm1, ymm1, 0xd8 vpunpckhbw ymm2, ymm1, ymm0 vpunpcklbw ymm1, ymm1, ymm0 @@ -567,9 +537,9 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, #endif // HAS_RGB565TOARGBROW_AVX2 #ifdef HAS_ARGB1555TOARGBROW_AVX2 -__declspec(naked) -void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits vmovd xmm5, eax @@ -577,33 +547,33 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits vmovd xmm6, eax vbroadcastss ymm6, xmm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 - vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha vpsllw ymm7, ymm7, 8 - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 - vpsllw ymm1, ymm0, 1 // R in upper 5 bits - vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 + vpsllw ymm1, ymm0, 1 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits vpand ymm1, ymm1, ymm3 - vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) - vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) vpsllw ymm1, ymm1, 8 - vpor ymm1, ymm1, ymm2 // RB - vpsraw ymm2, ymm0, 8 // A - vpand ymm0, ymm0, ymm4 // G in middle 5 bits - vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) + vpor ymm1, ymm1, ymm2 // RB + vpsraw ymm2, ymm0, 8 // A + vpand ymm0, ymm0, ymm4 // G in middle 5 bits + vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) vpand ymm2, ymm2, ymm7 - vpor ymm0, ymm0, ymm2 // AG - vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpor ymm0, ymm0, ymm2 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm1, ymm1, 0xd8 vpunpckhbw ymm2, ymm1, ymm0 vpunpcklbw ymm1, ymm1, ymm0 @@ -619,29 +589,29 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, #endif // HAS_ARGB1555TOARGBROW_AVX2 #ifdef HAS_ARGB4444TOARGBROW_AVX2 -__declspec(naked) -void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f vmovd xmm4, eax vbroadcastss ymm4, xmm4 - vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb + vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 - vpand ymm2, ymm0, ymm5 // mask high nibbles - vpand ymm0, ymm0, ymm4 // mask low nibbles + vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 + vpand ymm2, ymm0, ymm5 // mask high nibbles + vpand ymm0, ymm0, ymm4 // mask low nibbles vpsrlw ymm3, ymm2, 4 vpsllw ymm1, ymm0, 4 vpor ymm2, ymm2, ymm3 vpor ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm2, ymm2, 0xd8 vpunpckhbw ymm1, ymm0, ymm2 vpunpcklbw ymm0, ymm0, ymm2 @@ -657,9 +627,9 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, #endif // HAS_ARGB4444TOARGBROW_AVX2 // 24 instructions -__declspec(naked) -void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits movd xmm5, eax @@ -667,36 +637,36 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits movd xmm6, eax pshufd xmm6, xmm6, 0 - pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red psllw xmm3, 11 - movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green psrlw xmm4, 6 - pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha psllw xmm7, 8 - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of 1555 + movdqu xmm0, [eax] // fetch 8 pixels of 1555 movdqa xmm1, xmm0 movdqa xmm2, xmm0 - psllw xmm1, 1 // R in upper 5 bits - psllw xmm2, 11 // B in upper 5 bits + psllw xmm1, 1 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits pand xmm1, xmm3 - pmulhuw xmm2, xmm5 // * (256 + 8) - pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) + pmulhuw xmm1, xmm5 // * (256 + 8) psllw xmm1, 8 - por xmm1, xmm2 // RB + por xmm1, xmm2 // RB movdqa xmm2, xmm0 - pand xmm0, xmm4 // G in middle 5 bits - psraw xmm2, 8 // A - pmulhuw xmm0, xmm6 // << 6 * (256 + 8) + pand xmm0, xmm4 // G in middle 5 bits + psraw xmm2, 8 // A + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) pand xmm2, xmm7 - por xmm0, xmm2 // AG + por xmm0, xmm2 // AG movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 @@ -710,26 +680,26 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, } // 18 instructions. -__declspec(naked) -void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, - int width) { +__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f movd xmm4, eax pshufd xmm4, xmm4, 0 - movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles + movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles pslld xmm5, 4 - mov eax, [esp + 4] // src_argb4444 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: - movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 + movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 movdqa xmm2, xmm0 - pand xmm0, xmm4 // mask low nibbles - pand xmm2, xmm5 // mask high nibbles + pand xmm0, xmm4 // mask low nibbles + pand xmm2, xmm5 // mask high nibbles movdqa xmm1, xmm0 movdqa xmm3, xmm2 psllw xmm1, 4 @@ -748,37 +718,38 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, } } -__declspec(naked) -void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 pshufb xmm2, xmm6 pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop @@ -786,37 +757,38 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW convertloop: - movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] lea eax, [eax + 64] - pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 pshufb xmm2, xmm6 pshufb xmm3, xmm6 - movdqa xmm4, xmm1 // 4 bytes from 1 for 0 - psrldq xmm1, 4 // 8 bytes from 1 - pslldq xmm4, 12 // 4 bytes from 1 for 0 - movdqa xmm5, xmm2 // 8 bytes from 2 for 1 - por xmm0, xmm4 // 4 bytes from 1 for 0 - pslldq xmm5, 8 // 8 bytes from 2 for 1 + movdqa xmm4, xmm1 // 4 bytes from 1 for 0 + psrldq xmm1, 4 // 8 bytes from 1 + pslldq xmm4, 12 // 4 bytes from 1 for 0 + movdqa xmm5, xmm2 // 8 bytes from 2 for 1 + por xmm0, xmm4 // 4 bytes from 1 for 0 + pslldq xmm5, 8 // 8 bytes from 2 for 1 movdqu [edx], xmm0 // store 0 - por xmm1, xmm5 // 8 bytes from 2 for 1 - psrldq xmm2, 8 // 4 bytes from 2 - pslldq xmm3, 4 // 12 bytes from 3 for 2 - por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqu [edx + 16], xmm1 // store 1 - movdqu [edx + 32], xmm2 // store 2 + por xmm1, xmm5 // 8 bytes from 2 for 1 + psrldq xmm2, 8 // 4 bytes from 2 + pslldq xmm3, 4 // 12 bytes from 3 for 2 + por xmm2, xmm3 // 12 bytes from 3 for 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop @@ -824,33 +796,34 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 psrld xmm4, 26 pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pslld xmm5, 11 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 @@ -861,41 +834,42 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - movd xmm6, [esp + 12] // dither4 + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + movd xmm6, [esp + 12] // dither4 mov ecx, [esp + 16] // width - punpcklbw xmm6, xmm6 // make dither 16 bytes + punpcklbw xmm6, xmm6 // make dither 16 bytes movdqa xmm7, xmm6 punpcklwd xmm6, xmm6 punpckhwd xmm7, xmm7 - pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f psrld xmm3, 27 - pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 psrld xmm4, 26 pslld xmm4, 5 - pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pslld xmm5, 11 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - paddusb xmm0, xmm6 // add dither - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm1, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm1, xmm3 // B - pand xmm2, xmm4 // G - pand xmm0, xmm5 // R - por xmm1, xmm2 // BG - por xmm0, xmm1 // BGR + movdqu xmm0, [eax] // fetch 4 pixels of argb + paddusb xmm0, xmm6 // add dither + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 @@ -907,39 +881,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -__declspec(naked) -void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, - const uint32 dither4, int width) { +__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + const uint32_t dither4, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb vbroadcastss xmm6, [esp + 12] // dither4 - mov ecx, [esp + 16] // width - vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes + mov ecx, [esp + 16] // width + vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes vpermq ymm6, ymm6, 0xd8 vpunpcklwd ymm6, ymm6, ymm6 - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 vpsrld ymm4, ymm4, 26 vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpaddusb ymm0, ymm0, ymm6 // add dither - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpaddusb ymm0, ymm0, ymm6 // add dither + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR vpackusdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -950,37 +925,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, #endif // HAS_ARGBTORGB565DITHERROW_AVX2 // TODO(fbarchard): Improve sign extension/packing. -__declspec(naked) -void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0x0000001f + pcmpeqb xmm4, xmm4 // generate mask 0x0000001f psrld xmm4, 27 - movdqa xmm5, xmm4 // generate mask 0x000003e0 + movdqa xmm5, xmm4 // generate mask 0x000003e0 pslld xmm5, 5 - movdqa xmm6, xmm4 // generate mask 0x00007c00 + movdqa xmm6, xmm4 // generate mask 0x00007c00 pslld xmm6, 10 - pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 + pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 pslld xmm7, 15 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb - movdqa xmm1, xmm0 // B - movdqa xmm2, xmm0 // G - movdqa xmm3, xmm0 // R - psrad xmm0, 16 // A - psrld xmm1, 3 // B - psrld xmm2, 6 // G - psrld xmm3, 9 // R - pand xmm0, xmm7 // A - pand xmm1, xmm4 // B - pand xmm2, xmm5 // G - pand xmm3, xmm6 // R - por xmm0, xmm1 // BA - por xmm2, xmm3 // GR - por xmm0, xmm2 // BGRA + movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + movdqa xmm3, xmm0 // R + psrad xmm0, 16 // A + psrld xmm1, 3 // B + psrld xmm2, 6 // G + psrld xmm3, 9 // R + pand xmm0, xmm7 // A + pand xmm1, xmm4 // B + pand xmm2, xmm5 // G + pand xmm3, xmm6 // R + por xmm0, xmm1 // BA + por xmm2, xmm3 // GR + por xmm0, xmm2 // BGRA packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 @@ -991,22 +967,23 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { } } -__declspec(naked) -void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 psllw xmm4, 12 - movdqa xmm3, xmm4 // generate mask 0x00f000f0 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 psrlw xmm3, 8 convertloop: - movdqu xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 - pand xmm0, xmm3 // low nibble - pand xmm1, xmm4 // high nibble + pand xmm0, xmm3 // low nibble + pand xmm1, xmm4 // high nibble psrld xmm0, 4 psrld xmm1, 8 por xmm0, xmm1 @@ -1021,33 +998,34 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { } #ifdef HAS_ARGBTORGB565ROW_AVX2 -__declspec(naked) -void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width - vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f vpsrld ymm3, ymm3, 27 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 vpsrld ymm4, ymm4, 26 vpslld ymm4, ymm4, 5 - vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm2, ymm0, 5 // G - vpsrld ymm1, ymm0, 3 // B - vpsrld ymm0, ymm0, 8 // R - vpand ymm2, ymm2, ymm4 // G - vpand ymm1, ymm1, ymm3 // B - vpand ymm0, ymm0, ymm5 // R - vpor ymm1, ymm1, ymm2 // BG - vpor ymm0, ymm0, ymm1 // BGR + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR vpackusdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -1058,36 +1036,37 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { #endif // HAS_ARGBTORGB565ROW_AVX2 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 -__declspec(naked) -void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // width vpcmpeqb ymm4, ymm4, ymm4 - vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f - vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 - vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 + vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f + vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 + vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 vpslld ymm7, ymm7, 15 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpsrld ymm3, ymm0, 9 // R - vpsrld ymm2, ymm0, 6 // G - vpsrld ymm1, ymm0, 3 // B - vpsrad ymm0, ymm0, 16 // A - vpand ymm3, ymm3, ymm6 // R - vpand ymm2, ymm2, ymm5 // G - vpand ymm1, ymm1, ymm4 // B - vpand ymm0, ymm0, ymm7 // A - vpor ymm0, ymm0, ymm1 // BA - vpor ymm2, ymm2, ymm3 // GR - vpor ymm0, ymm0, ymm2 // BGRA + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm3, ymm0, 9 // R + vpsrld ymm2, ymm0, 6 // G + vpsrld ymm1, ymm0, 3 // B + vpsrad ymm0, ymm0, 16 // A + vpand ymm3, ymm3, ymm6 // R + vpand ymm2, ymm2, ymm5 // G + vpand ymm1, ymm1, ymm4 // B + vpand ymm0, ymm0, ymm7 // A + vpor ymm0, ymm0, ymm1 // BA + vpor ymm2, ymm2, ymm3 // GR + vpor ymm0, ymm0, ymm2 // BGRA vpackssdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 + vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -1098,27 +1077,28 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { #endif // HAS_ARGBTOARGB1555ROW_AVX2 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 -__declspec(naked) -void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { +__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_rgb, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 vpsllw ymm4, ymm4, 12 - vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 + vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 convertloop: - vmovdqu ymm0, [eax] // fetch 8 pixels of argb - vpand ymm1, ymm0, ymm4 // high nibble - vpand ymm0, ymm0, ymm3 // low nibble + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpand ymm1, ymm0, ymm4 // high nibble + vpand ymm0, ymm0, ymm3 // low nibble vpsrld ymm1, ymm1, 8 vpsrld ymm0, ymm0, 4 vpor ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] - vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 + vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 lea edx, [edx + 16] sub ecx, 8 jg convertloop @@ -1129,12 +1109,13 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { #endif // HAS_ARGBTOARGB4444ROW_AVX2 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -__declspec(naked) -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToY movdqa xmm5, xmmword ptr kAddY16 @@ -1164,12 +1145,13 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -__declspec(naked) -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToYJ movdqa xmm5, xmmword ptr kAddYJ64 @@ -1200,17 +1182,16 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { #ifdef HAS_ARGBTOYROW_AVX2 // vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = { - 0, 4, 1, 5, 2, 6, 3, 7 -}; +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) -void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ vbroadcastf128 ymm4, xmmword ptr kARGBToY vbroadcastf128 ymm5, xmmword ptr kAddY16 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX @@ -1244,12 +1225,13 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) -void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ vbroadcastf128 ymm4, xmmword ptr kARGBToYJ vbroadcastf128 ymm5, xmmword ptr kAddYJ64 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX @@ -1283,12 +1265,13 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { } #endif // HAS_ARGBTOYJROW_AVX2 -__declspec(naked) -void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kBGRAToY movdqa xmm5, xmmword ptr kAddY16 @@ -1316,12 +1299,13 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { } } -__declspec(naked) -void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kABGRToY movdqa xmm5, xmmword ptr kAddY16 @@ -1349,12 +1333,13 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { } } -__declspec(naked) -void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { +__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kRGBAToY movdqa xmm5, xmmword ptr kAddY16 @@ -1382,24 +1367,26 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { } } -__declspec(naked) -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1423,9 +1410,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1437,11 +1424,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1452,24 +1439,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) -void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUVJ128 movdqa xmm6, xmmword ptr kARGBToVJ movdqa xmm7, xmmword ptr kARGBToUJ - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1493,9 +1482,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1510,9 +1499,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm1, 8 packsswb xmm0, xmm1 - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1524,24 +1513,26 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } #ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) -void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width vbroadcastf128 ymm5, xmmword ptr kAddUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ + /* step 1 - subsample 32x2 argb pixels to 16x1 */ vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + 64] @@ -1558,9 +1549,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm0, ymm0, ymm6 // V @@ -1574,9 +1565,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw vpaddb ymm0, ymm0, ymm5 // -> unsigned - // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V + // step 3 - store 16 U and 16 V values + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -1590,24 +1581,26 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) -void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width vbroadcastf128 ymm5, xmmword ptr kAddUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 32x2 argb pixels to 16x1 */ + /* step 1 - subsample 32x2 argb pixels to 16x1 */ vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + 64] @@ -1624,9 +1617,9 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm0, ymm0, ymm6 // V @@ -1641,9 +1634,9 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, vpermq ymm0, ymm0, 0xd8 // For vpacksswb vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw - // step 3 - store 16 U and 16 V values - vextractf128 [edx], ymm0, 0 // U - vextractf128 [edx + edi], ymm0, 1 // V + // step 3 - store 16 U and 16 V values + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -1656,23 +1649,24 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) -void ARGBToUV444Row_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u + mov eax, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* convert to U and V */ - movdqu xmm0, [eax] // U + /* convert to U and V */ + movdqu xmm0, [eax] // U movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] @@ -1688,7 +1682,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, paddb xmm0, xmm5 movdqu [edx], xmm0 - movdqu xmm0, [eax] // V + movdqu xmm0, [eax] // V movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] @@ -1713,24 +1707,26 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, } } -__declspec(naked) -void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kBGRAToV movdqa xmm7, xmmword ptr kBGRAToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1754,9 +1750,9 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1768,11 +1764,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1783,24 +1779,26 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) -void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kABGRToV movdqa xmm7, xmmword ptr kABGRToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1824,9 +1822,9 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1838,11 +1836,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1853,24 +1851,26 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) -void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kAddUV128 movdqa xmm6, xmmword ptr kRGBAToV movdqa xmm7, xmmword ptr kRGBAToU - sub edi, edx // stride from u to v + sub edi, edx // stride from u to v convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ + /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 @@ -1894,9 +1894,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1908,11 +1908,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned + paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop @@ -1925,109 +1925,95 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOYROW_SSSE3 // Read 16 UV from 444 -#define READYUV444_AVX2 __asm { \ - __asm vmovdqu xmm0, [esi] /* U */ \ - __asm vmovdqu xmm1, [esi + edi] /* V */ \ +#define READYUV444_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* U */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 UV from 422, upsample to 16 UV. -#define READYUV422_AVX2 __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ +#define READYUV422_AVX2 \ + __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. -#define READYUVA422_AVX2 __asm { \ - __asm vmovq xmm0, qword ptr [esi] /* U */ \ - __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ +#define READYUVA422_AVX2 \ + __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16] \ - __asm vmovdqu xmm5, [ebp] /* A */ \ + __asm vmovdqu xmm5, [ebp] /* A */ \ __asm vpermq ymm5, ymm5, 0xd8 \ - __asm lea ebp, [ebp + 16] \ - } - -// Read 4 UV from 411, upsample to 16 UV. -#define READYUV411_AVX2 __asm { \ - __asm vmovd xmm0, dword ptr [esi] /* U */ \ - __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ - __asm lea esi, [esi + 4] \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ - __asm vpermq ymm4, ymm4, 0xd8 \ - __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea ebp, [ebp + 16]} // Read 8 UV from NV12, upsample to 16 UV. -#define READNV12_AVX2 __asm { \ - __asm vmovdqu xmm0, [esi] /* UV */ \ +#define READNV12_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 UV from NV21, upsample to 16 UV. -#define READNV21_AVX2 __asm { \ - __asm vmovdqu xmm0, [esi] /* UV */ \ +#define READNV21_AVX2 \ + __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \ - __asm vmovdqu xmm4, [eax] /* Y */ \ + __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. -#define READYUY2_AVX2 __asm { \ - __asm vmovdqu ymm4, [eax] /* YUY2 */ \ +#define READYUY2_AVX2 \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* YUY2 */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ - __asm vmovdqu ymm0, [eax] /* UV */ \ + __asm vmovdqu ymm0, [eax] /* UV */ \ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 32] \ - } + __asm lea eax, [eax + 32]} // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. -#define READUYVY_AVX2 __asm { \ - __asm vmovdqu ymm4, [eax] /* UYVY */ \ +#define READUYVY_AVX2 \ + __asm { \ + __asm vmovdqu ymm4, [eax] /* UYVY */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ - __asm vmovdqu ymm0, [eax] /* UV */ \ + __asm vmovdqu ymm0, [eax] /* UV */ \ __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 32] \ - } + __asm lea eax, [eax + 32]} // Convert 16 pixels: 16 UV and 16 Y. -#define YUVTORGB_AVX2(YuvConstants) __asm { \ +#define YUVTORGB_AVX2(YuvConstants) \ + __asm { \ __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ @@ -2036,68 +2022,67 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ __asm vpsubw ymm1, ymm3, ymm1 \ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ - __asm vpsubw ymm0, ymm3, ymm0 \ - /* Step 2: Find Y contribution to 16 R,G,B values */ \ + __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ - __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ - __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ - __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ + __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ + __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ + __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ __asm vpsraw ymm0, ymm0, 6 \ __asm vpsraw ymm1, ymm1, 6 \ __asm vpsraw ymm2, ymm2, 6 \ - __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ - __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ - __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ + __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ + __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ + __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ } // Store 16 ARGB values. -#define STOREARGB_AVX2 __asm { \ - __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ +#define STOREARGB_AVX2 \ + __asm { \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ __asm vpermq ymm0, ymm0, 0xd8 \ - __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ + __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ - __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ + __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ + __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ __asm vmovdqu 0[edx], ymm1 \ __asm vmovdqu 32[edx], ymm0 \ - __asm lea edx, [edx + 64] \ - } + __asm lea edx, [edx + 64]} // Store 16 RGBA values. -#define STORERGBA_AVX2 __asm { \ - __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ +#define STORERGBA_AVX2 \ + __asm { \ + __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ __asm vpermq ymm1, ymm1, 0xd8 \ - __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ + __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ __asm vpermq ymm2, ymm2, 0xd8 \ - __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ - __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ + __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ + __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ __asm vmovdqu [edx], ymm0 \ __asm vmovdqu [edx + 32], ymm1 \ - __asm lea edx, [edx + 64] \ - } + __asm lea edx, [edx + 64]} #ifdef HAS_I422TOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void I422ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV422_AVX2 @@ -2119,21 +2104,21 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_I422ALPHATOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. -__declspec(naked) -void I422AlphaToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422AlphaToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U mov edi, [esp + 16 + 12] // V mov ebp, [esp + 16 + 16] // A mov edx, [esp + 16 + 20] // argb @@ -2162,25 +2147,25 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void I444ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I444ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV444_AVX2 YUVTORGB_AVX2(ebx) @@ -2198,64 +2183,24 @@ void I444ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I444TOARGBROW_AVX2 -#ifdef HAS_I411TOARGBROW_AVX2 -// 16 pixels -// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void I411ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // abgr - mov ebx, [esp + 12 + 20] // yuvconstants - mov ecx, [esp + 12 + 24] // width - sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - - convertloop: - READYUV411_AVX2 - YUVTORGB_AVX2(ebx) - STOREARGB_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I411TOARGBROW_AVX2 - #ifdef HAS_NV12TOARGBROW_AVX2 // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void NV12ToARGBRow_AVX2(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV12ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READNV12_AVX2 @@ -2276,21 +2221,21 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_NV21TOARGBROW_AVX2 // 16 pixels. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) -void NV21ToARGBRow_AVX2(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV21ToARGBRow_AVX2( + const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READNV21_AVX2 @@ -2311,18 +2256,18 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_YUY2TOARGBROW_AVX2 // 16 pixels. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) -void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void YUY2ToARGBRow_AVX2( + const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUY2_AVX2 @@ -2342,18 +2287,18 @@ void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, #ifdef HAS_UYVYTOARGBROW_AVX2 // 16 pixels. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). -__declspec(naked) -void UYVYToARGBRow_AVX2(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void UYVYToARGBRow_AVX2( + const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READUYVY_AVX2 @@ -2373,25 +2318,25 @@ void UYVYToARGBRow_AVX2(const uint8* src_uyvy, #ifdef HAS_I422TORGBAROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -__declspec(naked) -void I422ToRGBARow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGBARow_AVX2( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // abgr mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV422_AVX2 @@ -2415,100 +2360,83 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, // Allows a conversion with half size scaling. // Read 8 UV from 444. -#define READYUV444 __asm { \ +#define READYUV444 \ + __asm { \ __asm movq xmm0, qword ptr [esi] /* U */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ +#define READYUV422 \ + __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. -#define READYUVA422 __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ +#define READYUVA422 \ + __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] /* Y */ \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] /* Y */ \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8] \ - __asm movq xmm5, qword ptr [ebp] /* A */ \ - __asm lea ebp, [ebp + 8] \ - } - -// Read 2 UV from 411, upsample to 8 UV. -// drmemory fails with memory fault if pinsrw used. libyuv bug: 525 -// __asm pinsrw xmm0, [esi], 0 /* U */ -// __asm pinsrw xmm1, [esi + edi], 0 /* V */ -#define READYUV411_EBX __asm { \ - __asm movzx ebx, word ptr [esi] /* U */ \ - __asm movd xmm0, ebx \ - __asm movzx ebx, word ptr [esi + edi] /* V */ \ - __asm movd xmm1, ebx \ - __asm lea esi, [esi + 2] \ - __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ - __asm movq xmm4, qword ptr [eax] \ - __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm lea ebp, [ebp + 8]} // Read 4 UV from NV12, upsample to 8 UV. -#define READNV12 __asm { \ +#define READNV12 \ + __asm { \ __asm movq xmm0, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ - __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 VU from NV21, upsample to 8 UV. -#define READNV21 __asm { \ +#define READNV21 \ + __asm { \ __asm movq xmm0, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ __asm pshufb xmm0, xmmword ptr kShuffleNV21 \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ - __asm lea eax, [eax + 8] \ - } + __asm lea eax, [eax + 8]} // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. -#define READYUY2 __asm { \ - __asm movdqu xmm4, [eax] /* YUY2 */ \ +#define READYUY2 \ + __asm { \ + __asm movdqu xmm4, [eax] /* YUY2 */ \ __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ - __asm movdqu xmm0, [eax] /* UV */ \ + __asm movdqu xmm0, [eax] /* UV */ \ __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. -#define READUYVY __asm { \ - __asm movdqu xmm4, [eax] /* UYVY */ \ +#define READUYVY \ + __asm { \ + __asm movdqu xmm4, [eax] /* UYVY */ \ __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ - __asm movdqu xmm0, [eax] /* UV */ \ + __asm movdqu xmm0, [eax] /* UV */ \ __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ - __asm lea eax, [eax + 16] \ - } + __asm lea eax, [eax + 16]} // Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB(YuvConstants) __asm { \ +#define YUVTORGB(YuvConstants) \ + __asm { \ __asm movdqa xmm1, xmm0 \ __asm movdqa xmm2, xmm0 \ __asm movdqa xmm3, xmm0 \ @@ -2522,129 +2450,125 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ __asm psubw xmm2, xmm3 \ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ - __asm paddsw xmm0, xmm4 /* B += Y */ \ - __asm paddsw xmm1, xmm4 /* G += Y */ \ - __asm paddsw xmm2, xmm4 /* R += Y */ \ + __asm paddsw xmm0, xmm4 /* B += Y */ \ + __asm paddsw xmm1, xmm4 /* G += Y */ \ + __asm paddsw xmm2, xmm4 /* R += Y */ \ __asm psraw xmm0, 6 \ __asm psraw xmm1, 6 \ __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ } // Store 8 ARGB values. -#define STOREARGB __asm { \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm5 /* RA */ \ +#define STOREARGB \ + __asm { \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm5 /* RA */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm0 \ __asm movdqu 16[edx], xmm1 \ - __asm lea edx, [edx + 32] \ - } + __asm lea edx, [edx + 32]} // Store 8 BGRA values. -#define STOREBGRA __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm0 /* GB */ \ - __asm punpcklbw xmm5, xmm2 /* AR */ \ +#define STOREBGRA \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm0 /* GB */ \ + __asm punpcklbw xmm5, xmm2 /* AR */ \ __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ + __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32] \ - } + __asm lea edx, [edx + 32]} // Store 8 RGBA values. -#define STORERGBA __asm { \ - __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ - __asm punpcklbw xmm1, xmm2 /* GR */ \ - __asm punpcklbw xmm5, xmm0 /* AB */ \ +#define STORERGBA \ + __asm { \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm2 /* GR */ \ + __asm punpcklbw xmm5, xmm0 /* AB */ \ __asm movdqa xmm0, xmm5 \ - __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ - __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ + __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ - __asm lea edx, [edx + 32] \ - } + __asm lea edx, [edx + 32]} // Store 8 RGB24 values. -#define STORERGB24 __asm { \ - /* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ +#define STORERGB24 \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ - /* RRGB -> RGB24 */ \ - __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ - __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ - __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ - __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ - __asm lea edx, [edx + 24] \ - } + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ + __asm lea edx, [edx + 24]} // Store 8 RGB565 values. -#define STORERGB565 __asm { \ - /* Weave into RRGB */ \ - __asm punpcklbw xmm0, xmm1 /* BG */ \ - __asm punpcklbw xmm2, xmm2 /* RR */ \ +#define STORERGB565 \ + __asm {/* Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ - __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ - __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ - /* RRGB -> RGB565 */ \ - __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ - __asm movdqa xmm2, xmm0 /* G */ \ - __asm pslld xmm0, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm0, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm0, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm0, xmm3 /* BGR */ \ - __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ - __asm movdqa xmm2, xmm1 /* G */ \ - __asm pslld xmm1, 8 /* R */ \ - __asm psrld xmm3, 3 /* B */ \ - __asm psrld xmm2, 5 /* G */ \ - __asm psrad xmm1, 16 /* R */ \ - __asm pand xmm3, xmm5 /* B */ \ - __asm pand xmm2, xmm6 /* G */ \ - __asm pand xmm1, xmm7 /* R */ \ - __asm por xmm3, xmm2 /* BG */ \ - __asm por xmm1, xmm3 /* BGR */ \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ + __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ + __asm movdqa xmm2, xmm0 /* G */ \ + __asm pslld xmm0, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm0, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm0, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm0, xmm3 /* BGR */ \ + __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ + __asm movdqa xmm2, xmm1 /* G */ \ + __asm pslld xmm1, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm1, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm1, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm1, xmm3 /* BGR */ \ __asm packssdw xmm0, xmm1 \ - __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ - __asm lea edx, [edx + 16] \ - } + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm lea edx, [edx + 16]} // 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void I444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I444ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV444 @@ -2663,19 +2587,19 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). -__declspec(naked) -void I422ToRGB24Row_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgb24, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGB24Row_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants @@ -2701,30 +2625,30 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, // 8 pixels // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). -__declspec(naked) -void I422ToRGB565Row_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb565_buf, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGB565Row_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* rgb565_buf, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate mask 0x0000001f + pcmpeqb xmm5, xmm5 // generate mask 0x0000001f psrld xmm5, 27 - pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 + pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 psrld xmm6, 26 pslld xmm6, 5 - pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 + pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 pslld xmm7, 11 convertloop: @@ -2744,25 +2668,25 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void I422ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV422 @@ -2781,21 +2705,21 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. -__declspec(naked) -void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422AlphaToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U mov edi, [esp + 16 + 12] // V mov ebp, [esp + 16 + 16] // A mov edx, [esp + 16 + 20] // argb @@ -2819,63 +2743,23 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels. -// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -// Similar to I420 but duplicate UV once more. -__declspec(naked) -void I411ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov edx, [esp + 16 + 16] // abgr - mov ebp, [esp + 16 + 20] // yuvconstants - mov ecx, [esp + 16 + 24] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - - convertloop: - READYUV411_EBX - YUVTORGB(ebp) - STOREARGB - - sub ecx, 8 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void NV12ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV12ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // UV + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // UV mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READNV12 @@ -2893,21 +2777,21 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) -void NV21ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* vu_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void NV21ToARGBRow_SSSE3( + const uint8_t* y_buf, + const uint8_t* vu_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push ebx - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // VU + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // VU mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READNV21 @@ -2925,18 +2809,18 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) -void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void YUY2ToARGBRow_SSSE3( + const uint8_t* src_yuy2, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // yuy2 - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUY2 @@ -2953,18 +2837,18 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, // 8 pixels. // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). -__declspec(naked) -void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void UYVYToARGBRow_SSSE3( + const uint8_t* src_uyvy, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { __asm { push ebx - mov eax, [esp + 4 + 4] // uyvy - mov edx, [esp + 4 + 8] // argb + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READUYVY @@ -2979,19 +2863,19 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, } } -__declspec(naked) -void I422ToRGBARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, - const struct YuvConstants* yuvconstants, - int width) { +__declspec(naked) void I422ToRGBARow_SSSE3( + const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { __asm { push esi push edi push ebx - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants @@ -3016,39 +2900,38 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, #ifdef HAS_I400TOARGBROW_SSE2 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). -__declspec(naked) -void I400ToARGBRow_SSE2(const uint8* y_buf, - uint8* rgb_buf, - int width) { +__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* rgb_buf, + int width) { __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) movd xmm2, eax pshufd xmm2, xmm2,0 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) movd xmm3, eax pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width convertloop: - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 movq xmm0, qword ptr [eax] lea eax, [eax + 8] - punpcklbw xmm0, xmm0 // Y.Y + punpcklbw xmm0, xmm0 // Y.Y pmulhuw xmm0, xmm2 psubusw xmm0, xmm3 psrlw xmm0, 6 - packuswb xmm0, xmm0 // G + packuswb xmm0, xmm0 // G - // Step 2: Weave into ARGB - punpcklbw xmm0, xmm0 // GG + // Step 2: Weave into ARGB + punpcklbw xmm0, xmm0 // GG movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 // BGRA first 4 pixels - punpckhwd xmm1, xmm1 // BGRA next 4 pixels + punpcklwd xmm0, xmm0 // BGRA first 4 pixels + punpckhwd xmm1, xmm1 // BGRA next 4 pixels por xmm0, xmm4 por xmm1, xmm4 movdqu [edx], xmm0 @@ -3064,41 +2947,40 @@ void I400ToARGBRow_SSE2(const uint8* y_buf, #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -__declspec(naked) -void I400ToARGBRow_AVX2(const uint8* y_buf, - uint8* rgb_buf, - int width) { +__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* rgb_buf, + int width) { __asm { - mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) vmovd xmm2, eax vbroadcastss ymm2, xmm2 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) vmovd xmm3, eax vbroadcastss ymm3, xmm3 - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 vpslld ymm4, ymm4, 24 - mov eax, [esp + 4] // Y - mov edx, [esp + 8] // rgb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width convertloop: - // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 + // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 vmovdqu xmm0, [eax] lea eax, [eax + 16] - vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates - vpunpcklbw ymm0, ymm0, ymm0 // Y.Y + vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates + vpunpcklbw ymm0, ymm0, ymm0 // Y.Y vpmulhuw ymm0, ymm0, ymm2 vpsubusw ymm0, ymm0, ymm3 vpsrlw ymm0, ymm0, 6 - vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 + vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 - // TODO(fbarchard): Weave alpha with unpack. - // Step 2: Weave into ARGB - vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates + // TODO(fbarchard): Weave alpha with unpack. + // Step 2: Weave into ARGB + vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates vpermq ymm1, ymm1, 0xd8 - vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels - vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels + vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels + vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels vpor ymm0, ymm0, ymm4 vpor ymm1, ymm1, ymm4 vmovdqu [edx], ymm0 @@ -3114,16 +2996,16 @@ void I400ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. -static const uvec8 kShuffleMirror = { - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; // TODO(fbarchard): Replace lea with -16 offset. -__declspec(naked) -void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { +__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width movdqa xmm5, xmmword ptr kShuffleMirror @@ -3140,11 +3022,12 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 -__declspec(naked) -void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void MirrorRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width vbroadcastf128 ymm5, xmmword ptr kShuffleMirror @@ -3164,17 +3047,17 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorUV = { - 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u -}; +static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -__declspec(naked) -void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, - int width) { +__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src - mov edx, [esp + 4 + 8] // dst_u + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width movdqa xmm1, xmmword ptr kShuffleMirrorUV @@ -3198,11 +3081,12 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 -__declspec(naked) -void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width lea eax, [eax - 16 + ecx * 4] // last 4 pixels. @@ -3221,15 +3105,14 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. -static const ulvec32 kARGBShuffleMirror_AVX2 = { - 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; +static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; -__declspec(naked) -void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 @@ -3246,16 +3129,17 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -__declspec(naked) -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { +__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3265,10 +3149,10 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, lea eax, [eax + 32] movdqa xmm2, xmm0 movdqa xmm3, xmm1 - pand xmm0, xmm5 // even bytes + pand xmm0, xmm5 // even bytes pand xmm1, xmm5 packuswb xmm0, xmm1 - psrlw xmm2, 8 // odd bytes + psrlw xmm2, 8 // odd bytes psrlw xmm3, 8 packuswb xmm2, xmm3 movdqu [edx], xmm0 @@ -3285,16 +3169,17 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_AVX2 -__declspec(naked) -void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int width) { +__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3302,9 +3187,9 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm2, ymm0, 8 // odd bytes + vpsrlw ymm2, ymm0, 8 // odd bytes vpsrlw ymm3, ymm1, 8 - vpand ymm0, ymm0, ymm5 // even bytes + vpand ymm0, ymm0, ymm5 // even bytes vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm2, ymm2, ymm3 @@ -3324,24 +3209,25 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 -__declspec(naked) -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) { +__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width sub edx, eax convertloop: - movdqu xmm0, [eax] // read 16 U's + movdqu xmm0, [eax] // read 16 U's movdqu xmm1, [eax + edx] // and 16 V's lea eax, [eax + 16] movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 // first 8 UV pairs - punpckhbw xmm2, xmm1 // next 8 UV pairs + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs movdqu [edi], xmm0 movdqu [edi + 16], xmm2 lea edi, [edi + 32] @@ -3355,24 +3241,25 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #endif // HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 -__declspec(naked) -void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) { +__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width sub edx, eax convertloop: - vmovdqu ymm0, [eax] // read 32 U's - vmovdqu ymm1, [eax + edx] // and 32 V's + vmovdqu ymm0, [eax] // read 32 U's + vmovdqu ymm1, [eax + edx] // and 32 V's lea eax, [eax + 32] - vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 - vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 - vextractf128 [edi], ymm2, 0 // bytes 0..15 + vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 + vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 + vextractf128 [edi], ymm2, 0 // bytes 0..15 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 @@ -3388,13 +3275,14 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #endif // HAS_MERGEUVROW_AVX2 #ifdef HAS_COPYROW_SSE2 -// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. -__declspec(naked) -void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { +// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time. +__declspec(naked) void CopyRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width test eax, 15 jne convertloopu test edx, 15 @@ -3426,13 +3314,14 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_AVX -// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. -__declspec(naked) -void CopyRow_AVX(const uint8* src, uint8* dst, int count) { +// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time. +__declspec(naked) void CopyRow_AVX(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width convertloop: vmovdqu ymm0, [eax] @@ -3451,14 +3340,15 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_AVX // Multiple of 1. -__declspec(naked) -void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { +__declspec(naked) void CopyRow_ERMS(const uint8_t* src, + uint8_t* dst, + int width) { __asm { mov eax, esi mov edx, edi - mov esi, [esp + 4] // src - mov edi, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // width rep movsb mov edi, edx mov esi, eax @@ -3468,15 +3358,16 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels -__declspec(naked) -void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff psrld xmm1, 8 convertloop: @@ -3504,14 +3395,15 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels -__declspec(naked) -void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff convertloop: vmovdqu ymm1, [eax] @@ -3533,11 +3425,12 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 // width in pixels -__declspec(naked) -void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { +__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_a + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a mov ecx, [esp + 12] // width extractloop: @@ -3558,17 +3451,54 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +// width in pixels +__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_a + mov ecx, [esp + 12] // width + vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX + + extractloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpsrld ymm0, ymm0, 24 + vpsrld ymm1, ymm1, 24 + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + lea eax, [eax + 128] + vpackssdw ymm0, ymm0, ymm1 // mutates + vpsrld ymm2, ymm2, 24 + vpsrld ymm3, ymm3, 24 + vpackssdw ymm2, ymm2, ymm3 // mutates + vpackuswb ymm0, ymm0, ymm2 // mutates + vpermd ymm0, ymm4, ymm0 // unmutate + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg extractloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels -__declspec(naked) -void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count - pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 pslld xmm0, 24 - pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff psrld xmm1, 8 convertloop: @@ -3598,14 +3528,15 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels -__declspec(naked) -void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { +__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, + uint8_t* dst, + int width) { __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width vpcmpeqb ymm0, ymm0, ymm0 - vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff convertloop: vpmovzxbd ymm1, qword ptr [eax] @@ -3628,17 +3559,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 -// Write 'count' bytes using an 8 bit value repeated. -// Count should be multiple of 4. -__declspec(naked) -void SetRow_X86(uint8* dst, uint8 v8, int count) { +// Write 'width' bytes using an 8 bit value repeated. +// width should be multiple of 4. +__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { __asm { - movzx eax, byte ptr [esp + 8] // v8 + movzx eax, byte ptr [esp + 8] // v8 mov edx, 0x01010101 // Duplicate byte to all bytes. - mul edx // overwrites edx with upper part of result. + mul edx // overwrites edx with upper part of result. mov edx, edi - mov edi, [esp + 4] // dst - mov ecx, [esp + 12] // count + mov edi, [esp + 4] // dst + mov ecx, [esp + 12] // width shr ecx, 2 rep stosd mov edi, edx @@ -3646,28 +3576,28 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) { } } -// Write 'count' bytes using an 8 bit value repeated. -__declspec(naked) -void SetRow_ERMS(uint8* dst, uint8 v8, int count) { +// Write 'width' bytes using an 8 bit value repeated. +__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { __asm { mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v8 - mov ecx, [esp + 12] // count + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v8 + mov ecx, [esp + 12] // width rep stosb mov edi, edx ret } } -// Write 'count' 32 bit values. -__declspec(naked) -void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { +// Write 'width' 32 bit values. +__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, + uint32_t v32, + int width) { __asm { mov edx, edi - mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v32 - mov ecx, [esp + 12] // count + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // width rep stosd mov edi, edx ret @@ -3676,12 +3606,13 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_AVX2 -__declspec(naked) -void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { +__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 @@ -3689,9 +3620,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // even bytes are Y + vpand ymm0, ymm0, ymm5 // even bytes are Y vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -3702,18 +3633,20 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { } } -__declspec(naked) -void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3723,18 +3656,18 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3746,16 +3679,17 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) -void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3763,18 +3697,18 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3785,21 +3719,21 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, } } -__declspec(naked) -void UYVYToYRow_AVX2(const uint8* src_uyvy, - uint8* dst_y, int width) { +__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // odd bytes are Y + vpsrlw ymm0, ymm0, 8 // odd bytes are Y vpsrlw ymm1, ymm1, 8 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -3810,18 +3744,20 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, } } -__declspec(naked) -void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3831,18 +3767,18 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3854,16 +3790,17 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) -void UYVYToUV422Row_AVX2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx @@ -3871,18 +3808,18 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U - vpsrlw ymm0, ymm0, 8 // V + vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U - vextractf128 [edx + edi], ymm0, 0 // V + vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop @@ -3895,21 +3832,21 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_YUY2TOYROW_SSE2 -__declspec(naked) -void YUY2ToYRow_SSE2(const uint8* src_yuy2, - uint8* dst_y, int width) { +__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y + pand xmm0, xmm5 // even bytes are Y pand xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -3920,18 +3857,20 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, } } -__declspec(naked) -void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3943,13 +3882,13 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm0, 8 // YUYV -> UVUV psrlw xmm1, 8 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -3963,16 +3902,17 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) -void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -3980,13 +3920,13 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm0, 8 // YUYV -> UVUV psrlw xmm1, 8 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -3999,19 +3939,19 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, } } -__declspec(naked) -void UYVYToYRow_SSE2(const uint8* src_uyvy, - uint8* dst_y, int width) { +__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_y, + int width) { __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // width convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y + psrlw xmm0, 8 // odd bytes are Y psrlw xmm1, 8 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -4022,18 +3962,20 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, } } -__declspec(naked) -void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, + int stride_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -4045,13 +3987,13 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 - pand xmm0, xmm5 // UYVY -> UVUV + pand xmm0, xmm5 // UYVY -> UVUV pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -4065,16 +4007,17 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int width) { +__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { __asm { push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx @@ -4082,13 +4025,13 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - pand xmm0, xmm5 // UYVY -> UVUV + pand xmm0, xmm5 // UYVY -> UVUV pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 - psrlw xmm1, 8 // V + psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 @@ -4108,13 +4051,15 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) -void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { +__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { __asm { push esi push edi - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 psllw xmm5, 8 mov eax, 0x80808080 // 128 for biasing image to signed. movd xmm6, eax @@ -4123,8 +4068,8 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, mov eax, 0x807f807f // 32768 + 127 for unbias and round. movd xmm7, eax pshufd xmm7, xmm7, 0x00 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 mov esi, [esp + 8 + 12] // alpha mov edi, [esp + 8 + 16] // dst mov ecx, [esp + 8 + 20] // width @@ -4132,17 +4077,17 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, sub edx, esi sub edi, esi - // 8 pixel loop. + // 8 pixel loop. convertloop8: - movq xmm0, qword ptr [esi] // alpha + movq xmm0, qword ptr [esi] // alpha punpcklbw xmm0, xmm0 - pxor xmm0, xmm5 // a, 255-a + pxor xmm0, xmm5 // a, 255-a movq xmm1, qword ptr [eax + esi] // src0 movq xmm2, qword ptr [edx + esi] // src1 punpcklbw xmm1, xmm2 - psubb xmm1, xmm6 // bias src0/1 - 128 + psubb xmm1, xmm6 // bias src0/1 - 128 pmaddubsw xmm0, xmm1 - paddw xmm0, xmm7 // unbias result - 32768 and round. + paddw xmm0, xmm7 // unbias result - 32768 and round. psrlw xmm0, 8 packuswb xmm0, xmm0 movq qword ptr [edi + esi], xmm0 @@ -4163,13 +4108,15 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 -__declspec(naked) -void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, - const uint8* alpha, uint8* dst, int width) { +__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0, + const uint8_t* src1, + const uint8_t* alpha, + uint8_t* dst, + int width) { __asm { push esi push edi - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 vpsllw ymm5, ymm5, 8 mov eax, 0x80808080 // 128 for biasing image to signed. vmovd xmm6, eax @@ -4177,8 +4124,8 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, mov eax, 0x807f807f // 32768 + 127 for unbias and round. vmovd xmm7, eax vbroadcastss ymm7, xmm7 - mov eax, [esp + 8 + 4] // src0 - mov edx, [esp + 8 + 8] // src1 + mov eax, [esp + 8 + 4] // src0 + mov edx, [esp + 8 + 8] // src1 mov esi, [esp + 8 + 12] // alpha mov edi, [esp + 8 + 16] // dst mov ecx, [esp + 8 + 20] // width @@ -4186,23 +4133,23 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, sub edx, esi sub edi, esi - // 32 pixel loop. + // 32 pixel loop. convertloop32: - vmovdqu ymm0, [esi] // alpha - vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 - vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 - vpxor ymm3, ymm3, ymm5 // a, 255-a - vpxor ymm0, ymm0, ymm5 // a, 255-a + vmovdqu ymm0, [esi] // alpha + vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 + vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 + vpxor ymm3, ymm3, ymm5 // a, 255-a + vpxor ymm0, ymm0, ymm5 // a, 255-a vmovdqu ymm1, [eax + esi] // src0 vmovdqu ymm2, [edx + esi] // src1 vpunpckhbw ymm4, ymm1, ymm2 vpunpcklbw ymm1, ymm1, ymm2 - vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 - vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 + vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 + vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 vpmaddubsw ymm3, ymm3, ymm4 vpmaddubsw ymm0, ymm0, ymm1 - vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. - vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. + vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. + vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. vpsrlw ymm3, ymm3, 8 vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm3 @@ -4221,52 +4168,51 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. -static const uvec8 kShuffleAlpha = { - 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 -}; +static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time. -__declspec(naked) -void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm7, xmm7 // generate constant 0x0001 + pcmpeqb xmm7, xmm7 // generate constant 0x0001 psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff + pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff psrlw xmm6, 8 - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 sub ecx, 4 - jl convertloop4b // less than 4 pixels? + jl convertloop4b // less than 4 pixels? - // 4 pixel loop. + // 4 pixel loop. convertloop4: - movdqu xmm3, [eax] // src argb + movdqu xmm3, [eax] // src argb lea eax, [eax + 16] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqu xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqu xmm1, [esi] // _a_g + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movdqu xmm2, [esi] // _r_b + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movdqu xmm1, [esi] // _a_g lea esi, [esi + 16] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4276,26 +4222,26 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, add ecx, 4 - 1 jl convertloop1b - // 1 pixel loop. + // 1 pixel loop. convertloop1: - movd xmm3, [eax] // src argb + movd xmm3, [eax] // src argb lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [esi] // _r_b - pshufb xmm3, xmmword ptr kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [esi] // _a_g + movdqa xmm0, xmm3 // src argb + pxor xmm3, xmm4 // ~alpha + movd xmm2, [esi] // _r_b + pshufb xmm3, xmmword ptr kShuffleAlpha // alpha + pand xmm2, xmm6 // _r_b + paddw xmm3, xmm7 // 256 - alpha + pmullw xmm2, xmm3 // _r_b * alpha + movd xmm1, [esi] // _a_g lea esi, [esi + 4] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb + psrlw xmm1, 8 // _a_g + por xmm0, xmm4 // set alpha to 255 + pmullw xmm1, xmm3 // _a_g * alpha + psrlw xmm2, 8 // _r_b convert to 8 bits again + paddusb xmm0, xmm2 // + src argb + pand xmm1, xmm5 // a_g_ convert to 8 bits again + paddusb xmm0, xmm1 // + src argb movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 @@ -4311,41 +4257,42 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, }; static const uvec8 kShuffleAlpha1 = { - 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, }; -__declspec(naked) -void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { +__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - pcmpeqb xmm3, xmm3 // generate mask 0xff000000 + pcmpeqb xmm3, xmm3 // generate mask 0xff000000 pslld xmm3, 24 movdqa xmm4, xmmword ptr kShuffleAlpha0 movdqa xmm5, xmmword ptr kShuffleAlpha1 convertloop: - movdqu xmm0, [eax] // read 4 pixels - pshufb xmm0, xmm4 // isolate first 2 alphas - movdqu xmm1, [eax] // read 4 pixels - punpcklbw xmm1, xmm1 // first 2 pixel rgbs - pmulhuw xmm0, xmm1 // rgb * a - movdqu xmm1, [eax] // read 4 pixels - pshufb xmm1, xmm5 // isolate next 2 alphas - movdqu xmm2, [eax] // read 4 pixels - punpckhbw xmm2, xmm2 // next 2 pixel rgbs - pmulhuw xmm1, xmm2 // rgb * a - movdqu xmm2, [eax] // mask original alpha + movdqu xmm0, [eax] // read 4 pixels + pshufb xmm0, xmm4 // isolate first 2 alphas + movdqu xmm1, [eax] // read 4 pixels + punpcklbw xmm1, xmm1 // first 2 pixel rgbs + pmulhuw xmm0, xmm1 // rgb * a + movdqu xmm1, [eax] // read 4 pixels + pshufb xmm1, xmm5 // isolate next 2 alphas + movdqu xmm2, [eax] // read 4 pixels + punpckhbw xmm2, xmm2 // next 2 pixel rgbs + pmulhuw xmm1, xmm2 // rgb * a + movdqu xmm2, [eax] // mask original alpha lea eax, [eax + 16] pand xmm2, xmm3 psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 - por xmm0, xmm2 // copy original alpha + por xmm0, xmm2 // copy original alpha movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4358,22 +4305,23 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = { - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u -}; -__declspec(naked) -void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; +__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. + vmovdqu ymm6, [eax] // read 8 pixels. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpshufb ymm2, ymm0, ymm4 // low 4 alphas @@ -4398,40 +4346,40 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -__declspec(naked) -void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, - int width) { +__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { push ebx push esi push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb mov ecx, [esp + 12 + 12] // width lea ebx, fixed_invtbl8 convertloop: - movdqu xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels movzx esi, byte ptr [eax + 3] // first alpha movzx edi, byte ptr [eax + 7] // second alpha - punpcklbw xmm0, xmm0 // first 2 + punpcklbw xmm0, xmm0 // first 2 movd xmm2, dword ptr [ebx + esi * 4] movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 - pmulhuw xmm0, xmm2 // rgb * a + pmulhuw xmm0, xmm2 // rgb * a - movdqu xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels movzx esi, byte ptr [eax + 11] // third alpha movzx edi, byte ptr [eax + 15] // forth alpha - punpckhbw xmm1, xmm1 // next 2 + punpckhbw xmm1, xmm1 // next 2 movd xmm2, dword ptr [ebx + esi * 4] movd xmm3, dword ptr [ebx + edi * 4] - pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words - pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 - pmulhuw xmm1, xmm2 // rgb * a + pmulhuw xmm1, xmm2 // rgb * a lea eax, [eax + 16] packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -4450,25 +4398,24 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u -}; + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. // USE_GATHER is not on by default, due to being a slow instruction. #ifdef USE_GATHER -__declspec(naked) -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, - int width) { +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 convertloop: - vmovdqu ymm6, [eax] // read 8 pixels. + vmovdqu ymm6, [eax] // read 8 pixels. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. - vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. + vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a @@ -4488,50 +4435,50 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ret } } -#else // USE_GATHER -__declspec(naked) -void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, - int width) { +#else // USE_GATHER +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { push ebx push esi push edi - mov eax, [esp + 12 + 4] // src_argb - mov edx, [esp + 12 + 8] // dst_argb + mov eax, [esp + 12 + 4] // src_argb + mov edx, [esp + 12 + 8] // dst_argb mov ecx, [esp + 12 + 12] // width sub edx, eax lea ebx, fixed_invtbl8 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 convertloop: - // replace VPGATHER - movzx esi, byte ptr [eax + 3] // alpha0 - movzx edi, byte ptr [eax + 7] // alpha1 + // replace VPGATHER + movzx esi, byte ptr [eax + 3] // alpha0 + movzx edi, byte ptr [eax + 7] // alpha1 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] - movzx esi, byte ptr [eax + 11] // alpha2 - movzx edi, byte ptr [eax + 15] // alpha3 - vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] + movzx esi, byte ptr [eax + 11] // alpha2 + movzx edi, byte ptr [eax + 15] // alpha3 + vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] - movzx esi, byte ptr [eax + 19] // alpha4 - movzx edi, byte ptr [eax + 23] // alpha5 - vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] + movzx esi, byte ptr [eax + 19] // alpha4 + movzx edi, byte ptr [eax + 23] // alpha5 + vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] - movzx esi, byte ptr [eax + 27] // alpha6 - movzx edi, byte ptr [eax + 31] // alpha7 - vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] + movzx esi, byte ptr [eax + 27] // alpha6 + movzx edi, byte ptr [eax + 31] // alpha7 + vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] - vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] - vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] - vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] - vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] + vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] + vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] + vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] + vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] // end of VPGATHER - vmovdqu ymm6, [eax] // read 8 pixels. + vmovdqu ymm6, [eax] // read 8 pixels. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a @@ -4540,7 +4487,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia - vpackuswb ymm0, ymm0, ymm1 // unmutated. + vpackuswb ymm0, ymm0, ymm1 // unmutated. vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] sub ecx, 8 @@ -4558,12 +4505,13 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. -__declspec(naked) -void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { +__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* width */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToYJ movdqa xmm5, xmmword ptr kAddYJ64 @@ -4575,20 +4523,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { phaddw xmm0, xmm1 paddw xmm0, xmm5 // Add .5 for rounding. psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 G bytes + packuswb xmm0, xmm0 // 8 G bytes movdqu xmm2, [eax] // A movdqu xmm3, [eax + 16] lea eax, [eax + 32] psrld xmm2, 24 psrld xmm3, 24 packuswb xmm2, xmm3 - packuswb xmm2, xmm2 // 8 A bytes - movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA - punpcklbw xmm0, xmm0 // 8 GG words - punpcklbw xmm3, xmm2 // 8 GA words + packuswb xmm2, xmm2 // 8 A bytes + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA + punpcklbw xmm0, xmm0 // 8 GG words + punpcklbw xmm3, xmm2 // 8 GA words movdqa xmm1, xmm0 - punpcklwd xmm0, xmm3 // GGGA first 4 - punpckhwd xmm1, xmm3 // GGGA next 4 + punpcklwd xmm0, xmm3 // GGGA first 4 + punpckhwd xmm1, xmm3 // GGGA next 4 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] @@ -4604,24 +4552,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 // Constant for ARGB color to sepia tone. -static const vec8 kARGBToSepiaB = { - 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 -}; +static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, + 17, 68, 35, 0, 17, 68, 35, 0}; -static const vec8 kARGBToSepiaG = { - 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 -}; +static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, + 22, 88, 45, 0, 22, 88, 45, 0}; -static const vec8 kARGBToSepiaR = { - 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 -}; +static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, + 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -__declspec(naked) -void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { +__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] /* dst_argb */ - mov ecx, [esp + 8] /* width */ + mov eax, [esp + 4] /* dst_argb */ + mov ecx, [esp + 8] /* width */ movdqa xmm2, xmmword ptr kARGBToSepiaB movdqa xmm3, xmmword ptr kARGBToSepiaG movdqa xmm4, xmmword ptr kARGBToSepiaR @@ -4633,32 +4577,32 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { pmaddubsw xmm6, xmm2 phaddw xmm0, xmm6 psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 B values + packuswb xmm0, xmm0 // 8 B values movdqu xmm5, [eax] // G movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm3 pmaddubsw xmm1, xmm3 phaddw xmm5, xmm1 psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 G values - punpcklbw xmm0, xmm5 // 8 BG values + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values movdqu xmm5, [eax] // R movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm5, xmm1 psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 R values + packuswb xmm5, xmm5 // 8 R values movdqu xmm6, [eax] // A movdqu xmm1, [eax + 16] psrld xmm6, 24 psrld xmm1, 24 packuswb xmm6, xmm1 - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm5, xmm6 // 8 RA values - movdqa xmm1, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm5 // BGRA first 4 - punpckhwd xmm1, xmm5 // BGRA next 4 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 movdqu [eax], xmm0 movdqu [eax + 16], xmm1 lea eax, [eax + 32] @@ -4674,19 +4618,20 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // Same as Sepia except matrix is provided. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. -__declspec(naked) -void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const int8* matrix_argb, int width) { +__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const int8_t* matrix_argb, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* matrix_argb */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* matrix_argb */ movdqu xmm5, [ecx] pshufd xmm2, xmm5, 0x00 pshufd xmm3, xmm5, 0x55 pshufd xmm4, xmm5, 0xaa pshufd xmm5, xmm5, 0xff - mov ecx, [esp + 16] /* width */ + mov ecx, [esp + 16] /* width */ convertloop: movdqu xmm0, [eax] // B @@ -4697,31 +4642,31 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, movdqu xmm1, [eax + 16] pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 - phaddsw xmm0, xmm7 // B - phaddsw xmm6, xmm1 // G - psraw xmm0, 6 // B - psraw xmm6, 6 // G - packuswb xmm0, xmm0 // 8 B values - packuswb xmm6, xmm6 // 8 G values - punpcklbw xmm0, xmm6 // 8 BG values + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values movdqu xmm1, [eax] // R movdqu xmm7, [eax + 16] pmaddubsw xmm1, xmm4 pmaddubsw xmm7, xmm4 - phaddsw xmm1, xmm7 // R + phaddsw xmm1, xmm7 // R movdqu xmm6, [eax] // A movdqu xmm7, [eax + 16] pmaddubsw xmm6, xmm5 pmaddubsw xmm7, xmm5 - phaddsw xmm6, xmm7 // A - psraw xmm1, 6 // R - psraw xmm6, 6 // A - packuswb xmm1, xmm1 // 8 R values - packuswb xmm6, xmm6 // 8 A values - punpcklbw xmm1, xmm6 // 8 RA values - movdqa xmm6, xmm0 // Weave BG, RA together - punpcklwd xmm0, xmm1 // BGRA first 4 - punpckhwd xmm6, xmm1 // BGRA next 4 + phaddsw xmm6, xmm7 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A + packuswb xmm1, xmm1 // 8 R values + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm1, xmm6 // 8 RA values + movdqa xmm6, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm1 // BGRA first 4 + punpckhwd xmm6, xmm1 // BGRA next 4 movdqu [edx], xmm0 movdqu [edx + 16], xmm6 lea eax, [eax + 32] @@ -4735,15 +4680,17 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -__declspec(naked) -void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, - int interval_offset, int width) { +__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, + int scale, + int interval_size, + int interval_offset, + int width) { __asm { - mov eax, [esp + 4] /* dst_argb */ - movd xmm2, [esp + 8] /* scale */ - movd xmm3, [esp + 12] /* interval_size */ - movd xmm4, [esp + 16] /* interval_offset */ - mov ecx, [esp + 20] /* width */ + mov eax, [esp + 4] /* dst_argb */ + movd xmm2, [esp + 8] /* scale */ + movd xmm3, [esp + 12] /* interval_size */ + movd xmm4, [esp + 16] /* interval_offset */ + mov ecx, [esp + 20] /* width */ pshuflw xmm2, xmm2, 040h pshufd xmm2, xmm2, 044h pshuflw xmm3, xmm3, 040h @@ -4756,16 +4703,16 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, convertloop: movdqu xmm0, [eax] // read 4 pixels - punpcklbw xmm0, xmm5 // first 2 pixels - pmulhuw xmm0, xmm2 // pixel * scale >> 16 + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 movdqu xmm1, [eax] // read 4 pixels - punpckhbw xmm1, xmm5 // next 2 pixels + punpckhbw xmm1, xmm5 // next 2 pixels pmulhuw xmm1, xmm2 - pmullw xmm0, xmm3 // * interval_size + pmullw xmm0, xmm3 // * interval_size movdqu xmm7, [eax] // read 4 pixels pmullw xmm1, xmm3 - pand xmm7, xmm6 // mask alpha - paddw xmm0, xmm4 // + interval_size / 2 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 paddw xmm1, xmm4 packuswb xmm0, xmm1 por xmm0, xmm7 @@ -4780,25 +4727,26 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -__declspec(naked) -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { +__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + uint32_t value) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width movd xmm2, [esp + 16] // value punpcklbw xmm2, xmm2 punpcklqdq xmm2, xmm2 convertloop: - movdqu xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - pmulhuw xmm0, xmm2 // argb * value - pmulhuw xmm1, xmm2 // argb * value + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 @@ -4814,28 +4762,29 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) -void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width pxor xmm5, xmm5 // constant 0 convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 - movdqu xmm2, [esi] // read 4 pixels from src_argb1 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm1, xmm0 movdqu xmm3, xmm2 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - punpcklbw xmm2, xmm5 // first 2 - punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 lea eax, [eax + 16] lea esi, [esi + 16] packuswb xmm0, xmm1 @@ -4853,13 +4802,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. // TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) -void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -4867,11 +4817,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, jl convertloop49 convertloop4: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb0 + src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4882,11 +4832,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, jl convertloop19 convertloop1: - movd xmm0, [eax] // read 1 pixels from src_argb0 + movd xmm0, [eax] // read 1 pixels from src_argb0 lea eax, [eax + 4] - movd xmm1, [esi] // read 1 pixels from src_argb1 + movd xmm1, [esi] // read 1 pixels from src_argb1 lea esi, [esi + 4] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb0 + src_argb1 movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 @@ -4901,22 +4851,23 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) -void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb0 lea eax, [eax + 16] - movdqu xmm1, [esi] // read 4 pixels from src_argb1 + movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - psubusb xmm0, xmm1 // src_argb0 - src_argb1 + psubusb xmm0, xmm1 // src_argb0 - src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -4930,28 +4881,29 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) -void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 convertloop: - vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] - vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 lea esi, [esi + 32] - vpunpcklbw ymm0, ymm1, ymm1 // low 4 - vpunpckhbw ymm1, ymm1, ymm1 // high 4 - vpunpcklbw ymm2, ymm3, ymm5 // low 4 - vpunpckhbw ymm3, ymm3, ymm5 // high 4 - vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 - vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpunpcklbw ymm0, ymm1, ymm1 // low 4 + vpunpckhbw ymm1, ymm1, ymm1 // high 4 + vpunpcklbw ymm2, ymm3, ymm5 // low 4 + vpunpckhbw ymm3, ymm3, ymm5 // high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 vpackuswb ymm0, ymm0, ymm1 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -4967,20 +4919,21 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) -void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] - vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -4996,20 +4949,21 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) -void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] - vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -5028,14 +4982,16 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, // -1 0 1 // -2 0 2 // -1 0 1 -__declspec(naked) -void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { +__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + const uint8_t* src_y2, + uint8_t* dst_sobelx, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y0 - mov esi, [esp + 8 + 8] // src_y1 + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 mov edi, [esp + 8 + 12] // src_y2 mov edx, [esp + 8 + 16] // dst_sobelx mov ecx, [esp + 8 + 20] // width @@ -5045,17 +5001,17 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, pxor xmm5, xmm5 // constant 0 convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 @@ -5063,7 +5019,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, paddw xmm0, xmm2 paddw xmm0, xmm1 paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 @@ -5084,13 +5040,14 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, // -1 -2 -1 // 0 0 0 // 1 2 1 -__declspec(naked) -void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, + const uint8_t* src_y1, + uint8_t* dst_sobely, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_y0 - mov esi, [esp + 4 + 8] // src_y1 + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 mov edx, [esp + 4 + 12] // dst_sobely mov ecx, [esp + 4 + 16] // width sub esi, eax @@ -5098,17 +5055,17 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, pxor xmm5, xmm5 // constant 0 convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 @@ -5116,7 +5073,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, paddw xmm0, xmm2 paddw xmm0, xmm1 paddw xmm0, xmm1 - pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 @@ -5137,36 +5094,37 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, // R = Sobel // G = Sobel // B = Sobel -__declspec(naked) -void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 - pslld xmm5, 24 // 0xff000000 + pcmpeqb xmm5, xmm5 // alpha 255 + pslld xmm5, 24 // 0xff000000 convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely - movdqa xmm2, xmm0 // GG - punpcklbw xmm2, xmm0 // First 8 - punpckhbw xmm0, xmm0 // Next 8 - movdqa xmm1, xmm2 // GGGG - punpcklwd xmm1, xmm2 // First 4 - punpckhwd xmm2, xmm2 // Next 4 - por xmm1, xmm5 // GGGA + paddusb xmm0, xmm1 // sobel = sobelx + sobely + movdqa xmm2, xmm0 // GG + punpcklbw xmm2, xmm0 // First 8 + punpckhbw xmm0, xmm0 // Next 8 + movdqa xmm1, xmm2 // GGGG + punpcklwd xmm1, xmm2 // First 4 + punpckhwd xmm2, xmm2 // Next 4 + por xmm1, xmm5 // GGGA por xmm2, xmm5 - movdqa xmm3, xmm0 // GGGG - punpcklwd xmm3, xmm0 // Next 4 - punpckhwd xmm0, xmm0 // Last 4 - por xmm3, xmm5 // GGGA + movdqa xmm3, xmm0 // GGGG + punpcklwd xmm3, xmm0 // Next 4 + punpckhwd xmm0, xmm0 // Last 4 + por xmm3, xmm5 // GGGA por xmm0, xmm5 movdqu [edx], xmm1 movdqu [edx + 16], xmm2 @@ -5184,22 +5142,23 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. -__declspec(naked) -void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_y, int width) { +__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_y, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] - paddusb xmm0, xmm1 // sobel = sobelx + sobely + paddusb xmm0, xmm1 // sobel = sobelx + sobely movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -5217,36 +5176,37 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -__declspec(naked) -void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { +__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, + const uint8_t* src_sobely, + uint8_t* dst_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 + pcmpeqb xmm5, xmm5 // alpha 255 convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx - movdqu xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] movdqa xmm2, xmm0 - paddusb xmm2, xmm1 // sobel = sobelx + sobely - movdqa xmm3, xmm0 // XA + paddusb xmm2, xmm1 // sobel = sobelx + sobely + movdqa xmm3, xmm0 // XA punpcklbw xmm3, xmm5 punpckhbw xmm0, xmm5 - movdqa xmm4, xmm1 // YS + movdqa xmm4, xmm1 // YS punpcklbw xmm4, xmm2 punpckhbw xmm1, xmm2 - movdqa xmm6, xmm4 // YSXA - punpcklwd xmm6, xmm3 // First 4 - punpckhwd xmm4, xmm3 // Next 4 - movdqa xmm7, xmm1 // YSXA - punpcklwd xmm7, xmm0 // Next 4 - punpckhwd xmm1, xmm0 // Last 4 + movdqa xmm6, xmm4 // YSXA + punpcklwd xmm6, xmm3 // First 4 + punpckhwd xmm4, xmm3 // Next 4 + movdqa xmm7, xmm1 // YSXA + punpcklwd xmm7, xmm0 // Next 4 + punpckhwd xmm1, xmm0 // Last 4 movdqu [edx], xmm6 movdqu [edx + 16], xmm4 movdqu [edx + 32], xmm7 @@ -5275,8 +5235,11 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // count is number of averaged pixels to produce. // Does 4 pixels at a time. // This function requires alignment on accumulation buffer pointers. -void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, - int width, int area, uint8* dst, +void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, + const int32_t* botleft, + int width, + int area, + uint8_t* dst, int count) { __asm { mov eax, topleft // eax topleft @@ -5294,18 +5257,18 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, cmp area, 128 // 128 pixels will not overflow 15 bits. ja l4 - pshufd xmm5, xmm5, 0 // area - pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 + pshufd xmm5, xmm5, 0 // area + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 psrld xmm6, 16 cvtdq2ps xmm6, xmm6 - addps xmm5, xmm6 // (65536.0 + area - 1) - mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area - cvtps2dq xmm5, xmm5 // 0.16 fixed point - packssdw xmm5, xmm5 // 16 bit shorts + addps xmm5, xmm6 // (65536.0 + area - 1) + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area + cvtps2dq xmm5, xmm5 // 0.16 fixed point + packssdw xmm5, xmm5 // 16 bit shorts - // 4 pixel loop small blocks. + // 4 pixel loop small blocks. s4: - // top left + // top left movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -5345,9 +5308,9 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, jmp l4b - // 4 pixel loop + // 4 pixel loop l4: - // top left + // top left movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] @@ -5373,7 +5336,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, paddd xmm3, [esi + edx * 4 + 48] lea esi, [esi + 64] - cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area cvtdq2ps xmm1, xmm1 mulps xmm0, xmm4 mulps xmm1, xmm4 @@ -5397,7 +5360,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] @@ -5422,8 +5385,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value. -void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, - const int32* previous_cumsum, int width) { +void ComputeCumulativeSumRow_SSE2(const uint8_t* row, + int32_t* cumsum, + const int32_t* previous_cumsum, + int width) { __asm { mov eax, row mov edx, cumsum @@ -5437,7 +5402,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, test edx, 15 jne l4b - // 4 pixel loop + // 4 pixel loop l4: movdqu xmm2, [eax] // 4 argb pixels 16 bytes. lea eax, [eax + 16] @@ -5483,7 +5448,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. lea eax, [eax + 4] @@ -5505,10 +5470,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. -__declspec(naked) -LIBYUV_API -void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, - uint8* dst_argb, const float* uv_dudv, int width) { +__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, + int src_argb_stride, + uint8_t* dst_argb, + const float* uv_dudv, + int width) { __asm { push esi push edi @@ -5519,46 +5485,46 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, movq xmm2, qword ptr [ecx] // uv movq xmm7, qword ptr [ecx + 8] // dudv mov ecx, [esp + 28] // width - shl esi, 16 // 4, stride + shl esi, 16 // 4, stride add esi, 4 movd xmm5, esi sub ecx, 4 jl l4b - // setup for 4 pixel loop + // setup for 4 pixel loop pshufd xmm7, xmm7, 0x44 // dup dudv pshufd xmm5, xmm5, 0 // dup 4, stride - movdqa xmm0, xmm2 // x0, y0, x1, y1 + movdqa xmm0, xmm2 // x0, y0, x1, y1 addps xmm0, xmm7 movlhps xmm2, xmm0 movdqa xmm4, xmm7 - addps xmm4, xmm4 // dudv *= 2 - movdqa xmm3, xmm2 // x2, y2, x3, y3 + addps xmm4, xmm4 // dudv *= 2 + movdqa xmm3, xmm2 // x2, y2, x3, y3 addps xmm3, xmm4 - addps xmm4, xmm4 // dudv *= 4 + addps xmm4, xmm4 // dudv *= 4 - // 4 pixel loop + // 4 pixel loop l4: - cvttps2dq xmm0, xmm2 // x, y float to int first 2 - cvttps2dq xmm1, xmm3 // x, y float to int next 2 - packssdw xmm0, xmm1 // x, y as 8 shorts - pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. + cvttps2dq xmm0, xmm2 // x, y float to int first 2 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 + packssdw xmm0, xmm1 // x, y as 8 shorts + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd edi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd xmm1, [eax + esi] // read pixel 0 movd xmm6, [eax + edi] // read pixel 1 - punpckldq xmm1, xmm6 // combine pixel 0 and 1 - addps xmm2, xmm4 // x, y += dx, dy first 2 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 + addps xmm2, xmm4 // x, y += dx, dy first 2 movq qword ptr [edx], xmm1 movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd edi, xmm0 movd xmm6, [eax + esi] // read pixel 2 movd xmm0, [eax + edi] // read pixel 3 - punpckldq xmm6, xmm0 // combine pixel 2 and 3 - addps xmm3, xmm4 // x, y += dx, dy next 2 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 + addps xmm3, xmm4 // x, y += dx, dy next 2 movq qword ptr 8[edx], xmm6 lea edx, [edx + 16] sub ecx, 4 @@ -5568,12 +5534,12 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: - cvttps2dq xmm0, xmm2 // x, y float to int - packssdw xmm0, xmm0 // x, y as shorts - pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride - addps xmm2, xmm7 // x, y += dx, dy + cvttps2dq xmm0, xmm2 // x, y float to int + packssdw xmm0, xmm0 // x, y as shorts + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride + addps xmm2, xmm7 // x, y += dx, dy movd esi, xmm0 movd xmm0, [eax + esi] // copy a pixel movd [edx], xmm0 @@ -5590,15 +5556,16 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 -__declspec(naked) -void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) @@ -5607,7 +5574,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, je xloop100 // 0 / 256. Blend 100 / 0. sub edi, esi cmp eax, 128 - je xloop50 // 128 /256 is 0.50. Blend 50 / 50. + je xloop50 // 128 /256 is 0.50. Blend 50 / 50. vmovd xmm0, eax // high fraction 0..255 neg eax @@ -5634,14 +5601,14 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, vpaddw ymm0, ymm0, ymm4 vpsrlw ymm1, ymm1, 8 vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutates + vpackuswb ymm0, ymm0, ymm1 // unmutates vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] sub ecx, 32 jg xloop jmp xloop99 - // Blend 50 / 50. + // Blend 50 / 50. xloop50: vmovdqu ymm0, [esi] vpavgb ymm0, ymm0, [esi + edx] @@ -5651,7 +5618,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, jg xloop50 jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. + // Blend 100 / 0 - Copy row unchanged. xloop100: rep movsb @@ -5666,25 +5633,26 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, // Bilinear filter 16x2 -> 16x1 // TODO(fbarchard): Consider allowing 256 using memcpy. -__declspec(naked) -void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) sub edi, esi - // Dispatch to specialized filters if applicable. + // Dispatch to specialized filters if applicable. cmp eax, 0 je xloop100 // 0 /256. Blend 100 / 0. cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. movd xmm0, eax // high fraction 0..255 neg eax @@ -5703,7 +5671,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movdqu xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - psubb xmm0, xmm4 // bias image by -128 + psubb xmm0, xmm4 // bias image by -128 psubb xmm1, xmm4 movdqa xmm2, xmm5 movdqa xmm3, xmm5 @@ -5720,7 +5688,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jg xloop jmp xloop99 - // Blend 50 / 50. + // Blend 50 / 50. xloop50: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] @@ -5731,7 +5699,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jg xloop50 jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. + // Blend 100 / 0 - Copy row unchanged. xloop100: movdqu xmm0, [esi] movdqu [esi + edi], xmm0 @@ -5747,15 +5715,16 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -__declspec(naked) -void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler movdqu xmm5, [ecx] - mov ecx, [esp + 16] // width + mov ecx, [esp + 16] // width wloop: movdqu xmm0, [eax] @@ -5773,15 +5742,16 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, } #ifdef HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) -void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { +__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. - mov ecx, [esp + 16] // width + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // shuffler + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. + mov ecx, [esp + 16] // width wloop: vmovdqu ymm0, [eax] @@ -5801,152 +5771,36 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) -void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { - __asm { - push ebx - push esi - mov eax, [esp + 8 + 4] // src_argb - mov edx, [esp + 8 + 8] // dst_argb - mov esi, [esp + 8 + 12] // shuffler - mov ecx, [esp + 8 + 16] // width - pxor xmm5, xmm5 - - mov ebx, [esi] // shuffler - cmp ebx, 0x03000102 - je shuf_3012 - cmp ebx, 0x00010203 - je shuf_0123 - cmp ebx, 0x00030201 - je shuf_0321 - cmp ebx, 0x02010003 - je shuf_2103 - - // TODO(fbarchard): Use one source pointer and 3 offsets. - shuf_any1: - movzx ebx, byte ptr [esi] - movzx ebx, byte ptr [eax + ebx] - mov [edx], bl - movzx ebx, byte ptr [esi + 1] - movzx ebx, byte ptr [eax + ebx] - mov [edx + 1], bl - movzx ebx, byte ptr [esi + 2] - movzx ebx, byte ptr [eax + ebx] - mov [edx + 2], bl - movzx ebx, byte ptr [esi + 3] - movzx ebx, byte ptr [eax + ebx] - mov [edx + 3], bl - lea eax, [eax + 4] - lea edx, [edx + 4] - sub ecx, 1 - jg shuf_any1 - jmp shuf99 - - shuf_0123: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB - pshuflw xmm0, xmm0, 01Bh - pshufhw xmm1, xmm1, 01Bh - pshuflw xmm1, xmm1, 01Bh - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_0123 - jmp shuf99 - - shuf_0321: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB - pshuflw xmm0, xmm0, 039h - pshufhw xmm1, xmm1, 039h - pshuflw xmm1, xmm1, 039h - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_0321 - jmp shuf99 - - shuf_2103: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA - pshuflw xmm0, xmm0, 093h - pshufhw xmm1, xmm1, 093h - pshuflw xmm1, xmm1, 093h - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_2103 - jmp shuf99 - - shuf_3012: - movdqu xmm0, [eax] - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpckhbw xmm1, xmm5 - pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB - pshuflw xmm0, xmm0, 0C6h - pshufhw xmm1, xmm1, 0C6h - pshuflw xmm1, xmm1, 0C6h - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg shuf_3012 - - shuf99: - pop esi - pop ebx - ret - } -} - // YUY2 - Macro-pixel = 2 image pixels // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... // UYVY - Macro-pixel = 2 image pixels // U0Y0V0Y1 -__declspec(naked) -void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { +__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 // YUYV + punpcklbw xmm0, xmm2 // YUYV punpckhbw xmm1, xmm2 movdqu [edi], xmm0 movdqu [edi + 16], xmm1 @@ -5960,30 +5814,30 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, } } -__declspec(naked) -void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { +__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqu xmm0, [eax] // Y + punpcklbw xmm2, xmm3 // UV + movdqu xmm0, [eax] // Y movdqa xmm1, xmm2 lea eax, [eax + 16] - punpcklbw xmm1, xmm0 // UYVY + punpcklbw xmm1, xmm0 // UYVY punpckhbw xmm2, xmm0 movdqu [edi], xmm1 movdqu [edi + 16], xmm2 @@ -5998,22 +5852,22 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, } #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -__declspec(naked) -void ARGBPolynomialRow_SSE2(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) { +__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* src_argb */ - mov edx, [esp + 4 + 8] /* dst_argb */ - mov esi, [esp + 4 + 12] /* poly */ - mov ecx, [esp + 4 + 16] /* width */ + mov eax, [esp + 4 + 4] /* src_argb */ + mov edx, [esp + 4 + 8] /* dst_argb */ + mov esi, [esp + 4 + 12] /* poly */ + mov ecx, [esp + 4 + 16] /* width */ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. - // 2 pixel loop. + // 2 pixel loop. convertloop: -// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel -// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel + // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel movq xmm0, qword ptr [eax] // BGRABGRA lea eax, [eax + 8] punpcklbw xmm0, xmm3 @@ -6057,25 +5911,25 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -__declspec(naked) -void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) { +__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const float* poly, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ - mov ecx, [esp + 12] /* poly */ - vbroadcastf128 ymm4, [ecx] // C0 + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* poly */ + vbroadcastf128 ymm4, [ecx] // C0 vbroadcastf128 ymm5, [ecx + 16] // C1 vbroadcastf128 ymm6, [ecx + 32] // C2 vbroadcastf128 ymm7, [ecx + 48] // C3 - mov ecx, [esp + 16] /* width */ + mov ecx, [esp + 16] /* width */ // 2 pixel loop. convertloop: vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels lea eax, [eax + 8] - vcvtdq2ps ymm0, ymm0 // X 8 floats + vcvtdq2ps ymm0, ymm0 // X 8 floats vmulps ymm2, ymm0, ymm0 // X * X vmulps ymm3, ymm0, ymm7 // C3 * X vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X @@ -6095,16 +5949,125 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 +#ifdef HAS_HALFFLOATROW_SSE2 +static float kExpBias = 1.9259299444e-34f; +__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + mulss xmm4, kExpBias + pshufd xmm4, xmm4, 0 + pxor xmm5, xmm5 + sub edx, eax + + // 8 pixel loop. + convertloop: + movdqu xmm2, xmmword ptr [eax] // 8 shorts + add eax, 16 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm5 + cvtdq2ps xmm2, xmm2 // convert 8 ints to floats + punpckhwd xmm3, xmm5 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + psrld xmm2, 13 + psrld xmm3, 13 + packssdw xmm2, xmm3 + movdqu [eax + edx - 16], xmm2 + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_HALFFLOATROW_SSE2 + +#ifdef HAS_HALFFLOATROW_AVX2 +__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + + vmulss xmm4, xmm4, kExpBias + vbroadcastss ymm4, xmm4 + vpxor ymm5, ymm5, ymm5 + sub edx, eax + + // 16 pixel loop. + convertloop: + vmovdqu ymm2, [eax] // 16 shorts + add eax, 32 + vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints + vpunpcklwd ymm2, ymm2, ymm5 + vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats + vcvtdq2ps ymm2, ymm2 + vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. + vmulps ymm2, ymm2, ymm4 + vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate + vpsrld ymm2, ymm2, 13 + vpackssdw ymm2, ymm2, ymm3 + vmovdqu [eax + edx - 32], ymm2 + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, + uint16_t* dst, + float scale, + int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + vbroadcastss ymm4, [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + sub edx, eax + + // 16 pixel loop. + convertloop: + vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints + vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts + add eax, 32 + vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats + vcvtdq2ps ymm3, ymm3 + vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 + vmulps ymm3, ymm3, ymm4 + vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate + vcvtps2ph xmm3, ymm3, 3 + vmovdqu [eax + edx + 32], xmm2 + vmovdqu [eax + edx + 32 + 16], xmm3 + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_F16C + #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. -__declspec(naked) -void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, - int width) { +__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. convertloop: @@ -6131,13 +6094,14 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. -__declspec(naked) -void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { +__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, + const uint8_t* table_argb, + int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. convertloop: @@ -6162,27 +6126,28 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -__declspec(naked) -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, - int width, - const uint8* luma, uint32 lumacoeff) { +__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_argb, + int width, + const uint8_t* luma, + uint32_t lumacoeff) { __asm { push esi push edi - mov eax, [esp + 8 + 4] /* src_argb */ - mov edi, [esp + 8 + 8] /* dst_argb */ - mov ecx, [esp + 8 + 12] /* width */ + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ + mov ecx, [esp + 8 + 12] /* width */ movd xmm2, dword ptr [esp + 8 + 16] // luma table movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff pshufd xmm2, xmm2, 0 pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 psllw xmm4, 8 pxor xmm5, xmm5 - // 4 pixel loop. + // 4 pixel loop. convertloop: - movdqu xmm0, xmmword ptr [eax] // generate luma ptr + movdqu xmm0, xmmword ptr [eax] // generate luma ptr pmaddubsw xmm0, xmm3 phaddw xmm0, xmm0 pand xmm0, xmm4 // mask out low bits diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale.cc index 36e3fe5281..2cfa1c6cb1 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/scale.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale.cc @@ -33,17 +33,25 @@ static __inline int Abs(int v) { // This is an optimized version for scaling down a plane to 1/2 of // its original size. -static void ScalePlaneDown2(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = - filtering == kFilterNone ? ScaleRowDown2_C : - (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C); + void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = + filtering == kFilterNone + ? ScaleRowDown2_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_C + : ScaleRowDown2Box_C); int row_stride = src_stride << 1; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride; // Point to odd rows. src_stride = 0; @@ -51,46 +59,63 @@ static void ScalePlaneDown2(int src_width, int src_height, #if defined(HAS_SCALEROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON : - ScaleRowDown2Box_Any_NEON); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON + : ScaleRowDown2Box_Any_NEON); if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON : - (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON : - ScaleRowDown2Box_NEON); + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_NEON + : ScaleRowDown2Box_NEON); } } #endif #if defined(HAS_SCALEROWDOWN2_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 : - ScaleRowDown2Box_Any_SSSE3); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 + : ScaleRowDown2Box_Any_SSSE3); if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 : - ScaleRowDown2Box_SSSE3); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_SSSE3 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 + : ScaleRowDown2Box_SSSE3); } } #endif #if defined(HAS_SCALEROWDOWN2_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 : - ScaleRowDown2Box_Any_AVX2); + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_AVX2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 + : ScaleRowDown2Box_Any_AVX2); if (IS_ALIGNED(dst_width, 32)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 : - ScaleRowDown2Box_AVX2); + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_AVX2 + : ScaleRowDown2Box_AVX2); } } #endif -#if defined(HAS_SCALEROWDOWN2_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) && - IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown2 = filtering ? - ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2; +#if defined(HAS_SCALEROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA + : ScaleRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_MSA + : ScaleRowDown2Box_MSA); + } } #endif @@ -105,18 +130,25 @@ static void ScalePlaneDown2(int src_width, int src_height, } } -static void ScalePlaneDown2_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) = - filtering == kFilterNone ? ScaleRowDown2_16_C : - (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C : - ScaleRowDown2Box_16_C); + void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width) = + filtering == kFilterNone + ? ScaleRowDown2_16_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C + : ScaleRowDown2Box_16_C); int row_stride = src_stride << 1; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride; // Point to odd rows. src_stride = 0; @@ -124,23 +156,17 @@ static void ScalePlaneDown2_16(int src_width, int src_height, #if defined(HAS_SCALEROWDOWN2_16_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON : - ScaleRowDown2_16_NEON; + ScaleRowDown2 = + filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; } #endif #if defined(HAS_SCALEROWDOWN2_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : - ScaleRowDown2Box_16_SSE2); - } -#endif -#if defined(HAS_SCALEROWDOWN2_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) && - IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown2 = filtering ? - ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2; + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_16_SSE2 + : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 + : ScaleRowDown2Box_16_SSE2); } #endif @@ -159,24 +185,30 @@ static void ScalePlaneDown2_16(int src_width, int src_height, // This is an optimized version for scaling down a plane to 1/4 of // its original size. -static void ScalePlaneDown4(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown4(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = + void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; int row_stride = src_stride << 2; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride * 2; // Point to row 2. src_stride = 0; } #if defined(HAS_SCALEROWDOWN4_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; } @@ -184,8 +216,8 @@ static void ScalePlaneDown4(int src_width, int src_height, #endif #if defined(HAS_SCALEROWDOWN4_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; if (IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; } @@ -193,19 +225,20 @@ static void ScalePlaneDown4(int src_width, int src_height, #endif #if defined(HAS_SCALEROWDOWN4_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; if (IS_ALIGNED(dst_width, 16)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; } } #endif -#if defined(HAS_SCALEROWDOWN4_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2; +#if defined(HAS_SCALEROWDOWN4_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA; + } } #endif @@ -219,38 +252,36 @@ static void ScalePlaneDown4(int src_width, int src_height, } } -static void ScalePlaneDown4_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown4_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) = + void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; int row_stride = src_stride << 2; + (void)src_width; + (void)src_height; if (!filtering) { src_ptr += src_stride * 2; // Point to row 2. src_stride = 0; } #if defined(HAS_SCALEROWDOWN4_16_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON : - ScaleRowDown4_16_NEON; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; } #endif #if defined(HAS_SCALEROWDOWN4_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : - ScaleRowDown4_16_SSE2; - } -#endif -#if defined(HAS_SCALEROWDOWN4_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - ScaleRowDown4 = filtering ? - ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2; + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } #endif @@ -265,18 +296,23 @@ static void ScalePlaneDown4_16(int src_width, int src_height, } // Scale plane down, 3/4 - -static void ScalePlaneDown34(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown34(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_C; @@ -305,6 +341,26 @@ static void ScalePlaneDown34(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEROWDOWN34_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_MSA; + ScaleRowDown34_1 = ScaleRowDown34_Any_MSA; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA; + } + if (dst_width % 48 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_MSA; + ScaleRowDown34_1 = ScaleRowDown34_MSA; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA; + } + } + } +#endif #if defined(HAS_SCALEROWDOWN34_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { @@ -325,19 +381,6 @@ static void ScalePlaneDown34(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN34_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_DSPR2; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2; - } - } -#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); @@ -346,8 +389,7 @@ static void ScalePlaneDown34(int src_width, int src_height, ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; - ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, - dst_ptr, dst_width); + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } @@ -363,17 +405,23 @@ static void ScalePlaneDown34(int src_width, int src_height, } } -static void ScalePlaneDown34_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown34_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); - void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); + void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_16_C; @@ -404,19 +452,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN34_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2; - } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2; - } - } -#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); @@ -425,8 +460,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height, ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; - ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, - dst_ptr, dst_width); + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } @@ -442,7 +476,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height, } } - // Scale plane, 3/8 // This is an optimized version for scaling down a plane to 3/8 // of its original size. @@ -458,18 +491,24 @@ static void ScalePlaneDown34_16(int src_width, int src_height, // ggghhhii // Boxes are 3x3, 2x3, 3x2 and 2x2 -static void ScalePlaneDown38(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +static void ScalePlaneDown38(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); + void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; assert(dst_width % 3 == 0); + (void)src_width; + (void)src_height; if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_C; ScaleRowDown38_2 = ScaleRowDown38_C; @@ -517,16 +556,23 @@ static void ScalePlaneDown38(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN38_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { +#if defined(HAS_SCALEROWDOWN38_MSA) + if (TestCpuFlag(kCpuHasMSA)) { if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_DSPR2; + ScaleRowDown38_3 = ScaleRowDown38_Any_MSA; + ScaleRowDown38_2 = ScaleRowDown38_Any_MSA; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2; + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_MSA; + ScaleRowDown38_2 = ScaleRowDown38_MSA; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA; + } } } #endif @@ -554,17 +600,23 @@ static void ScalePlaneDown38(int src_width, int src_height, } } -static void ScalePlaneDown38_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +static void ScalePlaneDown38_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int y; - void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); - void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width); + void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; + (void)src_width; + (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_16_C; @@ -595,19 +647,6 @@ static void ScalePlaneDown38_16(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN38_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { - if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2; - } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2; - } - } -#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); @@ -634,8 +673,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height, #define MIN1(x) ((x) < 1 ? 1 : (x)) -static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { - uint32 sum = 0u; +static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) { + uint32_t sum = 0u; int x; assert(iboxwidth > 0); for (x = 0; x < iboxwidth; ++x) { @@ -644,8 +683,8 @@ static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { return sum; } -static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) { - uint32 sum = 0u; +static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) { + uint32_t sum = 0u; int x; assert(iboxwidth > 0); for (x = 0; x < iboxwidth; ++x) { @@ -654,8 +693,12 @@ static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) { return sum; } -static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, uint8* dst_ptr) { +static void ScaleAddCols2_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { int i; int scaletbl[2]; int minboxwidth = dx >> 16; @@ -666,13 +709,18 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, int ix = x >> 16; x += dx; boxwidth = MIN1((x >> 16) - ix); - *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * - scaletbl[boxwidth - minboxwidth] >> 16; + *dst_ptr++ = + SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >> + 16; } } -static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, uint16* dst_ptr) { +static void ScaleAddCols2_16_C(int dst_width, + int boxheight, + int x, + int dx, + const uint32_t* src_ptr, + uint16_t* dst_ptr) { int i; int scaletbl[2]; int minboxwidth = dx >> 16; @@ -684,22 +732,32 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, x += dx; boxwidth = MIN1((x >> 16) - ix); *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * - scaletbl[boxwidth - minboxwidth] >> 16; + scaletbl[boxwidth - minboxwidth] >> + 16; } } -static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int, - const uint16* src_ptr, uint8* dst_ptr) { +static void ScaleAddCols0_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { int scaleval = 65536 / boxheight; int i; + (void)dx; src_ptr += (x >> 16); for (i = 0; i < dst_width; ++i) { *dst_ptr++ = src_ptr[i] * scaleval >> 16; } } -static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, uint8* dst_ptr) { +static void ScaleAddCols1_C(int dst_width, + int boxheight, + int x, + int dx, + const uint16_t* src_ptr, + uint8_t* dst_ptr) { int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; @@ -710,8 +768,12 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, } } -static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, uint16* dst_ptr) { +static void ScaleAddCols1_16_C(int dst_width, + int boxheight, + int x, + int dx, + const uint32_t* src_ptr, + uint16_t* dst_ptr) { int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; @@ -728,10 +790,14 @@ static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, // one pixel of destination using fixed point (16.16) to step // through source, sampling a box of pixel with simple // averaging. -static void ScalePlaneBox(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { +static void ScalePlaneBox(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -739,18 +805,18 @@ static void ScalePlaneBox(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height << 16); - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); src_width = Abs(src_width); { - // Allocate a row buffer of uint16. + // Allocate a row buffer of uint16_t. align_buffer_64(row16, src_width * 2); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint16* src_ptr, uint8* dst_ptr) = - (dx & 0xffff) ? ScaleAddCols2_C: - ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); - void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) = - ScaleAddRow_C; + const uint16_t* src_ptr, uint8_t* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_C + : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); + void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr, + int src_width) = ScaleAddRow_C; #if defined(HAS_SCALEADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleAddRow = ScaleAddRow_Any_SSE2; @@ -775,11 +841,19 @@ static void ScalePlaneBox(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleAddRow = ScaleAddRow_Any_MSA; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_MSA; + } + } +#endif for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; - const uint8* src = src_ptr + iy * src_stride; + const uint8_t* src = src_ptr + iy * src_stride; y += dy; if (y > max_y) { y = max_y; @@ -787,20 +861,24 @@ static void ScalePlaneBox(int src_width, int src_height, boxheight = MIN1((y >> 16) - iy); memset(row16, 0, src_width * 2); for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint16 *)(row16), src_width); + ScaleAddRow(src, (uint16_t*)(row16), src_width); src += src_stride; } - ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr); + ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr); dst_ptr += dst_stride; } free_aligned_buffer_64(row16); } } -static void ScalePlaneBox_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr) { +static void ScalePlaneBox_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -808,17 +886,17 @@ static void ScalePlaneBox_16(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height << 16); - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, + &dx, &dy); src_width = Abs(src_width); { - // Allocate a row buffer of uint32. + // Allocate a row buffer of uint32_t. align_buffer_64(row32, src_width * 4); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, - const uint32* src_ptr, uint16* dst_ptr) = - (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C; - void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) = - ScaleAddRow_16_C; + const uint32_t* src_ptr, uint16_t* dst_ptr) = + (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C; + void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr, + int src_width) = ScaleAddRow_16_C; #if defined(HAS_SCALEADDROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { @@ -829,7 +907,7 @@ static void ScalePlaneBox_16(int src_width, int src_height, for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; - const uint16* src = src_ptr + iy * src_stride; + const uint16_t* src = src_ptr + iy * src_stride; y += dy; if (y > max_y) { y = max_y; @@ -837,10 +915,10 @@ static void ScalePlaneBox_16(int src_width, int src_height, boxheight = MIN1((y >> 16) - iy); memset(row32, 0, src_width * 4); for (k = 0; k < boxheight; ++k) { - ScaleAddRow(src, (uint32 *)(row32), src_width); + ScaleAddRow(src, (uint32_t*)(row32), src_width); src += src_stride; } - ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr); + ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr); dst_ptr += dst_stride; } free_aligned_buffer_64(row32); @@ -848,10 +926,14 @@ static void ScalePlaneBox_16(int src_width, int src_height, } // Scale plane down with bilinear interpolation. -void ScalePlaneBilinearDown(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +void ScalePlaneBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -864,14 +946,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) = + void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -898,16 +980,15 @@ void ScalePlaneBilinearDown(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(src_width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; } } #endif - #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_SSSE3; @@ -920,6 +1001,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, ScaleFilterCols = ScaleFilterCols_NEON; } } +#endif +#if defined(HAS_SCALEFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_MSA; + } + } #endif if (y > max_y) { y = max_y; @@ -927,7 +1016,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height, for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8* src = src_ptr + yi * src_stride; + const uint8_t* src = src_ptr + yi * src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { @@ -944,10 +1033,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, free_aligned_buffer_64(row); } -void ScalePlaneBilinearDown_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +void ScalePlaneBilinearDown_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -960,14 +1053,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, const int max_y = (src_height - 1) << 16; int j; - void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) = + void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; - void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_16_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_16_SSE2) @@ -1002,15 +1095,6 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_16_DSPR2; - if (IS_ALIGNED(src_width, 4)) { - InterpolateRow = InterpolateRow_16_DSPR2; - } - } -#endif - #if defined(HAS_SCALEFILTERCOLS_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -1023,13 +1107,13 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint16* src = src_ptr + yi * src_stride; + const uint16_t* src = src_ptr + yi * src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { int yf = (y >> 8) & 255; - InterpolateRow((uint16*)row, src, src_stride, src_width, yf); - ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx); + InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx); } dst_ptr += dst_stride; y += dy; @@ -1041,10 +1125,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, } // Scale up down with bilinear interpolation. -void ScalePlaneBilinearUp(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, +void ScalePlaneBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr, enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. @@ -1053,14 +1141,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, + int dst_width, int x, int dx) = filtering ? ScaleFilterCols_C : ScaleCols_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -1087,14 +1175,6 @@ void ScalePlaneBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; - } - } -#endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_C; @@ -1111,6 +1191,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, ScaleFilterCols = ScaleFilterCols_NEON; } } +#endif +#if defined(HAS_SCALEFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_MSA; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; @@ -1126,13 +1214,13 @@ void ScalePlaneBilinearUp(int src_width, int src_height, } { int yi = y >> 16; - const uint8* src = src_ptr + yi * src_stride; + const uint8_t* src = src_ptr + yi * src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; align_buffer_64(row, kRowSize * 2); - uint8* rowptr = row; + uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; @@ -1172,10 +1260,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, } } -void ScalePlaneBilinearUp_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr, +void ScalePlaneBilinearUp_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr, enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. @@ -1184,14 +1276,14 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; - void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_16_C; - void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; + void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, + int dst_width, int x, int dx) = filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_16_SSE2) @@ -1226,14 +1318,6 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2)) { - InterpolateRow = InterpolateRow_Any_16_DSPR2; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_16_DSPR2; - } - } -#endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_16_C; @@ -1257,13 +1341,13 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, } { int yi = y >> 16; - const uint16* src = src_ptr + yi * src_stride; + const uint16_t* src = src_ptr + yi * src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; align_buffer_64(row, kRowSize * 4); - uint16* rowptr = (uint16*)row; + uint16_t* rowptr = (uint16_t*)row; int rowstride = kRowSize; int lasty = yi; @@ -1308,20 +1392,24 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. -static void ScalePlaneSimple(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { +static void ScalePlaneSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { int i; - void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) = ScaleCols_C; + void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, + int x, int dx) = ScaleCols_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); src_width = Abs(src_width); if (src_width * 2 == dst_width && x < 0x8000) { @@ -1340,20 +1428,24 @@ static void ScalePlaneSimple(int src_width, int src_height, } } -static void ScalePlaneSimple_16(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_ptr, uint16* dst_ptr) { +static void ScalePlaneSimple_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { int i; - void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) = ScaleCols_16_C; + void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, + int x, int dx) = ScaleCols_16_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; - ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, + &dx, &dy); src_width = Abs(src_width); if (src_width * 2 == dst_width && x < 0x8000) { @@ -1366,8 +1458,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height, } for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, - dst_width, x, dx); + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); dst_ptr += dst_stride; y += dy; } @@ -1377,14 +1468,18 @@ static void ScalePlaneSimple_16(int src_width, int src_height, // This function dispatches to a specialized scaler based on scale factor. LIBYUV_API -void ScalePlane(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, +void ScalePlane(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, enum FilterMode filtering) { // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, filtering); + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); // Negative height means invert the image. if (src_height < 0) { @@ -1403,46 +1498,42 @@ void ScalePlane(const uint8* src, int src_stride, if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled horizontally. - ScalePlaneVertical(src_height, - dst_width, dst_height, - src_stride, dst_stride, src, dst, - 0, 0, dy, 1, filtering); + ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, 0, dy, 1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { // Scale down. - if (4 * dst_width == 3 * src_width && - 4 * dst_height == 3 * src_height) { + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { // optimized, 3/4 - ScalePlaneDown34(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } if (2 * dst_width == src_width && 2 * dst_height == src_height) { // optimized, 1/2 - ScalePlaneDown2(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } // 3/8 rounded up for odd sized chroma height. - if (8 * dst_width == 3 * src_width && - dst_height == ((src_height * 3 + 7) / 8)) { + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { // optimized, 3/8 - ScalePlaneDown38(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && (filtering == kFilterBox || filtering == kFilterNone)) { // optimized, 1/4 - ScalePlaneDown4(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, filtering); return; } } if (filtering == kFilterBox && dst_height * 2 < src_height) { - ScalePlaneBox(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); return; } if (filtering && dst_height > src_height) { @@ -1455,19 +1546,23 @@ void ScalePlane(const uint8* src, int src_stride, src_stride, dst_stride, src, dst, filtering); return; } - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); } LIBYUV_API -void ScalePlane_16(const uint16* src, int src_stride, - int src_width, int src_height, - uint16* dst, int dst_stride, - int dst_width, int dst_height, - enum FilterMode filtering) { +void ScalePlane_16(const uint16_t* src, + int src_stride, + int src_width, + int src_height, + uint16_t* dst, + int dst_stride, + int dst_width, + int dst_height, + enum FilterMode filtering) { // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, filtering); + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); // Negative height means invert the image. if (src_height < 0) { @@ -1483,19 +1578,16 @@ void ScalePlane_16(const uint16* src, int src_stride, CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); return; } - if (dst_width == src_width) { + if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled vertically. - ScalePlaneVertical_16(src_height, - dst_width, dst_height, - src_stride, dst_stride, src, dst, - 0, 0, dy, 1, filtering); + ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst, 0, 0, dy, 1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { // Scale down. - if (4 * dst_width == 3 * src_width && - 4 * dst_height == 3 * src_height) { + if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { // optimized, 3/4 ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1508,15 +1600,14 @@ void ScalePlane_16(const uint16* src, int src_stride, return; } // 3/8 rounded up for odd sized chroma height. - if (8 * dst_width == 3 * src_width && - dst_height == ((src_height * 3 + 7) / 8)) { + if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { // optimized, 3/8 ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && - filtering != kFilterBilinear) { + (filtering == kFilterBox || filtering == kFilterNone)) { // optimized, 1/4 ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1524,8 +1615,8 @@ void ScalePlane_16(const uint16* src, int src_stride, } } if (filtering == kFilterBox && dst_height * 2 < src_height) { - ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); return; } if (filtering && dst_height > src_height) { @@ -1538,132 +1629,110 @@ void ScalePlane_16(const uint16* src, int src_stride, src_stride, dst_stride, src, dst, filtering); return; } - ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst); + ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); } // Scale an I420 image. // This function in turn calls a scaling function for each plane. LIBYUV_API -int I420Scale(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering) { int src_halfwidth = SUBSAMPLE(src_width, 1, 1); int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || - !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } - ScalePlane(src_y, src_stride_y, src_width, src_height, - dst_y, dst_stride_y, dst_width, dst_height, - filtering); - ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, - dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, - filtering); - ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, - dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, - filtering); + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); return 0; } LIBYUV_API -int I420Scale_16(const uint16* src_y, int src_stride_y, - const uint16* src_u, int src_stride_u, - const uint16* src_v, int src_stride_v, - int src_width, int src_height, - uint16* dst_y, int dst_stride_y, - uint16* dst_u, int dst_stride_u, - uint16* dst_v, int dst_stride_v, - int dst_width, int dst_height, +int I420Scale_16(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + int src_width, + int src_height, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_u, + int dst_stride_u, + uint16_t* dst_v, + int dst_stride_v, + int dst_width, + int dst_height, enum FilterMode filtering) { int src_halfwidth = SUBSAMPLE(src_width, 1, 1); int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || - !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } - ScalePlane_16(src_y, src_stride_y, src_width, src_height, - dst_y, dst_stride_y, dst_width, dst_height, - filtering); - ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, - dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, - filtering); - ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, - dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, - filtering); + ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, + dst_stride_u, dst_halfwidth, dst_halfheight, filtering); + ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, + dst_stride_v, dst_halfwidth, dst_halfheight, filtering); return 0; } // Deprecated api LIBYUV_API -int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, - int src_stride_y, int src_stride_u, int src_stride_v, - int src_width, int src_height, - uint8* dst_y, uint8* dst_u, uint8* dst_v, - int dst_stride_y, int dst_stride_u, int dst_stride_v, - int dst_width, int dst_height, +int Scale(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int src_width, + int src_height, + uint8_t* dst_y, + uint8_t* dst_u, + uint8_t* dst_v, + int dst_stride_y, + int dst_stride_u, + int dst_stride_v, + int dst_width, + int dst_height, LIBYUV_BOOL interpolate) { - return I420Scale(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - src_width, src_height, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - dst_width, dst_height, - interpolate ? kFilterBox : kFilterNone); -} - -// Deprecated api -LIBYUV_API -int ScaleOffset(const uint8* src, int src_width, int src_height, - uint8* dst, int dst_width, int dst_height, int dst_yoffset, - LIBYUV_BOOL interpolate) { - // Chroma requires offset to multiple of 2. - int dst_yoffset_even = dst_yoffset & ~1; - int src_halfwidth = SUBSAMPLE(src_width, 1, 1); - int src_halfheight = SUBSAMPLE(src_height, 1, 1); - int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); - int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); - int aheight = dst_height - dst_yoffset_even * 2; // actual output height - const uint8* src_y = src; - const uint8* src_u = src + src_width * src_height; - const uint8* src_v = src + src_width * src_height + - src_halfwidth * src_halfheight; - uint8* dst_y = dst + dst_yoffset_even * dst_width; - uint8* dst_u = dst + dst_width * dst_height + - (dst_yoffset_even >> 1) * dst_halfwidth; - uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + - (dst_yoffset_even >> 1) * dst_halfwidth; - if (!src || src_width <= 0 || src_height <= 0 || - !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 || - dst_yoffset_even >= dst_height) { - return -1; - } - return I420Scale(src_y, src_width, - src_u, src_halfwidth, - src_v, src_halfwidth, - src_width, src_height, - dst_y, dst_width, - dst_u, dst_halfwidth, - dst_v, dst_halfwidth, - dst_width, aheight, - interpolate ? kFilterBox : kFilterNone); + return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, src_width, src_height, dst_y, dst_stride_y, + dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width, + dst_height, interpolate ? kFilterBox : kFilterNone); } #ifdef __cplusplus diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc index ed76a9e4c0..53ad136404 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_any.cc @@ -20,184 +20,429 @@ extern "C" { // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols #define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ - int dst_width, int x, int dx) { \ - int n = dst_width & ~MASK; \ - if (n > 0) { \ - TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ - } \ - TERP_C(dst_ptr + n * BPP, src_ptr, \ - dst_width & MASK, x + n * dx, dx); \ - } + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ + int dx) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ + } #ifdef HAS_SCALEFILTERCOLS_NEON CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) #endif +#ifdef HAS_SCALEFILTERCOLS_MSA +CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) +#endif #ifdef HAS_SCALEARGBCOLS_NEON CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) #endif +#ifdef HAS_SCALEARGBCOLS_MSA +CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) +#endif #ifdef HAS_SCALEARGBFILTERCOLS_NEON -CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, - ScaleARGBFilterCols_C, 4, 3) +CANY(ScaleARGBFilterCols_Any_NEON, + ScaleARGBFilterCols_NEON, + ScaleARGBFilterCols_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_MSA +CANY(ScaleARGBFilterCols_Any_MSA, + ScaleARGBFilterCols_MSA, + ScaleARGBFilterCols_C, + 4, + 7) #endif #undef CANY // Fixed scale down. +// Mask may be non-power of 2, so use MOD #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ - uint8* dst_ptr, int dst_width) { \ - int r = (int)((unsigned int)dst_width % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r); \ - } + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ + } // Fixed scale down for odd source width. Used by I420Blend subsampling. // Since dst_width is (width + 1) / 2, this function scales one less pixel // and copies the last pixel. #define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ - uint8* dst_ptr, int dst_width) { \ - int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ - dst_ptr + n * BPP, r); \ - } + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ + int dst_width) { \ + int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ + int n = (dst_width - 1) - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r + 1); \ + } #ifdef HAS_SCALEROWDOWN2_SSSE3 SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) -SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3, - ScaleRowDown2Linear_C, 2, 1, 15) -SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C, - 2, 1, 15) -SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3, - ScaleRowDown2Box_Odd_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_SSSE3, + ScaleRowDown2Linear_SSSE3, + ScaleRowDown2Linear_C, + 2, + 1, + 15) +SDANY(ScaleRowDown2Box_Any_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_C, + 2, + 1, + 15) +SDODD(ScaleRowDown2Box_Odd_SSSE3, + ScaleRowDown2Box_SSSE3, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) #endif #ifdef HAS_SCALEROWDOWN2_AVX2 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) -SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2, - ScaleRowDown2Linear_C, 2, 1, 31) -SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, - 2, 1, 31) -SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C, - 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_AVX2, + ScaleRowDown2Linear_AVX2, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_C, + 2, + 1, + 31) +SDODD(ScaleRowDown2Box_Odd_AVX2, + ScaleRowDown2Box_AVX2, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 31) #endif #ifdef HAS_SCALEROWDOWN2_NEON SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) -SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON, - ScaleRowDown2Linear_C, 2, 1, 15) -SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON, - ScaleRowDown2Box_C, 2, 1, 15) -SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON, - ScaleRowDown2Box_Odd_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_NEON, + ScaleRowDown2Linear_NEON, + ScaleRowDown2Linear_C, + 2, + 1, + 15) +SDANY(ScaleRowDown2Box_Any_NEON, + ScaleRowDown2Box_NEON, + ScaleRowDown2Box_C, + 2, + 1, + 15) +SDODD(ScaleRowDown2Box_Odd_NEON, + ScaleRowDown2Box_NEON, + ScaleRowDown2Box_Odd_C, + 2, + 1, + 15) +#endif +#ifdef HAS_SCALEROWDOWN2_MSA +SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_MSA, + ScaleRowDown2Linear_MSA, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_MSA, + ScaleRowDown2Box_MSA, + ScaleRowDown2Box_C, + 2, + 1, + 31) #endif #ifdef HAS_SCALEROWDOWN4_SSSE3 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C, - 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_SSSE3, + ScaleRowDown4Box_SSSE3, + ScaleRowDown4Box_C, + 4, + 1, + 7) #endif #ifdef HAS_SCALEROWDOWN4_AVX2 SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) -SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C, - 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_AVX2, + ScaleRowDown4Box_AVX2, + ScaleRowDown4Box_C, + 4, + 1, + 15) #endif #ifdef HAS_SCALEROWDOWN4_NEON SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C, - 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_NEON, + ScaleRowDown4Box_NEON, + ScaleRowDown4Box_C, + 4, + 1, + 7) +#endif +#ifdef HAS_SCALEROWDOWN4_MSA +SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_MSA, + ScaleRowDown4Box_MSA, + ScaleRowDown4Box_C, + 4, + 1, + 15) #endif #ifdef HAS_SCALEROWDOWN34_SSSE3 -SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, - ScaleRowDown34_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3, - ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3, - ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_Any_SSSE3, + ScaleRowDown34_SSSE3, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_0_Box_Any_SSSE3, + ScaleRowDown34_0_Box_SSSE3, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_1_Box_Any_SSSE3, + ScaleRowDown34_1_Box_SSSE3, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) #endif #ifdef HAS_SCALEROWDOWN34_NEON -SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON, - ScaleRowDown34_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON, - ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) -SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON, - ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_Any_NEON, + ScaleRowDown34_NEON, + ScaleRowDown34_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_0_Box_Any_NEON, + ScaleRowDown34_0_Box_NEON, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 23) +SDANY(ScaleRowDown34_1_Box_Any_NEON, + ScaleRowDown34_1_Box_NEON, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 23) +#endif +#ifdef HAS_SCALEROWDOWN34_MSA +SDANY(ScaleRowDown34_Any_MSA, + ScaleRowDown34_MSA, + ScaleRowDown34_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_0_Box_Any_MSA, + ScaleRowDown34_0_Box_MSA, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_1_Box_Any_MSA, + ScaleRowDown34_1_Box_MSA, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 47) #endif #ifdef HAS_SCALEROWDOWN38_SSSE3 -SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, - ScaleRowDown38_C, 8 / 3, 1, 11) -SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3, - ScaleRowDown38_3_Box_C, 8 / 3, 1, 5) -SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3, - ScaleRowDown38_2_Box_C, 8 / 3, 1, 5) +SDANY(ScaleRowDown38_Any_SSSE3, + ScaleRowDown38_SSSE3, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_SSSE3, + ScaleRowDown38_3_Box_SSSE3, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 5) +SDANY(ScaleRowDown38_2_Box_Any_SSSE3, + ScaleRowDown38_2_Box_SSSE3, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 5) #endif #ifdef HAS_SCALEROWDOWN38_NEON -SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON, - ScaleRowDown38_C, 8 / 3, 1, 11) -SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON, - ScaleRowDown38_3_Box_C, 8 / 3, 1, 11) -SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON, - ScaleRowDown38_2_Box_C, 8 / 3, 1, 11) +SDANY(ScaleRowDown38_Any_NEON, + ScaleRowDown38_NEON, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_NEON, + ScaleRowDown38_3_Box_NEON, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_NEON, + ScaleRowDown38_2_Box_NEON, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) +#endif +#ifdef HAS_SCALEROWDOWN38_MSA +SDANY(ScaleRowDown38_Any_MSA, + ScaleRowDown38_MSA, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_MSA, + ScaleRowDown38_3_Box_MSA, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_MSA, + ScaleRowDown38_2_Box_MSA, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) #endif #ifdef HAS_SCALEARGBROWDOWN2_SSE2 -SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2, - ScaleARGBRowDown2_C, 2, 4, 3) -SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2, - ScaleARGBRowDown2Linear_C, 2, 4, 3) -SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2, - ScaleARGBRowDown2Box_C, 2, 4, 3) +SDANY(ScaleARGBRowDown2_Any_SSE2, + ScaleARGBRowDown2_SSE2, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_SSE2, + ScaleARGBRowDown2Linear_SSE2, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_SSE2, + ScaleARGBRowDown2Box_SSE2, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) #endif #ifdef HAS_SCALEARGBROWDOWN2_NEON -SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON, - ScaleARGBRowDown2_C, 2, 4, 7) -SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON, - ScaleARGBRowDown2Linear_C, 2, 4, 7) -SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON, - ScaleARGBRowDown2Box_C, 2, 4, 7) +SDANY(ScaleARGBRowDown2_Any_NEON, + ScaleARGBRowDown2_NEON, + ScaleARGBRowDown2_C, + 2, + 4, + 7) +SDANY(ScaleARGBRowDown2Linear_Any_NEON, + ScaleARGBRowDown2Linear_NEON, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 7) +SDANY(ScaleARGBRowDown2Box_Any_NEON, + ScaleARGBRowDown2Box_NEON, + ScaleARGBRowDown2Box_C, + 2, + 4, + 7) +#endif +#ifdef HAS_SCALEARGBROWDOWN2_MSA +SDANY(ScaleARGBRowDown2_Any_MSA, + ScaleARGBRowDown2_MSA, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_MSA, + ScaleARGBRowDown2Linear_MSA, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_MSA, + ScaleARGBRowDown2Box_MSA, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) #endif #undef SDANY // Scale down by even scale factor. -#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ - void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \ - uint8* dst_ptr, int dst_width) { \ - int r = (int)((unsigned int)dst_width % (MASK + 1)); \ - int n = dst_width - r; \ - if (n > 0) { \ - SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ - } \ - SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \ - src_stepx, dst_ptr + n * BPP, r); \ - } +#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \ + uint8_t* dst_ptr, int dst_width) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ + dst_ptr + n * BPP, r); \ + } #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 -SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2, - ScaleARGBRowDownEven_C, 4, 3) -SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2, - ScaleARGBRowDownEvenBox_C, 4, 3) +SDAANY(ScaleARGBRowDownEven_Any_SSE2, + ScaleARGBRowDownEven_SSE2, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, + ScaleARGBRowDownEvenBox_SSE2, + ScaleARGBRowDownEvenBox_C, + 4, + 3) #endif #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON -SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON, - ScaleARGBRowDownEven_C, 4, 3) -SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON, - ScaleARGBRowDownEvenBox_C, 4, 3) +SDAANY(ScaleARGBRowDownEven_Any_NEON, + ScaleARGBRowDownEven_NEON, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, + ScaleARGBRowDownEvenBox_NEON, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA +SDAANY(ScaleARGBRowDownEven_Any_MSA, + ScaleARGBRowDownEven_MSA, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, + ScaleARGBRowDownEvenBox_MSA, + ScaleARGBRowDownEvenBox_C, + 4, + 3) #endif // Add rows box filter scale down. -#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ - void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \ - int n = src_width & ~MASK; \ - if (n > 0) { \ - SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ - } \ - SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ - } +#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ + int n = src_width & ~MASK; \ + if (n > 0) { \ + SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ + } \ + SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ + } #ifdef HAS_SCALEADDROW_SSE2 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) @@ -208,14 +453,12 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) #ifdef HAS_SCALEADDROW_NEON SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) #endif +#ifdef HAS_SCALEADDROW_MSA +SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) +#endif #undef SAANY #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif - - - - - diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc index 17f51ae9bf..53a22e8b41 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_argb.cc @@ -30,20 +30,31 @@ static __inline int Abs(int v) { // ScaleARGB ARGB, 1/2 // This is an optimized version for scaling down a ARGB to 1/2 of // its original size. -static void ScaleARGBDown2(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; int row_stride = src_stride * (dy >> 16); - void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) = - filtering == kFilterNone ? ScaleARGBRowDown2_C : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C : - ScaleARGBRowDown2Box_C); - assert(dx == 65536 * 2); // Test scale factor of 2. + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = + filtering == kFilterNone + ? ScaleARGBRowDown2_C + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C + : ScaleARGBRowDown2Box_C); + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 2); // Test scale factor of 2. assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. // Advance to odd row, even column. if (filtering == kFilterBilinear) { @@ -54,25 +65,49 @@ static void ScaleARGBDown2(int src_width, int src_height, #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 : - ScaleARGBRowDown2Box_Any_SSE2); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_SSE2 + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 + : ScaleARGBRowDown2Box_Any_SSE2); if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : - ScaleARGBRowDown2Box_SSE2); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_SSE2 + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 + : ScaleARGBRowDown2Box_SSE2); } } #endif #if defined(HAS_SCALEARGBROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON : - ScaleARGBRowDown2Box_Any_NEON); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON + : ScaleARGBRowDown2Box_Any_NEON); if (IS_ALIGNED(dst_width, 8)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON : - ScaleARGBRowDown2Box_NEON); + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_NEON + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON + : ScaleARGBRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA + : ScaleARGBRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA + : ScaleARGBRowDown2Box_MSA); } } #endif @@ -90,21 +125,32 @@ static void ScaleARGBDown2(int src_width, int src_height, // ScaleARGB ARGB, 1/4 // This is an optimized version for scaling down a ARGB to 1/4 of // its original size. -static void ScaleARGBDown4Box(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy) { +static void ScaleARGBDown4Box(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy) { int j; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 2 * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); int row_stride = src_stride * (dy >> 16); - void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; + void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, + uint8_t* dst_argb, int dst_width) = + ScaleARGBRowDown2Box_C; // Advance to odd row, even column. src_argb += (y >> 16) * src_stride + (x >> 16) * 4; - assert(dx == 65536 * 4); // Test scale factor of 4. + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -125,8 +171,8 @@ static void ScaleARGBDown4Box(int src_width, int src_height, for (j = 0; j < dst_height; ++j) { ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); - ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, - row + kRowSize, dst_width * 2); + ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize, + dst_width * 2); ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width); src_argb += row_stride; dst_argb += dst_stride; @@ -137,38 +183,57 @@ static void ScaleARGBDown4Box(int src_width, int src_height, // ScaleARGB ARGB Even // This is an optimized version for scaling down a ARGB to even // multiple of its original size. -static void ScaleARGBDownEven(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBDownEven(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; int col_step = dx >> 16; int row_stride = (dy >> 16) * src_stride; - void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, - int src_step, uint8* dst_argb, int dst_width) = + void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride, + int src_step, uint8_t* dst_argb, int dst_width) = filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; + (void)src_width; + (void)src_height; assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); src_argb += (y >> 16) * src_stride + (x >> 16) * 4; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 : - ScaleARGBRowDownEven_Any_SSE2; + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 + : ScaleARGBRowDownEven_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : - ScaleARGBRowDownEven_SSE2; + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON : - ScaleARGBRowDownEven_Any_NEON; + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON + : ScaleARGBRowDownEven_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : - ScaleARGBRowDownEven_NEON; + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA + : ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA; } } #endif @@ -184,25 +249,32 @@ static void ScaleARGBDownEven(int src_width, int src_height, } // Scale ARGB down with bilinear interpolation. -static void ScaleARGBBilinearDown(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; - int64 xlast = x + (int64)(dst_width - 1) * dx; - int64 xl = (dx >= 0) ? x : xlast; - int64 xr = (dx >= 0) ? xlast : x; + int64_t xlast = x + (int64_t)(dst_width - 1) * dx; + int64_t xl = (dx >= 0) ? x : xlast; + int64_t xr = (dx >= 0) ? xlast : x; int clip_src_width; - xl = (xl >> 16) & ~3; // Left edge aligned. - xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. if (xr > src_width) { xr = src_width; @@ -234,12 +306,11 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(clip_src_width, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; } } #endif @@ -255,6 +326,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; } } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } #endif // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row of ARGB. @@ -267,7 +346,7 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; - const uint8* src = src_argb + yi * src_stride; + const uint8_t* src = src_argb + yi * src_stride; if (filtering == kFilterLinear) { ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); } else { @@ -286,18 +365,25 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, } // Scale ARGB up with bilinear interpolation. -static void ScaleARGBBilinearUp(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, +static void ScaleARGBBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; - void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; #if defined(HAS_INTERPOLATEROW_SSSE3) @@ -324,15 +410,17 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } } #endif if (src_width >= 32768) { - ScaleARGBFilterCols = filtering ? - ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + ScaleARGBFilterCols = + filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -347,6 +435,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -359,6 +455,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, ScaleARGBFilterCols = ScaleARGBCols_NEON; } } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; @@ -375,13 +479,13 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, { int yi = y >> 16; - const uint8* src = src_argb + yi * src_stride; + const uint8_t* src = src_argb + yi * src_stride; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); - uint8* rowptr = row; + uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; @@ -423,24 +527,27 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, #ifdef YUVSCALEUP // Scale YUV to ARGB up with bilinear interpolation. -static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, - int dst_width, int dst_height, +static void ScaleYUVToARGBBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, int src_stride_y, int src_stride_u, int src_stride_v, int dst_stride_argb, - const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int x, int dx, int y, int dy, + const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy, enum FilterMode filtering) { int j; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToARGBRow_C; + void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, + const uint8_t* v_buf, uint8_t* rgb_buf, int width) = + I422ToARGBRow_C; #if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; @@ -465,19 +572,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_I422TOARGBROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_DSPR2; +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } } #endif - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -502,19 +608,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } } #endif - void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; if (src_width >= 32768) { - ScaleARGBFilterCols = filtering ? - ScaleARGBFilterCols64_C : ScaleARGBCols64_C; + ScaleARGBFilterCols = + filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -529,6 +637,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -541,6 +657,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, ScaleARGBFilterCols = ScaleARGBCols_NEON; } } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_MSA; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; @@ -558,9 +682,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. int yi = y >> 16; int uv_yi = yi >> kYShift; - const uint8* src_row_y = src_y + yi * src_stride_y; - const uint8* src_row_u = src_u + uv_yi * src_stride_u; - const uint8* src_row_v = src_v + uv_yi * src_stride_v; + const uint8_t* src_row_y = src_y + yi * src_stride_y; + const uint8_t* src_row_u = src_u + uv_yi * src_stride_u; + const uint8_t* src_row_v = src_v + uv_yi * src_stride_v; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 4 + 31) & ~31; @@ -569,7 +693,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, // Allocate 1 row of ARGB for source conversion. align_buffer_64(argb_row, src_width * 4); - uint8* rowptr = row; + uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; @@ -635,15 +759,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. -static void ScaleARGBSimple(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy) { +static void ScaleARGBSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int dx, + int y, + int dy) { int j; - void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = + void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb, + int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; + (void)src_height; #if defined(HAS_SCALEARGBCOLS_SSE2) if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBCols = ScaleARGBCols_SSE2; @@ -656,6 +788,14 @@ static void ScaleARGBSimple(int src_width, int src_height, ScaleARGBCols = ScaleARGBCols_NEON; } } +#endif +#if defined(HAS_SCALEARGBCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBCols = ScaleARGBCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBCols_MSA; + } + } #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; @@ -667,8 +807,8 @@ static void ScaleARGBSimple(int src_width, int src_height, } for (j = 0; j < dst_height; ++j) { - ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, - dst_width, x, dx); + ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x, + dx); dst_argb += dst_stride; y += dy; } @@ -677,11 +817,18 @@ static void ScaleARGBSimple(int src_width, int src_height, // ScaleARGB a ARGB. // This function in turn calls a scaling function // suitable for handling the desired resolutions. -static void ScaleARGB(const uint8* src, int src_stride, - int src_width, int src_height, - uint8* dst, int dst_stride, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +static void ScaleARGB(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; @@ -690,8 +837,7 @@ static void ScaleARGB(const uint8* src, int src_stride, int dy = 0; // ARGB does not support box filter yet, but allow the user to pass it. // Simplify filtering when possible. - filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, filtering); // Negative src_height means invert the image. @@ -700,17 +846,17 @@ static void ScaleARGB(const uint8* src, int src_stride, src = src + (src_height - 1) * src_stride; src_stride = -src_stride; } - ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, - &x, &y, &dx, &dy); + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); src_width = Abs(src_width); if (clip_x) { - int64 clipf = (int64)(clip_x) * dx; + int64_t clipf = (int64_t)(clip_x)*dx; x += (clipf & 0xffff); src += (clipf >> 16) * 4; dst += clip_x * 4; } if (clip_y) { - int64 clipf = (int64)(clip_y) * dy; + int64_t clipf = (int64_t)(clip_y)*dy; y += (clipf & 0xffff); src += (clipf >> 16) * src_stride; dst += clip_y * dst_stride; @@ -725,24 +871,20 @@ static void ScaleARGB(const uint8* src, int src_stride, if (!(dx & 0x10000) && !(dy & 0x10000)) { if (dx == 0x20000) { // Optimized 1/2 downsample. - ScaleARGBDown2(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBDown2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } if (dx == 0x40000 && filtering == kFilterBox) { // Optimized 1/4 box downsample. - ScaleARGBDown4Box(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy); + ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy); return; } - ScaleARGBDownEven(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBDownEven(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } // Optimized odd scale down. ie 3, 5, 7, 9x. @@ -759,96 +901,105 @@ static void ScaleARGB(const uint8* src, int src_stride, } if (dx == 0x10000 && (x & 0xffff) == 0) { // Arbitrary scale vertically, but unscaled vertically. - ScalePlaneVertical(src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, y, dy, 4, filtering); + ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, y, dy, 4, filtering); return; } if (filtering && dy < 65536) { - ScaleARGBBilinearUp(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } if (filtering) { - ScaleARGBBilinearDown(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy, filtering); + ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); return; } - ScaleARGBSimple(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src, dst, - x, dx, y, dy); + ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, dx, y, dy); } LIBYUV_API -int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +int ARGBScaleClip(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering) { - if (!src_argb || src_width == 0 || src_height == 0 || - !dst_argb || dst_width <= 0 || dst_height <= 0 || - clip_x < 0 || clip_y < 0 || + if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb || + dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 || clip_width > 32768 || clip_height > 32768 || (clip_x + clip_width) > dst_width || (clip_y + clip_height) > dst_height) { return -1; } - ScaleARGB(src_argb, src_stride_argb, src_width, src_height, - dst_argb, dst_stride_argb, dst_width, dst_height, - clip_x, clip_y, clip_width, clip_height, filtering); + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width, + clip_height, filtering); return 0; } // Scale an ARGB image. LIBYUV_API -int ARGBScale(const uint8* src_argb, int src_stride_argb, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - int dst_width, int dst_height, +int ARGBScale(const uint8_t* src_argb, + int src_stride_argb, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + int dst_width, + int dst_height, enum FilterMode filtering) { - if (!src_argb || src_width == 0 || src_height == 0 || - src_width > 32768 || src_height > 32768 || - !dst_argb || dst_width <= 0 || dst_height <= 0) { + if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) { return -1; } - ScaleARGB(src_argb, src_stride_argb, src_width, src_height, - dst_argb, dst_stride_argb, dst_width, dst_height, - 0, 0, dst_width, dst_height, filtering); + ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height, + filtering); return 0; } // Scale with YUV conversion to ARGB and clipping. LIBYUV_API -int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint32 src_fourcc, - int src_width, int src_height, - uint8* dst_argb, int dst_stride_argb, - uint32 dst_fourcc, - int dst_width, int dst_height, - int clip_x, int clip_y, int clip_width, int clip_height, +int YUVToARGBScaleClip(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint32_t src_fourcc, + int src_width, + int src_height, + uint8_t* dst_argb, + int dst_stride_argb, + uint32_t dst_fourcc, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, enum FilterMode filtering) { - uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4); + uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4); int r; - I420ToARGB(src_y, src_stride_y, - src_u, src_stride_u, - src_v, src_stride_v, - argb_buffer, src_width * 4, - src_width, src_height); + (void)src_fourcc; // TODO(fbarchard): implement and/or assert. + (void)dst_fourcc; + I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, + argb_buffer, src_width * 4, src_width, src_height); - r = ARGBScaleClip(argb_buffer, src_width * 4, - src_width, src_height, - dst_argb, dst_stride_argb, - dst_width, dst_height, - clip_x, clip_y, clip_width, clip_height, - filtering); + r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb, + dst_stride_argb, dst_width, dst_height, clip_x, clip_y, + clip_width, clip_height, filtering); free(argb_buffer); return r; } diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc index 3507aa4d9f..b28d7da41f 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_common.cc @@ -28,9 +28,12 @@ static __inline int Abs(int v) { } // CPU agnostic row functions -void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown2_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[1]; dst[1] = src_ptr[3]; @@ -42,9 +45,12 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[1]; dst[1] = src_ptr[3]; @@ -56,10 +62,13 @@ void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* s = src_ptr; +void ScaleRowDown2Linear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + 1) >> 1; dst[1] = (s[2] + s[3] + 1) >> 1; @@ -71,10 +80,13 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { - const uint16* s = src_ptr; +void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* s = src_ptr; int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + 1) >> 1; dst[1] = (s[2] + s[3] + 1) >> 1; @@ -86,10 +98,12 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; +void ScaleRowDown2Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; @@ -103,10 +117,12 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; +void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; dst_width -= 1; for (x = 0; x < dst_width - 1; x += 2) { @@ -125,10 +141,12 @@ void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride, dst[0] = (s[0] + t[0] + 1) >> 1; } -void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { - const uint16* s = src_ptr; - const uint16* t = src_ptr + src_stride; +void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; @@ -142,9 +160,12 @@ void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown4_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[2]; dst[1] = src_ptr[6]; @@ -156,9 +177,12 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown4_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[2]; dst[1] = src_ptr[6]; @@ -170,81 +194,88 @@ void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown4Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + - src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + - src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + - 8) >> 4; + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; } } -void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + - src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + - src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + - 8) >> 4; + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + + src_ptr[stride * 3 + 7] + 8) >> + 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + + src_ptr[stride * 3 + 3] + 8) >> + 4; } } -void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown34_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { int x; + (void)src_stride; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -255,9 +286,12 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown34_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { int x; + (void)src_stride; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -269,19 +303,21 @@ void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // Filter rows 0 and 1 together, 3 : 1 -void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; +void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 * 3 + b0 + 2) >> 2; d[1] = (a1 * 3 + b1 + 2) >> 2; d[2] = (a2 * 3 + b2 + 2) >> 2; @@ -291,19 +327,21 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width) { - const uint16* s = src_ptr; - const uint16* t = src_ptr + src_stride; +void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 * 3 + b0 + 2) >> 2; d[1] = (a1 * 3 + b1 + 2) >> 2; d[2] = (a2 * 3 + b2 + 2) >> 2; @@ -314,19 +352,21 @@ void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // Filter rows 1 and 2 together, 1 : 1 -void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; +void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 + b0 + 1) >> 1; d[1] = (a1 + b1 + 1) >> 1; d[2] = (a2 + b2 + 1) >> 1; @@ -336,19 +376,21 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* d, int dst_width) { - const uint16* s = src_ptr; - const uint16* t = src_ptr + src_stride; +void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* d, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { - uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 + b0 + 1) >> 1; d[1] = (a1 + b1 + 1) >> 1; d[2] = (a2 + b2 + 1) >> 1; @@ -359,8 +401,11 @@ void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // Scales a single row of pixels using point sampling. -void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[0] = src_ptr[x >> 16]; @@ -374,8 +419,11 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, } } -void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) { +void ScaleCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[0] = src_ptr[x >> 16]; @@ -390,9 +438,14 @@ void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr, } // Scales a single row of pixels up by 2x using point sampling. -void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleColsUp2_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { int j; + (void)x; + (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[1] = dst_ptr[0] = src_ptr[0]; src_ptr += 1; @@ -403,9 +456,14 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, } } -void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) { +void ScaleColsUp2_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { int j; + (void)x; + (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[1] = dst_ptr[0] = src_ptr[0]; src_ptr += 1; @@ -418,16 +476,19 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, // (1-f)a + fb can be replaced with a + f(b-a) #if defined(__arm__) || defined(__aarch64__) -#define BLENDER(a, b, f) (uint8)((int)(a) + \ - ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) +#define BLENDER(a, b, f) \ + (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) #else -// inteluses 7 bit math with rounding. -#define BLENDER(a, b, f) (uint8)((int)(a) + \ - (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) +// Intel uses 7 bit math with rounding. +#define BLENDER(a, b, f) \ + (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) #endif -void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; @@ -450,12 +511,15 @@ void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, } } -void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x32, int dx) { - int64 x = (int64)(x32); +void ScaleFilterCols64_C(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); int j; for (j = 0; j < dst_width - 1; j += 2) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -468,7 +532,7 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, dst_ptr += 2; } if (dst_width & 1) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -476,12 +540,15 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, } #undef BLENDER -// Same as 8 bit arm blender but return is cast to uint16 -#define BLENDER(a, b, f) (uint16)((int)(a) + \ - ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) +// Same as 8 bit arm blender but return is cast to uint16_t +#define BLENDER(a, b, f) \ + (uint16_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) -void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x, + int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; @@ -504,12 +571,15 @@ void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, } } -void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x32, int dx) { - int64 x = (int64)(x32); +void ScaleFilterCols64_16_C(uint16_t* dst_ptr, + const uint16_t* src_ptr, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); int j; for (j = 0; j < dst_width - 1; j += 2) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -522,7 +592,7 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, dst_ptr += 2; } if (dst_width & 1) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); @@ -530,9 +600,12 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr, } #undef BLENDER -void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { +void ScaleRowDown38_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { int x; + (void)src_stride; assert(dst_width % 3 == 0); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -543,9 +616,12 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst, int dst_width) { +void ScaleRowDown38_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { int x; + (void)src_stride; assert(dst_width % 3 == 0); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; @@ -557,100 +633,118 @@ void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } // 8x3 -> 3x1 -void ScaleRowDown38_3_Box_C(const uint8* src_ptr, +void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8_t* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> 16; + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; src_ptr += 8; dst_ptr += 3; } } -void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr, +void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) { + uint16_t* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> 16; + dst_ptr[0] = + (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> + 16; + dst_ptr[1] = + (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> + 16; src_ptr += 8; dst_ptr += 3; } } // 8x2 -> 3x1 -void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2]) * (65536 / 6) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5]) * (65536 / 6) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> 16; + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; src_ptr += 8; dst_ptr += 3; } } -void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int dst_width) { +void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2]) * (65536 / 6) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5]) * (65536 / 6) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> 16; + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + + src_ptr[stride + 1] + src_ptr[stride + 2]) * + (65536 / 6) >> + 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + + src_ptr[stride + 4] + src_ptr[stride + 5]) * + (65536 / 6) >> + 16; + dst_ptr[2] = + (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> + 16; src_ptr += 8; dst_ptr += 3; } } -void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) { +void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { int x; assert(src_width > 0); for (x = 0; x < src_width - 1; x += 2) { @@ -664,7 +758,9 @@ void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) { } } -void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) { +void ScaleAddRow_16_C(const uint16_t* src_ptr, + uint32_t* dst_ptr, + int src_width) { int x; assert(src_width > 0); for (x = 0; x < src_width - 1; x += 2) { @@ -678,13 +774,14 @@ void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) { } } -void ScaleARGBRowDown2_C(const uint8* src_argb, +void ScaleARGBRowDown2_C(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); - + uint8_t* dst_argb, + int dst_width) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int x; + (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src[1]; dst[1] = src[3]; @@ -696,10 +793,12 @@ void ScaleARGBRowDown2_C(const uint8* src_argb, } } -void ScaleARGBRowDown2Linear_C(const uint8* src_argb, +void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { + uint8_t* dst_argb, + int dst_width) { int x; + (void)src_stride; for (x = 0; x < dst_width; ++x) { dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; @@ -710,29 +809,37 @@ void ScaleARGBRowDown2Linear_C(const uint8* src_argb, } } -void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { int x; for (x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + - src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + - src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + - src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + - src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + + src_argb[src_stride + 4] + 2) >> + 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + + src_argb[src_stride + 5] + 2) >> + 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + + src_argb[src_stride + 6] + 2) >> + 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + + src_argb[src_stride + 7] + 2) >> + 2; src_argb += 8; dst_argb += 4; } } -void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEven_C(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); - + uint8_t* dst_argb, + int dst_width) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + (void)src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src[0]; @@ -745,30 +852,38 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride, } } -void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, +void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { + uint8_t* dst_argb, + int dst_width) { int x; for (x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + - src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + - src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + - src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + - src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + + src_argb[src_stride + 4] + 2) >> + 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + + src_argb[src_stride + 5] + 2) >> + 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + + src_argb[src_stride + 6] + 2) >> + 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + + src_argb[src_stride + 7] + 2) >> + 2; src_argb += src_stepx * 4; dst_argb += 4; } } // Scales a single row of pixels using point sampling. -void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); +void ScaleARGBCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { dst[0] = src[x >> 16]; @@ -782,11 +897,14 @@ void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, } } -void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x32, int dx) { - int64 x = (int64)(x32); - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); +void ScaleARGBCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { dst[0] = src[x >> 16]; @@ -801,11 +919,16 @@ void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb, } // Scales a single row of pixels up by 2x using point sampling. -void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); +void ScaleARGBColsUp2_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; + (void)x; + (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst[1] = dst[0] = src[0]; src += 1; @@ -818,23 +941,26 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, // TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. // Mimics SSSE3 blender -#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 -#define BLENDERC(a, b, f, s) (uint32)( \ - BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) -#define BLENDER(a, b, f) \ - BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ - BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDERC(a, b, f, s) \ + (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \ + BLENDERC(a, b, f, 0) -void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); +void ScaleARGBFilterCols_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); x += dx; xi = x >> 16; @@ -848,23 +974,26 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, if (dst_width & 1) { int xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); } } -void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x32, int dx) { - int64 x = (int64)(x32); - const uint32* src = (const uint32*)(src_argb); - uint32* dst = (uint32*)(dst_argb); +void ScaleARGBFilterCols64_C(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); x += dx; xi = x >> 16; @@ -876,10 +1005,10 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, dst += 2; } if (dst_width & 1) { - int64 xi = x >> 16; + int64_t xi = x >> 16; int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; + uint32_t a = src[xi]; + uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); } } @@ -889,16 +1018,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, // Scale plane vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int y, int dy, - int bpp, enum FilterMode filtering) { + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_argb, + uint8_t* dst_argb, + int x, + int y, + int dy, + int bpp, + enum FilterMode filtering) { // TODO(fbarchard): Allow higher bpp. int dst_width_bytes = dst_width * bpp; - void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_C; + void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; int j; assert(bpp >= 1 && bpp <= 4); @@ -930,13 +1065,11 @@ void ScalePlaneVertical(int src_height, } } #endif -#if defined(HAS_INTERPOLATEROW_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_Any_DSPR2; - if (IS_ALIGNED(dst_width_bytes, 4)) { - InterpolateRow = InterpolateRow_DSPR2; +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_MSA; } } #endif @@ -948,23 +1081,29 @@ void ScalePlaneVertical(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * src_stride, - src_stride, dst_width_bytes, yf); + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_bytes, yf); dst_argb += dst_stride; y += dy; } } void ScalePlaneVertical_16(int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint16* src_argb, uint16* dst_argb, - int x, int y, int dy, - int wpp, enum FilterMode filtering) { + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_argb, + uint16_t* dst_argb, + int x, + int y, + int dy, + int wpp, + enum FilterMode filtering) { // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; - void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb, - ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - InterpolateRow_16_C; + void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_16_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; int j; assert(wpp >= 1 && wpp <= 2); @@ -1003,16 +1142,6 @@ void ScalePlaneVertical_16(int src_height, InterpolateRow = InterpolateRow_16_NEON; } } -#endif -#if defined(HAS_INTERPOLATEROW_16_DSPR2) - if (TestCpuFlag(kCpuHasDSPR2) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { - InterpolateRow = InterpolateRow_Any_16_DSPR2; - if (IS_ALIGNED(dst_width_bytes, 4)) { - InterpolateRow = InterpolateRow_16_DSPR2; - } - } #endif for (j = 0; j < dst_height; ++j) { int yi; @@ -1022,16 +1151,18 @@ void ScalePlaneVertical_16(int src_height, } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; - InterpolateRow(dst_argb, src_argb + yi * src_stride, - src_stride, dst_width_words, yf); + InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, + dst_width_words, yf); dst_argb += dst_stride; y += dy; } } // Simplify the filtering based on scale factors. -enum FilterMode ScaleFilterReduce(int src_width, int src_height, - int dst_width, int dst_height, +enum FilterMode ScaleFilterReduce(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering) { if (src_width < 0) { src_width = -src_width; @@ -1073,22 +1204,26 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height, // Divide num by div and return as 16.16 fixed point result. int FixedDiv_C(int num, int div) { - return (int)(((int64)(num) << 16) / div); + return (int)(((int64_t)(num) << 16) / div); } // Divide num by div and return as 16.16 fixed point result. int FixedDiv1_C(int num, int div) { - return (int)((((int64)(num) << 16) - 0x00010001) / - (div - 1)); + return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1)); } #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) // Compute slope values for stepping. -void ScaleSlope(int src_width, int src_height, - int dst_width, int dst_height, +void ScaleSlope(int src_width, + int src_height, + int dst_width, + int dst_height, enum FilterMode filtering, - int* x, int* y, int* dx, int* dy) { + int* x, + int* y, + int* dx, + int* dy) { assert(x != NULL); assert(y != NULL); assert(dx != NULL); @@ -1120,7 +1255,7 @@ void ScaleSlope(int src_width, int src_height, *x = 0; } if (dst_height <= src_height) { - *dy = FixedDiv(src_height, dst_height); + *dy = FixedDiv(src_height, dst_height); *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. } else if (dst_height > 1) { *dy = FixedDiv1(src_height, dst_height); @@ -1153,6 +1288,35 @@ void ScaleSlope(int src_width, int src_height, } #undef CENTERSTART +// Read 8x2 upsample with filtering and write 16x1. +// actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + const uint16_t* src2 = src_ptr + src_stride; + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + uint16_t p0 = src_ptr[0]; + uint16_t p1 = src_ptr[1]; + uint16_t p2 = src2[0]; + uint16_t p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4; + ++src_ptr; + ++src2; + dst += 2; + } + if (dst_width & 1) { + uint16_t p0 = src_ptr[0]; + uint16_t p1 = src_ptr[1]; + uint16_t p2 = src2[0]; + uint16_t p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc index e2f88544b7..312236d2df 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_gcc.cc @@ -21,1296 +21,1348 @@ extern "C" { (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) // Offsets for source bytes 0 to 9 -static uvec8 kShuf0 = - { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static uvec8 kShuf1 = - { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf2 = - { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 0 to 10 -static uvec8 kShuf01 = - { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static uvec8 kShuf11 = - { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf21 = - { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; // Coefficients for source bytes 0 to 10 -static uvec8 kMadd01 = - { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; // Coefficients for source bytes 10 to 21 -static uvec8 kMadd11 = - { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; // Coefficients for source bytes 21 to 31 -static uvec8 kMadd21 = - { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; // Coefficients for source bytes 21 to 31 -static vec16 kRound34 = - { 2, 2, 2, 2, 2, 2, 2, 2 }; +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; -static uvec8 kShuf38a = - { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; -static uvec8 kShuf38b = - { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; // Arrange words 0,3,6 into 0,1,2 -static uvec8 kShufAc = - { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; // Arrange words 0,3,6 into 3,4,5 -static uvec8 kShufAc3 = - { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; // Scaling values for boxes of 3x3 and 2x3 -static uvec16 kScaleAc33 = - { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; // Arrange first value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb0 = - { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; // Arrange second value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb1 = - { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; // Arrange third value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb2 = - { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; // Scaling values for boxes of 3x2 and 2x2 -static uvec16 kScaleAb2 = - { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; // GCC versions of row functions are verbatim conversions from Visual C. // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt -void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); +void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" +void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } -void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" +void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "psrlw $0x1,%%xmm0 \n" - "psrlw $0x1,%%xmm1 \n" - "pavgw %%xmm5,%%xmm0 \n" - "pavgw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #ifdef HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); +void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" +void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } -void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" +void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vpsrlw $0x1,%%ymm0,%%ymm0 \n" - "vpsrlw $0x1,%%ymm1,%%ymm1 \n" - "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" - "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" +void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); } -void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { intptr_t stridex3; - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "packuswb %%xmm4,%%xmm4 \n" - "psllw $0x3,%%xmm5 \n" - "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" + "lea 0x00(%4,%4,2),%3 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "phaddw %%xmm1,%%xmm0 \n" - "paddw %%xmm5,%%xmm0 \n" - "psrlw $0x4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "=&r"(stridex3) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%4,2),%%xmm2 \n" + "movdqu 0x10(%0,%4,2),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "=&r"(stridex3) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } - #ifdef HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpsrld $0x18,%%ymm5,%%ymm5 \n" - "vpslld $0x10,%%ymm5,%%ymm5 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpand %%ymm5,%%ymm0,%%ymm0 \n" - "vpand %%ymm5,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" - ); +void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrld $0x18,%%ymm5,%%ymm5 \n" + "vpslld $0x10,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm5"); } -void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpsllw $0x3,%%ymm4,%%ymm5 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" +void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpsllw $0x3,%%ymm4,%%ymm5 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" - "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 - "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3 - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2 - MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3 - "lea " MEMLEA(0x40,0) ",%0 \n" - "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpsrlw $0x4,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(src_stride * 3)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" + "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" + "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" + "lea 0x40(%0),%0 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x4,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(src_stride * 3)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEROWDOWN4_AVX2 -void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kShuf0), // %0 - "m"(kShuf1), // %1 - "m"(kShuf2) // %2 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movq %%xmm1," MEMACCESS2(0x8,1) " \n" - "movq %%xmm2," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); +void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile ( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS(1) " \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } -void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile ( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7 - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS(1) " \n" - "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" - MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x18,1) ",%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm6 \n" + "movdqu 0x00(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,(%1) \n" + "movdqu 0x8(%0),%%xmm6 \n" + "movdqu 0x8(%0,%3,1),%%xmm7 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x8(%1) \n" + "movdqu 0x10(%0),%%xmm6 \n" + "movdqu 0x10(%0,%3,1),%%xmm7 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } -void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" +void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1," MEMACCESS2(0x8,1) " \n" - "lea " MEMLEA(0xc,1) ",%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kShuf38a), // %3 - "m"(kShuf38b) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" - ); + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } -void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" - : - : "m"(kShufAb0), // %0 - "m"(kShufAb1), // %1 - "m"(kShufAb2), // %2 - "m"(kScaleAb2) // %3 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1," MEMACCESS(1) " \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1," MEMACCESS2(0x2,1) " \n" - "lea " MEMLEA(0x6,1) ",%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,(%1) \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, +void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - : - : "m"(kShufAc), // %0 - "m"(kShufAc3), // %1 - "m"(kScaleAc33) // %2 - ); - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6 - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movd %%xmm6," MEMACCESS(1) " \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6," MEMACCESS2(0x2,1) " \n" - "lea " MEMLEA(0x6,1) ",%1 \n" - "sub $0x6,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%3,1),%%xmm6 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqu 0x00(%0,%3,2),%%xmm6 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movd %%xmm6,(%1) \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6,0x2(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } // Reads 16xN bytes and produces 16 shorts at a time. -void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" +void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + "pxor %%xmm5,%%xmm5 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } - #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. -void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { - asm volatile ( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" +void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { + asm volatile( - LABELALIGN - "1: \n" - "vmovdqu " MEMACCESS(0) ",%%ymm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n" - "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0," MEMACCESS(1) " \n" - "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" - "lea " MEMLEA(0x40,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" - ); + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEADDROW_AVX2 // Constant for making pixels signed to avoid pmaddubsw // saturation. -static uvec8 kFsub80 = - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Constant for making pixels unsigned and adding .5 for rounding. -static uvec16 kFadd40 = - { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { intptr_t x0, x1, temp_pixel; - asm volatile ( - "movd %6,%%xmm2 \n" - "movd %7,%%xmm3 \n" - "movl $0x04040000,%k2 \n" - "movd %k2,%%xmm5 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" // 0x007f007f - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $15,%%xmm7 \n" // 0x00010001 + asm volatile( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 - "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%5 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 - "movd %k2,%%xmm4 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm4,%%xmm0 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1 - "paddusb %%xmm7,%%xmm1 \n" - "pmaddubsw %%xmm0,%%xmm1 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "paddw %9,%%xmm1 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movd %%xmm1,%k2 \n" - "mov %w2," MEMACCESS(0) " \n" - "lea " MEMLEA(0x2,0) ",%0 \n" - "subl $0x2,%5 \n" - "jge 2b \n" + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movzwl 0x00(%1,%4,1),%k2 \n" + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + + // 1 + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" + "mov %w2,(%0) \n" + "lea 0x2(%0),%0 \n" + "subl $0x2,%5 \n" + "jge 2b \n" - LABELALIGN - "29: \n" - "addl $0x1,%5 \n" - "jl 99f \n" - MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 - "movd %k2,%%xmm0 \n" - "psrlw $0x9,%%xmm2 \n" - "pshufb %%xmm5,%%xmm2 \n" - "psubb %8,%%xmm0 \n" // make pixels signed. - "pxor %%xmm6,%%xmm2 \n" - "paddusb %%xmm7,%%xmm2 \n" - "pmaddubsw %%xmm0,%%xmm2 \n" - "paddw %9,%%xmm2 \n" // make pixels unsigned. - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,%k2 \n" - "mov %b2," MEMACCESS(0) " \n" - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "=&a"(temp_pixel), // %2 - "=&r"(x0), // %3 - "=&r"(x1), // %4 + LABELALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + "movzwl 0x00(%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" + "mov %b2,(%0) \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "=&a"(temp_pixel), // %2 + "=&r"(x0), // %3 + "=&r"(x1), // %4 #if defined(__x86_64__) - "+rm"(dst_width) // %5 + "+rm"(dst_width) // %5 #else - "+m"(dst_width) // %5 + "+m"(dst_width) // %5 #endif - : "rm"(x), // %6 - "rm"(dx), // %7 + : "rm"(x), // %6 + "rm"(dx), // %7 #if defined(__x86_64__) - "x"(kFsub80), // %8 - "x"(kFadd40) // %9 + "x"(kFsub80), // %8 + "x"(kFadd40) // %9 #else - "m"(kFsub80), // %8 - "m"(kFadd40) // %9 + "m"(kFsub80), // %8 + "m"(kFadd40) // %9 #endif - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x20,%2 \n" - "jg 1b \n" +void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + (void)x; + (void)dx; + asm volatile( - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleARGBRowDown2_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1" - ); + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } -void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, +void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + uint8_t* dst_argb, + int dst_width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Reads 4 pixels at a time. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width) { +void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; - asm volatile ( - "lea " MEMLEA3(0x00,1,4) ",%1 \n" - "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" - LABELALIGN - "1: \n" - "movd " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 - "punpckldq %%xmm1,%%xmm0 \n" - MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 - MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 - "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width), // %3 - "=&r"(src_stepx_x12) // %4 - :: "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + (void)src_stride; + asm volatile( + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + + LABELALIGN + "1: \n" + "movd (%0),%%xmm0 \n" + "movd 0x00(%0,%1,1),%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%0,%1,2),%%xmm2 \n" + "movd 0x00(%0,%4,1),%%xmm3 \n" + "lea 0x00(%0,%1,4),%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "=&r"(src_stepx_x12) // %4 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Blends four 2x2 to 4x1. // Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { +void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); - asm volatile ( - "lea " MEMLEA3(0x00,1,4) ",%1 \n" - "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" - "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" + asm volatile( + "lea 0x00(,%1,4),%1 \n" + "lea 0x00(%1,%1,2),%4 \n" + "lea 0x00(%0,%5,1),%5 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 - MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 - MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 - "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" - "movq " MEMACCESS(5) ",%%xmm2 \n" - MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 - MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 - MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 - "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+rm"(dst_width), // %3 - "=&r"(src_stepx_x12), // %4 - "+r"(row1) // %5 - :: "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3" - ); + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" + "movhps 0x00(%0,%1,1),%%xmm0 \n" + "movq 0x00(%0,%1,2),%%xmm1 \n" + "movhps 0x00(%0,%4,1),%%xmm1 \n" + "lea 0x00(%0,%1,4),%0 \n" + "movq (%5),%%xmm2 \n" + "movhps 0x00(%5,%1,1),%%xmm2 \n" + "movq 0x00(%5,%1,2),%%xmm3 \n" + "movhps 0x00(%5,%4,1),%%xmm3 \n" + "lea 0x00(%5,%1,4),%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "=&r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + ::"memory", + "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { intptr_t x0, x1; - asm volatile ( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x11,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x5,%%xmm3,%%xmm0 \n" - "paddd %%xmm0,%%xmm2 \n" - "paddd %%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "cmp $0x0,%4 \n" - "jl 99f \n" - "sub $0x4,%4 \n" - "jl 49f \n" + asm volatile( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" - LABELALIGN - "40: \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 - "pextrw $0x5,%%xmm2,%k0 \n" - "pextrw $0x7,%%xmm2,%k1 \n" - "paddd %%xmm3,%%xmm2 \n" - "punpckldq %%xmm1,%%xmm0 \n" - MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 - MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 - "pextrw $0x1,%%xmm2,%k0 \n" - "pextrw $0x3,%%xmm2,%k1 \n" - "punpckldq %%xmm4,%%xmm1 \n" - "punpcklqdq %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%4 \n" - "jge 40b \n" + LABELALIGN + "40: \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd 0x00(%3,%0,4),%%xmm1 \n" + "movd 0x00(%3,%1,4),%%xmm4 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%4 \n" + "jge 40b \n" - "49: \n" - "test $0x2,%4 \n" - "je 29f \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 - "pextrw $0x5,%%xmm2,%k0 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x8,2) ",%2 \n" - "29: \n" - "test $0x1,%4 \n" - "je 99f \n" - MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 - "movd %%xmm0," MEMACCESS(2) " \n" - "99: \n" - : "=&a"(x0), // %0 - "=&d"(x1), // %1 - "+r"(dst_argb), // %2 - "+r"(src_argb), // %3 - "+r"(dst_width) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" - ); + "49: \n" + "test $0x2,%4 \n" + "je 29f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd 0x00(%3,%1,4),%%xmm1 \n" + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" + "29: \n" + "test $0x1,%4 \n" + "je 99f \n" + "movd 0x00(%3,%0,4),%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "99: \n" + : "=&a"(x0), // %0 + "=&d"(x1), // %1 + "+r"(dst_argb), // %2 + "+r"(src_argb), // %3 + "+r"(dst_width) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpckldq %%xmm0,%%xmm0 \n" - "punpckhdq %%xmm1,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS(0) " \n" - "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "sub $0x8,%2 \n" - "jg 1b \n" +void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + (void)x; + (void)dx; + asm volatile( - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width) // %2 - :: "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); + LABELALIGN + "1: \n" + "movdqu (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%0) \n" + "movdqu %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + ::"memory", + "cc", "xmm0", "xmm1"); } // Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +static const uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each -static uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +static const uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version -void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { intptr_t x0, x1; - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" - : - : "m"(kShuffleColARGB), // %0 - "m"(kShuffleFractions) // %1 - ); + asm volatile( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" + : + : "m"(kShuffleColARGB), // %0 + "m"(kShuffleFractions) // %1 + ); - asm volatile ( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" + asm volatile( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" - LABELALIGN - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 - "psrlw $0x9,%%xmm1 \n" - MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(0) " \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" + LABELALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + "movhps 0x00(%1,%4,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%0) \n" + "lea 0x8(%0),%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" - LABELALIGN - "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(0) " \n" + LABELALIGN + "29: \n" + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + "movq 0x00(%1,%3,4),%%xmm0 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%0) \n" - LABELALIGN - "99: \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+rm"(dst_width), // %2 - "=&r"(x0), // %3 - "=&r"(x1) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" - ); + LABELALIGN "99: \n" // clang-format error. + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+rm"(dst_width), // %2 + "=&r"(x0), // %3 + "=&r"(x1) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } // Divide num by div and return as 16.16 fixed point result. int FixedDiv_X86(int num, int div) { - asm volatile ( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx" - ); + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); return num; } // Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_X86(int num, int div) { - asm volatile ( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "sub $0x10001,%%eax \n" - "sbb $0x0,%%edx \n" - "sub $0x1,%1 \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx" - ); + asm volatile( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx"); return num; } diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_mips.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_mips.cc deleted file mode 100644 index ae953073fa..0000000000 --- a/media/libvpx/libvpx/third_party/libyuv/source/scale_mips.cc +++ /dev/null @@ -1,644 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC MIPS DSPR2 -#if !defined(LIBYUV_DISABLE_MIPS) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) - -void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 4 \n" // iterations -> by 16 - "beqz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| - // TODO(fbarchard): Use odd pixels instead of even. - "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0| - "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8| - "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16| - "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "addiu $t9, $t9, -1 \n" - "sw $t8, 0(%[dst]) \n" - "sw $t0, 4(%[dst]) \n" - "sw $t1, 8(%[dst]) \n" - "sw $t2, 12(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 16 \n" - - "2: \n" - "andi $t9, %[dst_width], 0xf \n" // residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lbu $t0, 0(%[src_ptr]) \n" - "addiu %[src_ptr], %[src_ptr], 2 \n" - "addiu $t9, $t9, -1 \n" - "sb $t0, 0(%[dst]) \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst) - : [dst_width] "r" (dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); -} - -void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* t = src_ptr + src_stride; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 3 \n" // iterations -> step 8 - "bltz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 0(%[t]) \n" // |19|18|17|16| - "lw $t5, 4(%[t]) \n" // |23|22|21|20| - "lw $t6, 8(%[t]) \n" // |27|26|25|24| - "lw $t7, 12(%[t]) \n" // |31|30|29|28| - "addiu $t9, $t9, -1 \n" - "srl $t8, $t0, 16 \n" // |X|X|3|2| - "ins $t0, $t4, 16, 16 \n" // |17|16|1|0| - "ins $t4, $t8, 0, 16 \n" // |19|18|3|2| - "raddu.w.qb $t0, $t0 \n" // |17+16+1+0| - "raddu.w.qb $t4, $t4 \n" // |19+18+3+2| - "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2 - "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2 - "srl $t8, $t1, 16 \n" // |X|X|7|6| - "ins $t1, $t5, 16, 16 \n" // |21|20|5|4| - "ins $t5, $t8, 0, 16 \n" // |22|23|7|6| - "raddu.w.qb $t1, $t1 \n" // |21+20+5+4| - "raddu.w.qb $t5, $t5 \n" // |23+22+7+6| - "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2 - "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2 - "srl $t8, $t2, 16 \n" // |X|X|11|10| - "ins $t2, $t6, 16, 16 \n" // |25|24|9|8| - "ins $t6, $t8, 0, 16 \n" // |27|26|11|10| - "raddu.w.qb $t2, $t2 \n" // |25+24+9+8| - "raddu.w.qb $t6, $t6 \n" // |27+26+11+10| - "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2 - "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2 - "srl $t8, $t3, 16 \n" // |X|X|15|14| - "ins $t3, $t7, 16, 16 \n" // |29|28|13|12| - "ins $t7, $t8, 0, 16 \n" // |31|30|15|14| - "raddu.w.qb $t3, $t3 \n" // |29+28+13+12| - "raddu.w.qb $t7, $t7 \n" // |31+30+15+14| - "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2 - "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2 - "addiu %[src_ptr], %[src_ptr], 16 \n" - "addiu %[t], %[t], 16 \n" - "sb $t0, 0(%[dst]) \n" - "sb $t4, 1(%[dst]) \n" - "sb $t1, 2(%[dst]) \n" - "sb $t5, 3(%[dst]) \n" - "sb $t2, 4(%[dst]) \n" - "sb $t6, 5(%[dst]) \n" - "sb $t3, 6(%[dst]) \n" - "sb $t7, 7(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 8 \n" - - "2: \n" - "andi $t9, %[dst_width], 0x7 \n" // x = residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lwr $t1, 0(%[src_ptr]) \n" - "lwl $t1, 3(%[src_ptr]) \n" - "lwr $t2, 0(%[t]) \n" - "lwl $t2, 3(%[t]) \n" - "srl $t8, $t1, 16 \n" - "ins $t1, $t2, 16, 16 \n" - "ins $t2, $t8, 0, 16 \n" - "raddu.w.qb $t1, $t1 \n" - "raddu.w.qb $t2, $t2 \n" - "shra_r.w $t1, $t1, 2 \n" - "shra_r.w $t2, $t2, 2 \n" - "sb $t1, 0(%[dst]) \n" - "sb $t2, 1(%[dst]) \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "addiu $t9, $t9, -2 \n" - "addiu %[t], %[t], 4 \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 2 \n" - - "3: \n" - ".set pop \n" - - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), [t] "+r" (t) - : [dst_width] "r" (dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); -} - -void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 3 \n" - "beqz $t9, 2f \n" - " nop \n" - - "1: \n" - "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| - "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0| - "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8| - "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16| - "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24| - "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0| - "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "addiu $t9, $t9, -1 \n" - "sw $t1, 0(%[dst]) \n" - "sw $t5, 4(%[dst]) \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 8 \n" - - "2: \n" - "andi $t9, %[dst_width], 7 \n" // residue - "beqz $t9, 3f \n" - " nop \n" - - "21: \n" - "lbu $t1, 0(%[src_ptr]) \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "addiu $t9, $t9, -1 \n" - "sb $t1, 0(%[dst]) \n" - "bgtz $t9, 21b \n" - " addiu %[dst], %[dst], 1 \n" - - "3: \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst) - : [dst_width] "r" (dst_width) - : "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); -} - -void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - intptr_t stride = src_stride; - const uint8* s1 = src_ptr + stride; - const uint8* s2 = s1 + stride; - const uint8* s3 = s2 + stride; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "srl $t9, %[dst_width], 1 \n" - "andi $t8, %[dst_width], 1 \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 0(%[s1]) \n" // |7|6|5|4| - "lw $t2, 0(%[s2]) \n" // |11|10|9|8| - "lw $t3, 0(%[s3]) \n" // |15|14|13|12| - "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16| - "lw $t5, 4(%[s1]) \n" // |23|22|21|20| - "lw $t6, 4(%[s2]) \n" // |27|26|25|24| - "lw $t7, 4(%[s3]) \n" // |31|30|29|28| - "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| - "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| - "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| - "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| - "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16| - "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20| - "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24| - "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28| - "add $t0, $t0, $t1 \n" - "add $t1, $t2, $t3 \n" - "add $t0, $t0, $t1 \n" - "add $t4, $t4, $t5 \n" - "add $t6, $t6, $t7 \n" - "add $t4, $t4, $t6 \n" - "shra_r.w $t0, $t0, 4 \n" - "shra_r.w $t4, $t4, 4 \n" - "sb $t0, 0(%[dst]) \n" - "sb $t4, 1(%[dst]) \n" - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[s1], %[s1], 8 \n" - "addiu %[s2], %[s2], 8 \n" - "addiu %[s3], %[s3], 8 \n" - "addiu $t9, $t9, -1 \n" - "bgtz $t9, 1b \n" - " addiu %[dst], %[dst], 2 \n" - "beqz $t8, 2f \n" - " nop \n" - - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 0(%[s1]) \n" // |7|6|5|4| - "lw $t2, 0(%[s2]) \n" // |11|10|9|8| - "lw $t3, 0(%[s3]) \n" // |15|14|13|12| - "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| - "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| - "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| - "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| - "add $t0, $t0, $t1 \n" - "add $t1, $t2, $t3 \n" - "add $t0, $t0, $t1 \n" - "shra_r.w $t0, $t0, 4 \n" - "sb $t0, 0(%[dst]) \n" - - "2: \n" - ".set pop \n" - - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), - [s1] "+r" (s1), - [s2] "+r" (s2), - [s3] "+r" (s3) - : [dst_width] "r" (dst_width) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6","t7", "t8", "t9" - ); -} - -void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "1: \n" - "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| - "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13| - "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30| - "addiu %[dst_width], %[dst_width], -24 \n" - "ins $t1, $t1, 8, 16 \n" // |3|1|0|X| - "ins $t4, $t0, 8, 16 \n" // |X|15|13|12| - "ins $t5, $t5, 8, 16 \n" // |19|17|16|X| - "ins $t8, $t9, 8, 16 \n" // |X|31|29|28| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5| - "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21| - "prepend $t1, $t2, 8 \n" // |4|3|1|0| - "prepend $t3, $t4, 24 \n" // |15|13|12|11| - "prepend $t5, $t6, 8 \n" // |20|19|17|16| - "prepend $t7, $t8, 24 \n" // |31|29|28|27| - "sw $t1, 0(%[dst]) \n" - "sw $t0, 4(%[dst]) \n" - "sw $t3, 8(%[dst]) \n" - "sw $t5, 12(%[dst]) \n" - "sw $t9, 16(%[dst]) \n" - "sw $t7, 20(%[dst]) \n" - "bnez %[dst_width], 1b \n" - " addiu %[dst], %[dst], 24 \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), - [dst_width] "+r" (dst_width) - : - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6","t7", "t8", "t9" - ); -} - -void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "repl.ph $t3, 3 \n" // 0x00030003 - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| - "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1| - "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| - "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3| - "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3| - "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1| - "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| - "raddu.w.qb $t0, $t0 \n" - "raddu.w.qb $t1, $t1 \n" - "shra_r.w $t0, $t0, 1 \n" - "shra_r.w $t1, $t1, 1 \n" - "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1| - "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| - "rotr $t2, $t2, 16 \n" // |0|S1|0|S2| - "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| - "addu.ph $t2, $t2, $t4 \n" - "addu.ph $t6, $t6, $t5 \n" - "sll $t5, $t0, 1 \n" - "add $t0, $t5, $t0 \n" - "shra_r.ph $t2, $t2, 2 \n" - "shra_r.ph $t6, $t6, 2 \n" - "shll.ph $t4, $t2, 1 \n" - "addq.ph $t4, $t4, $t2 \n" - "addu $t0, $t0, $t1 \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "shra_r.w $t0, $t0, 2 \n" - "addu.ph $t6, $t6, $t4 \n" - "shra_r.ph $t6, $t6, 2 \n" - "srl $t1, $t6, 16 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "sb $t1, 0(%[d]) \n" - "sb $t0, 1(%[d]) \n" - "sb $t6, 2(%[d]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[d], %[d], 3 \n" - "3: \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [src_stride] "+r" (src_stride), - [d] "+r" (d), - [dst_width] "+r" (dst_width) - : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6" - ); -} - -void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "repl.ph $t2, 3 \n" // 0x00030003 - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| - "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1| - "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| - "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3| - "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3| - "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1| - "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| - "raddu.w.qb $t0, $t0 \n" - "raddu.w.qb $t1, $t1 \n" - "shra_r.w $t0, $t0, 1 \n" - "shra_r.w $t1, $t1, 1 \n" - "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1| - "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| - "rotr $t4, $t4, 16 \n" // |0|S1|0|S2| - "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| - "addu.ph $t4, $t4, $t3 \n" - "addu.ph $t6, $t6, $t5 \n" - "shra_r.ph $t6, $t6, 2 \n" - "shra_r.ph $t4, $t4, 2 \n" - "addu.ph $t6, $t6, $t4 \n" - "addiu %[src_ptr], %[src_ptr], 4 \n" - "shra_r.ph $t6, $t6, 1 \n" - "addu $t0, $t0, $t1 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "shra_r.w $t0, $t0, 1 \n" - "srl $t1, $t6, 16 \n" - "sb $t1, 0(%[d]) \n" - "sb $t0, 1(%[d]) \n" - "sb $t6, 2(%[d]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[d], %[d], 3 \n" - "3: \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [src_stride] "+r" (src_stride), - [d] "+r" (d), - [dst_width] "+r" (dst_width) - : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6" - ); -} - -void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| - "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| - "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| - "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| - "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| - "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| - "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| - "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| - "wsbh $t0, $t0 \n" // |2|3|0|1| - "wsbh $t6, $t6 \n" // |26|27|24|25| - "srl $t0, $t0, 8 \n" // |X|2|3|0| - "srl $t3, $t3, 16 \n" // |X|X|15|14| - "srl $t5, $t5, 16 \n" // |X|X|23|22| - "srl $t7, $t7, 16 \n" // |X|X|31|30| - "ins $t1, $t2, 24, 8 \n" // |8|6|5|4| - "ins $t6, $t5, 0, 8 \n" // |26|27|24|22| - "ins $t1, $t0, 0, 16 \n" // |8|6|3|0| - "ins $t6, $t7, 24, 8 \n" // |30|27|24|22| - "prepend $t2, $t3, 24 \n" // |X|15|14|11| - "ins $t4, $t4, 16, 8 \n" // |19|16|17|X| - "ins $t4, $t2, 0, 16 \n" // |19|16|14|11| - "addiu %[src_ptr], %[src_ptr], 32 \n" - "addiu %[dst_width], %[dst_width], -12 \n" - "addiu $t8,%[dst_width], -12 \n" - "sw $t1, 0(%[dst]) \n" - "sw $t4, 4(%[dst]) \n" - "sw $t6, 8(%[dst]) \n" - "bgez $t8, 1b \n" - " addiu %[dst], %[dst], 12 \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst] "+r" (dst), - [dst_width] "+r" (dst_width) - : - : "t0", "t1", "t2", "t3", "t4", - "t5", "t6", "t7", "t8" - ); -} - -void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - intptr_t stride = src_stride; - const uint8* t = src_ptr + stride; - const int c = 0x2AAA; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| - "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0| - "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4| - "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| - "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6| - "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4| - "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6 - "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4 - "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1| - "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3| - "srl $t4, $t4, 2 \n" // t4 / 4 - "srl $t6, $t6, 16 \n" // |0|0|S3|T3| - "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3 - "addu $t6, $t5, $t6 \n" - "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA - "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| - "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| - "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0 - "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0 - "addu $t0, $t0, $t2 \n" - "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[t], %[t], 8 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "addiu %[dst_ptr], %[dst_ptr], 3 \n" - "srl $t6, $t6, 16 \n" - "srl $t0, $t0, 16 \n" - "sb $t4, -1(%[dst_ptr]) \n" - "sb $t6, -2(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " sb $t0, -3(%[dst_ptr]) \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst_ptr] "+r" (dst_ptr), - [t] "+r" (t), - [dst_width] "+r" (dst_width) - : [c] "r" (c) - : "t0", "t1", "t2", "t3", "t4", "t5", "t6" - ); -} - -void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - intptr_t stride = src_stride; - const uint8* s1 = src_ptr + stride; - stride += stride; - const uint8* s2 = src_ptr + stride; - const int c1 = 0x1C71; - const int c2 = 0x2AAA; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "1: \n" - "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| - "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| - "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0| - "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4| - "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0| - "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4| - "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| - "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6| - "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6 - "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4| - "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4 - "sll $t8, $t5, 16 \n" // |R5|R4|0|0| - "raddu.w.qb $t8, $t8 \n" // R5+R4 - "addu $t7, $t7, $t8 \n" - "srl $t8, $t5, 16 \n" // |0|0|R7|R6| - "raddu.w.qb $t8, $t8 \n" // R7 + R6 - "addu $t6, $t6, $t8 \n" - "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA - "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1| - "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1| - "srl $t8, $t8, 8 \n" // |0|S3|T3|R3| - "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3 - "addu $t7, $t7, $t8 \n" - "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71 - "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| - "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| - "sll $t4, $t4, 8 \n" // |R2|R1|R0|0| - "raddu.w.qb $t0, $t0 \n" - "raddu.w.qb $t2, $t2 \n" - "raddu.w.qb $t4, $t4 \n" - "addu $t0, $t0, $t2 \n" - "addu $t0, $t0, $t4 \n" - "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71 - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[s1], %[s1], 8 \n" - "addiu %[s2], %[s2], 8 \n" - "addiu %[dst_width], %[dst_width], -3 \n" - "addiu %[dst_ptr], %[dst_ptr], 3 \n" - "srl $t6, $t6, 16 \n" - "srl $t7, $t7, 16 \n" - "srl $t0, $t0, 16 \n" - "sb $t6, -1(%[dst_ptr]) \n" - "sb $t7, -2(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " sb $t0, -3(%[dst_ptr]) \n" - ".set pop \n" - : [src_ptr] "+r" (src_ptr), - [dst_ptr] "+r" (dst_ptr), - [s1] "+r" (s1), - [s2] "+r" (s2), - [dst_width] "+r" (dst_width) - : [c1] "r" (c1), [c2] "r" (c2) - : "t0", "t1", "t2", "t3", "t4", - "t5", "t6", "t7", "t8" - ); -} - -#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc new file mode 100644 index 0000000000..482a521f0d --- /dev/null +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_msa.cc @@ -0,0 +1,949 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/scale_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define LOAD_INDEXED_DATA(srcp, indx0, out0) \ + { \ + out0[0] = srcp[indx0[0]]; \ + out0[1] = srcp[indx0[1]]; \ + out0[2] = srcp[indx0[2]]; \ + out0[3] = srcp[indx0[3]]; \ + } + +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3; + v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15}; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2); + vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg0 += reg2; + reg1 += reg3; + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); + reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_argb); + s += 32; + t += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int32_t stepx = src_stepx * 4; + int32_t data0, data1, data2, data3; + (void)src_stride; + + for (x = 0; x < dst_width; x += 4) { + data0 = LW(src_argb); + data1 = LW(src_argb + stepx); + data2 = LW(src_argb + stepx * 2); + data3 = LW(src_argb + stepx * 3); + SW(data0, dst_argb); + SW(data1, dst_argb + 4); + SW(data2, dst_argb + 8); + SW(data3, dst_argb + 12); + src_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + const uint8_t* nxt_argb = src_argb + src_stride; + int32_t stepx = src_stepx * 4; + int64_t data0, data1, data2, data3; + v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; + v16u8 vec0, vec1, vec2, vec3; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 dst0; + + for (x = 0; x < dst_width; x += 4) { + data0 = LD(src_argb); + data1 = LD(src_argb + stepx); + data2 = LD(src_argb + stepx * 2); + data3 = LD(src_argb + stepx * 3); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); + data0 = LD(nxt_argb); + data1 = LD(nxt_argb + stepx); + data2 = LD(nxt_argb + stepx * 2); + data3 = LD(nxt_argb + stepx * 3); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); + reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); + reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); + reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); + reg4 += reg6; + reg5 += reg7; + reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); + reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_argb); + src_argb += stepx * 4; + nxt_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleRowDown2_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + dst0 = __msa_aver_u_b(vec1, vec0); + dst1 = __msa_aver_u_b(vec3, vec2); + ST_UB2(dst0, dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1; + v8u16 vec0, vec1, vec2, vec3; + + for (x = 0; x < dst_width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = __msa_hadd_u_h(src0, src0); + vec1 = __msa_hadd_u_h(src1, src1); + vec2 = __msa_hadd_u_h(src2, src2); + vec3 = __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2); + vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2); + vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2); + vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + ST_UB2(dst0, dst1, dst, 16); + s += 64; + t += 64; + dst += 32; + } +} + +void ScaleRowDown4_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + v16u8 src0, src1, src2, src3, vec0, vec1, dst0; + (void)src_stride; + + for (x = 0; x < dst_width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst); + src_ptr += 64; + dst += 16; + } +} + +void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + const uint8_t* s = src_ptr; + const uint8_t* t0 = s + src_stride; + const uint8_t* t1 = s + src_stride * 2; + const uint8_t* t2 = s + src_stride * 3; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + + for (x = 0; x < dst_width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48); + vec0 = __msa_hadd_u_h(src0, src0); + vec1 = __msa_hadd_u_h(src1, src1); + vec2 = __msa_hadd_u_h(src2, src2); + vec3 = __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48); + vec0 += __msa_hadd_u_h(src0, src0); + vec1 += __msa_hadd_u_h(src1, src1); + vec2 += __msa_hadd_u_h(src2, src2); + vec3 += __msa_hadd_u_h(src3, src3); + vec0 += __msa_hadd_u_h(src4, src4); + vec1 += __msa_hadd_u_h(src5, src5); + vec2 += __msa_hadd_u_h(src6, src6); + vec3 += __msa_hadd_u_h(src7, src7); + reg0 = __msa_hadd_u_w(vec0, vec0); + reg1 = __msa_hadd_u_w(vec1, vec1); + reg2 = __msa_hadd_u_w(vec2, vec2); + reg3 = __msa_hadd_u_w(vec3, vec3); + reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4); + reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4); + reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4); + reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst); + s += 64; + t0 += 64; + t1 += 64; + t2 += 64; + dst += 16; + } +} + +void ScaleRowDown38_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x, width; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, vec0; + v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; + (void)src_stride; + + assert(dst_width % 3 == 0); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0); + dst0 = __msa_copy_u_d((v2i64)vec0, 0); + dst1 = __msa_copy_u_w((v4i32)vec0, 2); + SD(dst0, dst); + SW(dst1, dst + 8); + src_ptr += 32; + dst += 12; + } +} + +void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, width; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, src2, src3, out; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; + v8i16 zero = {0}; + v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; + v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; + v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); + v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0); + vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1); + vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2); + vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + tmp0 = __msa_hadd_u_w(vec4, vec4); + tmp1 = __msa_hadd_u_w(vec5, vec5); + tmp2 = __msa_hadd_u_w(vec6, vec6); + tmp3 = __msa_hadd_u_w(vec7, vec7); + tmp4 = __msa_hadd_u_w(vec0, vec0); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + tmp0 = __msa_hadd_u_w(vec0, vec0); + tmp1 = __msa_hadd_u_w(vec1, vec1); + tmp0 *= const_0x2AAA; + tmp1 *= const_0x2AAA; + tmp4 *= const_0x4000; + tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); + tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); + tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); + out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); + dst0 = __msa_copy_u_d((v2i64)out, 0); + dst1 = __msa_copy_u_w((v4i32)out, 2); + SD(dst0, dst_ptr); + SW(dst1, dst_ptr + 8); + s += 32; + t += 32; + dst_ptr += 12; + } +} + +void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, width; + const uint8_t* s = src_ptr; + const uint8_t* t0 = s + src_stride; + const uint8_t* t1 = s + src_stride * 2; + uint64_t dst0; + uint32_t dst1; + v16u8 src0, src1, src2, src3, src4, src5, out; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; + v8u16 zero = {0}; + v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; + v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; + v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71); + v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + width = dst_width / 3; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16); + src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4); + vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4); + vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5); + vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5); + vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); + vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); + vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); + vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); + vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); + vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0); + vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1); + vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2); + vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); + vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); + tmp0 = __msa_hadd_u_w(vec4, vec4); + tmp1 = __msa_hadd_u_w(vec5, vec5); + tmp2 = __msa_hadd_u_w(vec6, vec6); + tmp3 = __msa_hadd_u_w(vec7, vec7); + tmp4 = __msa_hadd_u_w(vec0, vec0); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + tmp0 = __msa_hadd_u_w(vec0, vec0); + tmp1 = __msa_hadd_u_w(vec1, vec1); + tmp0 *= const_0x1C71; + tmp1 *= const_0x1C71; + tmp4 *= const_0x2AAA; + tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); + tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); + tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); + out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); + dst0 = __msa_copy_u_d((v2i64)out, 0); + dst1 = __msa_copy_u_w((v4i32)out, 2); + SD(dst0, dst_ptr); + SW(dst1, dst_ptr + 8); + s += 32; + t0 += 32; + t1 += 32; + dst_ptr += 12; + } +} + +void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + int x; + v16u8 src0; + v8u16 dst0, dst1; + v16i8 zero = {0}; + + assert(src_width > 0); + + for (x = 0; x < src_width; x += 16) { + src0 = LD_UB(src_ptr); + dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0); + dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16); + dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0); + dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0); + ST_UH2(dst0, dst1, dst_ptr, 8); + src_ptr += 16; + dst_ptr += 16; + } +} + +void ScaleFilterCols_MSA(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + v4i32 vec_x = __msa_fill_w(x); + v4i32 vec_dx = __msa_fill_w(dx); + v4i32 vec_const = {0, 1, 2, 3}; + v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; + v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 reg0, reg1; + v16u8 dst0; + v4i32 const_0xFFFF = __msa_fill_w(0xFFFF); + v4i32 const_0x40 = __msa_fill_w(0x40); + + vec0 = vec_dx * vec_const; + vec1 = vec_dx * 4; + vec_x += vec0; + + for (j = 0; j < dst_width - 1; j += 16) { + vec2 = vec_x >> 16; + vec6 = vec_x & const_0xFFFF; + vec_x += vec1; + vec3 = vec_x >> 16; + vec7 = vec_x & const_0xFFFF; + vec_x += vec1; + vec4 = vec_x >> 16; + vec8 = vec_x & const_0xFFFF; + vec_x += vec1; + vec5 = vec_x >> 16; + vec9 = vec_x & const_0xFFFF; + vec_x += vec1; + vec6 >>= 9; + vec7 >>= 9; + vec8 >>= 9; + vec9 >>= 9; + LOAD_INDEXED_DATA(src_ptr, vec2, tmp0); + LOAD_INDEXED_DATA(src_ptr, vec3, tmp1); + LOAD_INDEXED_DATA(src_ptr, vec4, tmp2); + LOAD_INDEXED_DATA(src_ptr, vec5, tmp3); + vec2 += 1; + vec3 += 1; + vec4 += 1; + vec5 += 1; + LOAD_INDEXED_DATA(src_ptr, vec2, tmp4); + LOAD_INDEXED_DATA(src_ptr, vec3, tmp5); + LOAD_INDEXED_DATA(src_ptr, vec4, tmp6); + LOAD_INDEXED_DATA(src_ptr, vec5, tmp7); + tmp4 -= tmp0; + tmp5 -= tmp1; + tmp6 -= tmp2; + tmp7 -= tmp3; + tmp4 *= vec6; + tmp5 *= vec7; + tmp6 *= vec8; + tmp7 *= vec9; + tmp4 += const_0x40; + tmp5 += const_0x40; + tmp6 += const_0x40; + tmp7 += const_0x40; + tmp4 >>= 7; + tmp5 >>= 7; + tmp6 >>= 7; + tmp7 >>= 7; + tmp0 += tmp4; + tmp1 += tmp5; + tmp2 += tmp6; + tmp3 += tmp7; + reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); + reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + __msa_st_b(dst0, dst_ptr, 0); + dst_ptr += 16; + } +} + +void ScaleARGBCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + uint32_t* dst = (uint32_t*)(dst_argb); + int j; + v4i32 x_vec = __msa_fill_w(x); + v4i32 dx_vec = __msa_fill_w(dx); + v4i32 const_vec = {0, 1, 2, 3}; + v4i32 vec0, vec1, vec2; + v4i32 dst0; + + vec0 = dx_vec * const_vec; + vec1 = dx_vec * 4; + x_vec += vec0; + + for (j = 0; j < dst_width; j += 4) { + vec2 = x_vec >> 16; + x_vec += vec1; + LOAD_INDEXED_DATA(src, vec2, dst0); + __msa_st_w(dst0, dst, 0); + dst += 4; + } +} + +void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)(src_argb); + int j; + v4u32 src0, src1, src2, src3; + v4u32 vec0, vec1, vec2, vec3; + v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 mult0, mult1, mult2, mult3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 dst0, dst1; + v4u32 vec_x = (v4u32)__msa_fill_w(x); + v4u32 vec_dx = (v4u32)__msa_fill_w(dx); + v4u32 vec_const = {0, 1, 2, 3}; + v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f); + + vec0 = vec_dx * vec_const; + vec1 = vec_dx * 4; + vec_x += vec0; + + for (j = 0; j < dst_width - 1; j += 8) { + vec2 = vec_x >> 16; + reg0 = (v16u8)(vec_x >> 9); + vec_x += vec1; + vec3 = vec_x >> 16; + reg1 = (v16u8)(vec_x >> 9); + vec_x += vec1; + reg0 = reg0 & const_0x7f; + reg1 = reg1 & const_0x7f; + reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0); + reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0); + reg2 = reg0 ^ const_0x7f; + reg3 = reg1 ^ const_0x7f; + mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2); + mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2); + mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3); + mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3); + LOAD_INDEXED_DATA(src, vec2, src0); + LOAD_INDEXED_DATA(src, vec3, src1); + vec2 += 1; + vec3 += 1; + LOAD_INDEXED_DATA(src, vec2, src2); + LOAD_INDEXED_DATA(src, vec3, src3); + reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + tmp0 = __msa_dotp_u_h(reg4, mult0); + tmp1 = __msa_dotp_u_h(reg5, mult1); + tmp2 = __msa_dotp_u_h(reg6, mult2); + tmp3 = __msa_dotp_u_h(reg7, mult3); + tmp0 >>= 7; + tmp1 >>= 7; + tmp2 >>= 7; + tmp3 >>= 7; + dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + __msa_st_b(dst0, dst_argb, 0); + __msa_st_b(dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ScaleRowDown34_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + v16u8 src0, src1, src2, src3; + v16u8 vec0, vec1, vec2; + v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20}; + v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25}; + v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20, + 21, 23, 24, 25, 27, 28, 29, 31}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2); + __msa_st_b((v16i8)vec0, dst, 0); + __msa_st_b((v16i8)vec1, dst, 16); + __msa_st_b((v16i8)vec2, dst, 32); + src_ptr += 64; + dst += 48; + } +} + +void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5; + v8i16 reg6, reg7, reg8, reg9, reg10, reg11; + v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; + v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; + v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; + v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, + 16, 17, 17, 18, 18, 19, 20, 21}; + v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; + v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; + v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; + v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); + vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); + vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); + vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); + vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); + vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); + vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); + vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); + reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); + reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); + reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); + reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); + reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); + reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); + reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); + reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); + reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); + reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); + reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); + reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); + reg0 = __msa_srar_h(reg0, shft0); + reg1 = __msa_srar_h(reg1, shft1); + reg2 = __msa_srar_h(reg2, shft2); + reg3 = __msa_srar_h(reg3, shft0); + reg4 = __msa_srar_h(reg4, shft1); + reg5 = __msa_srar_h(reg5, shft2); + reg6 = __msa_srar_h(reg6, shft0); + reg7 = __msa_srar_h(reg7, shft1); + reg8 = __msa_srar_h(reg8, shft2); + reg9 = __msa_srar_h(reg9, shft0); + reg10 = __msa_srar_h(reg10, shft1); + reg11 = __msa_srar_h(reg11, shft2); + reg0 = reg0 * 3 + reg6; + reg1 = reg1 * 3 + reg7; + reg2 = reg2 * 3 + reg8; + reg3 = reg3 * 3 + reg9; + reg4 = reg4 * 3 + reg10; + reg5 = reg5 * 3 + reg11; + reg0 = __msa_srari_h(reg0, 2); + reg1 = __msa_srari_h(reg1, 2); + reg2 = __msa_srari_h(reg2, 2); + reg3 = __msa_srari_h(reg3, 2); + reg4 = __msa_srari_h(reg4, 2); + reg5 = __msa_srari_h(reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + __msa_st_b((v16i8)dst0, d, 0); + __msa_st_b((v16i8)dst1, d, 16); + __msa_st_b((v16i8)dst2, d, 32); + s += 64; + t += 64; + d += 48; + } +} + +void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5; + v16u8 vec6, vec7, vec8, vec9, vec10, vec11; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5; + v8i16 reg6, reg7, reg8, reg9, reg10, reg11; + v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; + v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; + v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; + v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; + v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, + 16, 17, 17, 18, 18, 19, 20, 21}; + v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; + v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; + v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; + v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); + src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); + src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); + src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); + vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); + vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); + vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); + vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); + vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); + vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); + vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); + vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); + reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); + reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); + reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); + reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); + reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); + reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); + reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); + reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); + reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); + reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); + reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); + reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); + reg0 = __msa_srar_h(reg0, shft0); + reg1 = __msa_srar_h(reg1, shft1); + reg2 = __msa_srar_h(reg2, shft2); + reg3 = __msa_srar_h(reg3, shft0); + reg4 = __msa_srar_h(reg4, shft1); + reg5 = __msa_srar_h(reg5, shft2); + reg6 = __msa_srar_h(reg6, shft0); + reg7 = __msa_srar_h(reg7, shft1); + reg8 = __msa_srar_h(reg8, shft2); + reg9 = __msa_srar_h(reg9, shft0); + reg10 = __msa_srar_h(reg10, shft1); + reg11 = __msa_srar_h(reg11, shft2); + reg0 += reg6; + reg1 += reg7; + reg2 += reg8; + reg3 += reg9; + reg4 += reg10; + reg5 += reg11; + reg0 = __msa_srari_h(reg0, 1); + reg1 = __msa_srari_h(reg1, 1); + reg2 = __msa_srari_h(reg2, 1); + reg3 = __msa_srari_h(reg3, 1); + reg4 = __msa_srari_h(reg4, 1); + reg5 = __msa_srari_h(reg5, 1); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); + dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + __msa_st_b((v16i8)dst0, d, 0); + __msa_st_b((v16i8)dst1, d, 16); + __msa_st_b((v16i8)dst2, d, 32); + s += 64; + t += 64; + d += 48; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc index 44b0c8080d..459a2995df 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon.cc @@ -23,564 +23,541 @@ extern "C" { // Provided by Fritz Koenig // Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.8 {q0, q1}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); } // Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc - "subs %2, %2, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // add adjacent - "vpaddl.u8 q1, q1 \n" - "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #1 \n" - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1" // Clobber List - ); +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); } // Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc - MEMACCESS(1) - "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc - "subs %3, %3, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "q0", "q1", "q2", "q3" // Clobber List - ); +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %0 \n" + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "subs %3, %3, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + + // row1 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and + // pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); } -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "vst1.8 {d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "q0", "q1", "memory", "cc" - ); +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc"); } -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride; - const uint8* src_ptr2 = src_ptr + src_stride * 2; - const uint8* src_ptr3 = src_ptr + src_stride * 3; -asm volatile ( - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load up 16x4 - MEMACCESS(3) - "vld1.8 {q1}, [%3]! \n" - MEMACCESS(4) - "vld1.8 {q2}, [%4]! \n" - MEMACCESS(5) - "vld1.8 {q3}, [%5]! \n" - "subs %2, %2, #4 \n" - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - "vpaddl.u16 q0, q0 \n" - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - "vmovn.u16 d0, q0 \n" - MEMACCESS(1) - "vst1.32 {d0[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_ptr1), // %3 - "+r"(src_ptr2), // %4 - "+r"(src_ptr3) // %5 - : - : "q0", "q1", "q2", "q3", "memory", "cc" - ); +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "vld1.8 {q1}, [%3]! \n" + "vld1.8 {q2}, [%4]! \n" + "vld1.8 {q3}, [%5]! \n" + "subs %2, %2, #4 \n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + "vpaddl.u16 q0, q0 \n" + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + "vmovn.u16 d0, q0 \n" + "vst1.32 {d0[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_ptr1), // %3 + "+r"(src_ptr2), // %4 + "+r"(src_ptr3) // %5 + : + : "q0", "q1", "q2", "q3", "memory", "cc"); } // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8* src_ptr, +void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #24 \n" - "vmov d2, d3 \n" // order d0, d1, d2 - MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "d0", "d1", "d2", "d3", "memory", "cc" - ); + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc"); } -void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" - // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" - // (3 * line_0 + line_1) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" - MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" - ); + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", + "cc"); } -void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" - // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "subs %2, %2, #24 \n" + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" - MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" - ); + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"); } #define HAS_SCALEROWDOWN38_NEON -static uvec8 kShuf38 = - { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; -static uvec8 kShuf38_2 = - { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; -static vec16 kMult38_Div6 = - { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; -static vec16 kMult38_Div9 = - { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; +static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, + 22, 24, 27, 30, 0, 0, 0, 0}; +static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12, + 18, 6, 14, 19, 0, 0, 0, 0}; +static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12}; +static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18}; // 32 -> 12 -void ScaleRowDown38_NEON(const uint8* src_ptr, +void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - MEMACCESS(3) - "vld1.8 {q3}, [%3] \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, %2, #12 \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - MEMACCESS(1) - "vst1.8 {d4}, [%1]! \n" - MEMACCESS(1) - "vst1.32 {d5[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" - ); + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "vld1.8 {q3}, [%3] \n" + "1: \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"); } // 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride * 2; + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride * 2; - asm volatile ( - MEMACCESS(5) - "vld1.16 {q13}, [%5] \n" - MEMACCESS(6) - "vld1.8 {q14}, [%6] \n" - MEMACCESS(7) - "vld1.8 {q15}, [%7] \n" - "add %3, %0 \n" - "1: \n" + asm volatile( + "vld1.16 {q13}, [%5] \n" + "vld1.8 {q14}, [%6] \n" + "vld1.8 {q15}, [%7] \n" + "add %3, %0 \n" + "1: \n" - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - MEMACCESS(4) - "vld4.8 {d16, d17, d18, d19}, [%4]! \n" - "subs %2, %2, #12 \n" + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d16, d17, d18, d19}, [%4]! \n" + "subs %2, %2, #12 \n" - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "vqrdmulh.s16 q2, q2, q13 \n" - "vmovn.u16 d4, q2 \n" + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q2, q13 \n" + "vmovn.u16 d4, q2 \n" - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" - // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q15 \n" + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q15 \n" - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - MEMACCESS(1) - "vst1.8 {d3}, [%1]! \n" - MEMACCESS(1) - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride), // %3 - "+r"(src_ptr1) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" - ); + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride), // %3 + "+r"(src_ptr1) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", + "cc"); } // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - MEMACCESS(4) - "vld1.16 {q13}, [%4] \n" - MEMACCESS(5) - "vld1.8 {q14}, [%5] \n" - "add %3, %0 \n" - "1: \n" + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" - MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "subs %2, %2, #12 \n" + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "subs %2, %2, #12 \n" - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" - // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" - // combine source lines - "vadd.u16 q1, q3 \n" + // combine source lines + "vadd.u16 q1, q3 \n" - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" - // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q13 \n" + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q0, q13 \n" - // Align for table lookup, vtbl requires registers to - // be adjacent - "vmov.u8 d2, d4 \n" + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - MEMACCESS(1) - "vst1.8 {d3}, [%1]! \n" - MEMACCESS(1) - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" - ); + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); } -void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { - const uint8* src_tmp; - asm volatile ( - "1: \n" - "mov %0, %1 \n" - "mov r12, %5 \n" - "veor q2, q2, q2 \n" - "veor q3, q3, q3 \n" - "2: \n" - // load 16 pixels into q0 - MEMACCESS(0) - "vld1.8 {q0}, [%0], %3 \n" - "vaddw.u8 q3, q3, d1 \n" - "vaddw.u8 q2, q2, d0 \n" - "subs r12, r12, #1 \n" - "bgt 2b \n" - MEMACCESS(2) - "vst1.16 {q2, q3}, [%2]! \n" // store pixels - "add %1, %1, #16 \n" - "subs %4, %4, #16 \n" // 16 processed per loop - "bgt 1b \n" - : "=&r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 - : - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List - ); +void ScaleAddRows_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int src_width, + int src_height) { + const uint8_t* src_tmp; + asm volatile( + "1: \n" + "mov %0, %1 \n" + "mov r12, %5 \n" + "veor q2, q2, q2 \n" + "veor q3, q3, q3 \n" + "2: \n" + // load 16 pixels into q0 + "vld1.8 {q0}, [%0], %3 \n" + "vaddw.u8 q3, q3, d1 \n" + "vaddw.u8 q2, q2, d0 \n" + "subs r12, r12, #1 \n" + "bgt 2b \n" + "vst1.16 {q2, q3}, [%2]! \n" // store pixels + "add %1, %1, #16 \n" + "subs %4, %4, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "=&r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List + ); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n" -// The NEON version mimics this formula: -// #define BLENDER(a, b, f) (uint8)((int)(a) + -// ((int)(f) * ((int)(b) - (int)(a)) >> 16)) +// The NEON version mimics this formula (from row_common.cc): +// #define BLENDER(a, b, f) (uint8_t)((int)(a) + +// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) -void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_ptr; + const uint8_t* src_tmp = src_ptr; asm volatile ( "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx @@ -617,7 +594,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, "vadd.s16 q8, q8, q9 \n" "vmovn.s16 d6, q8 \n" - MEMACCESS(0) "vst1.8 {d6}, [%0]! \n" // store pixels "vadd.s32 q1, q1, q0 \n" "vadd.s32 q2, q2, q0 \n" @@ -639,325 +615,299 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, #undef LOAD2_DATA8_LANE // 16x2 -> 16x1 -void ScaleFilterRows_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { - asm volatile ( - "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" - "cmp %4, #64 \n" - "beq 75f \n" - "cmp %4, #128 \n" - "beq 50f \n" - "cmp %4, #192 \n" - "beq 25f \n" +void ScaleFilterRows_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + asm volatile( + "cmp %4, #0 \n" + "beq 100f \n" + "add %2, %1 \n" + "cmp %4, #64 \n" + "beq 75f \n" + "cmp %4, #128 \n" + "beq 50f \n" + "cmp %4, #192 \n" + "beq 25f \n" - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" - "b 99f \n" + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + // General purpose row blend. + "1: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 1b \n" + "b 99f \n" - // Blend 25 / 75. - "25: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" - "b 99f \n" + // Blend 25 / 75. + "25: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 25b \n" + "b 99f \n" - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" - "b 99f \n" + // Blend 50 / 50. + "50: \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 50b \n" + "b 99f \n" - // Blend 75 / 25. - "75: \n" - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" - "b 99f \n" + // Blend 75 / 25. + "75: \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 75b \n" + "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - "subs %3, %3, #16 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "vld1.8 {q0}, [%1]! \n" + "subs %3, %3, #16 \n" + "vst1.8 {q0}, [%0]! \n" + "bgt 100b \n" - "99: \n" - MEMACCESS(0) - "vst1.8 {d1[7]}, [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction) // %4 - : - : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" - ); + "99: \n" + "vst1.8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"); } -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - MEMACCESS(0) - "vld2.32 {q0, q1}, [%0]! \n" - MEMACCESS(0) - "vld2.32 {q2, q3}, [%0]! \n" - "subs %2, %2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store odd pixels - MEMACCESS(1) - "vst1.8 {q3}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vmov q2, q1 \n" // load next 8 ARGB + "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); } -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #1 \n" - "vrshrn.u16 d2, q2, #1 \n" - "vrshrn.u16 d3, q3, #1 \n" - MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); +// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! +// 4a: 3e04 subs r6, #4 +// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! +// 50: ef64 21f4 vorr q9, q10, q10 +// 54: f942 038d vst2.32 {d16-d19}, [r2]! +// 58: d1f5 bne.n 46 + +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %2, %2, #8 \n" // 8 processed per loop + "vrhadd.u8 q0, q0, q1 \n" // rounding half add + "vrhadd.u8 q1, q2, q3 \n" // rounding half add + "vst2.32 {q0, q1}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); } -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - MEMACCESS(1) - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" - ); +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width) { - asm volatile ( - "mov r12, %3, lsl #2 \n" - "1: \n" - MEMACCESS(0) - "vld1.32 {d0[0]}, [%0], r12 \n" - MEMACCESS(0) - "vld1.32 {d0[1]}, [%0], r12 \n" - MEMACCESS(0) - "vld1.32 {d1[0]}, [%0], r12 \n" - MEMACCESS(0) - "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. - MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stepx) // %3 - : "memory", "cc", "r12", "q0" - ); +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "mov r12, %3, lsl #2 \n" + "1: \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { - asm volatile ( - "mov r12, %4, lsl #2 \n" - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 - MEMACCESS(1) - "vld1.8 {d1}, [%1], r12 \n" - MEMACCESS(0) - "vld1.8 {d2}, [%0], r12 \n" - MEMACCESS(1) - "vld1.8 {d3}, [%1], r12 \n" - MEMACCESS(0) - "vld1.8 {d4}, [%0], r12 \n" - MEMACCESS(1) - "vld1.8 {d5}, [%1], r12 \n" - MEMACCESS(0) - "vld1.8 {d6}, [%0], r12 \n" - MEMACCESS(1) - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"(src_stepx) // %4 - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" - ); + uint8_t* dst_argb, + int dst_width) { + asm volatile( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + "1: \n" + "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3"); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD1_DATA32_LANE(dn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "vld1.32 {"#dn"["#n"]}, [%6] \n" +#define LOAD1_DATA32_LANE(dn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "vld1.32 {" #dn "[" #n "]}, [%6] \n" -void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { int tmp; - const uint8* src_tmp = src_argb; - asm volatile ( - "1: \n" - LOAD1_DATA32_LANE(d0, 0) - LOAD1_DATA32_LANE(d0, 1) - LOAD1_DATA32_LANE(d1, 0) - LOAD1_DATA32_LANE(d1, 1) - LOAD1_DATA32_LANE(d2, 0) - LOAD1_DATA32_LANE(d2, 1) - LOAD1_DATA32_LANE(d3, 0) - LOAD1_DATA32_LANE(d3, 1) - - MEMACCESS(0) - "vst1.32 {q0, q1}, [%0]! \n" // store pixels - "subs %2, %2, #8 \n" // 8 processed per loop - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x), // %3 - "+r"(dx), // %4 - "=&r"(tmp), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "q0", "q1" - ); + const uint8_t* src_tmp = src_argb; + asm volatile( + "1: \n" + // clang-format off + LOAD1_DATA32_LANE(d0, 0) + LOAD1_DATA32_LANE(d0, 1) + LOAD1_DATA32_LANE(d1, 0) + LOAD1_DATA32_LANE(d1, 1) + LOAD1_DATA32_LANE(d2, 0) + LOAD1_DATA32_LANE(d2, 1) + LOAD1_DATA32_LANE(d3, 0) + LOAD1_DATA32_LANE(d3, 1) + // clang-format on + "vst1.32 {q0, q1}, [%0]! \n" // store pixels + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "=&r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1"); } #undef LOAD1_DATA32_LANE // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA32_LANE(dn1, dn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n" +#define LOAD2_DATA32_LANE(dn1, dn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" -void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_argb; + const uint8_t* src_tmp = src_argb; asm volatile ( "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx @@ -993,7 +943,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, "vshrn.i16 d0, q11, #7 \n" "vshrn.i16 d1, q12, #7 \n" - MEMACCESS(0) "vst1.32 {d0, d1}, [%0]! \n" // store pixels "vadd.s32 q8, q8, q9 \n" "subs %2, %2, #4 \n" // 4 processed per loop diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc index ff277f26ff..494a9cfbfb 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_neon64.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/scale.h" #include "libyuv/row.h" +#include "libyuv/scale.h" #include "libyuv/scale_row.h" #ifdef __cplusplus @@ -21,580 +21,556 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // Read 32x1 throw away even pixels, and write 16x1. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into v0, odd into v1 - MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1" // Clobber List - ); +void ScaleRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); } // Read 32x1 average down and write 16x1. -void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc - "subs %w2, %w2, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // add adjacent - "uaddlp v1.8h, v1.16b \n" - "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack - "rshrn2 v0.16b, v1.8h, #1 \n" - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1" // Clobber List - ); +void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); } // Read 32x2 average down and write 16x1. -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc - MEMACCESS(1) - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent - "uaddlp v1.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 - "uadalp v1.8h, v3.16b \n" - "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack - "rshrn2 v0.16b, v1.8h, #2 \n" - MEMACCESS(2) - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "v0", "v1", "v2", "v3" // Clobber List - ); +void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "uaddlp v1.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); } -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "st1 {v2.8b}, [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); +void ScaleRowDown4_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride; - const uint8* src_ptr2 = src_ptr + src_stride * 2; - const uint8* src_ptr3 = src_ptr + src_stride * 3; -asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 - MEMACCESS(3) - "ld1 {v1.16b}, [%2], #16 \n" - MEMACCESS(4) - "ld1 {v2.16b}, [%3], #16 \n" - MEMACCESS(5) - "ld1 {v3.16b}, [%4], #16 \n" - "subs %w5, %w5, #4 \n" - "uaddlp v0.8h, v0.16b \n" - "uadalp v0.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" - "uadalp v0.8h, v3.16b \n" - "addp v0.8h, v0.8h, v0.8h \n" - "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding - MEMACCESS(1) - "st1 {v0.s}[0], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(src_ptr2), // %3 - "+r"(src_ptr3), // %4 - "+r"(dst_width) // %5 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); +void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + "ld1 {v1.16b}, [%2], #16 \n" + "ld1 {v2.16b}, [%3], #16 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %w5, %w5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v0.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" + "uadalp v0.8h, v3.16b \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(src_ptr2), // %3 + "+r"(src_ptr3), // %4 + "+r"(dst_width) // %5 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8* src_ptr, +void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #24 \n" - "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 - MEMACCESS(1) - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } -void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "ushll v16.8h, v4.8b, #0 \n" - "ushll v17.8h, v5.8b, #0 \n" - "ushll v18.8h, v6.8b, #0 \n" - "ushll v19.8h, v7.8b, #0 \n" + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" - // 3 * line_0 + line_1 - "umlal v16.8h, v0.8b, v20.8b \n" - "umlal v17.8h, v1.8b, v20.8b \n" - "umlal v18.8h, v2.8b, v20.8b \n" - "umlal v19.8h, v3.8b, v20.8b \n" + // 3 * line_0 + line_1 + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" - // (3 * line_0 + line_1) >> 2 - "uqrshrn v0.8b, v16.8h, #2 \n" - "uqrshrn v1.8b, v17.8h, #2 \n" - "uqrshrn v2.8b, v18.8h, #2 \n" - "uqrshrn v3.8b, v19.8h, #2 \n" + // (3 * line_0 + line_1) >> 2 + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v16.8h, v1.8b, #0 \n" - "umlal v16.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v16.8h, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v16.8h, v2.8b, #0 \n" - "umlal v16.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v16.8h, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" - MEMACCESS(1) - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", - "v20", "memory", "cc" - ); + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "memory", "cc"); } -void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, +void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" - // average src line 0 with src line 1 - "urhadd v0.8b, v0.8b, v4.8b \n" - "urhadd v1.8b, v1.8b, v5.8b \n" - "urhadd v2.8b, v2.8b, v6.8b \n" - "urhadd v3.8b, v3.8b, v7.8b \n" + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + // average src line 0 with src line 1 + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v4.8h, v1.8b, #0 \n" - "umlal v4.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v4.8h, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v4.8h, v2.8b, #0 \n" - "umlal v4.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v4.8h, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" - MEMACCESS(1) - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" - ); + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"); } -static uvec8 kShuf38 = - { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; -static uvec8 kShuf38_2 = - { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 }; -static vec16 kMult38_Div6 = - { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, - 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; -static vec16 kMult38_Div9 = - { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, - 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; +static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, + 22, 24, 27, 30, 0, 0, 0, 0}; +static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, + 34, 6, 22, 35, 0, 0, 0, 0}; +static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12}; +static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18}; // 32 -> 12 -void ScaleRowDown38_NEON(const uint8* src_ptr, +void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - MEMACCESS(3) - "ld1 {v3.16b}, [%3] \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #12 \n" - "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" - MEMACCESS(1) - "st1 {v2.8b}, [%1], #8 \n" - MEMACCESS(1) - "st1 {v2.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + uint8_t* dst_ptr, + int dst_width) { + (void)src_stride; + asm volatile( + "ld1 {v3.16b}, [%3] \n" + "1: \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + "st1 {v2.8b}, [%1], #8 \n" + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "v0", "v1", "v2", "v3", "memory", "cc"); } // 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - const uint8* src_ptr1 = src_ptr + src_stride * 2; + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride * 2; ptrdiff_t tmp_src_stride = src_stride; - asm volatile ( - MEMACCESS(5) - "ld1 {v29.8h}, [%5] \n" - MEMACCESS(6) - "ld1 {v30.16b}, [%6] \n" - MEMACCESS(7) - "ld1 {v31.8h}, [%7] \n" - "add %2, %2, %0 \n" - "1: \n" + asm volatile( + "ld1 {v29.8h}, [%5] \n" + "ld1 {v30.16b}, [%6] \n" + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - MEMACCESS(4) - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %w4, %w4, #12 \n" + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %w4, %w4, #12 \n" - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v20.8b, v0.8b, v1.8b \n" - "trn2 v21.8b, v0.8b, v1.8b \n" - "trn1 v22.8b, v4.8b, v5.8b \n" - "trn2 v23.8b, v4.8b, v5.8b \n" - "trn1 v24.8b, v16.8b, v17.8b \n" - "trn2 v25.8b, v16.8b, v17.8b \n" + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - "trn1 v16.8b, v18.8b, v19.8b \n" - "trn2 v17.8b, v18.8b, v19.8b \n" + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v20.4h, v20.8b \n" - "uaddlp v21.4h, v21.8b \n" - "uaddlp v22.4h, v22.8b \n" - "uaddlp v23.4h, v23.8b \n" - "uaddlp v24.4h, v24.8b \n" - "uaddlp v25.4h, v25.8b \n" + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - "uaddlp v17.4h, v17.8b \n" + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" - // combine source lines - "add v20.4h, v20.4h, v22.4h \n" - "add v21.4h, v21.4h, v23.4h \n" - "add v20.4h, v20.4h, v24.4h \n" - "add v21.4h, v21.4h, v25.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - "add v2.4h, v2.4h, v17.4h \n" + // combine source lines + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "sqrdmulh v2.8h, v2.8h, v29.8h \n" - "xtn v2.8b, v2.8h \n" + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "ushll v16.8h, v16.8b, #0 \n" - "uaddl v0.8h, v0.8b, v4.8b \n" + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" - // combine source lines - "add v0.8h, v0.8h, v16.8h \n" + // combine source lines + "add v0.8h, v0.8h, v16.8h \n" - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" - // 0+1+2, 3+4+5 - "add v20.8h, v20.8h, v0.8h \n" - "add v21.8h, v21.8h, v4.8h \n" + // 0+1+2, 3+4+5 + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v20.8h, v31.8h \n" - "sqrdmulh v1.8h, v21.8h, v31.8h \n" + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" - // Align for table lookup, vtbl requires registers to - // be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + // Align for table lookup, vtbl requires registers to be adjacent + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" - MEMACCESS(1) - "st1 {v3.8b}, [%1], #8 \n" - MEMACCESS(1) - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(src_ptr1), // %3 - "+r"(dst_width) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", - "v30", "v31", "memory", "cc" - ); + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(src_ptr1), // %3 + "+r"(dst_width) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31", + "memory", "cc"); } // 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, +void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { + uint8_t* dst_ptr, + int dst_width) { // TODO(fbarchard): use src_stride directly for clang 3.5+. ptrdiff_t tmp_src_stride = src_stride; - asm volatile ( - MEMACCESS(4) - "ld1 {v30.8h}, [%4] \n" - MEMACCESS(5) - "ld1 {v31.16b}, [%5] \n" - "add %2, %2, %0 \n" - "1: \n" + asm volatile( + "ld1 {v30.8h}, [%4] \n" + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %w3, %w3, #12 \n" + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %w3, %w3, #12 \n" - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v16.8b, v0.8b, v1.8b \n" - "trn2 v17.8b, v0.8b, v1.8b \n" - "trn1 v18.8b, v4.8b, v5.8b \n" - "trn2 v19.8b, v4.8b, v5.8b \n" + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v16.4h, v16.8b \n" - "uaddlp v17.4h, v17.8b \n" - "uaddlp v18.4h, v18.8b \n" - "uaddlp v19.4h, v19.8b \n" + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" - // combine source lines - "add v16.4h, v16.4h, v18.4h \n" - "add v17.4h, v17.4h, v19.4h \n" - "add v2.4h, v1.4h, v5.4h \n" + // combine source lines + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "uqrshrn v2.8b, v2.8h, #2 \n" + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "uqrshrn v2.8b, v2.8h, #2 \n" - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - // combine source lines - "uaddl v0.8h, v0.8b, v4.8b \n" + // combine source lines + "uaddl v0.8h, v0.8b, v4.8b \n" - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" - // 0+1+2, 3+4+5 - "add v16.8h, v16.8h, v0.8h \n" - "add v17.8h, v17.8h, v4.8h \n" + // 0+1+2, 3+4+5 + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v16.8h, v30.8h \n" - "sqrdmulh v1.8h, v17.8h, v30.8h \n" + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" - // Align for table lookup, vtbl requires registers to - // be adjacent + // Align for table lookup, vtbl requires registers to + // be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" - MEMACCESS(1) - "st1 {v3.8b}, [%1], #8 \n" - MEMACCESS(1) - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(dst_width) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v30", "v31", "memory", "cc" - ); + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(dst_width) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v30", "v31", "memory", "cc"); } -void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { - const uint8* src_tmp; - asm volatile ( - "1: \n" - "mov %0, %1 \n" - "mov w12, %w5 \n" - "eor v2.16b, v2.16b, v2.16b \n" - "eor v3.16b, v3.16b, v3.16b \n" - "2: \n" - // load 16 pixels into q0 - MEMACCESS(0) - "ld1 {v0.16b}, [%0], %3 \n" - "uaddw2 v3.8h, v3.8h, v0.16b \n" - "uaddw v2.8h, v2.8h, v0.8b \n" - "subs w12, w12, #1 \n" - "b.gt 2b \n" - MEMACCESS(2) - "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels - "add %1, %1, #16 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop - "b.gt 1b \n" - : "=&r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 - : - : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List - ); +void ScaleAddRows_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + int src_width, + int src_height) { + const uint8_t* src_tmp; + asm volatile( + "1: \n" + "mov %0, %1 \n" + "mov w12, %w5 \n" + "eor v2.16b, v2.16b, v2.16b \n" + "eor v3.16b, v3.16b, v3.16b \n" + "2: \n" + // load 16 pixels into q0 + "ld1 {v0.16b}, [%0], %3 \n" + "uaddw2 v3.8h, v3.8h, v0.16b \n" + "uaddw v2.8h, v2.8h, v0.8b \n" + "subs w12, w12, #1 \n" + "b.gt 2b \n" + "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels + "add %1, %1, #16 \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "b.gt 1b \n" + : "=&r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List + ); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA8_LANE(n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "ld2 {v4.b, v5.b}["#n"], [%6] \n" +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "ld2 {v4.b, v5.b}[" #n "], [%6] \n" -void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +// The NEON version mimics this formula (from row_common.cc): +// #define BLENDER(a, b, f) (uint8_t)((int)(a) + +// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) + +void ScaleFilterCols_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_ptr; - int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64) x; - int64 dx64 = (int64) dx; + const uint8_t* src_tmp = src_ptr; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( "dup v0.4s, %w3 \n" // x "dup v1.4s, %w4 \n" // dx @@ -626,12 +602,11 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, "ushll2 v6.4s, v6.8h, #0 \n" "mul v16.4s, v16.4s, v7.4s \n" "mul v17.4s, v17.4s, v6.4s \n" - "rshrn v6.4h, v16.4s, #16 \n" - "rshrn2 v6.8h, v17.4s, #16 \n" + "rshrn v6.4h, v16.4s, #16 \n" + "rshrn2 v6.8h, v17.4s, #16 \n" "add v4.8h, v4.8h, v6.8h \n" "xtn v4.8b, v4.8h \n" - MEMACCESS(0) "st1 {v4.8b}, [%0], #8 \n" // store pixels "add v1.4s, v1.4s, v0.4s \n" "add v2.4s, v2.4s, v0.4s \n" @@ -639,7 +614,7 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, "b.gt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 - "+r"(dst_width64), // %2 + "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "+r"(tmp), // %5 @@ -653,331 +628,300 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, #undef LOAD2_DATA8_LANE // 16x2 -> 16x1 -void ScaleFilterRows_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { - int y_fraction = 256 - source_y_fraction; - asm volatile ( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "add %2, %2, %1 \n" - "cmp %w4, #64 \n" - "b.eq 75f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" +void ScaleFilterRows_NEON(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y_fraction = 256 - source_y_fraction; + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" - "dup v5.8b, %w4 \n" - "dup v4.8b, %w5 \n" - // General purpose row blend. - "1: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v6.8h, v0.8b, v4.8b \n" - "umull2 v7.8h, v0.16b, v4.16b \n" - "umlal v6.8h, v1.8b, v5.8b \n" - "umlal2 v7.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v6.8h, #8 \n" - "rshrn2 v0.16b, v7.8h, #8 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" - // Blend 25 / 75. - "25: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" + // Blend 25 / 75. + "25: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" - // Blend 50 / 50. - "50: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" - // Blend 75 / 25. - "75: \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" + // Blend 75 / 25. + "75: \n" + "ld1 {v1.16b}, [%1], #16 \n" + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" - "99: \n" - MEMACCESS(0) - "st1 {v0.b}[15], [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction),// %4 - "+r"(y_fraction) // %5 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" - ); + "99: \n" + "st1 {v0.b}[15], [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction), // %4 + "+r"(y_fraction) // %5 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); } -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - MEMACCESS (0) - "ld2 {v0.4s, v1.4s}, [%0], #32 \n" - MEMACCESS (0) - "ld2 {v2.4s, v3.4s}, [%0], #32 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - MEMACCESS (1) - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - MEMACCESS (1) - "st1 {v3.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r" (src_ptr), // %0 - "+r" (dst), // %1 - "+r" (dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); +void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "mov v2.16b, v3.16b \n" + "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); } -void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS (0) - // load 8 ARGB pixels. - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack - "rshrn v1.8b, v1.8h, #1 \n" - "rshrn v2.8b, v2.8h, #1 \n" - "rshrn v3.8b, v3.8h, #1 \n" - MEMACCESS (1) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); +void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 + "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + + "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add + "urhadd v1.16b, v2.16b, v3.16b \n" + "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); } -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS (0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - MEMACCESS (1) - "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. - "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. - "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack - "rshrn v1.8b, v1.8h, #2 \n" - "rshrn v2.8b, v2.8h, #2 \n" - "rshrn v3.8b, v3.8h, #2 \n" - MEMACCESS (2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" - "b.gt 1b \n" - : "+r" (src_ptr), // %0 - "+r" (src_stride), // %1 - "+r" (dst), // %2 - "+r" (dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" - ); +void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, uint8* dst_argb, int dst_width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld1 {v0.s}[0], [%0], %3 \n" - MEMACCESS(0) - "ld1 {v0.s}[1], [%0], %3 \n" - MEMACCESS(0) - "ld1 {v0.s}[2], [%0], %3 \n" - MEMACCESS(0) - "ld1 {v0.s}[3], [%0], %3 \n" - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - MEMACCESS(1) - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((int64)(src_stepx * 4)) // %3 - : "memory", "cc", "v0" - ); +void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + asm volatile( + "1: \n" + "ld1 {v0.s}[0], [%0], %3 \n" + "ld1 {v0.s}[1], [%0], %3 \n" + "ld1 {v0.s}[2], [%0], %3 \n" + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((int64_t)(src_stepx * 4)) // %3 + : "memory", "cc", "v0"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. // TODO(Yang Zhang): Might be worth another optimization pass in future. // It could be upgraded to 8 pixels at a time to start with. -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, + ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { - asm volatile ( - "add %1, %1, %0 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 - MEMACCESS(1) - "ld1 {v1.8b}, [%1], %4 \n" - MEMACCESS(0) - "ld1 {v2.8b}, [%0], %4 \n" - MEMACCESS(1) - "ld1 {v3.8b}, [%1], %4 \n" - MEMACCESS(0) - "ld1 {v4.8b}, [%0], %4 \n" - MEMACCESS(1) - "ld1 {v5.8b}, [%1], %4 \n" - MEMACCESS(0) - "ld1 {v6.8b}, [%0], %4 \n" - MEMACCESS(1) - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd - "mov v0.d[1], v2.d[0] \n" - "mov v2.d[0], v16.d[1] \n" - "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh - "mov v4.d[1], v6.d[0] \n" - "mov v6.d[0], v16.d[1] \n" - "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. - "subs %w3, %w3, #4 \n" // 4 pixels per loop. - MEMACCESS(2) - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"((int64)(src_stepx * 4)) // %4 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); + uint8_t* dst_argb, + int dst_width) { + asm volatile( + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 + "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"((int64_t)(src_stepx * 4)) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD1_DATA32_LANE(vn, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "ld1 {"#vn".s}["#n"], [%6] \n" +#define LOAD1_DATA32_LANE(vn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "ld1 {" #vn ".s}[" #n "], [%6] \n" -void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint8* src_tmp = src_argb; - int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64) x; - int64 dx64 = (int64) dx; - int64 tmp64; - asm volatile ( - "1: \n" - LOAD1_DATA32_LANE(v0, 0) - LOAD1_DATA32_LANE(v0, 1) - LOAD1_DATA32_LANE(v0, 2) - LOAD1_DATA32_LANE(v0, 3) - LOAD1_DATA32_LANE(v1, 0) - LOAD1_DATA32_LANE(v1, 1) - LOAD1_DATA32_LANE(v1, 2) - LOAD1_DATA32_LANE(v1, 3) - - MEMACCESS(0) - "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width64), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "=&r"(tmp64), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "v0", "v1" - ); +void ScaleARGBCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint8_t* src_tmp = src_argb; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT + int64_t tmp64; + asm volatile( + "1: \n" + // clang-format off + LOAD1_DATA32_LANE(v0, 0) + LOAD1_DATA32_LANE(v0, 1) + LOAD1_DATA32_LANE(v0, 2) + LOAD1_DATA32_LANE(v0, 3) + LOAD1_DATA32_LANE(v1, 0) + LOAD1_DATA32_LANE(v1, 1) + LOAD1_DATA32_LANE(v1, 2) + LOAD1_DATA32_LANE(v1, 3) + // clang-format on + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "=&r"(tmp64), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1"); } #undef LOAD1_DATA32_LANE // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping -#define LOAD2_DATA32_LANE(vn1, vn2, n) \ - "lsr %5, %3, #16 \n" \ - "add %6, %1, %5, lsl #2 \n" \ - "add %3, %3, %4 \n" \ - MEMACCESS(6) \ - "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n" +#define LOAD2_DATA32_LANE(vn1, vn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" -void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; - const uint8* src_tmp = src_argb; - int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. - int64 x64 = (int64) x; - int64 dx64 = (int64) dx; + const uint8_t* src_tmp = src_argb; + int64_t x64 = (int64_t)x; // NOLINT + int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( "dup v0.4s, %w3 \n" // x "dup v1.4s, %w4 \n" // dx @@ -1014,14 +958,13 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, "shrn v0.8b, v16.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n" - MEMACCESS(0) "st1 {v0.4s}, [%0], #16 \n" // store pixels "add v5.4s, v5.4s, v6.4s \n" "subs %w2, %w2, #4 \n" // 4 processed per loop "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 - "+r"(dst_width64), // %2 + "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "+r"(tmp), // %5 @@ -1034,6 +977,85 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, #undef LOAD2_DATA32_LANE +// Read 16x2 average down and write 8x1. +void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "1: \n" + "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #8 \n" // 8 processed per loop + "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent + "uaddlp v1.4s, v1.8h \n" + "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent + "uadalp v1.4s, v3.8h \n" + "rshrn v0.4h, v0.4s, #2 \n" // round and pack + "rshrn2 v0.8h, v1.4s, #2 \n" + "st1 {v0.8h}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Read 8x2 upsample with filtering and write 16x1. +// Actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst, + int dst_width) { + asm volatile( + "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 + "movi v0.8h, #9 \n" // constants + "movi v1.4s, #3 \n" + + "1: \n" + "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 + "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 + "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row + "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 + "subs %w3, %w3, #16 \n" // 16 dst pixels per loop + "umull v16.4s, v3.4h, v0.4h \n" + "umull2 v7.4s, v3.8h, v0.8h \n" + "umull v18.4s, v4.4h, v0.4h \n" + "umull2 v17.4s, v4.8h, v0.8h \n" + "uaddw v16.4s, v16.4s, v6.4h \n" + "uaddl2 v19.4s, v6.8h, v3.8h \n" + "uaddl v3.4s, v6.4h, v3.4h \n" + "uaddw2 v6.4s, v7.4s, v6.8h \n" + "uaddl2 v7.4s, v5.8h, v4.8h \n" + "uaddl v4.4s, v5.4h, v4.4h \n" + "uaddw v18.4s, v18.4s, v5.4h \n" + "mla v16.4s, v4.4s, v1.4s \n" + "mla v18.4s, v3.4s, v1.4s \n" + "mla v6.4s, v7.4s, v1.4s \n" + "uaddw2 v4.4s, v17.4s, v5.8h \n" + "uqrshrn v16.4h, v16.4s, #4 \n" + "mla v4.4s, v19.4s, v1.4s \n" + "uqrshrn2 v16.8h, v6.4s, #4 \n" + "uqrshrn v17.4h, v18.4s, #4 \n" + "uqrshrn2 v17.8h, v4.4s, #4 \n" + "st2 {v16.8h-v17.8h}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : "r"(2LL), // %4 + "r"(14LL) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19" // Clobber List + ); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc b/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc index f17097365c..c5fc86f3e9 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/scale_win.cc @@ -17,97 +17,93 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 and clangcl -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) // Offsets for source bytes 0 to 9 -static uvec8 kShuf0 = - { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -static uvec8 kShuf1 = - { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf2 = - { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, + 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 0 to 10 -static uvec8 kShuf01 = - { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; +static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -static uvec8 kShuf11 = - { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; +static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, + 8, 9, 9, 10, 10, 11, 12, 13}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -static uvec8 kShuf21 = - { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; +static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, + 10, 11, 12, 13, 13, 14, 14, 15}; // Coefficients for source bytes 0 to 10 -static uvec8 kMadd01 = - { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; +static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; // Coefficients for source bytes 10 to 21 -static uvec8 kMadd11 = - { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; +static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; // Coefficients for source bytes 21 to 31 -static uvec8 kMadd21 = - { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; +static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; // Coefficients for source bytes 21 to 31 -static vec16 kRound34 = - { 2, 2, 2, 2, 2, 2, 2, 2 }; +static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; -static uvec8 kShuf38a = - { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; -static uvec8 kShuf38b = - { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; +static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, + 6, 8, 11, 14, 128, 128, 128, 128}; // Arrange words 0,3,6 into 0,1,2 -static uvec8 kShufAc = - { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; // Arrange words 0,3,6 into 3,4,5 -static uvec8 kShufAc3 = - { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; +static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, + 6, 7, 12, 13, 128, 128, 128, 128}; // Scaling values for boxes of 3x3 and 2x3 -static uvec16 kScaleAc33 = - { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; +static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, + 65536 / 9, 65536 / 6, 0, 0}; // Arrange first value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb0 = - { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, + 11, 128, 14, 128, 128, 128, 128, 128}; // Arrange second value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb1 = - { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, + 12, 128, 15, 128, 128, 128, 128, 128}; // Arrange third value for pixels 0,1,2,3,4,5 -static uvec8 kShufAb2 = - { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; +static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, + 13, 128, 128, 128, 128, 128, 128, 128}; // Scaling values for boxes of 3x2 and 2x2 -static uvec16 kScaleAb2 = - { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, + 65536 / 3, 65536 / 2, 0, 0}; // Reads 32 pixels, throws half away and writes 16 pixels. -__declspec(naked) -void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm0, 8 // isolate odd pixels. psrlw xmm1, 8 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -120,27 +116,28 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x1 rectangle to 16x1. -__declspec(naked) -void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width - pcmpeqb xmm4, xmm4 // constant 0x0101 + pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 + pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 - pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm0, xmm5 // (x + 1) / 2 pavgw xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -153,20 +150,21 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x2 rectangle to 16x1. -__declspec(naked) -void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm4, xmm4 // constant 0x0101 + pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 packuswb xmm4, xmm4 - pxor xmm5, xmm5 // constant 0 + pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] @@ -174,15 +172,15 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] - pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add + paddw xmm0, xmm2 // vertical add paddw xmm1, xmm3 psrlw xmm0, 1 psrlw xmm1, 1 - pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm0, xmm5 // (x + 1) / 2 pavgw xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -197,23 +195,24 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, #ifdef HAS_SCALEROWDOWN2_AVX2 // Reads 64 pixels, throws half away and writes 32 pixels. -__declspec(naked) -void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpsrlw ymm0, ymm0, 8 // isolate odd pixels. + vpsrlw ymm0, ymm0, 8 // isolate odd pixels. vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -225,30 +224,31 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 64x1 rectangle to 32x1. -__declspec(naked) -void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b vpsrlw ymm4, ymm4, 15 vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -262,20 +262,21 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, // For rounding, average = (sum + 2) / 4 // becomes average((sum >> 1), 0) // Blends 64x2 rectangle to 32x1. -__declspec(naked) -void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width - vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b vpsrlw ymm4, ymm4, 15 vpackuswb ymm4, ymm4, ymm4 - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 wloop: vmovdqu ymm0, [eax] @@ -283,18 +284,18 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vmovdqu ymm2, [eax + esi] vmovdqu ymm3, [eax + esi + 32] lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add + vpaddw ymm0, ymm0, ymm2 // vertical add vpaddw ymm1, ymm1, ymm3 - vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 + vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 vpsrlw ymm1, ymm1, 1 - vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -308,15 +309,16 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, #endif // HAS_SCALEROWDOWN2_AVX2 // Point samples 32 pixels to 8 pixels. -__declspec(naked) -void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 psrld xmm5, 24 pslld xmm5, 16 @@ -339,50 +341,51 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x4 rectangle to 8x1. -__declspec(naked) -void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm4, xmm4 // constant 0x0101 + pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 movdqa xmm5, xmm4 packuswb xmm4, xmm4 - psllw xmm5, 3 // constant 0x0008 + psllw xmm5, 3 // constant 0x0008 wloop: - movdqu xmm0, [eax] // average rows + movdqu xmm0, [eax] // average rows movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] - pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // vertical add rows 0, 1 + paddw xmm0, xmm2 // vertical add rows 0, 1 paddw xmm1, xmm3 movdqu xmm2, [eax + esi * 2] movdqu xmm3, [eax + esi * 2 + 16] pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 2 + paddw xmm0, xmm2 // add row 2 paddw xmm1, xmm3 movdqu xmm2, [eax + edi] movdqu xmm3, [eax + edi + 16] lea eax, [eax + 32] pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 - paddw xmm0, xmm2 // add row 3 + paddw xmm0, xmm2 // add row 3 paddw xmm1, xmm3 phaddw xmm0, xmm1 - paddw xmm0, xmm5 // + 8 for round - psrlw xmm0, 4 // /16 for average of 4 * 4 + paddw xmm0, xmm5 // + 8 for round + psrlw xmm0, 4 // /16 for average of 4 * 4 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 lea edx, [edx + 8] @@ -397,15 +400,16 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, #ifdef HAS_SCALEROWDOWN4_AVX2 // Point samples 64 pixels to 16 pixels. -__declspec(naked) -void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 vpsrld ymm5, ymm5, 24 vpslld ymm5, ymm5, 16 @@ -416,10 +420,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vpand ymm0, ymm0, ymm5 vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -431,52 +435,53 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 64x4 rectangle to 16x1. -__declspec(naked) -void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 - vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 + vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 vpsrlw ymm4, ymm4, 15 - vpsllw ymm5, ymm4, 3 // constant 0x0008 + vpsllw ymm5, ymm4, 3 // constant 0x0008 vpackuswb ymm4, ymm4, ymm4 wloop: - vmovdqu ymm0, [eax] // average rows + vmovdqu ymm0, [eax] // average rows vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + esi] vmovdqu ymm3, [eax + esi + 32] - vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 + vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 vpaddw ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + esi * 2] vmovdqu ymm3, [eax + esi * 2 + 32] vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 2 + vpaddw ymm0, ymm0, ymm2 // add row 2 vpaddw ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + edi] vmovdqu ymm3, [eax + edi + 32] lea eax, [eax + 64] vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 - vpaddw ymm0, ymm0, ymm2 // add row 3 + vpaddw ymm0, ymm0, ymm2 // add row 3 vpaddw ymm1, ymm1, ymm3 - vphaddw ymm0, ymm0, ymm1 // mutates - vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw - vpaddw ymm0, ymm0, ymm5 // + 8 for round - vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 + vphaddw ymm0, ymm0, ymm1 // mutates + vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw + vpaddw ymm0, ymm0, ymm5 // + 8 for round + vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -494,14 +499,15 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. -__declspec(naked) -void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width movdqa xmm3, xmmword ptr kShuf0 movdqa xmm4, xmmword ptr kShuf1 movdqa xmm5, xmmword ptr kShuf2 @@ -541,16 +547,16 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. -__declspec(naked) -void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShuf01 movdqa xmm3, xmmword ptr kShuf11 movdqa xmm4, xmmword ptr kShuf21 @@ -559,7 +565,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, movdqa xmm7, xmmword ptr kRound34 wloop: - movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm0, [eax] // pixels 0..7 movdqu xmm1, [eax + esi] pavgb xmm0, xmm1 pshufb xmm0, xmm2 @@ -568,7 +574,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm0, [eax + 8] // pixels 8..15 movdqu xmm1, [eax + esi + 8] pavgb xmm0, xmm1 pshufb xmm0, xmm3 @@ -577,7 +583,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm0, [eax + 16] // pixels 16..23 movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm1 @@ -598,16 +604,16 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, } // Note that movdqa+palign may be better than movdqu. -__declspec(naked) -void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShuf01 movdqa xmm3, xmmword ptr kShuf11 movdqa xmm4, xmmword ptr kShuf21 @@ -616,7 +622,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, movdqa xmm7, xmmword ptr kRound34 wloop: - movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm0, [eax] // pixels 0..7 movdqu xmm1, [eax + esi] pavgb xmm1, xmm0 pavgb xmm0, xmm1 @@ -626,7 +632,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm0, [eax + 8] // pixels 8..15 movdqu xmm1, [eax + esi + 8] pavgb xmm1, xmm0 pavgb xmm0, xmm1 @@ -636,7 +642,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm0, [eax + 16] // pixels 16..23 movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm1, xmm0 @@ -660,26 +666,27 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, // 3/8 point sampler // Scale 32 pixels to 12 -__declspec(naked) -void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width movdqa xmm4, xmmword ptr kShuf38a movdqa xmm5, xmmword ptr kShuf38b xloop: - movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 lea eax, [eax + 32] pshufb xmm0, xmm4 pshufb xmm1, xmm5 paddusb xmm0, xmm1 - movq qword ptr [edx], xmm0 // write 12 pixels + movq qword ptr [edx], xmm0 // write 12 pixels movhlps xmm1, xmm0 movd [edx + 8], xmm1 lea edx, [edx + 12] @@ -691,23 +698,23 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) -void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShufAc movdqa xmm3, xmmword ptr kShufAc3 movdqa xmm4, xmmword ptr kScaleAc33 pxor xmm5, xmm5 xloop: - movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 movdqu xmm6, [eax + esi] movhlps xmm1, xmm0 movhlps xmm7, xmm6 @@ -725,14 +732,14 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, paddusw xmm0, xmm6 paddusw xmm1, xmm7 - movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 psrldq xmm0, 2 paddusw xmm6, xmm0 psrldq xmm0, 2 paddusw xmm6, xmm0 pshufb xmm6, xmm2 - movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 psrldq xmm1, 2 paddusw xmm7, xmm1 psrldq xmm1, 2 @@ -740,10 +747,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, pshufb xmm7, xmm3 paddusw xmm6, xmm7 - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 packuswb xmm6, xmm6 - movd [edx], xmm6 // write 6 pixels + movd [edx], xmm6 // write 6 pixels psrlq xmm6, 16 movd [edx + 2], xmm6 lea edx, [edx + 6] @@ -756,28 +763,28 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, } // Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) -void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShufAb0 movdqa xmm3, xmmword ptr kShufAb1 movdqa xmm4, xmmword ptr kShufAb2 movdqa xmm5, xmmword ptr kScaleAb2 xloop: - movdqu xmm0, [eax] // average 2 rows into xmm0 + movdqu xmm0, [eax] // average 2 rows into xmm0 movdqu xmm1, [eax + esi] lea eax, [eax + 16] pavgb xmm0, xmm1 - movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 pshufb xmm1, xmm2 movdqa xmm6, xmm0 pshufb xmm6, xmm3 @@ -785,10 +792,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, pshufb xmm0, xmm4 paddusw xmm1, xmm0 - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 packuswb xmm1, xmm1 - movd [edx], xmm1 // write 6 pixels + movd [edx], xmm1 // write 6 pixels psrlq xmm1, 16 movd [edx + 2], xmm1 lea edx, [edx + 6] @@ -801,26 +808,27 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, } // Reads 16 bytes and accumulates to 16 shorts at a time. -__declspec(naked) -void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { +__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr mov ecx, [esp + 12] // src_width pxor xmm5, xmm5 - // sum rows + // sum rows xloop: - movdqu xmm3, [eax] // read 16 bytes + movdqu xmm3, [eax] // read 16 bytes lea eax, [eax + 16] - movdqu xmm0, [edx] // read 16 words from destination + movdqu xmm0, [edx] // read 16 words from destination movdqu xmm1, [edx + 16] movdqa xmm2, xmm3 punpcklbw xmm2, xmm5 punpckhbw xmm3, xmm5 - paddusw xmm0, xmm2 // sum 16 words + paddusw xmm0, xmm2 // sum 16 words paddusw xmm1, xmm3 - movdqu [edx], xmm0 // write 16 words to destination + movdqu [edx], xmm0 // write 16 words to destination movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 16 @@ -831,24 +839,25 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. -__declspec(naked) -void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { +__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width) { __asm { - mov eax, [esp + 4] // src_ptr - mov edx, [esp + 8] // dst_ptr + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr mov ecx, [esp + 12] // src_width vpxor ymm5, ymm5, ymm5 - // sum rows + // sum rows xloop: - vmovdqu ymm3, [eax] // read 32 bytes + vmovdqu ymm3, [eax] // read 32 bytes lea eax, [eax + 32] vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck vpunpcklbw ymm2, ymm3, ymm5 vpunpckhbw ymm3, ymm3, ymm5 - vpaddusw ymm0, ymm2, [edx] // sum 16 words + vpaddusw ymm0, ymm2, [edx] // sum 16 words vpaddusw ymm1, ymm3, [edx + 32] - vmovdqu [edx], ymm0 // write 32 words to destination + vmovdqu [edx], ymm0 // write 32 words to destination vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] sub ecx, 32 @@ -862,86 +871,87 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { // Constant for making pixels signed to avoid pmaddubsw // saturation. -static uvec8 kFsub80 = - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; +static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Constant for making pixels unsigned and adding .5 for rounding. -static uvec16 kFadd40 = - { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; +static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, + 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. -__declspec(naked) -void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { __asm { push ebx push esi push edi - mov edi, [esp + 12 + 4] // dst_ptr - mov esi, [esp + 12 + 8] // src_ptr - mov ecx, [esp + 12 + 12] // dst_width + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width movd xmm2, [esp + 12 + 16] // x movd xmm3, [esp + 12 + 20] // dx - mov eax, 0x04040000 // shuffle to line up fractions with pixel. + mov eax, 0x04040000 // shuffle to line up fractions with pixel. movd xmm5, eax - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 - pcmpeqb xmm7, xmm7 // generate 0x0001 + pcmpeqb xmm7, xmm7 // generate 0x0001 psrlw xmm7, 15 - pextrw eax, xmm2, 1 // get x0 integer. preroll + pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 - movdqa xmm0, xmm2 // x1 = x0 + dx + movdqa xmm0, xmm2 // x1 = x0 + dx paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx - psrlw xmm1, 9 // 7 bit fractions. + psrlw xmm1, 9 // 7 bit fractions. movzx ebx, word ptr [esi + edx] // 2 source x1 pixels movd xmm4, ebx - pshufb xmm1, xmm5 // 0011 + pshufb xmm1, xmm5 // 0011 punpcklwd xmm0, xmm4 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm1, xmm6 // 0..7f and 7f..0 - paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. + pxor xmm1, xmm6 // 0..7f and 7f..0 + paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm1, xmm1 // 8 bits, 2 pixels. + psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm1, xmm1 // 8 bits, 2 pixels. movd ebx, xmm1 mov [edi], bx lea edi, [edi + 2] - sub ecx, 2 // 2 pixels + sub ecx, 2 // 2 pixels jge xloop2 xloop29: add ecx, 2 - 1 jl xloop99 - // 1 pixel remainder + // 1 pixel remainder movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx - psrlw xmm2, 9 // 7 bit fractions. - pshufb xmm2, xmm5 // 0011 + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. - pxor xmm2, xmm6 // 0..7f and 7f..0 - paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 - pmaddubsw xmm2, xmm0 // 16 bit + pxor xmm2, xmm6 // 0..7f and 7f..0 + paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm2, xmm0 // 16 bit paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. - psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm2, xmm2 // 8 bits + psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm2, xmm2 // 8 bits movd ebx, xmm2 mov [edi], bl @@ -955,13 +965,15 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } // Reads 16 pixels, duplicates them and writes 32 pixels. -__declspec(naked) -void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { __asm { - mov edx, [esp + 4] // dst_ptr - mov eax, [esp + 8] // src_ptr - mov ecx, [esp + 12] // dst_width + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width wloop: movdqu xmm0, [eax] @@ -980,15 +992,15 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, } // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -__declspec(naked) -void ScaleARGBRowDown2_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] @@ -1005,23 +1017,23 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, } // Blends 8x1 rectangle to 4x1. -__declspec(naked) -void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] @@ -1033,16 +1045,16 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, } // Blends 8x2 rectangle to 4x1. -__declspec(naked) -void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // dst_width + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // dst_width wloop: movdqu xmm0, [eax] @@ -1050,11 +1062,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows + pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] @@ -1067,18 +1079,19 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, } // Reads 4 pixels at a time. -__declspec(naked) -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { __asm { push ebx push edi - mov eax, [esp + 8 + 4] // src_argb - // src_stride ignored - mov ebx, [esp + 8 + 12] // src_stepx - mov edx, [esp + 8 + 16] // dst_argb - mov ecx, [esp + 8 + 20] // dst_width + mov eax, [esp + 8 + 4] // src_argb + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_argb + mov ecx, [esp + 8 + 20] // dst_width lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] @@ -1103,21 +1116,21 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, } // Blends four 2x2 to 4x1. -__declspec(naked) -void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { +__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { __asm { push ebx push esi push edi - mov eax, [esp + 12 + 4] // src_argb - mov esi, [esp + 12 + 8] // src_stride - mov ebx, [esp + 12 + 12] // src_stepx - mov edx, [esp + 12 + 16] // dst_argb - mov ecx, [esp + 12 + 20] // dst_width - lea esi, [eax + esi] // row1 pointer + mov eax, [esp + 12 + 4] // src_argb + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_argb + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] @@ -1132,11 +1145,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, movq xmm3, qword ptr [esi + ebx * 2] movhps xmm3, qword ptr [esi + edi] lea esi, [esi + ebx * 4] - pavgb xmm0, xmm2 // average rows + pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] @@ -1151,64 +1164,66 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, } // Column scaling unfiltered. SSE2 version. -__declspec(naked) -void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { __asm { push edi push esi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx - pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 - pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 paddd xmm2, xmm0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 2 - pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 - paddd xmm2, xmm0 // x3 x2 x1 x0 - paddd xmm3, xmm3 // 0, 0, 0, dx * 4 - pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 + paddd xmm2, xmm0 // x3 x2 x1 x0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 - pextrw eax, xmm2, 1 // get x0 integer. - pextrw edx, xmm2, 3 // get x1 integer. + pextrw eax, xmm2, 1 // get x0 integer. + pextrw edx, xmm2, 3 // get x1 integer. cmp ecx, 0 jle xloop99 sub ecx, 4 jl xloop49 - // 4 Pixel loop. + // 4 Pixel loop. xloop4: movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - pextrw edx, xmm2, 7 // get x3 integer. - paddd xmm2, xmm3 // x += dx - punpckldq xmm0, xmm1 // x0 x1 + pextrw eax, xmm2, 5 // get x2 integer. + pextrw edx, xmm2, 7 // get x3 integer. + paddd xmm2, xmm3 // x += dx + punpckldq xmm0, xmm1 // x0 x1 movd xmm1, [esi + eax * 4] // 1 source x2 pixels movd xmm4, [esi + edx * 4] // 1 source x3 pixels - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - punpckldq xmm1, xmm4 // x2 x3 - punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + punpckldq xmm1, xmm4 // x2 x3 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 movdqu [edi], xmm0 lea edi, [edi + 16] - sub ecx, 4 // 4 pixels + sub ecx, 4 // 4 pixels jge xloop4 xloop49: test ecx, 2 je xloop29 - // 2 Pixels. + // 2 Pixels. movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels - pextrw eax, xmm2, 5 // get x2 integer. - punpckldq xmm0, xmm1 // x0 x1 + pextrw eax, xmm2, 5 // get x2 integer. + punpckldq xmm0, xmm1 // x0 x1 movq qword ptr [edi], xmm0 lea edi, [edi + 8] @@ -1217,7 +1232,7 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, test ecx, 1 je xloop99 - // 1 Pixels. + // 1 Pixels. movd xmm0, [esi + eax * 4] // 1 source x2 pixels movd dword ptr [edi], xmm0 xloop99: @@ -1232,60 +1247,62 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, // TODO(fbarchard): Port to Neon // Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +static const uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each -static uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +static const uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; -__declspec(naked) -void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx movdqa xmm4, xmmword ptr kShuffleColARGB movdqa xmm5, xmmword ptr kShuffleFractions - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll + pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 - movdqa xmm0, xmm2 // x1 = x0 + dx + movdqa xmm0, xmm2 // x1 = x0 + dx paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - psrlw xmm1, 9 // 7 bit fractions. + psrlw xmm1, 9 // 7 bit fractions. movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels - pshufb xmm1, xmm5 // 0000000011111111 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + pshufb xmm1, xmm5 // 0000000011111111 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. movq qword ptr [edi], xmm0 lea edi, [edi + 8] - sub ecx, 2 // 2 pixels + sub ecx, 2 // 2 pixels jge xloop2 xloop29: @@ -1293,15 +1310,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, add ecx, 2 - 1 jl xloop99 - // 1 pixel remainder - psrlw xmm2, 9 // 7 bit fractions. + // 1 pixel remainder + psrlw xmm2, 9 // 7 bit fractions. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - pshufb xmm2, xmm5 // 00000000 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. + pshufb xmm2, xmm5 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. psrlw xmm0, 7 - packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. movd [edi], xmm0 xloop99: @@ -1313,13 +1330,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, } // Reads 4 pixels, duplicates them and writes 8 pixels. -__declspec(naked) -void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { +__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { __asm { - mov edx, [esp + 4] // dst_argb - mov eax, [esp + 8] // src_argb - mov ecx, [esp + 12] // dst_width + mov edx, [esp + 4] // dst_argb + mov eax, [esp + 8] // src_argb + mov ecx, [esp + 12] // dst_width wloop: movdqu xmm0, [eax] @@ -1338,12 +1357,11 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, } // Divide num by div and return as 16.16 fixed point result. -__declspec(naked) -int FixedDiv_X86(int num, int div) { +__declspec(naked) int FixedDiv_X86(int num, int div) { __asm { - mov eax, [esp + 4] // num - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 + mov eax, [esp + 4] // num + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 shl eax, 16 idiv dword ptr [esp + 8] ret @@ -1351,13 +1369,12 @@ int FixedDiv_X86(int num, int div) { } // Divide num by div and return as 16.16 fixed point result. -__declspec(naked) -int FixedDiv1_X86(int num, int div) { +__declspec(naked) int FixedDiv1_X86(int num, int div) { __asm { - mov eax, [esp + 4] // num - mov ecx, [esp + 8] // denom - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 + mov eax, [esp + 4] // num + mov ecx, [esp + 8] // denom + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 shl eax, 16 sub eax, 0x00010001 sbb edx, 0 diff --git a/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc b/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc index 00fb71e18b..92384c050c 100644 --- a/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc +++ b/media/libvpx/libvpx/third_party/libyuv/source/video_common.cc @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "libyuv/video_common.h" #ifdef __cplusplus @@ -16,40 +15,39 @@ namespace libyuv { extern "C" { #endif -#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0])) - struct FourCCAliasEntry { - uint32 alias; - uint32 canonical; + uint32_t alias; + uint32_t canonical; }; -static const struct FourCCAliasEntry kFourCCAliases[] = { - {FOURCC_IYUV, FOURCC_I420}, - {FOURCC_YU12, FOURCC_I420}, - {FOURCC_YU16, FOURCC_I422}, - {FOURCC_YU24, FOURCC_I444}, - {FOURCC_YUYV, FOURCC_YUY2}, - {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs - {FOURCC_HDYC, FOURCC_UYVY}, - {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 - {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. - {FOURCC_DMB1, FOURCC_MJPG}, - {FOURCC_BA81, FOURCC_BGGR}, // deprecated. - {FOURCC_RGB3, FOURCC_RAW }, - {FOURCC_BGR3, FOURCC_24BG}, - {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB - {FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB - {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 - {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 - {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 +#define NUM_ALIASES 18 +static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = { + {FOURCC_IYUV, FOURCC_I420}, + {FOURCC_YU12, FOURCC_I420}, + {FOURCC_YU16, FOURCC_I422}, + {FOURCC_YU24, FOURCC_I444}, + {FOURCC_YUYV, FOURCC_YUY2}, + {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs + {FOURCC_HDYC, FOURCC_UYVY}, + {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 + {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. + {FOURCC_DMB1, FOURCC_MJPG}, + {FOURCC_BA81, FOURCC_BGGR}, // deprecated. + {FOURCC_RGB3, FOURCC_RAW}, + {FOURCC_BGR3, FOURCC_24BG}, + {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB + {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB + {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 + {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 + {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 }; // TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB. // {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA LIBYUV_API -uint32 CanonicalFourCC(uint32 fourcc) { +uint32_t CanonicalFourCC(uint32_t fourcc) { int i; - for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) { + for (i = 0; i < NUM_ALIASES; ++i) { if (kFourCCAliases[i].alias == fourcc) { return kFourCCAliases[i].canonical; } @@ -62,4 +60,3 @@ uint32 CanonicalFourCC(uint32 fourcc) { } // extern "C" } // namespace libyuv #endif - diff --git a/media/libvpx/libvpx/third_party/nalloc/LICENSE b/media/libvpx/libvpx/third_party/nalloc/LICENSE new file mode 100644 index 0000000000..14c2b9e737 --- /dev/null +++ b/media/libvpx/libvpx/third_party/nalloc/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Catena cyber + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/media/libvpx/libvpx/third_party/nalloc/README.libvpx b/media/libvpx/libvpx/third_party/nalloc/README.libvpx new file mode 100644 index 0000000000..0acafdf2f7 --- /dev/null +++ b/media/libvpx/libvpx/third_party/nalloc/README.libvpx @@ -0,0 +1,11 @@ +Name: nalloc +URL: https://github.com/catenacyber/nallocfuzz +Version: dc351a94bbded5ede5b7550d6d08e78e0cc6dcef +License: MIT +License File: LICENSE + +Description: +Nalloc is a tool to inject allocation failures while fuzzing. + +Local Modifications: +None diff --git a/media/libvpx/libvpx/third_party/nalloc/nalloc.h b/media/libvpx/libvpx/third_party/nalloc/nalloc.h new file mode 100644 index 0000000000..f910926b18 --- /dev/null +++ b/media/libvpx/libvpx/third_party/nalloc/nalloc.h @@ -0,0 +1,342 @@ +/* + MIT License + + Copyright (c) 2025 Catena cyber + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef NALLOC_H_ +#define NALLOC_H_ + +#if defined(__clang__) && defined(__has_feature) +#if __has_feature(address_sanitizer) +#define NALLOC_ASAN 1 +#endif +#if __has_feature(memory_sanitizer) +#define FUZZER_DISABLE_NALLOC 1 +#endif +#endif + +#if defined(FUZZER_DISABLE_NALLOC) +#define nalloc_init(x) +#define nalloc_restrict_file_prefix(x) +#define nalloc_start(x, y) +#define nalloc_end() +#else + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static const uint32_t nalloc_crc32_table[] = { + 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, + 0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, + 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, 0x4c11db70, 0x48d0c6c7, + 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75, + 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, + 0x709f7b7a, 0x745e66cd, 0x9823b6e0, 0x9ce2ab57, 0x91a18d8e, 0x95609039, + 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef, + 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d, + 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, 0xc3f706fb, + 0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, + 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0, + 0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072, + 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4, + 0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, + 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08, + 0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba, + 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, + 0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, + 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050, + 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, + 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34, + 0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637, + 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1, + 0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53, + 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5, + 0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, + 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e, 0xf5ee4bb9, + 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b, + 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, + 0xcda1f604, 0xc960ebb3, 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, + 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71, + 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3, + 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, + 0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, + 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e, + 0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec, + 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a, + 0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, + 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, 0xe3a1cbc1, 0xe760d676, + 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4, + 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, + 0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, + 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4 +}; + +// Nallocfuzz data to take a decision +uint32_t nalloc_random_state = 0; +__thread unsigned int nalloc_running = 0; +bool nalloc_initialized = false; +uint32_t nalloc_runs = 0; + +// Nalloc fuzz parameters +uint32_t nalloc_bitmask = 0xFF; +bool nalloc_random_bitmask = true; +uint32_t nalloc_magic = 0x294cee63; +bool nalloc_verbose = false; + +#ifdef NALLOC_ASAN +extern void __sanitizer_print_stack_trace(void); +#endif + +// Generic init, using env variables to get parameters +void nalloc_init(const char *prog) { + if (nalloc_initialized) { + return; + } + nalloc_initialized = true; + char *bitmask = getenv("NALLOC_FREQ"); + if (bitmask) { + int shift = atoi(bitmask); + if (shift > 0 && shift < 31) { + nalloc_bitmask = 1 << shift; + nalloc_random_bitmask = false; + } else if (shift == 0) { + nalloc_random_bitmask = false; + nalloc_bitmask = 0; + } + } else if (prog == NULL || strstr(prog, "nalloc") == NULL) { + nalloc_random_bitmask = false; + nalloc_bitmask = 0; + return; + } + + char *magic = getenv("NALLOC_MAGIC"); + if (magic) { + nalloc_magic = (uint32_t)strtol(magic, NULL, 0); + } + + char *verbose = getenv("NALLOC_VERBOSE"); + if (verbose) { + nalloc_verbose = true; + } +} + +// add one byte to the CRC +static inline void nalloc_random_update(uint8_t b) { + nalloc_random_state = + ((uint32_t)((uint32_t)nalloc_random_state << 8)) ^ + nalloc_crc32_table[((nalloc_random_state >> 24) ^ b) & 0xFF]; +} + +// Start the failure injections, using a buffer as seed +static int nalloc_start(const uint8_t *data, size_t size) { + if (nalloc_random_bitmask) { + if (nalloc_random_state & 0x10) { + nalloc_bitmask = 0xFFFFFFFF; + } else { + nalloc_bitmask = 1 << (5 + (nalloc_random_state & 0xF)); + } + } else if (nalloc_bitmask == 0) { + // nalloc disabled + return 0; + } + nalloc_random_state = 0; + for (size_t i = 0; i < size; i++) { + nalloc_random_update(data[i]); + } + if (__sync_fetch_and_add(&nalloc_running, 1)) { + __sync_fetch_and_sub(&nalloc_running, 1); + return 0; + } + nalloc_runs++; + return 1; +} + +// Stop the failure injections +static void nalloc_end() { __sync_fetch_and_sub(&nalloc_running, 1); } + +static bool nalloc_backtrace_exclude(size_t size, const char *op) { + if (nalloc_verbose) { + fprintf(stderr, "failed %s(%zu) \n", op, size); +#ifdef NALLOC_ASAN + __sanitizer_print_stack_trace(); +#endif + } + + return false; +} + +// +static bool nalloc_fail(size_t size, const char *op) { + // do not fail before thread init + if (nalloc_runs == 0) { + return false; + } + if (__sync_fetch_and_add(&nalloc_running, 1) != 1) { + // do not fail allocations outside of fuzzer input + // and do not fail inside of this function + __sync_fetch_and_sub(&nalloc_running, 1); + return false; + } + nalloc_random_update((uint8_t)size); + if (size >= 0x100) { + nalloc_random_update((uint8_t)(size >> 8)); + if (size >= 0x10000) { + nalloc_random_update((uint8_t)(size >> 16)); + // bigger may already fail or oom + } + } + if (((nalloc_random_state ^ nalloc_magic) & nalloc_bitmask) == 0) { + if (nalloc_backtrace_exclude(size, op)) { + __sync_fetch_and_sub(&nalloc_running, 1); + return false; + } + __sync_fetch_and_sub(&nalloc_running, 1); + return true; + } + __sync_fetch_and_sub(&nalloc_running, 1); + return false; +} + +// ASAN interceptor for libc routines +#ifdef NALLOC_ASAN +extern void *__interceptor_malloc(size_t); +extern void *__interceptor_calloc(size_t, size_t); +extern void *__interceptor_realloc(void *, size_t); +extern void *__interceptor_reallocarray(void *, size_t, size_t); + +extern ssize_t __interceptor_read(int, void *, size_t); +extern ssize_t __interceptor_write(int, const void *, size_t); +extern ssize_t __interceptor_recv(int, void *, size_t, int); +extern ssize_t __interceptor_send(int, const void *, size_t, int); + +#define nalloc_malloc(s) __interceptor_malloc(s) +#define nalloc_calloc(s, n) __interceptor_calloc(s, n) +#define nalloc_realloc(p, s) __interceptor_realloc(p, s) +#define nalloc_reallocarray(p, s, n) __interceptor_reallocarray(p, s, n) + +#define nalloc_read(f, b, s) __interceptor_read(f, b, s) +#define nalloc_write(f, b, s) __interceptor_write(f, b, s) +#define nalloc_recv(f, b, s, x) __interceptor_recv(f, b, s, x) +#define nalloc_send(f, b, s, x) __interceptor_send(f, b, s, x) + +#else +extern void *__libc_malloc(size_t); +extern void *__libc_calloc(size_t, size_t); +extern void *__libc_realloc(void *, size_t); +extern void *__libc_reallocarray(void *, size_t, size_t); + +extern ssize_t __read(int, void *, size_t); +extern ssize_t __write(int, const void *, size_t); +extern ssize_t __recv(int, void *, size_t, int); +extern ssize_t __send(int, const void *, size_t, int); + +#define nalloc_malloc(s) __libc_malloc(s) +#define nalloc_calloc(s, n) __libc_calloc(s, n) +#define nalloc_realloc(p, s) __libc_realloc(p, s) +#define nalloc_reallocarray(p, s, n) __libc_reallocarray(p, s, n) + +#define nalloc_read(f, b, s) __read(f, b, s) +#define nalloc_write(f, b, s) __write(f, b, s) +#define nalloc_recv(f, b, s, x) __recv(f, b, s, x) +#define nalloc_send(f, b, s, x) __send(f, b, s, x) +#endif + +// nalloc standard function overwrites with pseudo-random failures +ssize_t read(int fd, void *buf, size_t count) { + if (nalloc_fail(count, "read")) { + errno = EIO; + return -1; + } + return nalloc_read(fd, buf, count); +} + +ssize_t write(int fd, const void *buf, size_t count) { + if (nalloc_fail(count, "write")) { + errno = EIO; + return -1; + } + return nalloc_write(fd, buf, count); +} + +ssize_t recv(int fd, void *buf, size_t count, int flags) { + if (nalloc_fail(count, "recv")) { + errno = EIO; + return -1; + } + return nalloc_recv(fd, buf, count, flags); +} + +ssize_t send(int fd, const void *buf, size_t count, int flags) { + if (nalloc_fail(count, "send")) { + errno = EIO; + return -1; + } + return nalloc_send(fd, buf, count, flags); +} + +void *calloc(size_t nmemb, size_t size) { + if (nalloc_fail(size, "calloc")) { + errno = ENOMEM; + return NULL; + } + return nalloc_calloc(nmemb, size); +} + +void *malloc(size_t size) { + if (nalloc_fail(size, "malloc")) { + errno = ENOMEM; + return NULL; + } + return nalloc_malloc(size); +} + +void *realloc(void *ptr, size_t size) { + if (nalloc_fail(size, "realloc")) { + errno = ENOMEM; + return NULL; + } + return nalloc_realloc(ptr, size); +} + +void *reallocarray(void *ptr, size_t nmemb, size_t size) { + if (nalloc_fail(size, "reallocarray")) { + errno = ENOMEM; + return NULL; + } + return nalloc_reallocarray(ptr, nmemb, size); +} + +#ifdef __cplusplus +} // extern "C" { +#endif + +#endif // FUZZER_DISABLE_NALLOC + +#endif // NALLOC_H_ diff --git a/media/libvpx/libvpx/third_party/x86inc/README.libvpx b/media/libvpx/libvpx/third_party/x86inc/README.libvpx index 8d3cd966da..195654f7bb 100644 --- a/media/libvpx/libvpx/third_party/x86inc/README.libvpx +++ b/media/libvpx/libvpx/third_party/x86inc/README.libvpx @@ -1,5 +1,5 @@ URL: https://git.videolan.org/git/x264.git -Version: d23d18655249944c1ca894b451e2c82c7a584c62 +Version: 3e5aed95cc470f37e2db3e6506a8deb89b527720 License: ISC License File: LICENSE @@ -12,9 +12,8 @@ Get configuration from vpx_config.asm. Prefix functions with vpx by default. Manage name mangling (prefixing with '_') manually because 'PREFIX' does not exist in libvpx. -Expand PIC default to macho64 and respect CONFIG_PIC from libvpx -Set 'private_extern' visibility for macho targets. Copy PIC 'GLOBAL' macros from x86_abi_support.asm Use .text instead of .rodata on macho to avoid broken tables in PIC mode. -Use .text with no alignment for aout -Only use 'hidden' visibility with Chromium +Use .text with no alignment for aout. +Only use 'hidden' visibility with Chromium. +Prefix ARCH_* with VPX_. diff --git a/media/libvpx/libvpx/third_party/x86inc/x86inc.asm b/media/libvpx/libvpx/third_party/x86inc/x86inc.asm index b647dff2f8..3d55e921c7 100644 --- a/media/libvpx/libvpx/third_party/x86inc/x86inc.asm +++ b/media/libvpx/libvpx/third_party/x86inc/x86inc.asm @@ -1,12 +1,12 @@ ;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2016 x264 project +;* Copyright (C) 2005-2019 x264 project ;* ;* Authors: Loren Merritt +;* Henrik Gramner ;* Anton Mitrofanov ;* Fiona Glaser -;* Henrik Gramner ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above @@ -45,7 +45,7 @@ %endif %ifndef STACK_ALIGNMENT - %if ARCH_X86_64 + %if VPX_ARCH_X86_64 %define STACK_ALIGNMENT 16 %else %define STACK_ALIGNMENT 4 @@ -54,7 +54,7 @@ %define WIN64 0 %define UNIX64 0 -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,win64 @@ -67,19 +67,19 @@ %endif %define FORMAT_ELF 0 +%define FORMAT_MACHO 0 %ifidn __OUTPUT_FORMAT__,elf %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,elf32 %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,elf64 %define FORMAT_ELF 1 -%endif - -%define FORMAT_MACHO 0 -%ifidn __OUTPUT_FORMAT__,macho32 - %define FORMAT_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho + %define FORMAT_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho32 + %define FORMAT_MACHO 1 %elifidn __OUTPUT_FORMAT__,macho64 - %define FORMAT_MACHO 1 + %define FORMAT_MACHO 1 %endif ; Set PREFIX for libvpx builds. @@ -103,7 +103,11 @@ ; works around the issue. It appears to be specific to the way libvpx ; handles the tables. %macro SECTION_RODATA 0-1 16 - %ifidn __OUTPUT_FORMAT__,macho32 + %ifidn __OUTPUT_FORMAT__,win32 + SECTION .rdata align=%1 + %elif WIN64 + SECTION .rdata align=%1 + %elifidn __OUTPUT_FORMAT__,macho32 SECTION .text align=%1 fakegot: %elifidn __OUTPUT_FORMAT__,aout @@ -113,8 +117,7 @@ %endif %endmacro -; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" -; from original code is added in for 64bit. +; PIC macros from vpx_ports/x86_abi_support.asm. %ifidn __OUTPUT_FORMAT__,elf32 %define ABI_IS_32BIT 1 %elifidn __OUTPUT_FORMAT__,macho32 @@ -165,7 +168,7 @@ %endif %endif - %if ARCH_X86_64 == 0 + %if VPX_ARCH_X86_64 == 0 %undef PIC %endif @@ -203,10 +206,24 @@ %ifndef GET_GOT_DEFINED %define GET_GOT_DEFINED 0 %endif -; Done with PIC macros +; End PIC macros from vpx_ports/x86_abi_support.asm. + +; libvpx explicitly sets visibilty in shared object builds. Avoid setting +; visibility to hidden as it may break builds that split sources on e.g., +; directory boundaries. +%ifdef CHROMIUM + %define VISIBILITY hidden + %define HAVE_PRIVATE_EXTERN 1 +%else + %define VISIBILITY + %define HAVE_PRIVATE_EXTERN 0 +%endif %ifdef __NASM_VER__ %use smartalign + %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 + %define HAVE_PRIVATE_EXTERN 0 + %endif %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: @@ -260,7 +277,7 @@ %if %0 == 2 %define r%1m %2d %define r%1mp %2 - %elif ARCH_X86_64 ; memory + %elif VPX_ARCH_X86_64 ; memory %define r%1m [rstk + stack_offset + %3] %define r%1mp qword r %+ %1 %+ m %else @@ -281,7 +298,7 @@ %define e%1h %3 %define r%1b %2 %define e%1b %2 - %if ARCH_X86_64 == 0 + %if VPX_ARCH_X86_64 == 0 %define r%1 e%1 %endif %endmacro @@ -318,12 +335,24 @@ DECLARE_REG_SIZE bp, bpl, null DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 %define gprsize 8 %else %define gprsize 4 %endif +%macro LEA 2 +%if VPX_ARCH_X86_64 + lea %1, [%2] +%elif PIC + call $+5 ; special-cased to not affect the RSB on most CPU:s + pop %1 + add %1, (%2)-$+1 +%else + mov %1, %2 +%endif +%endmacro + %macro PUSH 1 push %1 %ifidn rstk, rsp @@ -385,6 +414,10 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endif %endmacro +%if VPX_ARCH_X86_64 == 0 + %define movsxd movifnidn +%endif + %macro movsxdifnidn 2 %ifnidn %1, %2 movsxd %1, %2 @@ -433,6 +466,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endmacro %define required_stack_alignment ((mmsize + 15) & ~15) +%define vzeroupper_required (mmsize > 16 && (VPX_ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) +%define high_mm_regs (16*cpuflag(avx512)) %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 @@ -483,10 +518,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 + ; Reserve an additional register for storing the original stack pointer, but avoid using + ; eax/rax for this purpose since it can potentially get overwritten as a return value. %assign regs_used (regs_used + 1) + %if VPX_ARCH_X86_64 && regs_used == 7 + %assign regs_used 8 + %elif VPX_ARCH_X86_64 == 0 && regs_used == 1 + %assign regs_used 2 + %endif %endif - %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 - ; Ensure that we don't clobber any registers containing arguments + %if VPX_ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 + ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) + ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. %assign regs_used 5 + UNIX64 * 3 %endif %endif @@ -516,10 +559,10 @@ DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 +DECLARE_REG 11, R14, 96 +DECLARE_REG 12, R15, 104 +DECLARE_REG 13, R12, 112 +DECLARE_REG 14, R13, 120 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 @@ -538,15 +581,16 @@ DECLARE_REG 14, R15, 120 %macro WIN64_PUSH_XMM 0 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 + %if xmm_regs_used > 6 + high_mm_regs movaps [rstk + stack_offset + 8], xmm6 %endif - %if xmm_regs_used > 7 + %if xmm_regs_used > 7 + high_mm_regs movaps [rstk + stack_offset + 24], xmm7 %endif - %if xmm_regs_used > 8 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 %assign %%i 8 - %rep xmm_regs_used-8 + %rep %%xmm_regs_on_stack movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i %assign %%i %%i+1 %endrep @@ -555,59 +599,62 @@ DECLARE_REG 14, R15, 120 %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 8 + ASSERT xmm_regs_used <= 16 + high_mm_regs + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. - %assign %%pad (xmm_regs_used-8)*16 + 32 + %assign %%pad %%xmm_regs_on_stack*16 + 32 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif WIN64_PUSH_XMM %endmacro -%macro WIN64_RESTORE_XMM_INTERNAL 1 +%macro WIN64_RESTORE_XMM_INTERNAL 0 %assign %%pad_size 0 - %if xmm_regs_used > 8 - %assign %%i xmm_regs_used - %rep xmm_regs_used-8 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i xmm_regs_used - high_mm_regs + %rep %%xmm_regs_on_stack %assign %%i %%i-1 - movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else - add %1, stack_size_padded + add rsp, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif - %if xmm_regs_used > 7 - movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %if xmm_regs_used > 7 + high_mm_regs + movaps xmm7, [rsp + stack_offset - %%pad_size + 24] %endif - %if xmm_regs_used > 6 - movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %if xmm_regs_used > 6 + high_mm_regs + movaps xmm6, [rsp + stack_offset - %%pad_size + 8] %endif %endmacro -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 +%macro WIN64_RESTORE_XMM 0 + WIN64_RESTORE_XMM_INTERNAL %assign stack_offset (stack_offset-stack_size_padded) + %assign stack_size_padded 0 %assign xmm_regs_used 0 %endmacro -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs %macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp + WIN64_RESTORE_XMM_INTERNAL POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro -%elif ARCH_X86_64 ; *nix x64 ;============================================= +%elif VPX_ARCH_X86_64 ; *nix x64 ;============================================= DECLARE_REG 0, rdi DECLARE_REG 1, rsi @@ -620,14 +667,15 @@ DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 +DECLARE_REG 11, R14, 48 +DECLARE_REG 12, R15, 56 +DECLARE_REG 13, R12, 64 +DECLARE_REG 14, R13, 72 -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 + %assign xmm_regs_used %3 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 @@ -637,7 +685,7 @@ DECLARE_REG 14, R15, 72 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro -%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 @@ -648,7 +696,7 @@ DECLARE_REG 14, R15, 72 %endif %endif POP_IF_USED 14, 13, 12, 11, 10, 9 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -693,7 +741,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro -%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 @@ -704,7 +752,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endif %endif POP_IF_USED 6, 5, 4, 3 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -715,7 +763,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %if WIN64 == 0 %macro WIN64_SPILL_XMM 1 %endmacro - %macro WIN64_RESTORE_XMM 1 + %macro WIN64_RESTORE_XMM 0 %endmacro %macro WIN64_PUSH_XMM 0 %endmacro @@ -726,7 +774,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 - %if has_epilogue + %if has_epilogue || cpuflag(ssse3) RET %else rep ret @@ -758,7 +806,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp -%macro TAIL_CALL 2 ; callee, is_nonadjacent +%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent %if has_epilogue call %1 RET @@ -788,35 +836,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endmacro %macro cglobal_internal 2-3+ annotate_function_size - %if %1 - %xdefine %%FUNCTION_PREFIX private_prefix - ; libvpx explicitly sets visibility in shared object builds. Avoid - ; setting visibility to hidden as it may break builds that split - ; sources on e.g., directory boundaries. - %ifdef CHROMIUM - %xdefine %%VISIBILITY hidden - %else - %xdefine %%VISIBILITY - %endif - %else - %xdefine %%FUNCTION_PREFIX public_prefix - %xdefine %%VISIBILITY - %endif %ifndef cglobaled_%2 - %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2) + %if %1 + %xdefine %2 mangle(private_prefix %+ _ %+ %2) + %else + %xdefine %2 mangle(public_prefix %+ _ %+ %2) + %endif %xdefine %2.skip_prologue %2 %+ .skip_prologue CAT_XDEFINE cglobaled_, %2, 1 %endif %xdefine current_function %2 %xdefine current_function_section __SECT__ %if FORMAT_ELF - global %2:function %%VISIBILITY - %elif FORMAT_MACHO - %ifdef __NASM_VER__ - global %2 + %if %1 + global %2:function VISIBILITY %else - global %2:private_extern + global %2:function %endif + %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 + global %2:private_extern %else global %2 %endif @@ -827,12 +865,24 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign stack_offset 0 ; stack pointer offset relative to the return address %assign stack_size 0 ; amount of stack space that can be freely used inside a function %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding - %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper %ifnidn %3, "" PROLOGUE %3 %endif %endmacro +; Create a global symbol from a local label with the correct name mangling and type +%macro cglobal_label 1 + %if FORMAT_ELF + global current_function %+ %1:function VISIBILITY + %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN + global current_function %+ %1:private_extern + %else + global current_function %+ %1 + %endif + %1: +%endmacro + %macro cextern 1 %xdefine %1 mangle(private_prefix %+ _ %+ %1) CAT_XDEFINE cglobaled_, %1, 1 @@ -851,7 +901,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %macro const 1-2+ %xdefine %1 mangle(private_prefix %+ _ %+ %1) %if FORMAT_ELF - global %1:data hidden + global %1:data VISIBILITY + %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN + global %1:private_extern %else global %1 %endif @@ -890,24 +942,26 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_fma3 (1<<14)| cpuflags_avx -%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 +%assign cpuflags_aesni (1<<12)| cpuflags_sse42 +%assign cpuflags_gfni (1<<13)| cpuflags_sse42 +%assign cpuflags_avx (1<<14)| cpuflags_sse42 +%assign cpuflags_xop (1<<15)| cpuflags_avx +%assign cpuflags_fma4 (1<<16)| cpuflags_avx +%assign cpuflags_fma3 (1<<17)| cpuflags_avx +%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 +%assign cpuflags_cache32 (1<<22) +%assign cpuflags_cache64 (1<<23) +%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<25) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) @@ -948,9 +1002,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endif %endif - %if ARCH_X86_64 || cpuflag(sse2) + %if VPX_ARCH_X86_64 || cpuflag(sse2) %ifdef __NASM_VER__ - ALIGNMODE k8 + ALIGNMODE p6 %else CPU amdnop %endif @@ -963,11 +1017,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endif %endmacro -; Merge mmx and sse* +; Merge mmx, sse*, and avx* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# -; (All 3 remain in sync through SWAP.) +; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# +; (All 4 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 @@ -977,69 +1032,99 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %undef %1%2 %endmacro +%macro DEFINE_MMREGS 1 ; mmtype + %assign %%prev_mmregs 0 + %ifdef num_mmregs + %assign %%prev_mmregs num_mmregs + %endif + + %assign num_mmregs 8 + %if VPX_ARCH_X86_64 && mmsize >= 16 + %assign num_mmregs 16 + %if cpuflag(avx512) || mmsize == 64 + %assign num_mmregs 32 + %endif + %endif + + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1 %+ %%i + CAT_XDEFINE nn%1, %%i, %%i + %assign %%i %%i+1 + %endrep + %if %%prev_mmregs > num_mmregs + %rep %%prev_mmregs - num_mmregs + CAT_UNDEF m, %%i + CAT_UNDEF nn %+ mmtype, %%i + %assign %%i %%i+1 + %endrep + %endif + %xdefine mmtype %1 +%endmacro + +; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper +%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg + %if VPX_ARCH_X86_64 && cpuflag(avx512) + %assign %%i %1 + %rep 16-%1 + %assign %%i_high %%i+16 + SWAP %%i, %%i_high + %assign %%i %%i+1 + %endrep + %endif +%endmacro + %macro INIT_MMX 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %1 %define mmsize 8 - %define num_mmregs 8 %define mova movq %define movu movq %define movh movd %define movnta movntq - %assign %%i 0 - %rep 8 - CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nnmm, %%i, %%i - %assign %%i %%i+1 - %endrep - %rep 8 - CAT_UNDEF m, %%i - CAT_UNDEF nnmm, %%i - %assign %%i %%i+1 - %endrep INIT_CPUFLAGS %1 + DEFINE_MMREGS mm %endmacro %macro INIT_XMM 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_XMM %1 %define mmsize 16 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif %define mova movdqa %define movu movdqu %define movh movq %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nnxmm, %%i, %%i - %assign %%i %%i+1 - %endrep INIT_CPUFLAGS %1 + DEFINE_MMREGS xmm + %if WIN64 + AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers + %endif %endmacro %macro INIT_YMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_YMM %1 %define mmsize 32 - %define num_mmregs 8 - %if ARCH_X86_64 - %define num_mmregs 16 - %endif %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, ymm %+ %%i - CAT_XDEFINE nnymm, %%i, %%i - %assign %%i %%i+1 - %endrep INIT_CPUFLAGS %1 + DEFINE_MMREGS ymm + AVX512_MM_PERMUTATION +%endmacro + +%macro INIT_ZMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_ZMM %1 + %define mmsize 64 + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + INIT_CPUFLAGS %1 + DEFINE_MMREGS zmm + AVX512_MM_PERMUTATION %endmacro INIT_XMM @@ -1048,18 +1133,26 @@ INIT_XMM %define mmmm%1 mm%1 %define mmxmm%1 mm%1 %define mmymm%1 mm%1 + %define mmzmm%1 mm%1 %define xmmmm%1 mm%1 %define xmmxmm%1 xmm%1 %define xmmymm%1 xmm%1 + %define xmmzmm%1 xmm%1 %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 + %define ymmzmm%1 ymm%1 + %define zmmmm%1 mm%1 + %define zmmxmm%1 xmm%1 + %define zmmymm%1 ymm%1 + %define zmmzmm%1 zmm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 + %define zm%1 zmm %+ m%1 %endmacro %assign i 0 -%rep 16 +%rep 32 DECLARE_MMCAST i %assign i i+1 %endrep @@ -1129,25 +1222,42 @@ INIT_XMM %endif %assign %%i 0 %rep num_mmregs - CAT_XDEFINE %%f, %%i, m %+ %%i + %xdefine %%tmp m %+ %%i + CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp %assign %%i %%i+1 %endrep %endmacro -%macro LOAD_MM_PERMUTATION 1 ; name to load from - %ifdef %1_m0 +%macro LOAD_MM_PERMUTATION 0-1 ; name to load from + %if %0 + %xdefine %%f %1_m + %else + %xdefine %%f current_function %+ _m + %endif + %xdefine %%tmp %%f %+ 0 + %ifnum %%tmp + RESET_MM_PERMUTATION %assign %%i 0 %rep num_mmregs - CAT_XDEFINE m, %%i, %1_m %+ %%i - CAT_XDEFINE nn, m %+ %%i, %%i + %xdefine %%tmp %%f %+ %%i + CAT_XDEFINE %%m, %%i, m %+ %%tmp %assign %%i %%i+1 %endrep + %rep num_mmregs + %assign %%i %%i-1 + CAT_XDEFINE m, %%i, %%m %+ %%i + CAT_XDEFINE nn, m %+ %%i, %%i + %endrep %endif %endmacro ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 - call_internal %1 %+ SUFFIX, %1 + %ifid %1 + call_internal %1 %+ SUFFIX, %1 + %else + call %1 + %endif %endmacro %macro call_internal 2 %xdefine %%i %2 @@ -1190,12 +1300,17 @@ INIT_XMM ;============================================================================= %assign i 0 -%rep 16 +%rep 32 %if i < 8 CAT_XDEFINE sizeofmm, i, 8 + CAT_XDEFINE regnumofmm, i, i %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 + CAT_XDEFINE sizeofzmm, i, 64 + CAT_XDEFINE regnumofxmm, i, i + CAT_XDEFINE regnumofymm, i, i + CAT_XDEFINE regnumofzmm, i, i %assign i i+1 %endrep %undef i @@ -1214,7 +1329,7 @@ INIT_XMM ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int -;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not ;%6+: operands %macro RUN_AVX_INSTR 6-9+ @@ -1238,8 +1353,22 @@ INIT_XMM %ifdef cpuname %if notcpuflag(%2) %error use of ``%1'' %2 instruction in cpuname function: current_function - %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 + %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) %error use of ``%1'' sse2 instruction in cpuname function: current_function + %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) + %error use of ``%1'' avx2 instruction in cpuname function: current_function + %elif __sizeofreg == 16 && notcpuflag(sse) + %error use of ``%1'' sse instruction in cpuname function: current_function + %elif __sizeofreg == 32 && notcpuflag(avx) + %error use of ``%1'' avx instruction in cpuname function: current_function + %elif __sizeofreg == 64 && notcpuflag(avx512) + %error use of ``%1'' avx512 instruction in cpuname function: current_function + %elifidn %1, pextrw ; special case because the base instruction is mmx2, + %ifnid %6 ; but sse4 is required for memory operands + %if notcpuflag(sse4) + %error use of ``%1'' sse4 instruction in cpuname function: current_function + %endif + %endif %endif %endif %endif @@ -1247,14 +1376,12 @@ INIT_XMM %if __emulate_avx %xdefine __src1 %7 %xdefine __src2 %8 - %ifnidn %6, %7 - %if %0 >= 9 - CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9 - %else - CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8 - %endif - %if %5 && %4 == 0 - %ifnid %8 + %if %5 && %4 == 0 + %ifnidn %6, %7 + %ifidn %6, %8 + %xdefine __src1 %8 + %xdefine __src2 %7 + %elifnnum sizeof%8 ; 3-operand AVX instructions with a memory arg can only have it in src2, ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). ; So, if the instruction is commutative with a memory arg, swap them. @@ -1262,6 +1389,13 @@ INIT_XMM %xdefine __src2 %7 %endif %endif + %endif + %ifnidn %6, __src1 + %if %0 >= 9 + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 + %else + CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 + %endif %if __sizeofreg == 8 MOVQ %6, __src1 %elif %3 @@ -1278,9 +1412,40 @@ INIT_XMM %elif %0 >= 9 __instr %6, %7, %8, %9 %elif %0 == 8 - __instr %6, %7, %8 + %if avx_enabled && %5 + %xdefine __src1 %7 + %xdefine __src2 %8 + %ifnum regnumof%7 + %ifnum regnumof%8 + %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 + ; Most VEX-encoded instructions require an additional byte to encode when + ; src2 is a high register (e.g. m8..15). If the instruction is commutative + ; we can swap src1 and src2 when doing so reduces the instruction length. + %xdefine __src1 %8 + %xdefine __src2 %7 + %endif + %endif + %endif + __instr %6, __src1, __src2 + %else + __instr %6, %7, %8 + %endif %elif %0 == 7 - __instr %6, %7 + %if avx_enabled && %5 + %xdefine __src1 %6 + %xdefine __src2 %7 + %ifnum regnumof%6 + %ifnum regnumof%7 + %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 + %xdefine __src1 %7 + %xdefine __src2 %6 + %endif + %endif + %endif + __instr %6, __src1, __src2 + %else + __instr %6, %7 + %endif %else __instr %6 %endif @@ -1289,9 +1454,9 @@ INIT_XMM ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int -;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not -%macro AVX_INSTR 1-5 fnord, 0, 1, 0 +%macro AVX_INSTR 1-5 fnord, 0, 255, 0 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 %ifidn %2, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 @@ -1307,77 +1472,112 @@ INIT_XMM %endmacro %endmacro -; Instructions with both VEX and non-VEX encodings +; Instructions with both VEX/EVEX and legacy encodings ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 -AVX_INSTR addsd, sse2, 1, 0, 1 -AVX_INSTR addss, sse, 1, 0, 1 +AVX_INSTR addsd, sse2, 1, 0, 0 +AVX_INSTR addss, sse, 1, 0, 0 AVX_INSTR addsubpd, sse3, 1, 0, 0 AVX_INSTR addsubps, sse3, 1, 0, 0 -AVX_INSTR aesdec, fnord, 0, 0, 0 -AVX_INSTR aesdeclast, fnord, 0, 0, 0 -AVX_INSTR aesenc, fnord, 0, 0, 0 -AVX_INSTR aesenclast, fnord, 0, 0, 0 -AVX_INSTR aesimc -AVX_INSTR aeskeygenassist +AVX_INSTR aesdec, aesni, 0, 0, 0 +AVX_INSTR aesdeclast, aesni, 0, 0, 0 +AVX_INSTR aesenc, aesni, 0, 0, 0 +AVX_INSTR aesenclast, aesni, 0, 0, 0 +AVX_INSTR aesimc, aesni +AVX_INSTR aeskeygenassist, aesni AVX_INSTR andnpd, sse2, 1, 0, 0 AVX_INSTR andnps, sse, 1, 0, 0 AVX_INSTR andpd, sse2, 1, 0, 1 AVX_INSTR andps, sse, 1, 0, 1 -AVX_INSTR blendpd, sse4, 1, 0, 0 -AVX_INSTR blendps, sse4, 1, 0, 0 -AVX_INSTR blendvpd, sse4, 1, 0, 0 -AVX_INSTR blendvps, sse4, 1, 0, 0 +AVX_INSTR blendpd, sse4, 1, 1, 0 +AVX_INSTR blendps, sse4, 1, 1, 0 +AVX_INSTR blendvpd, sse4 ; can't be emulated +AVX_INSTR blendvps, sse4 ; can't be emulated +AVX_INSTR cmpeqpd, sse2, 1, 0, 1 +AVX_INSTR cmpeqps, sse, 1, 0, 1 +AVX_INSTR cmpeqsd, sse2, 1, 0, 0 +AVX_INSTR cmpeqss, sse, 1, 0, 0 +AVX_INSTR cmplepd, sse2, 1, 0, 0 +AVX_INSTR cmpleps, sse, 1, 0, 0 +AVX_INSTR cmplesd, sse2, 1, 0, 0 +AVX_INSTR cmpless, sse, 1, 0, 0 +AVX_INSTR cmpltpd, sse2, 1, 0, 0 +AVX_INSTR cmpltps, sse, 1, 0, 0 +AVX_INSTR cmpltsd, sse2, 1, 0, 0 +AVX_INSTR cmpltss, sse, 1, 0, 0 +AVX_INSTR cmpneqpd, sse2, 1, 0, 1 +AVX_INSTR cmpneqps, sse, 1, 0, 1 +AVX_INSTR cmpneqsd, sse2, 1, 0, 0 +AVX_INSTR cmpneqss, sse, 1, 0, 0 +AVX_INSTR cmpnlepd, sse2, 1, 0, 0 +AVX_INSTR cmpnleps, sse, 1, 0, 0 +AVX_INSTR cmpnlesd, sse2, 1, 0, 0 +AVX_INSTR cmpnless, sse, 1, 0, 0 +AVX_INSTR cmpnltpd, sse2, 1, 0, 0 +AVX_INSTR cmpnltps, sse, 1, 0, 0 +AVX_INSTR cmpnltsd, sse2, 1, 0, 0 +AVX_INSTR cmpnltss, sse, 1, 0, 0 +AVX_INSTR cmpordpd, sse2 1, 0, 1 +AVX_INSTR cmpordps, sse 1, 0, 1 +AVX_INSTR cmpordsd, sse2 1, 0, 0 +AVX_INSTR cmpordss, sse 1, 0, 0 AVX_INSTR cmppd, sse2, 1, 1, 0 AVX_INSTR cmpps, sse, 1, 1, 0 AVX_INSTR cmpsd, sse2, 1, 1, 0 AVX_INSTR cmpss, sse, 1, 1, 0 -AVX_INSTR comisd, sse2 -AVX_INSTR comiss, sse -AVX_INSTR cvtdq2pd, sse2 -AVX_INSTR cvtdq2ps, sse2 -AVX_INSTR cvtpd2dq, sse2 -AVX_INSTR cvtpd2ps, sse2 -AVX_INSTR cvtps2dq, sse2 -AVX_INSTR cvtps2pd, sse2 -AVX_INSTR cvtsd2si, sse2 -AVX_INSTR cvtsd2ss, sse2 -AVX_INSTR cvtsi2sd, sse2 -AVX_INSTR cvtsi2ss, sse -AVX_INSTR cvtss2sd, sse2 -AVX_INSTR cvtss2si, sse -AVX_INSTR cvttpd2dq, sse2 -AVX_INSTR cvttps2dq, sse2 -AVX_INSTR cvttsd2si, sse2 -AVX_INSTR cvttss2si, sse +AVX_INSTR cmpunordpd, sse2, 1, 0, 1 +AVX_INSTR cmpunordps, sse, 1, 0, 1 +AVX_INSTR cmpunordsd, sse2, 1, 0, 0 +AVX_INSTR cmpunordss, sse, 1, 0, 0 +AVX_INSTR comisd, sse2, 1 +AVX_INSTR comiss, sse, 1 +AVX_INSTR cvtdq2pd, sse2, 1 +AVX_INSTR cvtdq2ps, sse2, 1 +AVX_INSTR cvtpd2dq, sse2, 1 +AVX_INSTR cvtpd2ps, sse2, 1 +AVX_INSTR cvtps2dq, sse2, 1 +AVX_INSTR cvtps2pd, sse2, 1 +AVX_INSTR cvtsd2si, sse2, 1 +AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 +AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 +AVX_INSTR cvtsi2ss, sse, 1, 0, 0 +AVX_INSTR cvtss2sd, sse2, 1, 0, 0 +AVX_INSTR cvtss2si, sse, 1 +AVX_INSTR cvttpd2dq, sse2, 1 +AVX_INSTR cvttps2dq, sse2, 1 +AVX_INSTR cvttsd2si, sse2, 1 +AVX_INSTR cvttss2si, sse, 1 AVX_INSTR divpd, sse2, 1, 0, 0 AVX_INSTR divps, sse, 1, 0, 0 AVX_INSTR divsd, sse2, 1, 0, 0 AVX_INSTR divss, sse, 1, 0, 0 AVX_INSTR dppd, sse4, 1, 1, 0 AVX_INSTR dpps, sse4, 1, 1, 0 -AVX_INSTR extractps, sse4 +AVX_INSTR extractps, sse4, 1 +AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 +AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 +AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 AVX_INSTR haddpd, sse3, 1, 0, 0 AVX_INSTR haddps, sse3, 1, 0, 0 AVX_INSTR hsubpd, sse3, 1, 0, 0 AVX_INSTR hsubps, sse3, 1, 0, 0 AVX_INSTR insertps, sse4, 1, 1, 0 AVX_INSTR lddqu, sse3 -AVX_INSTR ldmxcsr, sse +AVX_INSTR ldmxcsr, sse, 1 AVX_INSTR maskmovdqu, sse2 AVX_INSTR maxpd, sse2, 1, 0, 1 AVX_INSTR maxps, sse, 1, 0, 1 -AVX_INSTR maxsd, sse2, 1, 0, 1 -AVX_INSTR maxss, sse, 1, 0, 1 +AVX_INSTR maxsd, sse2, 1, 0, 0 +AVX_INSTR maxss, sse, 1, 0, 0 AVX_INSTR minpd, sse2, 1, 0, 1 AVX_INSTR minps, sse, 1, 0, 1 -AVX_INSTR minsd, sse2, 1, 0, 1 -AVX_INSTR minss, sse, 1, 0, 1 -AVX_INSTR movapd, sse2 -AVX_INSTR movaps, sse +AVX_INSTR minsd, sse2, 1, 0, 0 +AVX_INSTR minss, sse, 1, 0, 0 +AVX_INSTR movapd, sse2, 1 +AVX_INSTR movaps, sse, 1 AVX_INSTR movd, mmx -AVX_INSTR movddup, sse3 +AVX_INSTR movddup, sse3, 1 AVX_INSTR movdqa, sse2 AVX_INSTR movdqu, sse2 AVX_INSTR movhlps, sse, 1, 0, 0 @@ -1386,24 +1586,24 @@ AVX_INSTR movhps, sse, 1, 0, 0 AVX_INSTR movlhps, sse, 1, 0, 0 AVX_INSTR movlpd, sse2, 1, 0, 0 AVX_INSTR movlps, sse, 1, 0, 0 -AVX_INSTR movmskpd, sse2 -AVX_INSTR movmskps, sse +AVX_INSTR movmskpd, sse2, 1 +AVX_INSTR movmskps, sse, 1 AVX_INSTR movntdq, sse2 AVX_INSTR movntdqa, sse4 -AVX_INSTR movntpd, sse2 -AVX_INSTR movntps, sse +AVX_INSTR movntpd, sse2, 1 +AVX_INSTR movntps, sse, 1 AVX_INSTR movq, mmx AVX_INSTR movsd, sse2, 1, 0, 0 -AVX_INSTR movshdup, sse3 -AVX_INSTR movsldup, sse3 +AVX_INSTR movshdup, sse3, 1 +AVX_INSTR movsldup, sse3, 1 AVX_INSTR movss, sse, 1, 0, 0 -AVX_INSTR movupd, sse2 -AVX_INSTR movups, sse -AVX_INSTR mpsadbw, sse4 +AVX_INSTR movupd, sse2, 1 +AVX_INSTR movups, sse, 1 +AVX_INSTR mpsadbw, sse4, 0, 1, 0 AVX_INSTR mulpd, sse2, 1, 0, 1 AVX_INSTR mulps, sse, 1, 0, 1 -AVX_INSTR mulsd, sse2, 1, 0, 1 -AVX_INSTR mulss, sse, 1, 0, 1 +AVX_INSTR mulsd, sse2, 1, 0, 0 +AVX_INSTR mulss, sse, 1, 0, 0 AVX_INSTR orpd, sse2, 1, 0, 1 AVX_INSTR orps, sse, 1, 0, 1 AVX_INSTR pabsb, ssse3 @@ -1421,14 +1621,18 @@ AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1 -AVX_INSTR palignr, ssse3 +AVX_INSTR palignr, ssse3, 0, 1, 0 AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 -AVX_INSTR pblendvb, sse4, 0, 0, 0 -AVX_INSTR pblendw, sse4 -AVX_INSTR pclmulqdq +AVX_INSTR pblendvb, sse4 ; can't be emulated +AVX_INSTR pblendw, sse4, 0, 1, 0 +AVX_INSTR pclmulqdq, fnord, 0, 1, 0 +AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 +AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 AVX_INSTR pcmpestri, sse42 AVX_INSTR pcmpestrm, sse42 AVX_INSTR pcmpistri, sse42 @@ -1452,10 +1656,10 @@ AVX_INSTR phminposuw, sse4 AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0 -AVX_INSTR pinsrb, sse4 -AVX_INSTR pinsrd, sse4 -AVX_INSTR pinsrq, sse4 -AVX_INSTR pinsrw, mmx2 +AVX_INSTR pinsrb, sse4, 0, 1, 0 +AVX_INSTR pinsrd, sse4, 0, 1, 0 +AVX_INSTR pinsrq, sse4, 0, 1, 0 +AVX_INSTR pinsrw, mmx2, 0, 1, 0 AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 AVX_INSTR pmaxsb, sse4, 0, 0, 1 @@ -1527,27 +1731,27 @@ AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 -AVX_INSTR rcpps, sse, 1, 0, 0 +AVX_INSTR rcpps, sse, 1 AVX_INSTR rcpss, sse, 1, 0, 0 -AVX_INSTR roundpd, sse4 -AVX_INSTR roundps, sse4 -AVX_INSTR roundsd, sse4 -AVX_INSTR roundss, sse4 -AVX_INSTR rsqrtps, sse, 1, 0, 0 +AVX_INSTR roundpd, sse4, 1 +AVX_INSTR roundps, sse4, 1 +AVX_INSTR roundsd, sse4, 1, 1, 0 +AVX_INSTR roundss, sse4, 1, 1, 0 +AVX_INSTR rsqrtps, sse, 1 AVX_INSTR rsqrtss, sse, 1, 0, 0 AVX_INSTR shufpd, sse2, 1, 1, 0 AVX_INSTR shufps, sse, 1, 1, 0 -AVX_INSTR sqrtpd, sse2, 1, 0, 0 -AVX_INSTR sqrtps, sse, 1, 0, 0 +AVX_INSTR sqrtpd, sse2, 1 +AVX_INSTR sqrtps, sse, 1 AVX_INSTR sqrtsd, sse2, 1, 0, 0 AVX_INSTR sqrtss, sse, 1, 0, 0 -AVX_INSTR stmxcsr, sse +AVX_INSTR stmxcsr, sse, 1 AVX_INSTR subpd, sse2, 1, 0, 0 AVX_INSTR subps, sse, 1, 0, 0 AVX_INSTR subsd, sse2, 1, 0, 0 AVX_INSTR subss, sse, 1, 0, 0 -AVX_INSTR ucomisd, sse2 -AVX_INSTR ucomiss, sse +AVX_INSTR ucomisd, sse2, 1 +AVX_INSTR ucomiss, sse, 1 AVX_INSTR unpckhpd, sse2, 1, 0, 0 AVX_INSTR unpckhps, sse, 1, 0, 0 AVX_INSTR unpcklpd, sse2, 1, 0, 0 @@ -1560,6 +1764,38 @@ AVX_INSTR pfadd, 3dnow, 1, 0, 1 AVX_INSTR pfsub, 3dnow, 1, 0, 0 AVX_INSTR pfmul, 3dnow, 1, 0, 1 +;%1 == instruction +;%2 == minimal instruction set +%macro GPR_INSTR 2 + %macro %1 2-5 fnord, %1, %2 + %ifdef cpuname + %if notcpuflag(%5) + %error use of ``%4'' %5 instruction in cpuname function: current_function + %endif + %endif + %ifidn %3, fnord + %4 %1, %2 + %else + %4 %1, %2, %3 + %endif + %endmacro +%endmacro + +GPR_INSTR andn, bmi1 +GPR_INSTR bextr, bmi1 +GPR_INSTR blsi, bmi1 +GPR_INSTR blsr, bmi1 +GPR_INSTR blsmsk, bmi1 +GPR_INSTR bzhi, bmi2 +GPR_INSTR mulx, bmi2 +GPR_INSTR pdep, bmi2 +GPR_INSTR pext, bmi2 +GPR_INSTR popcnt, sse42 +GPR_INSTR rorx, bmi2 +GPR_INSTR sarx, bmi2 +GPR_INSTR shlx, bmi2 +GPR_INSTR shrx, bmi2 + ; base-4 constants for shuffles %assign i 0 %rep 256 @@ -1610,7 +1846,7 @@ FMA_INSTR pmadcswd, pmaddwd, paddd v%5%6 %1, %2, %3, %4 %elifidn %1, %2 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. - %ifid %3 + %ifnum sizeof%3 v%{5}213%6 %2, %3, %4 %else v%{5}132%6 %2, %4, %3 @@ -1635,15 +1871,53 @@ FMA4_INSTR fmsubadd, pd, ps FMA4_INSTR fnmadd, pd, ps, sd, ss FMA4_INSTR fnmsub, pd, ps, sd, ss -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) -%ifdef __YASM_VER__ - %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 - %macro vpbroadcastq 2 - %if sizeof%1 == 16 - movddup %1, %2 - %else - vbroadcastsd %1, %2 +; Macros for converting VEX instructions to equivalent EVEX ones. +%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex + %macro %1 2-7 fnord, fnord, %1, %2, %3 + %ifidn %3, fnord + %define %%args %1, %2 + %elifidn %4, fnord + %define %%args %1, %2, %3 + %else + %define %%args %1, %2, %3, %4 + %endif + %assign %%evex_required cpuflag(avx512) & %7 + %ifnum regnumof%1 + %if regnumof%1 >= 16 || sizeof%1 > 32 + %assign %%evex_required 1 %endif - %endmacro - %endif -%endif + %endif + %ifnum regnumof%2 + %if regnumof%2 >= 16 || sizeof%2 > 32 + %assign %%evex_required 1 + %endif + %endif + %ifnum regnumof%3 + %if regnumof%3 >= 16 || sizeof%3 > 32 + %assign %%evex_required 1 + %endif + %endif + %if %%evex_required + %6 %%args + %else + %5 %%args ; Prefer VEX over EVEX due to shorter instruction length + %endif + %endmacro +%endmacro + +EVEX_INSTR vbroadcastf128, vbroadcastf32x4 +EVEX_INSTR vbroadcasti128, vbroadcasti32x4 +EVEX_INSTR vextractf128, vextractf32x4 +EVEX_INSTR vextracti128, vextracti32x4 +EVEX_INSTR vinsertf128, vinsertf32x4 +EVEX_INSTR vinserti128, vinserti32x4 +EVEX_INSTR vmovdqa, vmovdqa32 +EVEX_INSTR vmovdqu, vmovdqu32 +EVEX_INSTR vpand, vpandd +EVEX_INSTR vpandn, vpandnd +EVEX_INSTR vpor, vpord +EVEX_INSTR vpxor, vpxord +EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision +EVEX_INSTR vrcpss, vrcp14ss, 1 +EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 +EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 diff --git a/media/libvpx/libvpx/tools.mk b/media/libvpx/libvpx/tools.mk index 3c660b1dfd..79bb0cb8d6 100644 --- a/media/libvpx/libvpx/tools.mk +++ b/media/libvpx/libvpx/tools.mk @@ -10,7 +10,12 @@ # List of tools to build. TOOLS-yes += tiny_ssim.c -tiny_ssim.SRCS += vpx/vpx_integer.h +tiny_ssim.SRCS += vpx/vpx_integer.h y4minput.c y4minput.h \ + vpx/vpx_codec.h vpx/src/vpx_image.c +tiny_ssim.SRCS += vpx_mem/vpx_mem.c vpx_mem/vpx_mem.h +tiny_ssim.SRCS += vpx_dsp/ssim.h vpx_scale/yv12config.h +tiny_ssim.SRCS += vpx_ports/mem.h vpx_ports/mem.h +tiny_ssim.SRCS += vpx_mem/include/vpx_mem_intrnl.h tiny_ssim.GUID = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4 tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files @@ -23,7 +28,11 @@ tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files # Expand list of selected tools to build (as specified above) TOOLS = $(addprefix tools/,$(call enabled,TOOLS)) ALL_SRCS = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS)) +CFLAGS += -I../include +ifneq ($(CONFIG_CODEC_SRCS), yes) + CFLAGS += -I../include/vpx +endif # Expand all tools sources into a variable containing all sources # for that tools (not just them main one specified in TOOLS) @@ -39,15 +48,11 @@ DIST-SRCS-yes += $(ALL_SRCS) OBJS-$(NOT_MSVS) += $(call objs,$(ALL_SRCS)) BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX))) - # Instantiate linker template for all tools. $(foreach bin,$(BINS-yes),\ $(eval $(bin):)\ $(eval $(call linker_template,$(bin),\ - $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \ - -lm\ - ))) - + $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) -lm))) # The following pairs define a mapping of locations in the distribution # tree to locations in the source/build trees. @@ -74,6 +79,7 @@ $(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX) --ver=$$(CONFIG_VS_VERSION)\ --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\ --src-path-bare="$(SRC_PATH_BARE)" \ + --as=$$(AS) \ $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \ --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \ $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^ @@ -85,6 +91,13 @@ INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\ $(foreach proj,$(call enabled,PROJECTS),\ $(eval $(call vcproj_template,$(proj)))) +# Generate a list of all enabled sources, in particular for exporting to gyp +# based build systems. +tiny_ssim_srcs.txt: + @echo " [CREATE] $@" + @echo $(tiny_ssim.SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@ +CLEAN-OBJS += tiny_ssim_srcs.txt + # # Documentation Rules # diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Anandan.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Anandan.py new file mode 100644 index 0000000000..5ff9e98932 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Anandan.py @@ -0,0 +1,193 @@ +## Copyright (c) 2020 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# coding: utf-8 +import numpy as np +import numpy.linalg as LA +from scipy.ndimage.filters import gaussian_filter +from scipy.sparse import csc_matrix +from scipy.sparse.linalg import inv +from MotionEST import MotionEST +"""Anandan Model""" + + +class Anandan(MotionEST): + """ + constructor: + cur_f: current frame + ref_f: reference frame + blk_sz: block size + beta: smooth constrain weight + k1,k2,k3: confidence coefficients + max_iter: maximum number of iterations + """ + + def __init__(self, cur_f, ref_f, blk_sz, beta, k1, k2, k3, max_iter=100): + super(Anandan, self).__init__(cur_f, ref_f, blk_sz) + self.levels = int(np.log2(blk_sz)) + self.intensity_hierarchy() + self.c_maxs = [] + self.c_mins = [] + self.e_maxs = [] + self.e_mins = [] + for l in xrange(self.levels + 1): + c_max, c_min, e_max, e_min = self.get_curvature(self.cur_Is[l]) + self.c_maxs.append(c_max) + self.c_mins.append(c_min) + self.e_maxs.append(e_max) + self.e_mins.append(e_min) + self.beta = beta + self.k1, self.k2, self.k3 = k1, k2, k3 + self.max_iter = max_iter + + """ + build intensity hierarchy + """ + + def intensity_hierarchy(self): + level = 0 + self.cur_Is = [] + self.ref_Is = [] + #build each level itensity by using gaussian filters + while level <= self.levels: + cur_I = gaussian_filter(self.cur_yuv[:, :, 0], sigma=(2**level) * 0.56) + ref_I = gaussian_filter(self.ref_yuv[:, :, 0], sigma=(2**level) * 0.56) + self.ref_Is.append(ref_I) + self.cur_Is.append(cur_I) + level += 1 + + """ + get curvature of each block + """ + + def get_curvature(self, I): + c_max = np.zeros((self.num_row, self.num_col)) + c_min = np.zeros((self.num_row, self.num_col)) + e_max = np.zeros((self.num_row, self.num_col, 2)) + e_min = np.zeros((self.num_row, self.num_col, 2)) + for r in xrange(self.num_row): + for c in xrange(self.num_col): + h11, h12, h21, h22 = 0, 0, 0, 0 + for i in xrange(r * self.blk_sz, r * self.blk_sz + self.blk_sz): + for j in xrange(c * self.blk_sz, c * self.blk_sz + self.blk_sz): + if 0 <= i < self.height - 1 and 0 <= j < self.width - 1: + Ix = I[i][j + 1] - I[i][j] + Iy = I[i + 1][j] - I[i][j] + h11 += Iy * Iy + h12 += Ix * Iy + h21 += Ix * Iy + h22 += Ix * Ix + U, S, _ = LA.svd(np.array([[h11, h12], [h21, h22]])) + c_max[r, c], c_min[r, c] = S[0], S[1] + e_max[r, c] = U[:, 0] + e_min[r, c] = U[:, 1] + return c_max, c_min, e_max, e_min + + """ + get ssd of motion vector: + cur_I: current intensity + ref_I: reference intensity + center: current position + mv: motion vector + """ + + def get_ssd(self, cur_I, ref_I, center, mv): + ssd = 0 + for r in xrange(int(center[0]), int(center[0]) + self.blk_sz): + for c in xrange(int(center[1]), int(center[1]) + self.blk_sz): + if 0 <= r < self.height and 0 <= c < self.width: + tr, tc = r + int(mv[0]), c + int(mv[1]) + if 0 <= tr < self.height and 0 <= tc < self.width: + ssd += (ref_I[tr, tc] - cur_I[r, c])**2 + else: + ssd += cur_I[r, c]**2 + return ssd + + """ + get region match of level l + l: current level + last_mvs: matchine results of last level + radius: movenment radius + """ + + def region_match(self, l, last_mvs, radius): + mvs = np.zeros((self.num_row, self.num_col, 2)) + min_ssds = np.zeros((self.num_row, self.num_col)) + for r in xrange(self.num_row): + for c in xrange(self.num_col): + center = np.array([r * self.blk_sz, c * self.blk_sz]) + #use overlap hierarchy policy + init_mvs = [] + if last_mvs is None: + init_mvs = [np.array([0, 0])] + else: + for i, j in {(r, c), (r, c + 1), (r + 1, c), (r + 1, c + 1)}: + if 0 <= i < last_mvs.shape[0] and 0 <= j < last_mvs.shape[1]: + init_mvs.append(last_mvs[i, j]) + #use last matching results as the start position as current level + min_ssd = None + min_mv = None + for init_mv in init_mvs: + for i in xrange(-2, 3): + for j in xrange(-2, 3): + mv = init_mv + np.array([i, j]) * radius + ssd = self.get_ssd(self.cur_Is[l], self.ref_Is[l], center, mv) + if min_ssd is None or ssd < min_ssd: + min_ssd = ssd + min_mv = mv + min_ssds[r, c] = min_ssd + mvs[r, c] = min_mv + return mvs, min_ssds + + """ + smooth motion field based on neighbor constraint + uvs: current estimation + mvs: matching results + min_ssds: minimum ssd of matching results + l: current level + """ + + def smooth(self, uvs, mvs, min_ssds, l): + sm_uvs = np.zeros((self.num_row, self.num_col, 2)) + c_max = self.c_maxs[l] + c_min = self.c_mins[l] + e_max = self.e_maxs[l] + e_min = self.e_mins[l] + for r in xrange(self.num_row): + for c in xrange(self.num_col): + w_max = c_max[r, c] / ( + self.k1 + self.k2 * min_ssds[r, c] + self.k3 * c_max[r, c]) + w_min = c_min[r, c] / ( + self.k1 + self.k2 * min_ssds[r, c] + self.k3 * c_min[r, c]) + w = w_max * w_min / (w_max + w_min + 1e-6) + if w < 0: + w = 0 + avg_uv = np.array([0.0, 0.0]) + for i, j in {(r - 1, c), (r + 1, c), (r, c - 1), (r, c + 1)}: + if 0 <= i < self.num_row and 0 <= j < self.num_col: + avg_uv += 0.25 * uvs[i, j] + sm_uvs[r, c] = (w * w * mvs[r, c] + self.beta * avg_uv) / ( + self.beta + w * w) + return sm_uvs + + """ + motion field estimation + """ + + def motion_field_estimation(self): + last_mvs = None + for l in xrange(self.levels, -1, -1): + mvs, min_ssds = self.region_match(l, last_mvs, 2**l) + uvs = np.zeros(mvs.shape) + for _ in xrange(self.max_iter): + uvs = self.smooth(uvs, mvs, min_ssds, l) + last_mvs = uvs + for r in xrange(self.num_row): + for c in xrange(self.num_col): + self.mf[r, c] = uvs[r, c] diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py new file mode 100644 index 0000000000..d763de8562 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py @@ -0,0 +1,259 @@ +## Copyright (c) 2020 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# coding: utf-8 +import numpy as np +import numpy.linalg as LA +from Util import MSE +from MotionEST import MotionEST +"""Exhaust Search:""" + + +class Exhaust(MotionEST): + """ + Constructor: + cur_f: current frame + ref_f: reference frame + blk_sz: block size + wnd_size: search window size + metric: metric to compare the blocks distrotion + """ + + def __init__(self, cur_f, ref_f, blk_size, wnd_size, metric=MSE): + self.name = 'exhaust' + self.wnd_sz = wnd_size + self.metric = metric + super(Exhaust, self).__init__(cur_f, ref_f, blk_size) + + """ + search method: + cur_r: start row + cur_c: start column + """ + + def search(self, cur_r, cur_c): + min_loss = self.block_dist(cur_r, cur_c, [0, 0], self.metric) + cur_x = cur_c * self.blk_sz + cur_y = cur_r * self.blk_sz + ref_x = cur_x + ref_y = cur_y + #search all validate positions and select the one with minimum distortion + for y in xrange(cur_y - self.wnd_sz, cur_y + self.wnd_sz): + for x in xrange(cur_x - self.wnd_sz, cur_x + self.wnd_sz): + if 0 <= x < self.width - self.blk_sz and 0 <= y < self.height - self.blk_sz: + loss = self.block_dist(cur_r, cur_c, [y - cur_y, x - cur_x], + self.metric) + if loss < min_loss: + min_loss = loss + ref_x = x + ref_y = y + return ref_x, ref_y + + def motion_field_estimation(self): + for i in xrange(self.num_row): + for j in xrange(self.num_col): + ref_x, ref_y = self.search(i, j) + self.mf[i, j] = np.array( + [ref_y - i * self.blk_sz, ref_x - j * self.blk_sz]) + + +"""Exhaust with Neighbor Constraint""" + + +class ExhaustNeighbor(MotionEST): + """ + Constructor: + cur_f: current frame + ref_f: reference frame + blk_sz: block size + wnd_size: search window size + beta: neigbor loss weight + metric: metric to compare the blocks distrotion + """ + + def __init__(self, cur_f, ref_f, blk_size, wnd_size, beta, metric=MSE): + self.name = 'exhaust + neighbor' + self.wnd_sz = wnd_size + self.beta = beta + self.metric = metric + super(ExhaustNeighbor, self).__init__(cur_f, ref_f, blk_size) + self.assign = np.zeros((self.num_row, self.num_col), dtype=bool) + + """ + estimate neighbor loss: + cur_r: current row + cur_c: current column + mv: current motion vector + """ + + def neighborLoss(self, cur_r, cur_c, mv): + loss = 0 + #accumulate difference between current block's motion vector with neighbors' + for i, j in {(-1, 0), (1, 0), (0, 1), (0, -1)}: + nb_r = cur_r + i + nb_c = cur_c + j + if 0 <= nb_r < self.num_row and 0 <= nb_c < self.num_col and self.assign[ + nb_r, nb_c]: + loss += LA.norm(mv - self.mf[nb_r, nb_c]) + return loss + + """ + search method: + cur_r: start row + cur_c: start column + """ + + def search(self, cur_r, cur_c): + dist_loss = self.block_dist(cur_r, cur_c, [0, 0], self.metric) + nb_loss = self.neighborLoss(cur_r, cur_c, np.array([0, 0])) + min_loss = dist_loss + self.beta * nb_loss + cur_x = cur_c * self.blk_sz + cur_y = cur_r * self.blk_sz + ref_x = cur_x + ref_y = cur_y + #search all validate positions and select the one with minimum distortion + # as well as weighted neighbor loss + for y in xrange(cur_y - self.wnd_sz, cur_y + self.wnd_sz): + for x in xrange(cur_x - self.wnd_sz, cur_x + self.wnd_sz): + if 0 <= x < self.width - self.blk_sz and 0 <= y < self.height - self.blk_sz: + dist_loss = self.block_dist(cur_r, cur_c, [y - cur_y, x - cur_x], + self.metric) + nb_loss = self.neighborLoss(cur_r, cur_c, [y - cur_y, x - cur_x]) + loss = dist_loss + self.beta * nb_loss + if loss < min_loss: + min_loss = loss + ref_x = x + ref_y = y + return ref_x, ref_y + + def motion_field_estimation(self): + for i in xrange(self.num_row): + for j in xrange(self.num_col): + ref_x, ref_y = self.search(i, j) + self.mf[i, j] = np.array( + [ref_y - i * self.blk_sz, ref_x - j * self.blk_sz]) + self.assign[i, j] = True + + +"""Exhaust with Neighbor Constraint and Feature Score""" + + +class ExhaustNeighborFeatureScore(MotionEST): + """ + Constructor: + cur_f: current frame + ref_f: reference frame + blk_sz: block size + wnd_size: search window size + beta: neigbor loss weight + max_iter: maximum number of iterations + metric: metric to compare the blocks distrotion + """ + + def __init__(self, + cur_f, + ref_f, + blk_size, + wnd_size, + beta=1, + max_iter=100, + metric=MSE): + self.name = 'exhaust + neighbor+feature score' + self.wnd_sz = wnd_size + self.beta = beta + self.metric = metric + self.max_iter = max_iter + super(ExhaustNeighborFeatureScore, self).__init__(cur_f, ref_f, blk_size) + self.fs = self.getFeatureScore() + + """ + get feature score of each block + """ + + def getFeatureScore(self): + fs = np.zeros((self.num_row, self.num_col)) + for r in xrange(self.num_row): + for c in xrange(self.num_col): + IxIx = 0 + IyIy = 0 + IxIy = 0 + #get ssd surface + for x in xrange(self.blk_sz - 1): + for y in xrange(self.blk_sz - 1): + ox = c * self.blk_sz + x + oy = r * self.blk_sz + y + Ix = self.cur_yuv[oy, ox + 1, 0] - self.cur_yuv[oy, ox, 0] + Iy = self.cur_yuv[oy + 1, ox, 0] - self.cur_yuv[oy, ox, 0] + IxIx += Ix * Ix + IyIy += Iy * Iy + IxIy += Ix * Iy + #get maximum and minimum eigenvalues + lambda_max = 0.5 * ((IxIx + IyIy) + np.sqrt(4 * IxIy * IxIy + + (IxIx - IyIy)**2)) + lambda_min = 0.5 * ((IxIx + IyIy) - np.sqrt(4 * IxIy * IxIy + + (IxIx - IyIy)**2)) + fs[r, c] = lambda_max * lambda_min / (1e-6 + lambda_max + lambda_min) + if fs[r, c] < 0: + fs[r, c] = 0 + return fs + + """ + do exhaust search + """ + + def search(self, cur_r, cur_c): + min_loss = self.block_dist(cur_r, cur_c, [0, 0], self.metric) + cur_x = cur_c * self.blk_sz + cur_y = cur_r * self.blk_sz + ref_x = cur_x + ref_y = cur_y + #search all validate positions and select the one with minimum distortion + for y in xrange(cur_y - self.wnd_sz, cur_y + self.wnd_sz): + for x in xrange(cur_x - self.wnd_sz, cur_x + self.wnd_sz): + if 0 <= x < self.width - self.blk_sz and 0 <= y < self.height - self.blk_sz: + loss = self.block_dist(cur_r, cur_c, [y - cur_y, x - cur_x], + self.metric) + if loss < min_loss: + min_loss = loss + ref_x = x + ref_y = y + return ref_x, ref_y + + """ + add smooth constraint + """ + + def smooth(self, uvs, mvs): + sm_uvs = np.zeros(uvs.shape) + for r in xrange(self.num_row): + for c in xrange(self.num_col): + avg_uv = np.array([0.0, 0.0]) + for i, j in {(r - 1, c), (r + 1, c), (r, c - 1), (r, c + 1)}: + if 0 <= i < self.num_row and 0 <= j < self.num_col: + avg_uv += uvs[i, j] / 6.0 + for i, j in {(r - 1, c - 1), (r - 1, c + 1), (r + 1, c - 1), + (r + 1, c + 1)}: + if 0 <= i < self.num_row and 0 <= j < self.num_col: + avg_uv += uvs[i, j] / 12.0 + sm_uvs[r, c] = (self.fs[r, c] * mvs[r, c] + self.beta * avg_uv) / ( + self.beta + self.fs[r, c]) + return sm_uvs + + def motion_field_estimation(self): + #get matching results + mvs = np.zeros(self.mf.shape) + for r in xrange(self.num_row): + for c in xrange(self.num_col): + ref_x, ref_y = self.search(r, c) + mvs[r, c] = np.array([ref_y - r * self.blk_sz, ref_x - c * self.blk_sz]) + #add smoothness constraint + uvs = np.zeros(self.mf.shape) + for _ in xrange(self.max_iter): + uvs = self.smooth(uvs, mvs) + self.mf = uvs diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py new file mode 100644 index 0000000000..37305898a7 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py @@ -0,0 +1,48 @@ +## Copyright (c) 2020 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +#coding : utf - 8 +import numpy as np +import numpy.linalg as LA +from MotionEST import MotionEST +"""Ground Truth: + + Load in ground truth motion field and mask +""" + + +class GroundTruth(MotionEST): + """constructor: + + cur_f:current + frame ref_f:reference + frame blk_sz:block size + gt_path:ground truth motion field file path + """ + + def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None): + self.name = 'ground truth' + super(GroundTruth, self).__init__(cur_f, ref_f, blk_sz) + self.mask = np.zeros((self.num_row, self.num_col), dtype=bool) + if gt_path: + with open(gt_path) as gt_file: + lines = gt_file.readlines() + for i in xrange(len(lines)): + info = lines[i].split(';') + for j in xrange(len(info)): + x, y = info[j].split(',') + #-, - stands for nothing + if x == '-' or y == '-': + self.mask[i, -j - 1] = True + continue + #the order of original file is flipped on the x axis + self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=int) + else: + self.mf = mf + self.mask = mask diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/HornSchunck.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/HornSchunck.py new file mode 100644 index 0000000000..976bd4a178 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/HornSchunck.py @@ -0,0 +1,212 @@ +## Copyright (c) 2020 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# coding: utf-8 +import numpy as np +import numpy.linalg as LA +from scipy.ndimage.filters import gaussian_filter +from scipy.sparse import csc_matrix +from scipy.sparse.linalg import inv +from MotionEST import MotionEST +"""Horn & Schunck Model""" + + +class HornSchunck(MotionEST): + """ + constructor: + cur_f: current frame + ref_f: reference frame + blk_sz: block size + alpha: smooth constrain weight + sigma: gaussian blur parameter + """ + + def __init__(self, cur_f, ref_f, blk_sz, alpha, sigma, max_iter=100): + super(HornSchunck, self).__init__(cur_f, ref_f, blk_sz) + self.cur_I, self.ref_I = self.getIntensity() + #perform gaussian blur to smooth the intensity + self.cur_I = gaussian_filter(self.cur_I, sigma=sigma) + self.ref_I = gaussian_filter(self.ref_I, sigma=sigma) + self.alpha = alpha + self.max_iter = max_iter + self.Ix, self.Iy, self.It = self.intensityDiff() + + """ + Build Frame Intensity + """ + + def getIntensity(self): + cur_I = np.zeros((self.num_row, self.num_col)) + ref_I = np.zeros((self.num_row, self.num_col)) + #use average intensity as block's intensity + for i in xrange(self.num_row): + for j in xrange(self.num_col): + r = i * self.blk_sz + c = j * self.blk_sz + cur_I[i, j] = np.mean(self.cur_yuv[r:r + self.blk_sz, c:c + self.blk_sz, + 0]) + ref_I[i, j] = np.mean(self.ref_yuv[r:r + self.blk_sz, c:c + self.blk_sz, + 0]) + return cur_I, ref_I + + """ + Get First Order Derivative + """ + + def intensityDiff(self): + Ix = np.zeros((self.num_row, self.num_col)) + Iy = np.zeros((self.num_row, self.num_col)) + It = np.zeros((self.num_row, self.num_col)) + sz = self.blk_sz + for i in xrange(self.num_row - 1): + for j in xrange(self.num_col - 1): + """ + Ix: + (i ,j) <--- (i ,j+1) + (i+1,j) <--- (i+1,j+1) + """ + count = 0 + for r, c in {(i, j + 1), (i + 1, j + 1)}: + if 0 <= r < self.num_row and 0 < c < self.num_col: + Ix[i, j] += ( + self.cur_I[r, c] - self.cur_I[r, c - 1] + self.ref_I[r, c] - + self.ref_I[r, c - 1]) + count += 2 + Ix[i, j] /= count + """ + Iy: + (i ,j) (i ,j+1) + ^ ^ + | | + (i+1,j) (i+1,j+1) + """ + count = 0 + for r, c in {(i + 1, j), (i + 1, j + 1)}: + if 0 < r < self.num_row and 0 <= c < self.num_col: + Iy[i, j] += ( + self.cur_I[r, c] - self.cur_I[r - 1, c] + self.ref_I[r, c] - + self.ref_I[r - 1, c]) + count += 2 + Iy[i, j] /= count + count = 0 + #It: + for r in xrange(i, i + 2): + for c in xrange(j, j + 2): + if 0 <= r < self.num_row and 0 <= c < self.num_col: + It[i, j] += (self.ref_I[r, c] - self.cur_I[r, c]) + count += 1 + It[i, j] /= count + return Ix, Iy, It + + """ + Get weighted average of neighbor motion vectors + for evaluation of laplacian + """ + + def averageMV(self): + avg = np.zeros((self.num_row, self.num_col, 2)) + """ + 1/12 --- 1/6 --- 1/12 + | | | + 1/6 --- -1/8 --- 1/6 + | | | + 1/12 --- 1/6 --- 1/12 + """ + for i in xrange(self.num_row): + for j in xrange(self.num_col): + for r, c in {(-1, 0), (1, 0), (0, -1), (0, 1)}: + if 0 <= i + r < self.num_row and 0 <= j + c < self.num_col: + avg[i, j] += self.mf[i + r, j + c] / 6.0 + for r, c in {(-1, -1), (-1, 1), (1, -1), (1, 1)}: + if 0 <= i + r < self.num_row and 0 <= j + c < self.num_col: + avg[i, j] += self.mf[i + r, j + c] / 12.0 + return avg + + def motion_field_estimation(self): + count = 0 + """ + u_{n+1} = ~u_n - Ix(Ix.~u_n+Iy.~v+It)/(IxIx+IyIy+alpha^2) + v_{n+1} = ~v_n - Iy(Ix.~u_n+Iy.~v+It)/(IxIx+IyIy+alpha^2) + """ + denom = self.alpha**2 + np.power(self.Ix, 2) + np.power(self.Iy, 2) + while count < self.max_iter: + avg = self.averageMV() + self.mf[:, :, 1] = avg[:, :, 1] - self.Ix * ( + self.Ix * avg[:, :, 1] + self.Iy * avg[:, :, 0] + self.It) / denom + self.mf[:, :, 0] = avg[:, :, 0] - self.Iy * ( + self.Ix * avg[:, :, 1] + self.Iy * avg[:, :, 0] + self.It) / denom + count += 1 + self.mf *= self.blk_sz + + def motion_field_estimation_mat(self): + row_idx = [] + col_idx = [] + data = [] + + N = 2 * self.num_row * self.num_col + b = np.zeros((N, 1)) + for i in xrange(self.num_row): + for j in xrange(self.num_col): + """(IxIx+alpha^2)u+IxIy.v-alpha^2~u IxIy.u+(IyIy+alpha^2)v-alpha^2~v""" + u_idx = i * 2 * self.num_col + 2 * j + v_idx = u_idx + 1 + b[u_idx, 0] = -self.Ix[i, j] * self.It[i, j] + b[v_idx, 0] = -self.Iy[i, j] * self.It[i, j] + #u: (IxIx+alpha^2)u + row_idx.append(u_idx) + col_idx.append(u_idx) + data.append(self.Ix[i, j] * self.Ix[i, j] + self.alpha**2) + #IxIy.v + row_idx.append(u_idx) + col_idx.append(v_idx) + data.append(self.Ix[i, j] * self.Iy[i, j]) + + #v: IxIy.u + row_idx.append(v_idx) + col_idx.append(u_idx) + data.append(self.Ix[i, j] * self.Iy[i, j]) + #(IyIy+alpha^2)v + row_idx.append(v_idx) + col_idx.append(v_idx) + data.append(self.Iy[i, j] * self.Iy[i, j] + self.alpha**2) + + #-alpha^2~u + #-alpha^2~v + for r, c in {(-1, 0), (1, 0), (0, -1), (0, 1)}: + if 0 <= i + r < self.num_row and 0 <= j + c < self.num_col: + u_nb = (i + r) * 2 * self.num_col + 2 * (j + c) + v_nb = u_nb + 1 + + row_idx.append(u_idx) + col_idx.append(u_nb) + data.append(-1 * self.alpha**2 / 6.0) + + row_idx.append(v_idx) + col_idx.append(v_nb) + data.append(-1 * self.alpha**2 / 6.0) + for r, c in {(-1, -1), (-1, 1), (1, -1), (1, 1)}: + if 0 <= i + r < self.num_row and 0 <= j + c < self.num_col: + u_nb = (i + r) * 2 * self.num_col + 2 * (j + c) + v_nb = u_nb + 1 + + row_idx.append(u_idx) + col_idx.append(u_nb) + data.append(-1 * self.alpha**2 / 12.0) + + row_idx.append(v_idx) + col_idx.append(v_nb) + data.append(-1 * self.alpha**2 / 12.0) + M = csc_matrix((data, (row_idx, col_idx)), shape=(N, N)) + M_inv = inv(M) + uv = M_inv.dot(b) + + for i in xrange(self.num_row): + for j in xrange(self.num_col): + self.mf[i, j, 0] = uv[i * 2 * self.num_col + 2 * j + 1, 0] * self.blk_sz + self.mf[i, j, 1] = uv[i * 2 * self.num_col + 2 * j, 0] * self.blk_sz diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py new file mode 100644 index 0000000000..fc393818d9 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py @@ -0,0 +1,117 @@ +## Copyright (c) 2020 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +#coding : utf - 8 +import numpy as np +import numpy.linalg as LA +import matplotlib.pyplot as plt +from Util import drawMF, MSE +"""The Base Class of Estimators""" + + +class MotionEST(object): + """ + constructor: + cur_f: current frame + ref_f: reference frame + blk_sz: block size + """ + + def __init__(self, cur_f, ref_f, blk_sz): + self.cur_f = cur_f + self.ref_f = ref_f + self.blk_sz = blk_sz + #convert RGB to YUV + self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=int) + self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=int) + #frame size + self.width = self.cur_f.size[0] + self.height = self.cur_f.size[1] + #motion field size + self.num_row = self.height // self.blk_sz + self.num_col = self.width // self.blk_sz + #initialize motion field + self.mf = np.zeros((self.num_row, self.num_col, 2)) + + """estimation function Override by child classes""" + + def motion_field_estimation(self): + pass + + """ + distortion of a block: + cur_r: current row + cur_c: current column + mv: motion vector + metric: distortion metric + """ + + def block_dist(self, cur_r, cur_c, mv, metric=MSE): + cur_x = cur_c * self.blk_sz + cur_y = cur_r * self.blk_sz + h = min(self.blk_sz, self.height - cur_y) + w = min(self.blk_sz, self.width - cur_x) + cur_blk = self.cur_yuv[cur_y:cur_y + h, cur_x:cur_x + w, :] + ref_x = int(cur_x + mv[1]) + ref_y = int(cur_y + mv[0]) + if 0 <= ref_x < self.width - w and 0 <= ref_y < self.height - h: + ref_blk = self.ref_yuv[ref_y:ref_y + h, ref_x:ref_x + w, :] + else: + ref_blk = np.zeros((h, w, 3)) + return metric(cur_blk, ref_blk) + + """ + distortion of motion field + """ + + def distortion(self, mask=None, metric=MSE): + loss = 0 + count = 0 + for i in xrange(self.num_row): + for j in xrange(self.num_col): + if mask is not None and mask[i, j]: + continue + loss += self.block_dist(i, j, self.mf[i, j], metric) + count += 1 + return loss / count + + """evaluation compare the difference with ground truth""" + + def motion_field_evaluation(self, ground_truth): + loss = 0 + count = 0 + gt = ground_truth.mf + mask = ground_truth.mask + for i in xrange(self.num_row): + for j in xrange(self.num_col): + if mask is not None and mask[i][j]: + continue + loss += LA.norm(gt[i, j] - self.mf[i, j]) + count += 1 + return loss / count + + """render the motion field""" + + def show(self, ground_truth=None, size=10): + cur_mf = drawMF(self.cur_f, self.blk_sz, self.mf) + if ground_truth is None: + n_row = 1 + else: + gt_mf = drawMF(self.cur_f, self.blk_sz, ground_truth) + n_row = 2 + plt.figure(figsize=(n_row * size, size * self.height / self.width)) + plt.subplot(1, n_row, 1) + plt.imshow(cur_mf) + plt.title('Estimated Motion Field') + if ground_truth is not None: + plt.subplot(1, n_row, 2) + plt.imshow(gt_mf) + plt.title('Ground Truth') + plt.tight_layout() + plt.show() diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/SearchSmooth.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/SearchSmooth.py new file mode 100644 index 0000000000..2dc6771ee5 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/SearchSmooth.py @@ -0,0 +1,221 @@ +## Copyright (c) 2020 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# coding: utf-8 +import numpy as np +import numpy.linalg as LA +from Util import MSE +from MotionEST import MotionEST +"""Search & Smooth Model with Adapt Weights""" + + +class SearchSmoothAdapt(MotionEST): + """ + Constructor: + cur_f: current frame + ref_f: reference frame + blk_sz: block size + wnd_size: search window size + beta: neigbor loss weight + max_iter: maximum number of iterations + metric: metric to compare the blocks distrotion + """ + + def __init__(self, cur_f, ref_f, blk_size, search, max_iter=100): + self.search = search + self.max_iter = max_iter + super(SearchSmoothAdapt, self).__init__(cur_f, ref_f, blk_size) + + """ + get local diffiencial of refernce + """ + + def getRefLocalDiff(self, mvs): + m, n = self.num_row, self.num_col + localDiff = [[] for _ in xrange(m)] + blk_sz = self.blk_sz + for r in xrange(m): + for c in xrange(n): + I_row = 0 + I_col = 0 + #get ssd surface + count = 0 + center = self.cur_yuv[r * blk_sz:(r + 1) * blk_sz, + c * blk_sz:(c + 1) * blk_sz, 0] + ty = np.clip(r * blk_sz + int(mvs[r, c, 0]), 0, self.height - blk_sz) + tx = np.clip(c * blk_sz + int(mvs[r, c, 1]), 0, self.width - blk_sz) + target = self.ref_yuv[ty:ty + blk_sz, tx:tx + blk_sz, 0] + for y, x in {(ty - blk_sz, tx), (ty + blk_sz, tx)}: + if 0 <= y < self.height - blk_sz and 0 <= x < self.width - blk_sz: + nb = self.ref_yuv[y:y + blk_sz, x:x + blk_sz, 0] + I_row += np.sum(np.abs(nb - center)) - np.sum( + np.abs(target - center)) + count += 1 + I_row //= (count * blk_sz * blk_sz) + count = 0 + for y, x in {(ty, tx - blk_sz), (ty, tx + blk_sz)}: + if 0 <= y < self.height - blk_sz and 0 <= x < self.width - blk_sz: + nb = self.ref_yuv[y:y + blk_sz, x:x + blk_sz, 0] + I_col += np.sum(np.abs(nb - center)) - np.sum( + np.abs(target - center)) + count += 1 + I_col //= (count * blk_sz * blk_sz) + localDiff[r].append( + np.array([[I_row * I_row, I_row * I_col], + [I_col * I_row, I_col * I_col]])) + return localDiff + + """ + add smooth constraint + """ + + def smooth(self, uvs, mvs): + sm_uvs = np.zeros(uvs.shape) + blk_sz = self.blk_sz + for r in xrange(self.num_row): + for c in xrange(self.num_col): + nb_uv = np.array([0.0, 0.0]) + for i, j in {(r - 1, c), (r + 1, c), (r, c - 1), (r, c + 1)}: + if 0 <= i < self.num_row and 0 <= j < self.num_col: + nb_uv += uvs[i, j] / 6.0 + else: + nb_uv += uvs[r, c] / 6.0 + for i, j in {(r - 1, c - 1), (r - 1, c + 1), (r + 1, c - 1), + (r + 1, c + 1)}: + if 0 <= i < self.num_row and 0 <= j < self.num_col: + nb_uv += uvs[i, j] / 12.0 + else: + nb_uv += uvs[r, c] / 12.0 + ssd_nb = self.block_dist(r, c, self.blk_sz * nb_uv) + mv = mvs[r, c] + ssd_mv = self.block_dist(r, c, mv) + alpha = (ssd_nb - ssd_mv) / (ssd_mv + 1e-6) + M = alpha * self.localDiff[r][c] + P = M + np.identity(2) + inv_P = LA.inv(P) + sm_uvs[r, c] = np.dot(inv_P, nb_uv) + np.dot( + np.matmul(inv_P, M), mv / blk_sz) + return sm_uvs + + def block_matching(self): + self.search.motion_field_estimation() + + def motion_field_estimation(self): + self.localDiff = self.getRefLocalDiff(self.search.mf) + #get matching results + mvs = self.search.mf + #add smoothness constraint + uvs = mvs / self.blk_sz + for _ in xrange(self.max_iter): + uvs = self.smooth(uvs, mvs) + self.mf = uvs * self.blk_sz + + +"""Search & Smooth Model with Fixed Weights""" + + +class SearchSmoothFix(MotionEST): + """ + Constructor: + cur_f: current frame + ref_f: reference frame + blk_sz: block size + wnd_size: search window size + beta: neigbor loss weight + max_iter: maximum number of iterations + metric: metric to compare the blocks distrotion + """ + + def __init__(self, cur_f, ref_f, blk_size, search, beta, max_iter=100): + self.search = search + self.max_iter = max_iter + self.beta = beta + super(SearchSmoothFix, self).__init__(cur_f, ref_f, blk_size) + + """ + get local diffiencial of refernce + """ + + def getRefLocalDiff(self, mvs): + m, n = self.num_row, self.num_col + localDiff = [[] for _ in xrange(m)] + blk_sz = self.blk_sz + for r in xrange(m): + for c in xrange(n): + I_row = 0 + I_col = 0 + #get ssd surface + count = 0 + center = self.cur_yuv[r * blk_sz:(r + 1) * blk_sz, + c * blk_sz:(c + 1) * blk_sz, 0] + ty = np.clip(r * blk_sz + int(mvs[r, c, 0]), 0, self.height - blk_sz) + tx = np.clip(c * blk_sz + int(mvs[r, c, 1]), 0, self.width - blk_sz) + target = self.ref_yuv[ty:ty + blk_sz, tx:tx + blk_sz, 0] + for y, x in {(ty - blk_sz, tx), (ty + blk_sz, tx)}: + if 0 <= y < self.height - blk_sz and 0 <= x < self.width - blk_sz: + nb = self.ref_yuv[y:y + blk_sz, x:x + blk_sz, 0] + I_row += np.sum(np.abs(nb - center)) - np.sum( + np.abs(target - center)) + count += 1 + I_row //= (count * blk_sz * blk_sz) + count = 0 + for y, x in {(ty, tx - blk_sz), (ty, tx + blk_sz)}: + if 0 <= y < self.height - blk_sz and 0 <= x < self.width - blk_sz: + nb = self.ref_yuv[y:y + blk_sz, x:x + blk_sz, 0] + I_col += np.sum(np.abs(nb - center)) - np.sum( + np.abs(target - center)) + count += 1 + I_col //= (count * blk_sz * blk_sz) + localDiff[r].append( + np.array([[I_row * I_row, I_row * I_col], + [I_col * I_row, I_col * I_col]])) + return localDiff + + """ + add smooth constraint + """ + + def smooth(self, uvs, mvs): + sm_uvs = np.zeros(uvs.shape) + blk_sz = self.blk_sz + for r in xrange(self.num_row): + for c in xrange(self.num_col): + nb_uv = np.array([0.0, 0.0]) + for i, j in {(r - 1, c), (r + 1, c), (r, c - 1), (r, c + 1)}: + if 0 <= i < self.num_row and 0 <= j < self.num_col: + nb_uv += uvs[i, j] / 6.0 + else: + nb_uv += uvs[r, c] / 6.0 + for i, j in {(r - 1, c - 1), (r - 1, c + 1), (r + 1, c - 1), + (r + 1, c + 1)}: + if 0 <= i < self.num_row and 0 <= j < self.num_col: + nb_uv += uvs[i, j] / 12.0 + else: + nb_uv += uvs[r, c] / 12.0 + mv = mvs[r, c] / blk_sz + M = self.localDiff[r][c] + P = M + self.beta * np.identity(2) + inv_P = LA.inv(P) + sm_uvs[r, c] = np.dot(inv_P, self.beta * nb_uv) + np.dot( + np.matmul(inv_P, M), mv) + return sm_uvs + + def block_matching(self): + self.search.motion_field_estimation() + + def motion_field_estimation(self): + #get local structure + self.localDiff = self.getRefLocalDiff(self.search.mf) + #get matching results + mvs = self.search.mf + #add smoothness constraint + uvs = mvs / self.blk_sz + for _ in xrange(self.max_iter): + uvs = self.smooth(uvs, mvs) + self.mf = uvs * self.blk_sz diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Util.py b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Util.py new file mode 100644 index 0000000000..c2416163be --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/MotionEST/Util.py @@ -0,0 +1,46 @@ +## Copyright (c) 2020 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# coding: utf-8 +import numpy as np +import numpy.linalg as LA +import matplotlib.pyplot as plt +from scipy.ndimage import filters +from PIL import Image, ImageDraw + + +def MSE(blk1, blk2): + return np.mean( + LA.norm( + np.array(blk1, dtype=int) - np.array(blk2, dtype=int), axis=2)) + + +def drawMF(img, blk_sz, mf): + img_rgba = img.convert('RGBA') + mf_layer = Image.new(mode='RGBA', size=img_rgba.size, color=(0, 0, 0, 0)) + draw = ImageDraw.Draw(mf_layer) + width = img_rgba.size[0] + height = img_rgba.size[1] + num_row = height // blk_sz + num_col = width // blk_sz + for i in xrange(num_row): + left = (0, i * blk_sz) + right = (width, i * blk_sz) + draw.line([left, right], fill=(0, 0, 255, 255)) + for j in xrange(num_col): + up = (j * blk_sz, 0) + down = (j * blk_sz, height) + draw.line([up, down], fill=(0, 0, 255, 255)) + for i in xrange(num_row): + for j in xrange(num_col): + center = (j * blk_sz + 0.5 * blk_sz, i * blk_sz + 0.5 * blk_sz) + """mf[i,j][0] is the row shift and mf[i,j][1] is the column shift In PIL coordinates, head[0] is x (column shift) and head[1] is y (row shift).""" + head = (center[0] + mf[i, j][1], center[1] + mf[i, j][0]) + draw.line([center, head], fill=(255, 0, 0, 255)) + return Image.alpha_composite(img_rgba, mf_layer) diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py b/media/libvpx/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py new file mode 100644 index 0000000000..8028102f0c --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py @@ -0,0 +1,85 @@ +## Copyright (c) 2020 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +import argparse +from os import listdir, path +from PIL import Image +import sys + +parser = argparse.ArgumentParser() +parser.add_argument("--frame_path", default="../data/frame/", type=str) +parser.add_argument("--frame_rate", default="25:1", type=str) +parser.add_argument("--interlacing", default="Ip", type=str) +parser.add_argument("--pix_ratio", default="0:0", type=str) +parser.add_argument("--color_space", default="4:2:0", type=str) +parser.add_argument("--output", default="output.y4m", type=str) + + +def generate(args, frames): + if len(frames) == 0: + return + #sort the frames based on the frame index + frames = sorted(frames, key=lambda x: x[0]) + #convert the frames to YUV form + frames = [f.convert("YCbCr") for _, f in frames] + #write the header + header = "YUV4MPEG2 W%d H%d F%s %s A%s" % (frames[0].width, frames[0].height, + args.frame_rate, args.interlacing, + args.pix_ratio) + cs = args.color_space.split(":") + header += " C%s%s%s\n" % (cs[0], cs[1], cs[2]) + #estimate the sample step based on subsample value + subsamples = [int(c) for c in cs] + r_step = [1, int(subsamples[2] == 0) + 1, int(subsamples[2] == 0) + 1] + c_step = [1, 4 // subsamples[1], 4 // subsamples[1]] + #write in frames + with open(args.output, "wb") as y4m: + y4m.write(header) + for f in frames: + y4m.write("FRAME\n") + px = f.load() + for k in xrange(3): + for i in xrange(0, f.height, r_step[k]): + for j in xrange(0, f.width, c_step[k]): + yuv = px[j, i] + y4m.write(chr(yuv[k])) + + +if __name__ == "__main__": + args = parser.parse_args() + frames = [] + frames_mv = [] + for filename in listdir(args.frame_path): + name, ext = filename.split(".") + if ext == "png": + name_parse = name.split("_") + idx = int(name_parse[-1]) + img = Image.open(path.join(args.frame_path, filename)) + if name_parse[-2] == "mv": + frames_mv.append((idx, img)) + else: + frames.append((idx, img)) + if len(frames) == 0: + print("No frames in directory: " + args.frame_path) + sys.exit() + print("----------------------Y4M Info----------------------") + print("width: %d" % frames[0][1].width) + print("height: %d" % frames[0][1].height) + print("#frame: %d" % len(frames)) + print("frame rate: %s" % args.frame_rate) + print("interlacing: %s" % args.interlacing) + print("pixel ratio: %s" % args.pix_ratio) + print("color space: %s" % args.color_space) + print("----------------------------------------------------") + + print("Generating ...") + generate(args, frames) + if len(frames_mv) != 0: + args.output = args.output.replace(".y4m", "_mv.y4m") + generate(args, frames_mv) diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde new file mode 100644 index 0000000000..7249ee972e --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde @@ -0,0 +1,163 @@ +/* + *AABB bounding box + *Bouding Volume Hierarchy + */ +class BoundingBox { + float min_x, min_y, min_z, max_x, max_y, max_z; + PVector center; + BoundingBox() { + min_x = Float.POSITIVE_INFINITY; + min_y = Float.POSITIVE_INFINITY; + min_z = Float.POSITIVE_INFINITY; + max_x = Float.NEGATIVE_INFINITY; + max_y = Float.NEGATIVE_INFINITY; + max_z = Float.NEGATIVE_INFINITY; + center = new PVector(); + } + // build a bounding box for a triangle + void create(Triangle t) { + min_x = min(t.p1.x, min(t.p2.x, t.p3.x)); + max_x = max(t.p1.x, max(t.p2.x, t.p3.x)); + + min_y = min(t.p1.y, min(t.p2.y, t.p3.y)); + max_y = max(t.p1.y, max(t.p2.y, t.p3.y)); + + min_z = min(t.p1.z, min(t.p2.z, t.p3.z)); + max_z = max(t.p1.z, max(t.p2.z, t.p3.z)); + center.x = (max_x + min_x) / 2; + center.y = (max_y + min_y) / 2; + center.z = (max_z + min_z) / 2; + } + // merge two bounding boxes + void add(BoundingBox bbx) { + min_x = min(min_x, bbx.min_x); + min_y = min(min_y, bbx.min_y); + min_z = min(min_z, bbx.min_z); + + max_x = max(max_x, bbx.max_x); + max_y = max(max_y, bbx.max_y); + max_z = max(max_z, bbx.max_z); + center.x = (max_x + min_x) / 2; + center.y = (max_y + min_y) / 2; + center.z = (max_z + min_z) / 2; + } + // get bounding box center axis value + float getCenterAxisValue(int axis) { + if (axis == 1) { + return center.x; + } else if (axis == 2) { + return center.y; + } + // when axis == 3 + return center.z; + } + // check if a ray is intersected with the bounding box + boolean intersect(Ray r) { + float tmin, tmax; + if (r.dir.x >= 0) { + tmin = (min_x - r.ori.x) * (1.0f / r.dir.x); + tmax = (max_x - r.ori.x) * (1.0f / r.dir.x); + } else { + tmin = (max_x - r.ori.x) * (1.0f / r.dir.x); + tmax = (min_x - r.ori.x) * (1.0f / r.dir.x); + } + + float tymin, tymax; + if (r.dir.y >= 0) { + tymin = (min_y - r.ori.y) * (1.0f / r.dir.y); + tymax = (max_y - r.ori.y) * (1.0f / r.dir.y); + } else { + tymin = (max_y - r.ori.y) * (1.0f / r.dir.y); + tymax = (min_y - r.ori.y) * (1.0f / r.dir.y); + } + + if (tmax < tymin || tymax < tmin) { + return false; + } + + tmin = tmin < tymin ? tymin : tmin; + tmax = tmax > tymax ? tymax : tmax; + + float tzmin, tzmax; + if (r.dir.z >= 0) { + tzmin = (min_z - r.ori.z) * (1.0f / r.dir.z); + tzmax = (max_z - r.ori.z) * (1.0f / r.dir.z); + } else { + tzmin = (max_z - r.ori.z) * (1.0f / r.dir.z); + tzmax = (min_z - r.ori.z) * (1.0f / r.dir.z); + } + if (tmax < tzmin || tmin > tzmax) { + return false; + } + return true; + } +} +// Bounding Volume Hierarchy +class BVH { + // Binary Tree + BVH left, right; + BoundingBox overall_bbx; + ArrayList mesh; + BVH(ArrayList mesh) { + this.mesh = mesh; + overall_bbx = new BoundingBox(); + left = null; + right = null; + int mesh_size = this.mesh.size(); + if (mesh_size <= 1) { + return; + } + // random select an axis + int axis = int(random(100)) % 3 + 1; + // build bounding box and save the selected center component + float[] axis_values = new float[mesh_size]; + for (int i = 0; i < mesh_size; i++) { + Triangle t = this.mesh.get(i); + overall_bbx.add(t.bbx); + axis_values[i] = t.bbx.getCenterAxisValue(axis); + } + // find the median value of selected center component as pivot + axis_values = sort(axis_values); + float pivot; + if (mesh_size % 2 == 1) { + pivot = axis_values[mesh_size / 2]; + } else { + pivot = + 0.5f * (axis_values[mesh_size / 2 - 1] + axis_values[mesh_size / 2]); + } + // Build left node and right node by partitioning the mesh based on triangle + // bounding box center component value + ArrayList left_mesh = new ArrayList(); + ArrayList right_mesh = new ArrayList(); + for (int i = 0; i < mesh_size; i++) { + Triangle t = this.mesh.get(i); + if (t.bbx.getCenterAxisValue(axis) < pivot) { + left_mesh.add(t); + } else if (t.bbx.getCenterAxisValue(axis) > pivot) { + right_mesh.add(t); + } else if (left_mesh.size() < right_mesh.size()) { + left_mesh.add(t); + } else { + right_mesh.add(t); + } + } + left = new BVH(left_mesh); + right = new BVH(right_mesh); + } + // check if a ray intersect with current volume + boolean intersect(Ray r, float[] param) { + if (mesh.size() == 0) { + return false; + } + if (mesh.size() == 1) { + Triangle t = mesh.get(0); + return t.intersect(r, param); + } + if (!overall_bbx.intersect(r)) { + return false; + } + boolean left_res = left.intersect(r, param); + boolean right_res = right.intersect(r, param); + return left_res || right_res; + } +} diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde new file mode 100644 index 0000000000..b39dae3a19 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde @@ -0,0 +1,138 @@ +class Camera { + // camera's field of view + float fov; + // camera's position, look at point and axis + PVector pos, center, axis; + PVector init_pos, init_center, init_axis; + float move_speed; + float rot_speed; + Camera(float fov, PVector pos, PVector center, PVector axis) { + this.fov = fov; + this.pos = pos; + this.center = center; + this.axis = axis; + this.axis.normalize(); + move_speed = 0.001; + rot_speed = 0.01 * PI; + init_pos = pos.copy(); + init_center = center.copy(); + init_axis = axis.copy(); + } + + Camera copy() { + Camera cam = new Camera(fov, pos.copy(), center.copy(), axis.copy()); + return cam; + } + + PVector project(PVector pos) { + PVector proj = MatxVec3(getCameraMat(), PVector.sub(pos, this.pos)); + proj.x = (float)height / 2.0 * proj.x / proj.z / tan(fov / 2.0f); + proj.y = (float)height / 2.0 * proj.y / proj.z / tan(fov / 2.0f); + proj.z = proj.z; + return proj; + } + + float[] getCameraMat() { + float[] mat = new float[9]; + PVector dir = PVector.sub(center, pos); + dir.normalize(); + PVector left = dir.cross(axis); + left.normalize(); + // processing camera system does not follow right hand rule + mat[0] = -left.x; + mat[1] = -left.y; + mat[2] = -left.z; + mat[3] = axis.x; + mat[4] = axis.y; + mat[5] = axis.z; + mat[6] = dir.x; + mat[7] = dir.y; + mat[8] = dir.z; + + return mat; + } + + void run() { + PVector dir, left; + if (mousePressed) { + float angleX = (float)mouseX / width * PI - PI / 2; + float angleY = (float)mouseY / height * PI - PI; + PVector diff = PVector.sub(center, pos); + float radius = diff.mag(); + pos.x = radius * sin(angleY) * sin(angleX) + center.x; + pos.y = radius * cos(angleY) + center.y; + pos.z = radius * sin(angleY) * cos(angleX) + center.z; + dir = PVector.sub(center, pos); + dir.normalize(); + PVector up = new PVector(0, 1, 0); + left = up.cross(dir); + left.normalize(); + axis = dir.cross(left); + axis.normalize(); + } + + if (keyPressed) { + switch (key) { + case 'w': + dir = PVector.sub(center, pos); + dir.normalize(); + pos = PVector.add(pos, PVector.mult(dir, move_speed)); + center = PVector.add(center, PVector.mult(dir, move_speed)); + break; + case 's': + dir = PVector.sub(center, pos); + dir.normalize(); + pos = PVector.sub(pos, PVector.mult(dir, move_speed)); + center = PVector.sub(center, PVector.mult(dir, move_speed)); + break; + case 'a': + dir = PVector.sub(center, pos); + dir.normalize(); + left = axis.cross(dir); + left.normalize(); + pos = PVector.add(pos, PVector.mult(left, move_speed)); + center = PVector.add(center, PVector.mult(left, move_speed)); + break; + case 'd': + dir = PVector.sub(center, pos); + dir.normalize(); + left = axis.cross(dir); + left.normalize(); + pos = PVector.sub(pos, PVector.mult(left, move_speed)); + center = PVector.sub(center, PVector.mult(left, move_speed)); + break; + case 'r': + dir = PVector.sub(center, pos); + dir.normalize(); + float[] mat = getRotationMat3x3(rot_speed, dir.x, dir.y, dir.z); + axis = MatxVec3(mat, axis); + axis.normalize(); + break; + case 'b': + pos = init_pos.copy(); + center = init_center.copy(); + axis = init_axis.copy(); + break; + case '+': move_speed *= 2.0f; break; + case '-': move_speed /= 2.0; break; + case CODED: + if (keyCode == UP) { + pos = PVector.add(pos, PVector.mult(axis, move_speed)); + center = PVector.add(center, PVector.mult(axis, move_speed)); + } else if (keyCode == DOWN) { + pos = PVector.sub(pos, PVector.mult(axis, move_speed)); + center = PVector.sub(center, PVector.mult(axis, move_speed)); + } + } + } + } + void open() { + perspective(fov, float(width) / height, 1e-6, 1e5); + camera(pos.x, pos.y, pos.z, center.x, center.y, center.z, axis.x, axis.y, + axis.z); + } + void close() { + ortho(-width, 0, -height, 0); + camera(0, 0, 0, 0, 0, 1, 0, 1, 0); + } +} diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde new file mode 100644 index 0000000000..a5e04b6a92 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde @@ -0,0 +1,102 @@ +class MotionField { + int block_size; + ArrayList motion_field; + MotionField(int block_size) { + this.block_size = block_size; + motion_field = new ArrayList(); + } + + void update(Camera last_cam, Camera current_cam, PointCloud point_cloud, + BVH bvh) { + // clear motion field + motion_field = new ArrayList(); + int r_num = height / block_size, c_num = width / block_size; + for (int i = 0; i < r_num * c_num; i++) + motion_field.add(new PVector(0, 0, 0)); + // estimate motion vector of each point in point cloud + for (int i = 0; i < point_cloud.size(); i++) { + PVector p = point_cloud.getPosition(i); + PVector p0 = current_cam.project(p); + PVector p1 = last_cam.project(p); + int row = int((p0.y + height / 2.0f) / block_size); + int col = int((p0.x + width / 2.0f) / block_size); + if (row >= 0 && row < r_num && col >= 0 && col < c_num) { + PVector accu = motion_field.get(row * c_num + col); + accu.x += p1.x - p0.x; + accu.y += p1.y - p0.y; + accu.z += 1; + } + } + // if some blocks do not have point, then use ray tracing to see if they are + // in triangles + for (int i = 0; i < r_num; i++) + for (int j = 0; j < c_num; j++) { + PVector accu = motion_field.get(i * c_num + j); + if (accu.z > 0) { + continue; + } + // use the center of the block to generate view ray + float cx = j * block_size + block_size / 2.0f - width / 2.0f; + float cy = i * block_size + block_size / 2.0f - height / 2.0f; + float cz = 0.5f * height / tan(current_cam.fov / 2.0f); + PVector dir = new PVector(cx, cy, cz); + float[] camMat = current_cam.getCameraMat(); + dir = MatxVec3(transpose3x3(camMat), dir); + dir.normalize(); + Ray r = new Ray(current_cam.pos, dir); + // ray tracing + float[] param = new float[4]; + param[0] = Float.POSITIVE_INFINITY; + if (bvh.intersect(r, param)) { + PVector p = new PVector(param[1], param[2], param[3]); + PVector p0 = current_cam.project(p); + PVector p1 = last_cam.project(p); + accu.x += p1.x - p0.x; + accu.y += p1.y - p0.y; + accu.z += 1; + } + } + // estimate the motion vector of each block + for (int i = 0; i < r_num * c_num; i++) { + PVector mv = motion_field.get(i); + if (mv.z > 0) { + motion_field.set(i, new PVector(mv.x / mv.z, mv.y / mv.z, 0)); + } else // there is nothing in the block, use -1 to mark it. + { + motion_field.set(i, new PVector(0.0, 0.0, -1)); + } + } + } + + void render() { + int r_num = height / block_size, c_num = width / block_size; + for (int i = 0; i < r_num; i++) + for (int j = 0; j < c_num; j++) { + PVector mv = motion_field.get(i * c_num + j); + float ox = j * block_size + 0.5f * block_size; + float oy = i * block_size + 0.5f * block_size; + stroke(255, 0, 0); + line(ox, oy, ox + mv.x, oy + mv.y); + } + } + + void save(String path) { + int r_num = height / block_size; + int c_num = width / block_size; + String[] mvs = new String[r_num]; + for (int i = 0; i < r_num; i++) { + mvs[i] = ""; + for (int j = 0; j < c_num; j++) { + PVector mv = motion_field.get(i * c_num + j); + if (mv.z != -1) { + mvs[i] += str(mv.x) + "," + str(mv.y); + } else // there is nothing + { + mvs[i] += "-,-"; + } + if (j != c_num - 1) mvs[i] += ";"; + } + } + saveStrings(path, mvs); + } +} diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde new file mode 100644 index 0000000000..714a6f3a0b --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde @@ -0,0 +1,138 @@ +class PointCloud { + ArrayList points; // array to save points + IntList point_colors; // array to save points color + PVector cloud_mass; + float[] depth; + boolean[] real; + PointCloud() { + // initialize + points = new ArrayList(); + point_colors = new IntList(); + cloud_mass = new PVector(0, 0, 0); + depth = new float[width * height]; + real = new boolean[width * height]; + } + + void generate(PImage rgb_img, PImage depth_img, Transform trans) { + if (depth_img.width != width || depth_img.height != height || + rgb_img.width != width || rgb_img.height != height) { + println("rgb and depth file dimension should be same with window size"); + exit(); + } + // clear depth and real + for (int i = 0; i < width * height; i++) { + depth[i] = 0; + real[i] = false; + } + for (int v = 0; v < height; v++) + for (int u = 0; u < width; u++) { + // get depth value (red channel) + color depth_px = depth_img.get(u, v); + depth[v * width + u] = depth_px & 0x0000FFFF; + if (int(depth[v * width + u]) != 0) { + real[v * width + u] = true; + } + point_colors.append(rgb_img.get(u, v)); + } + for (int v = 0; v < height; v++) + for (int u = 0; u < width; u++) { + if (int(depth[v * width + u]) == 0) { + interpolateDepth(v, u); + } + // add transformed pixel as well as pixel color to the list + PVector pos = trans.transform(u, v, int(depth[v * width + u])); + points.add(pos); + // accumulate z value + cloud_mass = PVector.add(cloud_mass, pos); + } + } + void fillInDepthAlongPath(float d, Node node) { + node = node.parent; + while (node != null) { + int i = node.row; + int j = node.col; + if (depth[i * width + j] == 0) { + depth[i * width + j] = d; + } + node = node.parent; + } + } + // interpolate + void interpolateDepth(int row, int col) { + if (row < 0 || row >= height || col < 0 || col >= width || + int(depth[row * width + col]) != 0) { + return; + } + ArrayList queue = new ArrayList(); + queue.add(new Node(row, col, null)); + boolean[] visited = new boolean[width * height]; + for (int i = 0; i < width * height; i++) visited[i] = false; + visited[row * width + col] = true; + // Using BFS to Find the Nearest Neighbor + while (queue.size() > 0) { + // pop + Node node = queue.get(0); + queue.remove(0); + int i = node.row; + int j = node.col; + // if current position have a real depth + if (depth[i * width + j] != 0 && real[i * width + j]) { + fillInDepthAlongPath(depth[i * width + j], node); + break; + } else { + // search unvisited 8 neighbors + for (int r = max(0, i - 1); r < min(height, i + 2); r++) { + for (int c = max(0, j - 1); c < min(width, j + 2); c++) { + if (!visited[r * width + c]) { + visited[r * width + c] = true; + queue.add(new Node(r, c, node)); + } + } + } + } + } + } + // get point cloud size + int size() { return points.size(); } + // get ith position + PVector getPosition(int i) { + if (i >= points.size()) { + println("point position: index " + str(i) + " exceeds"); + exit(); + } + return points.get(i); + } + // get ith color + color getColor(int i) { + if (i >= point_colors.size()) { + println("point color: index " + str(i) + " exceeds"); + exit(); + } + return point_colors.get(i); + } + // get cloud center + PVector getCloudCenter() { + if (points.size() > 0) { + return PVector.div(cloud_mass, points.size()); + } + return new PVector(0, 0, 0); + } + // merge two clouds + void merge(PointCloud point_cloud) { + for (int i = 0; i < point_cloud.size(); i++) { + points.add(point_cloud.getPosition(i)); + point_colors.append(point_cloud.getColor(i)); + } + cloud_mass = PVector.add(cloud_mass, point_cloud.cloud_mass); + } +} + +class Node { + int row, col; + Node parent; + Node(int row, int col, Node parent) { + this.row = row; + this.col = col; + this.parent = parent; + } +} diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde new file mode 100644 index 0000000000..ef4be691c2 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde @@ -0,0 +1,61 @@ +// Triangle +class Triangle { + // position + PVector p1, p2, p3; + // color + color c1, c2, c3; + BoundingBox bbx; + Triangle(PVector p1, PVector p2, PVector p3, color c1, color c2, color c3) { + this.p1 = p1; + this.p2 = p2; + this.p3 = p3; + this.c1 = c1; + this.c2 = c2; + this.c3 = c3; + bbx = new BoundingBox(); + bbx.create(this); + } + // check to see if a ray intersects with the triangle + boolean intersect(Ray r, float[] param) { + PVector p21 = PVector.sub(p2, p1); + PVector p31 = PVector.sub(p3, p1); + PVector po1 = PVector.sub(r.ori, p1); + + PVector dxp31 = r.dir.cross(p31); + PVector po1xp21 = po1.cross(p21); + float denom = p21.dot(dxp31); + float t = p31.dot(po1xp21) / denom; + float alpha = po1.dot(dxp31) / denom; + float beta = r.dir.dot(po1xp21) / denom; + + boolean res = t > 0 && alpha > 0 && alpha < 1 && beta > 0 && beta < 1 && + alpha + beta < 1; + // depth test + if (res && t < param[0]) { + param[0] = t; + param[1] = alpha * p1.x + beta * p2.x + (1 - alpha - beta) * p3.x; + param[2] = alpha * p1.y + beta * p2.y + (1 - alpha - beta) * p3.y; + param[3] = alpha * p1.z + beta * p2.z + (1 - alpha - beta) * p3.z; + } + return res; + } + void render() { + beginShape(TRIANGLES); + fill(c1); + vertex(p1.x, p1.y, p1.z); + fill(c2); + vertex(p2.x, p2.y, p2.z); + fill(c3); + vertex(p3.x, p3.y, p3.z); + endShape(); + } +} +// Ray +class Ray { + // origin and direction + PVector ori, dir; + Ray(PVector ori, PVector dir) { + this.ori = ori; + this.dir = dir; + } +} diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde new file mode 100644 index 0000000000..cf79ab7141 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde @@ -0,0 +1,59 @@ +class Scene { + PointCloud point_cloud; + ArrayList mesh; + BVH bvh; + MotionField motion_field; + Camera last_cam; + Camera current_cam; + int frame_count; + + Scene(Camera camera, PointCloud point_cloud, MotionField motion_field) { + this.point_cloud = point_cloud; + this.motion_field = motion_field; + mesh = new ArrayList(); + for (int v = 0; v < height - 1; v++) + for (int u = 0; u < width - 1; u++) { + PVector p1 = point_cloud.getPosition(v * width + u); + PVector p2 = point_cloud.getPosition(v * width + u + 1); + PVector p3 = point_cloud.getPosition((v + 1) * width + u + 1); + PVector p4 = point_cloud.getPosition((v + 1) * width + u); + color c1 = point_cloud.getColor(v * width + u); + color c2 = point_cloud.getColor(v * width + u + 1); + color c3 = point_cloud.getColor((v + 1) * width + u + 1); + color c4 = point_cloud.getColor((v + 1) * width + u); + mesh.add(new Triangle(p1, p2, p3, c1, c2, c3)); + mesh.add(new Triangle(p3, p4, p1, c3, c4, c1)); + } + bvh = new BVH(mesh); + last_cam = camera.copy(); + current_cam = camera; + frame_count = 0; + } + + void run() { + last_cam = current_cam.copy(); + current_cam.run(); + motion_field.update(last_cam, current_cam, point_cloud, bvh); + frame_count += 1; + } + + void render(boolean show_motion_field) { + // build mesh + current_cam.open(); + noStroke(); + for (int i = 0; i < mesh.size(); i++) { + Triangle t = mesh.get(i); + t.render(); + } + if (show_motion_field) { + current_cam.close(); + motion_field.render(); + } + } + + void save(String path) { saveFrame(path + "_" + str(frame_count) + ".png"); } + + void saveMotionField(String path) { + motion_field.save(path + "_" + str(frame_count) + ".txt"); + } +} diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde new file mode 100644 index 0000000000..af2204e8cf --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde @@ -0,0 +1,82 @@ +class Transform { + float[] inv_rot; // inverse of rotation matrix + PVector inv_mov; // inverse of movement vector + float focal; // the focal distacne of real camera + int w, h; // the width and height of the frame + float normalier; // nomalization factor of depth + Transform(float tx, float ty, float tz, float qx, float qy, float qz, + float qw, float fov, int w, int h, float normalier) { + // currently, we did not use the info of real camera's position and + // quaternion maybe we will use it in the future when combine all frames + float[] rot = quaternion2Mat3x3(qx, qy, qz, qw); + inv_rot = transpose3x3(rot); + inv_mov = new PVector(-tx, -ty, -tz); + this.focal = 0.5f * h / tan(fov / 2.0); + this.w = w; + this.h = h; + this.normalier = normalier; + } + + PVector transform(int i, int j, float d) { + // transfer from camera view to world view + float z = d / normalier; + float x = (i - w / 2.0f) * z / focal; + float y = (j - h / 2.0f) * z / focal; + return new PVector(x, y, z); + } +} + +// get rotation matrix by using rotation axis and angle +float[] getRotationMat3x3(float angle, float ax, float ay, float az) { + float[] mat = new float[9]; + float c = cos(angle); + float s = sin(angle); + mat[0] = c + ax * ax * (1 - c); + mat[1] = ax * ay * (1 - c) - az * s; + mat[2] = ax * az * (1 - c) + ay * s; + mat[3] = ay * ax * (1 - c) + az * s; + mat[4] = c + ay * ay * (1 - c); + mat[5] = ay * az * (1 - c) - ax * s; + mat[6] = az * ax * (1 - c) - ay * s; + mat[7] = az * ay * (1 - c) + ax * s; + mat[8] = c + az * az * (1 - c); + return mat; +} + +// get rotation matrix by using quaternion +float[] quaternion2Mat3x3(float qx, float qy, float qz, float qw) { + float[] mat = new float[9]; + mat[0] = 1 - 2 * qy * qy - 2 * qz * qz; + mat[1] = 2 * qx * qy - 2 * qz * qw; + mat[2] = 2 * qx * qz + 2 * qy * qw; + mat[3] = 2 * qx * qy + 2 * qz * qw; + mat[4] = 1 - 2 * qx * qx - 2 * qz * qz; + mat[5] = 2 * qy * qz - 2 * qx * qw; + mat[6] = 2 * qx * qz - 2 * qy * qw; + mat[7] = 2 * qy * qz + 2 * qx * qw; + mat[8] = 1 - 2 * qx * qx - 2 * qy * qy; + return mat; +} + +// tranpose a 3x3 matrix +float[] transpose3x3(float[] mat) { + float[] Tmat = new float[9]; + for (int i = 0; i < 3; i++) + for (int j = 0; j < 3; j++) { + Tmat[i * 3 + j] = mat[j * 3 + i]; + } + return Tmat; +} + +// multiply a matrix with vector +PVector MatxVec3(float[] mat, PVector v) { + float[] vec = v.array(); + float[] res = new float[3]; + for (int i = 0; i < 3; i++) { + res[i] = 0.0f; + for (int j = 0; j < 3; j++) { + res[i] += mat[i * 3 + j] * vec[j]; + } + } + return new PVector(res[0], res[1], res[2]); +} diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde new file mode 100644 index 0000000000..19d124a0b3 --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde @@ -0,0 +1,28 @@ +// show grids +void showGrids(int block_size) { + ortho(-width, 0, -height, 0); + camera(0, 0, 0, 0, 0, 1, 0, 1, 0); + stroke(0, 0, 255); + for (int i = 0; i < height; i += block_size) { + line(0, i, width, i); + } + for (int i = 0; i < width; i += block_size) { + line(i, 0, i, height); + } +} + +// save the point clould information +void savePointCloud(PointCloud point_cloud, String file_name) { + String[] positions = new String[point_cloud.points.size()]; + String[] colors = new String[point_cloud.points.size()]; + for (int i = 0; i < point_cloud.points.size(); i++) { + PVector point = point_cloud.getPosition(i); + color point_color = point_cloud.getColor(i); + positions[i] = str(point.x) + ' ' + str(point.y) + ' ' + str(point.z); + colors[i] = str(((point_color >> 16) & 0xFF) / 255.0) + ' ' + + str(((point_color >> 8) & 0xFF) / 255.0) + ' ' + + str((point_color & 0xFF) / 255.0); + } + saveStrings(file_name + "_pos.txt", positions); + saveStrings(file_name + "_color.txt", colors); +} diff --git a/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde new file mode 100644 index 0000000000..22a495432d --- /dev/null +++ b/media/libvpx/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde @@ -0,0 +1,74 @@ +/*The dataset is from + *Computer Vision Group + *TUM Department of Informatics Technical + *University of Munich + *https://vision.in.tum.de/data/datasets/rgbd-dataset/download#freiburg1_xyz + */ +Scene scene; +void setup() { + size(640, 480, P3D); + // default settings + int frame_no = 0; // frame number + float fov = PI / 3; // field of view + int block_size = 8; // block size + float normalizer = 5000.0f; // normalizer + // initialize + PointCloud point_cloud = new PointCloud(); + // synchronized rgb, depth and ground truth + String head = "../data/"; + String[] rgb_depth_gt = loadStrings(head + "rgb_depth_groundtruth.txt"); + // read in rgb and depth image file paths as well as corresponding camera + // posiiton and quaternion + String[] info = split(rgb_depth_gt[frame_no], ' '); + String rgb_path = head + info[1]; + String depth_path = head + info[3]; + float tx = float(info[7]), ty = float(info[8]), + tz = float(info[9]); // real camera position + float qx = float(info[10]), qy = float(info[11]), qz = float(info[12]), + qw = float(info[13]); // quaternion + + // build transformer + Transform trans = + new Transform(tx, ty, tz, qx, qy, qz, qw, fov, width, height, normalizer); + PImage rgb = loadImage(rgb_path); + PImage depth = loadImage(depth_path); + // generate point cloud + point_cloud.generate(rgb, depth, trans); + // initialize camera + Camera camera = new Camera(fov, new PVector(0, 0, 0), new PVector(0, 0, 1), + new PVector(0, 1, 0)); + // initialize motion field + MotionField motion_field = new MotionField(block_size); + // initialize scene + scene = new Scene(camera, point_cloud, motion_field); +} +boolean inter = false; +void draw() { + background(0); + // run camera dragged mouse to rotate camera + // w: go forward + // s: go backward + // a: go left + // d: go right + // up arrow: go up + // down arrow: go down + //+ increase move speed + //- decrease move speed + // r: rotate the camera + // b: reset to initial position + scene.run(); // true: make interpolation; false: do not make + // interpolation + if (keyPressed && key == 'o') { + inter = true; + } + scene.render( + false); // true: turn on motion field; false: turn off motion field + // save frame with no motion field + scene.save("../data/frame/raw"); + background(0); + scene.render(true); + showGrids(scene.motion_field.block_size); + // save frame with motion field + scene.save("../data/frame/raw_mv"); + scene.saveMotionField("../data/frame/mv"); +} diff --git a/media/libvpx/libvpx/tools/README.pgo.md b/media/libvpx/libvpx/tools/README.pgo.md new file mode 100644 index 0000000000..414743f8fc --- /dev/null +++ b/media/libvpx/libvpx/tools/README.pgo.md @@ -0,0 +1,24 @@ +# Using Profile Guided Optimizations to identify compiler optimization failures + +When using Clang, the `-Rpass-missed` flag enables the verbose log of failed +compiler optimizations. However, the extensive log messages can obscure +potential optimization opportunities. + +Use the following steps to generate a more transparent optimization report +using a previously created PGO profile file. The report also includes code +hotness diagnostics: + +```bash +$ ../libvpx/configure --use-profile=perf.profdata \ + --extra-cflags="-fsave-optimization-record -fdiagnostics-show-hotness" +``` + +Convert the generated YAML files into a detailed HTML report using the +[optviewer2](https://github.com/OfekShilon/optview2) tool: + +```bash +$ opt-viewer.py --output-dir=out/ --source-dir=libvpx . +``` + +The HTML report displays each code line's relative hotness, cross-referenced +with the failed compiler optimizations. diff --git a/media/libvpx/libvpx/tools/all_builds.py b/media/libvpx/libvpx/tools/all_builds.py deleted file mode 100644 index 54176f58ab..0000000000 --- a/media/libvpx/libvpx/tools/all_builds.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python - -import getopt -import subprocess -import sys - -LONG_OPTIONS = ["shard=", "shards="] -BASE_COMMAND = "./configure --enable-internal-stats" - -def RunCommand(command): - run = subprocess.Popen(command, shell=True) - output = run.communicate() - if run.returncode: - print "Non-zero return code: " + str(run.returncode) + " => exiting!" - sys.exit(1) - -def list_of_experiments(): - experiments = [] - configure_file = open("configure") - list_start = False - for line in configure_file.read().split("\n"): - if line == 'EXPERIMENT_LIST="': - list_start = True - elif line == '"': - list_start = False - elif list_start: - currently_broken = ["csm"] - experiment = line[4:] - if experiment not in currently_broken: - experiments.append(experiment) - return experiments - -def main(argv): - # Parse arguments - options = {"--shard": 0, "--shards": 1} - if "--" in argv: - opt_end_index = argv.index("--") - else: - opt_end_index = len(argv) - try: - o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS) - except getopt.GetoptError, err: - print str(err) - print "Usage: %s [--shard= --shards=] -- [configure flag ...]"%argv[0] - sys.exit(2) - - options.update(o) - extra_args = argv[opt_end_index + 1:] - - # Shard experiment list - shard = int(options["--shard"]) - shards = int(options["--shards"]) - experiments = list_of_experiments() - base_command = " ".join([BASE_COMMAND] + extra_args) - configs = [base_command] - configs += ["%s --enable-%s" % (base_command, e) for e in experiments] - my_configs = zip(configs, range(len(configs))) - my_configs = filter(lambda x: x[1] % shards == shard, my_configs) - my_configs = [e[0] for e in my_configs] - - # Run configs for this shard - for config in my_configs: - test_build(config) - -def test_build(configure_command): - print "\033[34m\033[47mTesting %s\033[0m" % (configure_command) - RunCommand(configure_command) - RunCommand("make clean") - RunCommand("make") - -if __name__ == "__main__": - main(sys.argv) diff --git a/media/libvpx/libvpx/tools/author_first_release.sh b/media/libvpx/libvpx/tools/author_first_release.sh deleted file mode 100644 index 7b0b797212..0000000000 --- a/media/libvpx/libvpx/tools/author_first_release.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -## -## List the release each author first contributed to. -## -## Usage: author_first_release.sh [TAGS] -## -## If the TAGS arguments are unspecified, all tags reported by `git tag` -## will be considered. -## -tags=${@:-$(git tag)} -for tag in $tags; do - git shortlog -n -e -s $tag | - cut -f2- | - awk "{print \"${tag#v}\t\"\$0}" -done | sort -k2 | uniq -f2 diff --git a/media/libvpx/libvpx/tools/cpplint.py b/media/libvpx/libvpx/tools/cpplint.py index 25fbef73d8..e3ebde2f5a 100644 --- a/media/libvpx/libvpx/tools/cpplint.py +++ b/media/libvpx/libvpx/tools/cpplint.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # Copyright (c) 2009 Google Inc. All rights reserved. # @@ -51,16 +51,23 @@ import sre_compile import string import sys import unicodedata +import sysconfig + +try: + xrange # Python 2 +except NameError: + xrange = range # Python 3 _USAGE = """ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] [--counting=total|toplevel|detailed] [--root=subdir] - [--linelength=digits] + [--linelength=digits] [--headers=x,y,...] + [--quiet] [file] ... The style guidelines this tries to follow are those in - http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml + https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml Every problem is given a confidence score from 1-5, with 5 meaning we are certain of the problem, and 1 meaning it could be a legitimate construct. @@ -83,6 +90,9 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] verbose=# Specify a number 0-5 to restrict errors to certain verbosity levels. + quiet + Don't print anything if no errors are found. + filter=-x,+y,... Specify a comma-separated list of category-filters to apply: only error messages whose category names pass the filters will be printed. @@ -114,12 +124,13 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] ignored. Examples: - Assuing that src/.git exists, the header guard CPP variables for - src/chrome/browser/ui/browser.h are: + Assuming that top/src/.git exists (and cwd=top/src), the header guard + CPP variables for top/src/chrome/browser/ui/browser.h are: No flag => CHROME_BROWSER_UI_BROWSER_H_ --root=chrome => BROWSER_UI_BROWSER_H_ --root=chrome/browser => UI_BROWSER_H_ + --root=.. => SRC_CHROME_BROWSER_UI_BROWSER_H_ linelength=digits This is the allowed line length for the project. The default value is @@ -133,6 +144,57 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] Examples: --extensions=hpp,cpp + + headers=x,y,... + The header extensions that cpplint will treat as .h in checks. Values are + automatically added to --extensions list. + + Examples: + --headers=hpp,hxx + --headers=hpp + + cpplint.py supports per-directory configurations specified in CPPLINT.cfg + files. CPPLINT.cfg file can contain a number of key=value pairs. + Currently the following options are supported: + + set noparent + filter=+filter1,-filter2,... + exclude_files=regex + linelength=80 + root=subdir + headers=x,y,... + + "set noparent" option prevents cpplint from traversing directory tree + upwards looking for more .cfg files in parent directories. This option + is usually placed in the top-level project directory. + + The "filter" option is similar in function to --filter flag. It specifies + message filters in addition to the |_DEFAULT_FILTERS| and those specified + through --filter command-line flag. + + "exclude_files" allows to specify a regular expression to be matched against + a file name. If the expression matches, the file is skipped and not run + through liner. + + "linelength" allows to specify the allowed line length for the project. + + The "root" option is similar in function to the --root flag (see example + above). Paths are relative to the directory of the CPPLINT.cfg. + + The "headers" option is similar in function to the --headers flag + (see example above). + + CPPLINT.cfg has an effect on files in the same directory and all + sub-directories, unless overridden by a nested configuration file. + + Example file: + filter=-build/include_order,+build/include_alpha + exclude_files=.*\.cc + + The above example disables build/include_order warning and enables + build/include_alpha as well as excludes all .cc from being + processed by linter, in the current directory (where the .cfg + file is located) and all sub-directories. """ # We categorize each error message we print. Here are the categories. @@ -140,81 +202,101 @@ Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] # If you add a new error message with a new category, add it to the list # here! cpplint_unittest.py should tell you if you forget to do this. _ERROR_CATEGORIES = [ - 'build/class', - 'build/deprecated', - 'build/endif_comment', - 'build/explicit_make_pair', - 'build/forward_decl', - 'build/header_guard', - 'build/include', - 'build/include_alpha', - 'build/include_order', - 'build/include_what_you_use', - 'build/namespaces', - 'build/printf_format', - 'build/storage_class', - 'legal/copyright', - 'readability/alt_tokens', - 'readability/braces', - 'readability/casting', - 'readability/check', - 'readability/constructors', - 'readability/fn_size', - 'readability/function', - 'readability/multiline_comment', - 'readability/multiline_string', - 'readability/namespace', - 'readability/nolint', - 'readability/nul', - 'readability/streams', - 'readability/todo', - 'readability/utf8', - 'runtime/arrays', - 'runtime/casting', - 'runtime/explicit', - 'runtime/int', - 'runtime/init', - 'runtime/invalid_increment', - 'runtime/member_string_references', - 'runtime/memset', - 'runtime/operator', - 'runtime/printf', - 'runtime/printf_format', - 'runtime/references', - 'runtime/sizeof', - 'runtime/string', - 'runtime/threadsafe_fn', - 'runtime/vlog', - 'whitespace/blank_line', - 'whitespace/braces', - 'whitespace/comma', - 'whitespace/comments', - 'whitespace/empty_conditional_body', - 'whitespace/empty_loop_body', - 'whitespace/end_of_line', - 'whitespace/ending_newline', - 'whitespace/forcolon', - 'whitespace/indent', - 'whitespace/line_length', - 'whitespace/newline', - 'whitespace/operators', - 'whitespace/parens', - 'whitespace/semicolon', - 'whitespace/tab', - 'whitespace/todo' - ] + 'build/class', + 'build/c++11', + 'build/c++14', + 'build/c++tr1', + 'build/deprecated', + 'build/endif_comment', + 'build/explicit_make_pair', + 'build/forward_decl', + 'build/header_guard', + 'build/include', + 'build/include_alpha', + 'build/include_order', + 'build/include_what_you_use', + 'build/namespaces', + 'build/printf_format', + 'build/storage_class', + 'legal/copyright', + 'readability/alt_tokens', + 'readability/braces', + 'readability/casting', + 'readability/check', + 'readability/constructors', + 'readability/fn_size', + 'readability/inheritance', + 'readability/multiline_comment', + 'readability/multiline_string', + 'readability/namespace', + 'readability/nolint', + 'readability/nul', + 'readability/strings', + 'readability/todo', + 'readability/utf8', + 'runtime/arrays', + 'runtime/casting', + 'runtime/explicit', + 'runtime/int', + 'runtime/init', + 'runtime/invalid_increment', + 'runtime/member_string_references', + 'runtime/memset', + 'runtime/indentation_namespace', + 'runtime/operator', + 'runtime/printf', + 'runtime/printf_format', + 'runtime/references', + 'runtime/string', + 'runtime/threadsafe_fn', + 'runtime/vlog', + 'whitespace/blank_line', + 'whitespace/braces', + 'whitespace/comma', + 'whitespace/comments', + 'whitespace/empty_conditional_body', + 'whitespace/empty_if_body', + 'whitespace/empty_loop_body', + 'whitespace/end_of_line', + 'whitespace/ending_newline', + 'whitespace/forcolon', + 'whitespace/indent', + 'whitespace/line_length', + 'whitespace/newline', + 'whitespace/operators', + 'whitespace/parens', + 'whitespace/semicolon', + 'whitespace/tab', + 'whitespace/todo', + ] -# The default state of the category filter. This is overrided by the --filter= +# These error categories are no longer enforced by cpplint, but for backwards- +# compatibility they may still appear in NOLINT comments. +_LEGACY_ERROR_CATEGORIES = [ + 'readability/streams', + 'readability/function', + ] + +# The default state of the category filter. This is overridden by the --filter= # flag. By default all errors are on, so only add here categories that should be # off by default (i.e., categories that must be enabled by the --filter= flags). # All entries here should start with a '-' or '+', as in the --filter= flag. _DEFAULT_FILTERS = ['-build/include_alpha'] +# The default list of categories suppressed for C (not C++) files. +_DEFAULT_C_SUPPRESSED_CATEGORIES = [ + 'readability/casting', + ] + +# The default list of categories suppressed for Linux Kernel files. +_DEFAULT_KERNEL_SUPPRESSED_CATEGORIES = [ + 'whitespace/tab', + ] + # We used to check for high-bit characters, but after much discussion we # decided those were OK, as long as they were in UTF-8 and didn't represent # hard-coded international strings, which belong in a separate i18n file. - # C++ headers _CPP_HEADERS = frozenset([ # Legacy @@ -304,6 +386,7 @@ _CPP_HEADERS = frozenset([ 'random', 'ratio', 'regex', + 'scoped_allocator', 'set', 'sstream', 'stack', @@ -351,15 +434,40 @@ _CPP_HEADERS = frozenset([ 'cwctype', ]) +# Type names +_TYPES = re.compile( + r'^(?:' + # [dcl.type.simple] + r'(char(16_t|32_t)?)|wchar_t|' + r'bool|short|int|long|signed|unsigned|float|double|' + # [support.types] + r'(ptrdiff_t|size_t|max_align_t|nullptr_t)|' + # [cstdint.syn] + r'(u?int(_fast|_least)?(8|16|32|64)_t)|' + r'(u?int(max|ptr)_t)|' + r')$') + + +# These headers are excluded from [build/include] and [build/include_order] +# checks: +# - Anything not following google file name conventions (containing an +# uppercase character, such as Python.h or nsStringAPI.h, for example). +# - Lua headers. +_THIRD_PARTY_HEADERS_PATTERN = re.compile( + r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$') + +# Pattern for matching FileInfo.BaseName() against test file name +_TEST_FILE_SUFFIX = r'(_test|_unittest|_regtest)$' + +# Pattern that matches only complete whitespace, possibly across multiple lines. +_EMPTY_CONDITIONAL_BODY_PATTERN = re.compile(r'^\s*$', re.DOTALL) + # Assertion macros. These are defined in base/logging.h and -# testing/base/gunit.h. Note that the _M versions need to come first -# for substring matching to work. +# testing/base/public/gunit.h. _CHECK_MACROS = [ 'DCHECK', 'CHECK', - 'EXPECT_TRUE_M', 'EXPECT_TRUE', - 'ASSERT_TRUE_M', 'ASSERT_TRUE', - 'EXPECT_FALSE_M', 'EXPECT_FALSE', - 'ASSERT_FALSE_M', 'ASSERT_FALSE', + 'EXPECT_TRUE', 'ASSERT_TRUE', + 'EXPECT_FALSE', 'ASSERT_FALSE', ] # Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE @@ -372,16 +480,12 @@ for op, replacement in [('==', 'EQ'), ('!=', 'NE'), _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement - _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement - _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), ('>=', 'LT'), ('>', 'LE'), ('<=', 'GT'), ('<', 'GE')]: _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement - _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement - _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement # Alternative tokens and their replacements. For full list, see section 2.5 # Alternative tokens [lex.digraph] in the C++ standard. @@ -430,12 +534,15 @@ _MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)' r'(?:\s+(volatile|__volatile__))?' r'\s*[{(]') +# Match strings that indicate we're working on a C (not C++) file. +_SEARCH_C_FILE = re.compile(r'\b(?:LINT_C_FILE|' + r'vim?:\s*.*(\s*|:)filetype=c(\s*|:|$))') + +# Match string that indicates we're working on a Linux Kernel file. +_SEARCH_KERNEL_FILE = re.compile(r'\b(?:LINT_KERNEL_FILE)') _regexp_compile_cache = {} -# Finds occurrences of NOLINT or NOLINT(...). -_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?') - # {str, set(int)}: a map from error categories to sets of linenumbers # on which those errors are expected and should be suppressed. _error_suppressions = {} @@ -443,6 +550,7 @@ _error_suppressions = {} # The root directory used for deriving header guard CPP variable. # This is set by --root flag. _root = None +_root_debug = False # The allowed line length of files. # This is set by --linelength flag. @@ -452,8 +560,28 @@ _line_length = 80 # This is set by --extensions flag. _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh']) +# Treat all headers starting with 'h' equally: .h, .hpp, .hxx etc. +# This is set by --headers flag. +_hpp_headers = set(['h']) + +# {str, bool}: a map from error categories to booleans which indicate if the +# category should be suppressed for every line. +_global_error_suppressions = {} + +def ProcessHppHeadersOption(val): + global _hpp_headers + try: + _hpp_headers = set(val.split(',')) + # Automatically append to extensions list so it does not have to be set 2 times + _valid_extensions.update(_hpp_headers) + except ValueError: + PrintUsage('Header extensions must be comma separated list.') + +def IsHeaderExtension(file_extension): + return file_extension in _hpp_headers + def ParseNolintSuppressions(filename, raw_line, linenum, error): - """Updates the global list of error-suppressions. + """Updates the global list of line error-suppressions. Parses any NOLINT comments on the current line, updating the global error_suppressions store. Reports an error if the NOLINT comment @@ -465,42 +593,67 @@ def ParseNolintSuppressions(filename, raw_line, linenum, error): linenum: int, the number of the current line. error: function, an error handler. """ - # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*). - matched = _RE_SUPPRESSION.search(raw_line) + matched = Search(r'\bNOLINT(NEXTLINE)?\b(\([^)]+\))?', raw_line) if matched: - category = matched.group(1) + if matched.group(1): + suppressed_line = linenum + 1 + else: + suppressed_line = linenum + category = matched.group(2) if category in (None, '(*)'): # => "suppress all" - _error_suppressions.setdefault(None, set()).add(linenum) + _error_suppressions.setdefault(None, set()).add(suppressed_line) else: if category.startswith('(') and category.endswith(')'): category = category[1:-1] if category in _ERROR_CATEGORIES: - _error_suppressions.setdefault(category, set()).add(linenum) - else: + _error_suppressions.setdefault(category, set()).add(suppressed_line) + elif category not in _LEGACY_ERROR_CATEGORIES: error(filename, linenum, 'readability/nolint', 5, 'Unknown NOLINT error category: %s' % category) +def ProcessGlobalSuppresions(lines): + """Updates the list of global error suppressions. + + Parses any lint directives in the file that have global effect. + + Args: + lines: An array of strings, each representing a line of the file, with the + last element being empty if the file is terminated with a newline. + """ + for line in lines: + if _SEARCH_C_FILE.search(line): + for category in _DEFAULT_C_SUPPRESSED_CATEGORIES: + _global_error_suppressions[category] = True + if _SEARCH_KERNEL_FILE.search(line): + for category in _DEFAULT_KERNEL_SUPPRESSED_CATEGORIES: + _global_error_suppressions[category] = True + + def ResetNolintSuppressions(): - "Resets the set of NOLINT suppressions to empty." + """Resets the set of NOLINT suppressions to empty.""" _error_suppressions.clear() + _global_error_suppressions.clear() def IsErrorSuppressedByNolint(category, linenum): """Returns true if the specified error category is suppressed on this line. Consults the global error_suppressions map populated by - ParseNolintSuppressions/ResetNolintSuppressions. + ParseNolintSuppressions/ProcessGlobalSuppresions/ResetNolintSuppressions. Args: category: str, the category of the error. linenum: int, the current line number. Returns: - bool, True iff the error should be suppressed due to a NOLINT comment. + bool, True iff the error should be suppressed due to a NOLINT comment or + global suppression. """ - return (linenum in _error_suppressions.get(category, set()) or + return (_global_error_suppressions.get(category, False) or + linenum in _error_suppressions.get(category, set()) or linenum in _error_suppressions.get(None, set())) + def Match(pattern, s): """Matches the string with the pattern, caching the compiled regexp.""" # The regexp compilation caching is inlined in both Match and Search for @@ -536,11 +689,17 @@ def Search(pattern, s): return _regexp_compile_cache[pattern].search(s) -class _IncludeState(dict): +def _IsSourceExtension(s): + """File extension (excluding dot) matches a source file extension.""" + return s in ('c', 'cc', 'cpp', 'cxx') + + +class _IncludeState(object): """Tracks line numbers for includes, and the order in which includes appear. - As a dict, an _IncludeState object serves as a mapping between include - filename and line number on which that file was included. + include_list contains list of lists of (header, line number) pairs. + It's a lists of lists rather than just one flat list to make it + easier to update across preprocessor boundaries. Call CheckNextIncludeOrder() once for each header in the file, passing in the type constants defined above. Calls in an illegal order will @@ -571,15 +730,42 @@ class _IncludeState(dict): } def __init__(self): - dict.__init__(self) - self.ResetSection() + self.include_list = [[]] + self.ResetSection('') - def ResetSection(self): + def FindHeader(self, header): + """Check if a header has already been included. + + Args: + header: header to check. + Returns: + Line number of previous occurrence, or -1 if the header has not + been seen before. + """ + for section_list in self.include_list: + for f in section_list: + if f[0] == header: + return f[1] + return -1 + + def ResetSection(self, directive): + """Reset section checking for preprocessor directive. + + Args: + directive: preprocessor directive (e.g. "if", "else"). + """ # The name of the current section. self._section = self._INITIAL_SECTION # The path of last found header. self._last_header = '' + # Update list of includes. Note that we never pop from the + # include list. + if directive in ('if', 'ifdef', 'ifndef'): + self.include_list.append([]) + elif directive in ('else', 'elif'): + self.include_list[-1] = [] + def SetLastHeader(self, header_path): self._last_header = header_path @@ -615,7 +801,7 @@ class _IncludeState(dict): # If previous line was a blank line, assume that the headers are # intentionally sorted the way they are. if (self._last_header > header_path and - not Match(r'^\s*$', clean_lines.elided[linenum - 1])): + Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])): return False return True @@ -681,8 +867,11 @@ class _CppLintState(object): self.error_count = 0 # global count of reported errors # filters to apply when emitting error messages self.filters = _DEFAULT_FILTERS[:] + # backup of filter list. Used to restore the state after each file. + self._filters_backup = self.filters[:] self.counting = 'total' # In what way are we counting errors? self.errors_by_category = {} # string to int dict storing error counts + self.quiet = False # Suppress non-error messagess? # output format: # "emacs" - format that emacs can parse (default) @@ -693,6 +882,12 @@ class _CppLintState(object): """Sets the output format for errors.""" self.output_format = output_format + def SetQuiet(self, quiet): + """Sets the module's quiet settings, and returns the previous setting.""" + last_quiet = self.quiet + self.quiet = quiet + return last_quiet + def SetVerboseLevel(self, level): """Sets the module's verbosity, and returns the previous setting.""" last_verbose_level = self.verbose_level @@ -719,6 +914,10 @@ class _CppLintState(object): """ # Default filters always have less priority than the flag ones. self.filters = _DEFAULT_FILTERS[:] + self.AddFilters(filters) + + def AddFilters(self, filters): + """ Adds more filters to the existing list of error-message filters. """ for filt in filters.split(','): clean_filt = filt.strip() if clean_filt: @@ -728,6 +927,14 @@ class _CppLintState(object): raise ValueError('Every filter in --filters must start with + or -' ' (%s does not)' % filt) + def BackupFilters(self): + """ Saves the current filter list to backup storage.""" + self._filters_backup = self.filters[:] + + def RestoreFilters(self): + """ Restores filters previously backed up.""" + self.filters = self._filters_backup[:] + def ResetErrorCounts(self): """Sets the module's error statistic back to zero.""" self.error_count = 0 @@ -748,7 +955,7 @@ class _CppLintState(object): for category, count in self.errors_by_category.iteritems(): sys.stderr.write('Category \'%s\' errors found: %d\n' % (category, count)) - sys.stderr.write('Total errors found: %d\n' % self.error_count) + sys.stdout.write('Total errors found: %d\n' % self.error_count) _cpplint_state = _CppLintState() @@ -762,6 +969,14 @@ def _SetOutputFormat(output_format): """Sets the module's output format.""" _cpplint_state.SetOutputFormat(output_format) +def _Quiet(): + """Return's the module's quiet setting.""" + return _cpplint_state.quiet + +def _SetQuiet(quiet): + """Set the module's quiet status, and return previous setting.""" + return _cpplint_state.SetQuiet(quiet) + def _VerboseLevel(): """Returns the module's verbosity setting.""" @@ -795,6 +1010,25 @@ def _SetFilters(filters): """ _cpplint_state.SetFilters(filters) +def _AddFilters(filters): + """Adds more filter overrides. + + Unlike _SetFilters, this function does not reset the current list of filters + available. + + Args: + filters: A string of comma-separated filters (eg "whitespace/indent"). + Each filter should start with + or -; else we die. + """ + _cpplint_state.AddFilters(filters) + +def _BackupFilters(): + """ Saves the current filter list to backup storage.""" + _cpplint_state.BackupFilters() + +def _RestoreFilters(): + """ Restores filters previously backed up.""" + _cpplint_state.RestoreFilters() class _FunctionState(object): """Tracks current function name and the number of lines in its body.""" @@ -830,6 +1064,9 @@ class _FunctionState(object): filename: The name of the current file. linenum: The number of the line to check. """ + if not self.in_a_function: + return + if Match(r'T(EST|est)', self.current_function): base_trigger = self._TEST_TRIGGER else: @@ -857,7 +1094,7 @@ class _IncludeError(Exception): pass -class FileInfo: +class FileInfo(object): """Provides utility functions for filenames. FileInfo provides easy access to the components of a file's path @@ -900,12 +1137,13 @@ class FileInfo: # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by # searching up from the current path. - root_dir = os.path.dirname(fullname) - while (root_dir != os.path.dirname(root_dir) and - not os.path.exists(os.path.join(root_dir, ".git")) and - not os.path.exists(os.path.join(root_dir, ".hg")) and - not os.path.exists(os.path.join(root_dir, ".svn"))): - root_dir = os.path.dirname(root_dir) + root_dir = current_dir = os.path.dirname(fullname) + while current_dir != os.path.dirname(current_dir): + if (os.path.exists(os.path.join(current_dir, ".git")) or + os.path.exists(os.path.join(current_dir, ".hg")) or + os.path.exists(os.path.join(current_dir, ".svn"))): + root_dir = current_dir + current_dir = os.path.dirname(current_dir) if (os.path.exists(os.path.join(root_dir, ".git")) or os.path.exists(os.path.join(root_dir, ".hg")) or @@ -944,7 +1182,7 @@ class FileInfo: def IsSource(self): """File has a source file extension.""" - return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx') + return _IsSourceExtension(self.Extension()[1:]) def _ShouldPrintError(category, confidence, linenum): @@ -955,6 +1193,7 @@ def _ShouldPrintError(category, confidence, linenum): # the verbosity level isn't high enough, or the filters filter it out. if IsErrorSuppressedByNolint(category, linenum): return False + if confidence < _cpplint_state.verbose_level: return False @@ -999,8 +1238,8 @@ def Error(filename, linenum, category, confidence, message): if _ShouldPrintError(category, confidence, linenum): _cpplint_state.IncrementErrorCount(category) if _cpplint_state.output_format == 'vs7': - sys.stderr.write('%s(%s): %s [%s] [%d]\n' % ( - filename, linenum, message, category, confidence)) + sys.stderr.write('%s(%s): error cpplint: [%s] %s [%d]\n' % ( + filename, linenum, category, message, confidence)) elif _cpplint_state.output_format == 'eclipse': sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % ( filename, linenum, message, category, confidence)) @@ -1012,11 +1251,9 @@ def Error(filename, linenum, category, confidence, message): # Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard. _RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile( r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)') -# Matches strings. Escape codes should already be removed by ESCAPES. -_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"') -# Matches characters. Escape codes should already be removed by ESCAPES. -_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'") -# Matches multi-line C++ comments. +# Match a single C style comment on the same line. +_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/' +# Matches multi-line C style comments. # This RE is a little bit more complicated than one might expect, because we # have to take care of space removals tools so we can handle comments inside # statements better. @@ -1025,10 +1262,10 @@ _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'") # if this doesn't work we try on left side but only if there's a non-character # on the right. _RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile( - r"""(\s*/\*.*\*/\s*$| - /\*.*\*/\s+| - \s+/\*.*\*/(?=\W)| - /\*.*\*/)""", re.VERBOSE) + r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' + + _RE_PATTERN_C_COMMENTS + r'\s+|' + + r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' + + _RE_PATTERN_C_COMMENTS + r')') def IsCppString(line): @@ -1083,13 +1320,26 @@ def CleanseRawStrings(raw_lines): delimiter = None else: # Haven't found the end yet, append a blank line. - line = '' + line = '""' - else: + # Look for beginning of a raw string, and replace them with + # empty strings. This is done in a loop to handle multiple raw + # strings on the same line. + while delimiter is None: # Look for beginning of a raw string. # See 2.14.15 [lex.string] for syntax. - matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line) - if matched: + # + # Once we have matched a raw string, we check the prefix of the + # line to make sure that the line is not part of a single line + # comment. It's done this way because we remove raw strings + # before removing comments as opposed to removing comments + # before removing raw strings. This is because there are some + # cpplint checks that requires the comments to be preserved, but + # we don't want to check comments that are inside raw strings. + matched = Match(r'^(.*?)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line) + if (matched and + not Match(r'^([^\'"]|\'(\\.|[^\'])*\'|"(\\.|[^"])*")*//', + matched.group(1))): delimiter = ')' + matched.group(2) + '"' end = matched.group(3).find(delimiter) @@ -1101,6 +1351,8 @@ def CleanseRawStrings(raw_lines): else: # Start of a multi-line raw string line = matched.group(1) + '""' + else: + break lines_without_raw_strings.append(line) @@ -1131,10 +1383,10 @@ def FindNextMultiLineCommentEnd(lines, lineix): def RemoveMultiLineCommentsFromRange(lines, begin, end): """Clears a range of lines for multi-line comments.""" - # Having // dummy comments makes the lines non-empty, so we will not get + # Having // comments makes the lines non-empty, so we will not get # unnecessary blank line warnings later in the code. for i in range(begin, end): - lines[i] = '// dummy' + lines[i] = '/**/' def RemoveMultiLineComments(filename, lines, error): @@ -1170,12 +1422,14 @@ def CleanseComments(line): class CleansedLines(object): - """Holds 3 copies of all lines with different preprocessing applied to them. + """Holds 4 copies of all lines with different preprocessing applied to them. - 1) elided member contains lines without strings and comments, - 2) lines member contains lines without comments, and + 1) elided member contains lines without strings and comments. + 2) lines member contains lines without comments. 3) raw_lines member contains all the lines without processing. - All these three members are of , and of the same length. + 4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw + strings removed. + All these members are of , and of the same length. """ def __init__(self, lines): @@ -1206,38 +1460,138 @@ class CleansedLines(object): Returns: The line with collapsed strings. """ - if not _RE_PATTERN_INCLUDE.match(elided): - # Remove escaped characters first to make quote/single quote collapsing - # basic. Things that look like escaped characters shouldn't occur - # outside of strings and chars. - elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) - elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided) - elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided) - return elided + if _RE_PATTERN_INCLUDE.match(elided): + return elided + + # Remove escaped characters first to make quote/single quote collapsing + # basic. Things that look like escaped characters shouldn't occur + # outside of strings and chars. + elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) + + # Replace quoted strings and digit separators. Both single quotes + # and double quotes are processed in the same loop, otherwise + # nested quotes wouldn't work. + collapsed = '' + while True: + # Find the first quote character + match = Match(r'^([^\'"]*)([\'"])(.*)$', elided) + if not match: + collapsed += elided + break + head, quote, tail = match.groups() + + if quote == '"': + # Collapse double quoted strings + second_quote = tail.find('"') + if second_quote >= 0: + collapsed += head + '""' + elided = tail[second_quote + 1:] + else: + # Unmatched double quote, don't bother processing the rest + # of the line since this is probably a multiline string. + collapsed += elided + break + else: + # Found single quote, check nearby text to eliminate digit separators. + # + # There is no special handling for floating point here, because + # the integer/fractional/exponent parts would all be parsed + # correctly as long as there are digits on both sides of the + # separator. So we are fine as long as we don't see something + # like "0.'3" (gcc 4.9.0 will not allow this literal). + if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head): + match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail) + collapsed += head + match_literal.group(1).replace("'", '') + elided = match_literal.group(2) + else: + second_quote = tail.find('\'') + if second_quote >= 0: + collapsed += head + "''" + elided = tail[second_quote + 1:] + else: + # Unmatched single quote + collapsed += elided + break + + return collapsed -def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar): - """Find the position just after the matching endchar. +def FindEndOfExpressionInLine(line, startpos, stack): + """Find the position just after the end of current parenthesized expression. Args: line: a CleansedLines line. startpos: start searching at this position. - depth: nesting level at startpos. - startchar: expression opening character. - endchar: expression closing character. + stack: nesting stack at startpos. Returns: - On finding matching endchar: (index just after matching endchar, 0) - Otherwise: (-1, new depth at end of this line) + On finding matching end: (index just after matching end, None) + On finding an unclosed expression: (-1, None) + Otherwise: (-1, new stack at end of this line) """ for i in xrange(startpos, len(line)): - if line[i] == startchar: - depth += 1 - elif line[i] == endchar: - depth -= 1 - if depth == 0: - return (i + 1, 0) - return (-1, depth) + char = line[i] + if char in '([{': + # Found start of parenthesized expression, push to expression stack + stack.append(char) + elif char == '<': + # Found potential start of template argument list + if i > 0 and line[i - 1] == '<': + # Left shift operator + if stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + elif i > 0 and Search(r'\boperator\s*$', line[0:i]): + # operator<, don't add to stack + continue + else: + # Tentative start of template argument list + stack.append('<') + elif char in ')]}': + # Found end of parenthesized expression. + # + # If we are currently expecting a matching '>', the pending '<' + # must have been an operator. Remove them from expression stack. + while stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + if ((stack[-1] == '(' and char == ')') or + (stack[-1] == '[' and char == ']') or + (stack[-1] == '{' and char == '}')): + stack.pop() + if not stack: + return (i + 1, None) + else: + # Mismatched parentheses + return (-1, None) + elif char == '>': + # Found potential end of template argument list. + + # Ignore "->" and operator functions + if (i > 0 and + (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))): + continue + + # Pop the stack if there is a matching '<'. Otherwise, ignore + # this '>' since it must be an operator. + if stack: + if stack[-1] == '<': + stack.pop() + if not stack: + return (i + 1, None) + elif char == ';': + # Found something that look like end of statements. If we are currently + # expecting a '>', the matching '<' must have been an operator, since + # template argument list should not contain statements. + while stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + + # Did not find end of expression or unbalanced parentheses on this line + return (-1, stack) def CloseExpression(clean_lines, linenum, pos): @@ -1246,6 +1600,11 @@ def CloseExpression(clean_lines, linenum, pos): If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the linenum/pos that correspond to the closing of the expression. + TODO(unknown): cpplint spends a fair bit of time matching parentheses. + Ideally we would want to index all opening and closing parentheses once + and have CloseExpression be just a simple lookup, but due to preprocessor + tricks, this is not so easy. + Args: clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. @@ -1259,35 +1618,28 @@ def CloseExpression(clean_lines, linenum, pos): """ line = clean_lines.elided[linenum] - startchar = line[pos] - if startchar not in '({[<': + if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]): return (line, clean_lines.NumLines(), -1) - if startchar == '(': endchar = ')' - if startchar == '[': endchar = ']' - if startchar == '{': endchar = '}' - if startchar == '<': endchar = '>' # Check first line - (end_pos, num_open) = FindEndOfExpressionInLine( - line, pos, 0, startchar, endchar) + (end_pos, stack) = FindEndOfExpressionInLine(line, pos, []) if end_pos > -1: return (line, linenum, end_pos) # Continue scanning forward - while linenum < clean_lines.NumLines() - 1: + while stack and linenum < clean_lines.NumLines() - 1: linenum += 1 line = clean_lines.elided[linenum] - (end_pos, num_open) = FindEndOfExpressionInLine( - line, 0, num_open, startchar, endchar) + (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack) if end_pos > -1: return (line, linenum, end_pos) - # Did not find endchar before end of file, give up + # Did not find end of expression before end of file, give up return (line, clean_lines.NumLines(), -1) -def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar): - """Find position at the matching startchar. +def FindStartOfExpressionInLine(line, endpos, stack): + """Find position at the matching start of current expression. This is almost the reverse of FindEndOfExpressionInLine, but note that the input position and returned position differs by 1. @@ -1295,22 +1647,72 @@ def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar): Args: line: a CleansedLines line. endpos: start searching at this position. - depth: nesting level at endpos. - startchar: expression opening character. - endchar: expression closing character. + stack: nesting stack at endpos. Returns: - On finding matching startchar: (index at matching startchar, 0) - Otherwise: (-1, new depth at beginning of this line) + On finding matching start: (index at matching start, None) + On finding an unclosed expression: (-1, None) + Otherwise: (-1, new stack at beginning of this line) """ - for i in xrange(endpos, -1, -1): - if line[i] == endchar: - depth += 1 - elif line[i] == startchar: - depth -= 1 - if depth == 0: - return (i, 0) - return (-1, depth) + i = endpos + while i >= 0: + char = line[i] + if char in ')]}': + # Found end of expression, push to expression stack + stack.append(char) + elif char == '>': + # Found potential end of template argument list. + # + # Ignore it if it's a "->" or ">=" or "operator>" + if (i > 0 and + (line[i - 1] == '-' or + Match(r'\s>=\s', line[i - 1:]) or + Search(r'\boperator\s*$', line[0:i]))): + i -= 1 + else: + stack.append('>') + elif char == '<': + # Found potential start of template argument list + if i > 0 and line[i - 1] == '<': + # Left shift operator + i -= 1 + else: + # If there is a matching '>', we can pop the expression stack. + # Otherwise, ignore this '<' since it must be an operator. + if stack and stack[-1] == '>': + stack.pop() + if not stack: + return (i, None) + elif char in '([{': + # Found start of expression. + # + # If there are any unmatched '>' on the stack, they must be + # operators. Remove those. + while stack and stack[-1] == '>': + stack.pop() + if not stack: + return (-1, None) + if ((char == '(' and stack[-1] == ')') or + (char == '[' and stack[-1] == ']') or + (char == '{' and stack[-1] == '}')): + stack.pop() + if not stack: + return (i, None) + else: + # Mismatched parentheses + return (-1, None) + elif char == ';': + # Found something that look like end of statements. If we are currently + # expecting a '<', the matching '>' must have been an operator, since + # template argument list should not contain statements. + while stack and stack[-1] == '>': + stack.pop() + if not stack: + return (-1, None) + + i -= 1 + + return (-1, stack) def ReverseCloseExpression(clean_lines, linenum, pos): @@ -1331,30 +1733,23 @@ def ReverseCloseExpression(clean_lines, linenum, pos): return is the 'cleansed' line at linenum. """ line = clean_lines.elided[linenum] - endchar = line[pos] - if endchar not in ')}]>': + if line[pos] not in ')}]>': return (line, 0, -1) - if endchar == ')': startchar = '(' - if endchar == ']': startchar = '[' - if endchar == '}': startchar = '{' - if endchar == '>': startchar = '<' # Check last line - (start_pos, num_open) = FindStartOfExpressionInLine( - line, pos, 0, startchar, endchar) + (start_pos, stack) = FindStartOfExpressionInLine(line, pos, []) if start_pos > -1: return (line, linenum, start_pos) # Continue scanning backward - while linenum > 0: + while stack and linenum > 0: linenum -= 1 line = clean_lines.elided[linenum] - (start_pos, num_open) = FindStartOfExpressionInLine( - line, len(line) - 1, num_open, startchar, endchar) + (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack) if start_pos > -1: return (line, linenum, start_pos) - # Did not find startchar before beginning of file, give up + # Did not find start of expression before beginning of file, give up return (line, 0, -1) @@ -1362,7 +1757,7 @@ def CheckForCopyright(filename, lines, error): """Logs an error if no Copyright message appears at the top of the file.""" # We'll say it should occur by line 10. Don't forget there's a - # dummy line at the front. + # placeholder line at the front. for line in xrange(1, min(len(lines), 11)): if re.search(r'Copyright', lines[line], re.I): break else: # means no copyright line was found @@ -1371,6 +1766,46 @@ def CheckForCopyright(filename, lines, error): 'You should have a line: "Copyright [year] "') +def GetIndentLevel(line): + """Return the number of leading spaces in line. + + Args: + line: A string to check. + + Returns: + An integer count of leading spaces, possibly zero. + """ + indent = Match(r'^( *)\S', line) + if indent: + return len(indent.group(1)) + else: + return 0 + +def PathSplitToList(path): + """Returns the path split into a list by the separator. + + Args: + path: An absolute or relative path (e.g. '/a/b/c/' or '../a') + + Returns: + A list of path components (e.g. ['a', 'b', 'c]). + """ + lst = [] + while True: + (head, tail) = os.path.split(path) + if head == path: # absolute paths end + lst.append(head) + break + if tail == path: # relative paths end + lst.append(tail) + break + + path = head + lst.append(tail) + + lst.reverse() + return lst + def GetHeaderGuardCPPVariable(filename): """Returns the CPP variable that should be used as a header guard. @@ -1387,15 +1822,67 @@ def GetHeaderGuardCPPVariable(filename): # flymake. filename = re.sub(r'_flymake\.h$', '.h', filename) filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename) + # Replace 'c++' with 'cpp'. + filename = filename.replace('C++', 'cpp').replace('c++', 'cpp') fileinfo = FileInfo(filename) file_path_from_root = fileinfo.RepositoryName() - if _root: - file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root) - return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_' + + def FixupPathFromRoot(): + if _root_debug: + sys.stderr.write("\n_root fixup, _root = '%s', repository name = '%s'\n" + %(_root, fileinfo.RepositoryName())) + + # Process the file path with the --root flag if it was set. + if not _root: + if _root_debug: + sys.stderr.write("_root unspecified\n") + return file_path_from_root + + def StripListPrefix(lst, prefix): + # f(['x', 'y'], ['w, z']) -> None (not a valid prefix) + if lst[:len(prefix)] != prefix: + return None + # f(['a, 'b', 'c', 'd'], ['a', 'b']) -> ['c', 'd'] + return lst[(len(prefix)):] + + # root behavior: + # --root=subdir , lstrips subdir from the header guard + maybe_path = StripListPrefix(PathSplitToList(file_path_from_root), + PathSplitToList(_root)) + + if _root_debug: + sys.stderr.write(("_root lstrip (maybe_path=%s, file_path_from_root=%s," + + " _root=%s)\n") %(maybe_path, file_path_from_root, _root)) + + if maybe_path: + return os.path.join(*maybe_path) + + # --root=.. , will prepend the outer directory to the header guard + full_path = fileinfo.FullName() + root_abspath = os.path.abspath(_root) + + maybe_path = StripListPrefix(PathSplitToList(full_path), + PathSplitToList(root_abspath)) + + if _root_debug: + sys.stderr.write(("_root prepend (maybe_path=%s, full_path=%s, " + + "root_abspath=%s)\n") %(maybe_path, full_path, root_abspath)) + + if maybe_path: + return os.path.join(*maybe_path) + + if _root_debug: + sys.stderr.write("_root ignore, returning %s\n" %(file_path_from_root)) + + # --root=FAKE_DIR is ignored + return file_path_from_root + + file_path_from_root = FixupPathFromRoot() + return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_' -def CheckForHeaderGuard(filename, lines, error): +def CheckForHeaderGuard(filename, clean_lines, error): """Checks that the file contains a header guard. Logs an error if no #ifndef header guard is present. For other @@ -1403,18 +1890,29 @@ def CheckForHeaderGuard(filename, lines, error): Args: filename: The name of the C++ header file. - lines: An array of strings, each representing a line of the file. + clean_lines: A CleansedLines instance containing the file. error: The function to call with any errors found. """ + # Don't check for header guards if there are error suppression + # comments somewhere in this file. + # + # Because this is silencing a warning for a nonexistent line, we + # only support the very specific NOLINT(build/header_guard) syntax, + # and not the general NOLINT or NOLINT(*) syntax. + raw_lines = clean_lines.lines_without_raw_strings + for i in raw_lines: + if Search(r'//\s*NOLINT\(build/header_guard\)', i): + return + cppvar = GetHeaderGuardCPPVariable(filename) - ifndef = None + ifndef = '' ifndef_linenum = 0 - define = None - endif = None + define = '' + endif = '' endif_linenum = 0 - for linenum, line in enumerate(lines): + for linenum, line in enumerate(raw_lines): linesplit = line.split() if len(linesplit) >= 2: # find the first occurrence of #ifndef and #define, save arg @@ -1429,18 +1927,12 @@ def CheckForHeaderGuard(filename, lines, error): endif = line endif_linenum = linenum - if not ifndef: + if not ifndef or not define or ifndef != define: error(filename, 0, 'build/header_guard', 5, 'No #ifndef header guard found, suggested CPP variable is: %s' % cppvar) return - if not define: - error(filename, 0, 'build/header_guard', 5, - 'No #define header guard found, suggested CPP variable is: %s' % - cppvar) - return - # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__ # for backward compatibility. if ifndef != cppvar: @@ -1448,26 +1940,69 @@ def CheckForHeaderGuard(filename, lines, error): if ifndef != cppvar + '_': error_level = 5 - ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum, + ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum, error) error(filename, ifndef_linenum, 'build/header_guard', error_level, '#ifndef header guard has wrong style, please use: %s' % cppvar) - if define != ifndef: - error(filename, 0, 'build/header_guard', 5, - '#ifndef and #define don\'t match, suggested CPP variable is: %s' % - cppvar) + # Check for "//" comments on endif line. + ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum, + error) + match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif) + if match: + if match.group(1) == '_': + # Issue low severity warning for deprecated double trailing underscore + error(filename, endif_linenum, 'build/header_guard', 0, + '#endif line should be "#endif // %s"' % cppvar) return - if endif != ('#endif // %s' % cppvar): - error_level = 0 - if endif != ('#endif // %s' % (cppvar + '_')): - error_level = 5 + # Didn't find the corresponding "//" comment. If this file does not + # contain any "//" comments at all, it could be that the compiler + # only wants "/**/" comments, look for those instead. + no_single_line_comments = True + for i in xrange(1, len(raw_lines) - 1): + line = raw_lines[i] + if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line): + no_single_line_comments = False + break - ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum, - error) - error(filename, endif_linenum, 'build/header_guard', error_level, - '#endif line should be "#endif // %s"' % cppvar) + if no_single_line_comments: + match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif) + if match: + if match.group(1) == '_': + # Low severity warning for double trailing underscore + error(filename, endif_linenum, 'build/header_guard', 0, + '#endif line should be "#endif /* %s */"' % cppvar) + return + + # Didn't find anything + error(filename, endif_linenum, 'build/header_guard', 5, + '#endif line should be "#endif // %s"' % cppvar) + + +def CheckHeaderFileIncluded(filename, include_state, error): + """Logs an error if a .cc file does not include its header.""" + + # Do not check test files + fileinfo = FileInfo(filename) + if Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()): + return + + headerfile = filename[0:len(filename) - len(fileinfo.Extension())] + '.h' + if not os.path.exists(headerfile): + return + headername = FileInfo(headerfile).RepositoryName() + first_include = 0 + for section_list in include_state.include_list: + for f in section_list: + if headername in f[0] or f[0] in headername: + return + if not first_include: + first_include = f[1] + + error(filename, first_include, 'build/include', 5, + '%s should include its header file %s' % (fileinfo.RepositoryName(), + headername)) def CheckForBadCharacters(filename, lines, error): @@ -1551,19 +2086,33 @@ def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error): 'Use C++11 raw strings or concatenation instead.') -threading_list = ( - ('asctime(', 'asctime_r('), - ('ctime(', 'ctime_r('), - ('getgrgid(', 'getgrgid_r('), - ('getgrnam(', 'getgrnam_r('), - ('getlogin(', 'getlogin_r('), - ('getpwnam(', 'getpwnam_r('), - ('getpwuid(', 'getpwuid_r('), - ('gmtime(', 'gmtime_r('), - ('localtime(', 'localtime_r('), - ('rand(', 'rand_r('), - ('strtok(', 'strtok_r('), - ('ttyname(', 'ttyname_r('), +# (non-threadsafe name, thread-safe alternative, validation pattern) +# +# The validation pattern is used to eliminate false positives such as: +# _rand(); // false positive due to substring match. +# ->rand(); // some member function rand(). +# ACMRandom rand(seed); // some variable named rand. +# ISAACRandom rand(); // another variable named rand. +# +# Basically we require the return value of these functions to be used +# in some expression context on the same line by matching on some +# operator before the function name. This eliminates constructors and +# member function calls. +_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)' +_THREADING_LIST = ( + ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'), + ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'), + ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'), + ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'), + ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'), + ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'), + ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'), + ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'), + ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'), + ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'), + ('strtok(', 'strtok_r(', + _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'), + ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'), ) @@ -1583,14 +2132,13 @@ def CheckPosixThreading(filename, clean_lines, linenum, error): error: The function to call with any errors found. """ line = clean_lines.elided[linenum] - for single_thread_function, multithread_safe_function in threading_list: - ix = line.find(single_thread_function) - # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison - if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and - line[ix - 1] not in ('_', '.', '>'))): + for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST: + # Additional pattern matching check to confirm that this is the + # function we are looking for + if Search(pattern, line): error(filename, linenum, 'runtime/threadsafe_fn', 2, - 'Consider using ' + multithread_safe_function + - '...) instead of ' + single_thread_function + + 'Consider using ' + multithread_safe_func + + '...) instead of ' + single_thread_func + '...) for improved thread safety.') @@ -1612,7 +2160,6 @@ def CheckVlogArguments(filename, clean_lines, linenum, error): 'VLOG() should be used with numeric verbosity level. ' 'Use LOG() if you want symbolic severity levels.') - # Matches invalid increment: *count++, which moves pointer instead of # incrementing a value. _RE_PATTERN_INVALID_INCREMENT = re.compile( @@ -1641,13 +2188,29 @@ def CheckInvalidIncrement(filename, clean_lines, linenum, error): 'Changing pointer instead of value (or unused value of operator*).') +def IsMacroDefinition(clean_lines, linenum): + if Search(r'^#define', clean_lines[linenum]): + return True + + if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]): + return True + + return False + + +def IsForwardClassDeclaration(clean_lines, linenum): + return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum]) + + class _BlockInfo(object): """Stores information about a generic block of code.""" - def __init__(self, seen_open_brace): + def __init__(self, linenum, seen_open_brace): + self.starting_linenum = linenum self.seen_open_brace = seen_open_brace self.open_parentheses = 0 self.inline_asm = _NO_ASM + self.check_namespace_indentation = False def CheckBegin(self, filename, clean_lines, linenum, error): """Run checks that applies to text up to the opening brace. @@ -1677,15 +2240,33 @@ class _BlockInfo(object): """ pass + def IsBlockInfo(self): + """Returns true if this block is a _BlockInfo. + + This is convenient for verifying that an object is an instance of + a _BlockInfo, but not an instance of any of the derived classes. + + Returns: + True for this class, False for derived classes. + """ + return self.__class__ == _BlockInfo + + +class _ExternCInfo(_BlockInfo): + """Stores information about an 'extern "C"' block.""" + + def __init__(self, linenum): + _BlockInfo.__init__(self, linenum, True) + class _ClassInfo(_BlockInfo): """Stores information about a class.""" def __init__(self, name, class_or_struct, clean_lines, linenum): - _BlockInfo.__init__(self, False) + _BlockInfo.__init__(self, linenum, False) self.name = name - self.starting_linenum = linenum self.is_derived = False + self.check_namespace_indentation = True if class_or_struct == 'struct': self.access = 'public' self.is_struct = True @@ -1695,11 +2276,7 @@ class _ClassInfo(_BlockInfo): # Remember initial indentation level for this class. Using raw_lines here # instead of elided to account for leading comments. - initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum]) - if initial_indent: - self.class_indent = len(initial_indent.group(1)) - else: - self.class_indent = 0 + self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum]) # Try to find the end of the class. This will be confused by things like: # class A { @@ -1721,6 +2298,23 @@ class _ClassInfo(_BlockInfo): self.is_derived = True def CheckEnd(self, filename, clean_lines, linenum, error): + # If there is a DISALLOW macro, it should appear near the end of + # the class. + seen_last_thing_in_class = False + for i in xrange(linenum - 1, self.starting_linenum, -1): + match = Search( + r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' + + self.name + r'\)', + clean_lines.elided[i]) + if match: + if seen_last_thing_in_class: + error(filename, i, 'readability/constructors', 3, + match.group(1) + ' should be the last thing in the class') + break + + if not Match(r'^\s*$', clean_lines.elided[i]): + seen_last_thing_in_class = True + # Check that closing brace is aligned with beginning of the class. # Only do this if the closing brace is indented by only whitespaces. # This means we will not check single-line class definitions. @@ -1738,9 +2332,9 @@ class _NamespaceInfo(_BlockInfo): """Stores information about a namespace.""" def __init__(self, name, linenum): - _BlockInfo.__init__(self, False) + _BlockInfo.__init__(self, linenum, False) self.name = name or '' - self.starting_linenum = linenum + self.check_namespace_indentation = True def CheckEnd(self, filename, clean_lines, linenum, error): """Check end of namespace comments.""" @@ -1758,7 +2352,7 @@ class _NamespaceInfo(_BlockInfo): # deciding what these nontrivial things are, so this check is # triggered by namespace size only, which works most of the time. if (linenum - self.starting_linenum < 10 - and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)): + and not Match(r'^\s*};*\s*(//|/\*).*\bnamespace\b', line)): return # Look for matching comment at end of namespace. @@ -1775,17 +2369,24 @@ class _NamespaceInfo(_BlockInfo): # expected namespace. if self.name: # Named namespace - if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) + - r'[\*/\.\\\s]*$'), + if not Match((r'^\s*};*\s*(//|/\*).*\bnamespace\s+' + + re.escape(self.name) + r'[\*/\.\\\s]*$'), line): error(filename, linenum, 'readability/namespace', 5, 'Namespace should be terminated with "// namespace %s"' % self.name) else: # Anonymous namespace - if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): - error(filename, linenum, 'readability/namespace', 5, - 'Namespace should be terminated with "// namespace"') + if not Match(r'^\s*};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): + # If "// namespace anonymous" or "// anonymous namespace (more text)", + # mention "// anonymous namespace" as an acceptable form + if Match(r'^\s*}.*\b(namespace anonymous|anonymous namespace)\b', line): + error(filename, linenum, 'readability/namespace', 5, + 'Anonymous namespace should be terminated with "// namespace"' + ' or "// anonymous namespace"') + else: + error(filename, linenum, 'readability/namespace', 5, + 'Anonymous namespace should be terminated with "// namespace"') class _PreprocessorInfo(object): @@ -1802,7 +2403,7 @@ class _PreprocessorInfo(object): self.seen_else = False -class _NestingState(object): +class NestingState(object): """Holds states related to parsing braces.""" def __init__(self): @@ -1814,6 +2415,17 @@ class _NestingState(object): # - _BlockInfo: some other type of block. self.stack = [] + # Top of the previous stack before each Update(). + # + # Because the nesting_stack is updated at the end of each line, we + # had to do some convoluted checks to find out what is the current + # scope at the beginning of the line. This check is simplified by + # saving the previous top of nesting stack. + # + # We could save the full stack, but we only need the top. Copying + # the full nesting stack would slow down cpplint by ~10%. + self.previous_stack_top = [] + # Stack of _PreprocessorInfo objects. self.pp_stack = [] @@ -1834,6 +2446,82 @@ class _NestingState(object): """ return self.stack and isinstance(self.stack[-1], _NamespaceInfo) + def InExternC(self): + """Check if we are currently one level inside an 'extern "C"' block. + + Returns: + True if top of the stack is an extern block, False otherwise. + """ + return self.stack and isinstance(self.stack[-1], _ExternCInfo) + + def InClassDeclaration(self): + """Check if we are currently one level inside a class or struct declaration. + + Returns: + True if top of the stack is a class/struct, False otherwise. + """ + return self.stack and isinstance(self.stack[-1], _ClassInfo) + + def InAsmBlock(self): + """Check if we are currently one level inside an inline ASM block. + + Returns: + True if the top of the stack is a block containing inline ASM. + """ + return self.stack and self.stack[-1].inline_asm != _NO_ASM + + def InTemplateArgumentList(self, clean_lines, linenum, pos): + """Check if current position is inside template argument list. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: position just after the suspected template argument. + Returns: + True if (linenum, pos) is inside template arguments. + """ + while linenum < clean_lines.NumLines(): + # Find the earliest character that might indicate a template argument + line = clean_lines.elided[linenum] + match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:]) + if not match: + linenum += 1 + pos = 0 + continue + token = match.group(1) + pos += len(match.group(0)) + + # These things do not look like template argument list: + # class Suspect { + # class Suspect x; } + if token in ('{', '}', ';'): return False + + # These things look like template argument list: + # template + # template + # template + # template + if token in ('>', '=', '[', ']', '.'): return True + + # Check if token is an unmatched '<'. + # If not, move on to the next character. + if token != '<': + pos += 1 + if pos >= len(line): + linenum += 1 + pos = 0 + continue + + # We can't be sure if we just find a single '<', and need to + # find the matching '>'. + (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1) + if end_pos < 0: + # Not sure if template argument list or syntax error in file + return False + linenum = end_line + pos = end_pos + return False + def UpdatePreprocessor(self, line): """Update preprocessor stack. @@ -1890,6 +2578,7 @@ class _NestingState(object): # TODO(unknown): unexpected #endif, issue warning? pass + # TODO(unknown): Update() is too long, but we will refactor later. def Update(self, filename, clean_lines, linenum, error): """Update nesting state with current line. @@ -1901,7 +2590,17 @@ class _NestingState(object): """ line = clean_lines.elided[linenum] - # Update pp_stack first + # Remember top of the previous nesting stack. + # + # The stack is always pushed/popped and not modified in place, so + # we can just do a shallow copy instead of copy.deepcopy. Using + # deepcopy would slow down cpplint by ~28%. + if self.stack: + self.previous_stack_top = self.stack[-1] + else: + self.previous_stack_top = None + + # Update pp_stack self.UpdatePreprocessor(line) # Count parentheses. This is to avoid adding struct arguments to @@ -1952,32 +2651,27 @@ class _NestingState(object): # such as in: # class LOCKABLE API Object { # }; - # - # Templates with class arguments may confuse the parser, for example: - # template , - # class Vector = vector > - # class HeapQueue { - # - # Because this parser has no nesting state about templates, by the - # time it saw "class Comparator", it may think that it's a new class. - # Nested templates have a similar problem: - # template < - # typename ExportedType, - # typename TupleType, - # template class ImplTemplate> - # - # To avoid these cases, we ignore classes that are followed by '=' or '>' class_decl_match = Match( - r'\s*(template\s*<[\w\s<>,:]*>\s*)?' - r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)' - r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line) + r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?' + r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))' + r'(.*)$', line) if (class_decl_match and (not self.stack or self.stack[-1].open_parentheses == 0)): - self.stack.append(_ClassInfo( - class_decl_match.group(4), class_decl_match.group(2), - clean_lines, linenum)) - line = class_decl_match.group(5) + # We do not want to accept classes that are actually template arguments: + # template , + # template class Ignore3> + # void Function() {}; + # + # To avoid template argument cases, we scan forward and look for + # an unmatched '>'. If we see one, assume we are inside a + # template argument list. + end_declaration = len(class_decl_match.group(1)) + if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration): + self.stack.append(_ClassInfo( + class_decl_match.group(3), class_decl_match.group(2), + clean_lines, linenum)) + line = class_decl_match.group(4) # If we have not yet seen the opening brace for the innermost block, # run checks here. @@ -2024,10 +2718,13 @@ class _NestingState(object): # stack otherwise. if not self.SeenOpenBrace(): self.stack[-1].seen_open_brace = True + elif Match(r'^extern\s*"[^"]*"\s*\{', line): + self.stack.append(_ExternCInfo(linenum)) else: - self.stack.append(_BlockInfo(True)) + self.stack.append(_BlockInfo(linenum, True)) if _MATCH_ASM.match(line): self.stack[-1].inline_asm = _BLOCK_ASM + elif token == ';' or token == ')': # If we haven't seen an opening brace yet, but we already saw # a semicolon, this is probably a forward declaration. Pop @@ -2103,7 +2800,7 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum, filename: The name of the current file. clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: A callable to which errors are reported, which takes 4 arguments: filename, line number, error level, and message @@ -2136,7 +2833,8 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum, r'\s+(register|static|extern|typedef)\b', line): error(filename, linenum, 'build/storage_class', 5, - 'Storage class (static, extern, typedef, etc) should be first.') + 'Storage-class specifier (static, extern, typedef, etc) should be ' + 'at the beginning of the declaration.') if Match(r'\s*#\s*endif\s*[^/\s]+', line): error(filename, linenum, 'build/endif_comment', 5, @@ -2176,26 +2874,79 @@ def CheckForNonStandardConstructs(filename, clean_lines, linenum, # Look for single-argument constructors that aren't marked explicit. # Technically a valid construct, but against style. - args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)' - % re.escape(base_classname), - line) - if (args and - args.group(1) != 'void' and - not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&' - % re.escape(base_classname), args.group(1).strip())): - error(filename, linenum, 'runtime/explicit', 5, - 'Single-argument constructors should be marked explicit.') + explicit_constructor_match = Match( + r'\s+(?:(?:inline|constexpr)\s+)*(explicit\s+)?' + r'(?:(?:inline|constexpr)\s+)*%s\s*' + r'\(((?:[^()]|\([^()]*\))*)\)' + % re.escape(base_classname), + line) + + if explicit_constructor_match: + is_marked_explicit = explicit_constructor_match.group(1) + + if not explicit_constructor_match.group(2): + constructor_args = [] + else: + constructor_args = explicit_constructor_match.group(2).split(',') + + # collapse arguments so that commas in template parameter lists and function + # argument parameter lists don't split arguments in two + i = 0 + while i < len(constructor_args): + constructor_arg = constructor_args[i] + while (constructor_arg.count('<') > constructor_arg.count('>') or + constructor_arg.count('(') > constructor_arg.count(')')): + constructor_arg += ',' + constructor_args[i + 1] + del constructor_args[i + 1] + constructor_args[i] = constructor_arg + i += 1 + + defaulted_args = [arg for arg in constructor_args if '=' in arg] + noarg_constructor = (not constructor_args or # empty arg list + # 'void' arg specifier + (len(constructor_args) == 1 and + constructor_args[0].strip() == 'void')) + onearg_constructor = ((len(constructor_args) == 1 and # exactly one arg + not noarg_constructor) or + # all but at most one arg defaulted + (len(constructor_args) >= 1 and + not noarg_constructor and + len(defaulted_args) >= len(constructor_args) - 1)) + initializer_list_constructor = bool( + onearg_constructor and + Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0])) + copy_constructor = bool( + onearg_constructor and + Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&' + % re.escape(base_classname), constructor_args[0].strip())) + + if (not is_marked_explicit and + onearg_constructor and + not initializer_list_constructor and + not copy_constructor): + if defaulted_args: + error(filename, linenum, 'runtime/explicit', 5, + 'Constructors callable with one argument ' + 'should be marked explicit.') + else: + error(filename, linenum, 'runtime/explicit', 5, + 'Single-parameter constructors should be marked explicit.') + elif is_marked_explicit and not onearg_constructor: + if noarg_constructor: + error(filename, linenum, 'runtime/explicit', 5, + 'Zero-parameter constructors should not be marked explicit.') -def CheckSpacingForFunctionCall(filename, line, linenum, error): +def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error): """Checks for the correctness of various spacing around function calls. Args: filename: The name of the current file. - line: The text of the line to check. + clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. error: The function to call with any errors found. """ + line = clean_lines.elided[linenum] # Since function calls often occur inside if/for/while/switch # expressions - which have their own, more liberal conventions - we @@ -2238,10 +2989,18 @@ def CheckSpacingForFunctionCall(filename, line, linenum, error): error(filename, linenum, 'whitespace/parens', 2, 'Extra space after (') if (Search(r'\w\s+\(', fncall) and - not Search(r'#\s*define|typedef', fncall) and - not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)): - error(filename, linenum, 'whitespace/parens', 4, - 'Extra space before ( in function call') + not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and + not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and + not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and + not Search(r'\bcase\s+\(', fncall)): + # TODO(unknown): Space after an operator function seem to be a common + # error, silence those for now by restricting them to highest verbosity. + if Search(r'\boperator_*\b', line): + error(filename, linenum, 'whitespace/parens', 0, + 'Extra space before ( in function call') + else: + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space before ( in function call') # If the ) is followed only by a newline or a { + newline, assume it's # part of a control statement (if/while/etc), and don't complain if Search(r'[^)]\s+\)\s*[^{\s]', fncall): @@ -2270,12 +3029,26 @@ def IsBlankLine(line): return not line or line.isspace() +def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, + error): + is_namespace_indent_item = ( + len(nesting_state.stack) > 1 and + nesting_state.stack[-1].check_namespace_indentation and + isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and + nesting_state.previous_stack_top == nesting_state.stack[-2]) + + if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, + clean_lines.elided, line): + CheckItemIndentationInNamespace(filename, clean_lines.elided, + line, error) + + def CheckForFunctionLengths(filename, clean_lines, linenum, function_state, error): """Reports for long function bodies. For an overview why this is done, see: - http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions + https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions Uses a simplistic algorithm assuming other style guidelines (especially spacing) are followed. @@ -2295,8 +3068,6 @@ def CheckForFunctionLengths(filename, clean_lines, linenum, """ lines = clean_lines.lines line = lines[linenum] - raw = clean_lines.raw_lines - raw_line = raw[linenum] joined_line = '' starting_func = False @@ -2343,190 +3114,58 @@ def CheckForFunctionLengths(filename, clean_lines, linenum, _RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?') -def CheckComment(comment, filename, linenum, error): - """Checks for common mistakes in TODO comments. +def CheckComment(line, filename, linenum, next_line_start, error): + """Checks for common mistakes in comments. Args: - comment: The text of the comment from the line in question. + line: The line in question. filename: The name of the current file. linenum: The number of the line to check. + next_line_start: The first non-whitespace column of the next line. error: The function to call with any errors found. """ - match = _RE_PATTERN_TODO.match(comment) - if match: - # One whitespace is correct; zero whitespace is handled elsewhere. - leading_whitespace = match.group(1) - if len(leading_whitespace) > 1: - error(filename, linenum, 'whitespace/todo', 2, - 'Too many spaces before TODO') + commentpos = line.find('//') + if commentpos != -1: + # Check if the // may be in quotes. If so, ignore it + if re.sub(r'\\.', '', line[0:commentpos]).count('"') % 2 == 0: + # Allow one space for new scopes, two spaces otherwise: + if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and + ((commentpos >= 1 and + line[commentpos-1] not in string.whitespace) or + (commentpos >= 2 and + line[commentpos-2] not in string.whitespace))): + error(filename, linenum, 'whitespace/comments', 2, + 'At least two spaces is best between code and comments') - username = match.group(2) - if not username: - error(filename, linenum, 'readability/todo', 2, - 'Missing username in TODO; it should look like ' - '"// TODO(my_username): Stuff."') + # Checks for common mistakes in TODO comments. + comment = line[commentpos:] + match = _RE_PATTERN_TODO.match(comment) + if match: + # One whitespace is correct; zero whitespace is handled elsewhere. + leading_whitespace = match.group(1) + if len(leading_whitespace) > 1: + error(filename, linenum, 'whitespace/todo', 2, + 'Too many spaces before TODO') - middle_whitespace = match.group(3) - # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison - if middle_whitespace != ' ' and middle_whitespace != '': - error(filename, linenum, 'whitespace/todo', 2, - 'TODO(my_username) should be followed by a space') + username = match.group(2) + if not username: + error(filename, linenum, 'readability/todo', 2, + 'Missing username in TODO; it should look like ' + '"// TODO(my_username): Stuff."') -def CheckAccess(filename, clean_lines, linenum, nesting_state, error): - """Checks for improper use of DISALLOW* macros. + middle_whitespace = match.group(3) + # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison + if middle_whitespace != ' ' and middle_whitespace != '': + error(filename, linenum, 'whitespace/todo', 2, + 'TODO(my_username) should be followed by a space') - Args: - filename: The name of the current file. - clean_lines: A CleansedLines instance containing the file. - linenum: The number of the line to check. - nesting_state: A _NestingState instance which maintains information about - the current stack of nested blocks being parsed. - error: The function to call with any errors found. - """ - line = clean_lines.elided[linenum] # get rid of comments and strings - - matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|' - r'DISALLOW_EVIL_CONSTRUCTORS|' - r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line) - if not matched: - return - if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo): - if nesting_state.stack[-1].access != 'private': - error(filename, linenum, 'readability/constructors', 3, - '%s must be in the private: section' % matched.group(1)) - - else: - # Found DISALLOW* macro outside a class declaration, or perhaps it - # was used inside a function when it should have been part of the - # class declaration. We could issue a warning here, but it - # probably resulted in a compiler error already. - pass - - -def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix): - """Find the corresponding > to close a template. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: Current line number. - init_suffix: Remainder of the current line after the initial <. - - Returns: - True if a matching bracket exists. - """ - line = init_suffix - nesting_stack = ['<'] - while True: - # Find the next operator that can tell us whether < is used as an - # opening bracket or as a less-than operator. We only want to - # warn on the latter case. - # - # We could also check all other operators and terminate the search - # early, e.g. if we got something like this "a(),;\[\]]*([<>(),;\[\]])(.*)$', line) - if match: - # Found an operator, update nesting stack - operator = match.group(1) - line = match.group(2) - - if nesting_stack[-1] == '<': - # Expecting closing angle bracket - if operator in ('<', '(', '['): - nesting_stack.append(operator) - elif operator == '>': - nesting_stack.pop() - if not nesting_stack: - # Found matching angle bracket - return True - elif operator == ',': - # Got a comma after a bracket, this is most likely a template - # argument. We have not seen a closing angle bracket yet, but - # it's probably a few lines later if we look for it, so just - # return early here. - return True - else: - # Got some other operator. - return False - - else: - # Expecting closing parenthesis or closing bracket - if operator in ('<', '(', '['): - nesting_stack.append(operator) - elif operator in (')', ']'): - # We don't bother checking for matching () or []. If we got - # something like (] or [), it would have been a syntax error. - nesting_stack.pop() - - else: - # Scan the next line - linenum += 1 - if linenum >= len(clean_lines.elided): - break - line = clean_lines.elided[linenum] - - # Exhausted all remaining lines and still no matching angle bracket. - # Most likely the input was incomplete, otherwise we should have - # seen a semicolon and returned early. - return True - - -def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix): - """Find the corresponding < that started a template. - - Args: - clean_lines: A CleansedLines instance containing the file. - linenum: Current line number. - init_prefix: Part of the current line before the initial >. - - Returns: - True if a matching bracket exists. - """ - line = init_prefix - nesting_stack = ['>'] - while True: - # Find the previous operator - match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line) - if match: - # Found an operator, update nesting stack - operator = match.group(2) - line = match.group(1) - - if nesting_stack[-1] == '>': - # Expecting opening angle bracket - if operator in ('>', ')', ']'): - nesting_stack.append(operator) - elif operator == '<': - nesting_stack.pop() - if not nesting_stack: - # Found matching angle bracket - return True - elif operator == ',': - # Got a comma before a bracket, this is most likely a - # template argument. The opening angle bracket is probably - # there if we look for it, so just return early here. - return True - else: - # Got some other operator. - return False - - else: - # Expecting opening parenthesis or opening bracket - if operator in ('>', ')', ']'): - nesting_stack.append(operator) - elif operator in ('(', '['): - nesting_stack.pop() - - else: - # Scan the previous line - linenum -= 1 - if linenum < 0: - break - line = clean_lines.elided[linenum] - - # Exhausted all earlier lines and still no matching angle bracket. - return False + # If the comment contains an alphanumeric character, there + # should be a space somewhere between it and the // unless + # it's a /// or //! Doxygen comment. + if (Match(r'//[^ ]*\w', comment) and + not Match(r'(///|//\!)(\s+|$)', comment)): + error(filename, linenum, 'whitespace/comments', 4, + 'Should have a space between // and comment') def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): @@ -2542,7 +3181,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): filename: The name of the current file. clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: The function to call with any errors found. """ @@ -2565,7 +3204,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # } # # A warning about missing end of namespace comments will be issued instead. - if IsBlankLine(line) and not nesting_state.InNamespaceBody(): + # + # Also skip blank line checks for 'extern "C"' blocks, which are formatted + # like namespaces. + if (IsBlankLine(line) and + not nesting_state.InNamespaceBody() and + not nesting_state.InExternC()): elided = clean_lines.elided prev_line = elided[linenum - 1] prevbrace = prev_line.rfind('{') @@ -2628,54 +3272,64 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): error(filename, linenum, 'whitespace/blank_line', 3, 'Do not leave a blank line after "%s:"' % matched.group(1)) - # Next, we complain if there's a comment too near the text - commentpos = line.find('//') - if commentpos != -1: - # Check if the // may be in quotes. If so, ignore it - # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison - if (line.count('"', 0, commentpos) - - line.count('\\"', 0, commentpos)) % 2 == 0: # not in quotes - # Allow one space for new scopes, two spaces otherwise: - if (not Match(r'^\s*{ //', line) and - ((commentpos >= 1 and - line[commentpos-1] not in string.whitespace) or - (commentpos >= 2 and - line[commentpos-2] not in string.whitespace))): - error(filename, linenum, 'whitespace/comments', 2, - 'At least two spaces is best between code and comments') - # There should always be a space between the // and the comment - commentend = commentpos + 2 - if commentend < len(line) and not line[commentend] == ' ': - # but some lines are exceptions -- e.g. if they're big - # comment delimiters like: - # //---------------------------------------------------------- - # or are an empty C++ style Doxygen comment, like: - # /// - # or C++ style Doxygen comments placed after the variable: - # ///< Header comment - # //!< Header comment - # or they begin with multiple slashes followed by a space: - # //////// Header comment - match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or - Search(r'^/$', line[commentend:]) or - Search(r'^!< ', line[commentend:]) or - Search(r'^/< ', line[commentend:]) or - Search(r'^/+ ', line[commentend:])) - if not match: - error(filename, linenum, 'whitespace/comments', 4, - 'Should have a space between // and comment') - CheckComment(line[commentpos:], filename, linenum, error) + # Next, check comments + next_line_start = 0 + if linenum + 1 < clean_lines.NumLines(): + next_line = raw[linenum + 1] + next_line_start = len(next_line) - len(next_line.lstrip()) + CheckComment(line, filename, linenum, next_line_start, error) - line = clean_lines.elided[linenum] # get rid of comments and strings + # get rid of comments and strings + line = clean_lines.elided[linenum] - # Don't try to do spacing checks for operator methods - line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line) + # You shouldn't have spaces before your brackets, except maybe after + # 'delete []', 'return []() {};', or 'auto [abc, ...] = ...;'. + if Search(r'\w\s+\[', line) and not Search(r'(?:auto&?|delete|return)\s+\[', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Extra space before [') + + # In range-based for, we wanted spaces before and after the colon, but + # not around "::" tokens that might appear. + if (Search(r'for *\(.*[^:]:[^: ]', line) or + Search(r'for *\(.*[^: ]:[^:]', line)): + error(filename, linenum, 'whitespace/forcolon', 2, + 'Missing space around colon in range-based for loop') + + +def CheckOperatorSpacing(filename, clean_lines, linenum, error): + """Checks for horizontal spacing around operators. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Don't try to do spacing checks for operator methods. Do this by + # replacing the troublesome characters with something else, + # preserving column position for all other characters. + # + # The replacement is done repeatedly to avoid false positives from + # operators that call operators. + while True: + match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line) + if match: + line = match.group(1) + ('_' * len(match.group(2))) + match.group(3) + else: + break # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )". # Otherwise not. Note we only check for non-spaces on *both* sides; # sometimes people put non-spaces on one side when aligning ='s among # many lines (not that this is behavior that I approve of...) - if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line): + if ((Search(r'[\w.]=', line) or + Search(r'=[\w.]', line)) + and not Search(r'\b(if|while|for) ', line) + # Operators taken from [lex.operators] in C++11 standard. + and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line) + and not Search(r'operator=', line)): error(filename, linenum, 'whitespace/operators', 4, 'Missing spaces around =') @@ -2687,42 +3341,51 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # # Check <= and >= first to avoid false positives with < and >, then # check non-include lines for spacing around < and >. - match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line) + # + # If the operator is followed by a comma, assume it's be used in a + # macro context and don't do any checks. This avoids false + # positives. + # + # Note that && is not included here. This is because there are too + # many false positives due to RValue references. + match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line) if match: error(filename, linenum, 'whitespace/operators', 3, 'Missing spaces around %s' % match.group(1)) - # We allow no-spaces around << when used like this: 10<<20, but - # not otherwise (particularly, not when used as streams) - # Also ignore using ns::operator<<; - match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line) - if (match and - not (match.group(1).isdigit() and match.group(2).isdigit()) and - not (match.group(1) == 'operator' and match.group(2) == ';')): - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around <<') elif not Match(r'#.*include', line): - # Avoid false positives on -> - reduced_line = line.replace('->', '') - # Look for < that is not surrounded by spaces. This is only # triggered if both sides are missing spaces, even though # technically should should flag if at least one side is missing a # space. This is done to avoid some false positives with shifts. - match = Search(r'[^\s<]<([^\s=<].*)', reduced_line) - if (match and - not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))): - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around <') + match = Match(r'^(.*[^\s<])<[^\s=<,]', line) + if match: + (_, _, end_pos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + if end_pos <= -1: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <') # Look for > that is not surrounded by spaces. Similar to the # above, we only trigger if both sides are missing spaces to avoid # false positives with shifts. - match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line) - if (match and - not FindPreviousMatchingAngleBracket(clean_lines, linenum, - match.group(1))): - error(filename, linenum, 'whitespace/operators', 3, - 'Missing spaces around >') + match = Match(r'^(.*[^-\s>])>[^\s=>,]', line) + if match: + (_, _, start_pos) = ReverseCloseExpression( + clean_lines, linenum, len(match.group(1))) + if start_pos <= -1: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >') + + # We allow no-spaces around << when used like this: 10<<20, but + # not otherwise (particularly, not when used as streams) + # + # We also allow operators following an opening parenthesis, since + # those tend to be macros that deal with operators. + match = Search(r'(operator|[^\s(<])(?:L|UL|LL|ULL|l|ul|ll|ull)?<<([^\s,=<])', line) + if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and + not (match.group(1) == 'operator' and match.group(2) == ';')): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <<') # We allow no-spaces around >> for almost anything. This is because # C++11 allows ">>" to close nested templates, which accounts for @@ -2747,7 +3410,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): error(filename, linenum, 'whitespace/operators', 4, 'Extra space for operator %s' % match.group(1)) - # A pet peeve of mine: no spaces after an if, while, switch, or for + +def CheckParenthesisSpacing(filename, clean_lines, linenum, error): + """Checks for horizontal spacing around parentheses. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # No spaces after an if, while, switch, or for match = Search(r' (if\(|for\(|while\(|switch\()', line) if match: error(filename, linenum, 'whitespace/parens', 5, @@ -2773,6 +3448,19 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): 'Should have zero or one spaces inside ( and ) in %s' % match.group(1)) + +def CheckCommaSpacing(filename, clean_lines, linenum, error): + """Checks for horizontal spacing near commas and semicolons. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + raw = clean_lines.lines_without_raw_strings + line = clean_lines.elided[linenum] + # You should always have a space after a comma (either as fn arg or operator) # # This does not apply when the non-space character following the @@ -2783,7 +3471,8 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # verify that lines contain missing whitespaces, second pass on raw # lines to confirm that those missing whitespaces are not due to # elided comments. - if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]): + if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and + Search(r',[^,\s]', raw[linenum])): error(filename, linenum, 'whitespace/comma', 3, 'Missing space after ,') @@ -2795,14 +3484,91 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): error(filename, linenum, 'whitespace/semicolon', 3, 'Missing space after ;') - # Next we will look for issues with function calls. - CheckSpacingForFunctionCall(filename, line, linenum, error) + +def _IsType(clean_lines, nesting_state, expr): + """Check if expression looks like a type name, returns true if so. + + Args: + clean_lines: A CleansedLines instance containing the file. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + expr: The expression to check. + Returns: + True, if token looks like a type. + """ + # Keep only the last token in the expression + last_word = Match(r'^.*(\b\S+)$', expr) + if last_word: + token = last_word.group(1) + else: + token = expr + + # Match native types and stdint types + if _TYPES.match(token): + return True + + # Try a bit harder to match templated types. Walk up the nesting + # stack until we find something that resembles a typename + # declaration for what we are looking for. + typename_pattern = (r'\b(?:typename|class|struct)\s+' + re.escape(token) + + r'\b') + block_index = len(nesting_state.stack) - 1 + while block_index >= 0: + if isinstance(nesting_state.stack[block_index], _NamespaceInfo): + return False + + # Found where the opening brace is. We want to scan from this + # line up to the beginning of the function, minus a few lines. + # template + # class C + # : public ... { // start scanning here + last_line = nesting_state.stack[block_index].starting_linenum + + next_block_start = 0 + if block_index > 0: + next_block_start = nesting_state.stack[block_index - 1].starting_linenum + first_line = last_line + while first_line >= next_block_start: + if clean_lines.elided[first_line].find('template') >= 0: + break + first_line -= 1 + if first_line < next_block_start: + # Didn't find any "template" keyword before reaching the next block, + # there are probably no template things to check for this block + block_index -= 1 + continue + + # Look for typename in the specified range + for i in xrange(first_line, last_line + 1, 1): + if Search(typename_pattern, clean_lines.elided[i]): + return True + block_index -= 1 + + return False + + +def CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error): + """Checks for horizontal spacing near commas. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] # Except after an opening paren, or after another opening brace (in case of # an initializer list, for instance), you should have spaces before your - # braces. And since you should never have braces at the beginning of a line, - # this is an easy test. - match = Match(r'^(.*[^ ({]){', line) + # braces when they are delimiting blocks, classes, namespaces etc. + # And since you should never have braces at the beginning of a line, + # this is an easy test. Except that braces used for initialization don't + # follow the same rule; we often don't want spaces before those. + match = Match(r'^(.*[^ ({>]){', line) + if match: # Try a bit harder to check for brace initialization. This # happens in one of the following forms: @@ -2813,10 +3579,12 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # LastArgument(..., type{}); # LOG(INFO) << type{} << " ..."; # map_of_type[{...}] = ...; + # ternary = expr ? new type{} : nullptr; + # OuterTemplate{}> # # We check for the character following the closing brace, and # silence the warning if it's one of those listed above, i.e. - # "{.;,)<]". + # "{.;,)<>]:". # # To account for nested initializer list, we allow any number of # closing braces up to "{;,)<". We can't simply silence the @@ -2830,6 +3598,7 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): # There is a false negative with this approach if people inserted # spurious semicolons, e.g. "if (cond){};", but we will catch the # spurious semicolon with a separate check. + leading_text = match.group(1) (endline, endlinenum, endpos) = CloseExpression( clean_lines, linenum, len(match.group(1))) trailing_text = '' @@ -2838,7 +3607,11 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): for offset in xrange(endlinenum + 1, min(endlinenum + 3, clean_lines.NumLines() - 1)): trailing_text += clean_lines.elided[offset] - if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text): + # We also suppress warnings for `uint64_t{expression}` etc., as the style + # guide recommends brace initialization for integral types to avoid + # overflow/truncation. + if (not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text) + and not _IsType(clean_lines, nesting_state, leading_text)): error(filename, linenum, 'whitespace/braces', 5, 'Missing space before {') @@ -2847,12 +3620,6 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): error(filename, linenum, 'whitespace/braces', 5, 'Missing space before else') - # You shouldn't have spaces before your brackets, except maybe after - # 'delete []' or 'new char * []'. - if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line): - error(filename, linenum, 'whitespace/braces', 5, - 'Extra space before [') - # You shouldn't have a space before a semicolon at the end of the line. # There's a special case for "for" since the style guide allows space before # the semicolon there. @@ -2869,12 +3636,23 @@ def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): 'Extra space before last semicolon. If this should be an empty ' 'statement, use {} instead.') - # In range-based for, we wanted spaces before and after the colon, but - # not around "::" tokens that might appear. - if (Search('for *\(.*[^:]:[^: ]', line) or - Search('for *\(.*[^: ]:[^:]', line)): - error(filename, linenum, 'whitespace/forcolon', 2, - 'Missing space around colon in range-based for loop') + +def IsDecltype(clean_lines, linenum, column): + """Check if the token ending on (linenum, column) is decltype(). + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: the number of the line to check. + column: end column of the token to check. + Returns: + True if this token is decltype() expression, False otherwise. + """ + (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column) + if start_col < 0: + return False + if Search(r'\bdecltype\s*$', text[0:start_col]): + return True + return False def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error): @@ -2974,15 +3752,18 @@ def CheckBraces(filename, clean_lines, linenum, error): # used for brace initializers inside function calls. We don't detect this # perfectly: we just don't complain if the last non-whitespace character on # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the - # previous line starts a preprocessor block. + # previous line starts a preprocessor block. We also allow a brace on the + # following line if it is part of an array initialization and would not fit + # within the 80 character limit of the preceding line. prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] if (not Search(r'[,;:}{(]\s*$', prevline) and - not Match(r'\s*#', prevline)): + not Match(r'\s*#', prevline) and + not (GetLineWidth(prevline) > _line_length - 2 and '[]' in prevline)): error(filename, linenum, 'whitespace/braces', 4, '{ should almost always be at the end of the previous line') # An else clause should be on the same line as the preceding closing brace. - if Match(r'\s*else\s*', line): + if Match(r'\s*else\b\s*(?:if\b|\{|$)', line): prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] if Match(r'\s*}\s*$', prevline): error(filename, linenum, 'whitespace/newline', 4, @@ -2990,19 +3771,20 @@ def CheckBraces(filename, clean_lines, linenum, error): # If braces come on one side of an else, they should be on both. # However, we have to worry about "else if" that spans multiple lines! - if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): - if Search(r'}\s*else if([^{]*)$', line): # could be multi-line if - # find the ( after the if - pos = line.find('else if') - pos = line.find('(', pos) - if pos > 0: - (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) - if endline[endpos:].find('{') == -1: # must be brace after if - error(filename, linenum, 'readability/braces', 5, - 'If an else has a brace on one side, it should have it on both') - else: # common case: else not followed by a multi-line if - error(filename, linenum, 'readability/braces', 5, - 'If an else has a brace on one side, it should have it on both') + if Search(r'else if\s*\(', line): # could be multi-line if + brace_on_left = bool(Search(r'}\s*else if\s*\(', line)) + # find the ( after the if + pos = line.find('else if') + pos = line.find('(', pos) + if pos > 0: + (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) + brace_on_right = endline[endpos:].find('{') != -1 + if brace_on_left != brace_on_right: # must be brace after if + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') + elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') # Likewise, an else should never have the else clause on the same line if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line): @@ -3014,11 +3796,79 @@ def CheckBraces(filename, clean_lines, linenum, error): error(filename, linenum, 'whitespace/newline', 4, 'do/while clauses should not be on a single line') + # Check single-line if/else bodies. The style guide says 'curly braces are not + # required for single-line statements'. We additionally allow multi-line, + # single statements, but we reject anything with more than one semicolon in + # it. This means that the first semicolon after the if should be at the end of + # its line, and the line after that should have an indent level equal to or + # lower than the if. We also check for ambiguous if/else nesting without + # braces. + if_else_match = Search(r'\b(if\s*\(|else\b)', line) + if if_else_match and not Match(r'\s*#', line): + if_indent = GetIndentLevel(line) + endline, endlinenum, endpos = line, linenum, if_else_match.end() + if_match = Search(r'\bif\s*\(', line) + if if_match: + # This could be a multiline if condition, so find the end first. + pos = if_match.end() - 1 + (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos) + # Check for an opening brace, either directly after the if or on the next + # line. If found, this isn't a single-statement conditional. + if (not Match(r'\s*{', endline[endpos:]) + and not (Match(r'\s*$', endline[endpos:]) + and endlinenum < (len(clean_lines.elided) - 1) + and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))): + while (endlinenum < len(clean_lines.elided) + and ';' not in clean_lines.elided[endlinenum][endpos:]): + endlinenum += 1 + endpos = 0 + if endlinenum < len(clean_lines.elided): + endline = clean_lines.elided[endlinenum] + # We allow a mix of whitespace and closing braces (e.g. for one-liner + # methods) and a single \ after the semicolon (for macros) + endpos = endline.find(';') + if not Match(r';[\s}]*(\\?)$', endline[endpos:]): + # Semicolon isn't the last character, there's something trailing. + # Output a warning if the semicolon is not contained inside + # a lambda expression. + if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$', + endline): + error(filename, linenum, 'readability/braces', 4, + 'If/else bodies with multiple statements require braces') + elif endlinenum < len(clean_lines.elided) - 1: + # Make sure the next line is dedented + next_line = clean_lines.elided[endlinenum + 1] + next_indent = GetIndentLevel(next_line) + # With ambiguous nested if statements, this will error out on the + # if that *doesn't* match the else, regardless of whether it's the + # inner one or outer one. + if (if_match and Match(r'\s*else\b', next_line) + and next_indent != if_indent): + error(filename, linenum, 'readability/braces', 4, + 'Else clause should be indented at the same level as if. ' + 'Ambiguous nested if/else chains require braces.') + elif next_indent > if_indent: + error(filename, linenum, 'readability/braces', 4, + 'If/else bodies with multiple statements require braces') + + +def CheckTrailingSemicolon(filename, clean_lines, linenum, error): + """Looks for redundant trailing semicolon. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + line = clean_lines.elided[linenum] + # Block bodies should not be followed by a semicolon. Due to C++11 # brace initialization, there are more places where semicolons are - # required than not, so we use a whitelist approach to check these - # rather than a blacklist. These are the places where "};" should - # be replaced by just "}": + # required than not, so we explicitly list the allowed rules rather + # than listing the disallowed ones. These are the places where "};" + # should be replaced by just "}": # 1. Some flavor of block following closing parenthesis: # for (;;) {}; # while (...) {}; @@ -3074,28 +3924,40 @@ def CheckBraces(filename, clean_lines, linenum, error): # - INTERFACE_DEF # - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED: # - # We implement a whitelist of safe macros instead of a blacklist of + # We implement a list of safe macros instead of a list of # unsafe macros, even though the latter appears less frequently in # google code and would have been easier to implement. This is because - # the downside for getting the whitelist wrong means some extra - # semicolons, while the downside for getting the blacklist wrong + # the downside for getting the allowed checks wrong means some extra + # semicolons, while the downside for getting disallowed checks wrong # would result in compile errors. # - # In addition to macros, we also don't want to warn on compound - # literals. + # In addition to macros, we also don't want to warn on + # - Compound literals + # - Lambdas + # - alignas specifier with anonymous structs + # - decltype closing_brace_pos = match.group(1).rfind(')') opening_parenthesis = ReverseCloseExpression( clean_lines, linenum, closing_brace_pos) if opening_parenthesis[2] > -1: line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]] - macro = Search(r'\b([A-Z_]+)\s*$', line_prefix) + macro = Search(r'\b([A-Z_][A-Z0-9_]*)\s*$', line_prefix) + func = Match(r'^(.*\])\s*$', line_prefix) if ((macro and macro.group(1) not in ( 'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST', 'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED', 'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or + (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or + Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or + Search(r'\bdecltype$', line_prefix) or Search(r'\s+=\s*$', line_prefix)): match = None + if (match and + opening_parenthesis[1] > 1 and + Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])): + # Multi-line lambda-expression + match = None else: # Try matching cases 2-3. @@ -3125,6 +3987,14 @@ def CheckBraces(filename, clean_lines, linenum, error): # outputting warnings for the matching closing brace, if there are # nested blocks with trailing semicolons, we will get the error # messages in reversed order. + + # We need to check the line forward for NOLINT + raw_lines = clean_lines.raw_lines + ParseNolintSuppressions(filename, raw_lines[endlinenum-1], endlinenum-1, + error) + ParseNolintSuppressions(filename, raw_lines[endlinenum], endlinenum, + error) + error(filename, endlinenum, 'readability/braces', 4, "You don't need a ; after a }") @@ -3148,7 +4018,7 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error): line = clean_lines.elided[linenum] matched = Match(r'\s*(for|while|if)\s*\(', line) if matched: - # Find the end of the conditional expression + # Find the end of the conditional expression. (end_line, end_linenum, end_pos) = CloseExpression( clean_lines, linenum, line.find('(')) @@ -3163,6 +4033,98 @@ def CheckEmptyBlockBody(filename, clean_lines, linenum, error): error(filename, end_linenum, 'whitespace/empty_loop_body', 5, 'Empty loop bodies should use {} or continue') + # Check for if statements that have completely empty bodies (no comments) + # and no else clauses. + if end_pos >= 0 and matched.group(1) == 'if': + # Find the position of the opening { for the if statement. + # Return without logging an error if it has no brackets. + opening_linenum = end_linenum + opening_line_fragment = end_line[end_pos:] + # Loop until EOF or find anything that's not whitespace or opening {. + while not Search(r'^\s*\{', opening_line_fragment): + if Search(r'^(?!\s*$)', opening_line_fragment): + # Conditional has no brackets. + return + opening_linenum += 1 + if opening_linenum == len(clean_lines.elided): + # Couldn't find conditional's opening { or any code before EOF. + return + opening_line_fragment = clean_lines.elided[opening_linenum] + # Set opening_line (opening_line_fragment may not be entire opening line). + opening_line = clean_lines.elided[opening_linenum] + + # Find the position of the closing }. + opening_pos = opening_line_fragment.find('{') + if opening_linenum == end_linenum: + # We need to make opening_pos relative to the start of the entire line. + opening_pos += end_pos + (closing_line, closing_linenum, closing_pos) = CloseExpression( + clean_lines, opening_linenum, opening_pos) + if closing_pos < 0: + return + + # Now construct the body of the conditional. This consists of the portion + # of the opening line after the {, all lines until the closing line, + # and the portion of the closing line before the }. + if (clean_lines.raw_lines[opening_linenum] != + CleanseComments(clean_lines.raw_lines[opening_linenum])): + # Opening line ends with a comment, so conditional isn't empty. + return + if closing_linenum > opening_linenum: + # Opening line after the {. Ignore comments here since we checked above. + body = list(opening_line[opening_pos+1:]) + # All lines until closing line, excluding closing line, with comments. + body.extend(clean_lines.raw_lines[opening_linenum+1:closing_linenum]) + # Closing line before the }. Won't (and can't) have comments. + body.append(clean_lines.elided[closing_linenum][:closing_pos-1]) + body = '\n'.join(body) + else: + # If statement has brackets and fits on a single line. + body = opening_line[opening_pos+1:closing_pos-1] + + # Check if the body is empty + if not _EMPTY_CONDITIONAL_BODY_PATTERN.search(body): + return + # The body is empty. Now make sure there's not an else clause. + current_linenum = closing_linenum + current_line_fragment = closing_line[closing_pos:] + # Loop until EOF or find anything that's not whitespace or else clause. + while Search(r'^\s*$|^(?=\s*else)', current_line_fragment): + if Search(r'^(?=\s*else)', current_line_fragment): + # Found an else clause, so don't log an error. + return + current_linenum += 1 + if current_linenum == len(clean_lines.elided): + break + current_line_fragment = clean_lines.elided[current_linenum] + + # The body is empty and there's no else clause until EOF or other code. + error(filename, end_linenum, 'whitespace/empty_if_body', 4, + ('If statement had no body and no else clause')) + + +def FindCheckMacro(line): + """Find a replaceable CHECK-like macro. + + Args: + line: line to search on. + Returns: + (macro name, start position), or (None, -1) if no replaceable + macro is found. + """ + for macro in _CHECK_MACROS: + i = line.find(macro) + if i >= 0: + # Find opening parenthesis. Do a regular expression match here + # to make sure that we are matching the expected CHECK macro, as + # opposed to some other macro that happens to contain the CHECK + # substring. + matched = Match(r'^(.*\b' + macro + r'\s*)\(', line) + if not matched: + continue + return (macro, len(matched.group(1))) + return (None, -1) + def CheckCheck(filename, clean_lines, linenum, error): """Checks the use of CHECK and EXPECT macros. @@ -3176,24 +4138,8 @@ def CheckCheck(filename, clean_lines, linenum, error): # Decide the set of replacement macros that should be suggested lines = clean_lines.elided - check_macro = None - start_pos = -1 - for macro in _CHECK_MACROS: - i = lines[linenum].find(macro) - if i >= 0: - check_macro = macro - - # Find opening parenthesis. Do a regular expression match here - # to make sure that we are matching the expected CHECK macro, as - # opposed to some other macro that happens to contain the CHECK - # substring. - matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum]) - if not matched: - continue - start_pos = len(matched.group(1)) - break - if not check_macro or start_pos < 0: - # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT' + (check_macro, start_pos) = FindCheckMacro(lines[linenum]) + if not check_macro: return # Find end of the boolean expression by matching parentheses @@ -3201,6 +4147,13 @@ def CheckCheck(filename, clean_lines, linenum, error): clean_lines, linenum, start_pos) if end_pos < 0: return + + # If the check macro is followed by something other than a + # semicolon, assume users will log their own custom error messages + # and don't suggest any replacements. + if not Match(r'\s*;', last_line[end_pos:]): + return + if linenum == end_line: expression = lines[linenum][start_pos + 1:end_pos - 1] else: @@ -3223,7 +4176,7 @@ def CheckCheck(filename, clean_lines, linenum, error): if token == '(': # Parenthesized operand expression = matched.group(2) - (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')') + (end, _) = FindEndOfExpressionInLine(expression, 0, ['(']) if end < 0: return # Unmatched parenthesis lhs += '(' + expression[0:end] @@ -3339,6 +4292,16 @@ def GetLineWidth(line): if unicodedata.east_asian_width(uc) in ('W', 'F'): width += 2 elif not unicodedata.combining(uc): + # Issue 337 + # https://mail.python.org/pipermail/python-list/2012-August/628809.html + if (sys.version_info.major, sys.version_info.minor) <= (3, 2): + # https://github.com/python/cpython/blob/2.7/Include/unicodeobject.h#L81 + is_wide_build = sysconfig.get_config_var("Py_UNICODE_SIZE") >= 4 + # https://github.com/python/cpython/blob/2.7/Objects/unicodeobject.c#L564 + is_low_surrogate = 0xDC00 <= ord(uc) <= 0xDFFF + if not is_wide_build and is_low_surrogate: + width -= 1 + width += 1 return width else: @@ -3358,7 +4321,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. file_extension: The extension (without the dot) of the filename. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: The function to call with any errors found. """ @@ -3368,6 +4331,7 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, # raw strings, raw_lines = clean_lines.lines_without_raw_strings line = raw_lines[linenum] + prev = raw_lines[linenum - 1] if linenum > 0 else '' if line.find('\t') != -1: error(filename, linenum, 'whitespace/tab', 1, @@ -3385,23 +4349,33 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, # if(match($0, " <<")) complain = 0; # if(match(prev, " +for \\(")) complain = 0; # if(prevodd && match(prevprev, " +for \\(")) complain = 0; + scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$' + classinfo = nesting_state.InnermostClass() initial_spaces = 0 cleansed_line = clean_lines.elided[linenum] while initial_spaces < len(line) and line[initial_spaces] == ' ': initial_spaces += 1 - if line and line[-1].isspace(): - error(filename, linenum, 'whitespace/end_of_line', 4, - 'Line ends in whitespace. Consider deleting these extra spaces.') - # There are certain situations we allow one space, notably for section labels - elif ((initial_spaces == 1 or initial_spaces == 3) and - not Match(r'\s*\w+\s*:\s*$', cleansed_line)): + # There are certain situations we allow one space, notably for + # section labels, and also lines containing multi-line raw strings. + # We also don't check for lines that look like continuation lines + # (of lines ending in double quotes, commas, equals, or angle brackets) + # because the rules for how to indent those are non-trivial. + if (not Search(r'[",=><] *$', prev) and + (initial_spaces == 1 or initial_spaces == 3) and + not Match(scope_or_label_pattern, cleansed_line) and + not (clean_lines.raw_lines[linenum] != line and + Match(r'^\s*""', line))): error(filename, linenum, 'whitespace/indent', 3, 'Weird number of spaces at line-start. ' 'Are you using a 2-space indent?') + if line and line[-1].isspace(): + error(filename, linenum, 'whitespace/end_of_line', 4, + 'Line ends in whitespace. Consider deleting these extra spaces.') + # Check if the line is a header guard. is_header_guard = False - if file_extension == 'h': + if IsHeaderExtension(file_extension): cppvar = GetHeaderGuardCPPVariable(filename) if (line.startswith('#ifndef %s' % cppvar) or line.startswith('#define %s' % cppvar) or @@ -3417,14 +4391,10 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, # developers fault. if (not line.startswith('#include') and not is_header_guard and not Match(r'^\s*//.*http(s?)://\S*$', line) and + not Match(r'^\s*//\s*[^\s]*$', line) and not Match(r'^// \$Id:.*#[0-9]+ \$$', line)): line_width = GetLineWidth(line) - extended_length = int((_line_length * 1.25)) - if line_width > extended_length: - error(filename, linenum, 'whitespace/line_length', 4, - 'Lines should very rarely be longer than %i characters' % - extended_length) - elif line_width > _line_length: + if line_width > _line_length: error(filename, linenum, 'whitespace/line_length', 2, 'Lines should be <= %i characters long' % _line_length) @@ -3442,9 +4412,14 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, # Some more style checks CheckBraces(filename, clean_lines, linenum, error) + CheckTrailingSemicolon(filename, clean_lines, linenum, error) CheckEmptyBlockBody(filename, clean_lines, linenum, error) - CheckAccess(filename, clean_lines, linenum, nesting_state, error) CheckSpacing(filename, clean_lines, linenum, nesting_state, error) + CheckOperatorSpacing(filename, clean_lines, linenum, error) + CheckParenthesisSpacing(filename, clean_lines, linenum, error) + CheckCommaSpacing(filename, clean_lines, linenum, error) + CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error) + CheckSpacingForFunctionCall(filename, clean_lines, linenum, error) CheckCheck(filename, clean_lines, linenum, error) CheckAltTokens(filename, clean_lines, linenum, error) classinfo = nesting_state.InnermostClass() @@ -3452,7 +4427,6 @@ def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error) -_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"') _RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$') # Matches the first component of a filename delimited by -s and _s. That is: # _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo' @@ -3489,23 +4463,6 @@ def _DropCommonSuffixes(filename): return os.path.splitext(filename)[0] -def _IsTestFilename(filename): - """Determines if the given filename has a suffix that identifies it as a test. - - Args: - filename: The input filename. - - Returns: - True if 'filename' looks like a test, False otherwise. - """ - if (filename.endswith('_test.cc') or - filename.endswith('_unittest.cc') or - filename.endswith('_regtest.cc')): - return True - else: - return False - - def _ClassifyInclude(fileinfo, include, is_system): """Figures out what kind of header 'include' is. @@ -3581,11 +4538,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): error: The function to call with any errors found. """ fileinfo = FileInfo(filename) - line = clean_lines.lines[linenum] # "include" should use the new style "foo/bar.h" instead of just "bar.h" - if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line): + # Only do this check if the included header follows google naming + # conventions. If not, assume that it's a 3rd party API that + # requires special include conventions. + # + # We also make an exception for Lua headers, which follow google + # naming convention but not the include convention. + match = Match(r'#include\s*"([^/]+\.h)"', line) + if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)): error(filename, linenum, 'build/include', 4, 'Include the directory when naming .h files') @@ -3596,12 +4559,17 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): if match: include = match.group(2) is_system = (match.group(1) == '<') - if include in include_state: + duplicate_line = include_state.FindHeader(include) + if duplicate_line >= 0: error(filename, linenum, 'build/include', 4, '"%s" already included at %s:%s' % - (include, filename, include_state[include])) - else: - include_state[include] = linenum + (include, filename, duplicate_line)) + elif (include.endswith('.cc') and + os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)): + error(filename, linenum, 'build/include', 4, + 'Do not include .cc files from other packages') + elif not _THIRD_PARTY_HEADERS_PATTERN.match(include): + include_state.include_list[-1].append((include, linenum)) # We want to ensure that headers appear in the right order: # 1) for foo.cc, foo.h (preferred location) @@ -3627,15 +4595,6 @@ def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): 'Include "%s" not in alphabetical order' % include) include_state.SetLastHeader(canonical_include) - # Look for any of the stream classes that are part of standard C++. - match = _RE_PATTERN_INCLUDE.match(line) - if match: - include = match.group(2) - if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include): - # Many unit tests use cout, so we exempt them. - if not _IsTestFilename(filename): - error(filename, linenum, 'readability/streams', 3, - 'Streams are highly discouraged.') def _GetTextInside(text, start_pattern): @@ -3658,7 +4617,7 @@ def _GetTextInside(text, start_pattern): The extracted text. None if either the opening string or ending punctuation could not be found. """ - # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably + # TODO(unknown): Audit cpplint.py to see what places could be profitably # rewritten to use _GetTextInside (and use inferior regexp matching today). # Give opening punctuations to get the matching close-punctuations. @@ -3718,6 +4677,9 @@ _RE_PATTERN_REF_PARAM = re.compile( _RE_PATTERN_CONST_REF_PARAM = ( r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')') +# Stream types. +_RE_PATTERN_REF_STREAM_PARAM = ( + r'(?:.*stream\s*&\s*' + _RE_PATTERN_IDENT + r')') def CheckLanguage(filename, clean_lines, linenum, file_extension, @@ -3733,7 +4695,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, linenum: The number of the line to check. file_extension: The extension (without the dot) of the filename. include_state: An _IncludeState instance in which the headers are inserted. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: The function to call with any errors found. """ @@ -3750,129 +4712,23 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, # Reset include state across preprocessor directives. This is meant # to silence warnings for conditional includes. - if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line): - include_state.ResetSection() + match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line) + if match: + include_state.ResetSection(match.group(1)) # Make Windows paths like Unix. fullname = os.path.abspath(filename).replace('\\', '/') - # TODO(unknown): figure out if they're using default arguments in fn proto. + # Perform other checks now that we are sure that this is not an include line + CheckCasts(filename, clean_lines, linenum, error) + CheckGlobalStatic(filename, clean_lines, linenum, error) + CheckPrintf(filename, clean_lines, linenum, error) - # Check to see if they're using an conversion function cast. - # I just try to capture the most common basic types, though there are more. - # Parameterless conversion functions, such as bool(), are allowed as they are - # probably a member operator declaration or default constructor. - match = Search( - r'(\bnew\s+)?\b' # Grab 'new' operator, if it's there - r'(int|float|double|bool|char|int32|uint32|int64|uint64)' - r'(\([^)].*)', line) - if match: - matched_new = match.group(1) - matched_type = match.group(2) - matched_funcptr = match.group(3) - - # gMock methods are defined using some variant of MOCK_METHODx(name, type) - # where type may be float(), int(string), etc. Without context they are - # virtually indistinguishable from int(x) casts. Likewise, gMock's - # MockCallback takes a template parameter of the form return_type(arg_type), - # which looks much like the cast we're trying to detect. - # - # std::function<> wrapper has a similar problem. - # - # Return types for function pointers also look like casts if they - # don't have an extra space. - if (matched_new is None and # If new operator, then this isn't a cast - not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or - Search(r'\bMockCallback<.*>', line) or - Search(r'\bstd::function<.*>', line)) and - not (matched_funcptr and - Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', - matched_funcptr))): - # Try a bit harder to catch gmock lines: the only place where - # something looks like an old-style cast is where we declare the - # return type of the mocked method, and the only time when we - # are missing context is if MOCK_METHOD was split across - # multiple lines. The missing MOCK_METHOD is usually one or two - # lines back, so scan back one or two lines. - # - # It's not possible for gmock macros to appear in the first 2 - # lines, since the class head + section name takes up 2 lines. - if (linenum < 2 or - not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', - clean_lines.elided[linenum - 1]) or - Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', - clean_lines.elided[linenum - 2]))): - error(filename, linenum, 'readability/casting', 4, - 'Using deprecated casting style. ' - 'Use static_cast<%s>(...) instead' % - matched_type) - - CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], - 'static_cast', - r'\((int|float|double|bool|char|u?int(16|32|64))\)', error) - - # This doesn't catch all cases. Consider (const char * const)"hello". - # - # (char *) "foo" should always be a const_cast (reinterpret_cast won't - # compile). - if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], - 'const_cast', r'\((char\s?\*+\s?)\)\s*"', error): - pass - else: - # Check pointer casts for other than string constants - CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum], - 'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error) - - # In addition, we look for people taking the address of a cast. This - # is dangerous -- casts can assign to temporaries, so the pointer doesn't - # point where you think. - match = Search( - r'(?:&\(([^)]+)\)[\w(])|' - r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line) - if match and match.group(1) != '*': - error(filename, linenum, 'runtime/casting', 4, - ('Are you taking an address of a cast? ' - 'This is dangerous: could be a temp var. ' - 'Take the address before doing the cast, rather than after')) - - # Create an extended_line, which is the concatenation of the current and - # next lines, for more effective checking of code that may span more than one - # line. - if linenum + 1 < clean_lines.NumLines(): - extended_line = line + clean_lines.elided[linenum + 1] - else: - extended_line = line - - # Check for people declaring static/global STL strings at the top level. - # This is dangerous because the C++ language does not guarantee that - # globals with constructors are initialized before the first access. - match = Match( - r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)', - line) - # Make sure it's not a function. - # Function template specialization looks like: "string foo(...". - # Class template definitions look like: "string Foo::Method(...". - # - # Also ignore things that look like operators. These are matched separately - # because operator names cross non-word boundaries. If we change the pattern - # above, we would decrease the accuracy of matching identifiers. - if (match and - not Search(r'\boperator\W', line) and - not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))): - error(filename, linenum, 'runtime/string', 4, - 'For a static/global string constant, use a C style string instead: ' - '"%schar %s[]".' % - (match.group(1), match.group(2))) - - if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line): - error(filename, linenum, 'runtime/init', 4, - 'You seem to be initializing a member variable with itself.') - - if file_extension == 'h': + if IsHeaderExtension(file_extension): # TODO(unknown): check that 1-arg constructors are explicit. # How to tell it's a constructor? # (handled in CheckForNonStandardConstructs for now) - # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS + # TODO(unknown): check that classes declare or disable copy/assign # (level 1 error) pass @@ -3888,23 +4744,6 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, error(filename, linenum, 'runtime/int', 4, 'Use int16/int64/etc, rather than the C type %s' % match.group(1)) - # When snprintf is used, the second argument shouldn't be a literal. - match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) - if match and match.group(2) != '0': - # If 2nd arg is zero, snprintf is used to calculate size. - error(filename, linenum, 'runtime/printf', 3, - 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' - 'to snprintf.' % (match.group(1), match.group(2))) - - # Check if some verboten C functions are being used. - if Search(r'\bsprintf\b', line): - error(filename, linenum, 'runtime/printf', 5, - 'Never use sprintf. Use snprintf instead.') - match = Search(r'\b(strcpy|strcat)\b', line) - if match: - error(filename, linenum, 'runtime/printf', 4, - 'Almost always, snprintf is better than %s' % match.group(1)) - # Check if some verboten operator overloading is going on # TODO(unknown): catch out-of-line unary operator&: # class X {}; @@ -3924,7 +4763,7 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, # Check for potential format string bugs like printf(foo). # We constrain the pattern not to pick things like DocidForPrintf(foo). # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str()) - # TODO(sugawarayu): Catch the following case. Need to change the calling + # TODO(unknown): Catch the following case. Need to change the calling # convention of the whole function to process multiple line to handle it. # printf( # boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line); @@ -3989,37 +4828,188 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, 'Do not use variable-length arrays. Use an appropriately named ' "('k' followed by CamelCase) compile-time constant for the size.") - # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or - # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing - # in the class declaration. - match = Match( - (r'\s*' - r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))' - r'\(.*\);$'), - line) - if match and linenum + 1 < clean_lines.NumLines(): - next_line = clean_lines.elided[linenum + 1] - # We allow some, but not all, declarations of variables to be present - # in the statement that defines the class. The [\w\*,\s]* fragment of - # the regular expression below allows users to declare instances of - # the class or pointers to instances, but not less common types such - # as function pointers or arrays. It's a tradeoff between allowing - # reasonable code and avoiding trying to parse more C++ using regexps. - if not Search(r'^\s*}[\w\*,\s]*;', next_line): - error(filename, linenum, 'readability/constructors', 3, - match.group(1) + ' should be the last thing in the class') - # Check for use of unnamed namespaces in header files. Registration # macros are typically OK, so we allow use of "namespace {" on lines # that end with backslashes. - if (file_extension == 'h' + if (IsHeaderExtension(file_extension) and Search(r'\bnamespace\s*{', line) and line[-1] != '\\'): error(filename, linenum, 'build/namespaces', 4, 'Do not use unnamed namespaces in header files. See ' - 'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' + 'https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' ' for more information.') + +def CheckGlobalStatic(filename, clean_lines, linenum, error): + """Check for unsafe global or static objects. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Match two lines at a time to support multiline declarations + if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line): + line += clean_lines.elided[linenum + 1].strip() + + # Check for people declaring static/global STL strings at the top level. + # This is dangerous because the C++ language does not guarantee that + # globals with constructors are initialized before the first access, and + # also because globals can be destroyed when some threads are still running. + # TODO(unknown): Generalize this to also find static unique_ptr instances. + # TODO(unknown): File bugs for clang-tidy to find these. + match = Match( + r'((?:|static +)(?:|const +))(?::*std::)?string( +const)? +' + r'([a-zA-Z0-9_:]+)\b(.*)', + line) + + # Remove false positives: + # - String pointers (as opposed to values). + # string *pointer + # const string *pointer + # string const *pointer + # string *const pointer + # + # - Functions and template specializations. + # string Function(... + # string Class::Method(... + # + # - Operators. These are matched separately because operator names + # cross non-word boundaries, and trying to match both operators + # and functions at the same time would decrease accuracy of + # matching identifiers. + # string Class::operator*() + if (match and + not Search(r'\bstring\b(\s+const)?\s*[\*\&]\s*(const\s+)?\w', line) and + not Search(r'\boperator\W', line) and + not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(4))): + if Search(r'\bconst\b', line): + error(filename, linenum, 'runtime/string', 4, + 'For a static/global string constant, use a C style string ' + 'instead: "%schar%s %s[]".' % + (match.group(1), match.group(2) or '', match.group(3))) + else: + error(filename, linenum, 'runtime/string', 4, + 'Static/global string variables are not permitted.') + + if (Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line) or + Search(r'\b([A-Za-z0-9_]*_)\(CHECK_NOTNULL\(\1\)\)', line)): + error(filename, linenum, 'runtime/init', 4, + 'You seem to be initializing a member variable with itself.') + + +def CheckPrintf(filename, clean_lines, linenum, error): + """Check for printf related issues. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # When snprintf is used, the second argument shouldn't be a literal. + match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) + if match and match.group(2) != '0': + # If 2nd arg is zero, snprintf is used to calculate size. + error(filename, linenum, 'runtime/printf', 3, + 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' + 'to snprintf.' % (match.group(1), match.group(2))) + + # Check if some verboten C functions are being used. + if Search(r'\bsprintf\s*\(', line): + error(filename, linenum, 'runtime/printf', 5, + 'Never use sprintf. Use snprintf instead.') + match = Search(r'\b(strcpy|strcat)\s*\(', line) + if match: + error(filename, linenum, 'runtime/printf', 4, + 'Almost always, snprintf is better than %s' % match.group(1)) + + +def IsDerivedFunction(clean_lines, linenum): + """Check if current line contains an inherited function. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + Returns: + True if current line contains a function with "override" + virt-specifier. + """ + # Scan back a few lines for start of current function + for i in xrange(linenum, max(-1, linenum - 10), -1): + match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i]) + if match: + # Look for "override" after the matching closing parenthesis + line, _, closing_paren = CloseExpression( + clean_lines, i, len(match.group(1))) + return (closing_paren >= 0 and + Search(r'\boverride\b', line[closing_paren:])) + return False + + +def IsOutOfLineMethodDefinition(clean_lines, linenum): + """Check if current line contains an out-of-line method definition. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + Returns: + True if current line contains an out-of-line method definition. + """ + # Scan back a few lines for start of current function + for i in xrange(linenum, max(-1, linenum - 10), -1): + if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]): + return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None + return False + + +def IsInitializerList(clean_lines, linenum): + """Check if current line is inside constructor initializer list. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + Returns: + True if current line appears to be inside constructor initializer + list, False otherwise. + """ + for i in xrange(linenum, 1, -1): + line = clean_lines.elided[i] + if i == linenum: + remove_function_body = Match(r'^(.*)\{\s*$', line) + if remove_function_body: + line = remove_function_body.group(1) + + if Search(r'\s:\s*\w+[({]', line): + # A lone colon tend to indicate the start of a constructor + # initializer list. It could also be a ternary operator, which + # also tend to appear in constructor initializer lists as + # opposed to parameter lists. + return True + if Search(r'\}\s*,\s*$', line): + # A closing brace followed by a comma is probably the end of a + # brace-initialized member in constructor initializer list. + return True + if Search(r'[{};]\s*$', line): + # Found one of the following: + # - A closing brace or semicolon, probably the end of the previous + # function. + # - An opening brace, probably the start of current class or namespace. + # + # Current line is probably not inside an initializer list since + # we saw one of those things without seeing the starting colon. + return False + + # Got to the beginning of the file without seeing the start of + # constructor initializer list. + return False + + def CheckForNonConstReference(filename, clean_lines, linenum, nesting_state, error): """Check for non-const references. @@ -4031,7 +5021,7 @@ def CheckForNonConstReference(filename, clean_lines, linenum, filename: The name of the current file. clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: The function to call with any errors found. """ @@ -4040,6 +5030,17 @@ def CheckForNonConstReference(filename, clean_lines, linenum, if '&' not in line: return + # If a function is inherited, current function doesn't have much of + # a choice, so any non-const references should not be blamed on + # derived function. + if IsDerivedFunction(clean_lines, linenum): + return + + # Don't warn on out-of-line method definitions, as we would warn on the + # in-line declaration, if it isn't marked with 'override'. + if IsOutOfLineMethodDefinition(clean_lines, linenum): + return + # Long type names may be broken across multiple lines, usually in one # of these forms: # LongType @@ -4088,60 +5089,192 @@ def CheckForNonConstReference(filename, clean_lines, linenum, # inside declarators: reference parameter # We will exclude the first two cases by checking that we are not inside a # function body, including one that was just introduced by a trailing '{'. - # TODO(unknwon): Doesn't account for preprocessor directives. # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare]. - check_params = False - if not nesting_state.stack: - check_params = True # top level - elif (isinstance(nesting_state.stack[-1], _ClassInfo) or - isinstance(nesting_state.stack[-1], _NamespaceInfo)): - check_params = True # within class or namespace - elif Match(r'.*{\s*$', line): - if (len(nesting_state.stack) == 1 or - isinstance(nesting_state.stack[-2], _ClassInfo) or - isinstance(nesting_state.stack[-2], _NamespaceInfo)): - check_params = True # just opened global/class/namespace block + if (nesting_state.previous_stack_top and + not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or + isinstance(nesting_state.previous_stack_top, _NamespaceInfo))): + # Not at toplevel, not within a class, and not within a namespace + return + + # Avoid initializer lists. We only need to scan back from the + # current line for something that starts with ':'. + # + # We don't need to check the current line, since the '&' would + # appear inside the second set of parentheses on the current line as + # opposed to the first set. + if linenum > 0: + for i in xrange(linenum - 1, max(0, linenum - 10), -1): + previous_line = clean_lines.elided[i] + if not Search(r'[),]\s*$', previous_line): + break + if Match(r'^\s*:\s+\S', previous_line): + return + + # Avoid preprocessors + if Search(r'\\\s*$', line): + return + + # Avoid constructor initializer lists + if IsInitializerList(clean_lines, linenum): + return + # We allow non-const references in a few standard places, like functions # called "swap()" or iostream operators like "<<" or ">>". Do not check # those function parameters. # # We also accept & in static_assert, which looks like a function but # it's actually a declaration expression. - whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|' + allowed_functions = (r'(?:[sS]wap(?:<\w:+>)?|' r'operator\s*[<>][<>]|' r'static_assert|COMPILE_ASSERT' r')\s*\(') - if Search(whitelisted_functions, line): - check_params = False + if Search(allowed_functions, line): + return elif not Search(r'\S+\([^)]*$', line): - # Don't see a whitelisted function on this line. Actually we + # Don't see an allowed function on this line. Actually we # didn't see any function name on this line, so this is likely a # multi-line parameter list. Try a bit harder to catch this case. for i in xrange(2): if (linenum > i and - Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])): - check_params = False - break + Search(allowed_functions, clean_lines.elided[linenum - i - 1])): + return - if check_params: - decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body - for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): - if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter): - error(filename, linenum, 'runtime/references', 2, - 'Is this a non-const reference? ' - 'If so, make const or use a pointer: ' + - ReplaceAll(' *<', '<', parameter)) + decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body + for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): + if (not Match(_RE_PATTERN_CONST_REF_PARAM, parameter) and + not Match(_RE_PATTERN_REF_STREAM_PARAM, parameter)): + error(filename, linenum, 'runtime/references', 2, + 'Is this a non-const reference? ' + 'If so, make const or use a pointer: ' + + ReplaceAll(' *<', '<', parameter)) -def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, - error): +def CheckCasts(filename, clean_lines, linenum, error): + """Various cast related checks. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Check to see if they're using an conversion function cast. + # I just try to capture the most common basic types, though there are more. + # Parameterless conversion functions, such as bool(), are allowed as they are + # probably a member operator declaration or default constructor. + match = Search( + r'(\bnew\s+(?:const\s+)?|\S<\s*(?:const\s+)?)?\b' + r'(int|float|double|bool|char|int32|uint32|int64|uint64)' + r'(\([^)].*)', line) + expecting_function = ExpectingFunctionArgs(clean_lines, linenum) + if match and not expecting_function: + matched_type = match.group(2) + + # matched_new_or_template is used to silence two false positives: + # - New operators + # - Template arguments with function types + # + # For template arguments, we match on types immediately following + # an opening bracket without any spaces. This is a fast way to + # silence the common case where the function type is the first + # template argument. False negative with less-than comparison is + # avoided because those operators are usually followed by a space. + # + # function // bracket + no space = false positive + # value < double(42) // bracket + space = true positive + matched_new_or_template = match.group(1) + + # Avoid arrays by looking for brackets that come after the closing + # parenthesis. + if Match(r'\([^()]+\)\s*\[', match.group(3)): + return + + # Other things to ignore: + # - Function pointers + # - Casts to pointer types + # - Placement new + # - Alias declarations + matched_funcptr = match.group(3) + if (matched_new_or_template is None and + not (matched_funcptr and + (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', + matched_funcptr) or + matched_funcptr.startswith('(*)'))) and + not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and + not Search(r'new\(\S+\)\s*' + matched_type, line)): + error(filename, linenum, 'readability/casting', 4, + 'Using deprecated casting style. ' + 'Use static_cast<%s>(...) instead' % + matched_type) + + if not expecting_function: + CheckCStyleCast(filename, clean_lines, linenum, 'static_cast', + r'\((int|float|double|bool|char|u?int(16|32|64))\)', error) + + # This doesn't catch all cases. Consider (const char * const)"hello". + # + # (char *) "foo" should always be a const_cast (reinterpret_cast won't + # compile). + if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast', + r'\((char\s?\*+\s?)\)\s*"', error): + pass + else: + # Check pointer casts for other than string constants + CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast', + r'\((\w+\s?\*+\s?)\)', error) + + # In addition, we look for people taking the address of a cast. This + # is dangerous -- casts can assign to temporaries, so the pointer doesn't + # point where you think. + # + # Some non-identifier character is required before the '&' for the + # expression to be recognized as a cast. These are casts: + # expression = &static_cast(temporary()); + # function(&(int*)(temporary())); + # + # This is not a cast: + # reference_type&(int* function_param); + match = Search( + r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|' + r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line) + if match: + # Try a better error message when the & is bound to something + # dereferenced by the casted pointer, as opposed to the casted + # pointer itself. + parenthesis_error = False + match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line) + if match: + _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1))) + if x1 >= 0 and clean_lines.elided[y1][x1] == '(': + _, y2, x2 = CloseExpression(clean_lines, y1, x1) + if x2 >= 0: + extended_line = clean_lines.elided[y2][x2:] + if y2 < clean_lines.NumLines() - 1: + extended_line += clean_lines.elided[y2 + 1] + if Match(r'\s*(?:->|\[)', extended_line): + parenthesis_error = True + + if parenthesis_error: + error(filename, linenum, 'readability/casting', 4, + ('Are you taking an address of something dereferenced ' + 'from a cast? Wrapping the dereferenced expression in ' + 'parentheses will make the binding more obvious')) + else: + error(filename, linenum, 'runtime/casting', 4, + ('Are you taking an address of a cast? ' + 'This is dangerous: could be a temp var. ' + 'Take the address before doing the cast, rather than after')) + + +def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error): """Checks for a C-style cast by looking for the pattern. Args: filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. linenum: The number of the line to check. - line: The line of code to check. - raw_line: The raw line of code to check, with comments. cast_type: The string for the C++ cast to recommend. This is either reinterpret_cast, static_cast, or const_cast, depending. pattern: The regular expression used to find C-style casts. @@ -4151,75 +5284,34 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, True if an error was emitted. False otherwise. """ + line = clean_lines.elided[linenum] match = Search(pattern, line) if not match: return False - # e.g., sizeof(int) - sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1]) - if sizeof_match: - error(filename, linenum, 'runtime/sizeof', 1, - 'Using sizeof(type). Use sizeof(varname) instead if possible') - return True - - # operator++(int) and operator--(int) - if (line[0:match.start(1) - 1].endswith(' operator++') or - line[0:match.start(1) - 1].endswith(' operator--')): + # Exclude lines with keywords that tend to look like casts + context = line[0:match.start(1) - 1] + if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context): return False - # A single unnamed argument for a function tends to look like old - # style cast. If we see those, don't issue warnings for deprecated - # casts, instead issue warnings for unnamed arguments where - # appropriate. - # - # These are things that we want warnings for, since the style guide - # explicitly require all parameters to be named: - # Function(int); - # Function(int) { - # ConstMember(int) const; - # ConstMember(int) const { - # ExceptionMember(int) throw (...); - # ExceptionMember(int) throw (...) { - # PureVirtual(int) = 0; - # - # These are functions of some sort, where the compiler would be fine - # if they had named parameters, but people often omit those - # identifiers to reduce clutter: - # (FunctionPointer)(int); - # (FunctionPointer)(int) = value; - # Function((function_pointer_arg)(int)) - # ; - # <(FunctionPointerTemplateArgument)(int)>; + # Try expanding current context to see if we one level of + # parentheses inside a macro. + if linenum > 0: + for i in xrange(linenum - 1, max(0, linenum - 5), -1): + context = clean_lines.elided[i] + context + if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context): + return False + + # operator++(int) and operator--(int) + if context.endswith(' operator++') or context.endswith(' operator--'): + return False + + # A single unnamed argument for a function tends to look like old style cast. + # If we see those, don't issue warnings for deprecated casts. remainder = line[match.end(0):] - if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder): - # Looks like an unnamed parameter. - - # Don't warn on any kind of template arguments. - if Match(r'^\s*>', remainder): - return False - - # Don't warn on assignments to function pointers, but keep warnings for - # unnamed parameters to pure virtual functions. Note that this pattern - # will also pass on assignments of "0" to function pointers, but the - # preferred values for those would be "nullptr" or "NULL". - matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder) - if matched_zero and matched_zero.group(1) != '0': - return False - - # Don't warn on function pointer declarations. For this we need - # to check what came before the "(type)" string. - if Match(r'.*\)\s*$', line[0:match.start(0)]): - return False - - # Don't warn if the parameter is named with block comments, e.g.: - # Function(int /*unused_param*/); - if '/*' in raw_line: - return False - - # Passed all filters, issue warning here. - error(filename, linenum, 'readability/function', 3, - 'All parameters should be named in a function') - return True + if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)', + remainder): + return False # At this point, all that should be left is actual casts. error(filename, linenum, 'readability/casting', 4, @@ -4229,6 +5321,28 @@ def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern, return True +def ExpectingFunctionArgs(clean_lines, linenum): + """Checks whether where function type arguments are expected. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + + Returns: + True if the line at 'linenum' is inside something that expects arguments + of function types. + """ + line = clean_lines.elided[linenum] + return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or + (linenum >= 2 and + (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', + clean_lines.elided[linenum - 1]) or + Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', + clean_lines.elided[linenum - 2]) or + Search(r'\bstd::m?function\s*\<\s*$', + clean_lines.elided[linenum - 1])))) + + _HEADERS_CONTAINING_TEMPLATES = ( ('', ('deque',)), ('', ('unary_function', 'binary_function', @@ -4251,11 +5365,15 @@ _HEADERS_CONTAINING_TEMPLATES = ( ('', ('numeric_limits',)), ('', ('list',)), ('', ('map', 'multimap',)), - ('', ('allocator',)), + ('', ('allocator', 'make_shared', 'make_unique', 'shared_ptr', + 'unique_ptr', 'weak_ptr')), ('', ('queue', 'priority_queue',)), ('', ('set', 'multiset',)), ('', ('stack',)), ('', ('char_traits', 'basic_string',)), + ('', ('tuple',)), + ('', ('unordered_map', 'unordered_multimap')), + ('', ('unordered_set', 'unordered_multiset')), ('', ('pair',)), ('', ('vector',)), @@ -4266,18 +5384,26 @@ _HEADERS_CONTAINING_TEMPLATES = ( ('', ('slist',)), ) +_HEADERS_MAYBE_TEMPLATES = ( + ('', ('copy', 'max', 'min', 'min_element', 'sort', + 'transform', + )), + ('', ('forward', 'make_pair', 'move', 'swap')), + ) + _RE_PATTERN_STRING = re.compile(r'\bstring\b') -_re_pattern_algorithm_header = [] -for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap', - 'transform'): - # Match max(..., ...), max(..., ...), but not foo->max, foo.max or - # type::max(). - _re_pattern_algorithm_header.append( - (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), - _template, - '')) +_re_pattern_headers_maybe_templates = [] +for _header, _templates in _HEADERS_MAYBE_TEMPLATES: + for _template in _templates: + # Match max(..., ...), max(..., ...), but not foo->max, foo.max or + # type::max(). + _re_pattern_headers_maybe_templates.append( + (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), + _template, + _header)) +# Other scripts may reach in and modify this pattern. _re_pattern_templates = [] for _header, _templates in _HEADERS_CONTAINING_TEMPLATES: for _template in _templates: @@ -4317,13 +5443,13 @@ def FilesBelongToSameModule(filename_cc, filename_h): string: the additional prefix needed to open the header file. """ - if not filename_cc.endswith('.cc'): + fileinfo = FileInfo(filename_cc) + if not fileinfo.IsSource(): return (False, '') - filename_cc = filename_cc[:-len('.cc')] - if filename_cc.endswith('_unittest'): - filename_cc = filename_cc[:-len('_unittest')] - elif filename_cc.endswith('_test'): - filename_cc = filename_cc[:-len('_test')] + filename_cc = filename_cc[:-len(fileinfo.Extension())] + matched_test_suffix = Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()) + if matched_test_suffix: + filename_cc = filename_cc[:-len(matched_test_suffix.group(1))] filename_cc = filename_cc.replace('/public/', '/') filename_cc = filename_cc.replace('/internal/', '/') @@ -4342,16 +5468,16 @@ def FilesBelongToSameModule(filename_cc, filename_h): return files_belong_to_same_module, common_path -def UpdateIncludeState(filename, include_state, io=codecs): - """Fill up the include_state with new includes found from the file. +def UpdateIncludeState(filename, include_dict, io=codecs): + """Fill up the include_dict with new includes found from the file. Args: filename: the name of the header to read. - include_state: an _IncludeState instance in which the headers are inserted. + include_dict: a dictionary in which the headers are inserted. io: The io factory to use to read the file. Provided for testability. Returns: - True if a header was succesfully added. False otherwise. + True if a header was successfully added. False otherwise. """ headerfile = None try: @@ -4365,9 +5491,7 @@ def UpdateIncludeState(filename, include_state, io=codecs): match = _RE_PATTERN_INCLUDE.search(clean_line) if match: include = match.group(2) - # The value formatting is cute, but not really used right now. - # What matters here is that the key is in include_state. - include_state.setdefault(include, '%s:%d' % (filename, linenum)) + include_dict.setdefault(include, linenum) return True @@ -4406,7 +5530,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, if prefix.endswith('std::') or not prefix.endswith('::'): required[''] = (linenum, 'string') - for pattern, template, header in _re_pattern_algorithm_header: + for pattern, template, header in _re_pattern_headers_maybe_templates: if pattern.search(line): required[header] = (linenum, template) @@ -4415,15 +5539,21 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, continue for pattern, template, header in _re_pattern_templates: - if pattern.search(line): - required[header] = (linenum, template) + matched = pattern.search(line) + if matched: + # Don't warn about IWYU in non-STL namespaces: + # (We check only the first match per line; good enough.) + prefix = line[:matched.start()] + if prefix.endswith('std::') or not prefix.endswith('::'): + required[header] = (linenum, template) # The policy is that if you #include something in foo.h you don't need to # include it again in foo.cc. Here, we will look at possible includes. - # Let's copy the include_state so it is only messed up within this function. - include_state = include_state.copy() + # Let's flatten the include_state include_list and copy it into a dictionary. + include_dict = dict([item for sublist in include_state.include_list + for item in sublist]) - # Did we find the header for this file (if any) and succesfully load it? + # Did we find the header for this file (if any) and successfully load it? header_found = False # Use the absolute path so that matching works properly. @@ -4438,13 +5568,13 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, # instead of 'foo_flymake.h' abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename) - # include_state is modified during iteration, so we iterate over a copy of + # include_dict is modified during iteration, so we iterate over a copy of # the keys. - header_keys = include_state.keys() + header_keys = include_dict.keys() for header in header_keys: (same_module, common_path) = FilesBelongToSameModule(abs_filename, header) fullpath = common_path + header - if same_module and UpdateIncludeState(fullpath, include_state, io): + if same_module and UpdateIncludeState(fullpath, include_dict, io): header_found = True # If we can't find the header file for a .cc, assume it's because we don't @@ -4458,7 +5588,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, # All the lines have been processed, report the errors found. for required_header_unstripped in required: template = required[required_header_unstripped][1] - if required_header_unstripped.strip('<>"') not in include_state: + if required_header_unstripped.strip('<>"') not in include_dict: error(filename, required[required_header_unstripped][0], 'build/include_what_you_use', 4, 'Add #include ' + required_header_unstripped + ' for ' + template) @@ -4470,7 +5600,7 @@ _RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<') def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): """Check that make_pair's template arguments are deduced. - G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are + G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are specified explicitly, and such use isn't intended in any case. Args: @@ -4488,6 +5618,165 @@ def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): ' OR use pair directly OR if appropriate, construct a pair directly') +def CheckRedundantVirtual(filename, clean_lines, linenum, error): + """Check if line contains a redundant "virtual" function-specifier. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + # Look for "virtual" on current line. + line = clean_lines.elided[linenum] + virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line) + if not virtual: return + + # Ignore "virtual" keywords that are near access-specifiers. These + # are only used in class base-specifier and do not apply to member + # functions. + if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or + Match(r'^\s+(public|protected|private)\b', virtual.group(3))): + return + + # Ignore the "virtual" keyword from virtual base classes. Usually + # there is a column on the same line in these cases (virtual base + # classes are rare in google3 because multiple inheritance is rare). + if Match(r'^.*[^:]:[^:].*$', line): return + + # Look for the next opening parenthesis. This is the start of the + # parameter list (possibly on the next line shortly after virtual). + # TODO(unknown): doesn't work if there are virtual functions with + # decltype() or other things that use parentheses, but csearch suggests + # that this is rare. + end_col = -1 + end_line = -1 + start_col = len(virtual.group(2)) + for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())): + line = clean_lines.elided[start_line][start_col:] + parameter_list = Match(r'^([^(]*)\(', line) + if parameter_list: + # Match parentheses to find the end of the parameter list + (_, end_line, end_col) = CloseExpression( + clean_lines, start_line, start_col + len(parameter_list.group(1))) + break + start_col = 0 + + if end_col < 0: + return # Couldn't find end of parameter list, give up + + # Look for "override" or "final" after the parameter list + # (possibly on the next few lines). + for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())): + line = clean_lines.elided[i][end_col:] + match = Search(r'\b(override|final)\b', line) + if match: + error(filename, linenum, 'readability/inheritance', 4, + ('"virtual" is redundant since function is ' + 'already declared as "%s"' % match.group(1))) + + # Set end_col to check whole lines after we are done with the + # first line. + end_col = 0 + if Search(r'[^\w]\s*$', line): + break + + +def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error): + """Check if line contains a redundant "override" or "final" virt-specifier. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + # Look for closing parenthesis nearby. We need one to confirm where + # the declarator ends and where the virt-specifier starts to avoid + # false positives. + line = clean_lines.elided[linenum] + declarator_end = line.rfind(')') + if declarator_end >= 0: + fragment = line[declarator_end:] + else: + if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0: + fragment = line + else: + return + + # Check that at most one of "override" or "final" is present, not both + if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment): + error(filename, linenum, 'readability/inheritance', 4, + ('"override" is redundant since function is ' + 'already declared as "final"')) + + + + +# Returns true if we are at a new block, and it is directly +# inside of a namespace. +def IsBlockInNameSpace(nesting_state, is_forward_declaration): + """Checks that the new block is directly in a namespace. + + Args: + nesting_state: The _NestingState object that contains info about our state. + is_forward_declaration: If the class is a forward declared class. + Returns: + Whether or not the new block is directly in a namespace. + """ + if is_forward_declaration: + if len(nesting_state.stack) >= 1 and ( + isinstance(nesting_state.stack[-1], _NamespaceInfo)): + return True + else: + return False + + return (len(nesting_state.stack) > 1 and + nesting_state.stack[-1].check_namespace_indentation and + isinstance(nesting_state.stack[-2], _NamespaceInfo)) + + +def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, + raw_lines_no_comments, linenum): + """This method determines if we should apply our namespace indentation check. + + Args: + nesting_state: The current nesting state. + is_namespace_indent_item: If we just put a new class on the stack, True. + If the top of the stack is not a class, or we did not recently + add the class, False. + raw_lines_no_comments: The lines without the comments. + linenum: The current line number we are processing. + + Returns: + True if we should apply our namespace indentation check. Currently, it + only works for classes and namespaces inside of a namespace. + """ + + is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments, + linenum) + + if not (is_namespace_indent_item or is_forward_declaration): + return False + + # If we are in a macro, we do not want to check the namespace indentation. + if IsMacroDefinition(raw_lines_no_comments, linenum): + return False + + return IsBlockInNameSpace(nesting_state, is_forward_declaration) + + +# Call this method if the line is directly inside of a namespace. +# If the line above is blank (excluding comments) or the start of +# an inner namespace, it cannot be indented. +def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum, + error): + line = raw_lines_no_comments[linenum] + if Match(r'^\s+', line): + error(filename, linenum, 'runtime/indentation_namespace', 4, + 'Do not indent within a namespace') + + def ProcessLine(filename, file_extension, clean_lines, line, include_state, function_state, nesting_state, error, extra_check_functions=[]): @@ -4501,7 +5790,7 @@ def ProcessLine(filename, file_extension, clean_lines, line, line: Number of line being processed. include_state: An _IncludeState instance in which the headers are inserted. function_state: A _FunctionState instance which counts function lines, etc. - nesting_state: A _NestingState instance which maintains information about + nesting_state: A NestingState instance which maintains information about the current stack of nested blocks being parsed. error: A callable to which errors are reported, which takes 4 arguments: filename, line number, error level, and message @@ -4512,8 +5801,9 @@ def ProcessLine(filename, file_extension, clean_lines, line, raw_lines = clean_lines.raw_lines ParseNolintSuppressions(filename, raw_lines[line], line, error) nesting_state.Update(filename, clean_lines, line, error) - if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM: - return + CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, + error) + if nesting_state.InAsmBlock(): return CheckForFunctionLengths(filename, clean_lines, line, function_state, error) CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error) CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error) @@ -4526,9 +5816,82 @@ def ProcessLine(filename, file_extension, clean_lines, line, CheckPosixThreading(filename, clean_lines, line, error) CheckInvalidIncrement(filename, clean_lines, line, error) CheckMakePairUsesDeduction(filename, clean_lines, line, error) + CheckRedundantVirtual(filename, clean_lines, line, error) + CheckRedundantOverrideOrFinal(filename, clean_lines, line, error) for check_fn in extra_check_functions: check_fn(filename, clean_lines, line, error) +def FlagCxx11Features(filename, clean_lines, linenum, error): + """Flag those c++11 features that we only allow in certain places. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line) + + # Flag unapproved C++ TR1 headers. + if include and include.group(1).startswith('tr1/'): + error(filename, linenum, 'build/c++tr1', 5, + ('C++ TR1 headers such as <%s> are unapproved.') % include.group(1)) + + # Flag unapproved C++11 headers. + if include and include.group(1) in ('cfenv', + 'condition_variable', + 'fenv.h', + 'future', + 'mutex', + 'thread', + 'chrono', + 'ratio', + 'regex', + 'system_error', + ): + error(filename, linenum, 'build/c++11', 5, + ('<%s> is an unapproved C++11 header.') % include.group(1)) + + # The only place where we need to worry about C++11 keywords and library + # features in preprocessor directives is in macro definitions. + if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return + + # These are classes and free functions. The classes are always + # mentioned as std::*, but we only catch the free functions if + # they're not found by ADL. They're alphabetical by header. + for top_name in ( + # type_traits + 'alignment_of', + 'aligned_union', + ): + if Search(r'\bstd::%s\b' % top_name, line): + error(filename, linenum, 'build/c++11', 5, + ('std::%s is an unapproved C++11 class or function. Send c-style ' + 'an example of where it would make your code more readable, and ' + 'they may let you use it.') % top_name) + + +def FlagCxx14Features(filename, clean_lines, linenum, error): + """Flag those C++14 features that we restrict. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line) + + # Flag unapproved C++14 headers. + if include and include.group(1) in ('scoped_allocator', 'shared_mutex'): + error(filename, linenum, 'build/c++14', 5, + ('<%s> is an unapproved C++14 header.') % include.group(1)) + + def ProcessFileData(filename, file_extension, lines, error, extra_check_functions=[]): """Performs lint checks and reports any errors to the given error function. @@ -4549,31 +5912,122 @@ def ProcessFileData(filename, file_extension, lines, error, include_state = _IncludeState() function_state = _FunctionState() - nesting_state = _NestingState() + nesting_state = NestingState() ResetNolintSuppressions() CheckForCopyright(filename, lines, error) - - if file_extension == 'h': - CheckForHeaderGuard(filename, lines, error) - + ProcessGlobalSuppresions(lines) RemoveMultiLineComments(filename, lines, error) clean_lines = CleansedLines(lines) + + if IsHeaderExtension(file_extension): + CheckForHeaderGuard(filename, clean_lines, error) + for line in xrange(clean_lines.NumLines()): ProcessLine(filename, file_extension, clean_lines, line, include_state, function_state, nesting_state, error, extra_check_functions) + FlagCxx11Features(filename, clean_lines, line, error) nesting_state.CheckCompletedBlocks(filename, error) CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error) + # Check that the .cc file has included its header if it exists. + if _IsSourceExtension(file_extension): + CheckHeaderFileIncluded(filename, include_state, error) + # We check here rather than inside ProcessLine so that we see raw # lines rather than "cleaned" lines. CheckForBadCharacters(filename, lines, error) CheckForNewlineAtEOF(filename, lines, error) +def ProcessConfigOverrides(filename): + """ Loads the configuration files and processes the config overrides. + + Args: + filename: The name of the file being processed by the linter. + + Returns: + False if the current |filename| should not be processed further. + """ + + abs_filename = os.path.abspath(filename) + cfg_filters = [] + keep_looking = True + while keep_looking: + abs_path, base_name = os.path.split(abs_filename) + if not base_name: + break # Reached the root directory. + + cfg_file = os.path.join(abs_path, "CPPLINT.cfg") + abs_filename = abs_path + if not os.path.isfile(cfg_file): + continue + + try: + with open(cfg_file) as file_handle: + for line in file_handle: + line, _, _ = line.partition('#') # Remove comments. + if not line.strip(): + continue + + name, _, val = line.partition('=') + name = name.strip() + val = val.strip() + if name == 'set noparent': + keep_looking = False + elif name == 'filter': + cfg_filters.append(val) + elif name == 'exclude_files': + # When matching exclude_files pattern, use the base_name of + # the current file name or the directory name we are processing. + # For example, if we are checking for lint errors in /foo/bar/baz.cc + # and we found the .cfg file at /foo/CPPLINT.cfg, then the config + # file's "exclude_files" filter is meant to be checked against "bar" + # and not "baz" nor "bar/baz.cc". + if base_name: + pattern = re.compile(val) + if pattern.match(base_name): + if _cpplint_state.quiet: + # Suppress "Ignoring file" warning when using --quiet. + return False + sys.stderr.write('Ignoring "%s": file excluded by "%s". ' + 'File path component "%s" matches ' + 'pattern "%s"\n' % + (filename, cfg_file, base_name, val)) + return False + elif name == 'linelength': + global _line_length + try: + _line_length = int(val) + except ValueError: + sys.stderr.write('Line length must be numeric.') + elif name == 'root': + global _root + # root directories are specified relative to CPPLINT.cfg dir. + _root = os.path.join(os.path.dirname(cfg_file), val) + elif name == 'headers': + ProcessHppHeadersOption(val) + else: + sys.stderr.write( + 'Invalid configuration option (%s) in file %s\n' % + (name, cfg_file)) + + except IOError: + sys.stderr.write( + "Skipping config file '%s': Can't open for reading\n" % cfg_file) + keep_looking = False + + # Apply all the accumulated filters in reverse order (top-level directory + # config options having the least priority). + for filter in reversed(cfg_filters): + _AddFilters(filter) + + return True + + def ProcessFile(filename, vlevel, extra_check_functions=[]): """Does google-lint on a single file. @@ -4589,7 +6043,15 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]): """ _SetVerboseLevel(vlevel) + _BackupFilters() + old_errors = _cpplint_state.error_count + if not ProcessConfigOverrides(filename): + _RestoreFilters() + return + + lf_lines = [] + crlf_lines = [] try: # Support the UNIX convention of using "-" for stdin. Note that # we are not opening the file with universal newline support @@ -4597,10 +6059,7 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]): # contain trailing '\r' characters if we are reading a file that # has CRLF endings. # If after the split a trailing '\r' is present, it is removed - # below. If it is not expected to be present (i.e. os.linesep != - # '\r\n' as in Windows), a warning is issued below if this file - # is processed. - + # below. if filename == '-': lines = codecs.StreamReaderWriter(sys.stdin, codecs.getreader('utf8'), @@ -4609,16 +6068,19 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]): else: lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n') - carriage_return_found = False # Remove trailing '\r'. - for linenum in range(len(lines)): + # The -1 accounts for the extra trailing blank line we get from split() + for linenum in range(len(lines) - 1): if lines[linenum].endswith('\r'): lines[linenum] = lines[linenum].rstrip('\r') - carriage_return_found = True + crlf_lines.append(linenum + 1) + else: + lf_lines.append(linenum + 1) except IOError: sys.stderr.write( "Skipping input '%s': Can't open for reading\n" % filename) + _RestoreFilters() return # Note, if no dot is found, this will give the entire filename as the ext. @@ -4632,14 +6094,30 @@ def ProcessFile(filename, vlevel, extra_check_functions=[]): else: ProcessFileData(filename, file_extension, lines, Error, extra_check_functions) - if carriage_return_found and os.linesep != '\r\n': - # Use 0 for linenum since outputting only one error for potentially - # several lines. - Error(filename, 0, 'whitespace/newline', 1, - 'One or more unexpected \\r (^M) found;' - 'better to use only a \\n') - sys.stderr.write('Done processing %s\n' % filename) + # If end-of-line sequences are a mix of LF and CR-LF, issue + # warnings on the lines with CR. + # + # Don't issue any warnings if all lines are uniformly LF or CR-LF, + # since critique can handle these just fine, and the style guide + # doesn't dictate a particular end of line sequence. + # + # We can't depend on os.linesep to determine what the desired + # end-of-line sequence should be, since that will return the + # server-side end-of-line sequence. + if lf_lines and crlf_lines: + # Warn on every line with CR. An alternative approach might be to + # check whether the file is mostly CRLF or just LF, and warn on the + # minority, we bias toward LF here since most tools prefer LF. + for linenum in crlf_lines: + Error(filename, linenum, 'whitespace/newline', 1, + 'Unexpected \\r (^M) found; better to use only \\n') + + # Suppress printing anything if --quiet was passed unless the error + # count has increased after processing this file. + if not _cpplint_state.quiet or old_errors != _cpplint_state.error_count: + sys.stdout.write('Done processing %s\n' % filename) + _RestoreFilters() def PrintUsage(message): @@ -4681,13 +6159,16 @@ def ParseArguments(args): 'filter=', 'root=', 'linelength=', - 'extensions=']) + 'extensions=', + 'headers=', + 'quiet']) except getopt.GetoptError: PrintUsage('Invalid arguments.') verbosity = _VerboseLevel() output_format = _OutputFormat() filters = '' + quiet = _Quiet() counting_style = '' for (opt, val) in opts: @@ -4697,6 +6178,8 @@ def ParseArguments(args): if val not in ('emacs', 'vs7', 'eclipse'): PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.') output_format = val + elif opt == '--quiet': + quiet = True elif opt == '--verbose': verbosity = int(val) elif opt == '--filter': @@ -4721,12 +6204,15 @@ def ParseArguments(args): try: _valid_extensions = set(val.split(',')) except ValueError: - PrintUsage('Extensions must be comma seperated list.') + PrintUsage('Extensions must be comma separated list.') + elif opt == '--headers': + ProcessHppHeadersOption(val) if not filenames: PrintUsage('No files were specified.') _SetOutputFormat(output_format) + _SetQuiet(quiet) _SetVerboseLevel(verbosity) _SetFilters(filters) _SetCountingStyle(counting_style) @@ -4747,7 +6233,9 @@ def main(): _cpplint_state.ResetErrorCounts() for filename in filenames: ProcessFile(filename, _cpplint_state.verbose_level) - _cpplint_state.PrintErrorCounts() + # If --quiet is passed, suppress printing error count unless there are errors. + if not _cpplint_state.quiet or _cpplint_state.error_count > 0: + _cpplint_state.PrintErrorCounts() sys.exit(_cpplint_state.error_count > 0) diff --git a/media/libvpx/libvpx/tools/diff.py b/media/libvpx/libvpx/tools/diff.py index a96c7db851..860a6b051b 100644 --- a/media/libvpx/libvpx/tools/diff.py +++ b/media/libvpx/libvpx/tools/diff.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 ## Copyright (c) 2012 The WebM project authors. All Rights Reserved. ## ## Use of this source code is governed by a BSD-style license diff --git a/media/libvpx/libvpx/tools/ftfy.sh b/media/libvpx/libvpx/tools/ftfy.sh deleted file mode 100644 index c005918fe7..0000000000 --- a/media/libvpx/libvpx/tools/ftfy.sh +++ /dev/null @@ -1,158 +0,0 @@ -#!/bin/sh -self="$0" -dirname_self=$(dirname "$self") - -usage() { - cat <&2 -Usage: $self [option] - -This script applies a whitespace transformation to the commit at HEAD. If no -options are given, then the modified files are left in the working tree. - -Options: - -h, --help Shows this message - -n, --dry-run Shows a diff of the changes to be made. - --amend Squashes the changes into the commit at HEAD - This option will also reformat the commit message. - --commit Creates a new commit containing only the whitespace changes - --msg-only Reformat the commit message only, ignore the patch itself. - -EOF - rm -f ${CLEAN_FILES} - exit 1 -} - - -log() { - echo "${self##*/}: $@" >&2 -} - - -vpx_style() { - for f; do - case "$f" in - *.h|*.c|*.cc) - clang-format -i --style=file "$f" - ;; - esac - done -} - - -apply() { - [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1" -} - - -commit() { - LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}') - if [ -z "$LAST_CHANGEID" ]; then - log "HEAD doesn't have a Change-Id, unable to generate a new commit" - exit 1 - fi - - # Build a deterministic Change-Id from the parent's - NEW_CHANGEID=${LAST_CHANGEID}-styled - NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin) - - # Commit, preserving authorship from the parent commit. - git commit -a -C HEAD > /dev/null - git commit --amend -F- << EOF -Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9} - -Change-Id: ${NEW_CHANGEID} -EOF -} - - -show_commit_msg_diff() { - if [ $DIFF_MSG_RESULT -ne 0 ]; then - log "Modified commit message:" - diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3 - fi -} - - -amend() { - show_commit_msg_diff - if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then - git commit -a --amend -F "$NEW_COMMIT_MSG" - fi -} - - -diff_msg() { - git log -1 --format=%B > "$ORIG_COMMIT_MSG" - "${dirname_self}"/wrap-commit-msg.py \ - < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG" - cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" - DIFF_MSG_RESULT=$? -} - - -# Temporary files -ORIG_DIFF=orig.diff.$$ -MODIFIED_DIFF=modified.diff.$$ -FINAL_DIFF=final.diff.$$ -ORIG_COMMIT_MSG=orig.commit-msg.$$ -NEW_COMMIT_MSG=new.commit-msg.$$ -CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}" -CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}" - -# Preconditions -[ $# -lt 2 ] || usage - -if ! clang-format -version >/dev/null 2>&1; then - log "clang-format not found" - exit 1 -fi - -if ! git diff --quiet HEAD; then - log "Working tree is dirty, commit your changes first" - exit 1 -fi - -# Need to be in the root -cd "$(git rev-parse --show-toplevel)" - -# Collect the original diff -git show > "${ORIG_DIFF}" - -# Apply the style guide on new and modified files and collect its diff -for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do - case "$f" in - third_party/*) continue;; - esac - vpx_style "$f" -done -git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}" - -# Intersect the two diffs -"${dirname_self}"/intersect-diffs.py \ - "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}" -INTERSECT_RESULT=$? -git reset --hard >/dev/null - -# Fixup the commit message -diff_msg - -# Handle options -if [ -n "$1" ]; then - case "$1" in - -h|--help) usage;; - -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;; - --commit) apply "${FINAL_DIFF}"; commit;; - --amend) apply "${FINAL_DIFF}"; amend;; - --msg-only) amend;; - *) usage;; - esac -else - apply "${FINAL_DIFF}" - if ! git diff --quiet; then - log "Formatting changes applied, verify and commit." - log "See also: http://www.webmproject.org/code/contribute/conventions/" - git diff --stat - fi -fi - -rm -f ${CLEAN_FILES} diff --git a/media/libvpx/libvpx/tools/intersect-diffs.py b/media/libvpx/libvpx/tools/intersect-diffs.py index 4dbafa90b7..590e687b47 100644 --- a/media/libvpx/libvpx/tools/intersect-diffs.py +++ b/media/libvpx/libvpx/tools/intersect-diffs.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 ## Copyright (c) 2012 The WebM project authors. All Rights Reserved. ## ## Use of this source code is governed by a BSD-style license @@ -69,7 +69,7 @@ def main(): break if out_hunks: - print FormatDiffHunks(out_hunks) + print(FormatDiffHunks(out_hunks)) sys.exit(1) if __name__ == "__main__": diff --git a/media/libvpx/libvpx/tools/lint-hunks.py b/media/libvpx/libvpx/tools/lint-hunks.py index 6e25d93624..0a94afebb9 100644 --- a/media/libvpx/libvpx/tools/lint-hunks.py +++ b/media/libvpx/libvpx/tools/lint-hunks.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 ## Copyright (c) 2012 The WebM project authors. All Rights Reserved. ## ## Use of this source code is governed by a BSD-style license @@ -10,7 +10,7 @@ """Performs style checking on each diff hunk.""" import getopt import os -import StringIO +import io import subprocess import sys @@ -63,21 +63,21 @@ def main(argv=None): try: try: opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS) - except getopt.error, msg: + except getopt.error as msg: raise Usage(msg) # process options for o, _ in opts: if o in ("-h", "--help"): - print __doc__ + print(__doc__) sys.exit(0) if args and len(args) > 1: - print __doc__ + print(__doc__) sys.exit(0) # Find the fully qualified path to the root of the tree - tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE) + tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE, text=True) tl = tl.communicate()[0].strip() # See if we're working on the index or not. @@ -93,9 +93,9 @@ def main(argv=None): # Get a list of all affected lines file_affected_line_map = {} - p = Subprocess(diff_cmd, stdout=subprocess.PIPE) + p = Subprocess(diff_cmd, stdout=subprocess.PIPE, text=True) stdout = p.communicate()[0] - for hunk in diff.ParseDiffHunks(StringIO.StringIO(stdout)): + for hunk in diff.ParseDiffHunks(io.StringIO(stdout)): filename = hunk.right.filename[2:] if filename not in file_affected_line_map: file_affected_line_map[filename] = set() @@ -103,21 +103,25 @@ def main(argv=None): # Run each affected file through cpplint lint_failed = False - for filename, affected_lines in file_affected_line_map.iteritems(): + for filename, affected_lines in file_affected_line_map.items(): if filename.split(".")[-1] not in ("c", "h", "cc"): continue + if filename.startswith("third_party"): + continue if args: # File contents come from git show_cmd = SHOW_CMD + [args[0] + ":" + filename] - show = Subprocess(show_cmd, stdout=subprocess.PIPE) + show = Subprocess(show_cmd, stdout=subprocess.PIPE, text=True) lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1), - stdin=show.stdout, stderr=subprocess.PIPE) + stdin=show.stdout, stderr=subprocess.PIPE, + text=True) lint_out = lint.communicate()[1] else: # File contents come from the working tree lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1), - stdin=subprocess.PIPE, stderr=subprocess.PIPE) + stdin=subprocess.PIPE, stderr=subprocess.PIPE, + text=True) stdin = open(os.path.join(tl, filename)).read() lint_out = lint.communicate(stdin)[1] @@ -127,17 +131,17 @@ def main(argv=None): continue warning_line_num = int(fields[1]) if warning_line_num in affected_lines: - print "%s:%d:%s"%(filename, warning_line_num, - ":".join(fields[2:])) + print("%s:%d:%s"%(filename, warning_line_num, + ":".join(fields[2:]))) lint_failed = True # Set exit code if any relevant lint errors seen if lint_failed: return 1 - except Usage, err: - print >>sys.stderr, err - print >>sys.stderr, "for help use --help" + except Usage as err: + print(err, file=sys.stderr) + print("for help use --help", file=sys.stderr) return 2 if __name__ == "__main__": diff --git a/media/libvpx/libvpx/tools/non_greedy_mv/non_greedy_mv.py b/media/libvpx/libvpx/tools/non_greedy_mv/non_greedy_mv.py new file mode 100644 index 0000000000..a46b7e760c --- /dev/null +++ b/media/libvpx/libvpx/tools/non_greedy_mv/non_greedy_mv.py @@ -0,0 +1,195 @@ +## Copyright (c) 2020 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +import sys +import matplotlib.pyplot as plt +from matplotlib.collections import LineCollection +from matplotlib import colors as mcolors +import numpy as np +import math + + +def draw_mv_ls(axis, mv_ls, mode=0): + colors = np.array([(1., 0., 0., 1.)]) + segs = np.array([ + np.array([[ptr[0], ptr[1]], [ptr[0] + ptr[2], ptr[1] + ptr[3]]]) + for ptr in mv_ls + ]) + line_segments = LineCollection( + segs, linewidths=(1.,), colors=colors, linestyle='solid') + axis.add_collection(line_segments) + if mode == 0: + axis.scatter(mv_ls[:, 0], mv_ls[:, 1], s=2, c='b') + else: + axis.scatter( + mv_ls[:, 0] + mv_ls[:, 2], mv_ls[:, 1] + mv_ls[:, 3], s=2, c='b') + + +def draw_pred_block_ls(axis, mv_ls, bs, mode=0): + colors = np.array([(0., 0., 0., 1.)]) + segs = [] + for ptr in mv_ls: + if mode == 0: + x = ptr[0] + y = ptr[1] + else: + x = ptr[0] + ptr[2] + y = ptr[1] + ptr[3] + x_ls = [x, x + bs, x + bs, x, x] + y_ls = [y, y, y + bs, y + bs, y] + + segs.append(np.column_stack([x_ls, y_ls])) + line_segments = LineCollection( + segs, linewidths=(.5,), colors=colors, linestyle='solid') + axis.add_collection(line_segments) + + +def read_frame(fp, no_swap=0): + plane = [None, None, None] + for i in range(3): + line = fp.readline() + word_ls = line.split() + word_ls = [int(item) for item in word_ls] + rows = word_ls[0] + cols = word_ls[1] + + line = fp.readline() + word_ls = line.split() + word_ls = [int(item) for item in word_ls] + + plane[i] = np.array(word_ls).reshape(rows, cols) + if i > 0: + plane[i] = plane[i].repeat(2, axis=0).repeat(2, axis=1) + plane = np.array(plane) + if no_swap == 0: + plane = np.swapaxes(np.swapaxes(plane, 0, 1), 1, 2) + return plane + + +def yuv_to_rgb(yuv): + #mat = np.array([ + # [1.164, 0 , 1.596 ], + # [1.164, -0.391, -0.813], + # [1.164, 2.018 , 0 ] ] + # ) + #c = np.array([[ -16 , -16 , -16 ], + # [ 0 , -128, -128 ], + # [ -128, -128, 0 ]]) + + mat = np.array([[1, 0, 1.4075], [1, -0.3445, -0.7169], [1, 1.7790, 0]]) + c = np.array([[0, 0, 0], [0, -128, -128], [-128, -128, 0]]) + mat_c = np.dot(mat, c) + v = np.array([mat_c[0, 0], mat_c[1, 1], mat_c[2, 2]]) + mat = mat.transpose() + rgb = np.dot(yuv, mat) + v + rgb = rgb.astype(int) + rgb = rgb.clip(0, 255) + return rgb / 255. + + +def read_feature_score(fp, mv_rows, mv_cols): + line = fp.readline() + word_ls = line.split() + feature_score = np.array([math.log(float(v) + 1, 2) for v in word_ls]) + feature_score = feature_score.reshape(mv_rows, mv_cols) + return feature_score + +def read_mv_mode_arr(fp, mv_rows, mv_cols): + line = fp.readline() + word_ls = line.split() + mv_mode_arr = np.array([int(v) for v in word_ls]) + mv_mode_arr = mv_mode_arr.reshape(mv_rows, mv_cols) + return mv_mode_arr + + +def read_frame_dpl_stats(fp): + line = fp.readline() + word_ls = line.split() + frame_idx = int(word_ls[1]) + mi_rows = int(word_ls[3]) + mi_cols = int(word_ls[5]) + bs = int(word_ls[7]) + ref_frame_idx = int(word_ls[9]) + rf_idx = int(word_ls[11]) + gf_frame_offset = int(word_ls[13]) + ref_gf_frame_offset = int(word_ls[15]) + mi_size = bs / 8 + mv_ls = [] + mv_rows = int((math.ceil(mi_rows * 1. / mi_size))) + mv_cols = int((math.ceil(mi_cols * 1. / mi_size))) + for i in range(mv_rows * mv_cols): + line = fp.readline() + word_ls = line.split() + row = int(word_ls[0]) * 8. + col = int(word_ls[1]) * 8. + mv_row = int(word_ls[2]) / 8. + mv_col = int(word_ls[3]) / 8. + mv_ls.append([col, row, mv_col, mv_row]) + mv_ls = np.array(mv_ls) + feature_score = read_feature_score(fp, mv_rows, mv_cols) + mv_mode_arr = read_mv_mode_arr(fp, mv_rows, mv_cols) + img = yuv_to_rgb(read_frame(fp)) + ref = yuv_to_rgb(read_frame(fp)) + return rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, mv_ls, img, ref, bs, feature_score, mv_mode_arr + + +def read_dpl_stats_file(filename, frame_num=0): + fp = open(filename) + line = fp.readline() + width = 0 + height = 0 + data_ls = [] + while (line): + if line[0] == '=': + data_ls.append(read_frame_dpl_stats(fp)) + line = fp.readline() + if frame_num > 0 and len(data_ls) == frame_num: + break + return data_ls + + +if __name__ == '__main__': + filename = sys.argv[1] + data_ls = read_dpl_stats_file(filename, frame_num=5) + for rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, mv_ls, img, ref, bs, feature_score, mv_mode_arr in data_ls: + fig, axes = plt.subplots(2, 2) + + axes[0][0].imshow(img) + draw_mv_ls(axes[0][0], mv_ls) + draw_pred_block_ls(axes[0][0], mv_ls, bs, mode=0) + #axes[0].grid(color='k', linestyle='-') + axes[0][0].set_ylim(img.shape[0], 0) + axes[0][0].set_xlim(0, img.shape[1]) + + if ref is not None: + axes[0][1].imshow(ref) + draw_mv_ls(axes[0][1], mv_ls, mode=1) + draw_pred_block_ls(axes[0][1], mv_ls, bs, mode=1) + #axes[1].grid(color='k', linestyle='-') + axes[0][1].set_ylim(ref.shape[0], 0) + axes[0][1].set_xlim(0, ref.shape[1]) + + axes[1][0].imshow(feature_score) + #feature_score_arr = feature_score.flatten() + #feature_score_max = feature_score_arr.max() + #feature_score_min = feature_score_arr.min() + #step = (feature_score_max - feature_score_min) / 20. + #feature_score_bins = np.arange(feature_score_min, feature_score_max, step) + #axes[1][1].hist(feature_score_arr, bins=feature_score_bins) + im = axes[1][1].imshow(mv_mode_arr) + #axes[1][1].figure.colorbar(im, ax=axes[1][1]) + + print rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, len(mv_ls) + + flatten_mv_mode = mv_mode_arr.flatten() + zero_mv_count = sum(flatten_mv_mode == 0); + new_mv_count = sum(flatten_mv_mode == 1); + ref_mv_count = sum(flatten_mv_mode == 2) + sum(flatten_mv_mode == 3); + print zero_mv_count, new_mv_count, ref_mv_count + plt.show() diff --git a/media/libvpx/libvpx/tools/set_analyzer_env.sh b/media/libvpx/libvpx/tools/set_analyzer_env.sh new file mode 100644 index 0000000000..a3e8a1ae99 --- /dev/null +++ b/media/libvpx/libvpx/tools/set_analyzer_env.sh @@ -0,0 +1,135 @@ +## Copyright (c) 2018 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## +## Sourcing this file sets environment variables to simplify setting up +## sanitizer builds and testing. + +sanitizer="${1}" + +case "${sanitizer}" in + address) ;; + cfi) ;; + integer) ;; + memory) ;; + thread) ;; + undefined) ;; + clear) + echo "Clearing environment:" + set -x + unset CC CXX LD AR + unset CFLAGS CXXFLAGS LDFLAGS + unset ASAN_OPTIONS MSAN_OPTIONS TSAN_OPTIONS UBSAN_OPTIONS + set +x + return + ;; + *) + echo "Usage: source set_analyzer_env.sh [|clear]" + echo " Supported sanitizers:" + echo " address cfi integer memory thread undefined" + return 1 + ;; +esac + +if [ ! $(which clang) ]; then + # TODO(johannkoenig): Support gcc analyzers. + echo "ERROR: 'clang' must be in your PATH" + return 1 +fi + +# Warnings. +if [ "${sanitizer}" = "undefined" -o "${sanitizer}" = "integer" ]; then + echo "WARNING: When building the ${sanitizer} sanitizer for 32 bit targets" + echo "you must run:" + echo "export LDFLAGS=\"\${LDFLAGS} --rtlib=compiler-rt -lgcc_s\"" + echo "See http://llvm.org/bugs/show_bug.cgi?id=17693 for details." +fi + +if [ "${sanitizer}" = "undefined" ]; then + major_version=$(clang --version | head -n 1 \ + | grep -o -E "[[:digit:]]\.[[:digit:]]\.[[:digit:]]" | cut -f1 -d.) + if [ ${major_version} -eq 5 ]; then + echo "WARNING: clang v5 has a problem with vp9 x86_64 high bit depth" + echo "configurations. It can take ~40 minutes to compile" + echo "vpx_dsp/x86/fwd_txfm_sse2.c" + echo "clang v4 did not have this issue." + fi +fi + +echo "It is recommended to configure with '--enable-debug' to improve stack" +echo "traces. On mac builds, run 'dysmutil' on the output binaries (vpxenc," +echo "test_libvpx, etc) to link the stack traces to source code lines." + +# Build configuration. +cflags="-fsanitize=${sanitizer}" +ldflags="-fsanitize=${sanitizer}" + +# Useful backtraces. +cflags="${cflags} -fno-omit-frame-pointer" +# Exact backtraces. +cflags="${cflags} -fno-optimize-sibling-calls" + +case "${sanitizer}" in + cfi) + # https://clang.llvm.org/docs/ControlFlowIntegrity.html + cflags="${cflags} -fno-sanitize-trap=cfi -flto -fvisibility=hidden" + ldflags="${ldflags} -fno-sanitize-trap=cfi -flto -fuse-ld=gold" + export AR="llvm-ar" + ;; + integer|undefined) + # https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html + cflags="${cflags} -fsanitize=float-cast-overflow" + ;; +esac + +set -x +export CC="clang" +export CXX="clang++" +export LD="clang++" + +export CFLAGS="${cflags}" +export CXXFLAGS="${cflags}" +export LDFLAGS="${ldflags}" +set +x + +# Execution configuration. +sanitizer_options="" +sanitizer_options="${sanitizer_options}:handle_segv=1" +sanitizer_options="${sanitizer_options}:handle_abort=1" +sanitizer_options="${sanitizer_options}:handle_sigfpe=1" +sanitizer_options="${sanitizer_options}:fast_unwind_on_fatal=1" +sanitizer_options="${sanitizer_options}:allocator_may_return_null=1" + +case "${sanitizer}" in + address) + sanitizer_options="${sanitizer_options}:detect_stack_use_after_return=1" + sanitizer_options="${sanitizer_options}:max_uar_stack_size_log=17" + set -x + export ASAN_OPTIONS="${sanitizer_options}" + set +x + ;; + cfi) + # No environment settings + ;; + memory) + set -x + export MSAN_OPTIONS="${sanitizer_options}" + set +x + ;; + thread) + # The thread sanitizer uses an entirely independent set of options. + set -x + export TSAN_OPTIONS="halt_on_error=1" + set +x + ;; + undefined|integer) + sanitizer_options="${sanitizer_options}:print_stacktrace=1" + set -x + export UBSAN_OPTIONS="${sanitizer_options}" + set +x + ;; +esac diff --git a/media/libvpx/libvpx/tools/tiny_ssim.c b/media/libvpx/libvpx/tools/tiny_ssim.c index 28052e0a84..c07a9d2118 100644 --- a/media/libvpx/libvpx/tools/tiny_ssim.c +++ b/media/libvpx/libvpx/tools/tiny_ssim.c @@ -8,17 +8,188 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include #include #include +#include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "./y4minput.h" +#include "vpx_dsp/ssim.h" +#include "vpx_ports/mem.h" -void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp, - uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, - uint32_t *sum_sq_r, uint32_t *sum_sxr) { +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 +static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 +static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 +static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 + +#if CONFIG_VP9_HIGHBITDEPTH +static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride, + uint16_t *recon, int recon_stride, + unsigned int cols, unsigned int rows) { + unsigned int row, col; + uint64_t total_sse = 0; + int diff; + if (orig == NULL || recon == NULL) { + assert(0); + return 0; + } + + for (row = 0; row < rows; row++) { + for (col = 0; col < cols; col++) { + diff = orig[col] - recon[col]; + total_sse += diff * diff; + } + + orig += orig_stride; + recon += recon_stride; + } + return total_sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon, + int recon_stride, unsigned int cols, + unsigned int rows) { + unsigned int row, col; + uint64_t total_sse = 0; + int diff; + if (orig == NULL || recon == NULL) { + assert(0); + return 0; + } + + for (row = 0; row < rows; row++) { + for (col = 0; col < cols; col++) { + diff = orig[col] - recon[col]; + total_sse += diff * diff; + } + + orig += orig_stride; + recon += recon_stride; + } + return total_sse; +} + +#define MAX_PSNR 100 +static double mse2psnr(double samples, double peak, double mse) { + double psnr; + + if (mse > 0.0) + psnr = 10.0 * log10(peak * peak * samples / mse); + else + psnr = MAX_PSNR; // Limit to prevent / 0 + + if (psnr > MAX_PSNR) psnr = MAX_PSNR; + + return psnr; +} + +typedef enum { RAW_YUV, Y4M } input_file_type; + +typedef struct input_file { + FILE *file; + input_file_type type; + unsigned char *buf; + y4m_input y4m; + vpx_image_t img; + int w; + int h; + int bit_depth; + int frame_size; +} input_file_t; + +// Open a file and determine if its y4m or raw. If y4m get the header. +static int open_input_file(const char *file_name, input_file_t *input, int w, + int h, int bit_depth) { + char y4m_buf[4]; + input->w = w; + input->h = h; + input->bit_depth = bit_depth; + input->type = RAW_YUV; + input->buf = NULL; + input->file = strcmp(file_name, "-") ? fopen(file_name, "rb") : stdin; + if (input->file == NULL) return -1; + if (fread(y4m_buf, 1, 4, input->file) != 4) return -1; + if (memcmp(y4m_buf, "YUV4", 4) == 0) input->type = Y4M; + switch (input->type) { + case Y4M: + y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0); + input->w = input->y4m.pic_w; + input->h = input->y4m.pic_h; + input->bit_depth = input->y4m.bit_depth; + // Y4M alloc's its own buf. Init this to avoid problems if we never + // read frames. + memset(&input->img, 0, sizeof(input->img)); + break; + case RAW_YUV: + fseek(input->file, 0, SEEK_SET); + input->w = w; + input->h = h; + // handle odd frame sizes + input->frame_size = w * h + ((w + 1) / 2) * ((h + 1) / 2) * 2; + if (bit_depth > 8) { + input->frame_size *= 2; + } + input->buf = malloc(input->frame_size); + break; + } + return 0; +} + +static void close_input_file(input_file_t *in) { + if (in->file) fclose(in->file); + if (in->type == Y4M) { + vpx_img_free(&in->img); + } else { + free(in->buf); + } +} + +// Returns 1 on success, 0 on failure due to a read error or eof (or format +// error in the case of y4m). +static int read_input_file(input_file_t *in, unsigned char **y, + unsigned char **u, unsigned char **v, int bd) { + size_t r1 = 0; + switch (in->type) { + case Y4M: + r1 = y4m_input_fetch_frame(&in->y4m, in->file, &in->img); + if (r1 == (size_t)-1) return 0; + *y = in->img.planes[0]; + *u = in->img.planes[1]; + *v = in->img.planes[2]; + break; + case RAW_YUV: + if (bd < 9) { + r1 = fread(in->buf, in->frame_size, 1, in->file); + *y = in->buf; + *u = in->buf + in->w * in->h; + *v = *u + ((1 + in->w) / 2) * ((1 + in->h) / 2); + } else { + r1 = fread(in->buf, in->frame_size, 1, in->file); + *y = in->buf; + *u = in->buf + (in->w * in->h) * 2; + *v = *u + 2 * ((1 + in->w) / 2) * ((1 + in->h) / 2); + } + break; + } + + return r1 != 0; +} + +static void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { int i, j; + if (s == NULL || r == NULL || sum_s == NULL || sum_r == NULL || + sum_sq_s == NULL || sum_sq_r == NULL || sum_sxr == NULL) { + assert(0); + return; + } for (i = 0; i < 8; i++, s += sp, r += rp) { for (j = 0; j < 8; j++) { *sum_s += s[j]; @@ -30,40 +201,79 @@ void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp, } } -static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 -static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + if (s == NULL || r == NULL || sum_s == NULL || sum_r == NULL || + sum_sq_s == NULL || sum_sq_r == NULL || sum_sxr == NULL) { + assert(0); + return; + } + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, - uint32_t sum_sq_r, uint32_t sum_sxr, int count) { - int64_t ssim_n, ssim_d; - int64_t c1, c2; + uint32_t sum_sq_r, uint32_t sum_sxr, int count, + uint32_t bd) { + double ssim_n, ssim_d; + int64_t c1 = 0, c2 = 0; + if (bd == 8) { + // scale the constants by number of pixels + c1 = (cc1 * count * count) >> 12; + c2 = (cc2 * count * count) >> 12; + } else if (bd == 10) { + c1 = (cc1_10 * count * count) >> 12; + c2 = (cc2_10 * count * count) >> 12; + } else if (bd == 12) { + c1 = (cc1_12 * count * count) >> 12; + c2 = (cc2_12 * count * count) >> 12; + } else { + assert(0); + } - // scale the constants by number of pixels - c1 = (cc1 * count * count) >> 12; - c2 = (cc2 * count * count) >> 12; + ssim_n = (2.0 * sum_s * sum_r + c1) * + (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2); - ssim_n = (2 * sum_s * sum_r + c1) * - ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) * + ((double)count * sum_sq_s - (double)sum_s * sum_s + + (double)count * sum_sq_r - (double)sum_r * sum_r + c2); - ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * - ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + - (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); - - return ssim_n * 1.0 / ssim_d; + return ssim_n / ssim_d; } -static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) { +static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; - vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, - &sum_sxr); - return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); + ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); } +#if CONFIG_VP9_HIGHBITDEPTH +static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t bd) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + // We are using a 8x8 moving window with starting location of each 8x8 window // on the 4x4 pixel grid. Such arrangement allows the windows to overlap // block boundaries to penalize blocking artifacts. -double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1, - int stride_img2, int width, int height) { +static double ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, + int stride_img2, int width, int height) { int i, j; int samples = 0; double ssim_total = 0; @@ -81,120 +291,295 @@ double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1, return ssim_total; } -static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon, - int recon_stride, unsigned int cols, - unsigned int rows) { - unsigned int row, col; - uint64_t total_sse = 0; - int diff; +#if CONFIG_VP9_HIGHBITDEPTH +static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height, uint32_t bd) { + int i, j; + int samples = 0; + double ssim_total = 0; - for (row = 0; row < rows; row++) { - for (col = 0; col < cols; col++) { - diff = orig[col] - recon[col]; - total_sse += diff * diff; + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = + highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd); + ssim_total += v; + samples++; } - - orig += orig_stride; - recon += recon_stride; } - - return total_sse; -} - -#define MAX_PSNR 100 - -double vp9_mse2psnr(double samples, double peak, double mse) { - double psnr; - - if (mse > 0.0) - psnr = 10.0 * log10(peak * peak * samples / mse); - else - psnr = MAX_PSNR; // Limit to prevent / 0 - - if (psnr > MAX_PSNR) psnr = MAX_PSNR; - - return psnr; + ssim_total /= samples; + return ssim_total; } +#endif // CONFIG_VP9_HIGHBITDEPTH int main(int argc, char *argv[]) { - FILE *f[2]; - uint8_t *buf[2]; - int w, h, n_frames, tl_skip = 0, tl_skips_remaining = 0; - double ssim = 0, psnravg = 0, psnrglb = 0; - double ssimy, ssimu, ssimv; - uint64_t psnry, psnru, psnrv; + FILE *framestats = NULL; + int bit_depth = 8; + int w = 0, h = 0, tl_skip = 0, tl_skips_remaining = 0; + double ssimavg = 0, ssimyavg = 0, ssimuavg = 0, ssimvavg = 0; + double psnrglb = 0, psnryglb = 0, psnruglb = 0, psnrvglb = 0; + double psnravg = 0, psnryavg = 0, psnruavg = 0, psnrvavg = 0; + double *ssimy = NULL, *ssimu = NULL, *ssimv = NULL; + uint64_t *psnry = NULL, *psnru = NULL, *psnrv = NULL; + size_t i, n_frames = 0, allocated_frames = 0; + int return_value = 0; + input_file_t in[2]; + double peak = 255.0; - if (argc < 4) { - fprintf(stderr, "Usage: %s file1.yuv file2.yuv WxH [tl_skip={0,1,3}]\n", + memset(in, 0, sizeof(in)); + + if (argc < 3) { + fprintf(stderr, + "Usage: %s file1.{yuv|y4m} file2.{yuv|y4m}" + " [WxH tl_skip={0,1,3} frame_stats_file bits]\n", argv[0]); return 1; } - f[0] = strcmp(argv[1], "-") ? fopen(argv[1], "rb") : stdin; - f[1] = strcmp(argv[2], "-") ? fopen(argv[2], "rb") : stdin; - sscanf(argv[3], "%dx%d", &w, &h); - // Number of frames to skip from file1.yuv for every frame used. Normal values - // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding - // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding. - if (argc > 4) { - sscanf(argv[4], "%d", &tl_skip); - } - if (!f[0] || !f[1]) { - fprintf(stderr, "Could not open input files: %s\n", strerror(errno)); - return 1; - } - if (w <= 0 || h <= 0 || w & 1 || h & 1) { - fprintf(stderr, "Invalid size %dx%d\n", w, h); - return 1; - } - buf[0] = malloc(w * h * 3 / 2); - buf[1] = malloc(w * h * 3 / 2); - n_frames = 0; - while (1) { - size_t r1, r2; - r1 = fread(buf[0], w * h * 3 / 2, 1, f[0]); - if (r1) { - // Reading parts of file1.yuv that were not used in temporal layer. - if (tl_skips_remaining > 0) { - --tl_skips_remaining; - continue; - } - // Use frame, but skip |tl_skip| after it. - tl_skips_remaining = tl_skip; + + if (argc > 3) { + if (sscanf(argv[3], "%dx%d", &w, &h) != 2) { + fprintf(stderr, "arguments for w/h not assigned!\n"); + goto clean_up; } - r2 = fread(buf[1], w * h * 3 / 2, 1, f[1]); - if (r1 && r2 && r1 != r2) { - fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno), - (int)r1, (int)r2); - return 1; - } else if (r1 == 0 || r2 == 0) { + // Limit width/height to 4K. The frame_size set in the function + // open_input_file() will still be within range of int. + if (w < 1 || w > 4096 || h < 1 || h > 4096) { + fprintf(stderr, + "width or height is too large (above 4096) or below 1!\n"); + goto clean_up; + } + } + + if (argc > 6) { + if (sscanf(argv[6], "%d", &bit_depth) != 1) { + fprintf(stderr, "argument for bit_depth not assigned!\n"); + goto clean_up; + } + } + + if (open_input_file(argv[1], &in[0], w, h, bit_depth) < 0) { + fprintf(stderr, "File %s can't be opened or parsed!\n", argv[1]); + goto clean_up; + } + + if (w == 0 && h == 0) { + // If a y4m is the first file and w, h is not set grab from first file. + w = in[0].w; + h = in[0].h; + bit_depth = in[0].bit_depth; + } + if (bit_depth == 10) peak = 1023.0; + + if (bit_depth == 12) peak = 4095.0; + + if (open_input_file(argv[2], &in[1], w, h, bit_depth) < 0) { + fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]); + goto clean_up; + } + + if (in[0].w != in[1].w || in[0].h != in[1].h || in[0].w != w || + in[0].h != h || w == 0 || h == 0) { + fprintf(stderr, + "Failing: Image dimensions don't match or are unspecified!\n"); + return_value = 1; + goto clean_up; + } + + if (in[0].bit_depth != in[1].bit_depth) { + fprintf(stderr, + "Failing: Image bit depths don't match or are unspecified!\n"); + return_value = 1; + goto clean_up; + } + + bit_depth = in[0].bit_depth; + + // Number of frames to skip from file1.yuv for every frame used. Normal + // values 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL + // encoding in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer + // encoding. + if (argc > 4) { + if (sscanf(argv[4], "%d", &tl_skip) != 1) { + fprintf(stderr, "argument for tl_skip not assigned!\n"); + goto clean_up; + } + if (argc > 5) { + framestats = fopen(argv[5], "w"); + if (!framestats) { + fprintf(stderr, "Could not open \"%s\" for writing: %s\n", argv[5], + strerror(errno)); + return_value = 1; + goto clean_up; + } + } + } + + while (1) { + int r1, r2; + unsigned char *y[2], *u[2], *v[2]; + + r1 = read_input_file(&in[0], &y[0], &u[0], &v[0], bit_depth); + if (r1 == 0) { + if (ferror(in[0].file)) { + fprintf(stderr, "Failed to read data from '%s'\n", argv[1]); + return_value = 1; + goto clean_up; + } break; } -#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ - ssim = vp8_ssim2(buf0, buf1, w, w, w, h); \ - psnr = calc_plane_error(buf0, w, buf1, w, w, h); - psnr_and_ssim(ssimy, psnry, buf[0], buf[1], w, h); - psnr_and_ssim(ssimu, psnru, buf[0] + w * h, buf[1] + w * h, w / 2, h / 2); - psnr_and_ssim(ssimv, psnrv, buf[0] + w * h * 5 / 4, buf[1] + w * h * 5 / 4, - w / 2, h / 2); - ssim += 0.8 * ssimy + 0.1 * (ssimu + ssimv); - psnravg += - vp9_mse2psnr(w * h * 6 / 4, 255.0, (double)psnry + psnru + psnrv); - psnrglb += psnry + psnru + psnrv; + + // Reading parts of file1.yuv that were not used in temporal layer. + if (tl_skips_remaining > 0) { + --tl_skips_remaining; + continue; + } + // Use frame, but skip |tl_skip| after it. + tl_skips_remaining = tl_skip; + + r2 = read_input_file(&in[1], &y[1], &u[1], &v[1], bit_depth); + if (r2 == 0) { + if (ferror(in[1].file)) { + fprintf(stderr, "Failed to read data from '%s'\n", argv[2]); + return_value = 1; + goto clean_up; + } + break; + } + +#if CONFIG_VP9_HIGHBITDEPTH +#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ + do { \ + if (bit_depth < 9) { \ + ssim = ssim2(buf0, buf1, w, w, w, h); \ + psnr = calc_plane_error(buf0, w, buf1, w, w, h); \ + } else { \ + ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), \ + w, w, w, h, bit_depth); \ + psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w, \ + CAST_TO_SHORTPTR(buf1), w, w, h); \ + } \ + } while (0) +#else +#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ + do { \ + ssim = ssim2(buf0, buf1, w, w, w, h); \ + psnr = calc_plane_error(buf0, w, buf1, w, w, h); \ + } while (0) +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (n_frames == allocated_frames) { + allocated_frames = allocated_frames == 0 ? 1024 : allocated_frames * 2; + ssimy = realloc(ssimy, allocated_frames * sizeof(*ssimy)); + ssimu = realloc(ssimu, allocated_frames * sizeof(*ssimu)); + ssimv = realloc(ssimv, allocated_frames * sizeof(*ssimv)); + psnry = realloc(psnry, allocated_frames * sizeof(*psnry)); + psnru = realloc(psnru, allocated_frames * sizeof(*psnru)); + psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv)); + if (!(ssimy && ssimu && ssimv && psnry && psnru && psnrv)) { + fprintf(stderr, "Error allocating SSIM/PSNR data.\n"); + exit(EXIT_FAILURE); + } + } + psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h); + psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], (w + 1) / 2, + (h + 1) / 2); + psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], (w + 1) / 2, + (h + 1) / 2); + n_frames++; } - free(buf[0]); - free(buf[1]); - ssim /= n_frames; + + if (framestats) { + fprintf(framestats, + "ssim,ssim-y,ssim-u,ssim-v,psnr,psnr-y,psnr-u,psnr-v\n"); + } + + for (i = 0; i < n_frames; ++i) { + double frame_ssim; + double frame_psnr, frame_psnry, frame_psnru, frame_psnrv; + + frame_ssim = 0.8 * ssimy[i] + 0.1 * (ssimu[i] + ssimv[i]); + ssimavg += frame_ssim; + ssimyavg += ssimy[i]; + ssimuavg += ssimu[i]; + ssimvavg += ssimv[i]; + + frame_psnr = + mse2psnr(w * h * 6 / 4, peak, (double)psnry[i] + psnru[i] + psnrv[i]); + frame_psnry = mse2psnr(w * h * 4 / 4, peak, (double)psnry[i]); + frame_psnru = mse2psnr(w * h * 1 / 4, peak, (double)psnru[i]); + frame_psnrv = mse2psnr(w * h * 1 / 4, peak, (double)psnrv[i]); + + psnravg += frame_psnr; + psnryavg += frame_psnry; + psnruavg += frame_psnru; + psnrvavg += frame_psnrv; + + psnryglb += psnry[i]; + psnruglb += psnru[i]; + psnrvglb += psnrv[i]; + + if (framestats) { + fprintf(framestats, "%lf,%lf,%lf,%lf,%lf,%lf,%lf,%lf\n", frame_ssim, + ssimy[i], ssimu[i], ssimv[i], frame_psnr, frame_psnry, + frame_psnru, frame_psnrv); + } + } + + ssimavg /= n_frames; + ssimyavg /= n_frames; + ssimuavg /= n_frames; + ssimvavg /= n_frames; + + printf("VpxSSIM: %lf\n", 100 * pow(ssimavg, 8.0)); + printf("SSIM: %lf\n", ssimavg); + printf("SSIM-Y: %lf\n", ssimyavg); + printf("SSIM-U: %lf\n", ssimuavg); + printf("SSIM-V: %lf\n", ssimvavg); + puts(""); + psnravg /= n_frames; - psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb); + psnryavg /= n_frames; + psnruavg /= n_frames; + psnrvavg /= n_frames; printf("AvgPSNR: %lf\n", psnravg); + printf("AvgPSNR-Y: %lf\n", psnryavg); + printf("AvgPSNR-U: %lf\n", psnruavg); + printf("AvgPSNR-V: %lf\n", psnrvavg); + puts(""); + + psnrglb = psnryglb + psnruglb + psnrvglb; + psnrglb = mse2psnr((double)n_frames * w * h * 6 / 4, peak, psnrglb); + psnryglb = mse2psnr((double)n_frames * w * h * 4 / 4, peak, psnryglb); + psnruglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnruglb); + psnrvglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnrvglb); + printf("GlbPSNR: %lf\n", psnrglb); - printf("SSIM: %lf\n", 100 * pow(ssim, 8.0)); - printf("Nframes: %d\n", n_frames); + printf("GlbPSNR-Y: %lf\n", psnryglb); + printf("GlbPSNR-U: %lf\n", psnruglb); + printf("GlbPSNR-V: %lf\n", psnrvglb); + puts(""); - if (strcmp(argv[1], "-")) fclose(f[0]); - if (strcmp(argv[2], "-")) fclose(f[1]); + printf("Nframes: %d\n", (int)n_frames); - return 0; +clean_up: + + close_input_file(&in[0]); + close_input_file(&in[1]); + + if (framestats) fclose(framestats); + + free(ssimy); + free(ssimu); + free(ssimv); + + free(psnry); + free(psnru); + free(psnrv); + + return return_value; } diff --git a/media/libvpx/libvpx/tools/wrap-commit-msg.py b/media/libvpx/libvpx/tools/wrap-commit-msg.py index d5b4b046b1..ba3fa58732 100644 --- a/media/libvpx/libvpx/tools/wrap-commit-msg.py +++ b/media/libvpx/libvpx/tools/wrap-commit-msg.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 ## Copyright (c) 2012 The WebM project authors. All Rights Reserved. ## ## Use of this source code is governed by a BSD-style license diff --git a/media/libvpx/libvpx/tools_common.c b/media/libvpx/libvpx/tools_common.c index 6f14c25561..5af971f720 100644 --- a/media/libvpx/libvpx/tools_common.c +++ b/media/libvpx/libvpx/tools_common.c @@ -24,15 +24,11 @@ #include "vpx/vp8dx.h" #endif -#if defined(_WIN32) || defined(__OS2__) +#include "vpx/vpx_codec.h" + +#if defined(_WIN32) #include #include - -#ifdef __OS2__ -#define _setmode setmode -#define _fileno fileno -#define _O_BINARY O_BINARY -#endif #endif #define LOG_ERROR(label) \ @@ -46,9 +42,17 @@ va_end(ap); \ } while (0) +#if CONFIG_ENCODERS +/* Swallow warnings about unused results of fread/fwrite */ +static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { + return fread(ptr, size, nmemb, stream); +} +#define fread wrap_fread +#endif + FILE *set_binary_mode(FILE *stream) { (void)stream; -#if defined(_WIN32) || defined(__OS2__) +#if defined(_WIN32) _setmode(_fileno(stream), _O_BINARY); #endif return stream; @@ -69,8 +73,8 @@ void warn(const char *fmt, ...) { LOG_ERROR("Warning"); } void die_codec(vpx_codec_ctx_t *ctx, const char *s) { const char *detail = vpx_codec_error_detail(ctx); - printf("%s: %s\n", s, vpx_codec_error(ctx)); - if (detail) printf(" %s\n", detail); + fprintf(stderr, "%s: %s\n", s, vpx_codec_error(ctx)); + if (detail) fprintf(stderr, " %s\n", detail); exit(EXIT_FAILURE); } @@ -83,10 +87,13 @@ int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) { for (plane = 0; plane < 3; ++plane) { uint8_t *ptr; - const int w = vpx_img_plane_width(yuv_frame, plane); + int w = vpx_img_plane_width(yuv_frame, plane); const int h = vpx_img_plane_height(yuv_frame, plane); int r; - + // Assuming that for nv12 we read all chroma data at once + if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; + // Fixing NV12 chroma width if it is odd + if (yuv_frame->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; /* Determine the correct plane based on the image format. The for-loop * always counts in Y,U,V order, but this may not match the order of * the data on disk. @@ -200,8 +207,6 @@ const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc) { #endif // CONFIG_DECODERS -// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part -// of vpx_image_t support int vpx_img_plane_width(const vpx_image_t *img, int plane) { if (plane > 0 && img->x_chroma_shift > 0) return (img->d_w + 1) >> img->x_chroma_shift; @@ -218,17 +223,22 @@ int vpx_img_plane_height(const vpx_image_t *img, int plane) { void vpx_img_write(const vpx_image_t *img, FILE *file) { int plane; + const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = vpx_img_plane_width(img, plane) * - ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int w = vpx_img_plane_width(img, plane); const int h = vpx_img_plane_height(img, plane); int y; + // Assuming that for nv12 we write all chroma data at once + if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; + // Fixing NV12 chroma width if it is odd + if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; + for (y = 0; y < h; ++y) { - fwrite(buf, 1, w, file); + fwrite(buf, bytespp, w, file); buf += stride; } } @@ -236,17 +246,22 @@ void vpx_img_write(const vpx_image_t *img, FILE *file) { int vpx_img_read(vpx_image_t *img, FILE *file) { int plane; + const int bytespp = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = vpx_img_plane_width(img, plane) * - ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int w = vpx_img_plane_width(img, plane); const int h = vpx_img_plane_height(img, plane); int y; + // Assuming that for nv12 we read all chroma data at once + if (img->fmt == VPX_IMG_FMT_NV12 && plane > 1) break; + // Fixing NV12 chroma width if it is odd + if (img->fmt == VPX_IMG_FMT_NV12 && plane == 1) w = (w + 1) & ~1; + for (y = 0; y < h; ++y) { - if (fread(buf, 1, w, file) != (size_t)w) return 0; + if (fread(buf, bytespp, w, file) != (size_t)w) return 0; buf += stride; } } @@ -266,6 +281,88 @@ double sse_to_psnr(double samples, double peak, double sse) { } } +#if CONFIG_ENCODERS +int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) { + FILE *f = input_ctx->file; + y4m_input *y4m = &input_ctx->y4m; + int shortread = 0; + + if (input_ctx->file_type == FILE_TYPE_Y4M) { + if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; + } else { + shortread = read_yuv_frame(input_ctx, img); + } + + return !shortread; +} + +int file_is_y4m(const char detect[4]) { + if (memcmp(detect, "YUV4", 4) == 0) { + return 1; + } + return 0; +} + +int fourcc_is_ivf(const char detect[4]) { + if (memcmp(detect, "DKIF", 4) == 0) { + return 1; + } + return 0; +} + +void open_input_file(struct VpxInputContext *input) { + /* Parse certain options from the input file, if possible */ + input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") + : set_binary_mode(stdin); + + if (!input->file) fatal("Failed to open input file"); + + if (!fseeko(input->file, 0, SEEK_END)) { + /* Input file is seekable. Figure out how long it is, so we can get + * progress info. + */ + input->length = ftello(input->file); + rewind(input->file); + } + + /* Default to 1:1 pixel aspect ratio. */ + input->pixel_aspect_ratio.numerator = 1; + input->pixel_aspect_ratio.denominator = 1; + + /* For RAW input sources, these bytes will applied on the first frame + * in read_frame(). + */ + input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); + input->detect.position = 0; + + if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { + if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, + input->only_i420) >= 0) { + input->file_type = FILE_TYPE_Y4M; + input->width = input->y4m.pic_w; + input->height = input->y4m.pic_h; + input->pixel_aspect_ratio.numerator = input->y4m.par_n; + input->pixel_aspect_ratio.denominator = input->y4m.par_d; + input->framerate.numerator = input->y4m.fps_n; + input->framerate.denominator = input->y4m.fps_d; + input->fmt = input->y4m.vpx_fmt; + input->bit_depth = input->y4m.bit_depth; + } else { + fatal("Unsupported Y4M stream."); + } + } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { + fatal("IVF is not supported as input."); + } else { + input->file_type = FILE_TYPE_RAW; + } +} + +void close_input_file(struct VpxInputContext *input) { + fclose(input->file); + if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); +} +#endif + // TODO(debargha): Consolidate the functions below into a separate file. #if CONFIG_VP9_HIGHBITDEPTH static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, @@ -284,7 +381,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I42216: case VPX_IMG_FMT_I44416: case VPX_IMG_FMT_I44016: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -320,7 +417,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: case VPX_IMG_FMT_I440: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -361,7 +458,7 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) { case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: case VPX_IMG_FMT_I440: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -396,7 +493,7 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I42216: case VPX_IMG_FMT_I44416: case VPX_IMG_FMT_I44016: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -430,7 +527,7 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src, case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: case VPX_IMG_FMT_I440: break; - default: fatal("Unsupported image conversion"); break; + default: fatal("Unsupported image conversion"); } for (plane = 0; plane < 3; plane++) { int w = src->d_w; @@ -459,3 +556,225 @@ void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift) { } } #endif // CONFIG_VP9_HIGHBITDEPTH + +int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2) { + uint32_t l_w = img1->d_w; + uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + uint32_t i; + int match = 1; + + match &= (img1->fmt == img2->fmt); + match &= (img1->d_w == img2->d_w); + match &= (img1->d_h == img2->d_h); +#if CONFIG_VP9_HIGHBITDEPTH + if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + l_w *= 2; + c_w *= 2; + } +#endif + + for (i = 0; i < img1->d_h; ++i) + match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], + img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], + l_w) == 0); + + for (i = 0; i < c_h; ++i) + match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], + img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], + c_w) == 0); + + for (i = 0; i < c_h; ++i) + match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], + img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], + c_w) == 0); + + return match; +} + +#define mmin(a, b) ((a) < (b) ? (a) : (b)) + +#if CONFIG_VP9_HIGHBITDEPTH +void find_mismatch_high(const vpx_image_t *const img1, + const vpx_image_t *const img2, int yloc[4], int uloc[4], + int vloc[4]) { + uint16_t *plane1, *plane2; + uint32_t stride1, stride2; + const uint32_t bsize = 64; + const uint32_t bsizey = bsize >> img1->y_chroma_shift; + const uint32_t bsizex = bsize >> img1->x_chroma_shift; + const uint32_t c_w = + (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + int match = 1; + uint32_t i, j; + yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y]; + stride1 = img1->stride[VPX_PLANE_Y] / 2; + stride2 = img2->stride[VPX_PLANE_Y] / 2; + for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { + for (j = 0; match && j < img1->d_w; j += bsize) { + int k, l; + const int si = mmin(i + bsize, img1->d_h) - i; + const int sj = mmin(j + bsize, img1->d_w) - j; + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + yloc[0] = i + k; + yloc[1] = j + l; + yloc[2] = *(plane1 + (i + k) * stride1 + j + l); + yloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } + + uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_U]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_U]; + stride1 = img1->stride[VPX_PLANE_U] / 2; + stride2 = img2->stride[VPX_PLANE_U] / 2; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + uloc[0] = i + k; + uloc[1] = j + l; + uloc[2] = *(plane1 + (i + k) * stride1 + j + l); + uloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } + + vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; + plane1 = (uint16_t *)img1->planes[VPX_PLANE_V]; + plane2 = (uint16_t *)img2->planes[VPX_PLANE_V]; + stride1 = img1->stride[VPX_PLANE_V] / 2; + stride2 = img2->stride[VPX_PLANE_V] / 2; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(plane1 + (i + k) * stride1 + j + l) != + *(plane2 + (i + k) * stride2 + j + l)) { + vloc[0] = i + k; + vloc[1] = j + l; + vloc[2] = *(plane1 + (i + k) * stride1 + j + l); + vloc[3] = *(plane2 + (i + k) * stride2 + j + l); + match = 0; + break; + } + } + } + } + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2, + int yloc[4], int uloc[4], int vloc[4]) { + const uint32_t bsize = 64; + const uint32_t bsizey = bsize >> img1->y_chroma_shift; + const uint32_t bsizex = bsize >> img1->x_chroma_shift; + const uint32_t c_w = + (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; + const uint32_t c_h = + (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; + int match = 1; + uint32_t i, j; + yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; + for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { + for (j = 0; match && j < img1->d_w; j += bsize) { + int k, l; + const int si = mmin(i + bsize, img1->d_h) - i; + const int sj = mmin(j + bsize, img1->d_w) - j; + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(img1->planes[VPX_PLANE_Y] + + (i + k) * img1->stride[VPX_PLANE_Y] + j + l) != + *(img2->planes[VPX_PLANE_Y] + + (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) { + yloc[0] = i + k; + yloc[1] = j + l; + yloc[2] = *(img1->planes[VPX_PLANE_Y] + + (i + k) * img1->stride[VPX_PLANE_Y] + j + l); + yloc[3] = *(img2->planes[VPX_PLANE_Y] + + (i + k) * img2->stride[VPX_PLANE_Y] + j + l); + match = 0; + break; + } + } + } + } + } + + uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(img1->planes[VPX_PLANE_U] + + (i + k) * img1->stride[VPX_PLANE_U] + j + l) != + *(img2->planes[VPX_PLANE_U] + + (i + k) * img2->stride[VPX_PLANE_U] + j + l)) { + uloc[0] = i + k; + uloc[1] = j + l; + uloc[2] = *(img1->planes[VPX_PLANE_U] + + (i + k) * img1->stride[VPX_PLANE_U] + j + l); + uloc[3] = *(img2->planes[VPX_PLANE_U] + + (i + k) * img2->stride[VPX_PLANE_U] + j + l); + match = 0; + break; + } + } + } + } + } + vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; + for (i = 0, match = 1; match && i < c_h; i += bsizey) { + for (j = 0; match && j < c_w; j += bsizex) { + int k, l; + const int si = mmin(i + bsizey, c_h - i); + const int sj = mmin(j + bsizex, c_w - j); + for (k = 0; match && k < si; ++k) { + for (l = 0; match && l < sj; ++l) { + if (*(img1->planes[VPX_PLANE_V] + + (i + k) * img1->stride[VPX_PLANE_V] + j + l) != + *(img2->planes[VPX_PLANE_V] + + (i + k) * img2->stride[VPX_PLANE_V] + j + l)) { + vloc[0] = i + k; + vloc[1] = j + l; + vloc[2] = *(img1->planes[VPX_PLANE_V] + + (i + k) * img1->stride[VPX_PLANE_V] + j + l); + vloc[3] = *(img2->planes[VPX_PLANE_V] + + (i + k) * img2->stride[VPX_PLANE_V] + j + l); + match = 0; + break; + } + } + } + } + } +} diff --git a/media/libvpx/libvpx/tools_common.h b/media/libvpx/libvpx/tools_common.h index 73ba1bc03b..81453cc065 100644 --- a/media/libvpx/libvpx/tools_common.h +++ b/media/libvpx/libvpx/tools_common.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef TOOLS_COMMON_H_ -#define TOOLS_COMMON_H_ +#ifndef VPX_TOOLS_COMMON_H_ +#define VPX_TOOLS_COMMON_H_ #include @@ -16,7 +16,6 @@ #include "vpx/vpx_codec.h" #include "vpx/vpx_image.h" #include "vpx/vpx_integer.h" -#include "vpx_ports/msvc.h" #if CONFIG_ENCODERS #include "./y4minput.h" @@ -26,11 +25,27 @@ /* MSVS uses _f{seek,tell}i64. */ #define fseeko _fseeki64 #define ftello _ftelli64 +typedef int64_t FileOffset; #elif defined(_WIN32) /* MinGW uses f{seek,tell}o64 for large files. */ #define fseeko fseeko64 #define ftello ftello64 -#endif /* _WIN32 */ +typedef off64_t FileOffset; +#elif CONFIG_OS_SUPPORT && \ + !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \ + defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64) +/* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before + * Android API level 24. See + * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */ +#include /* NOLINT */ +typedef off_t FileOffset; +/* Use 32-bit file operations in WebM file format when building ARM + * executables (.axf) with RVCT. */ +#else +#define fseeko fseek +#define ftello ftell +typedef long FileOffset; /* NOLINT */ +#endif /* CONFIG_OS_SUPPORT */ #if CONFIG_OS_SUPPORT #if defined(_MSC_VER) @@ -42,13 +57,6 @@ #endif /* _MSC_VER */ #endif /* CONFIG_OS_SUPPORT */ -/* Use 32-bit file operations in WebM file format when building ARM - * executables (.axf) with RVCT. */ -#if !CONFIG_OS_SUPPORT -#define fseeko fseek -#define ftello ftell -#endif /* CONFIG_OS_SUPPORT */ - #define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo) #ifndef PATH_MAX @@ -106,30 +114,44 @@ extern "C" { #if defined(__GNUC__) #define VPX_NO_RETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define VPX_NO_RETURN __declspec(noreturn) #else #define VPX_NO_RETURN #endif +// Tells the compiler to perform `printf` format string checking if the +// compiler supports it; see the 'format' attribute in +// . +#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check) +#if defined(__has_attribute) +#if __has_attribute(format) +#undef VPX_TOOLS_FORMAT_PRINTF +#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#endif + /* Sets a stdio stream into binary mode */ FILE *set_binary_mode(FILE *stream); -void die(const char *fmt, ...) VPX_NO_RETURN; -void fatal(const char *fmt, ...) VPX_NO_RETURN; -void warn(const char *fmt, ...); +VPX_NO_RETURN void die(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2); +VPX_NO_RETURN void fatal(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2); +void warn(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2); -void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN; +VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s); /* The tool including this file must define usage_exit() */ -void usage_exit(void) VPX_NO_RETURN; +VPX_NO_RETURN void usage_exit(void); #undef VPX_NO_RETURN int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame); typedef struct VpxInterface { - const char *const name; - const uint32_t fourcc; - vpx_codec_iface_t *(*const codec_interface)(); + const char *name; + uint32_t fourcc; + vpx_codec_iface_t *(*codec_interface)(void); } VpxInterface; int get_vpx_encoder_count(void); @@ -141,8 +163,6 @@ const VpxInterface *get_vpx_decoder_by_index(int i); const VpxInterface *get_vpx_decoder_by_name(const char *name); const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc); -// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part -// of vpx_image_t support int vpx_img_plane_width(const vpx_image_t *img, int plane); int vpx_img_plane_height(const vpx_image_t *img, int plane); void vpx_img_write(const vpx_image_t *img, FILE *file); @@ -150,14 +170,31 @@ int vpx_img_read(vpx_image_t *img, FILE *file); double sse_to_psnr(double samples, double peak, double mse); +#if CONFIG_ENCODERS +int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img); +int file_is_y4m(const char detect[4]); +int fourcc_is_ivf(const char detect[4]); +void open_input_file(struct VpxInputContext *input); +void close_input_file(struct VpxInputContext *input); +#endif + #if CONFIG_VP9_HIGHBITDEPTH void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift); void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift); void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src); #endif +int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2); +#if CONFIG_VP9_HIGHBITDEPTH +void find_mismatch_high(const vpx_image_t *const img1, + const vpx_image_t *const img2, int yloc[4], int uloc[4], + int vloc[4]); +#endif +void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2, + int yloc[4], int uloc[4], int vloc[4]); + #ifdef __cplusplus } /* extern "C" */ #endif -#endif // TOOLS_COMMON_H_ +#endif // VPX_TOOLS_COMMON_H_ diff --git a/media/libvpx/libvpx/usage_cx.dox b/media/libvpx/libvpx/usage_cx.dox index 92b0d34ef4..b2220cfdde 100644 --- a/media/libvpx/libvpx/usage_cx.dox +++ b/media/libvpx/libvpx/usage_cx.dox @@ -8,6 +8,8 @@ \ref usage_deadline. + \if samples \ref samples + \endif */ diff --git a/media/libvpx/libvpx/usage_dx.dox b/media/libvpx/libvpx/usage_dx.dox index 883ce24926..85063f705b 100644 --- a/media/libvpx/libvpx/usage_dx.dox +++ b/media/libvpx/libvpx/usage_dx.dox @@ -11,7 +11,9 @@ \ref usage_postproc based on the amount of free CPU time. For more information on the deadline parameter, see \ref usage_deadline. + \if samples \ref samples + \endif \section usage_cb Callback Based Decoding diff --git a/media/libvpx/libvpx/video_common.h b/media/libvpx/libvpx/video_common.h index 44b27a8390..77eb9fac0c 100644 --- a/media/libvpx/libvpx/video_common.h +++ b/media/libvpx/libvpx/video_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VIDEO_COMMON_H_ -#define VIDEO_COMMON_H_ +#ifndef VPX_VIDEO_COMMON_H_ +#define VPX_VIDEO_COMMON_H_ #include "./tools_common.h" @@ -20,4 +20,4 @@ typedef struct { struct VpxRational time_base; } VpxVideoInfo; -#endif // VIDEO_COMMON_H_ +#endif // VPX_VIDEO_COMMON_H_ diff --git a/media/libvpx/libvpx/video_reader.c b/media/libvpx/libvpx/video_reader.c index a0ba2521c6..16822eff3c 100644 --- a/media/libvpx/libvpx/video_reader.c +++ b/media/libvpx/libvpx/video_reader.c @@ -30,17 +30,37 @@ VpxVideoReader *vpx_video_reader_open(const char *filename) { char header[32]; VpxVideoReader *reader = NULL; FILE *const file = fopen(filename, "rb"); - if (!file) return NULL; // Can't open file + if (!file) { + fprintf(stderr, "%s can't be opened.\n", filename); // Can't open file + return NULL; + } - if (fread(header, 1, 32, file) != 32) return NULL; // Can't read file header + if (fread(header, 1, 32, file) != 32) { + fprintf(stderr, "File header on %s can't be read.\n", + filename); // Can't read file header + return NULL; + } + if (memcmp(kIVFSignature, header, 4) != 0) { + fprintf(stderr, "The IVF signature on %s is wrong.\n", + filename); // Wrong IVF signature - if (memcmp(kIVFSignature, header, 4) != 0) - return NULL; // Wrong IVF signature + return NULL; + } + if (mem_get_le16(header + 4) != 0) { + fprintf(stderr, "%s uses the wrong IVF version.\n", + filename); // Wrong IVF version - if (mem_get_le16(header + 4) != 0) return NULL; // Wrong IVF version + return NULL; + } reader = calloc(1, sizeof(*reader)); - if (!reader) return NULL; // Can't allocate VpxVideoReader + if (!reader) { + fprintf( + stderr, + "Can't allocate VpxVideoReader\n"); // Can't allocate VpxVideoReader + + return NULL; + } reader->file = file; reader->info.codec_fourcc = mem_get_le32(header + 8); diff --git a/media/libvpx/libvpx/video_reader.h b/media/libvpx/libvpx/video_reader.h index 73c25b00a7..1f5c8088bb 100644 --- a/media/libvpx/libvpx/video_reader.h +++ b/media/libvpx/libvpx/video_reader.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VIDEO_READER_H_ -#define VIDEO_READER_H_ +#ifndef VPX_VIDEO_READER_H_ +#define VPX_VIDEO_READER_H_ #include "./video_common.h" @@ -48,4 +48,4 @@ const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader); } // extern "C" #endif -#endif // VIDEO_READER_H_ +#endif // VPX_VIDEO_READER_H_ diff --git a/media/libvpx/libvpx/video_writer.c b/media/libvpx/libvpx/video_writer.c index 56d428b072..6e9a848bc3 100644 --- a/media/libvpx/libvpx/video_writer.c +++ b/media/libvpx/libvpx/video_writer.c @@ -37,11 +37,15 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename, if (container == kContainerIVF) { VpxVideoWriter *writer = NULL; FILE *const file = fopen(filename, "wb"); - if (!file) return NULL; - + if (!file) { + fprintf(stderr, "%s can't be written to.\n", filename); + return NULL; + } writer = malloc(sizeof(*writer)); - if (!writer) return NULL; - + if (!writer) { + fprintf(stderr, "Can't allocate VpxVideoWriter.\n"); + return NULL; + } writer->frame_count = 0; writer->info = *info; writer->file = file; @@ -50,7 +54,7 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename, return writer; } - + fprintf(stderr, "VpxVideoWriter supports only IVF.\n"); return NULL; } diff --git a/media/libvpx/libvpx/video_writer.h b/media/libvpx/libvpx/video_writer.h index a769811c44..b4d242b920 100644 --- a/media/libvpx/libvpx/video_writer.h +++ b/media/libvpx/libvpx/video_writer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VIDEO_WRITER_H_ -#define VIDEO_WRITER_H_ +#ifndef VPX_VIDEO_WRITER_H_ +#define VPX_VIDEO_WRITER_H_ #include "./video_common.h" @@ -41,4 +41,4 @@ int vpx_video_writer_write_frame(VpxVideoWriter *writer, const uint8_t *buffer, } // extern "C" #endif -#endif // VIDEO_WRITER_H_ +#endif // VPX_VIDEO_WRITER_H_ diff --git a/media/libvpx/libvpx/vp8/common/alloccommon.h b/media/libvpx/libvpx/vp8/common/alloccommon.h index 5d0840c670..2d376bbac3 100644 --- a/media/libvpx/libvpx/vp8/common/alloccommon.h +++ b/media/libvpx/libvpx/vp8/common/alloccommon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ALLOCCOMMON_H_ -#define VP8_COMMON_ALLOCCOMMON_H_ +#ifndef VPX_VP8_COMMON_ALLOCCOMMON_H_ +#define VPX_VP8_COMMON_ALLOCCOMMON_H_ #include "onyxc_int.h" @@ -21,10 +21,10 @@ void vp8_create_common(VP8_COMMON *oci); void vp8_remove_common(VP8_COMMON *oci); void vp8_de_alloc_frame_buffers(VP8_COMMON *oci); int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height); -void vp8_setup_version(VP8_COMMON *oci); +void vp8_setup_version(VP8_COMMON *cm); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_COMMON_ALLOCCOMMON_H_ +#endif // VPX_VP8_COMMON_ALLOCCOMMON_H_ diff --git a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c index e12f65a042..48a1972048 100644 --- a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c +++ b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.c @@ -8,28 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" -#include "vp8_rtcd.h" +#include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vp8/common/arm/loopfilter_arm.h" #include "vp8/common/loopfilter.h" #include "vp8/common/onyxc_int.h" -typedef void loopfilter_y_neon(unsigned char *src, int pitch, - unsigned char blimit, unsigned char limit, - unsigned char thresh); -typedef void loopfilter_uv_neon(unsigned char *u, int pitch, - unsigned char blimit, unsigned char limit, - unsigned char thresh, unsigned char *v); - -extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; -extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; -extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; -extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; - -extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; -extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; -extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; -extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; - /* NEON loopfilter functions */ /* Horizontal MB filtering */ void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, diff --git a/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h new file mode 100644 index 0000000000..6cf660d228 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/arm/loopfilter_arm.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ +#define VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ + +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, + unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, + unsigned char thresh, unsigned char *v); + +loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; +loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; +loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; +loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; + +loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; +loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; +loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; + +#endif // VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_ diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c index af566c2c41..590956dde1 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -10,7 +10,10 @@ #include #include + #include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, @@ -21,35 +24,6 @@ static INLINE uint8x8_t load_and_shift(const unsigned char *a) { return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); } -static INLINE void store4x4(unsigned char *dst, int dst_stride, - const uint8x8_t a0, const uint8x8_t a1) { - if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) { - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1); - } else { - // Store to the aligned local buffer and memcpy instead of vget_lane_u8 - // which is really really slow. - uint32_t output_buffer[4]; - vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0); - vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1); - vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0); - vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1); - - memcpy(dst, output_buffer, 4); - dst += dst_stride; - memcpy(dst, output_buffer + 1, 4); - dst += dst_stride; - memcpy(dst, output_buffer + 2, 4); - dst += dst_stride; - memcpy(dst, output_buffer + 3, 4); - } -} - void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, @@ -122,7 +96,7 @@ void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, // secondpass_filter if (yoffset == 0) { // skip_2ndpass_filter - store4x4(dst_ptr, dst_pitch, e0, e1); + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1)); } else { uint8x8_t f0, f1; const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[yoffset][0]); @@ -140,7 +114,7 @@ void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr, f0 = vqrshrn_n_u16(b0, 7); f1 = vqrshrn_n_u16(b1, 7); - store4x4(dst_ptr, dst_pitch, f0, f1); + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(f0, f1)); } } diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c index c1d293b58d..c89b47d628 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/copymem_neon.c @@ -10,6 +10,8 @@ #include +#include "./vp8_rtcd.h" + void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) { uint8x8_t vtmp; diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c index 6edff3c69f..791aaea2ae 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/dequantizeb_neon.c @@ -10,6 +10,7 @@ #include +#include "./vp8_rtcd.h" #include "vp8/common/blockd.h" void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) { diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c index d61dde86cf..5c26ce67a4 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/idct_blk_neon.c @@ -8,15 +8,226 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" -#include "vp8_rtcd.h" +#include -/* place these declarations here because we don't want to maintain them - * outside of this scope - */ -void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *dst, - int stride); -void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *dst, int stride); +#include "./vp8_rtcd.h" + +static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst, + int stride) { + unsigned char *dst0; + int i, a0, a1; + int16x8x2_t q2Add; + int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0); + uint8x8_t d2u8, d4u8; + uint16x8_t q1u16, q2u16; + + a0 = ((q[0] * dq) + 4) >> 3; + a1 = ((q[16] * dq) + 4) >> 3; + q[0] = q[16] = 0; + q2Add.val[0] = vdupq_n_s16((int16_t)a0); + q2Add.val[1] = vdupq_n_s16((int16_t)a1); + + for (i = 0; i < 2; i++, dst += 4) { + dst0 = dst; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); + dst0 += stride; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d2s32)); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d4s32)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + + d2s32 = vreinterpret_s32_u8(d2u8); + d4s32 = vreinterpret_s32_u8(d4u8); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d2s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d2s32, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 1); + } +} + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 17734; +// because the lowest bit in 0x8a8c is 0, we can pre-shift this + +static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq, + unsigned char *dst, int stride) { + unsigned char *dst0, *dst1; + int32x2_t d28, d29, d30, d31; + int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x4x2_t q2tmp0, q2tmp1; + int16x8x2_t q2tmp2, q2tmp3; + int16x4_t dLow0, dLow1, dHigh0, dHigh1; + + d28 = d29 = d30 = d31 = vdup_n_s32(0); + + // load dq + q0 = vld1q_s16(dq); + dq += 8; + q1 = vld1q_s16(dq); + + // load q + q2 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q3 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q4 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q5 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + + // load src from dst + dst0 = dst; + dst1 = dst + 4; + d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); + dst0 += stride; + d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); + dst1 += stride; + d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); + dst0 += stride; + d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); + dst1 += stride; + + d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); + dst0 += stride; + d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); + dst1 += stride; + d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); + d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); + + q2 = vmulq_s16(q2, q0); + q3 = vmulq_s16(q3, q1); + q4 = vmulq_s16(q4, q0); + q5 = vmulq_s16(q5, q1); + + // vswp + dLow0 = vget_low_s16(q2); + dHigh0 = vget_high_s16(q2); + dLow1 = vget_low_s16(q4); + dHigh1 = vget_high_s16(q4); + q2 = vcombine_s16(dLow0, dLow1); + q4 = vcombine_s16(dHigh0, dHigh1); + + dLow0 = vget_low_s16(q3); + dHigh0 = vget_high_s16(q3); + dLow1 = vget_low_s16(q5); + dHigh1 = vget_high_s16(q5); + q3 = vcombine_s16(dLow0, dLow1); + q5 = vcombine_s16(dHigh0, dHigh1); + + q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); + q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); + q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); + q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); + + q10 = vqaddq_s16(q2, q3); + q11 = vqsubq_s16(q2, q3); + + q8 = vshrq_n_s16(q8, 1); + q9 = vshrq_n_s16(q9, 1); + + q4 = vqaddq_s16(q4, q8); + q5 = vqaddq_s16(q5, q9); + + q2 = vqsubq_s16(q6, q5); + q3 = vqaddq_s16(q7, q4); + + q4 = vqaddq_s16(q10, q3); + q5 = vqaddq_s16(q11, q2); + q6 = vqsubq_s16(q11, q2); + q7 = vqsubq_s16(q10, q3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + // loop 2 + q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); + q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); + q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); + q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); + + q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); + q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); + + q10 = vshrq_n_s16(q10, 1); + q11 = vshrq_n_s16(q11, 1); + + q10 = vqaddq_s16(q2tmp2.val[1], q10); + q11 = vqaddq_s16(q2tmp3.val[1], q11); + + q8 = vqsubq_s16(q8, q11); + q9 = vqaddq_s16(q9, q10); + + q4 = vqaddq_s16(q2, q9); + q5 = vqaddq_s16(q3, q8); + q6 = vqsubq_s16(q3, q8); + q7 = vqsubq_s16(q2, q9); + + q4 = vrshrq_n_s16(q4, 3); + q5 = vrshrq_n_s16(q5, 3); + q6 = vrshrq_n_s16(q6, 3); + q7 = vrshrq_n_s16(q7, 3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + q4 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28))); + q5 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29))); + q6 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30))); + q7 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31))); + + d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); + d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); + d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); + d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); + + dst0 = dst; + dst1 = dst + 4; + vst1_lane_s32((int32_t *)dst0, d28, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d28, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d29, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d29, 1); + dst1 += stride; + + vst1_lane_s32((int32_t *)dst0, d30, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d30, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d31, 0); + vst1_lane_s32((int32_t *)dst1, d31, 1); +} void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs) { @@ -43,42 +254,42 @@ void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, } void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, - unsigned char *dstu, - unsigned char *dstv, int stride, + unsigned char *dst_u, + unsigned char *dst_v, int stride, char *eobs) { if (((short *)(eobs))[0]) { if (((short *)eobs)[0] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, dstu, stride); + idct_dequant_full_2x_neon(q, dq, dst_u, stride); else - idct_dequant_0_2x_neon(q, dq[0], dstu, stride); + idct_dequant_0_2x_neon(q, dq[0], dst_u, stride); } q += 32; - dstu += 4 * stride; + dst_u += 4 * stride; if (((short *)(eobs))[1]) { if (((short *)eobs)[1] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, dstu, stride); + idct_dequant_full_2x_neon(q, dq, dst_u, stride); else - idct_dequant_0_2x_neon(q, dq[0], dstu, stride); + idct_dequant_0_2x_neon(q, dq[0], dst_u, stride); } q += 32; if (((short *)(eobs))[2]) { if (((short *)eobs)[2] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, dstv, stride); + idct_dequant_full_2x_neon(q, dq, dst_v, stride); else - idct_dequant_0_2x_neon(q, dq[0], dstv, stride); + idct_dequant_0_2x_neon(q, dq[0], dst_v, stride); } q += 32; - dstv += 4 * stride; + dst_v += 4 * stride; if (((short *)(eobs))[3]) { if (((short *)eobs)[3] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, dstv, stride); + idct_dequant_full_2x_neon(q, dq, dst_v, stride); else - idct_dequant_0_2x_neon(q, dq[0], dstv, stride); + idct_dequant_0_2x_neon(q, dq[0], dst_v, stride); } } diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c deleted file mode 100644 index c83102a5cc..0000000000 --- a/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst, - int stride) { - unsigned char *dst0; - int i, a0, a1; - int16x8x2_t q2Add; - int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0); - uint8x8_t d2u8, d4u8; - uint16x8_t q1u16, q2u16; - - a0 = ((q[0] * dq) + 4) >> 3; - a1 = ((q[16] * dq) + 4) >> 3; - q[0] = q[16] = 0; - q2Add.val[0] = vdupq_n_s16((int16_t)a0); - q2Add.val[1] = vdupq_n_s16((int16_t)a1); - - for (i = 0; i < 2; i++, dst += 4) { - dst0 = dst; - d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); - dst0 += stride; - d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); - dst0 += stride; - d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); - dst0 += stride; - d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); - - q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), - vreinterpret_u8_s32(d2s32)); - q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), - vreinterpret_u8_s32(d4s32)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); - d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); - - d2s32 = vreinterpret_s32_u8(d2u8); - d4s32 = vreinterpret_s32_u8(d4u8); - - dst0 = dst; - vst1_lane_s32((int32_t *)dst0, d2s32, 0); - dst0 += stride; - vst1_lane_s32((int32_t *)dst0, d2s32, 1); - dst0 += stride; - vst1_lane_s32((int32_t *)dst0, d4s32, 0); - dst0 += stride; - vst1_lane_s32((int32_t *)dst0, d4s32, 1); - } - return; -} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c deleted file mode 100644 index f30671cc3f..0000000000 --- a/media/libvpx/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -static const int16_t cospi8sqrt2minus1 = 20091; -static const int16_t sinpi8sqrt2 = 17734; -// because the lowest bit in 0x8a8c is 0, we can pre-shift this - -void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq, unsigned char *dst, - int stride) { - unsigned char *dst0, *dst1; - int32x2_t d28, d29, d30, d31; - int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; - int16x8_t qEmpty = vdupq_n_s16(0); - int32x4x2_t q2tmp0, q2tmp1; - int16x8x2_t q2tmp2, q2tmp3; - int16x4_t dLow0, dLow1, dHigh0, dHigh1; - - d28 = d29 = d30 = d31 = vdup_n_s32(0); - - // load dq - q0 = vld1q_s16(dq); - dq += 8; - q1 = vld1q_s16(dq); - - // load q - q2 = vld1q_s16(q); - vst1q_s16(q, qEmpty); - q += 8; - q3 = vld1q_s16(q); - vst1q_s16(q, qEmpty); - q += 8; - q4 = vld1q_s16(q); - vst1q_s16(q, qEmpty); - q += 8; - q5 = vld1q_s16(q); - vst1q_s16(q, qEmpty); - - // load src from dst - dst0 = dst; - dst1 = dst + 4; - d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); - dst0 += stride; - d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); - dst1 += stride; - d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); - dst0 += stride; - d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); - dst1 += stride; - - d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); - dst0 += stride; - d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); - dst1 += stride; - d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); - d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); - - q2 = vmulq_s16(q2, q0); - q3 = vmulq_s16(q3, q1); - q4 = vmulq_s16(q4, q0); - q5 = vmulq_s16(q5, q1); - - // vswp - dLow0 = vget_low_s16(q2); - dHigh0 = vget_high_s16(q2); - dLow1 = vget_low_s16(q4); - dHigh1 = vget_high_s16(q4); - q2 = vcombine_s16(dLow0, dLow1); - q4 = vcombine_s16(dHigh0, dHigh1); - - dLow0 = vget_low_s16(q3); - dHigh0 = vget_high_s16(q3); - dLow1 = vget_low_s16(q5); - dHigh1 = vget_high_s16(q5); - q3 = vcombine_s16(dLow0, dLow1); - q5 = vcombine_s16(dHigh0, dHigh1); - - q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); - q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); - q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); - q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); - - q10 = vqaddq_s16(q2, q3); - q11 = vqsubq_s16(q2, q3); - - q8 = vshrq_n_s16(q8, 1); - q9 = vshrq_n_s16(q9, 1); - - q4 = vqaddq_s16(q4, q8); - q5 = vqaddq_s16(q5, q9); - - q2 = vqsubq_s16(q6, q5); - q3 = vqaddq_s16(q7, q4); - - q4 = vqaddq_s16(q10, q3); - q5 = vqaddq_s16(q11, q2); - q6 = vqsubq_s16(q11, q2); - q7 = vqsubq_s16(q10, q3); - - q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); - q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); - q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), - vreinterpretq_s16_s32(q2tmp1.val[0])); - q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), - vreinterpretq_s16_s32(q2tmp1.val[1])); - - // loop 2 - q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); - q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); - q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); - q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); - - q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); - q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); - - q10 = vshrq_n_s16(q10, 1); - q11 = vshrq_n_s16(q11, 1); - - q10 = vqaddq_s16(q2tmp2.val[1], q10); - q11 = vqaddq_s16(q2tmp3.val[1], q11); - - q8 = vqsubq_s16(q8, q11); - q9 = vqaddq_s16(q9, q10); - - q4 = vqaddq_s16(q2, q9); - q5 = vqaddq_s16(q3, q8); - q6 = vqsubq_s16(q3, q8); - q7 = vqsubq_s16(q2, q9); - - q4 = vrshrq_n_s16(q4, 3); - q5 = vrshrq_n_s16(q5, 3); - q6 = vrshrq_n_s16(q6, 3); - q7 = vrshrq_n_s16(q7, 3); - - q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); - q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); - q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), - vreinterpretq_s16_s32(q2tmp1.val[0])); - q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), - vreinterpretq_s16_s32(q2tmp1.val[1])); - - q4 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28))); - q5 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29))); - q6 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30))); - q7 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31))); - - d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); - d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); - d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); - d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); - - dst0 = dst; - dst1 = dst + 4; - vst1_lane_s32((int32_t *)dst0, d28, 0); - dst0 += stride; - vst1_lane_s32((int32_t *)dst1, d28, 1); - dst1 += stride; - vst1_lane_s32((int32_t *)dst0, d29, 0); - dst0 += stride; - vst1_lane_s32((int32_t *)dst1, d29, 1); - dst1 += stride; - - vst1_lane_s32((int32_t *)dst0, d30, 0); - dst0 += stride; - vst1_lane_s32((int32_t *)dst1, d30, 1); - dst1 += stride; - vst1_lane_s32((int32_t *)dst0, d31, 0); - vst1_lane_s32((int32_t *)dst1, d31, 1); - return; -} diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c index 6c4bcc134b..91600bfc00 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/iwalsh_neon.c @@ -10,6 +10,8 @@ #include +#include "./vp8_rtcd.h" + void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) { int16x8_t q0s16, q1s16, q2s16, q3s16; int16x4_t d4s16, d5s16, d6s16, d7s16; diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c index a168219705..df983b23a3 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c @@ -9,7 +9,9 @@ */ #include + #include "./vpx_config.h" +#include "./vp8_rtcd.h" static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( unsigned char *s, int p, const unsigned char *blimit) { diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c index 80a222d248..fbc83ae290 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c @@ -9,7 +9,9 @@ */ #include + #include "./vpx_config.h" +#include "./vp8_rtcd.h" #include "vpx_ports/arm.h" #ifdef VPX_INCOMPATIBLE_GCC diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c index 65eec300ff..fafaf2d451 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c @@ -9,7 +9,9 @@ */ #include + #include "./vpx_config.h" +#include "vp8/common/arm/loopfilter_arm.h" static INLINE void vp8_mbloop_filter_neon(uint8x16_t qblimit, // mblimit uint8x16_t qlimit, // limit diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c index fbb552ebe2..a54e81084b 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -11,10 +11,12 @@ #include #include #include "./vpx_config.h" +#include "./vp8_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_ports/mem.h" static const int8_t vp8_sub_pel_filters[8][8] = { - { 0, 0, 128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positionyys are */ + { 0, 0, -128, 0, 0, 0, 0, 0 }, /* note that 1/8 pel positions are */ { 0, -6, 123, 12, -1, 0, 0, 0 }, /* just as per alpha -0.5 bicubic */ { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ { 0, -9, 93, 50, -6, 0, 0, 0 }, @@ -42,35 +44,6 @@ static INLINE uint8x8_t load_and_shift(const unsigned char *a) { return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32)); } -static INLINE void store4x4(unsigned char *dst, int dst_stride, - const uint8x8_t a0, const uint8x8_t a1) { - if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) { - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1); - } else { - // Store to the aligned local buffer and memcpy instead of vget_lane_u8 - // which is really really slow. - uint32_t output_buffer[4]; - vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0); - vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1); - vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0); - vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1); - - memcpy(dst, output_buffer, 4); - dst += dst_stride; - memcpy(dst, output_buffer + 1, 4); - dst += dst_stride; - memcpy(dst, output_buffer + 2, 4); - dst += dst_stride; - memcpy(dst, output_buffer + 3, 4); - } -} - static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b, const uint8x8_t filter, uint16x8_t *c, uint16x8_t *d) { @@ -180,7 +153,7 @@ static INLINE void yonly4x4(const unsigned char *src, int src_stride, e0 = vqrshrun_n_s16(d0, 7); e1 = vqrshrun_n_s16(d1, 7); - store4x4(dst, dst_stride, e0, e1); + store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1)); } void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, @@ -297,7 +270,7 @@ void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, b2 = vqrshrun_n_s16(e4567, 7); if (yoffset == 0) { // firstpass_filter4x4_only - store4x4(dst_ptr, dst_pitch, b0, b2); + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2)); return; } @@ -411,7 +384,7 @@ void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line, e0 = vqrshrun_n_s16(d0, 7); e1 = vqrshrun_n_s16(d1, 7); - store4x4(dst_ptr, dst_pitch, e0, e1); + store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1)); } void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, @@ -808,7 +781,6 @@ void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line, vst1_u8(dst_ptr, d8u8); dst_ptr += dst_pitch; vst1_u8(dst_ptr, d9u8); - return; } void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, @@ -1277,7 +1249,6 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line, vst1_u8(dst_ptr, d9u8); dst_ptr += dst_pitch; } - return; } void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, @@ -1531,7 +1502,9 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, src += src_pixels_per_line; d12u8 = vld1_u8(src); d13u8 = vld1_u8(src + 8); - d14u8 = vld1_u8(src + 16); + // Only 5 pixels are needed, avoid a potential out of bounds read. + d14u8 = vld1_u8(src + 13); + d14u8 = vext_u8(d14u8, d14u8, 3); src += src_pixels_per_line; __builtin_prefetch(src); @@ -1753,5 +1726,4 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr, dst += dst_pitch; } } - return; } diff --git a/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c index d7286739da..ebc004a048 100644 --- a/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c +++ b/media/libvpx/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c @@ -9,7 +9,9 @@ */ #include + #include "./vpx_config.h" +#include "vp8/common/arm/loopfilter_arm.h" #include "vpx_ports/arm.h" static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit, // flimit diff --git a/media/libvpx/libvpx/vp8/common/blockd.c b/media/libvpx/libvpx/vp8/common/blockd.c index f47c5bae15..22905c10a6 100644 --- a/media/libvpx/libvpx/vp8/common/blockd.c +++ b/media/libvpx/libvpx/vp8/common/blockd.c @@ -11,9 +11,9 @@ #include "blockd.h" #include "vpx_mem/vpx_mem.h" -const unsigned char vp8_block2left[25] = { - 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; -const unsigned char vp8_block2above[25] = { - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8 -}; +const unsigned char vp8_block2left[25] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, + 2, 2, 2, 3, 3, 3, 3, 4, 4, + 5, 5, 6, 6, 7, 7, 8 }; +const unsigned char vp8_block2above[25] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, + 1, 2, 3, 0, 1, 2, 3, 4, 5, + 4, 5, 6, 7, 6, 7, 8 }; diff --git a/media/libvpx/libvpx/vp8/common/blockd.h b/media/libvpx/libvpx/vp8/common/blockd.h index 74fc5d6dbf..8300aad941 100644 --- a/media/libvpx/libvpx/vp8/common/blockd.h +++ b/media/libvpx/libvpx/vp8/common/blockd.h @@ -8,11 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_BLOCKD_H_ -#define VP8_COMMON_BLOCKD_H_ +#ifndef VPX_VP8_COMMON_BLOCKD_H_ +#define VPX_VP8_COMMON_BLOCKD_H_ void vpx_log(const char *format, ...); +#include "vpx/internal/vpx_codec_internal.h" #include "vpx_config.h" #include "vpx_scale/yv12config.h" #include "mv.h" @@ -37,7 +38,9 @@ extern "C" { #define SEGMENT_DELTADATA 0 #define SEGMENT_ABSDATA 1 -typedef struct { int r, c; } POS; +typedef struct { + int r, c; +} POS; #define PLANE_TYPE_Y_NO_DC 0 #define PLANE_TYPE_Y2 1 @@ -55,7 +58,7 @@ typedef struct { extern const unsigned char vp8_block2left[25]; extern const unsigned char vp8_block2above[25]; -#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B); +#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B) typedef enum { KEY_FRAME = 0, INTER_FRAME = 1 } FRAME_TYPE; @@ -169,12 +172,20 @@ typedef struct { typedef struct { FRAME_TYPE frame_type; int is_frame_dropped; + // If frame is dropped due to overshoot after encode_frame. This triggers a + // drop and resets rate control with Q forced to max for following frame. + // The check for this dropping due to overshoot is only done on lowest stream, + // and if set will force drop on all spatial streams for that current frame. + int is_frame_dropped_overshoot_maxqp; // The frame rate for the lowest resolution. double low_res_framerate; /* The frame number of each reference frames */ unsigned int low_res_ref_frames[MAX_REF_FRAMES]; // The video frame counter value for the key frame, for lowest resolution. unsigned int key_frame_counter_value; + // Flags to signal skipped encoding of previous and base layer stream. + unsigned int skip_encoding_prev_stream; + unsigned int skip_encoding_base_stream; LOWER_RES_MB_INFO *mb_info; } LOWER_RES_FRAME_INFO; #endif @@ -191,8 +202,9 @@ typedef struct blockd { union b_mode_info bmi; } BLOCKD; -typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, - int yofst, unsigned char *dst, int dst_pitch); +typedef void (*vp8_subpix_fn_t)(unsigned char *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, + unsigned char *dst_ptr, int dst_pitch); typedef struct macroblockd { DECLARE_ALIGNED(16, unsigned char, predictor[384]); @@ -239,7 +251,7 @@ typedef struct macroblockd { unsigned char update_mb_segmentation_data; /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ - unsigned char mb_segement_abs_delta; + unsigned char mb_segment_abs_delta; /* Per frame flags that define which MB level features (such as quantizer or * loop filter level) */ @@ -278,7 +290,9 @@ typedef struct macroblockd { int corrupted; -#if ARCH_X86 || ARCH_X86_64 + struct vpx_internal_error_info error_info; + +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 /* This is an intermediate buffer currently used in sub-pixel motion search * to keep a copy of the reference area. This buffer can be used for other * purpose. @@ -294,4 +308,4 @@ extern void vp8_setup_block_dptrs(MACROBLOCKD *x); } // extern "C" #endif -#endif // VP8_COMMON_BLOCKD_H_ +#endif // VPX_VP8_COMMON_BLOCKD_H_ diff --git a/media/libvpx/libvpx/vp8/common/coefupdateprobs.h b/media/libvpx/libvpx/vp8/common/coefupdateprobs.h index 9b01bba312..b342096b55 100644 --- a/media/libvpx/libvpx/vp8/common/coefupdateprobs.h +++ b/media/libvpx/libvpx/vp8/common/coefupdateprobs.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_COEFUPDATEPROBS_H_ -#define VP8_COMMON_COEFUPDATEPROBS_H_ +#ifndef VPX_VP8_COMMON_COEFUPDATEPROBS_H_ +#define VPX_VP8_COMMON_COEFUPDATEPROBS_H_ #ifdef __cplusplus extern "C" { @@ -194,4 +194,4 @@ const vp8_prob vp8_coef_update_probs } // extern "C" #endif -#endif // VP8_COMMON_COEFUPDATEPROBS_H_ +#endif // VPX_VP8_COMMON_COEFUPDATEPROBS_H_ diff --git a/media/libvpx/libvpx/vp8/common/common.h b/media/libvpx/libvpx/vp8/common/common.h index bbfc4f3934..562569f9ab 100644 --- a/media/libvpx/libvpx/vp8/common/common.h +++ b/media/libvpx/libvpx/vp8/common/common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_COMMON_H_ -#define VP8_COMMON_COMMON_H_ +#ifndef VPX_VP8_COMMON_COMMON_H_ +#define VPX_VP8_COMMON_COMMON_H_ #include @@ -24,25 +24,25 @@ extern "C" { /* Only need this for fixed-size arrays, for structs just assign. */ #define vp8_copy(Dest, Src) \ - { \ + do { \ assert(sizeof(Dest) == sizeof(Src)); \ memcpy(Dest, Src, sizeof(Src)); \ - } + } while (0) /* Use this for variably-sized arrays. */ -#define vp8_copy_array(Dest, Src, N) \ - { \ - assert(sizeof(*Dest) == sizeof(*Src)); \ - memcpy(Dest, Src, N * sizeof(*Src)); \ - } +#define vp8_copy_array(Dest, Src, N) \ + do { \ + assert(sizeof(*(Dest)) == sizeof(*(Src))); \ + memcpy(Dest, Src, (N) * sizeof(*(Src))); \ + } while (0) -#define vp8_zero(Dest) memset(&Dest, 0, sizeof(Dest)); +#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest)) -#define vp8_zero_array(Dest, N) memset(Dest, 0, N * sizeof(*Dest)); +#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest))) #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_COMMON_COMMON_H_ +#endif // VPX_VP8_COMMON_COMMON_H_ diff --git a/media/libvpx/libvpx/vp8/common/default_coef_probs.h b/media/libvpx/libvpx/vp8/common/default_coef_probs.h index 8c861ac876..b25e4a45a3 100644 --- a/media/libvpx/libvpx/vp8/common/default_coef_probs.h +++ b/media/libvpx/libvpx/vp8/common/default_coef_probs.h @@ -6,10 +6,10 @@ * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. -*/ + */ -#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_ -#define VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#ifndef VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#define VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_ #ifdef __cplusplus extern "C" { @@ -157,4 +157,4 @@ static const vp8_prob default_coef_probs } // extern "C" #endif -#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#endif // VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/media/libvpx/libvpx/vp8/common/entropy.c b/media/libvpx/libvpx/vp8/common/entropy.c index f61fa9e8e4..b9efc0cc1f 100644 --- a/media/libvpx/libvpx/vp8/common/entropy.c +++ b/media/libvpx/libvpx/vp8/common/entropy.c @@ -28,9 +28,9 @@ DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) = { - 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7 -}; +DECLARE_ALIGNED(16, const unsigned char, + vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6, + 6, 6, 6, 6, 6, 6, 6, 7 }; DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = { @@ -41,9 +41,9 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15, }; -DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) = { - 1, 2, 6, 7, 3, 5, 8, 13, 4, 9, 12, 14, 10, 11, 15, 16 -}; +DECLARE_ALIGNED(16, const short, + vp8_default_inv_zig_zag[16]) = { 1, 2, 6, 7, 3, 5, 8, 13, + 4, 9, 12, 14, 10, 11, 15, 16 }; /* vp8_default_zig_zag_mask generated with: @@ -114,7 +114,7 @@ static const vp8_prob Pcat6[] = { 254, 254, 243, 230, 196, 177, p[0] = p[1] = 0; } - void init_bit_trees() { + void init_bit_trees(void) { init_bit_tree(cat1, 1); init_bit_tree(cat2, 2); init_bit_tree(cat3, 3); @@ -129,9 +129,9 @@ static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 }; static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 }; static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 }; static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 }; -static const vp8_tree_index cat6[22] = { - 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 0, 0 -}; +static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, + 10, 10, 12, 12, 14, 14, 16, 16, + 18, 18, 20, 20, 0, 0 }; const vp8_extra_bit_struct vp8_extra_bits[12] = { { 0, 0, 0, 0 }, { 0, 0, 0, 1 }, { 0, 0, 0, 2 }, diff --git a/media/libvpx/libvpx/vp8/common/entropy.h b/media/libvpx/libvpx/vp8/common/entropy.h index d088560011..fbdb7bcfca 100644 --- a/media/libvpx/libvpx/vp8/common/entropy.h +++ b/media/libvpx/libvpx/vp8/common/entropy.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ENTROPY_H_ -#define VP8_COMMON_ENTROPY_H_ +#ifndef VPX_VP8_COMMON_ENTROPY_H_ +#define VPX_VP8_COMMON_ENTROPY_H_ #include "treecoder.h" #include "blockd.h" @@ -105,4 +105,4 @@ void vp8_coef_tree_initialize(void); } // extern "C" #endif -#endif // VP8_COMMON_ENTROPY_H_ +#endif // VPX_VP8_COMMON_ENTROPY_H_ diff --git a/media/libvpx/libvpx/vp8/common/entropymode.c b/media/libvpx/libvpx/vp8/common/entropymode.c index 30c2fa86ae..f61e0c2e2b 100644 --- a/media/libvpx/libvpx/vp8/common/entropymode.c +++ b/media/libvpx/libvpx/vp8/common/entropymode.c @@ -34,12 +34,13 @@ int vp8_mv_cont(const int_mv *l, const int_mv *a) { static const vp8_prob sub_mv_ref_prob[VP8_SUBMVREFS - 1] = { 180, 162, 25 }; -const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT] - [VP8_SUBMVREFS - 1] = { { 147, 136, 18 }, - { 106, 145, 1 }, - { 179, 121, 1 }, - { 223, 1, 34 }, - { 208, 1, 1 } }; +const vp8_prob vp8_sub_mv_ref_prob2[SUBMVREF_COUNT][VP8_SUBMVREFS - 1] = { + { 147, 136, 18 }, + { 106, 145, 1 }, + { 179, 121, 1 }, + { 223, 1, 34 }, + { 208, 1, 1 } +}; const vp8_mbsplit vp8_mbsplits[VP8_NUMMBSPLITS] = { { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, @@ -74,9 +75,9 @@ const vp8_tree_index vp8_ymode_tree[8] = { -DC_PRED, 2, 4, 6, -V_PRED, -H_PRED, -TM_PRED, -B_PRED }; -const vp8_tree_index vp8_kf_ymode_tree[8] = { - -B_PRED, 2, 4, 6, -DC_PRED, -V_PRED, -H_PRED, -TM_PRED -}; +const vp8_tree_index vp8_kf_ymode_tree[8] = { -B_PRED, 2, 4, + 6, -DC_PRED, -V_PRED, + -H_PRED, -TM_PRED }; const vp8_tree_index vp8_uv_mode_tree[6] = { -DC_PRED, 2, -V_PRED, 4, -H_PRED, -TM_PRED }; @@ -98,6 +99,6 @@ void vp8_init_mbmode_probs(VP8_COMMON *x) { memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob)); } -void vp8_default_bmode_probs(vp8_prob p[VP8_BINTRAMODES - 1]) { - memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob)); +void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]) { + memcpy(dest, vp8_bmode_prob, sizeof(vp8_bmode_prob)); } diff --git a/media/libvpx/libvpx/vp8/common/entropymode.h b/media/libvpx/libvpx/vp8/common/entropymode.h index e0a17df10c..c772cece57 100644 --- a/media/libvpx/libvpx/vp8/common/entropymode.h +++ b/media/libvpx/libvpx/vp8/common/entropymode.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ENTROPYMODE_H_ -#define VP8_COMMON_ENTROPYMODE_H_ +#ifndef VPX_VP8_COMMON_ENTROPYMODE_H_ +#define VPX_VP8_COMMON_ENTROPYMODE_H_ #include "onyxc_int.h" #include "treecoder.h" @@ -78,11 +78,11 @@ extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES - 1]; void vp8_init_mbmode_probs(VP8_COMMON *x); void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]); -void vp8_kf_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES][VP8_BINTRAMODES] - [VP8_BINTRAMODES - 1]); +void vp8_kf_default_bmode_probs( + vp8_prob dest[VP8_BINTRAMODES][VP8_BINTRAMODES][VP8_BINTRAMODES - 1]); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_COMMON_ENTROPYMODE_H_ +#endif // VPX_VP8_COMMON_ENTROPYMODE_H_ diff --git a/media/libvpx/libvpx/vp8/common/entropymv.h b/media/libvpx/libvpx/vp8/common/entropymv.h index 6373000903..40039f5b2c 100644 --- a/media/libvpx/libvpx/vp8/common/entropymv.h +++ b/media/libvpx/libvpx/vp8/common/entropymv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ENTROPYMV_H_ -#define VP8_COMMON_ENTROPYMV_H_ +#ifndef VPX_VP8_COMMON_ENTROPYMV_H_ +#define VPX_VP8_COMMON_ENTROPYMV_H_ #include "treecoder.h" @@ -46,4 +46,4 @@ extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2]; } // extern "C" #endif -#endif // VP8_COMMON_ENTROPYMV_H_ +#endif // VPX_VP8_COMMON_ENTROPYMV_H_ diff --git a/media/libvpx/libvpx/vp8/common/extend.c b/media/libvpx/libvpx/vp8/common/extend.c index 2d67b516be..b52e9fe93c 100644 --- a/media/libvpx/libvpx/vp8/common/extend.c +++ b/media/libvpx/libvpx/vp8/common/extend.c @@ -11,31 +11,40 @@ #include "extend.h" #include "vpx_mem/vpx_mem.h" -static void copy_and_extend_plane(unsigned char *s, /* source */ - int sp, /* source pitch */ - unsigned char *d, /* destination */ - int dp, /* destination pitch */ - int h, /* height */ - int w, /* width */ - int et, /* extend top border */ - int el, /* extend left border */ - int eb, /* extend bottom border */ - int er /* extend right border */ - ) { - int i; +static void copy_and_extend_plane( + unsigned char *s, /* source */ + int sp, /* source pitch */ + unsigned char *d, /* destination */ + int dp, /* destination pitch */ + int h, /* height */ + int w, /* width */ + int et, /* extend top border */ + int el, /* extend left border */ + int eb, /* extend bottom border */ + int er, /* extend right border */ + int interleave_step) { /* step between pixels of the current plane */ + int i, j; unsigned char *src_ptr1, *src_ptr2; unsigned char *dest_ptr1, *dest_ptr2; int linesize; + if (interleave_step < 1) interleave_step = 1; + /* copy the left and right most columns out */ src_ptr1 = s; - src_ptr2 = s + w - 1; + src_ptr2 = s + (w - 1) * interleave_step; dest_ptr1 = d - el; dest_ptr2 = d + w; for (i = 0; i < h; ++i) { memset(dest_ptr1, src_ptr1[0], el); - memcpy(dest_ptr1 + el, src_ptr1, w); + if (interleave_step == 1) { + memcpy(dest_ptr1 + el, src_ptr1, w); + } else { + for (j = 0; j < w; j++) { + dest_ptr1[el + j] = src_ptr1[interleave_step * j]; + } + } memset(dest_ptr2, src_ptr2[0], er); src_ptr1 += sp; src_ptr2 += sp; @@ -70,9 +79,12 @@ void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, int eb = dst->border + dst->y_height - src->y_height; int er = dst->border + dst->y_width - src->y_width; + // detect nv12 colorspace + int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1; + copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src->y_height, src->y_width, et, el, eb, - er); + er, 1); et = dst->border >> 1; el = dst->border >> 1; @@ -81,11 +93,11 @@ void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, src->uv_height, src->uv_width, et, el, - eb, er); + eb, er, chroma_step); copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, src->uv_height, src->uv_width, et, el, - eb, er); + eb, er, chroma_step); } void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, @@ -99,6 +111,8 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, int dst_y_offset = srcy * dst->y_stride + srcx; int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); + // detect nv12 colorspace + int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1; /* If the side is not touching the bounder then don't extend. */ if (srcy) et = 0; @@ -108,7 +122,7 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, dst->y_buffer + dst_y_offset, dst->y_stride, srch, srcw, - et, el, eb, er); + et, el, eb, er, 1); et = (et + 1) >> 1; el = (el + 1) >> 1; @@ -119,11 +133,11 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride, dst->u_buffer + dst_uv_offset, dst->uv_stride, srch, - srcw, et, el, eb, er); + srcw, et, el, eb, er, chroma_step); copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride, dst->v_buffer + dst_uv_offset, dst->uv_stride, srch, - srcw, et, el, eb, er); + srcw, et, el, eb, er, chroma_step); } /* note the extension is only for the last row, for intra prediction purpose */ diff --git a/media/libvpx/libvpx/vp8/common/extend.h b/media/libvpx/libvpx/vp8/common/extend.h index 7da5ce31da..586a38a4f3 100644 --- a/media/libvpx/libvpx/vp8/common/extend.h +++ b/media/libvpx/libvpx/vp8/common/extend.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_EXTEND_H_ -#define VP8_COMMON_EXTEND_H_ +#ifndef VPX_VP8_COMMON_EXTEND_H_ +#define VPX_VP8_COMMON_EXTEND_H_ #include "vpx_scale/yv12config.h" @@ -29,4 +29,4 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, } // extern "C" #endif -#endif // VP8_COMMON_EXTEND_H_ +#endif // VPX_VP8_COMMON_EXTEND_H_ diff --git a/media/libvpx/libvpx/vp8/common/filter.h b/media/libvpx/libvpx/vp8/common/filter.h index f1d5ece4a5..6acee22b21 100644 --- a/media/libvpx/libvpx/vp8/common/filter.h +++ b/media/libvpx/libvpx/vp8/common/filter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_FILTER_H_ -#define VP8_COMMON_FILTER_H_ +#ifndef VPX_VP8_COMMON_FILTER_H_ +#define VPX_VP8_COMMON_FILTER_H_ #include "vpx_ports/mem.h" @@ -28,4 +28,4 @@ extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]); } // extern "C" #endif -#endif // VP8_COMMON_FILTER_H_ +#endif // VPX_VP8_COMMON_FILTER_H_ diff --git a/media/libvpx/libvpx/vp8/common/findnearmv.c b/media/libvpx/libvpx/vp8/common/findnearmv.c index f40d2c6bde..3b31923621 100644 --- a/media/libvpx/libvpx/vp8/common/findnearmv.c +++ b/media/libvpx/libvpx/vp8/common/findnearmv.c @@ -21,19 +21,20 @@ const unsigned char vp8_mbsplit_offset[4][16] = { Note that we only consider one 4x4 subblock from each candidate 16x16 macroblock. */ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, - int_mv *nearby, int_mv *best_mv, int cnt[4], + int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4], int refframe, int *ref_frame_sign_bias) { const MODE_INFO *above = here - xd->mode_info_stride; const MODE_INFO *left = here - 1; const MODE_INFO *aboveleft = above - 1; int_mv near_mvs[4]; int_mv *mv = near_mvs; - int *cntx = cnt; + int *cntx = near_mv_ref_cnts; enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV }; /* Zero accumulators */ mv[0].as_int = mv[1].as_int = mv[2].as_int = 0; - cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0; + near_mv_ref_cnts[0] = near_mv_ref_cnts[1] = near_mv_ref_cnts[2] = + near_mv_ref_cnts[3] = 0; /* Process above */ if (above->mbmi.ref_frame != INTRA_FRAME) { @@ -63,7 +64,7 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, *cntx += 2; } else { - cnt[CNT_INTRA] += 2; + near_mv_ref_cnts[CNT_INTRA] += 2; } } @@ -83,33 +84,34 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, *cntx += 1; } else { - cnt[CNT_INTRA] += 1; + near_mv_ref_cnts[CNT_INTRA] += 1; } } /* If we have three distinct MV's ... */ - if (cnt[CNT_SPLITMV]) { + if (near_mv_ref_cnts[CNT_SPLITMV]) { /* See if above-left MV can be merged with NEAREST */ - if (mv->as_int == near_mvs[CNT_NEAREST].as_int) cnt[CNT_NEAREST] += 1; + if (mv->as_int == near_mvs[CNT_NEAREST].as_int) + near_mv_ref_cnts[CNT_NEAREST] += 1; } - cnt[CNT_SPLITMV] = + near_mv_ref_cnts[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV) + (left->mbmi.mode == SPLITMV)) * 2 + (aboveleft->mbmi.mode == SPLITMV); /* Swap near and nearest if necessary */ - if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) { + if (near_mv_ref_cnts[CNT_NEAR] > near_mv_ref_cnts[CNT_NEAREST]) { int tmp; - tmp = cnt[CNT_NEAREST]; - cnt[CNT_NEAREST] = cnt[CNT_NEAR]; - cnt[CNT_NEAR] = tmp; - tmp = near_mvs[CNT_NEAREST].as_int; + tmp = near_mv_ref_cnts[CNT_NEAREST]; + near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR]; + near_mv_ref_cnts[CNT_NEAR] = tmp; + tmp = (int)near_mvs[CNT_NEAREST].as_int; near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; - near_mvs[CNT_NEAR].as_int = tmp; + near_mvs[CNT_NEAR].as_int = (uint32_t)tmp; } /* Use near_mvs[0] to store the "best" MV */ - if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]) { + if (near_mv_ref_cnts[CNT_NEAREST] >= near_mv_ref_cnts[CNT_INTRA]) { near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST]; } diff --git a/media/libvpx/libvpx/vp8/common/findnearmv.h b/media/libvpx/libvpx/vp8/common/findnearmv.h index c1eaa26980..d7db9544aa 100644 --- a/media/libvpx/libvpx/vp8/common/findnearmv.h +++ b/media/libvpx/libvpx/vp8/common/findnearmv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_FINDNEARMV_H_ -#define VP8_COMMON_FINDNEARMV_H_ +#ifndef VPX_VP8_COMMON_FINDNEARMV_H_ +#define VPX_VP8_COMMON_FINDNEARMV_H_ #include "./vpx_config.h" #include "mv.h" @@ -70,7 +70,7 @@ static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge, } void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest, - int_mv *nearby, int_mv *best, int near_mv_ref_cts[4], + int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4], int refframe, int *ref_frame_sign_bias); int vp8_find_near_mvs_bias(MACROBLOCKD *xd, const MODE_INFO *here, @@ -148,4 +148,4 @@ static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, } // extern "C" #endif -#endif // VP8_COMMON_FINDNEARMV_H_ +#endif // VPX_VP8_COMMON_FINDNEARMV_H_ diff --git a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c index 28c981a607..58c778aba5 100644 --- a/media/libvpx/libvpx/vp8/common/generic/systemdependent.c +++ b/media/libvpx/libvpx/vp8/common/generic/systemdependent.c @@ -10,32 +10,34 @@ #include "vpx_config.h" #include "vp8_rtcd.h" -#if ARCH_ARM +#if VPX_ARCH_ARM #include "vpx_ports/arm.h" -#elif ARCH_X86 || ARCH_X86_64 +#elif VPX_ARCH_X86 || VPX_ARCH_X86_64 #include "vpx_ports/x86.h" +#elif VPX_ARCH_PPC +#include "vpx_ports/ppc.h" +#elif VPX_ARCH_MIPS +#include "vpx_ports/mips.h" +#elif VPX_ARCH_LOONGARCH +#include "vpx_ports/loongarch.h" #endif #include "vp8/common/onyxc_int.h" #include "vp8/common/systemdependent.h" #if CONFIG_MULTITHREAD -#if HAVE_UNISTD_H && !defined(__OS2__) +#if HAVE_UNISTD_H #include #elif defined(_WIN32) #include typedef void(WINAPI *PGNSI)(LPSYSTEM_INFO); -#elif defined(__OS2__) -#define INCL_DOS -#define INCL_DOSSPINLOCK -#include #endif #endif #if CONFIG_MULTITHREAD -static int get_cpu_count() { +static int get_cpu_count(void) { int core_count = 16; -#if HAVE_UNISTD_H && !defined(__OS2__) +#if HAVE_UNISTD_H #if defined(_SC_NPROCESSORS_ONLN) core_count = (int)sysconf(_SC_NPROCESSORS_ONLN); #elif defined(_SC_NPROC_ONLN) @@ -43,38 +45,13 @@ static int get_cpu_count() { #endif #elif defined(_WIN32) { -#if _WIN32_WINNT >= 0x0501 +#if _WIN32_WINNT < 0x0501 +#error _WIN32_WINNT must target Windows XP or newer. +#endif SYSTEM_INFO sysinfo; GetNativeSystemInfo(&sysinfo); -#else - PGNSI pGNSI; - SYSTEM_INFO sysinfo; - - /* Call GetNativeSystemInfo if supported or - * GetSystemInfo otherwise. */ - - pGNSI = (PGNSI)GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), - "GetNativeSystemInfo"); - if (pGNSI != NULL) - pGNSI(&sysinfo); - else - GetSystemInfo(&sysinfo); -#endif - core_count = (int)sysinfo.dwNumberOfProcessors; } -#elif defined(__OS2__) - { - ULONG proc_id; - ULONG status; - - core_count = 0; - for (proc_id = 1;; ++proc_id) { - if (DosGetProcessorStatus(proc_id, &status)) break; - - if (status == PROC_ONLINE) core_count++; - } - } #else /* other platforms */ #endif @@ -89,10 +66,4 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) { #else (void)ctx; #endif /* CONFIG_MULTITHREAD */ - -#if ARCH_ARM - ctx->cpu_caps = arm_cpu_caps(); -#elif ARCH_X86 || ARCH_X86_64 - ctx->cpu_caps = x86_simd_caps(); -#endif } diff --git a/media/libvpx/libvpx/vp8/common/header.h b/media/libvpx/libvpx/vp8/common/header.h index 1df01fc6fa..e64e241908 100644 --- a/media/libvpx/libvpx/vp8/common/header.h +++ b/media/libvpx/libvpx/vp8/common/header.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_HEADER_H_ -#define VP8_COMMON_HEADER_H_ +#ifndef VPX_VP8_COMMON_HEADER_H_ +#define VPX_VP8_COMMON_HEADER_H_ #ifdef __cplusplus extern "C" { @@ -45,4 +45,4 @@ typedef struct { } // extern "C" #endif -#endif // VP8_COMMON_HEADER_H_ +#endif // VPX_VP8_COMMON_HEADER_H_ diff --git a/media/libvpx/libvpx/vp8/common/idct_blk.c b/media/libvpx/libvpx/vp8/common/idct_blk.c index ff9f3eb7f2..ebe1774f56 100644 --- a/media/libvpx/libvpx/vp8/common/idct_blk.c +++ b/media/libvpx/libvpx/vp8/common/idct_blk.c @@ -12,12 +12,6 @@ #include "vp8_rtcd.h" #include "vpx_mem/vpx_mem.h" -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, - int stride); -void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred, - int pred_stride, unsigned char *dst_ptr, - int dst_stride); - void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs) { int i, j; @@ -39,40 +33,40 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, } } -void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dstu, - unsigned char *dstv, int stride, +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, + unsigned char *dst_v, int stride, char *eobs) { int i, j; for (i = 0; i < 2; ++i) { for (j = 0; j < 2; ++j) { if (*eobs++ > 1) { - vp8_dequant_idct_add_c(q, dq, dstu, stride); + vp8_dequant_idct_add_c(q, dq, dst_u, stride); } else { - vp8_dc_only_idct_add_c(q[0] * dq[0], dstu, stride, dstu, stride); + vp8_dc_only_idct_add_c(q[0] * dq[0], dst_u, stride, dst_u, stride); memset(q, 0, 2 * sizeof(q[0])); } q += 16; - dstu += 4; + dst_u += 4; } - dstu += 4 * stride - 8; + dst_u += 4 * stride - 8; } for (i = 0; i < 2; ++i) { for (j = 0; j < 2; ++j) { if (*eobs++ > 1) { - vp8_dequant_idct_add_c(q, dq, dstv, stride); + vp8_dequant_idct_add_c(q, dq, dst_v, stride); } else { - vp8_dc_only_idct_add_c(q[0] * dq[0], dstv, stride, dstv, stride); + vp8_dc_only_idct_add_c(q[0] * dq[0], dst_v, stride, dst_v, stride); memset(q, 0, 2 * sizeof(q[0])); } q += 16; - dstv += 4; + dst_v += 4; } - dstv += 4 * stride - 8; + dst_v += 4 * stride - 8; } } diff --git a/media/libvpx/libvpx/vp8/common/invtrans.h b/media/libvpx/libvpx/vp8/common/invtrans.h index c7af32fb67..aed7bb0600 100644 --- a/media/libvpx/libvpx/vp8/common/invtrans.h +++ b/media/libvpx/libvpx/vp8/common/invtrans.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_INVTRANS_H_ -#define VP8_COMMON_INVTRANS_H_ +#ifndef VPX_VP8_COMMON_INVTRANS_H_ +#define VPX_VP8_COMMON_INVTRANS_H_ #include "./vpx_config.h" #include "vp8_rtcd.h" @@ -54,4 +54,4 @@ static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd) { } // extern "C" #endif -#endif // VP8_COMMON_INVTRANS_H_ +#endif // VPX_VP8_COMMON_INVTRANS_H_ diff --git a/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c new file mode 100644 index 0000000000..eee871eec4 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/loongarch/idct_lsx.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_util/loongson_intrinsics.h" + +static const int32_t cospi8sqrt2minus1 = 20091; +static const int32_t sinpi8sqrt2 = 35468; + +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m); \ + DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m); \ + DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ + } while (0) + +#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i s4_m, s5_m, s6_m, s7_m; \ + \ + TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m); \ + DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2); \ + out1 = __lsx_vilvh_d(s6_m, s4_m); \ + out3 = __lsx_vilvh_d(s7_m, s5_m); \ + } while (0) + +#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in0, in1) \ + do { \ + __m128i zero_m = __lsx_vldi(0); \ + __m128i tmp1_m, tmp2_m; \ + __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \ + \ + tmp1_m = __lsx_vilvl_h(in0, zero_m); \ + tmp2_m = __lsx_vilvh_h(in0, zero_m); \ + tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \ + tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \ + tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m); \ + tmp1_m = __lsx_vsrai_w(tmp1_m, 16); \ + tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m); \ + tmp2_m = __lsx_vsrai_w(tmp2_m, 16); \ + in1 = __lsx_vpickev_h(tmp2_m, tmp1_m); \ + } while (0) + +#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i a1_m, b1_m, c1_m, d1_m; \ + __m128i c_tmp1_m, c_tmp2_m; \ + __m128i d_tmp1_m, d_tmp2_m; \ + __m128i const_cospi8sqrt2minus1_m; \ + \ + const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \ + a1_m = __lsx_vadd_h(in0, in2); \ + b1_m = __lsx_vsub_h(in0, in2); \ + EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1, c_tmp1_m); \ + \ + c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m); \ + c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1); \ + c_tmp2_m = __lsx_vsrai_h(c_tmp2_m, 1); \ + c_tmp2_m = __lsx_vadd_h(in3, c_tmp2_m); \ + c1_m = __lsx_vsub_h(c_tmp1_m, c_tmp2_m); \ + \ + d_tmp1_m = __lsx_vmuh_h(in1, const_cospi8sqrt2minus1_m); \ + d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1); \ + d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1); \ + d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m); \ + EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3, d_tmp2_m); \ + d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m); \ + LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ + } while (0) + +#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i a1_m, b1_m, c1_m, d1_m; \ + __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ + __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m; \ + \ + const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_w(cospi8sqrt2minus1); \ + sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \ + a1_m = __lsx_vadd_w(in0, in2); \ + b1_m = __lsx_vsub_w(in0, in2); \ + c_tmp1_m = __lsx_vmul_w(in1, sinpi8_sqrt2_m); \ + c_tmp1_m = __lsx_vsrai_w(c_tmp1_m, 16); \ + c_tmp2_m = __lsx_vmul_w(in3, const_cospi8sqrt2minus1_m); \ + c_tmp2_m = __lsx_vsrai_w(c_tmp2_m, 16); \ + c_tmp2_m = __lsx_vadd_w(in3, c_tmp2_m); \ + c1_m = __lsx_vsub_w(c_tmp1_m, c_tmp2_m); \ + d_tmp1_m = __lsx_vmul_w(in1, const_cospi8sqrt2minus1_m); \ + d_tmp1_m = __lsx_vsrai_w(d_tmp1_m, 16); \ + d_tmp1_m = __lsx_vadd_w(in1, d_tmp1_m); \ + d_tmp2_m = __lsx_vmul_w(in3, sinpi8_sqrt2_m); \ + d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16); \ + d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m); \ + LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ + } while (0) + +#define UNPCK_SH_SW(in, out0, out1) \ + do { \ + out0 = __lsx_vsllwil_w_h(in, 0); \ + out1 = __lsx_vexth_w_h(in); \ + } while (0) + +static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred, + int32_t pred_stride, uint8_t *dest, + int32_t dest_stride) { + __m128i vec, res0, res1, res2, res3, dst0, dst1; + __m128i pred0, pred1, pred2, pred3; + __m128i zero = __lsx_vldi(0); + + int32_t pred_stride2 = pred_stride << 1; + int32_t pred_stride3 = pred_stride2 + pred_stride; + + vec = __lsx_vreplgr2vr_h(in_dc); + vec = __lsx_vsrari_h(vec, 3); + pred0 = __lsx_vld(pred, 0); + DUP2_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred1, pred2); + pred3 = __lsx_vldx(pred, pred_stride3); + DUP4_ARG2(__lsx_vilvl_b, zero, pred0, zero, pred1, zero, pred2, zero, pred3, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0, + res1, res2, res3); + res0 = __lsx_vclip255_h(res0); + res1 = __lsx_vclip255_h(res1); + res2 = __lsx_vclip255_h(res2); + res3 = __lsx_vclip255_h(res3); + + DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, dst0, dst1); + dst0 = __lsx_vpickev_w(dst1, dst0); + __lsx_vstelm_w(dst0, dest, 0, 0); + dest += dest_stride; + __lsx_vstelm_w(dst0, dest, 0, 1); + dest += dest_stride; + __lsx_vstelm_w(dst0, dest, 0, 2); + dest += dest_stride; + __lsx_vstelm_w(dst0, dest, 0, 3); +} + +void vp8_dc_only_idct_add_lsx(int16_t input_dc, uint8_t *pred_ptr, + int32_t pred_stride, uint8_t *dst_ptr, + int32_t dst_stride) { + idct4x4_addconst_lsx(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride); +} + +static void dequant_idct4x4_addblk_2x_lsx(int16_t *input, + int16_t *dequant_input, uint8_t *dest, + int32_t dest_stride) { + __m128i dest0, dest1, dest2, dest3; + __m128i in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1; + __m128i hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3; + __m128i hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r; + __m128i vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r; + __m128i zero = __lsx_vldi(0); + + int32_t dest_stride2 = dest_stride << 1; + int32_t dest_stride3 = dest_stride2 + dest_stride; + + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP2_ARG2(__lsx_vld, dequant_input, 0, dequant_input, 16, dequant_in0, + dequant_in1); + + DUP4_ARG2(__lsx_vmul_h, in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, + in3, dequant_in1, mul0, mul1, mul2, mul3); + DUP2_ARG2(__lsx_vpickev_d, mul2, mul0, mul3, mul1, in0, in2); + DUP2_ARG2(__lsx_vpickod_d, mul2, mul0, mul3, mul1, in1, in3); + + VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3); + TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + UNPCK_SH_SW(hz0, hz0r, hz0l); + UNPCK_SH_SW(hz1, hz1r, hz1l); + UNPCK_SH_SW(hz2, hz2r, hz2l); + UNPCK_SH_SW(hz3, hz3r, hz3l); + VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l); + DUP4_ARG2(__lsx_vsrari_w, vt0l, 3, vt1l, 3, vt2l, 3, vt3l, 3, vt0l, vt1l, + vt2l, vt3l); + VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r); + DUP4_ARG2(__lsx_vsrari_w, vt0r, 3, vt1r, 3, vt2r, 3, vt3r, 3, vt0r, vt1r, + vt2r, vt3r); + DUP4_ARG2(__lsx_vpickev_h, vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, + vt0, vt1, vt2, vt3); + TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + dest0 = __lsx_vld(dest, 0); + DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2); + dest3 = __lsx_vldx(dest, dest_stride3); + DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, + res1, res2, res3); + + res0 = __lsx_vclip255_h(res0); + res1 = __lsx_vclip255_h(res1); + res2 = __lsx_vclip255_h(res2); + res3 = __lsx_vclip255_h(res3); + DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, vt0l, vt1l); + + __lsx_vstelm_d(vt0l, dest, 0, 0); + __lsx_vstelm_d(vt0l, dest + dest_stride, 0, 1); + __lsx_vstelm_d(vt1l, dest + dest_stride2, 0, 0); + __lsx_vstelm_d(vt1l, dest + dest_stride3, 0, 1); + + __lsx_vst(zero, input, 0); + __lsx_vst(zero, input, 16); + __lsx_vst(zero, input, 32); + __lsx_vst(zero, input, 48); +} + +static void dequant_idct_addconst_2x_lsx(int16_t *input, int16_t *dequant_input, + uint8_t *dest, int32_t dest_stride) { + __m128i input_dc0, input_dc1, vec, res0, res1, res2, res3; + __m128i dest0, dest1, dest2, dest3; + __m128i zero = __lsx_vldi(0); + int32_t dest_stride2 = dest_stride << 1; + int32_t dest_stride3 = dest_stride2 + dest_stride; + + input_dc0 = __lsx_vreplgr2vr_h(input[0] * dequant_input[0]); + input_dc1 = __lsx_vreplgr2vr_h(input[16] * dequant_input[0]); + DUP2_ARG2(__lsx_vsrari_h, input_dc0, 3, input_dc1, 3, input_dc0, input_dc1); + vec = __lsx_vpickev_d(input_dc1, input_dc0); + input[0] = 0; + input[16] = 0; + dest0 = __lsx_vld(dest, 0); + DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2); + dest3 = __lsx_vldx(dest, dest_stride3); + DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0, + res1, res2, res3); + res0 = __lsx_vclip255_h(res0); + res1 = __lsx_vclip255_h(res1); + res2 = __lsx_vclip255_h(res2); + res3 = __lsx_vclip255_h(res3); + + DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, res0, res1); + __lsx_vstelm_d(res0, dest, 0, 0); + __lsx_vstelm_d(res0, dest + dest_stride, 0, 1); + __lsx_vstelm_d(res1, dest + dest_stride2, 0, 0); + __lsx_vstelm_d(res1, dest + dest_stride3, 0, 1); +} + +void vp8_dequant_idct_add_y_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst, + int32_t stride, char *eobs) { + int16_t *eobs_h = (int16_t *)eobs; + uint8_t i; + + for (i = 4; i--;) { + if (eobs_h[0]) { + if (eobs_h[0] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst, stride); + } + } + + q += 32; + + if (eobs_h[1]) { + if (eobs_h[1] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst + 8, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst + 8, stride); + } + } + + q += 32; + dst += (4 * stride); + eobs_h += 2; + } +} + +void vp8_dequant_idct_add_uv_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst_u, + uint8_t *dst_v, int32_t stride, + char *eobs) { + int16_t *eobs_h = (int16_t *)eobs; + if (eobs_h[0]) { + if (eobs_h[0] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride); + } + } + + q += 32; + dst_u += (stride * 4); + + if (eobs_h[1]) { + if (eobs_h[1] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride); + } + } + + q += 32; + + if (eobs_h[2]) { + if (eobs_h[2] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride); + } + } + q += 32; + dst_v += (stride * 4); + + if (eobs_h[3]) { + if (eobs_h[3] & 0xfefe) { + dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride); + } else { + dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride); + } + } +} diff --git a/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c new file mode 100644 index 0000000000..79c3ea6dbb --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c @@ -0,0 +1,743 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vpx_util/loongson_intrinsics.h" + +#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \ + do { \ + __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const __m128i cnst4b = __lsx_vldi(4); \ + const __m128i cnst3b = __lsx_vldi(3); \ + \ + p1_m = __lsx_vxori_b(p1, 0x80); \ + p0_m = __lsx_vxori_b(p0, 0x80); \ + q0_m = __lsx_vxori_b(q0, 0x80); \ + q1_m = __lsx_vxori_b(q1, 0x80); \ + \ + filt = __lsx_vssub_b(p1_m, q1_m); \ + filt = __lsx_vand_v(filt, hev); \ + q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vand_v(filt, mask); \ + t1 = __lsx_vsadd_b(filt, cnst4b); \ + t1 = __lsx_vsra_b(t1, cnst3b); \ + t2 = __lsx_vsadd_b(filt, cnst3b); \ + t2 = __lsx_vsra_b(t2, cnst3b); \ + q0_m = __lsx_vssub_b(q0_m, t1); \ + q0 = __lsx_vxori_b(q0_m, 0x80); \ + p0_m = __lsx_vsadd_b(p0_m, t2); \ + p0 = __lsx_vxori_b(p0_m, 0x80); \ + filt = __lsx_vsrari_b(t1, 1); \ + hev = __lsx_vxori_b(hev, 0xff); \ + filt = __lsx_vand_v(filt, hev); \ + q1_m = __lsx_vssub_b(q1_m, filt); \ + q1 = __lsx_vxori_b(q1_m, 0x80); \ + p1_m = __lsx_vsadd_b(p1_m, filt); \ + p1 = __lsx_vxori_b(p1_m, 0x80); \ + } while (0) + +#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ + do { \ + __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ + __m128i u, filt, t1, t2, filt_sign, q0_sub_p0; \ + __m128i filt_r, filt_l; \ + __m128i temp0, temp1, temp2, temp3; \ + const __m128i cnst4b = __lsx_vldi(4); \ + const __m128i cnst3b = __lsx_vldi(3); \ + const __m128i cnst9h = __lsx_vldi(1033); \ + const __m128i cnst63h = __lsx_vldi(1087); \ + \ + p2_m = __lsx_vxori_b(p2, 0x80); \ + p1_m = __lsx_vxori_b(p1, 0x80); \ + p0_m = __lsx_vxori_b(p0, 0x80); \ + q0_m = __lsx_vxori_b(q0, 0x80); \ + q1_m = __lsx_vxori_b(q1, 0x80); \ + q2_m = __lsx_vxori_b(q2, 0x80); \ + \ + filt = __lsx_vssub_b(p1_m, q1_m); \ + q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vand_v(filt, mask); \ + \ + t2 = __lsx_vand_v(filt, hev); \ + hev = __lsx_vxori_b(hev, 0xff); \ + filt = __lsx_vand_v(hev, filt); \ + t1 = __lsx_vsadd_b(t2, cnst4b); \ + t1 = __lsx_vsra_b(t1, cnst3b); \ + t2 = __lsx_vsadd_b(t2, cnst3b); \ + t2 = __lsx_vsra_b(t2, cnst3b); \ + q0_m = __lsx_vssub_b(q0_m, t1); \ + p0_m = __lsx_vsadd_b(p0_m, t2); \ + filt_sign = __lsx_vslti_b(filt, 0); \ + filt_r = __lsx_vilvl_b(filt_sign, filt); \ + filt_l = __lsx_vilvh_b(filt_sign, filt); \ + temp0 = __lsx_vmul_h(filt_r, cnst9h); \ + temp1 = __lsx_vadd_h(temp0, cnst63h); \ + temp2 = __lsx_vmul_h(filt_l, cnst9h); \ + temp3 = __lsx_vadd_h(temp2, cnst63h); \ + \ + u = __lsx_vssrani_b_h(temp3, temp1, 7); \ + q2_m = __lsx_vssub_b(q2_m, u); \ + p2_m = __lsx_vsadd_b(p2_m, u); \ + q2 = __lsx_vxori_b(q2_m, 0x80); \ + p2 = __lsx_vxori_b(p2_m, 0x80); \ + \ + temp1 = __lsx_vadd_h(temp1, temp0); \ + temp3 = __lsx_vadd_h(temp3, temp2); \ + \ + u = __lsx_vssrani_b_h(temp3, temp1, 7); \ + q1_m = __lsx_vssub_b(q1_m, u); \ + p1_m = __lsx_vsadd_b(p1_m, u); \ + q1 = __lsx_vxori_b(q1_m, 0x80); \ + p1 = __lsx_vxori_b(p1_m, 0x80); \ + \ + temp1 = __lsx_vadd_h(temp1, temp0); \ + temp3 = __lsx_vadd_h(temp3, temp2); \ + \ + u = __lsx_vssrani_b_h(temp3, temp1, 7); \ + q0_m = __lsx_vssub_b(q0_m, u); \ + p0_m = __lsx_vsadd_b(p0_m, u); \ + q0 = __lsx_vxori_b(q0_m, 0x80); \ + p0 = __lsx_vxori_b(p0_m, 0x80); \ + } while (0) + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + do { \ + __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \ + p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \ + p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \ + q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \ + q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \ + q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \ + p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \ + p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \ + flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = __lsx_vslt_bu(thresh_in, flat_out); \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \ + mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \ + mask_out = __lsx_vmax_bu(flat_out, mask_out); \ + p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \ + mask_out = __lsx_vslt_bu(limit_in, mask_out); \ + mask_out = __lsx_vxori_b(mask_out, 0xff); \ + } while (0) + +#define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \ + do { \ + __lsx_vstelm_w(in0, pdst, 0, in0_idx); \ + __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx); \ + } while (0) + +static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i mask, hev, flat; + __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + + DUP4_ARG2(__lsx_vldx, src, -pitch_x4, src, -pitch_x3, src, -pitch_x2, src, + -pitch, p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2); + q3 = __lsx_vldx(src, pitch_x3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + __lsx_vstx(p1, src, -pitch_x2); + __lsx_vstx(p0, src, -pitch); + __lsx_vst(q0, src, 0); + __lsx_vstx(q1, src, pitch); +} + +static void loop_filter_vertical_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + uint8_t *src_tmp0 = src - 4; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + __m128i mask, hev, flat; + __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + row0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row1, row2); + row3 = __lsx_vldx(src_tmp0, pitch_x3); + src_tmp0 += pitch_x4; + row4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row5, row6); + row7 = __lsx_vldx(src_tmp0, pitch_x3); + src_tmp0 += pitch_x4; + + row8 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row9, row10); + row11 = __lsx_vldx(src_tmp0, pitch_x3); + src_tmp0 += pitch_x4; + row12 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row13, row14); + row15 = __lsx_vldx(src_tmp0, pitch_x3); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); + tmp2 = __lsx_vilvl_h(tmp1, tmp0); + tmp3 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1); + tmp4 = __lsx_vilvl_h(tmp1, tmp0); + tmp5 = __lsx_vilvh_h(tmp1, tmp0); + + src -= 2; + __lsx_vstelm_w(tmp2, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp2, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp2, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp2, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(tmp3, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp3, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp3, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp3, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(tmp4, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp4, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp4, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp4, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(tmp5, src, 0, 0); + src += pitch; + __lsx_vstelm_w(tmp5, src, 0, 1); + src += pitch; + __lsx_vstelm_w(tmp5, src, 0, 2); + src += pitch; + __lsx_vstelm_w(tmp5, src, 0, 3); +} + +static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; + __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; + + thresh = __lsx_vreplgr2vr_b(thresh_in); + limit = __lsx_vreplgr2vr_b(limit_in); + b_limit = __lsx_vreplgr2vr_b(b_limit_in); + + DUP4_ARG2(__lsx_vldx, src_u, -pitch_x4, src_u, -pitch_x3, src_u, -pitch_x2, + src_u, -pitch, p3_u, p2_u, p1_u, p0_u); + q0_u = __lsx_vld(src_u, 0); + DUP2_ARG2(__lsx_vldx, src_u, pitch, src_u, pitch_x2, q1_u, q2_u); + q3_u = __lsx_vldx(src_u, pitch_x3); + + DUP4_ARG2(__lsx_vldx, src_v, -pitch_x4, src_v, -pitch_x3, src_v, -pitch_x2, + src_v, -pitch, p3_v, p2_v, p1_v, p0_v); + q0_v = __lsx_vld(src_v, 0); + DUP2_ARG2(__lsx_vldx, src_v, pitch, src_v, pitch_x2, q1_v, q2_v); + q3_v = __lsx_vldx(src_v, pitch_x3); + + /* right 8 element of p3 are u pixel and + left 8 element of p3 are v pixel */ + DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, + p2, p1, p0); + DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, + q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + __lsx_vstelm_d(q1, src_u + pitch, 0, 0); + __lsx_vstelm_d(q0, src_u, 0, 0); + __lsx_vstelm_d(p0, src_u - pitch, 0, 0); + __lsx_vstelm_d(p1, src_u - pitch_x2, 0, 0); + + __lsx_vstelm_d(q1, src_v + pitch, 0, 1); + __lsx_vstelm_d(q0, src_v, 0, 1); + __lsx_vstelm_d(p0, src_v - pitch, 0, 1); + __lsx_vstelm_d(p1, src_v - pitch_x2, 0, 1); +} + +static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint8_t *src_u_tmp, *src_v_tmp; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; + __m128i row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + thresh = __lsx_vreplgr2vr_b(thresh_in); + limit = __lsx_vreplgr2vr_b(limit_in); + b_limit = __lsx_vreplgr2vr_b(b_limit_in); + + src_u_tmp = src_u - 4; + row0 = __lsx_vld(src_u_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row1, row2); + row3 = __lsx_vldx(src_u_tmp, pitch_x3); + src_u_tmp += pitch_x4; + row4 = __lsx_vld(src_u_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row5, row6); + row7 = __lsx_vldx(src_u_tmp, pitch_x3); + + src_v_tmp = src_v - 4; + row8 = __lsx_vld(src_v_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row9, row10); + row11 = __lsx_vldx(src_v_tmp, pitch_x3); + src_v_tmp += pitch_x4; + row12 = __lsx_vld(src_v_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row13, row14); + row15 = __lsx_vldx(src_v_tmp, pitch_x3); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev); + + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); + tmp2 = __lsx_vilvl_h(tmp1, tmp0); + tmp3 = __lsx_vilvh_h(tmp1, tmp0); + + tmp0 = __lsx_vilvh_b(p0, p1); + tmp1 = __lsx_vilvh_b(q1, q0); + tmp4 = __lsx_vilvl_h(tmp1, tmp0); + tmp5 = __lsx_vilvh_h(tmp1, tmp0); + + src_u_tmp += 2; + __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x4, 0, 0); + __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x3, 0, 1); + __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x2, 0, 2); + __lsx_vstelm_w(tmp2, src_u_tmp - pitch, 0, 3); + + __lsx_vstelm_w(tmp3, src_u_tmp, 0, 0); + __lsx_vstelm_w(tmp3, src_u_tmp + pitch, 0, 1); + __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x2, 0, 2); + __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x3, 0, 3); + + src_v_tmp += 2; + __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x4, 0, 0); + __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x3, 0, 1); + __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x2, 0, 2); + __lsx_vstelm_w(tmp4, src_v_tmp - pitch, 0, 3); + + __lsx_vstelm_w(tmp5, src_v_tmp, 0, 0); + __lsx_vstelm_w(tmp5, src_v_tmp + pitch, 0, 1); + __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x2, 0, 2); + __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x3, 0, 3); +} + +static inline void mbloop_filter_horizontal_edge_y_lsx( + uint8_t *src, int32_t pitch, const uint8_t b_limit_in, + const uint8_t limit_in, const uint8_t thresh_in) { + uint8_t *temp_src; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + + temp_src = src - pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, p3, p2, p1, p0); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, q0, q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + temp_src = src - pitch_x3; + __lsx_vstx(p2, temp_src, 0); + __lsx_vstx(p1, temp_src, pitch); + __lsx_vstx(p0, temp_src, pitch_x2); + __lsx_vstx(q0, temp_src, pitch_x3); + temp_src += pitch_x4; + __lsx_vstx(q1, temp_src, 0); + __lsx_vstx(q2, temp_src, pitch); +} + +static inline void mbloop_filter_horizontal_edge_uv_lsx( + uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, + const uint8_t limit_in, const uint8_t thresh_in) { + uint8_t *temp_src; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u; + __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + + temp_src = src_u - pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, p3_u, p2_u, p1_u, p0_u); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, q0_u, q1_u, q2_u, q3_u); + temp_src = src_v - pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, p3_v, p2_v, p1_v, p0_v); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, q0_v, q1_v, q2_v, q3_v); + + DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, + p2, p1, p0); + DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, + q1, q2, q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + src_u -= pitch_x3; + __lsx_vstelm_d(p2, src_u, 0, 0); + __lsx_vstelm_d(p1, src_u + pitch, 0, 0); + __lsx_vstelm_d(p0, src_u + pitch_x2, 0, 0); + __lsx_vstelm_d(q0, src_u + pitch_x3, 0, 0); + src_u += pitch_x4; + __lsx_vstelm_d(q1, src_u, 0, 0); + src_u += pitch; + __lsx_vstelm_d(q2, src_u, 0, 0); + + src_v -= pitch_x3; + __lsx_vstelm_d(p2, src_v, 0, 1); + __lsx_vstelm_d(p1, src_v + pitch, 0, 1); + __lsx_vstelm_d(p0, src_v + pitch_x2, 0, 1); + __lsx_vstelm_d(q0, src_v + pitch_x3, 0, 1); + src_v += pitch_x4; + __lsx_vstelm_d(q1, src_v, 0, 1); + src_v += pitch; + __lsx_vstelm_d(q2, src_v, 0, 1); +} + +static inline void mbloop_filter_vertical_edge_y_lsx(uint8_t *src, + int32_t pitch, + const uint8_t b_limit_in, + const uint8_t limit_in, + const uint8_t thresh_in) { + uint8_t *temp_src; + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; + __m128i row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + temp_src = src - 4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row0, row1, row2, row3); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row4, row5, row6, row7); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row8, row9, row10, row11); + temp_src += pitch_x4; + DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2, + temp_src, pitch_x3, row12, row13, row14, row15); + temp_src -= pitch_x4; + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1); + tmp3 = __lsx_vilvl_h(tmp1, tmp0); + tmp4 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1); + tmp6 = __lsx_vilvl_h(tmp1, tmp0); + tmp7 = __lsx_vilvh_h(tmp1, tmp0); + tmp2 = __lsx_vilvl_b(q2, q1); + tmp5 = __lsx_vilvh_b(q2, q1); + + temp_src = src - 3; + VP8_ST6x1_B(tmp3, 0, tmp2, 0, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp3, 1, tmp2, 1, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp3, 2, tmp2, 2, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp3, 3, tmp2, 3, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 0, tmp2, 4, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 1, tmp2, 5, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 2, tmp2, 6, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp4, 3, tmp2, 7, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 0, tmp5, 0, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 1, tmp5, 1, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 2, tmp5, 2, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp6, 3, tmp5, 3, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 0, tmp5, 4, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 1, tmp5, 5, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 2, tmp5, 6, temp_src, 4); + temp_src += pitch; + VP8_ST6x1_B(tmp7, 3, tmp5, 7, temp_src, 4); +} + +static inline void mbloop_filter_vertical_edge_uv_lsx( + uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in, + const uint8_t limit_in, const uint8_t thresh_in) { + int32_t pitch_x2 = pitch << 1; + int32_t pitch_x3 = pitch_x2 + pitch; + int32_t pitch_x4 = pitch << 2; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i mask, hev, flat, thresh, limit, b_limit; + __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8; + __m128i row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit); + thresh = __lsx_vldrepl_b(&thresh_in, 0); + + src_u -= 4; + DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u, + pitch_x3, row0, row1, row2, row3); + src_u += pitch_x4; + DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u, + pitch_x3, row4, row5, row6, row7); + src_v -= 4; + DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v, + pitch_x3, row8, row9, row10, row11); + src_v += pitch_x4; + DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v, + pitch_x3, row12, row13, row14, row15); + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1); + tmp3 = __lsx_vilvl_h(tmp1, tmp0); + tmp4 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1); + tmp6 = __lsx_vilvl_h(tmp1, tmp0); + tmp7 = __lsx_vilvh_h(tmp1, tmp0); + tmp2 = __lsx_vilvl_b(q2, q1); + tmp5 = __lsx_vilvh_b(q2, q1); + + src_u += 1 - pitch_x4; + VP8_ST6x1_B(tmp3, 0, tmp2, 0, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp3, 1, tmp2, 1, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp3, 2, tmp2, 2, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp3, 3, tmp2, 3, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 0, tmp2, 4, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 1, tmp2, 5, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 2, tmp2, 6, src_u, 4); + src_u += pitch; + VP8_ST6x1_B(tmp4, 3, tmp2, 7, src_u, 4); + + src_v += 1 - pitch_x4; + VP8_ST6x1_B(tmp6, 0, tmp5, 0, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp6, 1, tmp5, 1, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp6, 2, tmp5, 2, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp6, 3, tmp5, 3, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 0, tmp5, 4, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 1, tmp5, 5, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 2, tmp5, 6, src_v, 4); + src_v += pitch; + VP8_ST6x1_B(tmp7, 3, tmp5, 7, src_v, 4); +} + +void vp8_loop_filter_mbh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + mbloop_filter_horizontal_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim, + *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + if (src_u) { + mbloop_filter_horizontal_edge_uv_lsx( + src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + mbloop_filter_vertical_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim, + *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); + if (src_u) { + mbloop_filter_vertical_edge_uv_lsx(src_u, src_v, pitch_u_v, + *lpf_info_ptr->mblim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_bh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + loop_filter_horizontal_4_dual_lsx(src_y + 4 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + loop_filter_horizontal_4_dual_lsx(src_y + 8 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + loop_filter_horizontal_4_dual_lsx(src_y + 12 * pitch_y, pitch_y, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr); + if (src_u) { + loop_filter_horizontal_edge_uv_lsx( + src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v, + *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr); + } +} + +void vp8_loop_filter_bv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v, + int32_t pitch_y, int32_t pitch_u_v, + loop_filter_info *lpf_info_ptr) { + loop_filter_vertical_4_dual_lsx(src_y + 4, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + loop_filter_vertical_4_dual_lsx(src_y + 8, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + loop_filter_vertical_4_dual_lsx(src_y + 12, pitch_y, lpf_info_ptr->blim, + lpf_info_ptr->lim, lpf_info_ptr->hev_thr, + lpf_info_ptr->blim, lpf_info_ptr->lim, + lpf_info_ptr->hev_thr); + if (src_u) { + loop_filter_vertical_edge_uv_lsx(src_u + 4, src_v + 4, pitch_u_v, + *lpf_info_ptr->blim, *lpf_info_ptr->lim, + *lpf_info_ptr->hev_thr); + } +} diff --git a/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c b/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c new file mode 100644 index 0000000000..9867633415 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c @@ -0,0 +1,1904 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/filter.h" +#include "vpx_ports/mem.h" +#include "vpx_util/loongson_intrinsics.h" + +DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_lsx[7][8]) = { + { 0, -6, 123, 12, -1, 0, 0, 0 }, + { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0, 0, 0 }, + { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0, 0, 0 }, + { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0, 0, 0 }, +}; + +static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2, + __m128i coeff0, __m128i coeff1, __m128i coeff2) { + __m128i out0_m; + + out0_m = __lsx_vdp2_h_b(in0, coeff0); + out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); + out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); + + return out0_m; +} + +static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0, + __m128i mask1, __m128i mask2, + __m128i filt_h0, __m128i filt_h1, + __m128i filt_h2) { + __m128i vec0_m, vec1_m, vec2_m; + __m128i hz_out_m; + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, + vec1_m); + vec2_m = __lsx_vshuf_b(src1, src0, mask2); + hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); + hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); + hz_out_m = __lsx_vsat_h(hz_out_m, 7); + + return hz_out_m; +} + +static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1, + __m128i filt0, __m128i filt1) { + __m128i tmp_m; + + tmp_m = __lsx_vdp2_h_b(vec0, filt0); + tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1); + + return tmp_m; +} + +static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0, + __m128i mask1, __m128i filt_h0, + __m128i filt_h1) { + __m128i vec0_m, vec1_m, hz_out_m; + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m, + vec1_m); + hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1); + hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT); + hz_out_m = __lsx_vsat_h(hz_out_m, 7); + + return hz_out_m; +} + +#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1); \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \ + vec3_m); \ + DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \ + out0, out1); \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src3, src2, mask2, vec4_m, \ + vec5_m); \ + DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ + out0, out1); \ + } while (0) + +#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1, \ + out2, out3) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \ + vec3_m); \ + DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \ + vec3_m, filt0, out0, out1, out2, out3); \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \ + vec3_m); \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, vec4_m, \ + vec5_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, vec6_m, \ + vec7_m); \ + DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \ + out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \ + out3); \ + DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \ + out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, \ + out3); \ + } while (0) + +#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1); \ + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \ + vec3_m); \ + DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \ + out0, out1); \ + } while (0) + +#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1, out2, out3) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \ + vec3_m); \ + DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \ + vec3_m, filt0, out0, out1, out2, out3); \ + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \ + vec1_m); \ + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \ + vec3_m); \ + DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \ + out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \ + out3); \ + } while (0) + +static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src, + int32_t src_stride, + uint8_t *RESTRICT dst, + int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + filt2 = __lsx_vldrepl_h(filter, 4); + + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1); + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); +} + +static void common_hz_6t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, out0, out1, out2, out3; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride_x2 << 1; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + filt2 = __lsx_vldrepl_h(filter, 4); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src += src_stride_x4; + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out2, out3); + + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 3); +} + +static void common_hz_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_6t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_6t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, tmp0, tmp1; + __m128i filt, out0, out1, out2, out3; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 2; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src += src_stride_x4; + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, + filt1, filt2, out0, out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src += src_stride_x4; + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + } +} + +static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2; + __m128i mask0, mask1, mask2, out; + __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 2; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src2, src4, src6); + src += 8; + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src1, src3, src5, src7); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4, + src5, src6, src7); + src += src_stride_x4 - 8; + + HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + filt0, filt1, filt2, out0, out1, out2, out3); + HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2, + filt0, filt1, filt2, out4, out5, out6, out7); + DUP4_ARG2(__lsx_vsrari_h, out0, VP8_FILTER_SHIFT, out1, VP8_FILTER_SHIFT, + out2, VP8_FILTER_SHIFT, out3, VP8_FILTER_SHIFT, out0, out1, out2, + out3); + DUP4_ARG2(__lsx_vsrari_h, out4, VP8_FILTER_SHIFT, out5, VP8_FILTER_SHIFT, + out6, VP8_FILTER_SHIFT, out7, VP8_FILTER_SHIFT, out4, out5, out6, + out7); + DUP4_ARG2(__lsx_vsat_h, out0, 7, out1, 7, out2, 7, out3, 7, out0, out1, + out2, out3); + DUP4_ARG2(__lsx_vsat_h, out4, 7, out5, 7, out6, 7, out7, 7, out4, out5, + out6, out7); + out = __lsx_vpickev_b(out1, out0); + out = __lsx_vxori_b(out, 128); + __lsx_vst(out, dst, 0); + out = __lsx_vpickev_b(out3, out2); + out = __lsx_vxori_b(out, 128); + __lsx_vstx(out, dst, dst_stride); + out = __lsx_vpickev_b(out5, out4); + out = __lsx_vxori_b(out, 128); + __lsx_vstx(out, dst, dst_stride_x2); + out = __lsx_vpickev_b(out7, out6); + out = __lsx_vxori_b(out, 128); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + } +} + +static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + __m128i src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2; + __m128i out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + filt2 = __lsx_vldrepl_h(filter, 4); + + DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1); + src2 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4); + src += src_stride_x3; + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110, + src4332); + DUP2_ARG2(__lsx_vxori_b, src2110, 128, src4332, 128, src2110, src4332); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7); + src8 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554, + src8776); + DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776); + out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2); + out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2); + + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + src2110 = src6554; + src4332 = src8776; + src4 = src8; + } +} + +static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10; + __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r; + __m128i src109_r, filt0, filt1, filt2; + __m128i tmp0, tmp1; + __m128i filt, out0_r, out1_r, out2_r, out3_r; + + src -= src_stride_x2; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vld(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, src3, + src10_r, src32_r, src21_r, src43_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src7, src8, src9, src10); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9, + src76_r, src87_r, src98_r, src109_r); + out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2); + out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2); + out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2); + out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2); + DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r, + out2_r, VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + src10_r = src76_r; + src32_r = src98_r; + src21_r = src87_r; + src43_r = src109_r; + src4 = src10; + } +} + +static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + __m128i src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + __m128i src65_l, src87_l, filt0, filt1, filt2; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= src_stride_x2; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + filt2 = __lsx_vreplvei_h(filt, 2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vldx(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1, + src10_r, src32_r, src43_r, src21_r); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1, + src10_l, src32_l, src43_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src5, src6, src7, src8); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_l, src65_l, src76_l, src87_l); + out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2); + out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2); + out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2); + out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2); + out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2); + out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2); + out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2); + out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2); + DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l, + out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT, + out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0, + tmp1, tmp2, tmp3); + __lsx_vstx(tmp0, dst, 0); + __lsx_vstx(tmp1, dst, dst_stride); + __lsx_vstx(tmp2, dst, dst_stride_x2); + __lsx_vstx(tmp3, dst, dst_stride_x3); + dst += dst_stride_x4; + + src10_r = src54_r; + src32_r = src76_r; + src21_r = src65_r; + src43_r = src87_r; + src10_l = src54_l; + src32_l = src76_l; + src21_l = src65_l; + src43_l = src87_l; + src4 = src8; + } +} + +static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, tmp0, tmp1; + __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4); + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + filt_vt2 = __lsx_vldrepl_h(filter_vert, 4); + + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1); + src2 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4); + src += src_stride_x3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + + hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src5 = __lsx_vld(src, 0); + src6 = __lsx_vldx(src, src_stride); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6); + hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + + src7 = __lsx_vld(src, 0); + src8 = __lsx_vldx(src, src_stride); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8); + hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff); + + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + out3 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + hz_out3 = hz_out7; + out0 = out2; + out1 = out3; + } +} + +static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i filt_hz0, filt_hz1, filt_hz2; + __m128i mask0, mask1, mask2, vec0, vec1; + __m128i filt, filt_vt0, filt_vt1, filt_vt2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; + __m128i tmp0, tmp1, tmp2, tmp3; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= (2 + src_stride_x2); + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + filt_hz2 = __lsx_vreplvei_h(filt, 2); + + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vldx(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + + hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + filt_vt2 = __lsx_vreplvei_h(filt, 2); + + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out2, + hz_out1, hz_out4, hz_out3, out0, out1, out3, out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src5, src6, src7, src8); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out5 = __lsx_vpackev_b(hz_out6, hz_out5); + tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out7 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2); + + hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + out6 = __lsx_vpackev_b(hz_out8, hz_out7); + tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2, + VP8_FILTER_SHIFT, vec0, vec1); + DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1); + + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + hz_out4 = hz_out8; + out0 = out2; + out1 = out7; + out3 = out5; + out4 = out6; + } +} + +static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_6ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_6ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hz_4t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1; + __m128i out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out0, out1); + + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); +} + +static void common_hz_4t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1; + __m128i out0, out1, out2, out3; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out0, out1); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2); + src3 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, + out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 3); +} + +static void common_hz_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_4t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_4t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1; + __m128i tmp0, tmp1; + __m128i filt, out0, out1, out2, out3; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + } +} + +static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i filt0, filt1, mask0, mask1; + __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1; + + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src2, src4, src6); + src += 8; + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src1, src3, src5, src7); + src += src_stride_x4 - 8; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4, + src5, src6, src7); + HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, + filt1, out0, out1, out2, out3); + HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0, + filt1, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2, + VP8_FILTER_SHIFT, out5, out4, VP8_FILTER_SHIFT, out7, out6, + VP8_FILTER_SHIFT, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out2, 128, out3, 128, out0, + out1, out2, out3); + __lsx_vstx(out0, dst, 0); + __lsx_vstx(out1, dst, dst_stride); + __lsx_vstx(out2, dst, dst_stride_x2); + __lsx_vstx(out3, dst, dst_stride_x3); + dst += dst_stride_x4; + } +} + +static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5; + __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r; + __m128i src2110, src4332, filt0, filt1, out0, out1; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + + DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1); + DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2); + src1 = __lsx_vld(src, 0); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); + + src2110 = __lsx_vilvl_d(src21_r, src10_r); + src2110 = __lsx_vxori_b(src2110, 128); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src3 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5); + src += src_stride_x3; + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r); + src4332 = __lsx_vilvl_d(src43_r, src32_r); + src4332 = __lsx_vxori_b(src4332, 128); + out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1); + + src2 = __lsx_vld(src, 0); + src += src_stride; + DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r); + src2110 = __lsx_vilvl_d(src65_r, src54_r); + src2110 = __lsx_vxori_b(src2110, 128); + out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1); + out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT); + out0 = __lsx_vxori_b(out0, 128); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + } +} + +static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src7, src8, src9, src10; + __m128i src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1; + __m128i tmp0, tmp1; + __m128i filt, out0_r, out1_r, out2_r, out3_r; + + src -= src_stride; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src7, src8, src9, src10); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9, + src72_r, src87_r, src98_r, src109_r); + out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1); + out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1); + out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1); + out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1); + DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r, + out2_r, VP8_FILTER_SHIFT, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + src10_r = src98_r; + src21_r = src109_r; + src2 = src10; + } +} + +static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6; + __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l; + __m128i src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= src_stride; + filt = __lsx_vld(filter, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src3, src4, src5, src6); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, src5, + src32_r, src43_r, src54_r, src65_r); + DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5, + src32_l, src43_l, src54_l, src65_l); + out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1); + out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1); + out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1); + out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1); + out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1); + out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1); + out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1); + out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1); + DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l, + out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT, + out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0, + tmp1, tmp2, tmp3); + __lsx_vstx(tmp0, dst, 0); + __lsx_vstx(tmp1, dst, dst_stride); + __lsx_vstx(tmp2, dst, dst_stride_x2); + __lsx_vstx(tmp3, dst, dst_stride_x3); + dst += dst_stride_x4; + + src10_r = src54_r; + src21_r = src65_r; + src10_l = src54_l; + src21_l = src65_l; + src2 = src6; + } +} + +static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; + __m128i mask0, mask1, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src3 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5); + src6 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + + DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4); + hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); + + DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6); + hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + vec2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); + + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + hz_out1 = hz_out5; + vec0 = vec2; + } +} + +static inline void common_hv_4ht_4vt_8w_lsx( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1; + __m128i mask0, mask1, out0, out1; + __m128i filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3; + __m128i vec0, vec1, vec2, vec3, vec4; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1 + src_stride; + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); + + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src3, src4, src5, src6); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); + + hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + vec3 = __lsx_vpackev_b(hz_out0, hz_out3); + tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1); + + hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + vec4 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1); + + hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1); + tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + + vec0 = vec4; + vec2 = vec1; + } +} + +static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_4ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_4ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6; + __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + __m128i filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + src -= 2; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2); + src += src_stride_x2; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + + hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src3 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5); + src6 = __lsx_vldx(src, src_stride_x3); + src += src_stride_x4; + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + + hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); + + hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + vec2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp1, dst, 0, 1); + dst += dst_stride; + + hz_out1 = hz_out5; + vec0 = vec2; + } +} + +static inline void common_hv_6ht_4vt_8w_lsx( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + + __m128i src0, src1, src2, src3, src4, src5, src6; + __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2; + __m128i filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3; + __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3; + __m128i out0, out1; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= (2 + src_stride); + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + filt_hz2 = __lsx_vreplvei_h(filt, 2); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + + DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1); + src2 = __lsx_vldx(src, src_stride_x2); + src += src_stride_x3; + + DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1); + src2 = __lsx_vxori_b(src2, 128); + hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1, + filt_hz2); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2); + + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src3, src4, src5, src6); + src += src_stride_x4; + DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3, + src4, src5, src6); + + hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec1 = __lsx_vpackev_b(hz_out3, hz_out2); + tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1); + + hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec3 = __lsx_vpackev_b(hz_out0, hz_out3); + tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1); + + hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1); + + hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0, + filt_hz1, filt_hz2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2); + tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1); + + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + } +} + +static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_6ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_6ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i filt_hz0, filt_hz1, filt_vt0, filt_vt1, filt_vt2, mask0, mask1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, tmp0, tmp1, out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16); + + src -= 1; + + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, + filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + DUP4_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src, src_stride, + src, src_stride_x2, src0, src1, src3, src4); + src2 = __lsx_vld(src, 0); + src += src_stride_x3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); + + DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, + filt_vt1); + filt_vt2 = __lsx_vldrepl_h(filter_vert, 4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7); + src8 = __lsx_vldx(src, src_stride_x3); + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + src += src_stride_x4; + + hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff); + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1); + hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff); + out3 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2); + + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + hz_out3 = hz_out7; + out0 = out2; + out1 = out3; + } +} + +static inline void common_hv_4ht_6vt_8w_lsx( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + int32_t src_stride_x2 = src_stride << 1; + int32_t src_stride_x3 = src_stride_x2 + src_stride; + int32_t src_stride_x4 = src_stride << 2; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i filt_hz0, filt_hz1, mask0, mask1; + __m128i filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7; + __m128i vec0, vec1; + + mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0); + src -= 1 + src_stride_x2; + + filt = __lsx_vld(filter_horiz, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1); + mask1 = __lsx_vaddi_bu(mask0, 2); + + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src0, src1, src2, src3); + src += src_stride_x4; + src4 = __lsx_vld(src, 0); + src += src_stride; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + src4 = __lsx_vxori_b(src4, 128); + hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1); + hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1); + hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1); + hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1); + hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1); + DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4); + + filt = __lsx_vld(filter_vert, 0); + DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1); + filt_vt2 = __lsx_vreplvei_h(filt, 2); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src, + src_stride_x3, src5, src6, src7, src8); + src += src_stride_x4; + + DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5, + src6, src7, src8); + hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1); + out2 = __lsx_vpackev_b(hz_out5, hz_out4); + tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2); + + hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1); + out5 = __lsx_vpackev_b(hz_out6, hz_out5); + tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2); + + hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1); + out6 = __lsx_vpackev_b(hz_out7, hz_out6); + tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2); + + hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1); + out7 = __lsx_vpackev_b(hz_out8, hz_out7); + tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1); + DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1); + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1); + dst += dst_stride_x4; + hz_out4 = hz_out8; + out0 = out2; + out1 = out6; + out3 = out5; + out4 = out7; + } +} + +static void common_hv_4ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride, + uint8_t *RESTRICT dst, int32_t dst_stride, + const int8_t *filter_horiz, + const int8_t *filter_vert, + int32_t height) { + common_hv_4ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + common_hv_4ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride, + filter_horiz, filter_vert, height); +} + +typedef void (*PVp8SixtapPredictFunc1)( + uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, + int32_t height); + +typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src, + int32_t src_stride, + uint8_t *RESTRICT dst, + int32_t dst_stride, const int8_t *filter, + int32_t height); + +void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1]; + + static PVp8SixtapPredictFunc1 Predict4x4Funcs1[4] = { + common_hv_6ht_6vt_4w_lsx, + common_hv_6ht_4vt_4w_lsx, + common_hv_4ht_6vt_4w_lsx, + common_hv_4ht_4vt_4w_lsx, + }; + + static PVp8SixtapPredictFunc2 Predict4x4Funcs2[4] = { common_vt_6t_4w_lsx, + common_vt_4t_4w_lsx, + common_hz_6t_4w_lsx, + common_hz_4t_4w_lsx }; + if (yoffset < 8 && xoffset < 8) { + if (yoffset) { + if (xoffset) { + switch (xoffset & 1) { + case 0: + switch (yoffset & 1) { + case 0: + Predict4x4Funcs1[0](src, src_stride, dst, dst_stride, h_filter, + v_filter, 4); + break; + case 1: + Predict4x4Funcs1[1](src, src_stride, dst, dst_stride, h_filter, + v_filter + 1, 4); + break; + } + break; + + case 1: + switch (yoffset & 1) { + case 0: + Predict4x4Funcs1[2](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 4); + break; + + case 1: + Predict4x4Funcs1[3](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 4); + break; + } + break; + } + } else { + switch (yoffset & 1) { + case 0: + Predict4x4Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 4); + break; + + case 1: + Predict4x4Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1, + 4); + break; + } + } + } else { + switch (xoffset) { + case 0: { + __m128i tp0; + + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; + __lsx_vstelm_w(tp0, dst, 0, 0); + dst += dst_stride; + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; + __lsx_vstelm_w(tp0, dst, 0, 0); + dst += dst_stride; + tp0 = __lsx_vldrepl_w(src, 0); + src += src_stride; + __lsx_vstelm_w(tp0, dst, 0, 0); + dst += dst_stride; + tp0 = __lsx_vldrepl_w(src, 0); + __lsx_vstelm_w(tp0, dst, 0, 0); + + break; + } + case 2: + case 4: + case 6: + Predict4x4Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 4); + break; + } + switch (xoffset & 1) { + case 1: + Predict4x4Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1, + 4); + break; + } + } + } +} + +void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1]; + + static PVp8SixtapPredictFunc1 Predict8x8Funcs1[4] = { + common_hv_6ht_6vt_8w_lsx, + common_hv_6ht_4vt_8w_lsx, + common_hv_4ht_6vt_8w_lsx, + common_hv_4ht_4vt_8w_lsx, + }; + + static PVp8SixtapPredictFunc2 Predict8x8Funcs2[4] = { common_vt_6t_8w_lsx, + common_vt_4t_8w_lsx, + common_hz_6t_8w_lsx, + common_hz_4t_8w_lsx }; + + if (yoffset < 8 && xoffset < 8) { + if (yoffset) { + if (xoffset) { + switch (xoffset & 1) { + case 0: + switch (yoffset & 1) { + case 0: + Predict8x8Funcs1[0](src, src_stride, dst, dst_stride, h_filter, + v_filter, 8); + break; + + case 1: + Predict8x8Funcs1[1](src, src_stride, dst, dst_stride, h_filter, + v_filter + 1, 8); + break; + } + break; + + case 1: + switch (yoffset & 1) { + case 0: + Predict8x8Funcs1[2](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 8); + break; + + case 1: + Predict8x8Funcs1[3](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 8); + break; + } + break; + } + } else { + switch (yoffset & 1) { + case 0: + Predict8x8Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 8); + break; + + case 1: + Predict8x8Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1, + 8); + break; + } + } + } else { + switch (xoffset & 1) { + case 1: + Predict8x8Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1, + 8); + break; + } + switch (xoffset) { + case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break; + case 2: + case 4: + case 6: + Predict8x8Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 8); + break; + } + } + } +} + +void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride, + int32_t xoffset, int32_t yoffset, + uint8_t *RESTRICT dst, int32_t dst_stride) { + const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1]; + const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1]; + + static PVp8SixtapPredictFunc1 Predict16x16Funcs1[4] = { + common_hv_6ht_6vt_16w_lsx, + common_hv_6ht_4vt_16w_lsx, + common_hv_4ht_6vt_16w_lsx, + common_hv_4ht_4vt_16w_lsx, + }; + + static PVp8SixtapPredictFunc2 Predict16x16Funcs2[4] = { + common_vt_6t_16w_lsx, common_vt_4t_16w_lsx, common_hz_6t_16w_lsx, + common_hz_4t_16w_lsx + }; + + if (yoffset < 8 && xoffset < 8) { + if (yoffset) { + if (xoffset) { + switch (xoffset & 1) { + case 0: + switch (yoffset & 1) { + case 0: + Predict16x16Funcs1[0](src, src_stride, dst, dst_stride, + h_filter, v_filter, 16); + break; + + case 1: + Predict16x16Funcs1[1](src, src_stride, dst, dst_stride, + h_filter, v_filter + 1, 16); + break; + } + break; + + case 1: + switch (yoffset & 1) { + case 0: + Predict16x16Funcs1[2](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter, 16); + break; + + case 1: + Predict16x16Funcs1[3](src, src_stride, dst, dst_stride, + h_filter + 1, v_filter + 1, 16); + break; + } + break; + } + } else { + switch (yoffset & 1) { + case 0: + Predict16x16Funcs2[0](src, src_stride, dst, dst_stride, v_filter, + 16); + break; + + case 1: + Predict16x16Funcs2[1](src, src_stride, dst, dst_stride, + v_filter + 1, 16); + break; + } + } + } else { + switch (xoffset & 1) { + case 1: + Predict16x16Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1, + 16); + break; + } + switch (xoffset) { + case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break; + case 2: + case 4: + case 6: + Predict16x16Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 16); + break; + } + } + } +} diff --git a/media/libvpx/libvpx/vp8/common/loopfilter.h b/media/libvpx/libvpx/vp8/common/loopfilter.h index 7484563e06..909e8df512 100644 --- a/media/libvpx/libvpx/vp8/common/loopfilter.h +++ b/media/libvpx/libvpx/vp8/common/loopfilter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_LOOPFILTER_H_ -#define VP8_COMMON_LOOPFILTER_H_ +#ifndef VPX_VP8_COMMON_LOOPFILTER_H_ +#define VPX_VP8_COMMON_LOOPFILTER_H_ #include "vpx_ports/mem.h" #include "vpx_config.h" @@ -26,7 +26,7 @@ extern "C" { typedef enum { NORMAL_LOOPFILTER = 0, SIMPLE_LOOPFILTER = 1 } LOOPFILTERTYPE; -#if ARCH_ARM +#if VPX_ARCH_ARM #define SIMD_WIDTH 1 #else #define SIMD_WIDTH 16 @@ -93,11 +93,9 @@ void vp8_loop_filter_row_normal(struct VP8Common *cm, void vp8_loop_filter_row_simple(struct VP8Common *cm, struct modeinfo *mode_info_context, int mb_row, - int post_ystride, int post_uvstride, - unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr); + int post_ystride, unsigned char *y_ptr); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_COMMON_LOOPFILTER_H_ +#endif // VPX_VP8_COMMON_LOOPFILTER_H_ diff --git a/media/libvpx/libvpx/vp8/common/loopfilter_filters.c b/media/libvpx/libvpx/vp8/common/loopfilter_filters.c index 2a7cde8788..61a55d3c92 100644 --- a/media/libvpx/libvpx/vp8/common/loopfilter_filters.c +++ b/media/libvpx/libvpx/vp8/common/loopfilter_filters.c @@ -86,10 +86,12 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0, u = vp8_signed_char_clamp(ps1 + filter_value); *op1 = u ^ 0x80; } -void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */ - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, int count) { + +static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { int hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -109,10 +111,11 @@ void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */ } while (++i < count * 8); } -void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, int count) { +static void loop_filter_vertical_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { int hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -185,11 +188,11 @@ static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0, *op2 = s ^ 0x80; } -void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count) { +static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { signed char hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -210,10 +213,11 @@ void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p, } while (++i < count * 8); } -void vp8_mbloop_filter_vertical_edge_c(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, int count) { +static void mbloop_filter_vertical_edge_c(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { signed char hev = 0; /* high edge variance */ signed char mask = 0; int i = 0; @@ -266,28 +270,32 @@ static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, *op0 = u ^ 0x80; } -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *s, int p, +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, + int y_stride, const unsigned char *blimit) { signed char mask = 0; int i = 0; do { - mask = vp8_simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p], s[0 * p], - s[1 * p]); - vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); - ++s; + mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2 * y_stride], + y_ptr[-1 * y_stride], y_ptr[0 * y_stride], + y_ptr[1 * y_stride]); + vp8_simple_filter(mask, y_ptr - 2 * y_stride, y_ptr - 1 * y_stride, y_ptr, + y_ptr + 1 * y_stride); + ++y_ptr; } while (++i < 16); } -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p, +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit) { signed char mask = 0; int i = 0; do { - mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); - vp8_simple_filter(mask, s - 2, s - 1, s, s + 1); - s += p; + mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2], y_ptr[-1], y_ptr[0], + y_ptr[1]); + vp8_simple_filter(mask, y_ptr - 2, y_ptr - 1, y_ptr, y_ptr + 1); + y_ptr += y_stride; } while (++i < 16); } @@ -295,17 +303,17 @@ void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p, void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); if (u_ptr) { - vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } if (v_ptr) { - vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } @@ -313,17 +321,17 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); if (u_ptr) { - vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } if (v_ptr) { - vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } @@ -331,21 +339,21 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); if (u_ptr) { - vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); + loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 1); } if (v_ptr) { - vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); + loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 1); } } @@ -363,21 +371,21 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); if (u_ptr) { - vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 1); + loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); } if (v_ptr) { - vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 1); + loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); } } diff --git a/media/libvpx/libvpx/vp8/common/mfqe.c b/media/libvpx/libvpx/vp8/common/mfqe.c index 5aace8c99d..3d801e72e2 100644 --- a/media/libvpx/libvpx/vp8/common/mfqe.c +++ b/media/libvpx/libvpx/vp8/common/mfqe.c @@ -18,6 +18,7 @@ #include "./vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" +#include "vp8/common/common.h" #include "vp8/common/postproc.h" #include "vpx_dsp/variance.h" #include "vpx_mem/vpx_mem.h" @@ -74,8 +75,7 @@ static void apply_ifactor(unsigned char *y_src, int y_src_stride, src_weight); vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight); - } else /* if (block_size == 8) */ - { + } else { vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight); vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, @@ -136,8 +136,7 @@ static void multiframe_quality_enhance_block( usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6; vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride) + 32) >> 6; #endif - } else /* if (blksize == 8) */ - { + } else { actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse) + 32) >> 6; act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse) + 32) >> 6; #ifdef USE_SSD @@ -186,14 +185,12 @@ static void multiframe_quality_enhance_block( apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd, uvd_stride, blksize, ifactor); } - } else /* else implicitly copy from previous frame */ - { + } else { /* else implicitly copy from previous frame */ if (blksize == 16) { vp8_copy_mem16x16(y, y_stride, yd, yd_stride); vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride); vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride); - } else /* if (blksize == 8) */ - { + } else { vp8_copy_mem8x8(y, y_stride, yd, yd_stride); for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride) { @@ -215,6 +212,7 @@ static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map) { { 0, 1, 4, 5 }, { 2, 3, 6, 7 }, { 8, 9, 12, 13 }, { 10, 11, 14, 15 } }; int i, j; + vp8_zero_array(map, 4); for (i = 0; i < 4; ++i) { map[i] = 1; for (j = 0; j < 4 && map[j]; ++j) { @@ -237,7 +235,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) { FRAME_TYPE frame_type = cm->frame_type; /* Point at base of Mb MODE_INFO list has motion vectors etc */ - const MODE_INFO *mode_info_context = cm->show_frame_mi; + const MODE_INFO *mode_info_context = cm->mi; int mb_row; int mb_col; int totmap, map[4]; @@ -297,8 +295,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) { } } } - } else /* totmap = 4 */ - { + } else { /* totmap = 4 */ multiframe_quality_enhance_block( 16, qcurr, qprev, y_ptr, u_ptr, v_ptr, show->y_stride, show->uv_stride, yd_ptr, ud_ptr, vd_ptr, dest->y_stride, diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c index 2de343419a..b9da52084d 100644 --- a/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c +++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/filter_dspr2.c @@ -673,9 +673,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4), [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) + [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1), - [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [p2] "r"(p2), + [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2), [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3), [src_ptr] "r"(src_ptr)); @@ -724,9 +724,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4) + [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4) : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2), - [p2] "r"(p2), [n2] "r"(n2), [p4] "r"(p4), [n4] "r"(n4), [p1] "r"(p1), + [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1), [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a), [vector3b] "r"(vector3b)); @@ -781,9 +781,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr, : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [Temp4] "=r"(Temp4) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1), - [p4] "r"(p4), [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), + [Temp4] "=r"(Temp4), [tp1] "+r"(tp1) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4), + [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3), [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); @@ -816,8 +816,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) - : [src_pixels_per_line] "r"(src_pixels_per_line), - [output_ptr] "r"(output_ptr)); + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); __asm__ __volatile__( "ulw %[Temp1], 0(%[src_ptr]) \n\t" @@ -832,8 +832,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) - : [src_pixels_per_line] "r"(src_pixels_per_line), - [output_ptr] "r"(output_ptr)); + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); __asm__ __volatile__( "ulw %[Temp1], 0(%[src_ptr]) \n\t" @@ -848,8 +848,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr, : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr) - : [src_pixels_per_line] "r"(src_pixels_per_line), - [output_ptr] "r"(output_ptr)); + : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"( + output_ptr)); output_ptr += 48; } diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c index 899dc10ad9..eae852d592 100644 --- a/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c +++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c @@ -35,41 +35,41 @@ void vp8_dequant_idct_add_y_block_dspr2(short *q, short *dq, unsigned char *dst, } void vp8_dequant_idct_add_uv_block_dspr2(short *q, short *dq, - unsigned char *dstu, - unsigned char *dstv, int stride, + unsigned char *dst_u, + unsigned char *dst_v, int stride, char *eobs) { int i, j; for (i = 0; i < 2; ++i) { for (j = 0; j < 2; ++j) { if (*eobs++ > 1) - vp8_dequant_idct_add_dspr2(q, dq, dstu, stride); + vp8_dequant_idct_add_dspr2(q, dq, dst_u, stride); else { - vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dstu, stride, dstu, stride); + vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_u, stride, dst_u, stride); ((int *)q)[0] = 0; } q += 16; - dstu += 4; + dst_u += 4; } - dstu += 4 * stride - 8; + dst_u += 4 * stride - 8; } for (i = 0; i < 2; ++i) { for (j = 0; j < 2; ++j) { if (*eobs++ > 1) - vp8_dequant_idct_add_dspr2(q, dq, dstv, stride); + vp8_dequant_idct_add_dspr2(q, dq, dst_v, stride); else { - vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dstv, stride, dstv, stride); + vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_v, stride, dst_v, stride); ((int *)q)[0] = 0; } q += 16; - dstv += 4; + dst_v += 4; } - dstv += 4 * stride - 8; + dst_v += 4 * stride - 8; } } diff --git a/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c index d2c3442515..21446fb413 100644 --- a/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c +++ b/media/libvpx/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c @@ -934,8 +934,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p, s4 = s3 + p; /* load quad-byte vectors - * memory is 4 byte aligned - */ + * memory is 4 byte aligned + */ p2 = *((uint32_t *)(s1 - 4)); p6 = *((uint32_t *)(s1)); p1 = *((uint32_t *)(s2 - 4)); @@ -990,8 +990,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p, :); /* if (p1 - p4 == 0) and (p2 - p3 == 0) - * mask will be zero and filtering is not needed - */ + * mask will be zero and filtering is not needed + */ if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, thresh, &hev, &mask); @@ -2102,8 +2102,8 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p, s4 = s3 + p; /* load quad-byte vectors - * memory is 4 byte aligned - */ + * memory is 4 byte aligned + */ p2 = *((uint32_t *)(s1 - 4)); p6 = *((uint32_t *)(s1)); p1 = *((uint32_t *)(s2 - 4)); diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c new file mode 100644 index 0000000000..86a32aa9ef --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/copymem_mmi.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define COPY_MEM_16X2 \ + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \ + "ldl %[tmp0], 0x0f(%[src]) \n\t" \ + "ldr %[tmp0], 0x08(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \ + "sdl %[tmp0], 0x0f(%[dst]) \n\t" \ + "sdr %[tmp0], 0x08(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "ldl %[tmp1], 0x0f(%[src]) \n\t" \ + "ldr %[tmp1], 0x08(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "gssdlc1 %[ftmp1], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp1], 0x00(%[dst]) \n\t" \ + "sdl %[tmp1], 0x0f(%[dst]) \n\t" \ + "sdr %[tmp1], 0x08(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + +#define COPY_MEM_8X2 \ + "gsldlc1 %[ftmp0], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp0], 0x00(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + "ldl %[tmp0], 0x07(%[src]) \n\t" \ + "ldr %[tmp0], 0x00(%[src]) \n\t" \ + MMI_ADDU(%[src], %[src], %[src_stride]) \ + \ + "gssdlc1 %[ftmp0], 0x07(%[dst]) \n\t" \ + "gssdrc1 %[ftmp0], 0x00(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) \ + "sdl %[tmp0], 0x07(%[dst]) \n\t" \ + "sdr %[tmp0], 0x00(%[dst]) \n\t" \ + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + +void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + double ftmp[2]; + uint64_t tmp[2]; + uint8_t loop_count = 4; + + /* clang-format off */ + __asm__ volatile ( + "1: \n\t" + COPY_MEM_16X2 + COPY_MEM_16X2 + MMI_ADDIU(%[loop_count], %[loop_count], -0x01) + "bnez %[loop_count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [loop_count]"+&r"(loop_count), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + double ftmp[2]; + uint64_t tmp[1]; + uint8_t loop_count = 4; + + /* clang-format off */ + __asm__ volatile ( + "1: \n\t" + COPY_MEM_8X2 + MMI_ADDIU(%[loop_count], %[loop_count], -0x01) + "bnez %[loop_count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), [loop_count]"+&r"(loop_count), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst, + int dst_stride) { + double ftmp[2]; + uint64_t tmp[1]; + + /* clang-format off */ + __asm__ volatile ( + COPY_MEM_8X2 + COPY_MEM_8X2 + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [tmp0]"=&r"(tmp[0]), + [dst]"+&r"(dst), [src]"+&r"(src) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c new file mode 100644 index 0000000000..b9330a6663 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/dequantize_mmi.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vp8/common/blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) { + double ftmp[8]; + + __asm__ volatile( + "gsldlc1 %[ftmp0], 0x07(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[qcoeff]) \n\t" + "gsldlc1 %[ftmp3], 0x1f(%[qcoeff]) \n\t" + "gsldrc1 %[ftmp3], 0x18(%[qcoeff]) \n\t" + + "gsldlc1 %[ftmp4], 0x07(%[DQC]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[DQC]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[DQC]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[DQC]) \n\t" + "gsldlc1 %[ftmp6], 0x17(%[DQC]) \n\t" + "gsldrc1 %[ftmp6], 0x10(%[DQC]) \n\t" + "gsldlc1 %[ftmp7], 0x1f(%[DQC]) \n\t" + "gsldrc1 %[ftmp7], 0x18(%[DQC]) \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[dqcoeff]) \n\t" + "gssdlc1 %[ftmp3], 0x1f(%[dqcoeff]) \n\t" + "gssdrc1 %[ftmp3], 0x18(%[dqcoeff]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]) + : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC) + : "memory"); +} + +void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest, + int stride) { + double ftmp[8]; + + __asm__ volatile( + "gsldlc1 %[ftmp0], 0x07(%[dq]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[dq]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[dq]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[dq]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[dq]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[dq]) \n\t" + "gsldlc1 %[ftmp3], 0x1f(%[dq]) \n\t" + "gsldrc1 %[ftmp3], 0x18(%[dq]) \n\t" + + "gsldlc1 %[ftmp4], 0x07(%[input]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[input]) \n\t" + "gsldlc1 %[ftmp5], 0x0f(%[input]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[input]) \n\t" + "gsldlc1 %[ftmp6], 0x17(%[input]) \n\t" + "gsldrc1 %[ftmp6], 0x10(%[input]) \n\t" + "gsldlc1 %[ftmp7], 0x1f(%[input]) \n\t" + "gsldrc1 %[ftmp7], 0x18(%[input]) \n\t" + + "pmullh %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "pmullh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "pmullh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "pmullh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" + "gssdlc1 %[ftmp1], 0x0f(%[input]) \n\t" + "gssdrc1 %[ftmp1], 0x08(%[input]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[input]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[input]) \n\t" + "gssdlc1 %[ftmp3], 0x1f(%[input]) \n\t" + "gssdrc1 %[ftmp3], 0x18(%[input]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]) + : [dq] "r"(dq), [input] "r"(input) + : "memory"); + + vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride); + + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[input]) \n\t" + "sdl $0, 0x0f(%[input]) \n\t" + "sdr $0, 0x08(%[input]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[input]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[input]) \n\t" + "sdl $0, 0x1f(%[input]) \n\t" + "sdr $0, 0x18(%[input]) \n\t" + : [ftmp0] "=&f"(ftmp[0]) + : [input] "r"(input) + : "memory"); +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c new file mode 100644 index 0000000000..4fd6854c52 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, + int stride, char *eobs) { + int i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst += 4; + } + + dst += 4 * stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst_u, + uint8_t *dst_v, int stride, char *eobs) { + int i, j; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst_u, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_u, stride, dst_u, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst_u += 4; + } + + dst_u += 4 * stride - 8; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst_v, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_v, stride, dst_v, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst_v += 4; + } + + dst_v += 4 * stride - 8; + } +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c new file mode 100644 index 0000000000..a35689dd30 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/idctllm_mmi.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +#define TRANSPOSE_4H \ + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ + MMI_LI(%[tmp0], 0x93) \ + "mtc1 %[tmp0], %[ftmp10] \n\t" \ + "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ + "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ + "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + +void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + double ftmp[12]; + uint64_t tmp[1]; + double ff_ph_04, ff_ph_4e7b, ff_ph_22a3; + + __asm__ volatile ( + "dli %[tmp0], 0x0004000400040004 \n\t" + "dmtc1 %[tmp0], %[ff_ph_04] \n\t" + "dli %[tmp0], 0x4e7b4e7b4e7b4e7b \n\t" + "dmtc1 %[tmp0], %[ff_ph_4e7b] \n\t" + "dli %[tmp0], 0x22a322a322a322a3 \n\t" + "dmtc1 %[tmp0], %[ff_ph_22a3] \n\t" + MMI_LI(%[tmp0], 0x02) + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t" + "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t" + "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t" + + // ip[0...3] + ip[8...11] + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // ip[0...3] - ip[8...11] + "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + // (ip[12...15] * sinpi8sqrt2) >> 16 + "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" + "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t" + // (ip[ 4... 7] * sinpi8sqrt2) >> 16 + "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" + "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t" + // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16) + "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t" + // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16) + "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t" + "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + + TRANSPOSE_4H + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // b + "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + // c + "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t" + "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" + "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t" + "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t" + // d + "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t" + "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t" + "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t" + "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t" + "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" + + MMI_LI(%[tmp0], 0x03) + "mtc1 %[tmp0], %[ftmp11] \n\t" + // a + d + "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_ph_04] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + // b + c + "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_ph_04] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + // b - c + "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_ph_04] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + // a - d + "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_ph_04] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + TRANSPOSE_4H +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" +#else + "gslwlc1 %[ftmp5], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp5], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" +#else + "gslwlc1 %[ftmp6], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp2], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp2], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" +#else + "gslwlc1 %[ftmp7], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "gsswlc1 %[ftmp3], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp3], 0x00(%[dst_ptr]) \n\t" + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[pred_prt]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" +#else + "gslwlc1 %[ftmp8], 0x03(%[pred_ptr]) \n\t" + "gslwrc1 %[ftmp8], 0x00(%[pred_ptr]) \n\t" +#endif + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "gsswlc1 %[ftmp4], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp4], 0x00(%[dst_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), + [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), + [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr), + [ff_ph_4e7b]"=&f"(ff_ph_4e7b), [ff_ph_04]"=&f"(ff_ph_04), + [ff_ph_22a3]"=&f"(ff_ph_22a3) + : [ip]"r"(input), + [pred_stride]"r"((mips_reg)pred_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); +} + +void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { + int a0 = ((input_dc + 4) >> 3); + double a1, ftmp[5]; + int low32; + + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[a0], %[a1] \n\t" + "pshufh %[a1], %[a1], %[ftmp0] \n\t" + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + + MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride]) + MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride]) + "ulw %[low32], 0x00(%[pred_ptr]) \n\t" + "mtc1 %[low32], %[ftmp1] \n\t" + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" + "paddsh %[ftmp2], %[ftmp2], %[a1] \n\t" + "packushb %[ftmp1], %[ftmp2], %[ftmp0] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[dst_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[dst_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32), + [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr), [a1]"=&f"(a1) + : [dst_stride]"r"((mips_reg)dst_stride), + [pred_stride]"r"((mips_reg)pred_stride), [a0]"r"(a0) + : "memory" + ); +} + +void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) { + int i; + int16_t output[16]; + double ff_ph_03, ftmp[12]; + uint64_t tmp[1]; + + __asm__ volatile ( + "dli %[tmp0], 0x0003000300030003 \n\t" + "dmtc1 %[tmp0], %[ff_ph_03] \n\t" + MMI_LI(%[tmp0], 0x03) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ip]) \n\t" + "gsldlc1 %[ftmp3], 0x17(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x10(%[ip]) \n\t" + "gsldlc1 %[ftmp4], 0x1f(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x18(%[ip]) \n\t" + "paddh %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "psubh %[ftmp6], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp7], %[ftmp3], %[ftmp4] \n\t" + "psubh %[ftmp8], %[ftmp3], %[ftmp4] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp2], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t" + "paddh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + TRANSPOSE_4H + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + // d + "psubh %[ftmp6], %[ftmp1], %[ftmp4] \n\t" + // b + "paddh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + // c + "psubh %[ftmp8], %[ftmp2], %[ftmp3] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t" + "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t" + "psubh %[ftmp3], %[ftmp5], %[ftmp7] \n\t" + "psubh %[ftmp4], %[ftmp6], %[ftmp8] \n\t" + + "paddh %[ftmp1], %[ftmp1], %[ff_ph_03] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ff_ph_03] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ff_ph_03] \n\t" + "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "paddh %[ftmp4], %[ftmp4], %[ff_ph_03] \n\t" + "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + TRANSPOSE_4H + "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" + "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" + "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]), + [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]), + [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), + [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [ff_ph_03]"=&f"(ff_ph_03) + : [ip]"r"(input), [op]"r"(output) + : "memory" + ); + + for (i = 0; i < 16; i++) { + mb_dqcoeff[i * 16] = output[i]; + } +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c new file mode 100644 index 0000000000..a07a7e3b41 --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c @@ -0,0 +1,1415 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vp8/common/loopfilter.h" +#include "vp8/common/onyxc_int.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vp8_loop_filter_horizontal_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + uint64_t tmp[1]; + mips_reg addr[2]; + double ftmp[12]; + double ff_ph_01, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03; + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp10], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[limit]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gsldlc1 %[ftmp1], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gsldlc1 %[ftmp3], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp4], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp9], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp8], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp10] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "gsldlc1 %[ftmp10], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[blimit]) \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + + "gsldlc1 %[ftmp10], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[thresh]) \n\t" + "psubusb %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "psubusb %[ftmp2], %[ftmp11], %[ftmp10] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + + "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psubsb %[ftmp3], %[ftmp6], %[ftmp5] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + + "paddsb %[ftmp8], %[ftmp2], %[ff_pb_03] \n\t" + "paddsb %[ftmp9], %[ftmp2], %[ff_pb_04] \n\t" + + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp8] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" + "packsshb %[ftmp8], %[ftmp0], %[ftmp11] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + "pxor %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + "punpckhbh %[ftmp9], %[ftmp11], %[ftmp9] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "paddsh %[ftmp11], %[ftmp0], %[ff_ph_01] \n\t" + "packsshb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "paddsh %[ftmp9], %[ftmp9], %[ff_ph_01] \n\t" + + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "psrah %[ftmp11], %[ftmp11], %[ftmp10] \n\t" + "psrah %[ftmp9], %[ftmp9], %[ftmp10] \n\t" + "packsshb %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "pandn %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp5], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp4], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[addr1]) \n\t" + + "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + + "addiu %[count], %[count], -0x01 \n\t" + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_fe]"=&f"(ff_pb_fe), + [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_04]"=&f"(ff_pb_04), + [ff_pb_03]"=&f"(ff_pb_03) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)) + : "memory" + ); + /* clang-format on */ +} + +void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count) { + uint64_t tmp[1]; + mips_reg addr[2]; + double ftmp[13]; + double ff_pb_fe, ff_ph_01, ff_pb_03, ff_pb_04, ff_pb_80; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) + + "1: \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SLL (%[tmp0], %[src_pixel_step], 0x01) + MMI_ADDU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_ADDU(%[addr1], %[addr0], %[tmp0]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp1], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t" + + "gsldlc1 %[ftmp11], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp12], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr0]) \n\t" + "punpcklbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp4], %[ftmp11], %[ftmp12] \n\t" + + "punpcklhw %[ftmp5], %[ftmp4], %[ftmp2] \n\t" + "punpckhhw %[ftmp6], %[ftmp4], %[ftmp2] \n\t" + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp1] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x01) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp9], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp10], %[ftmp11], %[ftmp12] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsldlc1 %[ftmp11], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr1]) \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[addr0], %[tmp0]) + "gsldlc1 %[ftmp12], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp0], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + + "punpcklhw %[ftmp1], %[ftmp11], %[ftmp10] \n\t" + "punpckhhw %[ftmp2], %[ftmp11], %[ftmp10] \n\t" + "punpcklhw %[ftmp3], %[ftmp0], %[ftmp9] \n\t" + "punpckhhw %[ftmp4], %[ftmp0], %[ftmp9] \n\t" + + /* ftmp9:q0 ftmp10:q1 */ + "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t" + "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t" + /* ftmp11:q2 ftmp12:q3 */ + "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t" + "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t" + /* ftmp1:p3 ftmp2:p2 */ + "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + /* ftmp5:p1 ftmp6:p0 */ + "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[limit]) \n\t" + + /* abs (q3-q2) */ + "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + "psubusb %[ftmp0], %[ftmp7], %[ftmp8] \n\t" + /* abs (q2-q1) */ + "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp3: abs(q1-q0) */ + "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp4: abs(p1-p0) */ + "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p2-p1) */ + "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p3-p2) */ + "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[blimit]) \n\t" + + /* abs (p0-q0) */ + "pasubub %[ftmp11], %[ftmp9], %[ftmp6] \n\t" + "paddusb %[ftmp11], %[ftmp11], %[ftmp11] \n\t" + /* abs (p1-q1) */ + "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" + "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp1] \n\t" + "psrlh %[ftmp12], %[ftmp12], %[ftmp1] \n\t" + "paddusb %[ftmp1], %[ftmp11], %[ftmp12] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp0:mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "gsldlc1 %[ftmp8], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[thresh]) \n\t" + + /* ftmp3: abs(q1-q0) ftmp4: abs(p1-p0) */ + "psubusb %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + "psubusb %[ftmp3], %[ftmp3], %[ftmp8] \n\t" + "por %[ftmp2], %[ftmp4], %[ftmp3] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp1:hev */ + "pxor %[ftmp1], %[ftmp2], %[ftmp1] \n\t" + + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + "psubsb %[ftmp2], %[ftmp5], %[ftmp10] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + "psubsb %[ftmp3], %[ftmp9], %[ftmp6] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" + /* ftmp2:filter_value */ + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + + "paddsb %[ftmp11], %[ftmp2], %[ff_pb_04] \n\t" + "paddsb %[ftmp12], %[ftmp2], %[ff_pb_03] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp12], %[ftmp0], %[ftmp8] \n\t" + + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp11] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp11], %[ftmp0], %[ftmp8] \n\t" + + "psubsb %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "paddsb %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "paddsh %[ftmp0], %[ftmp0], %[ff_ph_01] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_01] \n\t" + + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packsshb %[ftmp2], %[ftmp0], %[ftmp8] \n\t" + "pandn %[ftmp2], %[ftmp1], %[ftmp2] \n\t" + "psubsb %[ftmp10], %[ftmp10], %[ftmp2] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + + /* ftmp5: *op1 ; ftmp6: *op0 */ + "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + /* ftmp9: *oq0 ; ftmp10: *oq1 */ + "punpcklbh %[ftmp4], %[ftmp9], %[ftmp10] \n\t" + "punpckhbh %[ftmp3], %[ftmp9], %[ftmp10] \n\t" + "punpckhhw %[ftmp6], %[ftmp2], %[ftmp4] \n\t" + "punpcklhw %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t" + + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "ssrld %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[addr1], %[addr0], %[tmp0]) + "gsswlc1 %[ftmp2], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp2], 0x02(%[addr1]) \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x01) + MMI_SUBU(%[addr1], %[src_ptr], %[tmp0]) + "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" + + "ssrld %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsswlc1 %[ftmp6], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x02(%[addr1]) \n\t" + "gsswlc1 %[ftmp1], 0x05(%[src_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x02(%[src_ptr]) \n\t" + + "ssrld %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "gsswlc1 %[ftmp1], 0x05(%[addr0]) \n\t" + "gsswrc1 %[ftmp1], 0x02(%[addr0]) \n\t" + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step]) + "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" + + "ssrld %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + MMI_ADDU(%[addr1], %[addr0], %[tmp0]) + "gsswlc1 %[ftmp5], 0x05(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x02(%[addr1]) \n\t" + + MMI_ADDIU(%[count], %[count], -0x01) + MMI_SLL(%[tmp0], %[src_pixel_step], 0x03) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_ph_01]"=&f"(ff_ph_01), [ff_pb_03]"=&f"(ff_pb_03), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_fe]"=&f"(ff_pb_fe) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step) + : "memory" + ); + /* clang-format on */ +} + +/* clang-format off */ +#define VP8_MBLOOP_HPSRAB \ + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" \ + "psrah %[ftmp10], %[ftmp10], %[ftmp9] \n\t" \ + "psrah %[ftmp11], %[ftmp11], %[ftmp9] \n\t" \ + "packsshb %[ftmp0], %[ftmp10], %[ftmp11] \n\t" + +#define VP8_MBLOOP_HPSRAB_ADD(reg) \ + "punpcklbh %[ftmp1], %[ftmp0], %[ftmp12] \n\t" \ + "punpckhbh %[ftmp2], %[ftmp0], %[ftmp12] \n\t" \ + "pmulhh %[ftmp1], %[ftmp1], " #reg " \n\t" \ + "pmulhh %[ftmp2], %[ftmp2], " #reg " \n\t" \ + "paddh %[ftmp1], %[ftmp1], %[ff_ph_003f] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_003f] \n\t" \ + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \ + "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" \ + "packsshb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" +/* clang-format on */ + +void vp8_mbloop_filter_horizontal_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + uint64_t tmp[1]; + double ftmp[13]; + double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03, ff_ph_003f, ff_ph_0900, + ff_ph_1200, ff_ph_1b00; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + "dli %[tmp0], 0x003f003f003f003f \n\t" + "dmtc1 %[tmp0], %[ff_ph_003f] \n\t" + "dli %[tmp0], 0x0900090009000900 \n\t" + "dmtc1 %[tmp0], %[ff_ph_0900] \n\t" + "dli %[tmp0], 0x1200120012001200 \n\t" + "dmtc1 %[tmp0], %[ff_ph_1200] \n\t" + "dli %[tmp0], 0x1b001b001b001b00 \n\t" + "dmtc1 %[tmp0], %[ff_ph_1b00] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + "1: \n\t" + "gsldlc1 %[ftmp9], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[limit]) \n\t" + /* ftmp1: p3 */ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + /* ftmp3: p2 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + /* ftmp4: p1 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + /* ftmp5: p0 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + /* ftmp6: q0 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + /* ftmp7: q1 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + /* ftmp8: q2 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + /* ftmp2: q3 */ + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp2], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[src_ptr]) \n\t" + + "gsldlc1 %[ftmp12], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp12], 0x00(%[blimit]) \n\t" + + "pasubub %[ftmp0], %[ftmp1], %[ftmp3] \n\t" + "psubusb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "pasubub %[ftmp1], %[ftmp3], %[ftmp4] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp10], %[ftmp4], %[ftmp5] \n\t" + "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp11], %[ftmp7], %[ftmp6] \n\t" + "psubusb %[ftmp1], %[ftmp11], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp1], %[ftmp8], %[ftmp7] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pasubub %[ftmp1], %[ftmp2], %[ftmp8] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "pasubub %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + "pasubub %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "pand %[ftmp2], %[ftmp2], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrlh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp1], %[ftmp1], %[ftmp12] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + /* ftmp0: mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + + "gsldlc1 %[ftmp9], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[thresh]) \n\t" + "psubusb %[ftmp1], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp2], %[ftmp11], %[ftmp9] \n\t" + "paddb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "pcmpeqb %[ftmp2], %[ftmp2], %[ftmp2] \n\t" + /* ftmp1: hev */ + "pxor %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "psubsb %[ftmp2], %[ftmp4], %[ftmp7] \n\t" + "psubsb %[ftmp9], %[ftmp6], %[ftmp5] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pandn %[ftmp12], %[ftmp1], %[ftmp2] \n\t" + "pand %[ftmp2], %[ftmp2], %[ftmp1] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "paddsb %[ftmp0], %[ftmp2], %[ff_pb_03] \n\t" + VP8_MBLOOP_HPSRAB + "paddsb %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddsb %[ftmp0], %[ftmp2], %[ff_pb_04] \n\t" + VP8_MBLOOP_HPSRAB + "psubsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00]) + "psubsb %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + MMI_SLL(%[tmp0], %[src_pixel_step], 0x02) + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + "gssdlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200]) + "paddsb %[ftmp4], %[ftmp4], %[ftmp1] \n\t" + "psubsb %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pxor %[ftmp4], %[ftmp4], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp4], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + + VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900]) + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + "psubsb %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ff_pb_80] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "gssdlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0]) + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "addiu %[count], %[count], -0x01 \n\t" + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_03]"=&f"(ff_pb_03), + [ff_ph_0900]"=&f"(ff_ph_0900), [ff_ph_1b00]"=&f"(ff_ph_1b00), + [ff_ph_1200]"=&f"(ff_ph_1200), [ff_ph_003f]"=&f"(ff_ph_003f) + : [limit]"r"(limit), [blimit]"r"(blimit), + [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step) + : "memory" + ); + /* clang-format on */ +} + +/* clang-format off */ +#define VP8_MBLOOP_VPSRAB_ADDH \ + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \ + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" \ + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + +#define VP8_MBLOOP_VPSRAB_ADDT \ + "paddh %[ftmp7], %[ftmp7], %[ff_ph_003f] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_003f] \n\t" \ + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" \ + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" \ + "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" +/* clang-format on */ + +void vp8_mbloop_filter_vertical_edge_mmi( + unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit, + const unsigned char *limit, const unsigned char *thresh, int count) { + mips_reg tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, srct[2]); + double ftmp[14]; + double ff_ph_003f, ff_ph_0900, ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_03; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x003f003f003f003f \n\t" + "dmtc1 %[tmp0], %[ff_ph_003f] \n\t" + "dli %[tmp0], 0x0900090009000900 \n\t" + "dmtc1 %[tmp0], %[ff_ph_0900] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0303030303030303 \n\t" + "dmtc1 %[tmp0], %[ff_pb_03] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], 0x04) + + "1: \n\t" + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t" + + "punpcklhw %[ftmp1], %[ftmp12], %[ftmp10] \n\t" + "punpckhhw %[ftmp2], %[ftmp12], %[ftmp10] \n\t" + "punpcklhw %[ftmp3], %[ftmp11], %[ftmp9] \n\t" + "punpckhhw %[ftmp4], %[ftmp11], %[ftmp9] \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp5], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp7], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp8], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp12], %[ftmp5], %[ftmp6] \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp8] \n\t" + + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp10] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp10] \n\t" + "punpcklhw %[ftmp7], %[ftmp11], %[ftmp9] \n\t" + "punpckhhw %[ftmp8], %[ftmp11], %[ftmp9] \n\t" + + "gsldlc1 %[ftmp13], 0x07(%[limit]) \n\t" + "gsldrc1 %[ftmp13], 0x00(%[limit]) \n\t" + /* ftmp9:q0 ftmp10:q1 */ + "punpcklwd %[ftmp9], %[ftmp1], %[ftmp5] \n\t" + "punpckhwd %[ftmp10], %[ftmp1], %[ftmp5] \n\t" + /* ftmp11:q2 ftmp12:q3 */ + "punpcklwd %[ftmp11], %[ftmp2], %[ftmp6] \n\t" + "punpckhwd %[ftmp12], %[ftmp2], %[ftmp6] \n\t" + /* srct[0x00]: q3 */ + "sdc1 %[ftmp12], 0x00(%[srct]) \n\t" + /* ftmp1:p3 ftmp2:p2 */ + "punpcklwd %[ftmp1], %[ftmp3], %[ftmp7] \n\t" + "punpckhwd %[ftmp2], %[ftmp3], %[ftmp7] \n\t" + /* srct[0x08]: p3 */ + "sdc1 %[ftmp1], 0x08(%[srct]) \n\t" + /* ftmp5:p1 ftmp6:p0 */ + "punpcklwd %[ftmp5], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp6], %[ftmp4], %[ftmp8] \n\t" + + /* abs (q3-q2) */ + "pasubub %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + "psubusb %[ftmp0], %[ftmp7], %[ftmp13] \n\t" + /* abs (q2-q1) */ + "pasubub %[ftmp7], %[ftmp11], %[ftmp10] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp3: abs(q1-q0) */ + "pasubub %[ftmp3], %[ftmp10], %[ftmp9] \n\t" + "psubusb %[ftmp7], %[ftmp3], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* ftmp4: abs(p1-p0) */ + "pasubub %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + "psubusb %[ftmp7], %[ftmp4], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p2-p1) */ + "pasubub %[ftmp7], %[ftmp2], %[ftmp5] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + /* abs (p3-p2) */ + "pasubub %[ftmp7], %[ftmp1], %[ftmp2] \n\t" + "psubusb %[ftmp7], %[ftmp7], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + + "gsldlc1 %[ftmp13], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp13], 0x00(%[blimit]) \n\t" + "gsldlc1 %[ftmp7], 0x07(%[thresh]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[thresh]) \n\t" + /* abs (p0-q0) * 2 */ + "pasubub %[ftmp1], %[ftmp9], %[ftmp6] \n\t" + "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* abs (p1-q1) / 2 */ + "pasubub %[ftmp12], %[ftmp10], %[ftmp5] \n\t" + "pand %[ftmp12], %[ftmp12], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "psrlh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + "paddusb %[ftmp12], %[ftmp1], %[ftmp12] \n\t" + "psubusb %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + /* ftmp0: mask */ + "pcmpeqb %[ftmp0], %[ftmp0], %[ftmp12] \n\t" + + /* abs(p1-p0) - thresh */ + "psubusb %[ftmp4], %[ftmp4], %[ftmp7] \n\t" + /* abs(q1-q0) - thresh */ + "psubusb %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "por %[ftmp3], %[ftmp4], %[ftmp3] \n\t" + "pcmpeqb %[ftmp3], %[ftmp3], %[ftmp12] \n\t" + "pcmpeqb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" + /* ftmp1: hev */ + "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + + /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */ + "pxor %[ftmp11], %[ftmp11], %[ff_pb_80] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ff_pb_80] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ff_pb_80] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ff_pb_80] \n\t" + "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + + "psubsb %[ftmp3], %[ftmp5], %[ftmp10] \n\t" + "psubsb %[ftmp4], %[ftmp9], %[ftmp6] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + /* filter_value &= mask */ + "pand %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + /* Filter2 = filter_value & hev */ + "pand %[ftmp3], %[ftmp1], %[ftmp0] \n\t" + /* filter_value &= ~hev */ + "pandn %[ftmp0], %[ftmp1], %[ftmp0] \n\t" + + "paddsb %[ftmp4], %[ftmp3], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp4] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "packsshb %[ftmp4], %[ftmp7], %[ftmp8] \n\t" + /* ftmp9: qs0 */ + "psubsb %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsb %[ftmp3], %[ftmp3], %[ff_pb_03] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "punpckhbh %[ftmp8], %[ftmp8], %[ftmp3] \n\t" + "psrah %[ftmp7], %[ftmp7], %[ftmp12] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp12] \n\t" + "packsshb %[ftmp3], %[ftmp7], %[ftmp8] \n\t" + /* ftmp6: ps0 */ + "paddsb %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" + VP8_MBLOOP_VPSRAB_ADDH + "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp9], %[ftmp3] \n\t" + /* ftmp9: oq0 */ + "pxor %[ftmp9], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp6], %[ftmp3] \n\t" + /* ftmp6: op0 */ + "pxor %[ftmp6], %[ftmp4], %[ff_pb_80] \n\t" + + VP8_MBLOOP_VPSRAB_ADDH + "paddh %[ftmp1], %[ff_ph_0900], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp10], %[ftmp3] \n\t" + /* ftmp10: oq1 */ + "pxor %[ftmp10], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp5], %[ftmp3] \n\t" + /* ftmp5: op1 */ + "pxor %[ftmp5], %[ftmp4], %[ff_pb_80] \n\t" + + VP8_MBLOOP_VPSRAB_ADDH + "pmulhh %[ftmp7], %[ftmp7], %[ff_ph_0900] \n\t" + "pmulhh %[ftmp8], %[ftmp8], %[ff_ph_0900] \n\t" + VP8_MBLOOP_VPSRAB_ADDT + "psubsb %[ftmp4], %[ftmp11], %[ftmp3] \n\t" + /* ftmp11: oq2 */ + "pxor %[ftmp11], %[ftmp4], %[ff_pb_80] \n\t" + "paddsb %[ftmp4], %[ftmp2], %[ftmp3] \n\t" + /* ftmp2: op2 */ + "pxor %[ftmp2], %[ftmp4], %[ff_pb_80] \n\t" + + "ldc1 %[ftmp12], 0x00(%[srct]) \n\t" + "ldc1 %[ftmp8], 0x08(%[srct]) \n\t" + + "punpcklbh %[ftmp0], %[ftmp8], %[ftmp2] \n\t" + "punpckhbh %[ftmp1], %[ftmp8], %[ftmp2] \n\t" + "punpcklbh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "punpckhbh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp5], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp6], %[ftmp1], %[ftmp3] \n\t" + "punpckhhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t" + + "punpcklbh %[ftmp0], %[ftmp9], %[ftmp10] \n\t" + "punpckhbh %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "punpcklbh %[ftmp2], %[ftmp11], %[ftmp12] \n\t" + "punpckhbh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "punpcklhw %[ftmp8], %[ftmp0], %[ftmp2] \n\t" + "punpckhhw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp10], %[ftmp1], %[ftmp3] \n\t" + "punpckhhw %[ftmp11], %[ftmp1], %[ftmp3] \n\t" + + "punpcklwd %[ftmp0], %[ftmp7], %[ftmp11] \n\t" + "punpckhwd %[ftmp1], %[ftmp7], %[ftmp11] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp0], %[ftmp6], %[ftmp10] \n\t" + "punpckhwd %[ftmp1], %[ftmp6], %[ftmp10] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp9] \n\t" + "punpckhwd %[ftmp0], %[ftmp5], %[ftmp9] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + + "punpcklwd %[ftmp1], %[ftmp4], %[ftmp8] \n\t" + "punpckhwd %[ftmp0], %[ftmp4], %[ftmp8] \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "addiu %[count], %[count], -0x01 \n\t" + + MMI_SLL(%[tmp0], %[src_pixel_step], 0x03) + MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0]) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [src_ptr]"+&r"(src_ptr), + [count]"+&r"(count), + [ff_ph_003f]"=&f"(ff_ph_003f), [ff_ph_0900]"=&f"(ff_ph_0900), + [ff_pb_03]"=&f"(ff_pb_03), [ff_pb_04]"=&f"(ff_pb_04), + [ff_pb_80]"=&f"(ff_pb_80), [ff_pb_fe]"=&f"(ff_pb_fe) + : [limit]"r"(limit), [blimit]"r"(blimit), + [srct]"r"(srct), [thresh]"r"(thresh), + [src_pixel_step]"r"((mips_reg)src_pixel_step) + : "memory" + ); + /* clang-format on */ +} + +/* clang-format off */ +#define VP8_SIMPLE_HPSRAB \ + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" \ + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" \ + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" \ + "psrah %[ftmp1], %[ftmp5], %[ftmp10] \n\t" \ + "psllh %[ftmp1], %[ftmp1], %[ftmp8] \n\t" \ + "por %[ftmp0], %[ftmp0], %[ftmp1] \n\t" +/* clang-format on */ + +void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit) { + uint64_t tmp[1], count = 2; + mips_reg addr[2]; + double ftmp[12]; + double ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0101010101010101 \n\t" + "dmtc1 %[tmp0], %[ff_pb_01] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp3], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[blimit]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsldlc1 %[ftmp2], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + "pasubub %[ftmp1], %[ftmp7], %[ftmp2] \n\t" + "pand %[ftmp1], %[ftmp1], %[ff_pb_fe] \n\t" + "psrlh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsldlc1 %[ftmp6], 0x07(%[addr1]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + "pasubub %[ftmp5], %[ftmp6], %[ftmp0] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp3] \n\t" + + "pxor %[ftmp2], %[ftmp2], %[ff_pb_80] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ff_pb_80] \n\t" + "psubsb %[ftmp2], %[ftmp2], %[ftmp7] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp0], %[ff_pb_80] \n\t" + "psubsb %[ftmp0], %[ftmp3], %[ftmp6] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddsb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + + "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" + VP8_SIMPLE_HPSRAB + "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "gssdlc1 %[ftmp3], 0x07(%[src_ptr]) \n\t" + "gssdrc1 %[ftmp3], 0x00(%[src_ptr]) \n\t" + + "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" + VP8_SIMPLE_HPSRAB + "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gssdlc1 %[ftmp6], 0x07(%[addr1]) \n\t" + "gssdrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + "addiu %[count], %[count], -0x01 \n\t" + MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08) + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01) + : [blimit]"r"(blimit), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)) + : "memory" + ); + /* clang-format on */ +} + +void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr, + int src_pixel_step, + const unsigned char *blimit) { + uint64_t tmp[1], count = 2; + mips_reg addr[2]; + DECLARE_ALIGNED(8, const uint64_t, srct[2]); + double ftmp[12], ff_pb_fe, ff_pb_80, ff_pb_04, ff_pb_01; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x20 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp8] \n\t" + "dli %[tmp0], 0x20 \n\t" + "dmtc1 %[tmp0], %[ftmp10] \n\t" + "dli %[tmp0], 0xfefefefefefefefe \n\t" + "dmtc1 %[tmp0], %[ff_pb_fe] \n\t" + "dli %[tmp0], 0x8080808080808080 \n\t" + "dmtc1 %[tmp0], %[ff_pb_80] \n\t" + "dli %[tmp0], 0x0404040404040404 \n\t" + "dmtc1 %[tmp0], %[ff_pb_04] \n\t" + "dli %[tmp0], 0x0101010101010101 \n\t" + "dmtc1 %[tmp0], %[ff_pb_01] \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4]) + MMI_SUBU(%[src_ptr], %[src_ptr], 0x02) + + "1: \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step]) + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "gslwlc1 %[ftmp4], 0x03(%[src_ptr]) \n\t" + "gslwrc1 %[ftmp4], 0x00(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpckhhw %[ftmp5], %[ftmp4], %[ftmp6] \n\t" + "punpcklhw %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gslwlc1 %[ftmp7], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gslwlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp7] \n\t" + + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gslwlc1 %[ftmp1], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp1], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gslwlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gslwrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp1] \n\t" + + "punpckhhw %[ftmp2], %[ftmp0], %[ftmp6] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpckhwd %[ftmp1], %[ftmp0], %[ftmp4] \n\t" + "punpcklwd %[ftmp0], %[ftmp0], %[ftmp4] \n\t" + "punpckhwd %[ftmp3], %[ftmp2], %[ftmp5] \n\t" + "punpcklwd %[ftmp2], %[ftmp2], %[ftmp5] \n\t" + + "dli %[tmp0], 0x01 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "pasubub %[ftmp6], %[ftmp3], %[ftmp0] \n\t" + "pand %[ftmp6], %[ftmp6], %[ff_pb_fe] \n\t" + "psrlh %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "pasubub %[ftmp5], %[ftmp1], %[ftmp2] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddusb %[ftmp5], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp7], 0x07(%[blimit]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[blimit]) \n\t" + "psubusb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pcmpeqb %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + + "sdc1 %[ftmp0], 0x00(%[srct]) \n\t" + "sdc1 %[ftmp3], 0x08(%[srct]) \n\t" + + "pxor %[ftmp0], %[ftmp0], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "psubsb %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + + "pxor %[ftmp6], %[ftmp1], %[ff_pb_80] \n\t" + "pxor %[ftmp3], %[ftmp2], %[ff_pb_80] \n\t" + "psubsb %[ftmp7], %[ftmp3], %[ftmp6] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "paddsb %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "paddsb %[ftmp5], %[ftmp5], %[ff_pb_04] \n\t" + + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp7], %[ftmp5], %[ftmp9] \n\t" + "psllh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp7] \n\t" + "psubsb %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + "pxor %[ftmp3], %[ftmp3], %[ff_pb_80] \n\t" + "psubsb %[ftmp5], %[ftmp5], %[ff_pb_01] \n\t" + + "dli %[tmp0], 0x03 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psllh %[ftmp0], %[ftmp5], %[ftmp8] \n\t" + "psrah %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + "psrlh %[ftmp0], %[ftmp0], %[ftmp8] \n\t" + + "dli %[tmp0], 0x0b \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "psllh %[ftmp5], %[ftmp5], %[ftmp8] \n\t" + "por %[ftmp0], %[ftmp0], %[ftmp5] \n\t" + "paddsb %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ff_pb_80] \n\t" + + "ldc1 %[ftmp0], 0x00(%[srct]) \n\t" + "ldc1 %[ftmp4], 0x08(%[srct]) \n\t" + + "punpckhbh %[ftmp1], %[ftmp0], %[ftmp6] \n\t" + "punpcklbh %[ftmp0], %[ftmp0], %[ftmp6] \n\t" + "punpcklbh %[ftmp2], %[ftmp3], %[ftmp4] \n\t" + "punpckhbh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + + "punpckhhw %[ftmp6], %[ftmp0], %[ftmp2] \n\t" + "punpcklhw %[ftmp0], %[ftmp0], %[ftmp2] \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4]) + "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + "punpckhhw %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + "punpcklhw %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + + "ssrld %[ftmp0], %[ftmp0], %[ftmp10] \n\t" + MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4]) + "gsswlc1 %[ftmp0], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp0], 0x00(%[addr1]) \n\t" + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + "ssrld %[ftmp6], %[ftmp6], %[ftmp10] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[src_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + + MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step]) + "gsswlc1 %[ftmp6], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp6], 0x00(%[addr1]) \n\t" + + MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + + "ssrld %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[addr0]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[addr0]) \n\t" + + "ssrld %[ftmp5], %[ftmp5], %[ftmp10] \n\t" + MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2]) + "gsswlc1 %[ftmp5], 0x03(%[addr1]) \n\t" + "gsswrc1 %[ftmp5], 0x00(%[addr1]) \n\t" + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8]) + "addiu %[count], %[count], -0x01 \n\t" + "bnez %[count], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]), + [src_ptr]"+&r"(src_ptr), [count]"+&r"(count), + [ff_pb_fe]"=&f"(ff_pb_fe), [ff_pb_80]"=&f"(ff_pb_80), + [ff_pb_04]"=&f"(ff_pb_04), [ff_pb_01]"=&f"(ff_pb_01) + : [blimit]"r"(blimit), [srct]"r"(srct), + [src_pixel_step]"r"((mips_reg)src_pixel_step), + [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)), + [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)), + [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)) + : "memory" + ); + /* clang-format on */ +} + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + + if (v_ptr) + vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim, + lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + loop_filter_info *lfi) { + vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 2); + + if (u_ptr) + vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); + + if (v_ptr) + vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, + lfi->hev_thr, 1); +} + +void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, + blimit); + vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride, + blimit); +} + +void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit); +} diff --git a/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c new file mode 100644 index 0000000000..b85f73fdff --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/filter.h" +#include "vpx_ports/asmdefs_mmi.h" + +DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = { + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, + 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, + 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, + 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, + 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, + { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, + 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, + 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, + 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, + 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, + 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, + 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, + 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } +}; + +/* Horizontal filter: pixel_step is 1, output_height and output_width are + the size of horizontal filtering output, output_height is always H + 5 */ +static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp8_filter) { + uint64_t tmp[1]; + double ff_ph_40; +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0040004000400040 \n\t" + "dmtc1 %[tmp0], %[ff_ph_40] \n\t" + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "pxor %[fzero], %[fzero], %[fzero] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp7] \n\t" + "dli %[tmp0], 0x08 \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp10], 0x06(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp10], -0x01(%[src_ptr]) \n\t" + + "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" + + "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "punpckhbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "ssrld %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "ssrld %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" + "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t" + + "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" + "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[fzero] \n\t" + "gssdlc1 %[ftmp8], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [src_ptr]"+&r"(src_ptr), [ff_ph_40]"=&f"(ff_ph_40) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width) + : "memory" + ); + /* clang-format on */ +} + +/* Horizontal filter: pixel_step is always W */ +static INLINE void vp8_filter_block1dc_v6_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) { + double ff_ph_40; + uint64_t tmp[1]; + mips_reg addr[1]; + +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); + register double ftmp2 asm("$f6"); + register double ftmp3 asm("$f8"); + register double ftmp4 asm("$f10"); + register double ftmp5 asm("$f12"); + register double ftmp6 asm("$f14"); + register double ftmp7 asm("$f16"); + register double ftmp8 asm("$f18"); + register double ftmp9 asm("$f20"); + register double ftmp10 asm("$f22"); + register double ftmp11 asm("$f24"); + register double ftmp12 asm("$f26"); + register double ftmp13 asm("$f28"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f3"); + register double ftmp3 asm("$f4"); + register double ftmp4 asm("$f5"); + register double ftmp5 asm("$f6"); + register double ftmp6 asm("$f7"); + register double ftmp7 asm("$f8"); + register double ftmp8 asm("$f9"); + register double ftmp9 asm("$f10"); + register double ftmp10 asm("$f11"); + register double ftmp11 asm("$f12"); + register double ftmp12 asm("$f13"); + register double ftmp13 asm("$f14"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0040004000400040 \n\t" + "dmtc1 %[tmp0], %[ff_ph_40] \n\t" + "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t" + "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t" + "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t" + "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t" + "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t" + "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t" + "pxor %[fzero], %[fzero], %[fzero] \n\t" + "dli %[tmp0], 0x07 \n\t" + "dmtc1 %[tmp0], %[ftmp13] \n\t" + + /* In order to make full use of memory load delay slot, + * Operation of memory loading and calculating has been rearranged. + */ + "1: \n\t" + "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line]) + "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp8], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[addr0]) \n\t" + + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp9], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2]) + "gsldlc1 %[ftmp10], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[addr0]) \n\t" + MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4]) + "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t" + + "pmullh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + + "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp7] \n\t" + + "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp8] \n\t" + + "pmullh %[ftmp9], %[ftmp9], %[ftmp4] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp9] \n\t" + + "pmullh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp10] \n\t" + + "pmullh %[ftmp11], %[ftmp11], %[ftmp5] \n\t" + "paddsh %[ftmp12], %[ftmp12], %[ftmp11] \n\t" + + "paddsh %[ftmp12], %[ftmp12], %[ff_ph_40] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp13] \n\t" + "packushb %[ftmp12], %[ftmp12], %[fzero] \n\t" + "gsswlc1 %[ftmp12], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp12], 0x00(%[output_ptr]) \n\t" + + MMI_ADDIU(%[output_height], %[output_height], -0x01) + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), + [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4), + [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6), + [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8), + [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10), + [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12), + [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]), + [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height), + [ff_ph_40]"=&f"(ff_ph_40) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)), + [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)), + [vp8_filter]"r"(vp8_filter), + [output_pitch]"r"((mips_reg)output_pitch) + : "memory" + ); + /* clang-format on */ +} + +/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0}, + function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can + be simplified */ +static INLINE void vp8_filter_block1d_h6_filter0_mmi( + unsigned char *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int output_height, + unsigned int output_width) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format off */ + __asm__ volatile ( + "pxor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line]) + + "punpcklbh %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gssdlc1 %[ftmp1], 0x07(%[output_ptr]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + "addiu %[output_height], %[output_height], -0x01 \n\t" + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), + [output_width]"r"(output_width) + : "memory" + ); + /* clang-format on */ +} + +static INLINE void vp8_filter_block1dc_v6_filter0_mmi( + uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, + int output_pitch, unsigned int pixels_per_line) { +#if _MIPS_SIM == _ABIO32 + register double fzero asm("$f0"); + register double ftmp0 asm("$f2"); + register double ftmp1 asm("$f4"); +#else + register double fzero asm("$f0"); + register double ftmp0 asm("$f1"); + register double ftmp1 asm("$f2"); +#endif // _MIPS_SIM == _ABIO32 + + /* clang-format on */ + __asm__ volatile ( + "pxor %[fzero], %[fzero], %[fzero] \n\t" + + "1: \n\t" + "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line]) + MMI_ADDIU(%[output_height], %[output_height], -0x01) + "packushb %[ftmp1], %[ftmp0], %[fzero] \n\t" + "gsswlc1 %[ftmp1], 0x03(%[output_ptr]) \n\t" + "gsswrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t" + + MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) + "bnez %[output_height], 1b \n\t" + : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0), + [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr), + [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height) + : [pixels_per_line]"r"((mips_reg)pixels_per_line), + [output_pitch]"r"((mips_reg)output_pitch) + : "memory" + ); + /* clang-format on */ +} + +#define sixtapNxM(n, m) \ + void vp8_sixtap_predict##n##x##m##_mmi( \ + unsigned char *src_ptr, int src_pixels_per_line, int xoffset, \ + int yoffset, unsigned char *dst_ptr, int dst_pitch) { \ + DECLARE_ALIGNED(16, uint16_t, \ + FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]); \ + const int16_t *HFilter, *VFilter; \ + int i, loop = n / 4; \ + HFilter = vp8_six_tap_mmi[xoffset]; \ + VFilter = vp8_six_tap_mmi[yoffset]; \ + \ + if (xoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_filter0_mmi( \ + src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4, \ + src_pixels_per_line, m + 5, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \ + FData2 + i * 4, src_pixels_per_line, m + 5, \ + n * 2, HFilter); \ + } \ + } \ + if (yoffset == 0) { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_filter0_mmi( \ + FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2); \ + } \ + } else { \ + for (i = 0; i < loop; ++i) { \ + vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m, \ + dst_pitch, n * 2, VFilter); \ + } \ + } \ + } + +sixtapNxM(4, 4); +sixtapNxM(8, 8); +sixtapNxM(8, 4); +sixtapNxM(16, 16); diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c index 3d516d0f81..efad0c29f8 100644 --- a/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c +++ b/media/libvpx/libvpx/vp8/common/mips/msa/idct_msa.c @@ -134,7 +134,7 @@ static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred, ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride); } -void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) { +void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dqcoeff) { v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1; const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; @@ -157,22 +157,22 @@ void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) { ADD2(tmp0, 3, tmp1, 3, out0, out1); out0 >>= 3; out1 >>= 3; - mb_dq_coeff[0] = __msa_copy_s_h(out0, 0); - mb_dq_coeff[16] = __msa_copy_s_h(out0, 4); - mb_dq_coeff[32] = __msa_copy_s_h(out1, 0); - mb_dq_coeff[48] = __msa_copy_s_h(out1, 4); - mb_dq_coeff[64] = __msa_copy_s_h(out0, 1); - mb_dq_coeff[80] = __msa_copy_s_h(out0, 5); - mb_dq_coeff[96] = __msa_copy_s_h(out1, 1); - mb_dq_coeff[112] = __msa_copy_s_h(out1, 5); - mb_dq_coeff[128] = __msa_copy_s_h(out0, 2); - mb_dq_coeff[144] = __msa_copy_s_h(out0, 6); - mb_dq_coeff[160] = __msa_copy_s_h(out1, 2); - mb_dq_coeff[176] = __msa_copy_s_h(out1, 6); - mb_dq_coeff[192] = __msa_copy_s_h(out0, 3); - mb_dq_coeff[208] = __msa_copy_s_h(out0, 7); - mb_dq_coeff[224] = __msa_copy_s_h(out1, 3); - mb_dq_coeff[240] = __msa_copy_s_h(out1, 7); + mb_dqcoeff[0] = __msa_copy_s_h(out0, 0); + mb_dqcoeff[16] = __msa_copy_s_h(out0, 4); + mb_dqcoeff[32] = __msa_copy_s_h(out1, 0); + mb_dqcoeff[48] = __msa_copy_s_h(out1, 4); + mb_dqcoeff[64] = __msa_copy_s_h(out0, 1); + mb_dqcoeff[80] = __msa_copy_s_h(out0, 5); + mb_dqcoeff[96] = __msa_copy_s_h(out1, 1); + mb_dqcoeff[112] = __msa_copy_s_h(out1, 5); + mb_dqcoeff[128] = __msa_copy_s_h(out0, 2); + mb_dqcoeff[144] = __msa_copy_s_h(out0, 6); + mb_dqcoeff[160] = __msa_copy_s_h(out1, 2); + mb_dqcoeff[176] = __msa_copy_s_h(out1, 6); + mb_dqcoeff[192] = __msa_copy_s_h(out0, 3); + mb_dqcoeff[208] = __msa_copy_s_h(out0, 7); + mb_dqcoeff[224] = __msa_copy_s_h(out1, 3); + mb_dqcoeff[240] = __msa_copy_s_h(out1, 7); } static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input, @@ -359,27 +359,27 @@ void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq, uint8_t *dst, } } -void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dstu, - uint8_t *dstv, int32_t stride, +void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dst_u, + uint8_t *dst_v, int32_t stride, char *eobs) { int16_t *eobs_h = (int16_t *)eobs; if (eobs_h[0]) { if (eobs_h[0] & 0xfefe) { - dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride); + dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride); } else { - dequant_idct_addconst_2x_msa(q, dq, dstu, stride); + dequant_idct_addconst_2x_msa(q, dq, dst_u, stride); } } q += 32; - dstu += (stride * 4); + dst_u += (stride * 4); if (eobs_h[1]) { if (eobs_h[1] & 0xfefe) { - dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride); + dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride); } else { - dequant_idct_addconst_2x_msa(q, dq, dstu, stride); + dequant_idct_addconst_2x_msa(q, dq, dst_u, stride); } } @@ -387,20 +387,20 @@ void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dstu, if (eobs_h[2]) { if (eobs_h[2] & 0xfefe) { - dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride); + dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride); } else { - dequant_idct_addconst_2x_msa(q, dq, dstv, stride); + dequant_idct_addconst_2x_msa(q, dq, dst_v, stride); } } q += 32; - dstv += (stride * 4); + dst_v += (stride * 4); if (eobs_h[3]) { if (eobs_h[3] & 0xfefe) { - dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride); + dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride); } else { - dequant_idct_addconst_2x_msa(q, dq, dstv, stride); + dequant_idct_addconst_2x_msa(q, dq, dst_v, stride); } } } diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c b/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c index b0affcff01..3a1bb7cd57 100644 --- a/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c +++ b/media/libvpx/libvpx/vp8/common/mips/msa/sixtap_filter_msa.c @@ -35,101 +35,134 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = { #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \ filt_h2) \ ({ \ - v16i8 vec0_m, vec1_m, vec2_m; \ - v8i16 hz_out_m; \ + v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m; \ + v8i16 _6tap_out_m; \ \ VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ - vec0_m, vec1_m, vec2_m); \ - hz_out_m = \ - DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2); \ + _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m); \ + _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m, \ + filt_h0, filt_h1, filt_h2); \ \ - hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT); \ - hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT); \ + _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7); \ \ - hz_out_m; \ + _6tap_out_m; \ }) #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ mask2, filt0, filt1, filt2, out0, out1) \ { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ + v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m, \ + _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m; \ \ - VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ - DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ - VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ - VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ - DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m, \ + _6tap_4wid_vec1_m); \ + DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m, \ + _6tap_4wid_vec3_m); \ + DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m, \ + _6tap_4wid_vec5_m); \ + DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \ + out1); \ } -#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - mask2, filt0, filt1, filt2, out0, out1, \ - out2, out3) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - \ - VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ - out0, out1, out2, out3); \ - VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ - VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \ - DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ - out0, out1, out2, out3); \ - DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \ - out0, out1, out2, out3); \ +#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, filt0, filt1, filt2, out0, out1, \ + out2, out3) \ + { \ + v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m, \ + _6tap_8wid_vec6_m, _6tap_8wid_vec7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m, \ + _6tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m); \ + DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \ + out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m, \ + _6tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m, \ + _6tap_8wid_vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m, \ + _6tap_8wid_vec7_m); \ + DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m, \ + _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \ + out2, out3); \ + DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m, \ + _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \ + out2, out3); \ } -#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ - ({ \ - v8i16 tmp0; \ - \ - tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ - tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ - \ - tmp0; \ - }) - -#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ +#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ ({ \ - v16i8 vec0_m, vec1_m; \ - v8i16 hz_out_m; \ + v8i16 _4tap_dpadd_tmp0; \ \ - VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \ - hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \ + _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + _4tap_dpadd_tmp0 = \ + __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \ \ - hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT); \ - hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ - \ - hz_out_m; \ + _4tap_dpadd_tmp0; \ }) -#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - filt0, filt1, out0, out1) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ - \ - VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ - DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ - VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ +#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ + ({ \ + v16i8 _4tap_vec0_m, _4tap_vec1_m; \ + v8i16 _4tap_out_m; \ + \ + VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m, \ + _4tap_vec1_m); \ + _4tap_out_m = \ + FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \ + \ + _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT); \ + _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7); \ + \ + _4tap_out_m; \ + }) + +#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1) \ + { \ + v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m, \ + _4tap_4wid_vec3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m, \ + _4tap_4wid_vec1_m); \ + DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0, \ + out1); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m, \ + _4tap_4wid_vec3_m); \ + DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \ + out1); \ } -#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ - filt0, filt1, out0, out1, out2, out3) \ - { \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ - \ - VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ - DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ - out0, out1, out2, out3); \ - VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ - VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ - DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ - out0, out1, out2, out3); \ +#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + filt0, filt1, out0, out1, out2, out3) \ + { \ + v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m, \ + _4tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m); \ + DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1, \ + out2, out3); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m, \ + _4tap_8wid_vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m); \ + DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m, \ + _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \ + out2, out3); \ } static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride, diff --git a/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h index 6bec3adec3..75b99146d2 100644 --- a/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h +++ b/media/libvpx/libvpx/vp8/common/mips/msa/vp8_macros_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ -#define VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ +#ifndef VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ +#define VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ #include @@ -40,158 +40,160 @@ #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) #if (__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - asm volatile("lw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \ + uint32_t lw_val_m; \ + \ + asm volatile("lw %[lw_val_m], %[lw_psrc_m] \n\t" \ + \ + : [lw_val_m] "=r"(lw_val_m) \ + : [lw_psrc_m] "m"(*lw_psrc_m)); \ + \ + lw_val_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - asm volatile("ld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint64_t ld_val_m = 0; \ + \ + asm volatile("ld %[ld_val_m], %[ld_psrc_m] \n\t" \ + \ + : [ld_val_m] "=r"(ld_val_m) \ + : [ld_psrc_m] "m"(*ld_psrc_m)); \ + \ + ld_val_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m); \ - val1_m = LW(psrc_m + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint32_t ld_val0_m, ld_val1_m; \ + uint64_t ld_val_m = 0; \ + \ + ld_val0_m = LW(ld_psrc_m); \ + ld_val1_m = LW(ld_psrc_m + 4); \ + \ + ld_val_m = (uint64_t)(ld_val1_m); \ + ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \ + ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m); \ + \ + ld_val_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - asm volatile("sh %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SH(val, pdst) \ + { \ + uint8_t *sh_pdst_m = (uint8_t *)(pdst); \ + const uint16_t sh_val_m = (val); \ + \ + asm volatile("sh %[sh_val_m], %[sh_pdst_m] \n\t" \ + \ + : [sh_pdst_m] "=m"(*sh_pdst_m) \ + : [sh_val_m] "r"(sh_val_m)); \ } -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - asm volatile("sw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SW(val, pdst) \ + { \ + uint8_t *sw_pdst_m = (uint8_t *)(pdst); \ + const uint32_t sw_val_m = (val); \ + \ + asm volatile("sw %[sw_val_m], %[sw_pdst_m] \n\t" \ + \ + : [sw_pdst_m] "=m"(*sw_pdst_m) \ + : [sw_val_m] "r"(sw_val_m)); \ } -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - asm volatile("sd %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SD(val, pdst) \ + { \ + uint8_t *sd_pdst_m = (uint8_t *)(pdst); \ + const uint64_t sd_val_m = (val); \ + \ + asm volatile("sd %[sd_val_m], %[sd_pdst_m] \n\t" \ + \ + : [sd_pdst_m] "=m"(*sd_pdst_m) \ + : [sd_val_m] "r"(sd_val_m)); \ } #else // !(__mips_isa_rev >= 6) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - asm volatile("ulw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \ + uint32_t lw_val_m; \ + \ + asm volatile( \ + "lwr %[lw_val_m], 0(%[lw_psrc_m]) \n\t" \ + "lwl %[lw_val_m], 3(%[lw_psrc_m]) \n\t" \ + : [lw_val_m] "=&r"(lw_val_m) \ + : [lw_psrc_m] "r"(lw_psrc_m)); \ + \ + lw_val_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - asm volatile("uld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \ + uint64_t ld_val_m = 0; \ + \ + asm volatile( \ + "ldr %[ld_val_m], 0(%[ld_psrc_m]) \n\t" \ + "ldl %[ld_val_m], 7(%[ld_psrc_m]) \n\t" \ + : [ld_val_m] "=&r"(ld_val_m) \ + : [ld_psrc_m] "r"(ld_psrc_m)); \ + \ + ld_val_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *ld_psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t ld_val0_m, ld_val1_m; \ + uint64_t ld_val_m = 0; \ + \ + ld_val0_m = LW(ld_psrc_m1); \ + ld_val1_m = LW(ld_psrc_m1 + 4); \ + \ + ld_val_m = (uint64_t)(ld_val1_m); \ + ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \ + ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m); \ + \ + ld_val_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - asm volatile("ush %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SH(val, pdst) \ + { \ + uint8_t *sh_pdst_m = (uint8_t *)(pdst); \ + const uint16_t sh_val_m = (val); \ + \ + asm volatile("ush %[sh_val_m], %[sh_pdst_m] \n\t" \ + \ + : [sh_pdst_m] "=m"(*sh_pdst_m) \ + : [sh_val_m] "r"(sh_val_m)); \ } -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - asm volatile("usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SW(val, pdst) \ + { \ + uint8_t *sw_pdst_m = (uint8_t *)(pdst); \ + const uint32_t sw_val_m = (val); \ + \ + asm volatile("usw %[sw_val_m], %[sw_pdst_m] \n\t" \ + \ + : [sw_pdst_m] "=m"(*sw_pdst_m) \ + : [sw_val_m] "r"(sw_val_m)); \ } -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m1 = (uint8_t *)(pdst); \ - uint32_t val0_m, val1_m; \ - \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - \ - SW(val0_m, pdst_m1); \ - SW(val1_m, pdst_m1 + 4); \ +#define SD(val, pdst) \ + { \ + uint8_t *sd_pdst_m1 = (uint8_t *)(pdst); \ + uint32_t sd_val0_m, sd_val1_m; \ + \ + sd_val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ + sd_val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(sd_val0_m, sd_pdst_m1); \ + SW(sd_val1_m, sd_pdst_m1 + 4); \ } #endif // (__mips_isa_rev >= 6) @@ -1757,4 +1759,4 @@ \ tmp1_m; \ }) -#endif /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */ +#endif // VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ diff --git a/media/libvpx/libvpx/vp8/common/modecont.c b/media/libvpx/libvpx/vp8/common/modecont.c index d6ad9bb99a..bab410374f 100644 --- a/media/libvpx/libvpx/vp8/common/modecont.c +++ b/media/libvpx/libvpx/vp8/common/modecont.c @@ -11,28 +11,16 @@ #include "entropy.h" const int vp8_mode_contexts[6][4] = { - { - /* 0 */ - 7, 1, 1, 143, - }, - { - /* 1 */ - 14, 18, 14, 107, - }, - { - /* 2 */ - 135, 64, 57, 68, - }, - { - /* 3 */ - 60, 56, 128, 65, - }, - { - /* 4 */ - 159, 134, 128, 34, - }, - { - /* 5 */ - 234, 188, 128, 28, - }, + { /* 0 */ + 7, 1, 1, 143 }, + { /* 1 */ + 14, 18, 14, 107 }, + { /* 2 */ + 135, 64, 57, 68 }, + { /* 3 */ + 60, 56, 128, 65 }, + { /* 4 */ + 159, 134, 128, 34 }, + { /* 5 */ + 234, 188, 128, 28 }, }; diff --git a/media/libvpx/libvpx/vp8/common/modecont.h b/media/libvpx/libvpx/vp8/common/modecont.h index b58c7dc2d3..031f74f2ff 100644 --- a/media/libvpx/libvpx/vp8/common/modecont.h +++ b/media/libvpx/libvpx/vp8/common/modecont.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_MODECONT_H_ -#define VP8_COMMON_MODECONT_H_ +#ifndef VPX_VP8_COMMON_MODECONT_H_ +#define VPX_VP8_COMMON_MODECONT_H_ #ifdef __cplusplus extern "C" { @@ -21,4 +21,4 @@ extern const int vp8_mode_contexts[6][4]; } // extern "C" #endif -#endif // VP8_COMMON_MODECONT_H_ +#endif // VPX_VP8_COMMON_MODECONT_H_ diff --git a/media/libvpx/libvpx/vp8/common/mv.h b/media/libvpx/libvpx/vp8/common/mv.h index b6d2147af8..4cde12f201 100644 --- a/media/libvpx/libvpx/vp8/common/mv.h +++ b/media/libvpx/libvpx/vp8/common/mv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_MV_H_ -#define VP8_COMMON_MV_H_ +#ifndef VPX_VP8_COMMON_MV_H_ +#define VPX_VP8_COMMON_MV_H_ #include "vpx/vpx_integer.h" #ifdef __cplusplus @@ -30,4 +30,4 @@ typedef union int_mv { } // extern "C" #endif -#endif // VP8_COMMON_MV_H_ +#endif // VPX_VP8_COMMON_MV_H_ diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h index 72fba2ec56..2038c000b0 100644 --- a/media/libvpx/libvpx/vp8/common/onyx.h +++ b/media/libvpx/libvpx/vp8/common/onyx.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ONYX_H_ -#define VP8_COMMON_ONYX_H_ +#ifndef VPX_VP8_COMMON_ONYX_H_ +#define VPX_VP8_COMMON_ONYX_H_ #ifdef __cplusplus extern "C" { @@ -26,13 +26,6 @@ struct VP8_COMP; /* Create/destroy static data structures. */ -typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 -} VPX_SCALING; - typedef enum { USAGE_LOCAL_FILE_PLAYBACK = 0x0, USAGE_STREAM_FROM_SERVER = 0x1, @@ -58,19 +51,19 @@ typedef enum { #include static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { switch (mode) { - case NORMAL: + case VP8E_NORMAL: *hr = 1; *hs = 1; break; - case FOURFIVE: + case VP8E_FOURFIVE: *hr = 4; *hs = 5; break; - case THREEFIVE: + case VP8E_THREEFIVE: *hr = 3; *hs = 5; break; - case ONETWO: + case VP8E_ONETWO: *hr = 1; *hs = 2; break; @@ -90,7 +83,14 @@ typedef struct { int Width; int Height; struct vpx_rational timebase; - unsigned int target_bandwidth; /* kilobits per second */ + /* In either kilobits per second or bits per second, depending on which + * copy of oxcf this is in. + * - ctx->oxcf.target_bandwidth is in kilobits per second. See + * set_vp8e_config(). + * - ctx->cpi->oxcf.target_bandwidth in is bits per second. See + * vp8_change_config(). + */ + unsigned int target_bandwidth; /* Parameter used for applying denoiser. * For temporal denoiser: noise_sensitivity = 0 means off, @@ -221,6 +221,7 @@ typedef struct { /* Temporal scaling parameters */ unsigned int number_of_layers; + /* kilobits per second */ unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY]; unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY]; unsigned int periodicity; @@ -241,44 +242,44 @@ typedef struct { #endif } VP8_CONFIG; -void vp8_initialize(); +void vp8_initialize(void); -struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf); +struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf); void vp8_remove_compressor(struct VP8_COMP **comp); void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf); -void vp8_change_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf); +void vp8_change_config(struct VP8_COMP *cpi, const VP8_CONFIG *oxcf); -int vp8_receive_raw_frame(struct VP8_COMP *comp, unsigned int frame_flags, +int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time_stamp); -int vp8_get_compressed_data(struct VP8_COMP *comp, unsigned int *frame_flags, + int64_t end_time); +int vp8_get_compressed_data(struct VP8_COMP *cpi, unsigned int *frame_flags, size_t *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush); -int vp8_get_preview_raw_frame(struct VP8_COMP *comp, YV12_BUFFER_CONFIG *dest, +int vp8_get_preview_raw_frame(struct VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags); -int vp8_use_as_reference(struct VP8_COMP *comp, int ref_frame_flags); -int vp8_update_reference(struct VP8_COMP *comp, int ref_frame_flags); -int vp8_get_reference(struct VP8_COMP *comp, +int vp8_use_as_reference(struct VP8_COMP *cpi, int ref_frame_flags); +int vp8_update_reference(struct VP8_COMP *cpi, int ref_frame_flags); +int vp8_get_reference(struct VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp8_set_reference(struct VP8_COMP *comp, +int vp8_set_reference(struct VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp8_update_entropy(struct VP8_COMP *comp, int update); -int vp8_set_roimap(struct VP8_COMP *comp, unsigned char *map, unsigned int rows, +int vp8_update_entropy(struct VP8_COMP *cpi, int update); +int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]); -int vp8_set_active_map(struct VP8_COMP *comp, unsigned char *map, +int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols); -int vp8_set_internal_size(struct VP8_COMP *comp, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode); -int vp8_get_quantizer(struct VP8_COMP *c); +int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode); +int vp8_get_quantizer(struct VP8_COMP *cpi); #ifdef __cplusplus } #endif -#endif // VP8_COMMON_ONYX_H_ +#endif // VPX_VP8_COMMON_ONYX_H_ diff --git a/media/libvpx/libvpx/vp8/common/onyxc_int.h b/media/libvpx/libvpx/vp8/common/onyxc_int.h index 9a12c7fb67..d4824d24e4 100644 --- a/media/libvpx/libvpx/vp8/common/onyxc_int.h +++ b/media/libvpx/libvpx/vp8/common/onyxc_int.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ONYXC_INT_H_ -#define VP8_COMMON_ONYXC_INT_H_ +#ifndef VPX_VP8_COMMON_ONYXC_INT_H_ +#define VPX_VP8_COMMON_ONYXC_INT_H_ #include "vpx_config.h" #include "vp8_rtcd.h" @@ -167,11 +167,10 @@ typedef struct VP8Common { #if CONFIG_POSTPROC struct postproc_state postproc_state; #endif - int cpu_caps; } VP8_COMMON; #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_COMMON_ONYXC_INT_H_ +#endif // VPX_VP8_COMMON_ONYXC_INT_H_ diff --git a/media/libvpx/libvpx/vp8/common/onyxd.h b/media/libvpx/libvpx/vp8/common/onyxd.h index e05461aad0..217a598de7 100644 --- a/media/libvpx/libvpx/vp8/common/onyxd.h +++ b/media/libvpx/libvpx/vp8/common/onyxd.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_ONYXD_H_ -#define VP8_COMMON_ONYXD_H_ +#ifndef VPX_VP8_COMMON_ONYXD_H_ +#define VPX_VP8_COMMON_ONYXD_H_ /* Create/destroy static data structures. */ #ifdef __cplusplus @@ -22,6 +22,7 @@ extern "C" { #include "vpx/vp8.h" struct VP8D_COMP; +struct VP8Common; typedef struct { int Width; @@ -40,21 +41,21 @@ void vp8dx_set_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst, int x); int vp8dx_get_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst); -int vp8dx_receive_compressed_data(struct VP8D_COMP *comp, size_t size, - const uint8_t *dest, int64_t time_stamp); -int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd, - int64_t *time_stamp, int64_t *time_end_stamp, +int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi); +int vp8dx_get_raw_frame(struct VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, vp8_ppflags_t *flags); +int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame); -vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp, +vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); -vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *comp, +vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); +int vp8dx_get_quantizer(const struct VP8D_COMP *pbi); #ifdef __cplusplus } #endif -#endif // VP8_COMMON_ONYXD_H_ +#endif // VPX_VP8_COMMON_ONYXD_H_ diff --git a/media/libvpx/libvpx/vp8/common/postproc.c b/media/libvpx/libvpx/vp8/common/postproc.c index 8c292d6161..c03b16b2f5 100644 --- a/media/libvpx/libvpx/vp8/common/postproc.c +++ b/media/libvpx/libvpx/vp8/common/postproc.c @@ -60,8 +60,7 @@ static void vp8_de_mblock(YV12_BUFFER_CONFIG *post, int q) { } void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, - int flag) { + YV12_BUFFER_CONFIG *post, int q) { double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; int ppl = (int)(level + .5); @@ -72,8 +71,6 @@ void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, * is a skipped block. */ unsigned char *ylimits = cm->pp_limits_buffer; unsigned char *uvlimits = cm->pp_limits_buffer + 16 * cm->mb_cols; - (void)low_var_thresh; - (void)flag; if (ppl > 0) { for (mbr = 0; mbr < cm->mb_rows; ++mbr) { @@ -116,8 +113,7 @@ void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, } } -void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag, +void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, int q, int uvfilter) { int mbr; double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; @@ -125,9 +121,6 @@ void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, int mb_rows = cm->mb_rows; int mb_cols = cm->mb_cols; unsigned char *limits = cm->pp_limits_buffer; - (void)post; - (void)low_var_thresh; - (void)flag; memset(limits, (unsigned char)ppl, 16 * mb_cols); @@ -151,124 +144,6 @@ void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source, } #endif // CONFIG_POSTPROC -/* Blend the macro block with a solid colored square. Leave the - * edges unblended to give distinction to macro blocks in areas - * filled with the same color block. - */ -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, - int y_1, int u_1, int v_1, int alpha, int stride) { - int i, j; - int y1_const = y_1 * ((1 << 16) - alpha); - int u1_const = u_1 * ((1 << 16) - alpha); - int v1_const = v_1 * ((1 << 16) - alpha); - - y += 2 * stride + 2; - for (i = 0; i < 12; ++i) { - for (j = 0; j < 12; ++j) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - u += stride + 1; - v += stride + 1; - - for (i = 0; i < 6; ++i) { - for (j = 0; j < 6; ++j) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - } -} - -/* Blend only the edge of the macro block. Leave center - * unblended to allow for other visualizations to be layered. - */ -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, - int y_1, int u_1, int v_1, int alpha, int stride) { - int i, j; - int y1_const = y_1 * ((1 << 16) - alpha); - int u1_const = u_1 * ((1 << 16) - alpha); - int v1_const = v_1 * ((1 << 16) - alpha); - - for (i = 0; i < 2; ++i) { - for (j = 0; j < 16; ++j) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - for (i = 0; i < 12; ++i) { - y[0] = (y[0] * alpha + y1_const) >> 16; - y[1] = (y[1] * alpha + y1_const) >> 16; - y[14] = (y[14] * alpha + y1_const) >> 16; - y[15] = (y[15] * alpha + y1_const) >> 16; - y += stride; - } - - for (i = 0; i < 2; ++i) { - for (j = 0; j < 16; ++j) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - for (j = 0; j < 8; ++j) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - - for (i = 0; i < 6; ++i) { - u[0] = (u[0] * alpha + u1_const) >> 16; - v[0] = (v[0] * alpha + v1_const) >> 16; - - u[7] = (u[7] * alpha + u1_const) >> 16; - v[7] = (v[7] * alpha + v1_const) >> 16; - - u += stride; - v += stride; - } - - for (j = 0; j < 8; ++j) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } -} - -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, - int y_1, int u_1, int v_1, int alpha, int stride) { - int i, j; - int y1_const = y_1 * ((1 << 16) - alpha); - int u1_const = u_1 * ((1 << 16) - alpha); - int v1_const = v_1 * ((1 << 16) - alpha); - - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - for (i = 0; i < 2; ++i) { - for (j = 0; j < 2; ++j) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - } -} - #if CONFIG_POSTPROC int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags) { @@ -325,7 +200,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vpx_clear_system_state(); if ((flags & VP8D_MFQE) && oci->postproc_state.last_frame_valid && - oci->current_video_frame >= 2 && + oci->current_video_frame > 10 && oci->postproc_state.last_base_qindex < 60 && oci->base_qindex - oci->postproc_state.last_base_qindex >= 20) { vp8_multiframe_quality_enhance(oci); @@ -334,11 +209,10 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int); if (flags & VP8D_DEMACROBLOCK) { vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, - q + (deblock_level - 5) * 10, 1, 0); + q + (deblock_level - 5) * 10); vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10); } else if (flags & VP8D_DEBLOCK) { - vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, q, - 1, 0); + vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer, q); } } /* Move partially towards the base q of the previous frame */ @@ -346,12 +220,12 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, (3 * oci->postproc_state.last_base_qindex + oci->base_qindex) >> 2; } else if (flags & VP8D_DEMACROBLOCK) { vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, - q + (deblock_level - 5) * 10, 1, 0); + q + (deblock_level - 5) * 10); vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10); oci->postproc_state.last_base_qindex = oci->base_qindex; } else if (flags & VP8D_DEBLOCK) { - vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, q, 1, 0); + vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer, q); oci->postproc_state.last_base_qindex = oci->base_qindex; } else { vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer); diff --git a/media/libvpx/libvpx/vp8/common/postproc.h b/media/libvpx/libvpx/vp8/common/postproc.h index 7be112b163..492c52aef6 100644 --- a/media/libvpx/libvpx/vp8/common/postproc.h +++ b/media/libvpx/libvpx/vp8/common/postproc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_POSTPROC_H_ -#define VP8_COMMON_POSTPROC_H_ +#ifndef VPX_VP8_COMMON_POSTPROC_H_ +#define VPX_VP8_COMMON_POSTPROC_H_ #include "vpx_ports/mem.h" struct postproc_state { @@ -27,14 +27,13 @@ struct postproc_state { extern "C" { #endif int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest, - vp8_ppflags_t *flags); + vp8_ppflags_t *ppflags); -void vp8_de_noise(struct VP8Common *oci, YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag, +void vp8_de_noise(struct VP8Common *cm, YV12_BUFFER_CONFIG *source, int q, int uvfilter); -void vp8_deblock(struct VP8Common *oci, YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag); +void vp8_deblock(struct VP8Common *cm, YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, int q); #define MFQE_PRECISION 4 @@ -43,4 +42,4 @@ void vp8_multiframe_quality_enhance(struct VP8Common *cm); } // extern "C" #endif -#endif // VP8_COMMON_POSTPROC_H_ +#endif // VPX_VP8_COMMON_POSTPROC_H_ diff --git a/media/libvpx/libvpx/vp8/common/ppflags.h b/media/libvpx/libvpx/vp8/common/ppflags.h index 96e3af6c9c..bdf08734b9 100644 --- a/media/libvpx/libvpx/vp8/common/ppflags.h +++ b/media/libvpx/libvpx/vp8/common/ppflags.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_PPFLAGS_H_ -#define VP8_COMMON_PPFLAGS_H_ +#ifndef VPX_VP8_COMMON_PPFLAGS_H_ +#define VPX_VP8_COMMON_PPFLAGS_H_ #ifdef __cplusplus extern "C" { @@ -36,4 +36,4 @@ typedef struct { } // extern "C" #endif -#endif // VP8_COMMON_PPFLAGS_H_ +#endif // VPX_VP8_COMMON_PPFLAGS_H_ diff --git a/media/libvpx/libvpx/vp8/common/quant_common.h b/media/libvpx/libvpx/vp8/common/quant_common.h index ff4203df87..049840a272 100644 --- a/media/libvpx/libvpx/vp8/common/quant_common.h +++ b/media/libvpx/libvpx/vp8/common/quant_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_QUANT_COMMON_H_ -#define VP8_COMMON_QUANT_COMMON_H_ +#ifndef VPX_VP8_COMMON_QUANT_COMMON_H_ +#define VPX_VP8_COMMON_QUANT_COMMON_H_ #include "string.h" #include "blockd.h" @@ -30,4 +30,4 @@ extern int vp8_ac_uv_quant(int QIndex, int Delta); } // extern "C" #endif -#endif // VP8_COMMON_QUANT_COMMON_H_ +#endif // VPX_VP8_COMMON_QUANT_COMMON_H_ diff --git a/media/libvpx/libvpx/vp8/common/reconinter.c b/media/libvpx/libvpx/vp8/common/reconinter.c index 48892c9b8e..2cb0709318 100644 --- a/media/libvpx/libvpx/vp8/common/reconinter.c +++ b/media/libvpx/libvpx/vp8/common/reconinter.c @@ -333,6 +333,13 @@ void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y, _16x16mv.as_mv.row &= x->fullpixel_mask; _16x16mv.as_mv.col &= x->fullpixel_mask; + if (2 * _16x16mv.as_mv.col < (x->mb_to_left_edge - (19 << 3)) || + 2 * _16x16mv.as_mv.col > x->mb_to_right_edge + (18 << 3) || + 2 * _16x16mv.as_mv.row < (x->mb_to_top_edge - (19 << 3)) || + 2 * _16x16mv.as_mv.row > x->mb_to_bottom_edge + (18 << 3)) { + return; + } + pre_stride >>= 1; offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3); uptr = x->pre.u_buffer + offset; diff --git a/media/libvpx/libvpx/vp8/common/reconinter.h b/media/libvpx/libvpx/vp8/common/reconinter.h index 4cdd4fee0f..974e7ce754 100644 --- a/media/libvpx/libvpx/vp8/common/reconinter.h +++ b/media/libvpx/libvpx/vp8/common/reconinter.h @@ -8,30 +8,29 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_RECONINTER_H_ -#define VP8_COMMON_RECONINTER_H_ +#ifndef VPX_VP8_COMMON_RECONINTER_H_ +#define VPX_VP8_COMMON_RECONINTER_H_ #ifdef __cplusplus extern "C" { #endif -extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x); -extern void vp8_build_inter16x16_predictors_mb( - MACROBLOCKD *x, unsigned char *dst_y, unsigned char *dst_u, - unsigned char *dst_v, int dst_ystride, int dst_uvstride); +void vp8_build_inter_predictors_mb(MACROBLOCKD *xd); +void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, int dst_ystride, + int dst_uvstride); -extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, - unsigned char *dst_y, - int dst_ystride); -extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, - unsigned char *base_pre, - int pre_stride, vp8_subpix_fn_t sppf); +void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, unsigned char *dst_y, + int dst_ystride); +void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, + int pre_stride, vp8_subpix_fn_t sppf); -extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); -extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); +void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); +void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_COMMON_RECONINTER_H_ +#endif // VPX_VP8_COMMON_RECONINTER_H_ diff --git a/media/libvpx/libvpx/vp8/common/reconintra.c b/media/libvpx/libvpx/vp8/common/reconintra.c index 986074ec7e..8e2094da87 100644 --- a/media/libvpx/libvpx/vp8/common/reconintra.c +++ b/media/libvpx/libvpx/vp8/common/reconintra.c @@ -71,8 +71,16 @@ void vp8_build_intra_predictors_mbuv_s( unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride) { MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode; +#if HAVE_VSX + /* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from + uleft_col and vleft_col. Play it safe by reserving enough stack + space here. */ + unsigned char uleft_col[16]; + unsigned char vleft_col[16]; +#else unsigned char uleft_col[8]; unsigned char vleft_col[8]; +#endif int i; intra_pred_fn fn; diff --git a/media/libvpx/libvpx/vp8/common/reconintra.h b/media/libvpx/libvpx/vp8/common/reconintra.h index fd7c725f35..029ac00a24 100644 --- a/media/libvpx/libvpx/vp8/common/reconintra.h +++ b/media/libvpx/libvpx/vp8/common/reconintra.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_RECONINTRA_H_ -#define VP8_COMMON_RECONINTRA_H_ +#ifndef VPX_VP8_COMMON_RECONINTRA_H_ +#define VPX_VP8_COMMON_RECONINTRA_H_ #include "vp8/common/blockd.h" @@ -32,4 +32,4 @@ void vp8_init_intra_predictors(void); } // extern "C" #endif -#endif // VP8_COMMON_RECONINTRA_H_ +#endif // VPX_VP8_COMMON_RECONINTRA_H_ diff --git a/media/libvpx/libvpx/vp8/common/reconintra4x4.c b/media/libvpx/libvpx/vp8/common/reconintra4x4.c index 07c9223331..be936df5e0 100644 --- a/media/libvpx/libvpx/vp8/common/reconintra4x4.c +++ b/media/libvpx/libvpx/vp8/common/reconintra4x4.c @@ -16,7 +16,7 @@ #include "blockd.h" #include "reconintra4x4.h" #include "vp8/common/common.h" -#include "vpx_ports/mem.h" +#include "vpx_ports/compiler_attributes.h" typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -31,7 +31,7 @@ void vp8_init_intra4x4_predictors_internal(void) { pred[B_LD_PRED] = vpx_d45e_predictor_4x4; pred[B_RD_PRED] = vpx_d135_predictor_4x4; pred[B_VR_PRED] = vpx_d117_predictor_4x4; - pred[B_VL_PRED] = vpx_d63f_predictor_4x4; + pred[B_VL_PRED] = vpx_d63e_predictor_4x4; pred[B_HD_PRED] = vpx_d153_predictor_4x4; pred[B_HU_PRED] = vpx_d207_predictor_4x4; } @@ -40,7 +40,15 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left) { - unsigned char Aboveb[12], *Above = Aboveb + 4; +/* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from + Above (aka, Aboveb + 4). Play it safe by reserving enough stack + space here. Similary for "Left". */ +#if HAVE_VSX + unsigned char Aboveb[20]; +#else + unsigned char Aboveb[12]; +#endif + unsigned char *Above = Aboveb + 4; #if HAVE_NEON // Neon intrinsics are unable to load 32 bits, or 4 8 bit values. Instead, it // over reads but does not use the extra 4 values. @@ -50,6 +58,8 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, // indeed read, they are not used. vp8_zero_array(Left, 8); #endif // VPX_WITH_ASAN +#elif HAVE_VSX + unsigned char Left[16]; #else unsigned char Left[4]; #endif // HAVE_NEON diff --git a/media/libvpx/libvpx/vp8/common/reconintra4x4.h b/media/libvpx/libvpx/vp8/common/reconintra4x4.h index e17fc58c01..3618ec5cbe 100644 --- a/media/libvpx/libvpx/vp8/common/reconintra4x4.h +++ b/media/libvpx/libvpx/vp8/common/reconintra4x4.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_RECONINTRA4X4_H_ -#define VP8_COMMON_RECONINTRA4X4_H_ +#ifndef VPX_VP8_COMMON_RECONINTRA4X4_H_ +#define VPX_VP8_COMMON_RECONINTRA4X4_H_ #include "vp8/common/blockd.h" #ifdef __cplusplus @@ -31,7 +31,7 @@ static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd, *dst_ptr2 = *src_ptr; } -void vp8_intra4x4_predict(unsigned char *Above, unsigned char *yleft, +void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); @@ -42,4 +42,4 @@ void vp8_init_intra4x4_predictors_internal(void); } // extern "C" #endif -#endif // VP8_COMMON_RECONINTRA4X4_H_ +#endif // VPX_VP8_COMMON_RECONINTRA4X4_H_ diff --git a/media/libvpx/libvpx/vp8/common/rtcd.c b/media/libvpx/libvpx/vp8/common/rtcd.c index 09a0e2b4b3..102b7ccd54 100644 --- a/media/libvpx/libvpx/vp8/common/rtcd.c +++ b/media/libvpx/libvpx/vp8/common/rtcd.c @@ -12,4 +12,4 @@ #include "./vp8_rtcd.h" #include "vpx_ports/vpx_once.h" -void vp8_rtcd() { once(setup_rtcd_internal); } +void vp8_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vp8/common/rtcd_defs.pl b/media/libvpx/libvpx/vp8/common/rtcd_defs.pl index bc5e057999..12b474d939 100644 --- a/media/libvpx/libvpx/vp8/common/rtcd_defs.pl +++ b/media/libvpx/libvpx/vp8/common/rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vp8_common_forward_decls() { print < #include -#if defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) -#define THREAD_FUNCTION \ - __attribute__((force_align_arg_pointer)) unsigned int __stdcall -#else -#define THREAD_FUNCTION unsigned int __stdcall -#endif -#define THREAD_FUNCTION_RETURN DWORD -#define THREAD_SPECIFIC_INDEX DWORD -#define pthread_t HANDLE -#define pthread_attr_t DWORD -#define pthread_detach(thread) \ - if (thread != NULL) CloseHandle(thread) -#define thread_sleep(nms) Sleep(nms) -#define pthread_cancel(thread) terminate_thread(thread, 0) -#define ts_key_create(ts_key, destructor) \ - { ts_key = TlsAlloc(); }; -#define pthread_getspecific(ts_key) TlsGetValue(ts_key) -#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value) -#define pthread_self() GetCurrentThreadId() - -#elif defined(__OS2__) -/* OS/2 */ -#define INCL_DOS -#include - -#include -#define THREAD_FUNCTION void * -#define THREAD_FUNCTION_RETURN void * -#define THREAD_SPECIFIC_INDEX PULONG -#define pthread_t TID -#define pthread_attr_t ULONG -#define pthread_detach(thread) 0 -#define thread_sleep(nms) DosSleep(nms) -#define pthread_cancel(thread) DosKillThread(thread) -#define ts_key_create(ts_key, destructor) \ - DosAllocThreadLocalMemory(1, &(ts_key)); -#define pthread_getspecific(ts_key) ((void *)(*(ts_key))) -#define pthread_setspecific(ts_key, value) (*(ts_key) = (ULONG)(value)) -#define pthread_self() _gettid() #else +/* pthreads */ #ifdef __APPLE__ #include #include #include #include #include - #else #include #endif - -#include -/* pthreads */ -/* Nearly everything is already defined */ -#define THREAD_FUNCTION void * -#define THREAD_FUNCTION_RETURN void * -#define THREAD_SPECIFIC_INDEX pthread_key_t -#define ts_key_create(ts_key, destructor) \ - pthread_key_create(&(ts_key), destructor); #endif /* Synchronization macros: Win32 and Pthreads */ #if defined(_WIN32) && !HAVE_PTHREAD_H -#define sem_t HANDLE -#define pause(voidpara) __asm PAUSE -#define sem_init(sem, sem_attr1, sem_init_value) \ - (int)((*sem = CreateSemaphore(NULL, 0, 32768, NULL)) == NULL) -#define sem_wait(sem) \ +#define vp8_sem_t HANDLE +#define vp8_sem_init(sem, pshared, value) \ + (int)((*sem = CreateSemaphore(NULL, value, 32768, NULL)) == NULL) +#define vp8_sem_wait(sem) \ (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem, INFINITE)) -#define sem_post(sem) ReleaseSemaphore(*sem, 1, NULL) -#define sem_destroy(sem) \ +#define vp8_sem_post(sem) ReleaseSemaphore(*sem, 1, NULL) +#define vp8_sem_destroy(sem) \ if (*sem) ((int)(CloseHandle(*sem)) == TRUE) #define thread_sleep(nms) Sleep(nms) -#elif defined(__OS2__) -typedef struct { - HEV event; - HMTX wait_mutex; - HMTX count_mutex; - int count; -} sem_t; - -static inline int sem_init(sem_t *sem, int pshared, unsigned int value) { - DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0, - value > 0 ? TRUE : FALSE); - DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE); - DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE); - - sem->count = value; - - return 0; -} - -static inline int sem_wait(sem_t *sem) { - DosRequestMutexSem(sem->wait_mutex, -1); - - DosWaitEventSem(sem->event, -1); - - DosRequestMutexSem(sem->count_mutex, -1); - - sem->count--; - if (sem->count == 0) { - ULONG post_count; - - DosResetEventSem(sem->event, &post_count); - } - - DosReleaseMutexSem(sem->count_mutex); - - DosReleaseMutexSem(sem->wait_mutex); - - return 0; -} - -static inline int sem_post(sem_t *sem) { - DosRequestMutexSem(sem->count_mutex, -1); - - if (sem->count < 32768) { - sem->count++; - DosPostEventSem(sem->event); - } - - DosReleaseMutexSem(sem->count_mutex); - - return 0; -} - -static inline int sem_destroy(sem_t *sem) { - DosCloseEventSem(sem->event); - DosCloseMutexSem(sem->wait_mutex); - DosCloseMutexSem(sem->count_mutex); - - return 0; -} - -#define thread_sleep(nms) DosSleep(nms) - #else #ifdef __APPLE__ -#define sem_t semaphore_t -#define sem_init(X, Y, Z) \ - semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z) -#define sem_wait(sem) (semaphore_wait(*sem)) -#define sem_post(sem) semaphore_signal(*sem) -#define sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem) -#define thread_sleep(nms) -/* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = - 1000*nms;nanosleep(&ts, NULL);} */ +#define vp8_sem_t semaphore_t +#define vp8_sem_init(sem, pshared, value) \ + semaphore_create(mach_task_self(), sem, SYNC_POLICY_FIFO, value) +#define vp8_sem_wait(sem) semaphore_wait(*sem) +#define vp8_sem_post(sem) semaphore_signal(*sem) +#define vp8_sem_destroy(sem) semaphore_destroy(mach_task_self(), *sem) #else +#include #include #include -#define thread_sleep(nms) sched_yield(); -/* {struct timespec ts;ts.tv_sec=0; - ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */ -#endif +#define vp8_sem_t sem_t +#define vp8_sem_init sem_init +static INLINE int vp8_sem_wait(vp8_sem_t *sem) { + int ret; + while ((ret = sem_wait(sem)) == -1 && errno == EINTR) { + } + return ret; +} +#define vp8_sem_post sem_post +#define vp8_sem_destroy sem_destroy +#endif /* __APPLE__ */ /* Not Windows. Assume pthreads */ +/* thread_sleep implementation: yield unless Linux/Unix. */ +#if defined(__unix__) || defined(__APPLE__) +#define thread_sleep(nms) +/* {struct timespec ts;ts.tv_sec=0; + ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */ +#else +#define thread_sleep(nms) sched_yield(); +#endif /* __unix__ || __APPLE__ */ + #endif -#if ARCH_X86 || ARCH_X86_64 +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 #include "vpx_ports/x86.h" #else #define x86_pause_hint() #endif -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define USE_MUTEX_LOCK 1 -#endif -#endif +#include "vpx_util/vpx_atomics.h" -#include "vpx_util/vpx_thread.h" - -static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) { - (void)mutex; -#if defined(USE_MUTEX_LOCK) - int ret; - pthread_mutex_lock(mutex); - ret = *p; - pthread_mutex_unlock(mutex); - return ret; -#endif - return *p; -} - -static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col, - const int *last_row_current_mb_col, - const int nsync) { - while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) { +static INLINE void vp8_atomic_spin_wait( + int mb_col, const vpx_atomic_int *last_row_current_mb_col, + const int nsync) { + while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) { x86_pause_hint(); thread_sleep(0); } } -static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) { - (void)mutex; -#if defined(USE_MUTEX_LOCK) - pthread_mutex_lock(mutex); - *p = v; - pthread_mutex_unlock(mutex); - return; -#endif - *p = v; -} - -#undef USE_MUTEX_LOCK #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_COMMON_THREADING_H_ +#endif // VPX_VP8_COMMON_THREADING_H_ diff --git a/media/libvpx/libvpx/vp8/common/treecoder.c b/media/libvpx/libvpx/vp8/common/treecoder.c index 9feb40a5a7..f1e78f4321 100644 --- a/media/libvpx/libvpx/vp8/common/treecoder.c +++ b/media/libvpx/libvpx/vp8/common/treecoder.c @@ -12,6 +12,7 @@ #include #include "vp8/common/treecoder.h" +#include "vpx/vpx_integer.h" static void tree2tok(struct vp8_token_struct *const p, vp8_tree t, int i, int v, int L) { @@ -79,7 +80,7 @@ void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */ vp8_prob probs[/* n-1 */], unsigned int branch_ct[/* n-1 */][2], const unsigned int num_events[/* n */], - unsigned int Pfac, int rd) { + unsigned int Pfactor, int Round) { const int tree_len = n - 1; int t = 0; @@ -89,10 +90,10 @@ void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */ const unsigned int *const c = branch_ct[t]; const unsigned int tot = c[0] + c[1]; - assert(tot < (1 << 24)); /* no overflow below */ - if (tot) { - const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot; + const unsigned int p = + (unsigned int)(((uint64_t)c[0] * Pfactor) + (Round ? tot >> 1 : 0)) / + tot; probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */ } else { probs[t] = vp8_prob_half; diff --git a/media/libvpx/libvpx/vp8/common/treecoder.h b/media/libvpx/libvpx/vp8/common/treecoder.h index d8503cf3f8..d7d8d0ead0 100644 --- a/media/libvpx/libvpx/vp8/common/treecoder.h +++ b/media/libvpx/libvpx/vp8/common/treecoder.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_COMMON_TREECODER_H_ -#define VP8_COMMON_TREECODER_H_ +#ifndef VPX_VP8_COMMON_TREECODER_H_ +#define VPX_VP8_COMMON_TREECODER_H_ #ifdef __cplusplus extern "C" { @@ -32,7 +32,7 @@ typedef const bool_coder_spec c_bool_coder_spec; typedef const bool_writer c_bool_writer; typedef const bool_reader c_bool_reader; -#define vp8_complement(x) (255 - x) +#define vp8_complement(x) (255 - (x)) /* We build coding trees compactly in arrays. Each node of the tree is a pair of vp8_tree_indices. @@ -79,4 +79,4 @@ void vp8bc_tree_probs_from_distribution(int n, /* n = size of alphabet */ } // extern "C" #endif -#endif // VP8_COMMON_TREECODER_H_ +#endif // VPX_VP8_COMMON_TREECODER_H_ diff --git a/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h b/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h index 9a81ebfe62..3fc942e050 100644 --- a/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h +++ b/media/libvpx/libvpx/vp8/common/vp8_entropymodedata.h @@ -6,10 +6,10 @@ * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. -*/ + */ -#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_ -#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#ifndef VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#define VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_ #ifdef __cplusplus extern "C" { @@ -169,4 +169,4 @@ const vp8_prob } // extern "C" #endif -#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#endif // VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_ diff --git a/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c b/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c index c6430be465..4576c18537 100644 --- a/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c +++ b/media/libvpx/libvpx/vp8/common/vp8_loopfilter.c @@ -111,11 +111,9 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd, /* Note the baseline filter values for each segment */ if (mbd->segmentation_enabled) { - /* Abs value */ - if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) { + if (mbd->mb_segment_abs_delta == SEGMENT_ABSDATA) { lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; - } else /* Delta Value */ - { + } else { /* Delta Value */ lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg]; } lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0; @@ -221,13 +219,11 @@ void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context, } void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context, - int mb_row, int post_ystride, int post_uvstride, - unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr) { + int mb_row, int post_ystride, + unsigned char *y_ptr) { int mb_col; int filter_level; loop_filter_info_n *lfi_n = &cm->lf_info; - (void)post_uvstride; for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { int skip_lf = (mode_info_context->mbmi.mode != B_PRED && @@ -260,8 +256,6 @@ void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context, } y_ptr += 16; - u_ptr += 8; - v_ptr += 8; mode_info_context++; /* step to next MB */ } @@ -344,8 +338,7 @@ void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int frame_type) { mode_info_context++; /* Skip border mb */ } - } else /* SIMPLE_LOOPFILTER */ - { + } else { /* SIMPLE_LOOPFILTER */ for (mb_row = 0; mb_row < mb_rows; ++mb_row) { for (mb_col = 0; mb_col < mb_cols; ++mb_col) { int skip_lf = (mode_info_context->mbmi.mode != B_PRED && diff --git a/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c new file mode 100644 index 0000000000..6739efa5fe --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/common/alloccommon.h" +#include "vp8/common/vp8_skin_detection.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_write_yuv_frame.h" + +static int avg_2x2(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 2; ++i, s += p) { + for (j = 0; j < 2; ++j) { + sum += s[j]; + } + } + return (sum + 2) >> 2; +} + +int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, + SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv, + int curr_motion_magn) { + // No skin if block has been zero/small motion for long consecutive time. + if (consec_zeromv > 60 && curr_motion_magn == 0) { + return 0; + } else { + int motion = 1; + if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0; + if (bsize == SKIN_16X16) { + // Take the average of center 2x2 pixels. + const int ysource = avg_2x2(y + 7 * stride + 7, stride); + const int usource = avg_2x2(u + 3 * strideuv + 3, strideuv); + const int vsource = avg_2x2(v + 3 * strideuv + 3, strideuv); + return vpx_skin_pixel(ysource, usource, vsource, motion); + } else { + int num_skin = 0; + int i, j; + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + // Take the average of center 2x2 pixels. + const int ysource = avg_2x2(y + 3 * stride + 3, stride); + const int usource = avg_2x2(u + strideuv + 1, strideuv); + const int vsource = avg_2x2(v + strideuv + 1, strideuv); + num_skin += vpx_skin_pixel(ysource, usource, vsource, motion); + if (num_skin >= 2) return 1; + y += 8; + u += 4; + v += 4; + } + y += (stride << 3) - 16; + u += (strideuv << 2) - 8; + v += (strideuv << 2) - 8; + } + + return 0; + } + } +} + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) { + int i, j, mb_row, mb_col, num_bl; + VP8_COMMON *const cm = &cpi->common; + uint8_t *y; + const uint8_t *src_y = cpi->Source->y_buffer; + const int src_ystride = cpi->Source->y_stride; + int offset = 0; + + YV12_BUFFER_CONFIG skinmap; + memset(&skinmap, 0, sizeof(skinmap)); + if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height, + VP8BORDERINPIXELS) < 0) { + vpx_free_frame_buffer(&skinmap); + return; + } + memset(skinmap.buffer_alloc, 128, skinmap.frame_size); + y = skinmap.y_buffer; + // Loop through blocks and set skin map based on center pixel of block. + // Set y to white for skin block, otherwise set to source with gray scale. + for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) { + num_bl = 0; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) { + const int is_skin = cpi->skin_map[offset++]; + for (i = 0; i < 16; i++) { + for (j = 0; j < 16; j++) { + y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j]; + } + } + num_bl++; + y += 16; + src_y += 16; + } + y += (src_ystride << 4) - (num_bl << 4); + src_y += (src_ystride << 4) - (num_bl << 4); + } + vpx_write_yuv_frame(yuv_skinmap_file, &skinmap); + vpx_free_frame_buffer(&skinmap); +} +#endif // OUTPUT_YUV_SKINMAP diff --git a/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h new file mode 100644 index 0000000000..ef0e4ae4fe --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/vp8_skin_detection.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_ +#define VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_ + +#include "vp8/encoder/onyx_int.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/skin_detection.h" +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; + +typedef enum { + // Skin detection based on 8x8 block. If two of them are identified as skin, + // the macroblock is marked as skin. + SKIN_8X8, + // Skin detection based on 16x16 block. + SKIN_16X16 +} SKIN_DETECTION_BLOCK_SIZE; + +int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, + int stride, int strideuv, + SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv, + int curr_motion_magn); + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_ diff --git a/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c b/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c new file mode 100644 index 0000000000..ff6cbbd68c --- /dev/null +++ b/media/libvpx/libvpx/vp8/common/x86/bilinear_filter_sse2.c @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp8_rtcd.h" +#include "./vpx_config.h" +#include "vp8/common/filter.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_ports/mem.h" + +static INLINE void horizontal_16x16(uint8_t *src, const int stride, + uint16_t *dst, const int xoffset) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < 17; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i a_lo = _mm_unpacklo_epi8(a, zero); + const __m128i a_hi = _mm_unpackhi_epi8(a, zero); + _mm_store_si128((__m128i *)dst, a_lo); + _mm_store_si128((__m128i *)(dst + 8), a_hi); + src += stride; + dst += 16; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + for (h = 0; h < 17; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i a_lo = _mm_unpacklo_epi8(a, zero); + const __m128i a_hi = _mm_unpackhi_epi8(a, zero); + const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0); + const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0); + + const __m128i b = _mm_loadu_si128((__m128i *)(src + 1)); + const __m128i b_lo = _mm_unpacklo_epi8(b, zero); + const __m128i b_hi = _mm_unpackhi_epi8(b, zero); + const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1); + const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1); + + const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered); + const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered); + + const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); + const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor); + + const __m128i shifted_lo = + _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT); + const __m128i shifted_hi = + _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT); + + _mm_store_si128((__m128i *)dst, shifted_lo); + _mm_store_si128((__m128i *)(dst + 8), shifted_hi); + src += stride; + dst += 16; + } + } +} + +static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset) { + int h; + + if (yoffset == 0) { + for (h = 0; h < 16; ++h) { + const __m128i row_lo = _mm_load_si128((__m128i *)src); + const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8)); + const __m128i packed = _mm_packus_epi16(row_lo, row_hi); + _mm_store_si128((__m128i *)dst, packed); + src += 16; + dst += stride; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + __m128i row_0_lo = _mm_load_si128((__m128i *)src); + __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8)); + src += 16; + for (h = 0; h < 16; ++h) { + const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0); + const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0); + + const __m128i row_1_lo = _mm_load_si128((__m128i *)src); + const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8)); + const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1); + const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1); + + const __m128i sum_lo = + _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered); + const __m128i sum_hi = + _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered); + + const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); + const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor); + + const __m128i shifted_lo = + _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT); + const __m128i shifted_hi = + _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT); + + const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi); + _mm_store_si128((__m128i *)dst, packed); + row_0_lo = row_1_lo; + row_0_hi = row_1_hi; + src += 16; + dst += stride; + } + } +} + +void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[16 * 17]); + + assert((xoffset | yoffset) != 0); + + horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset); + + vertical_16x16(FData, dst_ptr, dst_pitch, yoffset); +} + +static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst, + const int xoffset, const int height) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < height; ++h) { + const __m128i a = _mm_loadl_epi64((__m128i *)src); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + _mm_store_si128((__m128i *)dst, a_u16); + src += stride; + dst += 8; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + // Filter horizontally. Rather than load the whole array and transpose, load + // 16 values (overreading) and shift to set up the second value. Do an + // "extra" 9th line so the vertical pass has the necessary context. + for (h = 0; h < height; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i b = _mm_srli_si128(a, 1); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + const __m128i b_u16 = _mm_unpacklo_epi8(b, zero); + const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0); + const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1); + const __m128i sum = _mm_add_epi16(a_filtered, b_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + _mm_store_si128((__m128i *)dst, shifted); + src += stride; + dst += 8; + } + } +} + +static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset, const int height) { + int h; + + if (yoffset == 0) { + for (h = 0; h < height; ++h) { + const __m128i row = _mm_load_si128((__m128i *)src); + const __m128i packed = _mm_packus_epi16(row, row); + _mm_storel_epi64((__m128i *)dst, packed); + src += 8; + dst += stride; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + __m128i row_0 = _mm_load_si128((__m128i *)src); + src += 8; + for (h = 0; h < height; ++h) { + const __m128i row_1 = _mm_load_si128((__m128i *)src); + const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0); + const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1); + const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + const __m128i packed = _mm_packus_epi16(shifted, shifted); + _mm_storel_epi64((__m128i *)dst, packed); + row_0 = row_1; + src += 8; + dst += stride; + } + } +} + +void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[8 * 9]); + + assert((xoffset | yoffset) != 0); + + horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9); + + vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8); +} + +void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[8 * 5]); + + assert((xoffset | yoffset) != 0); + + horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5); + + vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4); +} + +static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst, + const int xoffset) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < 5; ++h) { + const __m128i a = load_unaligned_u32(src); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + _mm_storel_epi64((__m128i *)dst, a_u16); + src += stride; + dst += 4; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + for (h = 0; h < 5; ++h) { + const __m128i a = load_unaligned_u32(src); + const __m128i b = load_unaligned_u32(src + 1); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + const __m128i b_u16 = _mm_unpacklo_epi8(b, zero); + const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0); + const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1); + const __m128i sum = _mm_add_epi16(a_filtered, b_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + _mm_storel_epi64((__m128i *)dst, shifted); + src += stride; + dst += 4; + } + } +} + +static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset) { + int h; + + if (yoffset == 0) { + for (h = 0; h < 4; h += 2) { + const __m128i row = _mm_load_si128((__m128i *)src); + __m128i packed = _mm_packus_epi16(row, row); + store_unaligned_u32(dst, packed); + dst += stride; + packed = _mm_srli_si128(packed, 4); + store_unaligned_u32(dst, packed); + dst += stride; + src += 8; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + for (h = 0; h < 4; h += 2) { + const __m128i row_0 = _mm_load_si128((__m128i *)src); + const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4)); + const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0); + const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1); + const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + __m128i packed = _mm_packus_epi16(shifted, shifted); + storeu_int32(dst, _mm_cvtsi128_si32(packed)); + packed = _mm_srli_si128(packed, 4); + dst += stride; + storeu_int32(dst, _mm_cvtsi128_si32(packed)); + dst += stride; + src += 8; + } + } +} + +void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED(16, uint16_t, FData[4 * 5]); + + assert((xoffset | yoffset) != 0); + + horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset); + + vertical_4x4(FData, dst_ptr, dst_pitch, yoffset); +} diff --git a/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm index 4e551f00aa..0a269e15f7 100644 --- a/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm +++ b/media/libvpx/libvpx/vp8/common/x86/dequantize_mmx.asm @@ -11,9 +11,10 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) -global sym(vp8_dequantize_b_impl_mmx) PRIVATE +globalsym(vp8_dequantize_b_impl_mmx) sym(vp8_dequantize_b_impl_mmx): push rbp mov rbp, rsp @@ -55,7 +56,7 @@ sym(vp8_dequantize_b_impl_mmx): ;short *dq, 1 ;unsigned char *dest, 2 ;int stride) 3 -global sym(vp8_dequant_idct_add_mmx) PRIVATE +globalsym(vp8_dequant_idct_add_mmx) sym(vp8_dequant_idct_add_mmx): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/common/x86/filter_x86.c b/media/libvpx/libvpx/vp8/common/x86/filter_x86.c deleted file mode 100644 index 2405342f02..0000000000 --- a/media/libvpx/libvpx/vp8/common/x86/filter_x86.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp8/common/x86/filter_x86.h" - -DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = { - { 128, 128, 128, 128, 0, 0, 0, 0 }, { 112, 112, 112, 112, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 32, 32, 32, 32 }, { 80, 80, 80, 80, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64 }, { 48, 48, 48, 48, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 } -}; - -DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = { - { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, - { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, - { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, - { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, - { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } -}; diff --git a/media/libvpx/libvpx/vp8/common/x86/filter_x86.h b/media/libvpx/libvpx/vp8/common/x86/filter_x86.h deleted file mode 100644 index d282841bee..0000000000 --- a/media/libvpx/libvpx/vp8/common/x86/filter_x86.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP8_COMMON_X86_FILTER_X86_H_ -#define VP8_COMMON_X86_FILTER_X86_H_ - -#include "vpx_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with - * duplicated values */ - -/* duplicated 4x */ -extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]); - -/* duplicated 8x */ -extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP8_COMMON_X86_FILTER_X86_H_ diff --git a/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c b/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c index 8aefb27997..897ed5b652 100644 --- a/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c +++ b/media/libvpx/libvpx/vp8/common/x86/idct_blk_sse2.c @@ -42,43 +42,43 @@ void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, } void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, - unsigned char *dstu, - unsigned char *dstv, int stride, + unsigned char *dst_u, + unsigned char *dst_v, int stride, char *eobs) { if (((short *)(eobs))[0]) { if (((short *)(eobs))[0] & 0xfefe) { - vp8_idct_dequant_full_2x_sse2(q, dq, dstu, stride); + vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride); } else { - vp8_idct_dequant_0_2x_sse2(q, dq, dstu, stride); + vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride); } } q += 32; - dstu += stride * 4; + dst_u += stride * 4; if (((short *)(eobs))[1]) { if (((short *)(eobs))[1] & 0xfefe) { - vp8_idct_dequant_full_2x_sse2(q, dq, dstu, stride); + vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride); } else { - vp8_idct_dequant_0_2x_sse2(q, dq, dstu, stride); + vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride); } } q += 32; if (((short *)(eobs))[2]) { if (((short *)(eobs))[2] & 0xfefe) { - vp8_idct_dequant_full_2x_sse2(q, dq, dstv, stride); + vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride); } else { - vp8_idct_dequant_0_2x_sse2(q, dq, dstv, stride); + vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride); } } q += 32; - dstv += stride * 4; + dst_v += stride * 4; if (((short *)(eobs))[3]) { if (((short *)(eobs))[3] & 0xfefe) { - vp8_idct_dequant_full_2x_sse2(q, dq, dstv, stride); + vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride); } else { - vp8_idct_dequant_0_2x_sse2(q, dq, dstv, stride); + vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride); } } } diff --git a/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm index 96fa2c60d0..6cea86fe03 100644 --- a/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm +++ b/media/libvpx/libvpx/vp8/common/x86/idctllm_mmx.asm @@ -31,10 +31,11 @@ ; * ; **************************************************************************/ +SECTION .text ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, ;int pitch, unsigned char *dest,int stride) -global sym(vp8_short_idct4x4llm_mmx) PRIVATE +globalsym(vp8_short_idct4x4llm_mmx) sym(vp8_short_idct4x4llm_mmx): push rbp mov rbp, rsp @@ -224,7 +225,7 @@ sym(vp8_short_idct4x4llm_mmx): ;int pred_stride, ;unsigned char *dst_ptr, ;int stride) -global sym(vp8_dc_only_idct_add_mmx) PRIVATE +globalsym(vp8_dc_only_idct_add_mmx) sym(vp8_dc_only_idct_add_mmx): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm index bf8e2c4021..bb79d2da3b 100644 --- a/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm +++ b/media/libvpx/libvpx/vp8/common/x86/idctllm_sse2.asm @@ -19,7 +19,9 @@ ; int dst_stride - 3 ; ) -global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE +SECTION .text + +globalsym(vp8_idct_dequant_0_2x_sse2) sym(vp8_idct_dequant_0_2x_sse2): push rbp mov rbp, rsp @@ -101,7 +103,7 @@ sym(vp8_idct_dequant_0_2x_sse2): ; unsigned char *dst - 2 ; int dst_stride - 3 ; ) -global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE +globalsym(vp8_idct_dequant_full_2x_sse2) sym(vp8_idct_dequant_full_2x_sse2): push rbp mov rbp, rsp @@ -358,7 +360,7 @@ sym(vp8_idct_dequant_full_2x_sse2): ; int dst_stride - 3 ; short *dc - 4 ; ) -global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE +globalsym(vp8_idct_dequant_dc_0_2x_sse2) sym(vp8_idct_dequant_dc_0_2x_sse2): push rbp mov rbp, rsp @@ -434,7 +436,7 @@ sym(vp8_idct_dequant_dc_0_2x_sse2): ; int dst_stride - 3 ; short *dc - 4 ; ) -global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE +globalsym(vp8_idct_dequant_dc_full_2x_sse2) sym(vp8_idct_dequant_dc_full_2x_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm index 06e86a80b6..56f37c3e0f 100644 --- a/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm +++ b/media/libvpx/libvpx/vp8/common/x86/iwalsh_sse2.asm @@ -11,8 +11,10 @@ %include "vpx_ports/x86_abi_support.asm" -;void vp8_short_inv_walsh4x4_sse2(short *input, short *output) -global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE +SECTION .text + +;void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff) +globalsym(vp8_short_inv_walsh4x4_sse2) sym(vp8_short_inv_walsh4x4_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm index 6d5aaa19db..8d12f5385d 100644 --- a/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm +++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm @@ -125,6 +125,8 @@ pxor %1, [GLOBAL(t80)] %endmacro +SECTION .text + ;void vp8_loop_filter_bh_y_sse2 ;( ; unsigned char *src_ptr, @@ -133,7 +135,7 @@ ; const char *limit, ; const char *thresh ;) -global sym(vp8_loop_filter_bh_y_sse2) PRIVATE +globalsym(vp8_loop_filter_bh_y_sse2) sym(vp8_loop_filter_bh_y_sse2): %if LIBVPX_YASM_WIN64 @@ -275,7 +277,7 @@ LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 ; const char *thresh ;) -global sym(vp8_loop_filter_bv_y_sse2) PRIVATE +globalsym(vp8_loop_filter_bv_y_sse2) sym(vp8_loop_filter_bv_y_sse2): %if LIBVPX_YASM_WIN64 diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm index 1913abc69b..ce5c313138 100644 --- a/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm +++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_sse2.asm @@ -276,6 +276,8 @@ %endmacro +SECTION .text + %if ABI_IS_32BIT ;void vp8_loop_filter_horizontal_edge_sse2 @@ -286,7 +288,7 @@ ; const char *limit, ; const char *thresh, ;) -global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE +globalsym(vp8_loop_filter_horizontal_edge_sse2) sym(vp8_loop_filter_horizontal_edge_sse2): push rbp mov rbp, rsp @@ -334,7 +336,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2): ; const char *thresh, ; int count ;) -global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE +globalsym(vp8_loop_filter_horizontal_edge_uv_sse2) sym(vp8_loop_filter_horizontal_edge_uv_sse2): push rbp mov rbp, rsp @@ -561,7 +563,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): ; const char *limit, ; const char *thresh, ;) -global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE +globalsym(vp8_mbloop_filter_horizontal_edge_sse2) sym(vp8_mbloop_filter_horizontal_edge_sse2): push rbp mov rbp, rsp @@ -607,7 +609,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2): ; const char *thresh, ; unsigned char *v ;) -global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE +globalsym(vp8_mbloop_filter_horizontal_edge_uv_sse2) sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): push rbp mov rbp, rsp @@ -928,7 +930,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): ; const char *limit, ; const char *thresh, ;) -global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE +globalsym(vp8_loop_filter_vertical_edge_sse2) sym(vp8_loop_filter_vertical_edge_sse2): push rbp mov rbp, rsp @@ -993,7 +995,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): ; const char *thresh, ; unsigned char *v ;) -global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE +globalsym(vp8_loop_filter_vertical_edge_uv_sse2) sym(vp8_loop_filter_vertical_edge_uv_sse2): push rbp mov rbp, rsp @@ -1142,7 +1144,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ; const char *limit, ; const char *thresh, ;) -global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE +globalsym(vp8_mbloop_filter_vertical_edge_sse2) sym(vp8_mbloop_filter_vertical_edge_sse2): push rbp mov rbp, rsp @@ -1209,7 +1211,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): ; const char *thresh, ; unsigned char *v ;) -global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE +globalsym(vp8_mbloop_filter_vertical_edge_uv_sse2) sym(vp8_mbloop_filter_vertical_edge_uv_sse2): push rbp mov rbp, rsp @@ -1269,7 +1271,7 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): ; int src_pixel_step, ; const char *blimit, ;) -global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE +globalsym(vp8_loop_filter_simple_horizontal_edge_sse2) sym(vp8_loop_filter_simple_horizontal_edge_sse2): push rbp mov rbp, rsp @@ -1374,7 +1376,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): ; int src_pixel_step, ; const char *blimit, ;) -global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE +globalsym(vp8_loop_filter_simple_vertical_edge_sse2) sym(vp8_loop_filter_simple_vertical_edge_sse2): push rbp ; save old base pointer value. mov rbp, rsp ; set new base pointer value. diff --git a/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c b/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c index a187d51fbe..cfa13a2ddb 100644 --- a/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c +++ b/media/libvpx/libvpx/vp8/common/x86/loopfilter_x86.c @@ -22,7 +22,7 @@ #define prototype_simple_loopfilter(sym) \ void sym(unsigned char *y, int ystride, const unsigned char *blimit) -#if HAVE_SSE2 && ARCH_X86_64 +#if HAVE_SSE2 && VPX_ARCH_X86_64 prototype_loopfilter(vp8_loop_filter_bv_y_sse2); prototype_loopfilter(vp8_loop_filter_bh_y_sse2); #else @@ -68,7 +68,7 @@ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { -#if ARCH_X86_64 +#if VPX_ARCH_X86_64 vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); #else @@ -101,7 +101,7 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) { -#if ARCH_X86_64 +#if VPX_ARCH_X86_64 vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); #else diff --git a/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm index 8177b79226..3ec2a99ec2 100644 --- a/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm +++ b/media/libvpx/libvpx/vp8/common/x86/mfqe_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp8_filter_by_weight16x16_sse2 ;( ; unsigned char *src, @@ -19,7 +21,7 @@ ; int dst_stride, ; int src_weight ;) -global sym(vp8_filter_by_weight16x16_sse2) PRIVATE +globalsym(vp8_filter_by_weight16x16_sse2) sym(vp8_filter_by_weight16x16_sse2): push rbp mov rbp, rsp @@ -97,7 +99,7 @@ sym(vp8_filter_by_weight16x16_sse2): ; int dst_stride, ; int src_weight ;) -global sym(vp8_filter_by_weight8x8_sse2) PRIVATE +globalsym(vp8_filter_by_weight8x8_sse2) sym(vp8_filter_by_weight8x8_sse2): push rbp mov rbp, rsp @@ -165,7 +167,7 @@ sym(vp8_filter_by_weight8x8_sse2): ; unsigned int *variance, 4 ; unsigned int *sad, 5 ;) -global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE +globalsym(vp8_variance_and_sad_16x16_sse2) sym(vp8_variance_and_sad_16x16_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm index 43f2dc6c62..01cf066837 100644 --- a/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm +++ b/media/libvpx/libvpx/vp8/common/x86/recon_mmx.asm @@ -11,6 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text ;void copy_mem8x8_mmx( ; unsigned char *src, @@ -18,7 +19,7 @@ ; unsigned char *dst, ; int dst_stride ; ) -global sym(vp8_copy_mem8x8_mmx) PRIVATE +globalsym(vp8_copy_mem8x8_mmx) sym(vp8_copy_mem8x8_mmx): push rbp mov rbp, rsp @@ -81,7 +82,7 @@ sym(vp8_copy_mem8x8_mmx): ; unsigned char *dst, ; int dst_stride ; ) -global sym(vp8_copy_mem8x4_mmx) PRIVATE +globalsym(vp8_copy_mem8x4_mmx) sym(vp8_copy_mem8x4_mmx): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm index cb89537f76..17baf094ef 100644 --- a/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm +++ b/media/libvpx/libvpx/vp8/common/x86/recon_sse2.asm @@ -11,13 +11,15 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void copy_mem16x16_sse2( ; unsigned char *src, ; int src_stride, ; unsigned char *dst, ; int dst_stride ; ) -global sym(vp8_copy_mem16x16_sse2) PRIVATE +globalsym(vp8_copy_mem16x16_sse2) sym(vp8_copy_mem16x16_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm index 6ab7f1fdc7..8f0f6fcc89 100644 --- a/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm +++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_mmx.asm @@ -10,13 +10,12 @@ %include "vpx_ports/x86_abi_support.asm" -extern sym(vp8_bilinear_filters_x86_8) - %define BLOCK_HEIGHT_WIDTH 4 %define vp8_filter_weight 128 %define VP8_FILTER_SHIFT 7 +SECTION .text ;void vp8_filter_block1d_h6_mmx ;( @@ -28,7 +27,7 @@ extern sym(vp8_bilinear_filters_x86_8) ; unsigned int output_width, ; short * vp8_filter ;) -global sym(vp8_filter_block1d_h6_mmx) PRIVATE +globalsym(vp8_filter_block1d_h6_mmx) sym(vp8_filter_block1d_h6_mmx): push rbp mov rbp, rsp @@ -125,7 +124,7 @@ sym(vp8_filter_block1d_h6_mmx): ; unsigned int output_width, ; short * vp8_filter ;) -global sym(vp8_filter_block1dc_v6_mmx) PRIVATE +globalsym(vp8_filter_block1dc_v6_mmx) sym(vp8_filter_block1dc_v6_mmx): push rbp mov rbp, rsp @@ -204,280 +203,6 @@ sym(vp8_filter_block1dc_v6_mmx): ret -;void bilinear_predict8x4_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict8x4_mmx) PRIVATE -sym(vp8_bilinear_predict8x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - shl rax, 5 - - mov rsi, arg(0) ;src_ptr ; - add rax, rcx - - movsxd rdx, dword ptr arg(5) ;dst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - shl rax, 5 - - add rax, rcx - lea rcx, [rdi+rdx*4] ; - - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - ; get the first horizontal line done ; - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - add rsi, rdx ; next line -.next_row_8x4: - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - movq mm5, mm7 ; - movq mm6, mm7 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 - - pmullw mm5, [rax] ; - pmullw mm6, [rax] ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - - pmullw mm3, [rax+16] ; - pmullw mm4, [rax+16] ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP8_FILTER_SHIFT ; - - packuswb mm3, mm4 - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch - add rsi, rdx ; next line - add rdi, r8 -%endif - cmp rdi, rcx ; - jne .next_row_8x4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void bilinear_predict4x4_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict4x4_mmx) PRIVATE -sym(vp8_bilinear_predict4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - shl rax, 5 - - add rax, rcx ; HFilter - mov rsi, arg(0) ;src_ptr ; - - movsxd rdx, dword ptr arg(5) ;ldst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - shl rax, 5 - - add rax, rcx - lea rcx, [rdi+rdx*4] ; - - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - ; get the first horizontal line done ; - movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - - pmullw mm3, mm1 ; - movd mm5, [rsi+1] ; - - punpcklbw mm5, mm0 ; - pmullw mm5, mm2 ; - - paddw mm3, mm5 ; - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movq mm7, mm3 ; - packuswb mm7, mm0 ; - - add rsi, rdx ; next line -.next_row_4x4: - movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - - pmullw mm3, mm1 ; - movd mm5, [rsi+1] ; - - punpcklbw mm5, mm0 ; - pmullw mm5, mm2 ; - - paddw mm3, mm5 ; - - movq mm5, mm7 ; - punpcklbw mm5, mm0 ; - - pmullw mm5, [rax] ; - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - movq mm7, mm3 ; - - packuswb mm7, mm0 ; - - pmullw mm3, [rax+16] ; - paddw mm3, mm5 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - packuswb mm3, mm0 - movd [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch ; - add rsi, rdx ; next line - add rdi, r8 -%endif - - cmp rdi, rcx ; - jne .next_row_4x4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - SECTION_RODATA align 16 rd: diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm index ca00583ca2..94e14aed6c 100644 --- a/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm +++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_sse2.asm @@ -10,12 +10,12 @@ %include "vpx_ports/x86_abi_support.asm" -extern sym(vp8_bilinear_filters_x86_8) %define BLOCK_HEIGHT_WIDTH 4 %define VP8_FILTER_WEIGHT 128 %define VP8_FILTER_SHIFT 7 +SECTION .text ;/************************************************************************************ ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The @@ -33,7 +33,7 @@ extern sym(vp8_bilinear_filters_x86_8) ; unsigned int output_width, ; short *vp8_filter ;) -global sym(vp8_filter_block1d8_h6_sse2) PRIVATE +globalsym(vp8_filter_block1d8_h6_sse2) sym(vp8_filter_block1d8_h6_sse2): push rbp mov rbp, rsp @@ -153,7 +153,7 @@ sym(vp8_filter_block1d8_h6_sse2): ; even number. This function handles 8 pixels in horizontal direction, calculating ONE ; rows each iteration to take advantage of the 128 bits operations. ;*************************************************************************************/ -global sym(vp8_filter_block1d16_h6_sse2) PRIVATE +globalsym(vp8_filter_block1d16_h6_sse2) sym(vp8_filter_block1d16_h6_sse2): push rbp mov rbp, rsp @@ -333,7 +333,7 @@ sym(vp8_filter_block1d16_h6_sse2): ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The ; input pixel array has output_height rows. ;*************************************************************************************/ -global sym(vp8_filter_block1d8_v6_sse2) PRIVATE +globalsym(vp8_filter_block1d8_v6_sse2) sym(vp8_filter_block1d8_v6_sse2): push rbp mov rbp, rsp @@ -428,7 +428,7 @@ sym(vp8_filter_block1d8_v6_sse2): ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The ; input pixel array has output_height rows. ;*************************************************************************************/ -global sym(vp8_filter_block1d16_v6_sse2) PRIVATE +globalsym(vp8_filter_block1d16_v6_sse2) sym(vp8_filter_block1d16_v6_sse2): push rbp mov rbp, rsp @@ -538,7 +538,7 @@ sym(vp8_filter_block1d16_v6_sse2): ; const short *vp8_filter ;) ; First-pass filter only when yoffset==0 -global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE +globalsym(vp8_filter_block1d8_h6_only_sse2) sym(vp8_filter_block1d8_h6_only_sse2): push rbp mov rbp, rsp @@ -651,7 +651,7 @@ sym(vp8_filter_block1d8_h6_only_sse2): ; const short *vp8_filter ;) ; First-pass filter only when yoffset==0 -global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE +globalsym(vp8_filter_block1d16_h6_only_sse2) sym(vp8_filter_block1d16_h6_only_sse2): push rbp mov rbp, rsp @@ -816,7 +816,7 @@ sym(vp8_filter_block1d16_h6_only_sse2): ; const short *vp8_filter ;) ; Second-pass filter only when xoffset==0 -global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE +globalsym(vp8_filter_block1d8_v6_only_sse2) sym(vp8_filter_block1d8_v6_only_sse2): push rbp mov rbp, rsp @@ -908,7 +908,7 @@ sym(vp8_filter_block1d8_v6_only_sse2): ; unsigned int output_height, ; unsigned int output_width ;) -global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE +globalsym(vp8_unpack_block1d16_h6_sse2) sym(vp8_unpack_block1d16_h6_sse2): push rbp mov rbp, rsp @@ -957,419 +957,6 @@ sym(vp8_unpack_block1d16_h6_sse2): ret -;void vp8_bilinear_predict16x16_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp8_bilinear_filters_x86_8) -global sym(vp8_bilinear_predict16x16_sse2) PRIVATE -sym(vp8_bilinear_predict16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - movsxd rax, dword ptr arg(2) ;xoffset - - cmp rax, 0 ;skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - - cmp rax, 0 ;skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;dst_pitch -%endif - ; get the first horizontal line done - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - add rsi, rdx ; next line -.next_row: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, [rax] - pmullw xmm6, [rax] - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - pmullw xmm3, [rax+16] - pmullw xmm4, [rax+16] - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rdx ; next line -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ;dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - add rsi, rax ; next line -.next_row_spo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - movdqa xmm4, xmm3 ; make a copy of current line - movdqa xmm7, xmm3 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm5, xmm1 - pmullw xmm6, xmm1 - pmullw xmm3, xmm2 - pmullw xmm4, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ;dst_pitch - cmp rdi, rcx - jne .next_row_spo - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - pxor xmm0, xmm0 - -.next_row_fpo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ; dst_pitch - cmp rdi, rcx - jne .next_row_fpo - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_bilinear_predict8x8_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict8x8_sse2) PRIVATE -sym(vp8_bilinear_predict8x8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ;xoffset - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm5, [rax] - movdqa xmm6, [rax+16] - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqa xmm3, XMMWORD PTR [rsp] - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - add rsp, 16 ; next line -.next_row8x8: - movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - pmullw xmm7, xmm5 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm4, xmm3 - - pmullw xmm3, xmm6 - paddw xmm3, xmm7 - - movdqa xmm7, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - packuswb xmm3, xmm0 - movq [rdi], xmm3 ; store the results in the destination - - add rsp, 16 ; next line - add rdi, rdx - - cmp rdi, rcx - jne .next_row8x8 - - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - SECTION_RODATA align 16 rd: diff --git a/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm b/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm index 1f6cbd1d1e..17247227db 100644 --- a/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm +++ b/media/libvpx/libvpx/vp8/common/x86/subpixel_ssse3.asm @@ -15,6 +15,7 @@ %define VP8_FILTER_WEIGHT 128 %define VP8_FILTER_SHIFT 7 +SECTION .text ;/************************************************************************************ ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The @@ -34,7 +35,7 @@ ; unsigned int output_height, ; unsigned int vp8_filter_index ;) -global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE +globalsym(vp8_filter_block1d8_h6_ssse3) sym(vp8_filter_block1d8_h6_ssse3): push rbp mov rbp, rsp @@ -177,7 +178,7 @@ vp8_filter_block1d8_h4_ssse3: ; unsigned int output_height, ; unsigned int vp8_filter_index ;) -global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE +globalsym(vp8_filter_block1d16_h6_ssse3) sym(vp8_filter_block1d16_h6_ssse3): push rbp mov rbp, rsp @@ -284,7 +285,7 @@ sym(vp8_filter_block1d16_h6_ssse3): ; unsigned int output_height, ; unsigned int vp8_filter_index ;) -global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE +globalsym(vp8_filter_block1d4_h6_ssse3) sym(vp8_filter_block1d4_h6_ssse3): push rbp mov rbp, rsp @@ -414,7 +415,7 @@ sym(vp8_filter_block1d4_h6_ssse3): ; unsigned int output_height, ; unsigned int vp8_filter_index ;) -global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE +globalsym(vp8_filter_block1d16_v6_ssse3) sym(vp8_filter_block1d16_v6_ssse3): push rbp mov rbp, rsp @@ -602,7 +603,7 @@ sym(vp8_filter_block1d16_v6_ssse3): ; unsigned int output_height, ; unsigned int vp8_filter_index ;) -global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE +globalsym(vp8_filter_block1d8_v6_ssse3) sym(vp8_filter_block1d8_v6_ssse3): push rbp mov rbp, rsp @@ -742,7 +743,7 @@ sym(vp8_filter_block1d8_v6_ssse3): ; unsigned int output_height, ; unsigned int vp8_filter_index ;) -global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE +globalsym(vp8_filter_block1d4_v6_ssse3) sym(vp8_filter_block1d4_v6_ssse3): push rbp mov rbp, rsp @@ -881,7 +882,7 @@ sym(vp8_filter_block1d4_v6_ssse3): ; unsigned char *dst_ptr, ; int dst_pitch ;) -global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE +globalsym(vp8_bilinear_predict16x16_ssse3) sym(vp8_bilinear_predict16x16_ssse3): push rbp mov rbp, rsp @@ -1144,7 +1145,7 @@ sym(vp8_bilinear_predict16x16_ssse3): ; unsigned char *dst_ptr, ; int dst_pitch ;) -global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE +globalsym(vp8_bilinear_predict8x8_ssse3) sym(vp8_bilinear_predict8x8_ssse3): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c b/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c index b9d087e20d..7fb83c2d5e 100644 --- a/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c +++ b/media/libvpx/libvpx/vp8/common/x86/vp8_asm_stubs.c @@ -11,7 +11,6 @@ #include "vpx_config.h" #include "vp8_rtcd.h" #include "vpx_ports/mem.h" -#include "filter_x86.h" extern const short vp8_six_tap_x86[8][6 * 8]; @@ -95,9 +94,7 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, - int dst_pitch - - ) { + int dst_pitch) { DECLARE_ALIGNED(16, unsigned short, FData2[24 * 24]); /* Temp data bufffer used in filtering */ @@ -236,9 +233,7 @@ extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr, void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, - int dst_pitch - - ) { + int dst_pitch) { DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]); if (xoffset) { @@ -351,8 +346,8 @@ void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, yoffset); } else { /* ssse3 second-pass only function couldn't handle (xoffset==0 && - * yoffset==0) case correctly. Add copy function here to guarantee - * six-tap function handles all possible offsets. */ + * yoffset==0) case correctly. Add copy function here to guarantee + * six-tap function handles all possible offsets. */ int r; for (r = 0; r < 4; ++r) { diff --git a/media/libvpx/libvpx/vp8/decoder/dboolhuff.c b/media/libvpx/libvpx/vp8/decoder/dboolhuff.c index 9cf74bf856..11099c453c 100644 --- a/media/libvpx/libvpx/vp8/decoder/dboolhuff.c +++ b/media/libvpx/libvpx/vp8/decoder/dboolhuff.c @@ -15,7 +15,13 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, unsigned int source_sz, vpx_decrypt_cb decrypt_cb, void *decrypt_state) { - br->user_buffer_end = source + source_sz; + if (source_sz && !source) return 1; + + // To simplify calling code this fuction can be called with |source| == null + // and |source_sz| == 0. This and vp8dx_bool_decoder_fill() are essentially + // no-ops in this case. + // Work around a ubsan warning with a ternary to avoid adding 0 to null. + br->user_buffer_end = source ? source + source_sz : source; br->user_buffer = source; br->value = 0; br->count = -8; @@ -23,8 +29,6 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, br->decrypt_cb = decrypt_cb; br->decrypt_state = decrypt_state; - if (source_sz && !source) return 1; - /* Populate the buffer */ vp8dx_bool_decoder_fill(br); diff --git a/media/libvpx/libvpx/vp8/decoder/dboolhuff.h b/media/libvpx/libvpx/vp8/decoder/dboolhuff.h index 04c027cd78..673b2fbd5d 100644 --- a/media/libvpx/libvpx/vp8/decoder/dboolhuff.h +++ b/media/libvpx/libvpx/vp8/decoder/dboolhuff.h @@ -8,13 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_DBOOLHUFF_H_ -#define VP8_DECODER_DBOOLHUFF_H_ +#ifndef VPX_VP8_DECODER_DBOOLHUFF_H_ +#define VPX_VP8_DECODER_DBOOLHUFF_H_ #include #include #include "./vpx_config.h" +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/mem.h" #include "vpx/vp8dx.h" #include "vpx/vpx_integer.h" @@ -50,7 +51,8 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, void vp8dx_bool_decoder_fill(BOOL_DECODER *br); -static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) { +static VPX_NO_UNSIGNED_SHIFT_CHECK int vp8dx_decode_bool(BOOL_DECODER *br, + int probability) { unsigned int bit = 0; VP8_BD_VALUE value; unsigned int split; @@ -76,7 +78,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) { } { - register int shift = vp8_norm[range]; + const unsigned char shift = vp8_norm[(unsigned char)range]; range <<= shift; value <<= shift; count -= shift; @@ -127,4 +129,4 @@ static INLINE int vp8dx_bool_error(BOOL_DECODER *br) { } // extern "C" #endif -#endif // VP8_DECODER_DBOOLHUFF_H_ +#endif // VPX_VP8_DECODER_DBOOLHUFF_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/decodeframe.c b/media/libvpx/libvpx/vp8/decoder/decodeframe.c index 0aec2a01b0..a20f33dd8c 100644 --- a/media/libvpx/libvpx/vp8/decoder/decodeframe.c +++ b/media/libvpx/libvpx/vp8/decoder/decodeframe.c @@ -63,7 +63,7 @@ void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) { /* Decide whether to use the default or alternate baseline Q value. */ if (xd->segmentation_enabled) { /* Abs Value */ - if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) { + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id]; /* Delta Value */ @@ -211,7 +211,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, vp8_short_inv_walsh4x4(&b->dqcoeff[0], xd->qcoeff); memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0])); } else { - b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0]; + b->dqcoeff[0] = (short)(b->qcoeff[0] * xd->dequant_y2[0]); vp8_short_inv_walsh4x4_1(&b->dqcoeff[0], xd->qcoeff); memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0])); } @@ -610,8 +610,7 @@ static void decode_mb_rows(VP8D_COMP *pbi) { lf_dst[2]); } else { vp8_loop_filter_row_simple(pc, lf_mic, mb_row - 1, recon_y_stride, - recon_uv_stride, lf_dst[0], lf_dst[1], - lf_dst[2]); + lf_dst[0]); } if (mb_row > 1) { yv12_extend_frame_left_right_c(yv12_fb_new, eb_dst[0], eb_dst[1], @@ -647,8 +646,7 @@ static void decode_mb_rows(VP8D_COMP *pbi) { lf_dst[2]); } else { vp8_loop_filter_row_simple(pc, lf_mic, mb_row - 1, recon_y_stride, - recon_uv_stride, lf_dst[0], lf_dst[1], - lf_dst[2]); + lf_dst[0]); } yv12_extend_frame_left_right_c(yv12_fb_new, eb_dst[0], eb_dst[1], @@ -674,7 +672,7 @@ static unsigned int read_partition_size(VP8D_COMP *pbi, static int read_is_valid(const unsigned char *start, size_t len, const unsigned char *end) { - return (start + len > start && start + len <= end); + return len != 0 && end > start && len <= (size_t)(end - start); } static unsigned int read_available_partition_size( @@ -686,6 +684,12 @@ static unsigned int read_available_partition_size( const unsigned char *partition_size_ptr = token_part_sizes + i * 3; unsigned int partition_size = 0; ptrdiff_t bytes_left = fragment_end - fragment_start; + if (bytes_left < 0) { + vpx_internal_error( + &pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt partition. No bytes left %d.", + (int)bytes_left); + } /* Calculate the length of this partition. The last partition * size is implicit. If the partition size can't be read, then * either use the remaining data in the buffer (for EC mode) @@ -750,6 +754,9 @@ static void setup_token_decoder(VP8D_COMP *pbi, ptrdiff_t ext_first_part_size = token_part_sizes - pbi->fragments.ptrs[0] + 3 * (num_token_partitions - 1); + if (fragment_size < (unsigned int)ext_first_part_size) + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, + "Corrupted fragment size %d", fragment_size); fragment_size -= (unsigned int)ext_first_part_size; if (fragment_size > 0) { pbi->fragments.sizes[0] = (unsigned int)ext_first_part_size; @@ -767,6 +774,9 @@ static void setup_token_decoder(VP8D_COMP *pbi, first_fragment_end, fragment_end, fragment_idx - 1, num_token_partitions); pbi->fragments.sizes[fragment_idx] = (unsigned int)partition_size; + if (fragment_size < (unsigned int)partition_size) + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, + "Corrupted fragment size %d", fragment_size); fragment_size -= (unsigned int)partition_size; assert(fragment_idx <= num_token_partitions); if (fragment_size > 0) { @@ -819,7 +829,7 @@ static void init_frame(VP8D_COMP *pbi) { /* reset the segment feature data to 0 with delta coding (Default state). */ memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); - xd->mb_segement_abs_delta = SEGMENT_DELTADATA; + xd->mb_segment_abs_delta = SEGMENT_DELTADATA; /* reset the mode ref deltasa for loop filter */ memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas)); @@ -862,8 +872,8 @@ static void init_frame(VP8D_COMP *pbi) { xd->mode_info_stride = pc->mode_info_stride; xd->corrupted = 0; /* init without corruption */ - xd->fullpixel_mask = 0xffffffff; - if (pc->full_pixel) xd->fullpixel_mask = 0xfffffff8; + xd->fullpixel_mask = ~0; + if (pc->full_pixel) xd->fullpixel_mask = ~7; } int vp8_decode_frame(VP8D_COMP *pbi) { @@ -930,7 +940,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) { /* When error concealment is enabled we should only check the sync * code if we have enough bits available */ - if (!pbi->ec_active || data + 3 < data_end) { + if (data + 3 < data_end) { if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a) { vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM, "Invalid frame sync code"); @@ -941,16 +951,22 @@ int vp8_decode_frame(VP8D_COMP *pbi) { * if we have enough data. Otherwise we will end up with the wrong * size. */ - if (!pbi->ec_active || data + 6 < data_end) { + if (data + 6 < data_end) { pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff; pc->horiz_scale = clear[4] >> 6; pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff; pc->vert_scale = clear[6] >> 6; + data += 7; + } else if (!pbi->ec_active) { + vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated key frame header"); + } else { + /* Error concealment is active, clear the frame. */ + data = data_end; } - data += 7; } else { - memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG)); - memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG)); + xd->pre = *yv12_fb_new; + xd->dst = *yv12_fb_new; } } if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME)) { @@ -979,7 +995,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) { xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc); if (xd->update_mb_segmentation_data) { - xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc); + xd->mb_segment_abs_delta = (unsigned char)vp8_read_bit(bc); memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); @@ -1140,7 +1156,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) { if (pbi->ec_active && xd->corrupted) pc->refresh_entropy_probs = 0; #endif if (pc->refresh_entropy_probs == 0) { - memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc)); + pc->lfc = pc->fc; } pc->refresh_last_frame = pc->frame_type == KEY_FRAME || vp8_read_bit(bc); @@ -1151,14 +1167,6 @@ int vp8_decode_frame(VP8D_COMP *pbi) { if (pbi->ec_active && xd->corrupted) pc->refresh_last_frame = 1; #endif - if (0) { - FILE *z = fopen("decodestats.stt", "a"); - fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", pc->current_video_frame, - pc->frame_type, pc->refresh_golden_frame, pc->refresh_alt_ref_frame, - pc->refresh_last_frame, pc->base_qindex); - fclose(z); - } - { pbi->independent_partitions = 1; @@ -1199,9 +1207,14 @@ int vp8_decode_frame(VP8D_COMP *pbi) { pbi->frame_corrupt_residual = 0; #if CONFIG_MULTITHREAD - if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) && + pc->multi_token_partition != ONE_PARTITION) { unsigned int thread; - vp8mt_decode_mb_rows(pbi, xd); + if (vp8mt_decode_mb_rows(pbi, xd)) { + vp8_decoder_remove_threads(pbi); + pbi->restart_threads = 1; + vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, NULL); + } vp8_yv12_extend_frame_borders(yv12_fb_new); for (thread = 0; thread < pbi->decoding_thread_count; ++thread) { corrupt_tokens |= pbi->mb_row_di[thread].mbd.corrupted; @@ -1232,7 +1245,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) { * \n",bc->pos+pbi->bc2.pos); */ if (pc->refresh_entropy_probs == 0) { - memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc)); + pc->fc = pc->lfc; pbi->independent_partitions = prev_independent_partitions; } diff --git a/media/libvpx/libvpx/vp8/decoder/decodemv.c b/media/libvpx/libvpx/vp8/decoder/decodemv.c index b946ab73d0..3f459d623f 100644 --- a/media/libvpx/libvpx/vp8/decoder/decodemv.c +++ b/media/libvpx/libvpx/vp8/decoder/decodemv.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "decodemv.h" #include "treereader.h" #include "vp8/common/entropymv.h" #include "vp8/common/entropymode.h" @@ -64,8 +65,7 @@ static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc) { const vp8_prob *const p = (const vp8_prob *)mvc; int x = 0; - if (vp8_read(r, p[mvpis_short])) /* Large */ - { + if (vp8_read(r, p[mvpis_short])) { /* Large */ int i = 0; do { @@ -173,7 +173,8 @@ const vp8_prob vp8_sub_mv_ref_prob3[8][VP8_SUBMVREFS - 1] = { { 208, 1, 1 } /* SUBMVREF_LEFT_ABOVE_ZED */ }; -static const vp8_prob *get_sub_mv_ref_prob(const int left, const int above) { +static const vp8_prob *get_sub_mv_ref_prob(const uint32_t left, + const uint32_t above) { int lez = (left == 0); int aez = (above == 0); int lea = (left == above); @@ -284,8 +285,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi) { vp8_reader *const bc = &pbi->mbc[8]; mbmi->ref_frame = (MV_REFERENCE_FRAME)vp8_read(bc, pbi->prob_intra); - if (mbmi->ref_frame) /* inter MB */ - { + if (mbmi->ref_frame) { /* inter MB */ enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV }; int cnt[4]; int *cntx = cnt; @@ -372,9 +372,9 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, tmp = cnt[CNT_NEAREST]; cnt[CNT_NEAREST] = cnt[CNT_NEAR]; cnt[CNT_NEAR] = tmp; - tmp = near_mvs[CNT_NEAREST].as_int; + tmp = (int)near_mvs[CNT_NEAREST].as_int; near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int; - near_mvs[CNT_NEAR].as_int = tmp; + near_mvs[CNT_NEAR].as_int = (uint32_t)tmp; } if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) { @@ -486,10 +486,7 @@ static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x) { } } -static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi, - MB_MODE_INFO *mbmi) { - (void)mbmi; - +static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi) { /* Read the Macroblock segmentation map if it is being updated explicitly * this frame (reset to 0 above by default) * By default on a key frame reset all MBs to segment 0 @@ -538,7 +535,7 @@ void vp8_decode_mode_mvs(VP8D_COMP *pbi) { int mb_num = mb_row * pbi->common.mb_cols + mb_col; #endif - decode_mb_mode_mvs(pbi, mi, &mi->mbmi); + decode_mb_mode_mvs(pbi, mi); #if CONFIG_ERROR_CONCEALMENT /* look for corruption. set mvs_corrupt_from_mb to the current diff --git a/media/libvpx/libvpx/vp8/decoder/decodemv.h b/media/libvpx/libvpx/vp8/decoder/decodemv.h index f33b07351d..504e943d85 100644 --- a/media/libvpx/libvpx/vp8/decoder/decodemv.h +++ b/media/libvpx/libvpx/vp8/decoder/decodemv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_DECODEMV_H_ -#define VP8_DECODER_DECODEMV_H_ +#ifndef VPX_VP8_DECODER_DECODEMV_H_ +#define VPX_VP8_DECODER_DECODEMV_H_ #include "onyxd_int.h" @@ -23,4 +23,4 @@ void vp8_decode_mode_mvs(VP8D_COMP *); } // extern "C" #endif -#endif // VP8_DECODER_DECODEMV_H_ +#endif // VPX_VP8_DECODER_DECODEMV_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/decoderthreading.h b/media/libvpx/libvpx/vp8/decoder/decoderthreading.h index c563cf6e93..3d49bc8317 100644 --- a/media/libvpx/libvpx/vp8/decoder/decoderthreading.h +++ b/media/libvpx/libvpx/vp8/decoder/decoderthreading.h @@ -8,15 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_DECODERTHREADING_H_ -#define VP8_DECODER_DECODERTHREADING_H_ +#ifndef VPX_VP8_DECODER_DECODERTHREADING_H_ +#define VPX_VP8_DECODER_DECODERTHREADING_H_ #ifdef __cplusplus extern "C" { #endif #if CONFIG_MULTITHREAD -void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd); +int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd); void vp8_decoder_remove_threads(VP8D_COMP *pbi); void vp8_decoder_create_threads(VP8D_COMP *pbi); void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows); @@ -27,4 +27,4 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows); } // extern "C" #endif -#endif // VP8_DECODER_DECODERTHREADING_H_ +#endif // VPX_VP8_DECODER_DECODERTHREADING_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/detokenize.c b/media/libvpx/libvpx/vp8/decoder/detokenize.c index b350bafbc5..1c77873f0b 100644 --- a/media/libvpx/libvpx/vp8/decoder/detokenize.c +++ b/media/libvpx/libvpx/vp8/decoder/detokenize.c @@ -11,6 +11,7 @@ #include "vp8/common/blockd.h" #include "onyxd_int.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/mem.h" #include "detokenize.h" @@ -52,7 +53,10 @@ static const uint8_t kZigzag[16] = { 0, 1, 4, 8, 5, 2, 3, 6, /* for const-casting */ typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS]; -static int GetSigned(BOOL_DECODER *br, int value_to_sign) { +// With corrupt / fuzzed streams the calculation of br->value may overflow. See +// b/148271109. +static VPX_NO_UNSIGNED_OVERFLOW_CHECK int GetSigned(BOOL_DECODER *br, + int value_to_sign) { int split = (br->range + 1) >> 1; VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); int v; diff --git a/media/libvpx/libvpx/vp8/decoder/detokenize.h b/media/libvpx/libvpx/vp8/decoder/detokenize.h index f0b125444f..410a431ba0 100644 --- a/media/libvpx/libvpx/vp8/decoder/detokenize.h +++ b/media/libvpx/libvpx/vp8/decoder/detokenize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_DETOKENIZE_H_ -#define VP8_DECODER_DETOKENIZE_H_ +#ifndef VPX_VP8_DECODER_DETOKENIZE_H_ +#define VPX_VP8_DECODER_DETOKENIZE_H_ #include "onyxd_int.h" @@ -24,4 +24,4 @@ int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *); } // extern "C" #endif -#endif // VP8_DECODER_DETOKENIZE_H_ +#endif // VPX_VP8_DECODER_DETOKENIZE_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/ec_types.h b/media/libvpx/libvpx/vp8/decoder/ec_types.h index 0ab08b649a..84feb269df 100644 --- a/media/libvpx/libvpx/vp8/decoder/ec_types.h +++ b/media/libvpx/libvpx/vp8/decoder/ec_types.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_EC_TYPES_H_ -#define VP8_DECODER_EC_TYPES_H_ +#ifndef VPX_VP8_DECODER_EC_TYPES_H_ +#define VPX_VP8_DECODER_EC_TYPES_H_ #ifdef __cplusplus extern "C" { @@ -34,7 +34,9 @@ typedef struct { /* Structure used to hold all the overlaps of a macroblock. The overlaps of a * macroblock is further divided into block overlaps. */ -typedef struct { B_OVERLAP overlaps[16]; } MB_OVERLAP; +typedef struct { + B_OVERLAP overlaps[16]; +} MB_OVERLAP; /* Structure for keeping track of motion vectors and which reference frame they * refer to. Used for motion vector interpolation. @@ -48,4 +50,4 @@ typedef struct { } // extern "C" #endif -#endif // VP8_DECODER_EC_TYPES_H_ +#endif // VPX_VP8_DECODER_EC_TYPES_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/error_concealment.c b/media/libvpx/libvpx/vp8/decoder/error_concealment.c index e22141492c..85982e4de3 100644 --- a/media/libvpx/libvpx/vp8/decoder/error_concealment.c +++ b/media/libvpx/libvpx/vp8/decoder/error_concealment.c @@ -147,8 +147,8 @@ static void calculate_overlaps_mb(B_OVERLAP *b_overlaps, union b_mode_info *bmi, } } -void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols, - union b_mode_info *bmi, int b_row, int b_col) { +static void calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols, + union b_mode_info *bmi, int b_row, int b_col) { MB_OVERLAP *mb_overlap; int row, col, rel_row, rel_col; int new_row, new_col; @@ -280,9 +280,9 @@ static void calc_prev_mb_overlaps(MB_OVERLAP *overlaps, MODE_INFO *prev_mi, int sub_col; for (sub_row = 0; sub_row < 4; ++sub_row) { for (sub_col = 0; sub_col < 4; ++sub_col) { - vp8_calculate_overlaps(overlaps, mb_rows, mb_cols, - &(prev_mi->bmi[sub_row * 4 + sub_col]), - 4 * mb_row + sub_row, 4 * mb_col + sub_col); + calculate_overlaps(overlaps, mb_rows, mb_cols, + &(prev_mi->bmi[sub_row * 4 + sub_col]), + 4 * mb_row + sub_row, 4 * mb_col + sub_col); } } } diff --git a/media/libvpx/libvpx/vp8/decoder/error_concealment.h b/media/libvpx/libvpx/vp8/decoder/error_concealment.h index 89c78c1442..608a79f189 100644 --- a/media/libvpx/libvpx/vp8/decoder/error_concealment.h +++ b/media/libvpx/libvpx/vp8/decoder/error_concealment.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_ERROR_CONCEALMENT_H_ -#define VP8_DECODER_ERROR_CONCEALMENT_H_ +#ifndef VPX_VP8_DECODER_ERROR_CONCEALMENT_H_ +#define VPX_VP8_DECODER_ERROR_CONCEALMENT_H_ #include "onyxd_int.h" #include "ec_types.h" @@ -38,4 +38,4 @@ void vp8_interpolate_motion(MACROBLOCKD *mb, int mb_row, int mb_col, } // extern "C" #endif -#endif // VP8_DECODER_ERROR_CONCEALMENT_H_ +#endif // VPX_VP8_DECODER_ERROR_CONCEALMENT_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c index a1050c478d..88f2de024b 100644 --- a/media/libvpx/libvpx/vp8/decoder/onyxd_if.c +++ b/media/libvpx/libvpx/vp8/decoder/onyxd_if.c @@ -16,6 +16,7 @@ #include "onyxd_int.h" #include "vpx_mem/vpx_mem.h" #include "vp8/common/alloccommon.h" +#include "vp8/common/common.h" #include "vp8/common/loopfilter.h" #include "vp8/common/swapyv12buffer.h" #include "vp8/common/threading.h" @@ -36,12 +37,11 @@ #if CONFIG_ERROR_CONCEALMENT #include "error_concealment.h" #endif -#if ARCH_ARM +#if VPX_ARCH_ARM #include "vpx_ports/arm.h" #endif extern void vp8_init_loop_filter(VP8_COMMON *cm); -extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi); static int get_free_fb(VP8_COMMON *cm); static void ref_cnt_fb(int *buf, int *idx, int new_idx); @@ -302,12 +302,9 @@ static int check_fragments_for_errors(VP8D_COMP *pbi) { return 1; } -int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, - const uint8_t *source, int64_t time_stamp) { +int vp8dx_receive_compressed_data(VP8D_COMP *pbi) { VP8_COMMON *cm = &pbi->common; int retcode = -1; - (void)size; - (void)source; pbi->common.error.error_code = VPX_CODEC_OK; @@ -322,21 +319,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx]; pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx]; - if (setjmp(pbi->common.error.jmp)) { - /* We do not know if the missing frame(s) was supposed to update - * any of the reference buffers, but we act conservative and - * mark only the last buffer as corrupted. - */ - cm->yv12_fb[cm->lst_fb_idx].corrupted = 1; - - if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) { - cm->fb_idx_ref_cnt[cm->new_fb_idx]--; - } - goto decode_exit; - } - - pbi->common.error.setjmp = 1; - retcode = vp8_decode_frame(pbi); if (retcode < 0) { @@ -345,6 +327,12 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, } pbi->common.error.error_code = VPX_CODEC_ERROR; + // Propagate the error info. + if (pbi->mb.error_info.error_code != 0) { + pbi->common.error.error_code = pbi->mb.error_info.error_code; + memcpy(pbi->common.error.detail, pbi->mb.error_info.detail, + sizeof(pbi->mb.error_info.detail)); + } goto decode_exit; } @@ -380,15 +368,12 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, #endif pbi->ready_for_new_data = 0; - pbi->last_time_stamp = time_stamp; decode_exit: - pbi->common.error.setjmp = 0; vpx_clear_system_state(); return retcode; } int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, - int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags) { int ret = -1; @@ -398,8 +383,6 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, if (pbi->common.show_frame == 0) return ret; pbi->ready_for_new_data = 1; - *time_stamp = pbi->last_time_stamp; - *time_end_stamp = 0; #if CONFIG_POSTPROC ret = vp8_post_proc_frame(&pbi->common, sd, flags); @@ -445,8 +428,9 @@ int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) { #if CONFIG_MULTITHREAD if (setjmp(fb->pbi[0]->common.error.jmp)) { + fb->pbi[0]->common.error.setjmp = 0; vp8_remove_decoder_instances(fb); - memset(fb->pbi, 0, sizeof(fb->pbi) / sizeof(fb->pbi[0])); + vp8_zero(fb->pbi); vpx_clear_system_state(); return VPX_CODEC_ERROR; } @@ -469,5 +453,10 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb) { /* decoder instance for single thread mode */ remove_decompressor(pbi); + fb->pbi[0] = NULL; return VPX_CODEC_OK; } + +int vp8dx_get_quantizer(const VP8D_COMP *pbi) { + return pbi->common.base_qindex; +} diff --git a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h index 88b1ff16bc..08a60b31b9 100644 --- a/media/libvpx/libvpx/vp8/decoder/onyxd_int.h +++ b/media/libvpx/libvpx/vp8/decoder/onyxd_int.h @@ -8,10 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_ONYXD_INT_H_ -#define VP8_DECODER_ONYXD_INT_H_ +#ifndef VPX_VP8_DECODER_ONYXD_INT_H_ +#define VPX_VP8_DECODER_ONYXD_INT_H_ + +#include #include "vpx_config.h" +#include "vpx_util/vpx_pthread.h" #include "vp8/common/onyxd.h" #include "treereader.h" #include "vp8/common/onyxc_int.h" @@ -31,7 +34,9 @@ typedef struct { void *ptr2; } DECODETHREAD_DATA; -typedef struct { MACROBLOCKD mbd; } MB_ROW_DEC; +typedef struct { + MACROBLOCKD mbd; +} MB_ROW_DEC; typedef struct { int enabled; @@ -68,7 +73,7 @@ typedef struct VP8D_COMP { #if CONFIG_MULTITHREAD /* variable for threading */ - int b_multithreaded_rd; + vpx_atomic_int b_multithreaded_rd; int max_threads; int current_mb_col_main; unsigned int decoding_thread_count; @@ -76,9 +81,8 @@ typedef struct VP8D_COMP { int mt_baseline_filter_level[MAX_MB_SEGMENTS]; int sync_range; - int *mt_current_mb_col; /* Each row remembers its already decoded column. */ - pthread_mutex_t *pmutex; - pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */ + /* Each row remembers its already decoded column. */ + vpx_atomic_int *mt_current_mb_col; unsigned char **mt_yabove_row; /* mb_rows x width */ unsigned char **mt_uabove_row; @@ -91,12 +95,11 @@ typedef struct VP8D_COMP { DECODETHREAD_DATA *de_thread_data; pthread_t *h_decoding_thread; - sem_t *h_event_start_decoding; - sem_t h_event_end_decoding; + vp8_sem_t *h_event_start_decoding; + vp8_sem_t h_event_end_decoding; /* end of threading data */ #endif - int64_t last_time_stamp; int ready_for_new_data; vp8_prob prob_intra; @@ -117,34 +120,23 @@ typedef struct VP8D_COMP { vpx_decrypt_cb decrypt_cb; void *decrypt_state; +#if CONFIG_MULTITHREAD + // Restart threads on next frame if set to 1. + // This is set when error happens in multithreaded decoding and all threads + // are shut down. + int restart_threads; +#endif } VP8D_COMP; -int vp8_decode_frame(VP8D_COMP *cpi); +void vp8cx_init_de_quantizer(VP8D_COMP *pbi); +void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); +int vp8_decode_frame(VP8D_COMP *pbi); int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf); int vp8_remove_decoder_instances(struct frame_buffers *fb); -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - lval = (expr); \ - if (!lval) \ - vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval " at %s:%d", __FILE__, \ - __LINE__); \ - } while (0) -#else -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - lval = (expr); \ - if (!lval) \ - vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval); \ - } while (0) -#endif - #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_DECODER_ONYXD_INT_H_ +#endif // VPX_VP8_DECODER_ONYXD_INT_H_ diff --git a/media/libvpx/libvpx/vp8/decoder/threading.c b/media/libvpx/libvpx/vp8/decoder/threading.c index 9f77519882..d16284d134 100644 --- a/media/libvpx/libvpx/vp8/decoder/threading.c +++ b/media/libvpx/libvpx/vp8/decoder/threading.c @@ -10,16 +10,18 @@ #include "vpx_config.h" #include "vp8_rtcd.h" -#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1 +#if !defined(_WIN32) && CONFIG_OS_SUPPORT == 1 #include #endif #include "onyxd_int.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_pthread.h" +#include "vp8/common/common.h" #include "vp8/common/threading.h" - #include "vp8/common/loopfilter.h" #include "vp8/common/extend.h" #include "vpx_ports/vpx_timer.h" +#include "decoderthreading.h" #include "detokenize.h" #include "vp8/common/reconintra4x4.h" #include "vp8/common/reconinter.h" @@ -29,15 +31,15 @@ #include "error_concealment.h" #endif -#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n))) -#define CALLOC_ARRAY_ALIGNED(p, n, algn) \ - do { \ - CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \ - memset((p), 0, (n) * sizeof(*(p))); \ +#define CALLOC_ARRAY(p, n) \ + CHECK_MEM_ERROR(&pbi->common.error, (p), vpx_calloc(sizeof(*(p)), (n))) +#define CALLOC_ARRAY_ALIGNED(p, n, algn) \ + do { \ + CHECK_MEM_ERROR(&pbi->common.error, (p), \ + vpx_memalign((algn), sizeof(*(p)) * (n))); \ + memset((p), 0, (n) * sizeof(*(p))); \ } while (0) -void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); - static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count) { VP8_COMMON *const pc = &pbi->common; @@ -55,7 +57,7 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, mbd->dst = xd->dst; mbd->segmentation_enabled = xd->segmentation_enabled; - mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta; + mbd->mb_segment_abs_delta = xd->mb_segment_abs_delta; memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data)); @@ -75,12 +77,13 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2)); memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv)); - mbd->fullpixel_mask = 0xffffffff; + mbd->fullpixel_mask = ~0; - if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8; + if (pc->full_pixel) mbd->fullpixel_mask = ~7; } - for (i = 0; i < pc->mb_rows; ++i) pbi->mt_current_mb_col[i] = -1; + for (i = 0; i < pc->mb_rows; ++i) + vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1); } static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, @@ -248,12 +251,13 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row) { - const int *last_row_current_mb_col; - int *current_mb_col; + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col; int mb_row; VP8_COMMON *pc = &pbi->common; const int nsync = pbi->sync_range; - const int first_row_no_sync_above = pc->mb_cols + nsync; + const vpx_atomic_int first_row_no_sync_above = + VPX_ATOMIC_INIT(pc->mb_cols + nsync); int num_part = 1 << pbi->common.multi_token_partition; int last_mb_row = start_mb_row; @@ -357,13 +361,11 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) { if (((mb_col - 1) % nsync) == 0) { - pthread_mutex_t *mutex = &pbi->pmutex[mb_row]; - protected_write(mutex, current_mb_col, mb_col - 1); + vpx_atomic_store_release(current_mb_col, mb_col - 1); } if (mb_row && !(mb_col & (nsync - 1))) { - pthread_mutex_t *mutex = &pbi->pmutex[mb_row - 1]; - sync_read(mutex, mb_col, last_row_current_mb_col, nsync); + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } /* Distance of MB to the various image edges. @@ -401,16 +403,32 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset; xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset; - xd->pre.y_buffer = - ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset; - xd->pre.u_buffer = - ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset; - xd->pre.v_buffer = - ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset; - /* propagate errors from reference frames */ xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame]; + if (xd->corrupted) { + // Move current decoding marcoblock to the end of row for all rows + // assigned to this thread, such that other threads won't be waiting. + for (; mb_row < pc->mb_rows; + mb_row += (pbi->decoding_thread_count + 1)) { + current_mb_col = &pbi->mt_current_mb_col[mb_row]; + vpx_atomic_store_release(current_mb_col, pc->mb_cols + nsync); + } + vpx_internal_error(&xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Corrupted reference frame"); + } + + if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) { + const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame; + xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset; + xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset; + xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset; + } else { + // ref_frame is INTRA_FRAME, pre buffer should not be used. + xd->pre.y_buffer = 0; + xd->pre.u_buffer = 0; + xd->pre.v_buffer = 0; + } mt_decode_macroblock(pbi, xd, 0); xd->left_available = 1; @@ -549,7 +567,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, } /* last MB of row is ready just after extension is done */ - protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync); + vpx_atomic_store_release(current_mb_col, mb_col + nsync); ++xd->mode_info_context; /* skip prediction column */ xd->up_available = 1; @@ -558,41 +576,48 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count; } - /* signal end of frame decoding if this thread processed the last mb_row */ - if (last_mb_row == (pc->mb_rows - 1)) sem_post(&pbi->h_event_end_decoding); + /* signal end of decoding of current thread for current frame */ + if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows) + vp8_sem_post(&pbi->h_event_end_decoding); } -static THREAD_FUNCTION thread_decoding_proc(void *p_data) { +static THREADFN thread_decoding_proc(void *p_data) { int ithread = ((DECODETHREAD_DATA *)p_data)->ithread; VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1); MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2); ENTROPY_CONTEXT_PLANES mb_row_left_context; while (1) { - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) break; + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break; - if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) { + if (vp8_sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) { break; } else { MACROBLOCKD *xd = &mbrd->mbd; xd->left_context = &mb_row_left_context; - + if (setjmp(xd->error_info.jmp)) { + xd->error_info.setjmp = 0; + // Signal the end of decoding for current thread. + vp8_sem_post(&pbi->h_event_end_decoding); + continue; + } + xd->error_info.setjmp = 1; mt_decode_mb_rows(pbi, xd, ithread + 1); + xd->error_info.setjmp = 0; } } } - return 0; + return THREAD_EXIT_SUCCESS; } void vp8_decoder_create_threads(VP8D_COMP *pbi) { int core_count = 0; unsigned int ithread; - pbi->b_multithreaded_rd = 0; + vpx_atomic_init(&pbi->b_multithreaded_rd, 0); pbi->allocated_decoding_thread_count = 0; - pthread_mutex_init(&pbi->mt_mutex, NULL); /* limit decoding threads to the max number of token partitions */ core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads; @@ -603,7 +628,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { } if (core_count > 1) { - pbi->b_multithreaded_rd = 1; + vpx_atomic_init(&pbi->b_multithreaded_rd, 1); pbi->decoding_thread_count = core_count - 1; CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count); @@ -611,13 +636,13 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32); CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count); - if (sem_init(&pbi->h_event_end_decoding, 0, 0)) { + if (vp8_sem_init(&pbi->h_event_end_decoding, 0, 0)) { vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, "Failed to initialize semaphore"); } for (ithread = 0; ithread < pbi->decoding_thread_count; ++ithread) { - if (sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break; + if (vp8_sem_init(&pbi->h_event_start_decoding[ithread], 0, 0)) break; vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd); @@ -627,7 +652,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { if (pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, &pbi->de_thread_data[ithread])) { - sem_destroy(&pbi->h_event_start_decoding[ithread]); + vp8_sem_destroy(&pbi->h_event_start_decoding[ithread]); break; } } @@ -638,7 +663,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { /* the remainder of cleanup cases will be handled in * vp8_decoder_remove_threads(). */ if (pbi->allocated_decoding_thread_count == 0) { - sem_destroy(&pbi->h_event_end_decoding); + vp8_sem_destroy(&pbi->h_event_end_decoding); } vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, "Failed to create threads"); @@ -649,16 +674,6 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) { int i; - /* De-allocate mutex */ - if (pbi->pmutex != NULL) { - for (i = 0; i < mb_rows; ++i) { - pthread_mutex_destroy(&pbi->pmutex[i]); - } - - vpx_free(pbi->pmutex); - pbi->pmutex = NULL; - } - vpx_free(pbi->mt_current_mb_col); pbi->mt_current_mb_col = NULL; @@ -724,7 +739,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { int i; int uv_width; - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows); /* our internal buffers are always multiples of 16 */ @@ -742,73 +757,73 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { uv_width = width >> 1; - /* Allocate mutex */ - CHECK_MEM_ERROR(pbi->pmutex, - vpx_malloc(sizeof(*pbi->pmutex) * pc->mb_rows)); - if (pbi->pmutex) { - for (i = 0; i < pc->mb_rows; ++i) { - pthread_mutex_init(&pbi->pmutex[i], NULL); - } - } - - /* Allocate an int for each mb row. */ - CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows); + /* Allocate a vpx_atomic_int for each mb row. */ + CHECK_MEM_ERROR(&pc->error, pbi->mt_current_mb_col, + vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows)); + for (i = 0; i < pc->mb_rows; ++i) + vpx_atomic_init(&pbi->mt_current_mb_col[i], 0); /* Allocate memory for above_row buffers. */ CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows); - for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_yabove_row[i], + for (i = 0; i < pc->mb_rows; ++i) { + CHECK_MEM_ERROR(&pc->error, pbi->mt_yabove_row[i], vpx_memalign(16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1)))); + vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1)); + } CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows); - for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_uabove_row[i], + for (i = 0; i < pc->mb_rows; ++i) { + CHECK_MEM_ERROR(&pc->error, pbi->mt_uabove_row[i], vpx_memalign(16, sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); + vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS); + } CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows); - for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_vabove_row[i], + for (i = 0; i < pc->mb_rows; ++i) { + CHECK_MEM_ERROR(&pc->error, pbi->mt_vabove_row[i], vpx_memalign(16, sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); + vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS); + } /* Allocate memory for left_col buffers. */ CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_yleft_col[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1)); CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_uleft_col[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1)); CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR(pbi->mt_vleft_col[i], + CHECK_MEM_ERROR(&pc->error, pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1)); } } void vp8_decoder_remove_threads(VP8D_COMP *pbi) { /* shutdown MB Decoding thread; */ - if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) { + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { int i; - protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0); + vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0); /* allow all threads to exit */ for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { - sem_post(&pbi->h_event_start_decoding[i]); + vp8_sem_post(&pbi->h_event_start_decoding[i]); pthread_join(pbi->h_decoding_thread[i], NULL); } for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { - sem_destroy(&pbi->h_event_start_decoding[i]); + vp8_sem_destroy(&pbi->h_event_start_decoding[i]); } if (pbi->allocated_decoding_thread_count) { - sem_destroy(&pbi->h_event_end_decoding); + vp8_sem_destroy(&pbi->h_event_end_decoding); } vpx_free(pbi->h_decoding_thread); @@ -825,10 +840,9 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) { vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows); } - pthread_mutex_destroy(&pbi->mt_mutex); } -void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { +int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { VP8_COMMON *pc = &pbi->common; unsigned int i; int j; @@ -871,10 +885,26 @@ void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { pbi->decoding_thread_count); for (i = 0; i < pbi->decoding_thread_count; ++i) { - sem_post(&pbi->h_event_start_decoding[i]); + vp8_sem_post(&pbi->h_event_start_decoding[i]); } - mt_decode_mb_rows(pbi, xd, 0); + if (setjmp(xd->error_info.jmp)) { + xd->error_info.setjmp = 0; + xd->corrupted = 1; + // Wait for other threads to finish. This prevents other threads decoding + // the current frame while the main thread starts decoding the next frame, + // which causes a data race. + for (i = 0; i < pbi->decoding_thread_count; ++i) + vp8_sem_wait(&pbi->h_event_end_decoding); + return -1; + } - sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */ + xd->error_info.setjmp = 1; + mt_decode_mb_rows(pbi, xd, 0); + xd->error_info.setjmp = 0; + + for (i = 0; i < pbi->decoding_thread_count + 1; ++i) + vp8_sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */ + + return 0; } diff --git a/media/libvpx/libvpx/vp8/decoder/treereader.h b/media/libvpx/libvpx/vp8/decoder/treereader.h index dd0f0986e9..4bf938a741 100644 --- a/media/libvpx/libvpx/vp8/decoder/treereader.h +++ b/media/libvpx/libvpx/vp8/decoder/treereader.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DECODER_TREEREADER_H_ -#define VP8_DECODER_TREEREADER_H_ +#ifndef VPX_VP8_DECODER_TREEREADER_H_ +#define VPX_VP8_DECODER_TREEREADER_H_ #include "./vpx_config.h" #include "vp8/common/treecoder.h" @@ -30,7 +30,7 @@ typedef BOOL_DECODER vp8_reader; static INLINE int vp8_treed_read( vp8_reader *const r, /* !!! must return a 0 or 1 !!! */ vp8_tree t, const vp8_prob *const p) { - register vp8_tree_index i = 0; + vp8_tree_index i = 0; while ((i = t[i + vp8_read(r, p[i >> 1])]) > 0) { } @@ -42,4 +42,4 @@ static INLINE int vp8_treed_read( } // extern "C" #endif -#endif // VP8_DECODER_TREEREADER_H_ +#endif // VPX_VP8_DECODER_TREEREADER_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c index c42005df6c..950c943343 100644 --- a/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c +++ b/media/libvpx/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -9,6 +9,8 @@ */ #include + +#include "./vp8_rtcd.h" #include "vp8/encoder/block.h" static const uint16_t inv_zig_zag[16] = { 1, 2, 6, 7, 3, 5, 8, 13, @@ -26,9 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { zig_zag1 = vld1q_u16(inv_zig_zag + 8); int16x8_t x0, x1, sz0, sz1, y0, y1; uint16x8_t eob0, eob1; +#if !VPX_ARCH_AARCH64 uint16x4_t eob_d16; uint32x2_t eob_d32; uint32x4_t eob_q32; +#endif // !VPX_ARCH_AARCH64 /* sign of z: z >> 15 */ sz0 = vshrq_n_s16(z0, 15); @@ -66,11 +70,17 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* select the largest value */ eob0 = vmaxq_u16(eob0, eob1); +#if VPX_ARCH_AARCH64 + *d->eob = (int8_t)vmaxvq_u16(eob0); +#else eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); eob_q32 = vmovl_u16(eob_d16); eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32)); eob_d32 = vpmax_u32(eob_d32, eob_d32); + vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); +#endif // VPX_ARCH_AARCH64 + /* qcoeff = x */ vst1q_s16(d->qcoeff, x0); vst1q_s16(d->qcoeff + 8, x1); @@ -78,6 +88,4 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* dqcoeff = x * dequant */ vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0)); vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1)); - - vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); } diff --git a/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c index 76853e6524..99dff6b520 100644 --- a/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c +++ b/media/libvpx/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c @@ -10,6 +10,8 @@ #include +#include "./vp8_rtcd.h" + void vp8_short_fdct4x4_neon(int16_t *input, int16_t *output, int pitch) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d16s16, d17s16, d26s16, dEmptys16; diff --git a/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c index 8d6ea4ccbe..02056f2f90 100644 --- a/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c +++ b/media/libvpx/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c @@ -9,6 +9,8 @@ */ #include + +#include "./vp8_rtcd.h" #include "vpx_ports/arm.h" #ifdef VPX_INCOMPATIBLE_GCC diff --git a/media/libvpx/libvpx/vp8/encoder/bitstream.c b/media/libvpx/libvpx/vp8/encoder/bitstream.c index 7086faae98..7bcdf77708 100644 --- a/media/libvpx/libvpx/vp8/encoder/bitstream.c +++ b/media/libvpx/libvpx/vp8/encoder/bitstream.c @@ -19,6 +19,7 @@ #include #include "vpx/vpx_encoder.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/system_state.h" #include "bitstream.h" @@ -41,13 +42,6 @@ const int vp8cx_base_skip_false_prob[128] = { unsigned __int64 Sectionbits[500]; #endif -#ifdef VP8_ENTROPY_STATS -int intra_mode_stats[10][10][10]; -static unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS] - [PREV_COEF_CONTEXTS][ENTROPY_NODES][2]; -extern unsigned int active_section; -#endif - #ifdef MODE_STATS int count_mb_seg[4] = { 0, 0, 0, 0 }; #endif @@ -124,7 +118,9 @@ static void write_split(vp8_writer *bc, int x) { vp8_mbsplit_encodings + x); } -void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) { +void VPX_NO_UNSIGNED_SHIFT_CHECK vp8_pack_tokens(vp8_writer *w, + const TOKENEXTRA *p, + int xcount) { const TOKENEXTRA *stop = p + xcount; unsigned int split; int shift; @@ -178,10 +174,9 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) { validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error); - w->buffer[w->pos++] = (lowvalue >> (24 - offset)); - lowvalue <<= offset; + w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff; shift = count; - lowvalue &= 0xffffff; + lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff); count -= 8; } @@ -229,10 +224,9 @@ void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) { validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error); - w->buffer[w->pos++] = (lowvalue >> (24 - offset)); - lowvalue <<= offset; + w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff; shift = count; - lowvalue &= 0xffffff; + lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff); count -= 8; } @@ -428,10 +422,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { vp8_convert_rfct_to_prob(cpi); -#ifdef VP8_ENTROPY_STATS - active_section = 1; -#endif - if (pc->mb_no_coeff_skip) { int total_mbs = pc->mb_rows * pc->mb_cols; @@ -472,10 +462,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { xd->mb_to_top_edge = -((mb_row * 16) << 3); xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; -#ifdef VP8_ENTROPY_STATS - active_section = 9; -#endif - if (cpi->mb.e_mbd.update_mb_segmentation_map) { write_mb_features(w, mi, &cpi->mb.e_mbd); } @@ -486,9 +472,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { if (rf == INTRA_FRAME) { vp8_write(w, 0, cpi->prob_intra_coded); -#ifdef VP8_ENTROPY_STATS - active_section = 6; -#endif write_ymode(w, mode, pc->fc.ymode_prob); if (mode == B_PRED) { @@ -500,8 +483,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { } write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob); - } else /* inter coded */ - { + } else { /* inter coded */ int_mv best_mv; vp8_prob mv_ref_p[VP8_MVREFS - 1]; @@ -519,32 +501,17 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { int ct[4]; vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, - cpi->common.ref_frame_sign_bias); + pc->ref_frame_sign_bias); vp8_clamp_mv2(&best_mv, xd); vp8_mv_ref_probs(mv_ref_p, ct); - -#ifdef VP8_ENTROPY_STATS - accum_mv_refs(mode, ct); -#endif } -#ifdef VP8_ENTROPY_STATS - active_section = 3; -#endif - write_mv_ref(w, mode, mv_ref_p); switch (mode) /* new, split require MVs */ { - case NEWMV: - -#ifdef VP8_ENTROPY_STATS - active_section = 5; -#endif - - write_mv(w, &mi->mv.as_mv, &best_mv, mvc); - break; + case NEWMV: write_mv(w, &mi->mv.as_mv, &best_mv, mvc); break; case SPLITMV: { int j = 0; @@ -575,9 +542,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) { write_sub_mv_ref(w, blockmode, vp8_sub_mv_ref_prob2[mv_contz]); if (blockmode == NEW4X4) { -#ifdef VP8_ENTROPY_STATS - active_section = 11; -#endif write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *)mvc); } } while (++j < cpi->mb.partition_info->count); @@ -643,10 +607,6 @@ static void write_kfmodes(VP8_COMP *cpi) { const B_PREDICTION_MODE L = left_block_mode(m, i); const int bm = m->bmi[i].as_mode; -#ifdef VP8_ENTROPY_STATS - ++intra_mode_stats[A][L][bm]; -#endif - write_bmode(bc, bm, vp8_kf_bmode_prob[A][L]); } while (++i < 16); } @@ -907,7 +867,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) { #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) vp8_writer *const w = cpi->bc; #endif - int savings = 0; vpx_clear_system_state(); @@ -974,10 +933,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) { vp8_write(w, u, upd); #endif -#ifdef VP8_ENTROPY_STATS - ++tree_update_hist[i][j][k][t][u]; -#endif - if (u) { /* send/use new probability */ @@ -985,22 +940,10 @@ void vp8_update_coef_probs(VP8_COMP *cpi) { #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) vp8_write_literal(w, newp, 8); #endif - - savings += s; } } while (++t < ENTROPY_NODES); -/* Accum token counts for generation of default statistics */ -#ifdef VP8_ENTROPY_STATS - t = 0; - - do { - context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t]; - } while (++t < MAX_ENTROPY_TOKENS); - -#endif - } while (++k < PREV_COEF_CONTEXTS); } while (++j < COEF_BANDS); } while (++i < BLOCK_TYPES); @@ -1078,7 +1021,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, bc[0].error = &pc->error; - validate_buffer(cx_data, 3, cx_data_end, &cpi->common.error); + validate_buffer(cx_data, 3, cx_data_end, &pc->error); cx_data += 3; #if defined(SECTIONBITS_OUTPUT) @@ -1091,19 +1034,25 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, if (oh.type == KEY_FRAME) { int v; - validate_buffer(cx_data, 7, cx_data_end, &cpi->common.error); + validate_buffer(cx_data, 7, cx_data_end, &pc->error); /* Start / synch code */ cx_data[0] = 0x9D; cx_data[1] = 0x01; cx_data[2] = 0x2a; + /* Pack scale and frame size into 16 bits. Store it 8 bits at a time. + * https://tools.ietf.org/html/rfc6386 + * 9.1. Uncompressed Data Chunk + * 16 bits : (2 bits Horizontal Scale << 14) | Width (14 bits) + * 16 bits : (2 bits Vertical Scale << 14) | Height (14 bits) + */ v = (pc->horiz_scale << 14) | pc->Width; - cx_data[3] = v; + cx_data[3] = v & 0xff; cx_data[4] = v >> 8; v = (pc->vert_scale << 14) | pc->Height; - cx_data[5] = v; + cx_data[5] = v & 0xff; cx_data[6] = v >> 8; extra_bytes_packed = 7; @@ -1131,7 +1080,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, if (xd->update_mb_segmentation_data) { signed char Data; - vp8_write_bit(bc, xd->mb_segement_abs_delta); + vp8_write_bit(bc, xd->mb_segment_abs_delta); /* For each segmentation feature (Quant and loop filter level) */ for (i = 0; i < MB_LVL_MAX; ++i) { @@ -1287,15 +1236,6 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, if (pc->frame_type != KEY_FRAME) vp8_write_bit(bc, pc->refresh_last_frame); -#ifdef VP8_ENTROPY_STATS - - if (pc->frame_type == INTER_FRAME) - active_section = 0; - else - active_section = 7; - -#endif - vpx_clear_system_state(); #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING @@ -1303,31 +1243,19 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, #else if (pc->refresh_entropy_probs == 0) { /* save a copy for later refresh */ - memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc)); + pc->lfc = pc->fc; } vp8_update_coef_probs(cpi); #endif -#ifdef VP8_ENTROPY_STATS - active_section = 2; -#endif - /* Write out the mb_no_coeff_skip flag */ vp8_write_bit(bc, pc->mb_no_coeff_skip); if (pc->frame_type == KEY_FRAME) { write_kfmodes(cpi); - -#ifdef VP8_ENTROPY_STATS - active_section = 8; -#endif } else { pack_inter_mode_mvs(cpi); - -#ifdef VP8_ENTROPY_STATS - active_section = 1; -#endif } vp8_stop_encode(bc); @@ -1338,11 +1266,30 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, /* update frame tag */ { + /* Pack partition size, show frame, version and frame type into to 24 bits. + * Store it 8 bits at a time. + * https://tools.ietf.org/html/rfc6386 + * 9.1. Uncompressed Data Chunk + * The uncompressed data chunk comprises a common (for key frames and + * interframes) 3-byte frame tag that contains four fields, as follows: + * + * 1. A 1-bit frame type (0 for key frames, 1 for interframes). + * + * 2. A 3-bit version number (0 - 3 are defined as four different + * profiles with different decoding complexity; other values may be + * defined for future variants of the VP8 data format). + * + * 3. A 1-bit show_frame flag (0 when current frame is not for display, + * 1 when current frame is for display). + * + * 4. A 19-bit field containing the size of the first data partition in + * bytes + */ int v = (oh.first_partition_length_in_bytes << 5) | (oh.show_frame << 4) | (oh.version << 1) | oh.type; - dest[0] = v; - dest[1] = v >> 8; + dest[0] = v & 0xff; + dest[1] = (v >> 8) & 0xff; dest[2] = v >> 16; } @@ -1416,7 +1363,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { pack_mb_row_tokens(cpi, &cpi->bc[1]); } else { vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count); @@ -1432,50 +1379,3 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, } #endif } - -#ifdef VP8_ENTROPY_STATS -void print_tree_update_probs() { - int i, j, k, l; - FILE *f = fopen("context.c", "a"); - int Sum; - fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n"); - fprintf(f, - "const vp8_prob tree_update_probs[BLOCK_TYPES] [COEF_BANDS] " - "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {\n"); - - for (i = 0; i < BLOCK_TYPES; ++i) { - fprintf(f, " { \n"); - - for (j = 0; j < COEF_BANDS; ++j) { - fprintf(f, " {\n"); - - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - fprintf(f, " {"); - - for (l = 0; l < ENTROPY_NODES; ++l) { - Sum = - tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1]; - - if (Sum > 0) { - if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0) - fprintf(f, "%3ld, ", - (tree_update_hist[i][j][k][l][0] * 255) / Sum); - else - fprintf(f, "%3ld, ", 1); - } else - fprintf(f, "%3ld, ", 128); - } - - fprintf(f, "},\n"); - } - - fprintf(f, " },\n"); - } - - fprintf(f, " },\n"); - } - - fprintf(f, "};\n"); - fclose(f); -} -#endif diff --git a/media/libvpx/libvpx/vp8/encoder/bitstream.h b/media/libvpx/libvpx/vp8/encoder/bitstream.h index 2b196dcd27..ee3f3e4aab 100644 --- a/media/libvpx/libvpx/vp8/encoder/bitstream.h +++ b/media/libvpx/libvpx/vp8/encoder/bitstream.h @@ -8,17 +8,25 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_BITSTREAM_H_ -#define VP8_ENCODER_BITSTREAM_H_ +#ifndef VPX_VP8_ENCODER_BITSTREAM_H_ +#define VPX_VP8_ENCODER_BITSTREAM_H_ #ifdef __cplusplus extern "C" { #endif +#include "vp8/encoder/treewriter.h" +#include "vp8/encoder/tokenize.h" + void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount); +void vp8_convert_rfct_to_prob(struct VP8_COMP *const cpi); +void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra, + int prob_last, int prob_garf); +int vp8_estimate_entropy_savings(struct VP8_COMP *cpi); +void vp8_update_coef_probs(struct VP8_COMP *cpi); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_ENCODER_BITSTREAM_H_ +#endif // VPX_VP8_ENCODER_BITSTREAM_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/block.h b/media/libvpx/libvpx/vp8/encoder/block.h index f9a273bd27..f0efd3e1e2 100644 --- a/media/libvpx/libvpx/vp8/encoder/block.h +++ b/media/libvpx/libvpx/vp8/encoder/block.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_BLOCK_H_ -#define VP8_ENCODER_BLOCK_H_ +#ifndef VPX_VP8_ENCODER_BLOCK_H_ +#define VPX_VP8_ENCODER_BLOCK_H_ #include "vp8/common/onyx.h" #include "vp8/common/blockd.h" @@ -166,4 +166,4 @@ typedef struct macroblock { } // extern "C" #endif -#endif // VP8_ENCODER_BLOCK_H_ +#endif // VPX_VP8_ENCODER_BLOCK_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/boolhuff.c b/media/libvpx/libvpx/vp8/encoder/boolhuff.c index 04f8db9331..819c2f22a0 100644 --- a/media/libvpx/libvpx/vp8/encoder/boolhuff.c +++ b/media/libvpx/libvpx/vp8/encoder/boolhuff.c @@ -15,10 +15,6 @@ unsigned __int64 Sectionbits[500]; #endif -#ifdef VP8_ENTROPY_STATS -unsigned int active_section = 0; -#endif - const unsigned int vp8_prob_cost[256] = { 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046, 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, @@ -42,26 +38,26 @@ const unsigned int vp8_prob_cost[256] = { 12, 10, 9, 7, 6, 4, 3, 1, 1 }; -void vp8_start_encode(BOOL_CODER *br, unsigned char *source, +void vp8_start_encode(BOOL_CODER *bc, unsigned char *source, unsigned char *source_end) { - br->lowvalue = 0; - br->range = 255; - br->count = -24; - br->buffer = source; - br->buffer_end = source_end; - br->pos = 0; + bc->lowvalue = 0; + bc->range = 255; + bc->count = -24; + bc->buffer = source; + bc->buffer_end = source_end; + bc->pos = 0; } -void vp8_stop_encode(BOOL_CODER *br) { +void vp8_stop_encode(BOOL_CODER *bc) { int i; - for (i = 0; i < 32; ++i) vp8_encode_bool(br, 0, 128); + for (i = 0; i < 32; ++i) vp8_encode_bool(bc, 0, 128); } -void vp8_encode_value(BOOL_CODER *br, int data, int bits) { +void vp8_encode_value(BOOL_CODER *bc, int data, int bits) { int bit; for (bit = bits - 1; bit >= 0; bit--) { - vp8_encode_bool(br, (1 & (data >> bit)), 0x80); + vp8_encode_bool(bc, (1 & (data >> bit)), 0x80); } } diff --git a/media/libvpx/libvpx/vp8/encoder/boolhuff.h b/media/libvpx/libvpx/vp8/encoder/boolhuff.h index d001eea9cd..a8c536b99c 100644 --- a/media/libvpx/libvpx/vp8/encoder/boolhuff.h +++ b/media/libvpx/libvpx/vp8/encoder/boolhuff.h @@ -9,14 +9,14 @@ */ /**************************************************************************** -* -* Module Title : boolhuff.h -* -* Description : Bool Coder header file. -* -****************************************************************************/ -#ifndef VP8_ENCODER_BOOLHUFF_H_ -#define VP8_ENCODER_BOOLHUFF_H_ + * + * Module Title : boolhuff.h + * + * Description : Bool Coder header file. + * + ****************************************************************************/ +#ifndef VPX_VP8_ENCODER_BOOLHUFF_H_ +#define VPX_VP8_ENCODER_BOOLHUFF_H_ #include "vpx_ports/mem.h" #include "vpx/internal/vpx_codec_internal.h" @@ -35,11 +35,11 @@ typedef struct { struct vpx_internal_error_info *error; } BOOL_CODER; -extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, - unsigned char *buffer_end); +void vp8_start_encode(BOOL_CODER *bc, unsigned char *source, + unsigned char *source_end); -extern void vp8_encode_value(BOOL_CODER *br, int data, int bits); -extern void vp8_stop_encode(BOOL_CODER *bc); +void vp8_encode_value(BOOL_CODER *bc, int data, int bits); +void vp8_stop_encode(BOOL_CODER *bc); extern const unsigned int vp8_prob_cost[256]; DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]); @@ -56,23 +56,12 @@ static int validate_buffer(const unsigned char *start, size_t len, return 0; } -static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) { +static void vp8_encode_bool(BOOL_CODER *bc, int bit, int probability) { unsigned int split; - int count = br->count; - unsigned int range = br->range; - unsigned int lowvalue = br->lowvalue; - register int shift; - -#ifdef VP8_ENTROPY_STATS -#if defined(SECTIONBITS_OUTPUT) - - if (bit) - Sectionbits[active_section] += vp8_prob_cost[255 - probability]; - else - Sectionbits[active_section] += vp8_prob_cost[probability]; - -#endif -#endif + int count = bc->count; + unsigned int range = bc->range; + unsigned int lowvalue = bc->lowvalue; + int shift; split = 1 + (((range - 1) * probability) >> 8); @@ -80,7 +69,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) { if (bit) { lowvalue += split; - range = br->range - split; + range = bc->range - split; } shift = vp8_norm[range]; @@ -92,33 +81,32 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) { int offset = shift - count; if ((lowvalue << (offset - 1)) & 0x80000000) { - int x = br->pos - 1; + int x = bc->pos - 1; - while (x >= 0 && br->buffer[x] == 0xff) { - br->buffer[x] = (unsigned char)0; + while (x >= 0 && bc->buffer[x] == 0xff) { + bc->buffer[x] = (unsigned char)0; x--; } - br->buffer[x] += 1; + bc->buffer[x] += 1; } - validate_buffer(br->buffer + br->pos, 1, br->buffer_end, br->error); - br->buffer[br->pos++] = (lowvalue >> (24 - offset)); + validate_buffer(bc->buffer + bc->pos, 1, bc->buffer_end, bc->error); + bc->buffer[bc->pos++] = (lowvalue >> (24 - offset) & 0xff); - lowvalue <<= offset; shift = count; - lowvalue &= 0xffffff; + lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff); count -= 8; } lowvalue <<= shift; - br->count = count; - br->lowvalue = lowvalue; - br->range = range; + bc->count = count; + bc->lowvalue = lowvalue; + bc->range = range; } #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_ENCODER_BOOLHUFF_H_ +#endif // VPX_VP8_ENCODER_BOOLHUFF_H_ diff --git a/media/libvpx/libvpx/vp8/common/copy_c.c b/media/libvpx/libvpx/vp8/encoder/copy_c.c similarity index 100% rename from media/libvpx/libvpx/vp8/common/copy_c.c rename to media/libvpx/libvpx/vp8/encoder/copy_c.c diff --git a/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h b/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h index 278dce73f4..0cd6cb4e65 100644 --- a/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h +++ b/media/libvpx/libvpx/vp8/encoder/dct_value_cost.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_DCT_VALUE_COST_H_ -#define VP8_ENCODER_DCT_VALUE_COST_H_ +#ifndef VPX_VP8_ENCODER_DCT_VALUE_COST_H_ +#define VPX_VP8_ENCODER_DCT_VALUE_COST_H_ #ifdef __cplusplus extern "C" { @@ -341,4 +341,4 @@ static const short dct_value_cost[2048 * 2] = { } // extern "C" #endif -#endif // VP8_ENCODER_DCT_VALUE_COST_H_ +#endif // VPX_VP8_ENCODER_DCT_VALUE_COST_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h b/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h index 0597deab2d..5cc4505f09 100644 --- a/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h +++ b/media/libvpx/libvpx/vp8/encoder/dct_value_tokens.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_DCT_VALUE_TOKENS_H_ -#define VP8_ENCODER_DCT_VALUE_TOKENS_H_ +#ifndef VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_ +#define VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_ #ifdef __cplusplus extern "C" { @@ -845,4 +845,4 @@ static const TOKENVALUE dct_value_tokens[2048 * 2] = { } // extern "C" #endif -#endif // VP8_ENCODER_DCT_VALUE_TOKENS_H_ +#endif // VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h b/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h index 2976325dc5..a3ab34c8a0 100644 --- a/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h +++ b/media/libvpx/libvpx/vp8/encoder/defaultcoefcounts.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ -#define VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ +#ifndef VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ +#define VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ #ifdef __cplusplus extern "C" { @@ -232,4 +232,4 @@ static const unsigned int default_coef_counts } // extern "C" #endif -#endif // VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ +#endif // VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/denoising.c b/media/libvpx/libvpx/vp8/encoder/denoising.c index eb963b97e3..a666bca4d2 100644 --- a/media/libvpx/libvpx/vp8/encoder/denoising.c +++ b/media/libvpx/libvpx/vp8/encoder/denoising.c @@ -135,7 +135,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, // When adopting aggressive denoiser, the adj_val for each pixel // could be at most 8 (this is current max adjustment of the map). // In SSE code, we calculate the sum of adj_val for - // the columns, so the sum could be upto 128(16 rows). However, + // the columns, so the sum could be up to 128(16 rows). However, // the range of the value is -128 ~ 127 in SSE code, that's why // we do this change in C code. // We don't do this for UV denoiser, since there are only 8 rows, @@ -213,13 +213,12 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, return FILTER_BLOCK; } -int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, - int mc_avg_uv_stride, - unsigned char *running_avg_uv, int avg_uv_stride, +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, + unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising) { - unsigned char *running_avg_uv_start = running_avg_uv; + unsigned char *running_avg_start = running_avg; unsigned char *sig_start = sig; int sum_diff_thresh; int r, c; @@ -259,13 +258,13 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, int adjustment = 0; int absdiff = 0; - diff = mc_running_avg_uv[c] - sig[c]; + diff = mc_running_avg[c] - sig[c]; absdiff = abs(diff); // When |diff| <= |3 + shift_inc1|, use pixel value from // last denoised raw. if (absdiff <= 3 + shift_inc1) { - running_avg_uv[c] = mc_running_avg_uv[c]; + running_avg[c] = mc_running_avg[c]; sum_diff += diff; } else { if (absdiff >= 4 && absdiff <= 7) { @@ -277,16 +276,16 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, } if (diff > 0) { if ((sig[c] + adjustment) > 255) { - running_avg_uv[c] = 255; + running_avg[c] = 255; } else { - running_avg_uv[c] = sig[c] + adjustment; + running_avg[c] = sig[c] + adjustment; } sum_diff += adjustment; } else { if ((sig[c] - adjustment) < 0) { - running_avg_uv[c] = 0; + running_avg[c] = 0; } else { - running_avg_uv[c] = sig[c] - adjustment; + running_avg[c] = sig[c] - adjustment; } sum_diff -= adjustment; } @@ -294,8 +293,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, } /* Update pointers for next iteration. */ sig += sig_stride; - mc_running_avg_uv += mc_avg_uv_stride; - running_avg_uv += avg_uv_stride; + mc_running_avg += mc_avg_stride; + running_avg += avg_stride; } sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; @@ -314,27 +313,27 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, // Only apply the adjustment for max delta up to 3. if (delta < 4) { sig -= sig_stride * 8; - mc_running_avg_uv -= mc_avg_uv_stride * 8; - running_avg_uv -= avg_uv_stride * 8; + mc_running_avg -= mc_avg_stride * 8; + running_avg -= avg_stride * 8; for (r = 0; r < 8; ++r) { for (c = 0; c < 8; ++c) { - int diff = mc_running_avg_uv[c] - sig[c]; + int diff = mc_running_avg[c] - sig[c]; int adjustment = abs(diff); if (adjustment > delta) adjustment = delta; if (diff > 0) { // Bring denoised signal down. - if (running_avg_uv[c] - adjustment < 0) { - running_avg_uv[c] = 0; + if (running_avg[c] - adjustment < 0) { + running_avg[c] = 0; } else { - running_avg_uv[c] = running_avg_uv[c] - adjustment; + running_avg[c] = running_avg[c] - adjustment; } sum_diff -= adjustment; } else if (diff < 0) { // Bring denoised signal up. - if (running_avg_uv[c] + adjustment > 255) { - running_avg_uv[c] = 255; + if (running_avg[c] + adjustment > 255) { + running_avg[c] = 255; } else { - running_avg_uv[c] = running_avg_uv[c] + adjustment; + running_avg[c] = running_avg[c] + adjustment; } sum_diff += adjustment; } @@ -342,8 +341,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, // TODO(marpan): Check here if abs(sum_diff) has gone below the // threshold sum_diff_thresh, and if so, we can exit the row loop. sig += sig_stride; - mc_running_avg_uv += mc_avg_uv_stride; - running_avg_uv += avg_uv_stride; + mc_running_avg += mc_avg_stride; + running_avg += avg_stride; } if (abs(sum_diff) > sum_diff_thresh) return COPY_BLOCK; } else { @@ -351,7 +350,7 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, } } - vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start, sig_stride); + vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); return FILTER_BLOCK; } diff --git a/media/libvpx/libvpx/vp8/encoder/denoising.h b/media/libvpx/libvpx/vp8/encoder/denoising.h index 91d87b3a1c..51ae3b0ab3 100644 --- a/media/libvpx/libvpx/vp8/encoder/denoising.h +++ b/media/libvpx/libvpx/vp8/encoder/denoising.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_DENOISING_H_ -#define VP8_ENCODER_DENOISING_H_ +#ifndef VPX_VP8_ENCODER_DENOISING_H_ +#define VPX_VP8_ENCODER_DENOISING_H_ #include "block.h" #include "vp8/common/loopfilter.h" @@ -100,4 +100,4 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MACROBLOCK *x, } // extern "C" #endif -#endif // VP8_ENCODER_DENOISING_H_ +#endif // VPX_VP8_ENCODER_DENOISING_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.c b/media/libvpx/libvpx/vp8/encoder/encodeframe.c index c7ad3bfe2c..97855ae003 100644 --- a/media/libvpx/libvpx/vp8/encoder/encodeframe.c +++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.c @@ -7,41 +7,37 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include +#include #include "vpx_config.h" -#include "vp8_rtcd.h" -#include "./vpx_dsp_rtcd.h" -#include "encodemb.h" -#include "encodemv.h" + #include "vp8/common/common.h" -#include "onyx_int.h" -#include "vp8/common/extend.h" #include "vp8/common/entropymode.h" -#include "vp8/common/quant_common.h" -#include "segmentation.h" -#include "vp8/common/setupintrarecon.h" -#include "encodeintra.h" -#include "vp8/common/reconinter.h" -#include "rdopt.h" -#include "pickinter.h" -#include "vp8/common/findnearmv.h" -#include -#include +#include "vp8/common/extend.h" #include "vp8/common/invtrans.h" +#include "vp8/common/quant_common.h" +#include "vp8/common/reconinter.h" +#include "vp8/common/setupintrarecon.h" +#include "vp8/common/threading.h" +#include "vp8/encoder/bitstream.h" +#include "vp8/encoder/encodeframe.h" +#include "vp8/encoder/encodeintra.h" +#include "vp8/encoder/encodemb.h" +#include "vp8/encoder/onyx_int.h" +#include "vp8/encoder/pickinter.h" +#include "vp8/encoder/rdopt.h" +#include "vp8_rtcd.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx_dsp_rtcd.h" +#include "vpx_mem/vpx_mem.h" #include "vpx_ports/vpx_timer.h" -#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING -#include "bitstream.h" + +#if CONFIG_MULTITHREAD +#include "vp8/encoder/ethreading.h" #endif -#include "encodeframe.h" extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); -extern void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra, - int prob_last, int prob_garf); -extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi); -extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex); -extern void vp8_auto_select_speed(VP8_COMP *cpi); -extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, - MB_ROW_COMP *mbr_ei, int count); static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x); #ifdef MODE_STATS @@ -67,15 +63,14 @@ unsigned int b_modes[14] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; * Eventually this should be replaced by custom no-reference routines, * which will be faster. */ -static const unsigned char VP8_VAR_OFFS[16] = { - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 -}; +static const unsigned char VP8_VAR_OFFS[16] = { 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128 }; /* Original activity measure from Tim T's code. */ -static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) { +static unsigned int tt_activity_measure(MACROBLOCK *x) { unsigned int act; unsigned int sse; - (void)cpi; /* TODO: This could also be done over smaller areas (8x8), but that would * require extensive changes elsewhere, as lambda is assumed to be fixed * over an entire MB in most of the code. @@ -93,28 +88,21 @@ static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) { return act; } -/* Stub for alternative experimental activity measures. */ -static unsigned int alt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x, - int use_dc_pred) { - return vp8_encode_intra(cpi, x, use_dc_pred); -} - /* Measure the activity of the current macroblock * What we measure here is TBD so abstracted to this function */ #define ALT_ACT_MEASURE 1 -static unsigned int mb_activity_measure(VP8_COMP *cpi, MACROBLOCK *x, - int mb_row, int mb_col) { +static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) { unsigned int mb_activity; if (ALT_ACT_MEASURE) { int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); - /* Or use and alternative. */ - mb_activity = alt_activity_measure(cpi, x, use_dc_pred); + /* Or use an alternative. */ + mb_activity = vp8_encode_intra(x, use_dc_pred); } else { /* Original activity measure from Tim T's code. */ - mb_activity = tt_activity_measure(cpi, x); + mb_activity = tt_activity_measure(x); } if (mb_activity < VP8_ACTIVITY_AVG_MIN) mb_activity = VP8_ACTIVITY_AVG_MIN; @@ -134,7 +122,7 @@ static void calc_av_activity(VP8_COMP *cpi, int64_t activity_sum) { unsigned int tmp; /* Create a list to sort to */ - CHECK_MEM_ERROR(sortlist, + CHECK_MEM_ERROR(&cpi->common.error, sortlist, vpx_calloc(sizeof(unsigned int), cpi->common.MBs)); /* Copy map to sort list */ @@ -267,7 +255,7 @@ static void build_activity_map(VP8_COMP *cpi) { vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); /* measure activity */ - mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col); + mb_activity = mb_activity_measure(x, mb_row, mb_col); /* Keep frame sum */ activity_sum += mb_activity; @@ -344,11 +332,14 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, #if CONFIG_MULTITHREAD const int nsync = cpi->mt_sync_range; - const int rightmost_col = cm->mb_cols + nsync; - const int *last_row_current_mb_col; - int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync); + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col = NULL; - if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { + current_mb_col = &cpi->mt_current_mb_col[mb_row]; + } + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) { last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1]; } else { last_row_current_mb_col = &rightmost_col; @@ -418,15 +409,13 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded != 0) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { if (((mb_col - 1) % nsync) == 0) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row]; - protected_write(mutex, current_mb_col, mb_col - 1); + vpx_atomic_store_release(current_mb_col, mb_col - 1); } if (mb_row && !(mb_col & (nsync - 1))) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1]; - sync_read(mutex, mb_col, last_row_current_mb_col, nsync); + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } } #endif @@ -455,13 +444,21 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, x->active_ptr = cpi->active_map + map_index + mb_col; if (cm->frame_type == KEY_FRAME) { - *totalrate += vp8cx_encode_intra_macroblock(cpi, x, tp); + const int intra_rate_cost = vp8cx_encode_intra_macroblock(cpi, x, tp); + if (INT_MAX - *totalrate > intra_rate_cost) + *totalrate += intra_rate_cost; + else + *totalrate = INT_MAX; #ifdef MODE_STATS y_modes[xd->mbmi.mode]++; #endif } else { - *totalrate += vp8cx_encode_inter_macroblock( + const int inter_rate_cost = vp8cx_encode_inter_macroblock( cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col); + if (INT_MAX - *totalrate > inter_rate_cost) + *totalrate += inter_rate_cost; + else + *totalrate = INT_MAX; #ifdef MODE_STATS inter_y_modes[xd->mbmi.mode]++; @@ -566,8 +563,9 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded != 0) { - protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col); + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) { + vpx_atomic_store_release(current_mb_col, + vpx_atomic_load_acquire(&rightmost_col)); } #endif @@ -635,12 +633,13 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi) { cpi->prob_last_coded, cpi->prob_gf_coded); } - xd->fullpixel_mask = 0xffffffff; - if (cm->full_pixel) xd->fullpixel_mask = 0xfffffff8; + xd->fullpixel_mask = ~0; + if (cm->full_pixel) xd->fullpixel_mask = ~7; vp8_zero(x->coef_counts); vp8_zero(x->ymode_count); - vp8_zero(x->uv_mode_count) x->prediction_error = 0; + vp8_zero(x->uv_mode_count); + x->prediction_error = 0; x->intra_error = 0; vp8_zero(x->count_mb_ref_frame_usage); } @@ -748,30 +747,42 @@ void vp8_encode_frame(VP8_COMP *cpi) { #endif { +#if CONFIG_INTERNAL_STATS struct vpx_usec_timer emr_timer; vpx_usec_timer_start(&emr_timer); +#endif #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { int i; vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, cpi->encoding_thread_count); - for (i = 0; i < cm->mb_rows; ++i) cpi->mt_current_mb_col[i] = -1; + if (cpi->mt_current_mb_col_size != cm->mb_rows) { + vpx_free(cpi->mt_current_mb_col); + cpi->mt_current_mb_col = NULL; + cpi->mt_current_mb_col_size = 0; + CHECK_MEM_ERROR( + &cpi->common.error, cpi->mt_current_mb_col, + vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); + cpi->mt_current_mb_col_size = cm->mb_rows; + } + for (i = 0; i < cm->mb_rows; ++i) + vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1); for (i = 0; i < cpi->encoding_thread_count; ++i) { - sem_post(&cpi->h_event_start_encoding[i]); + vp8_sem_post(&cpi->h_event_start_encoding[i]); } for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1)) { - vp8_zero(cm->left_context) + vp8_zero(cm->left_context); #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING - tp = cpi->tok; + tp = cpi->tok; #else - tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24); + tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24); #endif encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate); @@ -794,7 +805,7 @@ void vp8_encode_frame(VP8_COMP *cpi) { } /* Wait for all the threads to finish. */ for (i = 0; i < cpi->encoding_thread_count; ++i) { - sem_wait(&cpi->h_event_end_encoding[i]); + vp8_sem_wait(&cpi->h_event_end_encoding[i]); } for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { @@ -858,10 +869,10 @@ void vp8_encode_frame(VP8_COMP *cpi) { /* for each macroblock row in image */ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { - vp8_zero(cm->left_context) + vp8_zero(cm->left_context); #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING - tp = cpi->tok; + tp = cpi->tok; #endif encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate); @@ -885,8 +896,10 @@ void vp8_encode_frame(VP8_COMP *cpi) { } #endif +#if CONFIG_INTERNAL_STATS vpx_usec_timer_mark(&emr_timer); cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer); +#endif } // Work out the segment probabilities if segmentation is enabled diff --git a/media/libvpx/libvpx/vp8/encoder/encodeframe.h b/media/libvpx/libvpx/vp8/encoder/encodeframe.h index c1d8634927..cc8cf4d713 100644 --- a/media/libvpx/libvpx/vp8/encoder/encodeframe.h +++ b/media/libvpx/libvpx/vp8/encoder/encodeframe.h @@ -7,29 +7,34 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ENCODEFRAME_H_ -#define VP8_ENCODER_ENCODEFRAME_H_ +#ifndef VPX_VP8_ENCODER_ENCODEFRAME_H_ +#define VPX_VP8_ENCODER_ENCODEFRAME_H_ + +#include "vp8/encoder/tokenize.h" #ifdef __cplusplus extern "C" { #endif -extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x); -extern void vp8_build_block_offsets(MACROBLOCK *x); +struct VP8_COMP; +struct macroblock; -extern void vp8_setup_block_ptrs(MACROBLOCK *x); +void vp8_activity_masking(struct VP8_COMP *cpi, MACROBLOCK *x); -extern void vp8_encode_frame(VP8_COMP *cpi); +void vp8_build_block_offsets(struct macroblock *x); -extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, - TOKENEXTRA **t, int recon_yoffset, - int recon_uvoffset, int mb_row, - int mb_col); +void vp8_setup_block_ptrs(struct macroblock *x); -extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x, - TOKENEXTRA **t); +void vp8_encode_frame(struct VP8_COMP *cpi); + +int vp8cx_encode_inter_macroblock(struct VP8_COMP *cpi, struct macroblock *x, + TOKENEXTRA **t, int recon_yoffset, + int recon_uvoffset, int mb_row, int mb_col); + +int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x, + TOKENEXTRA **t); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_ENCODER_ENCODEFRAME_H_ +#endif // VPX_VP8_ENCODER_ENCODEFRAME_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/encodeintra.c b/media/libvpx/libvpx/vp8/encoder/encodeintra.c index f89e7cb1fa..7d448c0ea0 100644 --- a/media/libvpx/libvpx/vp8/encoder/encodeintra.c +++ b/media/libvpx/libvpx/vp8/encoder/encodeintra.c @@ -18,10 +18,9 @@ #include "vp8/common/invtrans.h" #include "encodeintra.h" -int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) { +int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred) { int i; int intra_pred_var = 0; - (void)cpi; if (use_dc_pred) { x->e_mbd.mode_info_context->mbmi.mode = DC_PRED; diff --git a/media/libvpx/libvpx/vp8/encoder/encodeintra.h b/media/libvpx/libvpx/vp8/encoder/encodeintra.h index 3956cf5fb1..9a378abf49 100644 --- a/media/libvpx/libvpx/vp8/encoder/encodeintra.h +++ b/media/libvpx/libvpx/vp8/encoder/encodeintra.h @@ -8,15 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ENCODEINTRA_H_ -#define VP8_ENCODER_ENCODEINTRA_H_ +#ifndef VPX_VP8_ENCODER_ENCODEINTRA_H_ +#define VPX_VP8_ENCODER_ENCODEINTRA_H_ #include "onyx_int.h" #ifdef __cplusplus extern "C" { #endif -int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred); +int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred); void vp8_encode_intra16x16mby(MACROBLOCK *x); void vp8_encode_intra16x16mbuv(MACROBLOCK *x); void vp8_encode_intra4x4mby(MACROBLOCK *mb); @@ -25,4 +25,4 @@ void vp8_encode_intra4x4block(MACROBLOCK *x, int ib); } // extern "C" #endif -#endif // VP8_ENCODER_ENCODEINTRA_H_ +#endif // VPX_VP8_ENCODER_ENCODEINTRA_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/encodemb.c b/media/libvpx/libvpx/vp8/encoder/encodemb.c index 3fd8d5fabe..052d09ba3b 100644 --- a/media/libvpx/libvpx/vp8/encoder/encodemb.c +++ b/media/libvpx/libvpx/vp8/encoder/encodemb.c @@ -396,8 +396,8 @@ static void optimize_mb(MACROBLOCK *x) { ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; - memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + t_above = *x->e_mbd.above_context; + t_left = *x->e_mbd.left_context; ta = (ENTROPY_CONTEXT *)&t_above; tl = (ENTROPY_CONTEXT *)&t_left; @@ -437,8 +437,8 @@ void vp8_optimize_mby(MACROBLOCK *x) { if (!x->e_mbd.left_context) return; - memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + t_above = *x->e_mbd.above_context; + t_left = *x->e_mbd.left_context; ta = (ENTROPY_CONTEXT *)&t_above; tl = (ENTROPY_CONTEXT *)&t_left; @@ -470,8 +470,8 @@ void vp8_optimize_mbuv(MACROBLOCK *x) { if (!x->e_mbd.left_context) return; - memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + t_above = *x->e_mbd.above_context; + t_left = *x->e_mbd.left_context; ta = (ENTROPY_CONTEXT *)&t_above; tl = (ENTROPY_CONTEXT *)&t_left; diff --git a/media/libvpx/libvpx/vp8/encoder/encodemb.h b/media/libvpx/libvpx/vp8/encoder/encodemb.h index b55ba3ac3f..db577ddc10 100644 --- a/media/libvpx/libvpx/vp8/encoder/encodemb.h +++ b/media/libvpx/libvpx/vp8/encoder/encodemb.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ENCODEMB_H_ -#define VP8_ENCODER_ENCODEMB_H_ +#ifndef VPX_VP8_ENCODER_ENCODEMB_H_ +#define VPX_VP8_ENCODER_ENCODEMB_H_ #include "onyx_int.h" @@ -37,4 +37,4 @@ void vp8_encode_inter16x16y(MACROBLOCK *x); } // extern "C" #endif -#endif // VP8_ENCODER_ENCODEMB_H_ +#endif // VPX_VP8_ENCODER_ENCODEMB_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/encodemv.c b/media/libvpx/libvpx/vp8/encoder/encodemv.c index 36e9a9078c..384bb29389 100644 --- a/media/libvpx/libvpx/vp8/encoder/encodemv.c +++ b/media/libvpx/libvpx/vp8/encoder/encodemv.c @@ -16,38 +16,30 @@ #include -#ifdef VP8_ENTROPY_STATS -extern unsigned int active_section; -#endif - static void encode_mvcomponent(vp8_writer *const w, const int v, const struct mv_context *mvc) { const vp8_prob *p = mvc->prob; const int x = v < 0 ? -v : v; - if (x < mvnum_short) /* Small */ - { + if (x < mvnum_short) { /* Small */ vp8_write(w, 0, p[mvpis_short]); vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3); if (!x) return; /* no sign bit */ - } else /* Large */ - { + } else { /* Large */ int i = 0; vp8_write(w, 1, p[mvpis_short]); - do + do { vp8_write(w, (x >> i) & 1, p[MVPbits + i]); - - while (++i < 3); + } while (++i < 3); i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */ - do + do { vp8_write(w, (x >> i) & 1, p[MVPbits + i]); - - while (--i > 3); + } while (--i > 3); if (x & 0xFFF0) vp8_write(w, (x >> 3) & 1, p[MVPbits + 3]); } @@ -166,7 +158,7 @@ static void calc_prob(vp8_prob *p, const unsigned int ct[2]) { const unsigned int tot = ct[0] + ct[1]; if (tot) { - const vp8_prob x = ((ct[0] * 255) / tot) & -2; + const vp8_prob x = ((ct[0] * 255) / tot) & ~1u; *p = x ? x : 1; } } @@ -211,8 +203,11 @@ static void write_component_probs(vp8_writer *const w, (void)rc; vp8_copy_array(Pnew, default_mvc, MVPcount); - vp8_zero(is_short_ct) vp8_zero(sign_ct) vp8_zero(bit_ct) vp8_zero(short_ct) - vp8_zero(short_bct) + vp8_zero(is_short_ct); + vp8_zero(sign_ct); + vp8_zero(bit_ct); + vp8_zero(short_ct); + vp8_zero(short_bct); /* j=0 */ { @@ -311,9 +306,6 @@ void vp8_write_mvprobs(VP8_COMP *cpi) { vp8_writer *const w = cpi->bc; MV_CONTEXT *mvc = cpi->common.fc.mvc; int flags[2] = { 0, 0 }; -#ifdef VP8_ENTROPY_STATS - active_section = 4; -#endif write_component_probs(w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0], cpi->mb.MVcount[0], 0, &flags[0]); @@ -325,8 +317,4 @@ void vp8_write_mvprobs(VP8_COMP *cpi) { vp8_build_component_cost_table( cpi->mb.mvcost, (const MV_CONTEXT *)cpi->common.fc.mvc, flags); } - -#ifdef VP8_ENTROPY_STATS - active_section = 5; -#endif } diff --git a/media/libvpx/libvpx/vp8/encoder/encodemv.h b/media/libvpx/libvpx/vp8/encoder/encodemv.h index 87db30f310..347b9feffe 100644 --- a/media/libvpx/libvpx/vp8/encoder/encodemv.h +++ b/media/libvpx/libvpx/vp8/encoder/encodemv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ENCODEMV_H_ -#define VP8_ENCODER_ENCODEMV_H_ +#ifndef VPX_VP8_ENCODER_ENCODEMV_H_ +#define VPX_VP8_ENCODER_ENCODEMV_H_ #include "onyx_int.h" @@ -26,4 +26,4 @@ void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, } // extern "C" #endif -#endif // VP8_ENCODER_ENCODEMV_H_ +#endif // VPX_VP8_ENCODER_ENCODEMV_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.c b/media/libvpx/libvpx/vp8/encoder/ethreading.c index df34997acc..98c87d3cbc 100644 --- a/media/libvpx/libvpx/vp8/encoder/ethreading.c +++ b/media/libvpx/libvpx/vp8/encoder/ethreading.c @@ -7,49 +7,52 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include #include "onyx_int.h" +#include "vpx_util/vpx_pthread.h" #include "vp8/common/threading.h" #include "vp8/common/common.h" #include "vp8/common/extend.h" #include "bitstream.h" #include "encodeframe.h" +#include "ethreading.h" #if CONFIG_MULTITHREAD extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip); -static THREAD_FUNCTION thread_loopfilter(void *p_data) { +static THREADFN thread_loopfilter(void *p_data) { VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1); VP8_COMMON *cm = &cpi->common; while (1) { - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; - if (sem_wait(&cpi->h_event_start_lpf) == 0) { + if (vp8_sem_wait(&cpi->h_event_start_lpf) == 0) { /* we're shutting down */ - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; vp8_loopfilter_frame(cpi, cm); - sem_post(&cpi->h_event_end_lpf); + vp8_sem_post(&cpi->h_event_end_lpf); } } - return 0; + return THREAD_EXIT_SUCCESS; } -static THREAD_FUNCTION thread_encoding_proc(void *p_data) { +static THREADFN thread_encoding_proc(void *p_data) { int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread; VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1); MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2); ENTROPY_CONTEXT_PLANES mb_row_left_context; while (1) { - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; - if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { + if (vp8_sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { const int nsync = cpi->mt_sync_range; VP8_COMMON *cm = &cpi->common; int mb_row; @@ -65,7 +68,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { int *totalrate = &mbri->totalrate; /* we're shutting down */ - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break; xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1); xd->mode_info_stride = cm->mode_info_stride; @@ -79,8 +82,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; int map_index = (mb_row * cm->mb_cols); - const int *last_row_current_mb_col; - int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + const vpx_atomic_int *last_row_current_mb_col; + vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)]; @@ -107,13 +110,11 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { /* for each macroblock col in image */ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { if (((mb_col - 1) % nsync) == 0) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row]; - protected_write(mutex, current_mb_col, mb_col - 1); + vpx_atomic_store_release(current_mb_col, mb_col - 1); } if (mb_row && !(mb_col & (nsync - 1))) { - pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1]; - sync_read(mutex, mb_col, last_row_current_mb_col, nsync); + vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync); } #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING @@ -285,7 +286,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); - protected_write(&cpi->pmutex[mb_row], current_mb_col, mb_col + nsync); + vpx_atomic_store_release(current_mb_col, mb_col + nsync); /* this is to account for the border */ xd->mode_info_context++; @@ -307,12 +308,12 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { x->gf_active_ptr += cm->mb_cols * cpi->encoding_thread_count; } /* Signal that this thread has completed processing its rows. */ - sem_post(&cpi->h_event_end_encoding[ithread]); + vp8_sem_post(&cpi->h_event_end_encoding[ithread]); } } /* printf("exit thread %d\n", ithread); */ - return 0; + return THREAD_EXIT_SUCCESS; } static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) { @@ -403,7 +404,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) { zd->subpixel_predict8x8 = xd->subpixel_predict8x8; zd->subpixel_predict16x16 = xd->subpixel_predict16x16; zd->segmentation_enabled = xd->segmentation_enabled; - zd->mb_segement_abs_delta = xd->mb_segement_abs_delta; + zd->mb_segment_abs_delta = xd->mb_segment_abs_delta; memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data)); @@ -471,8 +472,8 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, setup_mbby_copy(&mbr_ei[i].mb, x); - mbd->fullpixel_mask = 0xffffffff; - if (cm->full_pixel) mbd->fullpixel_mask = 0xfffffff8; + mbd->fullpixel_mask = ~0; + if (cm->full_pixel) mbd->fullpixel_mask = ~7; vp8_zero(mb->coef_counts); vp8_zero(x->ymode_count); @@ -488,17 +489,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, int vp8cx_create_encoder_threads(VP8_COMP *cpi) { const VP8_COMMON *cm = &cpi->common; - - cpi->b_multi_threaded = 0; - cpi->encoding_thread_count = 0; - cpi->b_lpf_running = 0; - - pthread_mutex_init(&cpi->mt_mutex, NULL); + int th_count = 0; if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) { - int ithread; - int th_count = cpi->oxcf.multi_threaded - 1; - int rc = 0; + th_count = cpi->oxcf.multi_threaded - 1; /* don't allocate more threads than cores available */ if (cpi->oxcf.multi_threaded > cm->processor_core_count) { @@ -510,22 +504,27 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1)) { th_count = (cm->mb_cols / cpi->mt_sync_range) - 1; } + } + if (th_count == cpi->encoding_thread_count) return 0; - if (th_count == 0) return 0; + vp8cx_remove_encoder_threads(cpi); + if (th_count != 0) { + int ithread; + int rc = 0; - CHECK_MEM_ERROR(cpi->h_encoding_thread, + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count)); - CHECK_MEM_ERROR(cpi->h_event_start_encoding, - vpx_malloc(sizeof(sem_t) * th_count)); - CHECK_MEM_ERROR(cpi->h_event_end_encoding, - vpx_malloc(sizeof(sem_t) * th_count)); - CHECK_MEM_ERROR(cpi->mb_row_ei, + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding, + vpx_malloc(sizeof(vp8_sem_t) * th_count)); + CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding, + vpx_malloc(sizeof(vp8_sem_t) * th_count)); + CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count)); memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count); - CHECK_MEM_ERROR(cpi->en_thread_data, + CHECK_MEM_ERROR(&cpi->common.error, cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count)); - cpi->b_multi_threaded = 1; + vpx_atomic_store_release(&cpi->b_multi_threaded, 1); cpi->encoding_thread_count = th_count; /* @@ -540,8 +539,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb); vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd); - sem_init(&cpi->h_event_start_encoding[ithread], 0, 0); - sem_init(&cpi->h_event_end_encoding[ithread], 0, 0); + vp8_sem_init(&cpi->h_event_start_encoding[ithread], 0, 0); + vp8_sem_init(&cpi->h_event_end_encoding[ithread], 0, 0); ethd->ithread = ithread; ethd->ptr1 = (void *)cpi; @@ -554,21 +553,27 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (rc) { /* shutdown other threads */ - protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { + vp8_sem_post(&cpi->h_event_start_encoding[ithread]); + vp8_sem_post(&cpi->h_event_end_encoding[ithread]); pthread_join(cpi->h_encoding_thread[ithread], 0); - sem_destroy(&cpi->h_event_start_encoding[ithread]); - sem_destroy(&cpi->h_event_end_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]); } /* free thread related resources */ vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; vpx_free(cpi->en_thread_data); - - pthread_mutex_destroy(&cpi->mt_mutex); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; return -1; } @@ -576,33 +581,37 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { { LPFTHREAD_DATA *lpfthd = &cpi->lpf_thread_data; - sem_init(&cpi->h_event_start_lpf, 0, 0); - sem_init(&cpi->h_event_end_lpf, 0, 0); + vp8_sem_init(&cpi->h_event_start_lpf, 0, 0); + vp8_sem_init(&cpi->h_event_end_lpf, 0, 0); lpfthd->ptr1 = (void *)cpi; rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter, lpfthd); if (rc) { /* shutdown other threads */ - protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { - sem_post(&cpi->h_event_start_encoding[ithread]); - sem_post(&cpi->h_event_end_encoding[ithread]); + vp8_sem_post(&cpi->h_event_start_encoding[ithread]); + vp8_sem_post(&cpi->h_event_end_encoding[ithread]); pthread_join(cpi->h_encoding_thread[ithread], 0); - sem_destroy(&cpi->h_event_start_encoding[ithread]); - sem_destroy(&cpi->h_event_end_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_start_encoding[ithread]); + vp8_sem_destroy(&cpi->h_event_end_encoding[ithread]); } - sem_destroy(&cpi->h_event_end_lpf); - sem_destroy(&cpi->h_event_start_lpf); + vp8_sem_destroy(&cpi->h_event_end_lpf); + vp8_sem_destroy(&cpi->h_event_start_lpf); /* free thread related resources */ vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; vpx_free(cpi->en_thread_data); - - pthread_mutex_destroy(&cpi->mt_mutex); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; return -2; } @@ -612,36 +621,45 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { } void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { - if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded)) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { /* shutdown other threads */ - protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); + vpx_atomic_store_release(&cpi->b_multi_threaded, 0); { int i; for (i = 0; i < cpi->encoding_thread_count; ++i) { - sem_post(&cpi->h_event_start_encoding[i]); - sem_post(&cpi->h_event_end_encoding[i]); + vp8_sem_post(&cpi->h_event_start_encoding[i]); + vp8_sem_post(&cpi->h_event_end_encoding[i]); pthread_join(cpi->h_encoding_thread[i], 0); - sem_destroy(&cpi->h_event_start_encoding[i]); - sem_destroy(&cpi->h_event_end_encoding[i]); + vp8_sem_destroy(&cpi->h_event_start_encoding[i]); + vp8_sem_destroy(&cpi->h_event_end_encoding[i]); } - sem_post(&cpi->h_event_start_lpf); + vp8_sem_post(&cpi->h_event_start_lpf); pthread_join(cpi->h_filter_thread, 0); } - sem_destroy(&cpi->h_event_end_lpf); - sem_destroy(&cpi->h_event_start_lpf); + vp8_sem_destroy(&cpi->h_event_end_lpf); + vp8_sem_destroy(&cpi->h_event_start_lpf); + cpi->b_lpf_running = 0; /* free thread related resources */ + vpx_free(cpi->mt_current_mb_col); + cpi->mt_current_mb_col = NULL; + cpi->mt_current_mb_col_size = 0; vpx_free(cpi->h_event_start_encoding); + cpi->h_event_start_encoding = NULL; vpx_free(cpi->h_event_end_encoding); + cpi->h_event_end_encoding = NULL; vpx_free(cpi->h_encoding_thread); + cpi->h_encoding_thread = NULL; vpx_free(cpi->mb_row_ei); + cpi->mb_row_ei = NULL; vpx_free(cpi->en_thread_data); + cpi->en_thread_data = NULL; + cpi->encoding_thread_count = 0; } - pthread_mutex_destroy(&cpi->mt_mutex); } #endif diff --git a/media/libvpx/libvpx/vp8/encoder/ethreading.h b/media/libvpx/libvpx/vp8/encoder/ethreading.h new file mode 100644 index 0000000000..598fe60559 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/ethreading.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_ETHREADING_H_ +#define VPX_VP8_ENCODER_ETHREADING_H_ + +#include "vp8/encoder/onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; +struct macroblock; + +void vp8cx_init_mbrthread_data(struct VP8_COMP *cpi, struct macroblock *x, + MB_ROW_COMP *mbr_ei, int count); +int vp8cx_create_encoder_threads(struct VP8_COMP *cpi); +void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi); + +#ifdef __cplusplus +} +#endif + +#endif // VPX_VP8_ENCODER_ETHREADING_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/firstpass.c b/media/libvpx/libvpx/vp8/encoder/firstpass.c index 884d6e18b1..097dc0ed78 100644 --- a/media/libvpx/libvpx/vp8/encoder/firstpass.c +++ b/media/libvpx/libvpx/vp8/encoder/firstpass.c @@ -10,6 +10,7 @@ #include #include +#include #include #include "./vpx_dsp_rtcd.h" @@ -17,6 +18,7 @@ #include "block.h" #include "onyx_int.h" #include "vpx_dsp/variance.h" +#include "vpx_dsp/vpx_dsp_common.h" #include "encodeintra.h" #include "vp8/common/common.h" #include "vp8/common/setupintrarecon.h" @@ -52,7 +54,7 @@ extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE]; #define KF_MB_INTRA_MIN 300 #define GF_MB_INTRA_MIN 200 -#define DOUBLE_DIVIDE_CHECK(X) ((X) < 0 ? (X)-.000001 : (X) + .000001) +#define DOUBLE_DIVIDE_CHECK(X) ((X) < 0 ? (X) - .000001 : (X) + .000001) #define POW1 (double)cpi->oxcf.two_pass_vbrbias / 100.0 #define POW2 (double)cpi->oxcf.two_pass_vbrbias / 100.0 @@ -113,11 +115,9 @@ static int input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps) { return 1; } -static void output_stats(const VP8_COMP *cpi, - struct vpx_codec_pkt_list *pktlist, +static void output_stats(struct vpx_codec_pkt_list *pktlist, FIRSTPASS_STATS *stats) { struct vpx_codec_cx_pkt pkt; - (void)cpi; pkt.kind = VPX_CODEC_STATS_PKT; pkt.data.twopass_stats.buf = stats; pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS); @@ -354,10 +354,11 @@ static int frame_max_bits(VP8_COMP *cpi) { /* For VBR base this on the bits and frames left plus the * two_pass_vbrmax_section rate passed in by the user */ - max_bits = (int)(((double)cpi->twopass.bits_left / - (cpi->twopass.total_stats.count - - (double)cpi->common.current_video_frame)) * - ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0)); + max_bits = saturate_cast_double_to_int( + ((double)cpi->twopass.bits_left / + (cpi->twopass.total_stats.count - + (double)cpi->common.current_video_frame)) * + ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0)); } /* Trap case where we are out of bits */ @@ -371,11 +372,10 @@ void vp8_init_first_pass(VP8_COMP *cpi) { } void vp8_end_first_pass(VP8_COMP *cpi) { - output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats); + output_stats(cpi->output_pkt_list, &cpi->twopass.total_stats); } -static void zz_motion_search(VP8_COMP *cpi, MACROBLOCK *x, - YV12_BUFFER_CONFIG *raw_buffer, +static void zz_motion_search(MACROBLOCK *x, YV12_BUFFER_CONFIG *raw_buffer, int *raw_motion_err, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) { @@ -389,7 +389,6 @@ static void zz_motion_search(VP8_COMP *cpi, MACROBLOCK *x, int raw_stride = raw_buffer->y_stride; unsigned char *ref_ptr; int ref_stride = x->e_mbd.pre.y_stride; - (void)cpi; /* Set up pointers for this macro block raw buffer */ raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset + d->offset); @@ -416,7 +415,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, int_mv ref_mv_full; int tmp_err; - int step_param = 3; /* Dont search over full range for first pass */ + int step_param = 3; /* Don't search over full range for first pass */ int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; int n; vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; @@ -571,7 +570,7 @@ void vp8_first_pass(VP8_COMP *cpi) { vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16); /* do intra 16x16 prediction */ - this_error = vp8_encode_intra(cpi, x, use_dc_pred); + this_error = vp8_encode_intra(x, use_dc_pred); /* "intrapenalty" below deals with situations where the intra * and inter error scores are very low (eg a plain black frame) @@ -603,9 +602,8 @@ void vp8_first_pass(VP8_COMP *cpi) { int raw_motion_error = INT_MAX; /* Simple 0,0 motion with no mv overhead */ - zz_motion_search(cpi, x, cpi->last_frame_unscaled_source, - &raw_motion_error, lst_yv12, &motion_error, - recon_yoffset); + zz_motion_search(x, cpi->last_frame_unscaled_source, &raw_motion_error, + lst_yv12, &motion_error, recon_yoffset); d->bmi.mv.as_mv.row = 0; d->bmi.mv.as_mv.col = 0; @@ -797,8 +795,8 @@ void vp8_first_pass(VP8_COMP *cpi) { fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start); /* don't want to do output stats with a stack variable! */ - memcpy(&cpi->twopass.this_frame_stats, &fps, sizeof(FIRSTPASS_STATS)); - output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats); + cpi->twopass.this_frame_stats = fps; + output_stats(cpi->output_pkt_list, &cpi->twopass.this_frame_stats); accumulate_stats(&cpi->twopass.total_stats, &fps); } @@ -826,22 +824,6 @@ void vp8_first_pass(VP8_COMP *cpi) { vp8_yv12_copy_frame(lst_yv12, gld_yv12); } - /* use this to see what the first pass reconstruction looks like */ - if (0) { - char filename[512]; - FILE *recon_file; - sprintf(filename, "enc%04d.yuv", (int)cm->current_video_frame); - - if (cm->current_video_frame == 0) { - recon_file = fopen(filename, "wb"); - } else { - recon_file = fopen(filename, "ab"); - } - - (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file); - fclose(recon_file); - } - cm->current_video_frame++; } extern const int vp8_bits_per_mb[2][QINDEX_RANGE]; @@ -908,9 +890,9 @@ static double calc_correction_factor(double err_per_mb, double err_devisor, correction_factor = pow(error_term, power_term); /* Clip range */ - correction_factor = (correction_factor < 0.05) - ? 0.05 - : (correction_factor > 5.0) ? 5.0 : correction_factor; + correction_factor = (correction_factor < 0.05) ? 0.05 + : (correction_factor > 5.0) ? 5.0 + : correction_factor; return correction_factor; } @@ -952,11 +934,10 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, } cpi->twopass.est_max_qcorrection_factor = - (cpi->twopass.est_max_qcorrection_factor < 0.1) - ? 0.1 - : (cpi->twopass.est_max_qcorrection_factor > 10.0) - ? 10.0 - : cpi->twopass.est_max_qcorrection_factor; + (cpi->twopass.est_max_qcorrection_factor < 0.1) ? 0.1 + : (cpi->twopass.est_max_qcorrection_factor > 10.0) + ? 10.0 + : cpi->twopass.est_max_qcorrection_factor; } /* Corrections for higher compression speed settings @@ -989,11 +970,11 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, bits_per_mb_at_this_q = vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb; - bits_per_mb_at_this_q = (int)(.5 + - err_correction_factor * speed_correction * - cpi->twopass.est_max_qcorrection_factor * - cpi->twopass.section_max_qfactor * - (double)bits_per_mb_at_this_q); + bits_per_mb_at_this_q = + (int)(.5 + err_correction_factor * speed_correction * + cpi->twopass.est_max_qcorrection_factor * + cpi->twopass.section_max_qfactor * + (double)bits_per_mb_at_this_q); /* Mode and motion overhead */ /* As Q rises in real encode loop rd code will force overhead down @@ -1044,12 +1025,6 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, double clip_iifactor; int overhead_bits_per_mb; - if (0) { - FILE *f = fopen("epmp.stt", "a"); - fprintf(f, "%10.2f\n", err_per_mb); - fclose(f); - } - target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs); @@ -1086,9 +1061,8 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb; bits_per_mb_at_this_q = - (int)(.5 + - err_correction_factor * speed_correction * clip_iifactor * - (double)bits_per_mb_at_this_q); + (int)(.5 + err_correction_factor * speed_correction * clip_iifactor * + (double)bits_per_mb_at_this_q); /* Mode and motion overhead */ /* As Q rises in real encode loop rd code will force overhead down @@ -1184,10 +1158,9 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, } else { current_spend_ratio = (double)cpi->long_rolling_actual_bits / (double)cpi->long_rolling_target_bits; - current_spend_ratio = - (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1) - ? 0.1 - : current_spend_ratio; + current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 + : (current_spend_ratio < 0.1) ? 0.1 + : current_spend_ratio; } /* Calculate a correction factor based on the quality of prediction in @@ -1238,17 +1211,6 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, Q++; } - if (0) { - FILE *f = fopen("estkf_q.stt", "a"); - fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n", - cpi->common.current_video_frame, bits_per_mb_at_this_q, - target_norm_bits_per_mb, err_per_mb, err_correction_factor, - current_spend_ratio, group_iiratio, iiratio_correction_factor, - (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level, - Q); - fclose(f); - } - return Q; } @@ -1276,7 +1238,6 @@ void vp8_init_second_pass(VP8_COMP *cpi) { vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration); - cpi->output_framerate = cpi->framerate; cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0); cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * @@ -1338,12 +1299,10 @@ void vp8_end_second_pass(VP8_COMP *cpi) { (void)cpi; } /* This function gives and estimate of how badly we believe the prediction * quality is decaying from frame to frame. */ -static double get_prediction_decay_rate(VP8_COMP *cpi, - FIRSTPASS_STATS *next_frame) { +static double get_prediction_decay_rate(FIRSTPASS_STATS *next_frame) { double prediction_decay_rate; double motion_decay; double motion_pct = next_frame->pcnt_motion; - (void)cpi; /* Initial basis is the % mbs inter coded */ prediction_decay_rate = next_frame->pcnt_inter; @@ -1400,7 +1359,7 @@ static int detect_transition_to_still(VP8_COMP *cpi, int frame_interval, for (j = 0; j < still_interval; ++j) { if (EOF == input_stats(cpi, &tmp_next_frame)) break; - decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame); + decay_rate = get_prediction_decay_rate(&tmp_next_frame); if (decay_rate < 0.999) break; } /* Reset file position */ @@ -1451,8 +1410,7 @@ static int detect_flash(VP8_COMP *cpi, int offset) { } /* Update the motion related elements to the GF arf boost calculation */ -static void accumulate_frame_motion_stats(VP8_COMP *cpi, - FIRSTPASS_STATS *this_frame, +static void accumulate_frame_motion_stats(FIRSTPASS_STATS *this_frame, double *this_frame_mv_in_out, double *mv_in_out_accumulator, double *abs_mv_in_out_accumulator, @@ -1460,7 +1418,6 @@ static void accumulate_frame_motion_stats(VP8_COMP *cpi, double this_frame_mvr_ratio; double this_frame_mvc_ratio; double motion_pct; - (void)cpi; /* Accumulate motion stats. */ motion_pct = this_frame->pcnt_motion; @@ -1543,13 +1500,13 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames, /* Update the motion related elements to the boost calculation */ accumulate_frame_motion_stats( - cpi, &this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); /* Calculate the baseline boost number for this frame */ r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out); - /* We want to discount the the flash frame itself and the recovery + /* We want to discount the flash frame itself and the recovery * frame that follows as both will have poor scores. */ flash_detected = @@ -1558,7 +1515,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames, /* Cumulative effect of prediction quality decay */ if (!flash_detected) { decay_accumulator = - decay_accumulator * get_prediction_decay_rate(cpi, &this_frame); + decay_accumulator * get_prediction_decay_rate(&this_frame); decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; } boost_score += (decay_accumulator * r); @@ -1587,13 +1544,13 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames, /* Update the motion related elements to the boost calculation */ accumulate_frame_motion_stats( - cpi, &this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); /* Calculate the baseline boost number for this frame */ r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out); - /* We want to discount the the flash frame itself and the recovery + /* We want to discount the flash frame itself and the recovery * frame that follows as both will have poor scores. */ flash_detected = @@ -1602,7 +1559,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames, /* Cumulative effect of prediction quality decay */ if (!flash_detected) { decay_accumulator = - decay_accumulator * get_prediction_decay_rate(cpi, &this_frame); + decay_accumulator * get_prediction_decay_rate(&this_frame); decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; } @@ -1641,7 +1598,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { double this_frame_mv_in_out = 0.0; double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; - double mod_err_per_mb_accumulator = 0.0; int max_bits = frame_max_bits(cpi); /* Max for a single frame */ @@ -1692,9 +1648,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group_err += mod_frame_err; - mod_err_per_mb_accumulator += - mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs); - if (EOF == input_stats(cpi, &next_frame)) break; /* Test for the case where there is a brief flash but the prediction @@ -1704,7 +1657,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { /* Update the motion related elements to the boost calculation */ accumulate_frame_motion_stats( - cpi, &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); /* Calculate a baseline boost number for this frame */ @@ -1712,7 +1665,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { /* Cumulative effect of prediction quality decay */ if (!flash_detected) { - loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); + loop_decay_rate = get_prediction_decay_rate(&next_frame); decay_accumulator = decay_accumulator * loop_decay_rate; decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; } @@ -1733,20 +1686,21 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { /* Break at cpi->max_gf_interval unless almost totally static */ (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) || ( - /* Dont break out with a very short interval */ + /* Don't break out with a very short interval */ (i > MIN_GF_INTERVAL) && - /* Dont break out very close to a key frame */ + /* Don't break out very close to a key frame */ ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) && ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) && - (!flash_detected) && ((mv_ratio_accumulator > 100.0) || - (abs_mv_in_out_accumulator > 3.0) || - (mv_in_out_accumulator < -2.0) || - ((boost_score - old_boost_score) < 2.0)))) { + (!flash_detected) && + ((mv_ratio_accumulator > 100.0) || + (abs_mv_in_out_accumulator > 3.0) || + (mv_in_out_accumulator < -2.0) || + ((boost_score - old_boost_score) < 2.0)))) { boost_score = old_boost_score; break; } - memcpy(this_frame, &next_frame, sizeof(*this_frame)); + *this_frame = next_frame; old_boost_score = boost_score; } @@ -1780,7 +1734,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (boost_score > max_boost) boost_score = max_boost; } - /* Dont allow conventional gf too near the next kf */ + /* Don't allow conventional gf too near the next kf */ if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) { while (i < cpi->twopass.frames_to_key) { i++; @@ -1801,9 +1755,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { alt_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); #endif - /* Should we use the alternate refernce frame */ + /* Should we use the alternate reference frame */ if (allow_alt_ref && (i >= MIN_GF_INTERVAL) && - /* dont use ARF very near next kf */ + /* don't use ARF very near next kf */ (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) && #if NEW_BOOST ((next_frame.pcnt_inter > 0.75) || (next_frame.pcnt_second_ref > 0.5)) && @@ -1814,8 +1768,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { (next_frame.pcnt_inter > 0.75) && ((mv_in_out_accumulator / (double)i > -0.2) || (mv_in_out_accumulator > -2.0)) && - (cpi->gfu_boost > 100) && (cpi->twopass.gf_decay_rate <= - (ARF_DECAY_THRESH + (cpi->gfu_boost / 200)))) + (cpi->gfu_boost > 100) && + (cpi->twopass.gf_decay_rate <= + (ARF_DECAY_THRESH + (cpi->gfu_boost / 200)))) #endif { int Boost; @@ -1980,11 +1935,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } cpi->twopass.gf_group_bits = - (cpi->twopass.gf_group_bits < 0) - ? 0 - : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) - ? cpi->twopass.kf_group_bits - : cpi->twopass.gf_group_bits; + (cpi->twopass.gf_group_bits < 0) ? 0 + : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) + ? cpi->twopass.kf_group_bits + : cpi->twopass.gf_group_bits; /* Clip cpi->twopass.gf_group_bits based on user supplied data rate * variability limit (cpi->oxcf.two_pass_vbrmax_section) @@ -2052,8 +2006,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { /* Calculate the number of bits to be spent on the gf or arf based on * the boost number */ - gf_bits = (int)((double)Boost * - (cpi->twopass.gf_group_bits / (double)allocation_chunks)); + gf_bits = saturate_cast_double_to_int( + (double)Boost * + (cpi->twopass.gf_group_bits / (double)allocation_chunks)); /* If the frame that is to be boosted is simpler than the average for * the gf/arf group then use an alternative calculation @@ -2080,9 +2035,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { * score, otherwise it may be worse off than an "un-boosted" frame */ else { - int alt_gf_bits = - (int)((double)cpi->twopass.kf_group_bits * mod_frame_err / - DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left)); + // Avoid division by 0 by clamping cpi->twopass.kf_group_error_left to 1 + int alt_gf_bits = saturate_cast_double_to_int( + (double)cpi->twopass.kf_group_bits * mod_frame_err / + (double)VPXMAX(cpi->twopass.kf_group_error_left, 1)); if (alt_gf_bits > gf_bits) { gf_bits = alt_gf_bits; @@ -2096,7 +2052,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } } - /* Dont allow a negative value for gf_bits */ + /* Don't allow a negative value for gf_bits */ if (gf_bits < 0) gf_bits = 0; /* Add in minimum for a frame */ @@ -2137,7 +2093,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (cpi->twopass.gf_group_bits < 0) cpi->twopass.gf_group_bits = 0; /* This condition could fail if there are two kfs very close together - * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the + * despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the * calculation of cpi->twopass.alt_extra_bits. */ if (cpi->baseline_gf_interval >= 3) { @@ -2216,7 +2172,8 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } /* How many of those bits available for allocation should we give it? */ - target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction); + target_frame_size = saturate_cast_double_to_int( + (double)cpi->twopass.gf_group_bits * err_fraction); /* Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) * at the top end. @@ -2279,7 +2236,7 @@ void vp8_second_pass(VP8_COMP *cpi) { /* keyframe and section processing ! */ if (cpi->twopass.frames_to_key == 0) { /* Define next KF group and assign bits to it */ - memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + this_frame_copy = this_frame; find_next_key_frame(cpi, &this_frame_copy); /* Special case: Error error_resilient_mode mode does not make much @@ -2301,7 +2258,7 @@ void vp8_second_pass(VP8_COMP *cpi) { /* Is this a GF / ARF (Note that a KF is always also a GF) */ if (cpi->frames_till_gf_update_due == 0) { /* Define next gf group and assign bits to it */ - memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + this_frame_copy = this_frame; define_gf_group(cpi, &this_frame_copy); /* If we are going to code an altref frame at the end of the group @@ -2316,7 +2273,7 @@ void vp8_second_pass(VP8_COMP *cpi) { * to the GF group */ int bak = cpi->per_frame_bandwidth; - memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + this_frame_copy = this_frame; assign_std_frame_bits(cpi, &this_frame_copy); cpi->per_frame_bandwidth = bak; } @@ -2336,12 +2293,12 @@ void vp8_second_pass(VP8_COMP *cpi) { if (cpi->common.frame_type != KEY_FRAME) { /* Assign bits from those allocated to the GF group */ - memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + this_frame_copy = this_frame; assign_std_frame_bits(cpi, &this_frame_copy); } } else { /* Assign bits from those allocated to the GF group */ - memcpy(&this_frame_copy, &this_frame, sizeof(this_frame)); + this_frame_copy = this_frame; assign_std_frame_bits(cpi, &this_frame_copy); } } @@ -2371,13 +2328,15 @@ void vp8_second_pass(VP8_COMP *cpi) { if (cpi->common.current_video_frame == 0) { cpi->twopass.est_max_qcorrection_factor = 1.0; + int64_t section_target_bandwidth = cpi->twopass.bits_left / frames_left; + section_target_bandwidth = VPXMIN(section_target_bandwidth, INT_MAX); + /* Set a cq_level in constrained quality mode. */ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { int est_cq; est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left), - overhead_bits); + (int)section_target_bandwidth, overhead_bits); cpi->cq_target_quality = cpi->oxcf.cq_level; if (est_cq > cpi->cq_target_quality) cpi->cq_target_quality = est_cq; @@ -2388,8 +2347,7 @@ void vp8_second_pass(VP8_COMP *cpi) { cpi->twopass.maxq_min_limit = cpi->best_quality; tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left), - overhead_bits); + (int)section_target_bandwidth, overhead_bits); /* Limit the maxq value returned subsequently. * This increases the risk of overspend or underspend if the initial @@ -2407,7 +2365,7 @@ void vp8_second_pass(VP8_COMP *cpi) { } /* The last few frames of a clip almost always have to few or too many - * bits and for the sake of over exact rate control we dont want to make + * bits and for the sake of over exact rate control we don't want to make * radical adjustments to the allowed quantizer range just to use up a * few surplus bits or get beneath the target rate. */ @@ -2417,9 +2375,11 @@ void vp8_second_pass(VP8_COMP *cpi) { (unsigned int)cpi->twopass.total_stats.count)) { if (frames_left < 1) frames_left = 1; + int64_t section_target_bandwidth = cpi->twopass.bits_left / frames_left; + section_target_bandwidth = VPXMIN(section_target_bandwidth, INT_MAX); + tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left), - overhead_bits); + (int)section_target_bandwidth, overhead_bits); /* Move active_worst_quality but in a damped way */ if (tmp_q > cpi->active_worst_quality) { @@ -2470,7 +2430,7 @@ static int test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, double decay_accumulator = 1.0; double next_iiratio; - memcpy(&local_next_frame, next_frame, sizeof(*next_frame)); + local_next_frame = *next_frame; /* Note the starting file position so we can reset to it */ start_pos = cpi->twopass.stats_in; @@ -2563,7 +2523,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->twopass.frames_to_key = 1; /* Take a copy of the initial frame details */ - memcpy(&first_frame, this_frame, sizeof(*this_frame)); + first_frame = *this_frame; cpi->twopass.kf_group_bits = 0; cpi->twopass.kf_group_error_left = 0; @@ -2585,7 +2545,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { kf_group_coded_err += this_frame->coded_error; /* Load the next frame's stats. */ - memcpy(&last_frame, this_frame, sizeof(*this_frame)); + last_frame = *this_frame; input_stats(cpi, this_frame); /* Provided that we are not at the end of the file... */ @@ -2598,7 +2558,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } /* How fast is prediction quality decaying */ - loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); + loop_decay_rate = get_prediction_decay_rate(&next_frame); /* We want to know something about the recent past... rather than * as used elsewhere where we are concened with decay in prediction @@ -2648,7 +2608,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->twopass.frames_to_key /= 2; /* Copy first frame details */ - memcpy(&tmp_frame, &first_frame, sizeof(first_frame)); + tmp_frame = first_frame; /* Reset to the start of the group */ reset_fpf_position(cpi, start_position); @@ -2745,9 +2705,10 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { else if (cpi->twopass.kf_group_bits < av_group_bits) { int64_t bits_below_av = av_group_bits - cpi->twopass.kf_group_bits; - cpi->twopass.kf_group_bits += (int64_t)( - (double)bits_below_av * (double)(buffer_lvl - opt_buffer_lvl) / - (double)(high_water_mark - opt_buffer_lvl)); + cpi->twopass.kf_group_bits += + (int64_t)((double)bits_below_av * + (double)(buffer_lvl - opt_buffer_lvl) / + (double)(high_water_mark - opt_buffer_lvl)); } } } @@ -2780,7 +2741,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { if (r > RMAX) r = RMAX; /* How fast is prediction quality decaying */ - loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); + loop_decay_rate = get_prediction_decay_rate(&next_frame); decay_accumulator = decay_accumulator * loop_decay_rate; decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator; @@ -3004,12 +2965,12 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } /* Set back to unscaled by defaults */ - cpi->common.horiz_scale = NORMAL; - cpi->common.vert_scale = NORMAL; + cpi->common.horiz_scale = VP8E_NORMAL; + cpi->common.vert_scale = VP8E_NORMAL; /* Calculate Average bits per frame. */ - av_bits_per_frame = cpi->oxcf.target_bandwidth / - DOUBLE_DIVIDE_CHECK((double)cpi->framerate); + av_bits_per_frame = + cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK(cpi->framerate); /* CBR... Use the clip average as the target for deciding resample */ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { @@ -3025,7 +2986,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { bits_per_frame = (double)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key); - /* Dont turn to resampling in easy sections just because they + /* Don't turn to resampling in easy sections just because they * have been assigned a small number of bits */ if (bits_per_frame < av_bits_per_frame) { @@ -3061,16 +3022,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { (int)((projected_bits_perframe - av_bits_per_frame) * cpi->twopass.frames_to_key)); - if (0) { - FILE *f = fopen("Subsamle.stt", "a"); - fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n", - cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, - cpi->common.vert_scale, kf_group_err / cpi->twopass.frames_to_key, - (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), - new_height, new_width); - fclose(f); - } - /* The trigger for spatial resampling depends on the various * parameters such as whether we are streaming (CBR) or VBR. */ @@ -3089,9 +3040,9 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { resample_trigger = 0; } } else { - int64_t clip_bits = (int64_t)( - cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / - DOUBLE_DIVIDE_CHECK((double)cpi->framerate)); + int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * + cpi->oxcf.target_bandwidth / + DOUBLE_DIVIDE_CHECK(cpi->framerate)); int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level; /* If triggered last time the threshold for triggering again is @@ -3134,17 +3085,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { */ kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, (int)bits_per_frame, group_iiratio); - - if (0) { - FILE *f = fopen("Subsamle.stt", "a"); - fprintf( - f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q, - cpi->common.horiz_scale, cpi->common.vert_scale, - kf_group_err / cpi->twopass.frames_to_key, - (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), - new_height, new_width); - fclose(f); - } } } diff --git a/media/libvpx/libvpx/vp8/encoder/firstpass.h b/media/libvpx/libvpx/vp8/encoder/firstpass.h index ac8a7b1bfb..f5490f1eff 100644 --- a/media/libvpx/libvpx/vp8/encoder/firstpass.h +++ b/media/libvpx/libvpx/vp8/encoder/firstpass.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_FIRSTPASS_H_ -#define VP8_ENCODER_FIRSTPASS_H_ +#ifndef VPX_VP8_ENCODER_FIRSTPASS_H_ +#define VPX_VP8_ENCODER_FIRSTPASS_H_ #ifdef __cplusplus extern "C" { @@ -28,4 +28,4 @@ extern size_t vp8_firstpass_stats_sz(unsigned int mb_count); } // extern "C" #endif -#endif // VP8_ENCODER_FIRSTPASS_H_ +#endif // VPX_VP8_ENCODER_FIRSTPASS_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/lookahead.c b/media/libvpx/libvpx/vp8/encoder/lookahead.c index 37aa9eee84..49f851d019 100644 --- a/media/libvpx/libvpx/vp8/encoder/lookahead.c +++ b/media/libvpx/libvpx/vp8/encoder/lookahead.c @@ -66,8 +66,8 @@ struct lookahead_ctx *vp8_lookahead_init(unsigned int width, depth += 1; /* Align the buffer dimensions */ - width = (width + 15) & ~15; - height = (height + 15) & ~15; + width = (width + 15) & ~15u; + height = (height + 15) & ~15u; /* Allocate the lookahead structures */ ctx = calloc(1, sizeof(*ctx)); diff --git a/media/libvpx/libvpx/vp8/encoder/lookahead.h b/media/libvpx/libvpx/vp8/encoder/lookahead.h index a67f226946..bf0401190b 100644 --- a/media/libvpx/libvpx/vp8/encoder/lookahead.h +++ b/media/libvpx/libvpx/vp8/encoder/lookahead.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_LOOKAHEAD_H_ -#define VP8_ENCODER_LOOKAHEAD_H_ +#ifndef VPX_VP8_ENCODER_LOOKAHEAD_H_ +#define VPX_VP8_ENCODER_LOOKAHEAD_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" @@ -74,7 +74,7 @@ int vp8_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, struct lookahead_entry *vp8_lookahead_pop(struct lookahead_ctx *ctx, int drain); #define PEEK_FORWARD 1 -#define PEEK_BACKWARD -1 +#define PEEK_BACKWARD (-1) /**\brief Get a future source buffer to encode * * \param[in] ctx Pointer to the lookahead context @@ -96,4 +96,4 @@ unsigned int vp8_lookahead_depth(struct lookahead_ctx *ctx); } // extern "C" #endif -#endif // VP8_ENCODER_LOOKAHEAD_H_ +#endif // VPX_VP8_ENCODER_LOOKAHEAD_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c b/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c new file mode 100644 index 0000000000..a08d4d3f63 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/loongarch/dct_lsx.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ + \ + DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ + DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _t2 = __lsx_vilvl_h(_s3, _s2); \ + _t3 = __lsx_vilvh_h(_s3, _s2); \ + DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ + DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ + } + +#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2) \ + { \ + __m128i tmp0_m, tmp1_m, tmp2_m; \ + \ + tmp0_m = __lsx_vreplvei_h(coeff, val0); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \ + DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1, \ + const2); \ + } + +#define RET_1_IF_NZERO_H(_in) \ + ({ \ + __m128i tmp_m; \ + __m128i one_m = __lsx_vldi(0x401); \ + __m128i max_m = __lsx_vldi(0xFF); \ + \ + tmp_m = __lsx_vseqi_h(_in, 0); \ + tmp_m = __lsx_vxor_v(tmp_m, max_m); \ + tmp_m = __lsx_vand_v(tmp_m, one_m); \ + \ + tmp_m; \ + }) + +void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { + __m128i in0, in1, in2, in3; + __m128i tmp0, tmp1, tmp2, tmp3, const0, const1; + __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c }; + __m128i out0, out1, out2, out3; + __m128i zero = __lsx_vldi(0); + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2); + in3 = __lsx_vldx(input, pitch3); + + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3); + DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1, + in3); + in0 = __lsx_vadd_h(tmp0, tmp1); + in2 = __lsx_vsub_h(tmp0, tmp1); + SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); + tmp0 = __lsx_vilvl_h(in3, in1); + in1 = __lsx_vreplvei_h(coeff, 3); + out0 = __lsx_vpackev_h(zero, in1); + coeff = __lsx_vilvl_h(zero, coeff); + out1 = __lsx_vreplvei_w(coeff, 0); + DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0, + out1); + DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3); + tmp2 = __lsx_vadd_h(tmp0, tmp1); + tmp3 = __lsx_vsub_h(tmp0, tmp1); + DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2); + DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2); + DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2); + tmp1 = RET_1_IF_NZERO_H(in3); + DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0); + DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1); + out3 = __lsx_vadd_w(out3, out1); + out1 = __lsx_vreplvei_w(coeff, 1); + DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1, + out3); + DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3); + out1 = __lsx_vadd_w(out1, tmp1); + DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2); + __lsx_vst(in0, output, 0); + __lsx_vst(in2, output, 16); +} + +void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) { + __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1; + __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w; + __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c }; + __m128i zero = __lsx_vldi(0); + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2); + in3 = __lsx_vldx(input, pitch3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3); + DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1, + in1, in3); + in0 = __lsx_vadd_h(temp0, temp1); + in2 = __lsx_vsub_h(temp0, temp1); + SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2); + temp0 = __lsx_vreplvei_h(coeff, 3); + vec1_w = __lsx_vpackev_h(zero, temp0); + coeff = __lsx_vilvh_h(zero, coeff); + vec3_w = __lsx_vreplvei_w(coeff, 0); + tmp1 = __lsx_vilvl_h(in3, in1); + tmp0 = __lsx_vilvh_h(in3, in1); + vec0_w = vec1_w; + vec2_w = vec3_w; + DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1, + vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w, + vec3_w); + DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3); + in0 = __lsx_vadd_h(temp0, temp1); + in0 = __lsx_vaddi_hu(in0, 7); + in2 = __lsx_vsub_h(temp0, temp1); + in2 = __lsx_vaddi_hu(in2, 7); + in0 = __lsx_vsrai_h(in0, 4); + in2 = __lsx_vsrai_h(in2, 4); + DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w); + vec3_w = __lsx_vadd_w(vec3_w, vec1_w); + vec1_w = __lsx_vreplvei_w(coeff, 1); + const0 = RET_1_IF_NZERO_H(in3); + tmp1 = __lsx_vilvl_h(in3, in1); + tmp0 = __lsx_vilvh_h(in3, in1); + vec0_w = vec1_w; + vec2_w = vec3_w; + DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1, + vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w, + vec3_w); + DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3); + in1 = __lsx_vadd_h(in1, const0); + DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1); + __lsx_vst(temp0, output, 0); + __lsx_vst(temp1, output, 16); + + DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2); + __lsx_vst(in0, output, 32); + __lsx_vst(in2, output, 48); +} diff --git a/media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c b/media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c new file mode 100644 index 0000000000..4ad4caba60 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vp8/encoder/block.h" + +int32_t vp8_block_error_lsx(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) { + int32_t err = 0; + __m128i dq_coeff0, dq_coeff1, coeff0, coeff1; + __m128i reg0, reg1, reg2, reg3, error; + + DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, dq_coeff_ptr, 0, + dq_coeff_ptr, 16, coeff0, coeff1, dq_coeff0, dq_coeff1); + DUP2_ARG2(__lsx_vsubwev_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg0, + reg2); + DUP2_ARG2(__lsx_vsubwod_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg1, + reg3); + error = __lsx_vmul_w(reg0, reg0); + DUP2_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, error); + error = __lsx_vmadd_w(error, reg3, reg3); + error = __lsx_vhaddw_d_w(error, error); + err = __lsx_vpickve2gr_w(error, 0); + err += __lsx_vpickve2gr_w(error, 2); + return err; +} + +int32_t vp8_mbblock_error_lsx(MACROBLOCK *mb, int32_t dc) { + BLOCK *be; + BLOCKD *bd; + int16_t *coeff, *dq_coeff; + int32_t err = 0; + uint32_t loop_cnt; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, error; + __m128i mask0 = __lsx_vldi(0xFF); + __m128i zero = __lsx_vldi(0); + + if (dc == 1) { + mask0 = __lsx_vinsgr2vr_w(mask0, 0, 0); + } + + for (loop_cnt = 0; loop_cnt < 8; loop_cnt++) { + int32_t loop_tmp = loop_cnt << 1; + be = &mb->block[loop_tmp]; + bd = &mb->e_mbd.block[loop_tmp]; + coeff = be->coeff; + dq_coeff = bd->dqcoeff; + DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src0, + src1, tmp0, tmp1); + be = &mb->block[loop_tmp + 1]; + bd = &mb->e_mbd.block[loop_tmp + 1]; + coeff = be->coeff; + dq_coeff = bd->dqcoeff; + DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src2, + src3, tmp2, tmp3); + DUP4_ARG2(__lsx_vsubwev_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vsubwod_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3, + reg1, reg3, reg5, reg7); + DUP2_ARG3(__lsx_vbitsel_v, zero, reg0, mask0, zero, reg4, mask0, reg0, + reg4); + error = __lsx_vmul_w(reg0, reg0); + DUP4_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, reg3, + reg3, error, reg4, reg4, error, error, error, error); + DUP2_ARG3(__lsx_vmadd_w, error, reg5, reg5, error, reg6, reg6, error, + error); + error = __lsx_vmadd_w(error, reg7, reg7); + error = __lsx_vhaddw_d_w(error, error); + error = __lsx_vhaddw_q_d(error, error); + err += __lsx_vpickve2gr_w(error, 0); + } + return err; +} diff --git a/media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c b/media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c new file mode 100644 index 0000000000..75889192a7 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp8_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vp8/encoder/block.h" + +#define BOOST_QUANT1(_in0, _in1, _in2, _ui) \ + { \ + if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \ + if (__lsx_vpickve2gr_h(_in1, _ui)) { \ + eob = _ui; \ + boost_temp = zbin_boost; \ + } else { \ + boost_temp++; \ + } \ + } else { \ + _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui); \ + boost_temp++; \ + } \ + } + +#define BOOST_QUANT2(_in0, _in1, _in2, _ui) \ + { \ + if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \ + if (__lsx_vpickve2gr_h(_in1, _ui)) { \ + eob = _ui + 8; \ + boost_temp = zbin_boost; \ + } else { \ + boost_temp++; \ + } \ + } else { \ + _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui); \ + boost_temp++; \ + } \ + } + +static int8_t exact_regular_quantize_b_lsx( + int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in, + int16_t *q_coeff, int16_t *dq_coeff) { + int32_t eob; + int16_t *boost_temp = zbin_boost; + __m128i inv_zig_zag = { 0x0C07040206050100, 0x0F0E0A090D0B0803 }; + __m128i sign_z0, sign_z1, q_coeff0, q_coeff1; + __m128i z_bin0, z_bin1, zbin_o_q, x0, x1, sign_x0, sign_x1, de_quant0, + de_quant1; + __m128i z0, z1, round0, round1, quant0, quant2; + __m128i inv_zig_zag0, inv_zig_zag1; + __m128i zigzag_mask0 = { 0x0008000400010000, 0x0006000300020005 }; + __m128i zigzag_mask1 = { 0x000A000D000C0009, 0X000F000E000B0007 }; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i zero = __lsx_vldi(0); + + zbin_o_q = __lsx_vreplgr2vr_h(zbin_oq_in); + inv_zig_zag0 = __lsx_vilvl_b(zero, inv_zig_zag); + inv_zig_zag1 = __lsx_vilvh_b(zero, inv_zig_zag); + eob = -1; + DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, round, 0, round, 16, tmp0, + tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0, + zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, z0, z1, round0, + round1); + DUP4_ARG2(__lsx_vld, quant, 0, quant, 16, zbin, 0, zbin, 16, tmp0, tmp1, tmp2, + tmp3); + DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0, + zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, quant0, quant2, + z_bin0, z_bin1); + DUP2_ARG2(__lsx_vsrai_h, z0, 15, z1, 15, sign_z0, sign_z1); + DUP2_ARG2(__lsx_vadda_h, z0, zero, z1, zero, x0, x1); + DUP2_ARG2(__lsx_vsub_h, x0, z_bin0, x1, z_bin1, z_bin0, z_bin1); + DUP2_ARG2(__lsx_vsub_h, z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1); + DUP2_ARG2(__lsx_vmulwev_w_h, quant0, round0, quant2, round1, tmp0, tmp2); + DUP2_ARG2(__lsx_vmulwod_w_h, quant0, round0, quant2, round1, tmp1, tmp3); + DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2); + DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, q_coeff0, q_coeff1); + + DUP2_ARG2(__lsx_vld, quant_shift, 0, quant_shift, 16, tmp1, tmp3); + DUP2_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp3, tmp1, zigzag_mask1, tmp3, tmp1, + quant0, quant2); + DUP2_ARG2(__lsx_vadd_h, x0, round0, x1, round1, x0, x1); + DUP2_ARG2(__lsx_vmulwev_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp0, tmp2); + DUP2_ARG2(__lsx_vmulwod_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp1, tmp3); + DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2); + DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, x0, x1); + DUP2_ARG2(__lsx_vxor_v, x0, sign_z0, x1, sign_z1, sign_x0, sign_x1); + DUP2_ARG2(__lsx_vsub_h, sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1); + + BOOST_QUANT1(z_bin0, x0, sign_x0, 0); + BOOST_QUANT1(z_bin0, x0, sign_x0, 1); + BOOST_QUANT1(z_bin0, x0, sign_x0, 2); + BOOST_QUANT1(z_bin0, x0, sign_x0, 3); + BOOST_QUANT1(z_bin0, x0, sign_x0, 4); + BOOST_QUANT1(z_bin0, x0, sign_x0, 5); + BOOST_QUANT1(z_bin0, x0, sign_x0, 6); + BOOST_QUANT1(z_bin0, x0, sign_x0, 7); + + BOOST_QUANT2(z_bin1, x1, sign_x1, 0); + BOOST_QUANT2(z_bin1, x1, sign_x1, 1); + BOOST_QUANT2(z_bin1, x1, sign_x1, 2); + BOOST_QUANT2(z_bin1, x1, sign_x1, 3); + BOOST_QUANT2(z_bin1, x1, sign_x1, 4); + BOOST_QUANT2(z_bin1, x1, sign_x1, 5); + BOOST_QUANT2(z_bin1, x1, sign_x1, 6); + BOOST_QUANT2(z_bin1, x1, sign_x1, 7); + + DUP2_ARG2(__lsx_vld, de_quant, 0, de_quant, 16, de_quant0, de_quant1); + DUP2_ARG3(__lsx_vshuf_h, inv_zig_zag0, sign_x1, sign_x0, inv_zig_zag1, + sign_x1, sign_x0, q_coeff0, q_coeff1); + DUP2_ARG2(__lsx_vmul_h, de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, + de_quant1); + __lsx_vst(q_coeff0, q_coeff, 0); + __lsx_vst(q_coeff1, q_coeff, 16); + __lsx_vst(de_quant0, dq_coeff, 0); + __lsx_vst(de_quant1, dq_coeff, 16); + + return (int8_t)(eob + 1); +} + +void vp8_regular_quantize_b_lsx(BLOCK *b, BLOCKD *d) { + int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + int16_t *coeff_ptr = b->coeff; + int16_t *zbin_ptr = b->zbin; + int16_t *round_ptr = b->round; + int16_t *quant_ptr = b->quant; + int16_t *quant_shift_ptr = b->quant_shift; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + int16_t *dequant_ptr = d->dequant; + int16_t zbin_oq_value = b->zbin_extra; + + *d->eob = exact_regular_quantize_b_lsx( + zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr); +} diff --git a/media/libvpx/libvpx/vp8/encoder/mcomp.c b/media/libvpx/libvpx/vp8/encoder/mcomp.c index 970120f3b2..6861adaa86 100644 --- a/media/libvpx/libvpx/vp8/encoder/mcomp.c +++ b/media/libvpx/libvpx/vp8/encoder/mcomp.c @@ -21,11 +21,6 @@ #include "vp8/common/common.h" #include "vpx_dsp/vpx_dsp_common.h" -#ifdef VP8_ENTROPY_STATS -static int mv_ref_ct[31][4][2]; -static int mv_mode_cts[4][2]; -#endif - int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) { /* MV costing is based on the distribution of vectors in the previous * frame and as such will tend to over state the cost of vectors. In @@ -34,19 +29,22 @@ int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) { * NEAREST for subsequent blocks. The "Weight" parameter allows, to a * limited extent, for some account to be taken of these factors. */ - return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] + - mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) * - Weight) >> - 7; + const int mv_idx_row = + clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals); + const int mv_idx_col = + clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals); + return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * Weight) >> 7; } static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int error_per_bit) { /* Ignore mv costing if mvcost is NULL */ if (mvcost) { - return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] + - mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) * - error_per_bit + + const int mv_idx_row = + clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals); + const int mv_idx_col = + clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals); + return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * error_per_bit + 128) >> 8; } @@ -190,14 +188,15 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) { */ /* estimated cost of a motion vector (r,c) */ -#define MVC(r, c) \ - (mvcost \ - ? ((mvcost[0][(r)-rr] + mvcost[1][(c)-rc]) * error_per_bit + 128) >> 8 \ - : 0) +#define MVC(r, c) \ + (mvcost ? ((mvcost[0][(r) - rr] + mvcost[1][(c) - rc]) * error_per_bit + \ + 128) >> \ + 8 \ + : 0) /* pointer to predictor base of a motionvector */ #define PRE(r, c) (y + (((r) >> 2) * y_stride + ((c) >> 2) - (offset))) /* convert motion vector component to offset for svf calc */ -#define SP(x) (((x)&3) << 1) +#define SP(x) (((x) & 3) << 1) /* returns subpixel variance error function. */ #define DIST(r, c) \ vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse) @@ -206,19 +205,22 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) { /* returns distortion + motion vector cost */ #define ERR(r, c) (MVC(r, c) + DIST(r, c)) /* checks if (r,c) has better score than previous best */ -#define CHECK_BETTER(v, r, c) \ - IFMVCV(r, c, \ - { \ - thismse = DIST(r, c); \ - if ((v = (MVC(r, c) + thismse)) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - }, \ - v = UINT_MAX;) +#define CHECK_BETTER(v, r, c) \ + do { \ + IFMVCV( \ + r, c, \ + { \ + thismse = DIST(r, c); \ + if ((v = (MVC(r, c) + thismse)) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + }, \ + v = UINT_MAX;) \ + } while (0) int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, @@ -253,7 +255,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int pre_stride = x->e_mbd.pre.y_stride; unsigned char *base_pre = x->e_mbd.pre.y_buffer; -#if ARCH_X86 || ARCH_X86_64 +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 MACROBLOCKD *xd = &x->e_mbd; unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col; @@ -285,8 +287,8 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; /* central mv */ - bestmv->as_mv.row *= 8; - bestmv->as_mv.col *= 8; + bestmv->as_mv.row = clamp(bestmv->as_mv.row * 8, SHRT_MIN, SHRT_MAX); + bestmv->as_mv.col = clamp(bestmv->as_mv.col * 8, SHRT_MIN, SHRT_MAX); /* calculate central point error */ besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1); @@ -346,8 +348,8 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, tc = bc; } - bestmv->as_mv.row = br * 2; - bestmv->as_mv.col = bc * 2; + bestmv->as_mv.row = clamp(br * 2, SHRT_MIN, SHRT_MAX); + bestmv->as_mv.col = clamp(bc * 2, SHRT_MIN, SHRT_MAX); if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) { @@ -382,7 +384,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int pre_stride = x->e_mbd.pre.y_stride; unsigned char *base_pre = x->e_mbd.pre.y_buffer; -#if ARCH_X86 || ARCH_X86_64 +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 MACROBLOCKD *xd = &x->e_mbd; unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col; @@ -399,8 +401,8 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, #endif /* central mv */ - bestmv->as_mv.row *= 8; - bestmv->as_mv.col *= 8; + bestmv->as_mv.row = clamp(bestmv->as_mv.row * 8, SHRT_MIN, SHRT_MAX); + bestmv->as_mv.col = clamp(bestmv->as_mv.col * 8, SHRT_MIN, SHRT_MAX); startmv = *bestmv; /* calculate central point error */ @@ -678,7 +680,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int pre_stride = x->e_mbd.pre.y_stride; unsigned char *base_pre = x->e_mbd.pre.y_buffer; -#if ARCH_X86 || ARCH_X86_64 +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 MACROBLOCKD *xd = &x->e_mbd; unsigned char *y_0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col; @@ -695,8 +697,8 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, #endif /* central mv */ - bestmv->as_mv.row *= 8; - bestmv->as_mv.col *= 8; + bestmv->as_mv.row = clamp(bestmv->as_mv.row * 8, SHRT_MIN, SHRT_MAX); + bestmv->as_mv.col = clamp(bestmv->as_mv.col * 8, SHRT_MIN, SHRT_MAX); startmv = *bestmv; /* calculate central point error */ @@ -802,13 +804,13 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } #define CHECK_BOUNDS(range) \ - { \ + do { \ all_in = 1; \ all_in &= ((br - range) >= x->mv_row_min); \ all_in &= ((br + range) <= x->mv_row_max); \ all_in &= ((bc - range) >= x->mv_col_min); \ all_in &= ((bc + range) <= x->mv_col_max); \ - } + } while (0) #define CHECK_POINT \ { \ @@ -819,7 +821,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } #define CHECK_BETTER \ - { \ + do { \ if (thissad < bestsad) { \ thissad += \ mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); \ @@ -828,7 +830,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, best_site = i; \ } \ } \ - } + } while (0) static const MV next_chkpts[6][3] = { { { -2, 0 }, { -1, -2 }, { 1, -2 } }, { { -1, -2 }, { 1, -2 }, { 2, 0 } }, @@ -839,7 +841,7 @@ static const MV next_chkpts[6][3] = { int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int_mv *best_mv, int search_param, int sad_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2], - int *mvcost[2], int_mv *center_mv) { + int_mv *center_mv) { MV hex[6] = { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } }; @@ -868,8 +870,6 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - (void)mvcost; - /* adjust ref_mv to make sure it is within MV range */ vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); @@ -905,7 +905,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, #endif /* hex search */ - CHECK_BOUNDS(2) + CHECK_BOUNDS(2); if (all_in) { for (i = 0; i < 6; ++i) { @@ -914,7 +914,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } else { for (i = 0; i < 6; ++i) { @@ -924,7 +924,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } @@ -938,7 +938,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, for (j = 1; j < hex_range; ++j) { best_site = -1; - CHECK_BOUNDS(2) + CHECK_BOUNDS(2); if (all_in) { for (i = 0; i < 3; ++i) { @@ -947,7 +947,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } else { for (i = 0; i < 3; ++i) { @@ -957,7 +957,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } @@ -979,7 +979,7 @@ int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, cal_neighbors: for (j = 0; j < dia_range; ++j) { best_site = -1; - CHECK_BOUNDS(1) + CHECK_BOUNDS(1); if (all_in) { for (i = 0; i < 4; ++i) { @@ -988,7 +988,7 @@ cal_neighbors: this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } else { for (i = 0; i < 4; ++i) { @@ -998,7 +998,7 @@ cal_neighbors: this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride); - CHECK_BETTER + CHECK_BETTER; } } @@ -1124,13 +1124,14 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, } } - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; + this_mv.as_mv.row = clamp(best_mv->as_mv.row * 8, SHRT_MIN, SHRT_MAX); + this_mv.as_mv.col = clamp(best_mv->as_mv.col * 8, SHRT_MIN, SHRT_MAX); return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } +#if HAVE_SSE2 || HAVE_MSA || HAVE_LSX int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int_mv *best_mv, int search_param, int sad_per_bit, int *num00, vp8_variance_fn_ptr_t *fn_ptr, @@ -1273,17 +1274,18 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, } } - this_mv.as_mv.row = best_mv->as_mv.row * 8; - this_mv.as_mv.col = best_mv->as_mv.col * 8; + this_mv.as_mv.row = clamp(best_mv->as_mv.row * 8, SHRT_MIN, SHRT_MAX); + this_mv.as_mv.col = clamp(best_mv->as_mv.col * 8, SHRT_MIN, SHRT_MAX); return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } +#endif // HAVE_SSE2 || HAVE_MSA || HAVE_LSX -int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, - int sad_per_bit, int distance, - vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], - int_mv *center_mv) { +int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int sad_per_bit, int distance, + vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], + int_mv *center_mv) { unsigned char *what = (*(b->base_src) + b->src); int what_stride = b->src_stride; unsigned char *in_what; @@ -1327,8 +1329,8 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); - /* Apply further limits to prevent us looking using vectors that - * stretch beyiond the UMV border + /* Apply further limits to prevent us looking using vectors that stretch + * beyond the UMV border */ if (col_min < x->mv_col_min) col_min = x->mv_col_min; @@ -1345,121 +1347,6 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, for (c = col_min; c < col_max; ++c) { thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); - this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; - } - - check_here++; - } - } - - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; - - return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) + - mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); -} - -int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, - int sad_per_bit, int distance, - vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], - int_mv *center_mv) { - unsigned char *what = (*(b->base_src) + b->src); - int what_stride = b->src_stride; - unsigned char *in_what; - int pre_stride = x->e_mbd.pre.y_stride; - unsigned char *base_pre = x->e_mbd.pre.y_buffer; - int in_what_stride = pre_stride; - int mv_stride = pre_stride; - unsigned char *bestaddress; - int_mv *best_mv = &d->bmi.mv; - int_mv this_mv; - unsigned int bestsad; - unsigned int thissad; - int r, c; - - unsigned char *check_here; - - int ref_row = ref_mv->as_mv.row; - int ref_col = ref_mv->as_mv.col; - - int row_min = ref_row - distance; - int row_max = ref_row + distance; - int col_min = ref_col - distance; - int col_max = ref_col + distance; - - unsigned int sad_array[3]; - - int *mvsadcost[2]; - int_mv fcenter_mv; - - mvsadcost[0] = x->mvsadcost[0]; - mvsadcost[1] = x->mvsadcost[1]; - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - /* Work out the mid point for the search */ - in_what = base_pre + d->offset; - bestaddress = in_what + (ref_row * pre_stride) + ref_col; - - best_mv->as_mv.row = ref_row; - best_mv->as_mv.col = ref_col; - - /* Baseline value at the centre */ - bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) + - mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - /* Apply further limits to prevent us looking using vectors that stretch - * beyond the UMV border - */ - if (col_min < x->mv_col_min) col_min = x->mv_col_min; - - if (col_max > x->mv_col_max) col_max = x->mv_col_max; - - if (row_min < x->mv_row_min) row_min = x->mv_row_min; - - if (row_max > x->mv_row_max) row_max = x->mv_row_max; - - for (r = row_min; r < row_max; ++r) { - this_mv.as_mv.row = r; - check_here = r * mv_stride + in_what + col_min; - c = col_min; - - while ((c + 2) < col_max) { - int i; - - fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array); - - for (i = 0; i < 3; ++i) { - thissad = sad_array[i]; - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += - mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; - } - } - - check_here++; - c++; - } - } - - while (c < col_max) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); - if (thissad < bestsad) { this_mv.as_mv.col = c; thissad += @@ -1474,158 +1361,11 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, } check_here++; - c++; } } - this_mv.as_mv.row = best_mv->as_mv.row << 3; - this_mv.as_mv.col = best_mv->as_mv.col << 3; - - return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) + - mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); -} - -int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, - int sad_per_bit, int distance, - vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], - int_mv *center_mv) { - unsigned char *what = (*(b->base_src) + b->src); - int what_stride = b->src_stride; - int pre_stride = x->e_mbd.pre.y_stride; - unsigned char *base_pre = x->e_mbd.pre.y_buffer; - unsigned char *in_what; - int in_what_stride = pre_stride; - int mv_stride = pre_stride; - unsigned char *bestaddress; - int_mv *best_mv = &d->bmi.mv; - int_mv this_mv; - unsigned int bestsad; - unsigned int thissad; - int r, c; - - unsigned char *check_here; - - int ref_row = ref_mv->as_mv.row; - int ref_col = ref_mv->as_mv.col; - - int row_min = ref_row - distance; - int row_max = ref_row + distance; - int col_min = ref_col - distance; - int col_max = ref_col + distance; - - DECLARE_ALIGNED(16, unsigned int, sad_array8[8]); - unsigned int sad_array[3]; - - int *mvsadcost[2]; - int_mv fcenter_mv; - - mvsadcost[0] = x->mvsadcost[0]; - mvsadcost[1] = x->mvsadcost[1]; - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - /* Work out the mid point for the search */ - in_what = base_pre + d->offset; - bestaddress = in_what + (ref_row * pre_stride) + ref_col; - - best_mv->as_mv.row = ref_row; - best_mv->as_mv.col = ref_col; - - /* Baseline value at the centre */ - bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) + - mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - /* Apply further limits to prevent us looking using vectors that stretch - * beyond the UMV border - */ - if (col_min < x->mv_col_min) col_min = x->mv_col_min; - - if (col_max > x->mv_col_max) col_max = x->mv_col_max; - - if (row_min < x->mv_row_min) row_min = x->mv_row_min; - - if (row_max > x->mv_row_max) row_max = x->mv_row_max; - - for (r = row_min; r < row_max; ++r) { - this_mv.as_mv.row = r; - check_here = r * mv_stride + in_what + col_min; - c = col_min; - - while ((c + 7) < col_max) { - int i; - - fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8); - - for (i = 0; i < 8; ++i) { - thissad = sad_array8[i]; - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += - mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; - } - } - - check_here++; - c++; - } - } - - while ((c + 2) < col_max) { - int i; - - fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array); - - for (i = 0; i < 3; ++i) { - thissad = sad_array[i]; - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += - mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; - } - } - - check_here++; - c++; - } - } - - while (c < col_max) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride); - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += - mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; - } - } - - check_here++; - c++; - } - } - - this_mv.as_mv.row = best_mv->as_mv.row * 8; - this_mv.as_mv.col = best_mv->as_mv.col * 8; + this_mv.as_mv.row = clamp(best_mv->as_mv.row * 8, SHRT_MIN, SHRT_MAX); + this_mv.as_mv.col = clamp(best_mv->as_mv.col * 8, SHRT_MIN, SHRT_MAX); return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); @@ -1702,13 +1442,14 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } } - this_mv.as_mv.row = ref_mv->as_mv.row << 3; - this_mv.as_mv.col = ref_mv->as_mv.col << 3; + this_mv.as_mv.row = clamp(ref_mv->as_mv.row * 8, SHRT_MIN, SHRT_MAX); + this_mv.as_mv.col = clamp(ref_mv->as_mv.col * 8, SHRT_MIN, SHRT_MAX); return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } +#if HAVE_SSE2 || HAVE_MSA int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int error_per_bit, int search_range, vp8_variance_fn_ptr_t *fn_ptr, @@ -1812,102 +1553,10 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } } - this_mv.as_mv.row = ref_mv->as_mv.row * 8; - this_mv.as_mv.col = ref_mv->as_mv.col * 8; + this_mv.as_mv.row = clamp(ref_mv->as_mv.row * 8, SHRT_MIN, SHRT_MAX); + this_mv.as_mv.col = clamp(ref_mv->as_mv.col * 8, SHRT_MIN, SHRT_MAX); return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit); } - -#ifdef VP8_ENTROPY_STATS -void print_mode_context(void) { - FILE *f = fopen("modecont.c", "w"); - int i, j; - - fprintf(f, "#include \"entropy.h\"\n"); - fprintf(f, "const int vp8_mode_contexts[6][4] =\n"); - fprintf(f, "{\n"); - - for (j = 0; j < 6; ++j) { - fprintf(f, " { /* %d */\n", j); - fprintf(f, " "); - - for (i = 0; i < 4; ++i) { - int overal_prob; - int this_prob; - int count; - - /* Overall probs */ - count = mv_mode_cts[i][0] + mv_mode_cts[i][1]; - - if (count) - overal_prob = 256 * mv_mode_cts[i][0] / count; - else - overal_prob = 128; - - if (overal_prob == 0) overal_prob = 1; - - /* context probs */ - count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1]; - - if (count) - this_prob = 256 * mv_ref_ct[j][i][0] / count; - else - this_prob = 128; - - if (this_prob == 0) this_prob = 1; - - fprintf(f, "%5d, ", this_prob); - } - - fprintf(f, " },\n"); - } - - fprintf(f, "};\n"); - fclose(f); -} - -/* MV ref count VP8_ENTROPY_STATS stats code */ -#ifdef VP8_ENTROPY_STATS -void init_mv_ref_counts() { - memset(mv_ref_ct, 0, sizeof(mv_ref_ct)); - memset(mv_mode_cts, 0, sizeof(mv_mode_cts)); -} - -void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) { - if (m == ZEROMV) { - ++mv_ref_ct[ct[0]][0][0]; - ++mv_mode_cts[0][0]; - } else { - ++mv_ref_ct[ct[0]][0][1]; - ++mv_mode_cts[0][1]; - - if (m == NEARESTMV) { - ++mv_ref_ct[ct[1]][1][0]; - ++mv_mode_cts[1][0]; - } else { - ++mv_ref_ct[ct[1]][1][1]; - ++mv_mode_cts[1][1]; - - if (m == NEARMV) { - ++mv_ref_ct[ct[2]][2][0]; - ++mv_mode_cts[2][0]; - } else { - ++mv_ref_ct[ct[2]][2][1]; - ++mv_mode_cts[2][1]; - - if (m == NEWMV) { - ++mv_ref_ct[ct[3]][3][0]; - ++mv_mode_cts[3][0]; - } else { - ++mv_ref_ct[ct[3]][3][1]; - ++mv_mode_cts[3][1]; - } - } - } - } -} - -#endif /* END MV ref count VP8_ENTROPY_STATS stats code */ - -#endif +#endif // HAVE_SSE2 || HAVE_MSA diff --git a/media/libvpx/libvpx/vp8/encoder/mcomp.h b/media/libvpx/libvpx/vp8/encoder/mcomp.h index b6228798ff..1ee6fe5dd6 100644 --- a/media/libvpx/libvpx/vp8/encoder/mcomp.h +++ b/media/libvpx/libvpx/vp8/encoder/mcomp.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_MCOMP_H_ -#define VP8_ENCODER_MCOMP_H_ +#ifndef VPX_VP8_ENCODER_MCOMP_H_ +#define VPX_VP8_ENCODER_MCOMP_H_ #include "block.h" #include "vpx_dsp/variance.h" @@ -18,11 +18,6 @@ extern "C" { #endif -#ifdef VP8_ENTROPY_STATS -extern void init_mv_ref_counts(); -extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); -#endif - /* The maximum number of steps in a step search given the largest allowed * initial step */ @@ -34,15 +29,14 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); /* Maximum size of the first step in full pel units */ #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1)) -extern void print_mode_context(void); -extern int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight); -extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride); -extern void vp8_init3smotion_compensation(MACROBLOCK *x, int stride); +int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight); +void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride); +void vp8_init3smotion_compensation(MACROBLOCK *x, int stride); -extern int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, - int_mv *best_mv, int search_param, int error_per_bit, - const vp8_variance_fn_ptr_t *vf, int *mvsadcost[2], - int *mvcost[2], int_mv *center_mv); +int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int_mv *best_mv, int search_param, int sad_per_bit, + const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2], + int_mv *center_mv); typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, @@ -51,15 +45,15 @@ typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int *mvcost[2], int *distortion, unsigned int *sse); -extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively; -extern fractional_mv_step_fp vp8_find_best_sub_pixel_step; -extern fractional_mv_step_fp vp8_find_best_half_pixel_step; -extern fractional_mv_step_fp vp8_skip_fractional_mv_step; +fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively; +fractional_mv_step_fp vp8_find_best_sub_pixel_step; +fractional_mv_step_fp vp8_find_best_half_pixel_step; +fractional_mv_step_fp vp8_skip_fractional_mv_step; -typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, - int_mv *ref_mv, int sad_per_bit, - int distance, vp8_variance_fn_ptr_t *fn_ptr, - int *mvcost[2], int_mv *center_mv); +int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int sad_per_bit, int distance, + vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], + int_mv *center_mv); typedef int (*vp8_refining_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int sad_per_bit, @@ -78,4 +72,4 @@ typedef int (*vp8_diamond_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } // extern "C" #endif -#endif // VP8_ENCODER_MCOMP_H_ +#endif // VPX_VP8_ENCODER_MCOMP_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c b/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c new file mode 100644 index 0000000000..0fd25fcda5 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mips/mmi/dct_mmi.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +/* clang-format off */ +/* TRANSPOSE_4H: transpose 4x4 matrix. + Input: ftmp1,ftmp2,ftmp3,ftmp4 + Output: ftmp1,ftmp2,ftmp3,ftmp4 + Note: ftmp0 always be 0, ftmp5~9 used for temporary value. + */ +#define TRANSPOSE_4H \ + MMI_LI(%[tmp0], 0x93) \ + "mtc1 %[tmp0], %[ftmp10] \n\t" \ + "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "por %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ + "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ + "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" +/* clang-format on */ + +void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { + uint64_t tmp[1]; + int16_t *ip = input; + double ff_ph_op1, ff_ph_op3; + +#if _MIPS_SIM == _ABIO32 + register double ftmp0 asm("$f0"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f4"); + register double ftmp3 asm("$f6"); + register double ftmp4 asm("$f8"); + register double ftmp5 asm("$f10"); + register double ftmp6 asm("$f12"); + register double ftmp7 asm("$f14"); + register double ftmp8 asm("$f16"); + register double ftmp9 asm("$f18"); + register double ftmp10 asm("$f20"); + register double ftmp11 asm("$f22"); + register double ftmp12 asm("$f24"); +#else + register double ftmp0 asm("$f0"); + register double ftmp1 asm("$f1"); + register double ftmp2 asm("$f2"); + register double ftmp3 asm("$f3"); + register double ftmp4 asm("$f4"); + register double ftmp5 asm("$f5"); + register double ftmp6 asm("$f6"); + register double ftmp7 asm("$f7"); + register double ftmp8 asm("$f8"); + register double ftmp9 asm("$f9"); + register double ftmp10 asm("$f10"); + register double ftmp11 asm("$f11"); + register double ftmp12 asm("$f12"); +#endif // _MIPS_SIM == _ABIO32 + + DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL }; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x14e808a914e808a9 \n\t" + "dmtc1 %[tmp0], %[ff_ph_op1] \n\t" + "dli %[tmp0], 0xeb1808a9eb1808a9 \n\t" + "dmtc1 %[tmp0], %[ff_ph_op3] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + TRANSPOSE_4H + + "ldc1 %[ftmp11], %[ff_ph_8] \n\t" + // f1 + f4 + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + // a1 + "pmullh %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + // f2 + f3 + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + // b1 + "pmullh %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + // f2 - f3 + "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + // c1 + "pmullh %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + // f1 - f4 + "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" + // d1 + "pmullh %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + // op[0] = a1 + b1 + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + // op[2] = a1 - b1 + "psubh %[ftmp3], %[ftmp5], %[ftmp6] \n\t" + + // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12 + MMI_LI(%[tmp0], 0x0c) + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "ldc1 %[ftmp12], %[ff_pw_14500] \n\t" + "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t" + "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + + // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12 + "ldc1 %[ftmp12], %[ff_pw_7500] \n\t" + "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t" + "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + TRANSPOSE_4H + + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" + + "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t" + "ldc1 %[ftmp9], %[ff_ph_01] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "ldc1 %[ftmp9], %[ff_ph_07] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + MMI_LI(%[tmp0], 0x04) + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + + MMI_LI(%[tmp0], 0x10) + "mtc1 %[tmp0], %[ftmp9] \n\t" + "ldc1 %[ftmp12], %[ff_pw_12000] \n\t" + "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t" + "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t" + "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + + "ldc1 %[ftmp12], %[ff_pw_51000] \n\t" + "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t" + "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t" + "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t" + + "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t" + "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t" + "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t" + + : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2), + [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5), + [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8), + [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11), + [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip), + [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3) + : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), + [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500), + [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000), + [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217), + [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output) + : "memory" + ); + /* clang-format on */ +} + +void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { + vp8_short_fdct4x4_mmi(input, output, pitch); + vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch); +} + +void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { + double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask; + uint64_t tmp[1]; + + /* clang-format off */ + __asm__ volatile ( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ff_ph_01] \n\t" + "dli %[tmp0], 0x0000000100000001 \n\t" + "dmtc1 %[tmp0], %[ff_pw_01] \n\t" + "dli %[tmp0], 0x0000000300000003 \n\t" + "dmtc1 %[tmp0], %[ff_pw_03] \n\t" + "dli %[tmp0], 0x0001000000010000 \n\t" + "dmtc1 %[tmp0], %[ff_pw_mask] \n\t" + MMI_LI(%[tmp0], 0x02) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "dmtc1 %[tmp0], %[ftmp11] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" + TRANSPOSE_4H + + "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // d + "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t" + // c + "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t" + // b + "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t" + + // a + d + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + // b + c + "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t" + // b - c + "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t" + // a - d + "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + + "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + TRANSPOSE_4H + + // op[2], op[0] + "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t" + // op[3], op[1] + "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t" + + // op[6], op[4] + "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t" + // op[7], op[5] + "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t" + + // op[10], op[8] + "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t" + // op[11], op[9] + "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t" + + // op[14], op[12] + "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t" + // op[15], op[13] + "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t" + + // a1, a3 + "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t" + // d1, d3 + "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t" + // c1, c3 + "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t" + // b1, b3 + "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t" + + // a1 + d1, a3 + d3 + "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t" + // b1 + c1, b3 + c3 + "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t" + // b1 - c1, b3 - c3 + "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + // a1 - d1, a3 - d3 + "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t" + + // a2, a4 + "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t" + // d2, d4 + "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t" + // c2, c4 + "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t" + // b2, b4 + "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t" + + // a2 + d2, a4 + d4 + "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + // b2 + c2, b4 + c4 + "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t" + // b2 - c2, b4 - c4 + "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t" + // a2 - d2, a4 - d4 + "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t" + + MMI_LI(%[tmp0], 0x03) + "dmtc1 %[tmp0], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t" + "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t" + "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t" + "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t" + "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t" + "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t" + "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t" + "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" + "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t" + "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + + "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + + MMI_LI(%[tmp0], 0x72) + "dmtc1 %[tmp0], %[ftmp11] \n\t" + "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" + "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" + "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [ff_pw_mask]"=&f"(ff_pw_mask), + [tmp0]"=&r"(tmp[0]), [ff_pw_01]"=&f"(ff_pw_01), + [ip]"+&r"(input), [ff_pw_03]"=&f"(ff_pw_03), + [ff_ph_01]"=&f"(ff_ph_01) + : [op]"r"(output), [pitch]"r"((mips_reg)pitch) + : "memory" + ); + /* clang-format on */ +} diff --git a/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c new file mode 100644 index 0000000000..1986444aa3 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vp8/encoder/onyx_int.h" +#include "vp8/encoder/quantize.h" +#include "vp8/common/quant_common.h" + +#define REGULAR_SELECT_EOB(i, rc) \ + z = coeff_ptr[rc]; \ + sz = (z >> 31); \ + x = (z ^ sz) - sz; \ + zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value; \ + if (x >= zbin) { \ + x += round_ptr[rc]; \ + y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \ + if (y) { \ + x = (y ^ sz) - sz; \ + qcoeff_ptr[rc] = x; \ + dqcoeff_ptr[rc] = x * dequant_ptr[rc]; \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + } \ + } + +void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) { + const int16_t *coeff_ptr = b->coeff; + const int16_t *round_ptr = b->round; + const int16_t *quant_ptr = b->quant_fast; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + const int16_t *dequant_ptr = d->dequant; + const int16_t *inv_zig_zag = vp8_default_inv_zig_zag; + + double ftmp[13]; + uint64_t tmp[1]; + int64_t eob = 0; + double ones; + + __asm__ volatile( + // loop 0 ~ 7 + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pcmpeqh %[ones], %[ones], %[ones] \n\t" + "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t" + "dli %[tmp0], 0x0f \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t" + + "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t" + "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t" + "pxor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + + "gsldlc1 %[ftmp5], 0x07(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[round_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[round_ptr]) \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "gsldlc1 %[ftmp7], 0x07(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[quant_ptr]) \n\t" + "gsldlc1 %[ftmp8], 0x0f(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x08(%[quant_ptr]) \n\t" + "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + + "pxor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" + "pxor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "gssdlc1 %[ftmp7], 0x07(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x00(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp8], 0x0f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x08(%[qcoeff_ptr]) \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[inv_zig_zag]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[inv_zig_zag]) \n\t" + "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ones] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ones] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "pand %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pmaxsh %[ftmp10], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp5], 0x07(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[dequant_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x0f(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x08(%[dequant_ptr]) \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "gssdlc1 %[ftmp5], 0x07(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x00(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp6], 0x0f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x08(%[dqcoeff_ptr]) \n\t" + + // loop 8 ~ 15 + "gsldlc1 %[ftmp1], 0x17(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[coeff_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[coeff_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[coeff_ptr]) \n\t" + + "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t" + "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t" + "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t" + "pxor %[ftmp2], %[ftmp4], %[ftmp2] \n\t" + "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" + + "gsldlc1 %[ftmp5], 0x17(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x10(%[round_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x1f(%[round_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x18(%[round_ptr]) \n\t" + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "gsldlc1 %[ftmp7], 0x17(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp7], 0x10(%[quant_ptr]) \n\t" + "gsldlc1 %[ftmp8], 0x1f(%[quant_ptr]) \n\t" + "gsldrc1 %[ftmp8], 0x18(%[quant_ptr]) \n\t" + "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + + "pxor %[ftmp7], %[ftmp5], %[ftmp3] \n\t" + "pxor %[ftmp8], %[ftmp6], %[ftmp4] \n\t" + "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + "gssdlc1 %[ftmp7], 0x17(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp7], 0x10(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp8], 0x1f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp8], 0x18(%[qcoeff_ptr]) \n\t" + + "gsldlc1 %[ftmp1], 0x17(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[inv_zig_zag]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[inv_zig_zag]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[inv_zig_zag]) \n\t" + "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp5], %[ftmp5], %[ones] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ones] \n\t" + "pand %[ftmp5], %[ftmp5], %[ftmp1] \n\t" + "pand %[ftmp6], %[ftmp6], %[ftmp2] \n\t" + "pmaxsh %[ftmp11], %[ftmp5], %[ftmp6] \n\t" + + "gsldlc1 %[ftmp5], 0x17(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp5], 0x10(%[dequant_ptr]) \n\t" + "gsldlc1 %[ftmp6], 0x1f(%[dequant_ptr]) \n\t" + "gsldrc1 %[ftmp6], 0x18(%[dequant_ptr]) \n\t" + "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" + "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" + "gssdlc1 %[ftmp5], 0x17(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp5], 0x10(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t" + + "dli %[tmp0], 0x10 \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t" + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "dli %[tmp0], 0xaa \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t" + "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t" + "dli %[tmp0], 0xffff \n\t" + "dmtc1 %[tmp0], %[ftmp9] \n\t" + "pand %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t" + "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones) + : [coeff_ptr] "r"((mips_reg)coeff_ptr), + [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr), + [dequant_ptr] "r"((mips_reg)dequant_ptr), + [round_ptr] "r"((mips_reg)round_ptr), + [quant_ptr] "r"((mips_reg)quant_ptr), + [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr), + [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob) + : "memory"); + + *d->eob = eob; +} + +void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) { + int eob = 0; + int x, y, z, sz, zbin; + const int16_t *zbin_boost_ptr = b->zrun_zbin_boost; + const int16_t *coeff_ptr = b->coeff; + const int16_t *zbin_ptr = b->zbin; + const int16_t *round_ptr = b->round; + const int16_t *quant_ptr = b->quant; + const int16_t *quant_shift_ptr = b->quant_shift; + int16_t *qcoeff_ptr = d->qcoeff; + int16_t *dqcoeff_ptr = d->dqcoeff; + const int16_t *dequant_ptr = d->dequant; + const int16_t zbin_oq_value = b->zbin_extra; + register double ftmp0 asm("$f0"); + + // memset(qcoeff_ptr, 0, 32); + // memset(dqcoeff_ptr, 0, 32); + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gssdlc1 %[ftmp0], 0x07(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[qcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x1f(%[qcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x18(%[qcoeff_ptr]) \n\t" + + "gssdlc1 %[ftmp0], 0x07(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x00(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x0f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x08(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x17(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x10(%[dqcoeff_ptr]) \n\t" + "gssdlc1 %[ftmp0], 0x1f(%[dqcoeff_ptr]) \n\t" + "gssdrc1 %[ftmp0], 0x18(%[dqcoeff_ptr]) \n\t" + : [ftmp0]"=&f"(ftmp0) + : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr) + : "memory" + ); + /* clang-format on */ + + REGULAR_SELECT_EOB(1, 0); + REGULAR_SELECT_EOB(2, 1); + REGULAR_SELECT_EOB(3, 4); + REGULAR_SELECT_EOB(4, 8); + REGULAR_SELECT_EOB(5, 5); + REGULAR_SELECT_EOB(6, 2); + REGULAR_SELECT_EOB(7, 3); + REGULAR_SELECT_EOB(8, 6); + REGULAR_SELECT_EOB(9, 9); + REGULAR_SELECT_EOB(10, 12); + REGULAR_SELECT_EOB(11, 13); + REGULAR_SELECT_EOB(12, 10); + REGULAR_SELECT_EOB(13, 7); + REGULAR_SELECT_EOB(14, 11); + REGULAR_SELECT_EOB(15, 14); + REGULAR_SELECT_EOB(16, 15); + + *d->eob = (char)eob; +} diff --git a/media/libvpx/libvpx/vp8/encoder/modecosts.h b/media/libvpx/libvpx/vp8/encoder/modecosts.h index dfb8989f7f..09ee2b5520 100644 --- a/media/libvpx/libvpx/vp8/encoder/modecosts.h +++ b/media/libvpx/libvpx/vp8/encoder/modecosts.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_MODECOSTS_H_ -#define VP8_ENCODER_MODECOSTS_H_ +#ifndef VPX_VP8_ENCODER_MODECOSTS_H_ +#define VPX_VP8_ENCODER_MODECOSTS_H_ #ifdef __cplusplus extern "C" { @@ -17,10 +17,10 @@ extern "C" { struct VP8_COMP; -void vp8_init_mode_costs(struct VP8_COMP *x); +void vp8_init_mode_costs(struct VP8_COMP *c); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_ENCODER_MODECOSTS_H_ +#endif // VPX_VP8_ENCODER_MODECOSTS_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/mr_dissim.c b/media/libvpx/libvpx/vp8/encoder/mr_dissim.c index 011b62a08f..b1bfb4b54a 100644 --- a/media/libvpx/libvpx/vp8/encoder/mr_dissim.c +++ b/media/libvpx/libvpx/vp8/encoder/mr_dissim.c @@ -49,7 +49,6 @@ void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) { void vp8_cal_dissimilarity(VP8_COMP *cpi) { VP8_COMMON *cm = &cpi->common; - int i; /* Note: The first row & first column in mip are outside the frame, which * were initialized to all 0.(ref_frame, mode, mv...) @@ -67,6 +66,7 @@ void vp8_cal_dissimilarity(VP8_COMP *cpi) { store_info->frame_type = cm->frame_type; if (cm->frame_type != KEY_FRAME) { + int i; store_info->is_frame_dropped = 0; for (i = 1; i < MAX_REF_FRAMES; ++i) store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i]; diff --git a/media/libvpx/libvpx/vp8/encoder/mr_dissim.h b/media/libvpx/libvpx/vp8/encoder/mr_dissim.h index da36628afa..58f5a97623 100644 --- a/media/libvpx/libvpx/vp8/encoder/mr_dissim.h +++ b/media/libvpx/libvpx/vp8/encoder/mr_dissim.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_MR_DISSIM_H_ -#define VP8_ENCODER_MR_DISSIM_H_ +#ifndef VPX_VP8_ENCODER_MR_DISSIM_H_ +#define VPX_VP8_ENCODER_MR_DISSIM_H_ #include "vpx_config.h" #ifdef __cplusplus @@ -24,4 +24,4 @@ extern void vp8_store_drop_frame_info(VP8_COMP *cpi); } // extern "C" #endif -#endif // VP8_ENCODER_MR_DISSIM_H_ +#endif // VPX_VP8_ENCODER_MR_DISSIM_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/libvpx/vp8/encoder/onyx_if.c index 648a616c9e..e9cc2365b4 100644 --- a/media/libvpx/libvpx/vp8/encoder/onyx_if.c +++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c @@ -12,15 +12,18 @@ #include "./vpx_scale_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "./vp8_rtcd.h" +#include "bitstream.h" #include "vp8/common/onyxc_int.h" #include "vp8/common/blockd.h" #include "onyx_int.h" #include "vp8/common/systemdependent.h" +#include "vp8/common/vp8_skin_detection.h" #include "vp8/encoder/quantize.h" #include "vp8/common/alloccommon.h" #include "mcomp.h" #include "firstpass.h" #include "vpx_dsp/psnr.h" +#include "vpx_dsp/vpx_dsp_common.h" #include "vpx_scale/vpx_scale.h" #include "vp8/common/extend.h" #include "ratectrl.h" @@ -34,43 +37,37 @@ #include "vp8/common/swapyv12buffer.h" #include "vp8/common/threading.h" #include "vpx_ports/system_state.h" +#include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" -#if ARCH_ARM +#include "vpx_util/vpx_write_yuv_frame.h" +#if VPX_ARCH_ARM #include "vpx_ports/arm.h" #endif #if CONFIG_MULTI_RES_ENCODING #include "mr_dissim.h" #endif #include "encodeframe.h" +#if CONFIG_MULTITHREAD +#include "ethreading.h" +#endif +#include "picklpf.h" +#if !CONFIG_REALTIME_ONLY +#include "temporal_filter.h" +#endif +#include #include #include #include #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING extern int vp8_update_coef_context(VP8_COMP *cpi); -extern void vp8_update_coef_probs(VP8_COMP *cpi); #endif -extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi); -extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val); -extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi); - -extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, int filt_lvl, - int low_var_thresh, int flag); -extern void print_parms(VP8_CONFIG *ocf, char *filenam); -extern unsigned int vp8_get_processor_freq(); -extern void print_tree_update_probs(); -extern int vp8cx_create_encoder_threads(VP8_COMP *cpi); -extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi); - -int vp8_estimate_entropy_savings(VP8_COMP *cpi); +extern unsigned int vp8_get_processor_freq(void); int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); -extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance); - static void set_default_lf_deltas(VP8_COMP *cpi); extern const int vp8_gf_interval_table[101]; @@ -86,6 +83,9 @@ FILE *yuv_file; #ifdef OUTPUT_YUV_DENOISED FILE *yuv_denoised_file; #endif +#ifdef OUTPUT_YUV_SKINMAP +static FILE *yuv_skinmap_file = NULL; +#endif #if 0 FILE *framepsnr; @@ -98,10 +98,6 @@ extern int skip_true_count; extern int skip_false_count; #endif -#ifdef VP8_ENTROPY_STATS -extern int intra_mode_stats[10][10][10]; -#endif - #ifdef SPEEDSTATS unsigned int frames_at_speed[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; @@ -189,7 +185,7 @@ static const unsigned char inter_minq[QINDEX_RANGE] = { extern FILE *vpxlogc; #endif -static void save_layer_context(VP8_COMP *cpi) { +void vp8_save_layer_context(VP8_COMP *cpi) { LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer]; /* Save layer dependent coding state */ @@ -218,14 +214,17 @@ static void save_layer_context(VP8_COMP *cpi) { lc->inter_frame_target = cpi->inter_frame_target; lc->total_byte_count = cpi->total_byte_count; lc->filter_level = cpi->common.filter_level; - + lc->frames_since_last_drop_overshoot = cpi->frames_since_last_drop_overshoot; + lc->force_maxqp = cpi->force_maxqp; lc->last_frame_percent_intra = cpi->last_frame_percent_intra; + lc->last_q[0] = cpi->last_q[0]; + lc->last_q[1] = cpi->last_q[1]; memcpy(lc->count_mb_ref_frame_usage, cpi->mb.count_mb_ref_frame_usage, sizeof(cpi->mb.count_mb_ref_frame_usage)); } -static void restore_layer_context(VP8_COMP *cpi, const int layer) { +void vp8_restore_layer_context(VP8_COMP *cpi, const int layer) { LAYER_CONTEXT *lc = &cpi->layer_context[layer]; /* Restore layer dependent coding state */ @@ -254,8 +253,11 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer) { cpi->inter_frame_target = lc->inter_frame_target; cpi->total_byte_count = lc->total_byte_count; cpi->common.filter_level = lc->filter_level; - + cpi->frames_since_last_drop_overshoot = lc->frames_since_last_drop_overshoot; + cpi->force_maxqp = lc->force_maxqp; cpi->last_frame_percent_intra = lc->last_frame_percent_intra; + cpi->last_q[0] = lc->last_q[0]; + cpi->last_q[1] = lc->last_q[1]; memcpy(cpi->mb.count_mb_ref_frame_usage, lc->count_mb_ref_frame_usage, sizeof(cpi->mb.count_mb_ref_frame_usage)); @@ -266,16 +268,23 @@ static int rescale(int val, int num, int denom) { int64_t llden = denom; int64_t llval = val; - return (int)(llval * llnum / llden); + int64_t result = (llval * llnum / llden); + if (result <= INT_MAX) + return (int)result; + else + return INT_MAX; } -static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, - const int layer, - double prev_layer_framerate) { +void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, + const int layer, + double prev_layer_framerate) { LAYER_CONTEXT *lc = &cpi->layer_context[layer]; lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer]; - lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000; + if (cpi->oxcf.target_bitrate[layer] > INT_MAX / 1000) + lc->target_bandwidth = INT_MAX; + else + lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000; lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level; lc->optimal_buffer_level_in_ms = oxcf->optimal_buffer_level; @@ -301,9 +310,9 @@ static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, /* Work out the average size of a frame within this layer */ if (layer > 0) { lc->avg_frame_size_for_layer = - (int)((cpi->oxcf.target_bitrate[layer] - - cpi->oxcf.target_bitrate[layer - 1]) * - 1000 / (lc->framerate - prev_layer_framerate)); + (int)round((cpi->oxcf.target_bitrate[layer] - + cpi->oxcf.target_bitrate[layer - 1]) * + 1000 / (lc->framerate - prev_layer_framerate)); } lc->active_worst_quality = cpi->oxcf.worst_allowed_q; @@ -327,8 +336,8 @@ static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, // for any "new" layers. For "existing" layers, let them inherit the parameters // from the previous layer state (at the same layer #). In future we may want // to better map the previous layer state(s) to the "new" ones. -static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, - const int prev_num_layers) { +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf, + const int prev_num_layers) { int i; double prev_layer_framerate = 0; const int curr_num_layers = cpi->oxcf.number_of_layers; @@ -336,12 +345,12 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, // We need this to set the layer context for the new layers below. if (prev_num_layers == 1) { cpi->current_layer = 0; - save_layer_context(cpi); + vp8_save_layer_context(cpi); } for (i = 0; i < curr_num_layers; ++i) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; if (i >= prev_num_layers) { - init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); + vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); } // The initial buffer levels are set based on their starting levels. // We could set the buffer levels based on the previous state (normalized @@ -356,7 +365,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, // state (to smooth-out quality dips/rate fluctuation at transition)? // We need to treat the 1 layer case separately: oxcf.target_bitrate[i] - // is not set for 1 layer, and the restore_layer_context/save_context() + // is not set for 1 layer, and the vp8_restore_layer_context/save_context() // are not called in the encoding loop, so we need to call it here to // pass the layer context state to |cpi|. if (curr_num_layers == 1) { @@ -364,7 +373,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf, lc->buffer_level = cpi->oxcf.starting_buffer_level_in_ms * lc->target_bandwidth / 1000; lc->bits_off_target = lc->buffer_level; - restore_layer_context(cpi, 0); + vp8_restore_layer_context(cpi, 0); } prev_layer_framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[i]; } @@ -394,16 +403,13 @@ static void setup_features(VP8_COMP *cpi) { static void dealloc_raw_frame_buffers(VP8_COMP *cpi); -void vp8_initialize_enc(void) { - static volatile int init_done = 0; - - if (!init_done) { - vpx_dsp_rtcd(); - vp8_init_intra_predictors(); - init_done = 1; - } +static void initialize_enc(void) { + vpx_dsp_rtcd(); + vp8_init_intra_predictors(); } +void vp8_initialize_enc(void) { once(initialize_enc); } + static void dealloc_compressor_data(VP8_COMP *cpi) { vpx_free(cpi->tplist); cpi->tplist = NULL; @@ -444,23 +450,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) { vpx_free(cpi->mb.pip); cpi->mb.pip = 0; - -#if CONFIG_MULTITHREAD - /* De-allocate mutex */ - if (cpi->pmutex != NULL) { - VP8_COMMON *const pc = &cpi->common; - int i; - - for (i = 0; i < pc->mb_rows; ++i) { - pthread_mutex_destroy(&cpi->pmutex[i]); - } - vpx_free(cpi->pmutex); - cpi->pmutex = NULL; - } - - vpx_free(cpi->mt_current_mb_col); - cpi->mt_current_mb_col = NULL; -#endif } static void enable_segmentation(VP8_COMP *cpi) { @@ -502,7 +491,7 @@ static void set_segmentation_map(VP8_COMP *cpi, */ static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta) { - cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta; + cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta; memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data)); } @@ -615,6 +604,59 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) { set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); } +static void compute_skin_map(VP8_COMP *cpi) { + int mb_row, mb_col, num_bl; + VP8_COMMON *cm = &cpi->common; + const uint8_t *src_y = cpi->Source->y_buffer; + const uint8_t *src_u = cpi->Source->u_buffer; + const uint8_t *src_v = cpi->Source->v_buffer; + const int src_ystride = cpi->Source->y_stride; + const int src_uvstride = cpi->Source->uv_stride; + + const SKIN_DETECTION_BLOCK_SIZE bsize = + (cm->Width * cm->Height <= 352 * 288) ? SKIN_8X8 : SKIN_16X16; + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { + num_bl = 0; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + const int bl_index = mb_row * cm->mb_cols + mb_col; + cpi->skin_map[bl_index] = + vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride, + bsize, cpi->consec_zero_last[bl_index], 0); + num_bl++; + src_y += 16; + src_u += 8; + src_v += 8; + } + src_y += (src_ystride << 4) - (num_bl << 4); + src_u += (src_uvstride << 3) - (num_bl << 3); + src_v += (src_uvstride << 3) - (num_bl << 3); + } + + // Remove isolated skin blocks (none of its neighbors are skin) and isolated + // non-skin blocks (all of its neighbors are skin). Skip the boundary. + for (mb_row = 1; mb_row < cm->mb_rows - 1; mb_row++) { + for (mb_col = 1; mb_col < cm->mb_cols - 1; mb_col++) { + const int bl_index = mb_row * cm->mb_cols + mb_col; + int num_neighbor = 0; + int mi, mj; + int non_skin_threshold = 8; + + for (mi = -1; mi <= 1; mi += 1) { + for (mj = -1; mj <= 1; mj += 1) { + int bl_neighbor_index = (mb_row + mi) * cm->mb_cols + mb_col + mj; + if (cpi->skin_map[bl_neighbor_index]) num_neighbor++; + } + } + + if (cpi->skin_map[bl_index] && num_neighbor < 2) + cpi->skin_map[bl_index] = 0; + if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold) + cpi->skin_map[bl_index] = 1; + } + } +} + static void set_default_lf_deltas(VP8_COMP *cpi) { cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1; cpi->mb.e_mbd.mode_ref_lf_delta_update = 1; @@ -643,8 +685,8 @@ static void set_default_lf_deltas(VP8_COMP *cpi) { /* Convenience macros for mapping speed and mode into a continuous * range */ -#define GOOD(x) (x + 1) -#define RT(x) (x + 7) +#define GOOD(x) ((x) + 1) +#define RT(x) ((x) + 7) static int speed_map(int speed, const int *map) { int res; @@ -697,9 +739,9 @@ static const int mode_check_freq_map_zn2[] = { 0, RT(10), 1 << 1, RT(11), 1 << 2, RT(12), 1 << 3, INT_MAX }; -static const int mode_check_freq_map_vhbpred[] = { - 0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(5), 4, INT_MAX -}; +static const int mode_check_freq_map_vhbpred[] = { 0, GOOD(5), 2, RT(0), + 0, RT(3), 2, RT(5), + 4, INT_MAX }; static const int mode_check_freq_map_near2[] = { 0, GOOD(5), 2, RT(0), 0, RT(3), 2, @@ -715,18 +757,19 @@ static const int mode_check_freq_map_new2[] = { 0, GOOD(5), 4, RT(0), 1 << 3, RT(11), 1 << 4, RT(12), 1 << 5, INT_MAX }; -static const int mode_check_freq_map_split1[] = { - 0, GOOD(2), 2, GOOD(3), 7, RT(1), 2, RT(2), 7, INT_MAX -}; +static const int mode_check_freq_map_split1[] = { 0, GOOD(2), 2, GOOD(3), + 7, RT(1), 2, RT(2), + 7, INT_MAX }; -static const int mode_check_freq_map_split2[] = { - 0, GOOD(1), 2, GOOD(2), 4, GOOD(3), 15, RT(1), 4, RT(2), 15, INT_MAX -}; +static const int mode_check_freq_map_split2[] = { 0, GOOD(1), 2, GOOD(2), + 4, GOOD(3), 15, RT(1), + 4, RT(2), 15, INT_MAX }; void vp8_set_speed_features(VP8_COMP *cpi) { SPEED_FEATURES *sf = &cpi->sf; int Mode = cpi->compressor_speed; int Speed = cpi->Speed; + int Speed2; int i; VP8_COMMON *cm = &cpi->common; int last_improved_quant = sf->improved_quant; @@ -828,9 +871,16 @@ void vp8_set_speed_features(VP8_COMP *cpi) { cpi->mode_check_freq[THR_V_PRED] = cpi->mode_check_freq[THR_H_PRED] = cpi->mode_check_freq[THR_B_PRED] = speed_map(Speed, mode_check_freq_map_vhbpred); - cpi->mode_check_freq[THR_NEW1] = speed_map(Speed, mode_check_freq_map_new1); + + // For real-time mode at speed 10 keep the mode_check_freq threshold + // for NEW1 similar to that of speed 9. + Speed2 = Speed; + if (cpi->Speed == 10 && Mode == 2) Speed2 = RT(9); + cpi->mode_check_freq[THR_NEW1] = speed_map(Speed2, mode_check_freq_map_new1); + cpi->mode_check_freq[THR_NEW2] = cpi->mode_check_freq[THR_NEW3] = speed_map(Speed, mode_check_freq_map_new2); + cpi->mode_check_freq[THR_SPLIT1] = speed_map(Speed, mode_check_freq_map_split1); cpi->mode_check_freq[THR_SPLIT2] = cpi->mode_check_freq[THR_SPLIT3] = @@ -974,7 +1024,7 @@ void vp8_set_speed_features(VP8_COMP *cpi) { memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins)); - }; /* switch */ + } /* switch */ /* Slow quant, dct and trellis not worthwhile for first pass * so make sure they are always turned off. @@ -1087,9 +1137,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { int width = cm->Width; int height = cm->Height; -#if CONFIG_MULTITHREAD - int prev_mb_rows = cm->mb_rows; -#endif if (vp8_alloc_frame_buffers(cm, width, height)) { vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, @@ -1125,7 +1172,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { #else unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16; #endif - CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); + CHECK_MEM_ERROR(&cpi->common.error, cpi->tok, + vpx_calloc(tokens, sizeof(*cpi->tok))); } /* Data used for real time vc mode to see if gf needs refreshing */ @@ -1134,37 +1182,39 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { /* Structures used to monitor GF usage */ vpx_free(cpi->gf_active_flags); CHECK_MEM_ERROR( - cpi->gf_active_flags, + &cpi->common.error, cpi->gf_active_flags, vpx_calloc(sizeof(*cpi->gf_active_flags), cm->mb_rows * cm->mb_cols)); cpi->gf_active_count = cm->mb_rows * cm->mb_cols; vpx_free(cpi->mb_activity_map); CHECK_MEM_ERROR( - cpi->mb_activity_map, + &cpi->common.error, cpi->mb_activity_map, vpx_calloc(sizeof(*cpi->mb_activity_map), cm->mb_rows * cm->mb_cols)); /* allocate memory for storing last frame's MVs for MV prediction. */ vpx_free(cpi->lfmv); - CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), - sizeof(*cpi->lfmv))); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->lfmv, + vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lfmv))); vpx_free(cpi->lf_ref_frame_sign_bias); - CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, + CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame_sign_bias, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lf_ref_frame_sign_bias))); vpx_free(cpi->lf_ref_frame); - CHECK_MEM_ERROR(cpi->lf_ref_frame, + CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lf_ref_frame))); /* Create the encoder segmentation map and set all entries to 0 */ vpx_free(cpi->segmentation_map); CHECK_MEM_ERROR( - cpi->segmentation_map, + &cpi->common.error, cpi->segmentation_map, vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->segmentation_map))); cpi->cyclic_refresh_mode_index = 0; vpx_free(cpi->active_map); - CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cm->mb_rows * cm->mb_cols, - sizeof(*cpi->active_map))); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->active_map, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->active_map))); memset(cpi->active_map, 1, (cm->mb_rows * cm->mb_cols)); #if CONFIG_MULTITHREAD @@ -1177,36 +1227,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { } else { cpi->mt_sync_range = 16; } - - if (cpi->oxcf.multi_threaded > 1) { - int i; - - /* De-allocate and re-allocate mutex */ - if (cpi->pmutex != NULL) { - for (i = 0; i < prev_mb_rows; ++i) { - pthread_mutex_destroy(&cpi->pmutex[i]); - } - vpx_free(cpi->pmutex); - cpi->pmutex = NULL; - } - - CHECK_MEM_ERROR(cpi->pmutex, - vpx_malloc(sizeof(*cpi->pmutex) * cm->mb_rows)); - if (cpi->pmutex) { - for (i = 0; i < cm->mb_rows; ++i) { - pthread_mutex_init(&cpi->pmutex[i], NULL); - } - } - - vpx_free(cpi->mt_current_mb_col); - CHECK_MEM_ERROR(cpi->mt_current_mb_col, - vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); - } - #endif vpx_free(cpi->tplist); - CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows)); + CHECK_MEM_ERROR(&cpi->common.error, cpi->tplist, + vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows)); #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { @@ -1237,16 +1262,25 @@ int vp8_reverse_trans(int x) { return 63; } -void vp8_new_framerate(VP8_COMP *cpi, double framerate) { - if (framerate < .1) framerate = 30; +static double clamp_framerate(double framerate) { + if (framerate < .1) + return 30.0; + else + return framerate; +} + +void vp8_new_framerate(VP8_COMP *cpi, double framerate) { + framerate = clamp_framerate(framerate); cpi->framerate = framerate; cpi->output_framerate = framerate; - cpi->per_frame_bandwidth = - (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate); + const double per_frame_bandwidth = + round(cpi->oxcf.target_bandwidth / cpi->output_framerate); + cpi->per_frame_bandwidth = (int)VPXMIN(per_frame_bandwidth, INT_MAX); cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth; - cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * - cpi->oxcf.two_pass_vbrmin_section / 100); + const int64_t vbr_min_bits = (int64_t)cpi->av_per_frame_bandwidth * + cpi->oxcf.two_pass_vbrmin_section / 100; + cpi->min_frame_bandwidth = (int)VPXMIN(vbr_min_bits, INT_MAX); /* Set Maximum gf/arf interval */ cpi->max_gf_interval = ((int)(cpi->output_framerate / 2.0) + 2); @@ -1273,7 +1307,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate) { } } -static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { +static void init_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) { VP8_COMMON *cm = &cpi->common; cpi->oxcf = *oxcf; @@ -1334,7 +1368,7 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { double prev_layer_framerate = 0; for (i = 0; i < cpi->oxcf.number_of_layers; ++i) { - init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); + vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); prev_layer_framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[i]; } @@ -1351,7 +1385,7 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { #endif } -static void update_layer_contexts(VP8_COMP *cpi) { +void vp8_update_layer_contexts(VP8_COMP *cpi) { VP8_CONFIG *oxcf = &cpi->oxcf; /* Update snapshots of the layer contexts to reflect new parameters */ @@ -1364,7 +1398,10 @@ static void update_layer_contexts(VP8_COMP *cpi) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; lc->framerate = cpi->ref_framerate / oxcf->rate_decimator[i]; - lc->target_bandwidth = oxcf->target_bitrate[i] * 1000; + if (oxcf->target_bitrate[i] > INT_MAX / 1000) + lc->target_bandwidth = INT_MAX; + else + lc->target_bandwidth = oxcf->target_bitrate[i] * 1000; lc->starting_buffer_level = rescale( (int)oxcf->starting_buffer_level_in_ms, lc->target_bandwidth, 1000); @@ -1386,8 +1423,8 @@ static void update_layer_contexts(VP8_COMP *cpi) { /* Work out the average size of a frame within this layer */ if (i > 0) { lc->avg_frame_size_for_layer = - (int)((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) * - 1000 / (lc->framerate - prev_layer_framerate)); + (int)round((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) * + 1000 / (lc->framerate - prev_layer_framerate)); } prev_layer_framerate = lc->framerate; @@ -1395,10 +1432,11 @@ static void update_layer_contexts(VP8_COMP *cpi) { } } -void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { +void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) { VP8_COMMON *cm = &cpi->common; int last_w, last_h; unsigned int prev_number_of_layers; + double raw_target_rate; if (!cpi) return; @@ -1413,10 +1451,6 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { last_h = cpi->oxcf.Height; prev_number_of_layers = cpi->oxcf.number_of_layers; - if (cpi->initial_width) { - oxcf->multi_threaded = cpi->oxcf.multi_threaded; - } - cpi->oxcf = *oxcf; switch (cpi->oxcf.Mode) { @@ -1502,6 +1536,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { } } + cpi->ext_refresh_frame_flags_pending = 0; + cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; @@ -1521,9 +1557,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { setup_features(cpi); - { + if (!cpi->use_roi_static_threshold) { int i; - for (i = 0; i < MAX_MB_SEGMENTS; ++i) { cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; } @@ -1542,6 +1577,10 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cpi->oxcf.maximum_buffer_size_in_ms = 240000; } + raw_target_rate = ((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * 8 * 3 * + cpi->framerate / 1000.0); + if (cpi->oxcf.target_bandwidth > raw_target_rate) + cpi->oxcf.target_bandwidth = (unsigned int)raw_target_rate; /* Convert target bandwidth from Kbit/s to Bit/s */ cpi->oxcf.target_bandwidth *= 1000; @@ -1612,7 +1651,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cpi->temporal_layer_id = 0; } cpi->temporal_pattern_counter = 0; - reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers); + vp8_reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers); } if (!cpi->initial_width) { @@ -1636,9 +1675,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cm->sharpness_level = cpi->oxcf.Sharpness; - if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) { - int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); - int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) { + int hr, hs, vr, vs; Scale2Ratio(cm->horiz_scale, &hr, &hs); Scale2Ratio(cm->vert_scale, &vr, &vs); @@ -1721,7 +1759,7 @@ static void cal_mvsadcosts(int *mvsadcost[2]) { } while (++i <= mvfp_max); } -struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { +struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) { int i; VP8_COMP *cpi; @@ -1743,8 +1781,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->common.error.setjmp = 1; - CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), - (MAX_MVSEARCH_STEPS * 8) + 1)); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->mb.ss, + vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1)); vp8_create_common(&cpi->common); @@ -1784,6 +1823,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->active_map_enabled = 0; + cpi->use_roi_static_threshold = 0; + #if 0 /* Experimental code for lagged and one pass */ /* Initialise one_pass GF frames stats */ @@ -1841,26 +1882,25 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { ? (2 * (cpi->common.mb_rows * cpi->common.mb_cols) / cpi->cyclic_refresh_mode_max_mbs_perframe) : 10; - cpi->gf_interval_onepass_cbr = - VPXMIN(40, VPXMAX(6, cpi->gf_interval_onepass_cbr)); + cpi->gf_interval_onepass_cbr = clamp(cpi->gf_interval_onepass_cbr, 6, 40); cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr; } if (cpi->cyclic_refresh_mode_enabled) { - CHECK_MEM_ERROR(cpi->cyclic_refresh_map, + CHECK_MEM_ERROR(&cpi->common.error, cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); } else { cpi->cyclic_refresh_map = (signed char *)NULL; } - CHECK_MEM_ERROR(cpi->consec_zero_last, - vpx_calloc(cm->mb_rows * cm->mb_cols, 1)); - CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias, - vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); + CHECK_MEM_ERROR( + &cpi->common.error, cpi->skin_map, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(cpi->skin_map[0]))); -#ifdef VP8_ENTROPY_STATS - init_context_counters(); -#endif + CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last, + vpx_calloc(cm->mb_rows * cm->mb_cols, 1)); + CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last_mvbias, + vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); /*Initialize the feed-forward activity masking.*/ cpi->activity_avg = 90 << 12; @@ -1876,6 +1916,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->common.refresh_alt_ref_frame = 0; cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot = 0; + cpi->rt_always_update_correction_factor = 0; + cpi->rt_drop_recode_on_overshoot = 1; cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS @@ -1929,6 +1972,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { #ifdef OUTPUT_YUV_DENOISED yuv_denoised_file = fopen("denoised.yuv", "ab"); #endif +#ifdef OUTPUT_YUV_SKINMAP + yuv_skinmap_file = fopen("skinmap.yuv", "wb"); +#endif #if 0 framepsnr = fopen("framepsnr.stt", "a"); @@ -1966,12 +2012,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->mb.rd_thresh_mult[i] = 128; } -#ifdef VP8_ENTROPY_STATS - init_mv_ref_counts(); -#endif - #if CONFIG_MULTITHREAD if (vp8cx_create_encoder_threads(cpi)) { + cpi->common.error.setjmp = 0; vp8_remove_compressor(&cpi); return 0; } @@ -1980,39 +2023,29 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16; cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16; - cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3; - cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8; cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d; cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8; cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8; - cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3; - cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8; cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d; cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16; cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16; - cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3; - cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8; cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d; cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8; cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8; - cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3; - cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8; cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d; cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4; cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4; - cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3; - cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8; cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d; -#if ARCH_X86 || ARCH_X86_64 +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 cpi->fn_ptr[BLOCK_16X16].copymem = vp8_copy32xn; cpi->fn_ptr[BLOCK_16X8].copymem = vp8_copy32xn; cpi->fn_ptr[BLOCK_8X16].copymem = vp8_copy32xn; @@ -2020,7 +2053,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->fn_ptr[BLOCK_4X4].copymem = vp8_copy32xn; #endif - cpi->full_search_sad = vp8_full_search_sad; cpi->diamond_search_sad = vp8_diamond_search_sad; cpi->refining_search_sad = vp8_refining_search_sad; @@ -2036,8 +2068,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { vp8_loop_filter_init(cm); - cpi->common.error.setjmp = 0; - #if CONFIG_MULTI_RES_ENCODING /* Calculate # of MBs in a row in lower-resolution level image. */ @@ -2064,11 +2094,13 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { vp8_setup_block_ptrs(&cpi->mb); vp8_setup_block_dptrs(&cpi->mb.e_mbd); + cpi->common.error.setjmp = 0; + return cpi; } -void vp8_remove_compressor(VP8_COMP **ptr) { - VP8_COMP *cpi = *ptr; +void vp8_remove_compressor(VP8_COMP **comp) { + VP8_COMP *cpi = *comp; if (!cpi) return; @@ -2081,12 +2113,6 @@ void vp8_remove_compressor(VP8_COMP **ptr) { #endif -#ifdef VP8_ENTROPY_STATS - print_context_counters(); - print_tree_update_probs(); - print_mode_context(); -#endif - #if CONFIG_INTERNAL_STATS if (cpi->pass != 1) { @@ -2094,11 +2120,6 @@ void vp8_remove_compressor(VP8_COMP **ptr) { double time_encoded = (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / 10000000.000; - double total_encode_time = - (cpi->time_receive_data + cpi->time_compress_data) / 1000.000; - double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded; - const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000; - const double rate_err = ((100.0 * (dr - target_rate)) / target_rate); if (cpi->b_calculate_psnr) { if (cpi->oxcf.number_of_layers > 1) { @@ -2127,6 +2148,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) { total_psnr2, total_ssim); } } else { + double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded; double samples = 3.0 / 2 * cpi->count * cpi->common.Width * cpi->common.Height; double total_psnr = @@ -2175,8 +2197,8 @@ void vp8_remove_compressor(VP8_COMP **ptr) { { extern int count_mb_seg[4]; FILE *f = fopen("modes.stt", "a"); - double dr = (double)cpi->framerate * (double)bytes * (double)8 / - (double)count / (double)1000; + double dr = cpi->framerate * (double)bytes * (double)8 / (double)count / + (double)1000; fprintf(f, "intra_mode in Intra Frames:\n"); fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]); @@ -2217,40 +2239,6 @@ void vp8_remove_compressor(VP8_COMP **ptr) { } #endif -#ifdef VP8_ENTROPY_STATS - { - int i, j, k; - FILE *fmode = fopen("modecontext.c", "w"); - - fprintf(fmode, "\n#include \"entropymode.h\"\n\n"); - fprintf(fmode, "const unsigned int vp8_kf_default_bmode_counts "); - fprintf(fmode, - "[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =\n{\n"); - - for (i = 0; i < 10; ++i) { - fprintf(fmode, " { /* Above Mode : %d */\n", i); - - for (j = 0; j < 10; ++j) { - fprintf(fmode, " {"); - - for (k = 0; k < 10; ++k) { - if (!intra_mode_stats[i][j][k]) - fprintf(fmode, " %5d, ", 1); - else - fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]); - } - - fprintf(fmode, "}, /* left_mode %d */\n", j); - } - - fprintf(fmode, " },\n"); - } - - fprintf(fmode, "};\n"); - fclose(fmode); - } -#endif - #if defined(SECTIONBITS_OUTPUT) if (0) { @@ -2268,7 +2256,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) { #if 0 { printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); - printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); + printf("\n_frames receive_data encod_mb_row compress_frame Total\n"); printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000); } #endif @@ -2284,13 +2272,14 @@ void vp8_remove_compressor(VP8_COMP **ptr) { dealloc_compressor_data(cpi); vpx_free(cpi->mb.ss); vpx_free(cpi->tok); + vpx_free(cpi->skin_map); vpx_free(cpi->cyclic_refresh_map); vpx_free(cpi->consec_zero_last); vpx_free(cpi->consec_zero_last_mvbias); vp8_remove_common(&cpi->common); vpx_free(cpi); - *ptr = 0; + *comp = 0; #ifdef OUTPUT_YUV_SRC fclose(yuv_file); @@ -2298,6 +2287,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) { #ifdef OUTPUT_YUV_DENOISED fclose(yuv_denoised_file); #endif +#ifdef OUTPUT_YUV_SKINMAP + fclose(yuv_skinmap_file); +#endif #if 0 @@ -2425,6 +2417,7 @@ int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags) { if (ref_frame_flags & VP8_ALTR_FRAME) cpi->common.refresh_alt_ref_frame = 1; + cpi->ext_refresh_frame_flags_pending = 1; return 0; } @@ -2474,42 +2467,13 @@ int vp8_update_entropy(VP8_COMP *cpi, int update) { return 0; } -#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) -void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { - unsigned char *src = s->y_buffer; - int h = s->y_height; - - do { - fwrite(src, s->y_width, 1, yuv_file); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, yuv_file); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, yuv_file); - src += s->uv_stride; - } while (--h); -} -#endif - static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) { VP8_COMMON *cm = &cpi->common; /* are we resizing the image */ if (cm->horiz_scale != 0 || cm->vert_scale != 0) { #if CONFIG_SPATIAL_RESAMPLING - int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); - int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + int hr, hs, vr, vs; int tmp_height; if (cm->vert_scale == 3) { @@ -2542,8 +2506,7 @@ static int resize_key_frame(VP8_COMP *cpi) { */ if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) { - int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs); - int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs); + int hr, hs, vr, vs; int new_width, new_height; /* If we are below the resample DOWN watermark then scale down a @@ -2552,15 +2515,17 @@ static int resize_key_frame(VP8_COMP *cpi) { if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) { cm->horiz_scale = - (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO; - cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO; + (cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO; + cm->vert_scale = + (cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO; } /* Should we now start scaling back up */ else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100)) { cm->horiz_scale = - (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL; - cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL; + (cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL; + cm->vert_scale = + (cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL; } /* Get the new height and width */ @@ -2791,7 +2756,7 @@ static int decide_key_frame(VP8_COMP *cpi) { } /* in addition if the following are true and this is not a golden frame * then code a key frame Note that on golden frames there often seems - * to be a pop in intra useage anyway hence this restriction is + * to be a pop in intra usage anyway hence this restriction is * designed to prevent spurious key frames. The Intra pop needs to be * investigated. */ @@ -2810,13 +2775,8 @@ static int decide_key_frame(VP8_COMP *cpi) { return code_key_frame; } -static void Pass1Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest, - unsigned int *frame_flags) { - (void)size; - (void)dest; - (void)frame_flags; +static void Pass1Encode(VP8_COMP *cpi) { vp8_set_quantizer(cpi, 26); - vp8_first_pass(cpi); } #endif @@ -2853,7 +2813,6 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) fclose(yframe); } #endif -/* return of 0 means drop frame */ #if !CONFIG_REALTIME_ONLY /* Function to test for conditions that indeicate we should loop @@ -2916,8 +2875,7 @@ static void update_reference_frames(VP8_COMP *cpi) { cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame; cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame; - } else /* For non key frames */ - { + } else { if (cm->refresh_alt_ref_frame) { assert(!cm->copy_buffer_to_arf); @@ -2938,8 +2896,7 @@ static void update_reference_frames(VP8_COMP *cpi) { cpi->current_ref_frames[ALTREF_FRAME] = cpi->current_ref_frames[LAST_FRAME]; } - } else /* if (cm->copy_buffer_to_arf == 2) */ - { + } else { if (cm->alt_fb_idx != cm->gld_fb_idx) { yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME; yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME; @@ -2971,8 +2928,7 @@ static void update_reference_frames(VP8_COMP *cpi) { cpi->current_ref_frames[GOLDEN_FRAME] = cpi->current_ref_frames[LAST_FRAME]; } - } else /* if (cm->copy_buffer_to_gf == 2) */ - { + } else { if (cm->alt_fb_idx != cm->gld_fb_idx) { yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME; yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME; @@ -3003,8 +2959,7 @@ static void update_reference_frames(VP8_COMP *cpi) { int i; for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_running_avg[i]); - } else /* For non key frames */ - { + } else { vp8_yv12_extend_frame_borders( &cpi->denoiser.yv12_running_avg[INTRA_FRAME]); @@ -3059,6 +3014,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source, } // Only return non-zero if we have at least ~1/16 samples for estimate. if (num_blocks > (tot_num_blocks >> 4)) { + assert(num_blocks != 0); return (Total / num_blocks); } else { return 0; @@ -3191,11 +3147,15 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { if (cm->no_lpf) { cm->filter_level = 0; } else { +#if CONFIG_INTERNAL_STATS struct vpx_usec_timer timer; +#endif vpx_clear_system_state(); +#if CONFIG_INTERNAL_STATS vpx_usec_timer_start(&timer); +#endif if (cpi->sf.auto_filter == 0) { #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) { @@ -3230,13 +3190,16 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { vp8cx_set_alt_lf_level(cpi, cm->filter_level); } +#if CONFIG_INTERNAL_STATS vpx_usec_timer_mark(&timer); cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); +#endif } #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { - sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { + /* signal that we have set filter_level */ + vp8_sem_post(&cpi->h_event_end_lpf); } #endif @@ -3248,6 +3211,113 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { vp8_yv12_extend_frame_borders(cm->frame_to_show); } +// Return 1 if frame is to be dropped. Update frame drop decimation +// counters. +int vp8_check_drop_buffer(VP8_COMP *cpi) { + VP8_COMMON *cm = &cpi->common; + int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark * + cpi->oxcf.optimal_buffer_level / 100); + int drop_mark75 = drop_mark * 2 / 3; + int drop_mark50 = drop_mark / 4; + int drop_mark25 = drop_mark / 8; + if (cpi->drop_frames_allowed) { + /* The reset to decimation 0 is only done here for one pass. + * Once it is set two pass leaves decimation on till the next kf. + */ + if (cpi->buffer_level > drop_mark && cpi->decimation_factor > 0) { + cpi->decimation_factor--; + } + + if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) { + cpi->decimation_factor = 1; + + } else if (cpi->buffer_level < drop_mark25 && + (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) { + cpi->decimation_factor = 3; + } else if (cpi->buffer_level < drop_mark50 && + (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) { + cpi->decimation_factor = 2; + } else if (cpi->buffer_level < drop_mark75 && + (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) { + cpi->decimation_factor = 1; + } + } + + /* The following decimates the frame rate according to a regular + * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help + * prevent buffer under-run in CBR mode. Alternatively it might be + * desirable in some situations to drop frame rate but throw more bits + * at each frame. + * + * Note that dropping a key frame can be problematic if spatial + * resampling is also active + */ + if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) { + switch (cpi->decimation_factor) { + case 1: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2; + break; + case 2: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; + break; + case 3: + cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; + break; + } + + /* Note that we should not throw out a key frame (especially when + * spatial resampling is enabled). + */ + if (cm->frame_type == KEY_FRAME) { + cpi->decimation_count = cpi->decimation_factor; + } else if (cpi->decimation_count > 0) { + cpi->decimation_count--; + + cpi->bits_off_target += cpi->av_per_frame_bandwidth; + if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { + cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; + } + +#if CONFIG_MULTI_RES_ENCODING + vp8_store_drop_frame_info(cpi); +#endif + + cm->current_video_frame++; + cpi->frames_since_key++; + cpi->ext_refresh_frame_flags_pending = 0; + // We advance the temporal pattern for dropped frames. + cpi->temporal_pattern_counter++; + +#if CONFIG_INTERNAL_STATS + cpi->count++; +#endif + + cpi->buffer_level = cpi->bits_off_target; + + if (cpi->oxcf.number_of_layers > 1) { + unsigned int i; + + /* Propagate bits saved by dropping the frame to higher + * layers + */ + for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); + if (lc->bits_off_target > lc->maximum_buffer_size) { + lc->bits_off_target = lc->maximum_buffer_size; + } + lc->buffer_level = lc->bits_off_target; + } + } + return 1; + } else { + cpi->decimation_count = cpi->decimation_factor; + } + } else { + cpi->decimation_count = 0; + } + return 0; +} static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, unsigned char *dest, @@ -3258,7 +3328,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, int frame_under_shoot_limit; int Loop = 0; - int loop_count; VP8_COMMON *cm = &cpi->common; int active_worst_qchanged = 0; @@ -3274,12 +3343,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, int undershoot_seen = 0; #endif - int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark * - cpi->oxcf.optimal_buffer_level / 100); - int drop_mark75 = drop_mark * 2 / 3; - int drop_mark50 = drop_mark / 4; - int drop_mark25 = drop_mark / 8; - /* Clear down mmx registers to allow floating point in what follows */ vpx_clear_system_state(); @@ -3303,10 +3366,12 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } break; #endif // !CONFIG_REALTIME_ONLY - default: - cpi->per_frame_bandwidth = - (int)(cpi->target_bandwidth / cpi->output_framerate); + default: { + const double per_frame_bandwidth = + round(cpi->target_bandwidth / cpi->output_framerate); + cpi->per_frame_bandwidth = (int)VPXMIN(per_frame_bandwidth, INT_MAX); break; + } } /* Default turn off buffer to buffer copying */ @@ -3358,11 +3423,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; if (cpi->oxcf.mr_encoder_id) { - // TODO(marpan): This constraint shouldn't be needed, as we would like - // to allow for key frame setting (forced or periodic) defined per - // spatial layer. For now, keep this in. - cm->frame_type = low_res_frame_info->frame_type; - // Check if lower resolution is available for motion vector reuse. if (cm->frame_type != KEY_FRAME) { cpi->mr_low_res_mv_avail = 1; @@ -3387,7 +3447,16 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]); */ } + // Disable motion vector reuse (i.e., disable any usage of the low_res) + // if the previous lower stream is skipped/disabled. + if (low_res_frame_info->skip_encoding_prev_stream) { + cpi->mr_low_res_mv_avail = 0; + } } + // This stream is not skipped (i.e., it's being encoded), so set this skip + // flag to 0. This is needed for the next stream (i.e., which is the next + // frame to be encoded). + low_res_frame_info->skip_encoding_prev_stream = 0; // On a key frame: For the lowest resolution, keep track of the key frame // counter value. For the higher resolutions, reset the current video @@ -3489,101 +3558,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, update_rd_ref_frame_probs(cpi); - if (cpi->drop_frames_allowed) { - /* The reset to decimation 0 is only done here for one pass. - * Once it is set two pass leaves decimation on till the next kf. - */ - if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0)) { - cpi->decimation_factor--; - } - - if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) { - cpi->decimation_factor = 1; - - } else if (cpi->buffer_level < drop_mark25 && - (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) { - cpi->decimation_factor = 3; - } else if (cpi->buffer_level < drop_mark50 && - (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) { - cpi->decimation_factor = 2; - } else if (cpi->buffer_level < drop_mark75 && - (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) { - cpi->decimation_factor = 1; - } - } - - /* The following decimates the frame rate according to a regular - * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help - * prevent buffer under-run in CBR mode. Alternatively it might be - * desirable in some situations to drop frame rate but throw more bits - * at each frame. - * - * Note that dropping a key frame can be problematic if spatial - * resampling is also active - */ - if (cpi->decimation_factor > 0) { - switch (cpi->decimation_factor) { - case 1: - cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2; - break; - case 2: - cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; - break; - case 3: - cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4; - break; - } - - /* Note that we should not throw out a key frame (especially when - * spatial resampling is enabled). - */ - if (cm->frame_type == KEY_FRAME) { - cpi->decimation_count = cpi->decimation_factor; - } else if (cpi->decimation_count > 0) { - cpi->decimation_count--; - - cpi->bits_off_target += cpi->av_per_frame_bandwidth; - if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { - cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; - } - -#if CONFIG_MULTI_RES_ENCODING - vp8_store_drop_frame_info(cpi); -#endif - - cm->current_video_frame++; - cpi->frames_since_key++; - // We advance the temporal pattern for dropped frames. - cpi->temporal_pattern_counter++; - -#if CONFIG_INTERNAL_STATS - cpi->count++; -#endif - - cpi->buffer_level = cpi->bits_off_target; - - if (cpi->oxcf.number_of_layers > 1) { - unsigned int i; - - /* Propagate bits saved by dropping the frame to higher - * layers - */ - for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { - LAYER_CONTEXT *lc = &cpi->layer_context[i]; - lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); - if (lc->bits_off_target > lc->maximum_buffer_size) { - lc->bits_off_target = lc->maximum_buffer_size; - } - lc->buffer_level = lc->bits_off_target; - } - } - - return; - } else { - cpi->decimation_count = cpi->decimation_factor; - } - } else { - cpi->decimation_count = 0; + if (vp8_check_drop_buffer(cpi)) { + return; } /* Decide how big to make the frame */ @@ -3594,6 +3570,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif cm->current_video_frame++; cpi->frames_since_key++; + cpi->ext_refresh_frame_flags_pending = 0; // We advance the temporal pattern for dropped frames. cpi->temporal_pattern_counter++; return; @@ -3659,7 +3636,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, if (cpi->this_key_frame_forced) { if (cpi->active_best_quality > cpi->avg_frame_qindex * 7 / 8) { cpi->active_best_quality = cpi->avg_frame_qindex * 7 / 8; - } else if (cpi->active_best_qualityavg_frame_qindex>> 2) { + } else if (cpi->active_best_quality < (cpi->avg_frame_qindex >> 2)) { cpi->active_best_quality = cpi->avg_frame_qindex >> 2; } } @@ -3681,7 +3658,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, Q = cpi->avg_frame_qindex; } - /* For constrained quality dont allow Q less than the cq level */ + /* For constrained quality don't allow Q less than the cq level */ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && (Q < cpi->cq_target_quality)) { Q = cpi->cq_target_quality; @@ -3708,7 +3685,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } else { cpi->active_best_quality = inter_minq[Q]; - /* For the constant/constrained quality mode we dont want + /* For the constant/constrained quality mode we don't want * q to fall below the cq level. */ if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && @@ -3729,7 +3706,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, * higher quality on the frames to prevent bits just going to waste. */ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { - /* Note that the use of >= here elliminates the risk of a devide + /* Note that the use of >= here elliminates the risk of a divide * by 0 error in the else if clause */ if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) { @@ -3789,9 +3766,11 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } #endif + compute_skin_map(cpi); + /* Setup background Q adjustment for error resilient mode. * For multi-layer encodes only enable this for the base layer. - */ + */ if (cpi->cyclic_refresh_mode_enabled) { // Special case for screen_content_mode with golden frame updates. int disable_cr_gf = @@ -3816,8 +3795,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, vp8_save_coding_context(cpi); - loop_count = 0; - scale_and_extend_source(cpi->un_scaled_source, cpi); #if CONFIG_TEMPORAL_DENOISING && CONFIG_POSTPROC @@ -3825,8 +3802,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, // (temporal denoising) mode. if (cpi->oxcf.noise_sensitivity >= 3) { if (cpi->denoiser.denoise_pars.spatial_blur != 0) { - vp8_de_noise(cm, cpi->Source, cpi->Source, - cpi->denoiser.denoise_pars.spatial_blur, 1, 0, 0); + vp8_de_noise(cm, cpi->Source, cpi->denoiser.denoise_pars.spatial_blur, 1); } } #endif @@ -3847,9 +3823,9 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } if (cm->frame_type == KEY_FRAME) { - vp8_de_noise(cm, cpi->Source, cpi->Source, l, 1, 0, 1); + vp8_de_noise(cm, cpi->Source, l, 1); } else { - vp8_de_noise(cm, cpi->Source, cpi->Source, l, 1, 0, 1); + vp8_de_noise(cm, cpi->Source, l, 1); src = cpi->Source->y_buffer; @@ -3862,7 +3838,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif #ifdef OUTPUT_YUV_SRC - vp8_write_yuv_frame(yuv_file, cpi->Source); + vpx_write_yuv_frame(yuv_file, cpi->Source); #endif do { @@ -3973,7 +3949,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, if (cm->refresh_entropy_probs == 0) { /* save a copy for later refresh */ - memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc)); + cm->lfc = cm->fc; } vp8_update_coef_context(cpi); @@ -3990,8 +3966,16 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #else /* transform / motion compensation build reconstruction frame */ vp8_encode_frame(cpi); - if (cpi->oxcf.screen_content_mode == 2) { - if (vp8_drop_encodedframe_overshoot(cpi, Q)) return; + + if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->rt_drop_recode_on_overshoot == 1) { + if (vp8_drop_encodedframe_overshoot(cpi, Q)) { + vpx_clear_system_state(); + return; + } + if (cm->frame_type != KEY_FRAME) + cpi->last_pred_err_mb = + (int)(cpi->mb.prediction_error / cpi->common.MBs); } cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi); @@ -4034,7 +4018,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, q_low = cpi->active_best_quality; q_high = cpi->active_worst_quality; - loop_count++; Loop = 1; continue; @@ -4047,7 +4030,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; /* Are we are overshooting and up against the limit of active max Q. */ - if (((cpi->pass != 2) || + if (!cpi->rt_always_update_correction_factor && + ((cpi->pass != 2) || (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) && (Q == cpi->active_worst_quality) && (cpi->active_worst_quality < cpi->worst_quality) && @@ -4066,9 +4050,9 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #if !CONFIG_REALTIME_ONLY top_index = cpi->active_worst_quality; #endif // !CONFIG_REALTIME_ONLY - /* If we have updated the active max Q do not call - * vp8_update_rate_correction_factors() this loop. - */ + /* If we have updated the active max Q do not call + * vp8_update_rate_correction_factors() this loop. + */ active_worst_qchanged = 1; } else { active_worst_qchanged = 0; @@ -4244,11 +4228,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } /* Clamp cpi->zbin_over_quant */ - cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low) - ? zbin_oq_low - : (cpi->mb.zbin_over_quant > zbin_oq_high) - ? zbin_oq_high - : cpi->mb.zbin_over_quant; + cpi->mb.zbin_over_quant = + (cpi->mb.zbin_over_quant < zbin_oq_low) ? zbin_oq_low + : (cpi->mb.zbin_over_quant > zbin_oq_high) ? zbin_oq_high + : cpi->mb.zbin_over_quant; Loop = Q != last_q; } else { @@ -4260,7 +4243,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, if (Loop == 1) { vp8_restore_coding_context(cpi); - loop_count++; #if CONFIG_INTERNAL_STATS cpi->tot_recode_hits++; #endif @@ -4274,6 +4256,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->common.current_video_frame++; cpi->frames_since_key++; cpi->drop_frame_count++; + cpi->ext_refresh_frame_flags_pending = 0; // We advance the temporal pattern for dropped frames. cpi->temporal_pattern_counter++; return; @@ -4361,12 +4344,12 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, vp8_cal_dissimilarity(cpi); #endif - /* Update the GF useage maps. + /* Update the GF usage maps. * This is done after completing the compression of a frame when all * modes etc. are finalized but before loop filter */ if (cpi->oxcf.number_of_layers == 1) { - vp8_update_gf_useage_maps(cpi, cm, &cpi->mb); + vp8_update_gf_usage_maps(cpi, cm, &cpi->mb); } if (cm->frame_type == KEY_FRAME) cm->refresh_last_frame = 1; @@ -4382,8 +4365,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, /* For inter frames the current default behavior is that when * cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer * This is purely an encoder decision at present. + * Avoid this behavior when refresh flags are set by the user. */ - if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame) { + if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame && + !cpi->ext_refresh_frame_flags_pending) { cm->copy_buffer_to_arf = 2; } else { cm->copy_buffer_to_arf = 0; @@ -4422,11 +4407,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } #endif +#ifdef OUTPUT_YUV_SKINMAP + if (cpi->common.current_video_frame > 1) { + vp8_compute_skin_map(cpi, yuv_skinmap_file); + } +#endif + #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded) { + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) { /* start loopfilter in separate thread */ - sem_post(&cpi->h_event_start_lpf); + vp8_sem_post(&cpi->h_event_start_lpf); cpi->b_lpf_running = 1; + /* wait for the filter_level to be picked so that we can continue with + * stream packing */ + vp8_sem_wait(&cpi->h_event_end_lpf); } else #endif { @@ -4436,7 +4430,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, update_reference_frames(cpi); #ifdef OUTPUT_YUV_DENOISED - vp8_write_yuv_frame(yuv_denoised_file, + vpx_write_yuv_frame(yuv_denoised_file, &cpi->denoiser.yv12_running_avg[INTRA_FRAME]); #endif @@ -4446,12 +4440,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } #endif -#if CONFIG_MULTITHREAD - /* wait that filter_level is picked so that we can continue with stream - * packing */ - if (cpi->b_multi_threaded) sem_wait(&cpi->h_event_end_lpf); -#endif - /* build the bitstream */ vp8_pack_bitstream(cpi, dest, dest_end, size); @@ -4518,7 +4506,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, * size within range) then use the last frame value - 1. The -1 * is designed to stop Q and hence the data rate, from * progressively falling away during difficult sections, but at - * the same time reduce the number of itterations around the + * the same time reduce the number of iterations around the * recode loop. */ if (Q > cpi->ni_av_qi) cpi->ni_av_qi = Q - 1; @@ -4539,10 +4527,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; } - // If the frame dropper is not enabled, don't let the buffer level go below - // some threshold, given here by -|maximum_buffer_size|. For now we only do - // this for screen content input. - if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode && + // Don't let the buffer level go below some threshold, given here + // by -|maximum_buffer_size|. For now we only do this for + // screen content input. + if (cpi->oxcf.screen_content_mode && cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size) { cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size; } @@ -4550,22 +4538,24 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, /* Rolling monitors of whether we are over or underspending used to * help regulate min and Max Q in two pass. */ - cpi->rolling_target_bits = - ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4; - cpi->rolling_actual_bits = - ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4; - cpi->long_rolling_target_bits = - ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32; - cpi->long_rolling_actual_bits = - ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / - 32; + cpi->rolling_target_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)cpi->rolling_target_bits * 3 + cpi->this_frame_target, 2); + cpi->rolling_actual_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)cpi->rolling_actual_bits * 3 + cpi->projected_frame_size, 2); + cpi->long_rolling_target_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)cpi->long_rolling_target_bits * 31 + cpi->this_frame_target, 5); + cpi->long_rolling_actual_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)cpi->long_rolling_actual_bits * 31 + cpi->projected_frame_size, + 5); /* Actual bits spent */ cpi->total_actual_bits += cpi->projected_frame_size; +#if 0 && CONFIG_INTERNAL_STATS /* Debug stats */ cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size); +#endif cpi->buffer_level = cpi->bits_off_target; @@ -4575,8 +4565,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; - int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate - - cpi->projected_frame_size); + int bits_off_for_this_layer = (int)round( + lc->target_bandwidth / lc->framerate - cpi->projected_frame_size); lc->bits_off_target += bits_off_for_this_layer; @@ -4687,6 +4677,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif + cpi->ext_refresh_frame_flags_pending = 0; + if (cm->refresh_golden_frame == 1) { cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN; } else { @@ -4761,7 +4753,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->mb.e_mbd.update_mb_segmentation_data = 0; cpi->mb.e_mbd.mode_ref_lf_delta_update = 0; - /* Dont increment frame counters if this was an altref buffer update + /* Don't increment frame counters if this was an altref buffer update * not a real frame */ if (cm->show_frame) { @@ -4770,8 +4762,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->temporal_pattern_counter++; } -/* reset to normal state now that we are done. */ - #if 0 { char filename[512]; @@ -4785,7 +4775,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #endif /* DEBUG */ - /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */ + /* vpx_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */ } #if !CONFIG_REALTIME_ONLY static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest, @@ -4807,10 +4797,14 @@ static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest, int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { +#if CONFIG_INTERNAL_STATS struct vpx_usec_timer timer; +#endif int res = 0; +#if CONFIG_INTERNAL_STATS vpx_usec_timer_start(&timer); +#endif /* Reinit the lookahead buffer if the frame size changes */ if (sd->y_width != cpi->oxcf.Width || sd->y_height != cpi->oxcf.Height) { @@ -4823,8 +4817,10 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL)) { res = -1; } +#if CONFIG_INTERNAL_STATS vpx_usec_timer_mark(&timer); cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); +#endif return res; } @@ -4845,24 +4841,19 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush) { VP8_COMMON *cm; - struct vpx_usec_timer tsctimer; struct vpx_usec_timer ticktimer; +#if CONFIG_INTERNAL_STATS struct vpx_usec_timer cmptimer; +#endif YV12_BUFFER_CONFIG *force_src_buffer = NULL; if (!cpi) return -1; cm = &cpi->common; - if (setjmp(cpi->common.error.jmp)) { - cpi->common.error.setjmp = 0; - vpx_clear_system_state(); - return VPX_CODEC_CORRUPT_FRAME; - } - - cpi->common.error.setjmp = 1; - +#if CONFIG_INTERNAL_STATS vpx_usec_timer_start(&cmptimer); +#endif cpi->source = NULL; @@ -4950,6 +4941,8 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen; last_duration = cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen; + // Cap this to avoid overflow of (this_duration - last_duration) * 10 + this_duration = VPXMIN(this_duration, INT64_MAX / 10); /* do a step update if the duration changes by 10% */ if (last_duration) { step = (int)(((this_duration - last_duration) * 10 / last_duration)); @@ -4987,13 +4980,17 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, // be received for that high layer, which will yield an incorrect // frame rate (from time-stamp adjustment in above calculation). if (cpi->oxcf.mr_encoder_id) { - cpi->ref_framerate = low_res_frame_info->low_res_framerate; + if (!low_res_frame_info->skip_encoding_base_stream) + cpi->ref_framerate = low_res_frame_info->low_res_framerate; } else { // Keep track of frame rate for lowest resolution. low_res_frame_info->low_res_framerate = cpi->ref_framerate; + // The base stream is being encoded so set skip flag to 0. + low_res_frame_info->skip_encoding_base_stream = 0; } } #endif + cpi->ref_framerate = clamp_framerate(cpi->ref_framerate); if (cpi->oxcf.number_of_layers > 1) { unsigned int i; @@ -5016,7 +5013,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, if (cpi->oxcf.number_of_layers > 1) { int layer; - update_layer_contexts(cpi); + vp8_update_layer_contexts(cpi); /* Restore layer specific context & set frame rate */ if (cpi->temporal_layer_id >= 0) { @@ -5026,12 +5023,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, cpi->oxcf .layer_id[cpi->temporal_pattern_counter % cpi->oxcf.periodicity]; } - restore_layer_context(cpi, layer); + vp8_restore_layer_context(cpi, layer); vp8_new_framerate(cpi, cpi->layer_context[layer].framerate); } if (cpi->compressor_speed == 2) { - vpx_usec_timer_start(&tsctimer); vpx_usec_timer_start(&ticktimer); } @@ -5096,7 +5092,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, } switch (cpi->pass) { #if !CONFIG_REALTIME_ONLY - case 1: Pass1Encode(cpi, size, dest, frame_flags); break; + case 1: Pass1Encode(cpi); break; case 2: Pass2Encode(cpi, size, dest, dest_end, frame_flags); break; #endif // !CONFIG_REALTIME_ONLY default: @@ -5106,7 +5102,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, if (cpi->compressor_speed == 2) { unsigned int duration, duration2; - vpx_usec_timer_mark(&tsctimer); vpx_usec_timer_mark(&ticktimer); duration = (int)(vpx_usec_timer_elapsed(&ticktimer)); @@ -5133,18 +5128,18 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, } if (cm->refresh_entropy_probs == 0) { - memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc)); + cm->fc = cm->lfc; } /* Save the contexts separately for alt ref, gold and last. */ /* (TODO jbb -> Optimize this with pointers to avoid extra copies. ) */ - if (cm->refresh_alt_ref_frame) memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc)); + if (cm->refresh_alt_ref_frame) cpi->lfc_a = cm->fc; - if (cm->refresh_golden_frame) memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc)); + if (cm->refresh_golden_frame) cpi->lfc_g = cm->fc; - if (cm->refresh_last_frame) memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc)); + if (cm->refresh_last_frame) cpi->lfc_n = cm->fc; - /* if its a dropped frame honor the requests on subsequent frames */ + /* if it's a dropped frame honor the requests on subsequent frames */ if (*size > 0) { cpi->droppable = !frame_is_reference(cpi); @@ -5157,10 +5152,20 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, } /* Save layer specific state */ - if (cpi->oxcf.number_of_layers > 1) save_layer_context(cpi); + if (cpi->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi); +#if CONFIG_INTERNAL_STATS vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); +#endif + +#if CONFIG_MULTITHREAD + /* wait for the lpf thread done */ + if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) { + vp8_sem_wait(&cpi->h_event_end_lpf); + cpi->b_lpf_running = 0; + } +#endif if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) { generate_psnr_packet(cpi); @@ -5215,7 +5220,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, double weight = 0; vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, - cm->filter_level * 10 / 6, 1, 0); + cm->filter_level * 10 / 6); vpx_clear_system_state(); ye = calc_plane_error(orig->y_buffer, orig->y_stride, pp->y_buffer, @@ -5289,16 +5294,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, #endif #endif - cpi->common.error.setjmp = 0; - -#if CONFIG_MULTITHREAD - /* wait for the lpf thread done */ - if (cpi->b_multi_threaded && cpi->b_lpf_running) { - sem_wait(&cpi->h_event_end_lpf); - cpi->b_lpf_running = 0; - } -#endif - return 0; } @@ -5339,28 +5334,25 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, const int range = 63; int i; - // This method is currently incompatible with the cyclic refresh method - if (cpi->cyclic_refresh_mode_enabled) return -1; - // Check number of rows and columns match if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols) { return -1; } - // Range check the delta Q values and convert the external Q range values - // to internal ones. - if ((abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) || - (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range)) { - return -1; + for (i = 0; i < MAX_MB_SEGMENTS; ++i) { + // Note abs() alone can't be used as the behavior of abs(INT_MIN) is + // undefined. + if (delta_q[i] > range || delta_q[i] < -range || delta_lf[i] > range || + delta_lf[i] < -range) { + return -1; + } } - // Range check the delta lf values - if ((abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) || - (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range)) { - return -1; - } - - if (!map) { + // Also disable segmentation if no deltas are specified. + if (!map || (delta_q[0] == 0 && delta_q[1] == 0 && delta_q[2] == 0 && + delta_q[3] == 0 && delta_lf[0] == 0 && delta_lf[1] == 0 && + delta_lf[2] == 0 && delta_lf[3] == 0 && threshold[0] == 0 && + threshold[1] == 0 && threshold[2] == 0 && threshold[3] == 0)) { disable_segmentation(cpi); return 0; } @@ -5397,6 +5389,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, /* Initialise the feature data structure */ set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA); + if (threshold[0] != 0 || threshold[1] != 0 || threshold[2] != 0 || + threshold[3] != 0) + cpi->use_roi_static_threshold = 1; + cpi->cyclic_refresh_mode_enabled = 0; + return 0; } @@ -5416,15 +5413,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, } } -int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode) { - if (horiz_mode <= ONETWO) { +int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode) { + if (horiz_mode <= VP8E_ONETWO) { cpi->common.horiz_scale = horiz_mode; } else { return -1; } - if (vert_mode <= ONETWO) { + if (vert_mode <= VP8E_ONETWO) { cpi->common.vert_scale = vert_mode; } else { return -1; diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_int.h b/media/libvpx/libvpx/vp8/encoder/onyx_int.h index fe775064a4..c2a137398e 100644 --- a/media/libvpx/libvpx/vp8/encoder/onyx_int.h +++ b/media/libvpx/libvpx/vp8/encoder/onyx_int.h @@ -8,16 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_ONYX_INT_H_ -#define VP8_ENCODER_ONYX_INT_H_ +#ifndef VPX_VP8_ENCODER_ONYX_INT_H_ +#define VPX_VP8_ENCODER_ONYX_INT_H_ +#include #include + #include "vpx_config.h" #include "vp8/common/onyx.h" #include "treewriter.h" #include "tokenize.h" #include "vp8/common/onyxc_int.h" #include "vpx_dsp/variance.h" +#include "vpx_util/vpx_pthread.h" #include "encodemb.h" #include "vp8/encoder/quantize.h" #include "vp8/common/entropy.h" @@ -57,6 +60,9 @@ extern "C" { #define VP8_TEMPORAL_ALT_REF !CONFIG_REALTIME_ONLY +/* vp8 uses 10,000,000 ticks/second as time stamp */ +#define TICKS_PER_SEC 10000000 + typedef struct { int kf_indicated; unsigned int frames_since_key; @@ -210,7 +216,7 @@ enum { typedef struct { /* Layer configuration */ double framerate; - int target_bandwidth; + int target_bandwidth; /* bits per second */ /* Layer specific coding parameters */ int64_t starting_buffer_level; @@ -226,7 +232,7 @@ typedef struct { int64_t bits_off_target; int64_t total_actual_bits; - int total_target_vs_actual; + int64_t total_target_vs_actual; int worst_quality; int active_worst_quality; @@ -249,10 +255,15 @@ typedef struct { int filter_level; + int frames_since_last_drop_overshoot; + + int force_maxqp; + int last_frame_percent_intra; int count_mb_ref_frame_usage[MAX_REF_FRAMES]; + int last_q[2]; } LAYER_CONTEXT; typedef struct VP8_COMP { @@ -331,7 +342,7 @@ typedef struct VP8_COMP { CODING_CONTEXT coding_context; - /* Rate targetting variables */ + /* Rate targeting variables */ int64_t last_prediction_error; int64_t last_intra_error; @@ -350,7 +361,7 @@ typedef struct VP8_COMP { /* GF interval chosen when we coded the last GF */ int current_gf_interval; - /* Total bits overspent becasue of GF boost (cumulative) */ + /* Total bits overspent because of GF boost (cumulative) */ int gf_overspend_bits; /* Used in the few frames following a GF to recover the extra bits @@ -402,7 +413,7 @@ typedef struct VP8_COMP { int long_rolling_actual_bits; int64_t total_actual_bits; - int total_target_vs_actual; /* debug stats */ + int64_t total_target_vs_actual; /* debug stats */ int worst_quality; int active_worst_quality; @@ -428,7 +439,7 @@ typedef struct VP8_COMP { int kf_boost; int last_boost; - int target_bandwidth; + int target_bandwidth; /* bits per second */ struct vpx_codec_pkt_list *output_pkt_list; #if 0 @@ -471,9 +482,11 @@ typedef struct VP8_COMP { int zeromv_count; int lf_zeromv_pct; + unsigned char *skin_map; + unsigned char *segmentation_map; signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; - int segment_encode_breakout[MAX_MB_SEGMENTS]; + unsigned int segment_encode_breakout[MAX_MB_SEGMENTS]; unsigned char *active_map; unsigned int active_map_enabled; @@ -503,6 +516,8 @@ typedef struct VP8_COMP { int mse_source_denoised; int force_maxqp; + int frames_since_last_drop_overshoot; + int last_pred_err_mb; // GF update for 1 pass cbr. int gf_update_onepass_cbr; @@ -511,11 +526,10 @@ typedef struct VP8_COMP { #if CONFIG_MULTITHREAD /* multithread data */ - pthread_mutex_t *pmutex; - pthread_mutex_t mt_mutex; /* mutex for b_multi_threaded */ - int *mt_current_mb_col; + vpx_atomic_int *mt_current_mb_col; + int mt_current_mb_col_size; int mt_sync_range; - int b_multi_threaded; + vpx_atomic_int b_multi_threaded; int encoding_thread_count; int b_lpf_running; @@ -527,10 +541,10 @@ typedef struct VP8_COMP { LPFTHREAD_DATA lpf_thread_data; /* events */ - sem_t *h_event_start_encoding; - sem_t *h_event_end_encoding; - sem_t h_event_start_lpf; - sem_t h_event_end_lpf; + vp8_sem_t *h_event_start_encoding; + vp8_sem_t *h_event_end_encoding; + vp8_sem_t h_event_start_lpf; + vp8_sem_t h_event_end_lpf; #endif TOKENLIST *tplist; @@ -539,14 +553,15 @@ typedef struct VP8_COMP { unsigned char *partition_d_end[MAX_PARTITIONS]; fractional_mv_step_fp *find_fractional_mv_step; - vp8_full_search_fn_t full_search_sad; vp8_refining_search_fn_t refining_search_sad; vp8_diamond_search_fn_t diamond_search_sad; vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS]; +#if CONFIG_INTERNAL_STATS uint64_t time_receive_data; uint64_t time_compress_data; uint64_t time_pick_lpf; uint64_t time_encode_mb_row; +#endif int base_skip_false_prob[128]; @@ -610,7 +625,7 @@ typedef struct VP8_COMP { double totalp_v; double totalp; double total_sq_error2; - int bytes; + uint64_t bytes; double summed_quality; double summed_weights; unsigned int tot_recode_hits; @@ -687,12 +702,33 @@ typedef struct VP8_COMP { int token_costs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; } rd_costs; + + // Use the static threshold from ROI settings. + int use_roi_static_threshold; + + int ext_refresh_frame_flags_pending; + + // Always update correction factor used for rate control after each frame for + // realtime encoding. + int rt_always_update_correction_factor; + + // Flag to indicate frame may be dropped due to large expected overshoot, + // and re-encoded on next frame at max_qp. + int rt_drop_recode_on_overshoot; } VP8_COMP; void vp8_initialize_enc(void); void vp8_alloc_compressor_data(VP8_COMP *cpi); int vp8_reverse_trans(int x); +void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf, + const int prev_num_layers); +void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf, + const int layer, + double prev_layer_framerate); +void vp8_update_layer_contexts(VP8_COMP *cpi); +void vp8_save_layer_context(VP8_COMP *cpi); +void vp8_restore_layer_context(VP8_COMP *cpi, const int layer); void vp8_new_framerate(VP8_COMP *cpi, double framerate); void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm); @@ -703,26 +739,10 @@ void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **); void vp8_set_speed_features(VP8_COMP *cpi); -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - lval = (expr); \ - if (!lval) \ - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval " at %s:%d", __FILE__, \ - __LINE__); \ - } while (0) -#else -#define CHECK_MEM_ERROR(lval, expr) \ - do { \ - lval = (expr); \ - if (!lval) \ - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval); \ - } while (0) -#endif +int vp8_check_drop_buffer(VP8_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_ENCODER_ONYX_INT_H_ +#endif // VPX_VP8_ENCODER_ONYX_INT_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/pickinter.c b/media/libvpx/libvpx/vp8/encoder/pickinter.c index 7b68d35f50..ca6c18f48e 100644 --- a/media/libvpx/libvpx/vp8/encoder/pickinter.c +++ b/media/libvpx/libvpx/vp8/encoder/pickinter.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "vpx_config.h" #include "./vpx_dsp_rtcd.h" @@ -24,6 +25,7 @@ #include "vp8/common/reconintra4x4.h" #include "vpx_dsp/variance.h" #include "mcomp.h" +#include "vp8/common/vp8_skin_detection.h" #include "rdopt.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" @@ -35,82 +37,9 @@ extern unsigned int cnt_pm; #endif -#define MODEL_MODE 1 - extern const int vp8_ref_frame_order[MAX_MODES]; extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; -// Fixed point implementation of a skin color classifier. Skin color -// is model by a Gaussian distribution in the CbCr color space. -// See ../../test/skin_color_detector_test.cc where the reference -// skin color classifier is defined. - -// Fixed-point skin color model parameters. -static const int skin_mean[5][2] = { { 7463, 9614 }, - { 6400, 10240 }, - { 7040, 10240 }, - { 8320, 9280 }, - { 6800, 9614 } }; -static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16 -static const int skin_threshold[6] = { 1570636, 1400000, 800000, - 800000, 800000, 800000 }; // q18 - -// Evaluates the Mahalanobis distance measure for the input CbCr values. -static int evaluate_skin_color_difference(int cb, int cr, int idx) { - const int cb_q6 = cb << 6; - const int cr_q6 = cr << 6; - const int cb_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]); - const int cbcr_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]); - const int cr_diff_q12 = - (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]); - const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10; - const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10; - const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10; - const int skin_diff = - skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 + - skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2; - return skin_diff; -} - -// Checks if the input yCbCr values corresponds to skin color. -static int is_skin_color(int y, int cb, int cr, int consec_zeromv) { - if (y < 40 || y > 220) { - return 0; - } else { - if (MODEL_MODE == 0) { - return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]); - } else { - int i = 0; - // No skin if block has been zero motion for long consecutive time. - if (consec_zeromv > 60) return 0; - // Exit on grey. - if (cb == 128 && cr == 128) return 0; - // Exit on very strong cb. - if (cb > 150 && cr < 110) return 0; - for (; i < 5; ++i) { - int skin_color_diff = evaluate_skin_color_difference(cb, cr, i); - if (skin_color_diff < skin_threshold[i + 1]) { - if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) { - return 0; - } else if (consec_zeromv > 25 && - skin_color_diff > (skin_threshold[i + 1] >> 1)) { - return 0; - } else { - return 1; - } - } - // Exit if difference is much large than the threshold. - if (skin_color_diff > (skin_threshold[i + 1] << 3)) { - return 0; - } - } - return 0; - } - } -} - static int macroblock_corner_grad(unsigned char *signal, int stride, int offsetx, int offsety, int sgnx, int sgny) { @@ -206,8 +135,8 @@ int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, (void)mvcost; (void)distortion; (void)sse; - bestmv->as_mv.row <<= 3; - bestmv->as_mv.col <<= 3; + bestmv->as_mv.row = clamp(bestmv->as_mv.row * 8, SHRT_MIN, SHRT_MAX); + bestmv->as_mv.col = clamp(bestmv->as_mv.col * 8, SHRT_MIN, SHRT_MAX); return 0; } @@ -244,9 +173,8 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b) { static int pick_intra4x4block(MACROBLOCK *x, int ib, B_PREDICTION_MODE *best_mode, - const int *mode_costs, - - int *bestrate, int *bestdistortion) { + const int *mode_costs, int *bestrate, + int *bestdistortion) { BLOCKD *b = &x->e_mbd.block[ib]; BLOCK *be = &x->block[ib]; int dst_stride = x->e_mbd.dst.y_stride; @@ -299,8 +227,8 @@ static int pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *best_dist) { MODE_INFO *const mic = xd->mode_info_context; const int mis = xd->mode_info_stride; - B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); - int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d); + B_PREDICTION_MODE best_mode = B_MODE_COUNT; + int r = 0, d = 0; if (mb->e_mbd.frame_type == KEY_FRAME) { const B_PREDICTION_MODE A = above_block_mode(mic, i, mis); @@ -313,6 +241,7 @@ static int pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *best_dist) { cost += r; distortion += d; + assert(best_mode != B_MODE_COUNT); mic->bmi[i].as_mode = best_mode; /* Break out case where we have already exceeded best so far value @@ -353,7 +282,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb) { int Vaverage = 0; int diff; int pred_error[4] = { 0, 0, 0, 0 }, best_error = INT_MAX; - MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); + MB_PREDICTION_MODE best_mode = MB_MODE_COUNT; for (i = 0; i < 8; ++i) { uleft_col[i] = x->dst.u_buffer[i * x->dst.uv_stride - 1]; @@ -442,6 +371,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb) { } } + assert(best_mode != MB_MODE_COUNT); mb->e_mbd.mode_info_context->mbmi.uv_mode = best_mode; } @@ -450,12 +380,18 @@ static void update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv) { /* Split MV modes currently not supported when RD is nopt enabled, * therefore, only need to modify MVcount in NEWMV mode. */ if (xd->mode_info_context->mbmi.mode == NEWMV) { - x->MVcount[0][mv_max + ((xd->mode_info_context->mbmi.mv.as_mv.row - - best_ref_mv->as_mv.row) >> - 1)]++; - x->MVcount[1][mv_max + ((xd->mode_info_context->mbmi.mv.as_mv.col - - best_ref_mv->as_mv.col) >> - 1)]++; + const int row_val = + ((xd->mode_info_context->mbmi.mv.as_mv.row - best_ref_mv->as_mv.row) >> + 1); + const int row_idx = mv_max + row_val; + const int col_val = + ((xd->mode_info_context->mbmi.mv.as_mv.col - best_ref_mv->as_mv.col) >> + 1); + const int col_idx = mv_max + col_val; + if (row_idx >= 0 && row_idx < MVvals && col_idx >= 0 && col_idx < MVvals) { + x->MVcount[0][row_idx]++; + x->MVcount[1][col_idx]++; + } } } @@ -633,7 +569,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO best_mbmode; - int_mv best_ref_mv_sb[2]; + int_mv best_ref_mv_sb[2] = { { 0 }, { 0 } }; int_mv mode_mv_sb[2][MB_MODE_COUNT]; int_mv best_ref_mv; int_mv *mode_mv; @@ -671,7 +607,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, /* search range got from mv_pred(). It uses step_param levels. (0-7) */ int sr = 0; - unsigned char *plane[4][3]; + unsigned char *plane[4][3] = { { 0, 0 } }; int ref_frame_map[4]; int sign_bias = 0; int dot_artifact_candidate = 0; @@ -700,13 +636,16 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, } } #endif + assert(plane[LAST_FRAME][0] != NULL); dot_artifact_candidate = check_dot_artifact_candidate( cpi, x, target_y, stride, plane[LAST_FRAME][0], mb_row, mb_col, 0); // If not found in Y channel, check UV channel. if (!dot_artifact_candidate) { + assert(plane[LAST_FRAME][1] != NULL); dot_artifact_candidate = check_dot_artifact_candidate( cpi, x, target_u, stride_uv, plane[LAST_FRAME][1], mb_row, mb_col, 1); if (!dot_artifact_candidate) { + assert(plane[LAST_FRAME][2] != NULL); dot_artifact_candidate = check_dot_artifact_candidate( cpi, x, target_v, stride_uv, plane[LAST_FRAME][2], mb_row, mb_col, 2); @@ -757,27 +696,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #endif // Check if current macroblock is in skin area. - { - const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] + - x->src.y_buffer[7 * x->src.y_stride + 8] + - x->src.y_buffer[8 * x->src.y_stride + 7] + - x->src.y_buffer[8 * x->src.y_stride + 8]) >> - 2; - const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] + - x->src.u_buffer[3 * x->src.uv_stride + 4] + - x->src.u_buffer[4 * x->src.uv_stride + 3] + - x->src.u_buffer[4 * x->src.uv_stride + 4]) >> - 2; - const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] + - x->src.v_buffer[3 * x->src.uv_stride + 4] + - x->src.v_buffer[4 * x->src.uv_stride + 3] + - x->src.v_buffer[4 * x->src.uv_stride + 4]) >> - 2; - x->is_skin = 0; - if (!cpi->oxcf.screen_content_mode) { - int block_index = mb_row * cpi->common.mb_cols + mb_col; - x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]); - } + x->is_skin = 0; + if (!cpi->oxcf.screen_content_mode) { + int block_index = mb_row * cpi->common.mb_cols + mb_col; + x->is_skin = cpi->skin_map[block_index]; } #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { @@ -827,10 +749,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; /* If the frame has big static background and current MB is in low - * motion area, its mode decision is biased to ZEROMV mode. - * No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12). - * At such speed settings, ZEROMV is already heavily favored. - */ + * motion area, its mode decision is biased to ZEROMV mode. + * No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12). + * At such speed settings, ZEROMV is already heavily favored. + */ if (cpi->Speed < 12) { calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment); } @@ -1102,7 +1024,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #endif bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16], - x->mvsadcost, x->mvcost, &best_ref_mv); + x->mvsadcost, &best_ref_mv); mode_mv[NEWMV].as_int = d->bmi.mv.as_int; } else { bestsme = cpi->diamond_search_sad( @@ -1154,10 +1076,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128); } + // fall through case NEARESTMV: case NEARMV: if (mode_mv[this_mode].as_int == 0) continue; + // fall through case ZEROMV: @@ -1185,7 +1109,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { /* Store for later use by denoiser. */ - // Dont' denoise with GOLDEN OR ALTREF is they are old reference + // Don't denoise with GOLDEN OR ALTREF is they are old reference // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past). int skip_old_reference = ((this_ref_frame != LAST_FRAME) && (cpi->common.current_video_frame - @@ -1387,9 +1311,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, update_mvcount(x, &best_ref_mv); } -void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) { +void vp8_pick_intra_mode(MACROBLOCK *x, int *rate) { int error4x4, error16x16 = INT_MAX; - int rate, best_rate = 0, distortion, best_sse; + int rate_, best_rate = 0, distortion, best_sse; MB_PREDICTION_MODE mode, best_mode = DC_PRED; int this_rd; unsigned int sse; @@ -1407,23 +1331,23 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) { xd->predictor, 16); distortion = vpx_variance16x16(*(b->base_src), b->src_stride, xd->predictor, 16, &sse); - rate = x->mbmode_cost[xd->frame_type][mode]; - this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + rate_ = x->mbmode_cost[xd->frame_type][mode]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate_, distortion); if (error16x16 > this_rd) { error16x16 = this_rd; best_mode = mode; best_sse = sse; - best_rate = rate; + best_rate = rate_; } } xd->mode_info_context->mbmi.mode = best_mode; - error4x4 = pick_intra4x4mby_modes(x, &rate, &best_sse); + error4x4 = pick_intra4x4mby_modes(x, &rate_, &best_sse); if (error4x4 < error16x16) { xd->mode_info_context->mbmi.mode = B_PRED; - best_rate = rate; + best_rate = rate_; } - *rate_ = best_rate; + *rate = best_rate; } diff --git a/media/libvpx/libvpx/vp8/encoder/pickinter.h b/media/libvpx/libvpx/vp8/encoder/pickinter.h index bf1d0c9749..392fb41593 100644 --- a/media/libvpx/libvpx/vp8/encoder/pickinter.h +++ b/media/libvpx/libvpx/vp8/encoder/pickinter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_PICKINTER_H_ -#define VP8_ENCODER_PICKINTER_H_ +#ifndef VPX_VP8_ENCODER_PICKINTER_H_ +#define VPX_VP8_ENCODER_PICKINTER_H_ #include "vpx_config.h" #include "vp8/common/onyxc_int.h" @@ -30,4 +30,4 @@ extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb, } // extern "C" #endif -#endif // VP8_ENCODER_PICKINTER_H_ +#endif // VPX_VP8_ENCODER_PICKINTER_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/picklpf.c b/media/libvpx/libvpx/vp8/encoder/picklpf.c index 6f287322ec..498738fd6d 100644 --- a/media/libvpx/libvpx/vp8/encoder/picklpf.c +++ b/media/libvpx/libvpx/vp8/encoder/picklpf.c @@ -12,12 +12,13 @@ #include "./vpx_scale_rtcd.h" #include "vp8/common/onyxc_int.h" #include "onyx_int.h" +#include "vp8/encoder/picklpf.h" #include "vp8/encoder/quantize.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/vpx_scale.h" #include "vp8/common/alloccommon.h" #include "vp8/common/loopfilter.h" -#if ARCH_ARM +#if VPX_ARCH_ARM #include "vpx_ports/arm.h" #endif @@ -49,6 +50,14 @@ static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, src_y = src_ybc->y_buffer + yoffset; dst_y = dst_ybc->y_buffer + yoffset; + // The border will be used in vp8_loop_filter_partial_frame so it needs to be + // extended to avoid a valgrind warning. + const unsigned char *const top_row = src_ybc->y_buffer; + for (int i = yoffset; i < 0; i += ystride, --linestocopy) { + memcpy(dst_y, top_row, ystride); + dst_y += ystride; + src_y += ystride; + } memcpy(dst_y, src_y, ystride * linestocopy); } diff --git a/media/libvpx/libvpx/vp8/encoder/picklpf.h b/media/libvpx/libvpx/vp8/encoder/picklpf.h new file mode 100644 index 0000000000..03597e5427 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/picklpf.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_PICKLPF_H_ +#define VPX_VP8_ENCODER_PICKLPF_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; +struct yv12_buffer_config; + +void vp8cx_pick_filter_level_fast(struct yv12_buffer_config *sd, + struct VP8_COMP *cpi); +void vp8cx_set_alt_lf_level(struct VP8_COMP *cpi, int filt_val); +void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi); + +#ifdef __cplusplus +} +#endif + +#endif // VPX_VP8_ENCODER_PICKLPF_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/quantize.h b/media/libvpx/libvpx/vp8/encoder/quantize.h index 267150f99f..78746c0c20 100644 --- a/media/libvpx/libvpx/vp8/encoder/quantize.h +++ b/media/libvpx/libvpx/vp8/encoder/quantize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_QUANTIZE_H_ -#define VP8_ENCODER_QUANTIZE_H_ +#ifndef VPX_VP8_ENCODER_QUANTIZE_H_ +#define VPX_VP8_ENCODER_QUANTIZE_H_ #ifdef __cplusplus extern "C" { @@ -31,4 +31,4 @@ extern void vp8cx_init_quantizer(struct VP8_COMP *cpi); } // extern "C" #endif -#endif // VP8_ENCODER_QUANTIZE_H_ +#endif // VPX_VP8_ENCODER_QUANTIZE_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.c b/media/libvpx/libvpx/vp8/encoder/ratectrl.c index e89247ae4a..5313f4ad1b 100644 --- a/media/libvpx/libvpx/vp8/encoder/ratectrl.c +++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.c @@ -259,9 +259,9 @@ void vp8_setup_key_frame(VP8_COMP *cpi) { /* Make sure we initialize separate contexts for altref,gold, and normal. * TODO shouldn't need 3 different copies of structure to do this! */ - memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc)); - memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc)); - memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc)); + cpi->lfc_a = cpi->common.fc; + cpi->lfc_g = cpi->common.fc; + cpi->lfc_n = cpi->common.fc; cpi->common.filter_level = cpi->common.base_qindex * 3 / 8; @@ -314,7 +314,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi) { * bandwidth per second * fraction of the initial buffer * level */ - target = cpi->oxcf.starting_buffer_level / 2; + target = (uint64_t)cpi->oxcf.starting_buffer_level / 2; if (target > cpi->oxcf.target_bandwidth * 3 / 2) { target = cpi->oxcf.target_bandwidth * 3 / 2; @@ -327,7 +327,13 @@ static void calc_iframe_target_size(VP8_COMP *cpi) { int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */ /* Boost depends somewhat on frame rate: only used for 1 layer case. */ if (cpi->oxcf.number_of_layers == 1) { - kf_boost = VPXMAX(initial_boost, (int)(2 * cpi->output_framerate - 16)); + kf_boost = + VPXMAX(initial_boost, (int)round(2 * cpi->output_framerate - 16)); + // cpi->output_framerate may be as large as 10M. Keep kf_boost small + // enough to allow for integer math when multiplying by values in + // kf_boost_qadjustment[]. + const int kMaxKfBoost = 2000; + if (kf_boost > kMaxKfBoost) kf_boost = kMaxKfBoost; } else { /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */ kf_boost = initial_boost; @@ -345,19 +351,24 @@ static void calc_iframe_target_size(VP8_COMP *cpi) { /* Minimal target size is |2* per_frame_bandwidth|. */ if (kf_boost < 16) kf_boost = 16; - target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4; + target = ((uint64_t)(16 + kf_boost) * cpi->per_frame_bandwidth) >> 4; + target = VPXMIN(INT_MAX, target); } if (cpi->oxcf.rc_max_intra_bitrate_pct) { - unsigned int max_rate = - cpi->per_frame_bandwidth * cpi->oxcf.rc_max_intra_bitrate_pct / 100; + unsigned int max_rate; + // This product may overflow unsigned int + uint64_t product = cpi->per_frame_bandwidth; + product *= cpi->oxcf.rc_max_intra_bitrate_pct; + product /= 100; + max_rate = (unsigned int)VPXMIN(INT_MAX, product); if (target > max_rate) target = max_rate; } cpi->this_frame_target = (int)target; - /* TODO: if we separate rate targeting from Q targetting, move this. + /* TODO: if we separate rate targeting from Q targeting, move this. * Reset the active worst quality to the baseline value for key frames. */ if (cpi->pass != 2) cpi->active_worst_quality = cpi->worst_quality; @@ -383,7 +394,7 @@ static void calc_gf_params(VP8_COMP *cpi) { (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; int Boost = 0; - int gf_frame_useage = 0; /* Golden frame useage since last GF */ + int gf_frame_usage = 0; /* Golden frame usage since last GF */ int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + cpi->recent_ref_frame_usage[LAST_FRAME] + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + @@ -393,12 +404,12 @@ static void calc_gf_params(VP8_COMP *cpi) { (cpi->common.mb_rows * cpi->common.mb_cols); if (tot_mbs) { - gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + - cpi->recent_ref_frame_usage[ALTREF_FRAME]) * - 100 / tot_mbs; + gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * + 100 / tot_mbs; } - if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active; + if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active; /* Not two pass */ if (cpi->pass != 2) { @@ -462,7 +473,7 @@ static void calc_gf_params(VP8_COMP *cpi) { /* Adjust boost based upon ambient Q */ Boost = GFQ_ADJUSTMENT; - /* Adjust based upon most recently measure intra useage */ + /* Adjust based upon most recently measure intra usage */ Boost = Boost * gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra @@ -470,7 +481,7 @@ static void calc_gf_params(VP8_COMP *cpi) { 100; /* Adjust gf boost based upon GF usage since last GF */ - Boost = Boost * gf_adjust_table[gf_frame_useage] / 100; + Boost = Boost * gf_adjust_table[gf_frame_usage] / 100; #endif } @@ -498,11 +509,9 @@ static void calc_gf_params(VP8_COMP *cpi) { * This is updated once the real frame size/boost is known. */ if (cpi->oxcf.fixed_q == -1) { - if (cpi->pass == 2) /* 2 Pass */ - { + if (cpi->pass == 2) { /* 2 Pass */ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; - } else /* 1 Pass */ - { + } else { /* 1 Pass */ cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; if (cpi->last_boost > 750) cpi->frames_till_gf_update_due++; @@ -513,8 +522,8 @@ static void calc_gf_params(VP8_COMP *cpi) { if (cpi->last_boost >= 1500) cpi->frames_till_gf_update_due++; - if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due) { - cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage]; + if (gf_interval_table[gf_frame_usage] > cpi->frames_till_gf_update_due) { + cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_usage]; } if (cpi->frames_till_gf_update_due > cpi->max_gf_interval) { @@ -634,11 +643,10 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { /* % Adjustment limited to the range 1% to 10% */ Adjustment = (cpi->last_boost - 100) >> 5; - if (Adjustment < 1) { - Adjustment = 1; - } else if (Adjustment > 10) { + if (Adjustment > 10) { Adjustment = 10; } + assert(Adjustment >= 1); /* Convert to bits */ Adjustment = (cpi->this_frame_target * Adjustment) / 100; @@ -715,7 +723,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { } /* lower the target bandwidth for this frame. */ - cpi->this_frame_target -= (cpi->this_frame_target * percent_low) / 200; + cpi->this_frame_target -= + (int)(((int64_t)cpi->this_frame_target * percent_low) / 200); /* Are we using allowing control of active_worst_allowed_q * according to buffer level. @@ -778,6 +787,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { } } else { int percent_high = 0; + int64_t target = cpi->this_frame_target; if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)) { @@ -785,8 +795,12 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { (int)((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) / one_percent_bits); } else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) { - percent_high = - (int)((100 * cpi->bits_off_target) / (cpi->total_byte_count * 8)); + if (cpi->total_byte_count > 0) { + percent_high = (int)((100 * cpi->bits_off_target) / + (cpi->total_byte_count * 8)); + } else { + percent_high = cpi->oxcf.over_shoot_pct; + } } if (percent_high > cpi->oxcf.over_shoot_pct) { @@ -795,7 +809,9 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { percent_high = 0; } - cpi->this_frame_target += (cpi->this_frame_target * percent_high) / 200; + target += (target * percent_high) / 200; + target = VPXMIN(target, INT_MAX); + cpi->this_frame_target = (int)target; /* Are we allowing control of active_worst_allowed_q according * to buffer level. @@ -889,7 +905,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; - int gf_frame_useage = 0; /* Golden frame useage since last GF */ + int gf_frame_usage = 0; /* Golden frame usage since last GF */ int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + cpi->recent_ref_frame_usage[LAST_FRAME] + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + @@ -899,20 +915,20 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { (cpi->common.mb_rows * cpi->common.mb_cols); if (tot_mbs) { - gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + - cpi->recent_ref_frame_usage[ALTREF_FRAME]) * - 100 / tot_mbs; + gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * + 100 / tot_mbs; } - if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active; + if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active; /* Is a fixed manual GF frequency being used */ if (cpi->auto_gold) { - /* For one pass throw a GF if recent frame intra useage is - * low or the GF useage is high + /* For one pass throw a GF if recent frame intra usage is + * low or the GF usage is high */ if ((cpi->pass == 0) && - (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) { + (cpi->this_frame_percent_intra < 15 || gf_frame_usage >= 5)) { cpi->common.refresh_golden_frame = 1; /* Two pass GF descision */ @@ -927,10 +943,10 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { if (0) { FILE *f; - f = fopen("gf_useaget.stt", "a"); + f = fopen("gf_usaget.stt", "a"); fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->gfu_boost, - GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage); + GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_usage); fclose(f); } @@ -998,7 +1014,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { * bits on this frame even if it is a contructed arf. * The active maximum quantizer insures that an appropriate * number of bits will be spent if needed for contstructed ARFs. - */ + */ cpi->this_frame_target = 0; } @@ -1054,9 +1070,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { * overflow when values are large */ projected_size_based_on_q = - (int)(((.5 + - rate_correction_factor * - vp8_bits_per_mb[cpi->common.frame_type][Q]) * + (int)(((.5 + rate_correction_factor * + vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS)); @@ -1077,8 +1092,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { /* Work out a size correction factor. */ if (projected_size_based_on_q > 0) { - correction_factor = - (100 * cpi->projected_frame_size) / projected_size_based_on_q; + correction_factor = (int)((100 * (int64_t)cpi->projected_frame_size) / + projected_size_based_on_q); } /* More heavily damped adjustment used if we have been oscillating @@ -1128,6 +1143,14 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { } } +static int limit_q_cbr_inter(int last_q, int current_q) { + int limit_down = 12; + if (last_q - current_q > limit_down) + return (last_q - limit_down); + else + return current_q; +} + int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { int Q = cpi->active_worst_quality; @@ -1175,10 +1198,13 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { /* Calculate required scaling factor based on target frame size and * size of frame produced using previous Q */ - if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) { - /* Case where we would overflow int */ - target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) - << BPER_MB_NORMBITS; + if (target_bits_per_frame > (INT_MAX >> BPER_MB_NORMBITS)) { + int temp = target_bits_per_frame / cpi->common.MBs; + if (temp > (INT_MAX >> BPER_MB_NORMBITS)) { + target_bits_per_mb = INT_MAX; + } else { + target_bits_per_mb = temp << BPER_MB_NORMBITS; + } } else { target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs; @@ -1267,6 +1293,12 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { } } + // Limit decrease in Q for 1 pass CBR screen content mode. + if (cpi->common.frame_type != KEY_FRAME && cpi->pass == 0 && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->oxcf.screen_content_mode) + Q = limit_q_cbr_inter(cpi->last_q[1], Q); + return Q; } @@ -1364,14 +1396,17 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, *frame_under_shoot_limit = 0; *frame_over_shoot_limit = INT_MAX; } else { + const int64_t this_frame_target = cpi->this_frame_target; + int64_t over_shoot_limit, under_shoot_limit; + if (cpi->common.frame_type == KEY_FRAME) { - *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; + over_shoot_limit = this_frame_target * 9 / 8; + under_shoot_limit = this_frame_target * 7 / 8; } else { if (cpi->oxcf.number_of_layers > 1 || cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) { - *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; + over_shoot_limit = this_frame_target * 9 / 8; + under_shoot_limit = this_frame_target * 7 / 8; } else { /* For CBR take buffer fullness into account */ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) { @@ -1381,18 +1416,18 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, /* Buffer is too full so relax overshoot and tighten * undershoot */ - *frame_over_shoot_limit = cpi->this_frame_target * 12 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 6 / 8; + over_shoot_limit = this_frame_target * 12 / 8; + under_shoot_limit = this_frame_target * 6 / 8; } else if (cpi->buffer_level <= (cpi->oxcf.optimal_buffer_level >> 1)) { /* Buffer is too low so relax undershoot and tighten * overshoot */ - *frame_over_shoot_limit = cpi->this_frame_target * 10 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 4 / 8; + over_shoot_limit = this_frame_target * 10 / 8; + under_shoot_limit = this_frame_target * 4 / 8; } else { - *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8; + over_shoot_limit = this_frame_target * 11 / 8; + under_shoot_limit = this_frame_target * 5 / 8; } } /* VBR and CQ mode */ @@ -1402,11 +1437,11 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, else { /* Stron overshoot limit for constrained quality */ if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { - *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8; + over_shoot_limit = this_frame_target * 11 / 8; + under_shoot_limit = this_frame_target * 2 / 8; } else { - *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8; + over_shoot_limit = this_frame_target * 11 / 8; + under_shoot_limit = this_frame_target * 5 / 8; } } } @@ -1416,9 +1451,13 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, * (eg * 7/8) may be tiny make sure there is at least a minimum * range. */ - *frame_over_shoot_limit += 200; - *frame_under_shoot_limit -= 200; - if (*frame_under_shoot_limit < 0) *frame_under_shoot_limit = 0; + over_shoot_limit += 200; + under_shoot_limit -= 200; + if (under_shoot_limit < 0) under_shoot_limit = 0; + if (under_shoot_limit > INT_MAX) under_shoot_limit = INT_MAX; + if (over_shoot_limit > INT_MAX) over_shoot_limit = INT_MAX; + *frame_under_shoot_limit = (int)under_shoot_limit; + *frame_over_shoot_limit = (int)over_shoot_limit; } } @@ -1442,12 +1481,33 @@ int vp8_pick_frame_size(VP8_COMP *cpi) { // If this just encoded frame (mcomp/transform/quant, but before loopfilter and // pack_bitstream) has large overshoot, and was not being encoded close to the // max QP, then drop this frame and force next frame to be encoded at max QP. -// Condition this on 1 pass CBR with screen content mode and frame dropper off. +// Allow this for screen_content_mode = 2, or if drop frames is allowed. // TODO(marpan): Should do this exit condition during the encode_frame // (i.e., halfway during the encoding of the frame) to save cycles. int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { - if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && - cpi->drop_frames_allowed == 0 && cpi->common.frame_type != KEY_FRAME) { + int force_drop_overshoot = 0; +#if CONFIG_MULTI_RES_ENCODING + // Only check for dropping due to overshoot on the lowest stream. + // If the lowest stream of the multi-res encoding was dropped due to + // overshoot, then force dropping on all upper layer streams + // (mr_encoder_id > 0). + LOWER_RES_FRAME_INFO *low_res_frame_info = + (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; + if (cpi->oxcf.mr_total_resolutions > 1 && cpi->oxcf.mr_encoder_id > 0) { + force_drop_overshoot = low_res_frame_info->is_frame_dropped_overshoot_maxqp; + if (!force_drop_overshoot) { + cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; + return 0; + } + } +#endif + if (cpi->common.frame_type != KEY_FRAME && + (cpi->oxcf.screen_content_mode == 2 || + (cpi->drop_frames_allowed && + (force_drop_overshoot || + (cpi->rate_correction_factor < (8.0f * MIN_BPB_FACTOR) && + cpi->frames_since_last_drop_overshoot > (int)cpi->framerate))))) { // Note: the "projected_frame_size" from encode_frame() only gives estimate // of mode/motion vector rate (in non-rd mode): so below we only require // that projected_frame_size is somewhat greater than per-frame-bandwidth, @@ -1458,17 +1518,21 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { // Rate threshold, in bytes. int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3); // Threshold for the average (over all macroblocks) of the pixel-sum - // residual error over 16x16 block. Should add QP dependence on threshold? - int thresh_pred_err_mb = (256 << 4); + // residual error over 16x16 block. + int thresh_pred_err_mb = (200 << 4); int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs); - if (Q < thresh_qp && cpi->projected_frame_size > thresh_rate && - pred_err_mb > thresh_pred_err_mb) { + // Reduce/ignore thresh_rate if pred_err_mb much larger than its threshold, + // give more weight to pred_err metric for overshoot detection. + if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4)) + thresh_rate = thresh_rate >> 3; + if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate && + pred_err_mb > thresh_pred_err_mb && + pred_err_mb > 2 * cpi->last_pred_err_mb) || + force_drop_overshoot) { + unsigned int i; double new_correction_factor; - const int target_size = cpi->av_per_frame_bandwidth; int target_bits_per_mb; - // Drop this frame: advance frame counters, and set force_maxqp flag. - cpi->common.current_video_frame++; - cpi->frames_since_key++; + const int target_size = cpi->av_per_frame_bandwidth; // Flag to indicate we will force next frame to be encoded at max QP. cpi->force_maxqp = 1; // Reset the buffer levels. @@ -1481,9 +1545,13 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { // undershoots significantly, and then we end up dropping every other // frame because the QP/rate_correction_factor may have been too low // before the drop and then takes too long to come up. - if (target_size >= (INT_MAX >> BPER_MB_NORMBITS)) { - target_bits_per_mb = (target_size / cpi->common.MBs) - << BPER_MB_NORMBITS; + if (target_size > (INT_MAX >> BPER_MB_NORMBITS)) { + int temp = target_size / cpi->common.MBs; + if (temp > (INT_MAX >> BPER_MB_NORMBITS)) { + target_bits_per_mb = INT_MAX; + } else { + target_bits_per_mb = temp << BPER_MB_NORMBITS; + } } else { target_bits_per_mb = (target_size << BPER_MB_NORMBITS) / cpi->common.MBs; @@ -1499,14 +1567,40 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) { if (cpi->rate_correction_factor > MAX_BPB_FACTOR) { cpi->rate_correction_factor = MAX_BPB_FACTOR; } + // Drop this frame: update frame counters. + cpi->common.current_video_frame++; + cpi->frames_since_key++; + cpi->temporal_pattern_counter++; + cpi->frames_since_last_drop_overshoot = 0; + if (cpi->oxcf.number_of_layers > 1) { + // Set max_qp and rate correction for all temporal layers if overshoot + // is detected. + for (i = 0; i < cpi->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + lc->force_maxqp = 1; + lc->frames_since_last_drop_overshoot = 0; + lc->rate_correction_factor = cpi->rate_correction_factor; + } + } +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 1; +#endif return 1; - } else { - cpi->force_maxqp = 0; - return 0; } cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0; +#endif return 0; } cpi->force_maxqp = 0; + cpi->frames_since_last_drop_overshoot++; +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_total_resolutions > 1) + low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0; +#endif return 0; } diff --git a/media/libvpx/libvpx/vp8/encoder/ratectrl.h b/media/libvpx/libvpx/vp8/encoder/ratectrl.h index 249de4e706..844c72cb86 100644 --- a/media/libvpx/libvpx/vp8/encoder/ratectrl.h +++ b/media/libvpx/libvpx/vp8/encoder/ratectrl.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_RATECTRL_H_ -#define VP8_ENCODER_RATECTRL_H_ +#ifndef VPX_VP8_ENCODER_RATECTRL_H_ +#define VPX_VP8_ENCODER_RATECTRL_H_ #include "onyx_int.h" @@ -37,4 +37,4 @@ extern int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q); } // extern "C" #endif -#endif // VP8_ENCODER_RATECTRL_H_ +#endif // VPX_VP8_ENCODER_RATECTRL_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/rdopt.c b/media/libvpx/libvpx/vp8/encoder/rdopt.c index 7bbeb28574..ad3866c770 100644 --- a/media/libvpx/libvpx/vp8/encoder/rdopt.c +++ b/media/libvpx/libvpx/vp8/encoder/rdopt.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include @@ -15,12 +16,14 @@ #include "vpx_config.h" #include "vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" +#include "encodeframe.h" #include "tokenize.h" #include "treewriter.h" #include "onyx_int.h" #include "modecosts.h" #include "encodeintra.h" #include "pickinter.h" +#include "vp8/common/common.h" #include "vp8/common/entropymode.h" #include "vp8/common/reconinter.h" #include "vp8/common/reconintra.h" @@ -106,11 +109,10 @@ const int vp8_ref_frame_order[MAX_MODES] = { 0, }; -static void fill_token_costs(int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS], - const vp8_prob p[BLOCK_TYPES][COEF_BANDS] - [PREV_COEF_CONTEXTS] - [ENTROPY_NODES]) { +static void fill_token_costs( + int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS], + const vp8_prob p[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [ENTROPY_NODES]) { int i, j, k; for (i = 0; i < BLOCK_TYPES; ++i) { @@ -449,8 +451,8 @@ static int vp8_rdcost_mby(MACROBLOCK *mb) { ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; - memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + t_above = *mb->e_mbd.above_context; + t_left = *mb->e_mbd.left_context; ta = (ENTROPY_CONTEXT *)&t_above; tl = (ENTROPY_CONTEXT *)&t_left; @@ -595,8 +597,8 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y, ENTROPY_CONTEXT *tl; const int *bmode_costs; - memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + t_above = *mb->e_mbd.above_context; + t_left = *mb->e_mbd.left_context; ta = (ENTROPY_CONTEXT *)&t_above; tl = (ENTROPY_CONTEXT *)&t_left; @@ -608,9 +610,8 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y, for (i = 0; i < 16; ++i) { MODE_INFO *const mic = xd->mode_info_context; const int mis = xd->mode_info_stride; - B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); - int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), - UNINITIALIZED_IS_SAFE(d); + B_PREDICTION_MODE best_mode = B_MODE_COUNT; + int r = 0, ry = 0, d = 0; if (mb->e_mbd.frame_type == KEY_FRAME) { const B_PREDICTION_MODE A = above_block_mode(mic, i, mis); @@ -627,6 +628,7 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y, distortion += d; tot_rate_y += ry; + assert(best_mode != B_MODE_COUNT); mic->bmi[i].as_mode = best_mode; if (total_rd >= (int64_t)best_rd) break; @@ -644,7 +646,7 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate, int *rate_y, static int rd_pick_intra16x16mby_mode(MACROBLOCK *x, int *Rate, int *rate_y, int *Distortion) { MB_PREDICTION_MODE mode; - MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); + MB_PREDICTION_MODE mode_selected = MB_MODE_COUNT; int rate, ratey; int distortion; int best_rd = INT_MAX; @@ -674,6 +676,7 @@ static int rd_pick_intra16x16mby_mode(MACROBLOCK *x, int *Rate, int *rate_y, } } + assert(mode_selected != MB_MODE_COUNT); xd->mode_info_context->mbmi.mode = mode_selected; return best_rd; } @@ -686,8 +689,8 @@ static int rd_cost_mbuv(MACROBLOCK *mb) { ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; - memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + t_above = *mb->e_mbd.above_context; + t_left = *mb->e_mbd.left_context; ta = (ENTROPY_CONTEXT *)&t_above; tl = (ENTROPY_CONTEXT *)&t_left; @@ -741,9 +744,9 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion) { MB_PREDICTION_MODE mode; - MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); + MB_PREDICTION_MODE mode_selected = MB_MODE_COUNT; int best_rd = INT_MAX; - int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r); + int d = 0, r = 0; int rate_to; MACROBLOCKD *xd = &x->e_mbd; @@ -767,9 +770,9 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate, vp8_quantize_mbuv(x); rate_to = rd_cost_mbuv(x); - this_rate = rate_to + - x->intra_uv_mode_cost[xd->frame_type] - [xd->mode_info_context->mbmi.uv_mode]; + this_rate = + rate_to + x->intra_uv_mode_cost[xd->frame_type] + [xd->mode_info_context->mbmi.uv_mode]; this_distortion = vp8_mbuverror(x) / 4; @@ -787,6 +790,7 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate, *rate = r; *distortion = d; + assert(mode_selected != MB_MODE_COUNT); xd->mode_info_context->mbmi.uv_mode = mode_selected; } @@ -850,8 +854,7 @@ static int labels2mode(MACROBLOCK *x, int const *labelings, int which_label, default: break; } - if (m == ABOVE4X4) /* replace above with left if same */ - { + if (m == ABOVE4X4) { /* replace above with left if same */ int_mv left_mv; left_mv.as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i); @@ -957,19 +960,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, vp8_variance_fn_ptr_t *v_fn_ptr; ENTROPY_CONTEXT_PLANES t_above, t_left; - ENTROPY_CONTEXT *ta; - ENTROPY_CONTEXT *tl; ENTROPY_CONTEXT_PLANES t_above_b, t_left_b; - ENTROPY_CONTEXT *ta_b; - ENTROPY_CONTEXT *tl_b; - memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + t_above = *x->e_mbd.above_context; + t_left = *x->e_mbd.left_context; - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - ta_b = (ENTROPY_CONTEXT *)&t_above_b; - tl_b = (ENTROPY_CONTEXT *)&t_left_b; + vp8_zero(t_above_b); + vp8_zero(t_left_b); br = 0; bd = 0; @@ -992,7 +989,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, br += rate; for (i = 0; i < label_count; ++i) { - int_mv mode_mv[B_MODE_COUNT]; + int_mv mode_mv[B_MODE_COUNT] = { { 0 }, { 0 } }; int best_label_rd = INT_MAX; B_PREDICTION_MODE mode_selected = ZERO4X4; int bestlabelyrate = 0; @@ -1006,8 +1003,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, ENTROPY_CONTEXT *ta_s; ENTROPY_CONTEXT *tl_s; - memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES)); + t_above_s = t_above; + t_left_s = t_left; ta_s = (ENTROPY_CONTEXT *)&t_above_s; tl_s = (ENTROPY_CONTEXT *)&t_left_s; @@ -1024,7 +1021,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, BLOCK *c; BLOCKD *e; - /* Is the best so far sufficiently good that we cant justify + /* Is the best so far sufficiently good that we can't justify * doing a new motion search. */ if (best_label_rd < label_mv_thresh) break; @@ -1100,8 +1097,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - thissme = cpi->full_search_sad(x, c, e, &mvp_full, sadpb, 16, - v_fn_ptr, x->mvcost, bsi->ref_mv); + thissme = vp8_full_search_sad(x, c, e, &mvp_full, sadpb, 16, + v_fn_ptr, x->mvcost, bsi->ref_mv); if (thissme < bestsme) { bestsme = thissme; @@ -1149,13 +1146,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, mode_selected = this_mode; best_label_rd = this_rd; - memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES)); + t_above_b = t_above_s; + t_left_b = t_left_s; } } /*for each 4x4 mode*/ - memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES)); - memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES)); + t_above = t_above_b; + t_left = t_left_b; labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected], bsi->ref_mv, x->mvcost); @@ -1567,21 +1564,34 @@ static void rd_update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv) { for (i = 0; i < x->partition_info->count; ++i) { if (x->partition_info->bmi[i].mode == NEW4X4) { - x->MVcount[0][mv_max + ((x->partition_info->bmi[i].mv.as_mv.row - - best_ref_mv->as_mv.row) >> - 1)]++; - x->MVcount[1][mv_max + ((x->partition_info->bmi[i].mv.as_mv.col - - best_ref_mv->as_mv.col) >> - 1)]++; + const int row_val = ((x->partition_info->bmi[i].mv.as_mv.row - + best_ref_mv->as_mv.row) >> + 1); + const int row_idx = mv_max + row_val; + const int col_val = ((x->partition_info->bmi[i].mv.as_mv.col - + best_ref_mv->as_mv.col) >> + 1); + const int col_idx = mv_max + col_val; + if (row_idx >= 0 && row_idx < MVvals && col_idx >= 0 && + col_idx < MVvals) { + x->MVcount[0][row_idx]++; + x->MVcount[1][col_idx]++; + } } } } else if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV) { - x->MVcount[0][mv_max + ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row - - best_ref_mv->as_mv.row) >> - 1)]++; - x->MVcount[1][mv_max + ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col - - best_ref_mv->as_mv.col) >> - 1)]++; + const int row_val = ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row - + best_ref_mv->as_mv.row) >> + 1); + const int row_idx = mv_max + row_val; + const int col_val = ((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col - + best_ref_mv->as_mv.col) >> + 1); + const int col_idx = mv_max + col_val; + if (row_idx >= 0 && row_idx < MVvals && col_idx >= 0 && col_idx < MVvals) { + x->MVcount[0][row_idx]++; + x->MVcount[1][col_idx]++; + } } } @@ -1611,7 +1621,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], RATE_DISTORTION *rd, unsigned int q2dc = xd->block[24].dequant[0]; /* If theres is no codeable 2nd order dc or a very small uniform pixel change change */ - if ((sse - var> 4) || (sse / 2 > var && sse - var < 64)) { + if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) { /* Check u and v to make sure skip is ok */ unsigned int sse2 = VP8_UVSSE(x); if (sse2 * 2 < threshold) { @@ -1726,9 +1736,8 @@ static void update_best_mode(BEST_MODE *best_mode, int this_rd, (rd->distortion2 - rd->distortion_uv)); best_mode->rd = this_rd; - memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, - sizeof(MB_MODE_INFO)); - memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO)); + best_mode->mbmode = x->e_mbd.mode_info_context->mbmi; + best_mode->partition = *x->partition_info; if ((this_mode == B_PRED) || (this_mode == SPLITMV)) { int i; @@ -1770,7 +1779,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, /* search range got from mv_pred(). It uses step_param levels. (0-7) */ int sr = 0; - unsigned char *plane[4][3]; + unsigned char *plane[4][3] = { { 0, 0 } }; int ref_frame_map[4]; int sign_bias = 0; @@ -1782,6 +1791,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, best_rd_sse = UINT_MAX; #endif + // _uv variables are not set consistantly before calling update_best_mode. + rd.rate_uv = 0; + rd.distortion_uv = 0; + mode_mv = mode_mv_sb[sign_bias]; best_ref_mv.as_int = 0; best_mode.rd = INT_MAX; @@ -1849,6 +1862,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, /* everything but intra */ if (x->e_mbd.mode_info_context->mbmi.ref_frame) { + assert(plane[this_ref_frame][0] != NULL && + plane[this_ref_frame][1] != NULL && + plane[this_ref_frame][2] != NULL); x->e_mbd.pre.y_buffer = plane[this_ref_frame][0]; x->e_mbd.pre.u_buffer = plane[this_ref_frame][1]; x->e_mbd.pre.v_buffer = plane[this_ref_frame][2]; @@ -1943,6 +1959,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd.distortion2 += distortion; if (tmp_rd < best_mode.yrd) { + assert(uv_intra_done); rd.rate2 += uv_intra_rate; rd.rate_uv = uv_intra_rate_tokenonly; rd.distortion2 += uv_intra_distortion; @@ -1974,7 +1991,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd.distortion2 += distortion; /* If even the 'Y' rd value of split is higher than best so far - * then dont bother looking at UV + * then don't bother looking at UV */ if (tmp_rd < best_mode.yrd) { /* Now work out UV cost and add it in */ @@ -2003,6 +2020,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd.distortion2 += distortion; rd.rate2 += x->mbmode_cost[x->e_mbd.frame_type] [x->e_mbd.mode_info_context->mbmi.mode]; + assert(uv_intra_done); rd.rate2 += uv_intra_rate; rd.rate_uv = uv_intra_rate_tokenonly; rd.distortion2 += uv_intra_distortion; @@ -2134,6 +2152,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd.rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96); } + // fall through case NEARESTMV: case NEARMV: @@ -2150,6 +2169,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, (mode_mv[this_mode].as_int == 0)) { continue; } + // fall through case ZEROMV: @@ -2327,8 +2347,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, } /* macroblock modes */ - memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, - sizeof(MB_MODE_INFO)); + x->e_mbd.mode_info_context->mbmi = best_mode.mbmode; if (best_mode.mbmode.mode == B_PRED) { for (i = 0; i < 16; ++i) { @@ -2341,7 +2360,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int; } - memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO)); + *x->partition_info = best_mode.partition; x->e_mbd.mode_info_context->mbmi.mv.as_int = x->partition_info->bmi[15].mv.as_int; @@ -2355,11 +2374,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd_update_mvcount(x, &best_ref_mv); } -void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) { +void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate) { int error4x4, error16x16; int rate4x4, rate16x16 = 0, rateuv; int dist4x4, dist16x16, distuv; - int rate; + int rate_; int rate4x4_tokenonly = 0; int rate16x16_tokenonly = 0; int rateuv_tokenonly = 0; @@ -2367,7 +2386,7 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) { x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv); - rate = rateuv; + rate_ = rateuv; error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly, &dist16x16); @@ -2377,10 +2396,10 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) { if (error4x4 < error16x16) { x->e_mbd.mode_info_context->mbmi.mode = B_PRED; - rate += rate4x4; + rate_ += rate4x4; } else { - rate += rate16x16; + rate_ += rate16x16; } - *rate_ = rate; + *rate = rate_; } diff --git a/media/libvpx/libvpx/vp8/encoder/rdopt.h b/media/libvpx/libvpx/vp8/encoder/rdopt.h index 8186ff1051..cc3db8197c 100644 --- a/media/libvpx/libvpx/vp8/encoder/rdopt.h +++ b/media/libvpx/libvpx/vp8/encoder/rdopt.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_RDOPT_H_ -#define VP8_ENCODER_RDOPT_H_ +#ifndef VPX_VP8_ENCODER_RDOPT_H_ +#define VPX_VP8_ENCODER_RDOPT_H_ #include "./vpx_config.h" @@ -19,6 +19,9 @@ extern "C" { #define RDCOST(RM, DM, R, D) (((128 + (R) * (RM)) >> 8) + (DM) * (D)) +void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex); +void vp8_auto_select_speed(VP8_COMP *cpi); + static INLINE void insertsortmv(int arr[], int len) { int i, j, k; @@ -60,12 +63,12 @@ static INLINE void insertsortsad(int arr[], int idx[], int len) { } } -extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue); -extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, - int *returnrate, int *returndistortion, - int *returnintra, int mb_row, int mb_col); -extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate); +void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue); +void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra, int mb_row, + int mb_col); +void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate); static INLINE void get_plane_pointers(const YV12_BUFFER_CONFIG *fb, unsigned char *plane[3], @@ -107,9 +110,9 @@ static INLINE void get_reference_search_order(const VP8_COMP *cpi, for (; i < 4; ++i) ref_frame_map[i] = -1; } -extern void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here, - int_mv *mvp, int refframe, int *ref_frame_sign_bias, - int *sr, int near_sadidx[]); +void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here, + int_mv *mvp, int refframe, int *ref_frame_sign_bias, int *sr, + int near_sadidx[]); void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]); int VP8_UVSSE(MACROBLOCK *x); @@ -120,4 +123,4 @@ void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv); } // extern "C" #endif -#endif // VP8_ENCODER_RDOPT_H_ +#endif // VPX_VP8_ENCODER_RDOPT_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/segmentation.c b/media/libvpx/libvpx/vp8/encoder/segmentation.c index dcb68119e1..2127258111 100644 --- a/media/libvpx/libvpx/vp8/encoder/segmentation.c +++ b/media/libvpx/libvpx/vp8/encoder/segmentation.c @@ -11,7 +11,7 @@ #include "segmentation.h" #include "vpx_mem/vpx_mem.h" -void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) { +void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) { int mb_row, mb_col; MODE_INFO *this_mb_mode_info = cm->mi; @@ -19,7 +19,7 @@ void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) { x->gf_active_ptr = (signed char *)cpi->gf_active_flags; if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) { - /* Reset Gf useage monitors */ + /* Reset Gf usage monitors */ memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols)); cpi->gf_active_count = cm->mb_rows * cm->mb_cols; } else { diff --git a/media/libvpx/libvpx/vp8/encoder/segmentation.h b/media/libvpx/libvpx/vp8/encoder/segmentation.h index 1395a34118..0fecfc2212 100644 --- a/media/libvpx/libvpx/vp8/encoder/segmentation.h +++ b/media/libvpx/libvpx/vp8/encoder/segmentation.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_SEGMENTATION_H_ -#define VP8_ENCODER_SEGMENTATION_H_ +#ifndef VPX_VP8_ENCODER_SEGMENTATION_H_ +#define VPX_VP8_ENCODER_SEGMENTATION_H_ #include "string.h" #include "vp8/common/blockd.h" @@ -19,11 +19,11 @@ extern "C" { #endif -extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, - MACROBLOCK *x); +extern void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, + MACROBLOCK *x); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_ENCODER_SEGMENTATION_H_ +#endif // VPX_VP8_ENCODER_SEGMENTATION_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/temporal_filter.c b/media/libvpx/libvpx/vp8/encoder/temporal_filter.c index 1b2f46bb69..1c1a55fde6 100644 --- a/media/libvpx/libvpx/vp8/encoder/temporal_filter.c +++ b/media/libvpx/libvpx/vp8/encoder/temporal_filter.c @@ -20,6 +20,7 @@ #include "ratectrl.h" #include "vp8/common/quant_common.h" #include "segmentation.h" +#include "temporal_filter.h" #include "vpx_mem/vpx_mem.h" #include "vp8/common/swapyv12buffer.h" #include "vp8/common/threading.h" @@ -157,7 +158,8 @@ static int vp8_temporal_filter_find_matching_mb_c(VP8_COMP *cpi, /* Ignore mv costing by sending NULL cost arrays */ bestsme = vp8_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.mv, step_param, sadpb, - &cpi->fn_ptr[BLOCK_16X16], NULL, NULL, &best_ref_mv1); + &cpi->fn_ptr[BLOCK_16X16], NULL, &best_ref_mv1); + (void)bestsme; // Ignore unused return value. #if ALT_REF_SUBPEL_ENABLED /* Try sub-pixel MC? */ diff --git a/media/libvpx/libvpx/vp8/encoder/temporal_filter.h b/media/libvpx/libvpx/vp8/encoder/temporal_filter.h new file mode 100644 index 0000000000..fd39f5cb87 --- /dev/null +++ b/media/libvpx/libvpx/vp8/encoder/temporal_filter.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_ENCODER_TEMPORAL_FILTER_H_ +#define VPX_VP8_ENCODER_TEMPORAL_FILTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP8_COMP; + +void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance); + +#ifdef __cplusplus +} +#endif + +#endif // VPX_VP8_ENCODER_TEMPORAL_FILTER_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.c b/media/libvpx/libvpx/vp8/encoder/tokenize.c index ca5f0e3d89..c3d7026607 100644 --- a/media/libvpx/libvpx/vp8/encoder/tokenize.c +++ b/media/libvpx/libvpx/vp8/encoder/tokenize.c @@ -19,10 +19,6 @@ /* Global event counters used for accumulating statistics across several compressions, then generating context.c = initial stats. */ -#ifdef VP8_ENTROPY_STATS -_int64 context_counters[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS]; -#endif void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); void vp8_fix_contexts(MACROBLOCKD *x); @@ -383,72 +379,6 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) { tokenize1st_order_b(x, t, plane_type, cpi); } -#ifdef VP8_ENTROPY_STATS - -void init_context_counters(void) { - memset(context_counters, 0, sizeof(context_counters)); -} - -void print_context_counters() { - int type, band, pt, t; - - FILE *const f = fopen("context.c", "w"); - - fprintf(f, "#include \"entropy.h\"\n"); - - fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n"); - - fprintf(f, - "int Contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] " - "[MAX_ENTROPY_TOKENS];\n\n"); - - fprintf(f, - "const int default_contexts[BLOCK_TYPES] [COEF_BANDS] " - "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {"); - -#define Comma(X) (X ? "," : "") - - type = 0; - - do { - fprintf(f, "%s\n { /* block Type %d */", Comma(type), type); - - band = 0; - - do { - fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band); - - pt = 0; - - do { - fprintf(f, "%s\n {", Comma(pt)); - - t = 0; - - do { - const _int64 x = context_counters[type][band][pt][t]; - const int y = (int)x; - - assert(x == (_int64)y); /* no overflow handling yet */ - fprintf(f, "%s %d", Comma(t), y); - - } while (++t < MAX_ENTROPY_TOKENS); - - fprintf(f, "}"); - } while (++pt < PREV_COEF_CONTEXTS); - - fprintf(f, "\n }"); - - } while (++band < COEF_BANDS); - - fprintf(f, "\n }"); - } while (++type < BLOCK_TYPES); - - fprintf(f, "\n};\n"); - fclose(f); -} -#endif - static void stuff2nd_order_b(TOKENEXTRA **tp, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, VP8_COMP *cpi, MACROBLOCK *x) { int pt; /* near block/prev token context index */ diff --git a/media/libvpx/libvpx/vp8/encoder/tokenize.h b/media/libvpx/libvpx/vp8/encoder/tokenize.h index e5dbdfc5af..5223aa2d86 100644 --- a/media/libvpx/libvpx/vp8/encoder/tokenize.h +++ b/media/libvpx/libvpx/vp8/encoder/tokenize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_TOKENIZE_H_ -#define VP8_ENCODER_TOKENIZE_H_ +#ifndef VPX_VP8_ENCODER_TOKENIZE_H_ +#define VPX_VP8_ENCODER_TOKENIZE_H_ #include "vp8/common/entropy.h" #include "block.h" @@ -18,8 +18,6 @@ extern "C" { #endif -void vp8_tokenize_initialize(); - typedef struct { short Token; short Extra; @@ -34,14 +32,6 @@ typedef struct { int rd_cost_mby(MACROBLOCKD *); -#ifdef VP8_ENTROPY_STATS -void init_context_counters(); -void print_context_counters(); - -extern _int64 context_counters[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS]; -#endif - extern const short *const vp8_dct_value_cost_ptr; /* TODO: The Token field should be broken out into a separate char array to * improve cache locality, since it's needed for costing when the rest of the @@ -53,4 +43,4 @@ extern const TOKENVALUE *const vp8_dct_value_tokens_ptr; } // extern "C" #endif -#endif // VP8_ENCODER_TOKENIZE_H_ +#endif // VPX_VP8_ENCODER_TOKENIZE_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/treewriter.h b/media/libvpx/libvpx/vp8/encoder/treewriter.h index dadbbe3f80..4e9ed6af17 100644 --- a/media/libvpx/libvpx/vp8/encoder/treewriter.h +++ b/media/libvpx/libvpx/vp8/encoder/treewriter.h @@ -8,12 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_ENCODER_TREEWRITER_H_ -#define VP8_ENCODER_TREEWRITER_H_ +#ifndef VPX_VP8_ENCODER_TREEWRITER_H_ +#define VPX_VP8_ENCODER_TREEWRITER_H_ /* Trees map alphabets into huffman-like codes suitable for an arithmetic bit coder. Timothy S Murphy 11 October 2004 */ +#include + #include "./vpx_config.h" #include "vp8/common/treecoder.h" @@ -48,7 +50,9 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p) { /* Imitate existing calculation */ - return ((ct[0] * vp8_cost_zero(p)) + (ct[1] * vp8_cost_one(p))) >> 8; + return (unsigned int)(((((uint64_t)ct[0]) * vp8_cost_zero(p)) + + (((uint64_t)ct[1]) * vp8_cost_one(p))) >> + 8); } /* Small functions to write explicit values and tokens, as well as @@ -56,8 +60,7 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2], static void vp8_treed_write(vp8_writer *const w, vp8_tree t, const vp8_prob *const p, int v, - int n /* number of bits in v, assumed nonzero */ - ) { + int n) { /* number of bits in v, assumed nonzero */ vp8_tree_index i = 0; do { @@ -73,8 +76,7 @@ static INLINE void vp8_write_token(vp8_writer *const w, vp8_tree t, } static int vp8_treed_cost(vp8_tree t, const vp8_prob *const p, int v, - int n /* number of bits in v, assumed nonzero */ - ) { + int n) { /* number of bits in v, assumed nonzero */ int c = 0; vp8_tree_index i = 0; @@ -93,12 +95,12 @@ static INLINE int vp8_cost_token(vp8_tree t, const vp8_prob *const p, /* Fill array of costs for all possible token values. */ -void vp8_cost_tokens(int *Costs, const vp8_prob *, vp8_tree); +void vp8_cost_tokens(int *c, const vp8_prob *, vp8_tree); -void vp8_cost_tokens2(int *Costs, const vp8_prob *, vp8_tree, int); +void vp8_cost_tokens2(int *c, const vp8_prob *, vp8_tree, int); #ifdef __cplusplus } // extern "C" #endif -#endif // VP8_ENCODER_TREEWRITER_H_ +#endif // VPX_VP8_ENCODER_TREEWRITER_H_ diff --git a/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c b/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c index ff6e04eaad..8b9b22babe 100644 --- a/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c +++ b/media/libvpx/libvpx/vp8/encoder/vp8_quantize.c @@ -9,6 +9,9 @@ */ #include + +#include "./vpx_config.h" +#include "vpx_ports/bitops.h" #include "vpx_mem/vpx_mem.h" #include "onyx_int.h" @@ -162,10 +165,10 @@ static const int qzbin_factors_y2[129] = { static void invert_quant(int improved_quant, short *quant, short *shift, short d) { if (improved_quant) { - unsigned t; + unsigned int t; int l, m; - t = d; - for (l = 0; t > 1; ++l) t >>= 1; + t = (unsigned int)d; + l = get_msb(t); m = 1 + (1 << (16 + l)) / d; *quant = (short)(m - (1 << 16)); *shift = l; @@ -174,8 +177,6 @@ static void invert_quant(int improved_quant, short *quant, short *shift, } else { *quant = (1 << 16) / d; *shift = 0; - /* use multiplication and constant shift by 16 */ - *shift = 1 << (16 - *shift); } } @@ -296,7 +297,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip) { /* Select the baseline MB Q index. */ if (xd->segmentation_enabled) { /* Abs Value */ - if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) { + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { QIndex = xd->segment_feature_data[MB_LVL_ALT_Q] [xd->mode_info_context->mbmi.segment_id]; /* Delta Value */ diff --git a/media/libvpx/libvpx/vp8/encoder/x86/encodeopt.asm b/media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm similarity index 97% rename from media/libvpx/libvpx/vp8/encoder/x86/encodeopt.asm rename to media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm index 0297220ee1..200b4ccfe6 100644 --- a/media/libvpx/libvpx/vp8/encoder/x86/encodeopt.asm +++ b/media/libvpx/libvpx/vp8/encoder/x86/block_error_sse2.asm @@ -11,8 +11,10 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;int vp8_block_error_sse2(short *coeff_ptr, short *dcoef_ptr) -global sym(vp8_block_error_sse2) PRIVATE +globalsym(vp8_block_error_sse2) sym(vp8_block_error_sse2): push rbp mov rbp, rsp @@ -60,7 +62,7 @@ sym(vp8_block_error_sse2): ret ;int vp8_mbblock_error_sse2_impl(short *coeff_ptr, short *dcoef_ptr, int dc); -global sym(vp8_mbblock_error_sse2_impl) PRIVATE +globalsym(vp8_mbblock_error_sse2_impl) sym(vp8_mbblock_error_sse2_impl): push rbp mov rbp, rsp @@ -130,7 +132,7 @@ sym(vp8_mbblock_error_sse2_impl): ;int vp8_mbuverror_sse2_impl(short *s_ptr, short *d_ptr); -global sym(vp8_mbuverror_sse2_impl) PRIVATE +globalsym(vp8_mbuverror_sse2_impl) sym(vp8_mbuverror_sse2_impl): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/common/x86/copy_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm similarity index 98% rename from media/libvpx/libvpx/vp8/common/x86/copy_sse2.asm rename to media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm index 86fae26956..fe78da398e 100644 --- a/media/libvpx/libvpx/vp8/common/x86/copy_sse2.asm +++ b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse2.asm @@ -11,6 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text ;void vp8_copy32xn_sse2( ; unsigned char *src_ptr, @@ -18,7 +19,7 @@ ; unsigned char *dst_ptr, ; int dst_stride, ; int height); -global sym(vp8_copy32xn_sse2) PRIVATE +globalsym(vp8_copy32xn_sse2) sym(vp8_copy32xn_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/common/x86/copy_sse3.asm b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm similarity index 99% rename from media/libvpx/libvpx/vp8/common/x86/copy_sse3.asm rename to media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm index d789a40ccf..c40b2d8bf6 100644 --- a/media/libvpx/libvpx/vp8/common/x86/copy_sse3.asm +++ b/media/libvpx/libvpx/vp8/encoder/x86/copy_sse3.asm @@ -83,6 +83,7 @@ ret %endmacro +SECTION .text ;void vp8_copy32xn_sse3( ; unsigned char *src_ptr, @@ -90,7 +91,7 @@ ; unsigned char *dst_ptr, ; int dst_stride, ; int height); -global sym(vp8_copy32xn_sse3) PRIVATE +globalsym(vp8_copy32xn_sse3) sym(vp8_copy32xn_sse3): STACK_FRAME_CREATE_X3 diff --git a/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm index d06bca5927..3c28cb902e 100644 --- a/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm +++ b/media/libvpx/libvpx/vp8/encoder/x86/dct_sse2.asm @@ -60,8 +60,10 @@ ret %endmacro +SECTION .text + ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) -global sym(vp8_short_fdct4x4_sse2) PRIVATE +globalsym(vp8_short_fdct4x4_sse2) sym(vp8_short_fdct4x4_sse2): STACK_FRAME_CREATE @@ -166,7 +168,7 @@ sym(vp8_short_fdct4x4_sse2): STACK_FRAME_DESTROY ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) -global sym(vp8_short_fdct8x4_sse2) PRIVATE +globalsym(vp8_short_fdct8x4_sse2) sym(vp8_short_fdct8x4_sse2): STACK_FRAME_CREATE diff --git a/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c b/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c index 89cad53356..f35b930169 100644 --- a/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c +++ b/media/libvpx/libvpx/vp8/encoder/x86/denoising_sse2.c @@ -30,7 +30,7 @@ static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) { _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); - unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba)); + unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba)); return sum_diff; } diff --git a/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm index f4989279f4..938fc173ff 100644 --- a/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm +++ b/media/libvpx/libvpx/vp8/encoder/x86/fwalsh_sse2.asm @@ -11,8 +11,10 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) -global sym(vp8_short_walsh4x4_sse2) PRIVATE +globalsym(vp8_short_walsh4x4_sse2) sym(vp8_short_walsh4x4_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp8/encoder/x86/quantize_mmx.asm b/media/libvpx/libvpx/vp8/encoder/x86/quantize_mmx.asm deleted file mode 100644 index 2864ce16d9..0000000000 --- a/media/libvpx/libvpx/vp8/encoder/x86/quantize_mmx.asm +++ /dev/null @@ -1,286 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, -; short *qcoeff_ptr,short *dequant_ptr, -; short *scan_mask, short *round_ptr, -; short *quant_ptr, short *dqcoeff_ptr); -global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE -sym(vp8_fast_quantize_b_impl_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;coeff_ptr - movq mm0, [rsi] - - mov rax, arg(1) ;zbin_ptr - movq mm1, [rax] - - movq mm3, mm0 - psraw mm0, 15 - - pxor mm3, mm0 - psubw mm3, mm0 ; abs - - movq mm2, mm3 - pcmpgtw mm1, mm2 - - pandn mm1, mm2 - movq mm3, mm1 - - mov rdx, arg(6) ;quant_ptr - movq mm1, [rdx] - - mov rcx, arg(5) ;round_ptr - movq mm2, [rcx] - - paddw mm3, mm2 - pmulhuw mm3, mm1 - - pxor mm3, mm0 - psubw mm3, mm0 ;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - movq mm0, mm3 - - movq [rdi], mm3 - - mov rax, arg(3) ;dequant_ptr - movq mm2, [rax] - - pmullw mm3, mm2 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax], mm3 - - ; next 8 - movq mm4, [rsi+8] - - mov rax, arg(1) ;zbin_ptr - movq mm5, [rax+8] - - movq mm7, mm4 - psraw mm4, 15 - - pxor mm7, mm4 - psubw mm7, mm4 ; abs - - movq mm6, mm7 - pcmpgtw mm5, mm6 - - pandn mm5, mm6 - movq mm7, mm5 - - movq mm5, [rdx+8] - movq mm6, [rcx+8] - - paddw mm7, mm6 - pmulhuw mm7, mm5 - - pxor mm7, mm4 - psubw mm7, mm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movq mm1, mm7 - movq [rdi+8], mm7 - - mov rax, arg(3) ;dequant_ptr - movq mm6, [rax+8] - - pmullw mm7, mm6 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax+8], mm7 - - - ; next 8 - movq mm4, [rsi+16] - - mov rax, arg(1) ;zbin_ptr - movq mm5, [rax+16] - - movq mm7, mm4 - psraw mm4, 15 - - pxor mm7, mm4 - psubw mm7, mm4 ; abs - - movq mm6, mm7 - pcmpgtw mm5, mm6 - - pandn mm5, mm6 - movq mm7, mm5 - - movq mm5, [rdx+16] - movq mm6, [rcx+16] - - paddw mm7, mm6 - pmulhuw mm7, mm5 - - pxor mm7, mm4 - psubw mm7, mm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movq mm1, mm7 - movq [rdi+16], mm7 - - mov rax, arg(3) ;dequant_ptr - movq mm6, [rax+16] - - pmullw mm7, mm6 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax+16], mm7 - - - ; next 8 - movq mm4, [rsi+24] - - mov rax, arg(1) ;zbin_ptr - movq mm5, [rax+24] - - movq mm7, mm4 - psraw mm4, 15 - - pxor mm7, mm4 - psubw mm7, mm4 ; abs - - movq mm6, mm7 - pcmpgtw mm5, mm6 - - pandn mm5, mm6 - movq mm7, mm5 - - movq mm5, [rdx+24] - movq mm6, [rcx+24] - - paddw mm7, mm6 - pmulhuw mm7, mm5 - - pxor mm7, mm4 - psubw mm7, mm4;gain the sign back - - mov rdi, arg(2) ;qcoeff_ptr - - movq mm1, mm7 - movq [rdi+24], mm7 - - mov rax, arg(3) ;dequant_ptr - movq mm6, [rax+24] - - pmullw mm7, mm6 - mov rax, arg(7) ;dqcoeff_ptr - - movq [rax+24], mm7 - - - - mov rdi, arg(4) ;scan_mask - mov rsi, arg(2) ;qcoeff_ptr - - pxor mm5, mm5 - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rsi+8] - - movq mm2, [rdi] - movq mm3, [rdi+8]; - - pcmpeqw mm0, mm7 - pcmpeqw mm1, mm7 - - pcmpeqw mm6, mm6 - pxor mm0, mm6 - - pxor mm1, mm6 - psrlw mm0, 15 - - psrlw mm1, 15 - pmaddwd mm0, mm2 - - pmaddwd mm1, mm3 - movq mm5, mm0 - - paddd mm5, mm1 - - movq mm0, [rsi+16] - movq mm1, [rsi+24] - - movq mm2, [rdi+16] - movq mm3, [rdi+24]; - - pcmpeqw mm0, mm7 - pcmpeqw mm1, mm7 - - pcmpeqw mm6, mm6 - pxor mm0, mm6 - - pxor mm1, mm6 - psrlw mm0, 15 - - psrlw mm1, 15 - pmaddwd mm0, mm2 - - pmaddwd mm1, mm3 - paddd mm5, mm0 - - paddd mm5, mm1 - movq mm0, mm5 - - psrlq mm5, 32 - paddd mm0, mm5 - - ; eob adjustment begins here - movq rcx, mm0 - and rcx, 0xffff - - xor rdx, rdx - sub rdx, rcx ; rdx=-rcx - - bsr rax, rcx - inc rax - - sar rdx, 31 - and rax, rdx - ; Substitute the sse assembly for the old mmx mixed assembly/C. The - ; following is kept as reference - ; movq rcx, mm0 - ; bsr rax, rcx - ; - ; mov eob, rax - ; mov eee, rcx - ; - ;if(eee==0) - ;{ - ; eob=-1; - ;} - ;else if(eee<0) - ;{ - ; eob=15; - ;} - ;d->eob = eob+1; - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c b/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c index 6f2c163492..4c2d24cc27 100644 --- a/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c +++ b/media/libvpx/libvpx/vp8/encoder/x86/quantize_sse4.c @@ -12,27 +12,17 @@ #include "./vp8_rtcd.h" #include "vp8/encoder/block.h" -#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ +#include "vpx_ports/bitops.h" /* get_lsb */ +#include "vpx_ports/compiler_attributes.h" -#define SELECT_EOB(i, z, x, y, q) \ - do { \ - short boost = *zbin_boost_ptr; \ - short x_z = _mm_extract_epi16(x, z); \ - short y_z = _mm_extract_epi16(y, z); \ - int cmp = (x_z < boost) | (y_z == 0); \ - zbin_boost_ptr++; \ - if (cmp) break; \ - q = _mm_insert_epi16(q, y_z, z); \ - eob = i; \ - zbin_boost_ptr = b->zrun_zbin_boost; \ - } while (0) - -void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { - char eob = 0; +// Unsigned shift overflow is disabled for the use of ~1U << eob with ymask. +VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b, + BLOCKD *d) { + int eob = -1; short *zbin_boost_ptr = b->zrun_zbin_boost; - - __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, - dqcoeff1; + __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr)); + __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8)); + __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1; __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); @@ -46,22 +36,20 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); - __m128i qcoeff0 = _mm_setzero_si128(); - __m128i qcoeff1 = _mm_setzero_si128(); + __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1; + uint32_t mask, ymask; + DECLARE_ALIGNED(16, static const uint8_t, + zig_zag_mask[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; + DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 }; /* Duplicate to all lanes. */ zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); - /* Sign of z: z >> 15 */ - sz0 = _mm_srai_epi16(z0, 15); - sz1 = _mm_srai_epi16(z1, 15); - - /* x = abs(z): (z ^ sz) - sz */ - x0 = _mm_xor_si128(z0, sz0); - x1 = _mm_xor_si128(z1, sz1); - x0 = _mm_sub_epi16(x0, sz0); - x1 = _mm_sub_epi16(x1, sz1); + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); /* zbin[] + zbin_extra */ zbin0 = _mm_add_epi16(zbin0, zbin_extra); @@ -89,29 +77,56 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { y0 = _mm_mulhi_epi16(y0, quant_shift0); y1 = _mm_mulhi_epi16(y1, quant_shift1); - /* Return the sign: (y ^ sz) - sz */ - y0 = _mm_xor_si128(y0, sz0); - y1 = _mm_xor_si128(y1, sz1); - y0 = _mm_sub_epi16(y0, sz0); - y1 = _mm_sub_epi16(y1, sz1); + /* Restore the sign. */ + y0 = _mm_sign_epi16(y0, z0); + y1 = _mm_sign_epi16(y1, z1); - /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ - SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0); - SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1); - SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1); + { + const __m128i zig_zag_i16_0 = + _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13); + const __m128i zig_zag_i16_1 = + _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13); + + /* The first part of the zig zag needs a value + * from x_minus_zbin1 and vice versa. */ + t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2); + t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80); + t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80); + x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0); + x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1); + } + + /* Check if y is nonzero and put it in zig zag order. */ + t0 = _mm_packs_epi16(y0, y1); + t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128()); + t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask)); + ymask = _mm_movemask_epi8(t0) ^ 0xffff; + + for (;;) { + t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0); + t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1); + t0 = _mm_packs_epi16(t0, t1); + mask = _mm_movemask_epi8(t0); + mask = ~mask & ymask; + if (!mask) break; + /* |eob| will contain the index of the next found element where: + * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */ + eob = get_lsb(mask); + /* Need to clear the mask from processed elements so that + * they are no longer counted in the next iteration. */ + ymask &= ~1U << eob; + /* It's safe to read ahead of this buffer if struct VP8_COMP has at + * least 32 bytes before the zrun_zbin_boost_* fields (it has 384). + * Any data read outside of the buffer is masked by the updated |ymask|. */ + zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1)); + zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7)); + qcoeff[zig_zag_mask[eob]] = 0xffff; + } + + qcoeff0 = _mm_load_si128((__m128i *)(qcoeff)); + qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8)); + qcoeff0 = _mm_and_si128(qcoeff0, y0); + qcoeff1 = _mm_and_si128(qcoeff1, y1); _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0); _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1); @@ -122,5 +137,5 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0); _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1); - *d->eob = eob; + *d->eob = eob + 1; } diff --git a/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm index bd92b398a0..67102064a1 100644 --- a/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm +++ b/media/libvpx/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ; void vp8_temporal_filter_apply_sse2 | arg ; (unsigned char *frame1, | 0 ; unsigned int stride, | 1 @@ -20,7 +22,7 @@ ; int filter_weight, | 5 ; unsigned int *accumulator, | 6 ; unsigned short *count) | 7 -global sym(vp8_temporal_filter_apply_sse2) PRIVATE +globalsym(vp8_temporal_filter_apply_sse2) sym(vp8_temporal_filter_apply_sse2): push rbp @@ -203,5 +205,5 @@ align 16 _const_top_bit: times 8 dw 1<<15 align 16 -_const_16w +_const_16w: times 8 dw 16 diff --git a/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c deleted file mode 100644 index 4406dd0cc4..0000000000 --- a/media/libvpx/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include "vpx_ports/x86.h" -#include "vp8/encoder/block.h" - -int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, - short *qcoeff_ptr, short *dequant_ptr, - const short *scan_mask, short *round_ptr, - short *quant_ptr, short *dqcoeff_ptr); -void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) { - const short *scan_mask = vp8_default_zig_zag_mask; - short *coeff_ptr = b->coeff; - short *zbin_ptr = b->zbin; - short *round_ptr = b->round; - short *quant_ptr = b->quant_fast; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; - short *dequant_ptr = d->dequant; - - *d->eob = (char)vp8_fast_quantize_b_impl_mmx( - coeff_ptr, zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask, - - round_ptr, quant_ptr, dqcoeff_ptr); -} diff --git a/media/libvpx/libvpx/vp8/encoder/x86/quantize_ssse3.c b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c similarity index 76% rename from media/libvpx/libvpx/vp8/encoder/x86/quantize_ssse3.c rename to media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c index 322f0a151f..f6df146f08 100644 --- a/media/libvpx/libvpx/vp8/encoder/x86/quantize_ssse3.c +++ b/media/libvpx/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c @@ -10,32 +10,9 @@ #include /* SSSE3 */ +#include "./vp8_rtcd.h" #include "vp8/encoder/block.h" - -/* bitscan reverse (bsr) */ -#if defined(_MSC_VER) -#include -#pragma intrinsic(_BitScanReverse) -static int bsr(int mask) { - unsigned long eob; - _BitScanReverse(&eob, mask); - eob++; - if (mask == 0) eob = 0; - return eob; -} -#else -static int bsr(int mask) { - int eob; -#if defined(__GNUC__) && __GNUC__ - __asm__ __volatile__("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags"); -#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) - asm volatile("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags"); -#endif - eob++; - if (mask == 0) eob = 0; - return eob; -} -#endif +#include "vpx_ports/bitops.h" /* get_msb */ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { int eob, mask; @@ -51,9 +28,9 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1; - DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) = { - 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 - }; + DECLARE_ALIGNED(16, const uint8_t, + pshufb_zig_zag_mask[16]) = { 0, 1, 4, 8, 5, 2, 3, 6, + 9, 12, 13, 10, 7, 11, 14, 15 }; __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask); /* sign of z: z >> 15 */ @@ -107,7 +84,10 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { mask = _mm_movemask_epi8(x); - eob = bsr(mask); + /* x2 is needed to increase the result from non-zero masks by 1, + * +1 is needed to mask undefined behavior for a null argument, + * the result of get_msb(1) is 0 */ + eob = get_msb(mask * 2 + 1); - *d->eob = 0xFF & eob; + *d->eob = eob; } diff --git a/media/libvpx/libvpx/vp8/vp8_common.mk b/media/libvpx/libvpx/vp8/vp8_common.mk index 137f5bb627..d485965d3d 100644 --- a/media/libvpx/libvpx/vp8/vp8_common.mk +++ b/media/libvpx/libvpx/vp8/vp8_common.mk @@ -15,7 +15,6 @@ VP8_COMMON_SRCS-yes += common/onyxd.h VP8_COMMON_SRCS-yes += common/alloccommon.c VP8_COMMON_SRCS-yes += common/blockd.c VP8_COMMON_SRCS-yes += common/coefupdateprobs.h -VP8_COMMON_SRCS-yes += common/copy_c.c # VP8_COMMON_SRCS-yes += common/debugmodes.c VP8_COMMON_SRCS-yes += common/default_coef_probs.h VP8_COMMON_SRCS-yes += common/dequantize.c @@ -70,10 +69,8 @@ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h VP8_COMMON_SRCS-yes += common/treecoder.c -VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c -VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h -VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c -VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c +VP8_COMMON_SRCS-$(VPX_ARCH_X86)$(VPX_ARCH_X86_64) += common/x86/vp8_asm_stubs.c +VP8_COMMON_SRCS-$(VPX_ARCH_X86)$(VPX_ARCH_X86_64) += common/x86/loopfilter_x86.c VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c @@ -82,21 +79,20 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm -VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/bilinear_filter_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm -VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm endif -ifeq ($(ARCH_X86_64),yes) +ifeq ($(VPX_ARCH_X86_64),yes) VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2_x86_64.asm endif @@ -116,20 +112,32 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h +# common (c) +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/sixtap_filter_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c + ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c endif +# common (loongarch LSX intrinsics) +VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/loopfilter_filters_lsx.c +VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/sixtap_filter_lsx.c +VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/idct_lsx.c + # common (neon intrinsics) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.h VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_loopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c index 4a3e8f2feb..d2b7184271 100644 --- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c +++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c @@ -8,22 +8,31 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include +#include +#include +#include +#include + #include "./vpx_config.h" #include "./vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" -#include "vpx/vpx_codec.h" +#include "vpx/vpx_encoder.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx_version.h" #include "vpx_mem/vpx_mem.h" -#include "vpx_ports/vpx_once.h" +#include "vpx_ports/static_assert.h" +#include "vpx_ports/system_state.h" +#include "vpx_util/vpx_timestamp.h" +#if CONFIG_MULTITHREAD +#include "vp8/encoder/ethreading.h" +#endif #include "vp8/encoder/onyx_int.h" #include "vpx/vp8cx.h" #include "vp8/encoder/firstpass.h" #include "vp8/common/onyx.h" #include "vp8/common/common.h" -#include -#include struct vp8_extracfg { struct vpx_codec_pkt_list *pkt_list; @@ -74,6 +83,9 @@ struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_enc_cfg_t cfg; struct vp8_extracfg vp8_cfg; + vpx_rational64_t timestamp_ratio; + vpx_codec_pts_t pts_offset; + unsigned char pts_offset_initialized; VP8_CONFIG oxcf; struct VP8_COMP *cpi; unsigned char *cx_data; @@ -87,13 +99,16 @@ struct vpx_codec_alg_priv { vpx_enc_frame_flags_t control_frame_flags; }; +// Called by vp8e_set_config() and vp8e_encode() only. Must not be called +// by vp8e_init() because the `error` paramerer (cpi->common.error) will be +// destroyed by vpx_codec_enc_init_ver() after vp8e_init() returns an error. +// See the "IMPORTANT" comment in vpx_codec_enc_init_ver(). static vpx_codec_err_t update_error_state( vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) { - vpx_codec_err_t res; + const vpx_codec_err_t res = error->error_code; - if ((res = error->error_code)) { + if (res != VPX_CODEC_OK) ctx->base.err_detail = error->has_detail ? error->detail : NULL; - } return res; } @@ -105,10 +120,10 @@ static vpx_codec_err_t update_error_state( return VPX_CODEC_INVALID_PARAM; \ } while (0) -#define RANGE_CHECK(p, memb, lo, hi) \ - do { \ - if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \ - ERROR(#memb " out of range [" #lo ".." #hi "]"); \ +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ } while (0) #define RANGE_CHECK_HI(p, memb, hi) \ @@ -146,8 +161,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, g_lag_in_frames, 25); #endif RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q); - RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000); - RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000); + RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100); RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO); @@ -251,6 +266,23 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("g_threads cannot be bigger than number of token partitions"); #endif + // The range below shall be further tuned. + RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1); + RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000); + RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000); + RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000); + RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, zm_factor.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000); + return VPX_CODEC_OK; } @@ -259,10 +291,11 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, switch (img->fmt) { case VPX_IMG_FMT_YV12: case VPX_IMG_FMT_I420: - case VPX_IMG_FMT_VPXI420: - case VPX_IMG_FMT_VPXYV12: break; + case VPX_IMG_FMT_NV12: break; default: - ERROR("Invalid image format. Only YV12 and I420 images are supported"); + ERROR( + "Invalid image format. Only YV12, I420 and NV12 images are " + "supported"); } if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h)) @@ -315,7 +348,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, oxcf->end_usage = USAGE_CONSTANT_QUALITY; } - oxcf->target_bandwidth = cfg.rc_target_bitrate; + // Cap the target rate to 1000 Mbps to avoid some integer overflows in + // target bandwidth calculations. + oxcf->target_bandwidth = VPXMIN(cfg.rc_target_bitrate, 1000000); oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; oxcf->gf_cbr_boost_pct = vp8_cfg.gf_cbr_boost_pct; @@ -362,8 +397,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, if (mr_cfg) { oxcf->mr_total_resolutions = mr_cfg->mr_total_resolutions; oxcf->mr_encoder_id = mr_cfg->mr_encoder_id; - oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num; - oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den; + oxcf->mr_down_sampling_factor = mr_cfg->mr_down_sampling_factor; oxcf->mr_low_res_mode_info = mr_cfg->mr_low_res_mode_info; } #else @@ -371,6 +405,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, #endif oxcf->cpu_used = vp8_cfg.cpu_used; + if (cfg.g_pass == VPX_RC_FIRST_PASS) { + oxcf->cpu_used = VPXMAX(4, oxcf->cpu_used); + } oxcf->encode_breakout = vp8_cfg.static_thresh; oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref; oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity; @@ -445,14 +482,29 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, ERROR("Cannot increase lag_in_frames"); res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0); + if (res != VPX_CODEC_OK) return res; - if (!res) { - ctx->cfg = *cfg; - set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); - vp8_change_config(ctx->cpi, &ctx->oxcf); + if (setjmp(ctx->cpi->common.error.jmp)) { + const vpx_codec_err_t codec_err = + update_error_state(ctx, &ctx->cpi->common.error); + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + assert(codec_err != VPX_CODEC_OK); + return codec_err; } - return res; + ctx->cpi->common.error.setjmp = 1; + ctx->cfg = *cfg; + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); + vp8_change_config(ctx->cpi, &ctx->oxcf); +#if CONFIG_MULTITHREAD + if (vp8cx_create_encoder_threads(ctx->cpi)) { + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_ERROR; + } +#endif + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_OK; } static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) { @@ -484,6 +536,9 @@ static vpx_codec_err_t update_extracfg(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t set_cpu_used(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp8_extracfg extra_cfg = ctx->vp8_cfg; extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args); + // Use fastest speed setting (speed 16 or -16) if it's set beyond the range. + extra_cfg.cpu_used = VPXMIN(16, extra_cfg.cpu_used); + extra_cfg.cpu_used = VPXMAX(-16, extra_cfg.cpu_used); return update_extracfg(ctx, &extra_cfg); } @@ -575,9 +630,21 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx, return update_extracfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP8_COMP *cpi = ctx->cpi; + const unsigned int data = CAST(VP8E_SET_RTC_EXTERNAL_RATECTRL, args); + if (data) { + cpi->cyclic_refresh_mode_enabled = 0; + cpi->rt_always_update_correction_factor = 1; + cpi->rt_drop_recode_on_overshoot = 0; + } + return VPX_CODEC_OK; +} + static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, void **mem_loc) { - vpx_codec_err_t res = 0; + vpx_codec_err_t res = VPX_CODEC_OK; #if CONFIG_MULTI_RES_ENCODING LOWER_RES_FRAME_INFO *shared_mem_loc; @@ -586,12 +653,13 @@ static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, shared_mem_loc = calloc(1, sizeof(LOWER_RES_FRAME_INFO)); if (!shared_mem_loc) { - res = VPX_CODEC_MEM_ERROR; + return VPX_CODEC_MEM_ERROR; } shared_mem_loc->mb_info = calloc(mb_rows * mb_cols, sizeof(LOWER_RES_MB_INFO)); if (!(shared_mem_loc->mb_info)) { + free(shared_mem_loc); res = VPX_CODEC_MEM_ERROR; } else { *mem_loc = (void *)shared_mem_loc; @@ -599,11 +667,22 @@ static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, } #else (void)cfg; - (void)mem_loc; + *mem_loc = NULL; #endif return res; } +static void vp8e_mr_free_mem(void *mem_loc) { +#if CONFIG_MULTI_RES_ENCODING + LOWER_RES_FRAME_INFO *shared_mem_loc = (LOWER_RES_FRAME_INFO *)mem_loc; + free(shared_mem_loc->mb_info); + free(mem_loc); +#else + (void)mem_loc; + assert(!mem_loc); +#endif +} + static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { vpx_codec_err_t res = VPX_CODEC_OK; @@ -641,6 +720,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, priv->cx_data = malloc(priv->cx_data_sz); if (!priv->cx_data) { + priv->cx_data_sz = 0; return VPX_CODEC_MEM_ERROR; } @@ -650,14 +730,30 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, ctx->priv->enc.total_encoders = 1; } - once(vp8_initialize_enc); + vp8_initialize_enc(); res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0); if (!res) { + priv->pts_offset_initialized = 0; + priv->timestamp_ratio.den = priv->cfg.g_timebase.den; + priv->timestamp_ratio.num = (int64_t)priv->cfg.g_timebase.num; + priv->timestamp_ratio.num *= TICKS_PER_SEC; + reduce_ratio(&priv->timestamp_ratio); + set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg); priv->cpi = vp8_create_compressor(&priv->oxcf); - if (!priv->cpi) res = VPX_CODEC_MEM_ERROR; + if (!priv->cpi) { +#if CONFIG_MULTI_RES_ENCODING + // Release ownership of mr_cfg->mr_low_res_mode_info on failure. This + // prevents ownership confusion with the caller and avoids a double + // free when vpx_codec_destroy() is called on this instance. + priv->oxcf.mr_total_resolutions = 0; + priv->oxcf.mr_encoder_id = 0; + priv->oxcf.mr_low_res_mode_info = NULL; +#endif + res = VPX_CODEC_MEM_ERROR; + } } } @@ -669,10 +765,7 @@ static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) { /* Free multi-encoder shared memory */ if (ctx->oxcf.mr_total_resolutions > 0 && (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions - 1)) { - LOWER_RES_FRAME_INFO *shared_mem_loc = - (LOWER_RES_FRAME_INFO *)ctx->oxcf.mr_low_res_mode_info; - free(shared_mem_loc->mb_info); - free(ctx->oxcf.mr_low_res_mode_info); + vp8e_mr_free_mem(ctx->oxcf.mr_low_res_mode_info); } #endif @@ -709,9 +802,9 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, return res; } -static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, - unsigned long duration, - unsigned long deadline) { +static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, + unsigned long duration, + vpx_enc_deadline_t deadline) { int new_qc; #if !(CONFIG_REALTIME_ONLY) @@ -719,12 +812,16 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, new_qc = MODE_BESTQUALITY; if (deadline) { - uint64_t duration_us; - /* Convert duration parameter from stream timebase to microseconds */ - duration_us = (uint64_t)duration * 1000000 * - (uint64_t)ctx->cfg.g_timebase.num / - (uint64_t)ctx->cfg.g_timebase.den; + VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 && + (TICKS_PER_SEC % 1000000) == 0); + + if (duration > UINT64_MAX / (uint64_t)ctx->timestamp_ratio.num) { + ERROR("duration is too big"); + } + uint64_t duration_us = + duration * (uint64_t)ctx->timestamp_ratio.num / + ((uint64_t)ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000)); /* If the deadline is more that the duration this frame is to be shown, * use good quality mode. Otherwise use realtime mode. @@ -750,6 +847,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, ctx->oxcf.Mode = new_qc; vp8_change_config(ctx->cpi, &ctx->oxcf); } + return VPX_CODEC_OK; } static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx, @@ -798,17 +896,33 @@ static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, unsigned long duration, - vpx_enc_frame_flags_t flags, - unsigned long deadline) { - vpx_codec_err_t res = VPX_CODEC_OK; + vpx_enc_frame_flags_t enc_flags, + vpx_enc_deadline_t deadline) { + volatile vpx_codec_err_t res = VPX_CODEC_OK; + // Make a copy as volatile to avoid -Wclobbered with longjmp. + volatile vpx_enc_frame_flags_t flags = enc_flags; + volatile vpx_codec_pts_t pts_val = pts; - if (!ctx->cfg.rc_target_bitrate) return res; + if (!ctx->cfg.rc_target_bitrate) { +#if CONFIG_MULTI_RES_ENCODING + if (!ctx->cpi) return VPX_CODEC_ERROR; + if (ctx->cpi->oxcf.mr_total_resolutions > 1) { + LOWER_RES_FRAME_INFO *low_res_frame_info = + (LOWER_RES_FRAME_INFO *)ctx->cpi->oxcf.mr_low_res_mode_info; + if (!low_res_frame_info) return VPX_CODEC_ERROR; + low_res_frame_info->skip_encoding_prev_stream = 1; + if (ctx->cpi->oxcf.mr_encoder_id == 0) + low_res_frame_info->skip_encoding_base_stream = 1; + } +#endif + return res; + } if (img) res = validate_img(ctx, img); if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1); - pick_quickcompress_mode(ctx, duration, deadline); + if (!res) res = pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); // If no flags are set in the encode call, then use the frame flags as @@ -829,20 +943,37 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, } } - /* Initialize the encoder instance on the first frame*/ + /* Initialize the encoder instance on the first frame */ if (!res && ctx->cpi) { unsigned int lib_flags; - YV12_BUFFER_CONFIG sd; int64_t dst_time_stamp, dst_end_time_stamp; size_t size, cx_data_sz; unsigned char *cx_data; unsigned char *cx_data_end; int comp_data_state = 0; - /* Set up internal flags */ - if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) { - ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1; + if (setjmp(ctx->cpi->common.error.jmp)) { + ctx->cpi->common.error.setjmp = 0; + res = update_error_state(ctx, &ctx->cpi->common.error); + vpx_clear_system_state(); + return res; } + ctx->cpi->common.error.setjmp = 1; + + // Per-frame PSNR is not supported when g_lag_in_frames is greater than 0. + if ((flags & VPX_EFLAG_CALCULATE_PSNR) && ctx->cfg.g_lag_in_frames != 0) { + vpx_internal_error( + &ctx->cpi->common.error, VPX_CODEC_INCAPABLE, + "Cannot calculate per-frame PSNR when g_lag_in_frames is nonzero"); + } + /* Set up internal flags */ +#if CONFIG_INTERNAL_STATS + assert(((VP8_COMP *)ctx->cpi)->b_calculate_psnr == 1); +#else + ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = + (ctx->base.init_flags & VPX_CODEC_USE_PSNR) || + (flags & VPX_EFLAG_CALCULATE_PSNR); +#endif if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION) { ((VP8_COMP *)ctx->cpi)->output_partition = 1; @@ -851,28 +982,50 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, /* Convert API flags to internal codec lib flags */ lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; - /* vp8 use 10,000,000 ticks/second as time stamp */ - dst_time_stamp = - pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den; - dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / - ctx->cfg.g_timebase.den; - if (img != NULL) { + YV12_BUFFER_CONFIG sd; + + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = pts_val; + ctx->pts_offset_initialized = 1; + } + if (pts_val < ctx->pts_offset) { + vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "pts is smaller than initial pts"); + } + pts_val -= ctx->pts_offset; + if (pts_val > INT64_MAX / ctx->timestamp_ratio.num) { + vpx_internal_error( + &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts to ticks would overflow"); + } + dst_time_stamp = + pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; +#if ULONG_MAX > INT64_MAX + if (duration > INT64_MAX) { + vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "duration is too big"); + } +#endif + if (pts_val > INT64_MAX - (int64_t)duration) { + vpx_internal_error(&ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "relative pts + duration is too big"); + } + vpx_codec_pts_t pts_end = pts_val + (int64_t)duration; + if (pts_end > INT64_MAX / ctx->timestamp_ratio.num) { + vpx_internal_error( + &ctx->cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts + duration to ticks would overflow"); + } + dst_end_time_stamp = + pts_end * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den; + res = image2yuvconfig(img, &sd); - if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { - /* from vpx_encoder.h for g_w/g_h: - "Note that the frames passed as input to the encoder must have this - resolution" - */ - ctx->base.err_detail = "Invalid input frame resolution"; - res = VPX_CODEC_INVALID_PARAM; - } else { - if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, - &sd, dst_time_stamp, dst_end_time_stamp)) { - VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; - res = update_error_state(ctx, &cpi->common.error); - } + if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd, + dst_time_stamp, dst_end_time_stamp)) { + VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; + res = update_error_state(ctx, &cpi->common.error); } /* reset for next frame */ @@ -890,6 +1043,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, &dst_end_time_stamp, !img); if (comp_data_state == VPX_CODEC_CORRUPT_FRAME) { + ctx->cpi->common.error.setjmp = 0; return VPX_CODEC_CORRUPT_FRAME; } else if (comp_data_state == -1) { break; @@ -901,16 +1055,21 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; /* Add the frame packet to the list of returned packets. */ - round = (vpx_codec_pts_t)10000000 * ctx->cfg.g_timebase.num / 2 - 1; + round = (vpx_codec_pts_t)ctx->timestamp_ratio.num / 2; + if (round > 0) --round; delta = (dst_end_time_stamp - dst_time_stamp); pkt.kind = VPX_CODEC_CX_FRAME_PKT; pkt.data.frame.pts = - (dst_time_stamp * ctx->cfg.g_timebase.den + round) / - ctx->cfg.g_timebase.num / 10000000; + (dst_time_stamp * ctx->timestamp_ratio.den + round) / + ctx->timestamp_ratio.num + + ctx->pts_offset; pkt.data.frame.duration = - (unsigned long)((delta * ctx->cfg.g_timebase.den + round) / - ctx->cfg.g_timebase.num / 10000000); + (unsigned long)((delta * ctx->timestamp_ratio.den + round) / + ctx->timestamp_ratio.num); pkt.data.frame.flags = lib_flags << 16; + pkt.data.frame.width[0] = cpi->common.Width; + pkt.data.frame.height[0] = cpi->common.Height; + pkt.data.frame.spatial_layer_encoded[0] = 1; if (lib_flags & FRAMEFLAGS_KEY) { pkt.data.frame.flags |= VPX_FRAME_IS_KEY; @@ -925,9 +1084,9 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, * Invisible frames have no duration. */ pkt.data.frame.pts = - ((cpi->last_time_stamp_seen * ctx->cfg.g_timebase.den + round) / - ctx->cfg.g_timebase.num / 10000000) + - 1; + ((cpi->last_time_stamp_seen * ctx->timestamp_ratio.den + round) / + ctx->timestamp_ratio.num) + + ctx->pts_offset + 1; pkt.data.frame.duration = 0; } @@ -974,6 +1133,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, } } } + ctx->cpi->common.error.setjmp = 0; } return res; @@ -1139,8 +1299,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx, if (data) { int res; vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data; - res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode, - (VPX_SCALING)scalemode.v_scaling_mode); + res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, + scalemode.v_scaling_mode); if (!res) { /*force next frame a key frame to effect scaling mode */ @@ -1179,13 +1339,14 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = { { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct }, { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode }, { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct }, + { VP8E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl }, { -1, NULL }, }; static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { { 0, { - 0, /* g_usage */ + 0, /* g_usage (unused) */ 0, /* g_threads */ 0, /* g_profile */ @@ -1206,13 +1367,13 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { 0, /* rc_resize_allowed */ 1, /* rc_scaled_width */ 1, /* rc_scaled_height */ - 60, /* rc_resize_down_thresold */ - 30, /* rc_resize_up_thresold */ + 60, /* rc_resize_down_thresh */ + 30, /* rc_resize_up_thresh */ VPX_VBR, /* rc_end_usage */ { NULL, 0 }, /* rc_twopass_stats_in */ { NULL, 0 }, /* rc_firstpass_mb_stats_in */ - 256, /* rc_target_bandwidth */ + 256, /* rc_target_bitrate */ 4, /* rc_min_quantizer */ 63, /* rc_max_quantizer */ 100, /* rc_undershoot_pct */ @@ -1225,6 +1386,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { 50, /* rc_two_pass_vbrbias */ 0, /* rc_two_pass_vbrmin_section */ 400, /* rc_two_pass_vbrmax_section */ + 0, // rc_2pass_vbr_corpus_complexity (only has meaningfull for VP9) /* keyframing settings (kf) */ VPX_KF_AUTO, /* g_kfmode*/ @@ -1233,14 +1395,30 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = { VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */ { 0 }, - { 0 }, /* ss_target_bitrate */ - 1, /* ts_number_layers */ - { 0 }, /* ts_target_bitrate */ - { 0 }, /* ts_rate_decimator */ - 0, /* ts_periodicity */ - { 0 }, /* ts_layer_id */ - { 0 }, /* layer_target_bitrate */ - 0 /* temporal_layering_mode */ + { 0 }, /* ss_target_bitrate */ + 1, /* ts_number_layers */ + { 0 }, /* ts_target_bitrate */ + { 0 }, /* ts_rate_decimator */ + 0, /* ts_periodicity */ + { 0 }, /* ts_layer_id */ + { 0 }, /* layer_target_bitrate */ + 0, /* temporal_layering_mode */ + 0, /* use_vizier_rc_params */ + { 1, 1 }, /* active_wq_factor */ + { 1, 1 }, /* err_per_mb_factor */ + { 1, 1 }, /* sr_default_decay_limit */ + { 1, 1 }, /* sr_diff_factor */ + { 1, 1 }, /* kf_err_per_mb_factor */ + { 1, 1 }, /* kf_frame_min_boost_factor */ + { 1, 1 }, /* kf_frame_max_boost_first_factor */ + { 1, 1 }, /* kf_frame_max_boost_subs_factor */ + { 1, 1 }, /* kf_max_total_boost_factor */ + { 1, 1 }, /* gf_max_total_boost_factor */ + { 1, 1 }, /* gf_frame_max_boost_factor */ + { 1, 1 }, /* zm_factor */ + { 1, 1 }, /* rd_mult_inter_qp_fac */ + { 1, 1 }, /* rd_mult_arf_qp_fac */ + { 1, 1 }, /* rd_mult_key_qp_fac */ } }, }; @@ -1267,6 +1445,10 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = { vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t cfg_maps; */ vp8e_encode, /* vpx_codec_encode_fn_t encode; */ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t get_cx_data; */ - vp8e_set_config, NULL, vp8e_get_preview, vp8e_mr_alloc_mem, + vp8e_set_config, + NULL, + vp8e_get_preview, + vp8e_mr_alloc_mem, + vp8e_mr_free_mem, } /* encoder functions */ }; diff --git a/media/libvpx/libvpx/vp8/vp8_dx_iface.c b/media/libvpx/libvpx/vp8/vp8_dx_iface.c index 3cc3f92f90..65a86e2dc7 100644 --- a/media/libvpx/libvpx/vp8/vp8_dx_iface.c +++ b/media/libvpx/libvpx/vp8/vp8_dx_iface.c @@ -20,6 +20,7 @@ #include "vpx_version.h" #include "common/alloccommon.h" #include "common/common.h" +#include "common/onyxc_int.h" #include "common/onyxd.h" #include "decoder/onyxd_int.h" #include "vpx_dsp/vpx_dsp_common.h" @@ -38,13 +39,19 @@ typedef vpx_codec_stream_info_t vp8_stream_info_t; /* Structures for handling memory allocations */ typedef enum { VP8_SEG_ALG_PRIV = 256, VP8_SEG_MAX } mem_seg_id_t; -#define NELEMENTS(x) ((int)(sizeof(x) / sizeof(x[0]))) +#define NELEMENTS(x) ((int)(sizeof(x) / sizeof((x)[0]))) struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_dec_cfg_t cfg; vp8_stream_info_t si; int decoder_init; +#if CONFIG_MULTITHREAD + // Restart threads on next frame if set to 1. + // This is set when error happens in multithreaded decoding and all threads + // are shut down. + int restart_threads; +#endif int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; vpx_decrypt_cb decrypt_cb; @@ -80,7 +87,6 @@ static int vp8_init_ctx(vpx_codec_ctx_t *ctx) { static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *data) { vpx_codec_err_t res = VPX_CODEC_OK; - vpx_codec_alg_priv_t *priv = NULL; (void)data; vp8_rtcd(); @@ -92,7 +98,10 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, * information becomes known. */ if (!ctx->priv) { + vpx_codec_alg_priv_t *priv; + if (vp8_init_ctx(ctx)) return VPX_CODEC_MEM_ERROR; + priv = (vpx_codec_alg_priv_t *)ctx->priv; /* initialize number of fragments to zero */ @@ -102,8 +111,6 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS); /*post processing level initialized to do nothing */ - } else { - priv = (vpx_codec_alg_priv_t *)ctx->priv; } return res; @@ -144,8 +151,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, } si->is_kf = 0; - if (data_sz >= 10 && !(clear[0] & 0x01)) /* I-Frame */ - { + if (data_sz >= 10 && !(clear[0] & 0x01)) { /* I-Frame */ si->is_kf = 1; /* vet via sync code */ @@ -157,7 +163,10 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, si->h = (clear[8] | (clear[9] << 8)) & 0x3fff; /*printf("w=%d, h=%d\n", si->w, si->h);*/ - if (!(si->h && si->w)) res = VPX_CODEC_CORRUPT_FRAME; + if (!(si->h && si->w)) { + si->w = si->h = 0; + res = VPX_CODEC_CORRUPT_FRAME; + } } else { res = VPX_CODEC_UNSUP_BITSTREAM; } @@ -201,9 +210,9 @@ static vpx_codec_err_t update_error_state( static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, void *user_priv) { /** vpx_img_wrap() doesn't allow specifying independent strides for - * the Y, U, and V planes, nor other alignment adjustments that - * might be representable by a YV12_BUFFER_CONFIG, so we just - * initialize all the fields.*/ + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ img->fmt = VPX_IMG_FMT_I420; img->w = yv12->y_stride; img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15; @@ -228,7 +237,8 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, } static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, - unsigned int data_sz, vpx_codec_err_t *res) { + unsigned int data_sz, + volatile vpx_codec_err_t *res) { *res = VPX_CODEC_OK; if (ctx->fragments.count == 0) { @@ -240,14 +250,14 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, /* Store a pointer to this fragment and return. We haven't * received the complete frame yet, so we will wait with decoding. */ - ctx->fragments.ptrs[ctx->fragments.count] = data; - ctx->fragments.sizes[ctx->fragments.count] = data_sz; - ctx->fragments.count++; - if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1) { + if (ctx->fragments.count >= MAX_PARTITIONS) { ctx->fragments.count = 0; *res = VPX_CODEC_INVALID_PARAM; return -1; } + ctx->fragments.ptrs[ctx->fragments.count] = data; + ctx->fragments.sizes[ctx->fragments.count] = data_sz; + ctx->fragments.count++; return 0; } @@ -266,10 +276,10 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data, static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, - void *user_priv, long deadline) { - vpx_codec_err_t res = VPX_CODEC_OK; - unsigned int resolution_change = 0; - unsigned int w, h; + void *user_priv) { + volatile vpx_codec_err_t res; + volatile unsigned int resolution_change = 0; + volatile unsigned int w, h; if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) { return 0; @@ -295,9 +305,39 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, } if (!ctx->decoder_init && !ctx->si.is_kf) res = VPX_CODEC_UNSUP_BITSTREAM; + if (!res && ctx->decoder_init && w == 0 && h == 0 && ctx->si.h == 0 && + ctx->si.w == 0) { + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + assert(pbi != NULL); + assert(!pbi->common.error.setjmp); + res = VPX_CODEC_CORRUPT_FRAME; + vpx_internal_error(&pbi->common.error, res, + "Keyframe / intra-only frame required to reset decoder" + " state"); + } if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1; +#if CONFIG_MULTITHREAD + if (!res && ctx->restart_threads) { + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + VP8_COMMON *const pc = &pbi->common; + if (setjmp(pbi->common.error.jmp)) { + pbi->common.error.setjmp = 0; + vp8_decoder_remove_threads(pbi); + vpx_clear_system_state(); + return VPX_CODEC_ERROR; + } + pbi->common.error.setjmp = 1; + pbi->max_threads = ctx->cfg.threads; + vp8_decoder_create_threads(pbi); + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { + vp8mt_alloc_temp_buffers(pbi, pc->Width, pc->mb_rows); + } + ctx->restart_threads = 0; + pbi->common.error.setjmp = 0; + } +#endif /* Initialize the decoder instance on the first frame*/ if (!res && !ctx->decoder_init) { VP8D_CONFIG oxcf; @@ -322,7 +362,14 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, } res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf); - if (res == VPX_CODEC_OK) ctx->decoder_init = 1; + if (res == VPX_CODEC_OK) { + ctx->decoder_init = 1; + } else { + /* on failure clear the cached resolution to ensure a full + * reallocation is attempted on resync. */ + ctx->si.w = 0; + ctx->si.h = 0; + } } /* Set these even if already initialized. The caller may have changed the @@ -335,8 +382,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if (!res) { VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + VP8_COMMON *const pc = &pbi->common; if (resolution_change) { - VP8_COMMON *const pc = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; #if CONFIG_MULTITHREAD int i; @@ -344,8 +391,6 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, pc->Width = ctx->si.w; pc->Height = ctx->si.h; { - int prev_mb_rows = pc->mb_rows; - if (setjmp(pbi->common.error.jmp)) { pbi->common.error.setjmp = 0; /* on failure clear the cached resolution to ensure a full @@ -371,6 +416,12 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, "Invalid frame height"); } +#if CONFIG_MULTITHREAD + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { + vp8mt_de_alloc_temp_buffers(pbi, pc->mb_rows); + } +#endif + if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height)) { vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffers"); @@ -414,11 +465,9 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, #endif #if CONFIG_MULTITHREAD - if (pbi->b_multithreaded_rd) { - vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows); + if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) { + vp8mt_alloc_temp_buffers(pbi, pc->Width, 0); } -#else - (void)prev_mb_rows; #endif } @@ -428,16 +477,44 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, pbi->common.fb_idx_ref_cnt[0] = 0; } + if (setjmp(pbi->common.error.jmp)) { + vpx_clear_system_state(); + /* We do not know if the missing frame(s) was supposed to update + * any of the reference buffers, but we act conservative and + * mark only the last buffer as corrupted. + */ + pc->yv12_fb[pc->lst_fb_idx].corrupted = 1; + + if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) { + pc->fb_idx_ref_cnt[pc->new_fb_idx]--; + } + pbi->common.error.setjmp = 0; +#if CONFIG_MULTITHREAD + if (pbi->restart_threads) { + ctx->si.w = 0; + ctx->si.h = 0; + ctx->restart_threads = 1; + } +#endif + res = update_error_state(ctx, &pbi->common.error); + return res; + } + + pbi->common.error.setjmp = 1; + /* update the pbi fragment data */ pbi->fragments = ctx->fragments; - +#if CONFIG_MULTITHREAD + pbi->restart_threads = 0; +#endif ctx->user_priv = user_priv; - if (vp8dx_receive_compressed_data(pbi, data_sz, data, deadline)) { + if (vp8dx_receive_compressed_data(pbi)) { res = update_error_state(ctx, &pbi->common.error); } /* get ready for the next series of fragments */ ctx->fragments.count = 0; + pbi->common.error.setjmp = 0; } return res; @@ -452,7 +529,6 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx, */ if (!(*iter) && ctx->yv12_frame_buffers.pbi[0]) { YV12_BUFFER_CONFIG sd; - int64_t time_stamp = 0, time_end_stamp = 0; vp8_ppflags_t flags; vp8_zero(flags); @@ -462,8 +538,7 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx, flags.noise_level = ctx->postproc_cfg.noise_level; } - if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd, - &time_stamp, &time_end_stamp, &flags)) { + if (0 == vp8dx_get_raw_frame(ctx->yv12_frame_buffers.pbi[0], &sd, &flags)) { yuvconfig2image(&ctx->img, &sd, ctx->user_priv); img = &ctx->img; @@ -535,6 +610,16 @@ static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx, } } +static vpx_codec_err_t vp8_get_quantizer(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0]; + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + if (pbi == NULL) return VPX_CODEC_CORRUPT_FRAME; + *arg = vp8dx_get_quantizer(pbi); + return VPX_CODEC_OK; +} + static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, va_list args) { #if CONFIG_POSTPROC @@ -561,6 +646,7 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, if (update_info) { VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0]; + if (pbi == NULL) return VPX_CODEC_CORRUPT_FRAME; *update_info = pbi->common.refresh_alt_ref_frame * (int)VP8_ALTR_FRAME + pbi->common.refresh_golden_frame * (int)VP8_GOLD_FRAME + @@ -572,20 +658,22 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, } } -extern int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame); static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx, va_list args) { int *ref_info = va_arg(args, int *); if (ref_info) { VP8D_COMP *pbi = (VP8D_COMP *)ctx->yv12_frame_buffers.pbi[0]; - VP8_COMMON *oci = &pbi->common; - *ref_info = - (vp8dx_references_buffer(oci, ALTREF_FRAME) ? VP8_ALTR_FRAME : 0) | - (vp8dx_references_buffer(oci, GOLDEN_FRAME) ? VP8_GOLD_FRAME : 0) | - (vp8dx_references_buffer(oci, LAST_FRAME) ? VP8_LAST_FRAME : 0); - - return VPX_CODEC_OK; + if (pbi) { + VP8_COMMON *oci = &pbi->common; + *ref_info = + (vp8dx_references_buffer(oci, ALTREF_FRAME) ? VP8_ALTR_FRAME : 0) | + (vp8dx_references_buffer(oci, GOLDEN_FRAME) ? VP8_GOLD_FRAME : 0) | + (vp8dx_references_buffer(oci, LAST_FRAME) ? VP8_LAST_FRAME : 0); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_CORRUPT_FRAME; + } } else { return VPX_CODEC_INVALID_PARAM; } @@ -620,13 +708,14 @@ static vpx_codec_err_t vp8_set_decryptor(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } -vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = { +static vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = { { VP8_SET_REFERENCE, vp8_set_reference }, { VP8_COPY_REFERENCE, vp8_get_reference }, { VP8_SET_POSTPROC, vp8_set_postproc }, { VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates }, { VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted }, { VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame }, + { VPXD_GET_LAST_QUANTIZER, vp8_get_quantizer }, { VPXD_SET_DECRYPTOR, vp8_set_decryptor }, { -1, NULL }, }; @@ -658,6 +747,7 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) = { NULL, /* vpx_codec_enc_config_set_fn_t */ NULL, /* vpx_codec_get_global_headers_fn_t */ NULL, /* vpx_codec_get_preview_frame_fn_t */ - NULL /* vpx_codec_enc_mr_get_mem_loc_fn_t */ + NULL, /* vpx_codec_enc_mr_get_mem_loc_fn_t */ + NULL /* vpx_codec_enc_mr_free_mem_loc_fn_t */ } }; diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc new file mode 100644 index 0000000000..312092f190 --- /dev/null +++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/vp8_ratectrl_rtc.h" + +#include + +#include + +#include "vp8/common/common.h" +#include "vp8/encoder/onyx_int.h" +#include "vp8/encoder/ratectrl.h" +#include "vpx_ports/system_state.h" + +namespace libvpx { +/* Quant MOD */ +static const int kQTrans[] = { + 0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 15, 17, 18, 19, + 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 35, 37, 39, 41, + 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124, 127, +}; + +static const unsigned char kf_high_motion_minq[QINDEX_RANGE] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, + 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 9, 9, 10, 10, + 10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16, + 16, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, + 22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30 +}; + +static const unsigned char inter_minq[QINDEX_RANGE] = { + 0, 0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 9, 10, 11, + 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 22, 23, 24, + 24, 25, 26, 27, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, + 39, 39, 40, 41, 42, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 50, 51, 52, 53, + 54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, + 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 +}; + +static int rescale(int val, int num, int denom) { + int64_t llnum = num; + int64_t llden = denom; + int64_t llval = val; + + return (int)(llval * llnum / llden); +} + +std::unique_ptr VP8RateControlRTC::Create( + const VP8RateControlRtcConfig &cfg) { + std::unique_ptr rc_api(new (std::nothrow) + VP8RateControlRTC()); + if (!rc_api) return nullptr; + rc_api->cpi_ = static_cast(vpx_memalign(32, sizeof(*cpi_))); + if (!rc_api->cpi_) return nullptr; + vp8_zero(*rc_api->cpi_); + + if (!rc_api->InitRateControl(cfg)) return nullptr; + + return rc_api; +} + +VP8RateControlRTC::~VP8RateControlRTC() { + if (cpi_) { + vpx_free(cpi_->gf_active_flags); + vpx_free(cpi_); + } +} + +bool VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { + VP8_COMMON *cm = &cpi_->common; + VP8_CONFIG *oxcf = &cpi_->oxcf; + oxcf->end_usage = USAGE_STREAM_FROM_SERVER; + cpi_->pass = 0; + cm->show_frame = 1; + oxcf->drop_frames_water_mark = 0; + cm->current_video_frame = 0; + cpi_->auto_gold = 1; + cpi_->key_frame_count = 1; + cpi_->rate_correction_factor = 1.0; + cpi_->key_frame_rate_correction_factor = 1.0; + cpi_->cyclic_refresh_mode_enabled = 0; + cpi_->auto_worst_q = 1; + cpi_->kf_overspend_bits = 0; + cpi_->kf_bitrate_adjustment = 0; + cpi_->gf_overspend_bits = 0; + cpi_->non_gf_bitrate_adjustment = 0; + if (!UpdateRateControl(rc_cfg)) return false; + cpi_->buffer_level = oxcf->starting_buffer_level; + cpi_->bits_off_target = oxcf->starting_buffer_level; + return true; +} + +bool VP8RateControlRTC::UpdateRateControl( + const VP8RateControlRtcConfig &rc_cfg) { + if (rc_cfg.ts_number_layers < 1 || + rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS) { + return false; + } + + VP8_COMMON *cm = &cpi_->common; + VP8_CONFIG *oxcf = &cpi_->oxcf; + const unsigned int prev_number_of_layers = oxcf->number_of_layers; + vpx_clear_system_state(); + cm->Width = rc_cfg.width; + cm->Height = rc_cfg.height; + oxcf->Width = rc_cfg.width; + oxcf->Height = rc_cfg.height; + oxcf->worst_allowed_q = kQTrans[rc_cfg.max_quantizer]; + oxcf->best_allowed_q = kQTrans[rc_cfg.min_quantizer]; + cpi_->worst_quality = oxcf->worst_allowed_q; + cpi_->best_quality = oxcf->best_allowed_q; + cpi_->output_framerate = rc_cfg.framerate; + oxcf->target_bandwidth = + static_cast(1000 * rc_cfg.target_bandwidth); + cpi_->ref_framerate = cpi_->output_framerate; + oxcf->fixed_q = -1; + oxcf->error_resilient_mode = 1; + oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz; + oxcf->optimal_buffer_level_in_ms = rc_cfg.buf_optimal_sz; + oxcf->maximum_buffer_size_in_ms = rc_cfg.buf_sz; + oxcf->starting_buffer_level = rc_cfg.buf_initial_sz; + oxcf->optimal_buffer_level = rc_cfg.buf_optimal_sz; + oxcf->maximum_buffer_size = rc_cfg.buf_sz; + oxcf->number_of_layers = rc_cfg.ts_number_layers; + cpi_->buffered_mode = oxcf->optimal_buffer_level > 0; + oxcf->under_shoot_pct = rc_cfg.undershoot_pct; + oxcf->over_shoot_pct = rc_cfg.overshoot_pct; + oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh; + if (oxcf->drop_frames_water_mark > 0) cpi_->drop_frames_allowed = 1; + cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct; + cpi_->framerate = rc_cfg.framerate; + for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) { + cpi_->prior_key_frame_distance[i] = + static_cast(cpi_->output_framerate); + } + oxcf->screen_content_mode = rc_cfg.is_screen; + if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) { + memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate, + sizeof(rc_cfg.layer_target_bitrate)); + memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator, + sizeof(rc_cfg.ts_rate_decimator)); + if (cm->current_video_frame == 0) { + double prev_layer_framerate = 0; + for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) { + vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate); + prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i]; + } + } else if (oxcf->number_of_layers != prev_number_of_layers) { + // The number of temporal layers has changed, so reset/initialize the + // temporal layer context for the new layer configuration: this means + // calling vp8_reset_temporal_layer_change() below. + + // Start at the base of the pattern cycle, so set the layer id to 0 and + // reset the temporal pattern counter. + // TODO(marpan/jianj): don't think lines 148-151 are needed (user controls + // the layer_id) so remove. + if (cpi_->temporal_layer_id > 0) { + cpi_->temporal_layer_id = 0; + } + cpi_->temporal_pattern_counter = 0; + + vp8_reset_temporal_layer_change(cpi_, oxcf, + static_cast(prev_number_of_layers)); + } + } + + cpi_->total_actual_bits = 0; + cpi_->total_target_vs_actual = 0; + + cm->mb_rows = cm->Height >> 4; + cm->mb_cols = cm->Width >> 4; + cm->MBs = cm->mb_rows * cm->mb_cols; + cm->mode_info_stride = cm->mb_cols + 1; + + // For temporal layers: starting/maximum/optimal_buffer_level is already set + // via vp8_init_temporal_layer_context() or vp8_reset_temporal_layer_change(). + if (oxcf->number_of_layers <= 1 && prev_number_of_layers <= 1) { + oxcf->starting_buffer_level = + rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000); + /* Set or reset optimal and maximum buffer levels. */ + if (oxcf->optimal_buffer_level == 0) { + oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8; + } else { + oxcf->optimal_buffer_level = rescale((int)oxcf->optimal_buffer_level, + oxcf->target_bandwidth, 1000); + } + if (oxcf->maximum_buffer_size == 0) { + oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8; + } else { + oxcf->maximum_buffer_size = + rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000); + } + } + + if (cpi_->bits_off_target > oxcf->maximum_buffer_size) { + cpi_->bits_off_target = oxcf->maximum_buffer_size; + cpi_->buffer_level = cpi_->bits_off_target; + } + + vp8_new_framerate(cpi_, cpi_->framerate); + vpx_clear_system_state(); + return true; +} + +FrameDropDecision VP8RateControlRTC::ComputeQP( + const VP8FrameParamsQpRTC &frame_params) { + VP8_COMMON *const cm = &cpi_->common; + vpx_clear_system_state(); + if (cpi_->oxcf.number_of_layers > 1) { + cpi_->temporal_layer_id = frame_params.temporal_layer_id; + const int layer = frame_params.temporal_layer_id; + vp8_update_layer_contexts(cpi_); + /* Restore layer specific context & set frame rate */ + vp8_restore_layer_context(cpi_, layer); + vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate); + } + cm->frame_type = static_cast(frame_params.frame_type); + cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; + cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; + if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) { + cpi_->common.frame_flags |= FRAMEFLAGS_KEY; + } + + cpi_->per_frame_bandwidth = static_cast( + round(cpi_->oxcf.target_bandwidth / cpi_->output_framerate)); + if (vp8_check_drop_buffer(cpi_)) { + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + return FrameDropDecision::kDrop; + } + + if (!vp8_pick_frame_size(cpi_)) { + cm->current_video_frame++; + cpi_->frames_since_key++; + cpi_->ext_refresh_frame_flags_pending = 0; + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + return FrameDropDecision::kDrop; + } + + if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level && + cpi_->buffered_mode) { + /* Max adjustment is 1/4 */ + int Adjustment = cpi_->active_worst_quality / 4; + if (Adjustment) { + int buff_lvl_step; + if (cpi_->buffer_level < cpi_->oxcf.maximum_buffer_size) { + buff_lvl_step = (int)((cpi_->oxcf.maximum_buffer_size - + cpi_->oxcf.optimal_buffer_level) / + Adjustment); + if (buff_lvl_step) { + Adjustment = + (int)((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) / + buff_lvl_step); + } else { + Adjustment = 0; + } + } + cpi_->active_worst_quality -= Adjustment; + if (cpi_->active_worst_quality < cpi_->active_best_quality) { + cpi_->active_worst_quality = cpi_->active_best_quality; + } + } + } + + if (cpi_->ni_frames > 150) { + int q = cpi_->active_worst_quality; + if (cm->frame_type == KEY_FRAME) { + cpi_->active_best_quality = kf_high_motion_minq[q]; + } else { + cpi_->active_best_quality = inter_minq[q]; + } + + if (cpi_->buffer_level >= cpi_->oxcf.maximum_buffer_size) { + cpi_->active_best_quality = cpi_->best_quality; + + } else if (cpi_->buffer_level > cpi_->oxcf.optimal_buffer_level) { + int Fraction = + (int)(((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) * 128) / + (cpi_->oxcf.maximum_buffer_size - + cpi_->oxcf.optimal_buffer_level)); + int min_qadjustment = + ((cpi_->active_best_quality - cpi_->best_quality) * Fraction) / 128; + + cpi_->active_best_quality -= min_qadjustment; + } + } + + /* Clip the active best and worst quality values to limits */ + if (cpi_->active_worst_quality > cpi_->worst_quality) { + cpi_->active_worst_quality = cpi_->worst_quality; + } + if (cpi_->active_best_quality < cpi_->best_quality) { + cpi_->active_best_quality = cpi_->best_quality; + } + if (cpi_->active_worst_quality < cpi_->active_best_quality) { + cpi_->active_worst_quality = cpi_->active_best_quality; + } + + q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target); + vp8_set_quantizer(cpi_, q_); + vpx_clear_system_state(); + return FrameDropDecision::kOk; +} + +int VP8RateControlRTC::GetQP() const { return q_; } + +UVDeltaQP VP8RateControlRTC::GetUVDeltaQP() const { + VP8_COMMON *cm = &cpi_->common; + UVDeltaQP uv_delta_q; + uv_delta_q.uvdc_delta_q = cm->uvdc_delta_q; + uv_delta_q.uvac_delta_q = cm->uvac_delta_q; + return uv_delta_q; +} + +int VP8RateControlRTC::GetLoopfilterLevel() const { + VP8_COMMON *cm = &cpi_->common; + const double qp = q_; + + // This model is from linear regression + if (cm->Width * cm->Height <= 320 * 240) { + cm->filter_level = static_cast(0.352685 * qp + 2.957774); + } else if (cm->Width * cm->Height <= 640 * 480) { + cm->filter_level = static_cast(0.485069 * qp - 0.534462); + } else { + cm->filter_level = static_cast(0.314875 * qp + 7.959003); + } + + int min_filter_level = 0; + // This logic is from get_min_filter_level() in picklpf.c + if (q_ > 6 && q_ <= 16) { + min_filter_level = 1; + } else { + min_filter_level = (q_ / 8); + } + + const int max_filter_level = 63; + if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level; + if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level; + + return cm->filter_level; +} + +void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) { + VP8_COMMON *const cm = &cpi_->common; + vpx_clear_system_state(); + cpi_->total_byte_count += encoded_frame_size; + cpi_->projected_frame_size = static_cast(encoded_frame_size << 3); + if (cpi_->oxcf.number_of_layers > 1) { + for (unsigned int i = cpi_->current_layer + 1; + i < cpi_->oxcf.number_of_layers; ++i) { + cpi_->layer_context[i].total_byte_count += encoded_frame_size; + } + } + + vp8_update_rate_correction_factors(cpi_, 2); + + cpi_->last_q[cm->frame_type] = cm->base_qindex; + + if (cm->frame_type == KEY_FRAME) { + vp8_adjust_key_frame_context(cpi_); + } + + /* Keep a record of ambient average Q. */ + if (cm->frame_type != KEY_FRAME) { + cpi_->avg_frame_qindex = + (2 + 3 * cpi_->avg_frame_qindex + cm->base_qindex) >> 2; + } + /* Keep a record from which we can calculate the average Q excluding + * key frames. + */ + if (cm->frame_type != KEY_FRAME) { + cpi_->ni_frames++; + /* Damp value for first few frames */ + if (cpi_->ni_frames > 150) { + cpi_->ni_tot_qi += q_; + cpi_->ni_av_qi = (cpi_->ni_tot_qi / cpi_->ni_frames); + } else { + cpi_->ni_tot_qi += q_; + cpi_->ni_av_qi = + ((cpi_->ni_tot_qi / cpi_->ni_frames) + cpi_->worst_quality + 1) / 2; + } + + /* If the average Q is higher than what was used in the last + * frame (after going through the recode loop to keep the frame + * size within range) then use the last frame value - 1. The -1 + * is designed to stop Q and hence the data rate, from + * progressively falling away during difficult sections, but at + * the same time reduce the number of itterations around the + * recode loop. + */ + if (q_ > cpi_->ni_av_qi) cpi_->ni_av_qi = q_ - 1; + } + + cpi_->bits_off_target += + cpi_->av_per_frame_bandwidth - cpi_->projected_frame_size; + if (cpi_->bits_off_target > cpi_->oxcf.maximum_buffer_size) { + cpi_->bits_off_target = cpi_->oxcf.maximum_buffer_size; + } + + cpi_->total_actual_bits += cpi_->projected_frame_size; + cpi_->buffer_level = cpi_->bits_off_target; + + /* Propagate values to higher temporal layers */ + if (cpi_->oxcf.number_of_layers > 1) { + for (unsigned int i = cpi_->current_layer + 1; + i < cpi_->oxcf.number_of_layers; ++i) { + LAYER_CONTEXT *lc = &cpi_->layer_context[i]; + int bits_off_for_this_layer = (int)round( + lc->target_bandwidth / lc->framerate - cpi_->projected_frame_size); + + lc->bits_off_target += bits_off_for_this_layer; + + /* Clip buffer level to maximum buffer size for the layer */ + if (lc->bits_off_target > lc->maximum_buffer_size) { + lc->bits_off_target = lc->maximum_buffer_size; + } + + lc->total_actual_bits += cpi_->projected_frame_size; + lc->total_target_vs_actual += bits_off_for_this_layer; + lc->buffer_level = lc->bits_off_target; + } + } + + cpi_->common.current_video_frame++; + cpi_->frames_since_key++; + + if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_); + vpx_clear_system_state(); +} +} // namespace libvpx diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h new file mode 100644 index 0000000000..b458b5ce65 --- /dev/null +++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP8_RATECTRL_RTC_H_ +#define VPX_VP8_RATECTRL_RTC_H_ + +#include +#include +#include + +#include "vpx/internal/vpx_ratectrl_rtc.h" + +struct VP8_COMP; + +namespace libvpx { +struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig { + VP8RateControlRtcConfig() { + memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate)); + memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator)); + } +}; + +struct VP8FrameParamsQpRTC { + RcFrameType frame_type; + int temporal_layer_id; +}; + +class VP8RateControlRTC { + public: + static std::unique_ptr Create( + const VP8RateControlRtcConfig &cfg); + ~VP8RateControlRTC(); + + bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); + // GetQP() needs to be called after ComputeQP() to get the latest QP + int GetQP() const; + // GetUVDeltaQP() needs to be called after ComputeQP() to get the latest + // delta QP for UV. + UVDeltaQP GetUVDeltaQP() const; + // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter + // level is calculated from frame qp. + int GetLoopfilterLevel() const; + // ComputeQP computes the QP if the frame is not dropped (kOk return), + // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate + // are not to be called. + FrameDropDecision ComputeQP(const VP8FrameParamsQpRTC &frame_params); + // Feedback to rate control with the size of current encoded frame + void PostEncodeUpdate(uint64_t encoded_frame_size); + + private: + VP8RateControlRTC() = default; + bool InitRateControl(const VP8RateControlRtcConfig &cfg); + struct VP8_COMP *cpi_ = nullptr; + int q_ = -1; +}; + +} // namespace libvpx + +#endif // VPX_VP8_RATECTRL_RTC_H_ diff --git a/media/libvpx/libvpx/vp8/vp8cx.mk b/media/libvpx/libvpx/vp8/vp8cx.mk index 7bd41a3fb7..b4b3fda9ea 100644 --- a/media/libvpx/libvpx/vp8/vp8cx.mk +++ b/media/libvpx/libvpx/vp8/vp8cx.mk @@ -23,6 +23,7 @@ VP8_CX_SRCS-yes += vp8_cx_iface.c VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h VP8_CX_SRCS-yes += encoder/bitstream.c VP8_CX_SRCS-yes += encoder/boolhuff.c +VP8_CX_SRCS-yes += encoder/copy_c.c VP8_CX_SRCS-yes += encoder/dct.c VP8_CX_SRCS-yes += encoder/encodeframe.c VP8_CX_SRCS-yes += encoder/encodeframe.h @@ -30,6 +31,7 @@ VP8_CX_SRCS-yes += encoder/encodeintra.c VP8_CX_SRCS-yes += encoder/encodemb.c VP8_CX_SRCS-yes += encoder/encodemv.c VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c +VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.h VP8_CX_SRCS-yes += encoder/firstpass.c VP8_CX_SRCS-yes += encoder/block.h VP8_CX_SRCS-yes += encoder/boolhuff.h @@ -56,11 +58,14 @@ VP8_CX_SRCS-yes += encoder/modecosts.c VP8_CX_SRCS-yes += encoder/onyx_if.c VP8_CX_SRCS-yes += encoder/pickinter.c VP8_CX_SRCS-yes += encoder/picklpf.c +VP8_CX_SRCS-yes += encoder/picklpf.h VP8_CX_SRCS-yes += encoder/vp8_quantize.c VP8_CX_SRCS-yes += encoder/ratectrl.c VP8_CX_SRCS-yes += encoder/rdopt.c VP8_CX_SRCS-yes += encoder/segmentation.c VP8_CX_SRCS-yes += encoder/segmentation.h +VP8_CX_SRCS-yes += common/vp8_skin_detection.c +VP8_CX_SRCS-yes += common/vp8_skin_detection.h VP8_CX_SRCS-yes += encoder/tokenize.c VP8_CX_SRCS-yes += encoder/dct_value_cost.h VP8_CX_SRCS-yes += encoder/dct_value_tokens.h @@ -68,29 +73,31 @@ VP8_CX_SRCS-yes += encoder/treewriter.c VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c VP8_CX_SRCS-yes += encoder/temporal_filter.c +VP8_CX_SRCS-yes += encoder/temporal_filter.h VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c +VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h endif -VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse2.asm +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse3.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c -VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp8_quantize_ssse3.c VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c endif +VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/block_error_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c -VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm -VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm @@ -106,6 +113,9 @@ VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c +VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c +VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c + ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c endif @@ -114,4 +124,9 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c endif +# common (loongarch LSX intrinsics) +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c +VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/vp8_quantize_lsx.c + VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes)) diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c new file mode 100644 index 0000000000..b43d7fa4f9 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c @@ -0,0 +1,446 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +// Use macros to make sure argument lane is passed in as a constant integer. + +#define vmull_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = vmull_lane_s32(vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = vmull_lane_s32(vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = vmull_lane_s32(vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = vmull_lane_s32(vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +#define vmlal_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = \ + vmlal_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = \ + vmlal_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = \ + vmlal_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = \ + vmlal_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +#define vmlsl_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = \ + vmlsl_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = \ + vmlsl_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = \ + vmlsl_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = \ + vmlsl_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +static INLINE int32x4x2_t +highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) { + int32x4x2_t out; + out.val[0] = vcombine_s32(vrshrn_n_s64(in[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(in[1].val[0], DCT_CONST_BITS)); + out.val[1] = vcombine_s32(vrshrn_n_s64(in[0].val[1], DCT_CONST_BITS), + vrshrn_n_s64(in[1].val[1], DCT_CONST_BITS)); + return out; +} + +#define highbd_iadst_half_butterfly(in, c, lane, out) \ + do { \ + int64x2x2_t _t[2]; \ + vmull_lane_s32_dual(in, c, lane, _t); \ + out = highbd_dct_const_round_shift_low_8(_t); \ + } while (0) + +#define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \ + do { \ + vmull_lane_s32_dual(in0, c, lane0, s0); \ + vmull_lane_s32_dual(in0, c, lane1, s1); \ + vmlal_lane_s32_dual(in1, c, lane1, s0); \ + vmlsl_lane_s32_dual(in1, c, lane0, s1); \ + } while (0) + +static INLINE int32x4x2_t vaddq_s32_dual(const int32x4x2_t in0, + const int32x4x2_t in1) { + int32x4x2_t out; + out.val[0] = vaddq_s32(in0.val[0], in1.val[0]); + out.val[1] = vaddq_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int64x2x2_t vaddq_s64_dual(const int64x2x2_t in0, + const int64x2x2_t in1) { + int64x2x2_t out; + out.val[0] = vaddq_s64(in0.val[0], in1.val[0]); + out.val[1] = vaddq_s64(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t vsubq_s32_dual(const int32x4x2_t in0, + const int32x4x2_t in1) { + int32x4x2_t out; + out.val[0] = vsubq_s32(in0.val[0], in1.val[0]); + out.val[1] = vsubq_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int64x2x2_t vsubq_s64_dual(const int64x2x2_t in0, + const int64x2x2_t in1) { + int64x2x2_t out; + out.val[0] = vsubq_s64(in0.val[0], in1.val[0]); + out.val[1] = vsubq_s64(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t vcombine_s32_dual(const int32x2x2_t in0, + const int32x2x2_t in1) { + int32x4x2_t out; + out.val[0] = vcombine_s32(in0.val[0], in1.val[0]); + out.val[1] = vcombine_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t highbd_add_dct_const_round_shift_low_8( + const int64x2x2_t *const in0, const int64x2x2_t *const in1) { + const int64x2x2_t sum_lo = vaddq_s64_dual(in0[0], in1[0]); + const int64x2x2_t sum_hi = vaddq_s64_dual(in0[1], in1[1]); + int32x2x2_t out_lo, out_hi; + + out_lo.val[0] = vrshrn_n_s64(sum_lo.val[0], DCT_CONST_BITS); + out_lo.val[1] = vrshrn_n_s64(sum_lo.val[1], DCT_CONST_BITS); + out_hi.val[0] = vrshrn_n_s64(sum_hi.val[0], DCT_CONST_BITS); + out_hi.val[1] = vrshrn_n_s64(sum_hi.val[1], DCT_CONST_BITS); + return vcombine_s32_dual(out_lo, out_hi); +} + +static INLINE int32x4x2_t highbd_sub_dct_const_round_shift_low_8( + const int64x2x2_t *const in0, const int64x2x2_t *const in1) { + const int64x2x2_t sub_lo = vsubq_s64_dual(in0[0], in1[0]); + const int64x2x2_t sub_hi = vsubq_s64_dual(in0[1], in1[1]); + int32x2x2_t out_lo, out_hi; + + out_lo.val[0] = vrshrn_n_s64(sub_lo.val[0], DCT_CONST_BITS); + out_lo.val[1] = vrshrn_n_s64(sub_lo.val[1], DCT_CONST_BITS); + out_hi.val[0] = vrshrn_n_s64(sub_hi.val[0], DCT_CONST_BITS); + out_hi.val[1] = vrshrn_n_s64(sub_hi.val[1], DCT_CONST_BITS); + return vcombine_s32_dual(out_lo, out_hi); +} + +static INLINE int32x4x2_t vnegq_s32_dual(const int32x4x2_t in) { + int32x4x2_t out; + out.val[0] = vnegq_s32(in.val[0]); + out.val[1] = vnegq_s32(in.val[1]); + return out; +} + +static void highbd_iadst16_neon(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, + const int bd) { + const int32x4_t c_1_31_5_27 = + create_s32x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64); + const int32x4_t c_9_23_13_19 = + create_s32x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64); + const int32x4_t c_17_15_21_11 = + create_s32x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64); + const int32x4_t c_25_7_29_3 = + create_s32x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64); + const int32x4_t c_4_28_20_12 = + create_s32x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64); + const int32x4_t c_16_n16_8_24 = + create_s32x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64); + int32x4x2_t in[16], out[16]; + int32x4x2_t x[16], t[12]; + int64x2x2_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + int64x2x2_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + + // Load input (16x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 8; + in[8].val[0] = vld1q_s32(input); + in[8].val[1] = vld1q_s32(input + 4); + input += 8; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 8; + in[9].val[0] = vld1q_s32(input); + in[9].val[1] = vld1q_s32(input + 4); + input += 8; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 8; + in[10].val[0] = vld1q_s32(input); + in[10].val[1] = vld1q_s32(input + 4); + input += 8; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 8; + in[11].val[0] = vld1q_s32(input); + in[11].val[1] = vld1q_s32(input + 4); + input += 8; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 8; + in[12].val[0] = vld1q_s32(input); + in[12].val[1] = vld1q_s32(input + 4); + input += 8; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 8; + in[13].val[0] = vld1q_s32(input); + in[13].val[1] = vld1q_s32(input + 4); + input += 8; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 8; + in[14].val[0] = vld1q_s32(input); + in[14].val[1] = vld1q_s32(input + 4); + input += 8; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + input += 8; + in[15].val[0] = vld1q_s32(input); + in[15].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // stage 1 + highbd_iadst_butterfly(x[0], x[1], vget_low_s32(c_1_31_5_27), 0, 1, s0, s1); + highbd_iadst_butterfly(x[2], x[3], vget_high_s32(c_1_31_5_27), 0, 1, s2, s3); + highbd_iadst_butterfly(x[4], x[5], vget_low_s32(c_9_23_13_19), 0, 1, s4, s5); + highbd_iadst_butterfly(x[6], x[7], vget_high_s32(c_9_23_13_19), 0, 1, s6, s7); + highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_17_15_21_11), 0, 1, s8, s9); + highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_17_15_21_11), 0, 1, s10, + s11); + highbd_iadst_butterfly(x[12], x[13], vget_low_s32(c_25_7_29_3), 0, 1, s12, + s13); + highbd_iadst_butterfly(x[14], x[15], vget_high_s32(c_25_7_29_3), 0, 1, s14, + s15); + + x[0] = highbd_add_dct_const_round_shift_low_8(s0, s8); + x[1] = highbd_add_dct_const_round_shift_low_8(s1, s9); + x[2] = highbd_add_dct_const_round_shift_low_8(s2, s10); + x[3] = highbd_add_dct_const_round_shift_low_8(s3, s11); + x[4] = highbd_add_dct_const_round_shift_low_8(s4, s12); + x[5] = highbd_add_dct_const_round_shift_low_8(s5, s13); + x[6] = highbd_add_dct_const_round_shift_low_8(s6, s14); + x[7] = highbd_add_dct_const_round_shift_low_8(s7, s15); + x[8] = highbd_sub_dct_const_round_shift_low_8(s0, s8); + x[9] = highbd_sub_dct_const_round_shift_low_8(s1, s9); + x[10] = highbd_sub_dct_const_round_shift_low_8(s2, s10); + x[11] = highbd_sub_dct_const_round_shift_low_8(s3, s11); + x[12] = highbd_sub_dct_const_round_shift_low_8(s4, s12); + x[13] = highbd_sub_dct_const_round_shift_low_8(s5, s13); + x[14] = highbd_sub_dct_const_round_shift_low_8(s6, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s7, s15); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_4_28_20_12), 0, 1, s8, s9); + highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_4_28_20_12), 0, 1, s10, + s11); + highbd_iadst_butterfly(x[13], x[12], vget_low_s32(c_4_28_20_12), 1, 0, s13, + s12); + highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_4_28_20_12), 1, 0, s15, + s14); + + x[0] = vaddq_s32_dual(t[0], t[4]); + x[1] = vaddq_s32_dual(t[1], t[5]); + x[2] = vaddq_s32_dual(t[2], t[6]); + x[3] = vaddq_s32_dual(t[3], t[7]); + x[4] = vsubq_s32_dual(t[0], t[4]); + x[5] = vsubq_s32_dual(t[1], t[5]); + x[6] = vsubq_s32_dual(t[2], t[6]); + x[7] = vsubq_s32_dual(t[3], t[7]); + x[8] = highbd_add_dct_const_round_shift_low_8(s8, s12); + x[9] = highbd_add_dct_const_round_shift_low_8(s9, s13); + x[10] = highbd_add_dct_const_round_shift_low_8(s10, s14); + x[11] = highbd_add_dct_const_round_shift_low_8(s11, s15); + x[12] = highbd_sub_dct_const_round_shift_low_8(s8, s12); + x[13] = highbd_sub_dct_const_round_shift_low_8(s9, s13); + x[14] = highbd_sub_dct_const_round_shift_low_8(s10, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s11, s15); + + // stage 3 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + highbd_iadst_butterfly(x[4], x[5], vget_high_s32(c_16_n16_8_24), 0, 1, s4, + s5); + highbd_iadst_butterfly(x[7], x[6], vget_high_s32(c_16_n16_8_24), 1, 0, s7, + s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + highbd_iadst_butterfly(x[12], x[13], vget_high_s32(c_16_n16_8_24), 0, 1, s12, + s13); + highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_16_n16_8_24), 1, 0, s15, + s14); + + x[0] = vaddq_s32_dual(t[0], t[2]); + x[1] = vaddq_s32_dual(t[1], t[3]); + x[2] = vsubq_s32_dual(t[0], t[2]); + x[3] = vsubq_s32_dual(t[1], t[3]); + x[4] = highbd_add_dct_const_round_shift_low_8(s4, s6); + x[5] = highbd_add_dct_const_round_shift_low_8(s5, s7); + x[6] = highbd_sub_dct_const_round_shift_low_8(s4, s6); + x[7] = highbd_sub_dct_const_round_shift_low_8(s5, s7); + x[8] = vaddq_s32_dual(t[8], t[10]); + x[9] = vaddq_s32_dual(t[9], t[11]); + x[10] = vsubq_s32_dual(t[8], t[10]); + x[11] = vsubq_s32_dual(t[9], t[11]); + x[12] = highbd_add_dct_const_round_shift_low_8(s12, s14); + x[13] = highbd_add_dct_const_round_shift_low_8(s13, s15); + x[14] = highbd_sub_dct_const_round_shift_low_8(s12, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s13, s15); + + // stage 4 + { + const int32x4x2_t sum = vaddq_s32_dual(x[2], x[3]); + const int32x4x2_t sub = vsubq_s32_dual(x[2], x[3]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[2]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[3]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[7], x[6]); + const int32x4x2_t sub = vsubq_s32_dual(x[7], x[6]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[6]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[7]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[11], x[10]); + const int32x4x2_t sub = vsubq_s32_dual(x[11], x[10]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[10]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[11]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[14], x[15]); + const int32x4x2_t sub = vsubq_s32_dual(x[14], x[15]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[14]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[15]); + } + + out[0] = x[0]; + out[1] = vnegq_s32_dual(x[8]); + out[2] = x[12]; + out[3] = vnegq_s32_dual(x[4]); + out[4] = x[6]; + out[5] = x[14]; + out[6] = x[10]; + out[7] = x[2]; + out[8] = x[3]; + out[9] = x[11]; + out[10] = x[15]; + out[11] = x[7]; + out[12] = x[5]; + out[13] = vnegq_s32_dual(x[13]); + out[14] = x[9]; + out[15] = vnegq_s32_dual(x[1]); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +typedef void (*highbd_iht_1d)(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, const int bd); + +typedef struct { + highbd_iht_1d cols, rows; // vertical and horizontal +} highbd_iht_2d; + +void vp9_highbd_iht16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + if (bd == 8) { + static const iht_2d IHT_16[] = { + { vpx_idct16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { vpx_iadst16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_idct16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d }, // DCT_ADST = 2 + { vpx_iadst16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d } // ADST_ADST = 3 + }; + const iht_2d ht = IHT_16[tx_type]; + int16_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, 1); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, 1); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, 1); // left 8 columns + ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 1); // right 8 columns + } else { + static const highbd_iht_2d IHT_16[] = { + { vpx_highbd_idct16x16_256_add_half1d, + vpx_highbd_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { highbd_iadst16_neon, + vpx_highbd_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_highbd_idct16x16_256_add_half1d, + highbd_iadst16_neon }, // DCT_ADST = 2 + { highbd_iadst16_neon, highbd_iadst16_neon } // ADST_ADST = 3 + }; + const highbd_iht_2d ht = IHT_16[tx_type]; + int32_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, bd); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, bd); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, bd); // left 8 columns + ht.cols(row_output + 8 * 16, NULL, dest + 8, stride, + bd); // right 8 columns + } +} diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c new file mode 100644 index 0000000000..52c4f1937d --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void highbd_iadst4(int32x4_t *const io) { + const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 }; + const int32x4_t sinpi = vld1q_s32(sinpis); + int64x2x2_t s[7], t[4]; + int32x4_t s7; + + s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0); + s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0); + s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1); + s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1); + s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0); + s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1); + s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1); + s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0); + s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0); + s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1); + s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1); + s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1); + s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1); + s7 = vsubq_s32(io[0], io[2]); + s7 = vaddq_s32(s7, io[3]); + + s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]); + s[3] = s[2]; + s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0); + + t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]); + t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]); + t[2] = s[2]; + t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]); + t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]); + t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]); + t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]); + io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS)); + io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS)); + io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS)); + io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS)); +} + +void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t a[2]; + int32x4_t c[4]; + + c[0] = vld1q_s32(input); + c[1] = vld1q_s32(input + 4); + c[2] = vld1q_s32(input + 8); + c[3] = vld1q_s32(input + 12); + + if (bd == 8) { + a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1])); + a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3])); + transpose_s16_4x4q(&a[0], &a[1]); + + switch (tx_type) { + case DCT_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + case ADST_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + + case DCT_ADST: + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + default: + assert(tx_type == ADST_ADST); + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + } + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + } else { + switch (tx_type) { + case DCT_DCT: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + idct4x4_16_kernel_bd12(cospis, c); + } + break; + } + + case ADST_DCT: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + } + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + break; + } + + case DCT_ADST: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + } + break; + } + + default: { + assert(tx_type == ADST_ADST); + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + break; + } + } + a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4)); + a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4)); + } + + highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max); + highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max); +} diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c new file mode 100644 index 0000000000..2232c6841c --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_iadst_half_butterfly_neon(int32x4_t *const x, + const int32x2_t c) { + const int32x4_t sum = vaddq_s32(x[0], x[1]); + const int32x4_t sub = vsubq_s32(x[0], x[1]); + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0); + const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS); + const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS); + const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS); + const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS); + + x[0] = vcombine_s32(out0_lo, out0_hi); + x[1] = vcombine_s32(out1_lo, out1_hi); +} + +static INLINE void highbd_iadst_butterfly_lane_0_1_neon(const int32x4_t in0, + const int32x4_t in1, + const int32x2_t c, + int64x2_t *const s0, + int64x2_t *const s1) { + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 0); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 1); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 0); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 1); + + s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 1); + s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 0); + s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 1); + s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 0); +} + +static INLINE void highbd_iadst_butterfly_lane_1_0_neon(const int32x4_t in0, + const int32x4_t in1, + const int32x2_t c, + int64x2_t *const s0, + int64x2_t *const s1) { + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 1); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 0); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 1); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 0); + + s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 0); + s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 1); + s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 0); + s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 1); +} + +static INLINE int32x4_t highbd_add_dct_const_round_shift_low_8( + const int64x2_t *const in0, const int64x2_t *const in1) { + const int64x2_t sum_lo = vaddq_s64(in0[0], in1[0]); + const int64x2_t sum_hi = vaddq_s64(in0[1], in1[1]); + const int32x2_t out_lo = vrshrn_n_s64(sum_lo, DCT_CONST_BITS); + const int32x2_t out_hi = vrshrn_n_s64(sum_hi, DCT_CONST_BITS); + return vcombine_s32(out_lo, out_hi); +} + +static INLINE int32x4_t highbd_sub_dct_const_round_shift_low_8( + const int64x2_t *const in0, const int64x2_t *const in1) { + const int64x2_t sub_lo = vsubq_s64(in0[0], in1[0]); + const int64x2_t sub_hi = vsubq_s64(in0[1], in1[1]); + const int32x2_t out_lo = vrshrn_n_s64(sub_lo, DCT_CONST_BITS); + const int32x2_t out_hi = vrshrn_n_s64(sub_hi, DCT_CONST_BITS); + return vcombine_s32(out_lo, out_hi); +} + +static INLINE void highbd_iadst8(int32x4_t *const io0, int32x4_t *const io1, + int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, + int32x4_t *const io6, int32x4_t *const io7) { + const int32x4_t c0 = + create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); + const int32x4_t c1 = + create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); + const int32x4_t c2 = + create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); + int32x4_t x[8], t[4]; + int64x2_t s[8][2]; + + x[0] = *io7; + x[1] = *io0; + x[2] = *io5; + x[3] = *io2; + x[4] = *io3; + x[5] = *io4; + x[6] = *io1; + x[7] = *io6; + + // stage 1 + highbd_iadst_butterfly_lane_0_1_neon(x[0], x[1], vget_low_s32(c0), s[0], + s[1]); + highbd_iadst_butterfly_lane_0_1_neon(x[2], x[3], vget_high_s32(c0), s[2], + s[3]); + highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_low_s32(c1), s[4], + s[5]); + highbd_iadst_butterfly_lane_0_1_neon(x[6], x[7], vget_high_s32(c1), s[6], + s[7]); + + x[0] = highbd_add_dct_const_round_shift_low_8(s[0], s[4]); + x[1] = highbd_add_dct_const_round_shift_low_8(s[1], s[5]); + x[2] = highbd_add_dct_const_round_shift_low_8(s[2], s[6]); + x[3] = highbd_add_dct_const_round_shift_low_8(s[3], s[7]); + x[4] = highbd_sub_dct_const_round_shift_low_8(s[0], s[4]); + x[5] = highbd_sub_dct_const_round_shift_low_8(s[1], s[5]); + x[6] = highbd_sub_dct_const_round_shift_low_8(s[2], s[6]); + x[7] = highbd_sub_dct_const_round_shift_low_8(s[3], s[7]); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_high_s32(c2), s[4], + s[5]); + highbd_iadst_butterfly_lane_1_0_neon(x[7], x[6], vget_high_s32(c2), s[7], + s[6]); + + x[0] = vaddq_s32(t[0], t[2]); + x[1] = vaddq_s32(t[1], t[3]); + x[2] = vsubq_s32(t[0], t[2]); + x[3] = vsubq_s32(t[1], t[3]); + x[4] = highbd_add_dct_const_round_shift_low_8(s[4], s[6]); + x[5] = highbd_add_dct_const_round_shift_low_8(s[5], s[7]); + x[6] = highbd_sub_dct_const_round_shift_low_8(s[4], s[6]); + x[7] = highbd_sub_dct_const_round_shift_low_8(s[5], s[7]); + + // stage 3 + highbd_iadst_half_butterfly_neon(x + 2, vget_low_s32(c2)); + highbd_iadst_half_butterfly_neon(x + 6, vget_low_s32(c2)); + + *io0 = x[0]; + *io1 = vnegq_s32(x[4]); + *io2 = x[6]; + *io3 = vnegq_s32(x[2]); + *io4 = x[3]; + *io5 = vnegq_s32(x[7]); + *io6 = x[5]; + *io7 = vnegq_s32(x[1]); +} + +void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 4); + a[2] = vld1q_s32(input + 8); + a[3] = vld1q_s32(input + 12); + a[4] = vld1q_s32(input + 16); + a[5] = vld1q_s32(input + 20); + a[6] = vld1q_s32(input + 24); + a[7] = vld1q_s32(input + 28); + a[8] = vld1q_s32(input + 32); + a[9] = vld1q_s32(input + 36); + a[10] = vld1q_s32(input + 40); + a[11] = vld1q_s32(input + 44); + a[12] = vld1q_s32(input + 48); + a[13] = vld1q_s32(input + 52); + a[14] = vld1q_s32(input + 56); + a[15] = vld1q_s32(input + 60); + + if (bd == 8) { + c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1])); + c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3])); + c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5])); + c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7])); + c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9])); + c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11])); + c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13])); + c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15])); + + switch (tx_type) { + case DCT_DCT: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + idct8x8_64_1d_bd8(cospis0, cospis1, c); + idct8x8_64_1d_bd8(cospis0, cospis1, c); + break; + } + + case ADST_DCT: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + idct8x8_64_1d_bd8(cospis0, cospis1, c); + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + break; + } + + case DCT_ADST: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + idct8x8_64_1d_bd8(cospis0, cospis1, c); + break; + } + + default: { + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + break; + } + } + + c[0] = vrshrq_n_s16(c[0], 5); + c[1] = vrshrq_n_s16(c[1], 5); + c[2] = vrshrq_n_s16(c[2], 5); + c[3] = vrshrq_n_s16(c[3], 5); + c[4] = vrshrq_n_s16(c[4], 5); + c[5] = vrshrq_n_s16(c[5], 5); + c[6] = vrshrq_n_s16(c[6], 5); + c[7] = vrshrq_n_s16(c[7], 5); + } else { + switch (tx_type) { + case DCT_DCT: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + if (bd == 10) { + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + } else { + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + } + break; + } + + case ADST_DCT: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], + &a[11]); + highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); + transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + break; + } + + case DCT_ADST: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7]); + highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + break; + } + + default: { + assert(tx_type == ADST_ADST); + transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7]); + highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], + &a[11]); + highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); + transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + break; + } + } + + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); + } + highbd_add8x8(c, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c new file mode 100644 index 0000000000..db72ff1161 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag) { + int16x8_t in[16], out[16]; + const int16x4_t c_1_31_5_27 = + create_s16x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64); + const int16x4_t c_9_23_13_19 = + create_s16x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64); + const int16x4_t c_17_15_21_11 = + create_s16x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64); + const int16x4_t c_25_7_29_3 = + create_s16x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64); + const int16x4_t c_4_28_20_12 = + create_s16x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64); + const int16x4_t c_16_n16_8_24 = + create_s16x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64); + int16x8_t x[16], t[12]; + int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + int32x4_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + + // Load input (16x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[8] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[9] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[10] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[11] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[12] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[13] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[14] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[7] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[15] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 8; + in[8] = vld1q_s16(inputT); + inputT += 8; + in[1] = vld1q_s16(inputT); + inputT += 8; + in[9] = vld1q_s16(inputT); + inputT += 8; + in[2] = vld1q_s16(inputT); + inputT += 8; + in[10] = vld1q_s16(inputT); + inputT += 8; + in[3] = vld1q_s16(inputT); + inputT += 8; + in[11] = vld1q_s16(inputT); + inputT += 8; + in[4] = vld1q_s16(inputT); + inputT += 8; + in[12] = vld1q_s16(inputT); + inputT += 8; + in[5] = vld1q_s16(inputT); + inputT += 8; + in[13] = vld1q_s16(inputT); + inputT += 8; + in[6] = vld1q_s16(inputT); + inputT += 8; + in[14] = vld1q_s16(inputT); + inputT += 8; + in[7] = vld1q_s16(inputT); + inputT += 8; + in[15] = vld1q_s16(inputT); + } + + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // stage 1 + iadst_butterfly_lane_0_1_neon(x[0], x[1], c_1_31_5_27, s0, s1); + iadst_butterfly_lane_2_3_neon(x[2], x[3], c_1_31_5_27, s2, s3); + iadst_butterfly_lane_0_1_neon(x[4], x[5], c_9_23_13_19, s4, s5); + iadst_butterfly_lane_2_3_neon(x[6], x[7], c_9_23_13_19, s6, s7); + iadst_butterfly_lane_0_1_neon(x[8], x[9], c_17_15_21_11, s8, s9); + iadst_butterfly_lane_2_3_neon(x[10], x[11], c_17_15_21_11, s10, s11); + iadst_butterfly_lane_0_1_neon(x[12], x[13], c_25_7_29_3, s12, s13); + iadst_butterfly_lane_2_3_neon(x[14], x[15], c_25_7_29_3, s14, s15); + + x[0] = add_dct_const_round_shift_low_8(s0, s8); + x[1] = add_dct_const_round_shift_low_8(s1, s9); + x[2] = add_dct_const_round_shift_low_8(s2, s10); + x[3] = add_dct_const_round_shift_low_8(s3, s11); + x[4] = add_dct_const_round_shift_low_8(s4, s12); + x[5] = add_dct_const_round_shift_low_8(s5, s13); + x[6] = add_dct_const_round_shift_low_8(s6, s14); + x[7] = add_dct_const_round_shift_low_8(s7, s15); + x[8] = sub_dct_const_round_shift_low_8(s0, s8); + x[9] = sub_dct_const_round_shift_low_8(s1, s9); + x[10] = sub_dct_const_round_shift_low_8(s2, s10); + x[11] = sub_dct_const_round_shift_low_8(s3, s11); + x[12] = sub_dct_const_round_shift_low_8(s4, s12); + x[13] = sub_dct_const_round_shift_low_8(s5, s13); + x[14] = sub_dct_const_round_shift_low_8(s6, s14); + x[15] = sub_dct_const_round_shift_low_8(s7, s15); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + iadst_butterfly_lane_0_1_neon(x[8], x[9], c_4_28_20_12, s8, s9); + iadst_butterfly_lane_2_3_neon(x[10], x[11], c_4_28_20_12, s10, s11); + iadst_butterfly_lane_1_0_neon(x[13], x[12], c_4_28_20_12, s13, s12); + iadst_butterfly_lane_3_2_neon(x[15], x[14], c_4_28_20_12, s15, s14); + + x[0] = vaddq_s16(t[0], t[4]); + x[1] = vaddq_s16(t[1], t[5]); + x[2] = vaddq_s16(t[2], t[6]); + x[3] = vaddq_s16(t[3], t[7]); + x[4] = vsubq_s16(t[0], t[4]); + x[5] = vsubq_s16(t[1], t[5]); + x[6] = vsubq_s16(t[2], t[6]); + x[7] = vsubq_s16(t[3], t[7]); + x[8] = add_dct_const_round_shift_low_8(s8, s12); + x[9] = add_dct_const_round_shift_low_8(s9, s13); + x[10] = add_dct_const_round_shift_low_8(s10, s14); + x[11] = add_dct_const_round_shift_low_8(s11, s15); + x[12] = sub_dct_const_round_shift_low_8(s8, s12); + x[13] = sub_dct_const_round_shift_low_8(s9, s13); + x[14] = sub_dct_const_round_shift_low_8(s10, s14); + x[15] = sub_dct_const_round_shift_low_8(s11, s15); + + // stage 3 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + iadst_butterfly_lane_2_3_neon(x[4], x[5], c_16_n16_8_24, s4, s5); + iadst_butterfly_lane_3_2_neon(x[7], x[6], c_16_n16_8_24, s7, s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + iadst_butterfly_lane_2_3_neon(x[12], x[13], c_16_n16_8_24, s12, s13); + iadst_butterfly_lane_3_2_neon(x[15], x[14], c_16_n16_8_24, s15, s14); + + x[0] = vaddq_s16(t[0], t[2]); + x[1] = vaddq_s16(t[1], t[3]); + x[2] = vsubq_s16(t[0], t[2]); + x[3] = vsubq_s16(t[1], t[3]); + x[4] = add_dct_const_round_shift_low_8(s4, s6); + x[5] = add_dct_const_round_shift_low_8(s5, s7); + x[6] = sub_dct_const_round_shift_low_8(s4, s6); + x[7] = sub_dct_const_round_shift_low_8(s5, s7); + x[8] = vaddq_s16(t[8], t[10]); + x[9] = vaddq_s16(t[9], t[11]); + x[10] = vsubq_s16(t[8], t[10]); + x[11] = vsubq_s16(t[9], t[11]); + x[12] = add_dct_const_round_shift_low_8(s12, s14); + x[13] = add_dct_const_round_shift_low_8(s13, s15); + x[14] = sub_dct_const_round_shift_low_8(s12, s14); + x[15] = sub_dct_const_round_shift_low_8(s13, s15); + + // stage 4 + iadst_half_butterfly_neg_neon(&x[3], &x[2], c_16_n16_8_24); + iadst_half_butterfly_pos_neon(&x[7], &x[6], c_16_n16_8_24); + iadst_half_butterfly_pos_neon(&x[11], &x[10], c_16_n16_8_24); + iadst_half_butterfly_neg_neon(&x[15], &x[14], c_16_n16_8_24); + + out[0] = x[0]; + out[1] = vnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vnegq_s16(x[4]); + out[4] = x[6]; + out[5] = x[14]; + out[6] = x[10]; + out[7] = x[2]; + out[8] = x[3]; + out[9] = x[11]; + out[10] = x[15]; + out[11] = x[7]; + out[12] = x[5]; + out[13] = vnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vnegq_s16(x[1]); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + static const iht_2d IHT_16[] = { + { vpx_idct16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { vpx_iadst16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_idct16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d }, // DCT_ADST = 2 + { vpx_iadst16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d } // ADST_ADST = 3 + }; + const iht_2d ht = IHT_16[tx_type]; + int16_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, 0); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, 0); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, 0); // left 8 columns + ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 0); // right 8 columns +} diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c index dd1ea03b6b..4f0a90f215 100644 --- a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -14,213 +14,63 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" - -static int16_t sinpi_1_9 = 0x14a3; -static int16_t sinpi_2_9 = 0x26c9; -static int16_t sinpi_3_9 = 0x3441; -static int16_t sinpi_4_9 = 0x3b6c; -static int16_t cospi_8_64 = 0x3b21; -static int16_t cospi_16_64 = 0x2d41; -static int16_t cospi_24_64 = 0x187e; - -static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) { - int32x4_t q8s32, q9s32; - int16x4x2_t d0x2s16, d1x2s16; - int32x4x2_t q0x2s32; - - d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16)); - d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16)); - - q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1])); - q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1])); - q0x2s32 = vtrnq_s32(q8s32, q9s32); - - *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]); - *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]); -} - -static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16, - int16x4_t *d2s16) { - *d0s16 = vdup_n_s16(cospi_8_64); - *d1s16 = vdup_n_s16(cospi_16_64); - *d2s16 = vdup_n_s16(cospi_24_64); -} - -static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16, - int16x4_t *d5s16, int16x8_t *q3s16) { - *d3s16 = vdup_n_s16(sinpi_1_9); - *d4s16 = vdup_n_s16(sinpi_2_9); - *q3s16 = vdupq_n_s16(sinpi_3_9); - *d5s16 = vdup_n_s16(sinpi_4_9); -} - -static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16, - int16x4_t *d2s16, int16x8_t *q8s16, - int16x8_t *q9s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16; - int16x4_t d26s16, d27s16, d28s16, d29s16; - int32x4_t q10s32, q13s32, q14s32, q15s32; - int16x8_t q13s16, q14s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, *d2s16); - q10s32 = vmull_s16(d17s16, *d0s16); - q13s32 = vmull_s16(d23s16, *d1s16); - q14s32 = vmull_s16(d24s16, *d1s16); - q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16); - q10s32 = vmlal_s16(q10s32, d19s16, *d2s16); - - d26s16 = vrshrn_n_s32(q13s32, 14); - d27s16 = vrshrn_n_s32(q14s32, 14); - d29s16 = vrshrn_n_s32(q15s32, 14); - d28s16 = vrshrn_n_s32(q10s32, 14); - - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - *q8s16 = vaddq_s16(q13s16, q14s16); - *q9s16 = vsubq_s16(q13s16, q14s16); - *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16)); // vswp -} - -static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, - int16x4_t *d5s16, int16x8_t *q3s16, - int16x8_t *q8s16, int16x8_t *q9s16) { - int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; - int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; - - d6s16 = vget_low_s16(*q3s16); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - - q10s32 = vmull_s16(*d3s16, d16s16); - q11s32 = vmull_s16(*d4s16, d16s16); - q12s32 = vmull_s16(d6s16, d17s16); - q13s32 = vmull_s16(*d5s16, d18s16); - q14s32 = vmull_s16(*d3s16, d18s16); - q15s32 = vmovl_s16(d16s16); - q15s32 = vaddw_s16(q15s32, d19s16); - q8s32 = vmull_s16(*d4s16, d19s16); - q15s32 = vsubw_s16(q15s32, d18s16); - q9s32 = vmull_s16(*d5s16, d19s16); - - q10s32 = vaddq_s32(q10s32, q13s32); - q10s32 = vaddq_s32(q10s32, q8s32); - q11s32 = vsubq_s32(q11s32, q14s32); - q8s32 = vdupq_n_s32(sinpi_3_9); - q11s32 = vsubq_s32(q11s32, q9s32); - q15s32 = vmulq_s32(q15s32, q8s32); - - q13s32 = vaddq_s32(q10s32, q12s32); - q10s32 = vaddq_s32(q10s32, q11s32); - q14s32 = vaddq_s32(q11s32, q12s32); - q10s32 = vsubq_s32(q10s32, q12s32); - - d16s16 = vrshrn_n_s32(q13s32, 14); - d17s16 = vrshrn_n_s32(q14s32, 14); - d18s16 = vrshrn_n_s32(q15s32, 14); - d19s16 = vrshrn_n_s32(q10s32, 14); - - *q8s16 = vcombine_s16(d16s16, d17s16); - *q9s16 = vcombine_s16(d18s16, d19s16); -} +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { - uint8x8_t d26u8, d27u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; - uint32x2_t d26u32, d27u32; - int16x8_t q3s16, q8s16, q9s16; - uint16x8_t q8u16, q9u16; + int16x8_t a[2]; + uint8x8_t s[2], d[2]; + uint16x8_t sum[2]; - d26u32 = d27u32 = vdup_n_u32(0); + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - - TRANSPOSE4X4(&q8s16, &q9s16); + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + transpose_s16_4x4q(&a[0], &a[1]); switch (tx_type) { - case 0: // idct_idct is not supported. Fall back to C - vp9_iht4x4_16_add_c(input, dest, stride, tx_type); - return; - case 1: // iadst_idct - // generate constants - GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - - // first transform rows - IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); - - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + case DCT_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); break; - case 2: // idct_iadst - // generate constantsyy - GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - // first transform rows - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); - - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); + case ADST_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); break; - case 3: // iadst_iadst - // generate constants - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - // first transform rows - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); - - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + case DCT_ADST: + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); break; - default: // iadst_idct - assert(0); + + default: + assert(tx_type == ADST_ADST); + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); break; } - q8s16 = vrshrq_n_s16(q8s16, 4); - q9s16 = vrshrq_n_s16(q9s16, 4); - - d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); - dest += stride; - d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); - dest += stride; - d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); - dest += stride; - d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); - - d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + s[0] = load_u8(dest, stride); + s[1] = load_u8(dest + 2 * stride, stride); + sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]); + sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]); + d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0])); + d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1])); + store_u8(dest, stride, d[0]); + store_u8(dest + 2 * stride, stride, d[1]); } diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c index 1c739861c3..46ee632e01 100644 --- a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -14,527 +14,55 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" -static int16_t cospi_2_64 = 16305; -static int16_t cospi_4_64 = 16069; -static int16_t cospi_6_64 = 15679; -static int16_t cospi_8_64 = 15137; -static int16_t cospi_10_64 = 14449; -static int16_t cospi_12_64 = 13623; -static int16_t cospi_14_64 = 12665; -static int16_t cospi_16_64 = 11585; -static int16_t cospi_18_64 = 10394; -static int16_t cospi_20_64 = 9102; -static int16_t cospi_22_64 = 7723; -static int16_t cospi_24_64 = 6270; -static int16_t cospi_26_64 = 4756; -static int16_t cospi_28_64 = 3196; -static int16_t cospi_30_64 = 1606; - -static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d26s16, d2s16); - q6s32 = vmull_s16(d27s16, d2s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); - q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - - d8s16 = vrshrn_n_s32(q2s32, 14); - d9s16 = vrshrn_n_s32(q3s32, 14); - d10s16 = vrshrn_n_s32(q5s32, 14); - d11s16 = vrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q2s32 = vmull_s16(d18s16, d1s16); - q3s32 = vmull_s16(d19s16, d1s16); - q9s32 = vmull_s16(d26s16, d3s16); - q13s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlal_s16(q2s32, d30s16, d0s16); - q3s32 = vmlal_s16(q3s32, d31s16, d0s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - - d14s16 = vrshrn_n_s32(q2s32, 14); - d15s16 = vrshrn_n_s32(q3s32, 14); - d12s16 = vrshrn_n_s32(q9s32, 14); - d13s16 = vrshrn_n_s32(q13s32, 14); - q6s16 = vcombine_s16(d12s16, d13s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d0s16 = vdup_n_s16(cospi_16_64); - - q2s32 = vmull_s16(d16s16, d0s16); - q3s32 = vmull_s16(d17s16, d0s16); - q13s32 = vmull_s16(d16s16, d0s16); - q15s32 = vmull_s16(d17s16, d0s16); - - q2s32 = vmlal_s16(q2s32, d24s16, d0s16); - q3s32 = vmlal_s16(q3s32, d25s16, d0s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); - q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - - d0s16 = vdup_n_s16(cospi_24_64); - d1s16 = vdup_n_s16(cospi_8_64); - - d18s16 = vrshrn_n_s32(q2s32, 14); - d19s16 = vrshrn_n_s32(q3s32, 14); - d22s16 = vrshrn_n_s32(q13s32, 14); - d23s16 = vrshrn_n_s32(q15s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q2s32 = vmull_s16(d20s16, d0s16); - q3s32 = vmull_s16(d21s16, d0s16); - q8s32 = vmull_s16(d20s16, d1s16); - q12s32 = vmull_s16(d21s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); - q8s32 = vmlal_s16(q8s32, d28s16, d0s16); - q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - - d26s16 = vrshrn_n_s32(q2s32, 14); - d27s16 = vrshrn_n_s32(q3s32, 14); - d30s16 = vrshrn_n_s32(q8s32, 14); - d31s16 = vrshrn_n_s32(q12s32, 14); - *q13s16 = vcombine_s16(d26s16, d27s16); - *q15s16 = vcombine_s16(d30s16, d31s16); - - q0s16 = vaddq_s16(*q9s16, *q15s16); - q1s16 = vaddq_s16(*q11s16, *q13s16); - q2s16 = vsubq_s16(*q11s16, *q13s16); - q3s16 = vsubq_s16(*q9s16, *q15s16); - - *q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - *q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - - d16s16 = vdup_n_s16(cospi_16_64); - - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vrshrn_n_s32(q9s32, 14); - d11s16 = vrshrn_n_s32(q10s32, 14); - d12s16 = vrshrn_n_s32(q11s32, 14); - d13s16 = vrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - *q8s16 = vaddq_s16(q0s16, q7s16); - *q9s16 = vaddq_s16(q1s16, q6s16); - *q10s16 = vaddq_s16(q2s16, q5s16); - *q11s16 = vaddq_s16(q3s16, q4s16); - *q12s16 = vsubq_s16(q3s16, q4s16); - *q13s16 = vsubq_s16(q2s16, q5s16); - *q14s16 = vsubq_s16(q1s16, q6s16); - *q15s16 = vsubq_s16(q0s16, q7s16); -} - -static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q2s16, q4s16, q5s16, q6s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32; - int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - d14s16 = vdup_n_s16(cospi_2_64); - d15s16 = vdup_n_s16(cospi_30_64); - - q1s32 = vmull_s16(d30s16, d14s16); - q2s32 = vmull_s16(d31s16, d14s16); - q3s32 = vmull_s16(d30s16, d15s16); - q4s32 = vmull_s16(d31s16, d15s16); - - d30s16 = vdup_n_s16(cospi_18_64); - d31s16 = vdup_n_s16(cospi_14_64); - - q1s32 = vmlal_s16(q1s32, d16s16, d15s16); - q2s32 = vmlal_s16(q2s32, d17s16, d15s16); - q3s32 = vmlsl_s16(q3s32, d16s16, d14s16); - q4s32 = vmlsl_s16(q4s32, d17s16, d14s16); - - q5s32 = vmull_s16(d22s16, d30s16); - q6s32 = vmull_s16(d23s16, d30s16); - q7s32 = vmull_s16(d22s16, d31s16); - q8s32 = vmull_s16(d23s16, d31s16); - - q5s32 = vmlal_s16(q5s32, d24s16, d31s16); - q6s32 = vmlal_s16(q6s32, d25s16, d31s16); - q7s32 = vmlsl_s16(q7s32, d24s16, d30s16); - q8s32 = vmlsl_s16(q8s32, d25s16, d30s16); - - q11s32 = vaddq_s32(q1s32, q5s32); - q12s32 = vaddq_s32(q2s32, q6s32); - q1s32 = vsubq_s32(q1s32, q5s32); - q2s32 = vsubq_s32(q2s32, q6s32); - - d22s16 = vrshrn_n_s32(q11s32, 14); - d23s16 = vrshrn_n_s32(q12s32, 14); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q12s32 = vaddq_s32(q3s32, q7s32); - q15s32 = vaddq_s32(q4s32, q8s32); - q3s32 = vsubq_s32(q3s32, q7s32); - q4s32 = vsubq_s32(q4s32, q8s32); - - d2s16 = vrshrn_n_s32(q1s32, 14); - d3s16 = vrshrn_n_s32(q2s32, 14); - d24s16 = vrshrn_n_s32(q12s32, 14); - d25s16 = vrshrn_n_s32(q15s32, 14); - d6s16 = vrshrn_n_s32(q3s32, 14); - d7s16 = vrshrn_n_s32(q4s32, 14); - *q12s16 = vcombine_s16(d24s16, d25s16); - - d0s16 = vdup_n_s16(cospi_10_64); - d1s16 = vdup_n_s16(cospi_22_64); - q4s32 = vmull_s16(d26s16, d0s16); - q5s32 = vmull_s16(d27s16, d0s16); - q2s32 = vmull_s16(d26s16, d1s16); - q6s32 = vmull_s16(d27s16, d1s16); - - d30s16 = vdup_n_s16(cospi_26_64); - d31s16 = vdup_n_s16(cospi_6_64); - - q4s32 = vmlal_s16(q4s32, d20s16, d1s16); - q5s32 = vmlal_s16(q5s32, d21s16, d1s16); - q2s32 = vmlsl_s16(q2s32, d20s16, d0s16); - q6s32 = vmlsl_s16(q6s32, d21s16, d0s16); - - q0s32 = vmull_s16(d18s16, d30s16); - q13s32 = vmull_s16(d19s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d31s16); - q13s32 = vmlal_s16(q13s32, d29s16, d31s16); - - q10s32 = vmull_s16(d18s16, d31s16); - q9s32 = vmull_s16(d19s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d28s16, d30s16); - q9s32 = vmlsl_s16(q9s32, d29s16, d30s16); - - q14s32 = vaddq_s32(q2s32, q10s32); - q15s32 = vaddq_s32(q6s32, q9s32); - q2s32 = vsubq_s32(q2s32, q10s32); - q6s32 = vsubq_s32(q6s32, q9s32); - - d28s16 = vrshrn_n_s32(q14s32, 14); - d29s16 = vrshrn_n_s32(q15s32, 14); - d4s16 = vrshrn_n_s32(q2s32, 14); - d5s16 = vrshrn_n_s32(q6s32, 14); - *q14s16 = vcombine_s16(d28s16, d29s16); - - q9s32 = vaddq_s32(q4s32, q0s32); - q10s32 = vaddq_s32(q5s32, q13s32); - q4s32 = vsubq_s32(q4s32, q0s32); - q5s32 = vsubq_s32(q5s32, q13s32); - - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); - - d18s16 = vrshrn_n_s32(q9s32, 14); - d19s16 = vrshrn_n_s32(q10s32, 14); - d8s16 = vrshrn_n_s32(q4s32, 14); - d9s16 = vrshrn_n_s32(q5s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - - q5s32 = vmull_s16(d2s16, d30s16); - q6s32 = vmull_s16(d3s16, d30s16); - q7s32 = vmull_s16(d2s16, d31s16); - q0s32 = vmull_s16(d3s16, d31s16); - - q5s32 = vmlal_s16(q5s32, d6s16, d31s16); - q6s32 = vmlal_s16(q6s32, d7s16, d31s16); - q7s32 = vmlsl_s16(q7s32, d6s16, d30s16); - q0s32 = vmlsl_s16(q0s32, d7s16, d30s16); - - q1s32 = vmull_s16(d4s16, d30s16); - q3s32 = vmull_s16(d5s16, d30s16); - q10s32 = vmull_s16(d4s16, d31s16); - q2s32 = vmull_s16(d5s16, d31s16); - - q1s32 = vmlsl_s16(q1s32, d8s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d9s16, d31s16); - q10s32 = vmlal_s16(q10s32, d8s16, d30s16); - q2s32 = vmlal_s16(q2s32, d9s16, d30s16); - - *q8s16 = vaddq_s16(*q11s16, *q9s16); - *q11s16 = vsubq_s16(*q11s16, *q9s16); - q4s16 = vaddq_s16(*q12s16, *q14s16); - *q12s16 = vsubq_s16(*q12s16, *q14s16); - - q14s32 = vaddq_s32(q5s32, q1s32); - q15s32 = vaddq_s32(q6s32, q3s32); - q5s32 = vsubq_s32(q5s32, q1s32); - q6s32 = vsubq_s32(q6s32, q3s32); - - d18s16 = vrshrn_n_s32(q14s32, 14); - d19s16 = vrshrn_n_s32(q15s32, 14); - d10s16 = vrshrn_n_s32(q5s32, 14); - d11s16 = vrshrn_n_s32(q6s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - - q1s32 = vaddq_s32(q7s32, q10s32); - q3s32 = vaddq_s32(q0s32, q2s32); - q7s32 = vsubq_s32(q7s32, q10s32); - q0s32 = vsubq_s32(q0s32, q2s32); - - d28s16 = vrshrn_n_s32(q1s32, 14); - d29s16 = vrshrn_n_s32(q3s32, 14); - d14s16 = vrshrn_n_s32(q7s32, 14); - d15s16 = vrshrn_n_s32(q0s32, 14); - *q14s16 = vcombine_s16(d28s16, d29s16); - - d30s16 = vdup_n_s16(cospi_16_64); - - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - q2s32 = vmull_s16(d22s16, d30s16); - q3s32 = vmull_s16(d23s16, d30s16); - q13s32 = vmull_s16(d22s16, d30s16); - q1s32 = vmull_s16(d23s16, d30s16); - - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - q2s32 = vmlal_s16(q2s32, d24s16, d30s16); - q3s32 = vmlal_s16(q3s32, d25s16, d30s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); - q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); - - d4s16 = vrshrn_n_s32(q2s32, 14); - d5s16 = vrshrn_n_s32(q3s32, 14); - d24s16 = vrshrn_n_s32(q13s32, 14); - d25s16 = vrshrn_n_s32(q1s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - *q12s16 = vcombine_s16(d24s16, d25s16); - - q13s32 = vmull_s16(d10s16, d30s16); - q1s32 = vmull_s16(d11s16, d30s16); - q11s32 = vmull_s16(d10s16, d30s16); - q0s32 = vmull_s16(d11s16, d30s16); - - q13s32 = vmlal_s16(q13s32, d14s16, d30s16); - q1s32 = vmlal_s16(q1s32, d15s16, d30s16); - q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); - q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); - - d20s16 = vrshrn_n_s32(q13s32, 14); - d21s16 = vrshrn_n_s32(q1s32, 14); - d12s16 = vrshrn_n_s32(q11s32, 14); - d13s16 = vrshrn_n_s32(q0s32, 14); - *q10s16 = vcombine_s16(d20s16, d21s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - q5s16 = vdupq_n_s16(0); - - *q9s16 = vsubq_s16(q5s16, *q9s16); - *q11s16 = vsubq_s16(q5s16, q2s16); - *q13s16 = vsubq_s16(q5s16, q6s16); - *q15s16 = vsubq_s16(q5s16, q4s16); -} - void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { - int i; - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t a[8]; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 8 * 2); - q11s16 = vld1q_s16(input + 8 * 3); - q12s16 = vld1q_s16(input + 8 * 4); - q13s16 = vld1q_s16(input + 8 * 5); - q14s16 = vld1q_s16(input + 8 * 6); - q15s16 = vld1q_s16(input + 8 * 7); + a[0] = load_tran_low_to_s16q(input + 0 * 8); + a[1] = load_tran_low_to_s16q(input + 1 * 8); + a[2] = load_tran_low_to_s16q(input + 2 * 8); + a[3] = load_tran_low_to_s16q(input + 3 * 8); + a[4] = load_tran_low_to_s16q(input + 4 * 8); + a[5] = load_tran_low_to_s16q(input + 5 * 8); + a[6] = load_tran_low_to_s16q(input + 6 * 8); + a[7] = load_tran_low_to_s16q(input + 7 * 8); - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); switch (tx_type) { - case 0: // idct_idct is not supported. Fall back to C - vp9_iht8x8_64_add_c(input, dest, stride, tx_type); - return; - case 1: // iadst_idct - // generate IDCT constants - // GENERATE_IDCT_CONSTANTS - - // first transform rows - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - - // generate IADST constants - // GENERATE_IADST_CONSTANTS - - // then transform columns - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case DCT_DCT: + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); break; - case 2: // idct_iadst - // generate IADST constants - // GENERATE_IADST_CONSTANTS - // first transform rows - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - - // generate IDCT constants - // GENERATE_IDCT_CONSTANTS - - // then transform columns - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case ADST_DCT: + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + iadst8(a); break; - case 3: // iadst_iadst - // generate IADST constants - // GENERATE_IADST_CONSTANTS - // first transform rows - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - - // then transform columns - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case DCT_ADST: + iadst8(a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); break; - default: // iadst_idct - assert(0); + + default: + assert(tx_type == ADST_ADST); + iadst8(a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + iadst8(a); break; } - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - for (d1 = d2 = dest, i = 0; i < 2; i++) { - if (i != 0) { - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - } - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = - vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = - vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += stride; - } + idct8x8_add8x8_neon(a, dest, stride); } diff --git a/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h new file mode 100644 index 0000000000..c64822e27c --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/arm/neon/vp9_iht_neon.h @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ +#define VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ + +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void iadst4(int16x8_t *const io) { + const int32x4_t c3 = vdupq_n_s32(sinpi_3_9); + int16x4_t x[4]; + int32x4_t s[8], output[4]; + const int16x4_t c = + create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9); + + x[0] = vget_low_s16(io[0]); + x[1] = vget_low_s16(io[1]); + x[2] = vget_high_s16(io[0]); + x[3] = vget_high_s16(io[1]); + + s[0] = vmull_lane_s16(x[0], c, 0); + s[1] = vmull_lane_s16(x[0], c, 1); + s[2] = vmull_lane_s16(x[1], c, 2); + s[3] = vmull_lane_s16(x[2], c, 3); + s[4] = vmull_lane_s16(x[2], c, 0); + s[5] = vmull_lane_s16(x[3], c, 1); + s[6] = vmull_lane_s16(x[3], c, 3); + s[7] = vaddl_s16(x[0], x[3]); + s[7] = vsubw_s16(s[7], x[2]); + + s[0] = vaddq_s32(s[0], s[3]); + s[0] = vaddq_s32(s[0], s[5]); + s[1] = vsubq_s32(s[1], s[4]); + s[1] = vsubq_s32(s[1], s[6]); + s[3] = s[2]; + s[2] = vmulq_s32(c3, s[7]); + + output[0] = vaddq_s32(s[0], s[3]); + output[1] = vaddq_s32(s[1], s[3]); + output[2] = s[2]; + output[3] = vaddq_s32(s[0], s[1]); + output[3] = vsubq_s32(output[3], s[3]); + dct_const_round_shift_low_8_dual(output, &io[0], &io[1]); +} + +static INLINE void iadst_half_butterfly_neon(int16x8_t *const x, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + x[0] = dct_const_round_shift_low_8(t0); + x[1] = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0, + int16x8_t *const x1, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + *x1 = dct_const_round_shift_low_8(t0); + *x0 = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0, + int16x8_t *const x1, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + *x1 = dct_const_round_shift_low_8(t0); + *x0 = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); +} + +static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); +} + +static INLINE void iadst_butterfly_lane_1_0_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); +} + +static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); +} + +static INLINE int16x8_t add_dct_const_round_shift_low_8( + const int32x4_t *const in0, const int32x4_t *const in1) { + int32x4_t sum[2]; + + sum[0] = vaddq_s32(in0[0], in1[0]); + sum[1] = vaddq_s32(in0[1], in1[1]); + return dct_const_round_shift_low_8(sum); +} + +static INLINE int16x8_t sub_dct_const_round_shift_low_8( + const int32x4_t *const in0, const int32x4_t *const in1) { + int32x4_t sum[2]; + + sum[0] = vsubq_s32(in0[0], in1[0]); + sum[1] = vsubq_s32(in0[1], in1[1]); + return dct_const_round_shift_low_8(sum); +} + +static INLINE void iadst8(int16x8_t *const io) { + const int16x4_t c0 = + create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); + const int16x4_t c1 = + create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); + const int16x4_t c2 = + create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); + int16x8_t x[8], t[4]; + int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + + x[0] = io[7]; + x[1] = io[0]; + x[2] = io[5]; + x[3] = io[2]; + x[4] = io[3]; + x[5] = io[4]; + x[6] = io[1]; + x[7] = io[6]; + + // stage 1 + iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1); + iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3); + iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5); + iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7); + + x[0] = add_dct_const_round_shift_low_8(s0, s4); + x[1] = add_dct_const_round_shift_low_8(s1, s5); + x[2] = add_dct_const_round_shift_low_8(s2, s6); + x[3] = add_dct_const_round_shift_low_8(s3, s7); + x[4] = sub_dct_const_round_shift_low_8(s0, s4); + x[5] = sub_dct_const_round_shift_low_8(s1, s5); + x[6] = sub_dct_const_round_shift_low_8(s2, s6); + x[7] = sub_dct_const_round_shift_low_8(s3, s7); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5); + iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6); + + x[0] = vaddq_s16(t[0], t[2]); + x[1] = vaddq_s16(t[1], t[3]); + x[2] = vsubq_s16(t[0], t[2]); + x[3] = vsubq_s16(t[1], t[3]); + x[4] = add_dct_const_round_shift_low_8(s4, s6); + x[5] = add_dct_const_round_shift_low_8(s5, s7); + x[6] = sub_dct_const_round_shift_low_8(s4, s6); + x[7] = sub_dct_const_round_shift_low_8(s5, s7); + + // stage 3 + iadst_half_butterfly_neon(x + 2, c2); + iadst_half_butterfly_neon(x + 6, c2); + + io[0] = x[0]; + io[1] = vnegq_s16(x[4]); + io[2] = x[6]; + io[3] = vnegq_s16(x[2]); + io[4] = x[3]; + io[5] = vnegq_s16(x[7]); + io[6] = x[5]; + io[7] = vnegq_s16(x[1]); +} + +void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +typedef void (*iht_1d)(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +typedef struct { + iht_1d cols, rows; // vertical and horizontal +} iht_2d; + +#endif // VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ diff --git a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c index 3e3530116d..c031322806 100644 --- a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c +++ b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vpx_dsp/mips/inv_txfm_msa.h" diff --git a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c index 786fbdb794..aaccd5ca7b 100644 --- a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c +++ b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vpx_dsp/mips/inv_txfm_msa.h" diff --git a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c index e4166775da..76d15ff8c0 100644 --- a/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c +++ b/media/libvpx/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vpx_dsp/mips/inv_txfm_msa.h" diff --git a/media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c b/media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c new file mode 100644 index 0000000000..e861596ad4 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/ppc/vp9_idct_vsx.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/ppc/inv_txfm_vsx.h" +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" + +#include "vp9/common/vp9_enums.h" + +void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[2], out[2]; + + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct4_vsx(in, out); + vpx_idct4_vsx(out, in); + break; + case ADST_DCT: + vpx_idct4_vsx(in, out); + vp9_iadst4_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst4_vsx(in, out); + vpx_idct4_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst4_vsx(in, out); + vp9_iadst4_vsx(out, in); + break; + } + + vpx_round_store4x4_vsx(in, out, dest, stride); +} + +void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[8], out[8]; + + // load input data + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + in[2] = load_tran_low(2 * 8 * sizeof(*input), input); + in[3] = load_tran_low(3 * 8 * sizeof(*input), input); + in[4] = load_tran_low(4 * 8 * sizeof(*input), input); + in[5] = load_tran_low(5 * 8 * sizeof(*input), input); + in[6] = load_tran_low(6 * 8 * sizeof(*input), input); + in[7] = load_tran_low(7 * 8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + case ADST_DCT: + vpx_idct8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + } + + vpx_round_store8x8_vsx(in, dest, stride); +} + +void vp9_iht16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + int16x8_t in0[16], in1[16]; + + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), in0); + LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), + 8 * sizeof(*input), in1); + + switch (tx_type) { + case DCT_DCT: + vpx_idct16_vsx(in0, in1); + vpx_idct16_vsx(in0, in1); + break; + case ADST_DCT: + vpx_idct16_vsx(in0, in1); + vpx_iadst16_vsx(in0, in1); + break; + case DCT_ADST: + vpx_iadst16_vsx(in0, in1); + vpx_idct16_vsx(in0, in1); + break; + default: + assert(tx_type == ADST_ADST); + vpx_iadst16_vsx(in0, in1); + vpx_iadst16_vsx(in0, in1); + break; + } + + vpx_round_store16x16_vsx(in0, in1, dest, stride); +} diff --git a/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c index 66aa733b9e..9e73e40ea0 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c +++ b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.c @@ -17,35 +17,26 @@ #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_onyxc_int.h" -// TODO(hkuang): Don't need to lock the whole pool after implementing atomic -// frame reference count. -void lock_buffer_pool(BufferPool *const pool) { -#if CONFIG_MULTITHREAD - pthread_mutex_lock(&pool->pool_mutex); -#else - (void)pool; -#endif +void vp9_set_mi_size(int *mi_rows, int *mi_cols, int *mi_stride, int width, + int height) { + const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); + *mi_cols = aligned_width >> MI_SIZE_LOG2; + *mi_rows = aligned_height >> MI_SIZE_LOG2; + *mi_stride = calc_mi_size(*mi_cols); } -void unlock_buffer_pool(BufferPool *const pool) { -#if CONFIG_MULTITHREAD - pthread_mutex_unlock(&pool->pool_mutex); -#else - (void)pool; -#endif +void vp9_set_mb_size(int *mb_rows, int *mb_cols, int *mb_num, int mi_rows, + int mi_cols) { + *mb_cols = (mi_cols + 1) >> 1; + *mb_rows = (mi_rows + 1) >> 1; + *mb_num = (*mb_rows) * (*mb_cols); } void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) { - const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); - const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); - - cm->mi_cols = aligned_width >> MI_SIZE_LOG2; - cm->mi_rows = aligned_height >> MI_SIZE_LOG2; - cm->mi_stride = calc_mi_size(cm->mi_cols); - - cm->mb_cols = (cm->mi_cols + 1) >> 1; - cm->mb_rows = (cm->mi_rows + 1) >> 1; - cm->MBs = cm->mb_rows * cm->mb_cols; + vp9_set_mi_size(&cm->mi_rows, &cm->mi_cols, &cm->mi_stride, width, height); + vp9_set_mb_size(&cm->mb_rows, &cm->mb_cols, &cm->MBs, cm->mi_rows, + cm->mi_cols); } static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) { @@ -62,8 +53,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) { cm->prev_seg_map_idx = 1; cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx]; - if (!cm->frame_parallel_decode) - cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; + cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx]; return 0; } @@ -75,22 +65,23 @@ static void free_seg_map(VP9_COMMON *cm) { vpx_free(cm->seg_map_array[i]); cm->seg_map_array[i] = NULL; } + cm->seg_map_alloc_size = 0; cm->current_frame_seg_map = NULL; - - if (!cm->frame_parallel_decode) { - cm->last_frame_seg_map = NULL; - } + cm->last_frame_seg_map = NULL; } void vp9_free_ref_frame_buffers(BufferPool *pool) { int i; + if (!pool) return; + for (i = 0; i < FRAME_BUFFERS; ++i) { - if (pool->frame_bufs[i].ref_count > 0 && + if (!pool->frame_bufs[i].released && pool->frame_bufs[i].raw_frame_buffer.data != NULL) { pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); pool->frame_bufs[i].ref_count = 0; + pool->frame_bufs[i].released = 1; } vpx_free(pool->frame_bufs[i].mvs); pool->frame_bufs[i].mvs = NULL; @@ -112,12 +103,13 @@ void vp9_free_postproc_buffers(VP9_COMMON *cm) { } void vp9_free_context_buffers(VP9_COMMON *cm) { - cm->free_mi(cm); + if (cm->free_mi) cm->free_mi(cm); free_seg_map(cm); vpx_free(cm->above_context); cm->above_context = NULL; vpx_free(cm->above_seg_context); cm->above_seg_context = NULL; + cm->above_context_alloc_cols = 0; vpx_free(cm->lf.lfm); cm->lf.lfm = NULL; } @@ -143,13 +135,6 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { cm->free_mi(cm); if (cm->alloc_mi(cm, new_mi_size)) goto fail; } - - if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { - // Create the segmentation map structure and set to 0. - free_seg_map(cm); - if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; - } - if (cm->above_context_alloc_cols < cm->mi_cols) { vpx_free(cm->above_context); cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( @@ -164,6 +149,12 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { cm->above_context_alloc_cols = cm->mi_cols; } + if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) { + // Create the segmentation map structure and set to 0. + free_seg_map(cm); + if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols)) goto fail; + } + if (vp9_alloc_loop_filter(cm)) goto fail; return 0; @@ -176,6 +167,9 @@ fail: } void vp9_remove_common(VP9_COMMON *cm) { +#if CONFIG_VP9_POSTPROC + vp9_free_postproc_buffers(cm); +#endif vp9_free_context_buffers(cm); vpx_free(cm->fc); @@ -186,7 +180,7 @@ void vp9_remove_common(VP9_COMMON *cm) { void vp9_init_context_buffers(VP9_COMMON *cm) { cm->setup_mi(cm); - if (cm->last_frame_seg_map && !cm->frame_parallel_decode) + if (cm->last_frame_seg_map) memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); } diff --git a/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h index a3a1638572..90cbb093d7 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h +++ b/media/libvpx/libvpx/vp9/common/vp9_alloccommon.h @@ -8,10 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_ -#define VP9_COMMON_VP9_ALLOCCOMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ +#define VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ -#define INVALID_IDX -1 // Invalid buffer index. +#define INVALID_IDX (-1) // Invalid buffer index. #ifdef __cplusplus extern "C" { @@ -33,6 +33,11 @@ void vp9_free_postproc_buffers(struct VP9Common *cm); int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height); void vp9_free_state_buffers(struct VP9Common *cm); +void vp9_set_mi_size(int *mi_rows, int *mi_cols, int *mi_stride, int width, + int height); +void vp9_set_mb_size(int *mb_rows, int *mb_cols, int *mb_num, int mi_rows, + int mi_cols); + void vp9_set_mb_mi(struct VP9Common *cm, int width, int height); void vp9_swap_current_and_last_seg_map(struct VP9Common *cm); @@ -41,4 +46,4 @@ void vp9_swap_current_and_last_seg_map(struct VP9Common *cm); } // extern "C" #endif -#endif // VP9_COMMON_VP9_ALLOCCOMMON_H_ +#endif // VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_blockd.c b/media/libvpx/libvpx/vp9/common/vp9_blockd.c index b0249687fd..4327599510 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_blockd.c +++ b/media/libvpx/libvpx/vp9/common/vp9_blockd.c @@ -53,8 +53,9 @@ void vp9_foreach_transformed_block_in_plane( // the current block size extends into the UMV and we won't // visit the sub blocks that are wholly within the UMV. const int max_blocks_wide = - num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> - (5 + pd->subsampling_x)); + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 diff --git a/media/libvpx/libvpx/vp9/common/vp9_blockd.h b/media/libvpx/libvpx/vp9/common/vp9_blockd.h index 780b29208b..514d6b7764 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_blockd.h +++ b/media/libvpx/libvpx/vp9/common/vp9_blockd.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_BLOCKD_H_ -#define VP9_COMMON_VP9_BLOCKD_H_ +#ifndef VPX_VP9_COMMON_VP9_BLOCKD_H_ +#define VPX_VP9_COMMON_VP9_BLOCKD_H_ #include "./vpx_config.h" @@ -54,14 +54,22 @@ typedef struct { // decoder implementation modules critically rely on the defined entry values // specified herein. They should be refactored concurrently. -#define NONE -1 +#define NO_REF_FRAME (-1) #define INTRA_FRAME 0 #define LAST_FRAME 1 #define GOLDEN_FRAME 2 #define ALTREF_FRAME 3 #define MAX_REF_FRAMES 4 +#define MAX_INTER_REF_FRAMES 3 + typedef int8_t MV_REFERENCE_FRAME; +static INLINE int mv_ref_frame_to_inter_ref_idx( + MV_REFERENCE_FRAME mv_ref_frame) { + assert(mv_ref_frame >= LAST_FRAME && mv_ref_frame < MAX_REF_FRAMES); + return mv_ref_frame - 1; +} + // This structure now relates to 8x8 block regions. typedef struct MODE_INFO { // Common for both INTER and INTRA blocks @@ -130,9 +138,11 @@ struct macroblockd_plane { // encoder const int16_t *dequant; + + int *eob; }; -#define BLOCK_OFFSET(x, i) ((x) + (i)*16) +#define BLOCK_OFFSET(x, i) ((x) + (i) * 16) typedef struct RefBuffer { // TODO(dkovalev): idx is not really required and should be removed, now it @@ -173,7 +183,7 @@ typedef struct macroblockd { FRAME_CONTEXT *fc; /* pointers to reference frames */ - RefBuffer *block_refs[2]; + const RefBuffer *block_refs[2]; /* pointer to current frame */ const YV12_BUFFER_CONFIG *cur_buf; @@ -193,6 +203,8 @@ typedef struct macroblockd { int corrupted; struct vpx_internal_error_info *error_info; + + PARTITION_TYPE *partition; } MACROBLOCKD; static INLINE PLANE_TYPE get_plane_type(int plane) { @@ -281,8 +293,30 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, int aoff, int loff); +#if CONFIG_MISMATCH_DEBUG +#define TX_UNIT_SIZE_LOG2 2 +static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, + int mi_row, int tx_blk_col, int tx_blk_row, + int subsampling_x, int subsampling_y) { + *pixel_c = ((mi_col << MI_SIZE_LOG2) >> subsampling_x) + + (tx_blk_col << TX_UNIT_SIZE_LOG2); + *pixel_r = ((mi_row << MI_SIZE_LOG2) >> subsampling_y) + + (tx_blk_row << TX_UNIT_SIZE_LOG2); +} + +static INLINE int get_block_width(BLOCK_SIZE bsize) { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + return 4 * num_4x4_w; +} + +static INLINE int get_block_height(BLOCK_SIZE bsize) { + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + return 4 * num_4x4_h; +} +#endif + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_BLOCKD_H_ +#endif // VPX_VP9_COMMON_VP9_BLOCKD_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_common.h b/media/libvpx/libvpx/vp9/common/vp9_common.h index 666c3beaf0..d63bad93d1 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_common.h +++ b/media/libvpx/libvpx/vp9/common/vp9_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_COMMON_H_ -#define VP9_COMMON_VP9_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_COMMON_H_ +#define VPX_VP9_COMMON_VP9_COMMON_H_ /* Interface header for common constant data structures and lookup tables */ @@ -27,44 +27,25 @@ extern "C" { // Only need this for fixed-size arrays, for structs just assign. #define vp9_copy(dest, src) \ - { \ + do { \ assert(sizeof(dest) == sizeof(src)); \ memcpy(dest, src, sizeof(src)); \ - } + } while (0) // Use this for variably-sized arrays. -#define vp9_copy_array(dest, src, n) \ - { \ - assert(sizeof(*dest) == sizeof(*src)); \ - memcpy(dest, src, n * sizeof(*src)); \ +#define vp9_copy_array(dest, src, n) \ + { \ + assert(sizeof(*(dest)) == sizeof(*(src))); \ + memcpy(dest, src, (n) * sizeof(*(src))); \ } #define vp9_zero(dest) memset(&(dest), 0, sizeof(dest)) -#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest)) +#define vp9_zero_array(dest, n) memset(dest, 0, (n) * sizeof(*(dest))) static INLINE int get_unsigned_bits(unsigned int num_values) { return num_values > 0 ? get_msb(num_values) + 1 : 0; } -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(cm, lval, expr) \ - do { \ - lval = (expr); \ - if (!lval) \ - vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval " at %s:%d", __FILE__, \ - __LINE__); \ - } while (0) -#else -#define CHECK_MEM_ERROR(cm, lval, expr) \ - do { \ - lval = (expr); \ - if (!lval) \ - vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \ - "Failed to allocate " #lval); \ - } while (0) -#endif - #define VP9_SYNC_CODE_0 0x49 #define VP9_SYNC_CODE_1 0x83 #define VP9_SYNC_CODE_2 0x42 @@ -75,4 +56,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { } // extern "C" #endif -#endif // VP9_COMMON_VP9_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_common_data.c b/media/libvpx/libvpx/vp9/common/vp9_common_data.c index 4a10833229..809d7317ce 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_common_data.c +++ b/media/libvpx/libvpx/vp9/common/vp9_common_data.c @@ -28,7 +28,7 @@ const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 2, 2, const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8 }; -// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize))) +// VPXMIN(3, VPXMIN(b_width_log2_lookup(bsize), b_height_log2_lookup(bsize))) const uint8_t size_group_lookup[BLOCK_SIZES] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; diff --git a/media/libvpx/libvpx/vp9/common/vp9_common_data.h b/media/libvpx/libvpx/vp9/common/vp9_common_data.h index 5c6a7e8ff3..a533c5f058 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_common_data.h +++ b/media/libvpx/libvpx/vp9/common/vp9_common_data.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_COMMON_DATA_H_ -#define VP9_COMMON_VP9_COMMON_DATA_H_ +#ifndef VPX_VP9_COMMON_VP9_COMMON_DATA_H_ +#define VPX_VP9_COMMON_VP9_COMMON_DATA_H_ #include "vp9/common/vp9_enums.h" #include "vpx/vpx_integer.h" @@ -42,4 +42,4 @@ extern const uint8_t need_top_left[INTRA_MODES]; } // extern "C" #endif -#endif // VP9_COMMON_VP9_COMMON_DATA_H_ +#endif // VPX_VP9_COMMON_VP9_COMMON_DATA_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c b/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c index 7d128c9f7f..28cd4a1924 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c +++ b/media/libvpx/libvpx/vp9/common/vp9_debugmodes.c @@ -34,7 +34,7 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor, for (mi_row = 0; mi_row < rows; mi_row++) { fprintf(file, "%c ", prefix); for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(file, "%2d ", *((int *)((char *)(mi[0]) + member_offset))); + fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset))); mi++; } fprintf(file, "\n"); diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropy.c b/media/libvpx/libvpx/vp9/common/vp9_entropy.c index a575bda729..430b917b8f 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_entropy.c +++ b/media/libvpx/libvpx/vp9/common/vp9_entropy.c @@ -42,6 +42,7 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254, 177, 153, 140, 133, 130, 129 }; #endif +/* clang-format off */ const uint8_t vp9_coefband_trans_8x8plus[1024] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, // beyond MAXBAND_INDEX+1 all values are filled as 5 @@ -85,6 +86,7 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, }; +/* clang-format on */ const uint8_t vp9_coefband_trans_4x4[16] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropy.h b/media/libvpx/libvpx/vp9/common/vp9_entropy.h index 1da4911668..d026651df7 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_entropy.h +++ b/media/libvpx/libvpx/vp9/common/vp9_entropy.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENTROPY_H_ -#define VP9_COMMON_VP9_ENTROPY_H_ +#ifndef VPX_VP9_COMMON_VP9_ENTROPY_H_ +#define VPX_VP9_COMMON_VP9_ENTROPY_H_ #include "vpx/vpx_integer.h" #include "vpx_dsp/prob.h" @@ -137,7 +137,6 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { // 128 lists of probabilities are stored for the following ONE node probs: // 1, 3, 5, 7, ..., 253, 255 // In between probabilities are interpolated linearly - #define COEFF_PROB_MODELS 255 #define UNCONSTRAINED_NODES 3 @@ -195,4 +194,4 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENTROPY_H_ +#endif // VPX_VP9_COMMON_VP9_ENTROPY_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymode.c b/media/libvpx/libvpx/vp9/common/vp9_entropymode.c index 22365efc50..9289fc9e1f 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_entropymode.c +++ b/media/libvpx/libvpx/vp9/common/vp9_entropymode.c @@ -186,16 +186,19 @@ const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] { 93, 24, 99 }, // a split, l not split { 85, 119, 44 }, // l split, a not split { 62, 59, 67 }, // a/l both split + // 16x16 -> 8x8 { 149, 53, 53 }, // a/l both not split { 94, 20, 48 }, // a split, l not split { 83, 53, 24 }, // l split, a not split { 52, 18, 18 }, // a/l both split + // 32x32 -> 16x16 { 150, 40, 39 }, // a/l both not split { 78, 12, 26 }, // a split, l not split { 67, 33, 11 }, // l split, a not split { 24, 7, 5 }, // a/l both split + // 64x64 -> 32x32 { 174, 35, 49 }, // a/l both not split { 68, 11, 27 }, // a split, l not split @@ -260,13 +263,13 @@ const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = { -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT }; -static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { - 9, 102, 187, 225 -}; +static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { 9, 102, + 187, + 225 }; -static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = { - 239, 183, 119, 96, 41 -}; +static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = { 239, 183, + 119, 96, + 41 }; static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = { 50, 126, 123, 221, 226 }; @@ -331,8 +334,8 @@ static void init_mode_probs(FRAME_CONTEXT *fc) { vp9_copy(fc->inter_mode_probs, default_inter_mode_probs); } -const vpx_tree_index vp9_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)] = - { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; +const vpx_tree_index vp9_switchable_interp_tree[TREE_SIZE( + SWITCHABLE_FILTERS)] = { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; void vp9_adapt_mode_probs(VP9_COMMON *cm) { int i, j; @@ -378,7 +381,6 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { } if (cm->tx_mode == TX_MODE_SELECT) { - int j; unsigned int branch_ct_8x8p[TX_SIZES - 3][2]; unsigned int branch_ct_16x16p[TX_SIZES - 2][2]; unsigned int branch_ct_32x32p[TX_SIZES - 1][2]; @@ -428,7 +430,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { vp9_clearall_segfeatures(&cm->seg); cm->seg.abs_delta = SEGMENT_DELTADATA; - if (cm->last_frame_seg_map && !cm->frame_parallel_decode) + if (cm->last_frame_seg_map) memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); if (cm->current_frame_seg_map) @@ -457,7 +459,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { } // prev_mip will only be allocated in encoder. - if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode) + if (frame_is_intra_only(cm) && cm->prev_mip) memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip)); diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymode.h b/media/libvpx/libvpx/vp9/common/vp9_entropymode.h index 0ee663fe88..e616aeac5c 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_entropymode.h +++ b/media/libvpx/libvpx/vp9/common/vp9_entropymode.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_ -#define VP9_COMMON_VP9_ENTROPYMODE_H_ +#ifndef VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ +#define VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymv.h" @@ -24,7 +24,7 @@ extern "C" { #define TX_SIZE_CONTEXTS 2 -#define INTER_OFFSET(mode) ((mode)-NEARESTMV) +#define INTER_OFFSET(mode) ((mode) - NEARESTMV) struct VP9Common; @@ -104,4 +104,4 @@ void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENTROPYMODE_H_ +#endif // VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymv.c b/media/libvpx/libvpx/vp9/common/vp9_entropymv.c index a18a290cfd..b6f052d088 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_entropymv.c +++ b/media/libvpx/libvpx/vp9/common/vp9_entropymv.c @@ -22,9 +22,7 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { 18, -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10, }; -const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { - -0, -1, -}; +const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 }; const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2, -1, 4, -2, -3 }; diff --git a/media/libvpx/libvpx/vp9/common/vp9_entropymv.h b/media/libvpx/libvpx/vp9/common/vp9_entropymv.h index e2fe37a327..ee9d37973f 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_entropymv.h +++ b/media/libvpx/libvpx/vp9/common/vp9_entropymv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENTROPYMV_H_ -#define VP9_COMMON_VP9_ENTROPYMV_H_ +#ifndef VPX_VP9_COMMON_VP9_ENTROPYMV_H_ +#define VPX_VP9_COMMON_VP9_ENTROPYMV_H_ #include "./vpx_config.h" @@ -25,7 +25,7 @@ struct VP9Common; void vp9_init_mv_probs(struct VP9Common *cm); -void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); +void vp9_adapt_mv_probs(struct VP9Common *cm, int allow_hp); static INLINE int use_mv_hp(const MV *ref) { const int kMvRefThresh = 64; // threshold for use of high-precision 1/8 mv @@ -127,10 +127,10 @@ typedef struct { nmv_component_counts comps[2]; } nmv_context_counts; -void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx); +void vp9_inc_mv(const MV *mv, nmv_context_counts *counts); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENTROPYMV_H_ +#endif // VPX_VP9_COMMON_VP9_ENTROPYMV_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_enums.h b/media/libvpx/libvpx/vp9/common/vp9_enums.h index 056b298b3d..b33a3a2978 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_enums.h +++ b/media/libvpx/libvpx/vp9/common/vp9_enums.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENUMS_H_ -#define VP9_COMMON_VP9_ENUMS_H_ +#ifndef VPX_VP9_COMMON_VP9_ENUMS_H_ +#define VPX_VP9_COMMON_VP9_ENUMS_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -41,6 +41,8 @@ typedef enum BITSTREAM_PROFILE { MAX_PROFILES } BITSTREAM_PROFILE; +typedef enum PARSE_RECON_FLAG { PARSE = 1, RECON = 2 } PARSE_RECON_FLAG; + #define BLOCK_4X4 0 #define BLOCK_4X8 1 #define BLOCK_8X4 2 @@ -140,4 +142,4 @@ typedef uint8_t PREDICTION_MODE; } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENUMS_H_ +#endif // VPX_VP9_COMMON_VP9_ENUMS_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_filter.c b/media/libvpx/libvpx/vp9/common/vp9_filter.c index 6c43af8ce8..adbda6c825 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_filter.c +++ b/media/libvpx/libvpx/vp9/common/vp9_filter.c @@ -63,6 +63,20 @@ DECLARE_ALIGNED(256, static const InterpKernel, { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 1, 38, 64, 32, -1, -3 } }; -const InterpKernel *vp9_filter_kernels[4] = { - sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters +// 4-tap filter +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_4[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, + { 0, 0, -6, 120, 18, -4, 0, 0 }, { 0, 0, -8, 114, 28, -6, 0, 0 }, + { 0, 0, -10, 108, 36, -6, 0, 0 }, { 0, 0, -12, 102, 46, -8, 0, 0 }, + { 0, 0, -12, 94, 56, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 }, + { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 }, + { 0, 0, -10, 56, 94, -12, 0, 0 }, { 0, 0, -8, 46, 102, -12, 0, 0 }, + { 0, 0, -6, 36, 108, -10, 0, 0 }, { 0, 0, -6, 28, 114, -8, 0, 0 }, + { 0, 0, -4, 18, 120, -6, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 } +}; + +const InterpKernel *vp9_filter_kernels[5] = { + sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters, + sub_pel_filters_4 }; diff --git a/media/libvpx/libvpx/vp9/common/vp9_filter.h b/media/libvpx/libvpx/vp9/common/vp9_filter.h index 9d2b8e1dbf..0382c88e7c 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_filter.h +++ b/media/libvpx/libvpx/vp9/common/vp9_filter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_FILTER_H_ -#define VP9_COMMON_VP9_FILTER_H_ +#ifndef VPX_VP9_COMMON_VP9_FILTER_H_ +#define VPX_VP9_COMMON_VP9_FILTER_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -25,6 +25,7 @@ extern "C" { #define EIGHTTAP_SHARP 2 #define SWITCHABLE_FILTERS 3 /* Number of switchable filters */ #define BILINEAR 3 +#define FOURTAP 4 // The codec can operate in four possible inter prediction filter mode: // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) @@ -32,10 +33,10 @@ extern "C" { typedef uint8_t INTERP_FILTER; -extern const InterpKernel *vp9_filter_kernels[4]; +extern const InterpKernel *vp9_filter_kernels[5]; #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_FILTER_H_ +#endif // VPX_VP9_COMMON_VP9_FILTER_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c index a254e79d20..889b809e50 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c +++ b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.c @@ -14,14 +14,17 @@ #include "vpx_mem/vpx_mem.h" int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; assert(list != NULL); vp9_free_internal_frame_buffers(list); - list->num_internal_frame_buffers = - VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; - list->int_fb = (InternalFrameBuffer *)vpx_calloc( - list->num_internal_frame_buffers, sizeof(*list->int_fb)); - return (list->int_fb == NULL); + list->int_fb = + (InternalFrameBuffer *)vpx_calloc(num_buffers, sizeof(*list->int_fb)); + if (list->int_fb) { + list->num_internal_frame_buffers = num_buffers; + return 0; + } + return -1; } void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) { @@ -35,6 +38,7 @@ void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) { } vpx_free(list->int_fb); list->int_fb = NULL; + list->num_internal_frame_buffers = 0; } int vp9_get_frame_buffer(void *cb_priv, size_t min_size, diff --git a/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h index e2cfe61b66..11be838c02 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h +++ b/media/libvpx/libvpx/vp9/common/vp9_frame_buffers.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_ -#define VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#ifndef VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#define VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ #include "vpx/vpx_frame_buffer.h" #include "vpx/vpx_integer.h" @@ -50,4 +50,4 @@ int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb); } // extern "C" #endif -#endif // VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#endif // VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_idct.c b/media/libvpx/libvpx/vp9/common/vp9_idct.c index e3a088e287..71be0f310d 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_idct.c +++ b/media/libvpx/libvpx/vp9/common/vp9_idct.c @@ -150,18 +150,22 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { + assert(((intptr_t)input) % 32 == 0); /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ if (eob == 1) /* DC only DCT coefficient. */ vpx_idct16x16_1_add(input, dest, stride); else if (eob <= 10) vpx_idct16x16_10_add(input, dest, stride); + else if (eob <= 38) + vpx_idct16x16_38_add(input, dest, stride); else vpx_idct16x16_256_add(input, dest, stride); } void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, int eob) { + assert(((intptr_t)input) % 32 == 0); if (eob == 1) vpx_idct32x32_1_add(input, dest, stride); else if (eob <= 34) @@ -203,7 +207,7 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { const highbd_transform_2d IHT_4[] = { { vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0 @@ -211,7 +215,6 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, { vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2 { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3 }; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i, j; tran_low_t out[4 * 4]; @@ -243,14 +246,13 @@ static const highbd_transform_2d HIGH_IHT_8[] = { { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3 }; -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; const highbd_transform_2d ht = HIGH_IHT_8[tx_type]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Inverse transform row vectors. for (i = 0; i < 8; ++i) { @@ -277,14 +279,13 @@ static const highbd_transform_2d HIGH_IHT_16[] = { { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3 }; -void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, +void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; const highbd_transform_2d ht = HIGH_IHT_16[tx_type]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 16; ++i) { @@ -305,7 +306,7 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } // idct -void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { if (eob > 1) vpx_highbd_idct4x4_16_add(input, dest, stride, bd); @@ -313,7 +314,7 @@ void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_highbd_idct4x4_1_add(input, dest, stride, bd); } -void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { if (eob > 1) vpx_highbd_iwht4x4_16_add(input, dest, stride, bd); @@ -321,7 +322,7 @@ void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, vpx_highbd_iwht4x4_1_add(input, dest, stride, bd); } -void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // If dc is 1, then input[0] is the reconstructed value, do not need // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. @@ -338,7 +339,7 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, } } -void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // The calculation can be simplified if there are not many non-zero dct // coefficients. Use eobs to separate different cases. @@ -347,18 +348,22 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, vpx_highbd_idct16x16_1_add(input, dest, stride, bd); } else if (eob <= 10) { vpx_highbd_idct16x16_10_add(input, dest, stride, bd); + } else if (eob <= 38) { + vpx_highbd_idct16x16_38_add(input, dest, stride, bd); } else { vpx_highbd_idct16x16_256_add(input, dest, stride, bd); } } -void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd) { // Non-zero coeff only in upper-left 8x8 if (eob == 1) { vpx_highbd_idct32x32_1_add(input, dest, stride, bd); } else if (eob <= 34) { vpx_highbd_idct32x32_34_add(input, dest, stride, bd); + } else if (eob <= 135) { + vpx_highbd_idct32x32_135_add(input, dest, stride, bd); } else { vpx_highbd_idct32x32_1024_add(input, dest, stride, bd); } @@ -366,7 +371,7 @@ void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, // iht void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) vp9_highbd_idct4x4_add(input, dest, stride, eob, bd); else @@ -374,7 +379,7 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, } void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) { vp9_highbd_idct8x8_add(input, dest, stride, eob, bd); } else { @@ -383,7 +388,7 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, } void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd) { + uint16_t *dest, int stride, int eob, int bd) { if (tx_type == DCT_DCT) { vp9_highbd_idct16x16_add(input, dest, stride, eob, bd); } else { diff --git a/media/libvpx/libvpx/vp9/common/vp9_idct.h b/media/libvpx/libvpx/vp9/common/vp9_idct.h index ea958a38c0..94eeaf599e 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_idct.h +++ b/media/libvpx/libvpx/vp9/common/vp9_idct.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_IDCT_H_ -#define VP9_COMMON_VP9_IDCT_H_ +#ifndef VPX_VP9_COMMON_VP9_IDCT_H_ +#define VPX_VP9_COMMON_VP9_IDCT_H_ #include @@ -57,25 +57,25 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, +void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); -void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest, +void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, - uint8_t *dest, int stride, int eob, int bd); + uint16_t *dest, int stride, int eob, int bd); #endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_IDCT_H_ +#endif // VPX_VP9_COMMON_VP9_IDCT_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c index ef0297dd5e..1a9d45ae77 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c +++ b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.c @@ -880,12 +880,12 @@ void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, // This function sets up the bit masks for the entire 64x64 region represented // by mi_row, mi_col. void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, - MODE_INFO **mi, const int mode_info_stride, + MODE_INFO **mi8x8, const int mode_info_stride, LOOP_FILTER_MASK *lfm) { int idx_32, idx_16, idx_8; const loop_filter_info_n *const lfi_n = &cm->lf_info; - MODE_INFO **mip = mi; - MODE_INFO **mip2 = mi; + MODE_INFO **mip = mi8x8; + MODE_INFO **mip2 = mi8x8; // These are offsets to the next mi in the 64x64 block. It is what gets // added to the mi ptr as we go through each loop. It helps us to avoid @@ -932,32 +932,32 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, break; default: for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) { - const int shift_y = shift_32_y[idx_32]; - const int shift_uv = shift_32_uv[idx_32]; + const int shift_y_32 = shift_32_y[idx_32]; + const int shift_uv_32 = shift_32_uv[idx_32]; const int mi_32_col_offset = ((idx_32 & 1) << 2); const int mi_32_row_offset = ((idx_32 >> 1) << 2); if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows) continue; switch (mip[0]->sb_type) { case BLOCK_32X32: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); break; case BLOCK_32X16: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); if (mi_32_row_offset + 2 >= max_rows) continue; mip2 = mip + mode_info_stride * 2; - build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm); + build_masks(lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4, lfm); break; case BLOCK_16X32: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm); if (mi_32_col_offset + 2 >= max_cols) continue; mip2 = mip + 2; - build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm); + build_masks(lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1, lfm); break; default: for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) { - const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16]; - const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16]; + const int shift_y_16 = shift_y_32 + shift_16_y[idx_16]; + const int shift_uv_16 = shift_uv_32 + shift_16_uv[idx_16]; const int mi_16_col_offset = mi_32_col_offset + ((idx_16 & 1) << 1); const int mi_16_row_offset = @@ -968,28 +968,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, switch (mip[0]->sb_type) { case BLOCK_16X16: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); break; case BLOCK_16X8: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); if (mi_16_row_offset + 1 >= max_rows) continue; mip2 = mip + mode_info_stride; - build_y_mask(lfi_n, mip2[0], shift_y + 8, lfm); + build_y_mask(lfi_n, mip2[0], shift_y_16 + 8, lfm); break; case BLOCK_8X16: - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm); if (mi_16_col_offset + 1 >= max_cols) continue; mip2 = mip + 1; - build_y_mask(lfi_n, mip2[0], shift_y + 1, lfm); + build_y_mask(lfi_n, mip2[0], shift_y_16 + 1, lfm); break; default: { - const int shift_y = - shift_32_y[idx_32] + shift_16_y[idx_16] + shift_8_y[0]; - build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + const int shift_y_8_0 = shift_y_16 + shift_8_y[0]; + build_masks(lfi_n, mip[0], shift_y_8_0, shift_uv_16, lfm); mip += offset[0]; for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) { - const int shift_y = shift_32_y[idx_32] + - shift_16_y[idx_16] + shift_8_y[idx_8]; + const int shift_y_8 = shift_y_16 + shift_8_y[idx_8]; const int mi_8_col_offset = mi_16_col_offset + ((idx_8 & 1)); const int mi_8_row_offset = @@ -998,7 +996,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, if (mi_8_col_offset >= max_cols || mi_8_row_offset >= max_rows) continue; - build_y_mask(lfi_n, mip[0], shift_y, lfm); + build_y_mask(lfi_n, mip[0], shift_y_8, lfm); } break; } @@ -1087,13 +1085,19 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, const int row_step_stride = cm->mi_stride * row_step; struct buf_2d *const dst = &plane->dst; uint8_t *const dst0 = dst->buf; - unsigned int mask_16x16[MI_BLOCK_SIZE] = { 0 }; - unsigned int mask_8x8[MI_BLOCK_SIZE] = { 0 }; - unsigned int mask_4x4[MI_BLOCK_SIZE] = { 0 }; - unsigned int mask_4x4_int[MI_BLOCK_SIZE] = { 0 }; + unsigned int mask_16x16[MI_BLOCK_SIZE]; + unsigned int mask_8x8[MI_BLOCK_SIZE]; + unsigned int mask_4x4[MI_BLOCK_SIZE]; + unsigned int mask_4x4_int[MI_BLOCK_SIZE]; uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE]; int r, c; + vp9_zero(mask_16x16); + vp9_zero(mask_8x8); + vp9_zero(mask_4x4); + vp9_zero(mask_4x4_int); + vp9_zero(lfl); + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { unsigned int mask_16x16_c = 0; unsigned int mask_8x8_c = 0; @@ -1174,7 +1178,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, } // Disable filtering on the leftmost column - border_mask = ~(mi_col == 0); + border_mask = ~(mi_col == 0 ? 1u : 0u); #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { highbd_filter_selectively_vert( @@ -1330,6 +1334,8 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm, uint16_t mask_4x4 = lfm->left_uv[TX_4X4]; uint16_t mask_4x4_int = lfm->int_4x4_uv; + vp9_zero(lfl_uv); + assert(plane->subsampling_x == 1 && plane->subsampling_y == 1); // Vertical pass: do 2 rows at one time @@ -1612,12 +1618,14 @@ void vp9_loop_filter_data_reset( void vp9_reset_lfm(VP9_COMMON *const cm) { if (cm->lf.filter_level) { - memset(cm->lf.lfm, 0, ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * - cm->lf.lfm_stride * sizeof(*cm->lf.lfm)); + memset(cm->lf.lfm, 0, + ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride * + sizeof(*cm->lf.lfm)); } } -int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) { +int vp9_loop_filter_worker(void *arg1, void *unused) { + LFWorkerData *const lf_data = (LFWorkerData *)arg1; (void)unused; loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only); diff --git a/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h index da37a6ebde..39648a72c3 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h +++ b/media/libvpx/libvpx/vp9/common/vp9_loopfilter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_LOOPFILTER_H_ -#define VP9_COMMON_VP9_LOOPFILTER_H_ +#ifndef VPX_VP9_COMMON_VP9_LOOPFILTER_H_ +#define VPX_VP9_COMMON_VP9_LOOPFILTER_H_ #include "vpx_ports/mem.h" #include "./vpx_config.h" @@ -97,7 +97,7 @@ struct VP9LfSyncData; // This function sets up the bit masks for the entire 64x64 region represented // by mi_row, mi_col. void vp9_setup_mask(struct VP9Common *const cm, const int mi_row, - const int mi_col, MODE_INFO **mi_8x8, + const int mi_col, MODE_INFO **mi8x8, const int mode_info_stride, LOOP_FILTER_MASK *lfm); void vp9_filter_block_plane_ss00(struct VP9Common *const cm, @@ -120,7 +120,7 @@ void vp9_loop_filter_init(struct VP9Common *cm); void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl); void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, - struct macroblockd *mbd, int filter_level, + struct macroblockd *xd, int frame_filter_level, int y_only, int partial_frame); // Get the superblock lfm for a given mi_row, mi_col. @@ -151,10 +151,10 @@ void vp9_loop_filter_data_reset( LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer, struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]); -// Operates on the rows described by 'lf_data'. -int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused); +// Operates on the rows described by 'arg1' (cast to LFWorkerData *). +int vp9_loop_filter_worker(void *arg1, void *unused); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_LOOPFILTER_H_ +#endif // VPX_VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_mfqe.c b/media/libvpx/libvpx/vp9/common/vp9_mfqe.c index e76d771b8d..cf60fa40fd 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_mfqe.c +++ b/media/libvpx/libvpx/vp9/common/vp9_mfqe.c @@ -217,6 +217,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, const int bsl = b_width_log2_lookup[bs]; PARTITION_TYPE partition = partition_lookup[bsl][cur_bs]; const BLOCK_SIZE subsize = get_subsize(bs, partition); + BLOCK_SIZE mfqe_bs, bs_tmp; if (cur_bs < BLOCK_8X8) { // If there are blocks smaller than 8x8, it must be on the boundary. @@ -236,7 +237,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs, uv_offset = 8; } switch (partition) { - BLOCK_SIZE mfqe_bs, bs_tmp; case PARTITION_HORZ: if (bs == BLOCK_64X64) { mfqe_bs = BLOCK_64X32; diff --git a/media/libvpx/libvpx/vp9/common/vp9_mfqe.h b/media/libvpx/libvpx/vp9/common/vp9_mfqe.h index dfff8c23d6..f53e1c2f9d 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_mfqe.h +++ b/media/libvpx/libvpx/vp9/common/vp9_mfqe.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_MFQE_H_ -#define VP9_COMMON_VP9_MFQE_H_ +#ifndef VPX_VP9_COMMON_VP9_MFQE_H_ +#define VPX_VP9_COMMON_VP9_MFQE_H_ #ifdef __cplusplus extern "C" { @@ -28,4 +28,4 @@ void vp9_mfqe(struct VP9Common *cm); } // extern "C" #endif -#endif // VP9_COMMON_VP9_MFQE_H_ +#endif // VPX_VP9_COMMON_VP9_MFQE_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_mv.h b/media/libvpx/libvpx/vp9/common/vp9_mv.h index 4c8eac7213..76f93cf0ba 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_mv.h +++ b/media/libvpx/libvpx/vp9/common/vp9_mv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_MV_H_ -#define VP9_COMMON_VP9_MV_H_ +#ifndef VPX_VP9_COMMON_VP9_MV_H_ +#define VPX_VP9_COMMON_VP9_MV_H_ #include "vpx/vpx_integer.h" @@ -19,6 +19,8 @@ extern "C" { #endif +#define INVALID_MV 0x80008000 + typedef struct mv { int16_t row; int16_t col; @@ -52,4 +54,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row, } // extern "C" #endif -#endif // VP9_COMMON_VP9_MV_H_ +#endif // VPX_VP9_COMMON_VP9_MV_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h b/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h index 2b2c1ba9ee..5db6772dca 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h +++ b/media/libvpx/libvpx/vp9/common/vp9_mvref_common.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ -#define VP9_COMMON_VP9_MVREF_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ +#define VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_blockd.h" @@ -263,10 +263,10 @@ static INLINE int_mv scale_mv(const MODE_INFO *mi, int ref, mv_ref_list, Done) \ do { \ if (is_inter_block(mbmi)) { \ - if ((mbmi)->ref_frame[0] != ref_frame) \ + if ((mbmi)->ref_frame[0] != (ref_frame)) \ ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \ refmv_count, mv_ref_list, Done); \ - if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != ref_frame && \ + if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != (ref_frame) && \ (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \ ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \ refmv_count, mv_ref_list, Done); \ @@ -320,4 +320,4 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, int block, } // extern "C" #endif -#endif // VP9_COMMON_VP9_MVREF_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h index 32db7b7aa7..4c8fcf6989 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h +++ b/media/libvpx/libvpx/vp9/common/vp9_onyxc_int.h @@ -8,12 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ONYXC_INT_H_ -#define VP9_COMMON_VP9_ONYXC_INT_H_ +#ifndef VPX_VP9_COMMON_VP9_ONYXC_INT_H_ +#define VPX_VP9_COMMON_VP9_ONYXC_INT_H_ #include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" -#include "vpx_util/vpx_thread.h" #include "./vp9_rtcd.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_loopfilter.h" @@ -37,13 +36,9 @@ extern "C" { #define REF_FRAMES_LOG2 3 #define REF_FRAMES (1 << REF_FRAMES_LOG2) -// 4 scratch frames for the new frames to support a maximum of 4 cores decoding -// in parallel, 3 for scaled references on the encoder. -// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number -// of framebuffers. -// TODO(jkoleszar): These 3 extra references could probably come from the -// normal reference pool. -#define FRAME_BUFFERS (REF_FRAMES + 7) +// 1 scratch frame for the new frame, REFS_PER_FRAME for scaled references on +// the encoder. +#define FRAME_BUFFERS (REF_FRAMES + 1 + REFS_PER_FRAME) #define FRAME_CONTEXTS_LOG2 2 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) @@ -72,30 +67,22 @@ typedef struct { MV_REF *mvs; int mi_rows; int mi_cols; + uint8_t released; + + // Note that frame_index/frame_coding_index are only set by set_frame_index() + // on the encoder side. + + // TODO(angiebird): Set frame_index/frame_coding_index on the decoder side + // properly. + int frame_index; // Display order in the video, it's equivalent to the + // show_idx defined in EncodeFrameInfo. + int frame_coding_index; // The coding order (starting from zero) of this + // frame. vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; - - // The Following variables will only be used in frame parallel decode. - - // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means - // that no FrameWorker owns, or is decoding, this buffer. - VPxWorker *frame_worker_owner; - - // row and col indicate which position frame has been decoded to in real - // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX - // when the frame is fully decoded. - int row; - int col; } RefCntBuffer; typedef struct BufferPool { -// Protect BufferPool from being accessed by several FrameWorkers at -// the same time during frame parallel decode. -// TODO(hkuang): Try to use atomic variable instead of locking the whole pool. -#if CONFIG_MULTITHREAD - pthread_mutex_t pool_mutex; -#endif - // Private data associated with the frame buffer callbacks. void *cb_priv; @@ -149,6 +136,8 @@ typedef struct VP9Common { int new_fb_idx; + int cur_show_frame_fb_idx; + #if CONFIG_VP9_POSTPROC YV12_BUFFER_CONFIG post_proc_buffer; YV12_BUFFER_CONFIG post_proc_buffer_int; @@ -235,10 +224,6 @@ typedef struct VP9Common { struct loopfilter lf; struct segmentation seg; - // TODO(hkuang): Remove this as it is the same as frame_parallel_decode - // in pbi. - int frame_parallel_decode; // frame-based threading. - // Context probabilities for reference frame prediction MV_REFERENCE_FRAME comp_fixed_ref; MV_REFERENCE_FRAME comp_var_ref[2]; @@ -249,7 +234,14 @@ typedef struct VP9Common { unsigned int frame_context_idx; /* Context to use/update */ FRAME_COUNTS counts; + // TODO(angiebird): current_video_frame/current_frame_coding_index into a + // structure unsigned int current_video_frame; + // Each show or no show frame is assigned with a coding index based on its + // coding order (starting from zero). + + // Current frame's coding index. + int current_frame_coding_index; BITSTREAM_PROFILE profile; // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3. @@ -267,26 +259,63 @@ typedef struct VP9Common { int byte_alignment; int skip_loop_filter; - // Private data associated with the frame buffer callbacks. - void *cb_priv; - vpx_get_frame_buffer_cb_fn_t get_fb_cb; - vpx_release_frame_buffer_cb_fn_t release_fb_cb; - - // Handles memory for the codec. - InternalFrameBufferList int_frame_buffers; - // External BufferPool passed from outside. BufferPool *buffer_pool; PARTITION_CONTEXT *above_seg_context; ENTROPY_CONTEXT *above_context; int above_context_alloc_cols; + + int lf_row; } VP9_COMMON; -// TODO(hkuang): Don't need to lock the whole pool after implementing atomic -// frame reference count. -void lock_buffer_pool(BufferPool *const pool); -void unlock_buffer_pool(BufferPool *const pool); +static INLINE void init_frame_indexes(VP9_COMMON *cm) { + cm->current_video_frame = 0; + cm->current_frame_coding_index = 0; +} + +static INLINE void update_frame_indexes(VP9_COMMON *cm, int show_frame) { + if (show_frame) { + // Don't increment frame counters if this was an altref buffer + // update not a real frame + ++cm->current_video_frame; + } + ++cm->current_frame_coding_index; +} + +typedef struct { + int frame_width; + int frame_height; + int render_frame_width; + int render_frame_height; + int mi_rows; + int mi_cols; + int mb_rows; + int mb_cols; + int num_mbs; + vpx_bit_depth_t bit_depth; +} FRAME_INFO; + +static INLINE void init_frame_info(FRAME_INFO *frame_info, + const VP9_COMMON *cm) { + frame_info->frame_width = cm->width; + frame_info->frame_height = cm->height; + frame_info->render_frame_width = cm->render_width; + frame_info->render_frame_height = cm->render_height; + frame_info->mi_cols = cm->mi_cols; + frame_info->mi_rows = cm->mi_rows; + frame_info->mb_cols = cm->mb_cols; + frame_info->mb_rows = cm->mb_rows; + frame_info->num_mbs = cm->MBs; + frame_info->bit_depth = cm->bit_depth; + // TODO(angiebird): Figure out how to get subsampling_x/y here +} + +static INLINE YV12_BUFFER_CONFIG *get_buf_frame(VP9_COMMON *cm, int index) { + if (index < 0 || index >= FRAME_BUFFERS) return NULL; + if (cm->error.error_code != VPX_CODEC_OK) return NULL; + return &cm->buffer_pool->frame_bufs[index].buf; +} static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { if (index < 0 || index >= REF_FRAMES) return NULL; @@ -303,7 +332,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) { RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; int i; - lock_buffer_pool(cm->buffer_pool); for (i = 0; i < FRAME_BUFFERS; ++i) if (frame_bufs[i].ref_count == 0) break; @@ -314,7 +342,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) { i = INVALID_IDX; } - unlock_buffer_pool(cm->buffer_pool); return i; } @@ -342,7 +369,7 @@ static INLINE void set_partition_probs(const VP9_COMMON *const cm, xd->partition_probs = frame_is_intra_only(cm) ? &vp9_kf_partition_probs[0] - : (const vpx_prob(*)[PARTITION_TYPES - 1])cm->fc->partition_prob; + : (const vpx_prob(*)[PARTITION_TYPES - 1]) cm->fc->partition_prob; } static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd, @@ -437,4 +464,4 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row, } // extern "C" #endif -#endif // VP9_COMMON_VP9_ONYXC_INT_H_ +#endif // VPX_VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_postproc.c b/media/libvpx/libvpx/vp9/common/vp9_postproc.c index b105e5d45a..c777556050 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_postproc.c +++ b/media/libvpx/libvpx/vp9/common/vp9_postproc.c @@ -119,8 +119,8 @@ void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, uint16_t d[16]; for (r = 0; r < rows; r++) { - int sumsq = 0; - int sum = 0; + int64_t sumsq = 0; + int64_t sum = 0; for (i = -8; i <= 6; i++) { sumsq += s[i] * s[i]; @@ -157,8 +157,8 @@ void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch, int rows, int cols, for (c = 0; c < cols; c++) { uint16_t *s = &dst[c]; - int sumsq = 0; - int sum = 0; + int64_t sumsq = 0; + int64_t sum = 0; uint16_t d[16]; const int16_t *rv2 = rv3 + ((c * 17) & 127); @@ -183,7 +183,8 @@ void vp9_highbd_mbpost_proc_down_c(uint16_t *dst, int pitch, int rows, int cols, } #endif // CONFIG_VP9_HIGHBITDEPTH -static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source, +static void deblock_and_de_macro_block(VP9_COMMON *cm, + YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag, uint8_t *limits) { @@ -216,7 +217,7 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source, source->uv_height, source->uv_width, ppl); } else { #endif // CONFIG_VP9_HIGHBITDEPTH - vp9_deblock(source, post, q, limits); + vp9_deblock(cm, source, post, q, limits); vpx_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q)); vpx_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, @@ -226,8 +227,8 @@ static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source, #endif // CONFIG_VP9_HIGHBITDEPTH } -void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, - uint8_t *limits) { +void vp9_deblock(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits) { const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q + 0.0065 + 0.5); #if CONFIG_VP9_HIGHBITDEPTH @@ -252,10 +253,8 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, } else { #endif // CONFIG_VP9_HIGHBITDEPTH int mbr; - const int mb_rows = src->y_height / 16; - const int mb_cols = src->y_width / 16; - - memset(limits, (unsigned char)ppl, 16 * mb_cols); + const int mb_rows = cm->mb_rows; + memset(limits, (unsigned char)ppl, cm->postproc_state.limits_size); for (mbr = 0; mbr < mb_rows; mbr++) { vpx_post_proc_down_and_across_mb_row( @@ -276,9 +275,9 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, #endif // CONFIG_VP9_HIGHBITDEPTH } -void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, - uint8_t *limits) { - vp9_deblock(src, dst, q, limits); +void vp9_denoise(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits) { + vp9_deblock(cm, src, dst, q, limits); } static void swap_mi_and_prev_mi(VP9_COMMON *cm) { @@ -293,11 +292,12 @@ static void swap_mi_and_prev_mi(VP9_COMMON *cm) { } int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *ppflags) { + vp9_ppflags_t *ppflags, int unscaled_width) { const int q = VPXMIN(105, cm->lf.filter_level * 2); const int flags = ppflags->post_proc_flag; YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer; struct postproc_state *const ppstate = &cm->postproc_state; + ppstate->limits_size = unscaled_width; if (!cm->frame_to_show) return -1; @@ -339,10 +339,10 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, "Failed to allocate MFQE framebuffer"); } - // Ensure that postproc is set to all 0s so that post proc + // Ensure that postproc is set to flat image so that post proc // doesn't pull random data in from edge. memset(cm->post_proc_buffer_int.buffer_alloc, 128, - cm->post_proc_buffer.frame_size); + cm->post_proc_buffer_int.frame_size); } } @@ -352,14 +352,18 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, cm->use_highbitdepth, #endif VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL) < 0) + NULL, NULL, NULL) < 0) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate post-processing buffer"); + } + memset(cm->post_proc_buffer.buffer_alloc, 128, + cm->post_proc_buffer.frame_size); if (flags & (VP9D_DEMACROBLOCK | VP9D_DEBLOCK)) { if (!cm->postproc_state.limits) { cm->postproc_state.limits = - vpx_calloc(cm->width, sizeof(*cm->postproc_state.limits)); + vpx_calloc(ppstate->limits_size, sizeof(*cm->postproc_state.limits)); + if (!cm->postproc_state.limits) return 1; } } @@ -380,26 +384,26 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, // if mfqe is enabled. Need to take both the quality and the speed // into consideration. if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) { - vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int); + vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int); } if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) { - deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf, + deblock_and_de_macro_block(cm, &cm->post_proc_buffer_int, ppbuf, q + (ppflags->deblocking_level - 5) * 10, 1, 0, cm->postproc_state.limits); } else if (flags & VP9D_DEBLOCK) { - vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q, + vp9_deblock(cm, &cm->post_proc_buffer_int, ppbuf, q, cm->postproc_state.limits); } else { - vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); + vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf); } } else if (flags & VP9D_DEMACROBLOCK) { - deblock_and_de_macro_block(cm->frame_to_show, ppbuf, + deblock_and_de_macro_block(cm, cm->frame_to_show, ppbuf, q + (ppflags->deblocking_level - 5) * 10, 1, 0, cm->postproc_state.limits); } else if (flags & VP9D_DEBLOCK) { - vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits); + vp9_deblock(cm, cm->frame_to_show, ppbuf, q, cm->postproc_state.limits); } else { - vp8_yv12_copy_frame(cm->frame_to_show, ppbuf); + vpx_yv12_copy_frame(cm->frame_to_show, ppbuf); } ppstate->last_base_qindex = cm->base_qindex; diff --git a/media/libvpx/libvpx/vp9/common/vp9_postproc.h b/media/libvpx/libvpx/vp9/common/vp9_postproc.h index 6059094114..ef6f4ea5f4 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_postproc.h +++ b/media/libvpx/libvpx/vp9/common/vp9_postproc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_POSTPROC_H_ -#define VP9_COMMON_VP9_POSTPROC_H_ +#ifndef VPX_VP9_COMMON_VP9_POSTPROC_H_ +#define VPX_VP9_COMMON_VP9_POSTPROC_H_ #include "vpx_ports/mem.h" #include "vpx_scale/yv12config.h" @@ -30,6 +30,7 @@ struct postproc_state { MODE_INFO *prev_mi; int clamp; uint8_t *limits; + int limits_size; int8_t *generated_noise; }; @@ -38,16 +39,16 @@ struct VP9Common; #define MFQE_PRECISION 4 int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *flags); + vp9_ppflags_t *ppflags, int unscaled_width); -void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, - uint8_t *limits); +void vp9_denoise(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits); -void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, - uint8_t *limits); +void vp9_deblock(struct VP9Common *cm, const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_POSTPROC_H_ +#endif // VPX_VP9_COMMON_VP9_POSTPROC_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_ppflags.h b/media/libvpx/libvpx/vp9/common/vp9_ppflags.h index b8b647bf18..a0e3017626 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_ppflags.h +++ b/media/libvpx/libvpx/vp9/common/vp9_ppflags.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_PPFLAGS_H_ -#define VP9_COMMON_VP9_PPFLAGS_H_ +#ifndef VPX_VP9_COMMON_VP9_PPFLAGS_H_ +#define VPX_VP9_COMMON_VP9_PPFLAGS_H_ #ifdef __cplusplus extern "C" { @@ -33,4 +33,4 @@ typedef struct { } // extern "C" #endif -#endif // VP9_COMMON_VP9_PPFLAGS_H_ +#endif // VPX_VP9_COMMON_VP9_PPFLAGS_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_pred_common.c b/media/libvpx/libvpx/vp9/common/vp9_pred_common.c index a7ddc0b951..375cb4d76c 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_pred_common.c +++ b/media/libvpx/libvpx/vp9/common/vp9_pred_common.c @@ -13,6 +13,32 @@ #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_seg_common.h" +int vp9_compound_reference_allowed(const VP9_COMMON *cm) { + int i; + for (i = 1; i < REFS_PER_FRAME; ++i) + if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1; + + return 0; +} + +void vp9_setup_compound_reference_mode(VP9_COMMON *cm) { + if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[GOLDEN_FRAME]) { + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; + } else if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[ALTREF_FRAME]) { + cm->comp_fixed_ref = GOLDEN_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } else { + cm->comp_fixed_ref = LAST_FRAME; + cm->comp_var_ref[0] = GOLDEN_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } +} + int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd) { int ctx; @@ -229,9 +255,8 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { else pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME); } else { - pred_context = 1 + - 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || - edge_mi->ref_frame[1] == GOLDEN_FRAME); + pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || + edge_mi->ref_frame[1] == GOLDEN_FRAME); } } else { // inter/inter const int above_has_second = has_second_ref(above_mi); diff --git a/media/libvpx/libvpx/vp9/common/vp9_pred_common.h b/media/libvpx/libvpx/vp9/common/vp9_pred_common.h index 8400bd70f1..ee59669359 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_pred_common.h +++ b/media/libvpx/libvpx/vp9/common/vp9_pred_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_PRED_COMMON_H_ -#define VP9_COMMON_VP9_PRED_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_PRED_COMMON_H_ +#define VPX_VP9_COMMON_VP9_PRED_COMMON_H_ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_onyxc_int.h" @@ -145,6 +145,10 @@ static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1]; } +int vp9_compound_reference_allowed(const VP9_COMMON *cm); + +void vp9_setup_compound_reference_mode(VP9_COMMON *cm); + // Returns a context number for the given MB prediction signal // The mode info data structure has a one element border above and to the // left of the entries corresponding to real blocks. @@ -176,12 +180,6 @@ static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, } } -static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size, - const MACROBLOCKD *xd, - const struct tx_probs *tx_probs) { - return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs); -} - static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, struct tx_counts *tx_counts) { switch (max_tx_size) { @@ -196,4 +194,4 @@ static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, } // extern "C" #endif -#endif // VP9_COMMON_VP9_PRED_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_quant_common.h b/media/libvpx/libvpx/vp9/common/vp9_quant_common.h index 4bae4a8967..ec8b9f4c6a 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_quant_common.h +++ b/media/libvpx/libvpx/vp9/common/vp9_quant_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_QUANT_COMMON_H_ -#define VP9_COMMON_VP9_QUANT_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ +#define VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ #include "vpx/vpx_codec.h" #include "vp9/common/vp9_seg_common.h" @@ -33,4 +33,4 @@ int vp9_get_qindex(const struct segmentation *seg, int segment_id, } // extern "C" #endif -#endif // VP9_COMMON_VP9_QUANT_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_reconinter.c b/media/libvpx/libvpx/vp9/common/vp9_reconinter.c index 8eb7126898..0a60b853d8 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_reconinter.c +++ b/media/libvpx/libvpx/vp9/common/vp9_reconinter.c @@ -13,15 +13,16 @@ #include "./vpx_scale_rtcd.h" #include "./vpx_config.h" -#include "vpx/vpx_integer.h" - #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vpx/vpx_integer.h" +#include "vpx_scale/yv12config.h" + #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_build_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y, int bd) { @@ -63,14 +64,14 @@ static INLINE int round_mv_comp_q4(int value) { } static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) { - MV res = { - round_mv_comp_q4( - mi->bmi[0].as_mv[idx].as_mv.row + mi->bmi[1].as_mv[idx].as_mv.row + - mi->bmi[2].as_mv[idx].as_mv.row + mi->bmi[3].as_mv[idx].as_mv.row), - round_mv_comp_q4( - mi->bmi[0].as_mv[idx].as_mv.col + mi->bmi[1].as_mv[idx].as_mv.col + - mi->bmi[2].as_mv[idx].as_mv.col + mi->bmi[3].as_mv[idx].as_mv.col) - }; + MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row + + mi->bmi[1].as_mv[idx].as_mv.row + + mi->bmi[2].as_mv[idx].as_mv.row + + mi->bmi[3].as_mv[idx].as_mv.row), + round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col + + mi->bmi[1].as_mv[idx].as_mv.col + + mi->bmi[2].as_mv[idx].as_mv.col + + mi->bmi[3].as_mv[idx].as_mv.col) }; return res; } @@ -96,8 +97,8 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, int bw, const int spel_right = spel_left - SUBPEL_SHIFTS; const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS; const int spel_bottom = spel_top - SUBPEL_SHIFTS; - MV clamped_mv = { src_mv->row * (1 << (1 - ss_y)), - src_mv->col * (1 << (1 - ss_x)) }; + MV clamped_mv = { (short)(src_mv->row * (1 << (1 - ss_y))), + (short)(src_mv->col * (1 << (1 - ss_x))) }; assert(ss_x <= 1); assert(ss_y <= 1); @@ -136,7 +137,7 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, const struct scale_factors *const sf = &xd->block_refs[ref]->sf; struct buf_2d *const pre_buf = &pd->pre[ref]; struct buf_2d *const dst_buf = &pd->dst; - uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + uint8_t *const dst = dst_buf->buf + (int64_t)dst_buf->stride * y + x; const MV mv = mi->sb_type < BLOCK_8X8 ? average_split_mvs(pd, mi, ref, block) : mi->mv[ref].as_mv; @@ -158,18 +159,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, // Co-ordinate of containing block to pixel precision. const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); + const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf; + uint8_t *buf_array[] = { ref_buf->y_buffer, ref_buf->u_buffer, + ref_buf->v_buffer }; + const int stride_array[] = { ref_buf->y_stride, ref_buf->uv_stride, + ref_buf->uv_stride }; #if 0 // CONFIG_BETTER_HW_COMPATIBILITY assert(xd->mi[0]->sb_type != BLOCK_4X8 && xd->mi[0]->sb_type != BLOCK_8X4); assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) && mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x))); #endif - if (plane == 0) - pre_buf->buf = xd->block_refs[ref]->buf->y_buffer; - else if (plane == 1) - pre_buf->buf = xd->block_refs[ref]->buf->u_buffer; - else - pre_buf->buf = xd->block_refs[ref]->buf->v_buffer; + pre_buf->buf = buf_array[plane]; + pre_buf->stride = stride_array[plane]; pre_buf->buf += scaled_buffer_offset(x_start + x, y_start + y, pre_buf->stride, sf); @@ -178,7 +180,7 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, xs = sf->x_step_q4; ys = sf->y_step_q4; } else { - pre = pre_buf->buf + (y * pre_buf->stride + x); + pre = pre_buf->buf + ((int64_t)y * pre_buf->stride + x); scaled_mv.row = mv_q4.row; scaled_mv.col = mv_q4.col; xs = ys = 16; @@ -190,7 +192,8 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + highbd_inter_predictor(CONVERT_TO_SHORTPTR(pre), pre_buf->stride, + CONVERT_TO_SHORTPTR(dst), dst_buf->stride, subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); } else { diff --git a/media/libvpx/libvpx/vp9/common/vp9_reconinter.h b/media/libvpx/libvpx/vp9/common/vp9_reconinter.h index 4fed4f7f6e..12b545831a 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_reconinter.h +++ b/media/libvpx/libvpx/vp9/common/vp9_reconinter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_RECONINTER_H_ -#define VP9_COMMON_VP9_RECONINTER_H_ +#ifndef VPX_VP9_COMMON_VP9_RECONINTER_H_ +#define VPX_VP9_COMMON_VP9_RECONINTER_H_ #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_onyxc_int.h" @@ -26,19 +26,19 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys) { - sf->predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h); + sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst, + dst_stride, kernel, subpel_x, + xs, subpel_y, ys, w, h); } #if CONFIG_VP9_HIGHBITDEPTH static INLINE void highbd_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, const int subpel_x, const int subpel_y, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) { sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h, bd); + src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w, + h, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -61,24 +61,25 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, const MV *mv_q3, + int dst_stride, const MV *src_mv, const struct scale_factors *sf, int w, int h, - int do_avg, const InterpKernel *kernel, + int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y); #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_build_inter_predictor( - const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg, + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y, int bd); #endif -static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, - const struct scale_factors *sf) { +static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset, + int stride, + const struct scale_factors *sf) { const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset; const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset; - return y * stride + x; + return (int64_t)y * stride + x; } static INLINE void setup_pred_plane(struct buf_2d *dst, uint8_t *src, @@ -103,4 +104,4 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, } // extern "C" #endif -#endif // VP9_COMMON_VP9_RECONINTER_H_ +#endif // VPX_VP9_COMMON_VP9_RECONINTER_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_reconintra.h b/media/libvpx/libvpx/vp9/common/vp9_reconintra.h index 78e41c8811..426a35ebfa 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_reconintra.h +++ b/media/libvpx/libvpx/vp9/common/vp9_reconintra.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_RECONINTRA_H_ -#define VP9_COMMON_VP9_RECONINTRA_H_ +#ifndef VPX_VP9_COMMON_VP9_RECONINTRA_H_ +#define VPX_VP9_COMMON_VP9_RECONINTRA_H_ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" @@ -28,4 +28,4 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, TX_SIZE tx_size, } // extern "C" #endif -#endif // VP9_COMMON_VP9_RECONINTRA_H_ +#endif // VPX_VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c index d8c870aa3f..1a93b97e56 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_rtcd.c +++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd.c @@ -12,8 +12,4 @@ #include "./vp9_rtcd.h" #include "vpx_ports/vpx_once.h" -void vp9_rtcd() { - // TODO(JBB): Remove this once, by insuring that both the encoder and - // decoder setup functions are protected by once(); - once(setup_rtcd_internal); -} +void vp9_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl index 088b004f52..dac3d89e2a 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/media/libvpx/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vp9_common_forward_decls() { print <mi[0]; if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) { @@ -55,4 +55,4 @@ static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, } // extern "C" #endif -#endif // VP9_COMMON_VP9_SCAN_H_ +#endif // VPX_VP9_COMMON_VP9_SCAN_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_seg_common.h b/media/libvpx/libvpx/vp9/common/vp9_seg_common.h index b9bf75d580..5e71c2fca5 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_seg_common.h +++ b/media/libvpx/libvpx/vp9/common/vp9_seg_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_SEG_COMMON_H_ -#define VP9_COMMON_VP9_SEG_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_SEG_COMMON_H_ +#define VPX_VP9_COMMON_VP9_SEG_COMMON_H_ #include "vpx_dsp/prob.h" @@ -25,6 +25,11 @@ extern "C" { #define PREDICTION_PROBS 3 +// Segment ID used to skip background encoding +#define BACKGROUND_SEG_SKIP_ID 3 +// Number of frames that don't skip after a key frame +#define FRAMES_NO_SKIPPING_AFTER_KEY 20 + // Segment level features. typedef enum { SEG_LVL_ALT_Q = 0, // Use alternate Quantizer .... @@ -78,4 +83,4 @@ extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)]; } // extern "C" #endif -#endif // VP9_COMMON_VP9_SEG_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_SEG_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c index 07e659d235..7f5ac36669 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.c +++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.c @@ -8,41 +8,27 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include +#include #include "./vpx_config.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_pthread.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_loopfilter.h" -#if CONFIG_MULTITHREAD -static INLINE void mutex_lock(pthread_mutex_t *const mutex) { - const int kMaxTryLocks = 4000; - int locked = 0; - int i; - - for (i = 0; i < kMaxTryLocks; ++i) { - if (!pthread_mutex_trylock(mutex)) { - locked = 1; - break; - } - } - - if (!locked) pthread_mutex_lock(mutex); -} -#endif // CONFIG_MULTITHREAD - static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) { #if CONFIG_MULTITHREAD const int nsync = lf_sync->sync_range; if (r && !(c & (nsync - 1))) { - pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1]; - mutex_lock(mutex); + pthread_mutex_t *const mutex = &lf_sync->mutex[r - 1]; + pthread_mutex_lock(mutex); while (c > lf_sync->cur_sb_col[r - 1] - nsync) { - pthread_cond_wait(&lf_sync->cond_[r - 1], mutex); + pthread_cond_wait(&lf_sync->cond[r - 1], mutex); } pthread_mutex_unlock(mutex); } @@ -69,12 +55,12 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, } if (sig) { - mutex_lock(&lf_sync->mutex_[r]); + pthread_mutex_lock(&lf_sync->mutex[r]); lf_sync->cur_sb_col[r] = cur; - pthread_cond_signal(&lf_sync->cond_[r]); - pthread_mutex_unlock(&lf_sync->mutex_[r]); + pthread_cond_signal(&lf_sync->cond[r]); + pthread_mutex_unlock(&lf_sync->mutex[r]); } #else (void)lf_sync; @@ -91,6 +77,7 @@ static INLINE void thread_loop_filter_rows( int y_only, VP9LfSync *const lf_sync) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + const int num_active_workers = lf_sync->num_active_workers; int mi_row, mi_col; enum lf_path path; if (y_only) @@ -102,8 +89,10 @@ static INLINE void thread_loop_filter_rows( else path = LF_PATH_SLOW; + assert(num_active_workers > 0); + for (mi_row = start; mi_row < stop; - mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) { + mi_row += num_active_workers * MI_BLOCK_SIZE) { MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride; LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0); @@ -140,8 +129,9 @@ static INLINE void thread_loop_filter_rows( } // Row-based multi-threaded loopfilter hook -static int loop_filter_row_worker(VP9LfSync *const lf_sync, - LFWorkerData *const lf_data) { +static int loop_filter_row_worker(void *arg1, void *arg2) { + VP9LfSync *const lf_sync = (VP9LfSync *)arg1; + LFWorkerData *const lf_data = (LFWorkerData *)arg2; thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only, lf_sync); @@ -156,10 +146,12 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; - // Decoder may allocate more threads than number of tiles based on user's - // input. - const int tile_cols = 1 << cm->log2_tile_cols; - const int num_workers = VPXMIN(nworkers, tile_cols); + const int num_tile_cols = 1 << cm->log2_tile_cols; + // Limit the number of workers to prevent changes in frame dimensions from + // causing incorrect sync calculations when sb_rows < threads/tile_cols. + // Further restrict them by the number of tile columns should the user + // request more as this implementation doesn't scale well beyond that. + const int num_workers = VPXMIN(nworkers, VPXMIN(num_tile_cols, sb_rows)); int i; if (!lf_sync->sync_range || sb_rows != lf_sync->rows || @@ -167,6 +159,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, vp9_loop_filter_dealloc(lf_sync); vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } + lf_sync->num_active_workers = num_workers; // Initialize cur_sb_col to -1 for all SB rows. memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); @@ -183,7 +176,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, VPxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; - worker->hook = (VPxWorkerHook)loop_filter_row_worker; + worker->hook = loop_filter_row_worker; worker->data1 = lf_sync; worker->data2 = lf_data; @@ -230,6 +223,28 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, workers, num_workers, lf_sync); } +void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level, + int num_workers) { + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + + if (!frame_filter_level) return; + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + vp9_loop_filter_dealloc(lf_sync); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + + // Initialize cur_sb_col to -1 for all SB rows. + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + lf_sync->corrupted = 0; + + memset(lf_sync->num_tiles_done, 0, + sizeof(*lf_sync->num_tiles_done) * sb_rows); + cm->lf_row = 0; +} + // Set up nsync by width. static INLINE int get_sync_range(int width) { // nsync numbers are picked by testing. For example, for 4k @@ -252,60 +267,205 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, { int i; - CHECK_MEM_ERROR(cm, lf_sync->mutex_, - vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); - if (lf_sync->mutex_) { + CHECK_MEM_ERROR(&cm->error, lf_sync->mutex, + vpx_malloc(sizeof(*lf_sync->mutex) * rows)); + if (lf_sync->mutex) { for (i = 0; i < rows; ++i) { - pthread_mutex_init(&lf_sync->mutex_[i], NULL); + pthread_mutex_init(&lf_sync->mutex[i], NULL); } } - CHECK_MEM_ERROR(cm, lf_sync->cond_, - vpx_malloc(sizeof(*lf_sync->cond_) * rows)); - if (lf_sync->cond_) { + CHECK_MEM_ERROR(&cm->error, lf_sync->cond, + vpx_malloc(sizeof(*lf_sync->cond) * rows)); + if (lf_sync->cond) { for (i = 0; i < rows; ++i) { - pthread_cond_init(&lf_sync->cond_[i], NULL); + pthread_cond_init(&lf_sync->cond[i], NULL); + } + } + + CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex, + vpx_malloc(sizeof(*lf_sync->lf_mutex))); + pthread_mutex_init(lf_sync->lf_mutex, NULL); + + CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex, + vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows)); + if (lf_sync->recon_done_mutex) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond, + vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows)); + if (lf_sync->recon_done_cond) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->recon_done_cond[i], NULL); } } } #endif // CONFIG_MULTITHREAD - CHECK_MEM_ERROR(cm, lf_sync->lfdata, + CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata, vpx_malloc(num_workers * sizeof(*lf_sync->lfdata))); lf_sync->num_workers = num_workers; + lf_sync->num_active_workers = lf_sync->num_workers; - CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, + CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col, vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); + CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done, + vpx_malloc(sizeof(*lf_sync->num_tiles_done) * + mi_cols_aligned_to_sb(cm->mi_rows) >> + MI_BLOCK_SIZE_LOG2)); + // Set up nsync. lf_sync->sync_range = get_sync_range(width); } // Deallocate lf synchronization related mutex and data void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { - if (lf_sync != NULL) { -#if CONFIG_MULTITHREAD - int i; + assert(lf_sync != NULL); - if (lf_sync->mutex_ != NULL) { - for (i = 0; i < lf_sync->rows; ++i) { - pthread_mutex_destroy(&lf_sync->mutex_[i]); - } - vpx_free(lf_sync->mutex_); +#if CONFIG_MULTITHREAD + if (lf_sync->mutex != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->mutex[i]); } - if (lf_sync->cond_ != NULL) { - for (i = 0; i < lf_sync->rows; ++i) { - pthread_cond_destroy(&lf_sync->cond_[i]); - } - vpx_free(lf_sync->cond_); - } -#endif // CONFIG_MULTITHREAD - vpx_free(lf_sync->lfdata); - vpx_free(lf_sync->cur_sb_col); - // clear the structure as the source of this call may be a resize in which - // case this call will be followed by an _alloc() which may fail. - vp9_zero(*lf_sync); + vpx_free(lf_sync->mutex); } + if (lf_sync->cond != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->cond[i]); + } + vpx_free(lf_sync->cond); + } + if (lf_sync->recon_done_mutex != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]); + } + vpx_free(lf_sync->recon_done_mutex); + } + + if (lf_sync->lf_mutex != NULL) { + pthread_mutex_destroy(lf_sync->lf_mutex); + vpx_free(lf_sync->lf_mutex); + } + if (lf_sync->recon_done_cond != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->recon_done_cond[i]); + } + vpx_free(lf_sync->recon_done_cond); + } +#endif // CONFIG_MULTITHREAD + + vpx_free(lf_sync->lfdata); + vpx_free(lf_sync->cur_sb_col); + vpx_free(lf_sync->num_tiles_done); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + vp9_zero(*lf_sync); +} + +static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { + int return_val = -1; + const int max_rows = cm->mi_rows; + +#if CONFIG_MULTITHREAD + int cur_row; + const int tile_cols = 1 << cm->log2_tile_cols; + + pthread_mutex_lock(lf_sync->lf_mutex); + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } + pthread_mutex_unlock(lf_sync->lf_mutex); + + if (return_val == -1) return return_val; + + pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]); + if (lf_sync->num_tiles_done[cur_row] < tile_cols) { + pthread_cond_wait(&lf_sync->recon_done_cond[cur_row], + &lf_sync->recon_done_mutex[cur_row]); + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]); + pthread_mutex_lock(lf_sync->lf_mutex); + if (lf_sync->corrupted) { + int row = return_val >> MI_BLOCK_SIZE_LOG2; + pthread_mutex_lock(&lf_sync->mutex[row]); + lf_sync->cur_sb_col[row] = INT_MAX; + pthread_cond_signal(&lf_sync->cond[row]); + pthread_mutex_unlock(&lf_sync->mutex[row]); + return_val = -1; + } + pthread_mutex_unlock(lf_sync->lf_mutex); +#else + (void)lf_sync; + if (cm->lf_row < max_rows) { + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + } +#endif // CONFIG_MULTITHREAD + + return return_val; +} + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync) { + int mi_row; + VP9_COMMON *cm = lf_data->cm; + + while ((mi_row = get_next_row(cm, lf_sync)) != -1 && mi_row < cm->mi_rows) { + lf_data->start = mi_row; + lf_data->stop = mi_row + MI_BLOCK_SIZE; + + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); + } +} + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(lf_sync->lf_mutex); + lf_sync->corrupted |= corrupted; + pthread_mutex_unlock(lf_sync->lf_mutex); + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tiles == lf_sync->num_tiles_done[row]) { + if (is_last_row) { + /* The last 2 rows wait on the last row to be done. + * So, we have to broadcast the signal in this case. + */ + pthread_cond_broadcast(&lf_sync->recon_done_cond[row]); + } else { + pthread_cond_signal(&lf_sync->recon_done_cond[row]); + } + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); +#else + (void)lf_sync; + (void)num_tiles; + (void)row; + (void)is_last_row; + (void)corrupted; +#endif // CONFIG_MULTITHREAD +} + +void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync) { + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); } // Accumulate frame counts. diff --git a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h index 0f7c3ff748..96c705d0d5 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_thread_common.h +++ b/media/libvpx/libvpx/vp9/common/vp9_thread_common.h @@ -8,10 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_THREAD_COMMON_H_ -#define VP9_COMMON_VP9_THREAD_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ +#define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ #include "./vpx_config.h" #include "vp9/common/vp9_loopfilter.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #ifdef __cplusplus @@ -24,8 +25,8 @@ struct FRAME_COUNTS; // Loopfilter row synchronization typedef struct VP9LfSyncData { #if CONFIG_MULTITHREAD - pthread_mutex_t *mutex_; - pthread_cond_t *cond_; + pthread_mutex_t *mutex; + pthread_cond_t *cond; #endif // Allocate memory to store the loop-filtered superblock index in each row. int *cur_sb_col; @@ -36,7 +37,16 @@ typedef struct VP9LfSyncData { // Row-based parallel loopfilter data LFWorkerData *lfdata; - int num_workers; + int num_workers; // number of allocated workers. + int num_active_workers; // number of scheduled workers. + +#if CONFIG_MULTITHREAD + pthread_mutex_t *lf_mutex; + pthread_mutex_t *recon_done_mutex; + pthread_cond_t *recon_done_cond; +#endif + int *num_tiles_done; + int corrupted; } VP9LfSync; // Allocate memory for loopfilter row synchronization. @@ -53,6 +63,17 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, int partial_frame, VPxWorker *workers, int num_workers, VP9LfSync *lf_sync); +// Multi-threaded loopfilter initialisations +void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm, + int frame_filter_level, int num_workers); + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync); + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted); + +void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync); + void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, const struct FRAME_COUNTS *counts, int is_dec); @@ -60,4 +81,4 @@ void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, } // extern "C" #endif -#endif // VP9_COMMON_VP9_THREAD_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/vp9_tile_common.h b/media/libvpx/libvpx/vp9/common/vp9_tile_common.h index 1b11c2680d..4ccf0a3d5f 100644 --- a/media/libvpx/libvpx/vp9/common/vp9_tile_common.h +++ b/media/libvpx/libvpx/vp9/common/vp9_tile_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_TILE_COMMON_H_ -#define VP9_COMMON_VP9_TILE_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_TILE_COMMON_H_ +#define VPX_VP9_COMMON_VP9_TILE_COMMON_H_ #ifdef __cplusplus extern "C" { @@ -37,4 +37,4 @@ void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, } // extern "C" #endif -#endif // VP9_COMMON_VP9_TILE_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c new file mode 100644 index 0000000000..57b79a732d --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2], + s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2], + x10[2], x11[2], x12[2], x13[2], x14[2], x15[2]; + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5); + highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7); + highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9); + highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14, + s15); + + x0[0] = _mm_add_epi64(s0[0], s8[0]); + x0[1] = _mm_add_epi64(s0[1], s8[1]); + x1[0] = _mm_add_epi64(s1[0], s9[0]); + x1[1] = _mm_add_epi64(s1[1], s9[1]); + x2[0] = _mm_add_epi64(s2[0], s10[0]); + x2[1] = _mm_add_epi64(s2[1], s10[1]); + x3[0] = _mm_add_epi64(s3[0], s11[0]); + x3[1] = _mm_add_epi64(s3[1], s11[1]); + x4[0] = _mm_add_epi64(s4[0], s12[0]); + x4[1] = _mm_add_epi64(s4[1], s12[1]); + x5[0] = _mm_add_epi64(s5[0], s13[0]); + x5[1] = _mm_add_epi64(s5[1], s13[1]); + x6[0] = _mm_add_epi64(s6[0], s14[0]); + x6[1] = _mm_add_epi64(s6[1], s14[1]); + x7[0] = _mm_add_epi64(s7[0], s15[0]); + x7[1] = _mm_add_epi64(s7[1], s15[1]); + x8[0] = _mm_sub_epi64(s0[0], s8[0]); + x8[1] = _mm_sub_epi64(s0[1], s8[1]); + x9[0] = _mm_sub_epi64(s1[0], s9[0]); + x9[1] = _mm_sub_epi64(s1[1], s9[1]); + x10[0] = _mm_sub_epi64(s2[0], s10[0]); + x10[1] = _mm_sub_epi64(s2[1], s10[1]); + x11[0] = _mm_sub_epi64(s3[0], s11[0]); + x11[1] = _mm_sub_epi64(s3[1], s11[1]); + x12[0] = _mm_sub_epi64(s4[0], s12[0]); + x12[1] = _mm_sub_epi64(s4[1], s12[1]); + x13[0] = _mm_sub_epi64(s5[0], s13[0]); + x13[1] = _mm_sub_epi64(s5[1], s13[1]); + x14[0] = _mm_sub_epi64(s6[0], s14[0]); + x14[1] = _mm_sub_epi64(s6[1], s14[1]); + x15[0] = _mm_sub_epi64(s7[0], s15[0]); + x15[1] = _mm_sub_epi64(s7[1], s15[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x0[0] = pack_4(x0[0], x0[1]); + x1[0] = pack_4(x1[0], x1[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 2 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + s4[0] = x4[0]; + s5[0] = x5[0]; + s6[0] = x6[0]; + s7[0] = x7[0]; + x0[0] = _mm_add_epi32(s0[0], s4[0]); + x1[0] = _mm_add_epi32(s1[0], s5[0]); + x2[0] = _mm_add_epi32(s2[0], s6[0]); + x3[0] = _mm_add_epi32(s3[0], s7[0]); + x4[0] = _mm_sub_epi32(s0[0], s4[0]); + x5[0] = _mm_sub_epi32(s1[0], s5[0]); + x6[0] = _mm_sub_epi32(s2[0], s6[0]); + x7[0] = _mm_sub_epi32(s3[0], s7[0]); + + highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9); + highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13, + s12); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15, + s14); + + x8[0] = _mm_add_epi64(s8[0], s12[0]); + x8[1] = _mm_add_epi64(s8[1], s12[1]); + x9[0] = _mm_add_epi64(s9[0], s13[0]); + x9[1] = _mm_add_epi64(s9[1], s13[1]); + x10[0] = _mm_add_epi64(s10[0], s14[0]); + x10[1] = _mm_add_epi64(s10[1], s14[1]); + x11[0] = _mm_add_epi64(s11[0], s15[0]); + x11[1] = _mm_add_epi64(s11[1], s15[1]); + x12[0] = _mm_sub_epi64(s8[0], s12[0]); + x12[1] = _mm_sub_epi64(s8[1], s12[1]); + x13[0] = _mm_sub_epi64(s9[0], s13[0]); + x13[1] = _mm_sub_epi64(s9[1], s13[1]); + x14[0] = _mm_sub_epi64(s10[0], s14[0]); + x14[1] = _mm_sub_epi64(s10[1], s14[1]); + x15[0] = _mm_sub_epi64(s11[0], s15[0]); + x15[1] = _mm_sub_epi64(s11[1], s15[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 3 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + s8[0] = x8[0]; + s9[0] = x9[0]; + s10[0] = x10[0]; + s11[0] = x11[0]; + highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15, + s14); + + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = _mm_add_epi32(s8[0], s10[0]); + x9[0] = _mm_add_epi32(s9[0], s11[0]); + x10[0] = _mm_sub_epi32(s8[0], s10[0]); + x11[0] = _mm_sub_epi32(s9[0], s11[0]); + x12[0] = _mm_add_epi64(s12[0], s14[0]); + x12[1] = _mm_add_epi64(s12[1], s14[1]); + x13[0] = _mm_add_epi64(s13[0], s15[0]); + x13[1] = _mm_add_epi64(s13[1], s15[1]); + x14[0] = _mm_sub_epi64(s12[0], s14[0]); + x14[1] = _mm_sub_epi64(s12[1], s14[1]); + x15[0] = _mm_sub_epi64(s13[0], s15[0]); + x15[1] = _mm_sub_epi64(s13[1], s15[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 4 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x7[0], x6[0]); + s7[0] = _mm_sub_epi32(x7[0], x6[0]); + s10[0] = _mm_add_epi32(x11[0], x10[0]); + s11[0] = _mm_sub_epi32(x11[0], x10[0]); + s14[0] = _mm_add_epi32(x14[0], x15[0]); + s15[0] = _mm_sub_epi32(x14[0], x15[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10); + highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11); + highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14); + highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x10[0] = dct_const_round_shift_64bit(s10[0]); + x10[1] = dct_const_round_shift_64bit(s10[1]); + x11[0] = dct_const_round_shift_64bit(s11[0]); + x11[1] = dct_const_round_shift_64bit(s11[1]); + x14[0] = dct_const_round_shift_64bit(s14[0]); + x14[1] = dct_const_round_shift_64bit(s14[1]); + x15[0] = dct_const_round_shift_64bit(s15[0]); + x15[1] = dct_const_round_shift_64bit(s15[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]); + io[2] = x12[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[4] = x6[0]; + io[5] = x14[0]; + io[6] = x10[0]; + io[7] = x2[0]; + io[8] = x3[0]; + io[9] = x11[0]; + io[10] = x15[0]; + io[11] = x7[0]; + io[12] = x5[0]; + io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]); + io[14] = x9[0]; + io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + idct16_8col(in, in); + } else { + vpx_iadst16_8col_sse2(in); + } + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + idct16_8col(out, out); + } else { + vpx_iadst16_8col_sse2(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct16_4col_sse4_1(in); + } else { + highbd_iadst16_4col_sse4_1(in); + } + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct16_4col_sse4_1(out); + } else { + highbd_iadst16_4col_sse4_1(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c new file mode 100644 index 0000000000..af158536f9 --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst4_sse4_1(__m128i *const io) { + const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0); + const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0); + const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0); + const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0); + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2]; + __m128i temp[2]; + + transpose_32bit_4x4(io, io); + + extend_64bit(io[0], temp); + s0[0] = _mm_mul_epi32(pair_c1, temp[0]); + s0[1] = _mm_mul_epi32(pair_c1, temp[1]); + s1[0] = _mm_mul_epi32(pair_c2, temp[0]); + s1[1] = _mm_mul_epi32(pair_c2, temp[1]); + + extend_64bit(io[1], temp); + s2[0] = _mm_mul_epi32(pair_c3, temp[0]); + s2[1] = _mm_mul_epi32(pair_c3, temp[1]); + + extend_64bit(io[2], temp); + s3[0] = _mm_mul_epi32(pair_c4, temp[0]); + s3[1] = _mm_mul_epi32(pair_c4, temp[1]); + s4[0] = _mm_mul_epi32(pair_c1, temp[0]); + s4[1] = _mm_mul_epi32(pair_c1, temp[1]); + + extend_64bit(io[3], temp); + s5[0] = _mm_mul_epi32(pair_c2, temp[0]); + s5[1] = _mm_mul_epi32(pair_c2, temp[1]); + s6[0] = _mm_mul_epi32(pair_c4, temp[0]); + s6[1] = _mm_mul_epi32(pair_c4, temp[1]); + + t0[0] = _mm_add_epi64(s0[0], s3[0]); + t0[1] = _mm_add_epi64(s0[1], s3[1]); + t0[0] = _mm_add_epi64(t0[0], s5[0]); + t0[1] = _mm_add_epi64(t0[1], s5[1]); + t1[0] = _mm_sub_epi64(s1[0], s4[0]); + t1[1] = _mm_sub_epi64(s1[1], s4[1]); + t1[0] = _mm_sub_epi64(t1[0], s6[0]); + t1[1] = _mm_sub_epi64(t1[1], s6[1]); + temp[0] = _mm_sub_epi32(io[0], io[2]); + temp[0] = _mm_add_epi32(temp[0], io[3]); + extend_64bit(temp[0], temp); + t2[0] = _mm_mul_epi32(pair_c3, temp[0]); + t2[1] = _mm_mul_epi32(pair_c3, temp[1]); + + s0[0] = _mm_add_epi64(t0[0], s2[0]); + s0[1] = _mm_add_epi64(t0[1], s2[1]); + s1[0] = _mm_add_epi64(t1[0], s2[0]); + s1[1] = _mm_add_epi64(t1[1], s2[1]); + s3[0] = _mm_add_epi64(t0[0], t1[0]); + s3[1] = _mm_add_epi64(t0[1], t1[1]); + s3[0] = _mm_sub_epi64(s3[0], s2[0]); + s3[1] = _mm_sub_epi64(s3[1], s2[1]); + + s0[0] = dct_const_round_shift_64bit(s0[0]); + s0[1] = dct_const_round_shift_64bit(s0[1]); + s1[0] = dct_const_round_shift_64bit(s1[0]); + s1[1] = dct_const_round_shift_64bit(s1[1]); + s2[0] = dct_const_round_shift_64bit(t2[0]); + s2[1] = dct_const_round_shift_64bit(t2[1]); + s3[0] = dct_const_round_shift_64bit(s3[0]); + s3[1] = dct_const_round_shift_64bit(s3[1]); + io[0] = pack_4(s0[0], s0[1]); + io[1] = pack_4(s1[0], s1[1]); + io[2] = pack_4(s2[0], s2[1]); + io[3] = pack_4(s3[0], s3[1]); +} + +void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + __m128i io[4]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + if (bd == 8) { + __m128i io_short[2]; + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + idct4_sse2(io_short); + } else { + iadst4_sse2(io_short); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + idct4_sse2(io_short); + } else { + iadst4_sse2(io_short); + } + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + highbd_idct4_sse4_1(io); + } else { + highbd_iadst4_sse4_1(io); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + highbd_idct4_sse4_1(io); + } else { + highbd_iadst4_sse4_1(io); + } + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c new file mode 100644 index 0000000000..7d949b6dbc --- /dev/null +++ b/media/libvpx/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst8_sse4_1(__m128i *const io) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5); + x0[0] = _mm_add_epi64(s0[0], s4[0]); + x0[1] = _mm_add_epi64(s0[1], s4[1]); + x1[0] = _mm_add_epi64(s1[0], s5[0]); + x1[1] = _mm_add_epi64(s1[1], s5[1]); + x4[0] = _mm_sub_epi64(s0[0], s4[0]); + x4[1] = _mm_sub_epi64(s0[1], s4[1]); + x5[0] = _mm_sub_epi64(s1[0], s5[0]); + x5[1] = _mm_sub_epi64(s1[1], s5[1]); + + highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7); + x2[0] = _mm_add_epi64(s2[0], s6[0]); + x2[1] = _mm_add_epi64(s2[1], s6[1]); + x3[0] = _mm_add_epi64(s3[0], s7[0]); + x3[1] = _mm_add_epi64(s3[1], s7[1]); + x6[0] = _mm_sub_epi64(s2[0], s6[0]); + x6[1] = _mm_sub_epi64(s2[1], s6[1]); + x7[0] = _mm_sub_epi64(s3[0], s7[0]); + x7[1] = _mm_sub_epi64(s3[1], s7[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + s0[0] = pack_4(x0[0], x0[1]); // s0 = x0; + s1[0] = pack_4(x1[0], x1[1]); // s1 = x1; + s2[0] = pack_4(x2[0], x2[1]); // s2 = x2; + s3[0] = pack_4(x3[0], x3[1]); // s3 = x3; + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 2 + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 3 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x6[0], x7[0]); + s7[0] = _mm_sub_epi32(x6[0], x7[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[2] = x6[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]); + io[4] = x3[0]; + io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]); + io[6] = x5[0]; + io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + highbd_iadst8_sse4_1(&io[8]); + } + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_iadst8_sse4_1(&io[8]); + } + highbd_idct8x8_final_round(io); + } + recon_and_store_8x8(io, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index dcfc454aa0..ad693718c0 100644 --- a/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/media/libvpx/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -10,36 +10,33 @@ #include "./vp9_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" -#include "vpx_ports/mem.h" void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[2]; - const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: idct4_sse2(in); idct4_sse2(in); break; - case 1: // ADST_DCT + case ADST_DCT: idct4_sse2(in); iadst4_sse2(in); break; - case 2: // DCT_ADST + case DCT_ADST: iadst4_sse2(in); idct4_sse2(in); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst4_sse2(in); iadst4_sse2(in); break; - default: assert(0); break; } // Final round and shift @@ -49,67 +46,42 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[0] = _mm_srai_epi16(in[0], 4); in[1] = _mm_srai_epi16(in[1], 4); - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3))); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, in[0]); - d2 = _mm_add_epi16(d2, in[1]); - d0 = _mm_packus_epi16(d0, d2); - // store result[0] - *(int *)dest = _mm_cvtsi128_si32(d0); - // store result[1] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store result[2] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - // store result[3] - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - } + recon_and_store4x4_sse2(in, dest, stride); } void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[8]; - const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 1); - in[2] = load_input_data(input + 8 * 2); - in[3] = load_input_data(input + 8 * 3); - in[4] = load_input_data(input + 8 * 4); - in[5] = load_input_data(input + 8 * 5); - in[6] = load_input_data(input + 8 * 6); - in[7] = load_input_data(input + 8 * 7); + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8 * 1); + in[2] = load_input_data8(input + 8 * 2); + in[3] = load_input_data8(input + 8 * 3); + in[4] = load_input_data8(input + 8 * 4); + in[5] = load_input_data8(input + 8 * 5); + in[6] = load_input_data8(input + 8 * 6); + in[7] = load_input_data8(input + 8 * 7); switch (tx_type) { - case 0: // DCT_DCT - idct8_sse2(in); - idct8_sse2(in); + case DCT_DCT: + vpx_idct8_sse2(in); + vpx_idct8_sse2(in); break; - case 1: // ADST_DCT - idct8_sse2(in); + case ADST_DCT: + vpx_idct8_sse2(in); iadst8_sse2(in); break; - case 2: // DCT_ADST + case DCT_ADST: iadst8_sse2(in); - idct8_sse2(in); + vpx_idct8_sse2(in); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst8_sse2(in); iadst8_sse2(in); break; - default: assert(0); break; } // Final rounding and shift @@ -131,14 +103,91 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[6] = _mm_srai_epi16(in[6], 5); in[7] = _mm_srai_epi16(in[7], 5); - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); +} + +static INLINE void load_buffer_8x16(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); + + in[8] = load_input_data8(input + 8 * 16); + in[9] = load_input_data8(input + 9 * 16); + in[10] = load_input_data8(input + 10 * 16); + in[11] = load_input_data8(input + 11 * 16); + in[12] = load_input_data8(input + 12 * 16); + in[13] = load_input_data8(input + 13 * 16); + in[14] = load_input_data8(input + 14 * 16); + in[15] = load_input_data8(input + 15 * 16); +} + +static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in, + const int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + recon_and_store(dest + 0 * stride, in[0]); + recon_and_store(dest + 1 * stride, in[1]); + recon_and_store(dest + 2 * stride, in[2]); + recon_and_store(dest + 3 * stride, in[3]); + recon_and_store(dest + 4 * stride, in[4]); + recon_and_store(dest + 5 * stride, in[5]); + recon_and_store(dest + 6 * stride, in[6]); + recon_and_store(dest + 7 * stride, in[7]); + recon_and_store(dest + 8 * stride, in[8]); + recon_and_store(dest + 9 * stride, in[9]); + recon_and_store(dest + 10 * stride, in[10]); + recon_and_store(dest + 11 * stride, in[11]); + recon_and_store(dest + 12 * stride, in[12]); + recon_and_store(dest + 13 * stride, in[13]); + recon_and_store(dest + 14 * stride, in[14]); + recon_and_store(dest + 15 * stride, in[15]); } void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, @@ -150,23 +199,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, load_buffer_8x16(input, in1); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: idct16_sse2(in0, in1); idct16_sse2(in0, in1); break; - case 1: // ADST_DCT + case ADST_DCT: idct16_sse2(in0, in1); iadst16_sse2(in0, in1); break; - case 2: // DCT_ADST + case DCT_ADST: iadst16_sse2(in0, in1); idct16_sse2(in0, in1); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst16_sse2(in0, in1); iadst16_sse2(in0, in1); break; - default: assert(0); break; } write_buffer_8x16(dest, in0, stride); diff --git a/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm index 30852049b4..ae7c94ea3f 100644 --- a/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm +++ b/media/libvpx/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm @@ -12,6 +12,8 @@ ; TODO(jackychen): Find a way to fix the duplicate. %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vp9_filter_by_weight16x16_sse2 ;( ; unsigned char *src, @@ -20,7 +22,7 @@ ; int dst_stride, ; int src_weight ;) -global sym(vp9_filter_by_weight16x16_sse2) PRIVATE +globalsym(vp9_filter_by_weight16x16_sse2) sym(vp9_filter_by_weight16x16_sse2): push rbp mov rbp, rsp @@ -98,7 +100,7 @@ sym(vp9_filter_by_weight16x16_sse2): ; int dst_stride, ; int src_weight ;) -global sym(vp9_filter_by_weight8x8_sse2) PRIVATE +globalsym(vp9_filter_by_weight8x8_sse2) sym(vp9_filter_by_weight8x8_sse2): push rbp mov rbp, rsp @@ -166,7 +168,7 @@ sym(vp9_filter_by_weight8x8_sse2): ; unsigned int *variance, 4 ; unsigned int *sad, 5 ;) -global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE +globalsym(vp9_variance_and_sad_16x16_sse2) sym(vp9_variance_and_sad_16x16_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c index 628d1c8d2b..45ef99adf9 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.c @@ -22,7 +22,11 @@ #include "vpx_ports/mem.h" #include "vpx_ports/mem_ops.h" #include "vpx_scale/vpx_scale.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_common.h" @@ -42,34 +46,15 @@ #include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_dsubexp.h" +#include "vp9/decoder/vp9_job_queue.h" #define MAX_VP9_HEADER_SIZE 80 -static int is_compound_reference_allowed(const VP9_COMMON *cm) { - int i; - for (i = 1; i < REFS_PER_FRAME; ++i) - if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1; +typedef int (*predict_recon_func)(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, TX_SIZE tx_size); - return 0; -} - -static void setup_compound_reference_mode(VP9_COMMON *cm) { - if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) { - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; - } else if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[ALTREF_FRAME]) { - cm->comp_fixed_ref = GOLDEN_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } else { - cm->comp_fixed_ref = LAST_FRAME; - cm->comp_var_ref[0] = GOLDEN_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } -} +typedef void (*intra_recon_func)(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, TX_SIZE tx_size); static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { return len != 0 && len <= (size_t)(end - start); @@ -83,6 +68,7 @@ static int decode_unsigned_max(struct vpx_read_bit_buffer *rb, int max) { static TX_MODE read_tx_mode(vpx_reader *r) { TX_MODE tx_mode = vpx_read_literal(r, 2); if (tx_mode == ALLOW_32X32) tx_mode += vpx_read_bit(r); + assert(tx_mode < TX_MODES); return tx_mode; } @@ -118,7 +104,7 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) { static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm, vpx_reader *r) { - if (is_compound_reference_allowed(cm)) { + if (vp9_compound_reference_allowed(cm)) { return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT : COMPOUND_REFERENCE) : SINGLE_REFERENCE; @@ -189,21 +175,22 @@ static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane, assert(eob > 0); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } @@ -256,21 +243,22 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane, assert(eob > 0); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_8X8: - vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_16X16: - vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd); break; case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd); break; default: assert(0 && "Invalid transform size"); } @@ -337,9 +325,9 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd, if (!mi->skip) { const TX_TYPE tx_type = (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; - const scan_order *sc = (plane || xd->lossless) - ? &vp9_default_scan_orders[tx_size] - : &vp9_scan_orders[tx_size][tx_type]; + const ScanOrder *sc = (plane || xd->lossless) + ? &vp9_default_scan_orders[tx_size] + : &vp9_scan_orders[tx_size][tx_type]; const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); if (eob > 0) { @@ -349,20 +337,121 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd, } } -static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi, - int plane, int row, int col, - TX_SIZE tx_size) { +static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; + + if (mi->sb_type < BLOCK_8X8) + if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; + + if (!mi->skip) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_TYPE tx_type = + (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; + const ScanOrder *sc = (plane || xd->lossless) + ? &vp9_default_scan_orders[tx_size] + : &vp9_scan_orders[tx_size][tx_type]; + *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, + mi->segment_id); + /* Keep the alignment to 16 */ + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + } +} + +static void predict_and_reconstruct_intra_block_row_mt(TileWorkerData *twd, + MODE_INFO *const mi, + int plane, int row, + int col, + TX_SIZE tx_size) { MACROBLOCKD *const xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *sc = &vp9_default_scan_orders[tx_size]; + PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; + uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + + if (mi->sb_type < BLOCK_8X8) + if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; + + vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode, dst, pd->dst.stride, + dst, pd->dst.stride, col, row, plane); + + if (!mi->skip) { + const TX_TYPE tx_type = + (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; + if (*pd->eob > 0) { + inverse_transform_block_intra(xd, plane, tx_type, tx_size, dst, + pd->dst.stride, *pd->eob); + } + /* Keep the alignment to 16 */ + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + } +} + +static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, TX_SIZE tx_size, + int mi_row, int mi_col) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const ScanOrder *sc = &vp9_default_scan_orders[tx_size]; + const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, + mi->segment_id); + uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + + if (eob > 0) { + inverse_transform_block_inter(xd, plane, tx_size, dst, pd->dst.stride, eob); + } +#if CONFIG_MISMATCH_DEBUG + { + int pixel_c, pixel_r; + int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row, + pd->subsampling_x, pd->subsampling_y); + mismatch_check_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r, blk_w, + blk_h, xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#else + (void)mi_row; + (void)mi_col; +#endif + return eob; +} + +static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const ScanOrder *sc = &vp9_default_scan_orders[tx_size]; const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, mi->segment_id); + *pd->eob = eob; + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + + return eob; +} + +static int reconstruct_inter_block_row_mt(TileWorkerData *twd, + MODE_INFO *const mi, int plane, + int row, int col, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int eob = *pd->eob; + + (void)mi; if (eob > 0) { inverse_transform_block_inter( xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col], pd->dst.stride, eob); } + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + return eob; } @@ -442,45 +531,39 @@ static void high_build_mc_border(const uint8_t *src8, int src_stride, #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_HIGHBITDEPTH -static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, - int x0, int y0, int b_w, int b_h, - int frame_width, int frame_height, +static void extend_and_predict(TileWorkerData *twd, const uint8_t *buf_ptr1, + int pre_buf_stride, int x0, int y0, int b_w, + int b_h, int frame_width, int frame_height, int border_offset, uint8_t *const dst, int dst_buf_stride, int subpel_x, int subpel_y, const InterpKernel *kernel, const struct scale_factors *sf, MACROBLOCKD *xd, int w, int h, int ref, int xs, int ys) { - DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]); - const uint8_t *buf_ptr; - + uint16_t *mc_buf_high = twd->extend_and_predict_buf; if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w, x0, y0, b_w, b_h, frame_width, frame_height); - buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset; + highbd_inter_predictor(mc_buf_high + border_offset, b_w, + CONVERT_TO_SHORTPTR(dst), dst_buf_stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); } else { build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w, x0, y0, b_w, b_h, frame_width, frame_height); - buf_ptr = ((uint8_t *)mc_buf_high) + border_offset; - } - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, - subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); - } else { - inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x, subpel_y, sf, - w, h, ref, kernel, xs, ys); + inter_predictor(((uint8_t *)mc_buf_high) + border_offset, b_w, dst, + dst_buf_stride, subpel_x, subpel_y, sf, w, h, ref, kernel, + xs, ys); } } #else -static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, - int x0, int y0, int b_w, int b_h, - int frame_width, int frame_height, +static void extend_and_predict(TileWorkerData *twd, const uint8_t *buf_ptr1, + int pre_buf_stride, int x0, int y0, int b_w, + int b_h, int frame_width, int frame_height, int border_offset, uint8_t *const dst, int dst_buf_stride, int subpel_x, int subpel_y, const InterpKernel *kernel, const struct scale_factors *sf, int w, int h, int ref, int xs, int ys) { - DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); + uint8_t *mc_buf = (uint8_t *)twd->extend_and_predict_buf; const uint8_t *buf_ptr; build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w, x0, y0, b_w, b_h, @@ -493,7 +576,7 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride, #endif // CONFIG_VP9_HIGHBITDEPTH static void dec_build_inter_predictors( - VPxWorker *const worker, MACROBLOCKD *xd, int plane, int bw, int bh, int x, + TileWorkerData *twd, MACROBLOCKD *xd, int plane, int bw, int bh, int x, int y, int w, int h, int mi_x, int mi_y, const InterpKernel *kernel, const struct scale_factors *sf, struct buf_2d *pre_buf, struct buf_2d *dst_buf, const MV *mv, RefCntBuffer *ref_frame_buf, @@ -596,12 +679,6 @@ static void dec_build_inter_predictors( y_pad = 1; } - // Wait until reference block is ready. Pad 7 more pixels as last 7 - // pixels of each superblock row can be changed by next superblock row. - if (worker != NULL) - vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7)) - << (plane == 0 ? 0 : 1)); - // Skip border extension if block is inside the frame. if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 || y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { @@ -611,27 +688,20 @@ static void dec_build_inter_predictors( const int b_h = y1 - y0 + 1; const int border_offset = y_pad * 3 * b_w + x_pad * 3; - extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h, frame_width, - frame_height, border_offset, dst, dst_buf->stride, - subpel_x, subpel_y, kernel, sf, + extend_and_predict(twd, buf_ptr1, buf_stride, x0, y0, b_w, b_h, + frame_width, frame_height, border_offset, dst, + dst_buf->stride, subpel_x, subpel_y, kernel, sf, #if CONFIG_VP9_HIGHBITDEPTH xd, #endif w, h, ref, xs, ys); return; } - } else { - // Wait until reference block is ready. Pad 7 more pixels as last 7 - // pixels of each superblock row can be changed by next superblock row. - if (worker != NULL) { - const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS; - vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7)) - << (plane == 0 ? 0 : 1)); - } } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + highbd_inter_predictor(CONVERT_TO_SHORTPTR(buf_ptr), buf_stride, + CONVERT_TO_SHORTPTR(dst), dst_buf->stride, subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd); } else { inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, @@ -643,7 +713,8 @@ static void dec_build_inter_predictors( #endif // CONFIG_VP9_HIGHBITDEPTH } -static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, +static void dec_build_inter_predictors_sb(TileWorkerData *twd, + VP9Decoder *const pbi, MACROBLOCKD *xd, int mi_row, int mi_col) { int plane; @@ -655,8 +726,6 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, const int is_compound = has_second_ref(mi); int ref; int is_scaled; - VPxWorker *const fwo = - pbi->frame_parallel_decode ? pbi->frame_worker_owner : NULL; for (ref = 0; ref < 1 + is_compound; ++ref) { const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; @@ -688,7 +757,7 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, for (y = 0; y < num_4x4_h; ++y) { for (x = 0; x < num_4x4_w; ++x) { const MV mv = average_split_mvs(pd, mi, ref, i++); - dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 4 * x, + dec_build_inter_predictors(twd, xd, plane, n4w_x4, n4h_x4, 4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel, sf, pre_buf, dst_buf, &mv, ref_frame_buf, is_scaled, ref); @@ -705,7 +774,7 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, const int n4w_x4 = 4 * num_4x4_w; const int n4h_x4 = 4 * num_4x4_h; struct buf_2d *const pre_buf = &pd->pre[ref]; - dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4, + dec_build_inter_predictors(twd, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel, sf, pre_buf, dst_buf, &mv, ref_frame_buf, is_scaled, ref); } @@ -733,6 +802,25 @@ static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl, } } +static MODE_INFO *set_offsets_recon(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int mi_row, int mi_col, int bw, int bh, + int bwl, int bhl) { + const int offset = mi_row * cm->mi_stride + mi_col; + const TileInfo *const tile = &xd->tile; + xd->mi = cm->mi_grid_visible + offset; + + set_plane_n4(xd, bw, bh, bwl, bhl); + + set_skip_context(xd, mi_row, mi_col); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); + + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); + return xd->mi[0]; +} + static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, BLOCK_SIZE bsize, int mi_row, int mi_col, int bw, int bh, int x_mis, int y_mis, int bwl, int bhl) { @@ -762,6 +850,66 @@ static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, return xd->mi[0]; } +static INLINE int predict_recon_inter(MACROBLOCKD *xd, MODE_INFO *mi, + TileWorkerData *twd, + predict_recon_func func) { + int eobtotal = 0; + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + eobtotal += func(twd, mi, plane, row, col, tx_size); + } + return eobtotal; +} + +static INLINE void predict_recon_intra(MACROBLOCKD *xd, MODE_INFO *mi, + TileWorkerData *twd, + intra_recon_func func) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + func(twd, mi, plane, row, col, tx_size); + } +} + static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { VP9_COMMON *const cm = &pbi->common; @@ -818,7 +966,25 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, } } else { // Prediction - dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col); + dec_build_inter_predictors_sb(twd, pbi, xd, mi_row, mi_col); +#if CONFIG_MISMATCH_DEBUG + { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]); + const int bw = get_block_width(plane_bsize); + const int bh = get_block_height(plane_bsize); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, + pd->subsampling_x, pd->subsampling_y); + mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c, + pixel_r, bw, bh, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } + } +#endif // Reconstruction if (!mi->skip) { @@ -838,16 +1004,17 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); const int max_blocks_high = num_4x4_h + - (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> - (5 + pd->subsampling_y)); + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; for (row = 0; row < max_blocks_high; row += step) for (col = 0; col < max_blocks_wide; col += step) - eobtotal += - reconstruct_inter_block(twd, mi, plane, row, col, tx_size); + eobtotal += reconstruct_inter_block(twd, mi, plane, row, col, + tx_size, mi_row, mi_col); } if (!less8x8 && eobtotal == 0) mi->skip = 1; // skip loopfilter @@ -861,6 +1028,98 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, } } +static void recon_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { + VP9_COMMON *const cm = &pbi->common; + const int bw = 1 << (bwl - 1); + const int bh = 1 << (bhl - 1); + MACROBLOCKD *const xd = &twd->xd; + + MODE_INFO *mi = set_offsets_recon(cm, xd, mi_row, mi_col, bw, bh, bwl, bhl); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } + + if (!is_inter_block(mi)) { + predict_recon_intra(xd, mi, twd, + predict_and_reconstruct_intra_block_row_mt); + } else { + // Prediction + dec_build_inter_predictors_sb(twd, pbi, xd, mi_row, mi_col); + + // Reconstruction + if (!mi->skip) { + predict_recon_inter(xd, mi, twd, reconstruct_inter_block_row_mt); + } + } + + vp9_build_mask(cm, mi, mi_row, mi_col, bw, bh); +} + +static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { + VP9_COMMON *const cm = &pbi->common; + const int bw = 1 << (bwl - 1); + const int bh = 1 << (bhl - 1); + const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col); + const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row); + vpx_reader *r = &twd->bit_reader; + MACROBLOCKD *const xd = &twd->xd; + + MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, + y_mis, bwl, bhl); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } + + vp9_read_mode_info(twd, pbi, mi_row, mi_col, x_mis, y_mis); + + if (mi->skip) { + dec_reset_skip_context(xd); + } + + if (!is_inter_block(mi)) { + predict_recon_intra(xd, mi, twd, parse_intra_block_row_mt); + } else { + if (!mi->skip) { + tran_low_t *dqcoeff[MAX_MB_PLANE]; + int *eob[MAX_MB_PLANE]; + int plane; + int eobtotal; + // Based on eobtotal and bsize, this may be mi->skip may be set to true + // In that case dqcoeff and eob need to be backed up and restored as + // recon_block will not increment these pointers for skip cases + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + dqcoeff[plane] = pd->dqcoeff; + eob[plane] = pd->eob; + } + eobtotal = predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt); + + if (bsize >= BLOCK_8X8 && eobtotal == 0) { + mi->skip = 1; // skip loopfilter + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + struct macroblockd_plane *pd = &xd->plane[plane]; + pd->dqcoeff = dqcoeff[plane]; + pd->eob = eob[plane]; + } + } + } + } + + xd->corrupted |= vpx_reader_has_error(r); +} + static INLINE int dec_partition_plane_context(TileWorkerData *twd, int mi_row, int mi_col, int bsl) { const PARTITION_CONTEXT *above_ctx = twd->xd.above_seg_context + mi_col; @@ -967,14 +1226,82 @@ static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi, dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); } +static void process_partition(TileWorkerData *twd, VP9Decoder *const pbi, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int n4x4_l2, int parse_recon_flag, + process_block_fn_t process_block) { + VP9_COMMON *const cm = &pbi->common; + const int n8x8_l2 = n4x4_l2 - 1; + const int num_8x8_wh = 1 << n8x8_l2; + const int hbs = num_8x8_wh >> 1; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + MACROBLOCKD *const xd = &twd->xd; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + if (parse_recon_flag & PARSE) { + *xd->partition = + read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2); + } + + partition = *xd->partition; + xd->partition++; + + subsize = get_subsize(bsize, partition); + if (!hbs) { + // calculate bmode block dimensions (log 2) + xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT); + xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ); + process_block(twd, pbi, mi_row, mi_col, subsize, 1, 1); + } else { + switch (partition) { + case PARTITION_NONE: + process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2); + break; + case PARTITION_HORZ: + process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2); + if (has_rows) + process_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2, + n8x8_l2); + break; + case PARTITION_VERT: + process_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2); + if (has_cols) + process_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + n4x4_l2); + break; + case PARTITION_SPLIT: + process_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2, + parse_recon_flag, process_block); + process_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, + n8x8_l2, parse_recon_flag, process_block); + break; + default: assert(0 && "Invalid partition type"); + } + } + + if (parse_recon_flag & PARSE) { + // update partition context + if ((bsize == BLOCK_8X8 || partition != PARTITION_SPLIT) && + bsize >= BLOCK_8X8) + dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); + } +} + static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end, size_t read_size, struct vpx_internal_error_info *error_info, vpx_reader *r, vpx_decrypt_cb decrypt_cb, void *decrypt_state) { - // Validate the calculated partition length. If the buffer - // described by the partition can't be fully read, then restrict - // it to the portion that can be (for EC mode) or throw an error. + // Validate the calculated partition length. If the buffer described by the + // partition can't be fully read then throw an error. if (!read_is_valid(data, read_size, data_end)) vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile length"); @@ -1144,7 +1471,7 @@ static void resize_mv_buffer(VP9_COMMON *cm) { vpx_free(cm->cur_frame->mvs); cm->cur_frame->mi_rows = cm->mi_rows; cm->cur_frame->mi_cols = cm->mi_cols; - CHECK_MEM_ERROR(cm, cm->cur_frame->mvs, + CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs, (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cm->cur_frame->mvs))); } @@ -1165,9 +1492,15 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { // Allocations in vp9_alloc_context_buffers() depend on individual // dimensions as well as the overall size. if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) { - if (vp9_alloc_context_buffers(cm, width, height)) + if (vp9_alloc_context_buffers(cm, width, height)) { + // The cm->mi_* values have been cleared and any existing context + // buffers have been freed. Clear cm->width and cm->height to be + // consistent and to force a realloc next time. + cm->width = 0; + cm->height = 0; vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate context buffers"); + } } else { vp9_set_mb_mi(cm, width, height); } @@ -1188,7 +1521,6 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { resize_context_buffers(cm, width, height); setup_render_size(cm, rb); - lock_buffer_pool(pool); if (vpx_realloc_frame_buffer( get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -1198,12 +1530,11 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { - unlock_buffer_pool(pool); vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } - unlock_buffer_pool(pool); + pool->frame_bufs[cm->new_fb_idx].released = 0; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; @@ -1274,7 +1605,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, resize_context_buffers(cm, width, height); setup_render_size(cm, rb); - lock_buffer_pool(pool); if (vpx_realloc_frame_buffer( get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, @@ -1284,12 +1614,11 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment, &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { - unlock_buffer_pool(pool); vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } - unlock_buffer_pool(pool); + pool->frame_bufs[cm->new_fb_idx].released = 0; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; @@ -1369,6 +1698,322 @@ static void get_tile_buffers(VP9Decoder *pbi, const uint8_t *data, } } +static void map_write(RowMTWorkerData *const row_mt_worker_data, int map_idx, + int sync_idx) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&row_mt_worker_data->recon_sync_mutex[sync_idx]); + row_mt_worker_data->recon_map[map_idx] = 1; + pthread_cond_signal(&row_mt_worker_data->recon_sync_cond[sync_idx]); + pthread_mutex_unlock(&row_mt_worker_data->recon_sync_mutex[sync_idx]); +#else + (void)row_mt_worker_data; + (void)map_idx; + (void)sync_idx; +#endif // CONFIG_MULTITHREAD +} + +static void map_read(RowMTWorkerData *const row_mt_worker_data, int map_idx, + int sync_idx) { +#if CONFIG_MULTITHREAD + volatile int8_t *map = row_mt_worker_data->recon_map + map_idx; + pthread_mutex_t *const mutex = + &row_mt_worker_data->recon_sync_mutex[sync_idx]; + pthread_mutex_lock(mutex); + while (!(*map)) { + pthread_cond_wait(&row_mt_worker_data->recon_sync_cond[sync_idx], mutex); + } + pthread_mutex_unlock(mutex); +#else + (void)row_mt_worker_data; + (void)map_idx; + (void)sync_idx; +#endif // CONFIG_MULTITHREAD +} + +static int lpf_map_write_check(VP9LfSync *lf_sync, int row, int num_tile_cols) { + int return_val = 0; +#if CONFIG_MULTITHREAD + int corrupted; + pthread_mutex_lock(lf_sync->lf_mutex); + corrupted = lf_sync->corrupted; + pthread_mutex_unlock(lf_sync->lf_mutex); + if (!corrupted) { + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tile_cols == lf_sync->num_tiles_done[row]) return_val = 1; + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); + } +#else + (void)lf_sync; + (void)row; + (void)num_tile_cols; +#endif + return return_val; +} + +static void vp9_tile_done(VP9Decoder *pbi) { +#if CONFIG_MULTITHREAD + int terminate; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int all_parse_done = 1 << pbi->common.log2_tile_cols; + pthread_mutex_lock(&row_mt_worker_data->recon_done_mutex); + row_mt_worker_data->num_tiles_done++; + terminate = all_parse_done == row_mt_worker_data->num_tiles_done; + pthread_mutex_unlock(&row_mt_worker_data->recon_done_mutex); + if (terminate) { + vp9_jobq_terminate(&row_mt_worker_data->jobq); + } +#else + (void)pbi; +#endif +} + +static void vp9_jobq_alloc(VP9Decoder *pbi) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int tile_cols = 1 << cm->log2_tile_cols; + const size_t jobq_size = (tile_cols * sb_rows * 2 + sb_rows) * sizeof(Job); + + if (jobq_size > row_mt_worker_data->jobq_size) { + vpx_free(row_mt_worker_data->jobq_buf); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf, + vpx_calloc(1, jobq_size)); + vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf, + jobq_size); + row_mt_worker_data->jobq_size = jobq_size; + } +} + +static void recon_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi, + int mi_row, int is_last_row, VP9LfSync *lf_sync, + int cur_tile_col) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int tile_cols = 1 << cm->log2_tile_cols; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + int mi_col_start = tile_data->xd.tile.mi_col_start; + int mi_col_end = tile_data->xd.tile.mi_col_end; + int mi_col; + + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) { + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + int plane; + const int sb_num = (cur_sb_row * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c); + + // Top Dependency + if (cur_sb_row) { + map_read(row_mt_worker_data, ((cur_sb_row - 1) * sb_cols) + c, + ((cur_sb_row - 1) * tile_cols) + cur_tile_col); + } + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = + row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2); + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2); + } + tile_data->xd.partition = + row_mt_worker_data->partition + (sb_num * PARTITIONS_PER_SB); + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, RECON, + recon_block); + if (cm->lf.filter_level && !cm->skip_loop_filter) { + // Queue LPF_JOB + int is_lpf_job_ready = 0; + + if (mi_col + MI_BLOCK_SIZE >= mi_col_end) { + // Checks if this row has been decoded in all tiles + is_lpf_job_ready = lpf_map_write_check(lf_sync, cur_sb_row, tile_cols); + + if (is_lpf_job_ready) { + Job lpf_job; + lpf_job.job_type = LPF_JOB; + if (cur_sb_row > 0) { + lpf_job.row_num = mi_row - MI_BLOCK_SIZE; + vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job, + sizeof(lpf_job)); + } + if (is_last_row) { + lpf_job.row_num = mi_row; + vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job, + sizeof(lpf_job)); + } + } + } + } + map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c, + (cur_sb_row * tile_cols) + cur_tile_col); + } +} + +static void parse_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi, + int mi_row, int cur_tile_col, uint8_t **data_end) { + int mi_col; + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + TileInfo *tile = &tile_data->xd.tile; + TileBuffer *const buf = &pbi->tile_buffers[cur_tile_col]; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + + vp9_zero(tile_data->dqcoeff); + vp9_tile_init(tile, cm, 0, cur_tile_col); + + /* Update reader only at the beginning of each row in a tile */ + if (mi_row == 0) { + setup_token_decoder(buf->data, *data_end, buf->size, &tile_data->error_info, + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); + } + vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff); + tile_data->xd.error_info = &tile_data->error_info; + + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + const int r = mi_row >> MI_BLOCK_SIZE_LOG2; + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + int plane; + const int sb_num = (r * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c); + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = + row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2); + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2); + } + tile_data->xd.partition = + row_mt_worker_data->partition + sb_num * PARTITIONS_PER_SB; + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, PARSE, + parse_block); + } +} + +static int row_decode_worker_hook(void *arg1, void *arg2) { + ThreadData *const thread_data = (ThreadData *)arg1; + uint8_t **data_end = (uint8_t **)arg2; + VP9Decoder *const pbi = thread_data->pbi; + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int tile_cols = 1 << cm->log2_tile_cols; + Job job; + LFWorkerData *lf_data = thread_data->lf_data; + VP9LfSync *lf_sync = thread_data->lf_sync; + volatile int corrupted = 0; + TileWorkerData *volatile tile_data_recon = NULL; + + while (!vp9_jobq_dequeue(&row_mt_worker_data->jobq, &job, sizeof(job), 1)) { + int mi_col; + const int mi_row = job.row_num; + + if (job.job_type == LPF_JOB) { + lf_data->start = mi_row; + lf_data->stop = lf_data->start + MI_BLOCK_SIZE; + + if (cm->lf.filter_level && !cm->skip_loop_filter && + mi_row < cm->mi_rows) { + vp9_loopfilter_job(lf_data, lf_sync); + } + } else if (job.job_type == RECON_JOB) { + const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int is_last_row = sb_rows - 1 == cur_sb_row; + int mi_col_start, mi_col_end; + if (!tile_data_recon) + CHECK_MEM_ERROR(&cm->error, tile_data_recon, + vpx_memalign(32, sizeof(TileWorkerData))); + + tile_data_recon->xd = pbi->mb; + vp9_tile_init(&tile_data_recon->xd.tile, cm, 0, job.tile_col); + vp9_init_macroblockd(cm, &tile_data_recon->xd, tile_data_recon->dqcoeff); + mi_col_start = tile_data_recon->xd.tile.mi_col_start; + mi_col_end = tile_data_recon->xd.tile.mi_col_end; + + if (setjmp(tile_data_recon->error_info.jmp)) { + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + tile_data_recon->error_info.setjmp = 0; + corrupted = 1; + for (mi_col = mi_col_start; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE) { + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; + map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c, + (cur_sb_row * tile_cols) + job.tile_col); + } + if (is_last_row) { + vp9_tile_done(pbi); + } + continue; + } + + tile_data_recon->error_info.setjmp = 1; + tile_data_recon->xd.error_info = &tile_data_recon->error_info; + + recon_tile_row(tile_data_recon, pbi, mi_row, is_last_row, lf_sync, + job.tile_col); + + if (corrupted) + vpx_internal_error(&tile_data_recon->error_info, + VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + if (is_last_row) { + vp9_tile_done(pbi); + } + } else if (job.job_type == PARSE_JOB) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[job.tile_col]; + + if (setjmp(tile_data->error_info.jmp)) { + tile_data->error_info.setjmp = 0; + corrupted = 1; + vp9_tile_done(pbi); + continue; + } + + tile_data->xd = pbi->mb; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? 0 : &tile_data->counts; + + tile_data->error_info.setjmp = 1; + + parse_tile_row(tile_data, pbi, mi_row, job.tile_col, data_end); + + corrupted |= tile_data->xd.corrupted; + if (corrupted) + vpx_internal_error(&tile_data->error_info, VPX_CODEC_CORRUPT_FRAME, + "Failed to decode tile data"); + + /* Queue in the recon_job for this row */ + { + Job recon_job; + recon_job.row_num = mi_row; + recon_job.tile_col = job.tile_col; + recon_job.job_type = RECON_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &recon_job, + sizeof(recon_job)); + } + + /* Queue next parse job */ + if (mi_row + MI_BLOCK_SIZE < cm->mi_rows) { + Job parse_job; + parse_job.row_num = mi_row + MI_BLOCK_SIZE; + parse_job.tile_col = job.tile_col; + parse_job.job_type = PARSE_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, + sizeof(parse_job)); + } + } + } + + vpx_free(tile_data_recon); + return !corrupted; +} + static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, const uint8_t *data_end) { VP9_COMMON *const cm = &pbi->common; @@ -1383,9 +2028,9 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, if (cm->lf.filter_level && !cm->skip_loop_filter && pbi->lf_worker.data1 == NULL) { - CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, + CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1, vpx_memalign(32, sizeof(LFWorkerData))); - pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker; + pbi->lf_worker.hook = vp9_loop_filter_worker; if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Loop filter thread creation failed"); @@ -1447,7 +2092,29 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + if (pbi->row_mt == 1) { + int plane; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane]; + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane]; + } + tile_data->xd.partition = row_mt_worker_data->partition; + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, + PARSE, parse_block); + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane]; + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane]; + } + tile_data->xd.partition = row_mt_worker_data->partition; + process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, + RECON, recon_block); + } else { + decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + } } pbi->mb.corrupted |= tile_data->xd.corrupted; if (pbi->mb.corrupted) @@ -1474,11 +2141,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, winterface->execute(&pbi->lf_worker); } } - // After loopfiltering, the last 7 row pixels in each superblock row may - // still be changed by the longest loopfilter of the next superblock - // row. - if (pbi->frame_parallel_decode) - vp9_frameworker_broadcast(pbi->cur_buf, mi_row << MI_BLOCK_SIZE_LOG2); } } @@ -1494,34 +2156,70 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, // Get last tile data. tile_data = pbi->tile_worker_data + tile_cols * tile_rows - 1; - if (pbi->frame_parallel_decode) - vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX); return vpx_reader_find_end(&tile_data->bit_reader); } +static void set_rows_after_error(VP9LfSync *lf_sync, int start_row, int mi_rows, + int num_tiles_left, int total_num_tiles) { + do { + int mi_row; + const int aligned_rows = mi_cols_aligned_to_sb(mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int corrupted = 1; + for (mi_row = start_row; mi_row < mi_rows; mi_row += MI_BLOCK_SIZE) { + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, total_num_tiles, mi_row >> MI_BLOCK_SIZE_LOG2, + is_last_row, corrupted); + } + /* If there are multiple tiles, the second tile should start marking row + * progress from row 0. + */ + start_row = 0; + } while (num_tiles_left--); +} + // On entry 'tile_data->data_end' points to the end of the input frame, on exit // it is updated to reflect the bitreader position of the final tile column if // present in the tile buffer group or NULL otherwise. -static int tile_worker_hook(TileWorkerData *const tile_data, - VP9Decoder *const pbi) { +static int tile_worker_hook(void *arg1, void *arg2) { + TileWorkerData *const tile_data = (TileWorkerData *)arg1; + VP9Decoder *const pbi = (VP9Decoder *)arg2; + TileInfo *volatile tile = &tile_data->xd.tile; const int final_col = (1 << pbi->common.log2_tile_cols) - 1; const uint8_t *volatile bit_reader_end = NULL; - volatile int n = tile_data->buf_start; - tile_data->error_info.setjmp = 1; + VP9_COMMON *cm = &pbi->common; + LFWorkerData *lf_data = tile_data->lf_data; + VP9LfSync *lf_sync = tile_data->lf_sync; + + volatile int mi_row = 0; + volatile int n = tile_data->buf_start; if (setjmp(tile_data->error_info.jmp)) { tile_data->error_info.setjmp = 0; tile_data->xd.corrupted = 1; tile_data->data_end = NULL; + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int num_tiles_left = tile_data->buf_end - n; + const int mi_row_start = mi_row; + set_rows_after_error(lf_sync, mi_row_start, cm->mi_rows, num_tiles_left, + 1 << cm->log2_tile_cols); + } return 0; } + tile_data->error_info.setjmp = 1; tile_data->xd.corrupted = 0; do { - int mi_row, mi_col; + int mi_col; const TileBuffer *const buf = pbi->tile_buffers + n; + + /* Initialize to 0 is safe since we do not deal with streams that have + * more than one row of tiles. (So tile->mi_row_start will be 0) + */ + assert(cm->log2_tile_rows == 0); + mi_row = 0; vp9_zero(tile_data->dqcoeff); vp9_tile_init(tile, &pbi->common, 0, buf->col); setup_token_decoder(buf->data, tile_data->data_end, buf->size, @@ -1539,6 +2237,14 @@ static int tile_worker_hook(TileWorkerData *const tile_data, mi_col += MI_BLOCK_SIZE) { decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); } + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, 1 << cm->log2_tile_cols, + mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row, + tile_data->xd.corrupted); + } } if (buf->col == final_col) { @@ -1546,15 +2252,194 @@ static int tile_worker_hook(TileWorkerData *const tile_data, } } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end); + if (pbi->lpf_mt_opt && n < tile_data->buf_end && cm->lf.filter_level && + !cm->skip_loop_filter) { + /* This was not incremented in the tile loop, so increment before tiles left + * calculation + */ + ++n; + set_rows_after_error(lf_sync, 0, cm->mi_rows, tile_data->buf_end - n, + 1 << cm->log2_tile_cols); + } + + if (pbi->lpf_mt_opt && !tile_data->xd.corrupted && cm->lf.filter_level && + !cm->skip_loop_filter) { + vp9_loopfilter_rows(lf_data, lf_sync); + } + tile_data->data_end = bit_reader_end; return !tile_data->xd.corrupted; } // sorts in descending order static int compare_tile_buffers(const void *a, const void *b) { - const TileBuffer *const buf1 = (const TileBuffer *)a; - const TileBuffer *const buf2 = (const TileBuffer *)b; - return (int)(buf2->size - buf1->size); + const TileBuffer *const buf_a = (const TileBuffer *)a; + const TileBuffer *const buf_b = (const TileBuffer *)b; + return (buf_a->size < buf_b->size) - (buf_a->size > buf_b->size); +} + +static INLINE void init_mt(VP9Decoder *pbi) { + int n; + VP9_COMMON *const cm = &pbi->common; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + + if (pbi->num_tile_workers == 0) { + const int num_threads = pbi->max_threads; + CHECK_MEM_ERROR(&cm->error, pbi->tile_workers, + vpx_malloc(num_threads * sizeof(*pbi->tile_workers))); + for (n = 0; n < num_threads; ++n) { + VPxWorker *const worker = &pbi->tile_workers[n]; + ++pbi->num_tile_workers; + + winterface->init(worker); + worker->thread_name = "vpx tile worker"; + if (n < num_threads - 1 && !winterface->reset(worker)) { + do { + winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]); + } while (--pbi->num_tile_workers != 0); + vpx_free(pbi->tile_workers); + pbi->tile_workers = NULL; + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile decoder thread creation failed"); + } + } + } + + // Initialize LPF + if ((pbi->lpf_mt_opt || pbi->row_mt) && cm->lf.filter_level && + !cm->skip_loop_filter) { + vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level, + pbi->num_tile_workers); + } + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + memset(cm->above_context, 0, + sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols); + + memset(cm->above_seg_context, 0, + sizeof(*cm->above_seg_context) * aligned_mi_cols); + + vp9_reset_lfm(cm); +} + +static const uint8_t *decode_tiles_row_wise_mt(VP9Decoder *pbi, + const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = pbi->max_threads; + int i, n; + int col; + int corrupted = 0; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); + + assert(tile_cols <= (1 << 6)); + assert(tile_rows == 1); + (void)tile_rows; + + memset(row_mt_worker_data->recon_map, 0, + sb_rows * sb_cols * sizeof(*row_mt_worker_data->recon_map)); + + init_mt(pbi); + + // Reset tile decoding hook + for (n = 0; n < num_workers; ++n) { + VPxWorker *const worker = &pbi->tile_workers[n]; + ThreadData *const thread_data = &pbi->row_mt_worker_data->thread_data[n]; + winterface->sync(worker); + + if (cm->lf.filter_level && !cm->skip_loop_filter) { + thread_data->lf_sync = lf_row_sync; + thread_data->lf_data = &thread_data->lf_sync->lfdata[n]; + vp9_loop_filter_data_reset(thread_data->lf_data, new_fb, cm, + pbi->mb.plane); + } + + thread_data->pbi = pbi; + + worker->hook = row_decode_worker_hook; + worker->data1 = thread_data; + worker->data2 = (void *)&row_mt_worker_data->data_end; + } + + for (col = 0; col < tile_cols; ++col) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[col]; + tile_data->xd = pbi->mb; + tile_data->xd.counts = + cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; + } + + /* Reset the jobq to start of the jobq buffer */ + vp9_jobq_reset(&row_mt_worker_data->jobq); + row_mt_worker_data->num_tiles_done = 0; + row_mt_worker_data->data_end = NULL; + + // Load tile data into tile_buffers + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, + &pbi->tile_buffers); + + // Initialize thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (col = 0; col < tile_cols; ++col) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[col]; + vp9_zero(tile_data->counts); + } + } + + // queue parse jobs for 0th row of every tile + for (col = 0; col < tile_cols; ++col) { + Job parse_job; + parse_job.row_num = 0; + parse_job.tile_col = col; + parse_job.job_type = PARSE_JOB; + vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, sizeof(parse_job)); + } + + for (i = 0; i < num_workers; ++i) { + VPxWorker *const worker = &pbi->tile_workers[i]; + worker->had_error = 0; + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + for (; n > 0; --n) { + VPxWorker *const worker = &pbi->tile_workers[n - 1]; + // TODO(jzern): The tile may have specific error data associated with + // its vpx_internal_error_info which could be propagated to the main info + // in cm. Additionally once the threads have been synced and an error is + // detected, there's no point in continuing to decode tiles. + corrupted |= !winterface->sync(worker); + } + + pbi->mb.corrupted = corrupted; + + { + /* Set data end */ + TileWorkerData *const tile_data = &pbi->tile_worker_data[tile_cols - 1]; + row_mt_worker_data->data_end = vpx_reader_find_end(&tile_data->bit_reader); + } + + // Accumulate thread frame counts. + if (!cm->frame_parallel_decoding_mode) { + for (i = 0; i < tile_cols; ++i) { + TileWorkerData *const tile_data = &pbi->tile_worker_data[i]; + vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1); + } + } + + return row_mt_worker_data->data_end; } static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, @@ -1562,7 +2447,8 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, VP9_COMMON *const cm = &pbi->common; const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); const uint8_t *bit_reader_end = NULL; - const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; const int num_workers = VPXMIN(pbi->max_threads, tile_cols); @@ -1572,21 +2458,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, assert(tile_rows == 1); (void)tile_rows; - if (pbi->num_tile_workers == 0) { - const int num_threads = pbi->max_threads; - CHECK_MEM_ERROR(cm, pbi->tile_workers, - vpx_malloc(num_threads * sizeof(*pbi->tile_workers))); - for (n = 0; n < num_threads; ++n) { - VPxWorker *const worker = &pbi->tile_workers[n]; - ++pbi->num_tile_workers; - - winterface->init(worker); - if (n < num_threads - 1 && !winterface->reset(worker)) { - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Tile decoder thread creation failed"); - } - } - } + init_mt(pbi); // Reset tile decoding hook for (n = 0; n < num_workers; ++n) { @@ -1594,23 +2466,22 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, TileWorkerData *const tile_data = &pbi->tile_worker_data[n + pbi->total_tiles]; winterface->sync(worker); + + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + tile_data->lf_sync = lf_row_sync; + tile_data->lf_data = &tile_data->lf_sync->lfdata[n]; + vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane); + tile_data->lf_data->y_only = 0; + } + tile_data->xd = pbi->mb; tile_data->xd.counts = cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; - worker->hook = (VPxWorkerHook)tile_worker_hook; + worker->hook = tile_worker_hook; worker->data1 = tile_data; worker->data2 = pbi; } - // Note: this memset assumes above_context[0], [1] and [2] - // are allocated as part of the same buffer. - memset(cm->above_context, 0, - sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols); - memset(cm->above_seg_context, 0, - sizeof(*cm->above_seg_context) * aligned_mi_cols); - - vp9_reset_lfm(cm); - // Load tile data into tile_buffers get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, &pbi->tile_buffers); @@ -1750,6 +2621,22 @@ static void read_bitdepth_colorspace_sampling(VP9_COMMON *cm, } } +static INLINE void flush_all_fb_on_key(VP9_COMMON *cm) { + if (cm->frame_type == KEY_FRAME && cm->current_video_frame > 0) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + BufferPool *const pool = cm->buffer_pool; + int i; + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (i == cm->new_fb_idx) continue; + frame_bufs[i].ref_count = 0; + if (!frame_bufs[i].released) { + pool->release_fb_cb(pool->cb_priv, &frame_bufs[i].raw_frame_buffer); + frame_bufs[i].released = 1; + } + } + } +} + static size_t read_uncompressed_header(VP9Decoder *pbi, struct vpx_read_bit_buffer *rb) { VP9_COMMON *const cm = &pbi->common; @@ -1780,24 +2667,17 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, if (cm->show_existing_frame) { // Show an existing frame directly. const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)]; - lock_buffer_pool(pool); if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { - unlock_buffer_pool(pool); vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, "Buffer %d does not contain a decoded frame", frame_to_show); } ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show); - unlock_buffer_pool(pool); pbi->refresh_frame_flags = 0; cm->lf.filter_level = 0; cm->show_frame = 1; - if (pbi->frame_parallel_decode) { - for (i = 0; i < REF_FRAMES; ++i) - cm->next_ref_frame_map[i] = cm->ref_frame_map[i]; - } return 0; } @@ -1821,6 +2701,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, setup_frame_size(cm, rb); if (pbi->need_resync) { memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + flush_all_fb_on_key(cm); pbi->need_resync = 0; } } else { @@ -1914,7 +2795,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2); // Generate next_ref_frame_map. - lock_buffer_pool(pool); for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { if (mask & 1) { cm->next_ref_frame_map[ref_index] = cm->new_fb_idx; @@ -1934,7 +2814,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, if (cm->ref_frame_map[ref_index] >= 0) ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count; } - unlock_buffer_pool(pool); pbi->hold_ref_buf = 1; if (frame_is_intra_only(cm) || cm->error_resilient_mode) @@ -1946,6 +2825,35 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, setup_segmentation_dequant(cm); setup_tile_info(cm, rb); + if (pbi->row_mt == 1) { + int num_sbs = 1; + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + const int num_jobs = sb_rows << cm->log2_tile_cols; + + if (pbi->row_mt_worker_data == NULL) { + CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data, + vpx_calloc(1, sizeof(*pbi->row_mt_worker_data))); +#if CONFIG_MULTITHREAD + pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL); +#endif + } + + if (pbi->max_threads > 1) { + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + + num_sbs = sb_cols * sb_rows; + } + + if (num_sbs > pbi->row_mt_worker_data->num_sbs || + num_jobs > pbi->row_mt_worker_data->num_jobs) { + vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data); + vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs, + pbi->max_threads, num_jobs); + } + vp9_jobq_alloc(pbi); + } sz = vpx_rb_read_literal(rb, 16); if (sz == 0) @@ -1988,7 +2896,7 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data, cm->reference_mode = read_frame_reference_mode(cm, &r); if (cm->reference_mode != SINGLE_REFERENCE) - setup_compound_reference_mode(cm); + vp9_setup_compound_reference_mode(cm); read_frame_reference_mode_probs(cm, &r); for (j = 0; j < BLOCK_SIZE_GROUPS; j++) @@ -2056,6 +2964,12 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, const int tile_rows = 1 << cm->log2_tile_rows; const int tile_cols = 1 << cm->log2_tile_cols; YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame); +#endif +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_r(); +#endif xd->cur_buf = new_fb; if (!first_partition_size) { @@ -2091,24 +3005,6 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, vp9_loop_filter_frame_init(cm, cm->lf.filter_level); } - // If encoded in frame parallel mode, frame context is ready after decoding - // the frame header. - if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) { - VPxWorker *const worker = pbi->frame_worker_owner; - FrameWorkerData *const frame_worker_data = worker->data1; - if (cm->refresh_frame_context) { - context_updated = 1; - cm->frame_contexts[cm->frame_context_idx] = *cm->fc; - } - vp9_frameworker_lock_stats(worker); - pbi->cur_buf->row = -1; - pbi->cur_buf->col = -1; - frame_worker_data->frame_context_ready = 1; - // Signal the main thread that context is ready. - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); - } - if (pbi->tile_worker_data == NULL || (tile_cols * tile_rows) != pbi->total_tiles) { const int num_tile_workers = @@ -2118,24 +3014,33 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, // platforms without DECLARE_ALIGNED(). assert((sizeof(*pbi->tile_worker_data) % 16) == 0); vpx_free(pbi->tile_worker_data); - CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size)); + CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data, + vpx_memalign(32, twd_size)); pbi->total_tiles = tile_rows * tile_cols; } - if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) { - // Multi-threaded tile decoder - *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); - if (!xd->corrupted) { - if (!cm->skip_loop_filter) { - // If multiple threads are used to decode tiles, then we use those - // threads to do parallel loopfiltering. - vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level, - 0, 0, pbi->tile_workers, pbi->num_tile_workers, - &pbi->lf_row_sync); - } + if (pbi->max_threads > 1 && tile_rows == 1 && + (tile_cols > 1 || pbi->row_mt == 1)) { + if (pbi->row_mt == 1) { + *p_data_end = + decode_tiles_row_wise_mt(pbi, data + first_partition_size, data_end); } else { - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Decode failed. Frame data is corrupted."); + // Multi-threaded tile decoder + *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); + if (!pbi->lpf_mt_opt) { + if (!xd->corrupted) { + if (!cm->skip_loop_filter) { + // If multiple threads are used to decode tiles, then we use those + // threads to do parallel loopfiltering. + vp9_loop_filter_frame_mt( + new_fb, cm, pbi->mb.plane, cm->lf.filter_level, 0, 0, + pbi->tile_workers, pbi->num_tile_workers, &pbi->lf_row_sync); + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); + } + } } } else { *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h index 44717f546a..ba95e72344 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodeframe.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DECODEFRAME_H_ -#define VP9_DECODER_VP9_DECODEFRAME_H_ +#ifndef VPX_VP9_DECODER_VP9_DECODEFRAME_H_ +#define VPX_VP9_DECODER_VP9_DECODEFRAME_H_ #ifdef __cplusplus extern "C" { @@ -32,4 +32,4 @@ void vp9_decode_frame(struct VP9Decoder *pbi, const uint8_t *data, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DECODEFRAME_H_ +#endif // VPX_VP9_DECODER_VP9_DECODEFRAME_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c index 1a4152436a..0989cde58d 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.c @@ -204,7 +204,7 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm, mi->skip = read_skip(cm, xd, mi->segment_id, r); mi->tx_size = read_tx_size(cm, xd, 1, r); mi->ref_frame[0] = INTRA_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; switch (bsize) { case BLOCK_4X4: @@ -299,7 +299,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, } } -// Read the referncence frame +// Read the reference frame static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, vpx_reader *r, int segment_id, MV_REFERENCE_FRAME ref_frame[2]) { @@ -309,7 +309,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); - ref_frame[1] = NONE; + ref_frame[1] = NO_REF_FRAME; } else { const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding @@ -333,7 +333,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, ref_frame[0] = LAST_FRAME; } - ref_frame[1] = NONE; + ref_frame[1] = NO_REF_FRAME; } else { assert(0 && "Invalid prediction mode."); } @@ -383,7 +383,7 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, mi->interp_filter = SWITCHABLE_FILTERS; mi->ref_frame[0] = INTRA_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; } static INLINE int is_mv_valid(const MV *mv) { @@ -426,7 +426,9 @@ static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd, zero_mv_pair(mv); break; } - default: { return 0; } + default: { + return 0; + } } return ret; } @@ -444,23 +446,6 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, } } -static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv, - int refmv_count) { - int i; - - // Make sure all the candidates are properly clamped etc - for (i = 0; i < refmv_count; ++i) { - lower_mv_precision(&mvlist[i].as_mv, allow_hp); - *best_mv = mvlist[i]; - } -} - -static void fpm_sync(void *const data, int mi_row) { - VP9Decoder *const pbi = (VP9Decoder *)data; - vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame, - mi_row << MI_BLOCK_SIZE_LOG2); -} - // This macro is used to add a motion vector mv_ref list if it isn't // already in the list. If it's the second motion vector or early_break // it will also skip all additional processing and jump to Done! @@ -500,8 +485,7 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, const POSITION *const mv_ref_search, int_mv *mv_ref_list, int mi_row, int mi_col, - int block, int is_sub8x8, find_mv_refs_sync sync, - void *const data) { + int block) { const int *ref_sign_bias = cm->ref_frame_sign_bias; int i, refmv_count = 0; int different_ref_found = 0; @@ -518,7 +502,7 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); i = 0; - if (is_sub8x8) { + if (block >= 0) { // If the size < 8x8 we get the mv from the bmi substructure for the // nearest two blocks. for (i = 0; i < 2; ++i) { @@ -557,23 +541,8 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, } } -// TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast -// on windows platform. The sync here is unnecessary if use_prev_frame_mvs -// is 0. But after removing it, there will be hang in the unit test on windows -// due to several threads waiting for a thread's signal. -#if defined(_WIN32) && !HAVE_PTHREAD_H - if (cm->frame_parallel_decode && sync != NULL) { - sync(data, mi_row); - } -#endif - // Check the last frame's mode and mv info. if (prev_frame_mvs) { - // Synchronize here for frame parallel decode if sync function is provided. - if (cm->frame_parallel_decode && sync != NULL) { - sync(data, mi_row); - } - if (prev_frame_mvs->ref_frame[0] == ref_frame) { ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done); } else if (prev_frame_mvs->ref_frame[1] == ref_frame) { @@ -650,19 +619,22 @@ static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, assert(MAX_MV_REF_CANDIDATES == 2); - refmv_count = - dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search, - mv_list, mi_row, mi_col, block, 1, NULL, NULL); - switch (block) { - case 0: best_sub8x8->as_int = mv_list[refmv_count - 1].as_int; break; + case 0: + refmv_count = + dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search, + mv_list, mi_row, mi_col, block); + best_sub8x8->as_int = mv_list[refmv_count - 1].as_int; + break; case 1: case 2: if (b_mode == NEARESTMV) { best_sub8x8->as_int = bmi[0].as_mv[ref].as_int; } else { + dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search, + mv_list, mi_row, mi_col, block); best_sub8x8->as_int = 0; - for (n = 0; n < refmv_count; ++n) + for (n = 0; n < 2; ++n) if (bmi[0].as_mv[ref].as_int != mv_list[n].as_int) { best_sub8x8->as_int = mv_list[n].as_int; break; @@ -673,15 +645,20 @@ static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, if (b_mode == NEARESTMV) { best_sub8x8->as_int = bmi[2].as_mv[ref].as_int; } else { - int_mv candidates[2 + MAX_MV_REF_CANDIDATES]; - candidates[0] = bmi[1].as_mv[ref]; - candidates[1] = bmi[0].as_mv[ref]; - candidates[2] = mv_list[0]; - candidates[3] = mv_list[1]; best_sub8x8->as_int = 0; - for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n) - if (bmi[2].as_mv[ref].as_int != candidates[n].as_int) { - best_sub8x8->as_int = candidates[n].as_int; + if (bmi[2].as_mv[ref].as_int != bmi[1].as_mv[ref].as_int) { + best_sub8x8->as_int = bmi[1].as_mv[ref].as_int; + break; + } + if (bmi[2].as_mv[ref].as_int != bmi[0].as_mv[ref].as_int) { + best_sub8x8->as_int = bmi[0].as_mv[ref].as_int; + break; + } + dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search, + mv_list, mi_row, mi_col, block); + for (n = 0; n < 2; ++n) + if (bmi[2].as_mv[ref].as_int != mv_list[n].as_int) { + best_sub8x8->as_int = mv_list[n].as_int; break; } } @@ -718,7 +695,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, VP9_COMMON *const cm = &pbi->common; const BLOCK_SIZE bsize = mi->sb_type; const int allow_hp = cm->allow_high_precision_mv; - int_mv best_ref_mvs[2]; + int_mv best_ref_mvs[2] = { { 0 }, { 0 } }; int ref, is_compound; uint8_t inter_mode_ctx; const POSITION *const mv_ref_search = mv_ref_blocks[bsize]; @@ -731,33 +708,12 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, mi->mode = ZEROMV; if (bsize < BLOCK_8X8) { vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, - "Invalid usage of segement feature on small blocks"); + "Invalid usage of segment feature on small blocks"); return; } } else { if (bsize >= BLOCK_8X8) mi->mode = read_inter_mode(cm, xd, r, inter_mode_ctx); - else - // Sub 8x8 blocks use the nearestmv as a ref_mv if the b_mode is NEWMV. - // Setting mode to NEARESTMV forces the search to stop after the nearestmv - // has been found. After b_modes have been read, mode will be overwritten - // by the last b_mode. - mi->mode = NEARESTMV; - - if (mi->mode != ZEROMV) { - for (ref = 0; ref < 1 + is_compound; ++ref) { - int_mv tmp_mvs[MAX_MV_REF_CANDIDATES]; - const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; - int refmv_count; - - refmv_count = - dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, tmp_mvs, - mi_row, mi_col, -1, 0, fpm_sync, (void *)pbi); - - dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref], - refmv_count); - } - } } mi->interp_filter = (cm->interp_filter == SWITCHABLE) @@ -769,6 +725,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, const int num_4x4_h = 1 << xd->bmode_blocks_hl; int idx, idy; PREDICTION_MODE b_mode; + int got_mv_refs_for_new = 0; int_mv best_sub8x8[2]; const uint32_t invalid_mv = 0x80008000; // Initialize the 2nd element as even though it won't be used meaningfully @@ -783,12 +740,24 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, for (ref = 0; ref < 1 + is_compound; ++ref) append_sub8x8_mvs_for_idx(cm, xd, mv_ref_search, b_mode, j, ref, mi_row, mi_col, &best_sub8x8[ref]); + } else if (b_mode == NEWMV && !got_mv_refs_for_new) { + for (ref = 0; ref < 1 + is_compound; ++ref) { + int_mv tmp_mvs[MAX_MV_REF_CANDIDATES]; + const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; + + dec_find_mv_refs(cm, xd, NEWMV, frame, mv_ref_search, tmp_mvs, + mi_row, mi_col, -1); + + lower_mv_precision(&tmp_mvs[0].as_mv, allow_hp); + best_ref_mvs[ref] = tmp_mvs[0]; + got_mv_refs_for_new = 1; + } } if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs, best_sub8x8, is_compound, allow_hp, r)) { xd->corrupted |= 1; - break; + return; } if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j]; @@ -800,6 +769,17 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, copy_mv_pair(mi->mv, mi->bmi[3].as_mv); } else { + if (mi->mode != ZEROMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) { + int_mv tmp_mvs[MAX_MV_REF_CANDIDATES]; + const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; + int refmv_count = + dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, tmp_mvs, + mi_row, mi_col, -1); + lower_mv_precision(&tmp_mvs[refmv_count - 1].as_mv, allow_hp); + best_ref_mvs[ref] = tmp_mvs[refmv_count - 1]; + } + } xd->corrupted |= !assign_mv(cm, xd, mi->mode, mi->mv, best_ref_mvs, best_ref_mvs, is_compound, allow_hp, r); } @@ -842,13 +822,21 @@ void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, if (frame_is_intra_only(cm)) { read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r, x_mis, y_mis); } else { + // Cache mi->ref_frame and mi->mv so that the compiler can prove that they + // are constant for the duration of the loop and avoids reloading them. + MV_REFERENCE_FRAME mi_ref_frame[2]; + int_mv mi_mv[2]; + read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis); + copy_ref_frame_pair(mi_ref_frame, mi->ref_frame); + copy_mv_pair(mi_mv, mi->mv); + for (h = 0; h < y_mis; ++h) { for (w = 0; w < x_mis; ++w) { MV_REF *const mv = frame_mvs + w; - copy_ref_frame_pair(mv->ref_frame, mi->ref_frame); - copy_mv_pair(mv->mv, mi->mv); + copy_ref_frame_pair(mv->ref_frame, mi_ref_frame); + copy_mv_pair(mv->mv, mi_mv); } frame_mvs += cm->mi_cols; } diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h index b460cb8fb1..11b45ace06 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decodemv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DECODEMV_H_ -#define VP9_DECODER_VP9_DECODEMV_H_ +#ifndef VPX_VP9_DECODER_VP9_DECODEMV_H_ +#define VPX_VP9_DECODER_VP9_DECODEMV_H_ #include "vpx_dsp/bitreader.h" @@ -26,4 +26,4 @@ void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DECODEMV_H_ +#endif // VPX_VP9_DECODER_VP9_DECODEMV_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c index 37693f0944..5c77df5002 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.c @@ -21,6 +21,7 @@ #include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" #include "vpx_scale/vpx_scale.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #include "vp9/common/vp9_alloccommon.h" @@ -55,6 +56,94 @@ static void vp9_dec_setup_mi(VP9_COMMON *cm) { cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); } +void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, + VP9_COMMON *cm, int num_sbs, int max_threads, + int num_jobs) { + int plane; + const size_t dqcoeff_size = (num_sbs << DQCOEFFS_PER_SB_LOG2) * + sizeof(*row_mt_worker_data->dqcoeff[0]); + row_mt_worker_data->num_jobs = num_jobs; +#if CONFIG_MULTITHREAD + { + int i; + CHECK_MEM_ERROR( + &cm->error, row_mt_worker_data->recon_sync_mutex, + vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs)); + if (row_mt_worker_data->recon_sync_mutex) { + for (i = 0; i < num_jobs; ++i) { + pthread_mutex_init(&row_mt_worker_data->recon_sync_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR( + &cm->error, row_mt_worker_data->recon_sync_cond, + vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs)); + if (row_mt_worker_data->recon_sync_cond) { + for (i = 0; i < num_jobs; ++i) { + pthread_cond_init(&row_mt_worker_data->recon_sync_cond[i], NULL); + } + } + } +#endif + row_mt_worker_data->num_sbs = num_sbs; + for (plane = 0; plane < 3; ++plane) { + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane], + vpx_memalign(32, dqcoeff_size)); + memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane], + vpx_calloc(num_sbs << EOBS_PER_SB_LOG2, + sizeof(*row_mt_worker_data->eob[plane]))); + } + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition, + vpx_calloc(num_sbs * PARTITIONS_PER_SB, + sizeof(*row_mt_worker_data->partition))); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map, + vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map))); + + // allocate memory for thread_data + if (row_mt_worker_data->thread_data == NULL) { + const size_t thread_size = + max_threads * sizeof(*row_mt_worker_data->thread_data); + CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data, + vpx_memalign(32, thread_size)); + } +} + +void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) { + if (row_mt_worker_data != NULL) { + int plane; +#if CONFIG_MULTITHREAD + int i; + if (row_mt_worker_data->recon_sync_mutex != NULL) { + for (i = 0; i < row_mt_worker_data->num_jobs; ++i) { + pthread_mutex_destroy(&row_mt_worker_data->recon_sync_mutex[i]); + } + vpx_free(row_mt_worker_data->recon_sync_mutex); + row_mt_worker_data->recon_sync_mutex = NULL; + } + if (row_mt_worker_data->recon_sync_cond != NULL) { + for (i = 0; i < row_mt_worker_data->num_jobs; ++i) { + pthread_cond_destroy(&row_mt_worker_data->recon_sync_cond[i]); + } + vpx_free(row_mt_worker_data->recon_sync_cond); + row_mt_worker_data->recon_sync_cond = NULL; + } +#endif + for (plane = 0; plane < 3; ++plane) { + vpx_free(row_mt_worker_data->eob[plane]); + row_mt_worker_data->eob[plane] = NULL; + vpx_free(row_mt_worker_data->dqcoeff[plane]); + row_mt_worker_data->dqcoeff[plane] = NULL; + } + vpx_free(row_mt_worker_data->partition); + row_mt_worker_data->partition = NULL; + vpx_free(row_mt_worker_data->recon_map); + row_mt_worker_data->recon_map = NULL; + vpx_free(row_mt_worker_data->thread_data); + row_mt_worker_data->thread_data = NULL; + } +} + static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) { cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip)); if (!cm->mip) return 1; @@ -65,10 +154,16 @@ static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) { } static void vp9_dec_free_mi(VP9_COMMON *cm) { +#if CONFIG_VP9_POSTPROC + // MFQE allocates an additional mip and swaps it with cm->mip. + vpx_free(cm->postproc_state.prev_mip); + cm->postproc_state.prev_mip = NULL; +#endif vpx_free(cm->mip); cm->mip = NULL; vpx_free(cm->mi_grid_base); cm->mi_grid_base = NULL; + cm->mi_alloc_size = 0; } VP9Decoder *vp9_decoder_create(BufferPool *const pool) { @@ -87,9 +182,10 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) { cm->error.setjmp = 1; - CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR(&cm->error, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); CHECK_MEM_ERROR( - cm, cm->frame_contexts, + &cm->error, cm->frame_contexts, (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); pbi->need_resync = 1; @@ -99,7 +195,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) { memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map)); - cm->current_video_frame = 0; + init_frame_indexes(cm); pbi->ready_for_new_data = 1; pbi->common.buffer_pool = pool; @@ -115,6 +211,7 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) { cm->error.setjmp = 0; vpx_get_worker_interface()->init(&pbi->lf_worker); + pbi->lf_worker.thread_name = "vpx lf worker"; return pbi; } @@ -139,6 +236,19 @@ void vp9_decoder_remove(VP9Decoder *pbi) { vp9_loop_filter_dealloc(&pbi->lf_row_sync); } + if (pbi->row_mt == 1) { + vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data); + if (pbi->row_mt_worker_data != NULL) { + vp9_jobq_deinit(&pbi->row_mt_worker_data->jobq); + vpx_free(pbi->row_mt_worker_data->jobq_buf); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&pbi->row_mt_worker_data->recon_done_mutex); +#endif + } + vpx_free(pbi->row_mt_worker_data); + } + + vp9_remove_common(&pbi->common); vpx_free(pbi); } @@ -169,7 +279,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi, vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Incorrect buffer dimensions"); else - vp8_yv12_copy_frame(cfg, sd); + vpx_yv12_copy_frame(cfg, sd); } else { vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame"); } @@ -217,7 +327,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, "Incorrect buffer dimensions"); } else { // Overwrite the reference frame buffer. - vp8_yv12_copy_frame(sd, ref_buf); + vpx_yv12_copy_frame(sd, ref_buf); } return cm->error.error_code; @@ -230,7 +340,6 @@ static void swap_frame_buffers(VP9Decoder *pbi) { BufferPool *const pool = cm->buffer_pool; RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - lock_buffer_pool(pool); for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { const int old_idx = cm->ref_frame_map[ref_index]; // Current thread releases the holding of reference frame. @@ -250,21 +359,54 @@ static void swap_frame_buffers(VP9Decoder *pbi) { decrease_ref_count(old_idx, frame_bufs, pool); cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; } - unlock_buffer_pool(pool); pbi->hold_ref_buf = 0; cm->frame_to_show = get_frame_new_buffer(cm); - if (!pbi->frame_parallel_decode || !cm->show_frame) { - lock_buffer_pool(pool); - --frame_bufs[cm->new_fb_idx].ref_count; - unlock_buffer_pool(pool); - } + --frame_bufs[cm->new_fb_idx].ref_count; // Invalidate these references until the next frame starts. for (ref_index = 0; ref_index < 3; ref_index++) cm->frame_refs[ref_index].idx = -1; } +static void release_fb_on_decoder_exit(VP9Decoder *pbi) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VP9_COMMON *volatile const cm = &pbi->common; + BufferPool *volatile const pool = cm->buffer_pool; + RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + // Synchronize all threads immediately as a subsequent decode call may + // cause a resize invalidating some allocations. + winterface->sync(&pbi->lf_worker); + for (i = 0; i < pbi->num_tile_workers; ++i) { + winterface->sync(&pbi->tile_workers[i]); + } + + // Release all the reference buffers if worker thread is holding them. + if (pbi->hold_ref_buf == 1) { + int ref_index = 0, mask; + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + const int old_idx = cm->ref_frame_map[ref_index]; + // Current thread releases the holding of reference frame. + decrease_ref_count(old_idx, frame_bufs, pool); + + // Release the reference frame in reference map. + if (mask & 1) { + decrease_ref_count(old_idx, frame_bufs, pool); + } + ++ref_index; + } + + // Current thread releases the holding of reference frame. + for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { + const int old_idx = cm->ref_frame_map[ref_index]; + decrease_ref_count(old_idx, frame_bufs, pool); + } + pbi->hold_ref_buf = 0; + } +} + int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, const uint8_t **psource) { VP9_COMMON *volatile const cm = &pbi->common; @@ -292,14 +434,19 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, pbi->ready_for_new_data = 0; // Check if the previous frame was a frame without any references to it. - // Release frame buffer if not decoding in frame parallel mode. - if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 && - frame_bufs[cm->new_fb_idx].ref_count == 0) + if (cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0 && + !frame_bufs[cm->new_fb_idx].released) { pool->release_fb_cb(pool->cb_priv, &frame_bufs[cm->new_fb_idx].raw_frame_buffer); + frame_bufs[cm->new_fb_idx].released = 1; + } + // Find a free frame buffer. Return error if can not find any. cm->new_fb_idx = get_free_fb(cm); if (cm->new_fb_idx == INVALID_IDX) { + pbi->ready_for_new_data = 1; + release_fb_on_decoder_exit(pbi); + vpx_clear_system_state(); vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Unable to find free frame buffer"); return cm->error.error_code; @@ -309,60 +456,14 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; pbi->hold_ref_buf = 0; - if (pbi->frame_parallel_decode) { - VPxWorker *const worker = pbi->frame_worker_owner; - vp9_frameworker_lock_stats(worker); - frame_bufs[cm->new_fb_idx].frame_worker_owner = worker; - // Reset decoding progress. - pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; - pbi->cur_buf->row = -1; - pbi->cur_buf->col = -1; - vp9_frameworker_unlock_stats(worker); - } else { - pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; - } + pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; if (setjmp(cm->error.jmp)) { - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - int i; - cm->error.setjmp = 0; pbi->ready_for_new_data = 1; - - // Synchronize all threads immediately as a subsequent decode call may - // cause a resize invalidating some allocations. - winterface->sync(&pbi->lf_worker); - for (i = 0; i < pbi->num_tile_workers; ++i) { - winterface->sync(&pbi->tile_workers[i]); - } - - lock_buffer_pool(pool); - // Release all the reference buffers if worker thread is holding them. - if (pbi->hold_ref_buf == 1) { - int ref_index = 0, mask; - for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { - const int old_idx = cm->ref_frame_map[ref_index]; - // Current thread releases the holding of reference frame. - decrease_ref_count(old_idx, frame_bufs, pool); - - // Release the reference frame in reference map. - if (mask & 1) { - decrease_ref_count(old_idx, frame_bufs, pool); - } - ++ref_index; - } - - // Current thread releases the holding of reference frame. - for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { - const int old_idx = cm->ref_frame_map[ref_index]; - decrease_ref_count(old_idx, frame_bufs, pool); - } - pbi->hold_ref_buf = 0; - } + release_fb_on_decoder_exit(pbi); // Release current frame. decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); - unlock_buffer_pool(pool); - vpx_clear_system_state(); return -1; } @@ -377,31 +478,16 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, if (!cm->show_existing_frame) { cm->last_show_frame = cm->show_frame; cm->prev_frame = cm->cur_frame; - if (cm->seg.enabled && !pbi->frame_parallel_decode) - vp9_swap_current_and_last_seg_map(cm); + if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm); } - // Update progress in frame parallel decode. - if (pbi->frame_parallel_decode) { - // Need to lock the mutex here as another thread may - // be accessing this buffer. - VPxWorker *const worker = pbi->frame_worker_owner; - FrameWorkerData *const frame_worker_data = worker->data1; - vp9_frameworker_lock_stats(worker); + if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx; - if (cm->show_frame) { - cm->current_video_frame++; - } - frame_worker_data->frame_decoded = 1; - frame_worker_data->frame_context_ready = 1; - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); - } else { - cm->last_width = cm->width; - cm->last_height = cm->height; - if (cm->show_frame) { - cm->current_video_frame++; - } + // Update progress in frame parallel decode. + cm->last_width = cm->width; + cm->last_height = cm->height; + if (cm->show_frame) { + cm->current_video_frame++; } cm->error.setjmp = 0; @@ -427,7 +513,7 @@ int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, #if CONFIG_VP9_POSTPROC if (!cm->show_existing_frame) { - ret = vp9_post_proc_frame(cm, sd, flags); + ret = vp9_post_proc_frame(cm, sd, flags, cm->width); } else { *sd = *cm->frame_to_show; ret = 0; diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h index 427baf1e0b..b3ee4eab5f 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h +++ b/media/libvpx/libvpx/vp9/decoder/vp9_decoder.h @@ -8,25 +8,38 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DECODER_H_ -#define VP9_DECODER_VP9_DECODER_H_ +#ifndef VPX_VP9_DECODER_VP9_DECODER_H_ +#define VPX_VP9_DECODER_VP9_DECODER_H_ #include "./vpx_config.h" #include "vpx/vpx_codec.h" #include "vpx_dsp/bitreader.h" #include "vpx_scale/yv12config.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" -#include "vp9/decoder/vp9_dthread.h" +#include "./vp9_job_queue.h" #ifdef __cplusplus extern "C" { #endif +#define EOBS_PER_SB_LOG2 8 +#define DQCOEFFS_PER_SB_LOG2 12 +#define PARTITIONS_PER_SB 85 + +typedef enum JobType { PARSE_JOB, RECON_JOB, LPF_JOB } JobType; + +typedef struct ThreadData { + struct VP9Decoder *pbi; + LFWorkerData *lf_data; + VP9LfSync *lf_sync; +} ThreadData; + typedef struct TileBuffer { const uint8_t *data; size_t size; @@ -38,12 +51,47 @@ typedef struct TileWorkerData { int buf_start, buf_end; // pbi->tile_buffers to decode, inclusive vpx_reader bit_reader; FRAME_COUNTS counts; + LFWorkerData *lf_data; + VP9LfSync *lf_sync; DECLARE_ALIGNED(16, MACROBLOCKD, xd); /* dqcoeff are shared by all the planes. So planes must be decoded serially */ - DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); + DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]); struct vpx_internal_error_info error_info; } TileWorkerData; +typedef void (*process_block_fn_t)(TileWorkerData *twd, + struct VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, + int bhl); + +typedef struct RowMTWorkerData { + int num_sbs; + int *eob[MAX_MB_PLANE]; + PARTITION_TYPE *partition; + tran_low_t *dqcoeff[MAX_MB_PLANE]; + int8_t *recon_map; + const uint8_t *data_end; + uint8_t *jobq_buf; + JobQueueRowMt jobq; + size_t jobq_size; + int num_tiles_done; + int num_jobs; +#if CONFIG_MULTITHREAD + pthread_mutex_t recon_done_mutex; + pthread_mutex_t *recon_sync_mutex; + pthread_cond_t *recon_sync_cond; +#endif + ThreadData *thread_data; +} RowMTWorkerData; + +/* Structure to queue and dequeue row decode jobs */ +typedef struct Job { + int row_num; + int tile_col; + JobType job_type; +} Job; + typedef struct VP9Decoder { DECLARE_ALIGNED(16, MACROBLOCKD, mb); @@ -53,13 +101,10 @@ typedef struct VP9Decoder { int refresh_frame_flags; - int frame_parallel_decode; // frame-based threading. - // TODO(hkuang): Combine this with cur_buf in macroblockd as they are // the same. RefCntBuffer *cur_buf; // Current decoding frame buffer. - VPxWorker *frame_worker_owner; // frame_worker that owns this pbi. VPxWorker lf_worker; VPxWorker *tile_workers; TileWorkerData *tile_worker_data; @@ -76,10 +121,14 @@ typedef struct VP9Decoder { int inv_tile_order; int need_resync; // wait for key/intra-only frame. int hold_ref_buf; // hold the reference buffer. + + int row_mt; + int lpf_mt_opt; + RowMTWorkerData *row_mt_worker_data; } VP9Decoder; int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size, - const uint8_t **dest); + const uint8_t **psource); int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, vp9_ppflags_t *flags); @@ -113,6 +162,11 @@ struct VP9Decoder *vp9_decoder_create(BufferPool *const pool); void vp9_decoder_remove(struct VP9Decoder *pbi); +void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, + VP9_COMMON *cm, int num_sbs, int max_threads, + int num_jobs); +void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data); + static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, BufferPool *const pool) { if (idx >= 0 && frame_bufs[idx].ref_count > 0) { @@ -121,9 +175,10 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, // But the private buffer is not set up until finish decoding header. // So any error happens during decoding header, the frame_bufs will not // have valid priv buffer. - if (frame_bufs[idx].ref_count == 0 && + if (!frame_bufs[idx].released && frame_bufs[idx].ref_count == 0 && frame_bufs[idx].raw_frame_buffer.priv) { pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer); + frame_bufs[idx].released = 1; } } } @@ -132,4 +187,4 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DECODER_H_ +#endif // VPX_VP9_DECODER_VP9_DECODER_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c index a441f3addc..d957dc34e3 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c +++ b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.c @@ -33,6 +33,20 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value, int *count, unsigned int *range) { const unsigned int split = (*range * prob + (256 - prob)) >> CHAR_BIT; const BD_VALUE bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT); +#if CONFIG_BITSTREAM_DEBUG + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + int ref_result, ref_prob; + bitstream_queue_pop(&ref_result, &ref_prob); + if (prob != ref_prob) { + fprintf(stderr, + "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d " + "queue_r %d\n", + frame_idx, prob, ref_prob, queue_r); + + assert(0); + } +#endif if (*count < 0) { r->value = *value; @@ -51,6 +65,20 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value, *value <<= shift; *count -= shift; } +#if CONFIG_BITSTREAM_DEBUG + { + const int bit = 1; + if (bit != ref_result) { + fprintf( + stderr, + "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d " + "queue_r %d\n", + frame_idx, bit, ref_result, queue_r); + + assert(0); + } + } +#endif return 1; } *range = split; @@ -60,6 +88,19 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value, *value <<= shift; *count -= shift; } +#if CONFIG_BITSTREAM_DEBUG + { + const int bit = 0; + if (bit != ref_result) { + fprintf(stderr, + "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d " + "queue_r %d\n", + frame_idx, bit, ref_result, queue_r); + + assert(0); + } + } +#endif return 0; } @@ -92,16 +133,18 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, int16_t dqv = dq[0]; const uint8_t *const cat6_prob = #if CONFIG_VP9_HIGHBITDEPTH - (xd->bd == VPX_BITS_12) - ? vp9_cat6_prob_high12 - : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 : + (xd->bd == VPX_BITS_12) ? vp9_cat6_prob_high12 + : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 + : #endif // CONFIG_VP9_HIGHBITDEPTH - vp9_cat6_prob; + vp9_cat6_prob; const int cat6_bits = #if CONFIG_VP9_HIGHBITDEPTH - (xd->bd == VPX_BITS_12) ? 18 : (xd->bd == VPX_BITS_10) ? 16 : + (xd->bd == VPX_BITS_12) ? 18 + : (xd->bd == VPX_BITS_10) ? 16 + : #endif // CONFIG_VP9_HIGHBITDEPTH - 14; + 14; // Keep value, range, and count as locals. The compiler produces better // results with the locals than using r directly. BD_VALUE value = r->value; @@ -201,9 +244,9 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type, #endif // CONFIG_VP9_HIGHBITDEPTH #else if (read_bool(r, 128, &value, &count, &range)) { - dqcoeff[scan[c]] = -v; + dqcoeff[scan[c]] = (tran_low_t)-v; } else { - dqcoeff[scan[c]] = v; + dqcoeff[scan[c]] = (tran_low_t)v; } #endif // CONFIG_COEFFICIENT_RANGE_CHECKING ++c; @@ -229,9 +272,8 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l, } } -int vp9_decode_block_tokens(TileWorkerData *twd, int plane, - const scan_order *sc, int x, int y, TX_SIZE tx_size, - int seg_id) { +int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc, + int x, int y, TX_SIZE tx_size, int seg_id) { vpx_reader *r = &twd->bit_reader; MACROBLOCKD *xd = &twd->xd; struct macroblockd_plane *const pd = &xd->plane[plane]; diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h index 7b0d876016..a8e47021b8 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h +++ b/media/libvpx/libvpx/vp9/decoder/vp9_detokenize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DETOKENIZE_H_ -#define VP9_DECODER_VP9_DETOKENIZE_H_ +#ifndef VPX_VP9_DECODER_VP9_DETOKENIZE_H_ +#define VPX_VP9_DECODER_VP9_DETOKENIZE_H_ #include "vpx_dsp/bitreader.h" #include "vp9/decoder/vp9_decoder.h" @@ -19,12 +19,11 @@ extern "C" { #endif -int vp9_decode_block_tokens(TileWorkerData *twd, int plane, - const scan_order *sc, int x, int y, TX_SIZE tx_size, - int seg_id); +int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc, + int x, int y, TX_SIZE tx_size, int seg_id); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_DECODER_VP9_DETOKENIZE_H_ +#endif // VPX_VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h b/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h index 5a8ec8300c..b0c7750736 100644 --- a/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h +++ b/media/libvpx/libvpx/vp9/decoder/vp9_dsubexp.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DSUBEXP_H_ -#define VP9_DECODER_VP9_DSUBEXP_H_ +#ifndef VPX_VP9_DECODER_VP9_DSUBEXP_H_ +#define VPX_VP9_DECODER_VP9_DSUBEXP_H_ #include "vpx_dsp/bitreader.h" @@ -23,4 +23,4 @@ void vp9_diff_update_prob(vpx_reader *r, vpx_prob *p); } // extern "C" #endif -#endif // VP9_DECODER_VP9_DSUBEXP_H_ +#endif // VPX_VP9_DECODER_VP9_DSUBEXP_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_dthread.c b/media/libvpx/libvpx/vp9/decoder/vp9_dthread.c deleted file mode 100644 index 52bc2a0f60..0000000000 --- a/media/libvpx/libvpx/vp9/decoder/vp9_dthread.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_reconinter.h" -#include "vp9/decoder/vp9_dthread.h" -#include "vp9/decoder/vp9_decoder.h" - -// #define DEBUG_THREAD - -// TODO(hkuang): Clean up all the #ifdef in this file. -void vp9_frameworker_lock_stats(VPxWorker *const worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const worker_data = worker->data1; - pthread_mutex_lock(&worker_data->stats_mutex); -#else - (void)worker; -#endif -} - -void vp9_frameworker_unlock_stats(VPxWorker *const worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const worker_data = worker->data1; - pthread_mutex_unlock(&worker_data->stats_mutex); -#else - (void)worker; -#endif -} - -void vp9_frameworker_signal_stats(VPxWorker *const worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const worker_data = worker->data1; - -// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper. -#if defined(_WIN32) && !HAVE_PTHREAD_H - pthread_cond_signal(&worker_data->stats_cond); -#else - pthread_cond_broadcast(&worker_data->stats_cond); -#endif - -#else - (void)worker; -#endif -} - -// This macro prevents thread_sanitizer from reporting known concurrent writes. -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#define BUILDING_WITH_TSAN -#endif -#endif - -// TODO(hkuang): Remove worker parameter as it is only used in debug code. -void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf, - int row) { -#if CONFIG_MULTITHREAD - if (!ref_buf) return; - -#ifndef BUILDING_WITH_TSAN - // The following line of code will get harmless tsan error but it is the key - // to get best performance. - if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return; -#endif - - { - // Find the worker thread that owns the reference frame. If the reference - // frame has been fully decoded, it may not have owner. - VPxWorker *const ref_worker = ref_buf->frame_worker_owner; - FrameWorkerData *const ref_worker_data = - (FrameWorkerData *)ref_worker->data1; - const VP9Decoder *const pbi = ref_worker_data->pbi; - -#ifdef DEBUG_THREAD - { - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - printf("%d %p worker is waiting for %d %p worker (%d) ref %d \r\n", - worker_data->worker_id, worker, ref_worker_data->worker_id, - ref_buf->frame_worker_owner, row, ref_buf->row); - } -#endif - - vp9_frameworker_lock_stats(ref_worker); - while (ref_buf->row < row && pbi->cur_buf == ref_buf && - ref_buf->buf.corrupted != 1) { - pthread_cond_wait(&ref_worker_data->stats_cond, - &ref_worker_data->stats_mutex); - } - - if (ref_buf->buf.corrupted == 1) { - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - vp9_frameworker_unlock_stats(ref_worker); - vpx_internal_error(&worker_data->pbi->common.error, - VPX_CODEC_CORRUPT_FRAME, - "Worker %p failed to decode frame", worker); - } - vp9_frameworker_unlock_stats(ref_worker); - } -#else - (void)worker; - (void)ref_buf; - (void)row; - (void)ref_buf; -#endif // CONFIG_MULTITHREAD -} - -void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) { -#if CONFIG_MULTITHREAD - VPxWorker *worker = buf->frame_worker_owner; - -#ifdef DEBUG_THREAD - { - FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1; - printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id, - buf->frame_worker_owner, row); - } -#endif - - vp9_frameworker_lock_stats(worker); - buf->row = row; - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); -#else - (void)buf; - (void)row; -#endif // CONFIG_MULTITHREAD -} - -void vp9_frameworker_copy_context(VPxWorker *const dst_worker, - VPxWorker *const src_worker) { -#if CONFIG_MULTITHREAD - FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1; - FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1; - VP9_COMMON *const src_cm = &src_worker_data->pbi->common; - VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common; - int i; - - // Wait until source frame's context is ready. - vp9_frameworker_lock_stats(src_worker); - while (!src_worker_data->frame_context_ready) { - pthread_cond_wait(&src_worker_data->stats_cond, - &src_worker_data->stats_mutex); - } - - dst_cm->last_frame_seg_map = src_cm->seg.enabled - ? src_cm->current_frame_seg_map - : src_cm->last_frame_seg_map; - dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync; - vp9_frameworker_unlock_stats(src_worker); - - dst_cm->bit_depth = src_cm->bit_depth; -#if CONFIG_VP9_HIGHBITDEPTH - dst_cm->use_highbitdepth = src_cm->use_highbitdepth; -#endif - dst_cm->prev_frame = - src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame; - dst_cm->last_width = - !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width; - dst_cm->last_height = - !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height; - dst_cm->subsampling_x = src_cm->subsampling_x; - dst_cm->subsampling_y = src_cm->subsampling_y; - dst_cm->frame_type = src_cm->frame_type; - dst_cm->last_show_frame = !src_cm->show_existing_frame - ? src_cm->show_frame - : src_cm->last_show_frame; - for (i = 0; i < REF_FRAMES; ++i) - dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i]; - - memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr, - (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh)); - dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level; - dst_cm->lf.filter_level = src_cm->lf.filter_level; - memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS); - memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS); - dst_cm->seg = src_cm->seg; - memcpy(dst_cm->frame_contexts, src_cm->frame_contexts, - FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0])); -#else - (void)dst_worker; - (void)src_worker; -#endif // CONFIG_MULTITHREAD -} diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_dthread.h b/media/libvpx/libvpx/vp9/decoder/vp9_dthread.h deleted file mode 100644 index fce0fe7fe3..0000000000 --- a/media/libvpx/libvpx/vp9/decoder/vp9_dthread.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_DECODER_VP9_DTHREAD_H_ -#define VP9_DECODER_VP9_DTHREAD_H_ - -#include "./vpx_config.h" -#include "vpx_util/vpx_thread.h" -#include "vpx/internal/vpx_codec_internal.h" - -#ifdef __cplusplus -extern "C" { -#endif - -struct VP9Common; -struct VP9Decoder; - -// WorkerData for the FrameWorker thread. It contains all the information of -// the worker and decode structures for decoding a frame. -typedef struct FrameWorkerData { - struct VP9Decoder *pbi; - const uint8_t *data; - const uint8_t *data_end; - size_t data_size; - void *user_priv; - int result; - int worker_id; - int received_frame; - - // scratch_buffer is used in frame parallel mode only. - // It is used to make a copy of the compressed data. - uint8_t *scratch_buffer; - size_t scratch_buffer_size; - -#if CONFIG_MULTITHREAD - pthread_mutex_t stats_mutex; - pthread_cond_t stats_cond; -#endif - - int frame_context_ready; // Current frame's context is ready to read. - int frame_decoded; // Finished decoding current frame. -} FrameWorkerData; - -void vp9_frameworker_lock_stats(VPxWorker *const worker); -void vp9_frameworker_unlock_stats(VPxWorker *const worker); -void vp9_frameworker_signal_stats(VPxWorker *const worker); - -// Wait until ref_buf has been decoded to row in real pixel unit. -// Note: worker may already finish decoding ref_buf and release it in order to -// start decoding next frame. So need to check whether worker is still decoding -// ref_buf. -void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf, - int row); - -// FrameWorker broadcasts its decoding progress so other workers that are -// waiting on it can resume decoding. -void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row); - -// Copy necessary decoding context from src worker to dst worker. -void vp9_frameworker_copy_context(VPxWorker *const dst_worker, - VPxWorker *const src_worker); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_DECODER_VP9_DTHREAD_H_ diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c new file mode 100644 index 0000000000..926ae87739 --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx/vpx_integer.h" +#include "vpx_util/vpx_pthread.h" + +#include "vp9/decoder/vp9_job_queue.h" + +void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size) { +#if CONFIG_MULTITHREAD + pthread_mutex_init(&jobq->mutex, NULL); + pthread_cond_init(&jobq->cond, NULL); +#endif + jobq->buf_base = buf; + jobq->buf_wr = buf; + jobq->buf_rd = buf; + jobq->buf_end = buf + buf_size; + jobq->terminate = 0; +} + +void vp9_jobq_reset(JobQueueRowMt *jobq) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + jobq->buf_wr = jobq->buf_base; + jobq->buf_rd = jobq->buf_base; + jobq->terminate = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif +} + +void vp9_jobq_deinit(JobQueueRowMt *jobq) { + vp9_jobq_reset(jobq); +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&jobq->mutex); + pthread_cond_destroy(&jobq->cond); +#endif +} + +void vp9_jobq_terminate(JobQueueRowMt *jobq) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + jobq->terminate = 1; +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(&jobq->cond); + pthread_mutex_unlock(&jobq->mutex); +#endif +} + +int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size) { + int ret = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + if (jobq->buf_end >= jobq->buf_wr + job_size) { + memcpy(jobq->buf_wr, job, job_size); + jobq->buf_wr = jobq->buf_wr + job_size; +#if CONFIG_MULTITHREAD + pthread_cond_signal(&jobq->cond); +#endif + ret = 0; + } else { + /* Wrap around case is not supported */ + assert(0); + ret = 1; + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif + return ret; +} + +int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size, + int blocking) { + int ret = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&jobq->mutex); +#endif + if (jobq->buf_end >= jobq->buf_rd + job_size) { + while (1) { + if (jobq->buf_wr >= jobq->buf_rd + job_size) { + memcpy(job, jobq->buf_rd, job_size); + jobq->buf_rd = jobq->buf_rd + job_size; + ret = 0; + break; + } else { + /* If all the entries have been dequeued, then break and return */ + if (jobq->terminate == 1) { + ret = 1; + break; + } + if (blocking == 1) { +#if CONFIG_MULTITHREAD + pthread_cond_wait(&jobq->cond, &jobq->mutex); +#endif + } else { + /* If there is no job available, + * and this is non blocking call then return fail */ + ret = 1; + break; + } + } + } + } else { + /* Wrap around case is not supported */ + ret = 1; + } +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&jobq->mutex); +#endif + + return ret; +} diff --git a/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h new file mode 100644 index 0000000000..59f71fb9ba --- /dev/null +++ b/media/libvpx/libvpx/vp9/decoder/vp9_job_queue.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ +#define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ + +#include "vpx_util/vpx_pthread.h" + +typedef struct { + // Pointer to buffer base which contains the jobs + uint8_t *buf_base; + + // Pointer to current address where new job can be added + uint8_t *volatile buf_wr; + + // Pointer to current address from where next job can be obtained + uint8_t *volatile buf_rd; + + // Pointer to end of job buffer + uint8_t *buf_end; + + int terminate; + +#if CONFIG_MULTITHREAD + pthread_mutex_t mutex; + pthread_cond_t cond; +#endif +} JobQueueRowMt; + +void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size); +void vp9_jobq_reset(JobQueueRowMt *jobq); +void vp9_jobq_deinit(JobQueueRowMt *jobq); +void vp9_jobq_terminate(JobQueueRowMt *jobq); +int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size); +int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size, + int blocking); + +#endif // VPX_VP9_DECODER_VP9_JOB_QUEUE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c index afffc77178..997b5477e1 100644 --- a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -10,26 +10,2164 @@ #include -#include "./vp9_rtcd.h" #include "./vpx_config.h" +#include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" -#include "vp9/common/vp9_blockd.h" #include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" +#include "vpx_dsp/arm/fdct16x16_neon.h" -void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, - int16_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { - int16_t temp_buffer[64]; - (void)coeff_ptr; +static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in, + int stride) { + // { 0, 1, 1, 1 }; + const int16x4_t nonzero_bias_a = vext_s16(vdup_n_s16(0), vdup_n_s16(1), 3); + // { 1, 0, 0, 0 }; + const int16x4_t nonzero_bias_b = vext_s16(vdup_n_s16(1), vdup_n_s16(0), 3); + int16x4_t mask; - vpx_fdct8x8_neon(input, temp_buffer, stride); - vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan_ptr, iscan_ptr); + int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + + // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by + // one non-zero first elements + mask = vreinterpret_s16_u16(vceq_s16(input_0, nonzero_bias_a)); + input_0 = vadd_s16(input_0, mask); + input_0 = vadd_s16(input_0, nonzero_bias_b); + + in[0] = vcombine_s16(input_0, input_1); + in[1] = vcombine_s16(input_2, input_3); } + +static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) { + const int16x8_t one_s16 = vdupq_n_s16(1); + res[0] = vaddq_s16(res[0], one_s16); + res[1] = vaddq_s16(res[1], one_s16); + res[0] = vshrq_n_s16(res[0], 2); + res[1] = vshrq_n_s16(res[1], 2); + store_s16q_to_tran_low(output + 0 * 8, res[0]); + store_s16q_to_tran_low(output + 1 * 8, res[1]); +} + +static INLINE void fadst4x4_neon(int16x8_t *in) { + int32x4_t u[4], t[4]; + int16x4_t s[4], out[4]; + + s[0] = vget_low_s16(in[0]); // | x_00 | x_01 | x_02 | x_03 | + s[1] = vget_high_s16(in[0]); // | x_10 | x_11 | x_12 | x_13 | + s[2] = vget_low_s16(in[1]); // | x_20 | x_21 | x_22 | x_23 | + s[3] = vget_high_s16(in[1]); // | x_30 | x_31 | x_32 | x_33 | + + // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. + // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9 + t[0] = vmull_n_s16(s[0], sinpi_1_9); + t[0] = vmlal_n_s16(t[0], s[1], sinpi_2_9); + t[0] = vmlal_n_s16(t[0], s[3], sinpi_4_9); + + // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 + t[1] = vmull_n_s16(s[0], sinpi_3_9); + t[1] = vmlal_n_s16(t[1], s[1], sinpi_3_9); + t[1] = vmlsl_n_s16(t[1], s[3], sinpi_3_9); + + // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9 + t[2] = vmull_n_s16(s[0], sinpi_4_9); + t[2] = vmlsl_n_s16(t[2], s[1], sinpi_1_9); + t[2] = vmlal_n_s16(t[2], s[3], sinpi_2_9); + + // t3 = s2 * sinpi_3_9 + t[3] = vmull_n_s16(s[2], sinpi_3_9); + + /* + * u0 = t0 + t3 + * u1 = t1 + * u2 = t2 - t3 + * u3 = t2 - t0 + t3 + */ + u[0] = vaddq_s32(t[0], t[3]); + u[1] = t[1]; + u[2] = vsubq_s32(t[2], t[3]); + u[3] = vaddq_s32(vsubq_s32(t[2], t[0]), t[3]); + + // fdct_round_shift + out[0] = vrshrn_n_s32(u[0], DCT_CONST_BITS); + out[1] = vrshrn_n_s32(u[1], DCT_CONST_BITS); + out[2] = vrshrn_n_s32(u[2], DCT_CONST_BITS); + out[3] = vrshrn_n_s32(u[3], DCT_CONST_BITS); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = vcombine_s16(out[0], out[1]); + in[1] = vcombine_s16(out[2], out[3]); +} + +void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + int16x8_t in[2]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct4x4_neon(input, output, stride); break; + case ADST_DCT: + load_buffer_4x4(input, in, stride); + fadst4x4_neon(in); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); + write_buffer_4x4(output, in); + break; + case DCT_ADST: + load_buffer_4x4(input, in, stride); + // pass1 variant is not accurate enough + vpx_fdct4x4_pass2_neon((int16x4_t *)in); + fadst4x4_neon(in); + write_buffer_4x4(output, in); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_4x4(input, in, stride); + fadst4x4_neon(in); + fadst4x4_neon(in); + write_buffer_4x4(output, in); + break; + } +} + +static INLINE void load_buffer_8x8(const int16_t *input, int16x8_t *in, + int stride) { + in[0] = vshlq_n_s16(vld1q_s16(input + 0 * stride), 2); + in[1] = vshlq_n_s16(vld1q_s16(input + 1 * stride), 2); + in[2] = vshlq_n_s16(vld1q_s16(input + 2 * stride), 2); + in[3] = vshlq_n_s16(vld1q_s16(input + 3 * stride), 2); + in[4] = vshlq_n_s16(vld1q_s16(input + 4 * stride), 2); + in[5] = vshlq_n_s16(vld1q_s16(input + 5 * stride), 2); + in[6] = vshlq_n_s16(vld1q_s16(input + 6 * stride), 2); + in[7] = vshlq_n_s16(vld1q_s16(input + 7 * stride), 2); +} + +/* right shift and rounding + * first get the sign bit (bit 15). + * If bit == 1, it's the simple case of shifting right by one bit. + * If bit == 2, it essentially computes the expression: + * + * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + * + * for each row. + */ +static INLINE void right_shift_8x8(int16x8_t *res, const int bit) { + int16x8_t sign0 = vshrq_n_s16(res[0], 15); + int16x8_t sign1 = vshrq_n_s16(res[1], 15); + int16x8_t sign2 = vshrq_n_s16(res[2], 15); + int16x8_t sign3 = vshrq_n_s16(res[3], 15); + int16x8_t sign4 = vshrq_n_s16(res[4], 15); + int16x8_t sign5 = vshrq_n_s16(res[5], 15); + int16x8_t sign6 = vshrq_n_s16(res[6], 15); + int16x8_t sign7 = vshrq_n_s16(res[7], 15); + + if (bit == 2) { + const int16x8_t const_rounding = vdupq_n_s16(1); + res[0] = vaddq_s16(res[0], const_rounding); + res[1] = vaddq_s16(res[1], const_rounding); + res[2] = vaddq_s16(res[2], const_rounding); + res[3] = vaddq_s16(res[3], const_rounding); + res[4] = vaddq_s16(res[4], const_rounding); + res[5] = vaddq_s16(res[5], const_rounding); + res[6] = vaddq_s16(res[6], const_rounding); + res[7] = vaddq_s16(res[7], const_rounding); + } + + res[0] = vsubq_s16(res[0], sign0); + res[1] = vsubq_s16(res[1], sign1); + res[2] = vsubq_s16(res[2], sign2); + res[3] = vsubq_s16(res[3], sign3); + res[4] = vsubq_s16(res[4], sign4); + res[5] = vsubq_s16(res[5], sign5); + res[6] = vsubq_s16(res[6], sign6); + res[7] = vsubq_s16(res[7], sign7); + + if (bit == 1) { + res[0] = vshrq_n_s16(res[0], 1); + res[1] = vshrq_n_s16(res[1], 1); + res[2] = vshrq_n_s16(res[2], 1); + res[3] = vshrq_n_s16(res[3], 1); + res[4] = vshrq_n_s16(res[4], 1); + res[5] = vshrq_n_s16(res[5], 1); + res[6] = vshrq_n_s16(res[6], 1); + res[7] = vshrq_n_s16(res[7], 1); + } else { + res[0] = vshrq_n_s16(res[0], 2); + res[1] = vshrq_n_s16(res[1], 2); + res[2] = vshrq_n_s16(res[2], 2); + res[3] = vshrq_n_s16(res[3], 2); + res[4] = vshrq_n_s16(res[4], 2); + res[5] = vshrq_n_s16(res[5], 2); + res[6] = vshrq_n_s16(res[6], 2); + res[7] = vshrq_n_s16(res[7], 2); + } +} + +static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res, + int stride) { + store_s16q_to_tran_low(output + 0 * stride, res[0]); + store_s16q_to_tran_low(output + 1 * stride, res[1]); + store_s16q_to_tran_low(output + 2 * stride, res[2]); + store_s16q_to_tran_low(output + 3 * stride, res[3]); + store_s16q_to_tran_low(output + 4 * stride, res[4]); + store_s16q_to_tran_low(output + 5 * stride, res[5]); + store_s16q_to_tran_low(output + 6 * stride, res[6]); + store_s16q_to_tran_low(output + 7 * stride, res[7]); +} + +static INLINE void fadst8x8_neon(int16x8_t *in) { + int16x4_t x_lo[8], x_hi[8]; + int32x4_t s_lo[8], s_hi[8]; + int32x4_t t_lo[8], t_hi[8]; + + x_lo[0] = vget_low_s16(in[7]); + x_hi[0] = vget_high_s16(in[7]); + x_lo[1] = vget_low_s16(in[0]); + x_hi[1] = vget_high_s16(in[0]); + x_lo[2] = vget_low_s16(in[5]); + x_hi[2] = vget_high_s16(in[5]); + x_lo[3] = vget_low_s16(in[2]); + x_hi[3] = vget_high_s16(in[2]); + x_lo[4] = vget_low_s16(in[3]); + x_hi[4] = vget_high_s16(in[3]); + x_lo[5] = vget_low_s16(in[4]); + x_hi[5] = vget_high_s16(in[4]); + x_lo[6] = vget_low_s16(in[1]); + x_hi[6] = vget_high_s16(in[1]); + x_lo[7] = vget_low_s16(in[6]); + x_hi[7] = vget_high_s16(in[6]); + + // stage 1 + // s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + // s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1], + cospi_2_64, cospi_30_64, &s_lo[0], + &s_hi[0], &s_lo[1], &s_hi[1]); + + // s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + // s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3], + cospi_10_64, cospi_22_64, &s_lo[2], + &s_hi[2], &s_lo[3], &s_hi[3]); + + // s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + // s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5], + cospi_18_64, cospi_14_64, &s_lo[4], + &s_hi[4], &s_lo[5], &s_hi[5]); + + // s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + // s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7], + cospi_26_64, cospi_6_64, &s_lo[6], + &s_hi[6], &s_lo[7], &s_hi[7]); + + // fdct_round_shift + t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS); + t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS); + t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS); + t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS); + t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS); + t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS); + t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS); + t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS); + t_lo[4] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + // s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5], + cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4], + &s_lo[5], &s_hi[5]); + + // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + // s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + butterfly_two_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7], + -cospi_24_64, cospi_8_64, &s_lo[6], &s_hi[6], + &s_lo[7], &s_hi[7]); + + // fdct_round_shift + // s0 + s2 + t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]); + // s1 + s3 + t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]); + // s0 - s2 + t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]); + t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]); + // s1 - s3 + t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]); + t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]); + // s4 + s6 + t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS); + // s5 + s7 + t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS); + // s4 - s6 + t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS); + // s5 - s7 + t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS); + + // stage 3 + // cospi_16_64 * (x2 + x3) + // cospi_16_64 * (x2 - x3) + butterfly_one_coeff_s32_noround(t_lo[2], t_hi[2], t_lo[3], t_hi[3], + cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3], + &s_hi[3]); + + // cospi_16_64 * (x6 + x7) + // cospi_16_64 * (x2 - x3) + butterfly_one_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7], + cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7], + &s_hi[7]); + + // final fdct_round_shift + x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS); + x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS); + x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS); + x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS); + x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS); + x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS); + x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS); + x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS); + + // x0, x1, x4, x5 narrow down to 16-bits directly + x_lo[0] = vmovn_s32(t_lo[0]); + x_hi[0] = vmovn_s32(t_hi[0]); + x_lo[1] = vmovn_s32(t_lo[1]); + x_hi[1] = vmovn_s32(t_hi[1]); + x_lo[4] = vmovn_s32(t_lo[4]); + x_hi[4] = vmovn_s32(t_hi[4]); + x_lo[5] = vmovn_s32(t_lo[5]); + x_hi[5] = vmovn_s32(t_hi[5]); + + in[0] = vcombine_s16(x_lo[0], x_hi[0]); + in[1] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4])); + in[2] = vcombine_s16(x_lo[6], x_hi[6]); + in[3] = vnegq_s16(vcombine_s16(x_lo[2], x_hi[2])); + in[4] = vcombine_s16(x_lo[3], x_hi[3]); + in[5] = vnegq_s16(vcombine_s16(x_lo[7], x_hi[7])); + in[6] = vcombine_s16(x_lo[5], x_hi[5]); + in[7] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1])); + + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); +} + +void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + int16x8_t in[8]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct8x8_neon(input, output, stride); break; + case ADST_DCT: + load_buffer_8x8(input, in, stride); + fadst8x8_neon(in); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + case DCT_ADST: + load_buffer_8x8(input, in, stride); + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(in); + fadst8x8_neon(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_8x8(input, in, stride); + fadst8x8_neon(in); + fadst8x8_neon(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); + break; + } +} + +static INLINE void load_buffer_16x16(const int16_t *input, int16x8_t *in0, + int16x8_t *in1, int stride) { + // load first 8 columns + load_buffer_8x8(input, in0, stride); + load_buffer_8x8(input + 8 * stride, in0 + 8, stride); + + input += 8; + // load second 8 columns + load_buffer_8x8(input, in1, stride); + load_buffer_8x8(input + 8 * stride, in1 + 8, stride); +} + +static INLINE void write_buffer_16x16(tran_low_t *output, int16x8_t *in0, + int16x8_t *in1, int stride) { + // write first 8 columns + write_buffer_8x8(output, in0, stride); + write_buffer_8x8(output + 8 * stride, in0 + 8, stride); + + // write second 8 columns + output += 8; + write_buffer_8x8(output, in1, stride); + write_buffer_8x8(output + 8 * stride, in1 + 8, stride); +} + +static INLINE void right_shift_16x16(int16x8_t *res0, int16x8_t *res1) { + // perform rounding operations + right_shift_8x8(res0, 2); + right_shift_8x8(res0 + 8, 2); + right_shift_8x8(res1, 2); + right_shift_8x8(res1 + 8, 2); +} + +static void fdct16_8col(int16x8_t *in) { + // perform 16x16 1-D DCT for 8 columns + int16x8_t i[8], s1[8], s2[8], s3[8], t[8]; + int16x4_t t_lo[8], t_hi[8]; + int32x4_t u_lo[8], u_hi[8]; + + // stage 1 + i[0] = vaddq_s16(in[0], in[15]); + i[1] = vaddq_s16(in[1], in[14]); + i[2] = vaddq_s16(in[2], in[13]); + i[3] = vaddq_s16(in[3], in[12]); + i[4] = vaddq_s16(in[4], in[11]); + i[5] = vaddq_s16(in[5], in[10]); + i[6] = vaddq_s16(in[6], in[9]); + i[7] = vaddq_s16(in[7], in[8]); + + // pass1 variant is not accurate enough + vpx_fdct8x8_pass2_neon(i); + transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]); + + // step 2 + s1[0] = vsubq_s16(in[7], in[8]); + s1[1] = vsubq_s16(in[6], in[9]); + s1[2] = vsubq_s16(in[5], in[10]); + s1[3] = vsubq_s16(in[4], in[11]); + s1[4] = vsubq_s16(in[3], in[12]); + s1[5] = vsubq_s16(in[2], in[13]); + s1[6] = vsubq_s16(in[1], in[14]); + s1[7] = vsubq_s16(in[0], in[15]); + + t[2] = vsubq_s16(s1[5], s1[2]); + t[3] = vsubq_s16(s1[4], s1[3]); + t[4] = vaddq_s16(s1[4], s1[3]); + t[5] = vaddq_s16(s1[5], s1[2]); + + t_lo[2] = vget_low_s16(t[2]); + t_hi[2] = vget_high_s16(t[2]); + t_lo[3] = vget_low_s16(t[3]); + t_hi[3] = vget_high_s16(t[3]); + t_lo[4] = vget_low_s16(t[4]); + t_hi[4] = vget_high_s16(t[4]); + t_lo[5] = vget_low_s16(t[5]); + t_hi[5] = vget_high_s16(t[5]); + + u_lo[2] = vmull_n_s16(t_lo[2], cospi_16_64); + u_hi[2] = vmull_n_s16(t_hi[2], cospi_16_64); + u_lo[3] = vmull_n_s16(t_lo[3], cospi_16_64); + u_hi[3] = vmull_n_s16(t_hi[3], cospi_16_64); + u_lo[4] = vmull_n_s16(t_lo[4], cospi_16_64); + u_hi[4] = vmull_n_s16(t_hi[4], cospi_16_64); + u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64); + u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64); + + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS); + t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS); + t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); + + s2[2] = vcombine_s16(t_lo[2], t_hi[2]); + s2[3] = vcombine_s16(t_lo[3], t_hi[3]); + s2[4] = vcombine_s16(t_lo[4], t_hi[4]); + s2[5] = vcombine_s16(t_lo[5], t_hi[5]); + + // step 3 + s3[0] = vaddq_s16(s1[0], s2[3]); + s3[1] = vaddq_s16(s1[1], s2[2]); + s3[2] = vsubq_s16(s1[1], s2[2]); + s3[3] = vsubq_s16(s1[0], s2[3]); + s3[4] = vsubq_s16(s1[7], s2[4]); + s3[5] = vsubq_s16(s1[6], s2[5]); + s3[6] = vaddq_s16(s1[6], s2[5]); + s3[7] = vaddq_s16(s1[7], s2[4]); + + // step 4 + t_lo[0] = vget_low_s16(s3[0]); + t_hi[0] = vget_high_s16(s3[0]); + t_lo[1] = vget_low_s16(s3[1]); + t_hi[1] = vget_high_s16(s3[1]); + t_lo[2] = vget_low_s16(s3[2]); + t_hi[2] = vget_high_s16(s3[2]); + t_lo[3] = vget_low_s16(s3[3]); + t_hi[3] = vget_high_s16(s3[3]); + t_lo[4] = vget_low_s16(s3[4]); + t_hi[4] = vget_high_s16(s3[4]); + t_lo[5] = vget_low_s16(s3[5]); + t_hi[5] = vget_high_s16(s3[5]); + t_lo[6] = vget_low_s16(s3[6]); + t_hi[6] = vget_high_s16(s3[6]); + t_lo[7] = vget_low_s16(s3[7]); + t_hi[7] = vget_high_s16(s3[7]); + + // u[1] = -cospi_8_64 * t[1] + cospi_24_64 * t[6] + // u[6] = cospi_24_64 * t[1] + cospi_8_64 * t[6] + butterfly_two_coeff_s16_s32_noround(t_lo[1], t_hi[1], t_lo[6], t_hi[6], + -cospi_8_64, cospi_24_64, &u_lo[1], + &u_hi[1], &u_lo[6], &u_hi[6]); + + // u[5] = -cospi_24_64 * t[5] + cospi_8_64 * t[2] + // u[2] = cospi_8_64 * t[5] + cospi_24_64 * t[2] + butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2], + -cospi_24_64, cospi_8_64, &u_lo[5], + &u_hi[5], &u_lo[2], &u_hi[2]); + + t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS); + t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS); + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS); + + s2[1] = vcombine_s16(t_lo[1], t_hi[1]); + s2[2] = vcombine_s16(t_lo[2], t_hi[2]); + s2[5] = vcombine_s16(t_lo[5], t_hi[5]); + s2[6] = vcombine_s16(t_lo[6], t_hi[6]); + + // step 5 + s1[0] = vaddq_s16(s3[0], s2[1]); + s1[1] = vsubq_s16(s3[0], s2[1]); + s1[2] = vaddq_s16(s3[3], s2[2]); + s1[3] = vsubq_s16(s3[3], s2[2]); + s1[4] = vsubq_s16(s3[4], s2[5]); + s1[5] = vaddq_s16(s3[4], s2[5]); + s1[6] = vsubq_s16(s3[7], s2[6]); + s1[7] = vaddq_s16(s3[7], s2[6]); + + // step 6 + t_lo[0] = vget_low_s16(s1[0]); + t_hi[0] = vget_high_s16(s1[0]); + t_lo[1] = vget_low_s16(s1[1]); + t_hi[1] = vget_high_s16(s1[1]); + t_lo[2] = vget_low_s16(s1[2]); + t_hi[2] = vget_high_s16(s1[2]); + t_lo[3] = vget_low_s16(s1[3]); + t_hi[3] = vget_high_s16(s1[3]); + t_lo[4] = vget_low_s16(s1[4]); + t_hi[4] = vget_high_s16(s1[4]); + t_lo[5] = vget_low_s16(s1[5]); + t_hi[5] = vget_high_s16(s1[5]); + t_lo[6] = vget_low_s16(s1[6]); + t_hi[6] = vget_high_s16(s1[6]); + t_lo[7] = vget_low_s16(s1[7]); + t_hi[7] = vget_high_s16(s1[7]); + + // u[0] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64 + // u[7] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64 + butterfly_two_coeff_s16_s32_noround(t_lo[7], t_hi[7], t_lo[0], t_hi[0], + cospi_2_64, cospi_30_64, &u_lo[0], + &u_hi[0], &u_lo[7], &u_hi[7]); + + // u[1] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64 + // u[6] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64 + butterfly_two_coeff_s16_s32_noround(t_lo[6], t_hi[6], t_lo[1], t_hi[1], + cospi_18_64, cospi_14_64, &u_lo[1], + &u_hi[1], &u_lo[6], &u_hi[6]); + + // u[2] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64 + // u[5] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64 + butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2], + cospi_10_64, cospi_22_64, &u_lo[2], + &u_hi[2], &u_lo[5], &u_hi[5]); + + // u[3] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64 + // u[4] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64 + butterfly_two_coeff_s16_s32_noround(t_lo[4], t_hi[4], t_lo[3], t_hi[3], + cospi_26_64, cospi_6_64, &u_lo[3], + &u_hi[3], &u_lo[4], &u_hi[4]); + + // final fdct_round_shift + t_lo[0] = vrshrn_n_s32(u_lo[0], DCT_CONST_BITS); + t_hi[0] = vrshrn_n_s32(u_hi[0], DCT_CONST_BITS); + t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS); + t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS); + t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS); + t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS); + t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS); + t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS); + t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS); + t_lo[7] = vrshrn_n_s32(u_lo[7], DCT_CONST_BITS); + t_hi[7] = vrshrn_n_s32(u_hi[7], DCT_CONST_BITS); + + in[0] = i[0]; + in[2] = i[1]; + in[4] = i[2]; + in[6] = i[3]; + in[8] = i[4]; + in[10] = i[5]; + in[12] = i[6]; + in[14] = i[7]; + in[1] = vcombine_s16(t_lo[0], t_hi[0]); + in[3] = vcombine_s16(t_lo[4], t_hi[4]); + in[5] = vcombine_s16(t_lo[2], t_hi[2]); + in[7] = vcombine_s16(t_lo[6], t_hi[6]); + in[9] = vcombine_s16(t_lo[1], t_hi[1]); + in[11] = vcombine_s16(t_lo[5], t_hi[5]); + in[13] = vcombine_s16(t_lo[3], t_hi[3]); + in[15] = vcombine_s16(t_lo[7], t_hi[7]); +} + +static void fadst16_8col(int16x8_t *in) { + // perform 16x16 1-D ADST for 8 columns + int16x4_t x_lo[16], x_hi[16]; + int32x4_t s_lo[16], s_hi[16]; + int32x4_t t_lo[16], t_hi[16]; + + x_lo[0] = vget_low_s16(in[15]); + x_hi[0] = vget_high_s16(in[15]); + x_lo[1] = vget_low_s16(in[0]); + x_hi[1] = vget_high_s16(in[0]); + x_lo[2] = vget_low_s16(in[13]); + x_hi[2] = vget_high_s16(in[13]); + x_lo[3] = vget_low_s16(in[2]); + x_hi[3] = vget_high_s16(in[2]); + x_lo[4] = vget_low_s16(in[11]); + x_hi[4] = vget_high_s16(in[11]); + x_lo[5] = vget_low_s16(in[4]); + x_hi[5] = vget_high_s16(in[4]); + x_lo[6] = vget_low_s16(in[9]); + x_hi[6] = vget_high_s16(in[9]); + x_lo[7] = vget_low_s16(in[6]); + x_hi[7] = vget_high_s16(in[6]); + x_lo[8] = vget_low_s16(in[7]); + x_hi[8] = vget_high_s16(in[7]); + x_lo[9] = vget_low_s16(in[8]); + x_hi[9] = vget_high_s16(in[8]); + x_lo[10] = vget_low_s16(in[5]); + x_hi[10] = vget_high_s16(in[5]); + x_lo[11] = vget_low_s16(in[10]); + x_hi[11] = vget_high_s16(in[10]); + x_lo[12] = vget_low_s16(in[3]); + x_hi[12] = vget_high_s16(in[3]); + x_lo[13] = vget_low_s16(in[12]); + x_hi[13] = vget_high_s16(in[12]); + x_lo[14] = vget_low_s16(in[1]); + x_hi[14] = vget_high_s16(in[1]); + x_lo[15] = vget_low_s16(in[14]); + x_hi[15] = vget_high_s16(in[14]); + + // stage 1 + // s0 = cospi_1_64 * x0 + cospi_31_64 * x1; + // s1 = cospi_31_64 * x0 - cospi_1_64 * x1; + butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1], + cospi_1_64, cospi_31_64, &s_lo[0], + &s_hi[0], &s_lo[1], &s_hi[1]); + // s2 = cospi_5_64 * x2 + cospi_27_64 * x3; + // s3 = cospi_27_64 * x2 - cospi_5_64 * x3; + butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3], + cospi_5_64, cospi_27_64, &s_lo[2], + &s_hi[2], &s_lo[3], &s_hi[3]); + // s4 = cospi_9_64 * x4 + cospi_23_64 * x5; + // s5 = cospi_23_64 * x4 - cospi_9_64 * x5; + butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5], + cospi_9_64, cospi_23_64, &s_lo[4], + &s_hi[4], &s_lo[5], &s_hi[5]); + // s6 = cospi_13_64 * x6 + cospi_19_64 * x7; + // s7 = cospi_19_64 * x6 - cospi_13_64 * x7; + butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7], + cospi_13_64, cospi_19_64, &s_lo[6], + &s_hi[6], &s_lo[7], &s_hi[7]); + // s8 = cospi_17_64 * x8 + cospi_15_64 * x9; + // s9 = cospi_15_64 * x8 - cospi_17_64 * x9; + butterfly_two_coeff_s16_s32_noround(x_lo[8], x_hi[8], x_lo[9], x_hi[9], + cospi_17_64, cospi_15_64, &s_lo[8], + &s_hi[8], &s_lo[9], &s_hi[9]); + // s10 = cospi_21_64 * x10 + cospi_11_64 * x11; + // s11 = cospi_11_64 * x10 - cospi_21_64 * x11; + butterfly_two_coeff_s16_s32_noround(x_lo[10], x_hi[10], x_lo[11], x_hi[11], + cospi_21_64, cospi_11_64, &s_lo[10], + &s_hi[10], &s_lo[11], &s_hi[11]); + // s12 = cospi_25_64 * x12 + cospi_7_64 * x13; + // s13 = cospi_7_64 * x12 - cospi_25_64 * x13; + butterfly_two_coeff_s16_s32_noround(x_lo[12], x_hi[12], x_lo[13], x_hi[13], + cospi_25_64, cospi_7_64, &s_lo[12], + &s_hi[12], &s_lo[13], &s_hi[13]); + // s14 = cospi_29_64 * x14 + cospi_3_64 * x15; + // s15 = cospi_3_64 * x14 - cospi_29_64 * x15; + butterfly_two_coeff_s16_s32_noround(x_lo[14], x_hi[14], x_lo[15], x_hi[15], + cospi_29_64, cospi_3_64, &s_lo[14], + &s_hi[14], &s_lo[15], &s_hi[15]); + + // fdct_round_shift + t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS); + t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS); + t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS); + t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS); + t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS); + t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS); + t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS); + t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS); + t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(vaddq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(vaddq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(vaddq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(vaddq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS); + t_lo[8] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS); + t_hi[8] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS); + t_lo[9] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS); + t_hi[9] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS); + t_lo[10] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS); + t_hi[10] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS); + t_lo[11] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS); + t_hi[11] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(vsubq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(vsubq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(vsubq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(vsubq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + s_lo[4] = t_lo[4]; + s_hi[4] = t_hi[4]; + s_lo[5] = t_lo[5]; + s_hi[5] = t_hi[5]; + s_lo[6] = t_lo[6]; + s_hi[6] = t_hi[6]; + s_lo[7] = t_lo[7]; + s_hi[7] = t_hi[7]; + // s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + // s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + butterfly_two_coeff_s32_noround(t_lo[8], t_hi[8], t_lo[9], t_hi[9], + cospi_4_64, cospi_28_64, &s_lo[8], &s_hi[8], + &s_lo[9], &s_hi[9]); + // s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + // s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + butterfly_two_coeff_s32_noround(t_lo[10], t_hi[10], t_lo[11], t_hi[11], + cospi_20_64, cospi_12_64, &s_lo[10], + &s_hi[10], &s_lo[11], &s_hi[11]); + // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + // s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + butterfly_two_coeff_s32_noround(t_lo[13], t_hi[13], t_lo[12], t_hi[12], + cospi_28_64, cospi_4_64, &s_lo[13], &s_hi[13], + &s_lo[12], &s_hi[12]); + // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + // s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + cospi_12_64, cospi_20_64, &s_lo[15], + &s_hi[15], &s_lo[14], &s_hi[14]); + + // s0 + s4 + t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[4]); + // s1 + s5 + t_lo[1] = vaddq_s32(s_lo[1], s_lo[5]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[5]); + // s2 + s6 + t_lo[2] = vaddq_s32(s_lo[2], s_lo[6]); + t_hi[2] = vaddq_s32(s_hi[2], s_hi[6]); + // s3 + s7 + t_lo[3] = vaddq_s32(s_lo[3], s_lo[7]); + t_hi[3] = vaddq_s32(s_hi[3], s_hi[7]); + // s0 - s4 + t_lo[4] = vsubq_s32(s_lo[0], s_lo[4]); + t_hi[4] = vsubq_s32(s_hi[0], s_hi[4]); + // s1 - s7 + t_lo[5] = vsubq_s32(s_lo[1], s_lo[5]); + t_hi[5] = vsubq_s32(s_hi[1], s_hi[5]); + // s2 - s6 + t_lo[6] = vsubq_s32(s_lo[2], s_lo[6]); + t_hi[6] = vsubq_s32(s_hi[2], s_hi[6]); + // s3 - s7 + t_lo[7] = vsubq_s32(s_lo[3], s_lo[7]); + t_hi[7] = vsubq_s32(s_hi[3], s_hi[7]); + // s8 + s12 + t_lo[8] = vaddq_s32(s_lo[8], s_lo[12]); + t_hi[8] = vaddq_s32(s_hi[8], s_hi[12]); + // s9 + s13 + t_lo[9] = vaddq_s32(s_lo[9], s_lo[13]); + t_hi[9] = vaddq_s32(s_hi[9], s_hi[13]); + // s10 + s14 + t_lo[10] = vaddq_s32(s_lo[10], s_lo[14]); + t_hi[10] = vaddq_s32(s_hi[10], s_hi[14]); + // s11 + s15 + t_lo[11] = vaddq_s32(s_lo[11], s_lo[15]); + t_hi[11] = vaddq_s32(s_hi[11], s_hi[15]); + // s8 + s12 + t_lo[12] = vsubq_s32(s_lo[8], s_lo[12]); + t_hi[12] = vsubq_s32(s_hi[8], s_hi[12]); + // s9 + s13 + t_lo[13] = vsubq_s32(s_lo[9], s_lo[13]); + t_hi[13] = vsubq_s32(s_hi[9], s_hi[13]); + // s10 + s14 + t_lo[14] = vsubq_s32(s_lo[10], s_lo[14]); + t_hi[14] = vsubq_s32(s_hi[10], s_hi[14]); + // s11 + s15 + t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]); + t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]); + + t_lo[8] = vrshrq_n_s32(t_lo[8], DCT_CONST_BITS); + t_hi[8] = vrshrq_n_s32(t_hi[8], DCT_CONST_BITS); + t_lo[9] = vrshrq_n_s32(t_lo[9], DCT_CONST_BITS); + t_hi[9] = vrshrq_n_s32(t_hi[9], DCT_CONST_BITS); + t_lo[10] = vrshrq_n_s32(t_lo[10], DCT_CONST_BITS); + t_hi[10] = vrshrq_n_s32(t_hi[10], DCT_CONST_BITS); + t_lo[11] = vrshrq_n_s32(t_lo[11], DCT_CONST_BITS); + t_hi[11] = vrshrq_n_s32(t_hi[11], DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS); + + // stage 3 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + // s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5], + cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4], + &s_lo[5], &s_hi[5]); + // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + // s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + butterfly_two_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_24_64, cospi_8_64, &s_lo[7], &s_hi[7], + &s_lo[6], &s_hi[6]); + s_lo[8] = t_lo[8]; + s_hi[8] = t_hi[8]; + s_lo[9] = t_lo[9]; + s_hi[9] = t_hi[9]; + s_lo[10] = t_lo[10]; + s_hi[10] = t_hi[10]; + s_lo[11] = t_lo[11]; + s_hi[11] = t_hi[11]; + // s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + // s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + butterfly_two_coeff_s32_noround(t_lo[12], t_hi[12], t_lo[13], t_hi[13], + cospi_8_64, cospi_24_64, &s_lo[12], &s_hi[12], + &s_lo[13], &s_hi[13]); + // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + // s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + cospi_24_64, cospi_8_64, &s_lo[15], &s_hi[15], + &s_lo[14], &s_hi[14]); + + // s0 + s4 + t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]); + t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]); + // s1 + s3 + t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]); + t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]); + // s0 - s4 + t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]); + t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]); + // s1 - s3 + t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]); + t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]); + // s4 + s6 + t_lo[4] = vaddq_s32(s_lo[4], s_lo[6]); + t_hi[4] = vaddq_s32(s_hi[4], s_hi[6]); + // s5 + s7 + t_lo[5] = vaddq_s32(s_lo[5], s_lo[7]); + t_hi[5] = vaddq_s32(s_hi[5], s_hi[7]); + // s4 - s6 + t_lo[6] = vsubq_s32(s_lo[4], s_lo[6]); + t_hi[6] = vsubq_s32(s_hi[4], s_hi[6]); + // s5 - s7 + t_lo[7] = vsubq_s32(s_lo[5], s_lo[7]); + t_hi[7] = vsubq_s32(s_hi[5], s_hi[7]); + // s8 + s10 + t_lo[8] = vaddq_s32(s_lo[8], s_lo[10]); + t_hi[8] = vaddq_s32(s_hi[8], s_hi[10]); + // s9 + s11 + t_lo[9] = vaddq_s32(s_lo[9], s_lo[11]); + t_hi[9] = vaddq_s32(s_hi[9], s_hi[11]); + // s8 - s10 + t_lo[10] = vsubq_s32(s_lo[8], s_lo[10]); + t_hi[10] = vsubq_s32(s_hi[8], s_hi[10]); + // s9 - s11 + t_lo[11] = vsubq_s32(s_lo[9], s_lo[11]); + t_hi[11] = vsubq_s32(s_hi[9], s_hi[11]); + // s12 + s14 + t_lo[12] = vaddq_s32(s_lo[12], s_lo[14]); + t_hi[12] = vaddq_s32(s_hi[12], s_hi[14]); + // s13 + s15 + t_lo[13] = vaddq_s32(s_lo[13], s_lo[15]); + t_hi[13] = vaddq_s32(s_hi[13], s_hi[15]); + // s12 - s14 + t_lo[14] = vsubq_s32(s_lo[12], s_lo[14]); + t_hi[14] = vsubq_s32(s_hi[12], s_hi[14]); + // s13 - s15 + t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]); + t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]); + + t_lo[4] = vrshrq_n_s32(t_lo[4], DCT_CONST_BITS); + t_hi[4] = vrshrq_n_s32(t_hi[4], DCT_CONST_BITS); + t_lo[5] = vrshrq_n_s32(t_lo[5], DCT_CONST_BITS); + t_hi[5] = vrshrq_n_s32(t_hi[5], DCT_CONST_BITS); + t_lo[6] = vrshrq_n_s32(t_lo[6], DCT_CONST_BITS); + t_hi[6] = vrshrq_n_s32(t_hi[6], DCT_CONST_BITS); + t_lo[7] = vrshrq_n_s32(t_lo[7], DCT_CONST_BITS); + t_hi[7] = vrshrq_n_s32(t_hi[7], DCT_CONST_BITS); + t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS); + t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS); + t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS); + t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS); + t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS); + t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS); + t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS); + t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS); + + // stage 4 + // s2 = (-cospi_16_64) * (x2 + x3); + // s3 = cospi_16_64 * (x2 - x3); + butterfly_one_coeff_s32_noround(t_lo[3], t_hi[3], t_lo[2], t_hi[2], + -cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3], + &s_hi[3]); + // s6 = cospi_16_64 * (x6 + x7); + // s7 = cospi_16_64 * (-x6 + x7); + butterfly_one_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7], + &s_hi[7]); + // s10 = cospi_16_64 * (x10 + x11); + // s11 = cospi_16_64 * (-x10 + x11); + butterfly_one_coeff_s32_noround(t_lo[11], t_hi[11], t_lo[10], t_hi[10], + cospi_16_64, &s_lo[10], &s_hi[10], &s_lo[11], + &s_hi[11]); + // s14 = (-cospi_16_64) * (x14 + x15); + // s15 = cospi_16_64 * (x14 - x15); + butterfly_one_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + -cospi_16_64, &s_lo[14], &s_hi[14], &s_lo[15], + &s_hi[15]); + + // final fdct_round_shift + x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS); + x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS); + x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS); + x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS); + x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS); + x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS); + x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS); + x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS); + x_lo[10] = vrshrn_n_s32(s_lo[10], DCT_CONST_BITS); + x_hi[10] = vrshrn_n_s32(s_hi[10], DCT_CONST_BITS); + x_lo[11] = vrshrn_n_s32(s_lo[11], DCT_CONST_BITS); + x_hi[11] = vrshrn_n_s32(s_hi[11], DCT_CONST_BITS); + x_lo[14] = vrshrn_n_s32(s_lo[14], DCT_CONST_BITS); + x_hi[14] = vrshrn_n_s32(s_hi[14], DCT_CONST_BITS); + x_lo[15] = vrshrn_n_s32(s_lo[15], DCT_CONST_BITS); + x_hi[15] = vrshrn_n_s32(s_hi[15], DCT_CONST_BITS); + + // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly + x_lo[0] = vmovn_s32(t_lo[0]); + x_hi[0] = vmovn_s32(t_hi[0]); + x_lo[1] = vmovn_s32(t_lo[1]); + x_hi[1] = vmovn_s32(t_hi[1]); + x_lo[4] = vmovn_s32(t_lo[4]); + x_hi[4] = vmovn_s32(t_hi[4]); + x_lo[5] = vmovn_s32(t_lo[5]); + x_hi[5] = vmovn_s32(t_hi[5]); + x_lo[8] = vmovn_s32(t_lo[8]); + x_hi[8] = vmovn_s32(t_hi[8]); + x_lo[9] = vmovn_s32(t_lo[9]); + x_hi[9] = vmovn_s32(t_hi[9]); + x_lo[12] = vmovn_s32(t_lo[12]); + x_hi[12] = vmovn_s32(t_hi[12]); + x_lo[13] = vmovn_s32(t_lo[13]); + x_hi[13] = vmovn_s32(t_hi[13]); + + in[0] = vcombine_s16(x_lo[0], x_hi[0]); + in[1] = vnegq_s16(vcombine_s16(x_lo[8], x_hi[8])); + in[2] = vcombine_s16(x_lo[12], x_hi[12]); + in[3] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4])); + in[4] = vcombine_s16(x_lo[6], x_hi[6]); + in[5] = vcombine_s16(x_lo[14], x_hi[14]); + in[6] = vcombine_s16(x_lo[10], x_hi[10]); + in[7] = vcombine_s16(x_lo[2], x_hi[2]); + in[8] = vcombine_s16(x_lo[3], x_hi[3]); + in[9] = vcombine_s16(x_lo[11], x_hi[11]); + in[10] = vcombine_s16(x_lo[15], x_hi[15]); + in[11] = vcombine_s16(x_lo[7], x_hi[7]); + in[12] = vcombine_s16(x_lo[5], x_hi[5]); + in[13] = vnegq_s16(vcombine_s16(x_lo[13], x_hi[13])); + in[14] = vcombine_s16(x_lo[9], x_hi[9]); + in[15] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1])); +} + +static void fdct16x16_neon(int16x8_t *in0, int16x8_t *in1) { + // Left half. + fdct16_8col(in0); + // Right half. + fdct16_8col(in1); + transpose_s16_16x16(in0, in1); +} + +static void fadst16x16_neon(int16x8_t *in0, int16x8_t *in1) { + fadst16_8col(in0); + fadst16_8col(in1); + transpose_s16_16x16(in0, in1); +} + +void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, + int tx_type) { + int16x8_t in0[16], in1[16]; + + switch (tx_type) { + case DCT_DCT: vpx_fdct16x16_neon(input, output, stride); break; + case ADST_DCT: + load_buffer_16x16(input, in0, in1, stride); + fadst16x16_neon(in0, in1); + right_shift_16x16(in0, in1); + fdct16x16_neon(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + case DCT_ADST: + load_buffer_16x16(input, in0, in1, stride); + fdct16x16_neon(in0, in1); + right_shift_16x16(in0, in1); + fadst16x16_neon(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + default: + assert(tx_type == ADST_ADST); + load_buffer_16x16(input, in0, in1, stride); + fadst16x16_neon(in0, in1); + right_shift_16x16(in0, in1); + fadst16x16_neon(in0, in1); + write_buffer_16x16(output, in0, in1, 16); + break; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void highbd_load_buffer_4x4(const int16_t *input, + int32x4_t *in /*[4]*/, int stride) { + // { 0, 1, 1, 1 }; + const int32x4_t nonzero_bias_a = vextq_s32(vdupq_n_s32(0), vdupq_n_s32(1), 3); + // { 1, 0, 0, 0 }; + const int32x4_t nonzero_bias_b = vextq_s32(vdupq_n_s32(1), vdupq_n_s32(0), 3); + int32x4_t mask; + + in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4); + + // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by + // one non-zero first elements + mask = vreinterpretq_s32_u32(vceqq_s32(in[0], nonzero_bias_a)); + in[0] = vaddq_s32(in[0], mask); + in[0] = vaddq_s32(in[0], nonzero_bias_b); +} + +static INLINE void highbd_write_buffer_4x4(tran_low_t *output, int32x4_t *res) { + const int32x4_t one = vdupq_n_s32(1); + res[0] = vshrq_n_s32(vaddq_s32(res[0], one), 2); + res[1] = vshrq_n_s32(vaddq_s32(res[1], one), 2); + res[2] = vshrq_n_s32(vaddq_s32(res[2], one), 2); + res[3] = vshrq_n_s32(vaddq_s32(res[3], one), 2); + vst1q_s32(output + 0 * 4, res[0]); + vst1q_s32(output + 1 * 4, res[1]); + vst1q_s32(output + 2 * 4, res[2]); + vst1q_s32(output + 3 * 4, res[3]); +} + +static INLINE void highbd_fadst4x4_neon(int32x4_t *in /*[4]*/) { + int32x2_t s_lo[4], s_hi[4]; + int64x2_t u_lo[4], u_hi[4], t_lo[4], t_hi[4]; + + s_lo[0] = vget_low_s32(in[0]); + s_hi[0] = vget_high_s32(in[0]); + s_lo[1] = vget_low_s32(in[1]); + s_hi[1] = vget_high_s32(in[1]); + s_lo[2] = vget_low_s32(in[2]); + s_hi[2] = vget_high_s32(in[2]); + s_lo[3] = vget_low_s32(in[3]); + s_hi[3] = vget_high_s32(in[3]); + + // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9 + t_lo[0] = vmull_n_s32(s_lo[0], sinpi_1_9); + t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[1], sinpi_2_9); + t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[3], sinpi_4_9); + t_hi[0] = vmull_n_s32(s_hi[0], sinpi_1_9); + t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[1], sinpi_2_9); + t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[3], sinpi_4_9); + + // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9 + t_lo[1] = vmull_n_s32(s_lo[0], sinpi_3_9); + t_lo[1] = vmlal_n_s32(t_lo[1], s_lo[1], sinpi_3_9); + t_lo[1] = vmlsl_n_s32(t_lo[1], s_lo[3], sinpi_3_9); + t_hi[1] = vmull_n_s32(s_hi[0], sinpi_3_9); + t_hi[1] = vmlal_n_s32(t_hi[1], s_hi[1], sinpi_3_9); + t_hi[1] = vmlsl_n_s32(t_hi[1], s_hi[3], sinpi_3_9); + + // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9 + t_lo[2] = vmull_n_s32(s_lo[0], sinpi_4_9); + t_lo[2] = vmlsl_n_s32(t_lo[2], s_lo[1], sinpi_1_9); + t_lo[2] = vmlal_n_s32(t_lo[2], s_lo[3], sinpi_2_9); + t_hi[2] = vmull_n_s32(s_hi[0], sinpi_4_9); + t_hi[2] = vmlsl_n_s32(t_hi[2], s_hi[1], sinpi_1_9); + t_hi[2] = vmlal_n_s32(t_hi[2], s_hi[3], sinpi_2_9); + + // t3 = s2 * sinpi_3_9 + t_lo[3] = vmull_n_s32(s_lo[2], sinpi_3_9); + t_hi[3] = vmull_n_s32(s_hi[2], sinpi_3_9); + + /* + * u0 = t0 + t3 + * u1 = t1 + * u2 = t2 - t3 + * u3 = t2 - t0 + t3 + */ + u_lo[0] = vaddq_s64(t_lo[0], t_lo[3]); + u_hi[0] = vaddq_s64(t_hi[0], t_hi[3]); + u_lo[1] = t_lo[1]; + u_hi[1] = t_hi[1]; + u_lo[2] = vsubq_s64(t_lo[2], t_lo[3]); + u_hi[2] = vsubq_s64(t_hi[2], t_hi[3]); + u_lo[3] = vaddq_s64(vsubq_s64(t_lo[2], t_lo[0]), t_lo[3]); + u_hi[3] = vaddq_s64(vsubq_s64(t_hi[2], t_hi[0]), t_hi[3]); + + // fdct_round_shift + in[0] = vcombine_s32(vrshrn_n_s64(u_lo[0], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[0], DCT_CONST_BITS)); + in[1] = vcombine_s32(vrshrn_n_s64(u_lo[1], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[1], DCT_CONST_BITS)); + in[2] = vcombine_s32(vrshrn_n_s64(u_lo[2], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[2], DCT_CONST_BITS)); + in[3] = vcombine_s32(vrshrn_n_s64(u_lo[3], DCT_CONST_BITS), + vrshrn_n_s64(u_hi[3], DCT_CONST_BITS)); + + transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]); +} + +void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t in[4]; + // int i; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct4x4_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_4x4(input, in, stride); + highbd_fadst4x4_neon(in); + vpx_highbd_fdct4x4_pass1_neon(in); + highbd_write_buffer_4x4(output, in); + break; + case DCT_ADST: + highbd_load_buffer_4x4(input, in, stride); + vpx_highbd_fdct4x4_pass1_neon(in); + highbd_fadst4x4_neon(in); + highbd_write_buffer_4x4(output, in); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_4x4(input, in, stride); + highbd_fadst4x4_neon(in); + highbd_fadst4x4_neon(in); + highbd_write_buffer_4x4(output, in); + break; + } +} + +static INLINE void highbd_load_buffer_8x8(const int16_t *input, + int32x4_t *lo /*[8]*/, + int32x4_t *hi /*[8]*/, int stride) { + int16x8_t in[8]; + in[0] = vld1q_s16(input + 0 * stride); + in[1] = vld1q_s16(input + 1 * stride); + in[2] = vld1q_s16(input + 2 * stride); + in[3] = vld1q_s16(input + 3 * stride); + in[4] = vld1q_s16(input + 4 * stride); + in[5] = vld1q_s16(input + 5 * stride); + in[6] = vld1q_s16(input + 6 * stride); + in[7] = vld1q_s16(input + 7 * stride); + lo[0] = vshll_n_s16(vget_low_s16(in[0]), 2); + hi[0] = vshll_n_s16(vget_high_s16(in[0]), 2); + lo[1] = vshll_n_s16(vget_low_s16(in[1]), 2); + hi[1] = vshll_n_s16(vget_high_s16(in[1]), 2); + lo[2] = vshll_n_s16(vget_low_s16(in[2]), 2); + hi[2] = vshll_n_s16(vget_high_s16(in[2]), 2); + lo[3] = vshll_n_s16(vget_low_s16(in[3]), 2); + hi[3] = vshll_n_s16(vget_high_s16(in[3]), 2); + lo[4] = vshll_n_s16(vget_low_s16(in[4]), 2); + hi[4] = vshll_n_s16(vget_high_s16(in[4]), 2); + lo[5] = vshll_n_s16(vget_low_s16(in[5]), 2); + hi[5] = vshll_n_s16(vget_high_s16(in[5]), 2); + lo[6] = vshll_n_s16(vget_low_s16(in[6]), 2); + hi[6] = vshll_n_s16(vget_high_s16(in[6]), 2); + lo[7] = vshll_n_s16(vget_low_s16(in[7]), 2); + hi[7] = vshll_n_s16(vget_high_s16(in[7]), 2); +} + +/* right shift and rounding + * first get the sign bit (bit 15). + * If bit == 1, it's the simple case of shifting right by one bit. + * If bit == 2, it essentially computes the expression: + * + * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + * + * for each row. + */ +static INLINE void highbd_right_shift_8x8(int32x4_t *lo, int32x4_t *hi, + const int bit) { + int32x4_t sign_lo[8], sign_hi[8]; + sign_lo[0] = vshrq_n_s32(lo[0], 31); + sign_hi[0] = vshrq_n_s32(hi[0], 31); + sign_lo[1] = vshrq_n_s32(lo[1], 31); + sign_hi[1] = vshrq_n_s32(hi[1], 31); + sign_lo[2] = vshrq_n_s32(lo[2], 31); + sign_hi[2] = vshrq_n_s32(hi[2], 31); + sign_lo[3] = vshrq_n_s32(lo[3], 31); + sign_hi[3] = vshrq_n_s32(hi[3], 31); + sign_lo[4] = vshrq_n_s32(lo[4], 31); + sign_hi[4] = vshrq_n_s32(hi[4], 31); + sign_lo[5] = vshrq_n_s32(lo[5], 31); + sign_hi[5] = vshrq_n_s32(hi[5], 31); + sign_lo[6] = vshrq_n_s32(lo[6], 31); + sign_hi[6] = vshrq_n_s32(hi[6], 31); + sign_lo[7] = vshrq_n_s32(lo[7], 31); + sign_hi[7] = vshrq_n_s32(hi[7], 31); + + if (bit == 2) { + const int32x4_t const_rounding = vdupq_n_s32(1); + lo[0] = vaddq_s32(lo[0], const_rounding); + hi[0] = vaddq_s32(hi[0], const_rounding); + lo[1] = vaddq_s32(lo[1], const_rounding); + hi[1] = vaddq_s32(hi[1], const_rounding); + lo[2] = vaddq_s32(lo[2], const_rounding); + hi[2] = vaddq_s32(hi[2], const_rounding); + lo[3] = vaddq_s32(lo[3], const_rounding); + hi[3] = vaddq_s32(hi[3], const_rounding); + lo[4] = vaddq_s32(lo[4], const_rounding); + hi[4] = vaddq_s32(hi[4], const_rounding); + lo[5] = vaddq_s32(lo[5], const_rounding); + hi[5] = vaddq_s32(hi[5], const_rounding); + lo[6] = vaddq_s32(lo[6], const_rounding); + hi[6] = vaddq_s32(hi[6], const_rounding); + lo[7] = vaddq_s32(lo[7], const_rounding); + hi[7] = vaddq_s32(hi[7], const_rounding); + } + + lo[0] = vsubq_s32(lo[0], sign_lo[0]); + hi[0] = vsubq_s32(hi[0], sign_hi[0]); + lo[1] = vsubq_s32(lo[1], sign_lo[1]); + hi[1] = vsubq_s32(hi[1], sign_hi[1]); + lo[2] = vsubq_s32(lo[2], sign_lo[2]); + hi[2] = vsubq_s32(hi[2], sign_hi[2]); + lo[3] = vsubq_s32(lo[3], sign_lo[3]); + hi[3] = vsubq_s32(hi[3], sign_hi[3]); + lo[4] = vsubq_s32(lo[4], sign_lo[4]); + hi[4] = vsubq_s32(hi[4], sign_hi[4]); + lo[5] = vsubq_s32(lo[5], sign_lo[5]); + hi[5] = vsubq_s32(hi[5], sign_hi[5]); + lo[6] = vsubq_s32(lo[6], sign_lo[6]); + hi[6] = vsubq_s32(hi[6], sign_hi[6]); + lo[7] = vsubq_s32(lo[7], sign_lo[7]); + hi[7] = vsubq_s32(hi[7], sign_hi[7]); + + if (bit == 1) { + lo[0] = vshrq_n_s32(lo[0], 1); + hi[0] = vshrq_n_s32(hi[0], 1); + lo[1] = vshrq_n_s32(lo[1], 1); + hi[1] = vshrq_n_s32(hi[1], 1); + lo[2] = vshrq_n_s32(lo[2], 1); + hi[2] = vshrq_n_s32(hi[2], 1); + lo[3] = vshrq_n_s32(lo[3], 1); + hi[3] = vshrq_n_s32(hi[3], 1); + lo[4] = vshrq_n_s32(lo[4], 1); + hi[4] = vshrq_n_s32(hi[4], 1); + lo[5] = vshrq_n_s32(lo[5], 1); + hi[5] = vshrq_n_s32(hi[5], 1); + lo[6] = vshrq_n_s32(lo[6], 1); + hi[6] = vshrq_n_s32(hi[6], 1); + lo[7] = vshrq_n_s32(lo[7], 1); + hi[7] = vshrq_n_s32(hi[7], 1); + } else { + lo[0] = vshrq_n_s32(lo[0], 2); + hi[0] = vshrq_n_s32(hi[0], 2); + lo[1] = vshrq_n_s32(lo[1], 2); + hi[1] = vshrq_n_s32(hi[1], 2); + lo[2] = vshrq_n_s32(lo[2], 2); + hi[2] = vshrq_n_s32(hi[2], 2); + lo[3] = vshrq_n_s32(lo[3], 2); + hi[3] = vshrq_n_s32(hi[3], 2); + lo[4] = vshrq_n_s32(lo[4], 2); + hi[4] = vshrq_n_s32(hi[4], 2); + lo[5] = vshrq_n_s32(lo[5], 2); + hi[5] = vshrq_n_s32(hi[5], 2); + lo[6] = vshrq_n_s32(lo[6], 2); + hi[6] = vshrq_n_s32(hi[6], 2); + lo[7] = vshrq_n_s32(lo[7], 2); + hi[7] = vshrq_n_s32(hi[7], 2); + } +} + +static INLINE void highbd_write_buffer_8x8(tran_low_t *output, int32x4_t *lo, + int32x4_t *hi, int stride) { + vst1q_s32(output + 0 * stride, lo[0]); + vst1q_s32(output + 0 * stride + 4, hi[0]); + vst1q_s32(output + 1 * stride, lo[1]); + vst1q_s32(output + 1 * stride + 4, hi[1]); + vst1q_s32(output + 2 * stride, lo[2]); + vst1q_s32(output + 2 * stride + 4, hi[2]); + vst1q_s32(output + 3 * stride, lo[3]); + vst1q_s32(output + 3 * stride + 4, hi[3]); + vst1q_s32(output + 4 * stride, lo[4]); + vst1q_s32(output + 4 * stride + 4, hi[4]); + vst1q_s32(output + 5 * stride, lo[5]); + vst1q_s32(output + 5 * stride + 4, hi[5]); + vst1q_s32(output + 6 * stride, lo[6]); + vst1q_s32(output + 6 * stride + 4, hi[6]); + vst1q_s32(output + 7 * stride, lo[7]); + vst1q_s32(output + 7 * stride + 4, hi[7]); +} + +static INLINE void highbd_fadst8x8_neon(int32x4_t *lo /*[8]*/, + int32x4_t *hi /*[8]*/) { + int32x4_t s_lo[8], s_hi[8]; + int32x4_t t_lo[8], t_hi[8]; + int32x4_t x_lo[8], x_hi[8]; + int64x2_t s64_lo[16], s64_hi[16]; + + x_lo[0] = lo[7]; + x_hi[0] = hi[7]; + x_lo[1] = lo[0]; + x_hi[1] = hi[0]; + x_lo[2] = lo[5]; + x_hi[2] = hi[5]; + x_lo[3] = lo[2]; + x_hi[3] = hi[2]; + x_lo[4] = lo[3]; + x_hi[4] = hi[3]; + x_lo[5] = lo[4]; + x_hi[5] = hi[4]; + x_lo[6] = lo[1]; + x_hi[6] = hi[1]; + x_lo[7] = lo[6]; + x_hi[7] = hi[6]; + + // stage 1 + // s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + // s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + butterfly_two_coeff_s32_s64_noround( + x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_2_64, cospi_30_64, + &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]); + // s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + // s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + butterfly_two_coeff_s32_s64_noround( + x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_10_64, cospi_22_64, + &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]); + + // s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + // s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + butterfly_two_coeff_s32_s64_noround( + x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_18_64, cospi_14_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + + // s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + // s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + butterfly_two_coeff_s32_s64_noround( + x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_26_64, cospi_6_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + + // fdct_round_shift, indices are doubled + t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]); + t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]); + t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]); + t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]); + t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]); + t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]); + t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]); + t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]); + t_lo[4] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]); + t_hi[4] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]); + t_lo[5] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]); + t_hi[5] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]); + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]); + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + // s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + butterfly_two_coeff_s32_s64_noround( + t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + + // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + // s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + butterfly_two_coeff_s32_s64_noround( + t_lo[6], t_hi[6], t_lo[7], t_hi[7], -cospi_24_64, cospi_8_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + + // fdct_round_shift + // s0 + s2 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]); + // s0 - s2 + t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]); + + // s1 + s3 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]); + // s1 - s3 + t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]); + + // s4 + s6 + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s4 - s6 + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + + // s5 + s7 + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s5 - s7 + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + + // stage 3 + // s2 = cospi_16_64 * (x2 + x3) + // s3 = cospi_16_64 * (x2 - x3) + butterfly_one_coeff_s32_fast(t_lo[2], t_hi[2], t_lo[3], t_hi[3], cospi_16_64, + &s_lo[2], &s_hi[2], &s_lo[3], &s_hi[3]); + + // s6 = cospi_16_64 * (x6 + x7) + // s7 = cospi_16_64 * (x6 - x7) + butterfly_one_coeff_s32_fast(t_lo[6], t_hi[6], t_lo[7], t_hi[7], cospi_16_64, + &s_lo[6], &s_hi[6], &s_lo[7], &s_hi[7]); + + // x0, x2, x4, x6 pass through + lo[0] = t_lo[0]; + hi[0] = t_hi[0]; + lo[2] = s_lo[6]; + hi[2] = s_hi[6]; + lo[4] = s_lo[3]; + hi[4] = s_hi[3]; + lo[6] = t_lo[5]; + hi[6] = t_hi[5]; + + lo[1] = vnegq_s32(t_lo[4]); + hi[1] = vnegq_s32(t_hi[4]); + lo[3] = vnegq_s32(s_lo[2]); + hi[3] = vnegq_s32(s_hi[2]); + lo[5] = vnegq_s32(s_lo[7]); + hi[5] = vnegq_s32(s_hi[7]); + lo[7] = vnegq_s32(t_lo[1]); + hi[7] = vnegq_s32(t_hi[1]); + + transpose_s32_8x8_2(lo, hi, lo, hi); +} + +void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t lo[8], hi[8]; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct8x8_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_8x8(input, lo, hi, stride); + highbd_fadst8x8_neon(lo, hi); + // pass1 variant is not precise enough + vpx_highbd_fdct8x8_pass2_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + case DCT_ADST: + highbd_load_buffer_8x8(input, lo, hi, stride); + // pass1 variant is not precise enough + vpx_highbd_fdct8x8_pass2_neon(lo, hi); + highbd_fadst8x8_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_8x8(input, lo, hi, stride); + highbd_fadst8x8_neon(lo, hi); + highbd_fadst8x8_neon(lo, hi); + highbd_right_shift_8x8(lo, hi, 1); + highbd_write_buffer_8x8(output, lo, hi, 8); + break; + } +} + +static INLINE void highbd_load_buffer_16x16( + const int16_t *input, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) { + // load first 8 columns + highbd_load_buffer_8x8(input, left1, right1, stride); + highbd_load_buffer_8x8(input + 8 * stride, left1 + 8, right1 + 8, stride); + + input += 8; + // load second 8 columns + highbd_load_buffer_8x8(input, left2, right2, stride); + highbd_load_buffer_8x8(input + 8 * stride, left2 + 8, right2 + 8, stride); +} + +static INLINE void highbd_write_buffer_16x16( + tran_low_t *output, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) { + // write first 8 columns + highbd_write_buffer_8x8(output, left1, right1, stride); + highbd_write_buffer_8x8(output + 8 * stride, left1 + 8, right1 + 8, stride); + + // write second 8 columns + output += 8; + highbd_write_buffer_8x8(output, left2, right2, stride); + highbd_write_buffer_8x8(output + 8 * stride, left2 + 8, right2 + 8, stride); +} + +static INLINE void highbd_right_shift_16x16(int32x4_t *left1 /*[16]*/, + int32x4_t *right1 /*[16]*/, + int32x4_t *left2 /*[16]*/, + int32x4_t *right2 /*[16]*/, + const int bit) { + // perform rounding operations + highbd_right_shift_8x8(left1, right1, bit); + highbd_right_shift_8x8(left1 + 8, right1 + 8, bit); + highbd_right_shift_8x8(left2, right2, bit); + highbd_right_shift_8x8(left2 + 8, right2 + 8, bit); +} + +static void highbd_fdct16_8col(int32x4_t *left, int32x4_t *right) { + // perform 16x16 1-D DCT for 8 columns + int32x4_t s1_lo[8], s1_hi[8], s2_lo[8], s2_hi[8], s3_lo[8], s3_hi[8]; + int32x4_t left8[8], right8[8]; + + // stage 1 + left8[0] = vaddq_s32(left[0], left[15]); + right8[0] = vaddq_s32(right[0], right[15]); + left8[1] = vaddq_s32(left[1], left[14]); + right8[1] = vaddq_s32(right[1], right[14]); + left8[2] = vaddq_s32(left[2], left[13]); + right8[2] = vaddq_s32(right[2], right[13]); + left8[3] = vaddq_s32(left[3], left[12]); + right8[3] = vaddq_s32(right[3], right[12]); + left8[4] = vaddq_s32(left[4], left[11]); + right8[4] = vaddq_s32(right[4], right[11]); + left8[5] = vaddq_s32(left[5], left[10]); + right8[5] = vaddq_s32(right[5], right[10]); + left8[6] = vaddq_s32(left[6], left[9]); + right8[6] = vaddq_s32(right[6], right[9]); + left8[7] = vaddq_s32(left[7], left[8]); + right8[7] = vaddq_s32(right[7], right[8]); + + // step 1 + s1_lo[0] = vsubq_s32(left[7], left[8]); + s1_hi[0] = vsubq_s32(right[7], right[8]); + s1_lo[1] = vsubq_s32(left[6], left[9]); + s1_hi[1] = vsubq_s32(right[6], right[9]); + s1_lo[2] = vsubq_s32(left[5], left[10]); + s1_hi[2] = vsubq_s32(right[5], right[10]); + s1_lo[3] = vsubq_s32(left[4], left[11]); + s1_hi[3] = vsubq_s32(right[4], right[11]); + s1_lo[4] = vsubq_s32(left[3], left[12]); + s1_hi[4] = vsubq_s32(right[3], right[12]); + s1_lo[5] = vsubq_s32(left[2], left[13]); + s1_hi[5] = vsubq_s32(right[2], right[13]); + s1_lo[6] = vsubq_s32(left[1], left[14]); + s1_hi[6] = vsubq_s32(right[1], right[14]); + s1_lo[7] = vsubq_s32(left[0], left[15]); + s1_hi[7] = vsubq_s32(right[0], right[15]); + + // pass1 variant is not accurate enough + vpx_highbd_fdct8x8_pass2_notranspose_neon(left8, right8); + + // step 2 + // step2[2] = (step1[5] - step1[2]) * cospi_16_64; + // step2[5] = (step1[5] + step1[2]) * cospi_16_64; + butterfly_one_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2], + cospi_16_64, &s2_lo[5], &s2_hi[5], + &s2_lo[2], &s2_hi[2]); + // step2[3] = (step1[4] - step1[3]) * cospi_16_64; + // step2[4] = (step1[4] + step1[3]) * cospi_16_64; + butterfly_one_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3], + cospi_16_64, &s2_lo[4], &s2_hi[4], + &s2_lo[3], &s2_hi[3]); + + // step 3 + s3_lo[0] = vaddq_s32(s1_lo[0], s2_lo[3]); + s3_hi[0] = vaddq_s32(s1_hi[0], s2_hi[3]); + s3_lo[1] = vaddq_s32(s1_lo[1], s2_lo[2]); + s3_hi[1] = vaddq_s32(s1_hi[1], s2_hi[2]); + s3_lo[2] = vsubq_s32(s1_lo[1], s2_lo[2]); + s3_hi[2] = vsubq_s32(s1_hi[1], s2_hi[2]); + s3_lo[3] = vsubq_s32(s1_lo[0], s2_lo[3]); + s3_hi[3] = vsubq_s32(s1_hi[0], s2_hi[3]); + s3_lo[4] = vsubq_s32(s1_lo[7], s2_lo[4]); + s3_hi[4] = vsubq_s32(s1_hi[7], s2_hi[4]); + s3_lo[5] = vsubq_s32(s1_lo[6], s2_lo[5]); + s3_hi[5] = vsubq_s32(s1_hi[6], s2_hi[5]); + s3_lo[6] = vaddq_s32(s1_lo[6], s2_lo[5]); + s3_hi[6] = vaddq_s32(s1_hi[6], s2_hi[5]); + s3_lo[7] = vaddq_s32(s1_lo[7], s2_lo[4]); + s3_hi[7] = vaddq_s32(s1_hi[7], s2_hi[4]); + + // step 4 + // s2[1] = cospi_24_64 * s3[6] - cospi_8_64 * s3[1] + // s2[6] = cospi_8_64 * s3[6] + cospi_24_64 * s3[1] + butterfly_two_coeff_s32_s64_narrow(s3_lo[6], s3_hi[6], s3_lo[1], s3_hi[1], + cospi_8_64, cospi_24_64, &s2_lo[6], + &s2_hi[6], &s2_lo[1], &s2_hi[1]); + + // s2[5] = cospi_8_64 * s3[2] - cospi_24_64 * s3[5] + // s2[2] = cospi_24_64 * s3[2] + cospi_8_64 * s3[5] + butterfly_two_coeff_s32_s64_narrow(s3_lo[2], s3_hi[2], s3_lo[5], s3_hi[5], + cospi_24_64, cospi_8_64, &s2_lo[2], + &s2_hi[2], &s2_lo[5], &s2_hi[5]); + + // step 5 + s1_lo[0] = vaddq_s32(s3_lo[0], s2_lo[1]); + s1_hi[0] = vaddq_s32(s3_hi[0], s2_hi[1]); + s1_lo[1] = vsubq_s32(s3_lo[0], s2_lo[1]); + s1_hi[1] = vsubq_s32(s3_hi[0], s2_hi[1]); + s1_lo[2] = vaddq_s32(s3_lo[3], s2_lo[2]); + s1_hi[2] = vaddq_s32(s3_hi[3], s2_hi[2]); + s1_lo[3] = vsubq_s32(s3_lo[3], s2_lo[2]); + s1_hi[3] = vsubq_s32(s3_hi[3], s2_hi[2]); + s1_lo[4] = vsubq_s32(s3_lo[4], s2_lo[5]); + s1_hi[4] = vsubq_s32(s3_hi[4], s2_hi[5]); + s1_lo[5] = vaddq_s32(s3_lo[4], s2_lo[5]); + s1_hi[5] = vaddq_s32(s3_hi[4], s2_hi[5]); + s1_lo[6] = vsubq_s32(s3_lo[7], s2_lo[6]); + s1_hi[6] = vsubq_s32(s3_hi[7], s2_hi[6]); + s1_lo[7] = vaddq_s32(s3_lo[7], s2_lo[6]); + s1_hi[7] = vaddq_s32(s3_hi[7], s2_hi[6]); + + // step 6 + // out[1] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64 + // out[15] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[7], s1_hi[7], s1_lo[0], s1_hi[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + + // out[9] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64 + // out[7] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[6], s1_hi[6], s1_lo[1], s1_hi[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + + // out[5] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64 + // out[11] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); + + // out[13] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64 + // out[3] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64 + butterfly_two_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + + left[0] = left8[0]; + right[0] = right8[0]; + left[2] = left8[1]; + right[2] = right8[1]; + left[4] = left8[2]; + right[4] = right8[2]; + left[6] = left8[3]; + right[6] = right8[3]; + left[8] = left8[4]; + right[8] = right8[4]; + left[10] = left8[5]; + right[10] = right8[5]; + left[12] = left8[6]; + right[12] = right8[6]; + left[14] = left8[7]; + right[14] = right8[7]; +} + +static void highbd_fadst16_8col(int32x4_t *left, int32x4_t *right) { + // perform 16x16 1-D ADST for 8 columns + int32x4_t x_lo[16], x_hi[16]; + int32x4_t s_lo[16], s_hi[16]; + int32x4_t t_lo[16], t_hi[16]; + int64x2_t s64_lo[32], s64_hi[32]; + + x_lo[0] = left[15]; + x_hi[0] = right[15]; + x_lo[1] = left[0]; + x_hi[1] = right[0]; + x_lo[2] = left[13]; + x_hi[2] = right[13]; + x_lo[3] = left[2]; + x_hi[3] = right[2]; + x_lo[4] = left[11]; + x_hi[4] = right[11]; + x_lo[5] = left[4]; + x_hi[5] = right[4]; + x_lo[6] = left[9]; + x_hi[6] = right[9]; + x_lo[7] = left[6]; + x_hi[7] = right[6]; + x_lo[8] = left[7]; + x_hi[8] = right[7]; + x_lo[9] = left[8]; + x_hi[9] = right[8]; + x_lo[10] = left[5]; + x_hi[10] = right[5]; + x_lo[11] = left[10]; + x_hi[11] = right[10]; + x_lo[12] = left[3]; + x_hi[12] = right[3]; + x_lo[13] = left[12]; + x_hi[13] = right[12]; + x_lo[14] = left[1]; + x_hi[14] = right[1]; + x_lo[15] = left[14]; + x_hi[15] = right[14]; + + // stage 1, indices are doubled + // s0 = cospi_1_64 * x0 + cospi_31_64 * x1; + // s1 = cospi_31_64 * x0 - cospi_1_64 * x1; + butterfly_two_coeff_s32_s64_noround( + x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_1_64, cospi_31_64, + &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]); + // s2 = cospi_5_64 * x2 + cospi_27_64 * x3; + // s3 = cospi_27_64 * x2 - cospi_5_64 * x3; + butterfly_two_coeff_s32_s64_noround( + x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_5_64, cospi_27_64, + &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]); + // s4 = cospi_9_64 * x4 + cospi_23_64 * x5; + // s5 = cospi_23_64 * x4 - cospi_9_64 * x5; + butterfly_two_coeff_s32_s64_noround( + x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_9_64, cospi_23_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + // s6 = cospi_13_64 * x6 + cospi_19_64 * x7; + // s7 = cospi_19_64 * x6 - cospi_13_64 * x7; + butterfly_two_coeff_s32_s64_noround( + x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_13_64, cospi_19_64, + &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]); + // s8 = cospi_17_64 * x8 + cospi_15_64 * x9; + // s9 = cospi_15_64 * x8 - cospi_17_64 * x9; + butterfly_two_coeff_s32_s64_noround( + x_lo[8], x_hi[8], x_lo[9], x_hi[9], cospi_17_64, cospi_15_64, + &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]); + // s10 = cospi_21_64 * x10 + cospi_11_64 * x11; + // s11 = cospi_11_64 * x10 - cospi_21_64 * x11; + butterfly_two_coeff_s32_s64_noround( + x_lo[10], x_hi[10], x_lo[11], x_hi[11], cospi_21_64, cospi_11_64, + &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]); + // s12 = cospi_25_64 * x12 + cospi_7_64 * x13; + // s13 = cospi_7_64 * x12 - cospi_25_64 * x13; + butterfly_two_coeff_s32_s64_noround( + x_lo[12], x_hi[12], x_lo[13], x_hi[13], cospi_25_64, cospi_7_64, + &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]); + // s14 = cospi_29_64 * x14 + cospi_3_64 * x15; + // s15 = cospi_3_64 * x14 - cospi_29_64 * x15; + butterfly_two_coeff_s32_s64_noround( + x_lo[14], x_hi[14], x_lo[15], x_hi[15], cospi_29_64, cospi_3_64, + &s64_lo[2 * 14], &s64_hi[2 * 14], &s64_lo[2 * 15], &s64_hi[2 * 15]); + + // fdct_round_shift, indices are doubled + t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]); + t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]); + t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]); + t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]); + t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]); + t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]); + t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]); + t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]); + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]); + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]); + t_lo[6] = add_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]); + t_hi[6] = add_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]); + t_lo[7] = add_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]); + t_hi[7] = add_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]); + t_lo[8] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]); + t_hi[8] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]); + t_lo[9] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]); + t_hi[9] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]); + t_lo[10] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]); + t_hi[10] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]); + t_lo[11] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]); + t_hi[11] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]); + t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]); + t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]); + t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]); + t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]); + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]); + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]); + + // stage 2 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + s_lo[4] = t_lo[4]; + s_hi[4] = t_hi[4]; + s_lo[5] = t_lo[5]; + s_hi[5] = t_hi[5]; + s_lo[6] = t_lo[6]; + s_hi[6] = t_hi[6]; + s_lo[7] = t_lo[7]; + s_hi[7] = t_hi[7]; + // s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + // s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[8], t_hi[8], t_lo[9], t_hi[9], cospi_4_64, cospi_28_64, + &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]); + // s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + // s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[10], t_hi[10], t_lo[11], t_hi[11], cospi_20_64, cospi_12_64, + &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]); + // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + // s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[13], t_hi[13], t_lo[12], t_hi[12], cospi_28_64, cospi_4_64, + &s64_lo[2 * 13], &s64_hi[2 * 13], &s64_lo[2 * 12], &s64_hi[2 * 12]); + // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + // s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_12_64, cospi_20_64, + &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]); + + // s0 + s4 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[4]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[4]); + // s1 + s5 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[5]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[5]); + // s2 + s6 + t_lo[2] = add_s32_s64_narrow(s_lo[2], s_lo[6]); + t_hi[2] = add_s32_s64_narrow(s_hi[2], s_hi[6]); + // s3 + s7 + t_lo[3] = add_s32_s64_narrow(s_lo[3], s_lo[7]); + t_hi[3] = add_s32_s64_narrow(s_hi[3], s_hi[7]); + + // s0 - s4 + t_lo[4] = sub_s32_s64_narrow(s_lo[0], s_lo[4]); + t_hi[4] = sub_s32_s64_narrow(s_hi[0], s_hi[4]); + // s1 - s5 + t_lo[5] = sub_s32_s64_narrow(s_lo[1], s_lo[5]); + t_hi[5] = sub_s32_s64_narrow(s_hi[1], s_hi[5]); + // s2 - s6 + t_lo[6] = sub_s32_s64_narrow(s_lo[2], s_lo[6]); + t_hi[6] = sub_s32_s64_narrow(s_hi[2], s_hi[6]); + // s3 - s7 + t_lo[7] = sub_s32_s64_narrow(s_lo[3], s_lo[7]); + t_hi[7] = sub_s32_s64_narrow(s_hi[3], s_hi[7]); + + // fdct_round_shift() + // s8 + s12 + t_lo[8] = add_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]); + t_hi[8] = add_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]); + // s9 + s13 + t_lo[9] = add_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]); + t_hi[9] = add_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]); + // s10 + s14 + t_lo[10] = add_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]); + t_hi[10] = add_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]); + // s11 + s15 + t_lo[11] = add_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]); + t_hi[11] = add_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]); + + // s8 - s12 + t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]); + t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]); + // s9 - s13 + t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]); + t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]); + // s10 - s14 + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]); + // s11 - s15 + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]); + + // stage 3 + s_lo[0] = t_lo[0]; + s_hi[0] = t_hi[0]; + s_lo[1] = t_lo[1]; + s_hi[1] = t_hi[1]; + s_lo[2] = t_lo[2]; + s_hi[2] = t_hi[2]; + s_lo[3] = t_lo[3]; + s_hi[3] = t_hi[3]; + // s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + // s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64, + &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]); + // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + // s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[7], t_hi[7], t_lo[6], t_hi[6], cospi_24_64, cospi_8_64, + &s64_lo[2 * 7], &s64_hi[2 * 7], &s64_lo[2 * 6], &s64_hi[2 * 6]); + s_lo[8] = t_lo[8]; + s_hi[8] = t_hi[8]; + s_lo[9] = t_lo[9]; + s_hi[9] = t_hi[9]; + s_lo[10] = t_lo[10]; + s_hi[10] = t_hi[10]; + s_lo[11] = t_lo[11]; + s_hi[11] = t_hi[11]; + // s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + // s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[12], t_hi[12], t_lo[13], t_hi[13], cospi_8_64, cospi_24_64, + &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]); + // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + // s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + butterfly_two_coeff_s32_s64_noround( + t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_24_64, cospi_8_64, + &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]); + + // s0 + s2 + t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]); + // s1 + s3 + t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]); + // s0 - s2 + t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]); + t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]); + // s1 - s3 + t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]); + t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]); + // fdct_round_shift() + // s4 + s6 + t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s5 + s7 + t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s4 - s6 + t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]); + t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]); + // s5 - s7 + t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]); + t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]); + // s8 + s10 + t_lo[8] = add_s32_s64_narrow(s_lo[8], s_lo[10]); + t_hi[8] = add_s32_s64_narrow(s_hi[8], s_hi[10]); + // s9 + s11 + t_lo[9] = add_s32_s64_narrow(s_lo[9], s_lo[11]); + t_hi[9] = add_s32_s64_narrow(s_hi[9], s_hi[11]); + // s8 - s10 + t_lo[10] = sub_s32_s64_narrow(s_lo[8], s_lo[10]); + t_hi[10] = sub_s32_s64_narrow(s_hi[8], s_hi[10]); + // s9 - s11 + t_lo[11] = sub_s32_s64_narrow(s_lo[9], s_lo[11]); + t_hi[11] = sub_s32_s64_narrow(s_hi[9], s_hi[11]); + // fdct_round_shift() + // s12 + s14 + t_lo[12] = add_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]); + t_hi[12] = add_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]); + // s13 + s15 + t_lo[13] = add_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]); + t_hi[13] = add_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]); + // s12 - s14 + t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]); + t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]); + // s13 - s15 + t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]); + t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]); + + // stage 4, with fdct_round_shift + // s2 = (-cospi_16_64) * (x2 + x3); + // s3 = cospi_16_64 * (x2 - x3); + butterfly_one_coeff_s32_s64_narrow(t_lo[3], t_hi[3], t_lo[2], t_hi[2], + -cospi_16_64, &x_lo[2], &x_hi[2], &x_lo[3], + &x_hi[3]); + // s6 = cospi_16_64 * (x6 + x7); + // s7 = cospi_16_64 * (-x6 + x7); + butterfly_one_coeff_s32_s64_narrow(t_lo[7], t_hi[7], t_lo[6], t_hi[6], + cospi_16_64, &x_lo[6], &x_hi[6], &x_lo[7], + &x_hi[7]); + // s10 = cospi_16_64 * (x10 + x11); + // s11 = cospi_16_64 * (-x10 + x11); + butterfly_one_coeff_s32_s64_narrow(t_lo[11], t_hi[11], t_lo[10], t_hi[10], + cospi_16_64, &x_lo[10], &x_hi[10], + &x_lo[11], &x_hi[11]); + // s14 = (-cospi_16_64) * (x14 + x15); + // s15 = cospi_16_64 * (x14 - x15); + butterfly_one_coeff_s32_s64_narrow(t_lo[15], t_hi[15], t_lo[14], t_hi[14], + -cospi_16_64, &x_lo[14], &x_hi[14], + &x_lo[15], &x_hi[15]); + + // Just copy x0, x1, x4, x5, x8, x9, x12, x13 + x_lo[0] = t_lo[0]; + x_hi[0] = t_hi[0]; + x_lo[1] = t_lo[1]; + x_hi[1] = t_hi[1]; + x_lo[4] = t_lo[4]; + x_hi[4] = t_hi[4]; + x_lo[5] = t_lo[5]; + x_hi[5] = t_hi[5]; + x_lo[8] = t_lo[8]; + x_hi[8] = t_hi[8]; + x_lo[9] = t_lo[9]; + x_hi[9] = t_hi[9]; + x_lo[12] = t_lo[12]; + x_hi[12] = t_hi[12]; + x_lo[13] = t_lo[13]; + x_hi[13] = t_hi[13]; + + left[0] = x_lo[0]; + right[0] = x_hi[0]; + left[1] = vnegq_s32(x_lo[8]); + right[1] = vnegq_s32(x_hi[8]); + left[2] = x_lo[12]; + right[2] = x_hi[12]; + left[3] = vnegq_s32(x_lo[4]); + right[3] = vnegq_s32(x_hi[4]); + left[4] = x_lo[6]; + right[4] = x_hi[6]; + left[5] = x_lo[14]; + right[5] = x_hi[14]; + left[6] = x_lo[10]; + right[6] = x_hi[10]; + left[7] = x_lo[2]; + right[7] = x_hi[2]; + left[8] = x_lo[3]; + right[8] = x_hi[3]; + left[9] = x_lo[11]; + right[9] = x_hi[11]; + left[10] = x_lo[15]; + right[10] = x_hi[15]; + left[11] = x_lo[7]; + right[11] = x_hi[7]; + left[12] = x_lo[5]; + right[12] = x_hi[5]; + left[13] = vnegq_s32(x_lo[13]); + right[13] = vnegq_s32(x_hi[13]); + left[14] = x_lo[9]; + right[14] = x_hi[9]; + left[15] = vnegq_s32(x_lo[1]); + right[15] = vnegq_s32(x_hi[1]); +} + +static void highbd_fdct16x16_neon(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + // Left half. + highbd_fdct16_8col(left1, right1); + // Right half. + highbd_fdct16_8col(left2, right2); + transpose_s32_16x16(left1, right1, left2, right2); +} + +static void highbd_fadst16x16_neon(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + // Left half. + highbd_fadst16_8col(left1, right1); + // Right half. + highbd_fadst16_8col(left2, right2); + transpose_s32_16x16(left1, right1, left2, right2); +} + +void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output, + int stride, int tx_type) { + int32x4_t left1[16], right1[16], left2[16], right2[16]; + + switch (tx_type) { + case DCT_DCT: vpx_highbd_fdct16x16_neon(input, output, stride); break; + case ADST_DCT: + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fdct16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + case DCT_ADST: + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fdct16x16_neon(left1, right1, left2, right2); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + default: + assert(tx_type == ADST_ADST); + highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_right_shift_16x16(left1, right1, left2, right2, 2); + highbd_fadst16x16_neon(left1, right1, left2, right2); + highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16); + break; + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c new file mode 100644 index 0000000000..d631cd437d --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" +#include "vpx_mem/vpx_mem.h" + +// Compute the sum of all pixel differences of this MB. +static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { +#if VPX_ARCH_AARCH64 + return vaddlvq_s8(v_sum_diff_total); +#else + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); + const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210); + const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210), + vget_low_s64(fedcba98_76543210)); + const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0); + return sum_diff; +#endif +} + +// Denoise a 16x1 vector. +static INLINE int8x16_t denoiser_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold, + const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment, + const uint8x16_t v_delta_level_1_and_2, + const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) { + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = + vandq_u8(v_level2_mask, v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = + vandq_u8(v_level3_mask, v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = + vaddq_u8(v_level1_adjustment, v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = + vaddq_u8(v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = + vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +static INLINE int8x16_t denoiser_adjust_16x1_neon( + const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, + const uint8x16_t k_delta, int8x16_t v_sum_diff_total) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = + vandq_u8(v_diff_pos_mask, v_abs_adjustment); + const uint8x16_t v_neg_adjustment = + vandq_u8(v_diff_neg_mask, v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff); + } + return v_sum_diff_total; +} + +// Denoise 8x8 and 8x16 blocks. +static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude, + int width) { + int sum_diff_thresh, r, sum_diff = 0; + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16]; + + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_height = (4 << b_height_log2_lookup[bs]) >> 1; + + int8x16_t v_sum_diff_total = vdupq_n_s8(0); + + for (r = 0; r < b_height; ++r) { + memcpy(sig_buffer[r], sig, width); + memcpy(sig_buffer[r] + width, sig + sig_stride, width); + memcpy(mc_running_buffer[r], mc_running_avg_y, width); + memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride, + width); + memcpy(running_buffer[r], running_avg_y, width); + memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width); + v_sum_diff_total = denoiser_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], + v_level1_threshold, v_level2_threshold, v_level3_threshold, + v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + sig += (sig_stride << 1); + mc_running_avg_y += (mc_avg_y_stride << 1); + running_avg_y += (avg_y_stride << 1); + } + + { + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // check if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the acceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + running_avg_y -= avg_y_stride * (b_height << 1); + for (r = 0; r < b_height; ++r) { + v_sum_diff_total = denoiser_adjust_16x1_neon( + sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta, + v_sum_diff_total); + { + const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]); + const uint8x8_t v_running_buffer_high = + vget_high_u8(v_running_buffer); + const uint8x8_t v_running_buffer_low = + vget_low_u8(v_running_buffer); + vst1_u8(running_avg_y, v_running_buffer_low); + vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high); + } + // Update pointers for next iteration. + running_avg_y += (avg_y_stride << 1); + } + sum_diff = horizontal_add_s8x16(v_sum_diff_total); + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + return FILTER_BLOCK; +} + +// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks. +static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_running_avg_y, + int mc_avg_y_stride, uint8_t *running_avg_y, + int avg_y_stride, int increase_denoising, + BLOCK_SIZE bs, int motion_magnitude) { + const int shift_inc = + (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) + ? 1 + : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + const int b_width = (4 << b_width_log2_lookup[bs]); + const int b_height = (4 << b_height_log2_lookup[bs]); + const int b_width_shift4 = b_width >> 4; + + int8x16_t v_sum_diff_total[4][4]; + int r, c, sum_diff = 0; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r] = vdupq_n_s8(0); + } + } + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon( + sig, mc_running_avg_y, running_avg_y, v_level1_threshold, + v_level2_threshold, v_level3_threshold, v_level1_adjustment, + v_delta_level_1_and_2, v_delta_level_2_and_3, + v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + // Update pointers for next iteration. + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + { + const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising); + if (abs(sum_diff) > sum_diff_thresh) { + const int delta = + ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vdupq_n_u8(delta); + sig -= sig_stride * b_height; + mc_running_avg_y -= mc_avg_y_stride * b_height; + running_avg_y -= avg_y_stride * b_height; + sum_diff = 0; + + for (r = 0; r < b_height; ++r) { + for (c = 0; c < b_width_shift4; ++c) { + v_sum_diff_total[c][r >> 4] = + denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y, + k_delta, v_sum_diff_total[c][r >> 4]); + + // Update pointers for next iteration. + sig += 16; + mc_running_avg_y += 16; + running_avg_y += 16; + } + if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) { + for (c = 0; c < b_width_shift4; ++c) { + sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]); + } + } + + sig = sig - b_width + sig_stride; + mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride; + running_avg_y = running_avg_y - b_width + avg_y_stride; + } + + if (abs(sum_diff) > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + return FILTER_BLOCK; +} + +int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, BLOCK_SIZE bs, + int motion_magnitude) { + // Rank by frequency of the block type to have an early termination. + if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 || + bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 || + bs == BLOCK_32X64 || bs == BLOCK_64X32) { + return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude); + } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) { + return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg, + avg_stride, increase_denoising, bs, + motion_magnitude, 8); + } + return COPY_BLOCK; +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c new file mode 100644 index 0000000000..b82b3f9db5 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vpx_ports/mem.h" + +#ifdef __GNUC__ +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) +#else +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) +#endif + +static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { + int_mv result; + result.as_mv.row = row; + result.as_mv.col = col; + return result; +} + +/***************************************************************************** + * This function utilizes 3 properties of the cost function lookup tables, * + * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * + * vp9_encoder.c. * + * For the joint cost: * + * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * + * For the component costs: * + * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * + * (Equal costs for both components) * + * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * + * (Cost function is even) * + * If these do not hold, then this function cannot be used without * + * modification, in which case you can revert to using the C implementation, * + * which does not rely on these properties. * + *****************************************************************************/ +int vp9_diamond_search_sad_neon(const MACROBLOCK *x, + const search_site_config *cfg, MV *ref_mv, + uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_sad_fn_ptr_t *sad_fn_ptr, + const MV *center_mv) { + static const uint32_t data[4] = { 0, 1, 2, 3 }; + const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data); + + const int32x4_t zero_s32 = vdupq_n_s32(0); + const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); + const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int)); + const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); + const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int)); + + const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit); + + const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]); + const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]); + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; + const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; + const int tot_steps = cfg->total_steps - search_param; + + const int_mv fcenter_mv = + pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); + const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int)); + + const int ref_row = ref_mv->row; + const int ref_col = ref_mv->col; + + int_mv bmv = pack_int_mv(ref_row, ref_col); + int_mv new_bmv = bmv; + int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); + + const int what_stride = x->plane[0].src.stride; + const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; + const uint8_t *const what = x->plane[0].src.buf; + const uint8_t *const in_what = + x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; + + // Work out the start point for the search + const uint8_t *best_address = in_what; + const uint8_t *new_best_address = best_address; +#if VPX_ARCH_AARCH64 + int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address); +#else + int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); +#endif + // Starting position + unsigned int best_sad = start_mv_sad; + int i, j, step; + + // Check the prerequisite cost function properties that are easy to check + // in an assert. See the function-level documentation for details on all + // prerequisites. + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); + assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); + + *num00 = 0; + + for (i = 0, step = 0; step < tot_steps; step++) { + for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { + int16x8_t v_diff_mv_w; + int8x16_t v_inside_d; + uint32x4_t v_outside_d; + int32x4_t v_cost_d, v_sad_d; +#if VPX_ARCH_AARCH64 + int64x2_t v_blocka[2]; +#else + int32x4_t v_blocka[1]; + uint32x2_t horiz_max_0, horiz_max_1; +#endif + + uint32_t horiz_max; + // Compute the candidate motion vectors + const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]); + const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w); + // Clamp them to the search bounds + int16x8_t v_these_mv_clamp_w = v_these_mv_w; + v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w); + v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w); + // The ones that did not change are inside the search area + v_inside_d = vreinterpretq_s8_u32( + vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w), + vreinterpretq_s32_s16(v_these_mv_w))); + + // If none of them are inside, then move on +#if VPX_ARCH_AARCH64 + horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d)); +#else + horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)), + vget_high_u32(vreinterpretq_u32_s8(v_inside_d))); + horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0); + vst1_lane_u32(&horiz_max, horiz_max_1, 0); +#endif + if (LIKELY(horiz_max == 0)) { + continue; + } + + // The inverse mask indicates which of the MVs are outside + v_outside_d = + vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff))); + // Shift right to keep the sign bit clear, we will use this later + // to set the cost to the maximum value. + v_outside_d = vshrq_n_u32(v_outside_d, 1); + + // Compute the difference MV + v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv); + // We utilise the fact that the cost function is even, and use the + // absolute difference. This allows us to use unsigned indexes later + // and reduces cache pressure somewhat as only a half of the table + // is ever referenced. + v_diff_mv_w = vabsq_s16(v_diff_mv_w); + + // Compute the SIMD pointer offsets. + { +#if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8 + // Load the offsets + int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]); + int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]); + // Set the ones falling outside to zero + v_bo10_q = vandq_s64( + v_bo10_q, + vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d)))); + v_bo32_q = vandq_s64( + v_bo32_q, + vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d)))); + // Compute the candidate addresses + v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q); + v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q); +#else // sizeof(intptr_t) == 4 + int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]); + v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d)); + v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d); +#endif + } + + sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], + in_what_stride, (uint32_t *)&v_sad_d); + + // Look up the component cost of the residual motion vector + { + uint32_t cost[4]; + DECLARE_ALIGNED(16, int16_t, rowcol[8]); + vst1q_s16(rowcol, v_diff_mv_w); + + // Note: This is a use case for gather instruction + cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]]; + cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]]; + cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]]; + cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]]; + + v_cost_d = vld1q_s32((int32_t *)cost); + } + + // Now add in the joint cost + { + const uint32x4_t v_sel_d = + vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32); + const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8( + vbslq_u8(vreinterpretq_u8_u32(v_sel_d), + vreinterpretq_u8_s32(v_joint_cost_0_d), + vreinterpretq_u8_s32(v_joint_cost_1_d))); + v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d); + } + + // Multiply by sad_per_bit + v_cost_d = vmulq_s32(v_cost_d, v_spb_d); + // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT) + v_cost_d = + vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1))); + v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT); + // Add the cost to the sad + v_sad_d = vaddq_s32(v_sad_d, v_cost_d); + + // Make the motion vectors outside the search area have max cost + // by or'ing in the comparison mask, this way the minimum search won't + // pick them. + v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d)); + + // Find the minimum value and index horizontally in v_sad_d + { + uint32_t local_best_sad; +#if VPX_ARCH_AARCH64 + local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d)); +#else + uint32x2_t horiz_min_0 = + vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)), + vget_high_u32(vreinterpretq_u32_s32(v_sad_d))); + uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); + vst1_lane_u32(&local_best_sad, horiz_min_1, 0); +#endif + + // Update the global minimum if the local minimum is smaller + if (LIKELY(local_best_sad < best_sad)) { +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + uint32_t local_best_idx; + const uint32x4_t v_sel_d = + vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad)); + uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d); + v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff)); + +#if VPX_ARCH_AARCH64 + local_best_idx = vminvq_u32(v_mask_d); +#else + horiz_min_0 = + vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d)); + horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0); + vst1_lane_u32(&local_best_idx, horiz_min_1, 0); +#endif + + new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; + + best_sad = local_best_sad; + } + } + } + + bmv = new_bmv; + best_address = new_best_address; + + v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); +#if VPX_ARCH_AARCH64 + v_ba_q = vdupq_n_s64((intptr_t)best_address); +#else + v_ba_d = vdupq_n_s32((intptr_t)best_address); +#endif + + if (UNLIKELY(best_address == in_what)) { + (*num00)++; + } + } + + *best_mv = bmv.as_mv; + return best_sad; +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c index 1c7503139e..0cf0bf250e 100644 --- a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c @@ -12,30 +12,91 @@ #include #include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, - int block_size) { - int64x2_t error = vdupq_n_s64(0); +int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); - assert(block_size >= 8); - assert((block_size % 8) == 0); + assert(block_size >= 16); + assert((block_size % 16) == 0); do { - const int16x8_t c = vld1q_s16(coeff); - const int16x8_t d = vld1q_s16(dqcoeff); - const int16x8_t diff = vsubq_s16(c, d); - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before + uint32x4_t err; + int32x4_t ssz0, ssz1; + + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before // accumulating them in 64-bits. - const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); - const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); - const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); - error = vaddq_s64(error, err2); - coeff += 8; - dqcoeff += 8; - block_size -= 8; + err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + // We can't do the same here as we're operating on signed integers, so we + // can store 2 15-bit diff before accumulating into 64-bits. + ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); + ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz0); + + ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); + ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; } while (block_size != 0); - return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); + *ssz = horizontal_add_int64x2(ssz_s64); + return (int64_t)horizontal_add_uint64x2(err_u64); +} + +int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + uint32x4_t err0, err1; + + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // diff is 15-bits, the squares 30, so in theory we can store 4 in 32-bits + // before accumulating them in 64-bits. However splitting into 2 mull, mlal + // pairs is beneficial since it allows us to use both Neon + // multiply-accumulate pipes - on CPUs that have them - rather than having + // a single chain of 4 instructions executing serially. + err0 = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err0 = vmlal_u16(err0, vget_high_u16(diff0), vget_high_u16(diff0)); + err_u64[0] = vpadalq_u32(err_u64[0], err0); + + err1 = vmull_u16(vget_low_u16(diff1), vget_low_u16(diff1)); + err1 = vmlal_u16(err1, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64[1] = vpadalq_u32(err_u64[1], err1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return horizontal_add_uint64x2(vaddq_u64(err_u64[0], err_u64[1])); } diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c new file mode 100644 index 0000000000..78e7361d85 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_sve.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" + +int64_t vp9_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int64x2_t err_v = vdupq_n_s64(0); + int64x2_t ssz_v = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const int16x8_t diff0 = vabdq_s16(c0, d0); + const int16x8_t diff1 = vabdq_s16(c1, d1); + + err_v = vpx_dotq_s16(err_v, diff0, diff0); + err_v = vpx_dotq_s16(err_v, diff1, diff1); + + ssz_v = vpx_dotq_s16(ssz_v, c0, c0); + ssz_v = vpx_dotq_s16(ssz_v, c1, c1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = horizontal_add_int64x2(ssz_v); + return horizontal_add_int64x2(err_v); +} + +int64_t vp9_block_error_fp_sve(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + int64x2_t err = vdupq_n_s64(0); + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const int16x8_t diff0 = vabdq_s16(c0, d0); + const int16x8_t diff1 = vabdq_s16(c1, d1); + + err = vpx_dotq_s16(err, diff0, diff0); + err = vpx_dotq_s16(err, diff1, diff1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return horizontal_add_int64x2(err); +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c new file mode 100644 index 0000000000..bc8dd4a341 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c @@ -0,0 +1,844 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_scale/yv12config.h" + +// Note: The scaling functions could write extra rows and columns in dst, which +// exceed the right and bottom boundaries of the destination frame. We rely on +// the following frame extension function to fix these rows and columns. + +static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x2_t s = vld2q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src, + const int src_stride, + uint8_t *dst, + const int dst_stride, const int w, + const int h) { + const int max_width = (w + 15) & ~15; + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + const uint8x16x4_t s = vld4q_u8(src); + vst1q_u8(dst, s.val[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_bilinear_kernel( + const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2, + const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1, + uint8_t *const dst) { + const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0); + const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0); + const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0); + const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0); + const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1); + const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1); + const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1); + const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1); + + const uint8x8_t hor0 = vrshrn_n_u16(h4, 7); // temp: 00 01 02 03 04 05 06 07 + const uint8x8_t hor1 = vrshrn_n_u16(h5, 7); // temp: 08 09 0A 0B 0C 0D 0E 0F + const uint8x8_t hor2 = vrshrn_n_u16(h6, 7); // temp: 10 11 12 13 14 15 16 17 + const uint8x8_t hor3 = vrshrn_n_u16(h7, 7); // temp: 18 19 1A 1B 1C 1D 1E 1F + const uint16x8_t v0 = vmull_u8(hor0, coef0); + const uint16x8_t v1 = vmull_u8(hor1, coef0); + const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1); + const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1); + // dst: 0 1 2 3 4 5 6 7 8 9 A B C D E F + const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7)); + vst1q_u8(dst, d); +} + +static INLINE void scale_plane_2_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // 000 002 004 006 008 00A 00C 00E 010 012 014 016 018 01A 01C 01E + // 001 003 005 007 009 00B 00D 00F 011 013 015 017 019 01B 01D 01F + const uint8x16x2_t s0 = vld2q_u8(src0); + // 100 102 104 106 108 10A 10C 10E 110 112 114 116 118 11A 11C 11E + // 101 103 105 107 109 10B 10D 10F 111 113 115 117 119 11B 11D 11F + const uint8x16x2_t s1 = vld2q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 32; + src1 += 32; + dst += 16; + x -= 16; + } while (x); + src0 += 2 * (src_stride - max_width); + src1 += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE void scale_plane_4_to_1_bilinear( + const uint8_t *const src, const int src_stride, uint8_t *dst, + const int dst_stride, const int w, const int h, const int16_t c0, + const int16_t c1) { + const int max_width = (w + 15) & ~15; + const uint8_t *src0 = src; + const uint8_t *src1 = src + src_stride; + const uint8x8_t coef0 = vdup_n_u8(c0); + const uint8x8_t coef1 = vdup_n_u8(c1); + int y = h; + + assert(w && h); + + do { + int x = max_width; + do { + // (*) -- useless + // 000 004 008 00C 010 014 018 01C 020 024 028 02C 030 034 038 03C + // 001 005 009 00D 011 015 019 01D 021 025 029 02D 031 035 039 03D + // 002 006 00A 00E 012 016 01A 01E 022 026 02A 02E 032 036 03A 03E (*) + // 003 007 00B 00F 013 017 01B 01F 023 027 02B 02F 033 037 03B 03F (*) + const uint8x16x4_t s0 = vld4q_u8(src0); + // 100 104 108 10C 110 114 118 11C 120 124 128 12C 130 134 138 13C + // 101 105 109 10D 111 115 119 11D 121 125 129 12D 131 135 139 13D + // 102 106 10A 10E 112 116 11A 11E 122 126 12A 12E 132 136 13A 13E (*) + // 103 107 10B 10F 113 117 11B 11F 123 127 12B 12F 133 137 13B 13F (*) + const uint8x16x4_t s1 = vld4q_u8(src1); + scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1], + coef0, coef1, dst); + src0 += 64; + src1 += 64; + dst += 16; + x -= 16; + } while (x); + src0 += 4 * (src_stride - max_width); + src1 += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s, + const uint8x8_t *const coef) { + const uint16x8_t h0 = vmull_u8(s[0], coef[0]); + const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]); + + return vrshrn_n_u16(h1, 7); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[14], d[4]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + // Note: processing 4x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71 + d[2] = scale_filter_8(&s[4], filters); // 02 12 22 32 42 52 62 72 + d[3] = scale_filter_8(&s[6], filters); // 03 13 23 33 43 53 63 73 + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]); + vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]), + 0); + vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]), + 0); + vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]), + 0); + vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]), + 0); + vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]), + 1); + vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]), + 1); + vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]), + 1); + vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]), + 1); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + t += 4; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 6 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11], + &s[12], &s[13]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[2], filters); // 10 11 12 13 14 15 16 17 + d[2] = scale_filter_8(&s[4], filters); // 20 21 22 23 24 25 26 27 + d[3] = scale_filter_8(&s[6], filters); // 30 31 32 33 34 35 36 37 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + const int16x8_t filters = vld1q_s16(coef); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[12], d[2]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + // Note: processing 2x8 is about 20% faster than processing row by row using + // vld4_u8(). + do { + load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]); + x = width_hor; + + do { + uint8x8x2_t dd; + src += 8; + load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10], + &s[11]); + + d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70 + d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71 + // dd.val[0]: 00 01 20 21 40 41 60 61 + // dd.val[1]: 10 11 30 31 50 51 70 71 + dd = vtrn_u8(d[0], d[1]); + vst1_lane_u16((uint16_t *)(t + 0 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 0); + vst1_lane_u16((uint16_t *)(t + 1 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 0); + vst1_lane_u16((uint16_t *)(t + 2 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 1); + vst1_lane_u16((uint16_t *)(t + 3 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 1); + vst1_lane_u16((uint16_t *)(t + 4 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 2); + vst1_lane_u16((uint16_t *)(t + 5 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 2); + vst1_lane_u16((uint16_t *)(t + 6 * width_hor), + vreinterpret_u16_u8(dd.val[0]), 3); + vst1_lane_u16((uint16_t *)(t + 7 * width_hor), + vreinterpret_u16_u8(dd.val[1]), 3); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + t += 2; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 7 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]); + t += 4 * width_hor; + y = height_ver; + + do { + load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9], + &s[10], &s[11]); + t += 8 * width_hor; + + d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07 + d[1] = scale_filter_8(&s[4], filters); // 10 11 12 13 14 15 16 17 + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +// Notes for 4 to 3 scaling: +// +// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be +// multiple of 6, and no less than w. +// +// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be +// multiple of 8, and no less than w. +// +// 3. 8 columns are calculated in each horizontal inner loop for further +// vertical scaling, so height_hor must be multiple of 8, and no less than +// 4 * h / 3. +// +// 4. 6 columns are calculated in each vertical inner loop, so height_ver must +// be multiple of 6, and no less than h. +// +// 5. The physical location of the last row of the 4 to 3 scaled frame is +// decided by phase_scaler, and are always less than 1 pixel below the last row +// of the original image. + +static void scale_plane_4_to_3_bilinear(const uint8_t *src, + const int src_stride, uint8_t *dst, + const int dst_stride, const int w, + const int h, const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We only need 1 extra row below because there are only 2 bilinear + // coefficients. + const int height_hor = (4 * h / 3 + 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[9], d[8], c[6]; + + assert(w && h); + + c[0] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][3]); + c[1] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][4]); + c[2] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) & + SUBPEL_MASK][3]); + c[3] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) & + SUBPEL_MASK][4]); + c[4] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) & + SUBPEL_MASK][3]); + c[5] = vdup_n_u8( + (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) & + SUBPEL_MASK][4]); + + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + src += 1; + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + src += 8; + transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3 - 1; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7], &s[8]); + t += 8 * stride_hor; + + d[0] = scale_filter_bilinear(&s[0], &c[0]); + d[1] = + scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]); + d[2] = + scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]); + d[3] = scale_filter_bilinear(&s[4], &c[0]); + d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], + &c[2]); + d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], + &c[4]); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 1); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = width_hor + 2; // store 2 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + const int16x8_t filters0 = + vld1q_s16(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters1 = + vld1q_s16(coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]); + const int16x8_t filters2 = + vld1q_s16(coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]); + int x, y = height_hor; + uint8_t *t = temp_buffer; + uint8x8_t s[15], d[8]; + + assert(w && h); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2; + d[6] = vdup_n_u8(0); + d[7] = vdup_n_u8(0); + + // horizontal 6x8 + do { + load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); + x = width_hor; + + do { + src += 8; + load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13], + &s[14]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + // 60 61 62 63 64 65 xx xx + // 70 71 72 73 74 75 xx xx + transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + // store 2 extra pixels + vst1_u8(t + 0 * stride_hor, d[0]); + vst1_u8(t + 1 * stride_hor, d[1]); + vst1_u8(t + 2 * stride_hor, d[2]); + vst1_u8(t + 3 * stride_hor, d[3]); + vst1_u8(t + 4 * stride_hor, d[4]); + vst1_u8(t + 5 * stride_hor, d[5]); + vst1_u8(t + 6 * stride_hor, d[6]); + vst1_u8(t + 7 * stride_hor, d[7]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + t += 6; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 7 * stride_hor + 2; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], + &s[7]); + t += 7 * stride_hor; + y = height_ver; + + do { + load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12], + &s[13], &s[14]); + t += 8 * stride_hor; + + d[0] = scale_filter_8(&s[0], filters0); + d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1); + d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2); + d[3] = scale_filter_8(&s[4], filters0); + d[4] = + scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1); + d[5] = + scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2); + vst1_u8(dst + 0 * dst_stride, d[0]); + vst1_u8(dst + 1 * dst_stride, d[1]); + vst1_u8(dst + 2 * dst_stride, d[2]); + vst1_u8(dst + 3 * dst_stride, d[3]); + vst1_u8(dst + 4 * dst_stride, d[4]); + vst1_u8(dst + 5 * dst_stride, d[5]); + + s[0] = s[8]; + s[1] = s[9]; + s[2] = s[10]; + s[3] = s[11]; + s[4] = s[12]; + s[5] = s[13]; + s[6] = s[14]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * (4 * height_ver / 3 + 7); + t += 8; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, + int phase_scaler) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + const int dst_uv_w = dst->uv_crop_width; + const int dst_uv_h = dst->uv_crop_height; + int scaled = 0; + + // phase_scaler is usually 0 or 8. + assert(phase_scaler >= 0 && phase_scaler < 16); + + if (2 * dst_w == src_w && 2 * dst_h == src_h) { + // 2 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0, c1); + scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + } else { + const int buffer_stride = (dst_w + 3) & ~3; + const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_2_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_2_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_2_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0, c1); + scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1); + } else { + const int buffer_stride = (dst_w + 1) & ~1; + const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_4_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_4_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_4_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scaled = 1; + if (filter_type == BILINEAR) { + scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, phase_scaler, + temp_buffer); + scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, dst_uv_w, + dst_uv_h, phase_scaler, temp_buffer); + scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, dst_uv_w, + dst_uv_h, phase_scaler, temp_buffer); + } else { + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], + phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], + phase_scaler, temp_buffer); + } + free(temp_buffer); + } + } + + if (scaled) { + vpx_extend_frame_borders(dst); + } else { + // Call c version for all other scaling ratios. + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c new file mode 100644 index 0000000000..d9b183472d --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); + + const int shift = 2 * (bd - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int32x4_t c = load_tran_low_to_s32q(coeff); + const int32x4_t d = load_tran_low_to_s32q(dqcoeff); + + const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d)); + + err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff)); + err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff)); + + ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c)); + ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c)); + + coeff += 4; + dqcoeff += 4; + block_size -= 4; + } while (block_size != 0); + + *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift; + return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift; +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c new file mode 100644 index 0000000000..f990a747ed --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c @@ -0,0 +1,1076 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Compute (a-b)**2 for 8 pixels with size 16-bit +static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, + uint32_t *dst) { + const uint16x8_t a_reg = vld1q_u16(a); + const uint16x8_t b_reg = vld1q_u16(b); + + uint16x8_t dist = vabdq_u16(a_reg, b_reg); + uint32x4_t dist_first = vmull_u16(vget_low_u16(dist), vget_low_u16(dist)); + uint32x4_t dist_second = vmull_u16(vget_high_u16(dist), vget_high_u16(dist)); + + vst1q_u32(dst, dist_first); + vst1q_u32(dst + 4, dist_second); +} + +// Sum up three neighboring distortions for the pixels +static INLINE void highbd_get_sum_4(const uint32_t *dist, uint32x4_t *sum) { + uint32x4_t dist_reg, dist_left, dist_right; + + dist_reg = vld1q_u32(dist); + dist_left = vld1q_u32(dist - 1); + dist_right = vld1q_u32(dist + 1); + + *sum = vaddq_u32(dist_reg, dist_left); + *sum = vaddq_u32(*sum, dist_right); +} + +static INLINE void highbd_get_sum_8(const uint32_t *dist, uint32x4_t *sum_first, + uint32x4_t *sum_second) { + highbd_get_sum_4(dist, sum_first); + highbd_get_sum_4(dist + 4, sum_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values, plus +// however many values from y/uv plane are). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE void highbd_average_4(uint32x4_t *output, const uint32x4_t sum, + const uint32x4_t *mul_constants, + const int strength, const int rounding, + const int weight) { + const int64x2_t strength_s64 = vdupq_n_s64(-strength - 32); + const uint64x2_t rounding_u64 = vdupq_n_u64((uint64_t)rounding << 32); + const uint32x4_t weight_u32 = vdupq_n_u32(weight); + const uint32x4_t sixteen = vdupq_n_u32(16); + uint32x4_t sum2; + + // modifier * 3 / index; + uint64x2_t sum_lo = + vmlal_u32(rounding_u64, vget_low_u32(sum), vget_low_u32(*mul_constants)); + uint64x2_t sum_hi = vmlal_u32(rounding_u64, vget_high_u32(sum), + vget_high_u32(*mul_constants)); + + // we cannot use vshrn_n_u64 as strength is not known at compile time. + sum_lo = vshlq_u64(sum_lo, strength_s64); + sum_hi = vshlq_u64(sum_hi, strength_s64); + + sum2 = vcombine_u32(vmovn_u64(sum_lo), vmovn_u64(sum_hi)); + + // Multiply with the weight + sum2 = vminq_u32(sum2, sixteen); + sum2 = vsubq_u32(sixteen, sum2); + *output = vmulq_u32(sum2, weight_u32); +} + +static INLINE void highbd_average_8(uint32x4_t *output_0, uint32x4_t *output_1, + const uint32x4_t sum_0_u32, + const uint32x4_t sum_1_u32, + const uint32x4_t *mul_constants_0, + const uint32x4_t *mul_constants_1, + const int strength, const int rounding, + const int weight) { + highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding, + weight); + highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding, + weight); +} + +// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static INLINE void highbd_accumulate_and_store_8( + const uint32x4_t sum_first_u32, const uint32x4_t sum_second_u32, + const uint16_t *pred, uint16_t *count, uint32_t *accumulator) { + const uint16x8_t sum_u16 = + vcombine_u16(vqmovn_u32(sum_first_u32), vqmovn_u32(sum_second_u32)); + uint16x8_t pred_u16 = vld1q_u16(pred); + uint16x8_t count_u16 = vld1q_u16(count); + uint32x4_t pred_0_u32, pred_1_u32; + uint32x4_t accum_0_u32, accum_1_u32; + + count_u16 = vqaddq_u16(count_u16, sum_u16); + vst1q_u16(count, count_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + + pred_0_u32 = vmovl_u16(vget_low_u16(pred_u16)); + pred_1_u32 = vmovl_u16(vget_high_u16(pred_u16)); + + // Don't use sum_u16 as that produces different results to the C version + accum_0_u32 = vmlaq_u32(accum_0_u32, sum_first_u32, pred_0_u32); + accum_1_u32 = vmlaq_u32(accum_1_u32, sum_second_u32, pred_1_u32); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); +} + +static INLINE void highbd_read_dist_4(const uint32_t *dist, + uint32x4_t *dist_reg) { + *dist_reg = vld1q_u32(dist); +} + +static INLINE void highbd_read_dist_8(const uint32_t *dist, + uint32x4_t *reg_first, + uint32x4_t *reg_second) { + highbd_read_dist_4(dist, reg_first); + highbd_read_dist_4(dist + 4, reg_second); +} + +static INLINE void highbd_read_chroma_dist_row_8( + int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, + uint32x4_t *u_first, uint32x4_t *u_second, uint32x4_t *v_first, + uint32x4_t *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 8 entries from chroma. + highbd_read_dist_8(u_dist, u_first, u_second); + highbd_read_dist_8(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + uint32x4_t u_reg, v_reg; + uint32x4x2_t pair; + + highbd_read_dist_4(u_dist, &u_reg); + + pair = vzipq_u32(u_reg, u_reg); + *u_first = pair.val[0]; + *u_second = pair.val[1]; + + highbd_read_dist_4(v_dist, &v_reg); + + pair = vzipq_u32(v_reg, v_reg); + *v_first = pair.val[0]; + *v_second = pair.val[1]; + } +} + +static void highbd_apply_temporal_filter_luma_8( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_first, + const uint32_t *const *neighbors_second, int top_weight, + int bottom_weight) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + uint32x4_t mul_first, mul_second; + + uint32x4_t sum_row_1_first, sum_row_1_second; + uint32x4_t sum_row_2_first, sum_row_2_second; + uint32x4_t sum_row_3_first, sum_row_3_second; + + uint32x4_t u_first, u_second; + uint32x4_t v_first, v_second; + + uint32x4_t sum_row_first; + uint32x4_t sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(block_width == 8); + + (void)block_width; + + // First row + mul_first = vld1q_u32(neighbors_first[0]); + mul_second = vld1q_u32(neighbors_second[0]); + + // Add luma values + highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second); + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + // We don't need to saturate here because the maximum value is UINT12_MAX ** 2 + // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX + sum_row_first = vaddq_u32(sum_row_2_first, sum_row_3_first); + sum_row_second = vaddq_u32(sum_row_2_second, sum_row_3_second); + + // Add chroma values + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = vld1q_u32(neighbors_first[1]); + mul_second = vld1q_u32(neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + weight = bottom_weight; + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first); + sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second); + + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vaddq_u32(sum_row_first, sum_row_3_first); + sum_row_second = vaddq_u32(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, + rounding, weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = vld1q_u32(neighbors_first[0]); + mul_second = vld1q_u32(neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first); + sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + } + + sum_row_first = vaddq_u32(sum_row_first, u_first); + sum_row_second = vaddq_u32(sum_row_second, u_second); + sum_row_first = vaddq_u32(sum_row_first, v_first); + sum_row_second = vaddq_u32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first, + sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void highbd_apply_temporal_filter_luma( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_first; + const uint32_t *const *neighbors_second; + + // Left + neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + // Right + neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; + highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); +} + +// Add a row of luma distortion that corresponds to 8 chroma mods. If we are +// subsampling in x direction, then we have 16 lumas, else we have 8. +static INLINE void highbd_add_luma_dist_to_8_chroma_mod( + const uint32_t *y_dist, int ss_x, int ss_y, uint32x4_t *u_mod_fst, + uint32x4_t *u_mod_snd, uint32x4_t *v_mod_fst, uint32x4_t *v_mod_snd) { + uint32x4_t y_reg_fst, y_reg_snd; + if (!ss_x) { + highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + y_reg_fst = vaddq_u32(y_reg_fst, y_tmp_fst); + y_reg_snd = vaddq_u32(y_reg_snd, y_tmp_snd); + } + } else { + // Temporary + uint32x4_t y_fst, y_snd; + uint64x2_t y_fst64, y_snd64; + + // First 8 + highbd_read_dist_8(y_dist, &y_fst, &y_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = vaddq_u32(y_fst, y_tmp_fst); + y_snd = vaddq_u32(y_snd, y_tmp_snd); + } + + y_fst64 = vpaddlq_u32(y_fst); + y_snd64 = vpaddlq_u32(y_snd); + y_reg_fst = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64)); + + // Second 8 + highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd); + if (ss_y == 1) { + uint32x4_t y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = vaddq_u32(y_fst, y_tmp_fst); + y_snd = vaddq_u32(y_snd, y_tmp_snd); + } + + y_fst64 = vpaddlq_u32(y_fst); + y_snd64 = vpaddlq_u32(y_snd); + y_reg_snd = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64)); + } + + *u_mod_fst = vaddq_u32(*u_mod_fst, y_reg_fst); + *u_mod_snd = vaddq_u32(*u_mod_snd, y_reg_snd); + *v_mod_fst = vaddq_u32(*v_mod_fst, y_reg_fst); + *v_mod_snd = vaddq_u32(*v_mod_snd, y_reg_snd); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void highbd_apply_temporal_filter_chroma_8( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int uv_block_width, unsigned int uv_block_height, int ss_x, + int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, + int top_weight, int bottom_weight, const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + uint32x4_t mul_fst, mul_snd; + + uint32x4_t u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst; + uint32x4_t v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst; + uint32x4_t u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd; + uint32x4_t v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd; + + uint32x4_t u_sum_row_fst, v_sum_row_fst; + uint32x4_t u_sum_row_snd, v_sum_row_snd; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul_fst = vld1q_u32(neighbors_fst[0]); + mul_snd = vld1q_u32(neighbors_snd[0]); + + // Add chroma values + highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + + u_sum_row_fst = vaddq_u32(u_sum_row_2_fst, u_sum_row_3_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_2_snd, u_sum_row_3_snd); + + highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + + v_sum_row_fst = vaddq_u32(v_sum_row_2_fst, v_sum_row_3_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_2_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul_fst = vld1q_u32(neighbors_fst[1]); + mul_snd = vld1q_u32(neighbors_snd[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + u_sum_row_fst = vaddq_u32(u_sum_row_fst, u_sum_row_3_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_snd, u_sum_row_3_snd); + + v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + v_sum_row_fst = vaddq_u32(v_sum_row_fst, v_sum_row_3_fst); + v_sum_row_snd = vaddq_u32(v_sum_row_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul_fst = vld1q_u32(neighbors_fst[0]); + mul_snd = vld1q_u32(neighbors_snd[0]); + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst); + v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst); + u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd); + v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst, + u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst, + v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); +} + +// Perform temporal filter for the chroma components. +static void highbd_apply_temporal_filter_chroma( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_fst; + const uint32_t *const *neighbors_snd; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } else { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); +} + +void vp9_highbd_apply_temporal_filter_neon( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + + uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 8) { + highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw, + use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); + + highbd_apply_temporal_filter_chroma( + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} + +static INLINE uint16x8_t highbd_convolve12_8( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, const int16x8_t s8, + const int16x8_t s9, const int16x8_t sA, const int16x8_t sB, + const int16x8_t filter_0_7, const int16x4_t filter_8_11, uint16x8_t max) { + const int16x4_t filter_0_3 = vget_low_s16(filter_0_7); + const int16x4_t filter_4_7 = vget_high_s16(filter_0_7); + + int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(s0), filter_0_3, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s1), filter_0_3, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), filter_0_3, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), filter_0_3, 3); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s4), filter_4_7, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s5), filter_4_7, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s6), filter_4_7, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s7), filter_4_7, 3); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s8), filter_8_11, 0); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s9), filter_8_11, 1); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(sA), filter_8_11, 2); + sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(sB), filter_8_11, 3); + + int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(s0), filter_0_3, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s1), filter_0_3, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), filter_0_3, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), filter_0_3, 3); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s4), filter_4_7, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s5), filter_4_7, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s6), filter_4_7, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s7), filter_4_7, 3); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s8), filter_8_11, 0); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s9), filter_8_11, 1); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(sA), filter_8_11, 2); + sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(sB), filter_8_11, 3); + + uint16x4_t sum_lo_s16 = vqrshrun_n_s32(sum_lo, FILTER_BITS); + uint16x4_t sum_hi_s16 = vqrshrun_n_s32(sum_hi, FILTER_BITS); + + uint16x8_t sum = vcombine_u16(sum_lo_s16, sum_hi_s16); + return vminq_u16(sum, max); +} + +void vpx_highbd_convolve12_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + // Scaling not supported by Neon implementation. + if (x_step_q4 != 16) { + vpx_highbd_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h % 4 == 0); + + const int16x8_t filter_0_7 = vld1q_s16(filter[x0_q4]); + const int16x4_t filter_8_11 = vld1_s16(filter[x0_q4] + 8); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + src -= MAX_FILTER_TAP / 2 - 1; + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[12], s1[12]; + load_s16_8x12(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7], &s0[8], &s0[9], &s0[10], + &s0[11]); + load_s16_8x12(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7], &s1[8], &s1[9], &s1[10], + &s1[11]); + + uint16x8_t d0 = highbd_convolve12_8( + s0[0], s0[1], s0[2], s0[3], s0[4], s0[5], s0[6], s0[7], s0[8], s0[9], + s0[10], s0[11], filter_0_7, filter_8_11, max); + uint16x8_t d1 = highbd_convolve12_8( + s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7], s1[8], s1[9], + s1[10], s1[11], filter_0_7, filter_8_11, max); + + vst1q_u16(d + 0 * dst_stride, d0); + vst1q_u16(d + 1 * dst_stride, d1); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); +} + +void vpx_highbd_convolve12_vert_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + // Scaling not supported by Neon implementation. + if (y_step_q4 != 16) { + vpx_highbd_convolve12_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + const int16x8_t filter_0_7 = vld1q_s16(filter[y0_q4]); + const int16x4_t filter_8_11 = vld1_s16(filter[y0_q4] + 8); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + src -= src_stride * (MAX_FILTER_TAP / 2 - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_s16_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; + + do { + int16x8_t sB, sC, sD, sE; + load_s16_8x4(s, src_stride, &sB, &sC, &sD, &sE); + + uint16x8_t d0 = + highbd_convolve12_8(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, + filter_0_7, filter_8_11, max); + uint16x8_t d1 = + highbd_convolve12_8(s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, + filter_0_7, filter_8_11, max); + uint16x8_t d2 = + highbd_convolve12_8(s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, + filter_0_7, filter_8_11, max); + uint16x8_t d3 = + highbd_convolve12_8(s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, + filter_0_7, filter_8_11, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = sA; + s7 = sB; + s8 = sC; + s9 = sD; + sA = sE; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +void vpx_highbd_convolve12_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + // Scaling not supported by Neon implementation. + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + DECLARE_ALIGNED(32, uint16_t, im_block[BW * (BH + MAX_FILTER_TAP)]); + + const int im_stride = BW; + // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior + // and MAX_FILTER_TAP / 2 lines post. (+1 to make total divisible by 2.) + const int im_height = h + MAX_FILTER_TAP; + const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1; + + // Filter starting border_offset rows up. + vpx_highbd_convolve12_horiz_neon( + src - src_stride * border_offset, src_stride, im_block, im_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, im_height, bd); + + vpx_highbd_convolve12_vert_neon(im_block + im_stride * border_offset, + im_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c new file mode 100644 index 0000000000..73660eb4ac --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2025 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" +#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h" + +DECLARE_ALIGNED(16, static const uint16_t, kDotProdPermuteTbl[32]) = { + // clang-format off + 0, 1, 2, 3, 1, 2, 3, 4, + 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 0, + 6, 7, 0, 1, 7, 0, 1, 2, + // clang-format on +}; + +static INLINE uint16x8_t highbd_convolve12_8_h( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t filter_0_7, const int16x8_t filter_4_11, + const uint16x8x4_t perm_tbl, const uint16x8_t max) { + int16x8_t perm_samples[8]; + + perm_samples[0] = vpx_tbl_s16(s0, perm_tbl.val[0]); + perm_samples[1] = vpx_tbl_s16(s0, perm_tbl.val[1]); + perm_samples[2] = vpx_tbl2_s16(s0, s1, perm_tbl.val[2]); + perm_samples[3] = vpx_tbl2_s16(s0, s1, perm_tbl.val[3]); + perm_samples[4] = vpx_tbl_s16(s1, perm_tbl.val[0]); + perm_samples[5] = vpx_tbl_s16(s1, perm_tbl.val[1]); + perm_samples[6] = vpx_tbl2_s16(s1, s2, perm_tbl.val[2]); + perm_samples[7] = vpx_tbl2_s16(s1, s2, perm_tbl.val[3]); + + int64x2_t sum01 = + vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[0], filter_0_7, 0); + sum01 = vpx_dotq_lane_s16(sum01, perm_samples[2], filter_0_7, 1); + sum01 = vpx_dotq_lane_s16(sum01, perm_samples[4], filter_4_11, 1); + + int64x2_t sum23 = + vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[1], filter_0_7, 0); + sum23 = vpx_dotq_lane_s16(sum23, perm_samples[3], filter_0_7, 1); + sum23 = vpx_dotq_lane_s16(sum23, perm_samples[5], filter_4_11, 1); + + int64x2_t sum45 = + vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[2], filter_0_7, 0); + sum45 = vpx_dotq_lane_s16(sum45, perm_samples[4], filter_0_7, 1); + sum45 = vpx_dotq_lane_s16(sum45, perm_samples[6], filter_4_11, 1); + + int64x2_t sum67 = + vpx_dotq_lane_s16(vdupq_n_s64(0), perm_samples[3], filter_0_7, 0); + sum67 = vpx_dotq_lane_s16(sum67, perm_samples[5], filter_0_7, 1); + sum67 = vpx_dotq_lane_s16(sum67, perm_samples[7], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +void vpx_highbd_convolve12_horiz_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + // Scaling not supported by SVE2 implementation. + if (x_step_q4 != 16) { + vpx_highbd_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h % 4 == 0); + + const int16x8_t filter_0_7 = vld1q_s16(filter[x0_q4]); + const int16x8_t filter_4_11 = vld1q_s16(filter[x0_q4] + 4); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdPermuteTbl); + + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + permute_tbl.val[2] = vsetq_lane_u16(svcnth(), permute_tbl.val[2], 7); + permute_tbl.val[3] = vsetq_lane_u16(svcnth(), permute_tbl.val[3], 5); + uint16x8_t permute_tbl_3_offsets = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + permute_tbl.val[3] = + vaddq_u16(permute_tbl.val[3], permute_tbl_3_offsets); // 2, 3, 6, 7 + + src -= MAX_FILTER_TAP / 2 - 1; + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[3], s1[3]; + + load_s16_8x3(s + 0 * src_stride, 8, &s0[0], &s0[1], &s0[2]); + load_s16_8x3(s + 1 * src_stride, 8, &s1[0], &s1[1], &s1[2]); + + uint16x8_t d0 = highbd_convolve12_8_h(s0[0], s0[1], s0[2], filter_0_7, + filter_4_11, permute_tbl, max); + uint16x8_t d1 = highbd_convolve12_8_h(s1[0], s1[1], s1[2], filter_0_7, + filter_4_11, permute_tbl, max); + + vst1q_u16(d + 0 * dst_stride, d0); + vst1q_u16(d + 1 * dst_stride, d1); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h != 0); +} + +static INLINE uint16x4_t highbd_convolve12_4_v(const int16x8_t s0[2], + const int16x8_t s1[2], + const int16x8_t s2[2], + const int16x8_t filter_0_7, + const int16x8_t filter_4_11, + const uint16x4_t max) { + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0); + sum01 = vpx_dotq_lane_s16(sum01, s1[0], filter_0_7, 1); + sum01 = vpx_dotq_lane_s16(sum01, s2[0], filter_4_11, 1); + + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0); + sum23 = vpx_dotq_lane_s16(sum23, s1[1], filter_0_7, 1); + sum23 = vpx_dotq_lane_s16(sum23, s2[1], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + + return vmin_u16(res, max); +} + +void vpx_highbd_convolve12_vert_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + // Scaling not supported by SVE2 implementation. + if (y_step_q4 != 16) { + vpx_highbd_convolve12_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + assert(w == 32 || w == 16 || w == 8); + assert(h % 4 == 0); + + const int16x8_t filter_0_7 = vld1q_s16(filter[y0_q4]); + const int16x8_t filter_4_11 = vld1q_s16(filter[y0_q4] + 4); + + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + + src -= src_stride * (MAX_FILTER_TAP / 2 - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], + s6789[2], s789A[2]; + transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]); + transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]); + transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]); + transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]); + transpose_concat_s16_4x4(s4, s5, s6, s7, &s4567[0], &s4567[1]); + transpose_concat_s16_4x4(s5, s6, s7, s8, &s5678[0], &s5678[1]); + transpose_concat_s16_4x4(s6, s7, s8, s9, &s6789[0], &s6789[1]); + transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]); + + do { + int16x4_t sB, sC, sD, sE; + load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); + + int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; + transpose_concat_s16_4x4(s8, s9, sA, sB, &s89AB[0], &s89AB[1]); + transpose_concat_s16_4x4(s9, sA, sB, sC, &s9ABC[0], &s9ABC[1]); + transpose_concat_s16_4x4(sA, sB, sC, sD, &sABCD[0], &sABCD[1]); + transpose_concat_s16_4x4(sB, sC, sD, sE, &sBCDE[0], &sBCDE[1]); + + uint16x4_t d0 = highbd_convolve12_4_v(s0123, s4567, s89AB, filter_0_7, + filter_4_11, max); + uint16x4_t d1 = highbd_convolve12_4_v(s1234, s5678, s9ABC, filter_0_7, + filter_4_11, max); + uint16x4_t d2 = highbd_convolve12_4_v(s2345, s6789, sABCD, filter_0_7, + filter_4_11, max); + uint16x4_t d3 = highbd_convolve12_4_v(s3456, s789A, sBCDE, filter_0_7, + filter_4_11, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - reusing as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s4567[0] = s89AB[0]; + s4567[1] = s89AB[1]; + s5678[0] = s9ABC[0]; + s5678[1] = s9ABC[1]; + s6789[0] = sABCD[0]; + s6789[1] = sABCD[1]; + s789A[0] = sBCDE[0]; + s789A[1] = sBCDE[1]; + + s8 = sC; + s9 = sD; + sA = sE; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 4; + dst += 4; + w -= 4; + } while (w != 0); +} + +void vpx_highbd_convolve12_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + // Scaling not supported by SVE2 implementation. + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + DECLARE_ALIGNED(32, uint16_t, im_block[BW * (BH + MAX_FILTER_TAP)]); + + const int im_stride = BW; + // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior + // and MAX_FILTER_TAP / 2 lines post. (+1 to make total divisible by 4.) + const int im_height = h + MAX_FILTER_TAP; + const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1; + + // Filter starting border_offset rows up. + vpx_highbd_convolve12_horiz_sve2( + src - src_stride * border_offset, src_stride, im_block, im_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, im_height, bd); + + vpx_highbd_convolve12_vert_sve2(im_block + im_stride * border_offset, + im_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c index 33c2fc7feb..96d0614367 100644 --- a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -9,109 +9,395 @@ */ #include - +#include #include +#include +#include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rd.h" -void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - // TODO(jingning) Decide the need of these arguments after the - // quantization process is completed. - (void)zbin_ptr; - (void)quant_shift_ptr; - (void)scan; +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - int i; - const int16x8_t v_zero = vdupq_n_s16(0); - const int16x8_t v_one = vdupq_n_s16(1); - int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); - int16x8_t v_round = vmovq_n_s16(round_ptr[1]); - int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); - int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); - // adjust for dc - v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); - v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); - v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); - // process dc and the first seven ac coeffs - { - const int16x8_t v_iscan = vld1q_s16(&iscan[0]); - const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]); - const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); - const int32x4_t v_tmp_lo = - vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); - const int32x4_t v_tmp_hi = - vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); - const int16x8_t v_tmp2 = - vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); - const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); - const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); - const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); - const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - vst1q_s16(&qcoeff_ptr[0], v_qcoeff); - vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff); - v_round = vmovq_n_s16(round_ptr[1]); - v_quant = vmovq_n_s16(quant_ptr[1]); - v_dequant = vmovq_n_s16(dequant_ptr[1]); - } - // now process the rest of the ac coeffs - for (i = 8; i < count; i += 8) { - const int16x8_t v_iscan = vld1q_s16(&iscan[i]); - const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]); - const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); - const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); - const int32x4_t v_tmp_lo = - vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); - const int32x4_t v_tmp_hi = - vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); - const int16x8_t v_tmp2 = - vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); - const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); - const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); - const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); - const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); - const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); - v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); - vst1q_s16(&qcoeff_ptr[i], v_qcoeff); - vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff); - } - { - const int16x4_t v_eobmax_3210 = vmax_s16( - vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); - const int64x1_t v_eobmax_xx32 = - vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); - const int16x4_t v_eobmax_tmp = - vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); - const int64x1_t v_eobmax_xxx3 = - vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); - const int16x4_t v_eobmax_final = - vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); +static VPX_FORCE_INLINE void calculate_dqcoeff_and_store( + const int16x8_t qcoeff, const int16x8_t dequant, tran_low_t *dqcoeff) { + const int32x4_t dqcoeff_0 = + vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + const int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); - *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); - } - } else { - memset(qcoeff_ptr, 0, count * sizeof(int16_t)); - memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); - *eob_ptr = 0; - } +#if CONFIG_VP9_HIGHBITDEPTH + vst1q_s32(dqcoeff, dqcoeff_0); + vst1q_s32(dqcoeff + 4, dqcoeff_1); +#else + vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1))); +#endif // CONFIG_VP9_HIGHBITDEPTH } + +static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr, + int16x8_t v_eobmax, + uint16x8_t v_nz_mask) { + const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan); + return vmaxq_s16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { +#if VPX_ARCH_AARCH64 + return (uint16_t)vmaxvq_s16(v_eobmax); +#else + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + return (uint16_t)vget_lane_s16(v_eobmax_final, 0); +#endif // VPX_ARCH_AARCH64 +} + +static VPX_FORCE_INLINE void load_fp_values( + const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr, + int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) { + *round = vld1q_s16(mb_plane->round_fp); + *quant = vld1q_s16(mb_plane->quant_fp); + *dequant = vld1q_s16(dequant_ptr); +} + +static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round, + int16x8_t *v_quant, + int16x8_t *v_dequant) { +#if VPX_ARCH_AARCH64 + *v_round = vdupq_laneq_s16(*v_round, 1); + *v_quant = vdupq_laneq_s16(*v_quant, 1); + *v_dequant = vdupq_laneq_s16(*v_dequant, 1); +#else + *v_round = vdupq_lane_s16(vget_low_s16(*v_round), 1); + *v_quant = vdupq_lane_s16(vget_low_s16(*v_quant), 1); + *v_dequant = vdupq_lane_s16(vget_low_s16(*v_dequant), 1); +#endif +} + +static VPX_FORCE_INLINE void quantize_fp_8( + const int16x8_t *v_round, const int16x8_t *v_quant, + const int16x8_t *v_dequant, const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + int16x8_t *v_eobmax) { + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs, *v_round); + const int32x4_t v_tmp_lo = + vmull_s16(vget_low_s16(v_tmp), vget_low_s16(*v_quant)); + const int32x4_t v_tmp_hi = + vmull_s16(vget_high_s16(v_tmp), vget_high_s16(*v_quant)); + const int16x8_t v_tmp2 = + vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + calculate_dqcoeff_and_store(v_qcoeff, *v_dequant, dqcoeff_ptr); + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + + *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); +} + +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + int i; + int16x8_t v_eobmax = vdupq_n_s16(-1); + int16x8_t v_round, v_quant, v_dequant; + const int16_t *iscan = scan_order->iscan; + + load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant); + // process dc and the first seven ac coeffs + quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr, + dqcoeff_ptr, &v_eobmax); + + // now process the rest of the ac coeffs + update_fp_values(&v_round, &v_quant, &v_dequant); + for (i = 8; i < n_coeffs; i += 8) { + quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i, + qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax); + } + + *eob_ptr = get_max_eob(v_eobmax); +} + +static INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +static VPX_FORCE_INLINE void quantize_fp_32x32_8( + const int16x8_t *v_round, const int16x8_t *v_quant, + const int16x8_t *v_dequant, const int16x8_t *dequant_thresh, + const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t *v_eobmax) { + const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_coeff_abs = vabsq_s16(v_coeff); + const int16x8_t v_thr_mask = + vreinterpretq_s16_u16(vcgeq_s16(v_coeff_abs, *dequant_thresh)); + const int16x8_t v_tmp_rnd = + vandq_s16(vqaddq_s16(v_coeff_abs, *v_round), v_thr_mask); + const int16x8_t v_abs_qcoeff = vqdmulhq_s16(v_tmp_rnd, *v_quant); + const int16x8_t v_qcoeff = + vsubq_s16(veorq_s16(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + const uint16x8_t v_nz_mask = vceqq_s16(v_abs_qcoeff, vdupq_n_s16(0)); + + int32x4_t dqcoeff_0, dqcoeff_1; + dqcoeff_0 = vmull_s16(vget_low_s16(v_qcoeff), vget_low_s16(*v_dequant)); + dqcoeff_1 = vmull_s16(vget_high_s16(v_qcoeff), vget_high_s16(*v_dequant)); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + +#if CONFIG_VP9_HIGHBITDEPTH + vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1)); + vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1)); +#else + store_s16q_to_tran_low(dqcoeff_ptr, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), + vshrn_n_s32(dqcoeff_1, 1))); +#endif + + store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); + + *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask); +} + +void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + int16x8_t eob_max = vdupq_n_s16(-1); + // ROUND_POWER_OF_TWO(round_ptr[], 1) + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant_fp); + int16x8_t dequant = vld1q_s16(dequant_ptr); + // dequant >> 2 is used similar to zbin as a threshold. + int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); + int i; + const int16_t *iscan = scan_order->iscan; + + (void)n_coeffs; + + // Process dc and the first seven ac coeffs. + quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, + iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max); + + update_fp_values(&round, &quant, &dequant); + dequant_thresh = vdupq_lane_s16(vget_low_s16(dequant_thresh), 1); + + iscan += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + + // Process the rest of the ac coeffs. + for (i = 8; i < 32 * 32; i += 8) { + quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr, + iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max); + + iscan += 8; + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + *eob_ptr = get_max_eob(eob_max); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static VPX_FORCE_INLINE uint16x4_t +highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, int32x4_t v_round_s32) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32); + // const int abs_qcoeff = (int)((tmp * quant) >> 16); + const int32x4_t v_abs_qcoeff = vqdmulhq_s32(v_tmp, v_quant_s32); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + const int32x4_t v_abs_dqcoeff = vmulq_s32(v_abs_qcoeff, v_dequant_s32); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Packed nz_qcoeff_mask. Used to find eob. + return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0))); +} + +void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x4_t v_zero = vdup_n_s16(0); + const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_round = vld1_s16(mb_plane->round_fp); + int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); + int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); + int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + const int16_t *iscan = scan_order->iscan; + + // DC and first 3 AC + v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + + // 4 more AC + v_mask_hi = + highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + + // Find the max lane eob for the first 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + n_coeffs -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + v_mask_hi = + highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + n_coeffs -= 8; + } while (n_coeffs); + + *eob_ptr = get_max_eob(v_eobmax); +} + +static VPX_FORCE_INLINE uint16x4_t +highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32, + int32x4_t v_dequant_s32, int32x4_t v_round_s32) { + const int32x4_t v_coeff = vld1q_s32(coeff_ptr); + const int32x4_t v_coeff_sign = + vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0))); + const int32x4_t v_abs_coeff = vabsq_s32(v_coeff); + // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) + const int32x4_t v_abs_coeff_scaled = vshlq_n_s32(v_abs_coeff, 2); + const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32); + // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0 + const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32), + vreinterpretq_s32_u32(v_mask)); + // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale)); + const int32x4_t v_abs_qcoeff = + vqdmulhq_s32(vshlq_n_s32(v_tmp, 1), v_quant_s32); + // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_qcoeff = + vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign); + // vshlq_s32 will shift right if shift value is negative. + const int32x4_t v_abs_dqcoeff = + vshrq_n_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), 1); + // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + const int32x4_t v_dqcoeff = + vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign); + + vst1q_s32(qcoeff_ptr, v_qcoeff); + vst1q_s32(dqcoeff_ptr, v_dqcoeff); + + // Packed nz_qcoeff_mask. Used to find eob. + return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0))); +} + +void vp9_highbd_quantize_fp_32x32_neon( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp); + const int16x4_t v_dequant = vld1_s16(dequant_ptr); + const int16x4_t v_zero = vdup_n_s16(0); + const int16x4_t v_round = + vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14)); + int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero); + int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15); + int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero); + uint16x4_t v_mask_lo, v_mask_hi; + int16x8_t v_eobmax = vdupq_n_s16(-1); + const int16_t *iscan = scan_order->iscan; + + // DC and first 3 AC + v_mask_lo = + highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + + // overwrite the DC constants with AC constants + v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1); + v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1); + v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1); + + // 4 more AC + v_mask_hi = + highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4, + v_quant_s32, v_dequant_s32, v_round_s32); + + // Find the max lane eob for the first 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + + n_coeffs -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan += 8; + v_mask_lo = + highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, + v_quant_s32, v_dequant_s32, v_round_s32); + v_mask_hi = highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, + dqcoeff_ptr + 4, v_quant_s32, + v_dequant_s32, v_round_s32); + // Find the max lane eob for 8 coeffs. + v_eobmax = + get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi)); + n_coeffs -= 8; + } while (n_coeffs); + + *eob_ptr = get_max_eob(v_eobmax); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c new file mode 100644 index 0000000000..b6cce39bfe --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c @@ -0,0 +1,1103 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const uint8x8_t a_reg = vld1_u8(a); + const uint8x8_t b_reg = vld1_u8(b); + + uint16x8_t dist_first = vabdl_u8(a_reg, b_reg); + dist_first = vmulq_u16(dist_first, dist_first); + + vst1q_u16(dst, dist_first); +} + +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const uint8x16_t a_reg = vld1q_u8(a); + const uint8x16_t b_reg = vld1q_u8(b); + + uint16x8_t dist_first = vabdl_u8(vget_low_u8(a_reg), vget_low_u8(b_reg)); + uint16x8_t dist_second = vabdl_u8(vget_high_u8(a_reg), vget_high_u8(b_reg)); + dist_first = vmulq_u16(dist_first, dist_first); + dist_second = vmulq_u16(dist_second, dist_second); + + vst1q_u16(dst, dist_first); + vst1q_u16(dst + 8, dist_second); +} + +static INLINE void read_dist_8(const uint16_t *dist, uint16x8_t *dist_reg) { + *dist_reg = vld1q_u16(dist); +} + +static INLINE void read_dist_16(const uint16_t *dist, uint16x8_t *reg_first, + uint16x8_t *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE uint16x8_t average_8(uint16x8_t sum, + const uint16x8_t *mul_constants, + const int strength, const int rounding, + const uint16x8_t *weight) { + const uint32x4_t rounding_u32 = vdupq_n_u32(rounding << 16); + const uint16x8_t weight_u16 = *weight; + const uint16x8_t sixteen = vdupq_n_u16(16); + const int32x4_t strength_u32 = vdupq_n_s32(-strength - 16); + + // modifier * 3 / index; + uint32x4_t sum_hi = + vmull_u16(vget_low_u16(sum), vget_low_u16(*mul_constants)); + uint32x4_t sum_lo = + vmull_u16(vget_high_u16(sum), vget_high_u16(*mul_constants)); + + sum_lo = vqaddq_u32(sum_lo, rounding_u32); + sum_hi = vqaddq_u32(sum_hi, rounding_u32); + + // we cannot use vshrn_n_u32 as strength is not known at compile time. + sum_lo = vshlq_u32(sum_lo, strength_u32); + sum_hi = vshlq_u32(sum_hi, strength_u32); + + sum = vcombine_u16(vmovn_u32(sum_hi), vmovn_u32(sum_lo)); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = vminq_u16(sum, sixteen); + sum = vsubq_u16(sixteen, sum); + return vmulq_u16(sum, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const uint16x8_t sum_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + uint16x8_t pred_u16 = vmovl_u8(vld1_u8(pred)); + uint16x8_t count_u16 = vld1q_u16(count); + uint32x4_t accum_0_u32, accum_1_u32; + + count_u16 = vqaddq_u16(count_u16, sum_u16); + vst1q_u16(count, count_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + + accum_0_u32 = + vmlal_u16(accum_0_u32, vget_low_u16(sum_u16), vget_low_u16(pred_u16)); + accum_1_u32 = + vmlal_u16(accum_1_u32, vget_high_u16(sum_u16), vget_high_u16(pred_u16)); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); +} + +static INLINE void accumulate_and_store_16(const uint16x8_t sum_0_u16, + const uint16x8_t sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + uint8x16_t pred_u8 = vld1q_u8(pred); + uint16x8_t pred_0_u16 = vmovl_u8(vget_low_u8(pred_u8)); + uint16x8_t pred_1_u16 = vmovl_u8(vget_high_u8(pred_u8)); + uint16x8_t count_0_u16 = vld1q_u16(count); + uint16x8_t count_1_u16 = vld1q_u16(count + 8); + uint32x4_t accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = vqaddq_u16(count_0_u16, sum_0_u16); + vst1q_u16(count, count_0_u16); + count_1_u16 = vqaddq_u16(count_1_u16, sum_1_u16); + vst1q_u16(count + 8, count_1_u16); + + accum_0_u32 = vld1q_u32(accumulator); + accum_1_u32 = vld1q_u32(accumulator + 4); + accum_2_u32 = vld1q_u32(accumulator + 8); + accum_3_u32 = vld1q_u32(accumulator + 12); + + accum_0_u32 = + vmlal_u16(accum_0_u32, vget_low_u16(sum_0_u16), vget_low_u16(pred_0_u16)); + accum_1_u32 = vmlal_u16(accum_1_u32, vget_high_u16(sum_0_u16), + vget_high_u16(pred_0_u16)); + accum_2_u32 = + vmlal_u16(accum_2_u32, vget_low_u16(sum_1_u16), vget_low_u16(pred_1_u16)); + accum_3_u32 = vmlal_u16(accum_3_u32, vget_high_u16(sum_1_u16), + vget_high_u16(pred_1_u16)); + + vst1q_u32(accumulator, accum_0_u32); + vst1q_u32(accumulator + 4, accum_1_u32); + vst1q_u32(accumulator + 8, accum_2_u32); + vst1q_u32(accumulator + 12, accum_3_u32); +} + +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, uint16x8_t *sum) { + uint16x8_t dist_reg, dist_left, dist_right; + + dist_reg = vld1q_u16(y_dist); + dist_left = vld1q_u16(y_dist - 1); + dist_right = vld1q_u16(y_dist + 1); + + *sum = vqaddq_u16(dist_reg, dist_left); + *sum = vqaddq_u16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, uint16x8_t *sum_first, + uint16x8_t *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + uint16x8_t *u_first, + uint16x8_t *u_second, + uint16x8_t *v_first, + uint16x8_t *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + uint16x8_t u_reg, v_reg; + uint16x8x2_t pair; + + read_dist_8(u_dist, &u_reg); + + pair = vzipq_u16(u_reg, u_reg); + *u_first = pair.val[0]; + *u_second = pair.val[1]; + + read_dist_8(v_dist, &v_reg); + + pair = vzipq_u16(v_reg, v_reg); + *v_first = pair.val[0]; + *v_second = pair.val[1]; + } +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + uint16x8_t *u_mod, + uint16x8_t *v_mod) { + uint16x8_t y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + uint16x8_t y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = vqaddq_u16(y_reg, y_tmp); + } + } else { + uint16x8_t y_first, y_second; + uint32x4_t y_first32, y_second32; + + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + uint16x8_t y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = vqaddq_u16(y_first, y_tmp_0); + y_second = vqaddq_u16(y_second, y_tmp_1); + } + + y_first32 = vpaddlq_u16(y_first); + y_second32 = vpaddlq_u16(y_second); + + y_reg = vcombine_u16(vqmovn_u32(y_first32), vqmovn_u32(y_second32)); + } + + *u_mod = vqaddq_u16(*u_mod, y_reg); + *v_mod = vqaddq_u16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void apply_temporal_filter_luma_16( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + uint16x8_t weight_first, weight_second; + + uint16x8_t mul_first, mul_second; + + uint16x8_t sum_row_1_first, sum_row_1_second; + uint16x8_t sum_row_2_first, sum_row_2_second; + uint16x8_t sum_row_3_first, sum_row_3_second; + + uint16x8_t u_first, u_second; + uint16x8_t v_first, v_second; + + uint16x8_t sum_row_first; + uint16x8_t sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 0); + assert(strength <= 6); + + assert(block_width == 16); + (void)block_width; + + // Initialize the weights + if (blk_fw) { + weight_first = vdupq_n_u16(blk_fw[0]); + weight_second = vdupq_n_u16(blk_fw[1]); + } else { + weight_first = vdupq_n_u16(top_weight); + weight_second = weight_first; + } + + // First row + mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vqaddq_u16(sum_row_2_first, sum_row_3_first); + sum_row_second = vqaddq_u16(sum_row_2_second, sum_row_3_second); + + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = vld1q_u16((const uint16_t *)neighbors_first[1]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + weight_first = vdupq_n_u16(blk_fw[2]); + weight_second = vdupq_n_u16(blk_fw[3]); + } else { + weight_first = vdupq_n_u16(bottom_weight); + weight_second = weight_first; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first); + sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = vqaddq_u16(sum_row_first, sum_row_3_first); + sum_row_second = vqaddq_u16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]); + mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first); + sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } + + sum_row_first = vqaddq_u16(sum_row_first, u_first); + sum_row_second = vqaddq_u16(sum_row_second, u_second); + sum_row_first = vqaddq_u16(sum_row_first, v_first); + sum_row_second = vqaddq_u16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void apply_temporal_filter_luma( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The block width is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usual left-middle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } else { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, 0, 0, blk_fw); + } + + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void apply_temporal_filter_chroma_8( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + + uint16x8_t weight; + + uint16x8_t mul; + + uint16x8_t u_sum_row_1, u_sum_row_2, u_sum_row_3; + uint16x8_t v_sum_row_1, v_sum_row_2, v_sum_row_3; + + uint16x8_t u_sum_row, v_sum_row; + + // Loop variable + unsigned int h; + + // Initialize weight + if (blk_fw) { + weight = vcombine_u16(vdup_n_u16(blk_fw[0]), vdup_n_u16(blk_fw[1])); + } else { + weight = vdupq_n_u16(top_weight); + } + + // First row + mul = vld1q_u16((const uint16_t *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = vqaddq_u16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = vqaddq_u16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = vld1q_u16((const uint16_t *)neighbors[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + weight = vcombine_u16(vdup_n_u16(blk_fw[2]), vdup_n_u16(blk_fw[3])); + } else { + weight = vdupq_n_u16(bottom_weight); + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = vqaddq_u16(u_sum_row, u_sum_row_3); + + v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = vqaddq_u16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = vld1q_u16((const uint16_t *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2); + v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void apply_temporal_filter_chroma( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } else { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +void vp9_apply_temporal_filter_neon( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height, + ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk, + y_accum, y_count, y_dist_ptr, u_dist_ptr, + v_dist_ptr); + + apply_temporal_filter_chroma(u_pre, v_pre, uv_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw_ptr, + use_whole_blk, u_accum, u_count, v_accum, + v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); +} + +static INLINE uint8x8_t convolve12_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t s8, const int16x8_t s9, + const int16x8_t sA, const int16x8_t sB, + const int16x8_t filter_0_7, + const int16x4_t filter_8_11) { + const int16x4_t filter_0_3 = vget_low_s16(filter_0_7); + const int16x4_t filter_4_7 = vget_high_s16(filter_0_7); + + int16x8_t sum = vmulq_lane_s16(s0, filter_0_3, 0); + sum = vmlaq_lane_s16(sum, s1, filter_0_3, 1); + sum = vmlaq_lane_s16(sum, s2, filter_0_3, 2); + sum = vmlaq_lane_s16(sum, s3, filter_0_3, 3); + sum = vmlaq_lane_s16(sum, s4, filter_4_7, 0); + + sum = vmlaq_lane_s16(sum, s7, filter_4_7, 3); + sum = vmlaq_lane_s16(sum, s8, filter_8_11, 0); + sum = vmlaq_lane_s16(sum, s9, filter_8_11, 1); + sum = vmlaq_lane_s16(sum, sA, filter_8_11, 2); + sum = vmlaq_lane_s16(sum, sB, filter_8_11, 3); + + // Saturating addition is required for the largest filter taps to avoid + // overflow (while staying in 16-bit elements.) + sum = vqaddq_s16(sum, vmulq_lane_s16(s5, filter_4_7, 1)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s6, filter_4_7, 2)); + + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +void vpx_convolve12_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + // Scaling not supported by Neon implementation. + if (x_step_q4 != 16) { + vpx_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h % 4 == 0); + + const int16x8_t filter_0_7 = vld1q_s16(filter[x0_q4]); + const int16x4_t filter_8_11 = vld1_s16(filter[x0_q4] + 8); + + src -= MAX_FILTER_TAP / 2 - 1; + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x8_t s0s1 = vcombine_s16(s0, s1); + int16x8_t s1s2 = vcombine_s16(s1, s2); + int16x8_t s2s3 = vcombine_s16(s2, s3); + int16x8_t s3s4 = vcombine_s16(s3, s4); + int16x8_t s4s5 = vcombine_s16(s4, s5); + int16x8_t s5s6 = vcombine_s16(s5, s6); + int16x8_t s6s7 = vcombine_s16(s6, s7); + + load_u8_8x4(s + 8, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t sA = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + + int16x8_t s7s8 = vcombine_s16(s7, s8); + int16x8_t s8s9 = vcombine_s16(s8, s9); + int16x8_t s9sA = vcombine_s16(s9, sA); + + s += 11; + + do { + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t sB = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t sC = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t sD = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t sE = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x8_t sAsB = vcombine_s16(sA, sB); + int16x8_t sBsC = vcombine_s16(sB, sC); + int16x8_t sCsD = vcombine_s16(sC, sD); + int16x8_t sDsE = vcombine_s16(sD, sE); + + uint8x8_t d01 = + convolve12_8(s0s1, s1s2, s2s3, s3s4, s4s5, s5s6, s6s7, s7s8, s8s9, + s9sA, sAsB, sBsC, filter_0_7, filter_8_11); + uint8x8_t d23 = + convolve12_8(s2s3, s3s4, s4s5, s5s6, s6s7, s7s8, s8s9, s9sA, sAsB, + sBsC, sCsD, sDsE, filter_0_7, filter_8_11); + + transpose_u8_4x4(&d01, &d23); + + store_u8(d + 0 * dst_stride, 2 * dst_stride, d01); + store_u8(d + 1 * dst_stride, 2 * dst_stride, d23); + + s0s1 = s4s5; + s1s2 = s5s6; + s2s3 = s6s7; + s3s4 = s7s8; + s4s5 = s8s9; + s5s6 = s9sA; + s6s7 = sAsB; + s7s8 = sBsC; + s8s9 = sCsD; + s9sA = sDsE; + sA = sE; + s += 4; + d += 4; + width -= 4; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); +} + +void vpx_convolve12_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + // Scaling not supported by Neon implementation. + if (y_step_q4 != 16) { + vpx_convolve12_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + const int16x8_t filter_0_7 = vld1q_s16(filter[y0_q4]); + const int16x4_t filter_8_11 = vld1_s16(filter[y0_q4] + 8); + + src -= src_stride * (MAX_FILTER_TAP / 2 - 1); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; + load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, + &t9, &tA); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t sA = vreinterpretq_s16_u16(vmovl_u8(tA)); + + s += 11 * src_stride; + + do { + uint8x8_t tB, tC, tD, tE; + load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE); + + int16x8_t sB = vreinterpretq_s16_u16(vmovl_u8(tB)); + int16x8_t sC = vreinterpretq_s16_u16(vmovl_u8(tC)); + int16x8_t sD = vreinterpretq_s16_u16(vmovl_u8(tD)); + int16x8_t sE = vreinterpretq_s16_u16(vmovl_u8(tE)); + + uint8x8_t d0 = convolve12_8(s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, + sB, filter_0_7, filter_8_11); + uint8x8_t d1 = convolve12_8(s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, + sC, filter_0_7, filter_8_11); + uint8x8_t d2 = convolve12_8(s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, + sD, filter_0_7, filter_8_11); + uint8x8_t d3 = convolve12_8(s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, + sE, filter_0_7, filter_8_11); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = sA; + s7 = sB; + s8 = sC; + s9 = sD; + sA = sE; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +void vpx_convolve12_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel12 *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Scaling not supported by Neon implementation. + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + DECLARE_ALIGNED(32, uint8_t, im_block[BW * (BH + MAX_FILTER_TAP)]); + + const int im_stride = BW; + // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior + // and MAX_FILTER_TAP / 2 lines post. (+1 to make total divisible by 4.) + const int im_height = h + MAX_FILTER_TAP; + const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1; + + // Filter starting border_offset rows up. + vpx_convolve12_horiz_neon(src - src_stride * border_offset, src_stride, + im_block, im_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, im_height); + + vpx_convolve12_vert_neon(im_block + im_stride * border_offset, im_stride, dst, + dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c new file mode 100644 index 0000000000..1fa2d8732f --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2025 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vp9/encoder/vp9_temporal_filter.h" + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = { + // clang-format off + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + // clang-format on +}; + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // clang-format off + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 + // clang-format on +}; + +static INLINE uint8x8_t convolve12_8_h(uint8x16_t samples[2], + const int8x16_t filter, + const uint8x16x3_t perm_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128[2] = { + vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))), + vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128))) + }; + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + // { 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 } + int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], perm_tbl.val[0]), + vqtbl1q_s8(samples_128[0], perm_tbl.val[1]), + vqtbl1q_s8(samples_128[0], perm_tbl.val[2]), + vqtbl1q_s8(samples_128[1], perm_tbl.val[2]) }; + + // Accumulate into 128 << FILTER_BITS to account for range transform. + int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); + + int32x4_t sum0123 = vdotq_laneq_s32(acc, perm_samples[0], filter, 0); + sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filter, 1); + sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filter, 2); + + int32x4_t sum4567 = vdotq_laneq_s32(acc, perm_samples[1], filter, 0); + sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filter, 1); + sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filter, 2); + + // Narrow and re-pack. + int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS), + vqrshrn_n_s32(sum4567, FILTER_BITS)); + return vqmovun_s16(sum_s16); +} + +void vpx_convolve12_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Scaling not supported by Neon implementation. + if (x_step_q4 != 16) { + vpx_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + const int16x8_t x_filter_0_7 = vld1q_s16(filter[x0_q4]); + const int16x4_t x_filter_8_11 = vld1_s16(filter[x0_q4] + 8); + const int16x8_t x_filter_8_15 = vcombine_s16(x_filter_8_11, vdup_n_s16(0)); + const int8x16_t x_filter = + vcombine_s8(vmovn_s16(x_filter_0_7), vmovn_s16(x_filter_8_15)); + + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + + src -= MAX_FILTER_TAP / 2 - 1; + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl); + uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl); + uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl); + uint8x8_t d3 = convolve12_8_h(s3, x_filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); +} + +static INLINE uint8x8_t convolve12_8_v( + const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo, + const int8x16_t s1_hi, const int8x16_t s2_lo, const int8x16_t s2_hi, + const int8x8_t filters_0_7, const int8x8_t filters_4_11) { + // The sample range transform and permutation are performed by the caller. + // Accumulate into 128 << FILTER_BITS to account for range transform. + int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS); + + int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters_0_7, 0); + sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1); + sum0123 = vdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1); + + int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters_0_7, 0); + sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1); + sum4567 = vdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +void vpx_convolve12_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Scaling not supported by Neon implementation. + if (y_step_q4 != 16) { + vpx_convolve12_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(filter[y0_q4] + 4)); + + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); + + src -= src_stride * (MAX_FILTER_TAP / 2 - 1); + + do { + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA; + load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, + &t9, &tA); + s += 11 * src_stride; + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s789A_lo, s789A_hi; + transpose_concat_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_s8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); + transpose_concat_s8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); + transpose_concat_s8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); + transpose_concat_s8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); + + do { + uint8x8_t tB, tC, tD, tE; + load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE); + + int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128))); + int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128))); + int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128))); + int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128))); + + int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, + sBCDE_lo, sBCDE_hi; + transpose_concat_s8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); + + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; + s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]); + s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]); + sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]); + + int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; + s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]); + s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]); + sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve12_8_v(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, + s89AB_hi, filter_0_7, filter_4_11); + uint8x8_t d1 = + convolve12_8_v(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, + s9ABC_hi, filter_0_7, filter_4_11); + uint8x8_t d2 = + convolve12_8_v(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, + sABCD_hi, filter_0_7, filter_4_11); + uint8x8_t d3 = + convolve12_8_v(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, + sBCDE_hi, filter_0_7, filter_4_11); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s789A_lo; + s3456_hi = s789A_hi; + s4567_lo = s89AB_lo; + s4567_hi = s89AB_hi; + s5678_lo = s9ABC_lo; + s5678_hi = s9ABC_hi; + s6789_lo = sABCD_lo; + s6789_hi = sABCD_hi; + s789A_lo = sBCDE_lo; + s789A_hi = sBCDE_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static INLINE void vpx_convolve12_2d_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int w, + int h) { + assert(w == 32 || w == 16 || w == 8); + assert(h % 4 == 3); + + const int16x8_t x_filter_0_7 = vld1q_s16(filter[x0_q4]); + const int16x4_t x_filter_8_11 = vld1_s16(filter[x0_q4] + 8); + const int16x8_t x_filter_8_15 = vcombine_s16(x_filter_8_11, vdup_n_s16(0)); + const int8x16_t x_filter = + vcombine_s8(vmovn_s16(x_filter_0_7), vmovn_s16(x_filter_8_15)); + + const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl); + + src -= MAX_FILTER_TAP / 2 - 1; + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl); + uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl); + uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl); + uint8x8_t d3 = convolve12_8_h(s3, x_filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 3); + + do { + uint8x16_t s0[2], s1[2], s2[2]; + load_u8_16x3(src, src_stride, &s0[0], &s1[0], &s2[0]); + load_u8_16x3(src + 4, src_stride, &s0[1], &s1[1], &s2[1]); + + uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl); + uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl); + uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl); + + store_u8_8x3(dst, dst_stride, d0, d1, d2); + + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +void vpx_convolve12_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + // Scaling not supported by Neon implementation. + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + DECLARE_ALIGNED(32, uint8_t, im_block[BW * (BH + MAX_FILTER_TAP)]); + + const int im_stride = BW; + // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior + // and MAX_FILTER_TAP / 2 lines post. + const int im_height = h + MAX_FILTER_TAP - 1; + const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1; + + // Filter starting border_offset rows up. + vpx_convolve12_2d_horiz_neon_dotprod(src - src_stride * border_offset, + src_stride, im_block, im_stride, filter, + x0_q4, w, im_height); + + vpx_convolve12_vert_neon_dotprod(im_block + im_stride * border_offset, + im_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c new file mode 100644 index 0000000000..3803873a44 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2025 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vp9/encoder/vp9_temporal_filter.h" + +DECLARE_ALIGNED(16, static const uint8_t, kMatMulPermuteTbl[32]) = { + // clang-format off + 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9, + 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 + // clang-format on +}; + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // clang-format off + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 + // clang-format on +}; + +static INLINE uint8x8_t convolve12_8_h(uint8x16_t samples[2], + const int8x16_t filter[2], + const uint8x16x2_t perm_tbl) { + // Permute samples ready for matrix multiply. + // { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 } + // { 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 } + // { 6, 7, 8, 9, 10, 11, 12, 13, 8, 9, 10, 11, 12, 13, 14, 15 } + // { 10, 11, 12, 13, 14, 15, 16, 17, 12, 13, 14, 15, 16, 17, 18, 19 } + uint8x16_t perm_samples[4] = { vqtbl1q_u8(samples[0], perm_tbl.val[0]), + vqtbl1q_u8(samples[0], perm_tbl.val[1]), + vqtbl1q_u8(samples[1], perm_tbl.val[0]), + vqtbl1q_u8(samples[1], perm_tbl.val[1]) }; + + // These instructions multiply a 2x8 matrix (samples) by an 8x2 matrix + // (filter), destructively accumulating into the destination register. + int32x4_t sum0123 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[0], filter[0]); + int32x4_t sum4567 = vusmmlaq_s32(vdupq_n_s32(0), perm_samples[1], filter[0]); + sum0123 = vusmmlaq_s32(sum0123, perm_samples[2], filter[1]); + sum4567 = vusmmlaq_s32(sum4567, perm_samples[3], filter[1]); + + // Narrow and re-pack. + int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS), + vqrshrn_n_s32(sum4567, FILTER_BITS)); + return vqmovun_s16(sum_s16); +} + +void vpx_convolve12_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Scaling not supported by Neon implementation. + if (x_step_q4 != 16) { + vpx_convolve12_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + // Split 12-tap filter into two 6-tap filters, masking the top two elements. + // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 } + const int8x8_t mask = vcreate_s8(0x0000ffffffffffff); + const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(filter[x0_q4])), mask); + const int8x8_t filter_1 = + vext_s8(vmovn_s16(vld1q_s16(filter[x0_q4] + 4)), vdup_n_s8(0), 2); + + // Stagger each 6-tap filter to enable use of matrix multiply instructions. + // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } + const int8x16_t x_filter[2] = { + vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)), + vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7)) + }; + + const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); + + src -= MAX_FILTER_TAP / 2 - 1; + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl); + uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl); + uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl); + uint8x8_t d3 = convolve12_8_h(s3, x_filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); +} + +static INLINE uint8x8_t convolve12_8_v( + const uint8x16_t s0_lo, const uint8x16_t s0_hi, const uint8x16_t s1_lo, + const uint8x16_t s1_hi, const uint8x16_t s2_lo, const uint8x16_t s2_hi, + const int8x8_t filters_0_7, const int8x8_t filters_4_11) { + // The sample range transform and permutation are performed by the caller. + int32x4_t sum0123 = vusdotq_lane_s32(vdupq_n_s32(0), s0_lo, filters_0_7, 0); + sum0123 = vusdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1); + sum0123 = vusdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1); + + int32x4_t sum4567 = vusdotq_lane_s32(vdupq_n_s32(0), s0_hi, filters_0_7, 0); + sum4567 = vusdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1); + sum4567 = vusdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +void vpx_convolve12_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Scaling not supported by Neon implementation. + if (y_step_q4 != 16) { + vpx_convolve12_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(filter[y0_q4])); + const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(filter[y0_q4] + 4)); + + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl); + + src -= src_stride * (MAX_FILTER_TAP / 2 - 1); + + do { + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + + uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, + s6789_hi, s789A_lo, s789A_hi; + transpose_concat_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + transpose_concat_u8_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi); + transpose_concat_u8_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi); + transpose_concat_u8_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi); + transpose_concat_u8_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi); + + do { + uint8x8_t sB, sC, sD, sE; + load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE); + + uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi, + sBCDE_lo, sBCDE_hi; + transpose_concat_u8_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi); + + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } }; + s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]); + s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]); + sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]); + + uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } }; + s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]); + s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]); + sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve12_8_v(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo, + s89AB_hi, filter_0_7, filter_4_11); + uint8x8_t d1 = + convolve12_8_v(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo, + s9ABC_hi, filter_0_7, filter_4_11); + uint8x8_t d2 = + convolve12_8_v(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo, + sABCD_hi, filter_0_7, filter_4_11); + uint8x8_t d3 = + convolve12_8_v(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo, + sBCDE_hi, filter_0_7, filter_4_11); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s789A_lo; + s3456_hi = s789A_hi; + s4567_lo = s89AB_lo; + s4567_hi = s89AB_hi; + s5678_lo = s9ABC_lo; + s5678_hi = s9ABC_hi; + s6789_lo = sABCD_lo; + s6789_hi = sABCD_hi; + s789A_lo = sBCDE_lo; + s789A_hi = sBCDE_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static INLINE void vpx_convolve12_2d_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel12 *filter, int x0_q4, int w, + int h) { + assert(w == 32 || w == 16 || w == 8); + assert(h % 4 == 3); + + // Split 12-tap filter into two 6-tap filters, masking the top two elements. + // { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 } + const int8x8_t mask = vcreate_s8(0x0000ffffffffffff); + const int8x8_t filter_0 = vand_s8(vmovn_s16(vld1q_s16(filter[x0_q4])), mask); + const int8x8_t filter_1 = + vext_s8(vmovn_s16(vld1q_s16(filter[x0_q4] + 4)), vdup_n_s8(0), 2); + + // Stagger each 6-tap filter to enable use of matrix multiply instructions. + // { f0, f1, f2, f3, f4, f5, 0, 0, 0, f0, f1, f2, f3, f4, f5, 0 } + const int8x16_t x_filter[2] = { + vcombine_s8(filter_0, vext_s8(filter_0, filter_0, 7)), + vcombine_s8(filter_1, vext_s8(filter_1, filter_1, 7)) + }; + + const uint8x16x2_t permute_tbl = vld1q_u8_x2(kMatMulPermuteTbl); + + src -= MAX_FILTER_TAP / 2 - 1; + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0[2], s1[2], s2[2], s3[2]; + load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]); + load_u8_16x4(s + 6, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]); + + uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl); + uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl); + uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl); + uint8x8_t d3 = convolve12_8_h(s3, x_filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 3); + + do { + uint8x16_t s0[2], s1[2], s2[2]; + load_u8_16x3(src, src_stride, &s0[0], &s1[0], &s2[0]); + load_u8_16x3(src + 6, src_stride, &s0[1], &s1[1], &s2[1]); + + uint8x8_t d0 = convolve12_8_h(s0, x_filter, permute_tbl); + uint8x8_t d1 = convolve12_8_h(s1, x_filter, permute_tbl); + uint8x8_t d2 = convolve12_8_h(s2, x_filter, permute_tbl); + + store_u8_8x3(dst, dst_stride, d0, d1, d2); + + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +void vpx_convolve12_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + // Scaling not supported by Neon implementation. + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_convolve12_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); + return; + } + + assert(w == 32 || w == 16 || w == 8); + assert(h == 32 || h == 16 || h == 8); + + DECLARE_ALIGNED(32, uint8_t, im_block[BW * (BH + MAX_FILTER_TAP)]); + + const int im_stride = BW; + // Account for the vertical pass needing MAX_FILTER_TAP / 2 - 1 lines prior + // and MAX_FILTER_TAP / 2 lines post. + const int im_height = h + MAX_FILTER_TAP - 1; + const ptrdiff_t border_offset = MAX_FILTER_TAP / 2 - 1; + + // Filter starting border_offset rows up. + vpx_convolve12_2d_horiz_neon_i8mm(src - src_stride * border_offset, + src_stride, im_block, im_stride, filter, + x0_q4, w, im_height); + + vpx_convolve12_vert_neon_i8mm(im_block + im_stride * border_offset, im_stride, + dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c index 188d04d8f6..61786d8f66 100644 --- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c +++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" @@ -79,6 +80,7 @@ return err; \ } +#if !CONFIG_VP9_HIGHBITDEPTH BLOCK_ERROR_BLOCKSIZE_MSA(16); BLOCK_ERROR_BLOCKSIZE_MSA(64); BLOCK_ERROR_BLOCKSIZE_MSA(256); @@ -103,3 +105,4 @@ int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr, return err; } +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c index 0831e59148..efbbe830db 100644 --- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c +++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" #include "vpx_dsp/mips/fwd_txfm_msa.h" diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c index fa36f09ab8..9c5cc12ef0 100644 --- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c +++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c index 604db853c4..26d81aa9ef 100644 --- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c +++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c @@ -10,6 +10,7 @@ #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_enums.h" #include "vp9/encoder/mips/msa/vp9_fdct_msa.h" diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h index 794bec70b6..fa1af2fc57 100644 --- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h +++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ -#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ +#ifndef VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ +#define VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ #include "vpx_dsp/mips/fwd_txfm_msa.h" #include "vpx_dsp/mips/txfm_macros_msa.h" @@ -113,4 +113,4 @@ PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \ out0, out1, out2, out3); \ } -#endif /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */ +#endif // VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c deleted file mode 100644 index 23f7ebace4..0000000000 --- a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "vpx_dsp/mips/macros_msa.h" - -static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride, - uint8_t *frm2_ptr, int32_t filt_sth, - int32_t filt_wgt, uint32_t *acc, - uint16_t *cnt) { - uint32_t row; - uint64_t f0, f1, f2, f3; - v16i8 frm2, frm1 = { 0 }; - v16i8 frm4, frm3 = { 0 }; - v16u8 frm_r, frm_l; - v8i16 frm2_r, frm2_l; - v8i16 diff0, diff1, mod0_h, mod1_h; - v4i32 cnst3, cnst16, filt_wt, strength; - v4i32 mod0_w, mod1_w, mod2_w, mod3_w; - v4i32 diff0_r, diff0_l, diff1_r, diff1_l; - v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; - v4i32 acc0, acc1, acc2, acc3; - v8i16 cnt0, cnt1; - - filt_wt = __msa_fill_w(filt_wgt); - strength = __msa_fill_w(filt_sth); - cnst3 = __msa_ldi_w(3); - cnst16 = __msa_ldi_w(16); - - for (row = 2; row--;) { - LD4(frm1_ptr, stride, f0, f1, f2, f3); - frm1_ptr += (4 * stride); - - LD_SB2(frm2_ptr, 16, frm2, frm4); - frm2_ptr += 32; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - INSERT_D2_SB(f0, f1, frm1); - INSERT_D2_SB(f2, f3, frm3); - ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - UNPCK_UB_SH(frm2, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - UNPCK_UB_SH(frm4, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - } -} - -static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride, - uint8_t *frm2_ptr, - int32_t filt_sth, int32_t filt_wgt, - uint32_t *acc, uint16_t *cnt) { - uint32_t row; - v16i8 frm1, frm2, frm3, frm4; - v16u8 frm_r, frm_l; - v16i8 zero = { 0 }; - v8u16 frm2_r, frm2_l; - v8i16 diff0, diff1, mod0_h, mod1_h; - v4i32 cnst3, cnst16, filt_wt, strength; - v4i32 mod0_w, mod1_w, mod2_w, mod3_w; - v4i32 diff0_r, diff0_l, diff1_r, diff1_l; - v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; - v4i32 acc0, acc1, acc2, acc3; - v8i16 cnt0, cnt1; - - filt_wt = __msa_fill_w(filt_wgt); - strength = __msa_fill_w(filt_sth); - cnst3 = __msa_ldi_w(3); - cnst16 = __msa_ldi_w(16); - - for (row = 8; row--;) { - LD_SB2(frm1_ptr, stride, frm1, frm3); - frm1_ptr += stride; - - LD_SB2(frm2_ptr, 16, frm2, frm4); - frm2_ptr += 16; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - LD_SW2(acc, 4, acc0, acc1); - LD_SW2(acc + 8, 4, acc2, acc3); - LD_SH2(cnt, 8, cnt0, cnt1); - - ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); - HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); - UNPCK_SH_SW(diff0, diff0_r, diff0_l); - UNPCK_SH_SW(diff1, diff1_r, diff1_l); - MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, - mod0_w, mod1_w, mod2_w, mod3_w); - MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w, - mod1_w, mod2_w, mod3_w); - SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); - - diff0_r = (mod0_w < cnst16); - diff0_l = (mod1_w < cnst16); - diff1_r = (mod2_w < cnst16); - diff1_l = (mod3_w < cnst16); - - SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w, - mod1_w, mod2_w, mod3_w); - - mod0_w = diff0_r & mod0_w; - mod1_w = diff0_l & mod1_w; - mod2_w = diff1_r & mod2_w; - mod3_w = diff1_l & mod3_w; - - MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, - mod0_w, mod1_w, mod2_w, mod3_w); - PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); - ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); - ST_SH2(mod0_h, mod1_h, cnt, 8); - cnt += 16; - - ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); - UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); - UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); - MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, - mod0_w, mod1_w, mod2_w, mod3_w); - ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, - mod2_w, mod3_w); - ST_SW2(mod0_w, mod1_w, acc, 4); - acc += 8; - ST_SW2(mod2_w, mod3_w, acc, 4); - acc += 8; - - frm1_ptr += stride; - frm2_ptr += 16; - } -} - -void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, - uint8_t *frame2_ptr, uint32_t blk_w, - uint32_t blk_h, int32_t strength, - int32_t filt_wgt, uint32_t *accu, - uint16_t *cnt) { - if (8 == (blk_w * blk_h)) { - temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength, - filt_wgt, accu, cnt); - } else if (16 == (blk_w * blk_h)) { - temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength, - filt_wgt, accu, cnt); - } else { - vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, - strength, filt_wgt, accu, cnt); - } -} diff --git a/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c b/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c new file mode 100644 index 0000000000..4d31558471 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" + +#include "./vp9_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and return the high 16 bits of the intermediate integers. +// (a * b) >> 16 +// Note: Because this is done in 2 operations, a and b cannot both be UINT16_MIN +static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) { + // madds does ((A * B) >> 15) + C, we need >> 16, so we perform an extra right + // shift. + return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16); +} + +// Negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vec_sra(b, vec_shift_sign_s16); + return vec_xor(vec_add(a, mask), mask); +} + +// Compare packed 16-bit integers across a, and return the maximum value in +// every element. Returns a vector containing the biggest value across vector a. +static INLINE int16x8_t vec_max_across(int16x8_t a) { + a = vec_max(a, vec_perm(a, a, vec_perm64)); + a = vec_max(a, vec_perm(a, a, vec_perm32)); + return vec_max(a, vec_perm(a, a, vec_perm16)); +} + +void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t zero_coeff0, zero_coeff1; + + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + int16x8_t scan0 = vec_vsx_ld(0, iscan); + int16x8_t scan1 = vec_vsx_ld(16, iscan); + + (void)scan; + + // First set of 8 coeff starts with DC + 7 AC + qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + + // Remove DC value from round and quant + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + + // Remove DC value from dequant + dequant = vec_splat(dequant, 1); + + // Second set of 8 coeff starts with (all AC) + qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1)); + + // We quantize 16 coeff up front (enough for a 4x4) and process 24 coeff per + // loop iteration. + // for 8x8: 16 + 2 x 24 = 64 + // for 16x16: 16 + 10 x 24 = 256 + if (n_coeffs > 16) { + int16x8_t coeff2, qcoeff2, dqcoeff2, eob2, scan2; + bool16x8_t zero_coeff2; + + int index = 16; + int off0 = 32; + int off1 = 48; + int off2 = 64; + + do { + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + scan0 = vec_vsx_ld(off0, iscan); + scan1 = vec_vsx_ld(off1, iscan); + scan2 = vec_vsx_ld(off2, iscan); + + qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + + qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + + qcoeff2 = vec_mulhi(vec_vaddshs(vec_abs(coeff2), round), quant); + zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16); + qcoeff2 = vec_sign(qcoeff2, coeff2); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = vec_max(eob, vec_or(scan0, zero_coeff0)); + eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2)); + eob = vec_max(eob, eob2); + + index += 24; + off0 += 48; + off1 += 48; + off2 += 48; + } while (index < n_coeffs); + } + + eob = vec_max_across(eob); + *eob_ptr = eob[0] + 1; +} + +// Sets the value of a 32-bit integers to 1 when the corresponding value in a is +// negative. +static INLINE int32x4_t vec_is_neg(int32x4_t a) { + return vec_sr(a, vec_shift_sign_s32); +} + +// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32 +// blocks are twice as big as for other block sizes. As such, using +// vec_mladd results in overflow. +static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, + int16x8_t dequant) { + int32x4_t dqcoeffe = vec_mule(qcoeff, dequant); + int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe)); + dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo)); + dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32); + dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32); + return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); +} + +void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + // In stage 1, we quantize 16 coeffs (DC + 15 AC) + // In stage 2, we loop 42 times and quantize 24 coeffs per iteration + // (32 * 32 - 16) / 24 = 42 + int num_itr = 42; + // Offsets are in bytes, 16 coeffs = 32 bytes + int off0 = 32; + int off1 = 48; + int off2 = 64; + + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t mask0, mask1, zero_coeff0, zero_coeff1; + + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + int16x8_t scan0 = vec_vsx_ld(0, iscan); + int16x8_t scan1 = vec_vsx_ld(16, iscan); + int16x8_t thres = vec_sra(dequant, vec_splats((uint16_t)2)); + int16x8_t abs_coeff0 = vec_abs(coeff0); + int16x8_t abs_coeff1 = vec_abs(coeff1); + + (void)scan; + (void)n_coeffs; + + mask0 = vec_cmpge(abs_coeff0, thres); + round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16); + // First set of 8 coeff starts with DC + 7 AC + qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16); + qcoeff0 = vec_and(qcoeff0, mask0); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + + dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + + // Remove DC value from thres, round, quant and dequant + thres = vec_splat(thres, 1); + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + dequant = vec_splat(dequant, 1); + + mask1 = vec_cmpge(abs_coeff1, thres); + + // Second set of 8 coeff starts with (all AC) + qcoeff1 = + vec_madds(vec_vaddshs(vec_abs(coeff1), round), quant, vec_zeros_s16); + qcoeff1 = vec_and(qcoeff1, mask1); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1)); + + do { + int16x8_t coeff2, abs_coeff2, qcoeff2, dqcoeff2, eob2, scan2; + bool16x8_t zero_coeff2, mask2; + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + scan0 = vec_vsx_ld(off0, iscan); + scan1 = vec_vsx_ld(off1, iscan); + scan2 = vec_vsx_ld(off2, iscan); + + abs_coeff0 = vec_abs(coeff0); + abs_coeff1 = vec_abs(coeff1); + abs_coeff2 = vec_abs(coeff2); + + qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16); + qcoeff1 = vec_madds(vec_vaddshs(abs_coeff1, round), quant, vec_zeros_s16); + qcoeff2 = vec_madds(vec_vaddshs(abs_coeff2, round), quant, vec_zeros_s16); + + mask0 = vec_cmpge(abs_coeff0, thres); + mask1 = vec_cmpge(abs_coeff1, thres); + mask2 = vec_cmpge(abs_coeff2, thres); + + qcoeff0 = vec_and(qcoeff0, mask0); + qcoeff1 = vec_and(qcoeff1, mask1); + qcoeff2 = vec_and(qcoeff2, mask2); + + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16); + + qcoeff0 = vec_sign(qcoeff0, coeff0); + qcoeff1 = vec_sign(qcoeff1, coeff1); + qcoeff2 = vec_sign(qcoeff2, coeff2); + + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant); + dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant); + dqcoeff2 = dequantize_coeff_32(qcoeff2, dequant); + + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = vec_max(eob, vec_or(scan0, zero_coeff0)); + eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2)); + eob = vec_max(eob, eob2); + + off0 += 48; + off1 += 48; + off2 += 48; + num_itr--; + } while (num_itr != 0); + + eob = vec_max_across(eob); + *eob_ptr = eob[0] + 1; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c index 3aeefb5845..acc3764c7a 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c @@ -15,7 +15,7 @@ struct ALT_REF_AQ { int dummy; }; -struct ALT_REF_AQ *vp9_alt_ref_aq_create() { +struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) { return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ)); } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h index 18acd8a85b..22a657e035 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h @@ -15,8 +15,8 @@ * for altref frames. Go to alt_ref_aq_private.h for implmentation details. */ -#ifndef VP9_ENCODER_VP9_ALT_REF_AQ_H_ -#define VP9_ENCODER_VP9_ALT_REF_AQ_H_ +#ifndef VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ +#define VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ #include "vpx/vpx_integer.h" @@ -54,7 +54,7 @@ struct ALT_REF_AQ; * * \return Instance of the class */ -struct ALT_REF_AQ *vp9_alt_ref_aq_create(); +struct ALT_REF_AQ *vp9_alt_ref_aq_create(void); /*!\brief Upload segmentation_map to self object * @@ -124,4 +124,4 @@ void vp9_alt_ref_aq_destroy(struct ALT_REF_AQ *const self); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ALT_REF_AQ_H_ +#endif // VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h index b1b56561d8..749d3c198a 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_360_H_ -#define VP9_ENCODER_VP9_AQ_360_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_360_H_ +#define VPX_VP9_ENCODER_VP9_AQ_360_H_ #include "vp9/encoder/vp9_encoder.h" @@ -24,4 +24,4 @@ void vp9_360aq_frame_setup(VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_360_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c index bd3812036c..ef3423f8eb 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c @@ -87,7 +87,7 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { &cpi->rc, cm->frame_type, cm->base_qindex, aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth); - // For AQ complexity mode, we dont allow Q0 in a segment if the base + // For AQ complexity mode, we don't allow Q0 in a segment if the base // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment // Q delta is sometimes applied without going back around the rd loop. // This could lead to an illegal combination of partition size and q. diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h index a00d34e702..d3cb34c013 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ -#define VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#define VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ #ifdef __cplusplus extern "C" { @@ -33,4 +33,4 @@ void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c index 3dc88b1914..92a31ebf63 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -21,6 +21,14 @@ #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_segmentation.h" +static const uint8_t VP9_VAR_OFFS[64] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 +}; + CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { size_t last_coded_q_map_size; CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr)); @@ -39,13 +47,17 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { } assert(MAXQ <= 255); memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size); + cr->counter_encode_maxq_scene_change = 0; + cr->content_mode = 1; return cr; } void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) { - vpx_free(cr->map); - vpx_free(cr->last_coded_q_map); - vpx_free(cr); + if (cr != NULL) { + vpx_free(cr->map); + vpx_free(cr->last_coded_q_map); + vpx_free(cr); + } } // Check if this coding block, of size bsize, should be considered for refresh @@ -102,18 +114,18 @@ int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi, double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl; double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl; // Take segment weighted average for estimated bits. - estimated_bits = - (int)((1.0 - weight_segment1 - weight_segment2) * - vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs, - correction_factor, cm->bit_depth) + - weight_segment1 * - vp9_estimate_bits_at_q(cm->frame_type, - cm->base_qindex + cr->qindex_delta[1], - mbs, correction_factor, cm->bit_depth) + - weight_segment2 * - vp9_estimate_bits_at_q(cm->frame_type, - cm->base_qindex + cr->qindex_delta[2], - mbs, correction_factor, cm->bit_depth)); + estimated_bits = (int)round( + (1.0 - weight_segment1 - weight_segment2) * + vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs, + correction_factor, cm->bit_depth) + + weight_segment1 * + vp9_estimate_bits_at_q(cm->frame_type, + cm->base_qindex + cr->qindex_delta[1], mbs, + correction_factor, cm->bit_depth) + + weight_segment2 * + vp9_estimate_bits_at_q(cm->frame_type, + cm->base_qindex + cr->qindex_delta[2], mbs, + correction_factor, cm->bit_depth)); return estimated_bits; } @@ -127,28 +139,19 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i, const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; int bits_per_mb; - int num8x8bl = cm->MBs << 2; - // Compute delta-q corresponding to qindex i. - int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); - // Weight for segment prior to encoding: take the average of the target - // number for the frame to be encoded and the actual from the previous frame. - // Use the target if its less. - int target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; - double weight_segment_target = (double)(target_refresh) / num8x8bl; - double weight_segment = - (double)((target_refresh + cr->actual_num_seg1_blocks + - cr->actual_num_seg2_blocks) >> - 1) / - num8x8bl; - if (weight_segment_target < 7 * weight_segment / 8) - weight_segment = weight_segment_target; + int deltaq = 0; + if (cpi->oxcf.speed < 8) + deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); + else + deltaq = -(cr->max_qdelta_perc * i) / 200; // Take segment weighted average for bits per mb. - bits_per_mb = (int)((1.0 - weight_segment) * - vp9_rc_bits_per_mb(cm->frame_type, i, - correction_factor, cm->bit_depth) + - weight_segment * - vp9_rc_bits_per_mb(cm->frame_type, i + deltaq, - correction_factor, cm->bit_depth)); + bits_per_mb = + (int)round((1.0 - cr->weight_segment) * + vp9_rc_bits_per_mb(cm->frame_type, i, correction_factor, + cm->bit_depth) + + cr->weight_segment * + vp9_rc_bits_per_mb(cm->frame_type, i + deltaq, + correction_factor, cm->bit_depth)); return bits_per_mb; } @@ -186,7 +189,8 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, MODE_INFO *const mi, // If this block is labeled for refresh, check if we should reset the // segment_id. - if (cyclic_refresh_segment_id_boosted(mi->segment_id)) { + if (cpi->sf.use_nonrd_pick_mode && + cyclic_refresh_segment_id_boosted(mi->segment_id)) { mi->segment_id = refresh_this_block; // Reset segment_id if it will be skipped. if (skip) mi->segment_id = CR_SEGMENT_ID_BASE; @@ -250,24 +254,66 @@ void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi, } } -// Update the actual number of blocks that were applied the segment delta q. +// From the just encoded frame: update the actual number of blocks that were +// applied the segment delta q, and the amount of low motion in the frame. +// Also check conditions for forcing golden update, or preventing golden +// update if the period is up. void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; + MODE_INFO **mi = cm->mi_grid_visible; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + RATE_CONTROL *const rc = &cpi->rc; unsigned char *const seg_map = cpi->segmentation_map; + double fraction_low = 0.0; + int force_gf_refresh = 0; + int low_content_frame = 0; int mi_row, mi_col; cr->actual_num_seg1_blocks = 0; cr->actual_num_seg2_blocks = 0; - for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { - if (cyclic_refresh_segment_id(seg_map[mi_row * cm->mi_cols + mi_col]) == - CR_SEGMENT_ID_BOOST1) + MV mv = mi[0]->mv[0].as_mv; + int map_index = mi_row * cm->mi_cols + mi_col; + if (cyclic_refresh_segment_id(seg_map[map_index]) == CR_SEGMENT_ID_BOOST1) cr->actual_num_seg1_blocks++; - else if (cyclic_refresh_segment_id( - seg_map[mi_row * cm->mi_cols + mi_col]) == + else if (cyclic_refresh_segment_id(seg_map[map_index]) == CR_SEGMENT_ID_BOOST2) cr->actual_num_seg2_blocks++; + // Accumulate low_content_frame. + if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16) + low_content_frame++; + mi++; } + mi += 8; + } + // Check for golden frame update: only for non-SVC and non-golden boost. + if (!cpi->use_svc && cpi->ext_refresh_frame_flags_pending == 0 && + !cpi->oxcf.gf_cbr_boost_pct) { + // Force this frame as a golden update frame if this frame changes the + // resolution (resize_pending != 0). + if (cpi->resize_pending != 0) { + vp9_cyclic_refresh_set_golden_update(cpi); + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + if (rc->frames_till_gf_update_due > rc->frames_to_key) + rc->frames_till_gf_update_due = rc->frames_to_key; + cpi->refresh_golden_frame = 1; + force_gf_refresh = 1; + } + // Update average of low content/motion in the frame. + fraction_low = (double)low_content_frame / (cm->mi_rows * cm->mi_cols); + cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4; + if (!force_gf_refresh && cpi->refresh_golden_frame == 1 && + rc->frames_since_key > rc->frames_since_golden + 1) { + // Don't update golden reference if the amount of low_content for the + // current encoded frame is small, or if the recursive average of the + // low_content over the update interval window falls below threshold. + if (fraction_low < 0.65 || cr->low_content_avg < 0.6) { + cpi->refresh_golden_frame = 0; + } + // Reset for next internal. + cr->low_content_avg = fraction_low; + } + } } // Set golden frame update interval, for non-svc 1 pass CBR mode. @@ -282,72 +328,31 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) { else rc->baseline_gf_interval = 40; if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20; + if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40 && + cr->content_mode) + rc->baseline_gf_interval = 10; } -// Update some encoding stats (from the just encoded frame). If this frame's -// background has high motion, refresh the golden frame. Otherwise, if the -// golden reference is to be updated check if we should NOT update the golden -// ref. -void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) { - VP9_COMMON *const cm = &cpi->common; - CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; - int mi_row, mi_col; - double fraction_low = 0.0; - int low_content_frame = 0; - MODE_INFO **mi = cm->mi_grid_visible; - RATE_CONTROL *const rc = &cpi->rc; - const int rows = cm->mi_rows, cols = cm->mi_cols; - int cnt1 = 0, cnt2 = 0; - int force_gf_refresh = 0; - int flag_force_gf_high_motion = 0; - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - if (flag_force_gf_high_motion == 1) { - int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0 - ? mi[0]->mv[0].as_mv.row - : -1 * mi[0]->mv[0].as_mv.row; - int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0 - ? mi[0]->mv[0].as_mv.col - : -1 * mi[0]->mv[0].as_mv.col; - // Calculate the motion of the background. - if (abs_mvr <= 16 && abs_mvc <= 16) { - cnt1++; - if (abs_mvr == 0 && abs_mvc == 0) cnt2++; - } - } - mi++; - // Accumulate low_content_frame. - if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++; - } - mi += 8; - } - // For video conference clips, if the background has high motion in current - // frame because of the camera movement, set this frame as the golden frame. - // Use 70% and 5% as the thresholds for golden frame refreshing. - // Also, force this frame as a golden update frame if this frame will change - // the resolution (resize_pending != 0). - if (cpi->resize_pending != 0 || - (cnt1 * 100 > (70 * rows * cols) && cnt2 * 20 < cnt1)) { - vp9_cyclic_refresh_set_golden_update(cpi); - rc->frames_till_gf_update_due = rc->baseline_gf_interval; - - if (rc->frames_till_gf_update_due > rc->frames_to_key) - rc->frames_till_gf_update_due = rc->frames_to_key; - cpi->refresh_golden_frame = 1; - force_gf_refresh = 1; - } - fraction_low = (double)low_content_frame / (rows * cols); - // Update average. - cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4; - if (!force_gf_refresh && cpi->refresh_golden_frame == 1) { - // Don't update golden reference if the amount of low_content for the - // current encoded frame is small, or if the recursive average of the - // low_content over the update interval window falls below threshold. - if (fraction_low < 0.8 || cr->low_content_avg < 0.7) - cpi->refresh_golden_frame = 0; - // Reset for next internal. - cr->low_content_avg = fraction_low; +static int is_superblock_flat_static(VP9_COMP *const cpi, int sb_row_index, + int sb_col_index) { + unsigned int source_variance; + const uint8_t *src_y = cpi->Source->y_buffer; + const int ystride = cpi->Source->y_stride; + unsigned int sse; + const BLOCK_SIZE bsize = BLOCK_64X64; + src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6); + source_variance = + cpi->fn_ptr[bsize].vf(src_y, ystride, VP9_VAR_OFFS, 0, &sse); + if (source_variance == 0) { + uint64_t block_sad; + const uint8_t *last_src_y = cpi->Last_Source->y_buffer; + const int last_ystride = cpi->Last_Source->y_stride; + last_src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6); + block_sad = + cpi->fn_ptr[bsize].sdf(src_y, ystride, last_src_y, last_ystride); + if (block_sad == 0) return 1; } + return 0; } // Update the segmentation map, and related quantities: cyclic refresh map, @@ -386,7 +391,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex) : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex); // More aggressive settings for noisy content. - if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) { + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium && + cr->content_mode) { consec_zero_mv_thresh = 60; qindex_thresh = VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex), @@ -400,8 +406,17 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { int sb_col_index = i - sb_row_index * sb_cols; int mi_row = sb_row_index * MI_BLOCK_SIZE; int mi_col = sb_col_index * MI_BLOCK_SIZE; + int flat_static_blocks = 0; + int compute_content = 1; assert(mi_row >= 0 && mi_row < cm->mi_rows); assert(mi_col >= 0 && mi_col < cm->mi_cols); +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) compute_content = 0; +#endif + if (cr->content_mode == 0 || cpi->Last_Source == NULL || + cpi->Last_Source->y_width != cpi->Source->y_width || + cpi->Last_Source->y_height != cpi->Source->y_height) + compute_content = 0; bl_index = mi_row * cm->mi_cols + mi_col; // Loop through all 8x8 blocks in superblock and update map. xmis = @@ -410,16 +425,17 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]); if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium && (xmis <= 2 || ymis <= 2)) - consec_zero_mv_thresh_block = 10; + consec_zero_mv_thresh_block = 4; for (y = 0; y < ymis; y++) { for (x = 0; x < xmis; x++) { const int bl_index2 = bl_index + y * cm->mi_cols + x; // If the block is as a candidate for clean up then mark it // for possible boost/refresh (segment 1). The segment id may get - // reset to 0 later if block gets coded anything other than ZEROMV. + // reset to 0 later depending on the coding mode. if (cr->map[bl_index2] == 0) { count_tot++; - if (cr->last_coded_q_map[bl_index2] > qindex_thresh || + if (cr->content_mode == 0 || + cr->last_coded_q_map[bl_index2] > qindex_thresh || cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) { sum_map++; count_sel++; @@ -432,11 +448,21 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { // Enforce constant segment over superblock. // If segment is at least half of superblock, set to 1. if (sum_map >= xmis * ymis / 2) { - for (y = 0; y < ymis; y++) - for (x = 0; x < xmis; x++) { - seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1; - } - cr->target_num_seg_blocks += xmis * ymis; + // This superblock is a candidate for refresh: + // compute spatial variance and exclude blocks that are spatially flat + // and stationary. Note: this is currently only done for screne content + // mode. + if (compute_content && cr->skip_flat_static_blocks) + flat_static_blocks = + is_superblock_flat_static(cpi, sb_row_index, sb_col_index); + if (!flat_static_blocks) { + // Label this superblock as segment 1. + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1; + } + cr->target_num_seg_blocks += xmis * ymis; + } } i++; if (i == sbs_in_frame) { @@ -445,7 +471,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index); cr->sb_index = i; cr->reduce_refresh = 0; - if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1; + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) + if (count_sel < (3 * count_tot) >> 2) cr->reduce_refresh = 1; } // Set cyclic refresh parameters. @@ -453,9 +480,33 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int num8x8bl = cm->MBs << 2; + int target_refresh = 0; + double weight_segment_target = 0; + double weight_segment = 0; + int thresh_low_motion = 20; + int qp_thresh = VPXMIN((cpi->oxcf.content == VP9E_CONTENT_SCREEN) ? 35 : 20, + rc->best_quality << 1); + int qp_max_thresh = 117 * MAXQ >> 7; + cr->apply_cyclic_refresh = 1; + if (frame_is_intra_only(cm) || cpi->svc.temporal_layer_id > 0 || + is_lossless_requested(&cpi->oxcf) || + rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || + (cpi->use_svc && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || + (!cpi->use_svc && cr->content_mode && + rc->avg_frame_low_motion < thresh_low_motion && + rc->frames_since_key > 40) || + (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh && + rc->frames_since_key > 20) || + (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] && + rc->frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)) { + cr->apply_cyclic_refresh = 0; + return; + } cr->percent_refresh = 10; if (cr->reduce_refresh) cr->percent_refresh = 5; - cr->max_qdelta_perc = 50; + cr->max_qdelta_perc = 60; cr->time_for_refresh = 0; cr->motion_thresh = 32; cr->rate_boost_fac = 15; @@ -468,20 +519,38 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->rate_ratio_qdelta = 3.0; } else { cr->rate_ratio_qdelta = 2.0; - if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) { + if (cr->content_mode && cpi->noise_estimate.enabled && + cpi->noise_estimate.level >= kMedium) { // Reduce the delta-qp if the estimated source noise is above threshold. cr->rate_ratio_qdelta = 1.7; cr->rate_boost_fac = 13; } } - // Adjust some parameters for low resolutions at low bitrates. - if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) { - cr->motion_thresh = 16; - cr->rate_boost_fac = 13; + // For screen-content: keep rate_ratio_qdelta to 2.0 (segment#1 boost) and + // percent_refresh (refresh rate) to 10. But reduce rate boost for segment#2 + // (rate_boost_fac = 10 disables segment#2). + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) { + // Only enable feature of skipping flat_static blocks for top layer + // under screen content mode. + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cr->skip_flat_static_blocks = 1; + cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10; + // Increase the amount of refresh on scene change that is encoded at max Q, + // increase for a few cycles of the refresh period (~100 / percent_refresh). + if (cr->content_mode && cr->counter_encode_maxq_scene_change < 30) + cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15; + cr->rate_ratio_qdelta = 2.0; + cr->rate_boost_fac = 10; } - if (cpi->svc.spatial_layer_id > 0) { - cr->motion_thresh = 4; - cr->rate_boost_fac = 12; + // Adjust some parameters for low resolutions. + if (cm->width * cm->height <= 352 * 288) { + if (rc->avg_frame_bandwidth < 3000) { + cr->motion_thresh = 64; + cr->rate_boost_fac = 13; + } else { + cr->max_qdelta_perc = 70; + cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5); + } } if (cpi->oxcf.rc_mode == VPX_VBR) { // To be adjusted for VBR mode, e.g., based on gf period and boost. @@ -490,11 +559,37 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->percent_refresh = 10; cr->rate_ratio_qdelta = 1.5; cr->rate_boost_fac = 10; - if (cpi->refresh_golden_frame == 1) { + if (cpi->refresh_golden_frame == 1 && !cpi->use_svc) { cr->percent_refresh = 0; cr->rate_ratio_qdelta = 1.0; } } + // Weight for segment prior to encoding: take the average of the target + // number for the frame to be encoded and the actual from the previous frame. + // Use the target if its less. To be used for setting the base qp for the + // frame in vp9_rc_regulate_q. + target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; + weight_segment_target = (double)(target_refresh) / num8x8bl; + weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks + + cr->actual_num_seg2_blocks) >> + 1) / + num8x8bl; + if (weight_segment_target < 7 * weight_segment / 8) + weight_segment = weight_segment_target; + // For screen-content: don't include target for the weight segment, + // since for all flat areas the segment is reset, so its more accurate + // to just use the previous actual number of seg blocks for the weight. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + weight_segment = + (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) / + num8x8bl; + cr->weight_segment = weight_segment; + if (cr->content_mode == 0) { + cr->actual_num_seg1_blocks = + cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; + cr->actual_num_seg2_blocks = 0; + cr->weight_segment = (double)(cr->actual_num_seg1_blocks) / num8x8bl; + } } // Setup cyclic background refresh: set delta q and segmentation map. @@ -503,28 +598,33 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; struct segmentation *const seg = &cm->seg; - // TODO(marpan): Look into whether we should reduce the amount/delta-qp - // instead of completely shutting off at low bitrates. For now keep it on. - // const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc); - const int apply_cyclic_refresh = 1; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); if (cm->current_video_frame == 0) cr->low_content_avg = 0.0; - // Don't apply refresh on key frame or temporal enhancement layer frames. - if (!apply_cyclic_refresh || (cm->frame_type == KEY_FRAME) || - (cpi->force_update_segmentation) || (cpi->svc.temporal_layer_id > 0)) { + // Reset if resoluton change has occurred. + if (cpi->resize_pending != 0 && cpi->svc.temporal_layer_id == 0) + vp9_cyclic_refresh_reset_resize(cpi); + if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation) || + scene_change_detected) { // Set segmentation map to 0 and disable. unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); vp9_disable_segmentation(&cm->seg); - if (cm->frame_type == KEY_FRAME) { + if ((cm->frame_type == KEY_FRAME || scene_change_detected) && + cpi->svc.temporal_layer_id == 0) { memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); cr->sb_index = 0; + cr->reduce_refresh = 0; + cr->counter_encode_maxq_scene_change = 0; } return; } else { int qindex_delta = 0; int qindex2; const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth); + cr->counter_encode_maxq_scene_change++; vpx_clear_system_state(); // Set rate threshold to some multiple (set to 2 for now) of the target // rate (target is given by sb64_target_rate and scaled by 256). @@ -574,9 +674,6 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { cr->qindex_delta[2] = qindex_delta; vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); - // Reset if resoluton change has occurred. - if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi); - // Update the segmentation and refresh map. cyclic_refresh_update_map(cpi); } @@ -590,8 +687,19 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) { const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; memset(cr->map, 0, cm->mi_rows * cm->mi_cols); - memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols); + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); cr->sb_index = 0; cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; + cr->counter_encode_maxq_scene_change = 0; +} + +void vp9_cyclic_refresh_limit_q(const VP9_COMP *cpi, int *q) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + // For now apply hard limit to frame-level decrease in q, if the cyclic + // refresh is active (percent_refresh > 0). + if (cr->percent_refresh > 0 && cpi->rc.q_1_frame - *q > 8) { + *q = cpi->rc.q_1_frame - 8; + } } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h index a4be031295..c74cee4743 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ -#define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#define VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" @@ -66,6 +66,11 @@ struct CYCLIC_REFRESH { double low_content_avg; int qindex_delta[3]; int reduce_refresh; + double weight_segment; + int apply_cyclic_refresh; + int counter_encode_maxq_scene_change; + int skip_flat_static_blocks; + int content_mode; }; struct VP9_COMP; @@ -100,19 +105,15 @@ void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi, int mi_row, int mi_col, BLOCK_SIZE bsize); -// Update the segmentation map, and related quantities: cyclic refresh map, -// refresh sb_index, and target number of blocks to be refreshed. -void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi); - -// Update the actual number of blocks that were applied the segment delta q. +// From the just encoded frame: update the actual number of blocks that were +// applied the segment delta q, and the amount of low motion in the frame. +// Also check conditions for forcing golden update, or preventing golden +// update if the period is up. void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi); // Set golden frame update interval, for non-svc 1 pass CBR mode. void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi); -// Check if we should not update golden reference, based on past refresh stats. -void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi); - // Set/update global/frame level refresh parameters. void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi); @@ -137,8 +138,10 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) { return CR_SEGMENT_ID_BASE; } +void vp9_cyclic_refresh_limit_q(const struct VP9_COMP *cpi, int *q); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c index 477f62ba5a..2d57615259 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c @@ -19,6 +19,7 @@ #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_segmentation.h" #define ENERGY_MIN (-4) @@ -31,7 +32,7 @@ static const double rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0, 0.75, 1.0, 1.0, 1.0 }; static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 }; -#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN] +#define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN] DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 }; #if CONFIG_VP9_HIGHBITDEPTH @@ -108,7 +109,7 @@ static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b, #if CONFIG_VP9_HIGHBITDEPTH static void aq_highbd_variance64(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int w, int h, - uint64_t *sse, uint64_t *sum) { + uint64_t *sse, int64_t *sum) { int i, j; uint16_t *a = CONVERT_TO_SHORTPTR(a8); @@ -127,15 +128,6 @@ static void aq_highbd_variance64(const uint8_t *a8, int a_stride, } } -static void aq_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; -} #endif // CONFIG_VP9_HIGHBITDEPTH static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, @@ -153,11 +145,13 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, int avg; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride, + uint64_t sse64 = 0; + int64_t sum64 = 0; + aq_highbd_variance64(x->plane[0].src.buf, x->plane[0].src.stride, CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, - &sse, &avg); - sse >>= 2 * (xd->bd - 8); - avg >>= (xd->bd - 8); + &sse64, &sum64); + sse = (unsigned int)(sse64 >> (2 * (xd->bd - 8))); + avg = (int)(sum64 >> (xd->bd - 8)); } else { aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0, bw, bh, &sse, &avg); @@ -193,12 +187,61 @@ double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { } #define DEFAULT_E_MIDPOINT 10.0 -int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { +static int scale_block_energy(VP9_COMP *cpi, unsigned int block_var) { double energy; double energy_midpoint; - vpx_clear_system_state(); energy_midpoint = (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT; - energy = vp9_log_block_var(cpi, x, bs) - energy_midpoint; + energy = log(block_var + 1.0) - energy_midpoint; return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); } +#undef DEFAULT_E_MIDPOINT + +// Get the range of sub block energy values; +void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *min_e, + int *max_e) { + VP9_COMMON *const cm = &cpi->common; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + int x, y; + + if (xmis < bw || ymis < bh) { + vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col); + *min_e = vp9_block_energy(cpi, mb, bsize); + *max_e = *min_e; + } else { + unsigned int var; + // Because scale_block_energy is non-decreasing, we can find the min/max + // block variance and scale afterwards. This avoids a costly scaling at + // every iteration. + unsigned int min_var = UINT_MAX; + unsigned int max_var = 0; + + for (y = 0; y < ymis; ++y) { + for (x = 0; x < xmis; ++x) { + vp9_setup_src_planes(mb, cpi->Source, mi_row + y, mi_col + x); + vpx_clear_system_state(); + var = block_variance(cpi, mb, BLOCK_8X8); + vpx_clear_system_state(); + min_var = VPXMIN(min_var, var); + max_var = VPXMAX(max_var, var); + } + } + *min_e = scale_block_energy(cpi, min_var); + *max_e = scale_block_energy(cpi, max_var); + } + + // Re-instate source pointers back to what they should have been on entry. + vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col); +} + +int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + unsigned int var; + vpx_clear_system_state(); + var = block_variance(cpi, x, bs); + vpx_clear_system_state(); + return scale_block_energy(cpi, var); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h index 211a69f392..a4f872879d 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_ -#define VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#define VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ #include "vp9/encoder/vp9_encoder.h" @@ -20,11 +20,15 @@ extern "C" { unsigned int vp9_vaq_segment_id(int energy); void vp9_vaq_frame_setup(VP9_COMP *cpi); +void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *min_e, + int *max_e); int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); + double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c index 49aea69ebd..8df16691f5 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c @@ -9,6 +9,7 @@ */ #include +#include #include #include @@ -18,6 +19,9 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem_ops.h" #include "vpx_ports/system_state.h" +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" @@ -39,8 +43,10 @@ static const struct vp9_token intra_mode_encodings[INTRA_MODES] = { { 0, 1 }, { 6, 3 }, { 28, 5 }, { 30, 5 }, { 58, 6 }, { 59, 6 }, { 126, 7 }, { 127, 7 }, { 62, 6 }, { 2, 2 } }; -static const struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS] = - { { 0, 1 }, { 2, 2 }, { 3, 2 } }; +static const struct vp9_token + switchable_interp_encodings[SWITCHABLE_FILTERS] = { { 0, 1 }, + { 2, 2 }, + { 3, 2 } }; static const struct vp9_token partition_encodings[PARTITION_TYPES] = { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } }; @@ -50,6 +56,7 @@ static const struct vp9_token inter_mode_encodings[INTER_MODES] = { static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode, const vpx_prob *probs) { + assert(!is_inter_mode(mode)); vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]); } @@ -86,7 +93,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm, BLOCK_SIZE bsize = xd->mi[0]->sb_type; const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; const vpx_prob *const tx_probs = - get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs); + get_tx_probs(max_tx_size, get_tx_size_context(xd), &cm->fc->tx_probs); vpx_write(w, tx_size != TX_4X4, tx_probs[0]); if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { vpx_write(w, tx_size != TX_8X8, tx_probs[1]); @@ -129,9 +136,9 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, const TOKENEXTRA *p; const vp9_extra_bit *const extra_bits = #if CONFIG_VP9_HIGHBITDEPTH - (bit_depth == VPX_BITS_12) - ? vp9_extra_bits_high12 - : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 : vp9_extra_bits; + (bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12 + : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 + : vp9_extra_bits; #else vp9_extra_bits; (void)bit_depth; @@ -164,8 +171,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, vpx_write_bit(w, p->extra & 1); } else { // t >= TWO_TOKEN && t < EOB_TOKEN const struct vp9_token *const a = &vp9_coef_encodings[t]; - const int v = a->value; - const int n = a->len; + int v = a->value; + int n = a->len; const int e = p->extra; vpx_write(w, 1, context_tree[2]); vp9_write_tree(w, vp9_coef_con_tree, @@ -174,8 +181,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp, if (t >= CATEGORY1_TOKEN) { const vp9_extra_bit *const b = &extra_bits[t]; const unsigned char *pb = b->prob; - int v = e >> 1; - int n = b->len; // number of bits in v, assumed nonzero + v = e >> 1; + n = b->len; // number of bits in v, assumed nonzero do { const int bb = (v >> --n) & 1; vpx_write(w, bb, *pb++); @@ -217,7 +224,8 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd, } if (is_compound) { - vpx_write(w, mi->ref_frame[0] == GOLDEN_FRAME, + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + vpx_write(w, mi->ref_frame[!idx] == cm->comp_var_ref[1], vp9_get_pred_prob_comp_ref_p(cm, xd)); } else { const int bit0 = mi->ref_frame[0] != LAST_FRAME; @@ -234,8 +242,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MACROBLOCKD *const xd, const MB_MODE_INFO_EXT *const mbmi_ext, vpx_writer *w, unsigned int *const max_mv_magnitude, - int interp_filter_selected[MAX_REF_FRAMES] - [SWITCHABLE]) { + int interp_filter_selected[][SWITCHABLE]) { VP9_COMMON *const cm = &cpi->common; const nmv_context *nmvc = &cm->fc->nmvc; const struct segmentation *const seg = &cm->seg; @@ -373,8 +380,7 @@ static void write_modes_b(VP9_COMP *cpi, MACROBLOCKD *const xd, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, int mi_row, int mi_col, unsigned int *const max_mv_magnitude, - int interp_filter_selected[MAX_REF_FRAMES] - [SWITCHABLE]) { + int interp_filter_selected[][SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; const MB_MODE_INFO_EXT *const mbmi_ext = cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); @@ -424,8 +430,7 @@ static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize, unsigned int *const max_mv_magnitude, - int interp_filter_selected[MAX_REF_FRAMES] - [SWITCHABLE]) { + int interp_filter_selected[][SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; const int bsl = b_width_log2_lookup[bsize]; const int bs = (1 << bsl) / 4; @@ -463,7 +468,8 @@ static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd, write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, max_mv_magnitude, interp_filter_selected); break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize, max_mv_magnitude, interp_filter_selected); write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, @@ -473,7 +479,6 @@ static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd, write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, subsize, max_mv_magnitude, interp_filter_selected); break; - default: assert(0); } } @@ -484,23 +489,30 @@ static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd, } static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd, - const TileInfo *const tile, vpx_writer *w, - TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, - unsigned int *const max_mv_magnitude, - int interp_filter_selected[MAX_REF_FRAMES] - [SWITCHABLE]) { + const TileInfo *const tile, vpx_writer *w, int tile_row, + int tile_col, unsigned int *const max_mv_magnitude, + int interp_filter_selected[][SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; - int mi_row, mi_col; + int mi_row, mi_col, tile_sb_row; + TOKENEXTRA *tok = NULL; + TOKENEXTRA *tok_end = NULL; set_partition_probs(cm, xd); for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += MI_BLOCK_SIZE) { + tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile->mi_row_start) >> + MI_BLOCK_SIZE_LOG2; + tok = cpi->tplist[tile_row][tile_col][tile_sb_row].start; + tok_end = tok + cpi->tplist[tile_row][tile_col][tile_sb_row].count; + vp9_zero(xd->left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) - write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + write_modes_sb(cpi, xd, tile, w, &tok, tok_end, mi_row, mi_col, BLOCK_64X64, max_mv_magnitude, interp_filter_selected); + + assert(tok == cpi->tplist[tile_row][tile_col][tile_sb_row].stop); } } @@ -544,7 +556,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, switch (cpi->sf.use_fast_coef_updates) { case TWO_LOOP: { /* dry run to see if there is any update at all needed */ - int savings = 0; + int64_t savings = 0; int update[2] = { 0, 0 }; for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { @@ -553,7 +565,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, for (t = 0; t < entropy_nodes_update; ++t) { vpx_prob newp = new_coef_probs[i][j][k][l][t]; const vpx_prob oldp = old_coef_probs[i][j][k][l][t]; - int s; + int64_t s; int u = 0; if (t == PIVOT_NODE) s = vp9_prob_diff_update_savings_search_model( @@ -589,8 +601,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, for (t = 0; t < entropy_nodes_update; ++t) { vpx_prob newp = new_coef_probs[i][j][k][l][t]; vpx_prob *oldp = old_coef_probs[i][j][k][l] + t; - const vpx_prob upd = DIFF_UPDATE_PROB; - int s; + int64_t s; int u = 0; if (t == PIVOT_NODE) s = vp9_prob_diff_update_savings_search_model( @@ -614,9 +625,10 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, return; } - case ONE_LOOP_REDUCED: { + default: { int updates = 0; int noupdates_before_first = 0; + assert(cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED); for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { for (k = 0; k < COEF_BANDS; ++k) { @@ -625,7 +637,7 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, for (t = 0; t < entropy_nodes_update; ++t) { vpx_prob newp = new_coef_probs[i][j][k][l][t]; vpx_prob *oldp = old_coef_probs[i][j][k][l] + t; - int s; + int64_t s; int u = 0; if (t == PIVOT_NODE) { @@ -666,7 +678,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, } return; } - default: assert(0); } } @@ -890,6 +901,19 @@ static void write_tile_info(const VP9_COMMON *const cm, } int vp9_get_refresh_mask(VP9_COMP *cpi) { + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL) { + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const int this_gf_index = gf_group->index; + const int update_ref_idx = gf_group->update_ref_idx[this_gf_index]; + + if (update_ref_idx != INVALID_IDX) { + return (1 << update_ref_idx); + } else { + return 0; + } + } if (vp9_preserve_existing_gf(cpi)) { // We have decided to preserve the previously existing golden frame as our // new ARF frame. However, in the short term we leave it in the GF slot and, @@ -905,25 +929,40 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) { (cpi->refresh_golden_frame << cpi->alt_fb_idx); } else { int arf_idx = cpi->alt_fb_idx; - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_idx = gf_group->arf_update_idx[gf_group->index]; + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + + if (cpi->multi_layer_arf) { + for (arf_idx = 0; arf_idx < REF_FRAMES; ++arf_idx) { + if (arf_idx != cpi->alt_fb_idx && arf_idx != cpi->lst_fb_idx && + arf_idx != cpi->gld_fb_idx) { + int idx; + for (idx = 0; idx < gf_group->stack_size; ++idx) + if (arf_idx == gf_group->arf_index_stack[idx]) break; + if (idx == gf_group->stack_size) break; + } + } } + cpi->twopass.gf_group.top_arf_idx = arf_idx; + + if (cpi->use_svc && cpi->svc.use_set_ref_frame_config && + cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) + return cpi->svc.update_buffer_slot[cpi->svc.spatial_layer_id]; return (cpi->refresh_last_frame << cpi->lst_fb_idx) | (cpi->refresh_golden_frame << cpi->gld_fb_idx) | (cpi->refresh_alt_ref_frame << arf_idx); } } -static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) { +static int encode_tile_worker(void *arg1, void *arg2) { + VP9_COMP *cpi = (VP9_COMP *)arg1; + VP9BitstreamWorkerData *data = (VP9BitstreamWorkerData *)arg2; MACROBLOCKD *const xd = &data->xd; - vpx_start_encode(&data->bit_writer, data->dest); + const int tile_row = 0; + vpx_start_encode(&data->bit_writer, data->dest, data->dest_size); write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info, - &data->bit_writer, &data->tok, data->tok_end, + &data->bit_writer, tile_row, data->tile_idx, &data->max_mv_magnitude, data->interp_filter_selected); - assert(data->tok == data->tok_end); - vpx_stop_encode(&data->bit_writer); - return 1; + return vpx_stop_encode(&data->bit_writer) == 0; } void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) { @@ -937,36 +976,47 @@ void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) { } } -static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) { +static size_t encode_tiles_buffer_alloc_size(const VP9_COMP *cpi) { + const VP9_COMMON *cm = &cpi->common; + const int image_bps = + (8 + 2 * (8 >> (cm->subsampling_x + cm->subsampling_y))) * + (1 + (cm->bit_depth > 8)); + const int64_t size = + (int64_t)cpi->oxcf.width * cpi->oxcf.height * image_bps / 8; + return (size_t)size; +} + +static void encode_tiles_buffer_alloc(VP9_COMP *const cpi, + size_t buffer_alloc_size) { + VP9_COMMON *const cm = &cpi->common; int i; const size_t worker_data_size = cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data); - cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size); + CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data, + vpx_memalign(16, worker_data_size)); memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size); - if (!cpi->vp9_bitstream_worker_data) return 1; for (i = 1; i < cpi->num_workers; ++i) { - cpi->vp9_bitstream_worker_data[i].dest_size = - cpi->oxcf.width * cpi->oxcf.height; - cpi->vp9_bitstream_worker_data[i].dest = - vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size); - if (!cpi->vp9_bitstream_worker_data[i].dest) return 1; + CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest, + vpx_malloc(buffer_alloc_size)); + cpi->vp9_bitstream_worker_data[i].dest_size = buffer_alloc_size; } - return 0; } -static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { +static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr, + size_t data_size) { const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; const int num_workers = cpi->num_workers; size_t total_size = 0; int tile_col = 0; + int error = 0; + const size_t buffer_alloc_size = encode_tiles_buffer_alloc_size(cpi); if (!cpi->vp9_bitstream_worker_data || - cpi->vp9_bitstream_worker_data[1].dest_size > - (cpi->oxcf.width * cpi->oxcf.height)) { + cpi->vp9_bitstream_worker_data[1].dest_size != buffer_alloc_size) { vp9_bitstream_encode_tiles_buffer_dealloc(cpi); - if (encode_tiles_buffer_alloc(cpi)) return 0; + encode_tiles_buffer_alloc(cpi, buffer_alloc_size); } while (tile_col < tile_cols) { @@ -978,8 +1028,6 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { // Populate the worker data. data->xd = cpi->td.mb.e_mbd; data->tile_idx = tile_col; - data->tok = cpi->tile_tok[0][tile_col]; - data->tok_end = cpi->tile_tok[0][tile_col] + cpi->tok_count[0][tile_col]; data->max_mv_magnitude = cpi->max_mv_magnitude; memset(data->interp_filter_selected, 0, sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE); @@ -988,12 +1036,17 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { if (i == 0) { // If this worker happens to be for the last tile, then do not offset it // by 4 for the tile size. - data->dest = - data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4); + const size_t offset = total_size + (tile_col == tile_cols - 1 ? 0 : 4); + if (data_size < offset) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "encode_tiles_mt: output buffer full"); + } + data->dest = data_ptr + offset; + data->dest_size = data_size - offset; } worker->data1 = cpi; worker->data2 = data; - worker->hook = (VPxWorkerHook)encode_tile_worker; + worker->hook = encode_tile_worker; worker->had_error = 0; if (i < num_workers - 1) { @@ -1010,7 +1063,11 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { uint32_t tile_size; int k; - if (!winterface->sync(worker)) return 0; + if (!winterface->sync(worker)) { + error = 1; + continue; + } + tile_size = data->bit_writer.pos; // Aggregate per-thread bitstream stats. @@ -1022,24 +1079,35 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { // Prefix the size of the tile on all but the last. if (tile_col != tile_cols || j < i - 1) { + if (data_size - total_size < 4) { + error = 1; + continue; + } mem_put_be32(data_ptr + total_size, tile_size); total_size += 4; } if (j > 0) { + if (data_size - total_size < tile_size) { + error = 1; + continue; + } memcpy(data_ptr + total_size, data->dest, tile_size); } total_size += tile_size; } + if (error) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "encode_tiles_mt: output buffer full"); + } } return total_size; } -static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { +static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr, size_t data_size) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; vpx_writer residual_bc; int tile_row, tile_col; - TOKENEXTRA *tok_end; size_t total_size = 0; const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; @@ -1052,27 +1120,32 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { // that it does not make the overall process worse in any case. if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 && tile_cols > 1) { - return encode_tiles_mt(cpi, data_ptr); + return encode_tiles_mt(cpi, data_ptr, data_size); } for (tile_row = 0; tile_row < tile_rows; tile_row++) { for (tile_col = 0; tile_col < tile_cols; tile_col++) { int tile_idx = tile_row * tile_cols + tile_col; - TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; - - tok_end = cpi->tile_tok[tile_row][tile_col] + - cpi->tok_count[tile_row][tile_col]; + size_t offset; if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) - vpx_start_encode(&residual_bc, data_ptr + total_size + 4); + offset = total_size + 4; else - vpx_start_encode(&residual_bc, data_ptr + total_size); + offset = total_size; + if (data_size < offset) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "encode_tiles: output buffer full"); + } + vpx_start_encode(&residual_bc, data_ptr + offset, data_size - offset); write_modes(cpi, xd, &cpi->tile_data[tile_idx].tile_info, &residual_bc, - &tok, tok_end, &cpi->max_mv_magnitude, + tile_row, tile_col, &cpi->max_mv_magnitude, cpi->interp_filter_selected); - assert(tok == tok_end); - vpx_stop_encode(&residual_bc); + + if (vpx_stop_encode(&residual_bc)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "encode_tiles: output buffer full"); + } if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) { // size of this tile mem_put_be32(data_ptr + total_size, residual_bc.pos); @@ -1118,11 +1191,7 @@ static void write_frame_size_with_refs(VP9_COMP *cpi, ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || (cpi->svc.number_spatial_layers > 1 && - cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame) || - (is_two_pass_svc(cpi) && - cpi->svc.encode_empty_frame_state == ENCODING && - cpi->svc.layer_context[0].frames_from_key_frame < - cpi->svc.number_temporal_layers + 1))) { + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) { found = 0; } else if (cfg != NULL) { found = @@ -1154,8 +1223,10 @@ static void write_profile(BITSTREAM_PROFILE profile, case PROFILE_0: vpx_wb_write_literal(wb, 0, 2); break; case PROFILE_1: vpx_wb_write_literal(wb, 2, 2); break; case PROFILE_2: vpx_wb_write_literal(wb, 1, 2); break; - case PROFILE_3: vpx_wb_write_literal(wb, 6, 3); break; - default: assert(0); + default: + assert(profile == PROFILE_3); + vpx_wb_write_literal(wb, 6, 3); + break; } } @@ -1179,6 +1250,7 @@ static void write_bitdepth_colorspace_sampling( } } else { assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3); + assert(cm->subsampling_x == 0 && cm->subsampling_y == 0); vpx_wb_write_bit(wb, 0); // unused } } @@ -1192,7 +1264,13 @@ static void write_uncompressed_header(VP9_COMP *cpi, write_profile(cm->profile, wb); - vpx_wb_write_bit(wb, 0); // show_existing_frame + // If to use show existing frame. + vpx_wb_write_bit(wb, cm->show_existing_frame); + if (cm->show_existing_frame) { + vpx_wb_write_literal(wb, cpi->alt_fb_idx, 3); + return; + } + vpx_wb_write_bit(wb, cm->frame_type); vpx_wb_write_bit(wb, cm->show_frame); vpx_wb_write_bit(wb, cm->error_resilient_mode); @@ -1202,14 +1280,6 @@ static void write_uncompressed_header(VP9_COMP *cpi, write_bitdepth_colorspace_sampling(cm, wb); write_frame_size(cm, wb); } else { - // In spatial svc if it's not error_resilient_mode then we need to code all - // visible frames as invisible. But we need to keep the show_frame flag so - // that the publisher could know whether it is supposed to be visible. - // So we will code the show_frame flag as it is. Then code the intra_only - // bit here. This will make the bitstream incompatible. In the player we - // will change to show_frame flag to 0, then add an one byte frame with - // show_existing_frame flag which tells the decoder which frame we want to - // show. if (!cm->show_frame) vpx_wb_write_bit(wb, cm->intra_only); if (!cm->error_resilient_mode) @@ -1258,14 +1328,15 @@ static void write_uncompressed_header(VP9_COMP *cpi, write_tile_info(cm, wb); } -static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { +static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data, + size_t data_size) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; FRAME_CONTEXT *const fc = cm->fc; FRAME_COUNTS *counts = cpi->td.counts; vpx_writer header_bc; - vpx_start_encode(&header_bc, data); + vpx_start_encode(&header_bc, data, data_size); if (xd->lossless) cm->tx_mode = ONLY_4X4; @@ -1329,33 +1400,67 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { &counts->mv); } - vpx_stop_encode(&header_bc); - assert(header_bc.pos <= 0xffff); + if (vpx_stop_encode(&header_bc)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "write_compressed_header: output buffer full"); + } return header_bc.pos; } -void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { +void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t dest_size, + size_t *size) { + VP9_COMMON *const cm = &cpi->common; uint8_t *data = dest; - size_t first_part_size, uncompressed_hdr_size; - struct vpx_write_bit_buffer wb = { data, 0 }; + size_t data_size = dest_size; + size_t uncompressed_hdr_size, compressed_hdr_size; + struct vpx_write_bit_buffer wb; struct vpx_write_bit_buffer saved_wb; +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_reset_write(); +#endif + + vpx_wb_init(&wb, data, data_size); write_uncompressed_header(cpi, &wb); + if (vpx_wb_has_error(&wb)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "vp9_pack_bitstream: output buffer full"); + } + + // Skip the rest coding process if use show existing frame. + if (cm->show_existing_frame) { + uncompressed_hdr_size = vpx_wb_bytes_written(&wb); + data += uncompressed_hdr_size; + *size = data - dest; + return; + } + saved_wb = wb; - vpx_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size + // don't know in advance compressed header size + vpx_wb_write_literal(&wb, 0, 16); + if (vpx_wb_has_error(&wb)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "vp9_pack_bitstream: output buffer full"); + } uncompressed_hdr_size = vpx_wb_bytes_written(&wb); data += uncompressed_hdr_size; + data_size -= uncompressed_hdr_size; vpx_clear_system_state(); - first_part_size = write_compressed_header(cpi, data); - data += first_part_size; - // TODO(jbb): Figure out what to do if first_part_size > 16 bits. - vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16); + compressed_hdr_size = write_compressed_header(cpi, data, data_size); + data += compressed_hdr_size; + data_size -= compressed_hdr_size; + if (compressed_hdr_size > UINT16_MAX) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "compressed_hdr_size > 16 bits"); + } + vpx_wb_write_literal(&saved_wb, (int)compressed_hdr_size, 16); + assert(!vpx_wb_has_error(&saved_wb)); - data += encode_tiles(cpi, data); + data += encode_tiles(cpi, data, data_size); *size = data - dest; } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h index 044a3bbc7b..1120841ecb 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_BITSTREAM_H_ -#define VP9_ENCODER_VP9_BITSTREAM_H_ +#ifndef VPX_VP9_ENCODER_VP9_BITSTREAM_H_ +#define VPX_VP9_ENCODER_VP9_BITSTREAM_H_ #ifdef __cplusplus extern "C" { @@ -19,9 +19,7 @@ extern "C" { typedef struct VP9BitstreamWorkerData { uint8_t *dest; - int dest_size; - TOKENEXTRA *tok; - TOKENEXTRA *tok_end; + size_t dest_size; vpx_writer bit_writer; int tile_idx; unsigned int max_mv_magnitude; @@ -37,19 +35,16 @@ int vp9_get_refresh_mask(VP9_COMP *cpi); void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi); -void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); +void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t dest_size, + size_t *size); static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { - return !cpi->multi_arf_allowed && cpi->refresh_golden_frame && - cpi->rc.is_src_frame_alt_ref && - (!cpi->use_svc || // Add spatial svc base layer case here - (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id == 0 && - cpi->svc.layer_context[0].gold_ref_idx >= 0 && - cpi->oxcf.ss_enable_auto_arf[0])); + return cpi->refresh_golden_frame && cpi->rc.is_src_frame_alt_ref && + !cpi->use_svc; } #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_BITSTREAM_H_ +#endif // VPX_VP9_ENCODER_VP9_BITSTREAM_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/libvpx/vp9/encoder/vp9_block.h index 1ea5fdf1ff..abb462c46a 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_block.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_block.h @@ -8,9 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_BLOCK_H_ -#define VP9_ENCODER_VP9_BLOCK_H_ +#ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_ +#define VPX_VP9_ENCODER_VP9_BLOCK_H_ +#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" @@ -18,12 +19,6 @@ extern "C" { #endif -typedef struct { - unsigned int sse; - int sum; - unsigned int var; -} diff; - struct macroblock_plane { DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); tran_low_t *qcoeff; @@ -31,9 +26,9 @@ struct macroblock_plane { uint16_t *eobs; struct buf_2d src; - // Quantizer setings - int16_t *quant_fp; + // Quantizer settings int16_t *round_fp; + int16_t *quant_fp; int16_t *quant; int16_t *quant_shift; int16_t *zbin; @@ -61,6 +56,11 @@ typedef struct { typedef struct macroblock MACROBLOCK; struct macroblock { +// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054 +#if defined(_MSC_VER) && _MSC_VER < 1900 + int64_t bsse[MAX_MB_PLANE << 2]; +#endif + struct macroblock_plane plane[MAX_MB_PLANE]; MACROBLOCKD e_mbd; @@ -71,23 +71,23 @@ struct macroblock { int skip_recode; int skip_optimize; int q_index; - int block_qcoeff_opt; + double log_block_src_var; int block_tx_domain; // The equivalent error at the current rdmult of one whole bit (not one // bitcost unit). int errorperbit; - // The equivalend SAD error of one (whole) bit at the current quantizer + // The equivalent SAD error of one (whole) bit at the current quantizer // for large blocks. int sadperbit16; - // The equivalend SAD error of one (whole) bit at the current quantizer + // The equivalent SAD error of one (whole) bit at the current quantizer // for sub-8x8 blocks. int sadperbit4; int rddiv; int rdmult; + int cb_rdmult; + int segment_id; int mb_energy; - int *m_search_count_ptr; - int *ex_search_count_ptr; // These are set to their default values at the beginning, and then adjusted // further in the encoding process. @@ -110,14 +110,23 @@ struct macroblock { int *nmvsadcost_hp[2]; int **mvsadcost; + // sharpness is used to disable skip mode and change rd_mult + int sharpness; + + // aq mode is used to adjust rd based on segment. + int adjust_rdmult_by_segment; + // These define limits to motion vector components to prevent them // from extending outside the UMV borders MvLimits mv_limits; - // Notes transform blocks where no coefficents are coded. + // Notes transform blocks where no coefficients are coded. // Set during mode selection. Read during block encoding. uint8_t zcoeff_blk[TX_SIZES][256]; + // Accumulate the tx block eobs in a partition block. + int32_t sum_y_eobs[TX_SIZES]; + int skip; int encode_breakout; @@ -131,16 +140,26 @@ struct macroblock { int use_lp32x32fdct; int skip_encode; + // In first pass, intra prediction is done based on source pixels + // at tile boundaries + int fp_src_pred; + // use fast quantization process int quant_fp; // skip forward transform and quantization uint8_t skip_txfm[MAX_MB_PLANE << 2]; #define SKIP_TXFM_NONE 0 +// TODO(chengchen): consider remove SKIP_TXFM_AC_DC from vp9 completely +// since it increases risks of bad perceptual quality. +// https://crbug.com/webm/1729 #define SKIP_TXFM_AC_DC 1 #define SKIP_TXFM_AC_ONLY 2 +// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054 +#if !defined(_MSC_VER) || _MSC_VER >= 1900 int64_t bsse[MAX_MB_PLANE << 2]; +#endif // Used to store sub partition's choices. MV pred_mv[MAX_REF_FRAMES]; @@ -151,21 +170,48 @@ struct macroblock { uint8_t sb_is_skin; + uint8_t skip_low_source_sad; + + uint8_t lowvar_highsumdiff; + + uint8_t last_sb_high_content; + + int sb_use_mv_part; + + int sb_mvcol_part; + + int sb_mvrow_part; + + int sb_pickmode_part; + + int zero_temp_sad_source; + + // For each superblock: saves the content value (e.g., low/high sad/sumdiff) + // based on source sad, prior to encoding the frame. + uint8_t content_state_sb; + // Used to save the status of whether a block has a low variance in // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for // 32x32, 9~24 for 16x16. uint8_t variance_low[25]; - void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); - void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); + uint8_t arf_frame_usage; + uint8_t lastgolden_frame_usage; + + void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride); + void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride, + int eob); #if CONFIG_VP9_HIGHBITDEPTH - void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, - int eob, int bd); + void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest, + int stride, int eob, int bd); #endif + DECLARE_ALIGNED(16, uint8_t, est_pred[64 * 64]); + + struct scale_factors *me_sf; }; #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_BLOCK_H_ +#endif // VPX_VP9_ENCODER_VP9_BLOCK_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c index 9ab57b57c7..da68a3c3c3 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c @@ -11,6 +11,7 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/system_state.h" +#include "vp9/encoder/vp9_blockiness.h" static int horizontal_filter(const uint8_t *s) { return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h new file mode 100644 index 0000000000..e840cb2518 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ +#define VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +double vp9_get_blockiness(const uint8_t *img1, int img1_pitch, + const uint8_t *img2, int img2_pitch, int width, + int height); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c index 2f7e544332..ee0fcd8729 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c @@ -12,7 +12,10 @@ #include "vp9/encoder/vp9_encoder.h" static const BLOCK_SIZE square[] = { - BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, + BLOCK_8X8, + BLOCK_16X16, + BLOCK_32X32, + BLOCK_64X64, }; static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, @@ -22,16 +25,17 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, int i, k; ctx->num_4x4_blk = num_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_blk, sizeof(uint8_t))); + CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk, + vpx_calloc(num_blk, sizeof(uint8_t))); for (i = 0; i < MAX_MB_PLANE; ++i) { for (k = 0; k < 3; ++k) { - CHECK_MEM_ERROR(cm, ctx->coeff[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k], vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k]))); - CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k], vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k]))); - CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k], vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k]))); - CHECK_MEM_ERROR(cm, ctx->eobs[i][k], + CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k], vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k]))); ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; @@ -97,10 +101,10 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { int nodes; vpx_free(td->leaf_tree); - CHECK_MEM_ERROR(cm, td->leaf_tree, + CHECK_MEM_ERROR(&cm->error, td->leaf_tree, vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree))); vpx_free(td->pc_tree); - CHECK_MEM_ERROR(cm, td->pc_tree, + CHECK_MEM_ERROR(&cm->error, td->pc_tree, vpx_calloc(tree_nodes, sizeof(*td->pc_tree))); this_pc = &td->pc_tree[0]; @@ -115,8 +119,8 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { PC_TREE *const tree = &td->pc_tree[pc_tree_index]; tree->block_size = square[0]; alloc_tree_contexts(cm, tree, 4); - tree->leaf_split[0] = this_leaf++; - for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0]; + tree->u.leaf_split[0] = this_leaf++; + for (j = 1; j < 4; j++) tree->u.leaf_split[j] = tree->u.leaf_split[0]; } // Each node has 4 leaf nodes, fill each block_size level of the tree @@ -126,7 +130,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { PC_TREE *const tree = &td->pc_tree[pc_tree_index]; alloc_tree_contexts(cm, tree, 4 << (2 * square_index)); tree->block_size = square[square_index]; - for (j = 0; j < 4; j++) tree->split[j] = this_pc++; + for (j = 0; j < 4; j++) tree->u.split[j] = this_pc++; ++pc_tree_index; } ++square_index; @@ -136,17 +140,22 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { } void vp9_free_pc_tree(ThreadData *td) { - const int tree_nodes = 64 + 16 + 4 + 1; int i; - // Set up all 4x4 mode contexts - for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]); + if (td == NULL) return; - // Sets up all the leaf nodes in the tree. - for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]); + if (td->leaf_tree != NULL) { + // Set up all 4x4 mode contexts + for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]); + vpx_free(td->leaf_tree); + td->leaf_tree = NULL; + } - vpx_free(td->pc_tree); - td->pc_tree = NULL; - vpx_free(td->leaf_tree); - td->leaf_tree = NULL; + if (td->pc_tree != NULL) { + const int tree_nodes = 64 + 16 + 4 + 1; + // Sets up all the leaf nodes in the tree. + for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]); + vpx_free(td->pc_tree); + td->pc_tree = NULL; + } } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h index 86ba03d69f..51e13ba654 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_ -#define VP9_ENCODER_VP9_CONTEXT_TREE_H_ +#ifndef VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ +#define VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ #include "vp9/common/vp9_blockd.h" #include "vp9/encoder/vp9_block.h" @@ -56,6 +56,7 @@ typedef struct { // scope of refactoring. int rate; int64_t dist; + int64_t rdcost; #if CONFIG_VP9_TEMPORAL_DENOISING unsigned int newmv_sse; @@ -65,12 +66,18 @@ typedef struct { int_mv best_sse_mv; MV_REFERENCE_FRAME best_reference_frame; MV_REFERENCE_FRAME best_zeromv_reference_frame; + int sb_skip_denoising; #endif // motion vector cache for adaptive motion search control in partition // search loop MV pred_mv[MAX_REF_FRAMES]; INTERP_FILTER pred_interp_filter; + + // Used for the machine learning-based early termination + int32_t sum_y_eobs; + // Skip certain ref frames during RD search of rectangular partitions. + uint8_t skip_ref_frame_mask; } PICK_MODE_CONTEXT; typedef struct PC_TREE { @@ -83,7 +90,10 @@ typedef struct PC_TREE { union { struct PC_TREE *split[4]; PICK_MODE_CONTEXT *leaf_split[4]; - }; + } u; + // Obtained from a simple motion search. Used by the ML based partition search + // speed feature. + MV mv; } PC_TREE; void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td); @@ -93,4 +103,4 @@ void vp9_free_pc_tree(struct ThreadData *td); } // extern "C" #endif -#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */ +#endif // VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_cost.h b/media/libvpx/libvpx/vp9/encoder/vp9_cost.h index 70a1a2e0e9..ee0033fa31 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_cost.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_cost.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_COST_H_ -#define VP9_ENCODER_VP9_COST_H_ +#ifndef VPX_VP9_ENCODER_VP9_COST_H_ +#define VPX_VP9_ENCODER_VP9_COST_H_ #include "vpx_dsp/prob.h" #include "vpx/vpx_integer.h" @@ -29,9 +29,8 @@ extern const uint16_t vp9_prob_cost[256]; #define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? 256 - (prob) : (prob)) -static INLINE unsigned int cost_branch256(const unsigned int ct[2], - vpx_prob p) { - return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p); +static INLINE uint64_t cost_branch256(const unsigned int ct[2], vpx_prob p) { + return (uint64_t)ct[0] * vp9_cost_zero(p) + (uint64_t)ct[1] * vp9_cost_one(p); } static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs, int bits, @@ -55,4 +54,4 @@ void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_COST_H_ +#endif // VPX_VP9_ENCODER_VP9_COST_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_dct.c b/media/libvpx/libvpx/vp9/encoder/vp9_dct.c index bb8c23fdb9..2f42c6afc2 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_dct.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_dct.c @@ -554,114 +554,6 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, } } -void vp9_fdct8x8_quant_c(const int16_t *input, int stride, - tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - int eob = -1; - - int i, j; - tran_low_t intermediate[64]; - - // Transform columns - { - tran_low_t *output = intermediate; - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 - tran_high_t t0, t1, t2, t3; // needs32 - tran_high_t x0, x1, x2, x3; // canbe16 - - int i; - for (i = 0; i < 8; i++) { - // stage 1 - s0 = (input[0 * stride] + input[7 * stride]) * 4; - s1 = (input[1 * stride] + input[6 * stride]) * 4; - s2 = (input[2 * stride] + input[5 * stride]) * 4; - s3 = (input[3 * stride] + input[4 * stride]) * 4; - s4 = (input[3 * stride] - input[4 * stride]) * 4; - s5 = (input[2 * stride] - input[5 * stride]) * 4; - s6 = (input[1 * stride] - input[6 * stride]) * 4; - s7 = (input[0 * stride] - input[7 * stride]) * 4; - - // fdct4(step, step); - x0 = s0 + s3; - x1 = s1 + s2; - x2 = s1 - s2; - x3 = s0 - s3; - t0 = (x0 + x1) * cospi_16_64; - t1 = (x0 - x1) * cospi_16_64; - t2 = x2 * cospi_24_64 + x3 * cospi_8_64; - t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; - output[0 * 8] = (tran_low_t)fdct_round_shift(t0); - output[2 * 8] = (tran_low_t)fdct_round_shift(t2); - output[4 * 8] = (tran_low_t)fdct_round_shift(t1); - output[6 * 8] = (tran_low_t)fdct_round_shift(t3); - - // Stage 2 - t0 = (s6 - s5) * cospi_16_64; - t1 = (s6 + s5) * cospi_16_64; - t2 = fdct_round_shift(t0); - t3 = fdct_round_shift(t1); - - // Stage 3 - x0 = s4 + t2; - x1 = s4 - t2; - x2 = s7 - t3; - x3 = s7 + t3; - - // Stage 4 - t0 = x0 * cospi_28_64 + x3 * cospi_4_64; - t1 = x1 * cospi_12_64 + x2 * cospi_20_64; - t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; - t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - output[1 * 8] = (tran_low_t)fdct_round_shift(t0); - output[3 * 8] = (tran_low_t)fdct_round_shift(t2); - output[5 * 8] = (tran_low_t)fdct_round_shift(t1); - output[7 * 8] = (tran_low_t)fdct_round_shift(t3); - input++; - output++; - } - } - - // Rows - for (i = 0; i < 8; ++i) { - fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]); - for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2; - } - - // TODO(jingning) Decide the need of these arguments after the - // quantization process is completed. - (void)zbin_ptr; - (void)quant_shift_ptr; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant_ptr[rc != 0]) >> 16; - - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - - if (tmp) eob = i; - } - } - *eob_ptr = eob + 1; -} - void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type) { if (tx_type == DCT_DCT) { diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c index 1d9a6702df..e5dffa90a8 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c @@ -185,33 +185,41 @@ static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row, } static VP9_DENOISER_DECISION perform_motion_compensation( - VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs, + VP9_COMMON *const cm, VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs, int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx, - int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv) { - int sse_diff = ctx->zeromv_sse - ctx->newmv_sse; - MV_REFERENCE_FRAME frame; + int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv, + int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx, + int use_svc, int spatial_layer, int use_gf_temporal_ref) { + const int sse_diff = (ctx->newmv_sse == UINT_MAX) + ? 0 + : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse); + int frame; + int denoise_layer_idx = 0; MACROBLOCKD *filter_mbd = &mb->e_mbd; MODE_INFO *mi = filter_mbd->mi[0]; MODE_INFO saved_mi; int i; struct buf_2d saved_dst[MAX_MB_PLANE]; struct buf_2d saved_pre[MAX_MB_PLANE]; + const RefBuffer *saved_block_refs[2]; + MV_REFERENCE_FRAME saved_frame; frame = ctx->best_reference_frame; + saved_mi = *mi; if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4)) return COPY_BLOCK; - // Avoid denoising for small block (unless motion is small). - // Small blocks are selected in variance partition (before encoding) and - // will typically lie on moving areas. - if (denoiser->denoising_level < kDenHigh && motion_magnitude > 16 && - bs <= BLOCK_8X8) + // Avoid denoising small blocks. When noise > kDenLow or frame width > 480, + // denoise 16x16 blocks. + if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 || + (bs == BLOCK_16X16 && width > 480 && + denoiser->denoising_level <= kDenLow)) return COPY_BLOCK; // If the best reference frame uses inter-prediction and there is enough of a // difference in sum-squared-error, use it. - if (frame != INTRA_FRAME && ctx->newmv_sse != UINT_MAX && + if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME && sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) { mi->ref_frame[0] = ctx->best_reference_frame; mi->mode = ctx->best_sse_inter_mode; @@ -221,9 +229,12 @@ static VP9_DENOISER_DECISION perform_motion_compensation( frame = ctx->best_zeromv_reference_frame; ctx->newmv_sse = ctx->zeromv_sse; // Bias to last reference. - if (frame != LAST_FRAME && - ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) || - denoiser->denoising_level >= kDenHigh)) { + if ((num_spatial_layers > 1 && !use_gf_temporal_ref) || + frame == ALTREF_FRAME || + (frame == GOLDEN_FRAME && use_gf_temporal_ref) || + (frame != LAST_FRAME && + ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) || + denoiser->denoising_level >= kDenHigh))) { frame = LAST_FRAME; ctx->newmv_sse = ctx->zeromv_lastref_sse; } @@ -238,6 +249,27 @@ static VP9_DENOISER_DECISION perform_motion_compensation( } } + saved_frame = frame; + // When using SVC, we need to map REF_FRAME to the frame buffer index. + if (use_svc) { + if (frame == LAST_FRAME) + frame = lst_fb_idx + 1; + else if (frame == GOLDEN_FRAME) + frame = gld_fb_idx + 1; + // Shift for the second spatial layer. + if (num_spatial_layers - spatial_layer == 2) + frame = frame + denoiser->num_ref_frames; + denoise_layer_idx = num_spatial_layers - spatial_layer - 1; + } + + // Force copy (no denoise, copy source in denoised buffer) if + // running_avg_y[frame] is NULL. + if (denoiser->running_avg_y[frame].buffer_alloc == NULL) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { // Restore everything to its original state *mi = saved_mi; @@ -254,6 +286,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( saved_pre[i] = filter_mbd->plane[i].pre[0]; saved_dst[i] = filter_mbd->plane[i].dst; } + saved_block_refs[0] = filter_mbd->block_refs[0]; // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser // struct. @@ -270,23 +303,28 @@ static VP9_DENOISER_DECISION perform_motion_compensation( denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col); filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride; - filter_mbd->plane[0].dst.buf = - block_start(denoiser->mc_running_avg_y.y_buffer, - denoiser->mc_running_avg_y.y_stride, mi_row, mi_col); - filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride; - filter_mbd->plane[1].dst.buf = - block_start(denoiser->mc_running_avg_y.u_buffer, - denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col); - filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride; - filter_mbd->plane[2].dst.buf = - block_start(denoiser->mc_running_avg_y.v_buffer, - denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col); - filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride; + filter_mbd->plane[0].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col); + filter_mbd->plane[0].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].y_stride; + filter_mbd->plane[1].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col); + filter_mbd->plane[1].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride; + filter_mbd->plane[2].dst.buf = block_start( + denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer, + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col); + filter_mbd->plane[2].dst.stride = + denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride; + set_ref_ptrs(cm, filter_mbd, saved_frame, NO_REF_FRAME); vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs); // Restore everything to its original state *mi = saved_mi; + filter_mbd->block_refs[0] = saved_block_refs[0]; for (i = 0; i < MAX_MB_PLANE; ++i) { filter_mbd->plane[i].pre[0] = saved_pre[i]; filter_mbd->plane[i].dst = saved_dst[i]; @@ -297,20 +335,31 @@ static VP9_DENOISER_DECISION perform_motion_compensation( void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, - VP9_DENOISER_DECISION *denoiser_decision) { + VP9_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref) { int mv_col, mv_row; int motion_magnitude = 0; int zeromv_filter = 0; VP9_DENOISER *denoiser = &cpi->denoiser; VP9_DENOISER_DECISION decision = COPY_BLOCK; - YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME]; - YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y; + + const int shift = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 + ? denoiser->num_ref_frames + : 0; + YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift]; + const int denoise_layer_index = + cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1; + YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index]; uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col); + uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col); struct buf_2d src = mb->plane[0].src; int is_skin = 0; + int increase_denoising = 0; int consec_zeromv = 0; + int last_is_reference = cpi->ref_frame_flags & VP9_LAST_FLAG; mv_col = ctx->best_sse_mv.as_mv.col; mv_row = ctx->best_sse_mv.as_mv.row; motion_magnitude = mv_row * mv_row + mv_col * mv_col; @@ -326,8 +375,8 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, VP9_COMMON *const cm = &cpi->common; int j, i; // Loop through the 8x8 sub-blocks. - const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; - const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; + const int bw = num_8x8_blocks_wide_lookup[bs]; + const int bh = num_8x8_blocks_high_lookup[bs]; const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); const int block_index = mi_row * cm->mi_cols + mi_col; @@ -338,10 +387,10 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv); // No need to keep checking 8x8 blocks if any of the sub-blocks // has small consec_zeromv (since threshold for no_skin based on - // zero/small motion in skin detection is high, i.e, > 4). + // zero/small motion in skin detection is high, i.e., > 4). if (consec_zeromv < 4) { i = ymis; - j = xmis; + break; } } } @@ -352,30 +401,34 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, mb->plane[0].src.stride, mb->plane[1].src.stride, bs, consec_zeromv, motion_level); } - if (!is_skin && denoiser->denoising_level == kDenHigh) { - denoiser->increase_denoising = 1; - } else { - denoiser->increase_denoising = 0; - } + if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1; - if (denoiser->denoising_level >= kDenLow) + // Copy block if LAST_FRAME is not a reference. + // Last doesn't always exist when SVC layers are dynamically changed, e.g. top + // spatial layer doesn't have last reference when it's brought up for the + // first time on the fly. + if (last_is_reference && denoiser->denoising_level >= kDenLow && + !ctx->sb_skip_denoising) decision = perform_motion_compensation( - denoiser, mb, bs, denoiser->increase_denoising, mi_row, mi_col, ctx, - motion_magnitude, is_skin, &zeromv_filter, consec_zeromv); + &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, + motion_magnitude, is_skin, &zeromv_filter, consec_zeromv, + cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx, + cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id, + use_gf_temporal_ref); if (decision == FILTER_BLOCK) { - decision = vp9_denoiser_filter( - src.buf, src.stride, mc_avg_start, mc_avg.y_stride, avg_start, - avg.y_stride, denoiser->increase_denoising, bs, motion_magnitude); + decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start, + mc_avg.y_stride, avg_start, avg.y_stride, + increase_denoising, bs, motion_magnitude); } if (decision == FILTER_BLOCK) { - vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, - NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2, + vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0, + 0, 0, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } else { // COPY_BLOCK - vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, - NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2, + vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0, + 0, 0, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } *denoiser_decision = decision; @@ -408,49 +461,64 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest, src->y_buffer = tmp_buf; } -void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, - YV12_BUFFER_CONFIG src, - FRAME_TYPE frame_type, - int refresh_alt_ref_frame, - int refresh_golden_frame, - int refresh_last_frame, int resized) { +void vp9_denoiser_update_frame_info( + VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc, + FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, + int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, + int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer) { + const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0; // Copy source into denoised reference buffers on KEY_FRAME or - // if the just encoded frame was resized. - if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset) { + // if the just encoded frame was resized. For SVC, copy source if the base + // spatial layer was key frame. + if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset || + svc_refresh_denoiser_buffers) { int i; // Start at 1 so as not to overwrite the INTRA_FRAME - for (i = 1; i < MAX_REF_FRAMES; ++i) - copy_frame(&denoiser->running_avg_y[i], &src); + for (i = 1; i < denoiser->num_ref_frames; ++i) { + if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL) + copy_frame(&denoiser->running_avg_y[i + shift], &src); + } denoiser->reset = 0; return; } - // If more than one refresh occurs, must copy frame buffer. - if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) { - if (refresh_alt_ref_frame) { - copy_frame(&denoiser->running_avg_y[ALTREF_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); - } - if (refresh_golden_frame) { - copy_frame(&denoiser->running_avg_y[GOLDEN_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); - } - if (refresh_last_frame) { - copy_frame(&denoiser->running_avg_y[LAST_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) + copy_frame(&denoiser->running_avg_y[i + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } } else { - if (refresh_alt_ref_frame) { - swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); - } - if (refresh_golden_frame) { - swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); - } - if (refresh_last_frame) { - swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME], - &denoiser->running_avg_y[INTRA_FRAME]); + // If more than one refresh occurs, must copy frame buffer. + if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > + 1) { + if (refresh_alt_ref_frame) { + copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } else { + if (refresh_alt_ref_frame) { + swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } } } } @@ -479,19 +547,122 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse, } } -int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, - int ssy, +static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm, + VP9_DENOISER *denoiser, int fb_idx) { + int fail = 0; + if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) { + fail = + vpx_alloc_frame_buffer(&denoiser->running_avg_y[fb_idx], cm->width, + cm->height, cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, 0); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } + } + return 0; +} + +int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, + struct SVC *svc, int svc_buf_shift, + int refresh_alt, int refresh_gld, int refresh_lst, + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) { + int fail = 0; + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (cm->frame_type == KEY_FRAME || + svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + i + 1 + svc_buf_shift); + } + } + } else { + if (refresh_alt) { + // Increase the frame buffer index by 1 to map it to the buffer index in + // the denoiser. + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + alt_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_gld) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + gld_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_lst) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + lst_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + } + return 0; +} + +int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, #if CONFIG_VP9_HIGHBITDEPTH int use_highbitdepth, #endif int border) { - int i, fail; + int i, layer, fail, init_num_ref_frames; const int legacy_byte_alignment = 0; + int num_layers = 1; + int scaled_width = width; + int scaled_height = height; + if (use_svc) { + LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + svc->temporal_layer_id]; + get_layer_resolution(width, height, lc->scaling_factor_num, + lc->scaling_factor_den, &scaled_width, &scaled_height); + // For SVC: only denoise at most 2 spatial (highest) layers. + if (noise_sen >= 2) + // Denoise from one spatial layer below the top. + svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0); + else + // Only denoise the top spatial layer. + svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0); + num_layers = svc->number_spatial_layers - svc->first_layer_denoise; + } assert(denoiser != NULL); + denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES; + init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES; + denoiser->num_layers = num_layers; + CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y, + vpx_calloc(denoiser->num_ref_frames * num_layers, + sizeof(denoiser->running_avg_y[0]))); + CHECK_MEM_ERROR( + &cm->error, denoiser->mc_running_avg_y, + vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0]))); - for (i = 0; i < MAX_REF_FRAMES; ++i) { - fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height, - ssx, ssy, + for (layer = 0; layer < num_layers; ++layer) { + const int denoise_width = (layer == 0) ? width : scaled_width; + const int denoise_height = (layer == 0) ? height : scaled_height; + for (i = 0; i < init_num_ref_frames; ++i) { + fail = vpx_alloc_frame_buffer( + &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer], + denoise_width, denoise_height, ssx, ssy, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border, legacy_byte_alignment); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + } + + fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer], + denoise_width, denoise_height, ssx, ssy, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, #endif @@ -500,22 +671,10 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, vp9_denoiser_free(denoiser); return 1; } -#ifdef OUTPUT_YUV_DENOISED - make_grayscale(&denoiser->running_avg_y[i]); -#endif - } - - fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, ssx, - ssy, -#if CONFIG_VP9_HIGHBITDEPTH - use_highbitdepth, -#endif - border, legacy_byte_alignment); - if (fail) { - vp9_denoiser_free(denoiser); - return 1; } + // denoiser->last_source only used for noise_estimation, so only for top + // layer. fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy, #if CONFIG_VP9_HIGHBITDEPTH use_highbitdepth, @@ -528,11 +687,11 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, #ifdef OUTPUT_YUV_DENOISED make_grayscale(&denoiser->running_avg_y[i]); #endif - denoiser->increase_denoising = 0; denoiser->frame_buffer_initialized = 1; - denoiser->denoising_level = kDenLow; - denoiser->prev_denoising_level = kDenLow; + denoiser->denoising_level = kDenMedium; + denoiser->prev_denoising_level = kDenMedium; denoiser->reset = 0; + denoiser->current_denoiser_frame = 0; return 0; } @@ -542,23 +701,126 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) { return; } denoiser->frame_buffer_initialized = 0; - for (i = 0; i < MAX_REF_FRAMES; ++i) { + for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) { vpx_free_frame_buffer(&denoiser->running_avg_y[i]); } - vpx_free_frame_buffer(&denoiser->mc_running_avg_y); + vpx_free(denoiser->running_avg_y); + denoiser->running_avg_y = NULL; + + for (i = 0; i < denoiser->num_layers; ++i) { + vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]); + } + + vpx_free(denoiser->mc_running_avg_y); + denoiser->mc_running_avg_y = NULL; vpx_free_frame_buffer(&denoiser->last_source); } -void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) { +static void force_refresh_longterm_ref(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // If long term reference is used, force refresh of that slot, so + // denoiser buffer for long term reference stays in sync. + if (svc->use_gf_temporal_ref_current_layer) { + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->refresh_alt_ref_frame = 1; + } +} + +void vp9_denoiser_set_noise_level(VP9_COMP *const cpi, int noise_level) { + VP9_DENOISER *const denoiser = &cpi->denoiser; denoiser->denoising_level = noise_level; if (denoiser->denoising_level > kDenLowLow && - denoiser->prev_denoising_level == kDenLowLow) + denoiser->prev_denoising_level == kDenLowLow) { denoiser->reset = 1; - else + force_refresh_longterm_ref(cpi); + } else { denoiser->reset = 0; + } denoiser->prev_denoising_level = denoiser->denoising_level; } +// Scale/increase the partition threshold +// for denoiser speed-up. +int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, + int content_state, int temporal_layer_id) { + if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff) || (noise_level == kDenHigh) || + (temporal_layer_id != 0)) { + int64_t scaled_thr = + (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2; + return scaled_thr; + } else { + return (5 * threshold) >> 2; + } +} + +// Scale/increase the ac skip threshold for +// denoiser speed-up. +int64_t vp9_scale_acskip_thresh(int64_t threshold, + VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id) { + if (noise_level >= kDenLow && abs_sumdiff < 5) + return threshold *= (noise_level == kDenLow) ? 2 + : (temporal_layer_id == 2) ? 10 + : 6; + else + return threshold; +} + +void vp9_denoiser_reset_on_first_frame(VP9_COMP *const cpi) { + if (vp9_denoise_svc_non_key(cpi) && + cpi->denoiser.current_denoiser_frame == 0) { + cpi->denoiser.reset = 1; + force_refresh_longterm_ref(cpi); + } +} + +void vp9_denoiser_update_ref_frame(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->denoiser.denoising_level > kDenLowLow) { + int svc_refresh_denoiser_buffers = 0; + int denoise_svc_second_layer = 0; + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; + cpi->denoiser.current_denoiser_frame++; + if (cpi->use_svc) { + const int svc_buf_shift = + svc->number_spatial_layers - svc->spatial_layer_id == 2 + ? cpi->denoiser.num_ref_frames + : 0; + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + svc_refresh_denoiser_buffers = + lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id]; + denoise_svc_second_layer = + svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0; + // Check if we need to allocate extra buffers in the denoiser + // for refreshed frames. + if (vp9_denoiser_realloc_svc(cm, &cpi->denoiser, svc, svc_buf_shift, + cpi->refresh_alt_ref_frame, + cpi->refresh_golden_frame, + cpi->refresh_last_frame, cpi->alt_fb_idx, + cpi->gld_fb_idx, cpi->lst_fb_idx)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to re-allocate denoiser for SVC"); + } + vp9_denoiser_update_frame_info( + &cpi->denoiser, *cpi->Source, svc, frame_type, + cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, + cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx, + cpi->lst_fb_idx, cpi->resize_pending, svc_refresh_denoiser_buffers, + denoise_svc_second_layer); + } +} + #ifdef OUTPUT_YUV_DENOISED static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { int r, c; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h index fcfaa5051a..1973e98988 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_DENOISER_H_ -#define VP9_ENCODER_DENOISER_H_ +#ifndef VPX_VP9_ENCODER_VP9_DENOISER_H_ +#define VPX_VP9_ENCODER_VP9_DENOISER_H_ #include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_skin_detection.h" @@ -21,6 +21,14 @@ extern "C" { #define MOTION_MAGNITUDE_THRESHOLD (8 * 3) +// Denoiser is used in non svc real-time mode which does not use alt-ref, so no +// need to allocate for it, and hence we need MAX_REF_FRAME - 1 +#define NONSVC_REF_FRAMES MAX_REF_FRAMES - 1 + +// Number of frame buffers when SVC is used. [0] for current denoised buffer and +// [1..8] for REF_FRAMES +#define SVC_REF_FRAMES 9 + typedef enum vp9_denoiser_decision { COPY_BLOCK, FILTER_BLOCK, @@ -35,12 +43,14 @@ typedef enum vp9_denoiser_level { } VP9_DENOISER_LEVEL; typedef struct vp9_denoiser { - YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES]; - YV12_BUFFER_CONFIG mc_running_avg_y; + YV12_BUFFER_CONFIG *running_avg_y; + YV12_BUFFER_CONFIG *mc_running_avg_y; YV12_BUFFER_CONFIG last_source; - int increase_denoising; int frame_buffer_initialized; int reset; + int num_ref_frames; + int num_layers; + unsigned int current_denoiser_frame; VP9_DENOISER_LEVEL denoising_level; VP9_DENOISER_LEVEL prev_denoising_level; } VP9_DENOISER; @@ -58,17 +68,18 @@ typedef struct { } VP9_PICKMODE_CTX_DEN; struct VP9_COMP; +struct SVC; -void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, - YV12_BUFFER_CONFIG src, - FRAME_TYPE frame_type, - int refresh_alt_ref_frame, - int refresh_golden_frame, - int refresh_last_frame, int resized); +void vp9_denoiser_update_frame_info( + VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc, + FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, + int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, + int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer); void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, - VP9_DENOISER_DECISION *denoiser_decision); + VP9_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref); void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); @@ -76,8 +87,14 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse, PREDICTION_MODE mode, PICK_MODE_CONTEXT *ctx); -int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx, - int ssy, +int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, + struct SVC *svc, int svc_buf_shift, + int refresh_alt, int refresh_gld, int refresh_lst, + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx); + +int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, + int use_svc, int noise_sen, int width, int height, + int ssx, int ssy, #if CONFIG_VP9_HIGHBITDEPTH int use_highbitdepth, #endif @@ -95,10 +112,21 @@ static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs, void vp9_denoiser_free(VP9_DENOISER *denoiser); -void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level); +void vp9_denoiser_set_noise_level(struct VP9_COMP *const cpi, int noise_level); + +void vp9_denoiser_reset_on_first_frame(struct VP9_COMP *const cpi); + +int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, + int content_state, int temporal_layer_id); + +int64_t vp9_scale_acskip_thresh(int64_t threshold, + VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, + int temporal_layer_id); + +void vp9_denoiser_update_ref_frame(struct VP9_COMP *const cpi); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_DENOISER_H_ +#endif // VPX_VP9_ENCODER_VP9_DENOISER_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c index 323c053edf..53b8136b3e 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include @@ -20,6 +21,10 @@ #include "vpx_ports/mem.h" #include "vpx_ports/vpx_timer.h" #include "vpx_ports/system_state.h" +#include "vpx_util/vpx_pthread.h" +#if CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropy.h" @@ -32,16 +37,22 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_tile_common.h" - +#if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_aq_360.h" #include "vp9/encoder/vp9_aq_complexity.h" +#endif #include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_aq_variance.h" +#endif #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_partition_models.h" #include "vp9/encoder/vp9_pickmode.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_rdopt.h" @@ -98,19 +109,17 @@ static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = { }; #endif // CONFIG_VP9_HIGHBITDEPTH -unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi, - const struct buf_2d *ref, - BLOCK_SIZE bs) { +unsigned int vp9_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref, + BLOCK_SIZE bs) { unsigned int sse; const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, VP9_VAR_OFFS, 0, &sse); - return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); + return var; } #if CONFIG_VP9_HIGHBITDEPTH -unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, - const struct buf_2d *ref, - BLOCK_SIZE bs, int bd) { +unsigned int vp9_high_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref, + BLOCK_SIZE bs, int bd) { unsigned int var, sse; switch (bd) { case 10: @@ -130,37 +139,90 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), 0, &sse); break; } - return ROUND64_POWER_OF_TWO((int64_t)var, num_pels_log2_lookup[bs]); + return var; } #endif // CONFIG_VP9_HIGHBITDEPTH -static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi, - const struct buf_2d *ref, - int mi_row, int mi_col, - BLOCK_SIZE bs) { - unsigned int sse, var; - uint8_t *last_y; - const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME); - - assert(last != NULL); - last_y = - &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE]; - var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse); - return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); +unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs) { + return ROUND_POWER_OF_TWO(vp9_get_sby_variance(cpi, ref, bs), + num_pels_log2_lookup[bs]); } -static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x, - int mi_row, int mi_col) { - unsigned int var = get_sby_perpixel_diff_variance( - cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64); - if (var < 8) - return BLOCK_64X64; - else if (var < 128) - return BLOCK_32X32; - else if (var < 2048) - return BLOCK_16X16; - else - return BLOCK_8X8; +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, + const struct buf_2d *ref, + BLOCK_SIZE bs, int bd) { + return (unsigned int)ROUND64_POWER_OF_TWO( + (int64_t)vp9_high_get_sby_variance(cpi, ref, bs, bd), + num_pels_log2_lookup[bs]); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize, int segment_index) { + VP9_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + + const AQ_MODE aq_mode = cpi->oxcf.aq_mode; + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + + // Initialize the segmentation index as 0. + mi->segment_id = 0; + + // Skip the rest if AQ mode is disabled. + if (!seg->enabled) return; + + switch (aq_mode) { + case CYCLIC_REFRESH_AQ: + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; +#if !CONFIG_REALTIME_ONLY + case VARIANCE_AQ: + if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || + cpi->force_update_segmentation || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + int min_energy; + int max_energy; + // Get sub block energy range + if (bsize >= BLOCK_32X32) { + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + } else { + min_energy = bsize <= BLOCK_16X16 ? x->mb_energy + : vp9_block_energy(cpi, x, bsize); + } + mi->segment_id = vp9_vaq_segment_id(min_energy); + } else { + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + } + break; + case EQUATOR360_AQ: + if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation) + mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows); + else + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; +#endif + case LOOKAHEAD_AQ: + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; + case PSNR_AQ: mi->segment_id = segment_index; break; + case PERCEPTUAL_AQ: mi->segment_id = x->segment_id; break; + default: + // NO_AQ or PSNR_AQ + break; + } + + // Set segment index if ROI map or active_map is enabled. + if (cpi->roi.enabled || cpi->active_map.enabled) + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + + vp9_init_plane_quantizers(cpi, x); } // Lighter version of set_offsets that only sets the mode info @@ -175,23 +237,57 @@ static INLINE void set_mode_info_offsets(VP9_COMMON *const cm, x->mbmi_ext = x->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); } +static void set_ssim_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x, + const BLOCK_SIZE bsize, const int mi_row, + const int mi_col, int *const rdmult) { + const VP9_COMMON *const cm = &cpi->common; + + const int bsize_base = BLOCK_16X16; + const int num_8x8_w = num_8x8_blocks_wide_lookup[bsize_base]; + const int num_8x8_h = num_8x8_blocks_high_lookup[bsize_base]; + const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w; + const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h; + const int num_bcols = + (num_8x8_blocks_wide_lookup[bsize] + num_8x8_w - 1) / num_8x8_w; + const int num_brows = + (num_8x8_blocks_high_lookup[bsize] + num_8x8_h - 1) / num_8x8_h; + int row, col; + double num_of_mi = 0.0; + double geom_mean_of_scale = 0.0; + + assert(cpi->oxcf.tuning == VP8_TUNE_SSIM); + + for (row = mi_row / num_8x8_w; + row < num_rows && row < mi_row / num_8x8_w + num_brows; ++row) { + for (col = mi_col / num_8x8_h; + col < num_cols && col < mi_col / num_8x8_h + num_bcols; ++col) { + const int index = row * num_cols + col; + geom_mean_of_scale += log(cpi->mi_ssim_rdmult_scaling_factors[index]); + num_of_mi += 1.0; + } + } + geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi); + + *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale); + *rdmult = VPXMAX(*rdmult, 0); + set_error_per_bit(x, *rdmult); + vpx_clear_system_state(); +} + static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *mi; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const struct segmentation *const seg = &cm->seg; MvLimits *const mv_limits = &x->mv_limits; set_skip_context(xd, mi_row, mi_col); set_mode_info_offsets(cm, x, xd, mi_row, mi_col); - mi = xd->mi[0]; - // Set up destination pointers. vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); @@ -213,21 +309,8 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, // R/D setup. x->rddiv = cpi->rd.RDDIV; x->rdmult = cpi->rd.RDMULT; - - // Setup segment ID. - if (seg->enabled) { - if (cpi->oxcf.aq_mode != VARIANCE_AQ && cpi->oxcf.aq_mode != LOOKAHEAD_AQ && - cpi->oxcf.aq_mode != EQUATOR360_AQ) { - const uint8_t *const map = - seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } - vp9_init_plane_quantizers(cpi, x); - - x->encode_breakout = cpi->segment_encode_breakout[mi->segment_id]; - } else { - mi->segment_id = 0; - x->encode_breakout = cpi->encode_breakout; + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); } // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs() @@ -259,21 +342,25 @@ static void set_block_size(VP9_COMP *const cpi, MACROBLOCK *const x, } typedef struct { - int64_t sum_square_error; - int64_t sum_error; + // This struct is used for computing variance in choose_partitioning(), where + // the max number of samples within a superblock is 16x16 (with 4x4 avg). Even + // in high bitdepth, uint32_t is enough for sum_square_error (2^12 * 2^12 * 16 + // * 16 = 2^32). + uint32_t sum_square_error; + int32_t sum_error; int log2_count; int variance; -} var; +} Var; typedef struct { - var none; - var horz[2]; - var vert[2]; + Var none; + Var horz[2]; + Var vert[2]; } partition_variance; typedef struct { partition_variance part_variances; - var split[4]; + Var split[4]; } v4x4; typedef struct { @@ -298,7 +385,7 @@ typedef struct { typedef struct { partition_variance *part_variances; - var *split[4]; + Var *split[4]; } variance_node; typedef enum { @@ -339,34 +426,32 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { node->split[i] = &vt->split[i].part_variances.none; break; } - case BLOCK_4X4: { + default: { v4x4 *vt = (v4x4 *)data; + assert(bsize == BLOCK_4X4); node->part_variances = &vt->part_variances; for (i = 0; i < 4; i++) node->split[i] = &vt->split[i]; break; } - default: { - assert(0); - break; - } } } // Set variance values given sum square error, sum error, count. -static void fill_variance(int64_t s2, int64_t s, int c, var *v) { +static void fill_variance(uint32_t s2, int32_t s, int c, Var *v) { v->sum_square_error = s2; v->sum_error = s; v->log2_count = c; } -static void get_variance(var *v) { +static void get_variance(Var *v) { v->variance = (int)(256 * (v->sum_square_error - - ((v->sum_error * v->sum_error) >> v->log2_count)) >> + (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> + v->log2_count)) >> v->log2_count); } -static void sum_2_variances(const var *a, const var *b, var *r) { +static void sum_2_variances(const Var *a, const Var *b, Var *r) { assert(a->log2_count == b->log2_count); fill_variance(a->sum_square_error + b->sum_square_error, a->sum_error + b->sum_error, a->log2_count + 1, r); @@ -404,7 +489,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, // No check for vert/horiz split as too few samples for variance. if (bsize == bsize_min) { // Variance already computed to set the force_split. - if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none); + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows && vt.part_variances->none.variance < threshold) { @@ -414,9 +499,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, return 0; } else if (bsize > bsize_min) { // Variance already computed to set the force_split. - if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none); + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); // For key frame: take split for bsize above 32X32 or very high variance. - if (cm->frame_type == KEY_FRAME && + if (frame_is_intra_only(cm) && (bsize > BLOCK_32X32 || vt.part_variances->none.variance > (threshold << 4))) { return 0; @@ -461,16 +546,39 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, return 0; } +static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, + int width, int height, + int content_state) { + if (speed >= 8) { + if (width <= 640 && height <= 480) + return (5 * threshold_base) >> 2; + else if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff)) + return (5 * threshold_base) >> 2; + } else if (speed == 7) { + if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff)) { + return (5 * threshold_base) >> 2; + } + } + return threshold_base; +} + // Set the variance split thresholds for following the block sizes: // 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16, // 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is // currently only used on key frame. -static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { +static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q, + int content_state) { VP9_COMMON *const cm = &cpi->common; - const int is_key_frame = (cm->frame_type == KEY_FRAME); - const int threshold_multiplier = is_key_frame ? 20 : 1; + const int is_key_frame = frame_is_intra_only(cm); + const int threshold_multiplier = + is_key_frame ? 20 : cpi->sf.variance_part_thresh_mult; int64_t threshold_base = (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]); + if (is_key_frame) { thresholds[0] = threshold_base; thresholds[1] = threshold_base >> 2; @@ -488,12 +596,33 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { else if (noise_level < kLow) threshold_base = (7 * threshold_base) >> 3; } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow) + threshold_base = + vp9_scale_part_thresh(threshold_base, cpi->denoiser.denoising_level, + content_state, cpi->svc.temporal_layer_id); + else + threshold_base = + scale_part_thresh_sumdiff(threshold_base, cpi->oxcf.speed, cm->width, + cm->height, content_state); +#else + // Increase base variance threshold based on content_state/sum_diff level. + threshold_base = scale_part_thresh_sumdiff( + threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state); +#endif thresholds[0] = threshold_base; thresholds[2] = threshold_base << cpi->oxcf.speed; + if (cm->width >= 1280 && cm->height >= 720 && cpi->oxcf.speed < 7) + thresholds[2] = thresholds[2] << 1; if (cm->width <= 352 && cm->height <= 288) { thresholds[0] = threshold_base >> 3; thresholds[1] = threshold_base >> 1; thresholds[2] = threshold_base << 3; + if (cpi->rc.avg_frame_qindex[INTER_FRAME] > 220) + thresholds[2] = thresholds[2] << 2; + else if (cpi->rc.avg_frame_qindex[INTER_FRAME] > 200) + thresholds[2] = thresholds[2] << 1; } else if (cm->width < 1280 && cm->height < 720) { thresholds[1] = (5 * threshold_base) >> 2; } else if (cm->width < 1920 && cm->height < 1080) { @@ -501,21 +630,24 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { } else { thresholds[1] = (5 * threshold_base) >> 1; } + if (cpi->sf.disable_16x16part_nonkey) thresholds[2] = INT64_MAX; } } -void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) { +void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q, + int content_state) { VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; - const int is_key_frame = (cm->frame_type == KEY_FRAME); + const int is_key_frame = frame_is_intra_only(cm); if (sf->partition_search_type != VAR_BASED_PARTITION && sf->partition_search_type != REFERENCE_PARTITION) { return; } else { - set_vbp_thresholds(cpi, cpi->vbp_thresholds, q); + set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state); // The thresholds below are not changed locally. if (is_key_frame) { cpi->vbp_threshold_sad = 0; + cpi->vbp_threshold_copy = 0; cpi->vbp_bsize_min = BLOCK_8X8; } else { if (cm->width <= 352 && cm->height <= 288) @@ -525,8 +657,20 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) { ? (cpi->y_dequant[q][1] << 1) : 1000; cpi->vbp_bsize_min = BLOCK_16X16; + if (cm->width <= 352 && cm->height <= 288) + cpi->vbp_threshold_copy = 4000; + else if (cm->width <= 640 && cm->height <= 360) + cpi->vbp_threshold_copy = 8000; + else + cpi->vbp_threshold_copy = (cpi->y_dequant[q][1] << 3) > 8000 + ? (cpi->y_dequant[q][1] << 3) + : 8000; + if (cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe)) { + cpi->vbp_threshold_sad = 0; + cpi->vbp_threshold_copy = 0; + } } - cpi->vbp_threshold_copy = cpi->vbp_thresholds[0] << 16; cpi->vbp_threshold_minmax = 15 + (q >> 3); } } @@ -639,12 +783,14 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d, } } -#if !CONFIG_VP9_HIGHBITDEPTH // Check if most of the superblock is skin content, and if so, force split to // 32x32, and set x->sb_is_skin for use in mode selection. -static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, - int mi_row, int mi_col, int *force_split) { +static int skin_sb_split(VP9_COMP *cpi, const int low_res, int mi_row, + int mi_col, int *force_split) { VP9_COMMON *const cm = &cpi->common; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) return 0; +#endif // Avoid checking superblocks on/near boundary and avoid low resolutions. // Note superblock may still pick 64X64 if y_sad is very small // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is. @@ -652,11 +798,6 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, mi_row + 8 < cm->mi_rows)) { int num_16x16_skin = 0; int num_16x16_nonskin = 0; - uint8_t *ysignal = x->plane[0].src.buf; - uint8_t *usignal = x->plane[1].src.buf; - uint8_t *vsignal = x->plane[2].src.buf; - int sp = x->plane[0].src.stride; - int spuv = x->plane[1].src.stride; const int block_index = mi_row * cm->mi_cols + mi_col; const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; @@ -667,16 +808,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, for (i = 0; i < ymis; i += 2) { for (j = 0; j < xmis; j += 2) { int bl_index = block_index + i * cm->mi_cols + j; - int bl_index1 = bl_index + 1; - int bl_index2 = bl_index + cm->mi_cols; - int bl_index3 = bl_index2 + 1; - int consec_zeromv = - VPXMIN(cpi->consec_zero_mv[bl_index], - VPXMIN(cpi->consec_zero_mv[bl_index1], - VPXMIN(cpi->consec_zero_mv[bl_index2], - cpi->consec_zero_mv[bl_index3]))); - int is_skin = vp9_compute_skin_block( - ysignal, usignal, vsignal, sp, spuv, BLOCK_16X16, consec_zeromv, 0); + int is_skin = cpi->skin_map[bl_index]; num_16x16_skin += is_skin; num_16x16_nonskin += (1 - is_skin); if (num_16x16_nonskin > 3) { @@ -684,13 +816,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, i = ymis; break; } - ysignal += 16; - usignal += 8; - vsignal += 8; } - ysignal += (sp << 4) - 64; - usignal += (spuv << 3) - 32; - vsignal += (spuv << 3) - 32; } if (num_16x16_skin > 12) { *force_split = 1; @@ -699,7 +825,6 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res, } return 0; } -#endif static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, v64x64 *vt, int64_t thresholds[], @@ -767,59 +892,232 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, } } -static void copy_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, - int mi_col) { +static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x, + MACROBLOCKD *xd, BLOCK_SIZE bsize, + int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; BLOCK_SIZE *prev_part = cpi->prev_partition; int start_pos = mi_row * cm->mi_stride + mi_col; const int bsl = b_width_log2_lookup[bsize]; - const int bs = (1 << bsl) / 4; + const int bs = (1 << bsl) >> 2; BLOCK_SIZE subsize; PARTITION_TYPE partition; - MODE_INFO *mi = NULL; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; partition = partition_lookup[bsl][prev_part[start_pos]]; subsize = get_subsize(bsize, partition); - mi = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]; if (subsize < BLOCK_8X8) { - mi->sb_type = bsize; + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); } else { switch (partition) { - case PARTITION_NONE: mi->sb_type = bsize; break; + case PARTITION_NONE: + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + break; case PARTITION_HORZ: - mi->sb_type = subsize; - if (mi_row + bs < cm->mi_rows) - cm->mi_grid_visible[(mi_row + bs) * cm->mi_stride + mi_col]->sb_type = - subsize; + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row + bs, mi_col, subsize); break; case PARTITION_VERT: - mi->sb_type = subsize; - if (mi_col + bs < cm->mi_cols) - cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col + bs]->sb_type = - subsize; + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize); break; - case PARTITION_SPLIT: - copy_prev_partition(cpi, subsize, mi_row, mi_col); - copy_prev_partition(cpi, subsize, mi_row + bs, mi_col); - copy_prev_partition(cpi, subsize, mi_row, mi_col + bs); - copy_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs); + default: + assert(partition == PARTITION_SPLIT); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs); + copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } -static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, - int mi_col) { +static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + int mi_row, int mi_col, int segment_id, + int sb_offset) { + int svc_copy_allowed = 1; + int frames_since_key_thresh = 1; + if (cpi->use_svc) { + // For SVC, don't allow copy if base spatial layer is key frame, or if + // frame is not a temporal enhancement layer frame. + int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + if (lc->is_key_frame || !cpi->svc.non_reference_frame) svc_copy_allowed = 0; + frames_since_key_thresh = cpi->svc.number_spatial_layers << 1; + } + if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed && + !cpi->resize_pending && segment_id == CR_SEGMENT_ID_BASE && + cpi->prev_segment_id[sb_offset] == CR_SEGMENT_ID_BASE && + cpi->copied_frame_cnt[sb_offset] < cpi->max_copied_frame) { + if (cpi->prev_partition != NULL) { + copy_partitioning_helper(cpi, x, xd, BLOCK_64X64, mi_row, mi_col); + cpi->copied_frame_cnt[sb_offset] += 1; + memcpy(x->variance_low, &(cpi->prev_variance_low[sb_offset * 25]), + sizeof(x->variance_low)); + return 1; + } + } + + return 0; +} + +// Set the partition for mi_col/row_high (current resolution) based on +// the previous spatial layer (mi_col/row). Returns 0 if partition is set, +// returns 1 if no scale partitioning is done. Return 1 means the variance +// partitioning will be used. +static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int mi_row_high, int mi_col_high) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + BLOCK_SIZE *prev_part = svc->prev_partition_svc; + // Variables with _high are for higher resolution. + int bsize_high = 0; + int subsize_high = 0; + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + const int has_rows = (mi_row_high + bs) < cm->mi_rows; + const int has_cols = (mi_col_high + bs) < cm->mi_cols; + + int start_pos; + BLOCK_SIZE bsize_low; + PARTITION_TYPE partition_high; + + // If the lower layer frame is outside the boundary (this can happen for + // odd size resolutions) then do not scale partitioning from the lower + // layer. Do variance based partitioning instead (return 1). + if (mi_row >= svc->mi_rows[svc->spatial_layer_id - 1] || + mi_col >= svc->mi_cols[svc->spatial_layer_id - 1]) + return 1; + + // Do not scale partitioning from lower layers on the boundary. Do + // variance based partitioning instead (return 1). + if (!has_rows || !has_cols) return 1; + + // Find corresponding (mi_col/mi_row) block down-scaled by 2x2. + start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col; + bsize_low = prev_part[start_pos]; + + // For reference frames: return 1 (do variance-based partitioning) if the + // superblock is not low source sad and lower-resoln bsize is below 32x32. + if (!cpi->svc.non_reference_frame && !x->skip_low_source_sad && + bsize_low < BLOCK_32X32) + return 1; + + // Scale up block size by 2x2. Force 64x64 for size larger than 32x32. + if (bsize_low < BLOCK_32X32) { + bsize_high = bsize_low + 3; + } else if (bsize_low >= BLOCK_32X32) { + bsize_high = BLOCK_64X64; + } + + partition_high = partition_lookup[bsl][bsize_high]; + subsize_high = get_subsize(bsize, partition_high); + + if (subsize_high < BLOCK_8X8) { + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high); + } else { + switch (partition_high) { + case PARTITION_NONE: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high); + break; + case PARTITION_HORZ: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high); + if (subsize_high < BLOCK_64X64) + set_block_size(cpi, x, xd, mi_row_high + bs, mi_col_high, + subsize_high); + break; + case PARTITION_VERT: + set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high); + if (subsize_high < BLOCK_64X64) + set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs, + subsize_high); + break; + default: + assert(partition_high == PARTITION_SPLIT); + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col, + mi_row_high, mi_col_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1), + mi_col, mi_row_high + bs, mi_col_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, + mi_col + (bs >> 1), mi_row_high, + mi_col_high + bs)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1), + mi_col + (bs >> 1), mi_row_high + bs, + mi_col_high + bs)) + return 1; + break; + } + } + + return 0; +} + +static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE *prev_part = cpi->svc.prev_partition_svc; + int start_pos = mi_row * cm->mi_stride + mi_col; + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + BLOCK_SIZE subsize; + PARTITION_TYPE partition; + const MODE_INFO *mi = NULL; + int xx, yy; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + mi = cm->mi_grid_visible[start_pos]; + partition = partition_lookup[bsl][mi->sb_type]; + subsize = get_subsize(bsize, partition); + if (subsize < BLOCK_8X8) { + prev_part[start_pos] = bsize; + } else { + switch (partition) { + case PARTITION_NONE: + prev_part[start_pos] = bsize; + if (bsize == BLOCK_64X64) { + for (xx = 0; xx < 8; xx += 4) + for (yy = 0; yy < 8; yy += 4) { + if ((mi_row + xx < cm->mi_rows) && (mi_col + yy < cm->mi_cols)) + prev_part[start_pos + xx * cm->mi_stride + yy] = bsize; + } + } + break; + case PARTITION_HORZ: + prev_part[start_pos] = subsize; + if (mi_row + bs < cm->mi_rows) + prev_part[start_pos + bs * cm->mi_stride] = subsize; + break; + case PARTITION_VERT: + prev_part[start_pos] = subsize; + if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; + break; + default: + assert(partition == PARTITION_SPLIT); + update_partition_svc(cpi, subsize, mi_row, mi_col); + update_partition_svc(cpi, subsize, mi_row + bs, mi_col); + update_partition_svc(cpi, subsize, mi_row, mi_col + bs); + update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs); + break; + } + } +} + +static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; BLOCK_SIZE *prev_part = cpi->prev_partition; int start_pos = mi_row * cm->mi_stride + mi_col; const int bsl = b_width_log2_lookup[bsize]; - const int bs = (1 << bsl) / 4; + const int bs = (1 << bsl) >> 2; BLOCK_SIZE subsize; PARTITION_TYPE partition; const MODE_INFO *mi = NULL; @@ -843,23 +1141,47 @@ static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, prev_part[start_pos] = subsize; if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; break; - case PARTITION_SPLIT: - update_prev_partition(cpi, subsize, mi_row, mi_col); - update_prev_partition(cpi, subsize, mi_row + bs, mi_col); - update_prev_partition(cpi, subsize, mi_row, mi_col + bs); - update_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs); + default: + assert(partition == PARTITION_SPLIT); + update_prev_partition_helper(cpi, subsize, mi_row, mi_col); + update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col); + update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs); + update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } +static void update_prev_partition(VP9_COMP *cpi, MACROBLOCK *x, int segment_id, + int mi_row, int mi_col, int sb_offset) { + update_prev_partition_helper(cpi, BLOCK_64X64, mi_row, mi_col); + cpi->prev_segment_id[sb_offset] = segment_id; + memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low, + sizeof(x->variance_low)); + // Reset the counter for copy partitioning + cpi->copied_frame_cnt[sb_offset] = 0; +} + static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize, - unsigned int y_sad, int is_key_frame) { + unsigned int y_sad, int is_key_frame, + int scene_change_detected) { int i; MACROBLOCKD *xd = &x->e_mbd; + int shift = 2; + if (is_key_frame) return; + // For speed > 8, avoid the chroma check if y_sad is above threshold. + if (cpi->oxcf.speed > 8) { + if (y_sad > cpi->vbp_thresholds[1] && + (!cpi->noise_estimate.enabled || + vp9_noise_estimate_extract_level(&cpi->noise_estimate) < kMedium)) + return; + } + + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && scene_change_detected) + shift = 5; + for (i = 1; i <= 2; ++i) { unsigned int uv_sad = UINT_MAX; struct macroblock_plane *p = &x->plane[i]; @@ -872,10 +1194,60 @@ static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize, // TODO(marpan): Investigate if we should lower this threshold if // superblock is detected as skin. - x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2); + x->color_sensitivity[i - 1] = uv_sad > (y_sad >> shift); } } +static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, + int sb_offset) { + unsigned int tmp_sse; + uint64_t tmp_sad; + unsigned int tmp_variance; + const BLOCK_SIZE bsize = BLOCK_64X64; + uint8_t *src_y = cpi->Source->y_buffer; + int src_ystride = cpi->Source->y_stride; + uint8_t *last_src_y = cpi->Last_Source->y_buffer; + int last_src_ystride = cpi->Last_Source->y_stride; + uint64_t avg_source_sad_threshold = 10000; + uint64_t avg_source_sad_threshold2 = 12000; +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) return 0; +#endif + src_y += shift; + last_src_y += shift; + tmp_sad = + cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride); + tmp_variance = vpx_variance64x64(src_y, src_ystride, last_src_y, + last_src_ystride, &tmp_sse); + // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12) + if (tmp_sad < avg_source_sad_threshold) + x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kLowSadLowSumdiff + : kLowSadHighSumdiff; + else + x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kHighSadLowSumdiff + : kHighSadHighSumdiff; + + // Detect large lighting change. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && + cpi->oxcf.rc_mode == VPX_CBR && tmp_variance < (tmp_sse >> 3) && + (tmp_sse - tmp_variance) > 10000) + x->content_state_sb = kLowVarHighSumdiff; + else if (tmp_sad > (avg_source_sad_threshold << 1)) + x->content_state_sb = kVeryHighSad; + + if (cpi->content_state_sb_fd != NULL) { + if (tmp_sad < avg_source_sad_threshold2) { + // Cap the increment to 255. + if (cpi->content_state_sb_fd[sb_offset] < 255) + cpi->content_state_sb_fd[sb_offset]++; + } else { + cpi->content_state_sb_fd[sb_offset] = 0; + } + } + if (tmp_sad == 0) x->zero_temp_sad_source = 1; + return tmp_sad; +} + // This function chooses partitioning based on the variance between source and // reconstructed last, where variance is computed for down-sampled inputs. static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, @@ -884,19 +1256,23 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, MACROBLOCKD *xd = &x->e_mbd; int i, j, k, m; v64x64 vt; - v16x16 vt2[16]; + v16x16 *vt2 = NULL; int force_split[21]; int avg_32x32; int max_var_32x32 = 0; int min_var_32x32 = INT_MAX; int var_32x32; int avg_16x16[4]; + int maxvar_16x16[4]; + int minvar_16x16[4]; int64_t threshold_4x4avg; NOISE_LEVEL noise_level = kLow; + int content_state = 0; uint8_t *s; const uint8_t *d; int sp; int dp; + int compute_minmax_variance = 1; unsigned int y_sad = UINT_MAX; BLOCK_SIZE bsize = BLOCK_64X64; // Ref frame used in partitioning. @@ -904,34 +1280,113 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, int pixels_wide = 64, pixels_high = 64; int64_t thresholds[4] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2], cpi->vbp_thresholds[3] }; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); + int force_64_split = scene_change_detected || + (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->compute_source_sad_onepass && + cpi->sf.use_source_sad && !x->zero_temp_sad_source); // For the variance computation under SVC mode, we treat the frame as key if // the reference (base layer frame) is key frame (i.e., is_key_frame == 1). - const int is_key_frame = - (cm->frame_type == KEY_FRAME || - (is_one_pass_cbr_svc(cpi) && + int is_key_frame = + (frame_is_intra_only(cm) || + (is_one_pass_svc(cpi) && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); - // Always use 4x4 partition for key frame. - const int use_4x4_partition = cm->frame_type == KEY_FRAME; + + if (!is_key_frame) { + if (cm->frame_refs[LAST_FRAME - 1].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[LAST_FRAME - 1].sf.y_scale_fp == REF_INVALID_SCALE) + is_key_frame = 1; + } + + // Allow for sub8x8 (4x4) partition on key frames, but only for hybrid mode + // (i.e., sf->nonrd_keyframe = 0), where for small blocks rd intra pickmode + // (vp9_rd_pick_intra_mode_sb) is used. The nonrd intra pickmode + // (vp9_pick_intra_mode) does not currently support sub8x8 blocks. This causes + // the issue: 44166813. Assert is added in vp9_pick_intra_mode to check this. + const int use_4x4_partition = + frame_is_intra_only(cm) && !cpi->sf.nonrd_keyframe; const int low_res = (cm->width <= 352 && cm->height <= 288); int variance4x4downsample[16]; int segment_id; - int offset = cm->mi_stride * mi_row + mi_col; + int sb_offset = (cm->mi_stride >> 3) * (mi_row >> 3) + (mi_col >> 3); + + // For SVC: check if LAST frame is NULL or if the resolution of LAST is + // different than the current frame resolution, and if so, treat this frame + // as a key frame, for the purpose of the superblock partitioning. + // LAST == NULL can happen in some cases where enhancement spatial layers are + // enabled dyanmically in the stream and the only reference is the spatial + // reference (GOLDEN). + if (cpi->use_svc) { + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, LAST_FRAME); + if (ref == NULL || ref->y_crop_height != cm->height || + ref->y_crop_width != cm->width) + is_key_frame = 1; + } set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0); segment_id = xd->mi[0]->segment_id; - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { - if (cyclic_refresh_segment_id_boosted(segment_id)) { - int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); - set_vbp_thresholds(cpi, thresholds, q); + + if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame)) + compute_minmax_variance = 0; + + memset(x->variance_low, 0, sizeof(x->variance_low)); + + if (cpi->sf.use_source_sad && !is_key_frame) { + int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + content_state = x->content_state_sb; + x->skip_low_source_sad = (content_state == kLowSadLowSumdiff || + content_state == kLowSadHighSumdiff) + ? 1 + : 0; + x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0; + if (cpi->content_state_sb_fd != NULL) + x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2]; + + // For SVC on top spatial layer: use/scale the partition from + // the lower spatial resolution if svc_use_lowres_part is enabled. + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->svc.prev_partition_svc != NULL && content_state != kVeryHighSad) { + if (!scale_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1, + mi_col >> 1, mi_row, mi_col)) { + if (cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); + } + return 0; + } + } + // If source_sad is low copy the partition without computing the y_sad. + if (x->skip_low_source_sad && cpi->sf.copy_partition_flag && + !force_64_split && + copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { + x->sb_use_mv_part = 1; + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + return 0; } } + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + cyclic_refresh_segment_id_boosted(segment_id)) { + int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); + set_vbp_thresholds(cpi, thresholds, q, content_state); + } else { + set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state); + } + // Decrease 32x32 split threshold for screen on base layer, for scene + // change/high motion frames. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->svc.spatial_layer_id == 0 && force_64_split) + thresholds[1] = 3 * thresholds[1] >> 2; + // For non keyframes, disable 4x4 average for low resolution when speed = 8 threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX; - memset(x->variance_low, 0, sizeof(x->variance_low)); - if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); @@ -940,7 +1395,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks, // 5-20 for the 16x16 blocks. - force_split[0] = 0; + force_split[0] = force_64_split; if (!is_key_frame) { // In the case of spatial/temporal scalable coding, the assumption here is @@ -956,13 +1411,16 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, assert(yv12 != NULL); - if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id)) { + if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) || + cpi->svc.use_gf_temporal_ref_current_layer) { // For now, GOLDEN will not be used for non-zero spatial layers, since // it may not be a temporal reference. yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); } - if (yv12_g && yv12_g != yv12 && (cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + // Only compute y_sad_g (sad for golden reference) for speed < 8. + if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 && + (cpi->ref_frame_flags & VP9_GOLD_FLAG)) { vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, &cm->frame_refs[GOLDEN_FRAME - 1].sf); y_sad_g = cpi->fn_ptr[bsize].sdf( @@ -984,12 +1442,41 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, &cm->frame_refs[LAST_FRAME - 1].sf); mi->ref_frame[0] = LAST_FRAME; } - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->sb_type = BLOCK_64X64; mi->mv[0].as_int = 0; mi->interp_filter = BILINEAR; - y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); + if (cpi->oxcf.speed >= 8 && !low_res && + x->content_state_sb != kVeryHighSad) { + y_sad = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } else { + const MV dummy_mv = { 0, 0 }; + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col, + &dummy_mv); + x->sb_use_mv_part = 1; + x->sb_mvcol_part = mi->mv[0].as_mv.col; + x->sb_mvrow_part = mi->mv[0].as_mv.row; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode && + cpi->svc.high_num_blocks_with_motion && !x->zero_temp_sad_source && + cm->width > 640 && cm->height > 480) { + // Disable split below 16x16 block size when scroll motion (horz or + // vert) is detected. + // TODO(marpan/jianj): Improve this condition: issue is that search + // range is hard-coded/limited in vp9_int_pro_motion_estimation() so + // scroll motion may not be detected here. + if (((abs(x->sb_mvrow_part) >= 48 && abs(x->sb_mvcol_part) <= 8) || + (abs(x->sb_mvcol_part) >= 48 && abs(x->sb_mvrow_part) <= 8)) && + y_sad < 100000) { + compute_minmax_variance = 0; + thresholds[2] = INT64_MAX; + } + } + } + y_sad_last = y_sad; // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad // are close if short_circuit_low_temp_var is on. @@ -1009,12 +1496,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); - x->sb_is_skin = 0; -#if !CONFIG_VP9_HIGHBITDEPTH if (cpi->use_skin_detection) - x->sb_is_skin = - skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split); -#endif + x->sb_is_skin = skin_sb_split(cpi, low_res, mi_row, mi_col, force_split); d = xd->plane[0].dst.buf; dp = xd->plane[0].dst.stride; @@ -1027,23 +1510,29 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows) { set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64); - chroma_check(cpi, x, bsize, y_sad, is_key_frame); + x->variance_low[0] = 1; + chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected); + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + if (cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); + } return 0; } } // If the y_sad is small enough, copy the partition of the superblock in the // last frame to current frame only if the last frame is not a keyframe. + // Stop the copy every cpi->max_copied_frame to refresh the partition. // TODO(jianj) : tune the threshold. - if (cpi->sf.copy_partition_flag && cpi->rc.frames_since_key > 1 && - segment_id == CR_SEGMENT_ID_BASE && - cpi->prev_segment_id[offset] == CR_SEGMENT_ID_BASE && - y_sad_last < cpi->vbp_threshold_copy) { - if (cpi->prev_partition != NULL) { - copy_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col); - chroma_check(cpi, x, bsize, y_sad, is_key_frame); - return 0; - } + if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy && + copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { + chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected); + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + return 0; } } else { d = VP9_VAR_OFFS; @@ -1060,6 +1549,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, #endif // CONFIG_VP9_HIGHBITDEPTH } + if (low_res && threshold_4x4avg < INT64_MAX) + CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2))); // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances // for splits. for (i = 0; i < 4; i++) { @@ -1068,6 +1559,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, const int i2 = i << 2; force_split[i + 1] = 0; avg_16x16[i] = 0; + maxvar_16x16[i] = 0; + minvar_16x16[i] = INT_MAX; for (j = 0; j < 4; j++) { const int x16_idx = x32_idx + ((j & 1) << 4); const int y16_idx = y32_idx + ((j >> 1) << 4); @@ -1084,13 +1577,17 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); get_variance(&vt.split[i].split[j].part_variances.none); avg_16x16[i] += vt.split[i].split[j].part_variances.none.variance; + if (vt.split[i].split[j].part_variances.none.variance < minvar_16x16[i]) + minvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance; + if (vt.split[i].split[j].part_variances.none.variance > maxvar_16x16[i]) + maxvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance; if (vt.split[i].split[j].part_variances.none.variance > thresholds[2]) { // 16X16 variance is above threshold for split, so force split to 8x8 // for this 16x16 block (this also forces splits for upper levels). force_split[split_index] = 1; force_split[i + 1] = 1; force_split[0] = 1; - } else if (cpi->oxcf.speed < 8 && + } else if (compute_minmax_variance && vt.split[i].split[j].part_variances.none.variance > thresholds[1] && !cyclic_refresh_segment_id_boosted(segment_id)) { @@ -1102,16 +1599,19 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, xd->cur_buf->flags, #endif pixels_wide, pixels_high); - if (minmax > cpi->vbp_threshold_minmax) { + int thresh_minmax = (int)cpi->vbp_threshold_minmax; + if (x->content_state_sb == kVeryHighSad) + thresh_minmax = thresh_minmax << 1; + if (minmax > thresh_minmax) { force_split[split_index] = 1; force_split[i + 1] = 1; force_split[0] = 1; } } } - if (is_key_frame || (low_res && - vt.split[i].split[j].part_variances.none.variance > - threshold_4x4avg)) { + if (is_key_frame || + (low_res && vt.split[i].split[j].part_variances.none.variance > + threshold_4x4avg)) { force_split[split_index] = 0; // Go down to 4x4 down-sampling for variance. variance4x4downsample[i2 + j] = 1; @@ -1128,6 +1628,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } } + if (cpi->noise_estimate.enabled) + noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate); // Fill the rest of the variance tree by summing split partition values. avg_32x32 = 0; for (i = 0; i < 4; i++) { @@ -1163,6 +1665,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, vt.split[i].part_variances.none.variance > (avg_16x16[i] >> 1))) { force_split[i + 1] = 1; force_split[0] = 1; + } else if (!is_key_frame && noise_level < kLow && cm->height <= 360 && + (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[1] >> 1) && + maxvar_16x16[i] > thresholds[1]) { + force_split[i + 1] = 1; + force_split[0] = 1; } avg_32x32 += var_32x32; } @@ -1170,13 +1677,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, if (!force_split[0]) { fill_variance_tree(&vt, BLOCK_64X64); get_variance(&vt.part_variances.none); - if (cpi->noise_estimate.enabled) - noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate); // If variance of this 64x64 block is above (some threshold of) the average // variance over the sub-32x32 blocks, then force this block to split. // Only checking this for noise level >= medium for now. if (!is_key_frame && noise_level >= kMedium && - vt.part_variances.none.variance > (5 * avg_32x32) >> 4) + vt.part_variances.none.variance > (9 * avg_32x32) >> 5) force_split[0] = 1; // Else if the maximum 32x32 variance minus the miniumum 32x32 variance in // a 64x64 block is greater than threshold and the maximum 32x32 variance is @@ -1206,7 +1711,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, const int y16_idx = ((j >> 1) << 1); // For inter frames: if variance4x4downsample[] == 1 for this 16x16 // block, then the variance is based on 4x4 down-sampling, so use vt2 - // in set_vt_partioning(), otherwise use vt. + // in set_vt_partitioning(), otherwise use vt. v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1) ? &vt2[i2 + j] : &vt.split[i].split[j]; @@ -1239,20 +1744,25 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } - if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) { - update_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col); - cpi->prev_segment_id[offset] = segment_id; + if (!frame_is_intra_only(cm) && cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); } + if (!frame_is_intra_only(cm) && cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) + update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); + if (cpi->sf.short_circuit_low_temp_var) { set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition, mi_col, mi_row); } - chroma_check(cpi, x, bsize, y_sad, is_key_frame); + chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected); + if (vt2) vpx_free(vt2); return 0; } +#if !CONFIG_REALTIME_ONLY static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, BLOCK_SIZE bsize, int output_enabled) { @@ -1294,7 +1804,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, } // Else for cyclic refresh mode update the segment map, set the segment id // and then update the quantizer. - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->content_mode) { vp9_cyclic_refresh_update_segment(cpi, xd->mi[0], mi_row, mi_col, bsize, ctx->rate, ctx->dist, x->skip, p); } @@ -1357,8 +1868,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, vp9_update_mv_count(td); if (cm->interp_filter == SWITCHABLE) { - const int ctx = get_pred_context_switchable_interp(xd); - ++td->counts->switchable_interp[ctx][xdmi->interp_filter]; + const int ctx_interp = get_pred_context_switchable_interp(xd); + ++td->counts->switchable_interp[ctx_interp][xdmi->interp_filter]; } } @@ -1381,6 +1892,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, } } } +#endif // !CONFIG_REALTIME_ONLY void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col) { @@ -1398,13 +1910,17 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, } static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, + INTERP_FILTER interp_filter, RD_COST *rd_cost, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; INTERP_FILTER filter_ref; filter_ref = get_pred_context_switchable_interp(xd); - if (filter_ref == SWITCHABLE_FILTERS) filter_ref = EIGHTTAP; + if (interp_filter == BILINEAR) + filter_ref = BILINEAR; + else if (filter_ref == SWITCHABLE_FILTERS) + filter_ref = EIGHTTAP; mi->sb_type = bsize; mi->mode = ZEROMV; @@ -1413,7 +1929,7 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, mi->skip = 1; mi->uv_mode = DC_PRED; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->mv[0].as_int = 0; mi->interp_filter = filter_ref; @@ -1423,20 +1939,41 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, vp9_rd_cost_init(rd_cost); } -static int set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x, - int8_t segment_id) { - int segment_qindex; +#if !CONFIG_REALTIME_ONLY +static void set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + AQ_MODE aq_mode) { VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const uint8_t *const map = + cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + vp9_init_plane_quantizers(cpi, x); vpx_clear_system_state(); - segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); - return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q); + + if (aq_mode == NO_AQ || aq_mode == PSNR_AQ) { + if (cpi->sf.enable_tpl_model) x->rdmult = x->cb_rdmult; + } else if (aq_mode == PERCEPTUAL_AQ) { + x->rdmult = x->cb_rdmult; + } else if (aq_mode == CYCLIC_REFRESH_AQ) { + // If segment is boosted, use rdmult for that segment. + if (cyclic_refresh_segment_id_boosted( + get_segment_id(cm, map, bsize, mi_row, mi_col))) + x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + } else { + x->rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q); + } + + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } } static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *const x, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, - PICK_MODE_CONTEXT *ctx, int64_t best_rd) { + PICK_MODE_CONTEXT *ctx, int rate_in_best_rd, + int64_t dist_in_best_rd) { VP9_COMMON *const cm = &cpi->common; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCKD *const xd = &x->e_mbd; @@ -1445,8 +1982,12 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, struct macroblockd_plane *const pd = xd->plane; const AQ_MODE aq_mode = cpi->oxcf.aq_mode; int i, orig_rdmult; + int64_t best_rd = INT64_MAX; vpx_clear_system_state(); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_sb_modes_time); +#endif // Use the lower precision, but faster, 32x32 fdct for mode selection. x->use_lp32x32fdct = 1; @@ -1485,59 +2026,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, // Save rdmult before it might be changed, so it can be restored later. orig_rdmult = x->rdmult; - if ((cpi->sf.tx_domain_thresh > 0.0) || (cpi->sf.quant_opt_thresh > 0.0)) { + if ((cpi->sf.tx_domain_thresh > 0.0) || + (cpi->sf.trellis_opt_tx_rd.thresh > 0.0)) { double logvar = vp9_log_block_var(cpi, x, bsize); - // Check block complexity as part of descision on using pixel or transform + // Check block complexity as part of decision on using pixel or transform // domain distortion in rd tests. x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion && (logvar >= cpi->sf.tx_domain_thresh); - // Check block complexity as part of descision on using quantized - // coefficient optimisation inside the rd loop. - x->block_qcoeff_opt = - cpi->sf.allow_quant_coeff_opt && (logvar <= cpi->sf.quant_opt_thresh); + // Store block complexity to decide on using quantized coefficient + // optimization inside the rd loop. + x->log_block_src_var = logvar; } else { x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion; - x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt; + x->log_block_src_var = 0.0; } - if (aq_mode == VARIANCE_AQ) { - const int energy = - bsize <= BLOCK_16X16 ? x->mb_energy : vp9_block_energy(cpi, x, bsize); - - if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || - cpi->force_update_segmentation || - (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { - mi->segment_id = vp9_vaq_segment_id(energy); - } else { - const uint8_t *const map = - cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } - x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id); - } else if (aq_mode == LOOKAHEAD_AQ) { - const uint8_t *const map = cpi->segmentation_map; - - // I do not change rdmult here consciously. - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } else if (aq_mode == EQUATOR360_AQ) { - if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation) { - mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows); - } else { - const uint8_t *const map = - cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } - x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id); - } else if (aq_mode == COMPLEXITY_AQ) { - x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id); - } else if (aq_mode == CYCLIC_REFRESH_AQ) { - const uint8_t *const map = - cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - // If segment is boosted, use rdmult for that segment. - if (cyclic_refresh_segment_id_boosted( - get_segment_id(cm, map, bsize, mi_row, mi_col))) - x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); + set_segment_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode); + if (rate_in_best_rd < INT_MAX && dist_in_best_rd < INT64_MAX) { + best_rd = vp9_calculate_rd_cost(x->rdmult, x->rddiv, rate_in_best_rd, + dist_in_best_rd); } // Find best coding mode & reconstruct the MB so it is available @@ -1546,15 +2055,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd); } else { if (bsize >= BLOCK_8X8) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rd_pick_inter_mode_sb_time); +#endif if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize, ctx, best_rd); else vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rd_pick_inter_mode_sb_time); +#endif } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time); +#endif vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time); +#endif } } @@ -1566,15 +2087,22 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); } - x->rdmult = orig_rdmult; - // TODO(jingning) The rate-distortion optimization flow needs to be // refactored to provide proper exit/return handle. - if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX; + if (rd_cost->rate == INT_MAX || rd_cost->dist == INT64_MAX) + rd_cost->rdcost = INT64_MAX; + else + rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); + + x->rdmult = orig_rdmult; ctx->rate = rd_cost->rate; ctx->dist = rd_cost->dist; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_sb_modes_time); +#endif } +#endif // !CONFIG_REALTIME_ONLY static void update_stats(VP9_COMMON *cm, ThreadData *td) { const MACROBLOCK *x = &td->mb; @@ -1600,8 +2128,10 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { [has_second_ref(mi)]++; if (has_second_ref(mi)) { - counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)] - [ref0 == GOLDEN_FRAME]++; + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd); + const int bit = mi->ref_frame[!idx] == cm->comp_var_ref[1]; + counts->comp_ref[ctx][bit]++; } else { counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0] [ref0 != LAST_FRAME]++; @@ -1633,6 +2163,7 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { } } +#if !CONFIG_REALTIME_ONLY static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], @@ -1697,6 +2228,16 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, ThreadData *td, PICK_MODE_CONTEXT *ctx) { MACROBLOCK *const x = &td->mb; set_offsets(cpi, tile, x, mi_row, mi_col, bsize); + + if (cpi->sf.enable_tpl_model && + (cpi->oxcf.aq_mode == NO_AQ || cpi->oxcf.aq_mode == PERCEPTUAL_AQ)) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + x->rdmult = x->cb_rdmult; + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult); + } + } + update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled); encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx); @@ -1755,27 +2296,28 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize, - pc_tree->leaf_split[0]); + pc_tree->u.leaf_split[0]); } else { encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, - pc_tree->split[0]); + pc_tree->u.split[0]); encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled, - subsize, pc_tree->split[1]); + subsize, pc_tree->u.split[1]); encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled, - subsize, pc_tree->split[2]); + subsize, pc_tree->u.split[2]); encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, - subsize, pc_tree->split[3]); + subsize, pc_tree->u.split[3]); } break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) update_partition_context(xd, mi_row, mi_col, subsize, bsize); } +#endif // !CONFIG_REALTIME_ONLY // Check to see if the given partition size is allowed for a specified number // of 8x8 block rows and columns remaining in the image. @@ -1849,120 +2391,6 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } -static const struct { - int row; - int col; -} coord_lookup[16] = { - // 32x32 index = 0 - { 0, 0 }, - { 0, 2 }, - { 2, 0 }, - { 2, 2 }, - // 32x32 index = 1 - { 0, 4 }, - { 0, 6 }, - { 2, 4 }, - { 2, 6 }, - // 32x32 index = 2 - { 4, 0 }, - { 4, 2 }, - { 6, 0 }, - { 6, 2 }, - // 32x32 index = 3 - { 4, 4 }, - { 4, 6 }, - { 6, 4 }, - { 6, 6 }, -}; - -static void set_source_var_based_partition(VP9_COMP *cpi, - const TileInfo *const tile, - MACROBLOCK *const x, - MODE_INFO **mi_8x8, int mi_row, - int mi_col) { - VP9_COMMON *const cm = &cpi->common; - const int mis = cm->mi_stride; - const int row8x8_remaining = tile->mi_row_end - mi_row; - const int col8x8_remaining = tile->mi_col_end - mi_col; - MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col; - - vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); - - assert((row8x8_remaining > 0) && (col8x8_remaining > 0)); - - // In-image SB64 - if ((col8x8_remaining >= MI_BLOCK_SIZE) && - (row8x8_remaining >= MI_BLOCK_SIZE)) { - int i, j; - int index; - diff d32[4]; - const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1); - int is_larger_better = 0; - int use32x32 = 0; - unsigned int thr = cpi->source_var_thresh; - - memset(d32, 0, 4 * sizeof(diff)); - - for (i = 0; i < 4; i++) { - diff *d16[4]; - - for (j = 0; j < 4; j++) { - int b_mi_row = coord_lookup[i * 4 + j].row; - int b_mi_col = coord_lookup[i * 4 + j].col; - int boffset = b_mi_row / 2 * cm->mb_cols + b_mi_col / 2; - - d16[j] = cpi->source_diff_var + offset + boffset; - - index = b_mi_row * mis + b_mi_col; - mi_8x8[index] = mi_upper_left + index; - mi_8x8[index]->sb_type = BLOCK_16X16; - - // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition - // size to further improve quality. - } - - is_larger_better = (d16[0]->var < thr) && (d16[1]->var < thr) && - (d16[2]->var < thr) && (d16[3]->var < thr); - - // Use 32x32 partition - if (is_larger_better) { - use32x32 += 1; - - for (j = 0; j < 4; j++) { - d32[i].sse += d16[j]->sse; - d32[i].sum += d16[j]->sum; - } - - d32[i].var = - (unsigned int)(d32[i].sse - - (unsigned int)(((int64_t)d32[i].sum * d32[i].sum) >> - 10)); - - index = coord_lookup[i * 4].row * mis + coord_lookup[i * 4].col; - mi_8x8[index] = mi_upper_left + index; - mi_8x8[index]->sb_type = BLOCK_32X32; - } - } - - if (use32x32 == 4) { - thr <<= 1; - is_larger_better = (d32[0].var < thr) && (d32[1].var < thr) && - (d32[2].var < thr) && (d32[3].var < thr); - - // Use 64x64 partition - if (is_larger_better) { - mi_8x8[0] = mi_upper_left; - mi_8x8[0]->sb_type = BLOCK_64X64; - } - } - } else { // partial in-image SB64 - int bh = num_8x8_blocks_high_lookup[BLOCK_16X16]; - int bw = num_8x8_blocks_wide_lookup[BLOCK_16X16]; - set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining, - col8x8_remaining, BLOCK_16X16, mi_8x8); - } -} - static void update_state_rt(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, int bsize) { @@ -1980,17 +2408,17 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, *(xd->mi[0]) = ctx->mic; *(x->mbmi_ext) = ctx->mbmi_ext; - if (seg->enabled && cpi->oxcf.aq_mode != NO_AQ) { - // For in frame complexity AQ or variance AQ, copy segment_id from - // segmentation_map. - if (cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ) { + if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled || + cpi->active_map.enabled)) { + // Setting segmentation map for cyclic_refresh. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->content_mode) { + vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize, + ctx->rate, ctx->dist, x->skip, p); + } else { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } else { - // Setting segmentation map for cyclic_refresh. - vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize, - ctx->rate, ctx->dist, x->skip, p); } vp9_init_plane_quantizers(cpi, x); } @@ -2028,7 +2456,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, } x->skip = ctx->skip; - x->skip_txfm[0] = mi->segment_id ? 0 : ctx->skip_txfm[0]; + x->skip_txfm[0] = (mi->segment_id || xd->lossless) ? 0 : ctx->skip_txfm[0]; } static void encode_b_rt(VP9_COMP *cpi, ThreadData *td, @@ -2096,24 +2524,25 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, - pc_tree->split[0]); + pc_tree->u.split[0]); encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled, - subsize, pc_tree->split[1]); + subsize, pc_tree->u.split[1]); encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled, - subsize, pc_tree->split[2]); + subsize, pc_tree->u.split[2]); encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, - output_enabled, subsize, pc_tree->split[3]); + output_enabled, subsize, pc_tree->u.split[3]); break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) update_partition_context(xd, mi_row, mi_col, subsize, bsize); } +#if !CONFIG_REALTIME_ONLY static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MODE_INFO **mi_8x8, TOKENEXTRA **tp, int mi_row, int mi_col, @@ -2182,7 +2611,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, mi_col + (mi_step >> 1) < cm->mi_cols) { pc_tree->partitioning = PARTITION_NONE; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, ctx, - INT64_MAX); + INT_MAX, INT64_MAX); pl = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -2201,20 +2630,23 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, switch (partition) { case PARTITION_NONE: rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, bsize, - ctx, INT64_MAX); + ctx, INT_MAX, INT64_MAX); break; case PARTITION_HORZ: + pc_tree->horizontal[0].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - subsize, &pc_tree->horizontal[0], INT64_MAX); + subsize, &pc_tree->horizontal[0], INT_MAX, INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) { RD_COST tmp_rdc; - PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; + PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0]; vp9_rd_cost_init(&tmp_rdc); - update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); - encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx); + pc_tree->horizontal[1].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col, - &tmp_rdc, subsize, &pc_tree->horizontal[1], INT64_MAX); + &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX, + INT64_MAX); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { vp9_rd_cost_reset(&last_part_rdc); break; @@ -2225,18 +2657,20 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, } break; case PARTITION_VERT: + pc_tree->vertical[0].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - subsize, &pc_tree->vertical[0], INT64_MAX); + subsize, &pc_tree->vertical[0], INT_MAX, INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) { RD_COST tmp_rdc; - PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0]; + PICK_MODE_CONTEXT *vctx = &pc_tree->vertical[0]; vp9_rd_cost_init(&tmp_rdc); - update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); - encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), - &tmp_rdc, subsize, - &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX); + update_state(cpi, td, vctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, vctx); + pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0; + rd_pick_sb_modes( + cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc, + subsize, &pc_tree->vertical[bsize > BLOCK_8X8], INT_MAX, INT64_MAX); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { vp9_rd_cost_reset(&last_part_rdc); break; @@ -2246,10 +2680,11 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, last_part_rdc.rdcost += tmp_rdc.rdcost; } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - subsize, pc_tree->leaf_split[0], INT64_MAX); + subsize, pc_tree->u.leaf_split[0], INT_MAX, INT64_MAX); break; } last_part_rdc.rate = 0; @@ -2267,7 +2702,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, &tmp_rdc.dist, i != 3, - pc_tree->split[i]); + pc_tree->u.split[i]); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { vp9_rd_cost_reset(&last_part_rdc); break; @@ -2276,7 +2711,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, last_part_rdc.dist += tmp_rdc.dist; } break; - default: assert(0); break; } pl = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -2304,17 +2738,15 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, int x_idx = (i & 1) * (mi_step >> 1); int y_idx = (i >> 1) * (mi_step >> 1); RD_COST tmp_rdc; - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; - PARTITION_CONTEXT sl[8], sa[8]; if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; save_context(x, mi_row, mi_col, a, l, sa, sl, bsize); - pc_tree->split[i]->partitioning = PARTITION_NONE; + pc_tree->u.split[i]->partitioning = PARTITION_NONE; rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, - &tmp_rdc, split_subsize, &pc_tree->split[i]->none, - INT64_MAX); + &tmp_rdc, split_subsize, &pc_tree->u.split[i]->none, + INT_MAX, INT64_MAX); restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); @@ -2328,7 +2760,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, if (i != 3) encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0, - split_subsize, pc_tree->split[i]); + split_subsize, pc_tree->u.split[i]); pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx, split_subsize); @@ -2511,14 +2943,12 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row, min_size = BLOCK_64X64; max_size = BLOCK_4X4; - if (prev_mi) { - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - mi = prev_mi[idy * cm->mi_stride + idx]; - bs = mi ? mi->sb_type : bsize; - min_size = VPXMIN(min_size, bs); - max_size = VPXMAX(max_size, bs); - } + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + mi = prev_mi[idy * cm->mi_stride + idx]; + bs = mi ? mi->sb_type : bsize; + min_size = VPXMIN(min_size, bs); + max_size = VPXMAX(max_size, bs); } } @@ -2548,6 +2978,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row, *min_bs = min_size; *max_bs = max_size; } +#endif // !CONFIG_REALTIME_ONLY static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); @@ -2557,63 +2988,689 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); } -#if CONFIG_FP_MB_STATS -const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 4, 4 }; -const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1, - 2, 1, 2, 4, 2, 4 }; -const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { - 0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120 -}; -const int qindex_split_threshold_lookup[BLOCK_SIZES] = { - 0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120 -}; -const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6 -}; +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +static void nn_predict(const float *features, const NN_CONFIG *nn_config, + float *output) { + int num_input_nodes = nn_config->num_inputs; + int buf_index = 0; + float buf[2][NN_MAX_NODES_PER_LAYER]; + const float *input_nodes = features; -typedef enum { - MV_ZERO = 0, - MV_LEFT = 1, - MV_UP = 2, - MV_RIGHT = 3, - MV_DOWN = 4, - MV_INVALID -} MOTION_DIRECTION; + // Propagate hidden layers. + const int num_layers = nn_config->num_hidden_layers; + int layer, node, i; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (layer = 0; layer < num_layers; ++layer) { + const float *weights = nn_config->weights[layer]; + const float *bias = nn_config->bias[layer]; + float *output_nodes = buf[buf_index]; + const int num_output_nodes = nn_config->num_hidden_nodes[layer]; + assert(num_output_nodes < NN_MAX_NODES_PER_LAYER); + for (node = 0; node < num_output_nodes; ++node) { + float val = 0.0f; + for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i]; + val += bias[node]; + // ReLU as activation function. + val = VPXMAX(val, 0.0f); + output_nodes[node] = val; + weights += num_input_nodes; + } + num_input_nodes = num_output_nodes; + input_nodes = output_nodes; + buf_index = 1 - buf_index; + } -static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) { - if (fp_byte & FPMB_MOTION_ZERO_MASK) { - return MV_ZERO; - } else if (fp_byte & FPMB_MOTION_LEFT_MASK) { - return MV_LEFT; - } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) { - return MV_RIGHT; - } else if (fp_byte & FPMB_MOTION_UP_MASK) { - return MV_UP; - } else { - return MV_DOWN; + // Final output layer. + { + const float *weights = nn_config->weights[num_layers]; + for (node = 0; node < nn_config->num_outputs; ++node) { + const float *bias = nn_config->bias[num_layers]; + float val = 0.0f; + for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i]; + output[node] = val + bias[node]; + weights += num_input_nodes; + } } } -static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, - MOTION_DIRECTION that_mv) { - if (this_mv == that_mv) { - return 0; - } else { - return abs(this_mv - that_mv) == 2 ? 2 : 1; +#if !CONFIG_REALTIME_ONLY +#define FEATURES 7 +// Machine-learning based partition search early termination. +// Return 1 to skip split and rect partitions. +static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int mag_mv = + abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row); + const int left_in_image = !!xd->left_mi; + const int above_in_image = !!xd->above_mi; + MODE_INFO **prev_mi = + &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row]; + int above_par = 0; // above_partitioning + int left_par = 0; // left_partitioning + int last_par = 0; // last_partitioning + int offset = 0; + int i; + BLOCK_SIZE context_size; + const NN_CONFIG *nn_config = NULL; + const float *mean, *sd, *linear_weights; + float nn_score, linear_score; + float features[FEATURES]; + + assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]); + vpx_clear_system_state(); + + switch (bsize) { + case BLOCK_64X64: + offset = 0; + nn_config = &vp9_partition_nnconfig_64x64; + break; + case BLOCK_32X32: + offset = 8; + nn_config = &vp9_partition_nnconfig_32x32; + break; + case BLOCK_16X16: + offset = 16; + nn_config = &vp9_partition_nnconfig_16x16; + break; + default: assert(0 && "Unexpected block size."); return 0; } + + if (above_in_image) { + context_size = xd->above_mi->sb_type; + if (context_size < bsize) + above_par = 2; + else if (context_size == bsize) + above_par = 1; + } + + if (left_in_image) { + context_size = xd->left_mi->sb_type; + if (context_size < bsize) + left_par = 2; + else if (context_size == bsize) + left_par = 1; + } + + if (prev_mi[0]) { + context_size = prev_mi[0]->sb_type; + if (context_size < bsize) + last_par = 2; + else if (context_size == bsize) + last_par = 1; + } + + mean = &vp9_partition_feature_mean[offset]; + sd = &vp9_partition_feature_std[offset]; + features[0] = ((float)ctx->rate - mean[0]) / sd[0]; + features[1] = ((float)ctx->dist - mean[1]) / sd[1]; + features[2] = ((float)mag_mv / 2 - mean[2]) * sd[2]; + features[3] = ((float)(left_par + above_par) / 2 - mean[3]) * sd[3]; + features[4] = ((float)ctx->sum_y_eobs - mean[4]) / sd[4]; + features[5] = ((float)cm->base_qindex - mean[5]) * sd[5]; + features[6] = ((float)last_par - mean[6]) * sd[6]; + + // Predict using linear model. + linear_weights = &vp9_partition_linear_weights[offset]; + linear_score = linear_weights[FEATURES]; + for (i = 0; i < FEATURES; ++i) + linear_score += linear_weights[i] * features[i]; + if (linear_score > 0.1f) return 0; + + // Predict using neural net model. + nn_predict(features, nn_config, &nn_score); + + if (linear_score < -0.0f && nn_score < 0.1f) return 1; + if (nn_score < -0.0f && linear_score < 0.1f) return 1; + return 0; } +#undef FEATURES + +#define FEATURES 4 +// ML-based partition search breakout. +static int ml_predict_breakout(VP9_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_COST *const rd_cost) { + DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 }; + const VP9_COMMON *const cm = &cpi->common; + float features[FEATURES]; + const float *linear_weights = NULL; // Linear model weights. + float linear_score = 0.0f; + const int qindex = cm->base_qindex; + const int q_ctx = qindex >= 200 ? 0 : (qindex >= 150 ? 1 : 2); + const int is_720p_or_larger = VPXMIN(cm->width, cm->height) >= 720; + const int resolution_ctx = is_720p_or_larger ? 1 : 0; + + switch (bsize) { + case BLOCK_64X64: + linear_weights = vp9_partition_breakout_weights_64[resolution_ctx][q_ctx]; + break; + case BLOCK_32X32: + linear_weights = vp9_partition_breakout_weights_32[resolution_ctx][q_ctx]; + break; + case BLOCK_16X16: + linear_weights = vp9_partition_breakout_weights_16[resolution_ctx][q_ctx]; + break; + case BLOCK_8X8: + linear_weights = vp9_partition_breakout_weights_8[resolution_ctx][q_ctx]; + break; + default: assert(0 && "Unexpected block size."); return 0; + } + if (!linear_weights) return 0; + + { // Generate feature values. +#if CONFIG_VP9_HIGHBITDEPTH + const int ac_q = + vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8); +#else + const int ac_q = vp9_ac_quant(qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + const int num_pels_log2 = num_pels_log2_lookup[bsize]; + int feature_index = 0; + unsigned int var, sse; + float rate_f, dist_f; + +#if CONFIG_VP9_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + var = + vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, x->e_mbd.bd); + } else { + var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); + } +#else + var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); #endif + var = var >> num_pels_log2; + vpx_clear_system_state(); + + rate_f = (float)VPXMIN(rd_cost->rate, INT_MAX); + dist_f = (float)(VPXMIN(rd_cost->dist, INT_MAX) >> num_pels_log2); + rate_f = + ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * + rate_f; + + features[feature_index++] = rate_f; + features[feature_index++] = dist_f; + features[feature_index++] = (float)var; + features[feature_index++] = (float)ac_q; + assert(feature_index == FEATURES); + } + + { // Calculate the output score. + int i; + linear_score = linear_weights[FEATURES]; + for (i = 0; i < FEATURES; ++i) + linear_score += linear_weights[i] * features[i]; + } + + return linear_score >= cpi->sf.rd_ml_partition.search_breakout_thresh[q_ctx]; +} +#undef FEATURES + +#define FEATURES 8 +#define LABELS 4 +static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, + const PC_TREE *const pc_tree, + int *allow_horz, int *allow_vert, + int64_t ref_rd) { + const NN_CONFIG *nn_config = NULL; + float score[LABELS] = { + 0.0f, + }; + int thresh = -1; + int i; + (void)x; + + if (ref_rd <= 0 || ref_rd > 1000000000) return; + + switch (bsize) { + case BLOCK_8X8: break; + case BLOCK_16X16: + nn_config = &vp9_rect_part_nnconfig_16; + thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[1]; + break; + case BLOCK_32X32: + nn_config = &vp9_rect_part_nnconfig_32; + thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[2]; + break; + case BLOCK_64X64: + nn_config = &vp9_rect_part_nnconfig_64; + thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[3]; + break; + default: assert(0 && "Unexpected block size."); return; + } + if (!nn_config || thresh < 0) return; + + // Feature extraction and model score calculation. + { + const VP9_COMMON *const cm = &cpi->common; +#if CONFIG_VP9_HIGHBITDEPTH + const int dc_q = + vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8); +#else + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + int feature_index = 0; + float features[FEATURES]; + + features[feature_index++] = logf((float)dc_q + 1.0f); + features[feature_index++] = + (float)(pc_tree->partitioning == PARTITION_NONE); + features[feature_index++] = logf((float)ref_rd / bs / bs + 1.0f); + + { + const float norm_factor = 1.0f / ((float)ref_rd + 1.0f); + const int64_t none_rdcost = pc_tree->none.rdcost; + float rd_ratio = 2.0f; + if (none_rdcost > 0 && none_rdcost < 1000000000) + rd_ratio = (float)none_rdcost * norm_factor; + features[feature_index++] = VPXMIN(rd_ratio, 2.0f); + + for (i = 0; i < 4; ++i) { + const int64_t this_rd = pc_tree->u.split[i]->none.rdcost; + const int rd_valid = this_rd > 0 && this_rd < 1000000000; + // Ratio between sub-block RD and whole block RD. + features[feature_index++] = + rd_valid ? (float)this_rd * norm_factor : 1.0f; + } + } + + assert(feature_index == FEATURES); + nn_predict(features, nn_config, score); + } + + // Make decisions based on the model score. + { + int max_score = -1000; + int horz = 0, vert = 0; + int int_score[LABELS]; + for (i = 0; i < LABELS; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = VPXMAX(int_score[i], max_score); + } + thresh = max_score - thresh; + for (i = 0; i < LABELS; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) horz = 1; + if ((i >> 1) & 1) vert = 1; + } + } + *allow_horz = *allow_horz && horz; + *allow_vert = *allow_vert && vert; + } +} +#undef FEATURES +#undef LABELS + +// Perform fast and coarse motion search for the given block. This is a +// pre-processing step for the ML based partition search speedup. +static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + MV ref_mv, MV_REFERENCE_FRAME ref, + uint8_t *const pred_buf) { + const VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + YV12_BUFFER_CONFIG *yv12; + YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); + const int step_param = 1; + const MvLimits tmp_mv_limits = x->mv_limits; + const SEARCH_METHODS search_method = NSTEP; + const int sadpb = x->sadperbit16; + MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 }; + MV best_mv = { 0, 0 }; + int cost_list[5]; + struct buf_2d backup_pre[MAX_MB_PLANE] = { { 0, 0 } }; + + if (scaled_ref_frame) { + yv12 = scaled_ref_frame; + // As reported in b/311294795, the reference buffer pointer needs to be + // saved and restored after the search. Otherwise, it causes problems while + // the reference frame scaling happens. + for (int i = 0; i < MAX_MB_PLANE; i++) backup_pre[i] = xd->plane[i].pre[0]; + } else { + yv12 = get_ref_frame_buffer(cpi, ref); + } + + assert(yv12 != NULL); + if (!yv12) return; + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, NULL); + mi->ref_frame[0] = ref; + mi->ref_frame[1] = NO_REF_FRAME; + mi->sb_type = bsize; + vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method, + sadpb, cond_cost_list(cpi, cost_list), &ref_mv, + &best_mv, 0, 0); + best_mv.row *= 8; + best_mv.col *= 8; + x->mv_limits = tmp_mv_limits; + mi->mv[0].as_mv = best_mv; + + // Restore reference buffer pointer. + if (scaled_ref_frame) { + for (int i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_pre[i]; + } + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = pred_buf; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); +} + +// Use a neural net model to prune partition-none and partition-split search. +// Features used: QP; spatial block size contexts; variance of prediction +// residue after simple_motion_search. +#define FEATURES 12 +static void ml_predict_var_rd_partitioning(const VP9_COMP *const cpi, + MACROBLOCK *const x, + PC_TREE *const pc_tree, + BLOCK_SIZE bsize, int mi_row, + int mi_col, int *none, int *split) { + const VP9_COMMON *const cm = &cpi->common; + const NN_CONFIG *nn_config = NULL; + const MACROBLOCKD *const xd = &x->e_mbd; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]); + uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? (CONVERT_TO_BYTEPTR(pred_buffer)) + : pred_buffer; +#else + DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64]); + uint8_t *const pred_buf = pred_buffer; +#endif // CONFIG_VP9_HIGHBITDEPTH + const int speed = cpi->oxcf.speed; + float thresh = 0.0f; + + switch (bsize) { + case BLOCK_64X64: + nn_config = &vp9_part_split_nnconfig_64; + thresh = speed > 0 ? 2.8f : 3.0f; + break; + case BLOCK_32X32: + nn_config = &vp9_part_split_nnconfig_32; + thresh = speed > 0 ? 3.5f : 3.0f; + break; + case BLOCK_16X16: + nn_config = &vp9_part_split_nnconfig_16; + thresh = speed > 0 ? 3.8f : 4.0f; + break; + case BLOCK_8X8: + nn_config = &vp9_part_split_nnconfig_8; + if (cm->width >= 720 && cm->height >= 720) + thresh = speed > 0 ? 2.5f : 2.0f; + else + thresh = speed > 0 ? 3.8f : 2.0f; + break; + default: assert(0 && "Unexpected block size."); return; + } + + if (!nn_config) return; + + // Do a simple single motion search to find a prediction for current block. + // The variance of the residue will be used as input features. + { + MV ref_mv; + const MV_REFERENCE_FRAME ref = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + // If bsize is 64x64, use zero MV as reference; otherwise, use MV result + // of previous(larger) block as reference. + if (bsize == BLOCK_64X64) + ref_mv.row = ref_mv.col = 0; + else + ref_mv = pc_tree->mv; + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + simple_motion_search(cpi, x, bsize, mi_row, mi_col, ref_mv, ref, pred_buf); + pc_tree->mv = x->e_mbd.mi[0]->mv[0].as_mv; + } + + vpx_clear_system_state(); + + { + float features[FEATURES] = { 0.0f }; +#if CONFIG_VP9_HIGHBITDEPTH + const int dc_q = + vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (xd->bd - 8); +#else + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + int feature_idx = 0; + float score; + + // Generate model input features. + features[feature_idx++] = logf((float)dc_q + 1.0f); + + // Get the variance of the residue as input features. + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const uint8_t *pred = pred_buf; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + const int has_above = !!xd->above_mi; + const int has_left = !!xd->left_mi; + const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize; + const BLOCK_SIZE left_bsize = has_left ? xd->left_mi->sb_type : bsize; + int i; + + features[feature_idx++] = (float)has_above; + features[feature_idx++] = (float)b_width_log2_lookup[above_bsize]; + features[feature_idx++] = (float)b_height_log2_lookup[above_bsize]; + features[feature_idx++] = (float)has_left; + features[feature_idx++] = (float)b_width_log2_lookup[left_bsize]; + features[feature_idx++] = (float)b_height_log2_lookup[left_bsize]; + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + assert(feature_idx == FEATURES); + + // Feed the features into the model to get the confidence score. + nn_predict(features, nn_config, &score); + + // Higher score means that the model has higher confidence that the split + // partition is better than the non-split partition. So if the score is + // high enough, we skip the none-split partition search; if the score is + // low enough, we skip the split partition search. + if (score > thresh) *none = 0; + if (score < -thresh) *split = 0; + } +} +#undef FEATURES +#endif // !CONFIG_REALTIME_ONLY + +static double log_wiener_var(int64_t wiener_variance) { + return log(1.0 + wiener_variance) / log(2.0); +} + +static void build_kmeans_segmentation(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + BLOCK_SIZE bsize = BLOCK_64X64; + KMEANS_DATA *kmeans_data; + + vp9_disable_segmentation(&cm->seg); + if (cm->show_frame) { + int mi_row, mi_col; + cpi->kmeans_data_size = 0; + cpi->kmeans_ctr_num = 8; + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { + int mb_row_start = mi_row >> 1; + int mb_col_start = mi_col >> 1; + int mb_row_end = VPXMIN( + (mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows); + int mb_col_end = VPXMIN( + (mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols); + int row, col; + int64_t wiener_variance = 0; + + for (row = mb_row_start; row < mb_row_end; ++row) + for (col = mb_col_start; col < mb_col_end; ++col) + wiener_variance += cpi->mb_wiener_variance[row * cm->mb_cols + col]; + + wiener_variance /= + (mb_row_end - mb_row_start) * (mb_col_end - mb_col_start); + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&cpi->kmeans_mutex); +#endif // CONFIG_MULTITHREAD + + kmeans_data = &cpi->kmeans_data_arr[cpi->kmeans_data_size++]; + kmeans_data->value = log_wiener_var(wiener_variance); + kmeans_data->pos = mi_row * cpi->kmeans_data_stride + mi_col; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(&cpi->kmeans_mutex); +#endif // CONFIG_MULTITHREAD + } + } + + vp9_kmeans(cpi->kmeans_ctr_ls, cpi->kmeans_boundary_ls, + cpi->kmeans_count_ls, cpi->kmeans_ctr_num, cpi->kmeans_data_arr, + cpi->kmeans_data_size); + + vp9_perceptual_aq_mode_setup(cpi, &cm->seg); + } +} + +#if !CONFIG_REALTIME_ONLY +static int wiener_var_segment(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *cm = &cpi->common; + int mb_row_start = mi_row >> 1; + int mb_col_start = mi_col >> 1; + int mb_row_end = + VPXMIN((mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows); + int mb_col_end = + VPXMIN((mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols); + int row, col, idx; + int64_t wiener_variance = 0; + int segment_id; + int8_t seg_hist[MAX_SEGMENTS] = { 0 }; + int8_t max_count = 0, max_index = -1; + + vpx_clear_system_state(); + + assert(cpi->norm_wiener_variance > 0); + + for (row = mb_row_start; row < mb_row_end; ++row) { + for (col = mb_col_start; col < mb_col_end; ++col) { + wiener_variance = cpi->mb_wiener_variance[row * cm->mb_cols + col]; + segment_id = + vp9_get_group_idx(log_wiener_var(wiener_variance), + cpi->kmeans_boundary_ls, cpi->kmeans_ctr_num); + ++seg_hist[segment_id]; + } + } + + for (idx = 0; idx < cpi->kmeans_ctr_num; ++idx) { + if (seg_hist[idx] > max_count) { + max_count = seg_hist[idx]; + max_index = idx; + } + } + + assert(max_index >= 0); + segment_id = max_index; + + return segment_id; +} + +static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int orig_rdmult) { + const int gf_group_index = cpi->twopass.gf_group.index; + int64_t intra_cost = 0; + int64_t mc_dep_cost = 0; + int mi_wide = num_8x8_blocks_wide_lookup[bsize]; + int mi_high = num_8x8_blocks_high_lookup[bsize]; + int row, col; + + int dr = 0; + double r0, rk, beta; + + TplDepFrame *tpl_frame; + TplDepStats *tpl_stats; + int tpl_stride; + + if (gf_group_index >= MAX_ARF_GOP_SIZE) return orig_rdmult; + tpl_frame = &cpi->tpl_stats[gf_group_index]; + + if (tpl_frame->is_valid == 0) return orig_rdmult; + tpl_stats = tpl_frame->tpl_stats_ptr; + tpl_stride = tpl_frame->stride; + + if (cpi->twopass.gf_group.layer_depth[gf_group_index] > 1) return orig_rdmult; + + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && + cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { + int sb_size = num_8x8_blocks_wide_lookup[BLOCK_64X64] * MI_SIZE; + int sb_stride = (cpi->common.width + sb_size - 1) / sb_size; + int sby = mi_row / 8; + int sbx = mi_col / 8; + return (int)((cpi->sb_mul_scale[sby * sb_stride + sbx] * orig_rdmult) / + 256); + } + + for (row = mi_row; row < mi_row + mi_high; ++row) { + for (col = mi_col; col < mi_col + mi_wide; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + + if (row >= cpi->common.mi_rows || col >= cpi->common.mi_cols) continue; + + intra_cost += this_stats->intra_cost; + mc_dep_cost += this_stats->mc_dep_cost; + } + } + + vpx_clear_system_state(); + + r0 = cpi->rd.r0; + rk = (double)intra_cost / mc_dep_cost; + beta = r0 / rk; + dr = vp9_get_adaptive_rdmult(cpi, beta); + + dr = clamp(dr, orig_rdmult * 1 / 2, orig_rdmult * 3 / 2); + dr = VPXMAX(1, dr); + + return dr; +} +#endif // !CONFIG_REALTIME_ONLY + +#if !CONFIG_REALTIME_ONLY // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. -static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, - TileDataEnc *tile_data, TOKENEXTRA **tp, - int mi_row, int mi_col, BLOCK_SIZE bsize, - RD_COST *rd_cost, int64_t best_rd, - PC_TREE *pc_tree) { +static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, TOKENEXTRA **tp, + int mi_row, int mi_col, BLOCK_SIZE bsize, + RD_COST *rd_cost, RD_COST best_rdc, + PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; TileInfo *const tile_info = &tile_data->tile_info; MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -2621,11 +3678,11 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; TOKENEXTRA *tp_orig = *tp; - PICK_MODE_CONTEXT *ctx = &pc_tree->none; + PICK_MODE_CONTEXT *const ctx = &pc_tree->none; int i; const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); BLOCK_SIZE subsize; - RD_COST this_rdc, sum_rdc, best_rdc; + RD_COST this_rdc, sum_rdc; int do_split = bsize >= BLOCK_8X8; int do_rect = 1; INTERP_FILTER pred_interp_filter; @@ -2639,37 +3696,43 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, BLOCK_SIZE min_size = x->min_partition_size; BLOCK_SIZE max_size = x->max_partition_size; -#if CONFIG_FP_MB_STATS - unsigned int src_diff_var = UINT_MAX; - int none_complexity = 0; -#endif - int partition_none_allowed = !force_horz_split && !force_vert_split; int partition_horz_allowed = !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; int partition_vert_allowed = !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; - int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr; - int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr; + int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist; + int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate; + int must_split = 0; + int should_encode_sb = 0; + + // Ref frames picked in the [i_th] quarter subblock during square partition + // RD search. It may be used to prune ref frame selection of rect partitions. + uint8_t ref_frames_used[4] = { 0, 0, 0, 0 }; + + int partition_mul = x->cb_rdmult; (void)*tp_orig; assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); - // Adjust dist breakout threshold according to the partition size. dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + rate_breakout_thr *= num_pels_log2_lookup[bsize]; vp9_rd_cost_init(&this_rdc); vp9_rd_cost_init(&sum_rdc); - vp9_rd_cost_reset(&best_rdc); - best_rdc.rdcost = best_rd; set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + if (oxcf->tuning == VP8_TUNE_SSIM) { + set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &partition_mul); + } + vp9_rd_cost_update(partition_mul, x->rddiv, &best_rdc); + if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ && cpi->oxcf.aq_mode != LOOKAHEAD_AQ) x->mb_energy = vp9_block_energy(cpi, x, bsize); @@ -2684,10 +3747,18 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size); } + // Get sub block energy range + if (bsize >= BLOCK_16X16) { + int min_energy, max_energy; + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + must_split = (min_energy < -3) && (max_energy - min_energy > 2); + } + // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. if (cpi->sf.auto_min_max_partition_size) { - partition_none_allowed &= (bsize <= max_size && bsize >= min_size); + partition_none_allowed &= (bsize <= max_size); partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) || force_horz_split); partition_vert_allowed &= @@ -2696,7 +3767,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } if (cpi->sf.use_square_partition_only && - bsize > cpi->sf.use_square_only_threshold) { + (bsize > cpi->sf.use_square_only_thresh_high || + bsize < cpi->sf.use_square_only_thresh_low)) { if (cpi->use_svc) { if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless) partition_horz_allowed &= force_horz_split; @@ -2710,144 +3782,101 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, save_context(x, mi_row, mi_col, a, l, sa, sl, bsize); -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); - src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row, - mi_col, bsize); - } -#endif + pc_tree->partitioning = PARTITION_NONE; -#if CONFIG_FP_MB_STATS - // Decide whether we shall split directly and skip searching NONE by using - // the first pass block statistics - if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split && - partition_none_allowed && src_diff_var > 4 && - cm->base_qindex < qindex_split_threshold_lookup[bsize]) { - int mb_row = mi_row >> 1; - int mb_col = mi_col >> 1; - int mb_row_end = - VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); - int mb_col_end = - VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); - int r, c; - - // compute a complexity measure, basically measure inconsistency of motion - // vectors obtained from the first pass in the current block - for (r = mb_row; r < mb_row_end; r++) { - for (c = mb_col; c < mb_col_end; c++) { - const int mb_index = r * cm->mb_cols + c; - - MOTION_DIRECTION this_mv; - MOTION_DIRECTION right_mv; - MOTION_DIRECTION bottom_mv; - - this_mv = - get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]); - - // to its right - if (c != mb_col_end - 1) { - right_mv = get_motion_direction_fp( - cpi->twopass.this_frame_mb_stats[mb_index + 1]); - none_complexity += get_motion_inconsistency(this_mv, right_mv); - } - - // to its bottom - if (r != mb_row_end - 1) { - bottom_mv = get_motion_direction_fp( - cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]); - none_complexity += get_motion_inconsistency(this_mv, bottom_mv); - } - - // do not count its left and top neighbors to avoid double counting + if (cpi->sf.rd_ml_partition.var_pruning && !frame_is_intra_only(cm)) { + const int do_rd_ml_partition_var_pruning = + partition_none_allowed && do_split && + mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows && + mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols; + if (do_rd_ml_partition_var_pruning) { + ml_predict_var_rd_partitioning(cpi, x, pc_tree, bsize, mi_row, mi_col, + &partition_none_allowed, &do_split); + // ml_predict_var_rd_partitioning() may pruune out either + // partition_none_allowed or do_split, but we should keep the + // partition_none_allowed for 8x8 blocks unless disable_split_mask is + // off (0). + if (bsize == BLOCK_8X8 && cpi->sf.disable_split_mask && + partition_none_allowed == 0) { + partition_none_allowed = 1; } + } else { + vp9_zero(pc_tree->mv); } - - if (none_complexity > complexity_16x16_blocks_threshold[bsize]) { - partition_none_allowed = 0; + if (bsize > BLOCK_8X8) { // Store MV result as reference for subblocks. + for (i = 0; i < 4; ++i) pc_tree->u.split[i]->mv = pc_tree->mv; } } -#endif // PARTITION_NONE if (partition_none_allowed) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx, - best_rdc.rdcost); + best_rdc.rate, best_rdc.dist); + ctx->rdcost = this_rdc.rdcost; if (this_rdc.rate != INT_MAX) { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref1 = ctx->mic.ref_frame[0]; + const int ref2 = ctx->mic.ref_frame[1]; + for (i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + } if (bsize >= BLOCK_8X8) { this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; - this_rdc.rdcost = - RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); + vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc); } if (this_rdc.rdcost < best_rdc.rdcost) { + MODE_INFO *mi = xd->mi[0]; + best_rdc = this_rdc; + should_encode_sb = 1; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; - // If all y, u, v transform blocks in this partition are skippable, and - // the dist & rate are within the thresholds, the partition search is - // terminated for current branch of the partition search tree. - if (!x->e_mbd.lossless && ctx->skippable && - ((best_rdc.dist < (dist_breakout_thr >> 2)) || - (best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr))) { - do_split = 0; - do_rect = 0; - } - -#if CONFIG_FP_MB_STATS - // Check if every 16x16 first pass block statistics has zero - // motion and the corresponding first pass residue is small enough. - // If that is the case, check the difference variance between the - // current frame and the last frame. If the variance is small enough, - // stop further splitting in RD optimization - if (cpi->use_fp_mb_stats && do_split != 0 && - cm->base_qindex > qindex_skip_threshold_lookup[bsize]) { - int mb_row = mi_row >> 1; - int mb_col = mi_col >> 1; - int mb_row_end = - VPXMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); - int mb_col_end = - VPXMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); - int r, c; - - int skip = 1; - for (r = mb_row; r < mb_row_end; r++) { - for (c = mb_col; c < mb_col_end; c++) { - const int mb_index = r * cm->mb_cols + c; - if (!(cpi->twopass.this_frame_mb_stats[mb_index] & - FPMB_MOTION_ZERO_MASK) || - !(cpi->twopass.this_frame_mb_stats[mb_index] & - FPMB_ERROR_SMALL_MASK)) { - skip = 0; - break; - } - } - if (skip == 0) { - break; - } - } - - if (skip) { - if (src_diff_var == UINT_MAX) { - set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); - src_diff_var = get_sby_perpixel_diff_variance( - cpi, &x->plane[0].src, mi_row, mi_col, bsize); - } - if (src_diff_var < 8) { + if (cpi->sf.rd_ml_partition.search_early_termination) { + // Currently, the machine-learning based partition search early + // termination is only used while bsize is 16x16, 32x32 or 64x64, + // VPXMIN(cm->width, cm->height) >= 480, and speed = 0. + if (!x->e_mbd.lossless && + !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) && + ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) { + if (ml_pruning_partition(cm, xd, ctx, mi_row, mi_col, bsize)) { do_split = 0; do_rect = 0; } } } -#endif + + if ((do_split || do_rect) && !x->e_mbd.lossless && ctx->skippable) { + const int use_ml_based_breakout = + cpi->sf.rd_ml_partition.search_breakout && cm->base_qindex >= 100; + if (use_ml_based_breakout) { + if (ml_predict_breakout(cpi, bsize, x, &this_rdc)) { + do_split = 0; + do_rect = 0; + } + } else { + if (!cpi->sf.rd_ml_partition.search_early_termination) { + if ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr)) { + do_split = 0; + do_rect = 0; + } + } + } + } } } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + } else { + vp9_zero(ctx->pred_mv); + ctx->mic.interp_filter = EIGHTTAP; } // store estimated motion vector - if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx); + store_pred_mv(x, ctx); // If the interp_filter is marked as SWITCHABLE_FILTERS, it was for an // intra block and used for context purposes. @@ -2860,112 +3889,192 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_SPLIT // TODO(jingning): use the motion vectors given by the above search as // the starting point of motion search in the following partition type check. - if (do_split) { + pc_tree->u.split[0]->none.rdcost = 0; + pc_tree->u.split[1]->none.rdcost = 0; + pc_tree->u.split[2]->none.rdcost = 0; + pc_tree->u.split[3]->none.rdcost = 0; + if (do_split || must_split) { subsize = get_subsize(bsize, PARTITION_SPLIT); + load_pred_mv(x, ctx); if (bsize == BLOCK_8X8) { i = 4; if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) - pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter; + pc_tree->u.leaf_split[0]->pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, - pc_tree->leaf_split[0], best_rdc.rdcost); - - if (sum_rdc.rate == INT_MAX) sum_rdc.rdcost = INT64_MAX; + pc_tree->u.leaf_split[0], best_rdc.rate, best_rdc.dist); + if (sum_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref1 = pc_tree->u.leaf_split[0]->mic.ref_frame[0]; + const int ref2 = pc_tree->u.leaf_split[0]->mic.ref_frame[1]; + for (i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + } + } } else { - for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) { + for (i = 0; (i < 4) && ((sum_rdc.rdcost < best_rdc.rdcost) || must_split); + ++i) { const int x_idx = (i & 1) * mi_step; const int y_idx = (i >> 1) * mi_step; + int found_best_rd = 0; + RD_COST best_rdc_split; + vp9_rd_cost_reset(&best_rdc_split); + + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) { + // A must split test here increases the number of sub + // partitions but hurts metrics results quite a bit, + // so this extra test is commented out pending + // further tests on whether it adds much in terms of + // visual quality. + // (must_split) ? best_rdc.rate + // : best_rdc.rate - sum_rdc.rate, + // (must_split) ? best_rdc.dist + // : best_rdc.dist - sum_rdc.dist, + best_rdc_split.rate = best_rdc.rate - sum_rdc.rate; + best_rdc_split.dist = best_rdc.dist - sum_rdc.dist; + } if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + pc_tree->u.split[i]->index = i; + if (cpi->sf.prune_ref_frame_for_rect_partitions) + pc_tree->u.split[i]->none.rate = INT_MAX; + found_best_rd = rd_pick_partition( + cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &this_rdc, best_rdc_split, pc_tree->u.split[i]); - pc_tree->split[i]->index = i; - rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, - mi_col + x_idx, subsize, &this_rdc, - best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]); - - if (this_rdc.rate == INT_MAX) { + if (found_best_rd == 0) { sum_rdc.rdcost = INT64_MAX; break; } else { + if (cpi->sf.prune_ref_frame_for_rect_partitions && + pc_tree->u.split[i]->none.rate != INT_MAX) { + const int ref1 = pc_tree->u.split[i]->none.mic.ref_frame[0]; + const int ref2 = pc_tree->u.split[i]->none.mic.ref_frame[1]; + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; - sum_rdc.rdcost += this_rdc.rdcost; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); } } } - if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) { + if (((sum_rdc.rdcost < best_rdc.rdcost) || must_split) && i == 4) { sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT]; - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); - if (sum_rdc.rdcost < best_rdc.rdcost) { + if ((sum_rdc.rdcost < best_rdc.rdcost) || + (must_split && (sum_rdc.dist < best_rdc.dist))) { best_rdc = sum_rdc; + should_encode_sb = 1; pc_tree->partitioning = PARTITION_SPLIT; // Rate and distortion based partition search termination clause. - if (!x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) || - (best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr))) { + if (!cpi->sf.rd_ml_partition.search_early_termination && + !x->e_mbd.lossless && + ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr))) { do_rect = 0; } } } else { // skip rectangular partition test when larger block size // gives better rd cost - if ((cpi->sf.less_rectangular_check) && - ((bsize > cpi->sf.use_square_only_threshold) || - (best_rdc.dist < dist_breakout_thr))) + if (cpi->sf.less_rectangular_check && + (bsize > cpi->sf.use_square_only_thresh_high || + best_rdc.dist < dist_breakout_thr)) do_rect &= !partition_none_allowed; } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } + pc_tree->horizontal[0].skip_ref_frame_mask = 0; + pc_tree->horizontal[1].skip_ref_frame_mask = 0; + pc_tree->vertical[0].skip_ref_frame_mask = 0; + pc_tree->vertical[1].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + uint8_t used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[1]; + if (used_frames) { + pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames & 0xff; + } + used_frames = ref_frames_used[2] | ref_frames_used[3]; + if (used_frames) { + pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames & 0xff; + } + used_frames = ref_frames_used[0] | ref_frames_used[2]; + if (used_frames) { + pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames & 0xff; + } + used_frames = ref_frames_used[1] | ref_frames_used[3]; + if (used_frames) { + pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames & 0xff; + } + } + + { + const int do_ml_rect_partition_pruning = + !frame_is_intra_only(cm) && !force_horz_split && !force_vert_split && + (partition_horz_allowed || partition_vert_allowed) && bsize > BLOCK_8X8; + if (do_ml_rect_partition_pruning) { + ml_prune_rect_partition(cpi, x, bsize, pc_tree, &partition_horz_allowed, + &partition_vert_allowed, best_rdc.rdcost); + } + } + // PARTITION_HORZ if (partition_horz_allowed && (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) { + const int part_mode_rate = cpi->partition_cost[pl][PARTITION_HORZ]; subsize = get_subsize(bsize, PARTITION_HORZ); - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->horizontal[0].pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, - &pc_tree->horizontal[0], best_rdc.rdcost); + &pc_tree->horizontal[0], best_rdc.rate - part_mode_rate, + best_rdc.dist); + if (sum_rdc.rdcost < INT64_MAX) { + sum_rdc.rate += part_mode_rate; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); + } if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows && bsize > BLOCK_8X8) { - PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; - update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); - encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); - - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0]; + update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, subsize, &pc_tree->horizontal[1], - best_rdc.rdcost - sum_rdc.rdcost); + best_rdc.rate - sum_rdc.rate, + best_rdc.dist - sum_rdc.dist); if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; } else { sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; - sum_rdc.rdcost += this_rdc.rdcost; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); } } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ]; - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); - if (sum_rdc.rdcost < best_rdc.rdcost) { - best_rdc = sum_rdc; - pc_tree->partitioning = PARTITION_HORZ; + best_rdc = sum_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_HORZ; - if ((cpi->sf.less_rectangular_check) && - (bsize > cpi->sf.use_square_only_threshold)) - do_rect = 0; - } + if (cpi->sf.less_rectangular_check && + bsize > cpi->sf.use_square_only_thresh_high) + do_rect = 0; } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } @@ -2973,59 +4082,74 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_VERT if (partition_vert_allowed && (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) { + const int part_mode_rate = cpi->partition_cost[pl][PARTITION_VERT]; subsize = get_subsize(bsize, PARTITION_VERT); - - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->vertical[0].pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, - &pc_tree->vertical[0], best_rdc.rdcost); + &pc_tree->vertical[0], best_rdc.rate - part_mode_rate, + best_rdc.dist); + if (sum_rdc.rdcost < INT64_MAX) { + sum_rdc.rate += part_mode_rate; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); + } + if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols && bsize > BLOCK_8X8) { update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, &pc_tree->vertical[0]); - - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->vertical[1].pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, subsize, &pc_tree->vertical[1], - best_rdc.rdcost - sum_rdc.rdcost); + best_rdc.rate - sum_rdc.rate, + best_rdc.dist - sum_rdc.dist); if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; } else { sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; - sum_rdc.rdcost += this_rdc.rdcost; + vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc); } } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT]; - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); - if (sum_rdc.rdcost < best_rdc.rdcost) { - best_rdc = sum_rdc; - pc_tree->partitioning = PARTITION_VERT; - } + best_rdc = sum_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_VERT; } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } - // TODO(jbb): This code added so that we avoid static analysis - // warning related to the fact that best_rd isn't used after this - // point. This code should be refactored so that the duplicate - // checks occur in some sub function and thus are used... - (void)best_rd; + if (bsize == BLOCK_64X64 && best_rdc.rdcost == INT64_MAX) { + vp9_rd_cost_reset(&this_rdc); + rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, BLOCK_64X64, + ctx, INT_MAX, INT64_MAX); + ctx->rdcost = this_rdc.rdcost; + vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc); + if (this_rdc.rdcost < best_rdc.rdcost) { + best_rdc = this_rdc; + should_encode_sb = 1; + pc_tree->partitioning = PARTITION_NONE; + } + } + *rd_cost = best_rdc; - if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && - pc_tree->index != 3) { + if (should_encode_sb && pc_tree->index != 3) { int output_enabled = (bsize == BLOCK_64X64); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif } if (bsize == BLOCK_64X64) { @@ -3035,6 +4159,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } else { assert(tp_orig == *tp); } + + return should_encode_sb; } static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, @@ -3048,23 +4174,33 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, const int mi_col_start = tile_info->mi_col_start; const int mi_col_end = tile_info->mi_col_end; int mi_col; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int num_sb_cols = + get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2); + int sb_col_in_tile; // Initialize the left context for the new SB row memset(&xd->left_context, 0, sizeof(xd->left_context)); memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); // Code each SB in the row - for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) { + for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE, sb_col_in_tile++) { const struct segmentation *const seg = &cm->seg; int dummy_rate; int64_t dummy_dist; RD_COST dummy_rdc; int i; int seg_skip = 0; + int orig_rdmult = cpi->rd.RDMULT; const int idx_str = cm->mi_stride * mi_row + mi_col; MODE_INFO **mi = cm->mi_grid_visible + idx_str; + vp9_rd_cost_reset(&dummy_rdc); + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile); + if (sf->adaptive_pred_interp_filter) { for (i = 0; i < 64; ++i) td->leaf_tree[i].pred_interp_filter = SWITCHABLE; @@ -3076,7 +4212,10 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, } } - vp9_zero(x->pred_mv); + for (i = 0; i < MAX_REF_FRAMES; ++i) { + x->pred_mv[i].row = INT16_MAX; + x->pred_mv[i].col = INT16_MAX; + } td->pc_root->index = 0; if (seg->enabled) { @@ -3087,6 +4226,9 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, } x->source_variance = UINT_MAX; + + x->cb_rdmult = orig_rdmult; + if (sf->partition_search_type == FIXED_PARTITION || seg_skip) { const BLOCK_SIZE bsize = seg_skip ? BLOCK_64X64 : sf->always_this_block_size; @@ -3094,30 +4236,46 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root); - } else if (cpi->partition_search_skippable_frame) { - BLOCK_SIZE bsize; - set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); - bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col); - set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); - rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, td->pc_root); } else if (sf->partition_search_type == VAR_BASED_PARTITION && cm->frame_type != KEY_FRAME) { choose_partitioning(cpi, tile_info, x, mi_row, mi_col); rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root); } else { + if (cpi->twopass.gf_group.index > 0 && cpi->sf.enable_tpl_model) { + int dr = + get_rdmult_delta(cpi, BLOCK_64X64, mi_row, mi_col, orig_rdmult); + x->cb_rdmult = dr; + } + + if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ && cm->show_frame) { + x->segment_id = wiener_var_segment(cpi, BLOCK_64X64, mi_row, mi_col); + x->cb_rdmult = vp9_compute_rd_mult( + cpi, vp9_get_qindex(&cm->seg, x->segment_id, cm->base_qindex)); + } + // If required set upper and lower partition size limits if (sf->auto_min_max_partition_size) { set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col, &x->min_partition_size, &x->max_partition_size); } + td->pc_root->none.rdcost = 0; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_partition_time); +#endif rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rdc, INT64_MAX, td->pc_root); + &dummy_rdc, dummy_rdc, td->pc_root); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_partition_time); +#endif } + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile, num_sb_cols); } } +#endif // !CONFIG_REALTIME_ONLY static void init_encode_frame_mb_context(VP9_COMP *cpi) { MACROBLOCK *const x = &cpi->td.mb; @@ -3189,12 +4347,42 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) { static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { - if (bsize < BLOCK_16X16) + if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16) vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); else vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); } +static void hybrid_search_svc_baseiskey(VP9_COMP *cpi, MACROBLOCK *const x, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + TileDataEnc *tile_data, int mi_row, + int mi_col) { + if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) { + vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + } else { + if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF) + vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); + else if (bsize >= BLOCK_8X8) + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, + ctx); + else + vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx); + } +} + +static void hybrid_search_scene_change(VP9_COMP *cpi, MACROBLOCK *const x, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + TileDataEnc *tile_data, int mi_row, + int mi_col) { + if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) { + vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + } else { + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx); + } +} + static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *const x, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, @@ -3210,6 +4398,11 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, int plane; set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); + + x->skip_recode = 0; + mi = xd->mi[0]; mi->sb_type = bsize; @@ -3225,14 +4418,23 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, if (cyclic_refresh_segment_id_boosted(mi->segment_id)) x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); + else if (cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) + hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row, + mi_col); else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) - set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize); - else if (bsize >= BLOCK_8X8) - vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx); - else + set_mode_info_seg_skip(x, cm->tx_mode, cm->interp_filter, rd_cost, bsize); + else if (bsize >= BLOCK_8X8) { + if (cpi->rc.hybrid_intra_scene_change) + hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row, + mi_col); + else + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, + ctx); + } else { vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx); + } duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); @@ -3294,13 +4496,13 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row, } break; case PARTITION_SPLIT: { - fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]); + fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->u.split[0]); fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize, - pc_tree->split[1]); + pc_tree->u.split[1]); fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize, - pc_tree->split[2]); + pc_tree->u.split[2]); fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize, - pc_tree->split[3]); + pc_tree->u.split[3]); break; } default: break; @@ -3318,10 +4520,81 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) { if (bsize > BLOCK_8X8) { BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); int i; - for (i = 0; i < 4; ++i) pred_pixel_ready_reset(pc_tree->split[i], subsize); + for (i = 0; i < 4; ++i) + pred_pixel_ready_reset(pc_tree->u.split[i], subsize); } } +#define FEATURES 6 +#define LABELS 2 +static int ml_predict_var_partitioning(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const NN_CONFIG *nn_config = NULL; + + switch (bsize) { + case BLOCK_64X64: nn_config = &vp9_var_part_nnconfig_64; break; + case BLOCK_32X32: nn_config = &vp9_var_part_nnconfig_32; break; + case BLOCK_16X16: nn_config = &vp9_var_part_nnconfig_16; break; + case BLOCK_8X8: break; + default: assert(0 && "Unexpected block size."); return -1; + } + + if (!nn_config) return -1; + + vpx_clear_system_state(); + + { + const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f; + float features[FEATURES] = { 0.0f }; + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); + int feature_idx = 0; + float score[LABELS]; + + features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f); + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const int sb_offset_row = 8 * (mi_row & 7); + const int sb_offset_col = 8 * (mi_col & 7); + const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + int i; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + + assert(feature_idx == FEATURES); + nn_predict(features, nn_config, score); + if (score[0] > thresh) return PARTITION_SPLIT; + if (score[0] < -thresh) return PARTITION_NONE; + return -1; + } +} +#undef FEATURES +#undef LABELS + static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, @@ -3351,8 +4624,14 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; int partition_vert_allowed = !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; + const int use_ml_based_partitioning = + sf->partition_search_type == ML_BASED_PARTITION; + (void)*tp_orig; + // Avoid checking for rectangular partitions for speed >= 5. + if (cpi->oxcf.speed >= 5) do_rect = 0; + assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); @@ -3378,6 +4657,18 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, partition_vert_allowed &= force_vert_split; } + if (use_ml_based_partitioning) { + if (partition_none_allowed || do_split) do_rect = 0; + if (partition_none_allowed && do_split) { + const int ml_predicted_partition = + ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col); + if (ml_predicted_partition == PARTITION_NONE) do_split = 0; + if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0; + } + } + + if (!partition_none_allowed && !do_split) do_rect = 1; + ctx->pred_pixel_ready = !(partition_vert_allowed || partition_horz_allowed || do_split); @@ -3391,26 +4682,25 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, ctx->skip = x->skip; if (this_rdc.rate != INT_MAX) { - int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); if (this_rdc.rdcost < best_rdc.rdcost) { - int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr; - int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr; - - dist_breakout_thr >>= - 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); - - rate_breakout_thr *= num_pels_log2_lookup[bsize]; - best_rdc = this_rdc; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; - if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr && - this_rdc.dist < dist_breakout_thr) { - do_split = 0; - do_rect = 0; + if (!use_ml_based_partitioning) { + int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist; + int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate; + dist_breakout_thr >>= + 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + rate_breakout_thr *= num_pels_log2_lookup[bsize]; + if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr && + this_rdc.dist < dist_breakout_thr) { + do_split = 0; + do_rect = 0; + } } } } @@ -3432,9 +4722,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; load_pred_mv(x, ctx); - nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, - mi_col + x_idx, subsize, &this_rdc, 0, - best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]); + nonrd_pick_partition( + cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &this_rdc, 0, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->u.split[i]); if (this_rdc.rate == INT_MAX) { vp9_rd_cost_reset(&sum_rdc); @@ -3458,7 +4748,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); - if (sf->adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); pc_tree->horizontal[0].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, &pc_tree->horizontal[0]); @@ -3502,7 +4792,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_VERT if (partition_vert_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_VERT); - if (sf->adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); pc_tree->vertical[0].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, &pc_tree->vertical[0]); @@ -3580,6 +4870,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, PARTITION_TYPE partition; BLOCK_SIZE subsize; RD_COST this_rdc; + BLOCK_SIZE subsize_ref = + (cpi->sf.adapt_partition_source_sad) ? BLOCK_8X8 : BLOCK_16X16; vp9_rd_cost_reset(&this_rdc); if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -3593,7 +4885,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost, 0, INT64_MAX, pc_tree); } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE && - subsize >= BLOCK_16X16) { + subsize >= subsize_ref) { x->max_partition_size = BLOCK_32X32; x->min_partition_size = BLOCK_8X8; nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost, @@ -3660,14 +4952,15 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, } } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize, output_enabled, rd_cost, - pc_tree->split[0]); + pc_tree->u.split[0]); nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp, mi_row, mi_col + hbs, subsize, output_enabled, &this_rdc, - pc_tree->split[1]); + pc_tree->u.split[1]); if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { rd_cost->rate += this_rdc.rate; @@ -3675,7 +4968,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, } nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp, mi_row + hbs, mi_col, subsize, output_enabled, - &this_rdc, pc_tree->split[2]); + &this_rdc, pc_tree->u.split[2]); if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { rd_cost->rate += this_rdc.rate; @@ -3683,14 +4976,13 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, } nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp, mi_row + hbs, mi_col + hbs, subsize, - output_enabled, &this_rdc, pc_tree->split[3]); + output_enabled, &this_rdc, pc_tree->u.split[3]); if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX && rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) { rd_cost->rate += this_rdc.rate; rd_cost->dist += this_rdc.dist; } break; - default: assert(0 && "Invalid partition type."); break; } } @@ -3779,34 +5071,132 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, output_enabled, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); if (bsize == BLOCK_8X8) { nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, - subsize, pc_tree->leaf_split[0]); + subsize, pc_tree->u.leaf_split[0]); encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, - subsize, pc_tree->leaf_split[0]); + subsize, pc_tree->u.leaf_split[0]); } else { nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize, - output_enabled, dummy_cost, pc_tree->split[0]); + output_enabled, dummy_cost, pc_tree->u.split[0]); nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, mi_row, mi_col + hbs, subsize, output_enabled, dummy_cost, - pc_tree->split[1]); + pc_tree->u.split[1]); nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp, mi_row + hbs, mi_col, subsize, output_enabled, - dummy_cost, pc_tree->split[2]); + dummy_cost, pc_tree->u.split[2]); nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp, mi_row + hbs, mi_col + hbs, subsize, output_enabled, - dummy_cost, pc_tree->split[3]); + dummy_cost, pc_tree->u.split[3]); } break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) update_partition_context(xd, mi_row, mi_col, subsize, bsize); } +// Get a prediction(stored in x->est_pred) for the whole 64x64 superblock. +static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *x, int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + MACROBLOCKD *xd = &x->e_mbd; + + set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + + if (!is_key_frame) { + MODE_INFO *mi = xd->mi[0]; + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + const YV12_BUFFER_CONFIG *yv12_g = NULL; + const BLOCK_SIZE bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 + + (mi_row + 4 < cm->mi_rows); + unsigned int y_sad_g, y_sad_thr; + unsigned int y_sad = UINT_MAX; + + assert(yv12 != NULL); + + if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) || + cpi->svc.use_gf_temporal_ref_current_layer) { + // For now, GOLDEN will not be used for non-zero spatial layers, since + // it may not be a temporal reference. + yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + + // Only compute y_sad_g (sad for golden reference) for speed < 8. + if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 && + (cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + y_sad_g = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } else { + y_sad_g = UINT_MAX; + } + + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) { + yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME); + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ALTREF_FRAME - 1].sf); + mi->ref_frame[0] = ALTREF_FRAME; + y_sad_g = UINT_MAX; + } else { + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[LAST_FRAME - 1].sf); + mi->ref_frame[0] = LAST_FRAME; + } + mi->ref_frame[1] = NO_REF_FRAME; + mi->sb_type = BLOCK_64X64; + mi->mv[0].as_int = 0; + mi->interp_filter = BILINEAR; + + { + const MV dummy_mv = { 0, 0 }; + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col, + &dummy_mv); + x->sb_use_mv_part = 1; + x->sb_mvcol_part = mi->mv[0].as_mv.col; + x->sb_mvrow_part = mi->mv[0].as_mv.row; + } + + // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad + // are close if short_circuit_low_temp_var is on. + y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad; + if (y_sad_g < y_sad_thr) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + mi->ref_frame[0] = GOLDEN_FRAME; + mi->mv[0].as_int = 0; + } else { + x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + } + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = x->est_pred; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + switch (xd->bd) { + case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break; + case 10: + memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0])); + break; + case 12: + memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0])); + break; + } +#else + memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); +#endif // CONFIG_VP9_HIGHBITDEPTH + } +} + static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, TOKENEXTRA **tp) { @@ -3818,13 +5208,18 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, const int mi_col_start = tile_info->mi_col_start; const int mi_col_end = tile_info->mi_col_end; int mi_col; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int num_sb_cols = + get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2); + int sb_col_in_tile; // Initialize the left context for the new SB row memset(&xd->left_context, 0, sizeof(xd->left_context)); memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); // Code each SB in the row - for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) { + for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE, ++sb_col_in_tile) { const struct segmentation *const seg = &cm->seg; RD_COST dummy_rdc; const int idx_str = cm->mi_stride * mi_row + mi_col; @@ -3832,18 +5227,73 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type; BLOCK_SIZE bsize = BLOCK_64X64; int seg_skip = 0; + int i; + + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile); + + if (cpi->use_skin_detection) { + vp9_compute_skin_sb(cpi, BLOCK_16X16, mi_row, mi_col); + } + x->source_variance = UINT_MAX; - vp9_zero(x->pred_mv); + for (i = 0; i < MAX_REF_FRAMES; ++i) { + x->pred_mv[i].row = INT16_MAX; + x->pred_mv[i].col = INT16_MAX; + } vp9_rd_cost_init(&dummy_rdc); x->color_sensitivity[0] = 0; x->color_sensitivity[1] = 0; x->sb_is_skin = 0; + x->skip_low_source_sad = 0; + x->lowvar_highsumdiff = 0; + x->content_state_sb = 0; + x->zero_temp_sad_source = 0; + x->sb_use_mv_part = 0; + x->sb_mvcol_part = 0; + x->sb_mvrow_part = 0; + x->sb_pickmode_part = 0; + x->arf_frame_usage = 0; + x->lastgolden_frame_usage = 0; + + if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) { + int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3); + int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + int64_t source_sad = avg_source_sad(cpi, x, shift, sb_offset2); + if (sf->adapt_partition_source_sad && + (cpi->oxcf.rc_mode == VPX_VBR && !cpi->rc.is_src_frame_alt_ref && + source_sad > sf->adapt_partition_thresh && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) + partition_search_type = REFERENCE_PARTITION; + } if (seg->enabled) { const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); + + if (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] && + cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY && + x->content_state_sb > kLowSadLowSumdiff) { + // For ROI with skip, force segment = 0 (no skip) over whole + // superblock to avoid artifacts if temporal change in source_sad is + // not 0. + int xi, yi; + const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; + const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_cols + mi_col; + set_mode_info_offsets(cm, x, xd, mi_row, mi_col); + for (yi = 0; yi < ymis; yi++) + for (xi = 0; xi < xmis; xi++) { + int map_offset = block_index + yi * cm->mi_cols + xi; + cpi->segmentation_map[map_offset] = 0; + } + set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0); + seg_skip = 0; + } if (seg_skip) { partition_search_type = FIXED_PARTITION; } @@ -3860,10 +5310,14 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); break; - case SOURCE_VAR_BASED_PARTITION: - set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col); - nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, - BLOCK_64X64, 1, &dummy_rdc, td->pc_root); + case ML_BASED_PARTITION: + get_estimated_pred(cpi, tile_info, x, mi_row, mi_col); + x->max_partition_size = BLOCK_64X64; + x->min_partition_size = BLOCK_8X8; + x->sb_pickmode_part = 1; + nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, + td->pc_root); break; case FIXED_PARTITION: if (!seg_skip) bsize = sf->always_this_block_size; @@ -3871,17 +5325,17 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); break; - case REFERENCE_PARTITION: + default: + assert(partition_search_type == REFERENCE_PARTITION); + x->sb_pickmode_part = 1; set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); - // Use nonrd_pick_partition on scene-cut for VBR, or on qp-segment - // if cyclic_refresh is enabled. + // Use nonrd_pick_partition on scene-cut for VBR mode. // nonrd_pick_partition does not support 4x4 partition, so avoid it // on key frame for now. if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad && - cm->frame_type != KEY_FRAME) || - (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && - xd->mi[0]->segment_id)) { - // Use lower max_partition_size for low resoultions. + cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { + // Use lower max_partition_size for low resolutions. if (cm->width <= 352 && cm->height <= 288) x->max_partition_size = BLOCK_32X32; else @@ -3895,7 +5349,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, // TODO(marpan): Seems like nonrd_select_partition does not support // 4x4 partition. Since 4x4 is used on key frame, use this switch // for now. - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); else @@ -3904,123 +5358,25 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, } break; - default: assert(0); break; } + + // Update ref_frame usage for inter frame if this group is ARF group. + if (!cpi->rc.is_src_frame_alt_ref && !cpi->refresh_golden_frame && + !cpi->refresh_alt_ref_frame && cpi->rc.alt_ref_gf_group && + cpi->sf.use_altref_onepass) { + int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + if (cpi->count_arf_frame_usage != NULL) + cpi->count_arf_frame_usage[sboffset] = x->arf_frame_usage; + if (cpi->count_lastgolden_frame_usage != NULL) + cpi->count_lastgolden_frame_usage[sboffset] = x->lastgolden_frame_usage; + } + + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile, num_sb_cols); } } // end RTC play code -static int set_var_thresh_from_histogram(VP9_COMP *cpi) { - const SPEED_FEATURES *const sf = &cpi->sf; - const VP9_COMMON *const cm = &cpi->common; - - const uint8_t *src = cpi->Source->y_buffer; - const uint8_t *last_src = cpi->Last_Source->y_buffer; - const int src_stride = cpi->Source->y_stride; - const int last_stride = cpi->Last_Source->y_stride; - - // Pick cutoff threshold - const int cutoff = (VPXMIN(cm->width, cm->height) >= 720) - ? (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) - : (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100); - DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]); - diff *var16 = cpi->source_diff_var; - - int sum = 0; - int i, j; - - memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0])); - - for (i = 0; i < cm->mb_rows; i++) { - for (j = 0; j < cm->mb_cols; j++) { -#if CONFIG_VP9_HIGHBITDEPTH - if (cm->use_highbitdepth) { - switch (cm->bit_depth) { - case VPX_BITS_8: - vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride, - &var16->sse, &var16->sum); - break; - case VPX_BITS_10: - vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride, - &var16->sse, &var16->sum); - break; - case VPX_BITS_12: - vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride, - &var16->sse, &var16->sum); - break; - default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10" - " or VPX_BITS_12"); - return -1; - } - } else { - vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, - &var16->sum); - } -#else - vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, - &var16->sum); -#endif // CONFIG_VP9_HIGHBITDEPTH - var16->var = var16->sse - (((uint32_t)var16->sum * var16->sum) >> 8); - - if (var16->var >= VAR_HIST_MAX_BG_VAR) - hist[VAR_HIST_BINS - 1]++; - else - hist[var16->var / VAR_HIST_FACTOR]++; - - src += 16; - last_src += 16; - var16++; - } - - src = src - cm->mb_cols * 16 + 16 * src_stride; - last_src = last_src - cm->mb_cols * 16 + 16 * last_stride; - } - - cpi->source_var_thresh = 0; - - if (hist[VAR_HIST_BINS - 1] < cutoff) { - for (i = 0; i < VAR_HIST_BINS - 1; i++) { - sum += hist[i]; - - if (sum > cutoff) { - cpi->source_var_thresh = (i + 1) * VAR_HIST_FACTOR; - return 0; - } - } - } - - return sf->search_type_check_frequency; -} - -static void source_var_based_partition_search_method(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - SPEED_FEATURES *const sf = &cpi->sf; - - if (cm->frame_type == KEY_FRAME) { - // For key frame, use SEARCH_PARTITION. - sf->partition_search_type = SEARCH_PARTITION; - } else if (cm->intra_only) { - sf->partition_search_type = FIXED_PARTITION; - } else { - if (cm->last_width != cm->width || cm->last_height != cm->height) { - if (cpi->source_diff_var) vpx_free(cpi->source_diff_var); - - CHECK_MEM_ERROR(cm, cpi->source_diff_var, - vpx_calloc(cm->MBs, sizeof(diff))); - } - - if (!cpi->frames_till_next_var_check) - cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi); - - if (cpi->frames_till_next_var_check > 0) { - sf->partition_search_type = FIXED_PARTITION; - cpi->frames_till_next_var_check--; - } - } -} - static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) { unsigned int intra_count = 0, inter_count = 0; int j; @@ -4040,12 +5396,20 @@ void vp9_init_tile_data(VP9_COMP *cpi) { const int tile_rows = 1 << cm->log2_tile_rows; int tile_col, tile_row; TOKENEXTRA *pre_tok = cpi->tile_tok[0][0]; + TOKENLIST *tplist = cpi->tplist[0][0]; int tile_tok = 0; + int tplist_count = 0; if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { - if (cpi->tile_data != NULL) vpx_free(cpi->tile_data); - CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows * - sizeof(*cpi->tile_data))); + if (cpi->tile_data != NULL) { + // Free the row mt memory in cpi->tile_data first. + vp9_row_mt_mem_dealloc(cpi); + vpx_free(cpi->tile_data); + } + cpi->allocated_tiles = 0; + CHECK_MEM_ERROR( + &cm->error, cpi->tile_data, + vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data))); cpi->allocated_tiles = tile_cols * tile_rows; for (tile_row = 0; tile_row < tile_rows; ++tile_row) @@ -4053,55 +5417,88 @@ void vp9_init_tile_data(VP9_COMP *cpi) { TileDataEnc *tile_data = &cpi->tile_data[tile_row * tile_cols + tile_col]; int i, j; + const MV zero_mv = { 0, 0 }; for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact[i][j] = 32; + tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; + tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; tile_data->mode_map[i][j] = j; } } + tile_data->firstpass_top_mv = zero_mv; +#if CONFIG_MULTITHREAD + tile_data->row_base_thresh_freq_fact = NULL; +#endif } } for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileInfo *tile_info = - &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileInfo *tile_info = &this_tile->tile_info; + if (cpi->sf.adaptive_rd_thresh_row_mt) { + vp9_row_mt_alloc_rd_thresh(cpi, this_tile); + } vp9_tile_init(tile_info, cm, tile_row, tile_col); cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; pre_tok = cpi->tile_tok[tile_row][tile_col]; tile_tok = allocated_tokens(*tile_info); + + cpi->tplist[tile_row][tile_col] = tplist + tplist_count; + tplist = cpi->tplist[tile_row][tile_col]; + tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); } } } +void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row, + int tile_col, int mi_row) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + TOKENEXTRA *tok = NULL; + int tile_sb_row; + int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1; + + tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile_info->mi_row_start) >> + MI_BLOCK_SIZE_LOG2; + get_start_tok(cpi, tile_row, tile_col, mi_row, &tok); + cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok; + +#if CONFIG_REALTIME_ONLY + assert(cpi->sf.use_nonrd_pick_mode); + encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); +#else + if (cpi->sf.use_nonrd_pick_mode) + encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); + else + encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); +#endif + + cpi->tplist[tile_row][tile_col][tile_sb_row].stop = tok; + cpi->tplist[tile_row][tile_col][tile_sb_row].count = + (unsigned int)(cpi->tplist[tile_row][tile_col][tile_sb_row].stop - + cpi->tplist[tile_row][tile_col][tile_sb_row].start); + assert(tok - cpi->tplist[tile_row][tile_col][tile_sb_row].start <= + get_token_alloc(MI_BLOCK_SIZE >> 1, tile_mb_cols)); + + (void)tile_mb_cols; +} + void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row, int tile_col) { VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; const TileInfo *const tile_info = &this_tile->tile_info; - TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; const int mi_row_start = tile_info->mi_row_start; const int mi_row_end = tile_info->mi_row_end; int mi_row; - // Set up pointers to per thread motion search counters. - this_tile->m_search_count = 0; // Count of motion search hits. - this_tile->ex_search_count = 0; // Exhaustive mesh search hits. - td->mb.m_search_count_ptr = &this_tile->m_search_count; - td->mb.ex_search_count_ptr = &this_tile->ex_search_count; - - for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) { - if (cpi->sf.use_nonrd_pick_mode) - encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); - else - encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); - } - cpi->tok_count[tile_row][tile_col] = - (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]); - assert(tok - cpi->tile_tok[tile_row][tile_col] <= - allocated_tokens(*tile_info)); + for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) + vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); } static void encode_tiles(VP9_COMP *cpi) { @@ -4117,19 +5514,106 @@ static void encode_tiles(VP9_COMP *cpi) { vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col); } -#if CONFIG_FP_MB_STATS -static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats, - VP9_COMMON *cm, uint8_t **this_frame_mb_stats) { - uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start + - cm->current_video_frame * cm->MBs * sizeof(uint8_t); - - if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF; - - *this_frame_mb_stats = mb_stats_in; - - return 1; +static int compare_kmeans_data(const void *a, const void *b) { + if (((const KMEANS_DATA *)a)->value > ((const KMEANS_DATA *)b)->value) { + return 1; + } else if (((const KMEANS_DATA *)a)->value < + ((const KMEANS_DATA *)b)->value) { + return -1; + } else { + return 0; + } +} + +static void compute_boundary_ls(const double *ctr_ls, int k, + double *boundary_ls) { + // boundary_ls[j] is the upper bound of data centered at ctr_ls[j] + int j; + for (j = 0; j < k - 1; ++j) { + boundary_ls[j] = (ctr_ls[j] + ctr_ls[j + 1]) / 2.; + } + boundary_ls[k - 1] = DBL_MAX; +} + +int vp9_get_group_idx(double value, double *boundary_ls, int k) { + int group_idx = 0; + while (value >= boundary_ls[group_idx]) { + ++group_idx; + if (group_idx == k - 1) { + break; + } + } + return group_idx; +} + +void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k, + KMEANS_DATA *arr, int size) { + int i, j; + int itr; + int group_idx; + double sum[MAX_KMEANS_GROUPS]; + int count[MAX_KMEANS_GROUPS]; + + vpx_clear_system_state(); + + assert(k >= 2 && k <= MAX_KMEANS_GROUPS); + + qsort(arr, size, sizeof(*arr), compare_kmeans_data); + + // initialize the center points + for (j = 0; j < k; ++j) { + ctr_ls[j] = arr[(size * (2 * j + 1)) / (2 * k)].value; + } + + for (itr = 0; itr < 10; ++itr) { + compute_boundary_ls(ctr_ls, k, boundary_ls); + for (i = 0; i < MAX_KMEANS_GROUPS; ++i) { + sum[i] = 0; + count[i] = 0; + } + + // Both the data and centers are sorted in ascending order. + // As each data point is processed in order, its corresponding group index + // can only increase. So we only need to reset the group index to zero here. + group_idx = 0; + for (i = 0; i < size; ++i) { + while (arr[i].value >= boundary_ls[group_idx]) { + // place samples into clusters + ++group_idx; + if (group_idx == k - 1) { + break; + } + } + sum[group_idx] += arr[i].value; + ++count[group_idx]; + } + + for (group_idx = 0; group_idx < k; ++group_idx) { + if (count[group_idx] > 0) + ctr_ls[group_idx] = sum[group_idx] / count[group_idx]; + + sum[group_idx] = 0; + count[group_idx] = 0; + } + } + + // compute group_idx, boundary_ls and count_ls + for (j = 0; j < k; ++j) { + count_ls[j] = 0; + } + compute_boundary_ls(ctr_ls, k, boundary_ls); + group_idx = 0; + for (i = 0; i < size; ++i) { + while (arr[i].value >= boundary_ls[group_idx]) { + ++group_idx; + if (group_idx == k - 1) { + break; + } + } + arr[i].group_idx = group_idx; + ++count_ls[group_idx]; + } } -#endif static void encode_frame_internal(VP9_COMP *cpi) { SPEED_FEATURES *const sf = &cpi->sf; @@ -4137,10 +5621,10 @@ static void encode_frame_internal(VP9_COMP *cpi) { MACROBLOCK *const x = &td->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; + const int gf_group_index = cpi->twopass.gf_group.index; xd->mi = cm->mi_grid_visible; xd->mi[0] = cm->mi; - vp9_zero(*td->counts); vp9_zero(cpi->td.rd_counts); @@ -4149,17 +5633,19 @@ static void encode_frame_internal(VP9_COMP *cpi) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; + x->fwd_txfm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; else - x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; - x->highbd_itxm_add = + x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; + x->highbd_inv_txfm_add = xd->lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add; #else - x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; + x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH - x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; - + x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1; if (xd->lossless) x->optimize = 0; + x->sharpness = cpi->oxcf.sharpness; + x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ); cm->tx_mode = select_tx_mode(cpi, xd); @@ -4198,30 +5684,67 @@ static void encode_frame_internal(VP9_COMP *cpi) { !(cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) && !cpi->use_svc) cpi->ref_frame_flags &= (~VP9_GOLD_FLAG); + } else if (gf_group_index && gf_group_index < MAX_ARF_GOP_SIZE && + cpi->sf.enable_tpl_model) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; - if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION) - source_var_based_partition_search_method(cpi); + int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t mc_dep_cost_base = 0; + int row, col; + + for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { + for (col = 0; col < cm->mi_cols; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + intra_cost_base += this_stats->intra_cost; + mc_dep_cost_base += this_stats->mc_dep_cost; + } + } + + vpx_clear_system_state(); + + if (tpl_frame->is_valid) + cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base; } + for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; + ++ref_frame) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { + if (cm->frame_refs[ref_frame - 1].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[ref_frame - 1].sf.y_scale_fp == REF_INVALID_SCALE) + cpi->ref_frame_flags &= ~ref_frame_to_flag(ref_frame); + } + } + + // Frame segmentation + if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi); + { +#if CONFIG_INTERNAL_STATS struct vpx_usec_timer emr_timer; vpx_usec_timer_start(&emr_timer); - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm, - &cpi->twopass.this_frame_mb_stats); - } #endif - // If allowed, encoding tiles in parallel with one thread handling one tile. - if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1) - vp9_encode_tiles_mt(cpi); - else - encode_tiles(cpi); + if (!cpi->row_mt) { + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy; + // If allowed, encoding tiles in parallel with one thread handling one + // tile when row based multi-threading is disabled. + if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1) + vp9_encode_tiles_mt(cpi); + else + encode_tiles(cpi); + } else { + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write; + vp9_encode_tiles_row_mt(cpi); + } +#if CONFIG_INTERNAL_STATS vpx_usec_timer_mark(&emr_timer); cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer); +#endif } sf->skip_encode_frame = @@ -4256,7 +5779,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { int mi_row, mi_col; int sum_delta = 0; - int map_index = 0; int qdelta_index; int segment_id; @@ -4266,7 +5788,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { segment_id = mi_8x8[0]->segment_id; qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); sum_delta += qdelta_index; - map_index++; } mi_8x8_ptr += cm->mi_stride; } @@ -4274,9 +5795,39 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { return sum_delta / (cm->mi_rows * cm->mi_cols); } +static void restore_encode_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int tile_idx; + int i, j; + TileDataEnc *tile_data; + RD_OPT *rd_opt = &cpi->rd; + for (i = 0; i < MAX_REF_FRAMES; i++) { + for (j = 0; j < REFERENCE_MODES; j++) + rd_opt->prediction_type_threshes[i][j] = + rd_opt->prediction_type_threshes_prev[i][j]; + + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++) + rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j]; + } + + for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) { + assert(cpi->tile_data); + tile_data = &cpi->tile_data[tile_idx]; + vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev); + } + + cm->interp_filter = cpi->sf.default_interp_filter; +} + void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; + restore_encode_params(cpi); + +#if CONFIG_MISMATCH_DEBUG + mismatch_reset_frame(MAX_MB_PLANE); +#endif + // In the longer term the encoder should be generalized to match the // decoder such that we allow compound where one of the 3 buffers has a // different sign bias and that buffer is then the fixed ref. However, this @@ -4284,16 +5835,11 @@ void vp9_encode_frame(VP9_COMP *cpi) { // side behavior is where the ALT ref buffer has opposite sign bias to // the other two. if (!frame_is_intra_only(cm)) { - if ((cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) || - (cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[LAST_FRAME])) { - cpi->allow_comp_inter_inter = 0; - } else { + if (vp9_compound_reference_allowed(cm)) { cpi->allow_comp_inter_inter = 1; - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; + vp9_setup_compound_reference_mode(cm); + } else { + cpi->allow_comp_inter_inter = 0; } } @@ -4330,7 +5876,13 @@ void vp9_encode_frame(VP9_COMP *cpi) { if (cm->interp_filter == SWITCHABLE) cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_frame_internal_time); +#endif encode_frame_internal(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_frame_internal_time); +#endif for (i = 0; i < REFERENCE_MODES; ++i) mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2; @@ -4391,8 +5943,31 @@ void vp9_encode_frame(VP9_COMP *cpi) { } } } else { + FRAME_COUNTS *counts = cpi->td.counts; cm->reference_mode = SINGLE_REFERENCE; + if (cpi->allow_comp_inter_inter && cpi->sf.use_compound_nonrd_pickmode && + cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref && + cm->frame_type != KEY_FRAME) + cm->reference_mode = REFERENCE_MODE_SELECT; + encode_frame_internal(cpi); + + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + int single_count_zero = 0; + int comp_count_zero = 0; + int i; + for (i = 0; i < COMP_INTER_CONTEXTS; i++) { + single_count_zero += counts->comp_inter[i][0]; + comp_count_zero += counts->comp_inter[i][1]; + } + if (comp_count_zero == 0) { + cm->reference_mode = SINGLE_REFERENCE; + vp9_zero(counts->comp_inter); + } else if (single_count_zero == 0) { + cm->reference_mode = COMPOUND_REFERENCE; + vp9_zero(counts->comp_inter); + } + } } // If segmented AQ is enabled compute the average AQ weighting. @@ -4434,7 +6009,8 @@ static void update_zeromv_cnt(VP9_COMP *const cpi, const MODE_INFO *const mi, for (y = 0; y < ymis; y++) for (x = 0; x < xmis; x++) { int map_offset = block_index + y * cm->mi_cols + x; - if (is_inter_block(mi) && mi->segment_id <= CR_SEGMENT_ID_BOOST2) { + if (mi->ref_frame[0] == LAST_FRAME && is_inter_block(mi) && + mi->segment_id <= CR_SEGMENT_ID_BOOST2) { if (abs(mv.row) < 8 && abs(mv.col) < 8) { if (cpi->consec_zero_mv[map_offset] < 255) cpi->consec_zero_mv[map_offset]++; @@ -4501,7 +6077,27 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, VPXMAX(bsize, BLOCK_8X8)); - vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8)); +#if CONFIG_MISMATCH_DEBUG + if (output_enabled) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *pd = &xd->plane[plane]; + int pixel_c, pixel_r; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]); + const int bw = get_block_width(plane_bsize); + const int bh = get_block_height(plane_bsize); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, + pd->subsampling_x, pd->subsampling_y); + + mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c, + pixel_r, bw, bh, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } + } +#endif + + vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8), mi_row, mi_col, output_enabled); vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip, VPXMAX(bsize, BLOCK_8X8)); } @@ -4527,9 +6123,14 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, ++td->counts->tx.tx_totals[mi->tx_size]; ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])]; - if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->cyclic_refresh->content_mode) vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize); - if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0) + if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 && + (!cpi->use_svc || + (cpi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize); } } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h index aa54947858..97fe52484a 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_ -#define VP9_ENCODER_VP9_ENCODEFRAME_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ #include "vpx/vpx_integer.h" @@ -22,13 +22,6 @@ struct yv12_buffer_config; struct VP9_COMP; struct ThreadData; -// Constants used in SOURCE_VAR_BASED_PARTITION -#define VAR_HIST_MAX_BG_VAR 1000 -#define VAR_HIST_FACTOR 10 -#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1) -#define VAR_HIST_LARGE_CUT_OFF 75 -#define VAR_HIST_SMALL_CUT_OFF 45 - void vp9_setup_src_planes(struct macroblock *x, const struct yv12_buffer_config *src, int mi_row, int mi_col); @@ -39,10 +32,19 @@ void vp9_init_tile_data(struct VP9_COMP *cpi); void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col); -void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q); +void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td, + int tile_row, int tile_col, int mi_row); + +void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q, + int content_state); + +struct KMEANS_DATA; +void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k, + struct KMEANS_DATA *arr, int size); +int vp9_get_group_idx(double value, double *boundary_ls, int k); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODEFRAME_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c index 2cb137d8b9..eded9f5c42 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c @@ -16,12 +16,17 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#if CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif + #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_tokenize.h" @@ -49,36 +54,14 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { pd->dst.buf, pd->dst.stride); } -typedef struct vp9_token_state { - int64_t error; - int rate; - int16_t next; - int16_t token; - tran_low_t qc; - tran_low_t dqc; - uint8_t best_index; -} vp9_token_state; - static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { - { 10, 6 }, { 8, 5 }, + { 10, 6 }, + { 8, 5 }, }; -#define UPDATE_RD_COST() \ - { \ - rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \ - rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \ - } - -// This function is a place holder for now but may ultimately need -// to scan previous tokens to work out the correct context. -static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb, - int idx, int token, uint8_t *token_cache) { - int bak = token_cache[scan[idx]], pt; - token_cache[scan[idx]] = vp9_pt_energy_class[token]; - pt = get_coef_context(nb, token_cache, idx + 1); - token_cache[scan[idx]] = bak; - return pt; -} +// 'num' can be negative, but 'shift' must be non-negative. +#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \ + (((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))) int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int ctx) { @@ -86,221 +69,264 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, struct macroblock_plane *const p = &mb->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; const int ref = is_inter_block(xd->mi[0]); - vp9_token_state tokens[1025][2]; uint8_t token_cache[1024]; - const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block); + const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const int eob = p->eobs[block]; - const PLANE_TYPE type = get_plane_type(plane); + const PLANE_TYPE plane_type = get_plane_type(plane); const int default_eob = 16 << (tx_size << 1); const int shift = (tx_size == TX_32X32); const int16_t *const dequant_ptr = pd->dequant; const uint8_t *const band_translate = get_band_translate(tx_size); - const scan_order *const so = get_scan(xd, tx_size, type, block); + const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block); const int16_t *const scan = so->scan; const int16_t *const nb = so->neighbors; - const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift }; - int next = eob, sz = 0; - const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][type]) >> 1; + const MODE_INFO *mbmi = xd->mi[0]; + const int sharpness = mb->sharpness; + const int64_t rdadj = (int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]; + const int64_t rdmult = + (sharpness == 0 ? rdadj >> 1 + : (rdadj * (8 - sharpness + mbmi->segment_id)) >> 4); + const int64_t rddiv = mb->rddiv; int64_t rd_cost0, rd_cost1; - int rate0, rate1; - int64_t error0, error1; + int64_t rate0, rate1; int16_t t0, t1; - EXTRABIT e0; - unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = - mb->token_costs[tx_size][type][ref]; - int best, band, pt, i, final_eob; + int i, final_eob; + int count_high_values_after_eob = 0; #if CONFIG_VP9_HIGHBITDEPTH - const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd); + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); #else - const int *cat6_high_cost = vp9_get_high_cost_table(8); + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8); #endif + unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + mb->token_costs[tx_size][plane_type][ref]; + unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS]; + int64_t eob_cost0, eob_cost1; + const int ctx0 = ctx; + int64_t accu_rate = 0; + // Initialized to the worst possible error for the largest transform size. + // This ensures that it never goes negative. + int64_t accu_error = ((int64_t)1) << 50; + int64_t best_block_rd_cost = INT64_MAX; + int x_prev = 1; + tran_low_t before_best_eob_qc = 0; + tran_low_t before_best_eob_dqc = 0; - assert((!type && !plane) || (type && plane)); + assert((!plane_type && !plane) || (plane_type && plane)); assert(eob <= default_eob); - /* Now set up a Viterbi trellis to evaluate alternative roundings. */ - /* Initialize the sentinel node of the trellis. */ - tokens[eob][0].rate = 0; - tokens[eob][0].error = 0; - tokens[eob][0].next = default_eob; - tokens[eob][0].token = EOB_TOKEN; - tokens[eob][0].qc = 0; - tokens[eob][1] = tokens[eob][0]; - - for (i = 0; i < eob; i++) - token_cache[scan[i]] = vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])]; - - for (i = eob; i-- > 0;) { - int base_bits, d2, dx; + for (i = 0; i < eob; i++) { const int rc = scan[i]; - int x = qcoeff[rc]; - /* Only add a trellis state for non-zero coefficients. */ - if (x) { - error0 = tokens[next][0].error; - error1 = tokens[next][1].error; - /* Evaluate the first possibility for this state. */ - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; - vp9_get_token_extra(x, &t0, &e0); - /* Consider both possible successor states. */ - if (next < default_eob) { - band = band_translate[i + 1]; - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += token_costs[band][0][pt][tokens[next][0].token]; - rate1 += token_costs[band][0][pt][tokens[next][1].token]; - } - UPDATE_RD_COST(); - /* And pick the best. */ - best = rd_cost1 < rd_cost0; - base_bits = vp9_get_cost(t0, e0, cat6_high_cost); - dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift); -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - dx >>= xd->bd - 8; - } -#endif // CONFIG_VP9_HIGHBITDEPTH - d2 = dx * dx; - tokens[i][0].rate = base_bits + (best ? rate1 : rate0); - tokens[i][0].error = d2 + (best ? error1 : error0); - tokens[i][0].next = next; - tokens[i][0].token = t0; - tokens[i][0].qc = x; - tokens[i][0].dqc = dqcoeff[rc]; - tokens[i][0].best_index = best; + token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])]; + } + final_eob = 0; - /* Evaluate the second possibility for this state. */ - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; + // Initial RD cost. + token_costs_cur = token_costs + band_translate[0]; + rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN]; + best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error); - if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) && - (abs(x) * dequant_ptr[rc != 0] < - (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) { - sz = -(x < 0); - x -= 2 * sz + 1; - } else { - tokens[i][1] = tokens[i][0]; - next = i; - continue; - } - - /* Consider both possible successor states. */ - if (!x) { - /* If we reduced this coefficient to zero, check to see if - * we need to move the EOB back here. - */ - t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; - t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; - e0 = 0; - } else { - vp9_get_token_extra(x, &t0, &e0); - t1 = t0; - } - if (next < default_eob) { - band = band_translate[i + 1]; - if (t0 != EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += token_costs[band][!x][pt][tokens[next][0].token]; - } - if (t1 != EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache); - rate1 += token_costs[band][!x][pt][tokens[next][1].token]; - } - } - - UPDATE_RD_COST(); - /* And pick the best. */ - best = rd_cost1 < rd_cost0; - base_bits = vp9_get_cost(t0, e0, cat6_high_cost); - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz; - } else { - dx -= (dequant_ptr[rc != 0] + sz) ^ sz; - } -#else - dx -= (dequant_ptr[rc != 0] + sz) ^ sz; -#endif // CONFIG_VP9_HIGHBITDEPTH - d2 = dx * dx; - - tokens[i][1].rate = base_bits + (best ? rate1 : rate0); - tokens[i][1].error = d2 + (best ? error1 : error0); - tokens[i][1].next = next; - tokens[i][1].token = best ? t1 : t0; - tokens[i][1].qc = x; - - if (x) { - tran_low_t offset = dq_step[rc != 0]; - // The 32x32 transform coefficient uses half quantization step size. - // Account for the rounding difference in the dequantized coefficeint - // value when the quantization index is dropped from an even number - // to an odd number. - if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01); - - if (sz == 0) - tokens[i][1].dqc = dqcoeff[rc] - offset; - else - tokens[i][1].dqc = dqcoeff[rc] + offset; - } else { - tokens[i][1].dqc = 0; - } - - tokens[i][1].best_index = best; - /* Finally, make this the new head of the trellis. */ - next = i; + // For each token, pick one of two choices greedily: + // (i) First candidate: Keep current quantized value, OR + // (ii) Second candidate: Reduce quantized value by 1. + for (i = 0; i < eob; i++) { + const int rc = scan[i]; + const int x = qcoeff[rc]; + const int band_cur = band_translate[i]; + const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i); + const int token_tree_sel_cur = (x_prev == 0); + token_costs_cur = token_costs + band_cur; + if (x == 0) { // No need to search + const int token = vp9_get_token(x); + rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token]; + accu_rate += rate0; + x_prev = 0; + // Note: accu_error does not change. } else { - /* There's no choice to make for a zero coefficient, so we don't - * add a new trellis node, but we do need to update the costs. - */ - band = band_translate[i + 1]; - pt = get_coef_context(nb, token_cache, i + 1); - t0 = tokens[next][0].token; - t1 = tokens[next][1].token; - /* Update the cost of each path if we're past the EOB token. */ - if (t0 != EOB_TOKEN) { - tokens[next][0].rate += token_costs[band][1][pt][t0]; - tokens[next][0].token = ZERO_TOKEN; + const int dqv = dequant_ptr[rc != 0]; + // Compute the distortion for quantizing to 0. + const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift); + const int diff_for_zero = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8) + : +#endif + diff_for_zero_raw; + const int64_t distortion_for_zero = + (int64_t)diff_for_zero * diff_for_zero; + + // Compute the distortion for the first candidate + const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift); + const int diff0 = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8) + : +#endif // CONFIG_VP9_HIGHBITDEPTH + diff0_raw; + const int64_t distortion0 = (int64_t)diff0 * diff0; + + // Compute the distortion for the second candidate + const int sign = -(x < 0); // -1 if x is negative and 0 otherwise. + const int x1 = x - 2 * sign - 1; // abs(x1) = abs(x) - 1. + int64_t distortion1; + if (x1 != 0) { + const int dqv_step = +#if CONFIG_VP9_HIGHBITDEPTH + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8) + : +#endif // CONFIG_VP9_HIGHBITDEPTH + dqv; + const int diff_step = (dqv_step + sign) ^ sign; + const int diff1 = diff0 - diff_step; + assert(dqv > 0); // We aren't right shifting a negative number above. + distortion1 = (int64_t)diff1 * diff1; + } else { + distortion1 = distortion_for_zero; } - if (t1 != EOB_TOKEN) { - tokens[next][1].rate += token_costs[band][1][pt][t1]; - tokens[next][1].token = ZERO_TOKEN; + { + // Calculate RDCost for current coeff for the two candidates. + const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost); + const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost); + rate0 = + base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0]; + rate1 = + base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1]; + } + { + int rdcost_better_for_x1, eob_rdcost_better_for_x1; + int dqc0, dqc1; + int64_t best_eob_cost_cur; + int use_x1; + + // Calculate RD Cost effect on the next coeff for the two candidates. + int64_t next_bits0 = 0; + int64_t next_bits1 = 0; + int64_t next_eob_bits0 = 0; + int64_t next_eob_bits1 = 0; + if (i < default_eob - 1) { + int ctx_next, token_tree_sel_next; + const int band_next = band_translate[i + 1]; + const int token_next = + (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN; + unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS] + [ENTROPY_TOKENS] = + token_costs + band_next; + token_cache[rc] = vp9_pt_energy_class[t0]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x == 0); + next_bits0 = + (*token_costs_next)[token_tree_sel_next][ctx_next][token_next]; + next_eob_bits0 = + (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; + token_cache[rc] = vp9_pt_energy_class[t1]; + ctx_next = get_coef_context(nb, token_cache, i + 1); + token_tree_sel_next = (x1 == 0); + next_bits1 = + (*token_costs_next)[token_tree_sel_next][ctx_next][token_next]; + if (x1 != 0) { + next_eob_bits1 = + (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN]; + } + } + + // Compare the total RD costs for two candidates. + rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0); + rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1); + rdcost_better_for_x1 = (rd_cost1 < rd_cost0); + eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0), + (accu_error + distortion0 - distortion_for_zero)); + eob_cost1 = eob_cost0; + if (x1 != 0) { + eob_cost1 = + RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1), + (accu_error + distortion1 - distortion_for_zero)); + eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0); + } else { + eob_rdcost_better_for_x1 = 0; + } + + // Calculate the two candidate de-quantized values. + dqc0 = dqcoeff[rc]; + dqc1 = 0; + if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) { + if (x1 != 0) { + dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift); + } else { + dqc1 = 0; + } + } + + // Pick and record the better quantized and de-quantized values. + if (rdcost_better_for_x1) { + qcoeff[rc] = x1; + dqcoeff[rc] = dqc1; + accu_rate += rate1; + accu_error += distortion1 - distortion_for_zero; + assert(distortion1 <= distortion_for_zero); + token_cache[rc] = vp9_pt_energy_class[t1]; + } else { + accu_rate += rate0; + accu_error += distortion0 - distortion_for_zero; + assert(distortion0 <= distortion_for_zero); + token_cache[rc] = vp9_pt_energy_class[t0]; + } + if (sharpness > 0 && abs(qcoeff[rc]) > 1) count_high_values_after_eob++; + assert(accu_error >= 0); + x_prev = qcoeff[rc]; // Update based on selected quantized value. + + use_x1 = (x1 != 0) && eob_rdcost_better_for_x1; + best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0; + + // Determine whether to move the eob position to i+1 + if (best_eob_cost_cur < best_block_rd_cost) { + best_block_rd_cost = best_eob_cost_cur; + final_eob = i + 1; + count_high_values_after_eob = 0; + if (use_x1) { + before_best_eob_qc = x1; + before_best_eob_dqc = dqc1; + } else { + before_best_eob_qc = x; + before_best_eob_dqc = dqc0; + } + } } - tokens[i][0].best_index = tokens[i][1].best_index = 0; - /* Don't update next, because we didn't add a new node. */ } } - - /* Now pick the best path through the whole trellis. */ - band = band_translate[i + 1]; - rate0 = tokens[next][0].rate; - rate1 = tokens[next][1].rate; - error0 = tokens[next][0].error; - error1 = tokens[next][1].error; - t0 = tokens[next][0].token; - t1 = tokens[next][1].token; - rate0 += token_costs[band][0][ctx][t0]; - rate1 += token_costs[band][0][ctx][t1]; - UPDATE_RD_COST(); - best = rd_cost1 < rd_cost0; - final_eob = -1; - - for (i = next; i < eob; i = next) { - const int x = tokens[i][best].qc; - const int rc = scan[i]; - if (x) final_eob = i; - qcoeff[rc] = x; - dqcoeff[rc] = tokens[i][best].dqc; - next = tokens[i][best].next; - best = tokens[i][best].best_index; + if (count_high_values_after_eob > 0) { + final_eob = eob - 1; + for (; final_eob >= 0; final_eob--) { + const int rc = scan[final_eob]; + const int x = qcoeff[rc]; + if (x) { + break; + } + } + final_eob++; + } else { + assert(final_eob <= eob); + if (final_eob > 0) { + int rc; + assert(before_best_eob_qc != 0); + i = final_eob - 1; + rc = scan[i]; + qcoeff[rc] = before_best_eob_qc; + dqcoeff[rc] = before_best_eob_dqc; + } + for (i = final_eob; i < eob; i++) { + int rc = scan[i]; + qcoeff[rc] = 0; + dqcoeff[rc] = 0; + } } - final_eob++; - mb->plane[plane].eobs[block] = final_eob; return final_eob; } +#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE static INLINE void fdct32x32(int rd_transform, const int16_t *src, tran_low_t *dst, int src_stride) { @@ -325,7 +351,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); @@ -333,39 +359,33 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *src_diff; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, - p->round_fp, p->quant_fp, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vp9_highbd_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, + pd->dequant, eob, scan_order); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_highbd_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; - case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vp9_highbd_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; - default: assert(0); } return; } @@ -374,30 +394,26 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vp9_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: - vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64, x->skip_block, - p->zbin, p->round_fp, p->quant_fp, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_fdct8x8(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + break; - case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant, - eob, scan_order->scan, scan_order->iscan); + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; - default: assert(0); break; } } @@ -413,34 +429,33 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *src_diff; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); + #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { switch (tx_size) { case TX_32X32: vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride); - vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round, - p->quant_fp[0], qcoeff, dqcoeff, - pd->dequant[0], eob); + vpx_highbd_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); break; case TX_16X16: vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride); - vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round, - p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], - eob); + vpx_highbd_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); break; case TX_8X8: vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride); - vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round, - p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], - eob); + vpx_highbd_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); break; - case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round, - p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], - eob); + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_highbd_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, + dqcoeff, pd->dequant[0], eob); break; - default: assert(0); } return; } @@ -449,25 +464,25 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: vpx_fdct32x32_1(src_diff, coeff, diff_stride); - vpx_quantize_dc_32x32(coeff, x->skip_block, p->round, p->quant_fp[0], - qcoeff, dqcoeff, pd->dequant[0], eob); + vpx_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); break; case TX_16X16: vpx_fdct16x16_1(src_diff, coeff, diff_stride); - vpx_quantize_dc(coeff, 256, x->skip_block, p->round, p->quant_fp[0], - qcoeff, dqcoeff, pd->dequant[0], eob); + vpx_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); break; case TX_8X8: vpx_fdct8x8_1(src_diff, coeff, diff_stride); - vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0], - qcoeff, dqcoeff, pd->dequant[0], eob); + vpx_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); break; - case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], - qcoeff, dqcoeff, pd->dequant[0], eob); + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); break; - default: assert(0); break; } } @@ -476,7 +491,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); @@ -484,39 +499,33 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const int16_t *src_diff; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { switch (tx_size) { case TX_32X32: highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, - p->round, p->quant, p->quant_shift, qcoeff, - dqcoeff, pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_16X16: vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; - case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; - default: assert(0); } return; } @@ -525,36 +534,36 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_16X16: vpx_fdct16x16(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: vpx_fdct8x8(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; - case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; - default: assert(0); break; } } static void encode_block(int plane, int block, int row, int col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct encode_b_args *const args = arg; +#if CONFIG_MISMATCH_DEBUG + int mi_row = args->mi_row; + int mi_col = args->mi_col; + int output_enabled = args->output_enabled; +#endif MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = &x->plane[plane]; @@ -571,7 +580,11 @@ static void encode_block(int plane, int block, int row, int col, if (x->zcoeff_blk[tx_size][block] && plane == 0) { p->eobs[block] = 0; *a = *l = 0; +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else return; +#endif } if (!x->skip_recode) { @@ -581,7 +594,11 @@ static void encode_block(int plane, int block, int row, int col, // skip forward transform p->eobs[block] = 0; *a = *l = 0; +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else return; +#endif } else { vp9_xform_quant_fp(x, plane, block, row, col, plane_bsize, tx_size); } @@ -598,7 +615,11 @@ static void encode_block(int plane, int block, int row, int col, // skip forward transform p->eobs[block] = 0; *a = *l = 0; +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else return; +#endif } } else { vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size); @@ -615,32 +636,43 @@ static void encode_block(int plane, int block, int row, int col, if (p->eobs[block]) *(args->skip) = 0; - if (x->skip_encode || p->eobs[block] == 0) return; + if (x->skip_encode || p->eobs[block] == 0) { +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else + return; +#endif + } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], + vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], - xd->bd); + x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], + xd->bd); break; - default: assert(0 && "Invalid transform size"); } +#if CONFIG_MISMATCH_DEBUG + goto encode_block_end; +#else return; +#endif } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -654,14 +686,27 @@ static void encode_block(int plane, int block, int row, int col, case TX_8X8: vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; - default: assert(0 && "Invalid transform size"); break; } +#if CONFIG_MISMATCH_DEBUG +encode_block_end: + if (output_enabled) { + int pixel_c, pixel_r; + int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2); + mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row, + pd->subsampling_x, pd->subsampling_y); + mismatch_record_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r, + blk_w, blk_h, + xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH); + } +#endif } static void encode_block_pass1(int plane, int block, int row, int col, @@ -680,11 +725,12 @@ static void encode_block_pass1(int plane, int block, int row, int col, if (p->eobs[block] > 0) { #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd); + x->highbd_inv_txfm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride, + p->eobs[block], xd->bd); return; } #endif // CONFIG_VP9_HIGHBITDEPTH - x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); } } @@ -694,12 +740,34 @@ void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) { encode_block_pass1, x); } -void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { +void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + int output_enabled) { MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; MODE_INFO *mi = xd->mi[0]; - struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip }; int plane; +#if CONFIG_MISMATCH_DEBUG + struct encode_b_args arg = { x, + 1, // enable_trellis_opt + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + NULL, // above entropy context + NULL, // left entropy context + &mi->skip, mi_row, mi_col, output_enabled }; +#else + struct encode_b_args arg = { x, + 1, // enable_trellis_opt + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + NULL, // above entropy context + NULL, // left entropy context + &mi->skip }; + (void)mi_row; + (void)mi_col; + (void)output_enabled; +#endif mi->skip = 1; @@ -713,9 +781,9 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); - arg.enable_coeff_opt = 1; + arg.enable_trellis_opt = 1; } else { - arg.enable_coeff_opt = 0; + arg.enable_trellis_opt = 0; } arg.ta = ctx.ta[plane]; arg.tl = ctx.tl[plane]; @@ -737,7 +805,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const scan_order *scan_order; + const ScanOrder *scan_order; TX_TYPE tx_type = DCT_DCT; PREDICTION_MODE mode; const int bwl = b_width_log2_lookup[plane_bsize]; @@ -747,17 +815,13 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, uint16_t *eob = &p->eobs[block]; const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; + int enable_trellis_opt = !x->skip_recode; ENTROPY_CONTEXT *a = NULL; ENTROPY_CONTEXT *l = NULL; int entropy_ctx = 0; dst = &pd->dst.buf[4 * (row * dst_stride + col)]; src = &p->src.buf[4 * (row * src_stride + col)]; src_diff = &p->src_diff[4 * (row * diff_stride + col)]; - if (args->enable_coeff_opt) { - a = &args->ta[col]; - l = &args->tl[row]; - entropy_ctx = combine_entropy_contexts(*a, *l); - } if (tx_size == TX_4X4) { tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block); @@ -773,89 +837,115 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, } } - vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst, - x->skip_encode ? src_stride : dst_stride, dst, - dst_stride, col, row, plane); + vp9_predict_intra_block( + xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst, + (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst, + dst_stride, col, row, plane); + + // skip block condition should be handled before this is called. + assert(!x->skip_block); + + if (!x->skip_recode) { + const int tx_size_in_pixels = (1 << tx_size) << 2; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride, + xd->bd); + } else { + vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride); + } +#else + vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff, + diff_stride, src, src_stride, dst, dst_stride); +#endif + enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col, + plane_bsize, tx_size, args); + } + + if (enable_trellis_opt) { + a = &args->ta[col]; + l = &args->tl[row]; + entropy_ctx = combine_entropy_contexts(*a, *l); + } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); switch (tx_size) { case TX_32X32: if (!x->skip_recode) { - vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, - p->round, p->quant, p->quant_shift, - qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, + eob, scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { - vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd); + vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; case TX_16X16: if (!x->skip_recode) { - vpx_highbd_subtract_block(16, 16, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); if (tx_type == DCT_DCT) vpx_highbd_fdct16x16(src_diff, coeff, diff_stride); else vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); - vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, + eob, scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { - vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob, + vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; case TX_8X8: if (!x->skip_recode) { - vpx_highbd_subtract_block(8, 8, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); if (tx_type == DCT_DCT) vpx_highbd_fdct8x8(src_diff, coeff, diff_stride); else vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); - vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { - vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob, + vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob, xd->bd); } break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); if (!x->skip_recode) { - vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src, - src_stride, dst, dst_stride, xd->bd); if (tx_type != DCT_DCT) vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); else - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } + if (enable_trellis_opt) { + *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } - if (!x->skip_encode && *eob) { if (tx_type == DCT_DCT) { // this is like vp9_short_idct4x4 but has a special case around // eob<=1 which is significant (not just an optimization) for the // lossless case. - x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd); + x->highbd_inv_txfm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd); } else { - vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd); + vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type, + xd->bd); } } break; - default: assert(0); return; } if (*eob) *(args->skip) = 0; return; @@ -865,15 +955,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, switch (tx_size) { case TX_32X32: if (!x->skip_recode) { - vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, - dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); - vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) @@ -881,14 +967,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, break; case TX_16X16: if (!x->skip_recode) { - vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst, - dst_stride); vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); - vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) @@ -896,32 +979,27 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, break; case TX_8X8: if (!x->skip_recode) { - vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst, - dst_stride); vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); - vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); if (!x->skip_recode) { - vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, - dst_stride); if (tx_type != DCT_DCT) vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); else - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, - scan_order->scan, scan_order->iscan); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); } - if (args->enable_coeff_opt && !x->skip_recode) { + if (enable_trellis_opt) { *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0; } if (!x->skip_encode && *eob) { @@ -929,31 +1007,53 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - x->itxm_add(dqcoeff, dst, dst_stride, *eob); + x->inv_txfm_add(dqcoeff, dst, dst_stride, *eob); else vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); } break; - default: assert(0); break; } if (*eob) *(args->skip) = 0; } void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, - int enable_optimize_b) { + int enable_trellis_opt) { const MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; - struct encode_b_args arg = { x, enable_optimize_b, ctx.ta[plane], - ctx.tl[plane], &xd->mi[0]->skip }; +#if CONFIG_MISMATCH_DEBUG + // TODO(angiebird): make mismatch_debug support intra mode + struct encode_b_args arg = { + x, + enable_trellis_opt, + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + ctx.ta[plane], + ctx.tl[plane], + &xd->mi[0]->skip, + 0, // mi_row + 0, // mi_col + 0 // output_enabled + }; +#else + struct encode_b_args arg = { x, + enable_trellis_opt, + 0.0, // trellis_opt_thresh + NULL, // &sse_calc_done + NULL, // &sse + ctx.ta[plane], + ctx.tl[plane], + &xd->mi[0]->skip }; +#endif - if (enable_optimize_b && x->optimize && + if (enable_trellis_opt && x->optimize && (!x->skip_recode || !x->skip_optimize)) { const struct macroblockd_plane *const pd = &xd->plane[plane]; const TX_SIZE tx_size = plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size; vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]); } else { - arg.enable_coeff_opt = 0; + arg.enable_trellis_opt = 0; } vp9_foreach_transformed_block_in_plane(xd, bsize, plane, diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h index cf943bedfd..1391446bed 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODEMB_H_ -#define VP9_ENCODER_VP9_ENCODEMB_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODEMB_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEMB_H_ #include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" @@ -20,14 +20,23 @@ extern "C" { struct encode_b_args { MACROBLOCK *x; - int enable_coeff_opt; + int enable_trellis_opt; + double trellis_opt_thresh; + int *sse_calc_done; + int64_t *sse; ENTROPY_CONTEXT *ta; ENTROPY_CONTEXT *tl; int8_t *skip; +#if CONFIG_MISMATCH_DEBUG + int mi_row; + int mi_col; + int output_enabled; +#endif }; int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int ctx); -void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + int output_enabled); void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize); void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size); @@ -42,10 +51,10 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg); void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, - int enable_optimize_b); + int enable_trellis_opt); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODEMB_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODEMB_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h index 9fc7ab8dc4..2f1be4b233 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODEMV_H_ -#define VP9_ENCODER_VP9_ENCODEMV_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODEMV_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEMV_H_ #include "vp9/encoder/vp9_encoder.h" @@ -27,7 +27,7 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, unsigned int *const max_mv_magnitude); void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], - const nmv_context *mvctx, int usehp); + const nmv_context *ctx, int usehp); void vp9_update_mv_count(ThreadData *td); @@ -35,4 +35,4 @@ void vp9_update_mv_count(ThreadData *td); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODEMV_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODEMV_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c index 432eac8da0..3f42655527 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c @@ -8,25 +8,38 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include #include +#include +#include +#include +#include +#include #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" #include "vpx_dsp/psnr.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_dsp/vpx_filter.h" #if CONFIG_INTERNAL_STATS #include "vpx_dsp/ssim.h" #endif +#include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" #include "vpx_ports/system_state.h" +#include "vpx_ports/vpx_once.h" #include "vpx_ports/vpx_timer.h" +#include "vpx_util/vpx_pthread.h" +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_enums.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_idct.h" #if CONFIG_VP9_POSTPROC @@ -34,24 +47,38 @@ #endif #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scale.h" #include "vp9/common/vp9_tile_common.h" +#if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_alt_ref_aq.h" #include "vp9/encoder/vp9_aq_360.h" #include "vp9/encoder/vp9_aq_complexity.h" +#endif #include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_aq_variance.h" +#endif #include "vp9/encoder/vp9_bitstream.h" +#if CONFIG_INTERNAL_STATS +#include "vp9/encoder/vp9_blockiness.h" +#endif #include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_encoder.h" -#include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mbgraph.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/encoder/vp9_mcomp.h" +#endif +#include "vp9/encoder/vp9_multi_thread.h" #include "vp9/encoder/vp9_noise_estimate.h" #include "vp9/encoder/vp9_picklpf.h" +#include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_resize.h" @@ -60,27 +87,34 @@ #include "vp9/encoder/vp9_speed_features.h" #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_tpl_model.h" +#include "vp9/vp9_cx_iface.h" #define AM_SEGMENT_ID_INACTIVE 7 #define AM_SEGMENT_ID_ACTIVE 0 -#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv - // for altref computation. -#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision - // mv. Choose a very high value for - // now so that HIGH_PRECISION is always - // chosen. -// #define OUTPUT_YUV_REC +// Whether to use high precision mv for altref computation. +#define ALTREF_HIGH_PRECISION_MV 1 + +// Q threshold for high precision mv. Choose a very high value for now so that +// HIGH_PRECISION is always chosen. +#define HIGH_PRECISION_MV_QTHRESH 200 + +#define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold +#define FRAME_RATE_FACTOR 8 #ifdef OUTPUT_YUV_DENOISED FILE *yuv_denoised_file = NULL; #endif #ifdef OUTPUT_YUV_SKINMAP -FILE *yuv_skinmap_file = NULL; +static FILE *yuv_skinmap_file = NULL; #endif #ifdef OUTPUT_YUV_REC FILE *yuv_rec_file; #endif +#ifdef OUTPUT_YUV_SVC_SRC +FILE *yuv_svc_src[3] = { NULL, NULL, NULL }; +#endif #if 0 FILE *framepsnr; @@ -99,9 +133,336 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) { } #endif +#if !CONFIG_REALTIME_ONLY +// compute adaptive threshold for skip recoding +static int compute_context_model_thresh(const VP9_COMP *const cpi) { + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int frame_size = (cm->width * cm->height) >> 10; + const int bitrate = (int)(oxcf->target_bandwidth >> 10); + const int qindex_factor = cm->base_qindex + (MAXQ >> 1); + + // This equation makes the threshold adaptive to frame size. + // Coding gain obtained by recoding comes from alternate frames of large + // content change. We skip recoding if the difference of previous and current + // frame context probability model is less than a certain threshold. + // The first component is the most critical part to guarantee adaptivity. + // Other parameters are estimated based on normal setting of hd resolution + // parameters. e.g. frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50 + const int thresh = + ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) * + qindex_factor) >> + 9; + + return thresh; +} + +// compute the total cost difference between current +// and previous frame context prob model. +static int compute_context_model_diff(const VP9_COMMON *const cm) { + const FRAME_CONTEXT *const pre_fc = + &cm->frame_contexts[cm->frame_context_idx]; + const FRAME_CONTEXT *const cur_fc = cm->fc; + const FRAME_COUNTS *counts = &cm->counts; + vpx_prob pre_last_prob, cur_last_prob; + int diff = 0; + int i, j, k, l, m, n; + + // y_mode_prob + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) { + for (j = 0; j < INTRA_MODES - 1; ++j) { + diff += (int)counts->y_mode[i][j] * + (pre_fc->y_mode_prob[i][j] - cur_fc->y_mode_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->y_mode_prob[i][INTRA_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->y_mode_prob[i][INTRA_MODES - 2]; + + diff += (int)counts->y_mode[i][INTRA_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // uv_mode_prob + for (i = 0; i < INTRA_MODES; ++i) { + for (j = 0; j < INTRA_MODES - 1; ++j) { + diff += (int)counts->uv_mode[i][j] * + (pre_fc->uv_mode_prob[i][j] - cur_fc->uv_mode_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->uv_mode_prob[i][INTRA_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->uv_mode_prob[i][INTRA_MODES - 2]; + + diff += (int)counts->uv_mode[i][INTRA_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // partition_prob + for (i = 0; i < PARTITION_CONTEXTS; ++i) { + for (j = 0; j < PARTITION_TYPES - 1; ++j) { + diff += (int)counts->partition[i][j] * + (pre_fc->partition_prob[i][j] - cur_fc->partition_prob[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->partition_prob[i][PARTITION_TYPES - 2]; + cur_last_prob = MAX_PROB - cur_fc->partition_prob[i][PARTITION_TYPES - 2]; + + diff += (int)counts->partition[i][PARTITION_TYPES - 1] * + (pre_last_prob - cur_last_prob); + } + + // coef_probs + for (i = 0; i < TX_SIZES; ++i) { + for (j = 0; j < PLANE_TYPES; ++j) { + for (k = 0; k < REF_TYPES; ++k) { + for (l = 0; l < COEF_BANDS; ++l) { + for (m = 0; m < BAND_COEFF_CONTEXTS(l); ++m) { + for (n = 0; n < UNCONSTRAINED_NODES; ++n) { + diff += (int)counts->coef[i][j][k][l][m][n] * + (pre_fc->coef_probs[i][j][k][l][m][n] - + cur_fc->coef_probs[i][j][k][l][m][n]); + } + + pre_last_prob = + MAX_PROB - + pre_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1]; + cur_last_prob = + MAX_PROB - + cur_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1]; + + diff += (int)counts->coef[i][j][k][l][m][UNCONSTRAINED_NODES] * + (pre_last_prob - cur_last_prob); + } + } + } + } + } + + // switchable_interp_prob + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { + for (j = 0; j < SWITCHABLE_FILTERS - 1; ++j) { + diff += (int)counts->switchable_interp[i][j] * + (pre_fc->switchable_interp_prob[i][j] - + cur_fc->switchable_interp_prob[i][j]); + } + pre_last_prob = + MAX_PROB - pre_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2]; + cur_last_prob = + MAX_PROB - cur_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2]; + + diff += (int)counts->switchable_interp[i][SWITCHABLE_FILTERS - 1] * + (pre_last_prob - cur_last_prob); + } + + // inter_mode_probs + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { + for (j = 0; j < INTER_MODES - 1; ++j) { + diff += (int)counts->inter_mode[i][j] * + (pre_fc->inter_mode_probs[i][j] - cur_fc->inter_mode_probs[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->inter_mode_probs[i][INTER_MODES - 2]; + cur_last_prob = MAX_PROB - cur_fc->inter_mode_probs[i][INTER_MODES - 2]; + + diff += (int)counts->inter_mode[i][INTER_MODES - 1] * + (pre_last_prob - cur_last_prob); + } + + // intra_inter_prob + for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) { + diff += (int)counts->intra_inter[i][0] * + (pre_fc->intra_inter_prob[i] - cur_fc->intra_inter_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->intra_inter_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->intra_inter_prob[i]; + + diff += (int)counts->intra_inter[i][1] * (pre_last_prob - cur_last_prob); + } + + // comp_inter_prob + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) { + diff += (int)counts->comp_inter[i][0] * + (pre_fc->comp_inter_prob[i] - cur_fc->comp_inter_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->comp_inter_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->comp_inter_prob[i]; + + diff += (int)counts->comp_inter[i][1] * (pre_last_prob - cur_last_prob); + } + + // single_ref_prob + for (i = 0; i < REF_CONTEXTS; ++i) { + for (j = 0; j < 2; ++j) { + diff += (int)counts->single_ref[i][j][0] * + (pre_fc->single_ref_prob[i][j] - cur_fc->single_ref_prob[i][j]); + + pre_last_prob = MAX_PROB - pre_fc->single_ref_prob[i][j]; + cur_last_prob = MAX_PROB - cur_fc->single_ref_prob[i][j]; + + diff += + (int)counts->single_ref[i][j][1] * (pre_last_prob - cur_last_prob); + } + } + + // comp_ref_prob + for (i = 0; i < REF_CONTEXTS; ++i) { + diff += (int)counts->comp_ref[i][0] * + (pre_fc->comp_ref_prob[i] - cur_fc->comp_ref_prob[i]); + + pre_last_prob = MAX_PROB - pre_fc->comp_ref_prob[i]; + cur_last_prob = MAX_PROB - cur_fc->comp_ref_prob[i]; + + diff += (int)counts->comp_ref[i][1] * (pre_last_prob - cur_last_prob); + } + + // tx_probs + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + // p32x32 + for (j = 0; j < TX_SIZES - 1; ++j) { + diff += (int)counts->tx.p32x32[i][j] * + (pre_fc->tx_probs.p32x32[i][j] - cur_fc->tx_probs.p32x32[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p32x32[i][TX_SIZES - 2]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p32x32[i][TX_SIZES - 2]; + + diff += (int)counts->tx.p32x32[i][TX_SIZES - 1] * + (pre_last_prob - cur_last_prob); + + // p16x16 + for (j = 0; j < TX_SIZES - 2; ++j) { + diff += (int)counts->tx.p16x16[i][j] * + (pre_fc->tx_probs.p16x16[i][j] - cur_fc->tx_probs.p16x16[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p16x16[i][TX_SIZES - 3]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p16x16[i][TX_SIZES - 3]; + + diff += (int)counts->tx.p16x16[i][TX_SIZES - 2] * + (pre_last_prob - cur_last_prob); + + // p8x8 + for (j = 0; j < TX_SIZES - 3; ++j) { + diff += (int)counts->tx.p8x8[i][j] * + (pre_fc->tx_probs.p8x8[i][j] - cur_fc->tx_probs.p8x8[i][j]); + } + pre_last_prob = MAX_PROB - pre_fc->tx_probs.p8x8[i][TX_SIZES - 4]; + cur_last_prob = MAX_PROB - cur_fc->tx_probs.p8x8[i][TX_SIZES - 4]; + + diff += + (int)counts->tx.p8x8[i][TX_SIZES - 3] * (pre_last_prob - cur_last_prob); + } + + // skip_probs + for (i = 0; i < SKIP_CONTEXTS; ++i) { + diff += (int)counts->skip[i][0] * + (pre_fc->skip_probs[i] - cur_fc->skip_probs[i]); + + pre_last_prob = MAX_PROB - pre_fc->skip_probs[i]; + cur_last_prob = MAX_PROB - cur_fc->skip_probs[i]; + + diff += (int)counts->skip[i][1] * (pre_last_prob - cur_last_prob); + } + + // mv + for (i = 0; i < MV_JOINTS - 1; ++i) { + diff += (int)counts->mv.joints[i] * + (pre_fc->nmvc.joints[i] - cur_fc->nmvc.joints[i]); + } + pre_last_prob = MAX_PROB - pre_fc->nmvc.joints[MV_JOINTS - 2]; + cur_last_prob = MAX_PROB - cur_fc->nmvc.joints[MV_JOINTS - 2]; + + diff += + (int)counts->mv.joints[MV_JOINTS - 1] * (pre_last_prob - cur_last_prob); + + for (i = 0; i < 2; ++i) { + const nmv_component_counts *nmv_count = &counts->mv.comps[i]; + const nmv_component *pre_nmv_prob = &pre_fc->nmvc.comps[i]; + const nmv_component *cur_nmv_prob = &cur_fc->nmvc.comps[i]; + + // sign + diff += (int)nmv_count->sign[0] * (pre_nmv_prob->sign - cur_nmv_prob->sign); + + pre_last_prob = MAX_PROB - pre_nmv_prob->sign; + cur_last_prob = MAX_PROB - cur_nmv_prob->sign; + + diff += (int)nmv_count->sign[1] * (pre_last_prob - cur_last_prob); + + // classes + for (j = 0; j < MV_CLASSES - 1; ++j) { + diff += (int)nmv_count->classes[j] * + (pre_nmv_prob->classes[j] - cur_nmv_prob->classes[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->classes[MV_CLASSES - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->classes[MV_CLASSES - 2]; + + diff += (int)nmv_count->classes[MV_CLASSES - 1] * + (pre_last_prob - cur_last_prob); + + // class0 + for (j = 0; j < CLASS0_SIZE - 1; ++j) { + diff += (int)nmv_count->class0[j] * + (pre_nmv_prob->class0[j] - cur_nmv_prob->class0[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->class0[CLASS0_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0[CLASS0_SIZE - 2]; + + diff += (int)nmv_count->class0[CLASS0_SIZE - 1] * + (pre_last_prob - cur_last_prob); + + // bits + for (j = 0; j < MV_OFFSET_BITS; ++j) { + diff += (int)nmv_count->bits[j][0] * + (pre_nmv_prob->bits[j] - cur_nmv_prob->bits[j]); + + pre_last_prob = MAX_PROB - pre_nmv_prob->bits[j]; + cur_last_prob = MAX_PROB - cur_nmv_prob->bits[j]; + + diff += (int)nmv_count->bits[j][1] * (pre_last_prob - cur_last_prob); + } + + // class0_fp + for (j = 0; j < CLASS0_SIZE; ++j) { + for (k = 0; k < MV_FP_SIZE - 1; ++k) { + diff += (int)nmv_count->class0_fp[j][k] * + (pre_nmv_prob->class0_fp[j][k] - cur_nmv_prob->class0_fp[j][k]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->class0_fp[j][MV_FP_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0_fp[j][MV_FP_SIZE - 2]; + + diff += (int)nmv_count->class0_fp[j][MV_FP_SIZE - 1] * + (pre_last_prob - cur_last_prob); + } + + // fp + for (j = 0; j < MV_FP_SIZE - 1; ++j) { + diff += + (int)nmv_count->fp[j] * (pre_nmv_prob->fp[j] - cur_nmv_prob->fp[j]); + } + pre_last_prob = MAX_PROB - pre_nmv_prob->fp[MV_FP_SIZE - 2]; + cur_last_prob = MAX_PROB - cur_nmv_prob->fp[MV_FP_SIZE - 2]; + + diff += + (int)nmv_count->fp[MV_FP_SIZE - 1] * (pre_last_prob - cur_last_prob); + + // class0_hp + diff += (int)nmv_count->class0_hp[0] * + (pre_nmv_prob->class0_hp - cur_nmv_prob->class0_hp); + + pre_last_prob = MAX_PROB - pre_nmv_prob->class0_hp; + cur_last_prob = MAX_PROB - cur_nmv_prob->class0_hp; + + diff += (int)nmv_count->class0_hp[1] * (pre_last_prob - cur_last_prob); + + // hp + diff += (int)nmv_count->hp[0] * (pre_nmv_prob->hp - cur_nmv_prob->hp); + + pre_last_prob = MAX_PROB - pre_nmv_prob->hp; + cur_last_prob = MAX_PROB - cur_nmv_prob->hp; + + diff += (int)nmv_count->hp[1] * (pre_last_prob - cur_last_prob); + } + + return -diff; +} +#endif // !CONFIG_REALTIME_ONLY + // Test for whether to calculate metrics for the frame. -static int is_psnr_calc_enabled(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; +static int is_psnr_calc_enabled(const VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; const VP9EncoderConfig *const oxcf = &cpi->oxcf; return cpi->b_calculate_psnr && (oxcf->pass != 1) && cm->show_frame; @@ -109,58 +470,57 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) { /* clang-format off */ const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = { - { LEVEL_1, 829440, 36864, 200, 400, 2, 1, 4, 8 }, - { LEVEL_1_1, 2764800, 73728, 800, 1000, 2, 1, 4, 8 }, - { LEVEL_2, 4608000, 122880, 1800, 1500, 2, 1, 4, 8 }, - { LEVEL_2_1, 9216000, 245760, 3600, 2800, 2, 2, 4, 8 }, - { LEVEL_3, 20736000, 552960, 7200, 6000, 2, 4, 4, 8 }, - { LEVEL_3_1, 36864000, 983040, 12000, 10000, 2, 4, 4, 8 }, - { LEVEL_4, 83558400, 2228224, 18000, 16000, 4, 4, 4, 8 }, - { LEVEL_4_1, 160432128, 2228224, 30000, 18000, 4, 4, 5, 6 }, - { LEVEL_5, 311951360, 8912896, 60000, 36000, 6, 8, 6, 4 }, - { LEVEL_5_1, 588251136, 8912896, 120000, 46000, 8, 8, 10, 4 }, + // sample rate size breadth bitrate cpb + { LEVEL_1, 829440, 36864, 512, 200, 400, 2, 1, 4, 8 }, + { LEVEL_1_1, 2764800, 73728, 768, 800, 1000, 2, 1, 4, 8 }, + { LEVEL_2, 4608000, 122880, 960, 1800, 1500, 2, 1, 4, 8 }, + { LEVEL_2_1, 9216000, 245760, 1344, 3600, 2800, 2, 2, 4, 8 }, + { LEVEL_3, 20736000, 552960, 2048, 7200, 6000, 2, 4, 4, 8 }, + { LEVEL_3_1, 36864000, 983040, 2752, 12000, 10000, 2, 4, 4, 8 }, + { LEVEL_4, 83558400, 2228224, 4160, 18000, 16000, 4, 4, 4, 8 }, + { LEVEL_4_1, 160432128, 2228224, 4160, 30000, 18000, 4, 4, 5, 6 }, + { LEVEL_5, 311951360, 8912896, 8384, 60000, 36000, 6, 8, 6, 4 }, + { LEVEL_5_1, 588251136, 8912896, 8384, 120000, 46000, 8, 8, 10, 4 }, // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when - // they are finalized (currently TBD). - { LEVEL_5_2, 1176502272, 8912896, 180000, 0, 8, 8, 10, 4 }, - { LEVEL_6, 1176502272, 35651584, 180000, 0, 8, 16, 10, 4 }, - { LEVEL_6_1, 2353004544u, 35651584, 240000, 0, 8, 16, 10, 4 }, - { LEVEL_6_2, 4706009088u, 35651584, 480000, 0, 8, 16, 10, 4 }, + // they are finalized (currently tentative). + { LEVEL_5_2, 1176502272, 8912896, 8384, 180000, 90000, 8, 8, 10, 4 }, + { LEVEL_6, 1176502272, 35651584, 16832, 180000, 90000, 8, 16, 10, 4 }, + { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 }, + { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 }, }; /* clang-format on */ -static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = - { "The average bit-rate is too high.", - "The picture size is too large.", - "The luma sample rate is too large.", - "The CPB size is too large.", - "The compression ratio is too small", - "Too many column tiles are used.", - "The alt-ref distance is too small.", - "Too many reference buffers are used." }; +static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { + "The average bit-rate is too high.", + "The picture size is too large.", + "The picture width/height is too large.", + "The luma sample rate is too large.", + "The CPB size is too large.", + "The compression ratio is too small", + "Too many column tiles are used.", + "The alt-ref distance is too small.", + "Too many reference buffers are used." +}; -static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { +static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) { switch (mode) { - case NORMAL: + case VP8E_NORMAL: *hr = 1; *hs = 1; break; - case FOURFIVE: + case VP8E_FOURFIVE: *hr = 4; *hs = 5; break; - case THREEFIVE: + case VP8E_THREEFIVE: *hr = 3; *hs = 5; break; - case ONETWO: + default: + assert(mode == VP8E_ONETWO); *hr = 1; *hs = 2; break; - default: - *hr = 1; - *hs = 1; - assert(0); - break; } } @@ -216,6 +576,72 @@ static void apply_active_map(VP9_COMP *cpi) { } } +static void apply_roi_map(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + struct segmentation *const seg = &cm->seg; + vpx_roi_map_t *roi = &cpi->roi; + const int *delta_q = roi->delta_q; + const int *delta_lf = roi->delta_lf; + const int *skip = roi->skip; + int ref_frame[8]; + int internal_delta_q[MAX_SEGMENTS]; + int i; + + // TODO(jianj): Investigate why ROI not working in speed < 5 or in non + // realtime mode. + if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return; + if (!roi->enabled) return; + + memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame)); + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + // Select delta coding method; + seg->abs_delta = SEGMENT_DELTADATA; + + memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols)); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + // Translate the external delta q values to internal values. + internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i])); + if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i]; + vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q); + vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF); + if (internal_delta_q[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]); + } + if (delta_lf[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF); + vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]); + } + if (skip[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_SKIP); + vp9_set_segdata(seg, i, SEG_LVL_SKIP, 0); + } + if (ref_frame[i] >= 0) { + int valid_ref = 1; + // ALTREF is not used as reference for nonrd_pickmode with 0 lag. + if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode) + valid_ref = 0; + // If GOLDEN is selected, make sure it's set as reference. + if (ref_frame[i] == GOLDEN_FRAME && + !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) { + valid_ref = 0; + } + // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are + // same reference. + if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0) + ref_frame[i] = LAST_FRAME; + if (valid_ref) { + vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME); + vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]); + } + } + } + roi->enabled = 1; +} + static void init_level_info(Vp9LevelInfo *level_info) { Vp9LevelStats *const level_stats = &level_info->level_stats; Vp9LevelSpec *const level_spec = &level_info->level_spec; @@ -226,6 +652,18 @@ static void init_level_info(Vp9LevelInfo *level_info) { level_spec->min_altref_distance = INT_MAX; } +static int check_seg_range(int seg_data[8], int range) { + int i; + for (i = 0; i < 8; ++i) { + // Note abs() alone can't be used as the behavior of abs(INT_MIN) is + // undefined. + if (seg_data[i] > range || seg_data[i] < -range) { + return 0; + } + } + return 1; +} + VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { int i; const Vp9LevelSpec *this_level; @@ -238,6 +676,8 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { (double)this_level->max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P) || level_spec->max_luma_picture_size > this_level->max_luma_picture_size || + level_spec->max_luma_picture_breadth > + this_level->max_luma_picture_breadth || level_spec->average_bitrate > this_level->average_bitrate || level_spec->max_cpb_size > this_level->max_cpb_size || level_spec->compression_ratio < this_level->compression_ratio || @@ -250,6 +690,63 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level; } +vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, + unsigned int rows, unsigned int cols, + int delta_q[8], int delta_lf[8], int skip[8], + int ref_frame[8]) { + VP9_COMMON *cm = &cpi->common; + vpx_roi_map_t *roi = &cpi->roi; + const int range = 63; + const int ref_frame_range = 3; // Alt-ref + const int skip_range = 1; + const int frame_rows = cpi->common.mi_rows; + const int frame_cols = cpi->common.mi_cols; + + // Check number of rows and columns match + if (frame_rows != (int)rows || frame_cols != (int)cols) { + return VPX_CODEC_INVALID_PARAM; + } + + if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) || + !check_seg_range(ref_frame, ref_frame_range) || + !check_seg_range(skip, skip_range)) + return VPX_CODEC_INVALID_PARAM; + + // Also disable segmentation if no deltas are specified. + if (!map || + (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] | + delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] | + delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] | + delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] | + skip[5] | skip[6] | skip[7]) && + (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 && + ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 && + ref_frame[6] == -1 && ref_frame[7] == -1))) { + vp9_disable_segmentation(&cm->seg); + cpi->roi.enabled = 0; + return VPX_CODEC_OK; + } + + if (roi->roi_map) { + vpx_free(roi->roi_map); + roi->roi_map = NULL; + } + roi->roi_map = vpx_malloc(rows * cols); + if (!roi->roi_map) return VPX_CODEC_MEM_ERROR; + + // Copy to ROI structure in the compressor. + memcpy(roi->roi_map, map, rows * cols); + memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0])); + memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0])); + memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0])); + memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0])); + roi->enabled = 1; + roi->rows = rows; + roi->cols = cols; + + return VPX_CODEC_OK; +} + int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, int cols) { if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) { @@ -327,8 +824,37 @@ static void setup_frame(VP9_COMP *cpi) { if (!cpi->use_svc) cm->frame_context_idx = cpi->refresh_alt_ref_frame; } + // TODO(jingning): Overwrite the frame_context_idx index in multi-layer ARF + // case. Need some further investigation on if we could apply this to single + // layer ARF case as well. + if (cpi->multi_layer_arf && !cpi->use_svc) { + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const int gf_group_index = gf_group->index; + const int boost_frame = + !cpi->rc.is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + + // frame_context_idx Frame Type + // 0 Intra only frame, base layer ARF + // 1 ARFs with layer depth = 2,3 + // 2 ARFs with layer depth > 3 + // 3 Non-boosted frames + if (frame_is_intra_only(cm)) { + cm->frame_context_idx = 0; + } else if (boost_frame) { + if (gf_group->rf_level[gf_group_index] == GF_ARF_STD) + cm->frame_context_idx = 0; + else if (gf_group->layer_depth[gf_group_index] <= 3) + cm->frame_context_idx = 1; + else + cm->frame_context_idx = 2; + } else { + cm->frame_context_idx = 3; + } + } + if (cm->frame_type == KEY_FRAME) { - if (!is_two_pass_svc(cpi)) cpi->refresh_golden_frame = 1; + cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; vp9_zero(cpi->interp_filter_selected); } else { @@ -362,10 +888,11 @@ static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) { if (!cm->prev_mip) return 1; cm->mi_alloc_size = mi_size; - cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *)); + cm->mi_grid_base = + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); if (!cm->mi_grid_base) return 1; cm->prev_mi_grid_base = - (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *)); + (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); if (!cm->prev_mi_grid_base) return 1; return 0; @@ -380,12 +907,17 @@ static void vp9_enc_free_mi(VP9_COMMON *cm) { cm->mi_grid_base = NULL; vpx_free(cm->prev_mi_grid_base); cm->prev_mi_grid_base = NULL; + cm->mi_alloc_size = 0; } static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { // Current mip will be the prev_mip for the next frame. MODE_INFO **temp_base = cm->prev_mi_grid_base; MODE_INFO *temp = cm->prev_mip; + + // Skip update prev_mi frame in show_existing_frame mode. + if (cm->show_existing_frame) return; + cm->prev_mip = cm->mip; cm->mip = temp; @@ -399,22 +931,21 @@ static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; } -void vp9_initialize_enc(void) { - static volatile int init_done = 0; - - if (!init_done) { - vp9_rtcd(); - vpx_dsp_rtcd(); - vpx_scale_rtcd(); - vp9_init_intra_predictors(); - vp9_init_me_luts(); - vp9_rc_init_minq_luts(); - vp9_entropy_mv_init(); - vp9_temporal_filter_init(); - init_done = 1; - } +static void initialize_enc(void) { + vp9_rtcd(); + vpx_dsp_rtcd(); + vpx_scale_rtcd(); + vp9_init_intra_predictors(); + vp9_init_me_luts(); + vp9_rc_init_minq_luts(); + vp9_entropy_mv_init(); +#if !CONFIG_REALTIME_ONLY + vp9_temporal_filter_init(); +#endif } +void vp9_initialize_enc(void) { once(initialize_enc); } + static void dealloc_compressor_data(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int i; @@ -450,21 +981,53 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { cpi->nmvsadcosts_hp[0] = NULL; cpi->nmvsadcosts_hp[1] = NULL; + vpx_free(cpi->skin_map); + cpi->skin_map = NULL; + vpx_free(cpi->prev_partition); cpi->prev_partition = NULL; + vpx_free(cpi->svc.prev_partition_svc); + cpi->svc.prev_partition_svc = NULL; + vpx_free(cpi->prev_segment_id); cpi->prev_segment_id = NULL; + vpx_free(cpi->prev_variance_low); + cpi->prev_variance_low = NULL; + + vpx_free(cpi->copied_frame_cnt); + cpi->copied_frame_cnt = NULL; + + vpx_free(cpi->content_state_sb_fd); + cpi->content_state_sb_fd = NULL; + + vpx_free(cpi->count_arf_frame_usage); + cpi->count_arf_frame_usage = NULL; + vpx_free(cpi->count_lastgolden_frame_usage); + cpi->count_lastgolden_frame_usage = NULL; + vp9_cyclic_refresh_free(cpi->cyclic_refresh); cpi->cyclic_refresh = NULL; vpx_free(cpi->active_map.map); cpi->active_map.map = NULL; + vpx_free(cpi->roi.roi_map); + cpi->roi.roi_map = NULL; + vpx_free(cpi->consec_zero_mv); cpi->consec_zero_mv = NULL; + vpx_free(cpi->mb_wiener_variance); + cpi->mb_wiener_variance = NULL; + + vpx_free(cpi->sb_mul_scale); + cpi->sb_mul_scale = NULL; + + vpx_free(cpi->mi_ssim_rdmult_scaling_factors); + cpi->mi_ssim_rdmult_scaling_factors = NULL; + vp9_free_ref_frame_buffers(cm->buffer_pool); #if CONFIG_VP9_POSTPROC vp9_free_postproc_buffers(cm); @@ -474,7 +1037,7 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free_frame_buffer(&cpi->last_frame_uf); vpx_free_frame_buffer(&cpi->scaled_source); vpx_free_frame_buffer(&cpi->scaled_last_source); - vpx_free_frame_buffer(&cpi->alt_ref_buffer); + vpx_free_frame_buffer(&cpi->tf_buffer); #ifdef ENABLE_KF_DENOISE vpx_free_frame_buffer(&cpi->raw_unscaled_source); vpx_free_frame_buffer(&cpi->raw_scaled_source); @@ -485,6 +1048,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->tile_tok[0][0]); cpi->tile_tok[0][0] = 0; + vpx_free(cpi->tplist[0][0]); + cpi->tplist[0][0] = NULL; + vp9_free_pc_tree(&cpi->td); for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { @@ -494,11 +1060,6 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { lc->rc_twopass_stats_in.sz = 0; } - if (cpi->source_diff_var != NULL) { - vpx_free(cpi->source_diff_var); - cpi->source_diff_var = NULL; - } - for (i = 0; i < MAX_LAG_BUFFERS; ++i) { vpx_free_frame_buffer(&cpi->svc.scaled_frames[i]); } @@ -570,6 +1131,7 @@ static void restore_coding_context(VP9_COMP *cpi) { *cm->fc = cc->fc; } +#if !CONFIG_REALTIME_ONLY static void configure_static_seg_features(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; @@ -693,6 +1255,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) { } } } +#endif // !CONFIG_REALTIME_ONLY static void update_reference_segmentation_map(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; @@ -726,7 +1289,7 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) { "Failed to allocate lag buffers"); // TODO(agrange) Check if ARF is enabled and skip allocation if not. - if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height, + if (vpx_realloc_frame_buffer(&cpi->tf_buffer, oxcf->width, oxcf->height, cm->subsampling_x, cm->subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH cm->use_highbitdepth, @@ -734,7 +1297,7 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) { VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate altref buffer"); + "Failed to allocate temporal filter buffer"); } static void alloc_util_frame_buffers(VP9_COMP *cpi) { @@ -761,8 +1324,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a - // target of 1/4x1/4. - if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) { + // target of 1/4x1/4. number_spatial_layers must be greater than 2. + if (is_one_pass_svc(cpi) && !cpi->svc.scaled_temp_is_alloc && + cpi->svc.number_spatial_layers > 2) { cpi->svc.scaled_temp_is_alloc = 1; if (vpx_realloc_frame_buffer( &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1, @@ -807,20 +1371,22 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { #endif } -static int alloc_context_buffers_ext(VP9_COMP *cpi) { +static void alloc_context_buffers_ext(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; int mi_size = cm->mi_cols * cm->mi_rows; - cpi->mbmi_ext_base = vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)); - if (!cpi->mbmi_ext_base) return 1; - - return 0; + CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base, + vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base))); } static void alloc_compressor_data(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; + int sb_rows; - vp9_alloc_context_buffers(cm, cm->width, cm->height); + if (vp9_alloc_context_buffers(cm, cm->width, cm->height)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } alloc_context_buffers_ext(cpi); @@ -828,10 +1394,16 @@ static void alloc_compressor_data(VP9_COMP *cpi) { { unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); - CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0], + CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0], vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0]))); } + sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + vpx_free(cpi->tplist[0][0]); + CHECK_MEM_ERROR( + &cm->error, cpi->tplist[0][0], + vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0]))); + vp9_setup_pc_tree(&cpi->common, &cpi->td); } @@ -846,14 +1418,23 @@ static void set_tile_limits(VP9_COMP *cpi) { int min_log2_tile_cols, max_log2_tile_cols; vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); - if (is_two_pass_svc(cpi) && (cpi->svc.encode_empty_frame_state == ENCODING || - cpi->svc.number_spatial_layers > 1)) { - cm->log2_tile_cols = 0; - cm->log2_tile_rows = 0; - } else { - cm->log2_tile_cols = - clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); - cm->log2_tile_rows = cpi->oxcf.tile_rows; + cm->log2_tile_cols = + clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); + + // Max allowed number of tile_rows is 4 (so log2_tile_rows = 2), and each + // tile_row contains a multiple of superblocks. + const int sb64_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> 3; + const int max_log2_tile_rows = (sb64_rows >= 4) ? 2 + : (sb64_rows >= 2) ? 1 + : 0; + cm->log2_tile_rows = VPXMIN(cpi->oxcf.tile_rows, max_log2_tile_rows); + + if (cpi->oxcf.target_level == LEVEL_AUTO) { + const int level_tile_cols = + log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height); + if (cm->log2_tile_cols > level_tile_cols) { + cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols); + } } } @@ -869,31 +1450,23 @@ static void update_frame_size(VP9_COMP *cpi) { cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base)); set_tile_limits(cpi); - - if (is_two_pass_svc(cpi)) { - if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to reallocate alt_ref_buffer"); - } } static void init_buffer_indices(VP9_COMP *cpi) { - cpi->lst_fb_idx = 0; - cpi->gld_fb_idx = 1; - cpi->alt_fb_idx = 2; + int ref_frame; + + for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) + cpi->ref_fb_idx[ref_frame] = ref_frame; + + cpi->lst_fb_idx = cpi->ref_fb_idx[LAST_FRAME - 1]; + cpi->gld_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; + cpi->alt_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; } static void init_level_constraint(LevelConstraint *lc) { lc->level_index = -1; lc->max_cpb_size = INT_MAX; lc->max_frame_size = INT_MAX; - lc->rc_config_updated = 0; lc->fail_flag = 0; } @@ -905,7 +1478,7 @@ static void set_level_constraint(LevelConstraint *ls, int8_t level_index) { } } -static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { +static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; cpi->oxcf = *oxcf; @@ -937,7 +1510,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { // Temporal scalability. cpi->svc.number_temporal_layers = oxcf->ts_number_layers; - if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || + if ((cpi->svc.number_temporal_layers > 1) || ((cpi->svc.number_temporal_layers > 1 || cpi->svc.number_spatial_layers > 1) && cpi->oxcf.pass != 1)) { @@ -953,10 +1526,32 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { init_buffer_indices(cpi); vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height); + cpi->fixed_qp_onepass = 0; } -static void set_rc_buffer_sizes(RATE_CONTROL *rc, - const VP9EncoderConfig *oxcf) { +void vp9_check_reset_rc_flag(VP9_COMP *cpi) { + RATE_CONTROL *rc = &cpi->rc; + + if (cpi->common.current_video_frame > + (unsigned int)cpi->svc.number_spatial_layers) { + if (cpi->use_svc) { + vp9_svc_check_reset_layer_rc_flag(cpi); + } else { + if (rc->avg_frame_bandwidth / 3 > (rc->last_avg_frame_bandwidth >> 1) || + rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) { + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + rc->bits_off_target = rc->optimal_buffer_level; + rc->buffer_level = rc->optimal_buffer_level; + } + } + } +} + +void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) { + RATE_CONTROL *rc = &cpi->rc; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + const int64_t bandwidth = oxcf->target_bandwidth; const int64_t starting = oxcf->starting_buffer_level_ms; const int64_t optimal = oxcf->optimal_buffer_level_ms; @@ -967,18 +1562,23 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000; rc->maximum_buffer_size = (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000; + + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = VPXMIN(rc->buffer_level, rc->maximum_buffer_size); } #if CONFIG_VP9_HIGHBITDEPTH -#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx3f = SDX3F; \ - cpi->fn_ptr[BT].sdx8f = SDX8F; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; +#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdsf = SDSF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdsx4df = SDSX4DF; #define MAKE_BFP_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ @@ -1016,47 +1616,6 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, 4; \ } -#define MAKE_BFP_SAD3_WRAPPER(fnname) \ - static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - } \ - static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 3; i++) sad_array[i] >>= 2; \ - } \ - static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 3; i++) sad_array[i] >>= 4; \ - } - -#define MAKE_BFP_SAD8_WRAPPER(fnname) \ - static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - } \ - static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 8; i++) sad_array[i] >>= 2; \ - } \ - static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \ - const uint8_t *ref_ptr, int ref_stride, \ - unsigned int *sad_array) { \ - int i; \ - fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \ - for (i = 0; i < 8; i++) sad_array[i] >>= 4; \ - } #define MAKE_BFP_SAD4D_WRAPPER(fnname) \ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \ const uint8_t *const ref_ptr[], int ref_stride, \ @@ -1079,322 +1638,362 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, } MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d) + MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4) MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg) -MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3) -MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8) MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d) static void highbd_set_var_fns(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; if (cm->use_highbitdepth) { switch (cm->bit_depth) { case VPX_BITS_8: - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8, - vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16, - vpx_highbd_8_sub_pixel_variance32x16, - vpx_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL, - vpx_highbd_sad32x16x4d_bits8) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8, - vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32, - vpx_highbd_8_sub_pixel_variance16x32, - vpx_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL, - vpx_highbd_sad16x32x4d_bits8) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8, - vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32, - vpx_highbd_8_sub_pixel_variance64x32, - vpx_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL, - vpx_highbd_sad64x32x4d_bits8) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8, - vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64, - vpx_highbd_8_sub_pixel_variance32x64, - vpx_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL, - vpx_highbd_sad32x64x4d_bits8) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8, - vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32, - vpx_highbd_8_sub_pixel_variance32x32, - vpx_highbd_8_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x3_bits8, vpx_highbd_sad32x32x8_bits8, - vpx_highbd_sad32x32x4d_bits8) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8, - vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64, - vpx_highbd_8_sub_pixel_variance64x64, - vpx_highbd_8_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x3_bits8, vpx_highbd_sad64x64x8_bits8, - vpx_highbd_sad64x64x4d_bits8) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8, - vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16, - vpx_highbd_8_sub_pixel_variance16x16, - vpx_highbd_8_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x3_bits8, vpx_highbd_sad16x16x8_bits8, - vpx_highbd_sad16x16x4d_bits8) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits8, + vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8, + vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16, + vpx_highbd_8_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8) HIGHBD_BFP( - BLOCK_16X8, vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8, + BLOCK_16X32, vpx_highbd_sad16x32_bits8, + vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8, + vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32, + vpx_highbd_8_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits8, + vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8, + vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32, + vpx_highbd_8_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits8, + vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8, + vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64, + vpx_highbd_8_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits8, + vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8, + vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32, + vpx_highbd_8_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits8, + vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8, + vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64, + vpx_highbd_8_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits8, + vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8, + vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16, + vpx_highbd_8_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits8, + vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8, - vpx_highbd_8_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8, - vpx_highbd_sad16x8x8_bits8, vpx_highbd_sad16x8x4d_bits8) + vpx_highbd_8_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8) HIGHBD_BFP( - BLOCK_8X16, vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8, + BLOCK_8X16, vpx_highbd_sad8x16_bits8, + vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16, - vpx_highbd_8_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8, - vpx_highbd_sad8x16x8_bits8, vpx_highbd_sad8x16x4d_bits8) + vpx_highbd_8_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8) - HIGHBD_BFP( - BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8, - vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, - vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8, - vpx_highbd_sad8x8x8_bits8, vpx_highbd_sad8x8x4d_bits8) + HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8, + vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8, + vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8, + vpx_highbd_8_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8) HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8, - vpx_highbd_sad8x4_avg_bits8, vpx_highbd_8_variance8x4, - vpx_highbd_8_sub_pixel_variance8x4, - vpx_highbd_8_sub_pixel_avg_variance8x4, NULL, - vpx_highbd_sad8x4x8_bits8, vpx_highbd_sad8x4x4d_bits8) + vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8, + vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4, + vpx_highbd_8_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8) HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8, - vpx_highbd_sad4x8_avg_bits8, vpx_highbd_8_variance4x8, - vpx_highbd_8_sub_pixel_variance4x8, - vpx_highbd_8_sub_pixel_avg_variance4x8, NULL, - vpx_highbd_sad4x8x8_bits8, vpx_highbd_sad4x8x4d_bits8) + vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8, + vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8, + vpx_highbd_8_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8) - HIGHBD_BFP( - BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8, - vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, - vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8, - vpx_highbd_sad4x4x8_bits8, vpx_highbd_sad4x4x4d_bits8) + HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8, + vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8, + vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4, + vpx_highbd_8_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8) break; case VPX_BITS_10: - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10, - vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16, - vpx_highbd_10_sub_pixel_variance32x16, - vpx_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL, - vpx_highbd_sad32x16x4d_bits10) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10, - vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32, - vpx_highbd_10_sub_pixel_variance16x32, - vpx_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL, - vpx_highbd_sad16x32x4d_bits10) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10, - vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32, - vpx_highbd_10_sub_pixel_variance64x32, - vpx_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL, - vpx_highbd_sad64x32x4d_bits10) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10, - vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64, - vpx_highbd_10_sub_pixel_variance32x64, - vpx_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL, - vpx_highbd_sad32x64x4d_bits10) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10, - vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32, - vpx_highbd_10_sub_pixel_variance32x32, - vpx_highbd_10_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x3_bits10, vpx_highbd_sad32x32x8_bits10, - vpx_highbd_sad32x32x4d_bits10) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10, - vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64, - vpx_highbd_10_sub_pixel_variance64x64, - vpx_highbd_10_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x3_bits10, vpx_highbd_sad64x64x8_bits10, - vpx_highbd_sad64x64x4d_bits10) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10, - vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16, - vpx_highbd_10_sub_pixel_variance16x16, - vpx_highbd_10_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x3_bits10, vpx_highbd_sad16x16x8_bits10, - vpx_highbd_sad16x16x4d_bits10) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10, - vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8, - vpx_highbd_10_sub_pixel_variance16x8, - vpx_highbd_10_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x3_bits10, vpx_highbd_sad16x8x8_bits10, - vpx_highbd_sad16x8x4d_bits10) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10, - vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16, - vpx_highbd_10_sub_pixel_variance8x16, - vpx_highbd_10_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x3_bits10, vpx_highbd_sad8x16x8_bits10, - vpx_highbd_sad8x16x4d_bits10) + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits10, + vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10, + vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16, + vpx_highbd_10_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10) HIGHBD_BFP( - BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10, - vpx_highbd_10_variance8x8, vpx_highbd_10_sub_pixel_variance8x8, - vpx_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10, - vpx_highbd_sad8x8x8_bits10, vpx_highbd_sad8x8x4d_bits10) - - HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10, - vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, - vpx_highbd_10_sub_pixel_variance8x4, - vpx_highbd_10_sub_pixel_avg_variance8x4, NULL, - vpx_highbd_sad8x4x8_bits10, vpx_highbd_sad8x4x4d_bits10) - - HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10, - vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, - vpx_highbd_10_sub_pixel_variance4x8, - vpx_highbd_10_sub_pixel_avg_variance4x8, NULL, - vpx_highbd_sad4x8x8_bits10, vpx_highbd_sad4x8x4d_bits10) + BLOCK_16X32, vpx_highbd_sad16x32_bits10, + vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10, + vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32, + vpx_highbd_10_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10) HIGHBD_BFP( - BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10, - vpx_highbd_10_variance4x4, vpx_highbd_10_sub_pixel_variance4x4, - vpx_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10, - vpx_highbd_sad4x4x8_bits10, vpx_highbd_sad4x4x4d_bits10) - break; - - case VPX_BITS_12: - HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, - vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16, - vpx_highbd_12_sub_pixel_variance32x16, - vpx_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL, - vpx_highbd_sad32x16x4d_bits12) - - HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12, - vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32, - vpx_highbd_12_sub_pixel_variance16x32, - vpx_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL, - vpx_highbd_sad16x32x4d_bits12) - - HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12, - vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32, - vpx_highbd_12_sub_pixel_variance64x32, - vpx_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL, - vpx_highbd_sad64x32x4d_bits12) - - HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12, - vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64, - vpx_highbd_12_sub_pixel_variance32x64, - vpx_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL, - vpx_highbd_sad32x64x4d_bits12) - - HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12, - vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32, - vpx_highbd_12_sub_pixel_variance32x32, - vpx_highbd_12_sub_pixel_avg_variance32x32, - vpx_highbd_sad32x32x3_bits12, vpx_highbd_sad32x32x8_bits12, - vpx_highbd_sad32x32x4d_bits12) - - HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12, - vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64, - vpx_highbd_12_sub_pixel_variance64x64, - vpx_highbd_12_sub_pixel_avg_variance64x64, - vpx_highbd_sad64x64x3_bits12, vpx_highbd_sad64x64x8_bits12, - vpx_highbd_sad64x64x4d_bits12) - - HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12, - vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16, - vpx_highbd_12_sub_pixel_variance16x16, - vpx_highbd_12_sub_pixel_avg_variance16x16, - vpx_highbd_sad16x16x3_bits12, vpx_highbd_sad16x16x8_bits12, - vpx_highbd_sad16x16x4d_bits12) - - HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12, - vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8, - vpx_highbd_12_sub_pixel_variance16x8, - vpx_highbd_12_sub_pixel_avg_variance16x8, - vpx_highbd_sad16x8x3_bits12, vpx_highbd_sad16x8x8_bits12, - vpx_highbd_sad16x8x4d_bits12) - - HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12, - vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16, - vpx_highbd_12_sub_pixel_variance8x16, - vpx_highbd_12_sub_pixel_avg_variance8x16, - vpx_highbd_sad8x16x3_bits12, vpx_highbd_sad8x16x8_bits12, - vpx_highbd_sad8x16x4d_bits12) + BLOCK_64X32, vpx_highbd_sad64x32_bits10, + vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10, + vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32, + vpx_highbd_10_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10) HIGHBD_BFP( - BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12, - vpx_highbd_12_variance8x8, vpx_highbd_12_sub_pixel_variance8x8, - vpx_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12, - vpx_highbd_sad8x8x8_bits12, vpx_highbd_sad8x8x4d_bits12) - - HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12, - vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, - vpx_highbd_12_sub_pixel_variance8x4, - vpx_highbd_12_sub_pixel_avg_variance8x4, NULL, - vpx_highbd_sad8x4x8_bits12, vpx_highbd_sad8x4x4d_bits12) - - HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12, - vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, - vpx_highbd_12_sub_pixel_variance4x8, - vpx_highbd_12_sub_pixel_avg_variance4x8, NULL, - vpx_highbd_sad4x8x8_bits12, vpx_highbd_sad4x8x4d_bits12) + BLOCK_32X64, vpx_highbd_sad32x64_bits10, + vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10, + vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64, + vpx_highbd_10_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10) HIGHBD_BFP( - BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12, - vpx_highbd_12_variance4x4, vpx_highbd_12_sub_pixel_variance4x4, - vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12, - vpx_highbd_sad4x4x8_bits12, vpx_highbd_sad4x4x4d_bits12) + BLOCK_32X32, vpx_highbd_sad32x32_bits10, + vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10, + vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32, + vpx_highbd_10_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits10, + vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10, + vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64, + vpx_highbd_10_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits10, + vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10, + vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16, + vpx_highbd_10_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits10, + vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10, + vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8, + vpx_highbd_10_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits10, + vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10, + vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16, + vpx_highbd_10_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10, + vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, + vpx_highbd_10_sub_pixel_variance8x8, + vpx_highbd_10_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10, + vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, + vpx_highbd_10_sub_pixel_variance8x4, + vpx_highbd_10_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10, + vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, + vpx_highbd_10_sub_pixel_variance4x8, + vpx_highbd_10_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10) + + HIGHBD_BFP( + BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10, + vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, + vpx_highbd_10_sub_pixel_variance4x4, + vpx_highbd_10_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10) break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + HIGHBD_BFP( + BLOCK_32X16, vpx_highbd_sad32x16_bits12, + vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12, + vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16, + vpx_highbd_12_sub_pixel_avg_variance32x16, + vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X32, vpx_highbd_sad16x32_bits12, + vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12, + vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32, + vpx_highbd_12_sub_pixel_avg_variance16x32, + vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_64X32, vpx_highbd_sad64x32_bits12, + vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12, + vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32, + vpx_highbd_12_sub_pixel_avg_variance64x32, + vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_32X64, vpx_highbd_sad32x64_bits12, + vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12, + vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64, + vpx_highbd_12_sub_pixel_avg_variance32x64, + vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12) + + HIGHBD_BFP( + BLOCK_32X32, vpx_highbd_sad32x32_bits12, + vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12, + vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32, + vpx_highbd_12_sub_pixel_avg_variance32x32, + vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12) + + HIGHBD_BFP( + BLOCK_64X64, vpx_highbd_sad64x64_bits12, + vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12, + vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64, + vpx_highbd_12_sub_pixel_avg_variance64x64, + vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X16, vpx_highbd_sad16x16_bits12, + vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12, + vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16, + vpx_highbd_12_sub_pixel_avg_variance16x16, + vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_16X8, vpx_highbd_sad16x8_bits12, + vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12, + vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8, + vpx_highbd_12_sub_pixel_avg_variance16x8, + vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X16, vpx_highbd_sad8x16_bits12, + vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12, + vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16, + vpx_highbd_12_sub_pixel_avg_variance8x16, + vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12, + vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, + vpx_highbd_12_sub_pixel_variance8x8, + vpx_highbd_12_sub_pixel_avg_variance8x8, + vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12, + vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, + vpx_highbd_12_sub_pixel_variance8x4, + vpx_highbd_12_sub_pixel_avg_variance8x4, + vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12, + vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, + vpx_highbd_12_sub_pixel_variance4x8, + vpx_highbd_12_sub_pixel_avg_variance4x8, + vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12) + + HIGHBD_BFP( + BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12, + vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, + vpx_highbd_12_sub_pixel_variance4x4, + vpx_highbd_12_sub_pixel_avg_variance4x4, + vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12) + break; } } } @@ -1405,32 +2004,89 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) { // Create the encoder segmentation map and set all entries to 0 vpx_free(cpi->segmentation_map); - CHECK_MEM_ERROR(cm, cpi->segmentation_map, + CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); // Create a map used for cyclic background refresh. if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh); - CHECK_MEM_ERROR(cm, cpi->cyclic_refresh, + CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh, vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols)); // Create a map used to mark inactive areas. vpx_free(cpi->active_map.map); - CHECK_MEM_ERROR(cm, cpi->active_map.map, + CHECK_MEM_ERROR(&cm->error, cpi->active_map.map, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); // And a place holder structure is the coding context // for use if we want to save and restore it vpx_free(cpi->coding_context.last_frame_seg_map_copy); - CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy, + CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); } +static void alloc_copy_partition_data(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (cpi->prev_partition == NULL) { + CHECK_MEM_ERROR(&cm->error, cpi->prev_partition, + (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, + sizeof(*cpi->prev_partition))); + } + if (cpi->prev_segment_id == NULL) { + CHECK_MEM_ERROR( + &cm->error, cpi->prev_segment_id, + (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->prev_segment_id))); + } + if (cpi->prev_variance_low == NULL) { + CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low, + (uint8_t *)vpx_calloc( + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25, + sizeof(*cpi->prev_variance_low))); + } + if (cpi->copied_frame_cnt == NULL) { + CHECK_MEM_ERROR( + &cm->error, cpi->copied_frame_cnt, + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->copied_frame_cnt))); + } +} + +static void free_copy_partition_data(VP9_COMP *cpi) { + vpx_free(cpi->prev_partition); + cpi->prev_partition = NULL; + vpx_free(cpi->prev_segment_id); + cpi->prev_segment_id = NULL; + vpx_free(cpi->prev_variance_low); + cpi->prev_variance_low = NULL; + vpx_free(cpi->copied_frame_cnt); + cpi->copied_frame_cnt = NULL; +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +static void setup_denoiser_buffer(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (cpi->oxcf.noise_sensitivity > 0 && + !cpi->denoiser.frame_buffer_initialized) { + if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc, + cpi->oxcf.noise_sensitivity, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate denoiser"); + } +} +#endif + void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int last_w = cpi->oxcf.width; int last_h = cpi->oxcf.height; + vp9_init_quantizer(cpi); if (cm->profile != oxcf->profile) cm->profile = oxcf->profile; cm->bit_depth = oxcf->bit_depth; cm->color_space = oxcf->color_space; @@ -1473,12 +2129,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { } cpi->encode_breakout = cpi->oxcf.encode_breakout; - set_rc_buffer_sizes(rc, &cpi->oxcf); - - // Under a configuration change, where maximum_buffer_size may change, - // keep buffer level clipped to the maximum allowed buffer size. - rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); - rc->buffer_level = VPXMIN(rc->buffer_level, rc->maximum_buffer_size); + vp9_set_rc_buffer_sizes(cpi); // Set up frame rate and related parameters rate control values. vp9_new_framerate(cpi, cpi->framerate); @@ -1502,19 +2153,22 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cpi->external_resize = 1; } - if (cpi->initial_width) { - int new_mi_size = 0; - vp9_set_mb_mi(cm, cm->width, cm->height); - new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); - if (cm->mi_alloc_size < new_mi_size) { - vp9_free_context_buffers(cm); - alloc_compressor_data(cpi); - realloc_segmentation_maps(cpi); - cpi->initial_width = cpi->initial_height = 0; - cpi->external_resize = 0; - } else if (cm->mi_alloc_size == new_mi_size && - (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) { - vp9_alloc_loop_filter(cm); + int new_mi_size = 0; + vp9_set_mb_mi(cm, cm->width, cm->height); + new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows); + if (cm->mi_alloc_size < new_mi_size) { + vp9_free_context_buffers(cm); + vp9_free_pc_tree(&cpi->td); + vpx_free(cpi->mbmi_ext_base); + alloc_compressor_data(cpi); + realloc_segmentation_maps(cpi); + cpi->initial_width = cpi->initial_height = 0; + cpi->external_resize = 0; + } else if (cm->mi_alloc_size == new_mi_size && + (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) { + if (vp9_alloc_loop_filter(cm)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate loop filter data"); } } @@ -1523,13 +2177,62 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { update_frame_size(cpi); if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) { - memset(cpi->consec_zero_mv, 0, - cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vpx_free(cpi->consec_zero_mv); + CHECK_MEM_ERROR( + &cm->error, cpi->consec_zero_mv, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); + + vpx_free(cpi->skin_map); + CHECK_MEM_ERROR( + &cm->error, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map))); + + if (cpi->svc.number_spatial_layers > 1) { +#if CONFIG_VP9_TEMPORAL_DENOISING + // Reset the denoiser for svc on the resize change. + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_denoiser_free(&cpi->denoiser); + setup_denoiser_buffer(cpi); + } +#endif + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + for (int sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { + const int layer = + LAYER_IDS_TO_IDX(sl, 0, cpi->svc.number_temporal_layers); + LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer]; + lc->sb_index = 0; + lc->actual_num_seg1_blocks = 0; + lc->actual_num_seg2_blocks = 0; + lc->counter_encode_maxq_scene_change = 0; + vpx_free(lc->map); + CHECK_MEM_ERROR( + &cm->error, lc->map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*lc->map))); + vpx_free(lc->last_coded_q_map); + CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map, + vpx_malloc(cm->mi_rows * cm->mi_cols * + sizeof(*lc->last_coded_q_map))); + memset(lc->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols); + vpx_free(lc->consec_zero_mv); + CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv, + vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*lc->consec_zero_mv))); + } + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + } + } + + free_copy_partition_data(cpi); + alloc_copy_partition_data(cpi); + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi->svc.number_spatial_layers == 1) vp9_cyclic_refresh_reset_resize(cpi); + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; } - if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || + if ((cpi->svc.number_temporal_layers > 1) || ((cpi->svc.number_temporal_layers > 1 || cpi->svc.number_spatial_layers > 1) && cpi->oxcf.pass != 1)) { @@ -1537,6 +2240,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { (int)cpi->oxcf.target_bandwidth); } + vp9_check_reset_rc_flag(cpi); + cpi->alt_ref_source = NULL; rc->is_src_frame_alt_ref = 0; @@ -1554,12 +2259,9 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); #endif -} -#ifndef M_LOG2_E -#define M_LOG2_E 0.693147180559945309417 -#endif -#define log2f(x) (log(x) / (float)M_LOG2_E) + vp9_set_row_mt(cpi); +} /*********************************************************************** * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts' * @@ -1567,7 +2269,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { * The following 2 functions ('cal_nmvjointsadcost' and * * 'cal_nmvsadcosts') are used to calculate cost lookup tables * * used by 'vp9_diamond_search_sad'. The C implementation of the * - * function is generic, but the AVX intrinsics optimised version * + * function is generic, but the NEON intrinsics optimised version * * relies on the following properties of the computed tables: * * For cal_nmvjointsadcost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * @@ -1576,7 +2278,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * - * If these do not hold, then the AVX optimised version of the * + * If these do not hold, then the NEON optimised version of the * * 'vp9_diamond_search_sad' function cannot be used as it is, in which * * case you can revert to using the C function instead. * ***********************************************************************/ @@ -1624,10 +2326,57 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { } while (++i <= MV_MAX); } -VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, +static void init_ref_frame_bufs(VP9_COMMON *cm) { + int i; + BufferPool *const pool = cm->buffer_pool; + cm->new_fb_idx = INVALID_IDX; + for (i = 0; i < REF_FRAMES; ++i) { + cm->ref_frame_map[i] = INVALID_IDX; + } + for (i = 0; i < FRAME_BUFFERS; ++i) { + pool->frame_bufs[i].ref_count = 0; + } +} + +static void update_initial_width(VP9_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y) { + VP9_COMMON *const cm = &cpi->common; +#if !CONFIG_VP9_HIGHBITDEPTH + (void)use_highbitdepth; + assert(use_highbitdepth == 0); +#endif + + if (!cpi->initial_width || +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth != use_highbitdepth || +#endif + cm->subsampling_x != subsampling_x || + cm->subsampling_y != subsampling_y) { + cm->subsampling_x = subsampling_x; + cm->subsampling_y = subsampling_y; +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth = use_highbitdepth; +#endif + alloc_util_frame_buffers(cpi); + // The initial_width/height is used to clamp the encoding width/height in + // vp9_set_size_literal(). The check below is added to avoid setting the + // initial_width/height to a smaller resolution than the one configured. + // This can happen when the user passes in a lower resolution on the very + // first frame (after creating the encoder with a larger resolution). For + // spatial layers this will prevent user from going back up in resolution + // (i.e., the top layer will get stuck at the lower resolution). + if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) { + cpi->initial_width = cm->width; + cpi->initial_height = cm->height; + } + cpi->initial_mbs = cm->MBs; + } +} + +VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, BufferPool *const pool) { unsigned int i; - VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP)); + VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(*cpi)); VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL; if (!cm) return NULL; @@ -1645,74 +2394,74 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cm->free_mi = vp9_enc_free_mi; cm->setup_mi = vp9_enc_setup_mi; - CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); + CHECK_MEM_ERROR(&cm->error, cm->fc, + (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc))); CHECK_MEM_ERROR( - cm, cm->frame_contexts, + &cm->error, cm->frame_contexts, (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts))); + cpi->compute_frame_low_motion_onepass = 1; cpi->use_svc = 0; - cpi->resize_state = 0; + cpi->resize_state = ORIG; cpi->external_resize = 0; cpi->resize_avg_qp = 0; cpi->resize_buffer_underflow = 0; cpi->use_skin_detection = 0; cpi->common.buffer_pool = pool; + init_ref_frame_bufs(cm); cpi->force_update_segmentation = 0; init_config(cpi, oxcf); - vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); + cpi->frame_info = vp9_get_frame_info(oxcf); - cm->current_video_frame = 0; - cpi->partition_search_skippable_frame = 0; + vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); + vp9_init_rd_parameters(cpi); + + init_frame_indexes(cm); + cpi->initial_width = cpi->oxcf.width; + cpi->initial_height = cpi->oxcf.height; cpi->tile_data = NULL; realloc_segmentation_maps(cpi); - CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); + CHECK_MEM_ERROR( + &cm->error, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map))); + +#if !CONFIG_REALTIME_ONLY + CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); +#endif CHECK_MEM_ERROR( - cm, cpi->consec_zero_mv, + &cm->error, cpi->consec_zero_mv, vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1]))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0]))); - CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1], + CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1], vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1]))); for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0])); i++) { CHECK_MEM_ERROR( - cm, cpi->mbgraph_stats[i].mb_stats, + &cm->error, cpi->mbgraph_stats[i].mb_stats, vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); } -#if CONFIG_FP_MB_STATS - cpi->use_fp_mb_stats = 0; - if (cpi->use_fp_mb_stats) { - // a place holder used to store the first pass mb stats in the first pass - CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf, - vpx_calloc(cm->MBs * sizeof(uint8_t), 1)); - } else { - cpi->twopass.frame_mb_stats_buf = NULL; - } -#endif - cpi->refresh_alt_ref_frame = 0; - cpi->multi_arf_last_grp_enabled = 0; - cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; init_level_info(&cpi->level_info); @@ -1752,10 +2501,12 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, } if (cpi->b_calculate_consistency) { - CHECK_MEM_ERROR(cm, cpi->ssim_vars, - vpx_malloc(sizeof(*cpi->ssim_vars) * 4 * - cpi->common.mi_rows * cpi->common.mi_cols)); + CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars, + vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, + sizeof(*cpi->ssim_vars) * 4)); cpi->worst_consistency = 100.0; + } else { + cpi->ssim_vars = NULL; } #endif @@ -1785,11 +2536,16 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, #endif #endif #ifdef OUTPUT_YUV_SKINMAP - yuv_skinmap_file = fopen("skinmap.yuv", "ab"); + yuv_skinmap_file = fopen("skinmap.yuv", "wb"); #endif #ifdef OUTPUT_YUV_REC yuv_rec_file = fopen("rec.yuv", "wb"); #endif +#ifdef OUTPUT_YUV_SVC_SRC + yuv_svc_src[0] = fopen("svc_src_0.yuv", "wb"); + yuv_svc_src[1] = fopen("svc_src_1.yuv", "wb"); + yuv_svc_src[2] = fopen("svc_src_2.yuv", "wb"); +#endif #if 0 framepsnr = fopen("framepsnr.stt", "a"); @@ -1798,6 +2554,14 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; + { + vpx_codec_err_t codec_status = vp9_extrc_init(&cpi->ext_ratectrl); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, "vp9_extrc_init() failed"); + } + } + +#if !CONFIG_REALTIME_ONLY if (oxcf->pass == 1) { vp9_init_first_pass(cpi); } else if (oxcf->pass == 2) { @@ -1808,130 +2572,149 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->svc.number_temporal_layers > 1) { FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf; FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 }; - int i; + int n; - for (i = 0; i < oxcf->ss_number_layers; ++i) { + for (n = 0; n < oxcf->ss_number_layers; ++n) { FIRSTPASS_STATS *const last_packet_for_layer = - &stats[packets - oxcf->ss_number_layers + i]; + &stats[packets - oxcf->ss_number_layers + n]; const int layer_id = (int)last_packet_for_layer->spatial_layer_id; const int packets_in_layer = (int)last_packet_for_layer->count + 1; if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) { + int num_frames; LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer_id]; vpx_free(lc->rc_twopass_stats_in.buf); lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz; - CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf, + CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf, vpx_malloc(lc->rc_twopass_stats_in.sz)); lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf; lc->twopass.stats_in = lc->twopass.stats_in_start; lc->twopass.stats_in_end = lc->twopass.stats_in_start + packets_in_layer - 1; + // Note the last packet is cumulative first pass stats. + // So the number of frames is packet number minus one + num_frames = packets_in_layer - 1; + fps_init_first_pass_info(&lc->twopass.first_pass_info, + lc->rc_twopass_stats_in.buf, num_frames); stats_copy[layer_id] = lc->rc_twopass_stats_in.buf; } } - for (i = 0; i < packets; ++i) { - const int layer_id = (int)stats[i].spatial_layer_id; + for (n = 0; n < packets; ++n) { + const int layer_id = (int)stats[n].spatial_layer_id; if (layer_id >= 0 && layer_id < oxcf->ss_number_layers && stats_copy[layer_id] != NULL) { - *stats_copy[layer_id] = stats[i]; + *stats_copy[layer_id] = stats[n]; ++stats_copy[layer_id]; } } vp9_init_second_pass_spatial_svc(cpi); } else { -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - const size_t psz = cpi->common.MBs * sizeof(uint8_t); - const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz); - - cpi->twopass.firstpass_mb_stats.mb_stats_start = - oxcf->firstpass_mb_stats_in.buf; - cpi->twopass.firstpass_mb_stats.mb_stats_end = - cpi->twopass.firstpass_mb_stats.mb_stats_start + - (ps - 1) * cpi->common.MBs * sizeof(uint8_t); - } -#endif + int num_frames; cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf; cpi->twopass.stats_in = cpi->twopass.stats_in_start; cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1]; + // Note the last packet is cumulative first pass stats. + // So the number of frames is packet number minus one + num_frames = packets - 1; + fps_init_first_pass_info(&cpi->twopass.first_pass_info, + oxcf->two_pass_stats_in.buf, num_frames); vp9_init_second_pass(cpi); } } +#endif // !CONFIG_REALTIME_ONLY - vp9_set_speed_features_framesize_independent(cpi); - vp9_set_speed_features_framesize_dependent(cpi); + cpi->mb_wiener_var_cols = 0; + cpi->mb_wiener_var_rows = 0; + cpi->mb_wiener_variance = NULL; - // Allocate memory to store variances for a frame. - CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff))); - cpi->source_var_thresh = 0; - cpi->frames_till_next_var_check = 0; + vp9_set_speed_features_framesize_independent(cpi, oxcf->speed); + vp9_set_speed_features_framesize_dependent(cpi, oxcf->speed); -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx3f = SDX3F; \ - cpi->fn_ptr[BT].sdx8f = SDX8F; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; + { + const int bsize = BLOCK_16X16; + const int w = num_8x8_blocks_wide_lookup[bsize]; + const int h = num_8x8_blocks_high_lookup[bsize]; + const int num_cols = (cm->mi_cols + w - 1) / w; + const int num_rows = (cm->mi_rows + h - 1) / h; + CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors, + vpx_calloc(num_rows * num_cols, + sizeof(*cpi->mi_ssim_rdmult_scaling_factors))); + } - BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16, - vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, NULL, NULL, - vpx_sad32x16x4d) + cpi->kmeans_data_arr_alloc = 0; +#if CONFIG_NON_GREEDY_MV + cpi->tpl_ready = 0; +#endif // CONFIG_NON_GREEDY_MV + for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) { + cpi->tpl_stats[i].tpl_stats_ptr = NULL; + } - BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32, - vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, NULL, NULL, - vpx_sad16x32x4d) +#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdsf = SDSF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdsx4df = SDSX4DF; - BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32, - vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, NULL, NULL, - vpx_sad64x32x4d) + BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg, + vpx_variance32x16, vpx_sub_pixel_variance32x16, + vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d) - BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64, - vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, NULL, NULL, - vpx_sad32x64x4d) + BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg, + vpx_variance16x32, vpx_sub_pixel_variance16x32, + vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d) - BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32, - vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32, - vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d) + BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg, + vpx_variance64x32, vpx_sub_pixel_variance64x32, + vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d) - BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64, - vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64, - vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d) + BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg, + vpx_variance32x64, vpx_sub_pixel_variance32x64, + vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d) - BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16, - vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16, - vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d) + BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg, + vpx_variance32x32, vpx_sub_pixel_variance32x32, + vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d) - BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8, - vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x3, - vpx_sad16x8x8, vpx_sad16x8x4d) + BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg, + vpx_variance64x64, vpx_sub_pixel_variance64x64, + vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d) - BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16, - vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x3, - vpx_sad8x16x8, vpx_sad8x16x4d) + BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg, + vpx_variance16x16, vpx_sub_pixel_variance16x16, + vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d) - BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8, - vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x3, - vpx_sad8x8x8, vpx_sad8x8x4d) + BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg, + vpx_variance16x8, vpx_sub_pixel_variance16x8, + vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d) - BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4, - vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, NULL, - vpx_sad8x4x8, vpx_sad8x4x4d) + BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg, + vpx_variance8x16, vpx_sub_pixel_variance8x16, + vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d) - BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8, - vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, NULL, - vpx_sad4x8x8, vpx_sad4x8x4d) + BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8, + vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d, + vpx_sad_skip_8x8x4d) - BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4, - vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x3, - vpx_sad4x4x8, vpx_sad4x4x4d) + BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4, + vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d, + vpx_sad_skip_8x4x4d) + + BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8, + vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d, + vpx_sad_skip_4x8x4d) + + BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4, + vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d, + vpx_sad_skip_4x4x4d) #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); @@ -1946,6 +2729,17 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, vp9_loop_filter_init(cm); + // Set up the unit scaling factor used during motion search. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height, + cm->width, cm->height, + cm->use_highbitdepth); +#else + vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height, + cm->width, cm->height); +#endif // CONFIG_VP9_HIGHBITDEPTH + cpi->td.mb.me_sf = &cpi->me_sf; + cm->error.setjmp = 0; return cpi; @@ -1961,10 +2755,13 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, void vp9_remove_compressor(VP9_COMP *cpi) { VP9_COMMON *cm; unsigned int i; - int t; if (!cpi) return; +#if CONFIG_INTERNAL_STATS + vpx_free(cpi->ssim_vars); +#endif + cm = &cpi->common; if (cm->current_video_frame > 0) { #if CONFIG_INTERNAL_STATS @@ -1998,16 +2795,20 @@ void vp9_remove_compressor(VP9_COMP *cpi) { snprintf(headings, sizeof(headings), "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t" - "WstPsnr\tWstSsim\tWstFast\tWstHVS"); + "WstPsnr\tWstSsim\tWstFast\tWstHVS\t" + "AVPsnrY\tAPsnrCb\tAPsnrCr"); snprintf(results, sizeof(results), "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" - "%7.3f\t%7.3f\t%7.3f\t%7.3f", + "%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\t%7.3f", dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr, cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr, total_ssim, totalp_ssim, cpi->fastssim.stat[ALL] / cpi->count, cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst, - cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst); + cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst, + cpi->psnr.stat[Y] / cpi->count, cpi->psnr.stat[U] / cpi->count, + cpi->psnr.stat[V] / cpi->count); if (cpi->b_calculate_blockiness) { SNPRINT(headings, "\t Block\tWstBlck"); @@ -2025,20 +2826,26 @@ void vp9_remove_compressor(VP9_COMP *cpi) { SNPRINT2(results, "\t%7.3f", cpi->worst_consistency); } - fprintf(f, "%s\t Time\tRcErr\tAbsErr\n", headings); - fprintf(f, "%s\t%8.0f\t%7.2f\t%7.2f\n", results, total_encode_time, - rate_err, fabs(rate_err)); + SNPRINT(headings, "\t Time\tRcErr\tAbsErr"); + SNPRINT2(results, "\t%8.0f", total_encode_time); + SNPRINT2(results, "\t%7.2f", rate_err); + SNPRINT2(results, "\t%7.2f", fabs(rate_err)); + + fprintf(f, "%s\tAPsnr611\n", headings); + fprintf( + f, "%s\t%7.3f\n", results, + (6 * cpi->psnr.stat[Y] + cpi->psnr.stat[U] + cpi->psnr.stat[V]) / + (cpi->count * 8)); } fclose(f); } - #endif #if 0 { printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); - printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); + printf("\n_frames receive_data encod_mb_row compress_frame Total\n"); printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000, cpi->time_compress_data / 1000, @@ -2051,29 +2858,23 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_denoiser_free(&(cpi->denoiser)); #endif - for (t = 0; t < cpi->num_workers; ++t) { - VPxWorker *const worker = &cpi->workers[t]; - EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; - - // Deallocate allocated threads. - vpx_get_worker_interface()->end(worker); - - // Deallocate allocated thread data. - if (t < cpi->num_workers - 1) { - vpx_free(thread_data->td->counts); - vp9_free_pc_tree(thread_data->td); - vpx_free(thread_data->td); - } - } - vpx_free(cpi->tile_thr_data); - vpx_free(cpi->workers); - - if (cpi->num_workers > 1) { - vp9_loop_filter_dealloc(&cpi->lf_row_sync); - vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + if (cpi->kmeans_data_arr_alloc) { +#if CONFIG_MULTITHREAD + pthread_mutex_destroy(&cpi->kmeans_mutex); +#endif + vpx_free(cpi->kmeans_data_arr); } + vp9_free_tpl_buffer(cpi); + + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + vp9_row_mt_mem_dealloc(cpi); + vp9_encode_free_mt_data(cpi); + +#if !CONFIG_REALTIME_ONLY vp9_alt_ref_aq_destroy(cpi->alt_ref_aq); +#endif dealloc_compressor_data(cpi); @@ -2082,12 +2883,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vpx_free(cpi->mbgraph_stats[i].mb_stats); } -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - vpx_free(cpi->twopass.frame_mb_stats_buf); - cpi->twopass.frame_mb_stats_buf = NULL; - } -#endif + vp9_extrc_delete(&cpi->ext_ratectrl); + + // Help detect use after free of the error detail string. + memset(cm->error.detail, 'A', sizeof(cm->error.detail) - 1); + cm->error.detail[sizeof(cm->error.detail) - 1] = '\0'; vp9_remove_common(cm); vp9_free_ref_frame_buffers(cm->buffer_pool); @@ -2107,6 +2907,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #ifdef OUTPUT_YUV_REC fclose(yuv_rec_file); #endif +#ifdef OUTPUT_YUV_SVC_SRC + fclose(yuv_svc_src[0]); + fclose(yuv_svc_src[1]); + fclose(yuv_svc_src[2]); +#endif #if 0 @@ -2122,30 +2927,21 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #endif } -static void generate_psnr_packet(VP9_COMP *cpi) { - struct vpx_codec_cx_pkt pkt; - int i; - PSNR_STATS psnr; +int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr) { + if (is_psnr_calc_enabled(cpi)) { #if CONFIG_VP9_HIGHBITDEPTH - vpx_calc_highbd_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, &psnr, - cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth); + vpx_calc_highbd_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, psnr, + cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth, + cpi->svc.spatial_layer_id); #else - vpx_calc_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, &psnr); + vpx_calc_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, psnr, + cpi->svc.spatial_layer_id); #endif - - for (i = 0; i < 4; ++i) { - pkt.data.psnr.samples[i] = psnr.samples[i]; - pkt.data.psnr.sse[i] = psnr.sse[i]; - pkt.data.psnr.psnr[i] = psnr.psnr[i]; + return 1; + } else { + vp9_zero(*psnr); + return 0; } - pkt.kind = VPX_CODEC_PSNR_PKT; - if (cpi->use_svc) - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers] - .psnr_pkt = pkt.data.psnr; - else - vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt); } int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) { @@ -2164,7 +2960,7 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) { static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer( VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag) { - MV_REFERENCE_FRAME ref_frame = NONE; + MV_REFERENCE_FRAME ref_frame = NO_REF_FRAME; if (ref_frame_flag == VP9_LAST_FLAG) ref_frame = LAST_FRAME; else if (ref_frame_flag == VP9_GOLD_FLAG) @@ -2172,14 +2968,15 @@ static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer( else if (ref_frame_flag == VP9_ALT_FLAG) ref_frame = ALTREF_FRAME; - return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame); + return ref_frame == NO_REF_FRAME ? NULL + : get_ref_frame_buffer(cpi, ref_frame); } int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); if (cfg) { - vp8_yv12_copy_frame(cfg, sd); + vpx_yv12_copy_frame(cfg, sd); return 0; } else { return -1; @@ -2190,7 +2987,7 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd) { YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); if (cfg) { - vp8_yv12_copy_frame(sd, cfg); + vpx_yv12_copy_frame(sd, cfg); return 0; } else { return -1; @@ -2203,38 +3000,6 @@ int vp9_update_entropy(VP9_COMP *cpi, int update) { return 0; } -#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP) -// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it -// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do -// not denoise the UV channels at this time. If ever we implement UV channel -// denoising we will have to modify this. -void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { - uint8_t *src = s->y_buffer; - int h = s->y_height; - - do { - fwrite(src, s->y_width, 1, f); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, f); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, f); - src += s->uv_stride; - } while (--h); -} -#endif - #ifdef OUTPUT_YUV_REC void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { YV12_BUFFER_CONFIG *s = cm->frame_to_show; @@ -2297,12 +3062,11 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { #endif #if CONFIG_VP9_HIGHBITDEPTH -static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, - int bd) { +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd) { #else -static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { #endif // CONFIG_VP9_HIGHBITDEPTH // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t int i; @@ -2340,17 +3104,36 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, #if CONFIG_VP9_HIGHBITDEPTH static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int bd) { + YV12_BUFFER_CONFIG *dst, int bd, + INTERP_FILTER filter_type, + int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; + + // The issue b/311394513 reveals a corner case bug. + // For bd = 8, vpx_scaled_2d() requires both x_step_q4 and y_step_q4 are less + // than or equal to 64. For bd >= 10, vpx_highbd_convolve8() requires both + // x_step_q4 and y_step_q4 are less than or equal to 32. If this condition + // isn't met, it needs to call vp9_scale_and_extend_frame_nonnormative() that + // supports arbitrary scaling. + const int x_step_q4 = 16 * src_w / dst_w; + const int y_step_q4 = 16 * src_h / dst_h; + const int is_arbitrary_scaling = + (bd == 8 && (x_step_q4 > 64 || y_step_q4 > 64)) || + (bd >= 10 && (x_step_q4 > 32 || y_step_q4 > 32)); + if (is_arbitrary_scaling) { + vp9_scale_and_extend_frame_nonnormative(src, dst, bd); + return; + } + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, src->v_buffer }; const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; int x, y, i; for (i = 0; i < MAX_MB_PLANE; ++i) { @@ -2358,24 +3141,24 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int src_stride = src_strides[i]; const int dst_stride = dst_strides[i]; for (y = 0; y < dst_h; y += 16) { - const int y_q4 = y * (16 / factor) * src_h / dst_h; + const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler; for (x = 0; x < dst_w; x += 16) { - const int x_q4 = x * (16 / factor) * src_w / dst_w; + const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler; const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h * src_stride + (x / factor) * src_w / dst_w; uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); if (src->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, - 16 / factor, 16 / factor, bd); + vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor, + bd); } else { - vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, - 16 / factor); + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor); } } } @@ -2383,46 +3166,9 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, vpx_extend_frame_borders(dst); } -#else -void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { - const int src_w = src->y_crop_width; - const int src_h = src->y_crop_height; - const int dst_w = dst->y_crop_width; - const int dst_h = dst->y_crop_height; - const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, - src->v_buffer }; - const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; - uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; - const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; - int x, y, i; - - for (i = 0; i < MAX_MB_PLANE; ++i) { - const int factor = (i == 0 || i == 3 ? 1 : 2); - const int src_stride = src_strides[i]; - const int dst_stride = dst_strides[i]; - for (y = 0; y < dst_h; y += 16) { - const int y_q4 = y * (16 / factor) * src_h / dst_h; - for (x = 0; x < dst_w; x += 16) { - const int x_q4 = x * (16 / factor) * src_w / dst_w; - const uint8_t *src_ptr = srcs[i] + - (y / factor) * src_h / dst_h * src_stride + - (x / factor) * src_w / dst_w; - uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); - - vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, - 16 / factor); - } - } - } - - vpx_extend_frame_borders(dst); -} #endif // CONFIG_VP9_HIGHBITDEPTH +#if !CONFIG_REALTIME_ONLY static int scale_down(VP9_COMP *cpi, int q) { RATE_CONTROL *const rc = &cpi->rc; GF_GROUP *const gf_group = &cpi->twopass.gf_group; @@ -2439,20 +3185,45 @@ static int scale_down(VP9_COMP *cpi, int q) { return scale; } -static int big_rate_miss(VP9_COMP *cpi, int high_limit, int low_limit) { +static int big_rate_miss_high_threshold(VP9_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; + int big_miss_high; - return (rc->projected_frame_size > ((high_limit * 3) / 2)) || - (rc->projected_frame_size < (low_limit / 2)); + if (frame_is_kf_gf_arf(cpi)) + big_miss_high = rc->this_frame_target * 3 / 2; + else + big_miss_high = rc->this_frame_target * 2; + + return big_miss_high; +} + +static int big_rate_miss(VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + int big_miss_high; + int big_miss_low; + + // Ignore for overlay frames + if (rc->is_src_frame_alt_ref) { + return 0; + } else { + big_miss_low = (rc->this_frame_target / 2); + big_miss_high = big_rate_miss_high_threshold(cpi); + + return (rc->projected_frame_size > big_miss_high) || + (rc->projected_frame_size < big_miss_low); + } } // test in two pass for the first static int two_pass_first_group_inter(VP9_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - if ((cpi->oxcf.pass == 2) && - (gf_group->index == gf_group->first_inter_index)) { - return 1; + if (cpi->oxcf.pass == 2) { + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const int gfg_index = gf_group->index; + + if (gfg_index == 0) return gf_group->update_type[gfg_index] == LF_UPDATE; + return gf_group->update_type[gfg_index - 1] != LF_UPDATE && + gf_group->update_type[gfg_index] == LF_UPDATE; } else { return 0; } @@ -2468,8 +3239,7 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, int force_recode = 0; if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || - big_rate_miss(cpi, high_limit, low_limit) || - (cpi->sf.recode_loop == ALLOW_RECODE) || + big_rate_miss(cpi) || (cpi->sf.recode_loop == ALLOW_RECODE) || (two_pass_first_group_inter(cpi) && (cpi->sf.recode_loop == ALLOW_RECODE_FIRST)) || (frame_is_kfgfarf && (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF))) { @@ -2479,8 +3249,13 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, cpi->resize_pending = 1; return 1; } - // Force recode if projected_frame_size > max_frame_bandwidth - if (rc->projected_frame_size >= rc->max_frame_bandwidth) return 1; + + // Force recode for extreme overshoot. + if ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF && + rc->projected_frame_size >= big_rate_miss_high_threshold(cpi))) { + return 1; + } // TODO(agrange) high_limit could be greater than the scale-down threshold. if ((rc->projected_frame_size > high_limit && q < maxq) || @@ -2497,10 +3272,52 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, } return force_recode; } +#endif // !CONFIG_REALTIME_ONLY -void vp9_update_reference_frames(VP9_COMP *cpi) { +static void update_ref_frames(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; BufferPool *const pool = cm->buffer_pool; + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL) { + const int this_gf_index = gf_group->index; + const int update_ref_idx = gf_group->update_ref_idx[this_gf_index]; + if (gf_group->update_type[this_gf_index] == KF_UPDATE) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[0], cm->new_fb_idx); + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[1], cm->new_fb_idx); + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[2], cm->new_fb_idx); + } else if (update_ref_idx != INVALID_IDX) { + ref_cnt_fb(pool->frame_bufs, + &cm->ref_frame_map[gf_group->update_ref_idx[this_gf_index]], + cm->new_fb_idx); + } + + const int next_gf_index = gf_group->index + 1; + + // Overlay frame should ideally look at the colocated ref frame from rc lib. + // Here temporarily just don't update the indices. + if (next_gf_index < gf_group->gf_group_size) { + cpi->lst_fb_idx = gf_group->ext_rc_ref[next_gf_index].last_index; + cpi->gld_fb_idx = gf_group->ext_rc_ref[next_gf_index].golden_index; + cpi->alt_fb_idx = gf_group->ext_rc_ref[next_gf_index].altref_index; + } + + return; + } + + if (cpi->rc.show_arf_as_gld) { + int tmp = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->gld_fb_idx; + cpi->gld_fb_idx = tmp; + } else if (cm->show_existing_frame) { + // Pop ARF. + cpi->lst_fb_idx = cpi->alt_fb_idx; + cpi->alt_fb_idx = + stack_pop(gf_group->arf_index_stack, gf_group->stack_size); + --gf_group->stack_size; + } // At this point the new frame has been encoded. // If any buffer copy / swapping is signaled it should be done here. @@ -2526,23 +3343,23 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { tmp = cpi->alt_fb_idx; cpi->alt_fb_idx = cpi->gld_fb_idx; cpi->gld_fb_idx = tmp; - - if (is_two_pass_svc(cpi)) { - cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx; - cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx; - } } else { /* For non key/golden frames */ if (cpi->refresh_alt_ref_frame) { - int arf_idx = cpi->alt_fb_idx; - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_idx = gf_group->arf_update_idx[gf_group->index]; - } + int arf_idx = gf_group->top_arf_idx; + + // Push new ARF into stack. + stack_push(gf_group->arf_index_stack, cpi->alt_fb_idx, + gf_group->stack_size); + ++gf_group->stack_size; + + assert(arf_idx < REF_FRAMES); ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx); memcpy(cpi->interp_filter_selected[ALTREF_FRAME], cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); + + cpi->alt_fb_idx = arf_idx; } if (cpi->refresh_golden_frame) { @@ -2567,46 +3384,60 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); } + + if (gf_group->update_type[gf_group->index] == MID_OVERLAY_UPDATE) { + cpi->alt_fb_idx = + stack_pop(gf_group->arf_index_stack, gf_group->stack_size); + --gf_group->stack_size; + } +} + +void vp9_update_reference_frames(VP9_COMP *cpi) { + update_ref_frames(cpi); + #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && - cpi->denoiser.denoising_level > kDenLowLow) { - vp9_denoiser_update_frame_info( - &cpi->denoiser, *cpi->Source, cpi->common.frame_type, - cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, - cpi->refresh_last_frame, cpi->resize_pending); - } + vp9_denoiser_update_ref_frame(cpi); #endif - if (is_one_pass_cbr_svc(cpi)) { - // Keep track of frame index for each reference frame. - SVC *const svc = &cpi->svc; - if (cm->frame_type == KEY_FRAME) { - svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; - svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; - svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; - } else { - if (cpi->refresh_last_frame) - svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; - if (cpi->refresh_golden_frame) - svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; - if (cpi->refresh_alt_ref_frame) - svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; - } - } + + if (is_one_pass_svc(cpi)) vp9_svc_update_ref_frame(cpi); } static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { MACROBLOCKD *xd = &cpi->td.mb.e_mbd; struct loopfilter *lf = &cm->lf; + int is_reference_frame = + (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame || + cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + if (cpi->use_svc && + cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) + is_reference_frame = !cpi->svc.non_reference_frame; + + // Skip loop filter in show_existing_frame mode. + if (cm->show_existing_frame) { + lf->filter_level = 0; + return; + } + + if (cpi->loopfilter_ctrl == NO_LOOPFILTER || + (!is_reference_frame && cpi->loopfilter_ctrl == LOOPFILTER_REFERENCE)) { + lf->filter_level = 0; + vpx_extend_frame_inner_borders(cm->frame_to_show); + return; + } if (xd->lossless) { lf->filter_level = 0; lf->last_filt_level = 0; } else { +#if CONFIG_INTERNAL_STATS struct vpx_usec_timer timer; +#endif vpx_clear_system_state(); +#if CONFIG_INTERNAL_STATS vpx_usec_timer_start(&timer); +#endif if (!cpi->rc.is_src_frame_alt_ref) { if ((cpi->common.frame_type == KEY_FRAME) && @@ -2619,11 +3450,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { lf->filter_level = 0; } +#if CONFIG_INTERNAL_STATS vpx_usec_timer_mark(&timer); cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); +#endif } - if (lf->filter_level > 0) { + if (lf->filter_level > 0 && is_reference_frame) { vp9_build_mask_frame(cm, lf->filter_level, 0); if (cpi->num_workers > 1) @@ -2637,19 +3470,6 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { vpx_extend_frame_inner_borders(cm->frame_to_show); } -static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { - RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; - if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || - new_fb_ptr->mi_cols < cm->mi_cols) { - vpx_free(new_fb_ptr->mvs); - CHECK_MEM_ERROR(cm, new_fb_ptr->mvs, - (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(*new_fb_ptr->mvs))); - new_fb_ptr->mi_rows = cm->mi_rows; - new_fb_ptr->mi_cols = cm->mi_cols; - } -} - void vp9_scale_references(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; MV_REFERENCE_FRAME ref_frame; @@ -2668,7 +3488,6 @@ void vp9_scale_references(VP9_COMP *cpi) { continue; } -#if CONFIG_VP9_HIGHBITDEPTH if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { RefCntBuffer *new_fb_ptr = NULL; int force_scaling = 0; @@ -2681,6 +3500,7 @@ void vp9_scale_references(VP9_COMP *cpi) { new_fb_ptr = &pool->frame_bufs[new_fb]; if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || new_fb_ptr->buf.y_crop_height != cm->height) { +#if CONFIG_VP9_HIGHBITDEPTH if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, cm->use_highbitdepth, @@ -2688,42 +3508,28 @@ void vp9_scale_references(VP9_COMP *cpi) { cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth); - cpi->scaled_ref_idx[ref_frame - 1] = new_fb; - alloc_frame_mvs(cm, new_fb); - } + scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth, + EIGHTTAP, 0); #else - if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { - RefCntBuffer *new_fb_ptr = NULL; - int force_scaling = 0; - int new_fb = cpi->scaled_ref_idx[ref_frame - 1]; - if (new_fb == INVALID_IDX) { - new_fb = get_free_fb(cm); - force_scaling = 1; - } - if (new_fb == INVALID_IDX) return; - new_fb_ptr = &pool->frame_bufs[new_fb]; - if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || - new_fb_ptr->buf.y_crop_height != cm->height) { if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); - vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf); + vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0); +#endif // CONFIG_VP9_HIGHBITDEPTH cpi->scaled_ref_idx[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } -#endif // CONFIG_VP9_HIGHBITDEPTH } else { int buf_idx; RefCntBuffer *buf = NULL; if (cpi->oxcf.pass == 0 && !cpi->use_svc) { // Check for release of scaled reference. buf_idx = cpi->scaled_ref_idx[ref_frame - 1]; - buf = (buf_idx != INVALID_IDX) ? &pool->frame_bufs[buf_idx] : NULL; - if (buf != NULL) { + if (buf_idx != INVALID_IDX) { + buf = &pool->frame_bufs[buf_idx]; --buf->ref_count; cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; } @@ -2754,22 +3560,21 @@ static void release_scaled_references(VP9_COMP *cpi) { refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0; for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { const int idx = cpi->scaled_ref_idx[i - 1]; - RefCntBuffer *const buf = - idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; - const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); - if (buf != NULL && - (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && - buf->buf.y_crop_height == ref->y_crop_height))) { - --buf->ref_count; - cpi->scaled_ref_idx[i - 1] = INVALID_IDX; + if (idx != INVALID_IDX) { + RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); + if (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && + buf->buf.y_crop_height == ref->y_crop_height)) { + --buf->ref_count; + cpi->scaled_ref_idx[i - 1] = INVALID_IDX; + } } } } else { - for (i = 0; i < MAX_REF_FRAMES; ++i) { + for (i = 0; i < REFS_PER_FRAME; ++i) { const int idx = cpi->scaled_ref_idx[i]; - RefCntBuffer *const buf = - idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; - if (buf != NULL) { + if (idx != INVALID_IDX) { + RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; --buf->ref_count; cpi->scaled_ref_idx[i] = INVALID_IDX; } @@ -2828,26 +3633,46 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { case VPX_BITS_10: dc_quant_devisor = 16.0; break; - case VPX_BITS_12: - dc_quant_devisor = 64.0; - break; default: - assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + dc_quant_devisor = 64.0; break; } #else dc_quant_devisor = 4.0; #endif - fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d" - "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" " - "%10"PRId64" %10"PRId64" %10d " - "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" - "%6d %6d %5d %5d %5d " - "%10"PRId64" %10.3lf" - "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n", + if (!cm->current_video_frame) { + fprintf(f, "frame, width, height, last ts, last end ts, " + "source_alt_ref_pending, source_alt_ref_active, " + "this_frame_target, projected_frame_size, " + "projected_frame_size / MBs, " + "projected_frame_size - this_frame_target, " + "vbr_bits_off_target, vbr_bits_off_target_fast, " + "twopass.extend_minq, twopass.extend_minq_fast, " + "total_target_vs_actual, " + "starting_buffer_level - bits_off_target, " + "total_actual_bits, base_qindex, q for base_qindex, " + "dc quant, q for active_worst_quality, avg_q, q for oxcf.cq_level, " + "refresh_last_frame, refresh_golden_frame, refresh_alt_ref_frame, " + "frame_type, gfu_boost, " + "twopass.bits_left, " + "twopass.total_left_stats.coded_error, " + "twopass.bits_left / (1 + twopass.total_left_stats.coded_error), " + "tot_recode_hits, recon_err, kf_boost, " + "twopass.kf_zeromotion_pct, twopass.fr_content_type, " + "filter_level, seg.aq_av_offset\n"); + } + + fprintf(f, "%10u, %d, %d, %10"PRId64", %10"PRId64", %d, %d, %10d, %10d, " + "%10d, %10d, %10"PRId64", %10"PRId64", %5d, %5d, %10"PRId64", " + "%10"PRId64", %10"PRId64", %10d, %7.2lf, %7.2lf, %7.2lf, %7.2lf, " + "%7.2lf, %6d, %6d, %5d, %5d, %5d, %10"PRId64", %10.3lf, %10lf, %8u, " + "%10"PRId64", %10d, %10d, %10d, %10d, %10d\n", cpi->common.current_video_frame, cm->width, cm->height, + cpi->last_time_stamp_seen, + cpi->last_end_time_stamp_seen, cpi->rc.source_alt_ref_pending, cpi->rc.source_alt_ref_active, cpi->rc.this_frame_target, @@ -2926,7 +3751,7 @@ static void set_mv_search_params(VP9_COMP *cpi) { } static void set_size_independent_vars(VP9_COMP *cpi) { - vp9_set_speed_features_framesize_independent(cpi); + vp9_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed); vp9_set_rd_speed_thresholds(cpi); vp9_set_rd_speed_thresholds_sub8x8(cpi); cpi->common.interp_filter = cpi->sf.default_interp_filter; @@ -2935,29 +3760,39 @@ static void set_size_independent_vars(VP9_COMP *cpi) { static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, int *top_index) { VP9_COMMON *const cm = &cpi->common; - const VP9EncoderConfig *const oxcf = &cpi->oxcf; // Setup variables that depend on the dimensions of the frame. - vp9_set_speed_features_framesize_dependent(cpi); + vp9_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed); // Decide q and q bounds. *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index); + if (cpi->oxcf.rc_mode == VPX_CBR && cpi->rc.force_max_q) { + *q = cpi->rc.worst_quality; + cpi->rc.force_max_q = 0; + } + + if (cpi->use_svc) { + cpi->svc.base_qindex[cpi->svc.spatial_layer_id] = *q; + } + if (!frame_is_intra_only(cm)) { vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH); } +#if !CONFIG_REALTIME_ONLY // Configure experimental use of segmentation for enhanced coding of // static regions if indicated. // Only allowed in the second pass of a two pass encode, as it requires // lagged coding, and if the relevant speed feature flag is set. - if (oxcf->pass == 2 && cpi->sf.static_segmentation) + if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation) configure_static_seg_features(cpi); +#endif // !CONFIG_REALTIME_ONLY #if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING) - if (oxcf->noise_sensitivity > 0) { + if (cpi->oxcf.noise_sensitivity > 0) { int l = 0; - switch (oxcf->noise_sensitivity) { + switch (cpi->oxcf.noise_sensitivity) { case 1: l = 20; break; case 2: l = 40; break; case 3: l = 60; break; @@ -2966,31 +3801,16 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, case 6: l = 150; break; } if (!cpi->common.postproc_state.limits) { - cpi->common.postproc_state.limits = vpx_calloc( - cpi->common.width, sizeof(*cpi->common.postproc_state.limits)); + CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits, + vpx_calloc(cpi->un_scaled_source->y_width, + sizeof(*cpi->common.postproc_state.limits))); } - vp9_denoise(cpi->Source, cpi->Source, l, cpi->common.postproc_state.limits); + vp9_denoise(&cpi->common, cpi->Source, cpi->Source, l, + cpi->common.postproc_state.limits); } #endif // CONFIG_VP9_POSTPROC } -#if CONFIG_VP9_TEMPORAL_DENOISING -static void setup_denoiser_buffer(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - if (cpi->oxcf.noise_sensitivity > 0 && - !cpi->denoiser.frame_buffer_initialized) { - if (vp9_denoiser_alloc(&cpi->denoiser, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - VP9_ENC_BORDER_IN_PIXELS)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate denoiser"); - } -} -#endif - static void init_motion_estimation(VP9_COMP *cpi) { int y_stride = cpi->scaled_source.y_stride; @@ -3007,6 +3827,7 @@ static void set_frame_size(VP9_COMP *cpi) { VP9EncoderConfig *const oxcf = &cpi->oxcf; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; +#if !CONFIG_REALTIME_ONLY if (oxcf->pass == 2 && oxcf->rc_mode == VPX_VBR && ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) || (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) { @@ -3017,16 +3838,21 @@ static void set_frame_size(VP9_COMP *cpi) { vp9_set_size_literal(cpi, oxcf->scaled_frame_width, oxcf->scaled_frame_height); } +#endif // !CONFIG_REALTIME_ONLY - if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && !cpi->use_svc && + if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) { - oxcf->scaled_frame_width = - (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den; - oxcf->scaled_frame_height = - (oxcf->height * cpi->resize_scale_num) / cpi->resize_scale_den; - // There has been a change in frame size. - vp9_set_size_literal(cpi, oxcf->scaled_frame_width, - oxcf->scaled_frame_height); + // For SVC scaled width/height will have been set (svc->resize_set=1) + // in get_svc_params based on the layer width/height. + if (!cpi->use_svc || !cpi->svc.resize_set) { + oxcf->scaled_frame_width = + (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den; + oxcf->scaled_frame_height = + (oxcf->height * cpi->resize_scale_num) / cpi->resize_scale_den; + // There has been a change in frame size. + vp9_set_size_literal(cpi, oxcf->scaled_frame_width, + oxcf->scaled_frame_height); + } // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. set_mv_search_params(cpi); @@ -3044,9 +3870,7 @@ static void set_frame_size(VP9_COMP *cpi) { #endif } - if ((oxcf->pass == 2) && - (!cpi->use_svc || (is_two_pass_svc(cpi) && - cpi->svc.encode_empty_frame_state != ENCODING))) { + if ((oxcf->pass == 2) && !cpi->use_svc) { vp9_set_target_rate(cpi); } @@ -3066,6 +3890,7 @@ static void set_frame_size(VP9_COMP *cpi) { alloc_util_frame_buffers(cpi); init_motion_estimation(cpi); + int has_valid_ref_frame = 0; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1]; const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); @@ -3084,55 +3909,158 @@ static void set_frame_size(VP9_COMP *cpi) { buf->y_crop_height, cm->width, cm->height); #endif // CONFIG_VP9_HIGHBITDEPTH + has_valid_ref_frame |= vp9_is_valid_scale(&ref_buf->sf); if (vp9_is_scaled(&ref_buf->sf)) vpx_extend_frame_borders(buf); } else { ref_buf->buf = NULL; } } + if (!frame_is_intra_only(cm) && !has_valid_ref_frame) { + vpx_internal_error( + &cm->error, VPX_CODEC_ERROR, + "Can't find at least one reference frame with valid size"); + } set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); } -static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, - uint8_t *dest) { +static void save_encode_params(VP9_COMP *cpi) { + int tile_idx; + int i, j; + TileDataEnc *tile_data; + RD_OPT *rd_opt = &cpi->rd; + for (i = 0; i < MAX_REF_FRAMES; i++) { + for (j = 0; j < REFERENCE_MODES; j++) + rd_opt->prediction_type_threshes_prev[i][j] = + rd_opt->prediction_type_threshes[i][j]; + + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++) + rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j]; + } + + for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) { + assert(cpi->tile_data); + tile_data = &cpi->tile_data[tile_idx]; + vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact); + } +} + +static INLINE void set_raw_source_frame(VP9_COMP *cpi) { +#ifdef ENABLE_KF_DENOISE + if (is_spatial_denoise_enabled(cpi)) { + cpi->raw_source_frame = vp9_scale_if_required( + cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, + (oxcf->pass == 0), EIGHTTAP, 0); + } else { + cpi->raw_source_frame = cpi->Source; + } +#else + cpi->raw_source_frame = cpi->Source; +#endif +} + +static YV12_BUFFER_CONFIG *svc_twostage_scale( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type, + int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) { + if (cm->mi_cols * MI_SIZE != unscaled->y_width || + cm->mi_rows * MI_SIZE != unscaled->y_height) { +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->bit_depth == VPX_BITS_8) { + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, + phase_scaler); + } else { + scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth, + filter_type2, phase_scaler2); + scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth, + filter_type, phase_scaler); + } +#else + vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2, + phase_scaler2); + vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler); +#endif // CONFIG_VP9_HIGHBITDEPTH + return scaled; + } else { + return unscaled; + } +} + +static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, + uint8_t *dest, size_t dest_size) { VP9_COMMON *const cm = &cpi->common; - int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. + SVC *const svc = &cpi->svc; + int q = 0, bottom_index = 0, top_index = 0; + int no_drop_scene_change = 0; + const INTERP_FILTER filter_scaler = + (is_one_pass_svc(cpi)) + ? svc->downsample_filter_type[svc->spatial_layer_id] + : EIGHTTAP; + const int phase_scaler = + (is_one_pass_svc(cpi)) + ? svc->downsample_filter_phase[svc->spatial_layer_id] + : 0; + + if (cm->show_existing_frame) { + cpi->rc.this_frame_target = 0; + if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi); + return 1; + } + + svc->time_stamp_prev[svc->spatial_layer_id] = svc->time_stamp_superframe; + + // Flag to check if its valid to compute the source sad (used for + // scene detection and for superblock content state in CBR mode). + // The flag may get reset below based on SVC or resizing state. + cpi->compute_source_sad_onepass = cpi->oxcf.mode == REALTIME; vpx_clear_system_state(); set_frame_size(cpi); - if (is_one_pass_cbr_svc(cpi) && + if (is_one_pass_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 2 && cpi->un_scaled_source->y_height == cm->height << 2 && - cpi->svc.scaled_temp.y_width == cm->width << 1 && - cpi->svc.scaled_temp.y_height == cm->height << 1) { + svc->scaled_temp.y_width == cm->width << 1 && + svc->scaled_temp.y_height == cm->height << 1) { // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2 // result will be saved in scaled_temp and might be used later. - cpi->Source = vp9_svc_twostage_scale( - cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp); - cpi->svc.scaled_one_half = 1; - } else if (is_one_pass_cbr_svc(cpi) && + const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1]; + const int phase_scaler2 = svc->downsample_filter_phase[1]; + cpi->Source = svc_twostage_scale( + cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp, + filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); + svc->scaled_one_half = 1; + } else if (is_one_pass_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 1 && cpi->un_scaled_source->y_height == cm->height << 1 && - cpi->svc.scaled_one_half) { + svc->scaled_one_half) { // If the spatial layer is 1/2x1/2 and the scaling is already done in the // two-stage scaling, use the result directly. - cpi->Source = &cpi->svc.scaled_temp; - cpi->svc.scaled_one_half = 0; + cpi->Source = &svc->scaled_temp; + svc->scaled_one_half = 0; } else { cpi->Source = vp9_scale_if_required( - cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0)); + cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0), + filter_scaler, phase_scaler); } +#ifdef OUTPUT_YUV_SVC_SRC + // Write out at most 3 spatial layers. + if (is_one_pass_svc(cpi) && svc->spatial_layer_id < 3) { + vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source); + } +#endif // Unfiltered raw source used in metrics calculation if the source // has been filtered. if (is_psnr_calc_enabled(cpi)) { #ifdef ENABLE_KF_DENOISE if (is_spatial_denoise_enabled(cpi)) { - cpi->raw_source_frame = - vp9_scale_if_required(cm, &cpi->raw_unscaled_source, - &cpi->raw_scaled_source, (cpi->oxcf.pass == 0)); + cpi->raw_source_frame = vp9_scale_if_required( + cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, + (cpi->oxcf.pass == 0), EIGHTTAP, phase_scaler); } else { cpi->raw_source_frame = cpi->Source; } @@ -3141,58 +4069,170 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, #endif } + if ((cpi->use_svc && + (svc->spatial_layer_id < svc->number_spatial_layers - 1 || + svc->temporal_layer_id < svc->number_temporal_layers - 1 || + svc->current_superframe < 1)) || + cpi->resize_pending || cpi->resize_state || cpi->external_resize || + cpi->resize_state != ORIG) { + cpi->compute_source_sad_onepass = 0; + if (cpi->content_state_sb_fd != NULL) + memset(cpi->content_state_sb_fd, 0, + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * + sizeof(*cpi->content_state_sb_fd)); + } + // Avoid scaling last_source unless its needed. - // Last source is needed if vp9_avg_source_sad() is used, or if - // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise - // estimation is enabled. + // Last source is needed if avg_source_sad() is used, or if noise estimation + // is enabled. if (cpi->unscaled_last_source != NULL && (cpi->oxcf.content == VP9E_CONTENT_SCREEN || (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5) || - cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION || - cpi->noise_estimate.enabled)) - cpi->Last_Source = - vp9_scale_if_required(cm, cpi->unscaled_last_source, - &cpi->scaled_last_source, (cpi->oxcf.pass == 0)); + (cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) || + cpi->compute_source_sad_onepass)) + cpi->Last_Source = vp9_scale_if_required( + cm, cpi->unscaled_last_source, &cpi->scaled_last_source, + (cpi->oxcf.pass == 0), EIGHTTAP, 0); - if (cm->frame_type == KEY_FRAME || cpi->resize_pending != 0) { + if (cpi->Last_Source == NULL || + cpi->Last_Source->y_width != cpi->Source->y_width || + cpi->Last_Source->y_height != cpi->Source->y_height) + cpi->compute_source_sad_onepass = 0; + + if (frame_is_intra_only(cm) || cpi->resize_pending != 0) { memset(cpi->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && cpi->use_svc) + vp9_denoiser_reset_on_first_frame(cpi); +#endif + + // Scene detection is always used for VBR mode or screen-content case. + // For other cases (e.g., CBR mode) use it for 5 <= speed. + cpi->rc.high_source_sad = 0; + cpi->rc.hybrid_intra_scene_change = 0; + cpi->rc.re_encode_maxq_scene_change = 0; + if (cm->show_frame && cpi->oxcf.mode == REALTIME && + !cpi->disable_scene_detection_rtc_ratectrl && + (cpi->oxcf.rc_mode == VPX_VBR || + cpi->oxcf.content == VP9E_CONTENT_SCREEN || cpi->oxcf.speed >= 5)) + vp9_scene_detection_onepass(cpi); + + if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) { + svc->high_source_sad_superframe = cpi->rc.high_source_sad; + svc->high_num_blocks_with_motion = cpi->rc.high_num_blocks_with_motion; + // On scene change reset temporal layer pattern to TL0. + // Note that if the base/lower spatial layers are skipped: instead of + // inserting base layer here, we force max-q for the next superframe + // with lower spatial layers: this is done in vp9_encodedframe_overshoot() + // when max-q is decided for the current layer. + // Only do this reset for bypass/flexible mode. + if (svc->high_source_sad_superframe && svc->temporal_layer_id > 0 && + svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + // rc->high_source_sad will get reset so copy it to restore it. + int tmp_high_source_sad = cpi->rc.high_source_sad; + vp9_svc_reset_temporal_layers(cpi, cm->frame_type == KEY_FRAME); + cpi->rc.high_source_sad = tmp_high_source_sad; + } + } + vp9_update_noise_estimate(cpi); - if (cpi->oxcf.pass == 0 && cpi->oxcf.mode == REALTIME && - cpi->oxcf.speed >= 5 && cpi->resize_state == 0 && - (cpi->oxcf.content == VP9E_CONTENT_SCREEN || - cpi->oxcf.rc_mode == VPX_VBR || cpi->sf.copy_partition_flag) && - cm->show_frame) - vp9_avg_source_sad(cpi); + // For 1 pass CBR, check if we are dropping this frame. + // Never drop on key frame, if base layer is key for svc, + // on scene change, or if superframe has layer sync. + if ((cpi->rc.high_source_sad || svc->high_source_sad_superframe) && + !(cpi->rc.use_post_encode_drop && svc->last_layer_dropped[0])) + no_drop_scene_change = 1; + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && + !frame_is_intra_only(cm) && !no_drop_scene_change && + !svc->superframe_has_layer_sync && + (!cpi->use_svc || + !svc->layer_context[svc->temporal_layer_id].is_key_frame)) { + if (vp9_rc_drop_frame(cpi)) return 0; + } - // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference - // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this - // frame-level upsampling. - if (frame_is_intra_only(cm) == 0 && !is_one_pass_cbr_svc(cpi)) { + // For 1 pass SVC, only ZEROMV is allowed for spatial reference frame + // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can + // avoid this frame-level upsampling (for non intra_only frames). + // For SVC single_layer mode, dynamic resize is allowed and we need to + // scale references for this case. + if (frame_is_intra_only(cm) == 0 && + ((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) || + !(is_one_pass_svc(cpi) && svc->force_zero_mode_spatial_ref))) { vp9_scale_references(cpi); } set_size_independent_vars(cpi); set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); - if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && + // search method and step parameter might be changed in speed settings. + init_motion_estimation(cpi); + + if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi); + + if (cpi->sf.svc_use_lowres_part && + svc->spatial_layer_id == svc->number_spatial_layers - 2) { + if (svc->prev_partition_svc == NULL) { + CHECK_MEM_ERROR( + &cm->error, svc->prev_partition_svc, + (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, + sizeof(*svc->prev_partition_svc))); + } + } + + // TODO(jianj): Look into issue of skin detection with high bitdepth. + if (cm->bit_depth == 8 && cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { cpi->use_skin_detection = 1; } - vp9_set_quantizer(cm, q); - vp9_set_variance_partition_thresholds(cpi, q); + // Enable post encode frame dropping for CBR on non key frame, when + // ext_use_post_encode_drop is specified by user. + cpi->rc.use_post_encode_drop = cpi->rc.ext_use_post_encode_drop && + cpi->oxcf.rc_mode == VPX_CBR && + cm->frame_type != KEY_FRAME; + + vp9_set_quantizer(cpi, q, 0); + vp9_set_variance_partition_thresholds(cpi, q, 0); setup_frame(cpi); suppress_active_map(cpi); + if (cpi->use_svc) { + // On non-zero spatial layer, check for disabling inter-layer + // prediction. + if (svc->spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi); + vp9_svc_assert_constraints_pattern(cpi); + } + + if (cpi->rc.last_post_encode_dropped_scene_change) { + cpi->rc.high_source_sad = 1; + svc->high_source_sad_superframe = 1; + // For now disable use_source_sad since Last_Source will not be the previous + // encoded but the dropped one. + cpi->sf.use_source_sad = 0; + cpi->rc.last_post_encode_dropped_scene_change = 0; + } + // Check if this high_source_sad (scene/slide change) frame should be + // encoded at high/max QP, and if so, set the q and adjust some rate + // control parameters. + if (cpi->sf.overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ && + (cpi->rc.high_source_sad || + (cpi->use_svc && svc->high_source_sad_superframe))) { + if (vp9_encodedframe_overshoot(cpi, -1, &q)) { + vp9_set_quantizer(cpi, q, 0); + vp9_set_variance_partition_thresholds(cpi, q, 0); + } + } + +#if !CONFIG_REALTIME_ONLY // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. if (cpi->oxcf.aq_mode == VARIANCE_AQ) { @@ -3201,41 +4241,64 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, vp9_360aq_frame_setup(cpi); } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { vp9_setup_in_frame_q_adj(cpi); - } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { - vp9_cyclic_refresh_setup(cpi); } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) { // it may be pretty bad for rate-control, // and I should handle it somehow vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); + } else { +#endif + // If ROI is enabled and skip feature is used for segmentation, apply cyclic + // refresh but not apply ROI for skip for the first 20 frames (defined by + // FRAMES_NO_SKIPPING_AFTER_KEY) after key frame to improve quality. + if (cpi->roi.enabled && !frame_is_intra_only(cm)) { + if (cpi->roi.skip[BACKGROUND_SEG_SKIP_ID]) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_setup(cpi); + if (cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY) + apply_roi_map(cpi); + } else { + apply_roi_map(cpi); + } + } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vp9_cyclic_refresh_setup(cpi); + } + +#if !CONFIG_REALTIME_ONLY } +#endif apply_active_map(cpi); vp9_encode_frame(cpi); - // Check if we should drop this frame because of high overshoot. - // Only for frames where high temporal-source SAD is detected. - if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && - cpi->resize_state == 0 && cm->frame_type != KEY_FRAME && - cpi->oxcf.content == VP9E_CONTENT_SCREEN && - cpi->rc.high_source_sad == 1) { + // Check if we should re-encode this frame at high Q because of high + // overshoot based on the encoded frame size. Only for frames where + // high temporal-source SAD is detected. + // For SVC: all spatial layers are checked for re-encoding. + if (cpi->sf.overshoot_detection_cbr_rt == RE_ENCODE_MAXQ && + (cpi->rc.high_source_sad || + (cpi->use_svc && svc->high_source_sad_superframe))) { int frame_size = 0; // Get an estimate of the encoded frame size. save_coding_context(cpi); - vp9_pack_bitstream(cpi, dest, size); + vp9_pack_bitstream(cpi, dest, dest_size, size); restore_coding_context(cpi); frame_size = (int)(*size) << 3; // Check if encoded frame will overshoot too much, and if so, set the q and // adjust some rate control parameters, and return to re-encode the frame. if (vp9_encodedframe_overshoot(cpi, frame_size, &q)) { vpx_clear_system_state(); - vp9_set_quantizer(cm, q); - vp9_set_variance_partition_thresholds(cpi, q); + vp9_set_quantizer(cpi, q, 0); + vp9_set_variance_partition_thresholds(cpi, q, 0); suppress_active_map(cpi); // Turn-off cyclic refresh for re-encoded frame. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); + cr->sb_index = 0; vp9_disable_segmentation(&cm->seg); } apply_active_map(cpi); @@ -3243,19 +4306,40 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, } } - // Update some stats from cyclic refresh, and check if we should not update - // golden reference, for non-SVC 1 pass CBR. - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME && - !cpi->use_svc && cpi->ext_refresh_frame_flags_pending == 0 && - (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR)) - vp9_cyclic_refresh_check_golden_update(cpi); + // Update some stats from cyclic refresh, and check for golden frame update. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + !frame_is_intra_only(cm) && cpi->cyclic_refresh->content_mode) + vp9_cyclic_refresh_postencode(cpi); // Update the skip mb flag probabilities based on the distribution // seen in the last encoder iteration. // update_base_skip_probs(cpi); vpx_clear_system_state(); + return 1; } +static int get_ref_frame_flags(const VP9_COMP *cpi) { + const int *const map = cpi->common.ref_frame_map; + const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx]; + const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx]; + const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx]; + int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; + + if (gold_is_last) flags &= ~VP9_GOLD_FLAG; + + if (cpi->rc.frames_till_gf_update_due == INT_MAX && + (cpi->svc.number_temporal_layers == 1 && + cpi->svc.number_spatial_layers == 1)) + flags &= ~VP9_GOLD_FLAG; + + if (alt_is_last) flags &= ~VP9_ALT_FLAG; + + if (gold_is_alt) flags &= ~VP9_ALT_FLAG; + + return flags; +} + +#if !CONFIG_REALTIME_ONLY #define MAX_QSTEP_ADJ 4 static int get_qstep_adj(int rate_excess, int rate_limit) { int qstep = @@ -3263,8 +4347,9 @@ static int get_qstep_adj(int rate_excess, int rate_limit) { return VPXMIN(qstep, MAX_QSTEP_ADJ); } -static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, - uint8_t *dest) { +static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest, + size_t dest_size) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int bottom_index, top_index; @@ -3277,13 +4362,27 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, int frame_under_shoot_limit; int q = 0, q_low = 0, q_high = 0; int enable_acl; +#ifdef AGGRESSIVE_VBR + int qrange_adj = 1; +#endif + + const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth; + + if (cm->show_existing_frame) { + rc->this_frame_target = 0; + if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi); + return; + } set_size_independent_vars(cpi); - enable_acl = cpi->sf.allow_acl - ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0) - : 0; + enable_acl = cpi->sf.allow_acl ? (cm->frame_type == KEY_FRAME) || + (cpi->twopass.gf_group.index == 1) + : 0; +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame: \n"); +#endif do { vpx_clear_system_state(); @@ -3292,6 +4391,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (loop_count == 0 || cpi->resize_pending != 0) { set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); +#ifdef AGGRESSIVE_VBR + if (two_pass_first_group_inter(cpi)) { + // Adjustment limits for min and max q + qrange_adj = VPXMAX(1, (top_index - bottom_index) / 2); + + bottom_index = + VPXMAX(bottom_index - qrange_adj / 2, oxcf->best_allowed_q); + top_index = VPXMIN(oxcf->worst_allowed_q, top_index + qrange_adj / 2); + } +#endif // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. set_mv_search_params(cpi); @@ -3315,8 +4424,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, &frame_over_shoot_limit); } - cpi->Source = vp9_scale_if_required( - cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0)); + cpi->Source = + vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source, + (oxcf->pass == 0), EIGHTTAP, 0); // Unfiltered raw source used in metrics calculation if the source // has been filtered. @@ -3325,7 +4435,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (is_spatial_denoise_enabled(cpi)) { cpi->raw_source_frame = vp9_scale_if_required( cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, - (cpi->oxcf.pass == 0)); + (oxcf->pass == 0), EIGHTTAP, 0); } else { cpi->raw_source_frame = cpi->Source; } @@ -3337,7 +4447,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (cpi->unscaled_last_source != NULL) cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, &cpi->scaled_last_source, - (cpi->oxcf.pass == 0)); + (oxcf->pass == 0), EIGHTTAP, 0); if (frame_is_intra_only(cm) == 0) { if (loop_count > 0) { @@ -3346,20 +4456,62 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, vp9_scale_references(cpi); } - vp9_set_quantizer(cm, q); + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int ext_rc_delta_q_uv = 0; + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && + cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { + vpx_codec_err_t codec_status; + vpx_rc_encodeframe_decision_t encode_frame_decision; + int sb_size = num_8x8_blocks_wide_lookup[BLOCK_64X64] * MI_SIZE; + int frame_height_sb = (cm->height + sb_size - 1) / sb_size; + int frame_width_sb = (cm->width + sb_size - 1) / sb_size; + CHECK_MEM_ERROR(&cm->error, encode_frame_decision.sb_params_list, + (sb_params *)vpx_calloc( + frame_height_sb * frame_width_sb, + sizeof(*encode_frame_decision.sb_params_list))); + codec_status = vp9_extrc_get_encodeframe_decision( + &cpi->ext_ratectrl, gf_group->index, &encode_frame_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_encodeframe_decision() failed"); + } + for (int idx = 0; idx < frame_height_sb * frame_width_sb; ++idx) { + cpi->sb_mul_scale[idx] = + (((int64_t)encode_frame_decision.sb_params_list[idx].rdmult * 256) / + (encode_frame_decision.rdmult + 1)); + } + vpx_free(encode_frame_decision.sb_params_list); + // If the external model recommends a reserved value, we use + // libvpx's default q. + if (encode_frame_decision.q_index != VPX_DEFAULT_Q) { + q = encode_frame_decision.q_index; + } + ext_rc_delta_q_uv = encode_frame_decision.delta_q_uv; + } + + if (cpi->ext_ratectrl.ready && cpi->ext_ratectrl.log_file) { + fprintf(cpi->ext_ratectrl.log_file, + "ENCODE_FRAME_INFO gop_index %d update_type %d q %d\n", + gf_group->index, gf_group->update_type[gf_group->index], q); + } + + vp9_set_quantizer(cpi, q, ext_rc_delta_q_uv); if (loop_count == 0) setup_frame(cpi); // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + if (oxcf->aq_mode == VARIANCE_AQ) { vp9_vaq_frame_setup(cpi); - } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) { + } else if (oxcf->aq_mode == EQUATOR360_AQ) { vp9_360aq_frame_setup(cpi); - } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + } else if (oxcf->aq_mode == COMPLEXITY_AQ) { vp9_setup_in_frame_q_adj(cpi); - } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) { + } else if (oxcf->aq_mode == LOOKAHEAD_AQ) { vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); + } else if (oxcf->aq_mode == PSNR_AQ) { + vp9_psnr_aq_mode_setup(&cm->seg); } vp9_encode_frame(cpi); @@ -3375,14 +4527,20 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // to recode. if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) { save_coding_context(cpi); - if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size); + if (!cpi->sf.use_nonrd_pick_mode) + vp9_pack_bitstream(cpi, dest, dest_size, size); rc->projected_frame_size = (int)(*size) << 3; if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; } - if (cpi->oxcf.rc_mode == VPX_Q) { + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) { + break; + } + + if (oxcf->rc_mode == VPX_Q) { loop = 0; } else { if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced && @@ -3462,14 +4620,25 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // Frame is too large if (rc->projected_frame_size > rc->this_frame_target) { // Special case if the projected size is > the max allowed. - if (rc->projected_frame_size >= rc->max_frame_bandwidth) - q_high = rc->worst_quality; + if ((q == q_high) && + ((rc->projected_frame_size >= rc->max_frame_bandwidth) || + (!rc->is_src_frame_alt_ref && + (rc->projected_frame_size >= + big_rate_miss_high_threshold(cpi))))) { + int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth, + big_rate_miss_high_threshold(cpi))); + double q_val_high; + q_val_high = vp9_convert_qindex_to_q(q_high, cm->bit_depth); + q_val_high = + q_val_high * ((double)rc->projected_frame_size / max_rate); + q_high = vp9_convert_q_to_qindex(q_val_high, cm->bit_depth); + q_high = clamp(q_high, rc->best_quality, rc->worst_quality); + } // Raise Qlow as to at least the current value qstep = get_qstep_adj(rc->projected_frame_size, rc->this_frame_target); q_low = VPXMIN(q + qstep, q_high); - // q_low = q < q_high ? q + 1 : q_high; if (undershoot_seen || loop_at_this_size > 1) { // Update rate_correction_factor unless @@ -3497,31 +4666,29 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, qstep = get_qstep_adj(rc->this_frame_target, rc->projected_frame_size); q_high = VPXMAX(q - qstep, q_low); - // q_high = q > q_low ? q - 1 : q_low; if (overshoot_seen || loop_at_this_size > 1) { vp9_rc_update_rate_correction_factors(cpi); q = (q_high + q_low) / 2; } else { vp9_rc_update_rate_correction_factors(cpi); - q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, - top_index); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + VPXMIN(q_low, bottom_index), top_index); // Special case reset for qlow for constrained quality. // This should only trigger where there is very substantial // undershoot on a frame and the auto cq level is above - // the user passsed in value. - if (cpi->oxcf.rc_mode == VPX_CQ && q < q_low) { + // the user passed in value. + if (oxcf->rc_mode == VPX_CQ && q < q_low) { q_low = q; } while (q > q_high && retries < 10) { vp9_rc_update_rate_correction_factors(cpi); - q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, - top_index); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + VPXMIN(q_low, bottom_index), top_index); retries++; } } - undershoot_seen = 1; } @@ -3549,42 +4716,47 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, } if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) - if (loop || !enable_acl) restore_coding_context(cpi); + if (loop) restore_coding_context(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + if (loop) printf("\n Recoding:"); +#endif } while (loop); + rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth; + +#ifdef AGGRESSIVE_VBR + if (two_pass_first_group_inter(cpi)) { + cpi->twopass.active_worst_quality = + VPXMIN(q + qrange_adj, oxcf->worst_allowed_q); + } else if (!frame_is_kf_gf_arf(cpi)) { +#else + if (!frame_is_kf_gf_arf(cpi)) { +#endif + // Have we been forced to adapt Q outside the expected range by an extreme + // rate miss. If so adjust the active maxQ for the subsequent frames. + if (!rc->is_src_frame_alt_ref && (q > cpi->twopass.active_worst_quality)) { + cpi->twopass.active_worst_quality = q; + } else if (oxcf->vbr_corpus_complexity && q == q_low && + rc->projected_frame_size < rc->this_frame_target) { + cpi->twopass.active_worst_quality = + VPXMAX(q, cpi->twopass.active_worst_quality - 1); + } + } + if (enable_acl) { - vp9_encode_frame(cpi); + // Skip recoding, if model diff is below threshold + const int thresh = compute_context_model_thresh(cpi); + const int diff = compute_context_model_diff(cm); + if (diff >= thresh) { + vp9_encode_frame(cpi); + } + } + if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) { vpx_clear_system_state(); restore_coding_context(cpi); - vp9_pack_bitstream(cpi, dest, size); - - vp9_encode_frame(cpi); - vpx_clear_system_state(); - - restore_coding_context(cpi); } } - -static int get_ref_frame_flags(const VP9_COMP *cpi) { - const int *const map = cpi->common.ref_frame_map; - const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx]; - const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx]; - const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx]; - int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; - - if (gold_is_last) flags &= ~VP9_GOLD_FLAG; - - if (cpi->rc.frames_till_gf_update_due == INT_MAX && - (cpi->svc.number_temporal_layers == 1 && - cpi->svc.number_spatial_layers == 1)) - flags &= ~VP9_GOLD_FLAG; - - if (alt_is_last) flags &= ~VP9_ALT_FLAG; - - if (gold_is_alt) flags &= ~VP9_ALT_FLAG; - - return flags; -} +#endif // !CONFIG_REALTIME_ONLY static void set_ext_overrides(VP9_COMP *cpi) { // Overrides the defaults with the externally supplied values with @@ -3602,18 +4774,28 @@ static void set_ext_overrides(VP9_COMP *cpi) { } } -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp) { +YV12_BUFFER_CONFIG *vp9_scale_if_required( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) { if (cm->mi_cols * MI_SIZE != unscaled->y_width || cm->mi_rows * MI_SIZE != unscaled->y_height) { #if CONFIG_VP9_HIGHBITDEPTH - scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth); - scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth); + if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && + unscaled->y_height <= (scaled->y_height << 1)) + if (cm->bit_depth == VPX_BITS_8) + vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); + else + scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth, + filter_type, phase_scaler); + else + vp9_scale_and_extend_frame_nonnormative(unscaled, scaled, + (int)cm->bit_depth); #else - vp9_scale_and_extend_frame(unscaled, scaled_temp); - vp9_scale_and_extend_frame(scaled_temp, scaled); + if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && + unscaled->y_height <= (scaled->y_height << 1)) + vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler); + else + vp9_scale_and_extend_frame_nonnormative(unscaled, scaled); #endif // CONFIG_VP9_HIGHBITDEPTH return scaled; } else { @@ -3621,45 +4803,21 @@ YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm, } } -YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - int use_normative_scaler) { - if (cm->mi_cols * MI_SIZE != unscaled->y_width || - cm->mi_rows * MI_SIZE != unscaled->y_height) { -#if CONFIG_VP9_HIGHBITDEPTH - if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && - unscaled->y_height <= (scaled->y_height << 1)) - scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth); - else - scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth); -#else - if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) && - unscaled->y_height <= (scaled->y_height << 1)) - vp9_scale_and_extend_frame(unscaled, scaled); - else - scale_and_extend_frame_nonnormative(unscaled, scaled); -#endif // CONFIG_VP9_HIGHBITDEPTH - return scaled; - } else { - return unscaled; - } -} - -static void set_arf_sign_bias(VP9_COMP *cpi) { +static void set_ref_sign_bias(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - int arf_sign_bias; + RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx); + const int cur_frame_index = ref_buffer->frame_index; + MV_REFERENCE_FRAME ref_frame; - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_sign_bias = cpi->rc.source_alt_ref_active && - (!cpi->refresh_alt_ref_frame || - (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)); - } else { - arf_sign_bias = - (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame); + for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) { + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + const RefCntBuffer *const ref_cnt_buf = + get_ref_cnt_buffer(&cpi->common, buf_idx); + if (ref_cnt_buf) { + cm->ref_frame_sign_bias[ref_frame] = + cur_frame_index < ref_cnt_buf->frame_index; + } } - cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias; } static int setup_interp_filter_search_mask(VP9_COMP *cpi) { @@ -3688,9 +4846,9 @@ static int setup_interp_filter_search_mask(VP9_COMP *cpi) { } #ifdef ENABLE_KF_DENOISE -// Baseline Kernal weights for denoise -static uint8_t dn_kernal_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; -static uint8_t dn_kernal_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4, +// Baseline kernel weights for denoise +static uint8_t dn_kernel_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; +static uint8_t dn_kernel_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1 }; static INLINE void add_denoise_point(int centre_val, int data_val, int thresh, @@ -3707,37 +4865,37 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride, int sum_weight = 0; int sum_val = 0; int thresh = strength; - int kernal_size = 5; + int kernel_size = 5; int half_k_size = 2; int i, j; int max_diff = 0; uint8_t *tmp_ptr; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; // Find the maximum deviation from the source point in the locale. tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1); - for (i = 0; i < kernal_size + 2; ++i) { - for (j = 0; j < kernal_size + 2; ++j) { + for (i = 0; i < kernel_size + 2; ++i) { + for (j = 0; j < kernel_size + 2; ++j) { max_diff = VPXMAX(max_diff, abs((int)*src_ptr - (int)tmp_ptr[j])); } tmp_ptr += stride; } - // Select the kernal size. + // Select the kernel size. if (max_diff > (strength + (strength >> 1))) { - kernal_size = 3; + kernel_size = 3; half_k_size = 1; thresh = thresh >> 1; } - kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5; + kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5; - // Apply the kernal + // Apply the kernel tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size; - for (i = 0; i < kernal_size; ++i) { - for (j = 0; j < kernal_size; ++j) { - add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr, + for (i = 0; i < kernel_size; ++i) { + for (j = 0; j < kernel_size; ++j) { + add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr, &sum_val, &sum_weight); - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } @@ -3752,37 +4910,37 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride, int sum_weight = 0; int sum_val = 0; int thresh = strength; - int kernal_size = 5; + int kernel_size = 5; int half_k_size = 2; int i, j; int max_diff = 0; uint16_t *tmp_ptr; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; // Find the maximum deviation from the source point in the locale. tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1); - for (i = 0; i < kernal_size + 2; ++i) { - for (j = 0; j < kernal_size + 2; ++j) { + for (i = 0; i < kernel_size + 2; ++i) { + for (j = 0; j < kernel_size + 2; ++j) { max_diff = VPXMAX(max_diff, abs((int)src_ptr - (int)tmp_ptr[j])); } tmp_ptr += stride; } - // Select the kernal size. + // Select the kernel size. if (max_diff > (strength + (strength >> 1))) { - kernal_size = 3; + kernel_size = 3; half_k_size = 1; thresh = thresh >> 1; } - kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5; + kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5; - // Apply the kernal + // Apply the kernel tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size; - for (i = 0; i < kernal_size; ++i) { - for (j = 0; j < kernal_size; ++j) { - add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr, + for (i = 0; i < kernel_size; ++i) { + for (j = 0; j < kernel_size; ++j) { + add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr, &sum_val, &sum_weight); - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } @@ -3792,7 +4950,7 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride, } #endif // CONFIG_VP9_HIGHBITDEPTH -// Apply thresholded spatial noise supression to a given buffer. +// Apply thresholded spatial noise suppression to a given buffer. static void spatial_denoise_buffer(VP9_COMP *cpi, uint8_t *buffer, const int stride, const int width, const int height, const int strength) { @@ -3817,7 +4975,7 @@ static void spatial_denoise_buffer(VP9_COMP *cpi, uint8_t *buffer, } } -// Apply thresholded spatial noise supression to source. +// Apply thresholded spatial noise suppression to source. static void spatial_denoise_frame(VP9_COMP *cpi) { YV12_BUFFER_CONFIG *src = cpi->Source; const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -3827,8 +4985,7 @@ static void spatial_denoise_frame(VP9_COMP *cpi) { // Base the filter strength on the current active max Q. const int q = (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality, cm->bit_depth)); - int strength = - VPXMAX(oxcf->arnr_strength >> 2, VPXMIN(oxcf->arnr_strength, (q >> 4))); + int strength = clamp(q >> 4, oxcf->arnr_strength >> 2, oxcf->arnr_strength); // Denoise each of Y,U and V buffers. spatial_denoise_buffer(cpi, src->y_buffer, src->y_stride, src->y_width, @@ -3843,8 +5000,9 @@ static void spatial_denoise_frame(VP9_COMP *cpi) { } #endif // ENABLE_KF_DENOISE +#if !CONFIG_REALTIME_ONLY static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size, - uint8_t *dest) { + uint8_t *dest, size_t dest_size) { if (cpi->common.seg.enabled) if (ALT_REF_AQ_PROTECT_GAIN) { size_t nsize = *size; @@ -3855,7 +5013,7 @@ static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size, save_coding_context(cpi); vp9_disable_segmentation(&cpi->common.seg); - vp9_pack_bitstream(cpi, dest, &nsize); + vp9_pack_bitstream(cpi, dest, dest_size, &nsize); restore_coding_context(cpi); overhead = (int)*size - (int)nsize; @@ -3866,15 +5024,263 @@ static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size, vp9_enable_segmentation(&cpi->common.seg); } } +#endif -static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, - uint8_t *dest, - unsigned int *frame_flags) { +static void set_frame_index(VP9_COMP *cpi, VP9_COMMON *cm) { + RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx); + + if (ref_buffer) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + ref_buffer->frame_index = + cm->current_video_frame + gf_group->arf_src_offset[gf_group->index]; + ref_buffer->frame_coding_index = cm->current_frame_coding_index; + } +} + +static void set_mb_ssim_rdmult_scaling(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + uint8_t *y_buffer = cpi->Source->y_buffer; + const int y_stride = cpi->Source->y_stride; + const int block_size = BLOCK_16X16; + + const int num_8x8_w = num_8x8_blocks_wide_lookup[block_size]; + const int num_8x8_h = num_8x8_blocks_high_lookup[block_size]; + const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w; + const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h; + double log_sum = 0.0; + int row, col; + + // Loop through each 64x64 block. + for (row = 0; row < num_rows; ++row) { + for (col = 0; col < num_cols; ++col) { + int mi_row, mi_col; + double var = 0.0, num_of_var = 0.0; + const int index = row * num_cols + col; + + for (mi_row = row * num_8x8_h; + mi_row < cm->mi_rows && mi_row < (row + 1) * num_8x8_h; ++mi_row) { + for (mi_col = col * num_8x8_w; + mi_col < cm->mi_cols && mi_col < (col + 1) * num_8x8_w; ++mi_col) { + struct buf_2d buf; + const int row_offset_y = mi_row << 3; + const int col_offset_y = mi_col << 3; + + buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y; + buf.stride = y_stride; + + // In order to make SSIM_VAR_SCALE in a same scale for both 8 bit + // and high bit videos, the variance needs to be divided by 2.0 or + // 64.0 separately. + // TODO(sdeng): need to tune for 12bit videos. +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) + var += vp9_high_get_sby_variance(cpi, &buf, BLOCK_8X8, xd->bd); + else +#endif + var += vp9_get_sby_variance(cpi, &buf, BLOCK_8X8); + + num_of_var += 1.0; + } + } + var = var / num_of_var / 64.0; + + // Curve fitting with an exponential model on all 16x16 blocks from the + // Midres dataset. + var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222; + cpi->mi_ssim_rdmult_scaling_factors[index] = var; + log_sum += log(var); + } + } + log_sum = exp(log_sum / (double)(num_rows * num_cols)); + + for (row = 0; row < num_rows; ++row) { + for (col = 0; col < num_cols; ++col) { + const int index = row * num_cols + col; + cpi->mi_ssim_rdmult_scaling_factors[index] /= log_sum; + } + } + + (void)xd; +} + +// Process the wiener variance in 16x16 block basis. +static int qsort_comp(const void *elem1, const void *elem2) { + int a = *((const int *)elem1); + int b = *((const int *)elem2); + if (a > b) return 1; + if (a < b) return -1; + return 0; +} + +static void init_mb_wiener_var_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + + if (cpi->mb_wiener_variance && cpi->mb_wiener_var_rows >= cm->mb_rows && + cpi->mb_wiener_var_cols >= cm->mb_cols) + return; + + vpx_free(cpi->mb_wiener_variance); + cpi->mb_wiener_variance = NULL; + + CHECK_MEM_ERROR( + &cm->error, cpi->mb_wiener_variance, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance))); + cpi->mb_wiener_var_rows = cm->mb_rows; + cpi->mb_wiener_var_cols = cm->mb_cols; +} + +static void init_sb_mul_scale_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + + if (cpi->mb_wiener_var_rows >= cm->mb_rows && + cpi->mb_wiener_var_cols >= cm->mb_cols) + return; + + vpx_free(cpi->sb_mul_scale); + cpi->sb_mul_scale = NULL; + + CHECK_MEM_ERROR( + &cm->error, cpi->sb_mul_scale, + vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->sb_mul_scale))); + cpi->mb_wiener_var_rows = cm->mb_rows; + cpi->mb_wiener_var_cols = cm->mb_cols; +} + +static void set_mb_wiener_variance(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + uint8_t *buffer = cpi->Source->y_buffer; + int buf_stride = cpi->Source->y_stride; + +#if CONFIG_VP9_HIGHBITDEPTH + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + DECLARE_ALIGNED(16, uint16_t, zero_pred16[32 * 32]); + DECLARE_ALIGNED(16, uint8_t, zero_pred8[32 * 32]); + uint8_t *zero_pred; +#else + DECLARE_ALIGNED(16, uint8_t, zero_pred[32 * 32]); +#endif + + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); + + int mb_row, mb_col, count = 0; + // Hard coded operating block size + const int block_size = 16; + const int coeff_count = block_size * block_size; + const TX_SIZE tx_size = TX_16X16; + +#if CONFIG_VP9_HIGHBITDEPTH + xd->cur_buf = cpi->Source; + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + zero_pred = CONVERT_TO_BYTEPTR(zero_pred16); + memset(zero_pred16, 0, sizeof(*zero_pred16) * coeff_count); + } else { + zero_pred = zero_pred8; + memset(zero_pred8, 0, sizeof(*zero_pred8) * coeff_count); + } +#else + memset(zero_pred, 0, sizeof(*zero_pred) * coeff_count); +#endif + + cpi->norm_wiener_variance = 0; + + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + int idx; + int16_t median_val = 0; + uint8_t *mb_buffer = + buffer + mb_row * block_size * buf_stride + mb_col * block_size; + int64_t wiener_variance = 0; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, zero_pred, block_size, + xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + } else { + vpx_subtract_block(block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, zero_pred, block_size); + vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + } +#else + vpx_subtract_block(block_size, block_size, src_diff, block_size, + mb_buffer, buf_stride, zero_pred, block_size); + vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); +#endif // CONFIG_VP9_HIGHBITDEPTH + + coeff[0] = 0; + for (idx = 1; idx < coeff_count; ++idx) coeff[idx] = abs(coeff[idx]); + + qsort(coeff, coeff_count - 1, sizeof(*coeff), qsort_comp); + + // Noise level estimation + median_val = coeff[coeff_count / 2]; + + // Wiener filter + for (idx = 1; idx < coeff_count; ++idx) { + int64_t sqr_coeff = (int64_t)coeff[idx] * coeff[idx]; + int64_t tmp_coeff = (int64_t)coeff[idx]; + if (median_val) { + tmp_coeff = (sqr_coeff * coeff[idx]) / + (sqr_coeff + (int64_t)median_val * median_val); + } + wiener_variance += tmp_coeff * tmp_coeff; + } + cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col] = + wiener_variance / coeff_count; + cpi->norm_wiener_variance += + cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col]; + ++count; + } + } + + if (count) cpi->norm_wiener_variance /= count; + cpi->norm_wiener_variance = VPXMAX(1, cpi->norm_wiener_variance); +} + +#if !CONFIG_REALTIME_ONLY +static PSNR_STATS compute_psnr_stats(const YV12_BUFFER_CONFIG *source_frame, + const YV12_BUFFER_CONFIG *coded_frame, + uint32_t bit_depth, + uint32_t input_bit_depth, + int spatial_layer_id) { + PSNR_STATS psnr; +#if CONFIG_VP9_HIGHBITDEPTH + vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth, + input_bit_depth, spatial_layer_id); +#else // CONFIG_VP9_HIGHBITDEPTH + (void)bit_depth; + (void)input_bit_depth; + vpx_calc_psnr(source_frame, coded_frame, &psnr, spatial_layer_id); +#endif // CONFIG_VP9_HIGHBITDEPTH + return psnr; +} + +static void update_encode_frame_result_basic( + FRAME_UPDATE_TYPE update_type, int show_idx, int quantize_index, + ENCODE_FRAME_RESULT *encode_frame_result) { + encode_frame_result->show_idx = show_idx; + encode_frame_result->update_type = update_type; + encode_frame_result->quantize_index = quantize_index; +} +#endif // !CONFIG_REALTIME_ONLY + +static void encode_frame_to_data_rate( + VP9_COMP *cpi, size_t *size, uint8_t *dest, size_t dest_size, + unsigned int *frame_flags, ENCODE_FRAME_RESULT *encode_frame_result) { VP9_COMMON *const cm = &cpi->common; const VP9EncoderConfig *const oxcf = &cpi->oxcf; struct segmentation *const seg = &cm->seg; TX_SIZE t; + if (vp9_svc_check_skip_enhancement_layer(cpi)) return; + set_ext_overrides(cpi); vpx_clear_system_state(); @@ -3883,8 +5289,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, if (is_spatial_denoise_enabled(cpi)) spatial_denoise_frame(cpi); #endif - // Set the arf sign bias for this frame. - set_arf_sign_bias(cpi); + if (cm->show_existing_frame == 0) { + // Update frame index + set_frame_index(cpi, cm); + + // Set the arf sign bias for this frame. + set_ref_sign_bias(cpi); + } + + // On the very first frame set the deadline_mode_previous_frame to + // the current mode. + if (cpi->common.current_video_frame == 0) + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; @@ -3919,65 +5335,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cm->reset_frame_context = 2; } } - if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) { - // Use context 0 for intra only empty frame, but the last frame context - // for other empty frames. - if (cpi->svc.encode_empty_frame_state == ENCODING) { - if (cpi->svc.encode_intra_empty_frame != 0) - cm->frame_context_idx = 0; - else - cm->frame_context_idx = FRAME_CONTEXTS - 1; - } else { - cm->frame_context_idx = - cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id; - } - cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode; + if (oxcf->tuning == VP8_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi); - // The probs will be updated based on the frame type of its previous - // frame if frame_parallel_decoding_mode is 0. The type may vary for - // the frame after a key frame in base layer since we may drop enhancement - // layers. So set frame_parallel_decoding_mode to 1 in this case. - if (cm->frame_parallel_decoding_mode == 0) { - if (cpi->svc.number_temporal_layers == 1) { - if (cpi->svc.spatial_layer_id == 0 && - cpi->svc.layer_context[0].last_frame_type == KEY_FRAME) - cm->frame_parallel_decoding_mode = 1; - } else if (cpi->svc.spatial_layer_id == 0) { - // Find the 2nd frame in temporal base layer and 1st frame in temporal - // enhancement layers from the key frame. - int i; - for (i = 0; i < cpi->svc.number_temporal_layers; ++i) { - if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) { - cm->frame_parallel_decoding_mode = 1; - break; - } - } - } - } + if (oxcf->aq_mode == PERCEPTUAL_AQ) { + init_mb_wiener_var_buffer(cpi); + set_mb_wiener_variance(cpi); } - // For 1 pass CBR, check if we are dropping this frame. - // For spatial layers, for now only check for frame-dropping on first spatial - // layer, and if decision is to drop, we drop whole super-frame. - if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && - cm->frame_type != KEY_FRAME) { - if (vp9_rc_drop_frame(cpi) || - (is_one_pass_cbr_svc(cpi) && cpi->svc.rc_drop_superframe == 1)) { - vp9_rc_postencode_update_drop_frame(cpi); - ++cm->current_video_frame; - cpi->ext_refresh_frame_flags_pending = 0; - cpi->svc.rc_drop_superframe = 1; - // TODO(marpan): Advancing the svc counters on dropped frames can break - // the referencing scheme for the fixed svc patterns defined in - // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but - // for now, don't advance the svc frame counters on dropped frame. - // if (cpi->use_svc) - // vp9_inc_frame_in_layer(cpi); - return; - } - } + init_sb_mul_scale_buffer(cpi); vpx_clear_system_state(); @@ -3985,28 +5351,76 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, memset(cpi->mode_chosen_counts, 0, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif - - if (cpi->sf.recode_loop == DISALLOW_RECODE) { - encode_without_recode_loop(cpi, size, dest); - } else { - encode_with_recode_loop(cpi, size, dest); + // Backup to ensure consistency between recodes + save_encode_params(cpi); + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.funcs.get_frame_rdmult != NULL) { + vpx_codec_err_t codec_status; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + const int ref_frame_flags = get_ref_frame_flags(cpi); + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx); + // index 0 of a gf group is always KEY/OVERLAY/GOLDEN. + // index 1 refers to the first encoding frame in a gf group. + // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref. + // See function define_gf_group_structure(). + const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE; + int ext_rdmult = VPX_DEFAULT_RDMULT; + get_ref_frame_bufs(cpi, ref_frame_bufs); + codec_status = vp9_extrc_get_frame_rdmult( + &cpi->ext_ratectrl, curr_frame_buf->frame_index, + cm->current_frame_coding_index, gf_group->index, update_type, + gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags, + &ext_rdmult); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_frame_rdmult() failed"); + } + cpi->ext_ratectrl.ext_rdmult = ext_rdmult; } + if (cpi->sf.recode_loop == DISALLOW_RECODE) { + if (!encode_without_recode_loop(cpi, size, dest, dest_size)) return; + } else { +#if !CONFIG_REALTIME_ONLY +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_with_recode_loop_time); +#endif + encode_with_recode_loop(cpi, size, dest, dest_size); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_with_recode_loop_time); +#endif +#endif // !CONFIG_REALTIME_ONLY + } + + // TODO(jingning): When using show existing frame mode, we assume that the + // current ARF will be directly used as the final reconstructed frame. This is + // an encoder control scheme. One could in principle explore other + // possibilities to arrange the reference frame buffer and their coding order. + if (cm->show_existing_frame) { + ref_cnt_fb(cm->buffer_pool->frame_bufs, &cm->new_fb_idx, + cm->ref_frame_map[cpi->alt_fb_idx]); + } + +#if !CONFIG_REALTIME_ONLY // Disable segmentation if it decrease rate/distortion ratio if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) - vp9_try_disable_lookahead_aq(cpi, size, dest); + vp9_try_disable_lookahead_aq(cpi, size, dest, dest_size); +#endif #if CONFIG_VP9_TEMPORAL_DENOISING #ifdef OUTPUT_YUV_DENOISED - if (oxcf->noise_sensitivity > 0) { - vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME], - yuv_denoised_file); + if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) { + vpx_write_yuv_frame(yuv_denoised_file, + &cpi->denoiser.running_avg_y[INTRA_FRAME]); } #endif #endif #ifdef OUTPUT_YUV_SKINMAP if (cpi->common.current_video_frame > 1) { - vp9_compute_skin_map(cpi, yuv_skinmap_file); + vp9_output_skin_map(cpi, yuv_skinmap_file); } #endif @@ -4035,11 +5449,98 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cm->frame_to_show->render_width = cm->render_width; cm->frame_to_show->render_height = cm->render_height; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loopfilter_frame_time); +#endif // Pick the loop filter level for the frame. loopfilter_frame(cpi, cm); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loopfilter_frame_time); +#endif + if (cpi->rc.use_post_encode_drop) save_coding_context(cpi); + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_pack_bitstream_time); +#endif // build the bitstream - vp9_pack_bitstream(cpi, dest, size); + vp9_pack_bitstream(cpi, dest, dest_size, size); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_pack_bitstream_time); +#endif + + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.update_encodeframe_result != NULL) { + vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result( + &cpi->ext_ratectrl, (*size) << 3, cm->base_qindex); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_update_encodeframe_result() failed"); + } + } +#if CONFIG_REALTIME_ONLY + (void)encode_frame_result; + assert(encode_frame_result == NULL); +#else // CONFIG_REALTIME_ONLY + if (encode_frame_result != NULL) { + const RefCntBuffer *coded_frame_buf = + get_ref_cnt_buffer(cm, cm->new_fb_idx); + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]; + FRAME_UPDATE_TYPE update_type = + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; + int quantize_index = vp9_get_quantizer(cpi); + get_ref_frame_bufs(cpi, ref_frame_bufs); + // update_encode_frame_result() depends on twopass.gf_group.index and + // cm->new_fb_idx, cpi->Source, cpi->lst_fb_idx, cpi->gld_fb_idx and + // cpi->alt_fb_idx are updated for current frame and have + // not been updated for the next frame yet. + // The update locations are as follows. + // 1) twopass.gf_group.index is initialized at define_gf_group by vp9_zero() + // for the first frame in the gf_group and is updated for the next frame at + // vp9_twopass_postencode_update(). + // 2) cpi->Source is updated at the beginning of vp9_get_compressed_data() + // 3) cm->new_fb_idx is updated at the beginning of + // vp9_get_compressed_data() by get_free_fb(cm). + // 4) cpi->lst_fb_idx/gld_fb_idx/alt_fb_idx will be updated for the next + // frame at vp9_update_reference_frames(). + // This function needs to be called before vp9_update_reference_frames(). + // TODO(angiebird): Improve the codebase to make the update of frame + // dependent variables more robust. + + update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index, + quantize_index, encode_frame_result); + if (cpi->ext_ratectrl.ready && cpi->ext_ratectrl.log_file) { + PSNR_STATS psnr = compute_psnr_stats( + cpi->Source, &coded_frame_buf->buf, cm->bit_depth, + cpi->oxcf.input_bit_depth, cpi->svc.spatial_layer_id); + fprintf(cpi->ext_ratectrl.log_file, + "ENCODE_FRAME_RESULT gop_index %d psnr %f bits %zu\n", + cpi->twopass.gf_group.index, psnr.psnr[0], (*size) << 3); + } + } +#endif // CONFIG_REALTIME_ONLY + + if (cpi->rc.use_post_encode_drop && cm->base_qindex < cpi->rc.worst_quality && + cpi->svc.spatial_layer_id == 0 && post_encode_drop_cbr(cpi, size)) { + restore_coding_context(cpi); + return; + } + + cpi->last_frame_dropped = 0; + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 0; + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->svc.num_encoded_top_layer++; + + // Keep track of the frame buffer index updated/refreshed for the + // current encoded TL0 superframe. + if (cpi->svc.temporal_layer_id == 0) { + if (cpi->refresh_last_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->lst_fb_idx; + else if (cpi->refresh_golden_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->gld_fb_idx; + else if (cpi->refresh_alt_ref_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->alt_fb_idx; + } if (cm->seg.update_map) update_reference_segmentation_map(cpi); @@ -4048,17 +5549,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, } vp9_update_reference_frames(cpi); - for (t = TX_4X4; t <= TX_32X32; t++) - full_to_model_counts(cpi->td.counts->coef[t], - cpi->td.rd_counts.coef_counts[t]); + if (!cm->show_existing_frame) { + for (t = TX_4X4; t <= TX_32X32; ++t) { + full_to_model_counts(cpi->td.counts->coef[t], + cpi->td.rd_counts.coef_counts[t]); + } - if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) - vp9_adapt_coef_probs(cm); - - if (!frame_is_intra_only(cm)) { if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { - vp9_adapt_mode_probs(cm); - vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + if (!frame_is_intra_only(cm)) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + } + vp9_adapt_coef_probs(cm); } } @@ -4078,8 +5580,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cm->last_frame_type = cm->frame_type; - if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING)) - vp9_rc_postencode_update(cpi, *size); + vp9_rc_postencode_update(cpi, *size); + + if (cpi->compute_frame_low_motion_onepass && oxcf->pass == 0 && + !frame_is_intra_only(cm) && + (!cpi->use_svc || + (cpi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) { + vp9_compute_frame_low_motion(cpi); + } + + *size = VPXMAX(1, *size); #if 0 output_frame_level_debug_stats(cpi); @@ -4103,140 +5615,128 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cm->last_height = cm->height; // reset to normal state now that we are done. - if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame; + if (!cm->show_existing_frame) { + cm->last_show_frame = cm->show_frame; + cm->prev_frame = cm->cur_frame; + } if (cm->show_frame) { vp9_swap_mi_and_prev_mi(cm); - // Don't increment frame counters if this was an altref buffer - // update not a real frame - ++cm->current_video_frame; if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); } - cm->prev_frame = cm->cur_frame; + update_frame_indexes(cm, cm->show_frame); - if (cpi->use_svc) + if (cpi->use_svc) { cpi->svc .layer_context[cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id] .last_frame_type = cm->frame_type; + // Reset layer_sync back to 0 for next frame. + cpi->svc.spatial_layer_sync[cpi->svc.spatial_layer_id] = 0; + } cpi->force_update_segmentation = 0; +#if !CONFIG_REALTIME_ONLY if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) vp9_alt_ref_aq_unset_all(cpi->alt_ref_aq, cpi); +#endif + + cpi->svc.previous_frame_is_intra_only = cm->intra_only; + cpi->svc.set_intra_only_frame = 0; } static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest, - unsigned int *frame_flags) { + size_t dest_size, unsigned int *frame_flags) { vp9_rc_get_svc_params(cpi); - encode_frame_to_data_rate(cpi, size, dest, frame_flags); + encode_frame_to_data_rate(cpi, size, dest, dest_size, frame_flags, + /*encode_frame_result = */ NULL); } static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, - unsigned int *frame_flags) { + size_t dest_size, unsigned int *frame_flags) { if (cpi->oxcf.rc_mode == VPX_CBR) { vp9_rc_get_one_pass_cbr_params(cpi); } else { vp9_rc_get_one_pass_vbr_params(cpi); } - encode_frame_to_data_rate(cpi, size, dest, frame_flags); + encode_frame_to_data_rate(cpi, size, dest, dest_size, frame_flags, + /*encode_frame_result = */ NULL); } +#if !CONFIG_REALTIME_ONLY static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, - unsigned int *frame_flags) { + size_t dest_size, unsigned int *frame_flags, + ENCODE_FRAME_RESULT *encode_frame_result) { cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; - encode_frame_to_data_rate(cpi, size, dest, frame_flags); - - if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING)) - vp9_twopass_postencode_update(cpi); -} - -static void init_ref_frame_bufs(VP9_COMMON *cm) { - int i; - BufferPool *const pool = cm->buffer_pool; - cm->new_fb_idx = INVALID_IDX; - for (i = 0; i < REF_FRAMES; ++i) { - cm->ref_frame_map[i] = INVALID_IDX; - pool->frame_bufs[i].ref_count = 0; - } -} - -static void check_initial_width(VP9_COMP *cpi, -#if CONFIG_VP9_HIGHBITDEPTH - int use_highbitdepth, +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_w(); #endif - int subsampling_x, int subsampling_y) { - VP9_COMMON *const cm = &cpi->common; - - if (!cpi->initial_width || -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth != use_highbitdepth || -#endif - cm->subsampling_x != subsampling_x || - cm->subsampling_y != subsampling_y) { - cm->subsampling_x = subsampling_x; - cm->subsampling_y = subsampling_y; -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth = use_highbitdepth; -#endif - - alloc_raw_frame_buffers(cpi); - init_ref_frame_bufs(cm); - alloc_util_frame_buffers(cpi); - - init_motion_estimation(cpi); // TODO(agrange) This can be removed. - - cpi->initial_width = cm->width; - cpi->initial_height = cm->height; - cpi->initial_mbs = cm->MBs; - } + encode_frame_to_data_rate(cpi, size, dest, dest_size, frame_flags, + encode_frame_result); } +#endif // !CONFIG_REALTIME_ONLY int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { VP9_COMMON *const cm = &cpi->common; +#if CONFIG_INTERNAL_STATS struct vpx_usec_timer timer; +#endif int res = 0; const int subsampling_x = sd->subsampling_x; const int subsampling_y = sd->subsampling_y; #if CONFIG_VP9_HIGHBITDEPTH const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; -#endif - -#if CONFIG_VP9_HIGHBITDEPTH - check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y); #else - check_initial_width(cpi, subsampling_x, subsampling_y); -#endif // CONFIG_VP9_HIGHBITDEPTH - -#if CONFIG_VP9_TEMPORAL_DENOISING - setup_denoiser_buffer(cpi); + const int use_highbitdepth = 0; #endif - vpx_usec_timer_start(&timer); - - if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, -#if CONFIG_VP9_HIGHBITDEPTH - use_highbitdepth, -#endif // CONFIG_VP9_HIGHBITDEPTH - frame_flags)) - res = -1; - vpx_usec_timer_mark(&timer); - cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) && (subsampling_x != 1 || subsampling_y != 1)) { vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, "Non-4:2:0 color format requires profile 1 or 3"); - res = -1; + return -1; } if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) && (subsampling_x == 1 && subsampling_y == 1)) { vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, "4:2:0 color format requires profile 0 or 2"); - res = -1; + return -1; } + if (cm->color_space == VPX_CS_SRGB) { + if (cm->profile == PROFILE_0 || cm->profile == PROFILE_2) { + vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, + "SRGB color space requires profile 1 or 3"); + return -1; + } + if (subsampling_x != 0 || subsampling_y != 0) { + vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, + "SRGB color space requires 4:4:4"); + return -1; + } + } + + update_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y); +#if CONFIG_VP9_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif + + alloc_raw_frame_buffers(cpi); + +#if CONFIG_INTERNAL_STATS + vpx_usec_timer_start(&timer); +#endif + + if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, + use_highbitdepth, frame_flags)) + res = -1; +#if CONFIG_INTERNAL_STATS + vpx_usec_timer_mark(&timer); + cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); +#endif return res; } @@ -4331,10 +5831,6 @@ static void check_src_altref(VP9_COMP *cpi, } #if CONFIG_INTERNAL_STATS -extern double vp9_get_blockiness(const uint8_t *img1, int img1_pitch, - const uint8_t *img2, int img2_pitch, int width, - int height); - static void adjust_image_stat(double y, double u, double v, double all, ImageStat *s) { s->stat[Y] += y; @@ -4373,6 +5869,7 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { int i, idx; uint64_t luma_samples, dur_end; const uint32_t luma_pic_size = cm->width * cm->height; + const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height); LevelConstraint *const level_constraint = &cpi->level_constraint; const int8_t level_index = level_constraint->level_index; double cpb_data_size; @@ -4476,6 +5973,11 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { level_spec->max_luma_picture_size = luma_pic_size; } + // update max_luma_picture_breadth + if (luma_pic_breadth > level_spec->max_luma_picture_breadth) { + level_spec->max_luma_picture_breadth = luma_pic_breadth; + } + // update compression_ratio level_spec->compression_ratio = (double)level_stats->total_uncompressed_size * cm->bit_depth / @@ -4496,6 +5998,15 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]); } + if (level_spec->max_luma_picture_breadth > + vp9_level_defs[level_index].max_luma_picture_breadth) { + level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]); + } + if ((double)level_spec->max_luma_sample_rate > (double)vp9_level_defs[level_index].max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P)) { @@ -4559,48 +6070,99 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { } } +void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], + int *ref_frame_coding_indexes, + int *ref_frame_valid_list) { + if (update_type != KF_UPDATE) { + const VP9_REFFRAME inter_ref_flags[MAX_INTER_REF_FRAMES] = { VP9_LAST_FLAG, + VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + int i; + for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) { + assert(ref_frame_bufs[i] != NULL); + ref_frame_coding_indexes[i] = ref_frame_bufs[i]->frame_coding_index; + ref_frame_valid_list[i] = (ref_frame_flags & inter_ref_flags[i]) != 0; + } + } else { + // No reference frame is available when this is a key frame. + int i; + for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) { + ref_frame_coding_indexes[i] = -1; + ref_frame_valid_list[i] = 0; + } + } +} + +void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result) { + encode_frame_result->show_idx = -1; // Actual encoding doesn't happen. +} + +// Returns if TPL stats need to be calculated. +static INLINE int should_run_tpl(VP9_COMP *cpi, int gf_group_index) { + RATE_CONTROL *const rc = &cpi->rc; + if (!cpi->sf.enable_tpl_model) return 0; + // If there is an ARF for this GOP, TPL stats is always calculated. + if (gf_group_index == 1 && + cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE) + return 1; + // If this GOP doesn't have an ARF, TPL stats is still calculated, only when + // external rate control is used. + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL && + rc->frames_till_gf_update_due == rc->baseline_gf_interval && + cpi->twopass.gf_group.update_type[1] != ARF_UPDATE) { + return 1; + } + return 0; +} + int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, - size_t *size, uint8_t *dest, int64_t *time_stamp, - int64_t *time_end, int flush) { + size_t *size, uint8_t *dest, size_t dest_size, + int64_t *time_stamp, int64_t *time_end, int flush, + ENCODE_FRAME_RESULT *encode_frame_result) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; VP9_COMMON *const cm = &cpi->common; BufferPool *const pool = cm->buffer_pool; RATE_CONTROL *const rc = &cpi->rc; +#if CONFIG_INTERNAL_STATS struct vpx_usec_timer cmptimer; +#endif YV12_BUFFER_CONFIG *force_src_buffer = NULL; struct lookahead_entry *last_source = NULL; struct lookahead_entry *source = NULL; int arf_src_index; + const int gf_group_index = cpi->twopass.gf_group.index; int i; - if (is_two_pass_svc(cpi)) { -#if CONFIG_SPATIAL_SVC - vp9_svc_start_frame(cpi); - // Use a small empty frame instead of a real frame - if (cpi->svc.encode_empty_frame_state == ENCODING) - source = &cpi->svc.empty_frame; +#if CONFIG_COLLECT_COMPONENT_TIMING + if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time); #endif - if (oxcf->pass == 2) vp9_restore_layer_context(cpi); - } else if (is_one_pass_cbr_svc(cpi)) { - vp9_one_pass_cbr_svc_start_layer(cpi); + + if (is_one_pass_svc(cpi)) { + vp9_one_pass_svc_start_layer(cpi); } +#if CONFIG_INTERNAL_STATS vpx_usec_timer_start(&cmptimer); +#endif vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV); // Is multi-arf enabled. // Note that at the moment multi_arf is only configured for 2 pass VBR and // will not work properly with svc. - if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf > 1)) - cpi->multi_arf_allowed = 1; + // Enable the Jingning's new "multi_layer_arf" code if "enable_auto_arf" + // is greater than or equal to 2. + if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf >= 2)) + cpi->multi_layer_arf = 1; else - cpi->multi_arf_allowed = 0; + cpi->multi_layer_arf = 0; // Normal defaults cm->reset_frame_context = 0; cm->refresh_frame_context = 1; - if (!is_one_pass_cbr_svc(cpi)) { + if (!is_one_pass_svc(cpi)) { cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 0; cpi->refresh_alt_ref_frame = 0; @@ -4609,9 +6171,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Should we encode an arf frame. arf_src_index = get_arf_src_index(cpi); - // Skip alt frame if we encode the empty frame - if (is_two_pass_svc(cpi) && source != NULL) arf_src_index = 0; - if (arf_src_index) { for (i = 0; i <= arf_src_index; ++i) { struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i); @@ -4626,26 +6185,23 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } } - if (arf_src_index) { - assert(arf_src_index <= rc->frames_to_key); + // Clear arf index stack before group of pictures processing starts. + if (gf_group_index == 1) { + stack_init(cpi->twopass.gf_group.arf_index_stack, MAX_LAG_BUFFERS * 2); + cpi->twopass.gf_group.stack_size = 0; + } + if (arf_src_index) { + if (!(cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL)) { + // This assert only makes sense when not using external RC. + assert(arf_src_index <= rc->frames_to_key); + } if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { cpi->alt_ref_source = source; -#if CONFIG_SPATIAL_SVC - if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) { - int i; - // Reference a hidden frame from a lower layer - for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) { - if (oxcf->ss_enable_auto_arf[i]) { - cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx; - break; - } - } - } - cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1; -#endif - +#if !CONFIG_REALTIME_ONLY if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0)) { int bitrate = cpi->rc.avg_frame_bandwidth / 40; @@ -4654,18 +6210,24 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1); not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_temporal_filter_time); +#endif // Produce the filtered ARF frame. vp9_temporal_filter(cpi, arf_src_index); - vpx_extend_frame_borders(&cpi->alt_ref_buffer); + vpx_extend_frame_borders(&cpi->tf_buffer); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_temporal_filter_time); +#endif // for small bitrates segmentation overhead usually // eats all bitrate gain from enabling delta quantizers if (cpi->oxcf.alt_ref_aq != 0 && not_low_bitrate && not_last_frame) vp9_alt_ref_aq_setup_mode(cpi->alt_ref_aq, cpi); - force_src_buffer = &cpi->alt_ref_buffer; + force_src_buffer = &cpi->tf_buffer; } - +#endif cm->show_frame = 0; cm->intra_only = 0; cpi->refresh_alt_ref_frame = 1; @@ -4686,7 +6248,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } // Read in the source frame. - if (cpi->use_svc) + if (cpi->use_svc || cpi->svc.set_intra_only_frame) source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush); else source = vp9_lookahead_pop(cpi->lookahead, flush); @@ -4694,11 +6256,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, if (source != NULL) { cm->show_frame = 1; cm->intra_only = 0; - // if the flags indicate intra frame, but if the current picture is for - // non-zero spatial layer, it should not be an intra picture. - // TODO(Won Kap): this needs to change if per-layer intra frame is - // allowed. - if ((source->flags & VPX_EFLAG_FORCE_KF) && + // If the flags indicate intra frame, but if the current picture is for + // spatial layer above first_spatial_layer_to_encode, it should not be an + // intra picture. + if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->use_svc && cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) { source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF); } @@ -4723,13 +6284,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, *time_stamp = source->ts_start; *time_end = source->ts_end; *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; - } else { *size = 0; - if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { - vp9_end_first_pass(cpi); /* get last stats packet */ - cpi->twopass.first_pass_done = 1; - } return -1; } @@ -4743,10 +6299,14 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // adjust frame rates based on timestamps given if (cm->show_frame) { - adjust_frame_rate(cpi, source); + if (cpi->use_svc && cpi->svc.use_set_ref_frame_config && + cpi->svc.duration[cpi->svc.spatial_layer_id] > 0) + vp9_svc_adjust_frame_rate(cpi); + else + adjust_frame_rate(cpi, source); } - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { vp9_update_temporal_layer_framerate(cpi); vp9_restore_layer_context(cpi); } @@ -4759,62 +6319,150 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cm->new_fb_idx = get_free_fb(cm); if (cm->new_fb_idx == INVALID_IDX) return -1; - cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; - - if (!cpi->use_svc && cpi->multi_arf_allowed) { - if (cm->frame_type == KEY_FRAME) { - init_buffer_indices(cpi); - } else if (oxcf->pass == 2) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index]; - } + // If the frame buffer for current frame is the same as previous frame, MV in + // the base layer shouldn't be used as it'll cause data race. + if (cpi->svc.spatial_layer_id > 0 && cm->cur_frame == cm->prev_frame) { + cpi->svc.use_base_mv = 0; } - // Start with a 0 size frame. *size = 0; cpi->frame_flags = *frame_flags; - if ((oxcf->pass == 2) && - (!cpi->use_svc || (is_two_pass_svc(cpi) && - cpi->svc.encode_empty_frame_state != ENCODING))) { +#if !CONFIG_REALTIME_ONLY + if ((oxcf->pass == 2) && !cpi->use_svc) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rc_get_second_pass_params_time); +#endif vp9_rc_get_second_pass_params(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rc_get_second_pass_params_time); +#endif } else if (oxcf->pass == 1) { set_frame_size(cpi); } + // Key frame temporal filtering + const int is_key_temporal_filter_enabled = + oxcf->enable_keyframe_filtering && cpi->oxcf.mode != REALTIME && + (oxcf->pass != 1) && !cpi->use_svc && + !is_lossless_requested(&cpi->oxcf) && cm->frame_type == KEY_FRAME && + (oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0) && + cpi->oxcf.speed < 2; + // Save the pointer to the original source image. + YV12_BUFFER_CONFIG *source_buffer = cpi->un_scaled_source; + + if (is_key_temporal_filter_enabled && source != NULL) { + // Produce the filtered Key frame. Set distance to -1 since the key frame + // is already popped out. + vp9_temporal_filter(cpi, -1); + vpx_extend_frame_borders(&cpi->tf_buffer); + force_src_buffer = &cpi->tf_buffer; + cpi->un_scaled_source = cpi->Source = + force_src_buffer ? force_src_buffer : &source->img; + } +#endif // !CONFIG_REALTIME_ONLY + if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 && cpi->level_constraint.fail_flag == 0) level_rc_framerate(cpi, arf_src_index); if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) { - for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; + for (i = 0; i < REFS_PER_FRAME; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; } - if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) { + if (cpi->kmeans_data_arr_alloc == 0) { + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); +#if CONFIG_MULTITHREAD + pthread_mutex_init(&cpi->kmeans_mutex, NULL); +#endif + CHECK_MEM_ERROR( + &cm->error, cpi->kmeans_data_arr, + vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr))); + cpi->kmeans_data_stride = mi_cols; + cpi->kmeans_data_arr_alloc = 1; + } + +#if CONFIG_NON_GREEDY_MV + { + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); + Status status = vp9_alloc_motion_field_info( + &cpi->motion_field_info, MAX_ARF_GOP_SIZE, mi_rows, mi_cols); + if (status == STATUS_FAILED) { + vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, + "vp9_alloc_motion_field_info failed"); + } + } +#endif // CONFIG_NON_GREEDY_MV + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, setup_tpl_stats_time); +#endif + if (should_run_tpl(cpi, cpi->twopass.gf_group.index)) { + vp9_init_tpl_buffer(cpi); + vp9_estimate_tpl_qp_gop(cpi); + vp9_setup_tpl_stats(cpi); + } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, setup_tpl_stats_time); +#endif + +#if CONFIG_BITSTREAM_DEBUG + assert(cpi->oxcf.max_threads == 0 && + "bitstream debug tool does not support multithreading"); + bitstream_queue_record_write(); +#endif +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG + bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame); +#endif + + cpi->td.mb.fp_src_pred = 0; +#if CONFIG_REALTIME_ONLY + (void)encode_frame_result; + if (cpi->use_svc) { + SvcEncode(cpi, size, dest, dest_size, frame_flags); + } else { + // One pass encode + Pass0Encode(cpi, size, dest, dest_size, frame_flags); + } +#else // !CONFIG_REALTIME_ONLY + if (oxcf->pass == 1 && !cpi->use_svc) { const int lossless = is_lossless_requested(oxcf); #if CONFIG_VP9_HIGHBITDEPTH if (cpi->oxcf.use_highbitdepth) - cpi->td.mb.fwd_txm4x4 = + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4; else - cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; - cpi->td.mb.highbd_itxm_add = + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; + cpi->td.mb.highbd_inv_txfm_add = lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add; #else - cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; + cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH - cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; vp9_first_pass(cpi, source); - } else if (oxcf->pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) { - Pass2Encode(cpi, size, dest, frame_flags); + } else if (oxcf->pass == 2 && !cpi->use_svc) { +#if CONFIG_COLLECT_COMPONENT_TIMING + // Accumulate 2nd pass time in 2-pass case. + start_timing(cpi, Pass2Encode_time); +#endif + Pass2Encode(cpi, size, dest, dest_size, frame_flags, encode_frame_result); + vp9_twopass_postencode_update(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, Pass2Encode_time); +#endif } else if (cpi->use_svc) { - SvcEncode(cpi, size, dest, frame_flags); + SvcEncode(cpi, size, dest, dest_size, frame_flags); } else { // One pass encode - Pass0Encode(cpi, size, dest, frame_flags); + Pass0Encode(cpi, size, dest, dest_size, frame_flags); } +#endif // CONFIG_REALTIME_ONLY + + if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx; if (cm->refresh_frame_context) cm->frame_contexts[cm->frame_context_idx] = *cm->fc; @@ -4829,26 +6477,35 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } // Save layer specific state. - if (is_one_pass_cbr_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 || - cpi->svc.number_spatial_layers > 1) && - oxcf->pass == 2)) { + if (is_one_pass_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + oxcf->pass == 2)) { vp9_save_layer_context(cpi); } + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->fixed_qp_onepass = 0; + +#if CONFIG_INTERNAL_STATS vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); - - // Should we calculate metrics for the frame. - if (is_psnr_calc_enabled(cpi)) generate_psnr_packet(cpi); +#endif if (cpi->keep_level_stats && oxcf->pass != 1) update_level_info(cpi, size, arf_src_index); +#if !CONFIG_REALTIME_ONLY + if (is_key_temporal_filter_enabled && cpi->b_calculate_psnr) { + cpi->raw_source_frame = vp9_scale_if_required( + cm, source_buffer, &cpi->scaled_source, (oxcf->pass == 0), EIGHTTAP, 0); + } +#endif // !CONFIG_REALTIME_ONLY + #if CONFIG_INTERNAL_STATS - if (oxcf->pass != 1) { + if (oxcf->pass != 1 && !cpi->last_frame_dropped) { double samples = 0.0; - cpi->bytes += (int)(*size); + cpi->bytes += *size; if (cm->show_frame) { uint32_t bit_depth = 8; @@ -4868,9 +6525,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, PSNR_STATS psnr; #if CONFIG_VP9_HIGHBITDEPTH vpx_calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd, - in_bit_depth); + in_bit_depth, cpi->svc.spatial_layer_id); #else - vpx_calc_psnr(orig, recon, &psnr); + vpx_calc_psnr(orig, recon, &psnr, cpi->svc.spatial_layer_id); #endif // CONFIG_VP9_HIGHBITDEPTH adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], @@ -4898,16 +6555,18 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, ppflags.post_proc_flag = VP9D_DEBLOCK; ppflags.deblocking_level = 0; // not used in vp9_post_proc_frame() ppflags.noise_level = 0; // not used in vp9_post_proc_frame() - vp9_post_proc_frame(cm, pp, &ppflags); + vp9_post_proc_frame(cm, pp, &ppflags, + cpi->un_scaled_source->y_width); } #endif vpx_clear_system_state(); #if CONFIG_VP9_HIGHBITDEPTH vpx_calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd, - cpi->oxcf.input_bit_depth); + cpi->oxcf.input_bit_depth, + cpi->svc.spatial_layer_id); #else - vpx_calc_psnr(orig, pp, &psnr2); + vpx_calc_psnr(orig, pp, &psnr2, cpi->svc.spatial_layer_id); #endif // CONFIG_VP9_HIGHBITDEPTH cpi->totalp_sq_error += psnr2.sse[0]; @@ -4944,11 +6603,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->summedp_quality += frame_ssim2 * weight; cpi->summedp_weights += weight; #if 0 - { + if (cm->show_frame) { FILE *f = fopen("q_used.stt", "a"); fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", - cpi->common.current_video_frame, y2, u2, v2, - frame_psnr2, frame_ssim2); + cpi->common.current_video_frame, psnr2.psnr[1], + psnr2.psnr[2], psnr2.psnr[3], psnr2.psnr[0], frame_ssim2); fclose(f); } #endif @@ -5007,21 +6666,42 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif - if (is_two_pass_svc(cpi)) { - if (cpi->svc.encode_empty_frame_state == ENCODING) { - cpi->svc.encode_empty_frame_state = ENCODED; - cpi->svc.encode_intra_empty_frame = 0; - } +#if CONFIG_COLLECT_COMPONENT_TIMING + if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time); - if (cm->show_frame) { - ++cpi->svc.spatial_layer_to_encode; - if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) - cpi->svc.spatial_layer_to_encode = 0; + // Print out timing information. + // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of + // show_existing_frame and lag-in-frames. + // if (cpi->frame_component_time[0] > 100) + if (oxcf->pass == 2) { + uint64_t frame_total = 0, total = 0; + int i; - // May need the empty frame after an visible frame. - cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE; + fprintf(stderr, + "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n", + cm->current_video_frame, get_frame_type_enum(cm->frame_type), + cm->show_frame, cm->base_qindex); + for (i = 0; i < kTimingComponents; i++) { + cpi->component_time[i] += cpi->frame_component_time[i]; + // Use vp9_get_compressed_data_time (i = 0) as the total time. + if (i == 0) { + frame_total = cpi->frame_component_time[0]; + total = cpi->component_time[0]; + } + fprintf(stderr, + " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64 + " us [%6.2f%%])\n", + get_component_name(i), cpi->frame_component_time[i], + (float)((float)cpi->frame_component_time[i] * 100.0 / + (float)frame_total), + cpi->component_time[i], + (float)((float)cpi->component_time[i] * 100.0 / (float)total)); + cpi->frame_component_time[i] = 0; } - } else if (is_one_pass_cbr_svc(cpi)) { + } +#endif + + if (is_one_pass_svc(cpi)) { if (cm->show_frame) { ++cpi->svc.spatial_layer_to_encode; if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) @@ -5045,7 +6725,7 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, } else { int ret; #if CONFIG_VP9_POSTPROC - ret = vp9_post_proc_frame(cm, dest, flags); + ret = vp9_post_proc_frame(cm, dest, flags, cpi->un_scaled_source->y_width); #else if (cm->frame_to_show) { *dest = *cm->frame_to_show; @@ -5063,12 +6743,12 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, } } -int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode) { +int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode) { VP9_COMMON *cm = &cpi->common; int hr = 0, hs = 0, vr = 0, vs = 0; - if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1; + if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1; Scale2Ratio(horiz_mode, &hr, &hs); Scale2Ratio(vert_mode, &vr, &vs); @@ -5090,20 +6770,21 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, unsigned int height) { VP9_COMMON *cm = &cpi->common; #if CONFIG_VP9_HIGHBITDEPTH - check_initial_width(cpi, cm->use_highbitdepth, 1, 1); + update_initial_width(cpi, cm->use_highbitdepth, cpi->common.subsampling_x, + cpi->common.subsampling_y); #else - check_initial_width(cpi, 1, 1); + update_initial_width(cpi, 0, cpi->common.subsampling_x, + cpi->common.subsampling_y); #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_VP9_TEMPORAL_DENOISING setup_denoiser_buffer(cpi); #endif - + alloc_raw_frame_buffers(cpi); if (width) { cm->width = width; if (cm->width > cpi->initial_width) { cm->width = cpi->initial_width; - printf("Warning: Desired width too large, changed to %d\n", cm->width); } } @@ -5111,7 +6792,6 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, cm->height = height; if (cm->height > cpi->initial_height) { cm->height = cpi->initial_height; - printf("Warning: Desired height too large, changed to %d\n", cm->height); } } assert(cm->width <= cpi->initial_width); @@ -5127,7 +6807,7 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc) { return; } -int vp9_get_quantizer(VP9_COMP *cpi) { return cpi->common.base_qindex; } +int vp9_get_quantizer(const VP9_COMP *cpi) { return cpi->common.base_qindex; } void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) { if (flags & @@ -5161,3 +6841,28 @@ void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) { vp9_update_entropy(cpi, 0); } } + +void vp9_set_row_mt(VP9_COMP *cpi) { + // Enable row based multi-threading for supported modes of encoding + cpi->row_mt = 0; + if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) && + cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) && + cpi->oxcf.row_mt && !cpi->use_svc) + cpi->row_mt = 1; + + if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 && + (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.row_mt && + !cpi->use_svc) + cpi->row_mt = 1; + + // In realtime mode, enable row based multi-threading for all the speed levels + // where non-rd path is used. + if (cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cpi->oxcf.row_mt) { + cpi->row_mt = 1; + } + + if (cpi->row_mt) + cpi->row_mt_bit_exact = 1; + else + cpi->row_mt_bit_exact = 0; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h index de324d3aab..f58fa2470e 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h @@ -8,20 +8,26 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODER_H_ -#define VP9_ENCODER_VP9_ENCODER_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODER_H_ +#define VPX_VP9_ENCODER_VP9_ENCODER_H_ #include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vpx_ext_ratectrl.h" #include "vpx/vp8cx.h" +#include "vpx/vpx_tpl.h" #if CONFIG_INTERNAL_STATS #include "vpx_dsp/ssim.h" #endif #include "vpx_dsp/variance.h" +#include "vpx_dsp/psnr.h" #include "vpx_ports/system_state.h" +#include "vpx_util/vpx_pthread.h" #include "vpx_util/vpx_thread.h" +#include "vpx_util/vpx_timestamp.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_ppflags.h" @@ -29,11 +35,16 @@ #include "vp9/common/vp9_thread_common.h" #include "vp9/common/vp9_onyxc_int.h" +#if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_alt_ref_aq.h" +#endif #include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_ext_ratectrl.h" #include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_job_queue.h" #include "vp9/encoder/vp9_lookahead.h" #include "vp9/encoder/vp9_mbgraph.h" #include "vp9/encoder/vp9_mcomp.h" @@ -82,13 +93,6 @@ typedef enum { ENCODE_BREAKOUT_LIMITED = 2 } ENCODE_BREAKOUT_TYPE; -typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 -} VPX_SCALING; - typedef enum { // Good Quality Fast Encoding. The encoder balances quality with the amount of // time it takes to encode the output. Speed setting controls how fast. @@ -117,9 +121,11 @@ typedef enum { COMPLEXITY_AQ = 2, CYCLIC_REFRESH_AQ = 3, EQUATOR360_AQ = 4, + PERCEPTUAL_AQ = 5, + PSNR_AQ = 6, // AQ based on lookahead temporal // variance (only valid for altref frames) - LOOKAHEAD_AQ = 5, + LOOKAHEAD_AQ = 7, AQ_MODE_COUNT // This should always be the last member of the enum } AQ_MODE; @@ -129,6 +135,22 @@ typedef enum { RESIZE_DYNAMIC = 2 // Coded size of each frame is determined by the codec. } RESIZE_TYPE; +typedef enum { + kInvalid = 0, + kLowSadLowSumdiff = 1, + kLowSadHighSumdiff = 2, + kHighSadLowSumdiff = 3, + kHighSadHighSumdiff = 4, + kLowVarHighSumdiff = 5, + kVeryHighSad = 6, +} CONTENT_STATE_SB; + +typedef enum { + LOOPFILTER_ALL = 0, + LOOPFILTER_REFERENCE = 1, // Disable loopfilter on non reference frames. + NO_LOOPFILTER = 2, // Disable loopfilter on all frames. +} LOOPFILTER_CONTROL; + typedef struct VP9EncoderConfig { BITSTREAM_PROFILE profile; vpx_bit_depth_t bit_depth; // Codec bit-depth. @@ -136,7 +158,10 @@ typedef struct VP9EncoderConfig { int height; // height of data passed to the compressor unsigned int input_bit_depth; // Input bit depth. double init_framerate; // set to passed in framerate - int64_t target_bandwidth; // bandwidth to be used in bits per second + vpx_rational_t g_timebase; // equivalent to g_timebase in vpx_codec_enc_cfg_t + vpx_rational64_t g_timebase_in_ts; // g_timebase * TICKS_PER_SEC + + int64_t target_bandwidth; // bandwidth to be used in bits per second int noise_sensitivity; // pre processing blur: recommendation 0 int sharpness; // sharpening output: recommendation 0: @@ -197,6 +222,7 @@ typedef struct VP9EncoderConfig { int two_pass_vbrbias; // two pass datarate control tweaks int two_pass_vbrmin_section; int two_pass_vbrmax_section; + int vbr_corpus_complexity; // 0 indicates corpus vbr disabled // END DATARATE CONTROL OPTIONS // ---------------------------------------------------------------- @@ -235,16 +261,15 @@ typedef struct VP9EncoderConfig { int tile_columns; int tile_rows; + int enable_tpl_model; + + int enable_keyframe_filtering; + int max_threads; unsigned int target_level; vpx_fixed_buf_t two_pass_stats_in; - struct vpx_codec_pkt_list *output_pkt_list; - -#if CONFIG_FP_MB_STATS - vpx_fixed_buf_t firstpass_mb_stats_in; -#endif vp8e_tuning tuning; vp9e_tune_content content; @@ -256,21 +281,102 @@ typedef struct VP9EncoderConfig { int render_width; int render_height; VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode; + + int row_mt; + unsigned int motion_vector_unit_test; + int delta_q_uv; } VP9EncoderConfig; static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; } +typedef struct TplDepStats { + int64_t intra_cost; + int64_t inter_cost; + int64_t mc_flow; + int64_t mc_dep_cost; + int64_t mc_ref_cost; + + int ref_frame_index; + int_mv mv; +} TplDepStats; + +#if CONFIG_NON_GREEDY_MV + +#define ZERO_MV_MODE 0 +#define NEW_MV_MODE 1 +#define NEAREST_MV_MODE 2 +#define NEAR_MV_MODE 3 +#define MAX_MV_MODE 4 +#endif + +typedef struct TplDepFrame { + uint8_t is_valid; + TplDepStats *tpl_stats_ptr; + int stride; + int width; + int height; + int mi_rows; + int mi_cols; + int base_qindex; +#if CONFIG_NON_GREEDY_MV + int lambda; + int *mv_mode_arr[3]; + double *rd_diff_arr[3]; +#endif +} TplDepFrame; + +#define TPL_DEP_COST_SCALE_LOG2 4 + // TODO(jingning) All spatially adaptive variables should go to TileDataEnc. typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; - int mode_map[BLOCK_SIZES][MAX_MODES]; - int m_search_count; - int ex_search_count; + int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES]; + int8_t mode_map[BLOCK_SIZES][MAX_MODES]; + FIRSTPASS_DATA fp_data; + VP9RowMTSync row_mt_sync; + + // Used for adaptive_rd_thresh with row multithreading + int *row_base_thresh_freq_fact; + // The value of sb_rows when row_base_thresh_freq_fact is allocated. + // The row_base_thresh_freq_fact array has sb_rows * BLOCK_SIZES * MAX_MODES + // elements. + int sb_rows; + MV firstpass_top_mv; } TileDataEnc; +typedef struct RowMTInfo { + JobQueueHandle job_queue_hdl; +#if CONFIG_MULTITHREAD + pthread_mutex_t job_mutex; +#endif +} RowMTInfo; + +typedef struct { + TOKENEXTRA *start; + TOKENEXTRA *stop; + unsigned int count; +} TOKENLIST; + +typedef struct MultiThreadHandle { + int allocated_tile_rows; + int allocated_tile_cols; + int allocated_vert_unit_rows; + + // Frame level params + int num_tile_vert_sbs[MAX_NUM_TILE_ROWS]; + + // Job Queue structure and handles + JobQueue *job_queue; + + int jobs_per_tile_col; + + RowMTInfo row_mt_info[MAX_NUM_TILE_COLS]; + int thread_id_to_tile_id[MAX_NUM_THREADS]; // Mapping of threads to tiles +} MultiThreadHandle; + typedef struct RD_COUNTS { vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; int64_t comp_pred_diff[REFERENCE_MODES]; @@ -312,6 +418,7 @@ typedef struct IMAGE_STAT { typedef enum { LEVEL_UNKNOWN = 0, + LEVEL_AUTO = 1, LEVEL_1 = 10, LEVEL_1_1 = 11, LEVEL_2 = 20, @@ -333,6 +440,7 @@ typedef struct { VP9_LEVEL level; uint64_t max_luma_sample_rate; uint32_t max_luma_picture_size; + uint32_t max_luma_picture_breadth; double average_bitrate; // in kilobits per second double max_cpb_size; // in kilobits double compression_ratio; @@ -372,25 +480,118 @@ typedef struct { typedef enum { BITRATE_TOO_LARGE = 0, - LUMA_PIC_SIZE_TOO_LARGE = 1, - LUMA_SAMPLE_RATE_TOO_LARGE = 2, - CPB_TOO_LARGE = 3, - COMPRESSION_RATIO_TOO_SMALL = 4, - TOO_MANY_COLUMN_TILE = 5, - ALTREF_DIST_TOO_SMALL = 6, - TOO_MANY_REF_BUFFER = 7, - TARGET_LEVEL_FAIL_IDS = 8 + LUMA_PIC_SIZE_TOO_LARGE, + LUMA_PIC_BREADTH_TOO_LARGE, + LUMA_SAMPLE_RATE_TOO_LARGE, + CPB_TOO_LARGE, + COMPRESSION_RATIO_TOO_SMALL, + TOO_MANY_COLUMN_TILE, + ALTREF_DIST_TOO_SMALL, + TOO_MANY_REF_BUFFER, + TARGET_LEVEL_FAIL_IDS } TARGET_LEVEL_FAIL_ID; typedef struct { int8_t level_index; - uint8_t rc_config_updated; uint8_t fail_flag; int max_frame_size; // in bits double max_cpb_size; // in bits } LevelConstraint; +typedef struct ARNRFilterData { + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; + int strength; + int frame_count; + int alt_ref_index; + struct scale_factors sf; + YV12_BUFFER_CONFIG *dst; +} ARNRFilterData; + +typedef struct EncFrameBuf { + int mem_valid; + int released; + YV12_BUFFER_CONFIG frame; +} EncFrameBuf; + +// Maximum operating frame buffer size needed for a GOP using ARF reference. +// This is used to allocate the memory for TPL stats for a GOP. +#define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS) +#define MAX_KMEANS_GROUPS 8 + +typedef struct KMEANS_DATA { + double value; + int pos; + int group_idx; +} KMEANS_DATA; + +#if CONFIG_COLLECT_COMPONENT_TIMING +#include "vpx_ports/vpx_timer.h" +// Adjust the following to add new components. +typedef enum { + vp9_get_compressed_data_time, + vp9_temporal_filter_time, + vp9_rc_get_second_pass_params_time, + setup_tpl_stats_time, + Pass2Encode_time, + + encode_with_recode_loop_time, + loopfilter_frame_time, + vp9_pack_bitstream_time, + + encode_frame_internal_time, + rd_pick_partition_time, + rd_pick_sb_modes_time, + encode_sb_time, + + vp9_rd_pick_inter_mode_sb_time, + vp9_rd_pick_inter_mode_sub8x8_time, + + intra_mode_search_time, + handle_inter_mode_time, + single_motion_search_time, + joint_motion_search_time, + interp_filter_time, + + kTimingComponents, +} TIMING_COMPONENT; + +static INLINE char const *get_component_name(int index) { + switch (index) { + case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time"; + case vp9_temporal_filter_time: return "vp9_temporal_filter_time"; + case vp9_rc_get_second_pass_params_time: + return "vp9_rc_get_second_pass_params_time"; + case setup_tpl_stats_time: return "setup_tpl_stats_time"; + case Pass2Encode_time: return "Pass2Encode_time"; + + case encode_with_recode_loop_time: return "encode_with_recode_loop_time"; + case loopfilter_frame_time: return "loopfilter_frame_time"; + case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time"; + + case encode_frame_internal_time: return "encode_frame_internal_time"; + case rd_pick_partition_time: return "rd_pick_partition_time"; + case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time"; + case encode_sb_time: return "encode_sb_time"; + + case vp9_rd_pick_inter_mode_sb_time: + return "vp9_rd_pick_inter_mode_sb_time"; + case vp9_rd_pick_inter_mode_sub8x8_time: + return "vp9_rd_pick_inter_mode_sub8x8_time"; + + case intra_mode_search_time: return "intra_mode_search_time"; + case handle_inter_mode_time: return "handle_inter_mode_time"; + case single_motion_search_time: return "single_motion_search_time"; + case joint_motion_search_time: return "joint_motion_search_time"; + case interp_filter_time: return "interp_filter_time"; + + default: assert(0); + } + return "error"; +} +#endif + typedef struct VP9_COMP { + FRAME_INFO frame_info; QUANTS quants; ThreadData td; MB_MODE_INFO_EXT *mbmi_ext_base; @@ -413,17 +614,39 @@ typedef struct VP9_COMP { #endif YV12_BUFFER_CONFIG *raw_source_frame; + BLOCK_SIZE tpl_bsize; + TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE]; + // Used to store TPL stats before propagation + VpxTplGopStats tpl_gop_stats; + YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES]; + EncFrameBuf enc_frame_buf[REF_FRAMES]; +#if CONFIG_MULTITHREAD + pthread_mutex_t kmeans_mutex; +#endif + int kmeans_data_arr_alloc; + KMEANS_DATA *kmeans_data_arr; + int kmeans_data_size; + int kmeans_data_stride; + double kmeans_ctr_ls[MAX_KMEANS_GROUPS]; + double kmeans_boundary_ls[MAX_KMEANS_GROUPS]; + int kmeans_count_ls[MAX_KMEANS_GROUPS]; + int kmeans_ctr_num; +#if CONFIG_NON_GREEDY_MV + MotionFieldInfo motion_field_info; + int tpl_ready; + int_mv *select_mv_arr; +#endif + TileDataEnc *tile_data; int allocated_tiles; // Keep track of memory allocated for tiles. - // For a still frame, this flag is set to 1 to skip partition search. - int partition_search_skippable_frame; - - int scaled_ref_idx[MAX_REF_FRAMES]; + int scaled_ref_idx[REFS_PER_FRAME]; int lst_fb_idx; int gld_fb_idx; int alt_fb_idx; + int ref_fb_idx[REF_FRAMES]; + int refresh_last_frame; int refresh_golden_frame; int refresh_alt_ref_frame; @@ -436,14 +659,23 @@ typedef struct VP9_COMP { int ext_refresh_frame_context_pending; int ext_refresh_frame_context; + int64_t norm_wiener_variance; + int64_t *mb_wiener_variance; + int mb_wiener_var_rows; + int mb_wiener_var_cols; + double *mi_ssim_rdmult_scaling_factors; + + int64_t *sb_mul_scale; + YV12_BUFFER_CONFIG last_frame_uf; TOKENEXTRA *tile_tok[4][1 << 6]; - uint32_t tok_count[4][1 << 6]; + TOKENLIST *tplist[4][1 << 6]; // Ambient reconstruction err target for force key frames int64_t ambient_err; + RD_CONTROL rd_ctrl; RD_OPT rd; CODING_CONTEXT coding_context; @@ -460,7 +692,7 @@ typedef struct VP9_COMP { RATE_CONTROL rc; double framerate; - int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]; + int interp_filter_selected[REF_FRAMES][SWITCHABLE]; struct vpx_codec_pkt_list *output_pkt_list; @@ -485,23 +717,23 @@ typedef struct VP9_COMP { uint8_t *segmentation_map; - // segment threashold for encode breakout + uint8_t *skin_map; + + // segment threshold for encode breakout int segment_encode_breakout[MAX_SEGMENTS]; CYCLIC_REFRESH *cyclic_refresh; ActiveMap active_map; fractional_mv_step_fp *find_fractional_mv_step; - vp9_full_search_fn_t full_search_sad; + struct scale_factors me_sf; vp9_diamond_search_fn_t diamond_search_sad; vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; +#if CONFIG_INTERNAL_STATS uint64_t time_receive_data; uint64_t time_compress_data; uint64_t time_pick_lpf; uint64_t time_encode_sb_row; - -#if CONFIG_FP_MB_STATS - int use_fp_mb_stats; #endif TWO_PASS twopass; @@ -509,7 +741,7 @@ typedef struct VP9_COMP { // Force recalculation of segment_ids for each mode info uint8_t force_update_segmentation; - YV12_BUFFER_CONFIG alt_ref_buffer; + YV12_BUFFER_CONFIG tf_buffer; // class responsible for adaptive // quantization of altref frames @@ -530,7 +762,7 @@ typedef struct VP9_COMP { double total_blockiness; double worst_blockiness; - int bytes; + uint64_t bytes; double summed_quality; double summed_weights; double summedp_quality; @@ -563,16 +795,13 @@ typedef struct VP9_COMP { // number of MBs in the current frame when the frame is // scaled. + int last_coded_width; + int last_coded_height; + int use_svc; SVC svc; - // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type. - diff *source_diff_var; - // The threshold used in SOURCE_VAR_BASED_PARTITION search type. - unsigned int source_var_thresh; - int frames_till_next_var_check; - int frame_flags; search_site_config ss_cfg; @@ -583,17 +812,15 @@ typedef struct VP9_COMP { int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES]; - - int multi_arf_allowed; - int multi_arf_enabled; - int multi_arf_last_grp_enabled; + // Indices are: max_tx_size-1, tx_size_ctx, tx_size + int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_DENOISER denoiser; #endif int resize_pending; - int resize_state; + RESIZE_STATE resize_state; int external_resize; int resize_scale_num; int resize_scale_den; @@ -629,17 +856,84 @@ typedef struct VP9_COMP { int keep_level_stats; Vp9LevelInfo level_info; + MultiThreadHandle multi_thread_ctxt; + void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int); + void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int); + ARNRFilterData arnr_filter_data; + + int row_mt; + unsigned int row_mt_bit_exact; // Previous Partition Info BLOCK_SIZE *prev_partition; int8_t *prev_segment_id; + // Used to save the status of whether a block has a low variance in + // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for + // 32x32, 9~24 for 16x16. + // This is for the last frame and is copied to the current frame + // when partition copy happens. + uint8_t *prev_variance_low; + uint8_t *copied_frame_cnt; + uint8_t max_copied_frame; + // If the last frame is dropped, we don't copy partition. + uint8_t last_frame_dropped; + + // For each superblock: keeps track of the last time (in frame distance) the + // the superblock did not have low source sad. + uint8_t *content_state_sb_fd; + + int compute_source_sad_onepass; + + int compute_frame_low_motion_onepass; LevelConstraint level_constraint; + + uint8_t *count_arf_frame_usage; + uint8_t *count_lastgolden_frame_usage; + + int multi_layer_arf; + vpx_roi_map_t roi; + + LOOPFILTER_CONTROL loopfilter_ctrl; + EXT_RATECTRL ext_ratectrl; + + int fixed_qp_onepass; + + // Flag to keep track of dynamic change in deadline mode + // (good/best/realtime). + MODE deadline_mode_previous_frame; + + // Flag to disable scene detection when rtc rate control library is used. + int disable_scene_detection_rtc_ratectrl; + +#if CONFIG_COLLECT_COMPONENT_TIMING + /*! + * component_time[] are initialized to zero while encoder starts. + */ + uint64_t component_time[kTimingComponents]; + /*! + * Stores timing for individual components between calls of start_timing() + * and end_timing(). + */ + struct vpx_usec_timer component_timer[kTimingComponents]; + /*! + * frame_component_time[] are initialized to zero at beginning of each frame. + */ + uint64_t frame_component_time[kTimingComponents]; +#endif } VP9_COMP; +typedef struct ENCODE_FRAME_RESULT { + int show_idx; + FRAME_UPDATE_TYPE update_type; + int quantize_index; +} ENCODE_FRAME_RESULT; + +void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result); + void vp9_initialize_enc(void); -struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, +struct VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, BufferPool *const pool); void vp9_remove_compressor(VP9_COMP *cpi); @@ -649,11 +943,12 @@ void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf); // frame is made and not just a copy of the pointer.. int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time_stamp); + int64_t end_time); int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, - size_t *size, uint8_t *dest, int64_t *time_stamp, - int64_t *time_end, int flush); + size_t *size, uint8_t *dest, size_t dest_size, + int64_t *time_stamp, int64_t *time_end, int flush, + ENCODE_FRAME_RESULT *encode_frame_result); int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags); @@ -670,25 +965,63 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, int vp9_update_entropy(VP9_COMP *cpi, int update); -int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols); +int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols); -int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols); +int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols); -int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode); +int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode); int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, unsigned int height); void vp9_set_svc(VP9_COMP *cpi, int use_svc); -int vp9_get_quantizer(struct VP9_COMP *cpi); +// Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the +// configuration change has a large change in avg_frame_bandwidth. +// For SVC check for resetting based on spatial layer average bandwidth. +// Also reset buffer level to optimal level. +void vp9_check_reset_rc_flag(VP9_COMP *cpi); + +void vp9_set_rc_buffer_sizes(VP9_COMP *cpi); + +static INLINE int stack_pop(int *stack, int stack_size) { + int idx; + const int r = stack[0]; + for (idx = 1; idx < stack_size; ++idx) stack[idx - 1] = stack[idx]; + + return r; +} + +static INLINE int stack_top(const int *stack) { return stack[0]; } + +static INLINE void stack_push(int *stack, int new_item, int stack_size) { + int idx; + for (idx = stack_size; idx > 0; --idx) stack[idx] = stack[idx - 1]; + stack[0] = new_item; +} + +static INLINE void stack_init(int *stack, int length) { + int idx; + for (idx = 0; idx < length; ++idx) stack[idx] = -1; +} + +int vp9_get_quantizer(const VP9_COMP *cpi); static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) { return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); } +static INLINE int ref_frame_to_flag(int8_t ref_frame) { + static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); + return kVp9RefFlagList[ref_frame]; +} + static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { if (ref_frame == LAST_FRAME) { @@ -707,9 +1040,25 @@ static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi, return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX; } +static INLINE RefCntBuffer *get_ref_cnt_buffer(const VP9_COMMON *cm, + int fb_idx) { + return fb_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[fb_idx] : NULL; +} + +static INLINE void get_ref_frame_bufs( + const VP9_COMP *cpi, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]) { + const VP9_COMMON *const cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) { + int ref_frame_buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + int inter_ref_idx = mv_ref_frame_to_inter_ref_idx(ref_frame); + ref_frame_bufs[inter_ref_idx] = get_ref_cnt_buffer(cm, ref_frame_buf_idx); + } +} + static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( - VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { - VP9_COMMON *const cm = &cpi->common; + const VP9_COMP *const cpi, MV_REFERENCE_FRAME ref_frame) { + const VP9_COMMON *const cm = &cpi->common; const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL; @@ -721,7 +1070,13 @@ static INLINE int get_token_alloc(int mb_rows, int mb_cols) { // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full // resolution. We assume up to 1 token per pixel, and then allow // a head room of 4. - return mb_rows * mb_cols * (16 * 16 * 3 + 4); + + // Use aligned mb_rows and mb_cols to better align with actual token sizes. + const int aligned_mb_rows = + ALIGN_POWER_OF_TWO(mb_rows, MI_BLOCK_SIZE_LOG2 - 1); + const int aligned_mb_cols = + ALIGN_POWER_OF_TWO(mb_cols, MI_BLOCK_SIZE_LOG2 - 1); + return aligned_mb_rows * aligned_mb_cols * (16 * 16 * 3 + 4); } // Get the allocated token size for a tile. It does the same calculation as in @@ -733,6 +1088,20 @@ static INLINE int allocated_tokens(TileInfo tile) { return get_token_alloc(tile_mb_rows, tile_mb_cols); } +static INLINE void get_start_tok(VP9_COMP *cpi, int tile_row, int tile_col, + int mi_row, TOKENEXTRA **tok) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo *const tile_info = &this_tile->tile_info; + + int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1; + const int mb_row = (mi_row - tile_info->mi_row_start) >> 1; + + *tok = + cpi->tile_tok[tile_row][tile_col] + get_token_alloc(mb_row, tile_mb_cols); +} + int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); #if CONFIG_VP9_HIGHBITDEPTH int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, @@ -743,37 +1112,46 @@ void vp9_scale_references(VP9_COMP *cpi); void vp9_update_reference_frames(VP9_COMP *cpi); +void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], + int *ref_frame_coding_indexes, + int *ref_frame_valid_list); + void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv); -YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - YV12_BUFFER_CONFIG *scaled_temp); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, int bd); +#else +void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); +#endif // CONFIG_VP9_HIGHBITDEPTH -YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, - YV12_BUFFER_CONFIG *unscaled, - YV12_BUFFER_CONFIG *scaled, - int use_normative_scaler); +YV12_BUFFER_CONFIG *vp9_scale_if_required( + VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, + int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler); void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); -static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) { - return cpi->use_svc && cpi->oxcf.pass != 0; -} - -static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) { +static INLINE int is_one_pass_svc(const struct VP9_COMP *const cpi) { return (cpi->use_svc && cpi->oxcf.pass == 0); } +#if CONFIG_VP9_TEMPORAL_DENOISING +static INLINE int denoise_svc(const struct VP9_COMP *const cpi) { + return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >= + cpi->svc.first_layer_denoise)); +} +#endif + +#define MIN_LOOKAHEAD_FOR_ARFS 4 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) && - cpi->oxcf.lag_in_frames > 0 && - (cpi->oxcf.enable_auto_arf && - (!is_two_pass_svc(cpi) || - cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id])); + cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS && + cpi->oxcf.enable_auto_arf; } -static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, +static INLINE void set_ref_ptrs(const VP9_COMMON *const cm, MACROBLOCKD *xd, MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) { xd->block_refs[0] = @@ -790,6 +1168,18 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) { return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL; } +static INLINE int get_num_vert_units(TileInfo tile, int shift) { + int num_vert_units = + (tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift; + return num_vert_units; +} + +static INLINE int get_num_cols(TileInfo tile, int shift) { + int num_cols = + (tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift; + return num_cols; +} + static INLINE int get_level_index(VP9_LEVEL level) { int i; for (i = 0; i < VP9_LEVELS; ++i) { @@ -798,14 +1188,204 @@ static INLINE int get_level_index(VP9_LEVEL level) { return -1; } +// Return the log2 value of max column tiles corresponding to the level that +// the picture size fits into. +static INLINE int log_tile_cols_from_picsize_level(uint32_t width, + uint32_t height) { + int i; + const uint32_t pic_size = width * height; + const uint32_t pic_breadth = VPXMAX(width, height); + for (i = 0; i < VP9_LEVELS; ++i) { + if (vp9_level_defs[i].max_luma_picture_size >= pic_size && + vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { + return get_msb(vp9_level_defs[i].max_col_tiles); + } + } + return INT_MAX; +} + VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); +vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, + unsigned int rows, unsigned int cols, + int delta_q[8], int delta_lf[8], int skip[8], + int ref_frame[8]); + void vp9_new_framerate(VP9_COMP *cpi, double framerate); +void vp9_set_row_mt(VP9_COMP *cpi); + +int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr); + #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) +static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { + RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; + if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || + new_fb_ptr->mi_cols < cm->mi_cols) { + vpx_free(new_fb_ptr->mvs); + CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*new_fb_ptr->mvs))); + new_fb_ptr->mi_rows = cm->mi_rows; + new_fb_ptr->mi_cols = cm->mi_cols; + } +} + +static INLINE int mv_cost(const MV *mv, const int *joint_cost, + int *const comp_cost[2]) { + assert(mv->row >= -MV_MAX && mv->row < MV_MAX); + assert(mv->col >= -MV_MAX && mv->col < MV_MAX); + return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + + comp_cost[1][mv->col]; +} + +static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, + const MV *ref, int sad_per_bit) { + MV diff; + diff.row = mv->row - ref->row; + diff.col = mv->col - ref->col; + return ROUND_POWER_OF_TWO( + (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, + VP9_PROB_COST_SHIFT); +} + +static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full, + const MV *ref_mv_full, + vpx_sad_fn_t sad_fn_ptr, int sadpb) { + const int src_buf_stride = x->plane[0].src.stride; + const uint8_t *const src_buf = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pred_buf_stride = xd->plane[0].pre[0].stride; + const uint8_t *const pred_buf = + xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col; + uint32_t start_mv_sad = + sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride); + start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb); + + return start_mv_sad; +} + +static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim, + int subsampling_dim, int blk_dim) { + return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim; +} + +// Compute the sum of squares on all visible 4x4s in the transform block. +static int64_t sum_squares_visible(const MACROBLOCKD *xd, + const struct macroblockd_plane *const pd, + const int16_t *diff, const int diff_stride, + int blk_row, int blk_col, + const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize, + int *visible_width, int *visible_height) { + int64_t sse; + const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; + const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; + const int b4x4s_to_right_edge = num_4x4_to_edge( + plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col); + const int b4x4s_to_bottom_edge = num_4x4_to_edge( + plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row); + if (tx_bsize == BLOCK_4X4 || + (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { + assert(tx_4x4_w == tx_4x4_h); + sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2); + *visible_width = tx_4x4_w << 2; + *visible_height = tx_4x4_h << 2; + } else { + int r, c; + const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); + const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); + sse = 0; + // if we are in the unrestricted motion border. + for (r = 0; r < max_r; ++r) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_c; ++c) { + sse += (int64_t)vpx_sum_squares_2d_i16( + diff + r * diff_stride * 4 + c * 4, diff_stride, 4); + } + } + *visible_width = max_c << 2; + *visible_height = max_r << 2; + } + return sse; +} + +// Check if trellis coefficient optimization of the transform block is enabled. +static INLINE int do_trellis_opt(const struct macroblockd_plane *pd, + const int16_t *src_diff, int diff_stride, + int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *arg) { + const struct encode_b_args *const args = (struct encode_b_args *)arg; + const MACROBLOCK *const x = args->x; + + switch (args->enable_trellis_opt) { + case DISABLE_TRELLIS_OPT: return 0; + case ENABLE_TRELLIS_OPT: return 1; + case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: { + vpx_clear_system_state(); + + return (args->trellis_opt_thresh > 0.0) + ? (x->log_block_src_var <= args->trellis_opt_thresh) + : 1; + } + case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: { + const MACROBLOCKD *const xd = &x->e_mbd; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; +#if CONFIG_VP9_HIGHBITDEPTH + const int dequant_shift = + (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; +#else + const int dequant_shift = 3; +#endif // CONFIG_VP9_HIGHBITDEPTH + const int qstep = pd->dequant[1] >> dequant_shift; + int *sse_calc_done = args->sse_calc_done; + int64_t *sse = args->sse; + int visible_width = 0, visible_height = 0; + + // TODO: Enable the sf for high bit-depth case + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse || + !sse_calc_done) + return 1; + + *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row, + blk_col, plane_bsize, tx_bsize, &visible_width, + &visible_height); + *sse_calc_done = 1; + + vpx_clear_system_state(); + + return (*(sse) <= (int64_t)visible_width * visible_height * qstep * + qstep * args->trellis_opt_thresh); + } + default: assert(0 && "Invalid trellis optimization method."); return 1; + } +} + +#if CONFIG_COLLECT_COMPONENT_TIMING +static INLINE void start_timing(VP9_COMP *cpi, int component) { + vpx_usec_timer_start(&cpi->component_timer[component]); +} +static INLINE void end_timing(VP9_COMP *cpi, int component) { + vpx_usec_timer_mark(&cpi->component_timer[component]); + cpi->frame_component_time[component] += + vpx_usec_timer_elapsed(&cpi->component_timer[component]); +} +static INLINE char const *get_frame_type_enum(int type) { + switch (type) { + case 0: return "KEY_FRAME"; + case 1: return "INTER_FRAME"; + default: assert(0); + } + return "error"; +} +#endif + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODER_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODER_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c index f4f7c7bacc..efe47259ef 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c @@ -8,10 +8,16 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "vp9/common/vp9_thread_common.h" +#include "vp9/encoder/vp9_bitstream.h" #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_temporal_filter.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_util/vpx_pthread.h" static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { int i, j, k, l, m, n; @@ -32,7 +38,8 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { td_t->rd_counts.coef_counts[i][j][k][l][m][n]; } -static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { +static int enc_worker_hook(void *arg1, void *unused) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; VP9_COMP *const cpi = thread_data->cpi; const VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; @@ -49,7 +56,7 @@ static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col); } - return 0; + return 1; } static int get_max_tile_cols(VP9_COMP *cpi) { @@ -61,81 +68,145 @@ static int get_max_tile_cols(VP9_COMP *cpi) { vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); log2_tile_cols = clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); + if (cpi->oxcf.target_level == LEVEL_AUTO) { + const int level_tile_cols = + log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height); + if (log2_tile_cols > level_tile_cols) { + log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols); + } + } return (1 << log2_tile_cols); } +static void create_enc_workers(VP9_COMP *cpi, int num_workers) { + VP9_COMMON *const cm = &cpi->common; + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + int i; + // While using SVC, we need to allocate threads according to the highest + // resolution. When row based multithreading is enabled, it is OK to + // allocate more threads than the number of max tile columns. + if (cpi->use_svc && !cpi->row_mt) { + int max_tile_cols = get_max_tile_cols(cpi); + num_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols); + } + assert(num_workers > 0); + if (num_workers == cpi->num_workers) return; + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + vp9_encode_free_mt_data(cpi); + + CHECK_MEM_ERROR(&cm->error, cpi->workers, + vpx_malloc(num_workers * sizeof(*cpi->workers))); + + CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data, + vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data))); + + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *thread_data = &cpi->tile_thr_data[i]; + + ++cpi->num_workers; + winterface->init(worker); + worker->thread_name = "vpx enc worker"; + + if (i < num_workers - 1) { + thread_data->cpi = cpi; + + // Allocate thread data. + CHECK_MEM_ERROR(&cm->error, thread_data->td, + vpx_memalign(32, sizeof(*thread_data->td))); + vp9_zero(*thread_data->td); + + // Set up pc_tree. + thread_data->td->leaf_tree = NULL; + thread_data->td->pc_tree = NULL; + vp9_setup_pc_tree(cm, thread_data->td); + + // Allocate frame counters in thread data. + CHECK_MEM_ERROR(&cm->error, thread_data->td->counts, + vpx_calloc(1, sizeof(*thread_data->td->counts))); + + // Create threads + if (!winterface->reset(worker)) + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile encoder thread creation failed"); + } else { + // Main thread acts as a worker and uses the thread data in cpi. + thread_data->cpi = cpi; + thread_data->td = &cpi->td; + } + winterface->sync(worker); + } +} + +static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2, + int num_workers) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + int i; + + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + worker->hook = hook; + worker->data1 = &cpi->tile_thr_data[i]; + worker->data2 = data2; + } + + // Encode a frame + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Set the starting tile for each thread. + thread_data->start = i; + + if (i == cpi->num_workers - 1) + winterface->execute(worker); + else + winterface->launch(worker); + } + + // Encoding ends. + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + winterface->sync(worker); + } +} + +void vp9_encode_free_mt_data(struct VP9_COMP *cpi) { + int t; + for (t = 0; t < cpi->num_workers; ++t) { + VPxWorker *const worker = &cpi->workers[t]; + EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; + + // Deallocate allocated threads. + vpx_get_worker_interface()->end(worker); + + // Deallocate allocated thread data. + if (t < cpi->num_workers - 1) { + vpx_free(thread_data->td->counts); + vp9_free_pc_tree(thread_data->td); + vpx_free(thread_data->td); + } + } + vpx_free(cpi->tile_thr_data); + cpi->tile_thr_data = NULL; + vpx_free(cpi->workers); + cpi->workers = NULL; + cpi->num_workers = 0; +} + void vp9_encode_tiles_mt(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols); int i; vp9_init_tile_data(cpi); - // Only run once to create threads and allocate thread data. - if (cpi->num_workers == 0) { - int allocated_workers = num_workers; - - // While using SVC, we need to allocate threads according to the highest - // resolution. - if (cpi->use_svc) { - int max_tile_cols = get_max_tile_cols(cpi); - allocated_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols); - } - - CHECK_MEM_ERROR(cm, cpi->workers, - vpx_malloc(allocated_workers * sizeof(*cpi->workers))); - - CHECK_MEM_ERROR(cm, cpi->tile_thr_data, - vpx_calloc(allocated_workers, sizeof(*cpi->tile_thr_data))); - - for (i = 0; i < allocated_workers; i++) { - VPxWorker *const worker = &cpi->workers[i]; - EncWorkerData *thread_data = &cpi->tile_thr_data[i]; - - ++cpi->num_workers; - winterface->init(worker); - - if (i < allocated_workers - 1) { - thread_data->cpi = cpi; - - // Allocate thread data. - CHECK_MEM_ERROR(cm, thread_data->td, - vpx_memalign(32, sizeof(*thread_data->td))); - vp9_zero(*thread_data->td); - - // Set up pc_tree. - thread_data->td->leaf_tree = NULL; - thread_data->td->pc_tree = NULL; - vp9_setup_pc_tree(cm, thread_data->td); - - // Allocate frame counters in thread data. - CHECK_MEM_ERROR(cm, thread_data->td->counts, - vpx_calloc(1, sizeof(*thread_data->td->counts))); - - // Create threads - if (!winterface->reset(worker)) - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Tile encoder thread creation failed"); - } else { - // Main thread acts as a worker and uses the thread data in cpi. - thread_data->cpi = cpi; - thread_data->td = &cpi->td; - } - - winterface->sync(worker); - } - } + create_enc_workers(cpi, num_workers); for (i = 0; i < num_workers; i++) { - VPxWorker *const worker = &cpi->workers[i]; - EncWorkerData *thread_data; - - worker->hook = (VPxWorkerHook)enc_worker_hook; - worker->data1 = &cpi->tile_thr_data[i]; - worker->data2 = NULL; - thread_data = (EncWorkerData *)worker->data1; + EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { @@ -165,25 +236,449 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { } } - // Encode a frame - for (i = 0; i < num_workers; i++) { - VPxWorker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; - - // Set the starting tile for each thread. - thread_data->start = i; - - if (i == cpi->num_workers - 1) - winterface->execute(worker); - else - winterface->launch(worker); - } - - // Encoding ends. - for (i = 0; i < num_workers; i++) { - VPxWorker *const worker = &cpi->workers[i]; - winterface->sync(worker); - } + launch_enc_workers(cpi, enc_worker_hook, NULL, num_workers); + + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Accumulate counters. + if (i < cpi->num_workers - 1) { + vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0); + accumulate_rd_opt(&cpi->td, thread_data->td); + } + } +} + +#if !CONFIG_REALTIME_ONLY +static void accumulate_fp_tile_stat(TileDataEnc *tile_data, + TileDataEnc *tile_data_t) { + tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor; + tile_data->fp_data.brightness_factor += + tile_data_t->fp_data.brightness_factor; + tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error; + tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error; + tile_data->fp_data.frame_noise_energy += + tile_data_t->fp_data.frame_noise_energy; + tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error; + tile_data->fp_data.intercount += tile_data_t->fp_data.intercount; + tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count; + tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count; + tile_data->fp_data.intra_count_low += tile_data_t->fp_data.intra_count_low; + tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high; + tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count; + tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount; + tile_data->fp_data.new_mv_count += tile_data_t->fp_data.new_mv_count; + tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr; + tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs; + tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc; + tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs; + tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs; + tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs; + tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors; + tile_data->fp_data.intra_smooth_count += + tile_data_t->fp_data.intra_smooth_count; + const int min_start_row = VPXMIN(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row); + tile_data->fp_data.image_data_start_row = + (min_start_row == INVALID_ROW) + ? VPXMAX(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row) + : min_start_row; +} +#endif // !CONFIG_REALTIME_ONLY + +// Allocate memory for row synchronization +void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, + int rows) { + row_mt_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex, + vpx_malloc(sizeof(*row_mt_sync->mutex) * rows)); + if (row_mt_sync->mutex) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&row_mt_sync->mutex[i], NULL); + } + } + + CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond, + vpx_malloc(sizeof(*row_mt_sync->cond) * rows)); + if (row_mt_sync->cond) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&row_mt_sync->cond[i], NULL); + } + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col, + vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows)); + + // Set up nsync. + row_mt_sync->sync_range = 1; +} + +// Deallocate row based multi-threading synchronization related mutex and data +void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) { + if (row_mt_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + + if (row_mt_sync->mutex != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_mutex_destroy(&row_mt_sync->mutex[i]); + } + vpx_free(row_mt_sync->mutex); + } + if (row_mt_sync->cond != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_cond_destroy(&row_mt_sync->cond[i]); + } + vpx_free(row_mt_sync->cond); + } +#endif // CONFIG_MULTITHREAD + vpx_free(row_mt_sync->cur_col); + // clear the structure as the source of this call may be dynamic change + // in tiles in which case this call will be followed by an _alloc() + // which may fail. + vp9_zero(*row_mt_sync); + } +} + +void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &row_mt_sync->mutex[r - 1]; + pthread_mutex_lock(mutex); + + while (c > row_mt_sync->cur_col[r - 1] - nsync + 1) { + pthread_cond_wait(&row_mt_sync->cond[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) { + (void)row_mt_sync; + (void)r; + (void)c; + return; +} + +void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + int cur; + // Only signal when there are enough encoded blocks for next row to run. + int sig = 1; + + if (c < cols - 1) { + cur = c; + if (c % nsync != nsync - 1) sig = 0; + } else { + cur = cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&row_mt_sync->mutex[r]); + + row_mt_sync->cur_col[r] = cur; + + pthread_cond_signal(&row_mt_sync->cond[r]); + pthread_mutex_unlock(&row_mt_sync->mutex[r]); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; +#endif // CONFIG_MULTITHREAD +} + +void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols) { + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; + return; +} + +#if !CONFIG_REALTIME_ONLY +static int first_pass_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + TileDataEnc *this_tile; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + FIRSTPASS_DATA fp_acc_data; + MV zero_mv = { 0, 0 }; + MV best_ref_mv; + int mb_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + + this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + mb_row = proc_job->vert_unit_row_num; + + best_ref_mv = zero_mv; + vp9_zero(fp_acc_data); + fp_acc_data.image_data_start_row = INVALID_ROW; + vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data, + this_tile, &best_ref_mv, mb_row); + } + } + return 1; +} + +void vp9_encode_fp_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + TileDataEnc *first_tile_col; + int num_workers = VPXMAX(cpi->oxcf.max_threads, 1); + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, FIRST_PASS_JOB); + + vp9_multi_thread_tile_init(cpi); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + } + } + + launch_enc_workers(cpi, first_pass_worker_hook, multi_thread_ctxt, + num_workers); + + first_tile_col = &cpi->tile_data[0]; + for (i = 1; i < tile_cols; i++) { + TileDataEnc *this_tile = &cpi->tile_data[i]; + accumulate_fp_tile_stat(first_tile_col, this_tile); + } +} + +static int temporal_filter_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + int mb_col_start, mb_col_end; + TileDataEnc *this_tile; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + int mb_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + mb_col_start = (this_tile->tile_info.mi_col_start) >> TF_SHIFT; + mb_col_end = (this_tile->tile_info.mi_col_end + TF_ROUND) >> TF_SHIFT; + mb_row = proc_job->vert_unit_row_num; + + vp9_temporal_filter_iterate_row_c(cpi, thread_data->td, mb_row, + mb_col_start, mb_col_end); + } + } + return 1; +} + +void vp9_temporal_filter_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int num_workers = cpi->num_workers ? cpi->num_workers : 1; + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, ARNR_JOB); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + } + } + + launch_enc_workers(cpi, temporal_filter_worker_hook, multi_thread_ctxt, + num_workers); +} +#endif // !CONFIG_REALTIME_ONLY + +static int enc_row_mt_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + int mi_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE; + + vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row); + } + } + return 1; +} + +void vp9_encode_tiles_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int num_workers = VPXMAX(cpi->oxcf.max_threads, 1); + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, ENCODE_JOB); + + vp9_multi_thread_tile_init(cpi); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + thread_data->td->rd_counts = cpi->td.rd_counts; + } + if (thread_data->td->counts != &cpi->common.counts) { + memcpy(thread_data->td->counts, &cpi->common.counts, + sizeof(cpi->common.counts)); + } + + // Handle use_nonrd_pick_mode case. + if (cpi->sf.use_nonrd_pick_mode) { + MACROBLOCK *const x = &thread_data->td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none; + int j; + + for (j = 0; j < MAX_MB_PLANE; ++j) { + p[j].coeff = ctx->coeff_pbuf[j][0]; + p[j].qcoeff = ctx->qcoeff_pbuf[j][0]; + pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0]; + p[j].eobs = ctx->eobs_pbuf[j][0]; + } + } + } + + launch_enc_workers(cpi, enc_row_mt_worker_hook, multi_thread_ctxt, + num_workers); for (i = 0; i < num_workers; i++) { VPxWorker *const worker = &cpi->workers[i]; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h index 1efa4dcde2..46478bef9f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h @@ -8,13 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ETHREAD_H_ -#define VP9_ENCODER_VP9_ETHREAD_H_ +#ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_ +#define VPX_VP9_ENCODER_VP9_ETHREAD_H_ + +#include "vpx_util/vpx_pthread.h" #ifdef __cplusplus extern "C" { #endif +#define MAX_NUM_TILE_COLS (1 << 6) +#define MAX_NUM_TILE_ROWS 4 +#define MAX_NUM_THREADS 64 + struct VP9_COMP; struct ThreadData; @@ -22,12 +28,52 @@ typedef struct EncWorkerData { struct VP9_COMP *cpi; struct ThreadData *td; int start; + int thread_id; + int tile_completion_status[MAX_NUM_TILE_COLS]; } EncWorkerData; +// Encoder row synchronization +typedef struct VP9RowMTSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex; + pthread_cond_t *cond; +#endif + // Allocate memory to store the sb/mb block index in each row. + int *cur_col; + int sync_range; + int rows; +} VP9RowMTSync; + +// Frees EncWorkerData related allocations made by vp9_encode_*_mt(). +// row_mt specific data is freed with vp9_row_mt_mem_dealloc() and is not +// called by this function. +void vp9_encode_free_mt_data(struct VP9_COMP *cpi); + void vp9_encode_tiles_mt(struct VP9_COMP *cpi); +void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi); + +void vp9_encode_fp_row_mt(struct VP9_COMP *cpi); + +void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c); +void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols); + +void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c); +void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols); + +// Allocate memory for row based multi-threading synchronization. +void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm, + int rows); + +// Deallocate row based multi-threading synchronization related mutex and data. +void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync); + +void vp9_temporal_filter_row_mt(struct VP9_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ETHREAD_H_ +#endif // VPX_VP9_ENCODER_VP9_ETHREAD_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c new file mode 100644 index 0000000000..e8cc006b1e --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vp9/encoder/vp9_ext_ratectrl.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/common/vp9_common.h" +#include "vpx_dsp/psnr.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_tpl.h" + +vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + vp9_zero(*ext_ratectrl); + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs, + vpx_rc_config_t ratectrl_config, + EXT_RATECTRL *ext_ratectrl) { + vpx_rc_status_t rc_status; + vpx_rc_firstpass_stats_t *rc_firstpass_stats; + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + vp9_extrc_delete(ext_ratectrl); + ext_ratectrl->funcs = funcs; + ext_ratectrl->ratectrl_config = ratectrl_config; + rc_status = ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv, + &ext_ratectrl->ratectrl_config, + &ext_ratectrl->model); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats; + rc_firstpass_stats->num_frames = ratectrl_config.show_frame_count; + rc_firstpass_stats->frame_stats = + vpx_malloc(sizeof(*rc_firstpass_stats->frame_stats) * + rc_firstpass_stats->num_frames); + if (rc_firstpass_stats->frame_stats == NULL) { + return VPX_CODEC_MEM_ERROR; + } + if (funcs.rate_ctrl_log_path != NULL) { + ext_ratectrl->log_file = fopen(funcs.rate_ctrl_log_path, "w"); + if (!ext_ratectrl->log_file) { + return VPX_CODEC_ERROR; + } + } else { + ext_ratectrl->log_file = NULL; + } + ext_ratectrl->ready = 1; + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready) { + if (ext_ratectrl->log_file) { + fclose(ext_ratectrl->log_file); + } + vpx_rc_status_t rc_status = + ext_ratectrl->funcs.delete_model(ext_ratectrl->model); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + vpx_free(ext_ratectrl->rc_firstpass_stats.frame_stats); + } + return vp9_extrc_init(ext_ratectrl); +} + +static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats, + vpx_rc_frame_stats_t *rc_frame_stats) { + rc_frame_stats->frame = stats->frame; + rc_frame_stats->weight = stats->weight; + rc_frame_stats->intra_error = stats->intra_error; + rc_frame_stats->coded_error = stats->coded_error; + rc_frame_stats->sr_coded_error = stats->sr_coded_error; + rc_frame_stats->frame_noise_energy = stats->frame_noise_energy; + rc_frame_stats->pcnt_inter = stats->pcnt_inter; + rc_frame_stats->pcnt_motion = stats->pcnt_motion; + rc_frame_stats->pcnt_second_ref = stats->pcnt_second_ref; + rc_frame_stats->pcnt_neutral = stats->pcnt_neutral; + rc_frame_stats->pcnt_intra_low = stats->pcnt_intra_low; + rc_frame_stats->pcnt_intra_high = stats->pcnt_intra_high; + rc_frame_stats->intra_skip_pct = stats->intra_skip_pct; + rc_frame_stats->intra_smooth_pct = stats->intra_smooth_pct; + rc_frame_stats->inactive_zone_rows = stats->inactive_zone_rows; + rc_frame_stats->inactive_zone_cols = stats->inactive_zone_cols; + rc_frame_stats->MVr = stats->MVr; + rc_frame_stats->mvr_abs = stats->mvr_abs; + rc_frame_stats->MVc = stats->MVc; + rc_frame_stats->mvc_abs = stats->mvc_abs; + rc_frame_stats->MVrv = stats->MVrv; + rc_frame_stats->MVcv = stats->MVcv; + rc_frame_stats->mv_in_out_count = stats->mv_in_out_count; + rc_frame_stats->duration = stats->duration; + rc_frame_stats->count = stats->count; + rc_frame_stats->new_mv_count = stats->new_mv_count; +} + +vpx_codec_err_t vp9_extrc_send_firstpass_stats( + EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready) { + vpx_rc_status_t rc_status; + vpx_rc_firstpass_stats_t *rc_firstpass_stats = + &ext_ratectrl->rc_firstpass_stats; + int i; + assert(rc_firstpass_stats->num_frames == first_pass_info->num_frames); + for (i = 0; i < rc_firstpass_stats->num_frames; ++i) { + gen_rc_firstpass_stats(&first_pass_info->stats[i], + &rc_firstpass_stats->frame_stats[i]); + } + rc_status = ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model, + rc_firstpass_stats); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + } + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl, + const VpxTplGopStats *tpl_gop_stats) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready && ext_ratectrl->funcs.send_tpl_gop_stats != NULL) { + vpx_rc_status_t rc_status = ext_ratectrl->funcs.send_tpl_gop_stats( + ext_ratectrl->model, tpl_gop_stats); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + } + return VPX_CODEC_OK; +} + +static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) { + // TODO(angiebird): Add unit test to make sure this function behaves like + // get_frame_type_from_update_type() + // TODO(angiebird): Merge this function with get_frame_type_from_update_type() + switch (update_type) { + case KF_UPDATE: return 0; // kFrameTypeKey; + case ARF_UPDATE: return 2; // kFrameTypeAltRef; + case GF_UPDATE: return 4; // kFrameTypeGolden; + case OVERLAY_UPDATE: return 3; // kFrameTypeOverlay; + case LF_UPDATE: return 1; // kFrameTypeInter; + default: + fprintf(stderr, "Unsupported update_type %d\n", update_type); + abort(); + } +} + +vpx_codec_err_t vp9_extrc_get_encodeframe_decision( + EXT_RATECTRL *ext_ratectrl, int gop_index, + vpx_rc_encodeframe_decision_t *encode_frame_decision) { + assert(ext_ratectrl != NULL); + assert(ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0); + + vpx_rc_status_t rc_status = ext_ratectrl->funcs.get_encodeframe_decision( + ext_ratectrl->model, gop_index, encode_frame_decision); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_update_encodeframe_result( + EXT_RATECTRL *ext_ratectrl, int64_t bit_count, int actual_encoding_qindex) { + if (ext_ratectrl == NULL) { + return VPX_CODEC_INVALID_PARAM; + } + if (ext_ratectrl->ready) { + vpx_rc_status_t rc_status; + vpx_rc_encodeframe_result_t encode_frame_result; + encode_frame_result.bit_count = bit_count; + encode_frame_result.actual_encoding_qindex = actual_encoding_qindex; + rc_status = ext_ratectrl->funcs.update_encodeframe_result( + ext_ratectrl->model, &encode_frame_result); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + } + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_get_gop_decision( + EXT_RATECTRL *ext_ratectrl, vpx_rc_gop_decision_t *gop_decision) { + vpx_rc_status_t rc_status; + if (ext_ratectrl == NULL || !ext_ratectrl->ready || + (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) { + return VPX_CODEC_INVALID_PARAM; + } + rc_status = + ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model, gop_decision); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + return VPX_CODEC_OK; +} + +vpx_codec_err_t vp9_extrc_get_key_frame_decision( + EXT_RATECTRL *ext_ratectrl, + vpx_rc_key_frame_decision_t *key_frame_decision) { + if (ext_ratectrl == NULL || !ext_ratectrl->ready || + (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) { + return VPX_CODEC_INVALID_PARAM; + } + vpx_rc_status_t rc_status = ext_ratectrl->funcs.get_key_frame_decision( + ext_ratectrl->model, key_frame_decision); + return rc_status == VPX_RC_OK ? VPX_CODEC_OK : VPX_CODEC_ERROR; +} + +vpx_codec_err_t vp9_extrc_get_frame_rdmult( + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + int *rdmult) { + vpx_rc_status_t rc_status; + vpx_rc_encodeframe_info_t encode_frame_info; + if (ext_ratectrl == NULL || !ext_ratectrl->ready || + (ext_ratectrl->funcs.rc_type & VPX_RC_RDMULT) == 0) { + return VPX_CODEC_INVALID_PARAM; + } + encode_frame_info.show_index = show_index; + encode_frame_info.coding_index = coding_index; + encode_frame_info.gop_index = gop_index; + encode_frame_info.frame_type = extrc_get_frame_type(update_type); + encode_frame_info.gop_size = gop_size; + encode_frame_info.use_alt_ref = use_alt_ref; + + vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs, + encode_frame_info.ref_frame_coding_indexes, + encode_frame_info.ref_frame_valid_list); + rc_status = ext_ratectrl->funcs.get_frame_rdmult(ext_ratectrl->model, + &encode_frame_info, rdmult); + if (rc_status == VPX_RC_ERROR) { + return VPX_CODEC_ERROR; + } + return VPX_CODEC_OK; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h new file mode 100644 index 0000000000..4ea1b24646 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ +#define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ + +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx/vpx_tpl.h" +#include "vp9/encoder/vp9_firstpass.h" + +typedef struct EXT_RATECTRL { + int ready; + int ext_rdmult; + vpx_rc_model_t model; + vpx_rc_funcs_t funcs; + vpx_rc_config_t ratectrl_config; + vpx_rc_firstpass_stats_t rc_firstpass_stats; + FILE *log_file; +} EXT_RATECTRL; + +vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl); + +vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs, + vpx_rc_config_t ratectrl_config, + EXT_RATECTRL *ext_ratectrl); + +vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl); + +vpx_codec_err_t vp9_extrc_send_firstpass_stats( + EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info); + +vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl, + const VpxTplGopStats *tpl_gop_stats); + +vpx_codec_err_t vp9_extrc_get_encodeframe_decision( + EXT_RATECTRL *ext_ratectrl, int gop_index, + vpx_rc_encodeframe_decision_t *encode_frame_decision); + +vpx_codec_err_t vp9_extrc_update_encodeframe_result(EXT_RATECTRL *ext_ratectrl, + int64_t bit_count, + int actual_encoding_qindex); + +vpx_codec_err_t vp9_extrc_get_key_frame_decision( + EXT_RATECTRL *ext_ratectrl, + vpx_rc_key_frame_decision_t *key_frame_decision); + +vpx_codec_err_t vp9_extrc_get_gop_decision(EXT_RATECTRL *ext_ratectrl, + vpx_rc_gop_decision_t *gop_decision); + +vpx_codec_err_t vp9_extrc_get_frame_rdmult( + EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index, + FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref, + RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags, + int *rdmult); + +#endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c index f8e24610ae..69261ac65f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c @@ -18,18 +18,26 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch, int w, int h, int extend_top, int extend_left, - int extend_bottom, int extend_right) { - int i, linesize; + int extend_bottom, int extend_right, + int interleave_step) { + int i, j, linesize; + const int step = interleave_step < 1 ? 1 : interleave_step; // copy the left and right most columns out const uint8_t *src_ptr1 = src; - const uint8_t *src_ptr2 = src + w - 1; + const uint8_t *src_ptr2 = src + (w - 1) * step; uint8_t *dst_ptr1 = dst - extend_left; uint8_t *dst_ptr2 = dst + w; for (i = 0; i < h; i++) { memset(dst_ptr1, src_ptr1[0], extend_left); - memcpy(dst_ptr1 + extend_left, src_ptr1, w); + if (step == 1) { + memcpy(dst_ptr1 + extend_left, src_ptr1, w); + } else { + for (j = 0; j < w; j++) { + dst_ptr1[extend_left + j] = src_ptr1[step * j]; + } + } memset(dst_ptr2, src_ptr2[0], extend_right); src_ptr1 += src_pitch; src_ptr2 += src_pitch; @@ -122,6 +130,8 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int el_uv = el_y >> uv_width_subsampling; const int eb_uv = eb_y >> uv_height_subsampling; const int er_uv = er_y >> uv_width_subsampling; + // detect nv12 colorspace + const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1; #if CONFIG_VP9_HIGHBITDEPTH if (src->flags & YV12_FLAG_HIGHBITDEPTH) { @@ -142,50 +152,13 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src->y_crop_width, src->y_crop_height, - et_y, el_y, eb_y, er_y); + et_y, el_y, eb_y, er_y, 1); copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, src->uv_crop_width, src->uv_crop_height, - et_uv, el_uv, eb_uv, er_uv); + et_uv, el_uv, eb_uv, er_uv, chroma_step); copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, src->uv_crop_width, src->uv_crop_height, - et_uv, el_uv, eb_uv, er_uv); -} - -void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int srcy, - int srcx, int srch, int srcw) { - // If the side is not touching the bounder then don't extend. - const int et_y = srcy ? 0 : dst->border; - const int el_y = srcx ? 0 : dst->border; - const int eb_y = srcy + srch != src->y_height - ? 0 - : dst->border + dst->y_height - src->y_height; - const int er_y = srcx + srcw != src->y_width - ? 0 - : dst->border + dst->y_width - src->y_width; - const int src_y_offset = srcy * src->y_stride + srcx; - const int dst_y_offset = srcy * dst->y_stride + srcx; - - const int et_uv = ROUND_POWER_OF_TWO(et_y, 1); - const int el_uv = ROUND_POWER_OF_TWO(el_y, 1); - const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1); - const int er_uv = ROUND_POWER_OF_TWO(er_y, 1); - const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); - const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); - const int srch_uv = ROUND_POWER_OF_TWO(srch, 1); - const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1); - - copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, - dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch, - et_y, el_y, eb_y, er_y); - - copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride, - dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv, - srch_uv, et_uv, el_uv, eb_uv, er_uv); - - copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride, - dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv, - srch_uv, et_uv, el_uv, eb_uv, er_uv); + et_uv, el_uv, eb_uv, er_uv, chroma_step); } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h index c0dd757159..21d7e68b9f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_EXTEND_H_ -#define VP9_ENCODER_VP9_EXTEND_H_ +#ifndef VPX_VP9_ENCODER_VP9_EXTEND_H_ +#define VPX_VP9_ENCODER_VP9_EXTEND_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" @@ -21,11 +21,8 @@ extern "C" { void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); -void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, int srcy, - int srcx, int srch, int srcw); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_EXTEND_H_ +#endif // VPX_VP9_ENCODER_VP9_EXTEND_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c index 72e9ac77e7..e5918a9bfe 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c @@ -10,6 +10,7 @@ #include #include +#include #include #include "./vpx_dsp_rtcd.h" @@ -31,38 +32,59 @@ #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_ext_ratectrl.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mcomp.h" #include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_rd.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" #include "vpx_dsp/variance.h" #define OUTPUT_FPF 0 #define ARF_STATS_OUTPUT 0 +#define COMPLEXITY_STATS_OUTPUT 0 -#define BOOST_BREAKOUT 12.5 -#define BOOST_FACTOR 12.5 -#define FACTOR_PT_LOW 0.70 -#define FACTOR_PT_HIGH 0.90 #define FIRST_PASS_Q 10.0 -#define GF_MAX_BOOST 96.0 -#define INTRA_MODE_PENALTY 1024 -#define MIN_ARF_GF_BOOST 240 +#define NORMAL_BOOST 100 +#define MIN_ARF_GF_BOOST 250 #define MIN_DECAY_FACTOR 0.01 #define NEW_MV_MODE_PENALTY 32 -#define SVC_FACTOR_PT_LOW 0.45 #define DARK_THRESH 64 -#define DEFAULT_GRP_WEIGHT 1.0 -#define RC_FACTOR_MIN 0.75 -#define RC_FACTOR_MAX 1.75 -#define SECTION_NOISE_DEF 250.0 #define LOW_I_THRESH 24000 #define NCOUNT_INTRA_THRESH 8192 #define NCOUNT_INTRA_FACTOR 3 -#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) +#define INTRA_PART 0.005 +#define DEFAULT_DECAY_LIMIT 0.75 +#define LOW_SR_DIFF_TRHESH 0.1 +#define LOW_CODED_ERR_PER_MB 10.0 +#define NCOUNT_FRAME_II_THRESH 6.0 +#define BASELINE_ERR_PER_MB 12500.0 +#define GF_MAX_FRAME_BOOST 96.0 + +#ifdef AGGRESSIVE_VBR +#define KF_MIN_FRAME_BOOST 40.0 +#define KF_MAX_FRAME_BOOST 80.0 +#define MAX_KF_TOT_BOOST 4800 +#else +#define KF_MIN_FRAME_BOOST 40.0 +#define KF_MAX_FRAME_BOOST 96.0 +#define MAX_KF_TOT_BOOST 5400 +#endif + +#define DEFAULT_ZM_FACTOR 0.5 +#define MINQ_ADJ_LIMIT 48 +#define MINQ_ADJ_LIMIT_CQ 20 +#define HIGH_UNDERSHOOT_RATIO 2 +#define AV_WQ_FACTOR 4.0 + +#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001) #if ARF_STATS_OUTPUT unsigned int arf_count = 0; @@ -92,14 +114,8 @@ static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) { return 1; } -static void output_stats(FIRSTPASS_STATS *stats, - struct vpx_codec_pkt_list *pktlist) { - struct vpx_codec_cx_pkt pkt; - pkt.kind = VPX_CODEC_STATS_PKT; - pkt.data.twopass_stats.buf = stats; - pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS); - vpx_codec_pkt_list_add(pktlist, &pkt); - +static void output_stats(FIRSTPASS_STATS *stats) { + (void)stats; // TEMP debug code #if OUTPUT_FPF { @@ -107,13 +123,15 @@ static void output_stats(FIRSTPASS_STATS *stats, fpfile = fopen("firstpass.stt", "a"); fprintf(fpfile, - "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.0lf %12.4lf" + "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf" "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" - "%12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.4lf %12.0lf" + "%12.4lf" "\n", stats->frame, stats->weight, stats->intra_error, stats->coded_error, stats->sr_coded_error, stats->frame_noise_energy, stats->pcnt_inter, stats->pcnt_motion, stats->pcnt_second_ref, stats->pcnt_neutral, + stats->pcnt_intra_low, stats->pcnt_intra_high, stats->intra_skip_pct, stats->intra_smooth_pct, stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr, stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv, @@ -123,17 +141,6 @@ static void output_stats(FIRSTPASS_STATS *stats, #endif } -#if CONFIG_FP_MB_STATS -static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm, - struct vpx_codec_pkt_list *pktlist) { - struct vpx_codec_cx_pkt pkt; - pkt.kind = VPX_CODEC_FPMB_STATS_PKT; - pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats; - pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t); - vpx_codec_pkt_list_add(pktlist, &pkt); -} -#endif - static void zero_stats(FIRSTPASS_STATS *section) { section->frame = 0.0; section->weight = 0.0; @@ -147,8 +154,11 @@ static void zero_stats(FIRSTPASS_STATS *section) { section->pcnt_neutral = 0.0; section->intra_skip_pct = 0.0; section->intra_smooth_pct = 0.0; + section->pcnt_intra_low = 0.0; + section->pcnt_intra_high = 0.0; section->inactive_zone_rows = 0.0; section->inactive_zone_cols = 0.0; + section->new_mv_count = 0.0; section->MVr = 0.0; section->mvr_abs = 0.0; section->MVc = 0.0; @@ -176,8 +186,11 @@ static void accumulate_stats(FIRSTPASS_STATS *section, section->pcnt_neutral += frame->pcnt_neutral; section->intra_skip_pct += frame->intra_skip_pct; section->intra_smooth_pct += frame->intra_smooth_pct; + section->pcnt_intra_low += frame->pcnt_intra_low; + section->pcnt_intra_high += frame->pcnt_intra_high; section->inactive_zone_rows += frame->inactive_zone_rows; section->inactive_zone_cols += frame->inactive_zone_cols; + section->new_mv_count += frame->new_mv_count; section->MVr += frame->MVr; section->mvr_abs += frame->mvr_abs; section->MVc += frame->MVc; @@ -203,8 +216,11 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->pcnt_neutral -= frame->pcnt_neutral; section->intra_skip_pct -= frame->intra_skip_pct; section->intra_smooth_pct -= frame->intra_smooth_pct; + section->pcnt_intra_low -= frame->pcnt_intra_low; + section->pcnt_intra_high -= frame->pcnt_intra_high; section->inactive_zone_rows -= frame->inactive_zone_rows; section->inactive_zone_cols -= frame->inactive_zone_cols; + section->new_mv_count -= frame->new_mv_count; section->MVr -= frame->MVr; section->mvr_abs -= frame->mvr_abs; section->MVc -= frame->MVc; @@ -220,28 +236,37 @@ static void subtract_stats(FIRSTPASS_STATS *section, // bars and partially discounts other 0 energy areas. #define MIN_ACTIVE_AREA 0.5 #define MAX_ACTIVE_AREA 1.0 -static double calculate_active_area(const VP9_COMP *cpi, +static double calculate_active_area(const FRAME_INFO *frame_info, const FIRSTPASS_STATS *this_frame) { double active_pct; active_pct = 1.0 - ((this_frame->intra_skip_pct / 2) + - ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows)); + ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows)); return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); } +// Get the average weighted error for the clip (or corpus) +static double get_distribution_av_err(VP9_COMP *cpi, TWO_PASS *const twopass) { + const double av_weight = + twopass->total_stats.weight / twopass->total_stats.count; + + if (cpi->oxcf.vbr_corpus_complexity) + return av_weight * twopass->mean_mod_score; + else + return (twopass->total_stats.coded_error * av_weight) / + twopass->total_stats.count; +} + +#define ACT_AREA_CORRECTION 0.5 // Calculate a modified Error used in distributing bits between easier and // harder frames. -#define ACT_AREA_CORRECTION 0.5 -static double calculate_modified_err(const VP9_COMP *cpi, - const TWO_PASS *twopass, - const VP9EncoderConfig *oxcf, - const FIRSTPASS_STATS *this_frame) { - const FIRSTPASS_STATS *const stats = &twopass->total_stats; - const double av_weight = stats->weight / stats->count; - const double av_err = (stats->coded_error * av_weight) / stats->count; - double modified_error = +static double calculate_mod_frame_score(const VP9_COMP *cpi, + const VP9EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame, + const double av_err) { + double modified_score = av_err * pow(this_frame->coded_error * this_frame->weight / DOUBLE_DIVIDE_CHECK(av_err), oxcf->two_pass_vbrbias / 100.0); @@ -251,11 +276,44 @@ static double calculate_modified_err(const VP9_COMP *cpi, // remaining active MBs. The correction here assumes that coding // 0.5N blocks of complexity 2X is a little easier than coding N // blocks of complexity X. - modified_error *= - pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION); + modified_score *= pow(calculate_active_area(&cpi->frame_info, this_frame), + ACT_AREA_CORRECTION); - return fclamp(modified_error, twopass->modified_error_min, - twopass->modified_error_max); + return modified_score; +} + +static double calc_norm_frame_score(const VP9EncoderConfig *oxcf, + const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame, + double mean_mod_score, double av_err) { + double modified_score = + av_err * pow(this_frame->coded_error * this_frame->weight / + DOUBLE_DIVIDE_CHECK(av_err), + oxcf->two_pass_vbrbias / 100.0); + + const double min_score = (double)(oxcf->two_pass_vbrmin_section) / 100.0; + const double max_score = (double)(oxcf->two_pass_vbrmax_section) / 100.0; + + // Correction for active area. Frames with a reduced active area + // (eg due to formatting bars) have a higher error per mb for the + // remaining active MBs. The correction here assumes that coding + // 0.5N blocks of complexity 2X is a little easier than coding N + // blocks of complexity X. + modified_score *= + pow(calculate_active_area(frame_info, this_frame), ACT_AREA_CORRECTION); + + // Normalize to a midpoint score. + modified_score /= DOUBLE_DIVIDE_CHECK(mean_mod_score); + return fclamp(modified_score, min_score, max_score); +} + +static double calculate_norm_frame_score(const VP9_COMP *cpi, + const TWO_PASS *twopass, + const VP9EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame, + const double av_err) { + return calc_norm_frame_score(oxcf, &cpi->frame_info, this_frame, + twopass->mean_mod_score, av_err); } // This function returns the maximum target rate per frame. @@ -277,15 +335,10 @@ void vp9_init_first_pass(VP9_COMP *cpi) { } void vp9_end_first_pass(VP9_COMP *cpi) { - if (is_two_pass_svc(cpi)) { - int i; - for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { - output_stats(&cpi->svc.layer_context[i].twopass.total_stats, - cpi->output_pkt_list); - } - } else { - output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list); - } + output_stats(&cpi->twopass.total_stats); + cpi->twopass.first_pass_done = 1; + vpx_free(cpi->twopass.fp_mb_float_stats); + cpi->twopass.fp_mb_float_stats = NULL; } static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { @@ -317,7 +370,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, case BLOCK_8X16: return vpx_highbd_8_mse8x16; default: return vpx_highbd_8_mse16x16; } - break; case 10: switch (bsize) { case BLOCK_8X8: return vpx_highbd_10_mse8x8; @@ -325,7 +377,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, case BLOCK_8X16: return vpx_highbd_10_mse8x16; default: return vpx_highbd_10_mse16x16; } - break; case 12: switch (bsize) { case BLOCK_8X8: return vpx_highbd_12_mse8x8; @@ -333,7 +384,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, case BLOCK_8X16: return vpx_highbd_12_mse8x16; default: return vpx_highbd_12_mse16x16; } - break; } } @@ -352,12 +402,36 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize, // for first pass test. static int get_search_range(const VP9_COMP *cpi) { int sr = 0; - const int dim = VPXMIN(cpi->initial_width, cpi->initial_height); + int dim = VPXMIN(cpi->initial_width, cpi->initial_height); + dim = VPXMAX(dim, MI_SIZE); while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr; return sr; } +// Reduce limits to keep the motion search within MV_MAX of ref_mv. Not doing +// this can be problematic for big videos (8K) and may cause assert failure +// (or memory violation) in mv_cost. Limits are only modified if they would +// be non-empty. Returns 1 if limits are non-empty. +static int intersect_limits_with_mv_max(MvLimits *mv_limits, const MV *ref_mv) { + const int row_min = + VPXMAX(mv_limits->row_min, (ref_mv->row + 7 - MV_MAX) >> 3); + const int row_max = + VPXMIN(mv_limits->row_max, (ref_mv->row - 1 + MV_MAX) >> 3); + const int col_min = + VPXMAX(mv_limits->col_min, (ref_mv->col + 7 - MV_MAX) >> 3); + const int col_max = + VPXMIN(mv_limits->col_max, (ref_mv->col - 1 + MV_MAX) >> 3); + if (row_min > row_max || col_min > col_max) { + return 0; + } + mv_limits->row_min = row_min; + mv_limits->row_max = row_max; + mv_limits->col_min = col_min; + mv_limits->col_max = col_max; + return 1; +} + static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const MV *ref_mv, MV *best_mv, int *best_motion_err) { @@ -368,13 +442,21 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize = xd->mi[0]->sb_type; vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY; + MV center_mv_full = ref_mv_full; + unsigned int start_mv_sad; + vp9_sad_fn_ptr_t sad_fn_ptr; int step_param = 3; int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; const int sr = get_search_range(cpi); + const MvLimits tmp_mv_limits = x->mv_limits; step_param += sr; further_steps -= sr; + if (!intersect_limits_with_mv_max(&x->mv_limits, ref_mv)) { + return; + } + // Override the default variance function to use MSE. v_fn_ptr.vf = get_block_variance_fn(bsize); #if CONFIG_VP9_HIGHBITDEPTH @@ -383,10 +465,18 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } #endif // CONFIG_VP9_HIGHBITDEPTH + // Calculate SAD of the start mv + clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + start_mv_sad = get_start_mv_sad(x, &ref_mv_full, ¢er_mv_full, + cpi->fn_ptr[bsize].sdf, x->sadperbit16); + sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf; + sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df; + // Center the initial step/diamond search on best mv. - tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, - step_param, x->sadperbit16, &num00, - &v_fn_ptr, ref_mv); + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, + &tmp_mv, step_param, x->sadperbit16, &num00, + &sad_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty; @@ -406,9 +496,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (num00) { --num00; } else { - tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, - step_param + n, x->sadperbit16, &num00, - &v_fn_ptr, ref_mv); + tmp_err = cpi->diamond_search_sad( + x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n, + x->sadperbit16, &num00, &sad_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) @@ -420,6 +510,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } } } + x->mv_limits = tmp_mv_limits; } static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) { @@ -460,12 +551,11 @@ static int scale_sse_threshold(VP9_COMMON *cm, int thresh) { if (cm->use_highbitdepth) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = thresh; break; - case VPX_BITS_10: ret_val = thresh >> 4; break; - case VPX_BITS_12: ret_val = thresh >> 8; break; + case VPX_BITS_10: ret_val = thresh << 4; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = thresh << 8; + break; } } #else @@ -487,11 +577,10 @@ static int get_ul_intra_threshold(VP9_COMMON *cm) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = UL_INTRA_THRESH; break; case VPX_BITS_10: ret_val = UL_INTRA_THRESH << 2; break; - case VPX_BITS_12: ret_val = UL_INTRA_THRESH << 4; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = UL_INTRA_THRESH << 4; + break; } } #else @@ -508,11 +597,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = SMOOTH_INTRA_THRESH; break; case VPX_BITS_10: ret_val = SMOOTH_INTRA_THRESH << 4; break; - case VPX_BITS_12: ret_val = SMOOTH_INTRA_THRESH << 8; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = SMOOTH_INTRA_THRESH << 8; + break; } } #else @@ -522,14 +610,14 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { } #define FP_DN_THRESH 8 -#define FP_MAX_DN_THRESH 16 +#define FP_MAX_DN_THRESH 24 #define KERNEL_SIZE 3 -// Baseline Kernal weights for first pass noise metric -static uint8_t fp_dn_kernal_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4, +// Baseline Kernel weights for first pass noise metric +static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 }; -// Estimate noise at a single point based on the impace of a spatial kernal +// Estimate noise at a single point based on the impact of a spatial kernel // on the point value static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) { int sum_weight = 0; @@ -539,23 +627,23 @@ static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) { int diff; int dn_diff; uint8_t *tmp_ptr; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; uint8_t dn_val; uint8_t centre_val = *src_ptr; - kernal_ptr = fp_dn_kernal_3; + kernel_ptr = fp_dn_kernel_3; - // Apply the kernal + // Apply the kernel tmp_ptr = src_ptr - stride - 1; for (i = 0; i < KERNEL_SIZE; ++i) { for (j = 0; j < KERNEL_SIZE; ++j) { diff = abs((int)centre_val - (int)tmp_ptr[j]); max_diff = VPXMAX(max_diff, diff); if (diff <= FP_DN_THRESH) { - sum_weight += *kernal_ptr; - sum_val += (int)tmp_ptr[j] * (int)*kernal_ptr; + sum_weight += *kernel_ptr; + sum_val += (int)tmp_ptr[j] * (int)*kernel_ptr; } - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } @@ -581,13 +669,13 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) { int dn_diff; uint8_t *tmp_ptr; uint16_t *tmp_ptr16; - uint8_t *kernal_ptr; + uint8_t *kernel_ptr; uint16_t dn_val; uint16_t centre_val = *CONVERT_TO_SHORTPTR(src_ptr); - kernal_ptr = fp_dn_kernal_3; + kernel_ptr = fp_dn_kernel_3; - // Apply the kernal + // Apply the kernel tmp_ptr = src_ptr - stride - 1; for (i = 0; i < KERNEL_SIZE; ++i) { tmp_ptr16 = CONVERT_TO_SHORTPTR(tmp_ptr); @@ -595,10 +683,10 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) { diff = abs((int)centre_val - (int)tmp_ptr16[j]); max_diff = VPXMAX(max_diff, diff); if (diff <= FP_DN_THRESH) { - sum_weight += *kernal_ptr; - sum_val += (int)tmp_ptr16[j] * (int)*kernal_ptr; + sum_weight += *kernel_ptr; + sum_val += (int)tmp_ptr16[j] * (int)*kernel_ptr; } - ++kernal_ptr; + ++kernel_ptr; } tmp_ptr += stride; } @@ -646,37 +734,166 @@ static int fp_estimate_block_noise(MACROBLOCK *x, BLOCK_SIZE bsize) { return block_noise << 2; // Scale << 2 to account for sampling. } -#define INVALID_ROW -1 -void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { +// This function is called to test the functionality of row based +// multi-threading in unit tests for bit-exactness +static void accumulate_floating_point_stats(VP9_COMP *cpi, + TileDataEnc *first_tile_col) { + VP9_COMMON *const cm = &cpi->common; int mb_row, mb_col; - MACROBLOCK *const x = &cpi->td.mb; + first_tile_col->fp_data.intra_factor = 0; + first_tile_col->fp_data.brightness_factor = 0; + first_tile_col->fp_data.neutral_count = 0; + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + const int mb_index = mb_row * cm->mb_cols + mb_col; + first_tile_col->fp_data.intra_factor += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor; + first_tile_col->fp_data.brightness_factor += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor; + first_tile_col->fp_data.neutral_count += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count; + } + } +} + +static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, + FIRSTPASS_DATA *fp_acc_data) { + VP9_COMMON *const cm = &cpi->common; + // The minimum error here insures some bit allocation to frames even + // in static regions. The allocation per MB declines for larger formats + // where the typical "real" energy per MB also falls. + // Initial estimate here uses sqrt(mbs) to define the min_err, where the + // number of mbs is proportional to the image area. + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + const double min_err = 200 * sqrt(num_mbs); + + // Clamp the image start to rows/2. This number of rows is discarded top + // and bottom as dead data so rows / 2 means the frame is blank. + if ((fp_acc_data->image_data_start_row > cm->mb_rows / 2) || + (fp_acc_data->image_data_start_row == INVALID_ROW)) { + fp_acc_data->image_data_start_row = cm->mb_rows / 2; + } + // Exclude any image dead zone + if (fp_acc_data->image_data_start_row > 0) { + fp_acc_data->intra_skip_count = + VPXMAX(0, fp_acc_data->intra_skip_count - + (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); + } + + fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs; + fp_acc_data->brightness_factor = + fp_acc_data->brightness_factor / (double)num_mbs; + fps->weight = fp_acc_data->intra_factor * fp_acc_data->brightness_factor; + + fps->frame = cm->current_video_frame; + fps->spatial_layer_id = cpi->svc.spatial_layer_id; + + fps->coded_error = + ((double)(fp_acc_data->coded_error >> 8) + min_err) / num_mbs; + fps->sr_coded_error = + ((double)(fp_acc_data->sr_coded_error >> 8) + min_err) / num_mbs; + fps->intra_error = + ((double)(fp_acc_data->intra_error >> 8) + min_err) / num_mbs; + + fps->frame_noise_energy = + (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs; + fps->count = 1.0; + fps->pcnt_inter = (double)(fp_acc_data->intercount) / num_mbs; + fps->pcnt_second_ref = (double)(fp_acc_data->second_ref_count) / num_mbs; + fps->pcnt_neutral = (double)(fp_acc_data->neutral_count) / num_mbs; + fps->pcnt_intra_low = (double)(fp_acc_data->intra_count_low) / num_mbs; + fps->pcnt_intra_high = (double)(fp_acc_data->intra_count_high) / num_mbs; + fps->intra_skip_pct = (double)(fp_acc_data->intra_skip_count) / num_mbs; + fps->intra_smooth_pct = (double)(fp_acc_data->intra_smooth_count) / num_mbs; + fps->inactive_zone_rows = (double)(fp_acc_data->image_data_start_row); + // Currently set to 0 as most issues relate to letter boxing. + fps->inactive_zone_cols = (double)0; + + if (fp_acc_data->mvcount > 0) { + fps->new_mv_count = (double)(fp_acc_data->new_mv_count) / num_mbs; + fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount; + fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount; + fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount; + fps->mvc_abs = (double)(fp_acc_data->sum_mvc_abs) / fp_acc_data->mvcount; + fps->MVrv = ((double)(fp_acc_data->sum_mvrs) - + ((double)(fp_acc_data->sum_mvr) * (fp_acc_data->sum_mvr) / + fp_acc_data->mvcount)) / + fp_acc_data->mvcount; + fps->MVcv = ((double)(fp_acc_data->sum_mvcs) - + ((double)(fp_acc_data->sum_mvc) * (fp_acc_data->sum_mvc) / + fp_acc_data->mvcount)) / + fp_acc_data->mvcount; + fps->mv_in_out_count = + (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2); + fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs; + } else { + fps->new_mv_count = 0.0; + fps->MVr = 0.0; + fps->mvr_abs = 0.0; + fps->MVc = 0.0; + fps->mvc_abs = 0.0; + fps->MVrv = 0.0; + fps->MVcv = 0.0; + fps->mv_in_out_count = 0.0; + fps->pcnt_motion = 0.0; + } +} + +static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile, + FIRSTPASS_DATA *fp_acc_data) { + this_tile->fp_data.intra_factor += fp_acc_data->intra_factor; + this_tile->fp_data.brightness_factor += fp_acc_data->brightness_factor; + this_tile->fp_data.coded_error += fp_acc_data->coded_error; + this_tile->fp_data.sr_coded_error += fp_acc_data->sr_coded_error; + this_tile->fp_data.frame_noise_energy += fp_acc_data->frame_noise_energy; + this_tile->fp_data.intra_error += fp_acc_data->intra_error; + this_tile->fp_data.intercount += fp_acc_data->intercount; + this_tile->fp_data.second_ref_count += fp_acc_data->second_ref_count; + this_tile->fp_data.neutral_count += fp_acc_data->neutral_count; + this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low; + this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high; + this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count; + this_tile->fp_data.new_mv_count += fp_acc_data->new_mv_count; + this_tile->fp_data.mvcount += fp_acc_data->mvcount; + this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr; + this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs; + this_tile->fp_data.sum_mvc += fp_acc_data->sum_mvc; + this_tile->fp_data.sum_mvc_abs += fp_acc_data->sum_mvc_abs; + this_tile->fp_data.sum_mvrs += fp_acc_data->sum_mvrs; + this_tile->fp_data.sum_mvcs += fp_acc_data->sum_mvcs; + this_tile->fp_data.sum_in_vectors += fp_acc_data->sum_in_vectors; + this_tile->fp_data.intra_smooth_count += fp_acc_data->intra_smooth_count; + const int min_start_row = VPXMIN(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row); + this_tile->fp_data.image_data_start_row = + (min_start_row == INVALID_ROW) + ? VPXMAX(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row) + : min_start_row; +} + +#define NZ_MOTION_PENALTY 128 +#define INTRA_MODE_PENALTY 1024 +void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, + FIRSTPASS_DATA *fp_acc_data, + TileDataEnc *tile_data, MV *best_ref_mv, + int mb_row) { + int mb_col; + MACROBLOCK *const x = &td->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - TileInfo tile; + TileInfo tile = tile_data->tile_info; + const int mb_col_start = ROUND_POWER_OF_TWO(tile.mi_col_start, 1); + const int mb_col_end = ROUND_POWER_OF_TWO(tile.mi_col_end, 1); struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none; - int i; + const PICK_MODE_CONTEXT *ctx = &td->pc_root->none; + int i, c; + int num_mb_cols = get_num_cols(tile_data->tile_info, 1); int recon_yoffset, recon_uvoffset; - int64_t intra_error = 0; - int64_t coded_error = 0; - int64_t sr_coded_error = 0; - int64_t frame_noise_energy = 0; - - int sum_mvr = 0, sum_mvc = 0; - int sum_mvr_abs = 0, sum_mvc_abs = 0; - int64_t sum_mvrs = 0, sum_mvcs = 0; - int mvcount = 0; - int intercount = 0; - int second_ref_count = 0; const int intrapenalty = INTRA_MODE_PENALTY; - double neutral_count; - int intra_skip_count = 0; - int intra_smooth_count = 0; - int image_data_start_row = INVALID_ROW; - int sum_in_vectors = 0; - TWO_PASS *twopass = &cpi->twopass; const MV zero_mv = { 0, 0 }; int recon_y_stride, recon_uv_stride, uv_mb_height; @@ -685,76 +902,480 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; - LAYER_CONTEXT *const lc = - is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] - : NULL; - double intra_factor; - double brightness_factor; - BufferPool *const pool = cm->buffer_pool; MODE_INFO mi_above, mi_left; + double mb_intra_factor; + double mb_brightness_factor; + double mb_neutral_count; + int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH); + + MV *first_top_mv = &tile_data->firstpass_top_mv; + MV last_nonzero_mv = { 0, 0 }; + // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); - assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL)); + assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs); + xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + mb_col_start; + xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + mb_col_start; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][1]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; + p[i].eobs = ctx->eobs_pbuf[i][1]; } -#endif - vpx_clear_system_state(); + recon_y_stride = new_yv12->y_stride; + recon_uv_stride = new_yv12->uv_stride; + uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); - intra_factor = 0.0; - brightness_factor = 0.0; - neutral_count = 0.0; + // Reset above block coeffs. + recon_yoffset = (mb_row * recon_y_stride * 16) + mb_col_start * 16; + recon_uvoffset = + (mb_row * recon_uv_stride * uv_mb_height) + mb_col_start * uv_mb_height; - set_first_pass_params(cpi); - vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth)); + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); + x->mv_limits.row_max = + ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; - if (lc != NULL) { - twopass = &lc->twopass; + for (mb_col = mb_col_start, c = 0; mb_col < mb_col_end; ++mb_col, c++) { + int this_error; + int this_intra_error; + const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); + double log_intra; + int level_sample; + const int mb_index = mb_row * cm->mb_cols + mb_col; - cpi->lst_fb_idx = cpi->svc.spatial_layer_id; - cpi->ref_frame_flags = VP9_LAST_FLAG; + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c); - if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id < - REF_FRAMES) { - cpi->gld_fb_idx = - cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id; - cpi->ref_frame_flags |= VP9_GOLD_FLAG; - cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0); + if (mb_col == mb_col_start) { + last_nonzero_mv = *first_top_mv; + } + + // Adjust to the next column of MBs. + x->plane[0].src.buf = cpi->Source->y_buffer + + mb_row * 16 * x->plane[0].src.stride + mb_col * 16; + x->plane[1].src.buf = cpi->Source->u_buffer + + mb_row * uv_mb_height * x->plane[1].src.stride + + mb_col * uv_mb_height; + x->plane[2].src.buf = cpi->Source->v_buffer + + mb_row * uv_mb_height * x->plane[1].src.stride + + mb_col * uv_mb_height; + + vpx_clear_system_state(); + + xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; + xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; + xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset; + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize], + mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows, + cm->mi_cols); + // Are edges available for intra prediction? + // Since the firstpass does not populate the mi_grid_visible, + // above_mi/left_mi must be overwritten with a nonzero value when edges + // are available. Required by vp9_predict_intra_block(). + xd->above_mi = (mb_row != 0) ? &mi_above : NULL; + xd->left_mi = ((mb_col << 1) > tile.mi_col_start) ? &mi_left : NULL; + + // Do intra 16x16 prediction. + x->skip_encode = 0; + x->fp_src_pred = 0; + // Do intra prediction based on source pixels for tile boundaries + if (mb_col == mb_col_start && mb_col != 0) { + xd->left_mi = &mi_left; + x->fp_src_pred = 1; + } + xd->mi[0]->mode = DC_PRED; + xd->mi[0]->tx_size = + use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; + // Fix - zero the 16x16 block first. This ensures correct this_error for + // block sizes smaller than 16x16. + vp9_zero_array(x->plane[0].src_diff, 256); + vp9_encode_intra_block_plane(x, bsize, 0, 0); + this_error = vpx_get_mb_ss(x->plane[0].src_diff); + this_intra_error = this_error; + + // Keep a record of blocks that have very low intra error residual + // (i.e. are in effect completely flat and untextured in the intra + // domain). In natural videos this is uncommon, but it is much more + // common in animations, graphics and screen content, so may be used + // as a signal to detect these types of content. + if (this_error < get_ul_intra_threshold(cm)) { + ++(fp_acc_data->intra_skip_count); + } else if ((mb_col > 0) && + (fp_acc_data->image_data_start_row == INVALID_ROW)) { + fp_acc_data->image_data_start_row = mb_row; + } + + // Blocks that are mainly smooth in the intra domain. + // Some special accounting for CQ but also these are better for testing + // noise levels. + if (this_error < get_smooth_intra_threshold(cm)) { + ++(fp_acc_data->intra_smooth_count); + } + + // Special case noise measurement for first frame. + if (cm->current_video_frame == 0) { + if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + } else { + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + } + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: break; + case VPX_BITS_10: this_error >>= 4; break; + default: + assert(cm->bit_depth == VPX_BITS_12); + this_error >>= 8; + break; + } + } +#endif // CONFIG_VP9_HIGHBITDEPTH + + vpx_clear_system_state(); + log_intra = log(this_error + 1.0); + if (log_intra < 10.0) { + mb_intra_factor = 1.0 + ((10.0 - log_intra) * 0.05); + fp_acc_data->intra_factor += mb_intra_factor; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = + mb_intra_factor; } else { - cpi->refresh_golden_frame = 0; + fp_acc_data->intra_factor += 1.0; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0; } - if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0; - - vp9_scale_references(cpi); - - // Use either last frame or alt frame for motion search. - if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); - if (first_ref_buf == NULL) - first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME); +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) + level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; + else + level_sample = x->plane[0].src.buf[0]; +#else + level_sample = x->plane[0].src.buf[0]; +#endif + if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) { + mb_brightness_factor = 1.0 + (0.01 * (DARK_THRESH - level_sample)); + fp_acc_data->brightness_factor += mb_brightness_factor; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = + mb_brightness_factor; + } else { + fp_acc_data->brightness_factor += 1.0; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = + 1.0; } - if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); - if (gld_yv12 == NULL) { - gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + // Intrapenalty below deals with situations where the intra and inter + // error scores are very low (e.g. a plain black frame). + // We do not have special cases in first pass for 0,0 and nearest etc so + // all inter modes carry an overhead cost estimate for the mv. + // When the error score is very low this causes us to pick all or lots of + // INTRA modes and throw lots of key frames. + // This penalty adds a cost matching that of a 0,0 mv to the intra case. + this_error += intrapenalty; + + // Accumulate the intra error. + fp_acc_data->intra_error += (int64_t)this_error; + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); + x->mv_limits.col_max = + ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; + + // Other than for intra-only frame do a motion search. + if (!frame_is_intra_only(cm)) { + int tmp_err, motion_error, this_motion_error, raw_motion_error; + // Assume 0,0 motion with no mv overhead. + MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; + struct buf_2d unscaled_last_source_buf_2d; + vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; + + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + this_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], 8); + } else { + motion_error = + get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + this_motion_error = motion_error; + } +#else + motion_error = + get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + this_motion_error = motion_error; +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Compute the motion error of the 0,0 motion using the last source + // frame as the reference. Skip the further motion search on + // reconstructed frame if this error is very small. + unscaled_last_source_buf_2d.buf = + cpi->unscaled_last_source->y_buffer + recon_yoffset; + unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + raw_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); + } else { + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); + } +#else + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (raw_motion_error > NZ_MOTION_PENALTY) { + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error); + + v_fn_ptr.vf = get_block_variance_fn(bsize); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + this_motion_error = + vp9_get_mvpred_var(x, &mv, best_ref_mv, &v_fn_ptr, 0); + + // If the current best reference mv is not centered on 0,0 then do a + // 0,0 based search as well. + if (!is_zero_mv(best_ref_mv)) { + tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); + + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv = tmp_mv; + this_motion_error = + vp9_get_mvpred_var(x, &tmp_mv, &zero_mv, &v_fn_ptr, 0); + } + } + + // Search in an older reference frame. + if ((cm->current_video_frame > 1) && gld_yv12 != NULL) { + // Assume 0,0 motion with no mv overhead. + int gf_motion_error; + + xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + gf_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + } else { + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); + } +#else + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); +#endif // CONFIG_VP9_HIGHBITDEPTH + + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error); + + if (gf_motion_error < motion_error && gf_motion_error < this_error) + ++(fp_acc_data->second_ref_count); + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; + xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; + + // In accumulating a score for the older reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if (gf_motion_error < this_error) + fp_acc_data->sr_coded_error += gf_motion_error; + else + fp_acc_data->sr_coded_error += this_error; + } else { + fp_acc_data->sr_coded_error += motion_error; + } + } else { + fp_acc_data->sr_coded_error += motion_error; + } + + // Start by assuming that intra mode is best. + best_ref_mv->row = 0; + best_ref_mv->col = 0; + + if (motion_error <= this_error) { + vpx_clear_system_state(); + + // Keep a count of cases where the inter and intra were very close + // and very low. This helps with scene cut detection for example in + // cropped clips with black bars at the sides or top and bottom. + if (((this_error - intrapenalty) * 9 <= motion_error * 10) && + (this_error < (2 * intrapenalty))) { + fp_acc_data->neutral_count += 1.0; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = + 1.0; + // Also track cases where the intra is not much worse than the inter + // and use this in limiting the GF/arf group length. + } else if ((this_error > NCOUNT_INTRA_THRESH) && + (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) { + mb_neutral_count = + (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error); + fp_acc_data->neutral_count += mb_neutral_count; + if (cpi->row_mt_bit_exact) + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = + mb_neutral_count; + } + + mv.row *= 8; + mv.col *= 8; + this_error = motion_error; + xd->mi[0]->mode = NEWMV; + xd->mi[0]->mv[0].as_mv = mv; + xd->mi[0]->tx_size = TX_4X4; + xd->mi[0]->ref_frame[0] = LAST_FRAME; + xd->mi[0]->ref_frame[1] = NO_REF_FRAME; + vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); + vp9_encode_sby_pass1(x, bsize); + fp_acc_data->sum_mvr += mv.row; + fp_acc_data->sum_mvr_abs += abs(mv.row); + fp_acc_data->sum_mvc += mv.col; + fp_acc_data->sum_mvc_abs += abs(mv.col); + fp_acc_data->sum_mvrs += mv.row * mv.row; + fp_acc_data->sum_mvcs += mv.col * mv.col; + ++(fp_acc_data->intercount); + + *best_ref_mv = mv; + + if (!is_zero_mv(&mv)) { + ++(fp_acc_data->mvcount); + if (!is_equal_mv(&mv, &last_nonzero_mv)) { + ++(fp_acc_data->new_mv_count); + } + last_nonzero_mv = mv; + + // Does the row vector point inwards or outwards? + if (mb_row < cm->mb_rows / 2) { + if (mv.row > 0) + --(fp_acc_data->sum_in_vectors); + else if (mv.row < 0) + ++(fp_acc_data->sum_in_vectors); + } else if (mb_row > cm->mb_rows / 2) { + if (mv.row > 0) + ++(fp_acc_data->sum_in_vectors); + else if (mv.row < 0) + --(fp_acc_data->sum_in_vectors); + } + + // Does the col vector point inwards or outwards? + if (mb_col < cm->mb_cols / 2) { + if (mv.col > 0) + --(fp_acc_data->sum_in_vectors); + else if (mv.col < 0) + ++(fp_acc_data->sum_in_vectors); + } else if (mb_col > cm->mb_cols / 2) { + if (mv.col > 0) + ++(fp_acc_data->sum_in_vectors); + else if (mv.col < 0) + --(fp_acc_data->sum_in_vectors); + } + } + if (this_intra_error < scaled_low_intra_thresh) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + } else { + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + } + } else { // Intra < inter error + if (this_intra_error < scaled_low_intra_thresh) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + if (this_motion_error < scaled_low_intra_thresh) { + fp_acc_data->intra_count_low += 1.0; + } else { + fp_acc_data->intra_count_high += 1.0; + } + } else { + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + fp_acc_data->intra_count_high += 1.0; + } } } else { - gld_yv12 = NULL; + fp_acc_data->sr_coded_error += (int64_t)this_error; } + fp_acc_data->coded_error += (int64_t)this_error; - set_ref_ptrs(cm, xd, - (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE, - (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE); + if (mb_col == mb_col_start) { + *first_top_mv = last_nonzero_mv; + } + recon_yoffset += 16; + recon_uvoffset += uv_mb_height; - cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, - &cpi->scaled_source, 0); + // Accumulate row level stats to the corresponding tile stats + if (cpi->row_mt && mb_col == mb_col_end - 1) + accumulate_fp_mb_row_stat(tile_data, fp_acc_data); + + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c, + num_mb_cols); } + vpx_clear_system_state(); +} + +static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) { + VP9_COMMON *const cm = &cpi->common; + int mb_row; + TileDataEnc tile_data; + TileInfo *tile = &tile_data.tile_info; + MV zero_mv = { 0, 0 }; + MV best_ref_mv; + // Tiling is ignored in the first pass. + vp9_tile_init(tile, cm, 0, 0); + tile_data.firstpass_top_mv = zero_mv; + + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + best_ref_mv = zero_mv; + vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data, + &best_ref_mv, mb_row); + } +} + +void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { + MACROBLOCK *const x = &cpi->td.mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TWO_PASS *twopass = &cpi->twopass; + + YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); + const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; + + BufferPool *const pool = cm->buffer_pool; + + FIRSTPASS_DATA fp_temp_data; + FIRSTPASS_DATA *fp_acc_data = &fp_temp_data; + + vpx_clear_system_state(); + vp9_zero(fp_temp_data); + fp_acc_data->image_data_start_row = INVALID_ROW; + + // First pass code requires valid last and new frame buffers. + assert(new_yv12 != NULL); + assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); + + set_first_pass_params(cpi); + vp9_set_quantizer(cpi, find_fp_qindex(cm->bit_depth), 0); vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); @@ -770,518 +1391,52 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { vp9_frame_init_quantizer(cpi); - for (i = 0; i < MAX_MB_PLANE; ++i) { - p[i].coeff = ctx->coeff_pbuf[i][1]; - p[i].qcoeff = ctx->qcoeff_pbuf[i][1]; - pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; - p[i].eobs = ctx->eobs_pbuf[i][1]; - } x->skip_recode = 0; vp9_init_mv_probs(cm); vp9_initialize_rd_consts(cpi); - // Tiling is ignored in the first pass. - vp9_tile_init(&tile, cm, 0, 0); + cm->log2_tile_rows = 0; - recon_y_stride = new_yv12->y_stride; - recon_uv_stride = new_yv12->uv_stride; - uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); - - for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { - MV best_ref_mv = { 0, 0 }; - - // Reset above block coeffs. - recon_yoffset = (mb_row * recon_y_stride * 16); - recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height); - - // Set up limit values for motion vectors to prevent them extending - // outside the UMV borders. - x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); - x->mv_limits.row_max = - ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; - - for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { - int this_error; - int this_intra_error; - const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); - const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); - double log_intra; - int level_sample; - -#if CONFIG_FP_MB_STATS - const int mb_index = mb_row * cm->mb_cols + mb_col; -#endif - - vpx_clear_system_state(); - - xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; - xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; - xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset; - xd->mi[0]->sb_type = bsize; - xd->mi[0]->ref_frame[0] = INTRA_FRAME; - set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize], - mb_col << 1, num_8x8_blocks_wide_lookup[bsize], - cm->mi_rows, cm->mi_cols); - // Are edges available for intra prediction? - // Since the firstpass does not populate the mi_grid_visible, - // above_mi/left_mi must be overwritten with a nonzero value when edges - // are available. Required by vp9_predict_intra_block(). - xd->above_mi = (mb_row != 0) ? &mi_above : NULL; - xd->left_mi = (mb_col > tile.mi_col_start) ? &mi_left : NULL; - - // Do intra 16x16 prediction. - x->skip_encode = 0; - xd->mi[0]->mode = DC_PRED; - xd->mi[0]->tx_size = - use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; - - // Set the 16x16 src_diff block to zero, which ensures correct this_error - // calculation for block sizes smaller than 16x16. - vp9_zero_array(x->plane[0].src_diff, 256); - vp9_encode_intra_block_plane(x, bsize, 0, 0); - this_error = vpx_get_mb_ss(x->plane[0].src_diff); - this_intra_error = this_error; - - // Keep a record of blocks that have very low intra error residual - // (i.e. are in effect completely flat and untextured in the intra - // domain). In natural videos this is uncommon, but it is much more - // common in animations, graphics and screen content, so may be used - // as a signal to detect these types of content. - if (this_error < get_ul_intra_threshold(cm)) { - ++intra_skip_count; - } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) { - image_data_start_row = mb_row; - } - - // Blocks that are mainly smooth in the intra domain. - // Some special accounting for CQ but also these are better for testing - // noise levels. - if (this_error < get_smooth_intra_threshold(cm)) { - ++intra_smooth_count; - } - - // Special case noise measurement for first frame. - if (cm->current_video_frame == 0) { - if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { - frame_noise_energy += fp_estimate_block_noise(x, bsize); - } else { - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } - } - -#if CONFIG_VP9_HIGHBITDEPTH - if (cm->use_highbitdepth) { - switch (cm->bit_depth) { - case VPX_BITS_8: break; - case VPX_BITS_10: this_error >>= 4; break; - case VPX_BITS_12: this_error >>= 8; break; - default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); - return; - } - } -#endif // CONFIG_VP9_HIGHBITDEPTH - - vpx_clear_system_state(); - log_intra = log(this_error + 1.0); - if (log_intra < 10.0) - intra_factor += 1.0 + ((10.0 - log_intra) * 0.05); - else - intra_factor += 1.0; - -#if CONFIG_VP9_HIGHBITDEPTH - if (cm->use_highbitdepth) - level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; - else - level_sample = x->plane[0].src.buf[0]; -#else - level_sample = x->plane[0].src.buf[0]; -#endif - if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) - brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample)); - else - brightness_factor += 1.0; - - // Intrapenalty below deals with situations where the intra and inter - // error scores are very low (e.g. a plain black frame). - // We do not have special cases in first pass for 0,0 and nearest etc so - // all inter modes carry an overhead cost estimate for the mv. - // When the error score is very low this causes us to pick all or lots of - // INTRA modes and throw lots of key frames. - // This penalty adds a cost matching that of a 0,0 mv to the intra case. - this_error += intrapenalty; - - // Accumulate the intra error. - intra_error += (int64_t)this_error; - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // initialization - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - } -#endif - - // Set up limit values for motion vectors to prevent them extending - // outside the UMV borders. - x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); - x->mv_limits.col_max = - ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; - - // Other than for the first frame do a motion search. - if ((lc == NULL && cm->current_video_frame > 0) || - (lc != NULL && lc->current_video_frame_in_layer > 0)) { - int tmp_err, motion_error, raw_motion_error; - // Assume 0,0 motion with no mv overhead. - MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; - struct buf_2d unscaled_last_source_buf_2d; - - xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - motion_error = highbd_get_prediction_error( - bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); - } else { - motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); - } -#else - motion_error = - get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); -#endif // CONFIG_VP9_HIGHBITDEPTH - - // Compute the motion error of the 0,0 motion using the last source - // frame as the reference. Skip the further motion search on - // reconstructed frame if this error is small. - unscaled_last_source_buf_2d.buf = - cpi->unscaled_last_source->y_buffer + recon_yoffset; - unscaled_last_source_buf_2d.stride = - cpi->unscaled_last_source->y_stride; -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - raw_motion_error = highbd_get_prediction_error( - bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); - } else { - raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &unscaled_last_source_buf_2d); - } -#else - raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &unscaled_last_source_buf_2d); -#endif // CONFIG_VP9_HIGHBITDEPTH - - // TODO(pengchong): Replace the hard-coded threshold - if (raw_motion_error > 25 || lc != NULL) { - // Test last reference frame using the previous best mv as the - // starting point (best reference) for the search. - first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error); - - // If the current best reference mv is not centered on 0,0 then do a - // 0,0 based search as well. - if (!is_zero_mv(&best_ref_mv)) { - tmp_err = INT_MAX; - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); - - if (tmp_err < motion_error) { - motion_error = tmp_err; - mv = tmp_mv; - } - } - - // Search in an older reference frame. - if (((lc == NULL && cm->current_video_frame > 1) || - (lc != NULL && lc->current_video_frame_in_layer > 1)) && - gld_yv12 != NULL) { - // Assume 0,0 motion with no mv overhead. - int gf_motion_error; - - xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - gf_motion_error = highbd_get_prediction_error( - bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); - } else { - gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); - } -#else - gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); -#endif // CONFIG_VP9_HIGHBITDEPTH - - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, - &gf_motion_error); - - if (gf_motion_error < motion_error && gf_motion_error < this_error) - ++second_ref_count; - - // Reset to last frame as reference buffer. - xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; - xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; - xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; - - // In accumulating a score for the older reference frame take the - // best of the motion predicted score and the intra coded error - // (just as will be done for) accumulation of "coded_error" for - // the last frame. - if (gf_motion_error < this_error) - sr_coded_error += gf_motion_error; - else - sr_coded_error += this_error; - } else { - sr_coded_error += motion_error; - } - } else { - sr_coded_error += motion_error; - } - - // Start by assuming that intra mode is best. - best_ref_mv.row = 0; - best_ref_mv.col = 0; - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // intra prediction statistics - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; - if (this_error > FPMB_ERROR_LARGE_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; - } else if (this_error < FPMB_ERROR_SMALL_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; - } - } -#endif - - if (motion_error <= this_error) { - vpx_clear_system_state(); - - // Keep a count of cases where the inter and intra were very close - // and very low. This helps with scene cut detection for example in - // cropped clips with black bars at the sides or top and bottom. - if (((this_error - intrapenalty) * 9 <= motion_error * 10) && - (this_error < (2 * intrapenalty))) { - neutral_count += 1.0; - // Also track cases where the intra is not much worse than the inter - // and use this in limiting the GF/arf group length. - } else if ((this_error > NCOUNT_INTRA_THRESH) && - (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) { - neutral_count += - (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error); - } - - mv.row *= 8; - mv.col *= 8; - this_error = motion_error; - xd->mi[0]->mode = NEWMV; - xd->mi[0]->mv[0].as_mv = mv; - xd->mi[0]->tx_size = TX_4X4; - xd->mi[0]->ref_frame[0] = LAST_FRAME; - xd->mi[0]->ref_frame[1] = NONE; - vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); - vp9_encode_sby_pass1(x, bsize); - sum_mvr += mv.row; - sum_mvr_abs += abs(mv.row); - sum_mvc += mv.col; - sum_mvc_abs += abs(mv.col); - sum_mvrs += mv.row * mv.row; - sum_mvcs += mv.col * mv.col; - ++intercount; - - best_ref_mv = mv; - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // inter prediction statistics - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; - if (this_error > FPMB_ERROR_LARGE_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_ERROR_LARGE_MASK; - } else if (this_error < FPMB_ERROR_SMALL_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_ERROR_SMALL_MASK; - } - } -#endif - - if (!is_zero_mv(&mv)) { - ++mvcount; - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - cpi->twopass.frame_mb_stats_buf[mb_index] &= - ~FPMB_MOTION_ZERO_MASK; - // check estimated motion direction - if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) { - // right direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_RIGHT_MASK; - } else if (mv.as_mv.row < 0 && - abs(mv.as_mv.row) >= abs(mv.as_mv.col)) { - // up direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_UP_MASK; - } else if (mv.as_mv.col < 0 && - abs(mv.as_mv.col) >= abs(mv.as_mv.row)) { - // left direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_LEFT_MASK; - } else { - // down direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_DOWN_MASK; - } - } -#endif - - // Does the row vector point inwards or outwards? - if (mb_row < cm->mb_rows / 2) { - if (mv.row > 0) - --sum_in_vectors; - else if (mv.row < 0) - ++sum_in_vectors; - } else if (mb_row > cm->mb_rows / 2) { - if (mv.row > 0) - ++sum_in_vectors; - else if (mv.row < 0) - --sum_in_vectors; - } - - // Does the col vector point inwards or outwards? - if (mb_col < cm->mb_cols / 2) { - if (mv.col > 0) - --sum_in_vectors; - else if (mv.col < 0) - ++sum_in_vectors; - } else if (mb_col > cm->mb_cols / 2) { - if (mv.col > 0) - ++sum_in_vectors; - else if (mv.col < 0) - --sum_in_vectors; - } - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { - frame_noise_energy += fp_estimate_block_noise(x, bsize); - } else { // 0,0 mv but high error - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } - } else { // Intra < inter error - if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) - frame_noise_energy += fp_estimate_block_noise(x, bsize); - else - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } - } else { - sr_coded_error += (int64_t)this_error; - } - coded_error += (int64_t)this_error; - - // Adjust to the next column of MBs. - x->plane[0].src.buf += 16; - x->plane[1].src.buf += uv_mb_height; - x->plane[2].src.buf += uv_mb_height; - - recon_yoffset += 16; - recon_uvoffset += uv_mb_height; - } - - // Adjust to the next row of MBs. - x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols; - x->plane[1].src.buf += - uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols; - x->plane[2].src.buf += - uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols; - - vpx_clear_system_state(); - } - - // Clamp the image start to rows/2. This number of rows is discarded top - // and bottom as dead data so rows / 2 means the frame is blank. - if ((image_data_start_row > cm->mb_rows / 2) || - (image_data_start_row == INVALID_ROW)) { - image_data_start_row = cm->mb_rows / 2; - } - // Exclude any image dead zone - if (image_data_start_row > 0) { - intra_skip_count = - VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2)); - } + if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL) + CHECK_MEM_ERROR( + &cm->error, cpi->twopass.fp_mb_float_stats, + vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1)); { FIRSTPASS_STATS fps; - // The minimum error here insures some bit allocation to frames even - // in static regions. The allocation per MB declines for larger formats - // where the typical "real" energy per MB also falls. - // Initial estimate here uses sqrt(mbs) to define the min_err, where the - // number of mbs is proportional to the image area. - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) - ? cpi->initial_mbs - : cpi->common.MBs; - const double min_err = 200 * sqrt(num_mbs); - - intra_factor = intra_factor / (double)num_mbs; - brightness_factor = brightness_factor / (double)num_mbs; - fps.weight = intra_factor * brightness_factor; - - fps.frame = cm->current_video_frame; - fps.spatial_layer_id = cpi->svc.spatial_layer_id; - fps.coded_error = (double)(coded_error >> 8) + min_err; - fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err; - fps.intra_error = (double)(intra_error >> 8) + min_err; - fps.frame_noise_energy = (double)frame_noise_energy / (double)num_mbs; - fps.count = 1.0; - fps.pcnt_inter = (double)intercount / num_mbs; - fps.pcnt_second_ref = (double)second_ref_count / num_mbs; - fps.pcnt_neutral = (double)neutral_count / num_mbs; - fps.intra_skip_pct = (double)intra_skip_count / num_mbs; - fps.intra_smooth_pct = (double)intra_smooth_count / num_mbs; - fps.inactive_zone_rows = (double)image_data_start_row; - // Currently set to 0 as most issues relate to letter boxing. - fps.inactive_zone_cols = (double)0; - - if (mvcount > 0) { - fps.MVr = (double)sum_mvr / mvcount; - fps.mvr_abs = (double)sum_mvr_abs / mvcount; - fps.MVc = (double)sum_mvc / mvcount; - fps.mvc_abs = (double)sum_mvc_abs / mvcount; - fps.MVrv = - ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount; - fps.MVcv = - ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount; - fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2); - fps.pcnt_motion = (double)mvcount / num_mbs; + TileDataEnc *first_tile_col; + if (!cpi->row_mt) { + cm->log2_tile_cols = 0; + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy; + first_pass_encode(cpi, fp_acc_data); + first_pass_stat_calc(cpi, &fps, fp_acc_data); } else { - fps.MVr = 0.0; - fps.mvr_abs = 0.0; - fps.MVc = 0.0; - fps.mvc_abs = 0.0; - fps.MVrv = 0.0; - fps.MVcv = 0.0; - fps.mv_in_out_count = 0.0; - fps.pcnt_motion = 0.0; + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write; + if (cpi->row_mt_bit_exact) { + cm->log2_tile_cols = 0; + vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs); + } + vp9_encode_fp_row_mt(cpi); + first_tile_col = &cpi->tile_data[0]; + if (cpi->row_mt_bit_exact) + accumulate_floating_point_stats(cpi, first_tile_col); + first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data)); } - // Dont allow a value of 0 for duration. + // Don't allow a value of 0 for duration. // (Section duration is also defaulted to minimum of 1.0). fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start)); // Don't want to do output stats with a stack variable! twopass->this_frame_stats = fps; - output_stats(&twopass->this_frame_stats, cpi->output_pkt_list); + output_stats(&twopass->this_frame_stats); accumulate_stats(&twopass->total_stats, &fps); - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - output_fpmb_stats(twopass->frame_mb_stats_buf, cm, cpi->output_pkt_list); - } -#endif } - // Copy the previous Last Frame back into gf and and arf buffers if + // Copy the previous Last Frame back into gf and arf buffers if // the prediction is good enough... but also don't allow it to lag too far. if ((twopass->sr_update_lag > 3) || ((cm->current_video_frame > 0) && @@ -1299,50 +1454,38 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { vpx_extend_frame_borders(new_yv12); - if (lc != NULL) { - vp9_update_reference_frames(cpi); - } else { - // The frame we just compressed now becomes the last frame. - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], - cm->new_fb_idx); - } + // The frame we just compressed now becomes the last frame. + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], + cm->new_fb_idx); // Special case for the first frame. Copy into the GF buffer as a second // reference. - if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX && - lc == NULL) { + if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) { ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], cm->ref_frame_map[cpi->lst_fb_idx]); } - // Use this to see what the first pass reconstruction looks like. - if (0) { - char filename[512]; - FILE *recon_file; - snprintf(filename, sizeof(filename), "enc%04d.yuv", - (int)cm->current_video_frame); - - if (cm->current_video_frame == 0) - recon_file = fopen(filename, "wb"); - else - recon_file = fopen(filename, "ab"); - - (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file); - fclose(recon_file); - } - - ++cm->current_video_frame; + // In the first pass, every frame is considered as a show frame. + update_frame_indexes(cm, /*show_frame=*/1); if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); } -static double calc_correction_factor(double err_per_mb, double err_divisor, - double pt_low, double pt_high, int q, - vpx_bit_depth_t bit_depth) { - const double error_term = err_per_mb / err_divisor; +static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75, + 0.85, 0.90, 0.90, + 0.90, 1.00, 1.25 }; - // Adjustment based on actual quantizer to power term. - const double power_term = - VPXMIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high); +static double calc_correction_factor(double err_per_mb, double err_divisor, + int q) { + const double error_term = err_per_mb / DOUBLE_DIVIDE_CHECK(err_divisor); + const int index = q >> 5; + double power_term; + + assert((index >= 0) && (index < (QINDEX_RANGE >> 5))); + + // Adjustment based on quantizer to the power term. + power_term = + q_pow_term[index] + + (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0); // Calculate correction factor. if (power_term < 1.0) assert(error_term >= 0.0); @@ -1350,7 +1493,26 @@ static double calc_correction_factor(double err_per_mb, double err_divisor, return fclamp(pow(error_term, power_term), 0.05, 5.0); } -#define ERR_DIVISOR 115.0 +static double wq_err_divisor(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + unsigned int screen_area = (cm->width * cm->height); + + // Use a different error per mb factor for calculating boost for + // different formats. + if (screen_area <= 640 * 360) { + return 115.0; + } else if (screen_area < 1280 * 720) { + return 125.0; + } else if (screen_area <= 1920 * 1080) { + return 130.0; + } else if (screen_area < 3840 * 2160) { + return 150.0; + } + + // Fall through to here only for 4K and above. + return 200.0; +} + #define NOISE_FACTOR_MIN 0.9 #define NOISE_FACTOR_MAX 1.1 static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, @@ -1359,6 +1521,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, const RATE_CONTROL *const rc = &cpi->rc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; TWO_PASS *const twopass = &cpi->twopass; + double last_group_rate_err; // Clamp the target rate to VBR min / max limts. const int target_rate = @@ -1367,44 +1530,54 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, noise_factor = fclamp(noise_factor, NOISE_FACTOR_MIN, NOISE_FACTOR_MAX); inactive_zone = fclamp(inactive_zone, 0.0, 1.0); +// TODO(jimbankoski): remove #if here or below when this has been +// well tested. +#if CONFIG_ALWAYS_ADJUST_BPM + // based on recent history adjust expectations of bits per macroblock. + last_group_rate_err = + (double)twopass->rolling_arf_group_actual_bits / + DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits); + last_group_rate_err = fclamp(last_group_rate_err, 0.25, 4.0); + twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; + twopass->bpm_factor = fclamp(twopass->bpm_factor, 0.25, 4.0); +#endif + if (target_rate <= 0) { return rc->worst_quality; // Highest value allowed } else { const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs : cpi->common.MBs; - const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); - const double av_err_per_mb = section_err / active_mbs; + const double active_pct = VPXMAX(0.01, 1.0 - inactive_zone); + const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct); + const double av_err_per_mb = section_err / active_pct; const double speed_term = 1.0 + 0.04 * oxcf->speed; - double last_group_rate_err; - const int target_norm_bits_per_mb = - (int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs); + const uint64_t target_norm_bits_per_mb = + ((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs; int q; - int is_svc_upper_layer = 0; - - if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) - is_svc_upper_layer = 1; +// TODO(jimbankoski): remove #if here or above when this has been +// well tested. +#if !CONFIG_ALWAYS_ADJUST_BPM // based on recent history adjust expectations of bits per macroblock. last_group_rate_err = (double)twopass->rolling_arf_group_actual_bits / DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits); - last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err)); + last_group_rate_err = fclamp(last_group_rate_err, 0.25, 4.0); twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0; - twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor)); + twopass->bpm_factor = fclamp(twopass->bpm_factor, 0.25, 4.0); +#endif // Try and pick a max Q that will be high enough to encode the // content at the given rate. for (q = rc->best_quality; q < rc->worst_quality; ++q) { - const double factor = calc_correction_factor( - av_err_per_mb, ERR_DIVISOR, - is_svc_upper_layer ? SVC_FACTOR_PT_LOW : FACTOR_PT_LOW, - FACTOR_PT_HIGH, q, cpi->common.bit_depth); + const double factor = + calc_correction_factor(av_err_per_mb, wq_err_divisor(cpi), q); const int bits_per_mb = vp9_rc_bits_per_mb( INTER_FRAME, q, factor * speed_term * cpi->twopass.bpm_factor * noise_factor, cpi->common.bit_depth); - if (bits_per_mb <= target_norm_bits_per_mb) break; + if ((uint64_t)bits_per_mb <= target_norm_bits_per_mb) break; } // Restriction on active max q for constrained quality mode. @@ -1446,14 +1619,9 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width, } void vp9_init_second_pass(VP9_COMP *cpi) { - SVC *const svc = &cpi->svc; - const VP9EncoderConfig *const oxcf = &cpi->oxcf; - const int is_two_pass_svc = - (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); + VP9EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = - is_two_pass_svc ? &svc->layer_context[svc->spatial_layer_id].twopass - : &cpi->twopass; + TWO_PASS *const twopass = &cpi->twopass; double frame_rate; FIRSTPASS_STATS *stats; @@ -1467,46 +1635,76 @@ void vp9_init_second_pass(VP9_COMP *cpi) { *stats = *twopass->stats_in_end; twopass->total_left_stats = *stats; + // Scan the first pass file and calculate a modified score for each + // frame that is used to distribute bits. The modified score is assumed + // to provide a linear basis for bit allocation. I.e., a frame A with a score + // that is double that of frame B will be allocated 2x as many bits. + { + double modified_score_total = 0.0; + const FIRSTPASS_STATS *s = twopass->stats_in; + double av_err; + + if (oxcf->vbr_corpus_complexity) { + twopass->mean_mod_score = (double)oxcf->vbr_corpus_complexity / 10.0; + av_err = get_distribution_av_err(cpi, twopass); + } else { + av_err = get_distribution_av_err(cpi, twopass); + // The first scan is unclamped and gives a raw average. + while (s < twopass->stats_in_end) { + modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err); + ++s; + } + + // The average error from this first scan is used to define the midpoint + // error for the rate distribution function. + twopass->mean_mod_score = + modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count); + } + + // Second scan using clamps based on the previous cycle average. + // This may modify the total and average somewhat but we don't bother with + // further iterations. + modified_score_total = 0.0; + s = twopass->stats_in; + while (s < twopass->stats_in_end) { + modified_score_total += + calculate_norm_frame_score(cpi, twopass, oxcf, s, av_err); + ++s; + } + twopass->normalized_score_left = modified_score_total; + + // If using Corpus wide VBR mode then update the clip target bandwidth to + // reflect how the clip compares to the rest of the corpus. + if (oxcf->vbr_corpus_complexity) { + oxcf->target_bandwidth = + (int64_t)((double)oxcf->target_bandwidth * + (twopass->normalized_score_left / stats->count)); + } + +#if COMPLEXITY_STATS_OUTPUT + { + FILE *compstats; + compstats = fopen("complexity_stats.stt", "a"); + fprintf(compstats, "%10.3lf\n", + twopass->normalized_score_left / stats->count); + fclose(compstats); + } +#endif + } + frame_rate = 10000000.0 * stats->count / stats->duration; // Each frame can have a different duration, as the frame rate in the source // isn't guaranteed to be constant. The frame rate prior to the first frame // encoded in the second pass is a guess. However, the sum duration is not. // It is calculated based on the actual durations of all frames from the // first pass. - - if (is_two_pass_svc) { - vp9_update_spatial_layer_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * - svc->layer_context[svc->spatial_layer_id].target_bandwidth / - 10000000.0); - } else { - vp9_new_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); - } + vp9_new_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); // This variable monitors how far behind the second ref update is lagging. twopass->sr_update_lag = 1; - // Scan the first pass file and calculate a modified total error based upon - // the bias/power function used to allocate bits. - { - const double avg_error = - stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count); - const FIRSTPASS_STATS *s = twopass->stats_in; - double modified_error_total = 0.0; - twopass->modified_error_min = - (avg_error * oxcf->two_pass_vbrmin_section) / 100; - twopass->modified_error_max = - (avg_error * oxcf->two_pass_vbrmax_section) / 100; - while (s < twopass->stats_in_end) { - modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s); - ++s; - } - twopass->modified_error_left = modified_error_total; - } - // Reset the vbr bits off target counters rc->vbr_bits_off_target = 0; rc->vbr_bits_off_target_fast = 0; @@ -1531,110 +1729,110 @@ void vp9_init_second_pass(VP9_COMP *cpi) { twopass->arnr_strength_adjustment = 0; } -#define SR_DIFF_PART 0.0015 -#define INTRA_PART 0.005 -#define DEFAULT_DECAY_LIMIT 0.75 -#define LOW_SR_DIFF_TRHESH 0.1 -#define SR_DIFF_MAX 128.0 -#define LOW_CODED_ERR_PER_MB 10.0 -#define NCOUNT_FRAME_II_THRESH 6.0 - -static double get_sr_decay_rate(const VP9_COMP *cpi, +/* This function considers how the quality of prediction may be deteriorating + * with distance. It compares the coded error for the last frame and the + * second reference frame (usually two frames old) and also applies a factor + * based on the extent of INTRA coding. + * + * The decay factor is then used to reduce the contribution of frames further + * from the alt-ref or golden frame, to the bitrate boost calculation for that + * alt-ref or golden frame. + */ +static double get_sr_decay_rate(const TWO_PASS *const twopass, const FIRSTPASS_STATS *frame) { - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; - double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs; + double sr_diff = (frame->sr_coded_error - frame->coded_error); double sr_decay = 1.0; - double modified_pct_inter; - double modified_pcnt_intra; - const double motion_amplitude_part = - frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / - (cpi->initial_height + cpi->initial_width)); - - modified_pct_inter = frame->pcnt_inter; - if (((frame->coded_error / num_mbs) > LOW_CODED_ERR_PER_MB) && - ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < - (double)NCOUNT_FRAME_II_THRESH)) { - modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral; - } - modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); + // Do nothing if the second ref to last frame error difference is + // very small or even negative. if ((sr_diff > LOW_SR_DIFF_TRHESH)) { - sr_diff = VPXMIN(sr_diff, SR_DIFF_MAX); - sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part - - (INTRA_PART * modified_pcnt_intra); + const double sr_diff_part = + twopass->sr_diff_factor * ((sr_diff * 0.25) / frame->intra_error); + double modified_pct_inter = frame->pcnt_inter; + double modified_pcnt_intra; + + if ((frame->coded_error > LOW_CODED_ERR_PER_MB) && + ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < + (double)NCOUNT_FRAME_II_THRESH)) { + modified_pct_inter = + frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral; + } + modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); + + sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra); } - return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT); + return VPXMAX(sr_decay, twopass->sr_default_decay_limit); } // This function gives an estimate of how badly we believe the prediction // quality is decaying from frame to frame. -static double get_zero_motion_factor(const VP9_COMP *cpi, - const FIRSTPASS_STATS *frame) { - const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion; - double sr_decay = get_sr_decay_rate(cpi, frame); +static double get_zero_motion_factor(const TWO_PASS *const twopass, + const FIRSTPASS_STATS *frame_stats) { + const double zero_motion_pct = + frame_stats->pcnt_inter - frame_stats->pcnt_motion; + double sr_decay = get_sr_decay_rate(twopass, frame_stats); return VPXMIN(sr_decay, zero_motion_pct); } -#define ZM_POWER_FACTOR 0.75 +static double get_prediction_decay_rate(const TWO_PASS *const twopass, + const FIRSTPASS_STATS *frame_stats) { + const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats); + double zero_motion_factor = + twopass->zm_factor * (frame_stats->pcnt_inter - frame_stats->pcnt_motion); -static double get_prediction_decay_rate(const VP9_COMP *cpi, - const FIRSTPASS_STATS *next_frame) { - const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame); - const double zero_motion_factor = - (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion), - ZM_POWER_FACTOR)); + // Check that the zero motion factor is valid + assert(zero_motion_factor >= 0.0 && zero_motion_factor <= 1.0); return VPXMAX(zero_motion_factor, (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); } +static int get_show_idx(const TWO_PASS *twopass) { + return (int)(twopass->stats_in - twopass->stats_in_start); +} // Function to test for a condition where a complex transition is followed // by a static section. For example in slide shows where there is a fade // between slides. This is to help with more optimal kf and gf positioning. -static int detect_transition_to_still(VP9_COMP *cpi, int frame_interval, - int still_interval, - double loop_decay_rate, - double last_decay_rate) { - TWO_PASS *const twopass = &cpi->twopass; - RATE_CONTROL *const rc = &cpi->rc; - - // Break clause to detect very still sections after motion - // For example a static image after a fade or other transition - // instead of a clean scene cut. - if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 && - last_decay_rate < 0.9) { - int j; - - // Look ahead a few frames to see if static condition persists... - for (j = 0; j < still_interval; ++j) { - const FIRSTPASS_STATS *stats = &twopass->stats_in[j]; - if (stats >= twopass->stats_in_end) break; - - if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; - } - - // Only if it does do we signal a transition to still. - return j == still_interval; +static int check_transition_to_still(const FIRST_PASS_INFO *first_pass_info, + int show_idx, int still_interval) { + int j; + int num_frames = fps_get_num_frames(first_pass_info); + if (show_idx + still_interval > num_frames) { + return 0; } - return 0; + // Look ahead a few frames to see if static condition persists... + for (j = 0; j < still_interval; ++j) { + const FIRSTPASS_STATS *stats = + fps_get_frame_stats(first_pass_info, show_idx + j); + if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; + } + + // Only if it does do we signal a transition to still. + return j == still_interval; } // This function detects a flash through the high relative pcnt_second_ref // score in the frame following a flash frame. The offset passed in should // reflect this. -static int detect_flash(const TWO_PASS *twopass, int offset) { - const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset); - +static int detect_flash_from_frame_stats(const FIRSTPASS_STATS *frame_stats) { // What we are looking for here is a situation where there is a // brief break in prediction (such as a flash) but subsequent frames // are reasonably well predicted by an earlier (pre flash) frame. // The recovery after a flash is indicated by a high pcnt_second_ref - // compared to pcnt_inter. - return next_frame != NULL && - next_frame->pcnt_second_ref > next_frame->pcnt_inter && - next_frame->pcnt_second_ref >= 0.5; + // usage or a second ref coded error notabley lower than the last + // frame coded error. + if (frame_stats == NULL) { + return 0; + } + return (frame_stats->sr_coded_error < frame_stats->coded_error) || + ((frame_stats->pcnt_second_ref > frame_stats->pcnt_inter) && + (frame_stats->pcnt_second_ref >= 0.5)); +} + +static int detect_flash(const TWO_PASS *twopass, int offset) { + const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset); + return detect_flash_from_frame_stats(next_frame); } // Update the motion related elements to the GF arf boost calculation. @@ -1665,79 +1863,80 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, } } -#define BASELINE_ERR_PER_MB 1000.0 -static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, - double *sr_accumulator, - double this_frame_mv_in_out, double max_boost) { +static double calc_frame_boost(const FRAME_INFO *frame_info, + const FIRSTPASS_STATS *this_frame, + const TWO_PASS *const twopass, + int avg_frame_qindex, + double this_frame_mv_in_out) { double frame_boost; - const double lq = vp9_convert_qindex_to_q( - cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); + const double lq = + vp9_convert_qindex_to_q(avg_frame_qindex, frame_info->bit_depth); const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5); - int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; + const double active_area = calculate_active_area(frame_info, this_frame); - // Correct for any inactive region in the image - num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame)); - - // Underlying boost factor is based on inter error ratio. - frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); - - // Update the accumulator for second ref error difference. - // This is intended to give an indication of how much the coded error is - // increasing over time. - *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1; - *sr_accumulator = VPXMAX(0.0, *sr_accumulator); + // Frame booost is based on inter error. + frame_boost = (twopass->err_per_mb * active_area) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error); // Small adjustment for cases where there is a zoom out if (this_frame_mv_in_out > 0.0) frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); // Q correction and scalling - frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; + frame_boost = frame_boost * boost_q_correction; - return VPXMIN(frame_boost, max_boost * boost_q_correction); + return VPXMIN(frame_boost, twopass->gf_frame_max_boost * boost_q_correction); } -#define KF_BOOST_FACTOR 12.5 static double calc_kf_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, double *sr_accumulator, double this_frame_mv_in_out, - double max_boost) { + double zm_factor) { + TWO_PASS *const twopass = &cpi->twopass; double frame_boost; const double lq = vp9_convert_qindex_to_q( cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00); - int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; + const double active_area = + calculate_active_area(&cpi->frame_info, this_frame); + double max_boost; - // Correct for any inactive region in the image - num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame)); - - // Underlying boost factor is based on inter error ratio. - frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / + // Frame booost is based on inter error. + frame_boost = (twopass->kf_err_per_mb * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); // Update the accumulator for second ref error difference. // This is intended to give an indication of how much the coded error is // increasing over time. - *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1; + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error); *sr_accumulator = VPXMAX(0.0, *sr_accumulator); // Small adjustment for cases where there is a zoom out if (this_frame_mv_in_out > 0.0) frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); - // Q correction and scalling - frame_boost = frame_boost * KF_BOOST_FACTOR * boost_q_correction; + // Q correction and scaling + // The 40.0 value here is an experimentally derived baseline minimum. + // This value is in line with the minimum per frame boost in the alt_ref + // boost calculation. + frame_boost = + (frame_boost + twopass->kf_frame_min_boost) * boost_q_correction; - return VPXMIN(frame_boost, max_boost * boost_q_correction); + // Maximum allowed boost this frame. May be different for first vs subsequent + // key frames. + max_boost = (cpi->common.current_video_frame == 0) + ? twopass->kf_frame_max_boost_first + : twopass->kf_frame_max_boost_subs; + max_boost *= zm_factor * boost_q_correction; + + return VPXMIN(frame_boost, max_boost); } -static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, - int *f_boost, int *b_boost) { - TWO_PASS *const twopass = &cpi->twopass; +static int compute_arf_boost(const FRAME_INFO *frame_info, + TWO_PASS *const twopass, int arf_show_idx, + int f_frames, int b_frames, int avg_frame_qindex) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; int i; double boost_score = 0.0; double mv_ratio_accumulator = 0.0; @@ -1745,13 +1944,15 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, double this_frame_mv_in_out = 0.0; double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; - double sr_accumulator = 0.0; int arf_boost; int flash_detected = 0; // Search forward from the proposed arf/next gf position. for (i = 0; i < f_frames; ++i) { - const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + const FIRSTPASS_STATS *this_frame = + fps_get_frame_stats(first_pass_info, arf_show_idx + i); + const FIRSTPASS_STATS *next_frame = + fps_get_frame_stats(first_pass_info, arf_show_idx + i + 1); if (this_frame == NULL) break; // Update the motion related elements to the boost calculation. @@ -1761,24 +1962,22 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, // We want to discount the flash frame itself and the recovery // frame that follows as both will have poor scores. - flash_detected = detect_flash(twopass, i + offset) || - detect_flash(twopass, i + offset + 1); + flash_detected = detect_flash_from_frame_stats(this_frame) || + detect_flash_from_frame_stats(next_frame); // Accumulate the effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(cpi, this_frame); + decay_accumulator *= get_prediction_decay_rate(twopass, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; } - - sr_accumulator = 0.0; boost_score += decay_accumulator * - calc_frame_boost(cpi, this_frame, &sr_accumulator, - this_frame_mv_in_out, GF_MAX_BOOST); + calc_frame_boost(frame_info, this_frame, twopass, + avg_frame_qindex, this_frame_mv_in_out); } - *f_boost = (int)boost_score; + arf_boost = (int)boost_score; // Reset for backward looking loop. boost_score = 0.0; @@ -1787,11 +1986,13 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, this_frame_mv_in_out = 0.0; mv_in_out_accumulator = 0.0; abs_mv_in_out_accumulator = 0.0; - sr_accumulator = 0.0; // Search backward towards last gf position. for (i = -1; i >= -b_frames; --i) { - const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + const FIRSTPASS_STATS *this_frame = + fps_get_frame_stats(first_pass_info, arf_show_idx + i); + const FIRSTPASS_STATS *next_frame = + fps_get_frame_stats(first_pass_info, arf_show_idx + i + 1); if (this_frame == NULL) break; // Update the motion related elements to the boost calculation. @@ -1799,34 +2000,40 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); - // We want to discount the the flash frame itself and the recovery + // We want to discount the flash frame itself and the recovery // frame that follows as both will have poor scores. - flash_detected = detect_flash(twopass, i + offset) || - detect_flash(twopass, i + offset + 1); + flash_detected = detect_flash_from_frame_stats(this_frame) || + detect_flash_from_frame_stats(next_frame); // Cumulative effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(cpi, this_frame); + decay_accumulator *= get_prediction_decay_rate(twopass, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; } - - sr_accumulator = 0.0; boost_score += decay_accumulator * - calc_frame_boost(cpi, this_frame, &sr_accumulator, - this_frame_mv_in_out, GF_MAX_BOOST); + calc_frame_boost(frame_info, this_frame, twopass, + avg_frame_qindex, this_frame_mv_in_out); } - *b_boost = (int)boost_score; + arf_boost += (int)boost_score; - arf_boost = (*f_boost + *b_boost); - if (arf_boost < ((b_frames + f_frames) * 20)) - arf_boost = ((b_frames + f_frames) * 20); + if (arf_boost < ((b_frames + f_frames) * 40)) + arf_boost = ((b_frames + f_frames) * 40); arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST); return arf_boost; } +static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) { + const FRAME_INFO *frame_info = &cpi->frame_info; + TWO_PASS *const twopass = &cpi->twopass; + const int avg_inter_frame_qindex = cpi->rc.avg_frame_qindex[INTER_FRAME]; + int arf_show_idx = get_show_idx(twopass); + return compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, + b_frames, avg_inter_frame_qindex); +} + // Calculate a section intra ratio used in setting max loop filter. static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, const FIRSTPASS_STATS *end, @@ -1849,28 +2056,44 @@ static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, // Calculate the total bits to allocate in this GF/ARF group. static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, double gf_group_err) { + VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const TWO_PASS *const twopass = &cpi->twopass; const int max_bits = frame_max_bits(rc, &cpi->oxcf); int64_t total_group_bits; + const int is_key_frame = frame_is_intra_only(cm); + const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; + int gop_frames = + rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf; // Calculate the bits to be allocated to the group as a whole. - if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) { + int key_frame_interval = rc->frames_since_key + rc->frames_to_key; + int distance_from_next_key_frame = + rc->frames_to_key - + (rc->baseline_gf_interval + rc->source_alt_ref_pending); + int max_gf_bits_bias = rc->avg_frame_bandwidth; + double gf_interval_bias_bits_normalize_factor = + (double)rc->baseline_gf_interval / 16; total_group_bits = (int64_t)(twopass->kf_group_bits * (gf_group_err / twopass->kf_group_error_left)); + // TODO(ravi): Experiment with different values of max_gf_bits_bias + total_group_bits += + (int64_t)((double)distance_from_next_key_frame / key_frame_interval * + max_gf_bits_bias * gf_interval_bias_bits_normalize_factor); } else { total_group_bits = 0; } // Clamp odd edge cases. - total_group_bits = - (total_group_bits < 0) ? 0 : (total_group_bits > twopass->kf_group_bits) - ? twopass->kf_group_bits - : total_group_bits; + total_group_bits = (total_group_bits < 0) ? 0 + : (total_group_bits > twopass->kf_group_bits) + ? twopass->kf_group_bits + : total_group_bits; // Clip based on user supplied data rate variability limit. - if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) - total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; + if (total_group_bits > (int64_t)max_bits * gop_frames) + total_group_bits = (int64_t)max_bits * gop_frames; return total_group_bits; } @@ -1881,9 +2104,9 @@ static int calculate_boost_bits(int frame_count, int boost, int allocation_chunks; // return 0 for invalid inputs (could arise e.g. through rounding errors) - if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0; + if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0; - allocation_chunks = (frame_count * 100) + boost; + allocation_chunks = (frame_count * NORMAL_BOOST) + boost; // Prevent overflow. if (boost > 1023) { @@ -1897,183 +2120,421 @@ static int calculate_boost_bits(int frame_count, int boost, 0); } -// Current limit on maximum number of active arfs in a GF/ARF group. -#define MAX_ACTIVE_ARFS 2 -#define ARF_SLOT1 2 -#define ARF_SLOT2 3 -// This function indirects the choice of buffers for arfs. -// At the moment the values are fixed but this may change as part of -// the integration process with other codec features that swap buffers around. -static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { - arf_buffer_indices[0] = ARF_SLOT1; - arf_buffer_indices[1] = ARF_SLOT2; +// Used in corpus vbr: Calculates the total normalized group complexity score +// for a given number of frames starting at the current position in the stats +// file. +static double calculate_group_score(VP9_COMP *cpi, double av_score, + int frame_count) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + const FIRSTPASS_STATS *s = twopass->stats_in; + double score_total = 0.0; + int i = 0; + + // We don't ever want to return a 0 score here. + if (frame_count == 0) return 1.0; + + while ((i < frame_count) && (s < twopass->stats_in_end)) { + score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score); + ++s; + ++i; + } + + return score_total; +} + +static void find_arf_order(VP9_COMP *cpi, GF_GROUP *gf_group, + int *index_counter, int depth, int start, int end) { + TWO_PASS *twopass = &cpi->twopass; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; + FIRSTPASS_STATS fpf_frame; + const int mid = (start + end + 1) >> 1; + const int min_frame_interval = 2; + int idx; + + // Process regular P frames + if ((end - start < min_frame_interval) || + (depth > gf_group->allowed_max_layer_depth)) { + for (idx = start; idx <= end; ++idx) { + gf_group->update_type[*index_counter] = LF_UPDATE; + gf_group->arf_src_offset[*index_counter] = 0; + gf_group->frame_gop_index[*index_counter] = idx; + gf_group->rf_level[*index_counter] = INTER_NORMAL; + gf_group->layer_depth[*index_counter] = depth; + gf_group->gfu_boost[*index_counter] = NORMAL_BOOST; + ++(*index_counter); + } + gf_group->max_layer_depth = VPXMAX(gf_group->max_layer_depth, depth); + return; + } + + assert(abs(mid - start) >= 1 && abs(mid - end) >= 1); + + // Process ARF frame + gf_group->layer_depth[*index_counter] = depth; + gf_group->update_type[*index_counter] = ARF_UPDATE; + gf_group->arf_src_offset[*index_counter] = mid - start; + gf_group->frame_gop_index[*index_counter] = mid; + gf_group->rf_level[*index_counter] = GF_ARF_LOW; + + for (idx = 0; idx <= mid; ++idx) + if (EOF == input_stats(twopass, &fpf_frame)) break; + + gf_group->gfu_boost[*index_counter] = + VPXMAX(MIN_ARF_GF_BOOST, + calc_arf_boost(cpi, end - mid + 1, mid - start) >> depth); + + reset_fpf_position(twopass, start_pos); + + ++(*index_counter); + + find_arf_order(cpi, gf_group, index_counter, depth + 1, start, mid - 1); + + gf_group->update_type[*index_counter] = USE_BUF_FRAME; + gf_group->arf_src_offset[*index_counter] = 0; + gf_group->frame_gop_index[*index_counter] = mid; + gf_group->rf_level[*index_counter] = INTER_NORMAL; + gf_group->layer_depth[*index_counter] = depth; + ++(*index_counter); + + find_arf_order(cpi, gf_group, index_counter, depth + 1, mid + 1, end); +} + +static INLINE void set_gf_overlay_frame_type(GF_GROUP *gf_group, + int frame_index, + int source_alt_ref_active) { + if (source_alt_ref_active) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1; + gf_group->gfu_boost[frame_index] = NORMAL_BOOST; + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->layer_depth[frame_index] = 0; + } +} + +static void define_gf_group_structure(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + int frame_index = 0; + int key_frame = cpi->common.frame_type == KEY_FRAME; + int layer_depth = 1; + int gop_frames = + rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending); + + gf_group->frame_start = cpi->common.current_video_frame; + gf_group->frame_end = gf_group->frame_start + rc->baseline_gf_interval; + gf_group->max_layer_depth = 0; + gf_group->allowed_max_layer_depth = 0; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + if (!key_frame) + set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_active); + + ++frame_index; + + // === [frame_index == 1] === + if (rc->source_alt_ref_pending) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->layer_depth[frame_index] = layer_depth; + gf_group->arf_src_offset[frame_index] = + (unsigned char)(rc->baseline_gf_interval - 1); + gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval; + gf_group->max_layer_depth = 1; + ++frame_index; + ++layer_depth; + gf_group->allowed_max_layer_depth = cpi->oxcf.enable_auto_arf; + } + + find_arf_order(cpi, gf_group, &frame_index, layer_depth, 1, gop_frames); + + // TODO(b/345523905): Why do we need to set an overlay frame in the end? + set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_pending); + gf_group->arf_src_offset[frame_index] = 0; + gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval; + + // Set the frame ops number. + gf_group->gf_group_size = frame_index; +} + +static INLINE void gf_group_set_overlay_frame(GF_GROUP *gf_group, + int frame_index, + int show_frame_index) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->frame_gop_index[frame_index] = show_frame_index; + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1; +} + +static INLINE void gf_group_set_key_frame(GF_GROUP *gf_group, int frame_index, + int show_frame_index) { + gf_group->update_type[frame_index] = KF_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->frame_gop_index[frame_index] = show_frame_index; + gf_group->rf_level[frame_index] = KF_STD; + gf_group->layer_depth[frame_index] = 0; +} + +static INLINE void gf_group_set_arf_frame(GF_GROUP *gf_group, int frame_index, + int show_frame_index) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->arf_src_offset[frame_index] = + (unsigned char)(show_frame_index - frame_index); + gf_group->frame_gop_index[frame_index] = show_frame_index; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->layer_depth[frame_index] = 1; +} + +static INLINE void gf_group_set_inter_normal_frame(GF_GROUP *gf_group, + int frame_index, + int show_frame_index) { + gf_group->update_type[frame_index] = LF_UPDATE; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->frame_gop_index[frame_index] = show_frame_index; + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->layer_depth[frame_index] = 2; +} + +static INLINE void set_gf_frame_type(vpx_rc_frame_update_type_t update_type, + int show_frame_count, GF_GROUP *gf_group, + int *frame_index, int *show_frame_index) { + if (update_type == VPX_RC_KF_UPDATE) { + gf_group_set_key_frame(gf_group, *frame_index, *show_frame_index); + ++(*frame_index); + ++(*show_frame_index); + } else if (update_type == VPX_RC_OVERLAY_UPDATE) { + gf_group_set_overlay_frame(gf_group, *frame_index, *show_frame_index); + ++(*frame_index); + ++(*show_frame_index); + } else if (update_type == VPX_RC_ARF_UPDATE) { + gf_group_set_arf_frame(gf_group, *frame_index, show_frame_count); + ++(*frame_index); + } else if (update_type == VPX_RC_LF_UPDATE) { + gf_group_set_inter_normal_frame(gf_group, *frame_index, *show_frame_index); + ++(*frame_index); + ++(*show_frame_index); + } else { + assert(0); + } +} + +static void ext_rc_define_gf_group_structure( + const vpx_rc_gop_decision_t *gop_decision, GF_GROUP *gf_group) { + const int gop_coding_frames = gop_decision->gop_coding_frames; + + const int show_frame_count = gop_coding_frames - gop_decision->use_alt_ref; + int frame_index = 0; + int show_frame_index = 0; + + for (int i = frame_index; i < gop_coding_frames; i++) { + set_gf_frame_type(gop_decision->update_type[i], show_frame_count, gf_group, + &frame_index, &show_frame_index); + + gf_group->update_ref_idx[i] = gop_decision->update_ref_index[i]; + + gf_group->ext_rc_ref[i].last_index = 0; + gf_group->ext_rc_ref[i].golden_index = 0; + gf_group->ext_rc_ref[i].altref_index = 0; + for (int ref_frame = 0; ref_frame < 3; ref_frame++) { + const vpx_rc_ref_frame_t *const ext_ref_frame = + &gop_decision->ref_frame_list[i]; + const int ref_index = ext_ref_frame->index[ref_frame]; + gf_group->ref_frame_list[i][ref_frame] = ext_ref_frame->index[ref_frame]; + switch (ext_ref_frame->name[ref_frame]) { + case VPX_RC_LAST_FRAME: + gf_group->ext_rc_ref[i].last_index = ref_index; + break; + case VPX_RC_GOLDEN_FRAME: + gf_group->ext_rc_ref[i].golden_index = ref_index; + break; + case VPX_RC_ALTREF_FRAME: + gf_group->ext_rc_ref[i].altref_index = ref_index; + break; + default: break; + } + } + if (gf_group->update_type[i] == OVERLAY_UPDATE) { + // From ext_rc, overlay may not update any ref. But here we force it to + // update its arf's slot. This is probably OK since the arf and this + // overlay frame should be very similar. + gf_group->update_ref_idx[i] = gf_group->ext_rc_ref[i].altref_index; + } + } + // max_layer_depth is hardcoded to match the behavior of + // define_gf_group_structure() + // TODO(angiebird): Check whether max_layer_depth has performance impact. + gf_group->max_layer_depth = 2; + gf_group->allowed_max_layer_depth = 1; + gf_group->gf_group_size = gop_coding_frames; + + // TODO(b/345523905): Why do we need to set an overlay frame in the end? + assert(show_frame_count == show_frame_index); + if (gop_decision->use_alt_ref) { + gf_group_set_overlay_frame(gf_group, gf_group->gf_group_size, + show_frame_index); + } else { + gf_group_set_inter_normal_frame(gf_group, gf_group->gf_group_size, + show_frame_index); + } + + gf_group->frame_start = 0; + gf_group->frame_end = gf_group->gf_group_size - gop_decision->use_alt_ref; } static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, int gf_arf_bits) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->twopass; GF_GROUP *const gf_group = &twopass->gf_group; FIRSTPASS_STATS frame_stats; int i; - int frame_index = 1; + int frame_index = 0; int target_frame_size; int key_frame; - const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf); + const int max_bits = frame_max_bits(&cpi->rc, oxcf); int64_t total_group_bits = gf_group_bits; - int mid_boost_bits = 0; int mid_frame_idx; - unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS]; - int alt_frame_index = frame_index; - int has_temporal_layers = - is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1; int normal_frames; int normal_frame_bits; - int last_frame_bits; - int last_frame_reduction; + int last_frame_reduction = 0; + double av_score = 1.0; + double tot_norm_frame_score = 1.0; + double this_frame_score = 1.0; - // Only encode alt reference frame in temporal base layer. - if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers; + // Define the GF structure and specify + int gop_frames = gf_group->gf_group_size; - key_frame = - cpi->common.frame_type == KEY_FRAME || vp9_is_upper_layer_key_frame(cpi); - - get_arf_buffer_indices(arf_buffer_indices); + key_frame = cpi->common.frame_type == KEY_FRAME; // For key frames the frame target rate is already set and it // is also the golden frame. + // === [frame_index == 0] === if (!key_frame) { - if (rc->source_alt_ref_active) { - gf_group->update_type[0] = OVERLAY_UPDATE; - gf_group->rf_level[0] = INTER_NORMAL; - gf_group->bit_allocation[0] = 0; - } else { - gf_group->update_type[0] = GF_UPDATE; - gf_group->rf_level[0] = GF_ARF_STD; - gf_group->bit_allocation[0] = gf_arf_bits; - } - gf_group->arf_update_idx[0] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[0] = arf_buffer_indices[0]; - - // Step over the golden frame / overlay frame - if (EOF == input_stats(twopass, &frame_stats)) return; + gf_group->bit_allocation[frame_index] = + rc->source_alt_ref_active ? 0 : gf_arf_bits; } // Deduct the boost bits for arf (or gf if it is not a key frame) // from the group total. if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits; + ++frame_index; + + // === [frame_index == 1] === // Store the bits to spend on the ARF if there is one. if (rc->source_alt_ref_pending) { - gf_group->update_type[alt_frame_index] = ARF_UPDATE; - gf_group->rf_level[alt_frame_index] = GF_ARF_STD; - gf_group->bit_allocation[alt_frame_index] = gf_arf_bits; + gf_group->bit_allocation[frame_index] = gf_arf_bits; - if (has_temporal_layers) - gf_group->arf_src_offset[alt_frame_index] = - (unsigned char)(rc->baseline_gf_interval - - cpi->svc.number_temporal_layers); - else - gf_group->arf_src_offset[alt_frame_index] = - (unsigned char)(rc->baseline_gf_interval - 1); - - gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[alt_frame_index] = - arf_buffer_indices[cpi->multi_arf_last_grp_enabled && - rc->source_alt_ref_active]; - if (!has_temporal_layers) ++frame_index; - - if (cpi->multi_arf_enabled) { - // Set aside a slot for a level 1 arf. - gf_group->update_type[frame_index] = ARF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_LOW; - gf_group->arf_src_offset[frame_index] = - (unsigned char)((rc->baseline_gf_interval >> 1) - 1); - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; - ++frame_index; - } + ++frame_index; } - // Note index of the first normal inter frame int eh group (not gf kf arf) - gf_group->first_inter_index = frame_index; - // Define middle frame mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; - normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending); - - // The last frame in the group is used less as a predictor so reduce - // its allocation a little. - if (normal_frames > 1) { + normal_frames = (rc->baseline_gf_interval - 1); + if (normal_frames > 1) normal_frame_bits = (int)(total_group_bits / normal_frames); - last_frame_reduction = normal_frame_bits / 16; - last_frame_bits = normal_frame_bits - last_frame_reduction; - } else { + else normal_frame_bits = (int)total_group_bits; - last_frame_bits = normal_frame_bits; - last_frame_reduction = 0; + + gf_group->gfu_boost[1] = rc->gfu_boost; + + if (cpi->multi_layer_arf) { + int idx; + int arf_depth_bits[MAX_ARF_LAYERS] = { 0 }; + int arf_depth_count[MAX_ARF_LAYERS] = { 0 }; + int arf_depth_boost[MAX_ARF_LAYERS] = { 0 }; + int total_arfs = 1; // Account for the base layer ARF. + + for (idx = 0; idx < gop_frames; ++idx) { + if (gf_group->update_type[idx] == ARF_UPDATE) { + arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->gfu_boost[idx]; + ++arf_depth_count[gf_group->layer_depth[idx]]; + } + } + + for (idx = 2; idx < MAX_ARF_LAYERS; ++idx) { + if (arf_depth_boost[idx] == 0) break; + arf_depth_bits[idx] = calculate_boost_bits( + rc->baseline_gf_interval - total_arfs - arf_depth_count[idx], + arf_depth_boost[idx], total_group_bits); + + total_group_bits -= arf_depth_bits[idx]; + total_arfs += arf_depth_count[idx]; + } + + // offset the base layer arf + normal_frames -= (total_arfs - 1); + if (normal_frames > 1) + normal_frame_bits = (int)(total_group_bits / normal_frames); + else + normal_frame_bits = (int)total_group_bits; + + target_frame_size = normal_frame_bits; + target_frame_size = + clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits)); + + // The first layer ARF has its bit allocation assigned. + for (idx = frame_index; idx < gop_frames; ++idx) { + switch (gf_group->update_type[idx]) { + case ARF_UPDATE: + gf_group->bit_allocation[idx] = + (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] * + gf_group->gfu_boost[idx]) / + arf_depth_boost[gf_group->layer_depth[idx]]); + break; + case USE_BUF_FRAME: gf_group->bit_allocation[idx] = 0; break; + default: gf_group->bit_allocation[idx] = target_frame_size; break; + } + } + gf_group->bit_allocation[idx] = 0; + + return; + } + + if (oxcf->vbr_corpus_complexity) { + av_score = get_distribution_av_err(cpi, twopass); + tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames); } // Allocate bits to the other frames in the group. for (i = 0; i < normal_frames; ++i) { - int arf_idx = 0; if (EOF == input_stats(twopass, &frame_stats)) break; - - if (has_temporal_layers && frame_index == alt_frame_index) { - ++frame_index; + if (oxcf->vbr_corpus_complexity) { + this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf, + &frame_stats, av_score); + normal_frame_bits = (int)((double)total_group_bits * + (this_frame_score / tot_norm_frame_score)); } - target_frame_size = (i == (normal_frames - 1)) - ? last_frame_bits - : (i == mid_frame_idx) - ? normal_frame_bits + last_frame_reduction - : normal_frame_bits; - - if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { - mid_boost_bits += (target_frame_size >> 4); - target_frame_size -= (target_frame_size >> 4); - - if (frame_index <= mid_frame_idx) arf_idx = 1; + target_frame_size = normal_frame_bits; + if ((i == (normal_frames - 1)) && (i >= 1)) { + last_frame_reduction = normal_frame_bits / 16; + target_frame_size -= last_frame_reduction; } - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx]; target_frame_size = clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits)); - gf_group->update_type[frame_index] = LF_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - gf_group->bit_allocation[frame_index] = target_frame_size; ++frame_index; } + // Add in some extra bits for the middle frame in the group. + gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction; + // Note: // We need to configure the frame at the end of the sequence + 1 that will be // the start frame for the next group. Otherwise prior to the call to // vp9_rc_get_second_pass_params() the data will be undefined. - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; - - if (rc->source_alt_ref_pending) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - - // Final setup for second arf and its overlay. - if (cpi->multi_arf_enabled) { - gf_group->bit_allocation[2] = - gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits; - gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE; - gf_group->bit_allocation[mid_frame_idx] = 0; - } - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - } - - // Note whether multi-arf was enabled this group for next time. - cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled; } // Adjusts the ARNF filter for a GF group. @@ -2085,23 +2546,228 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise, twopass->arnr_strength_adjustment = 0; - if ((section_zeromv < 0.10) || (section_noise <= (SECTION_NOISE_DEF * 0.75))) + if (section_noise < 150) { twopass->arnr_strength_adjustment -= 1; + if (section_noise < 75) twopass->arnr_strength_adjustment -= 1; + } else if (section_noise > 250) + twopass->arnr_strength_adjustment += 1; + if (section_zeromv > 0.50) twopass->arnr_strength_adjustment += 1; } // Analyse and define a gf/arf group. -static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { +#define ARF_ABS_ZOOM_THRESH 4.0 + +#define MAX_GF_BOOST 5400 + +typedef struct RANGE { + int min; + int max; +} RANGE; + +/* get_gop_coding_frame_num() depends on several fields in RATE_CONTROL *rc as + * follows. + * Static fields: + * (The following fields will remain unchanged after initialization of encoder.) + * rc->static_scene_max_gf_interval + * rc->min_gf_interval + * twopass->sr_diff_factor + * twopass->sr_default_decay_limit + * twopass->zm_factor + * + * Dynamic fields: + * (The following fields will be updated before or after coding each frame.) + * rc->frames_to_key + * rc->frames_since_key + * rc->source_alt_ref_active + * + * TODO(angiebird): Separate the dynamic fields and static fields into two + * structs. + */ +static int get_gop_coding_frame_num( + int *use_alt_ref, const FRAME_INFO *frame_info, + const TWO_PASS *const twopass, const RATE_CONTROL *rc, + int gf_start_show_idx, const RANGE *active_gf_interval, + double gop_intra_factor, int lag_in_frames, int *end_of_sequence) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; + double loop_decay_rate = 1.00; + double mv_ratio_accumulator = 0.0; + double this_frame_mv_in_out = 0.0; + double mv_in_out_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + double sr_accumulator = 0.0; + // Motion breakout threshold for loop below depends on image size. + double mv_ratio_accumulator_thresh = + (frame_info->frame_height + frame_info->frame_width) / 4.0; + double zero_motion_accumulator = 1.0; + int gop_coding_frames; + + *use_alt_ref = 1; + gop_coding_frames = 0; + while (gop_coding_frames < rc->static_scene_max_gf_interval && + gop_coding_frames < rc->frames_to_key) { + const FIRSTPASS_STATS *next_next_frame; + const FIRSTPASS_STATS *next_frame; + int flash_detected; + ++gop_coding_frames; + + next_frame = fps_get_frame_stats(first_pass_info, + gf_start_show_idx + gop_coding_frames); + if (next_frame == NULL) { + *end_of_sequence = gop_coding_frames == 1 && rc->source_alt_ref_active; + break; + } + + // Test for the case where there is a brief flash but the prediction + // quality back to an earlier frame is then restored. + next_next_frame = fps_get_frame_stats( + first_pass_info, gf_start_show_idx + gop_coding_frames + 1); + flash_detected = detect_flash_from_frame_stats(next_next_frame); + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats( + next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + // Monitor for static sections. + if ((rc->frames_since_key + gop_coding_frames - 1) > 1) { + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(twopass, next_frame)); + } + + // Accumulate the effect of prediction quality decay. + if (!flash_detected) { + double last_loop_decay_rate = loop_decay_rate; + loop_decay_rate = get_prediction_decay_rate(twopass, next_frame); + + // Break clause to detect very still sections after motion. For example, + // a static image after a fade or other transition. + if (gop_coding_frames > rc->min_gf_interval && loop_decay_rate >= 0.999 && + last_loop_decay_rate < 0.9) { + int still_interval = 5; + if (check_transition_to_still(first_pass_info, + gf_start_show_idx + gop_coding_frames, + still_interval)) { + *use_alt_ref = 0; + break; + } + } + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + if (gop_coding_frames == 1) { + sr_accumulator += next_frame->coded_error; + } else { + sr_accumulator += + (next_frame->sr_coded_error - next_frame->coded_error); + } + } + + // Break out conditions. + // Break at maximum of active_gf_interval->max unless almost totally + // static. + // + // Note that the addition of a test of rc->source_alt_ref_active is + // deliberate. The effect of this is that after a normal altref group even + // if the material is static there will be one normal length GF group + // before allowing longer GF groups. The reason for this is that in cases + // such as slide shows where slides are separated by a complex transition + // such as a fade, the arf group spanning the transition may not be coded + // at a very high quality and hence this frame (with its overlay) is a + // poor golden frame to use for an extended group. + if ((gop_coding_frames >= active_gf_interval->max) && + ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) { + break; + } + if ( + // Don't break out with a very short interval. + (gop_coding_frames >= active_gf_interval->min) && + // If possible don't break very close to a kf + ((rc->frames_to_key - gop_coding_frames) >= rc->min_gf_interval) && + (gop_coding_frames & 0x01) && (!flash_detected) && + ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || + (abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH) || + (sr_accumulator > gop_intra_factor * next_frame->intra_error))) { + break; + } + } + *use_alt_ref &= zero_motion_accumulator < 0.995; + *use_alt_ref &= gop_coding_frames < lag_in_frames; + *use_alt_ref &= gop_coding_frames >= rc->min_gf_interval; + return gop_coding_frames; +} + +static RANGE get_active_gf_inverval_range( + const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf, + int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) { + RANGE active_gf_interval; + int int_max_q = (int)(vp9_convert_qindex_to_q(active_worst_quality, + frame_info->bit_depth)); + int q_term = (gf_start_show_idx == 0) + ? int_max_q / 32 + : (int)(vp9_convert_qindex_to_q(last_boosted_qindex, + frame_info->bit_depth) / + 6); + active_gf_interval.min = + rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200); + active_gf_interval.min = + VPXMIN(active_gf_interval.min, rc->max_gf_interval + arf_active_or_kf); + + // The value chosen depends on the active Q range. At low Q we have + // bits to spare and are better with a smaller interval and smaller boost. + // At high Q when there are few bits to spare we are better with a longer + // interval to spread the cost of the GF. + active_gf_interval.max = 11 + arf_active_or_kf + VPXMIN(5, q_term); + + // Force max GF interval to be odd. + active_gf_interval.max = active_gf_interval.max | 0x01; + + // We have: active_gf_interval.min <= + // rc->max_gf_interval + arf_active_or_kf. + if (active_gf_interval.max < active_gf_interval.min) { + active_gf_interval.max = active_gf_interval.min; + } else { + active_gf_interval.max = + VPXMIN(active_gf_interval.max, rc->max_gf_interval + arf_active_or_kf); + } + + // Would the active max drop us out just before the near the next kf? + if ((active_gf_interval.max <= rc->frames_to_key) && + (active_gf_interval.max >= (rc->frames_to_key - rc->min_gf_interval))) { + active_gf_interval.max = rc->frames_to_key / 2; + } + active_gf_interval.max = + VPXMAX(active_gf_interval.max, active_gf_interval.min); + return active_gf_interval; +} + +static int get_arf_layers(int multi_layer_arf, int max_layers, + int coding_frame_num) { + assert(max_layers <= MAX_ARF_LAYERS); + if (multi_layer_arf) { + int layers = 0; + int i; + for (i = coding_frame_num; i > 0; i >>= 1) { + ++layers; + } + layers = VPXMIN(max_layers, layers); + return layers; + } else { + return 1; + } +} + +static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; VP9EncoderConfig *const oxcf = &cpi->oxcf; TWO_PASS *const twopass = &cpi->twopass; - FIRSTPASS_STATS next_frame; + const FRAME_INFO *frame_info = &cpi->frame_info; + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; const FIRSTPASS_STATS *const start_pos = twopass->stats_in; - int i; + int gop_coding_frames; - double boost_score = 0.0; - double old_boost_score = 0.0; double gf_group_err = 0.0; double gf_group_raw_error = 0.0; double gf_group_noise = 0.0; @@ -2109,268 +2775,263 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double gf_group_inactive_zone_rows = 0.0; double gf_group_inter = 0.0; double gf_group_motion = 0.0; - double gf_first_frame_err = 0.0; - double mod_frame_err = 0.0; - double mv_ratio_accumulator = 0.0; - double decay_accumulator = 1.0; - double zero_motion_accumulator = 1.0; - double loop_decay_rate = 1.00; - double last_loop_decay_rate = 1.00; + int allow_alt_ref = is_altref_enabled(cpi); + int use_alt_ref; - double this_frame_mv_in_out = 0.0; - double mv_in_out_accumulator = 0.0; - double abs_mv_in_out_accumulator = 0.0; - double mv_ratio_accumulator_thresh; - double mv_in_out_thresh; - double abs_mv_in_out_thresh; - double sr_accumulator = 0.0; - unsigned int allow_alt_ref = is_altref_enabled(cpi); - - int f_boost = 0; - int b_boost = 0; - int flash_detected; - int active_max_gf_interval; - int active_min_gf_interval; int64_t gf_group_bits; int gf_arf_bits; - const int is_key_frame = frame_is_intra_only(cm); + int is_key_frame = frame_is_intra_only(cm); + + vpx_rc_gop_decision_t gop_decision; + int gop_decision_ready = 0; + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL) { + vpx_codec_err_t codec_status = + vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_gop_decision() failed"); + } + is_key_frame = gop_decision.use_key_frame; + gop_decision_ready = 1; + } + + // If this is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; + int is_alt_ref_flash = 0; + + double gop_intra_factor; + int gop_frames; + RANGE active_gf_interval; + // Whether this is at the end of last GOP of this sequence. + int end_of_sequence = 0; // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. if (is_key_frame == 0) { vp9_zero(twopass->gf_group); + ++rc->gop_global_index; + } else { + rc->gop_global_index = 0; } vpx_clear_system_state(); - vp9_zero(next_frame); - // Load stats for the current frame. - mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + active_gf_interval = get_active_gf_inverval_range( + frame_info, rc, arf_active_or_kf, gf_start_show_idx, + twopass->active_worst_quality, rc->last_boosted_qindex); - // Note the error of the frame at the start of the group. This will be - // the GF frame error if we code a normal gf. - gf_first_frame_err = mod_frame_err; - - // If this is a key frame or the overlay from a previous arf then - // the error score / cost of this frame has already been accounted for. - if (arf_active_or_kf) { - gf_group_err -= gf_first_frame_err; - gf_group_raw_error -= this_frame->coded_error; - gf_group_noise -= this_frame->frame_noise_energy; - gf_group_skip_pct -= this_frame->intra_skip_pct; - gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows; - gf_group_inter -= this_frame->pcnt_inter; - gf_group_motion -= this_frame->pcnt_motion; + if (cpi->multi_layer_arf) { + int arf_layers = get_arf_layers(cpi->multi_layer_arf, oxcf->enable_auto_arf, + active_gf_interval.max); + gop_intra_factor = 1.0 + 0.25 * arf_layers; + } else { + gop_intra_factor = 1.0; } - // Motion breakout threshold for loop below depends on image size. - mv_ratio_accumulator_thresh = - (cpi->initial_height + cpi->initial_width) / 4.0; - mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 300.0; - abs_mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 200.0; + gop_coding_frames = get_gop_coding_frame_num( + &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx, + &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames, + &end_of_sequence); + use_alt_ref &= allow_alt_ref; - // Set a maximum and minimum interval for the GF group. - // If the image appears almost completely static we can extend beyond this. - { - int int_max_q = (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality, - cpi->common.bit_depth)); - int int_lbq = (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex, - cpi->common.bit_depth)); - active_min_gf_interval = - rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200); - if (active_min_gf_interval > rc->max_gf_interval) - active_min_gf_interval = rc->max_gf_interval; - - if (cpi->multi_arf_allowed) { - active_max_gf_interval = rc->max_gf_interval; - } else { - // The value chosen depends on the active Q range. At low Q we have - // bits to spare and are better with a smaller interval and smaller boost. - // At high Q when there are few bits to spare we are better with a longer - // interval to spread the cost of the GF. - active_max_gf_interval = 12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6)); - - // We have: active_min_gf_interval <= rc->max_gf_interval - if (active_max_gf_interval < active_min_gf_interval) - active_max_gf_interval = active_min_gf_interval; - else if (active_max_gf_interval > rc->max_gf_interval) - active_max_gf_interval = rc->max_gf_interval; - - // Would the active max drop us out just before the near the next kf? - if ((active_max_gf_interval <= rc->frames_to_key) && - (active_max_gf_interval >= (rc->frames_to_key - rc->min_gf_interval))) - active_max_gf_interval = rc->frames_to_key / 2; - } - } - - i = 0; - while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) { - ++i; - - // Accumulate error score of frames in this gf group. - mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); - gf_group_err += mod_frame_err; - gf_group_raw_error += this_frame->coded_error; - gf_group_noise += this_frame->frame_noise_energy; - gf_group_skip_pct += this_frame->intra_skip_pct; - gf_group_inactive_zone_rows += this_frame->inactive_zone_rows; - gf_group_inter += this_frame->pcnt_inter; - gf_group_motion += this_frame->pcnt_motion; - - if (EOF == input_stats(twopass, &next_frame)) break; - - // Test for the case where there is a brief flash but the prediction - // quality back to an earlier frame is then restored. - flash_detected = detect_flash(twopass, 0); - - // Update the motion related elements to the boost calculation. - accumulate_frame_motion_stats( - &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, - &abs_mv_in_out_accumulator, &mv_ratio_accumulator); - - // Accumulate the effect of prediction quality decay. - if (!flash_detected) { - last_loop_decay_rate = loop_decay_rate; - loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); - - decay_accumulator = decay_accumulator * loop_decay_rate; - - // Monitor for static sections. - zero_motion_accumulator = VPXMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); - - // Break clause to detect very still sections after motion. For example, - // a static image after a fade or other transition. - if (detect_transition_to_still(cpi, i, 5, loop_decay_rate, - last_loop_decay_rate)) { - allow_alt_ref = 0; - break; - } - } - - // Calculate a boost number for this frame. - sr_accumulator = 0.0; - boost_score += decay_accumulator * - calc_frame_boost(cpi, &next_frame, &sr_accumulator, - this_frame_mv_in_out, GF_MAX_BOOST); - - // Break out conditions. - if ( - // Break at active_max_gf_interval unless almost totally static. - ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) || - ( - // Don't break out with a very short interval. - (i >= active_min_gf_interval) && - // If possible dont break very close to a kf - ((rc->frames_to_key - i) >= rc->min_gf_interval) && - (!flash_detected) && - ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || - (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) || - (mv_in_out_accumulator < -mv_in_out_thresh) || - ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) { - boost_score = old_boost_score; - break; - } - - *this_frame = next_frame; - old_boost_score = boost_score; + if (gop_decision_ready) { + gop_coding_frames = gop_decision.gop_coding_frames; + use_alt_ref = gop_decision.use_alt_ref; } // Was the group length constrained by the requirement for a new KF? - rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; + rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0; // Should we use the alternate reference frame. - if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && - (i >= rc->min_gf_interval)) { + if (use_alt_ref) { + const int f_frames = + (rc->frames_to_key - gop_coding_frames >= gop_coding_frames - 1) + ? gop_coding_frames - 1 + : VPXMAX(0, rc->frames_to_key - gop_coding_frames); + const int b_frames = gop_coding_frames - 1; + const int avg_inter_frame_qindex = rc->avg_frame_qindex[INTER_FRAME]; + // TODO(angiebird): figure out why arf's location is assigned this way + const int arf_show_idx = VPXMIN(gf_start_show_idx + gop_coding_frames + 1, + fps_get_num_frames(first_pass_info)); + // Calculate the boost for alt ref. rc->gfu_boost = - calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost); + compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, b_frames, + avg_inter_frame_qindex); rc->source_alt_ref_pending = 1; - - // Test to see if multi arf is appropriate. - cpi->multi_arf_enabled = - (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && - (zero_motion_accumulator < 0.995)) - ? 1 - : 0; } else { - rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST); + const int f_frames = gop_coding_frames - 1; + const int b_frames = 0; + const int avg_inter_frame_qindex = rc->avg_frame_qindex[INTER_FRAME]; + // TODO(angiebird): figure out why arf's location is assigned this way + const int gld_show_idx = + VPXMIN(gf_start_show_idx + 1, fps_get_num_frames(first_pass_info)); + const int arf_boost = + compute_arf_boost(frame_info, twopass, gld_show_idx, f_frames, b_frames, + avg_inter_frame_qindex); + rc->gfu_boost = VPXMIN((int)twopass->gf_max_total_boost, arf_boost); rc->source_alt_ref_pending = 0; } - // Limit maximum boost based on interval length. - rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200); +#define LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR 0.2 + rc->arf_active_best_quality_adjustment_factor = 1.0; + rc->arf_increase_active_best_quality = 0; - // Set the interval until the next gf. - rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); - - // Only encode alt reference frame in temporal base layer. So - // baseline_gf_interval should be multiple of a temporal layer group - // (typically the frame distance between two base layer frames) - if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) { - int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1; - int new_gf_interval = (rc->baseline_gf_interval + count) & (~count); - int j; - for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) { - if (EOF == input_stats(twopass, this_frame)) break; - gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); - gf_group_raw_error += this_frame->coded_error; - gf_group_noise += this_frame->frame_noise_energy; - gf_group_skip_pct += this_frame->intra_skip_pct; - gf_group_inactive_zone_rows += this_frame->inactive_zone_rows; - gf_group_inter += this_frame->pcnt_inter; - gf_group_motion += this_frame->pcnt_motion; + if (!is_lossless_requested(&cpi->oxcf)) { + if (rc->frames_since_key >= rc->frames_to_key) { + // Increase the active best quality in the second half of key frame + // interval. + rc->arf_active_best_quality_adjustment_factor = + LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR + + (1.0 - LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR) * + (rc->frames_to_key - gop_coding_frames) / + (VPXMAX(1, ((rc->frames_to_key + rc->frames_since_key) / 2 - + gop_coding_frames))); + rc->arf_increase_active_best_quality = 1; + } else if ((rc->frames_to_key - gop_coding_frames) > 0) { + // Reduce the active best quality in the first half of key frame interval. + rc->arf_active_best_quality_adjustment_factor = + LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR + + (1.0 - LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR) * + (rc->frames_since_key + gop_coding_frames) / + (VPXMAX(1, (rc->frames_to_key + rc->frames_since_key) / 2 + + gop_coding_frames)); + rc->arf_increase_active_best_quality = -1; } - rc->baseline_gf_interval = new_gf_interval; } - rc->frames_till_gf_update_due = rc->baseline_gf_interval; +#ifdef AGGRESSIVE_VBR + // Limit maximum boost based on interval length. + rc->gfu_boost = VPXMIN((int)rc->gfu_boost, gop_coding_frames * 140); +#else + rc->gfu_boost = VPXMIN((int)rc->gfu_boost, gop_coding_frames * 200); +#endif - // Reset the file position. - reset_fpf_position(twopass, start_pos); + // Cap the ARF boost when perceptual quality AQ mode is enabled. This is + // designed to improve the perceptual quality of high value content and to + // make consistent quality across consecutive frames. It will hurt objective + // quality. + if (oxcf->aq_mode == PERCEPTUAL_AQ) + rc->gfu_boost = VPXMIN(rc->gfu_boost, MIN_ARF_GF_BOOST); + + rc->baseline_gf_interval = gop_coding_frames - rc->source_alt_ref_pending; + + if (rc->source_alt_ref_pending) + is_alt_ref_flash = detect_flash(twopass, rc->baseline_gf_interval); + + { + const double av_err = get_distribution_av_err(cpi, twopass); + const double mean_mod_score = twopass->mean_mod_score; + // If the first frame is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + int start_idx = arf_active_or_kf ? 1 : 0; + int j; + for (j = start_idx; j < gop_coding_frames; ++j) { + int show_idx = gf_start_show_idx + j; + const FIRSTPASS_STATS *frame_stats = + fps_get_frame_stats(first_pass_info, show_idx); + if (frame_stats == NULL) { + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_gop_decision != NULL) { + // Since in ext_ratectrl, gop_coding_frames means the count of both + // show and no show frames. Using this variable to access + // first_pass_info will trigger out-of-range error because + // first_pass_info only contains show frames. This part is used for + // computing gf_group_err which will be used to compute gf_group_bits + // for libvpx internal rate control. Since ext_ratectrl is using + // external rate control module, this part becomes non-critical. + // Hence, we can safely turn off this error reporting. + break; + } + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "In define_gf_group(), frame_stats is NULL when " + "calculating gf_group_err."); + break; + } + // Accumulate error score of frames in this gf group. + gf_group_err += calc_norm_frame_score(oxcf, frame_info, frame_stats, + mean_mod_score, av_err); + gf_group_raw_error += frame_stats->coded_error; + gf_group_noise += frame_stats->frame_noise_energy; + gf_group_skip_pct += frame_stats->intra_skip_pct; + gf_group_inactive_zone_rows += frame_stats->inactive_zone_rows; + gf_group_inter += frame_stats->pcnt_inter; + gf_group_motion += frame_stats->pcnt_motion; + } + } // Calculate the bits to be allocated to the gf/arf group as a whole gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err); + gop_frames = + rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf; + + // Store the average moise level measured for the group + // TODO(any): Experiment with removal of else condition (gop_frames = 0) so + // that consumption of group noise energy is based on previous gf group + if (gop_frames > 0) + twopass->gf_group.group_noise_energy = (int)(gf_group_noise / gop_frames); + else + twopass->gf_group.group_noise_energy = 0; + // Calculate an estimate of the maxq needed for the group. - // We are more agressive about correcting for sections + // We are more aggressive about correcting for sections // where there could be significant overshoot than for easier // sections where we do not wish to risk creating an overshoot // of the allocated bit budget. if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) { - const int vbr_group_bits_per_frame = - (int)(gf_group_bits / rc->baseline_gf_interval); - const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval; - const double group_av_noise = gf_group_noise / rc->baseline_gf_interval; - const double group_av_skip_pct = - gf_group_skip_pct / rc->baseline_gf_interval; - const double group_av_inactive_zone = - ((gf_group_inactive_zone_rows * 2) / - (rc->baseline_gf_interval * (double)cm->mb_rows)); + const int vbr_group_bits_per_frame = (int)(gf_group_bits / gop_frames); + const double group_av_err = gf_group_raw_error / gop_frames; + const double group_av_noise = gf_group_noise / gop_frames; + const double group_av_skip_pct = gf_group_skip_pct / gop_frames; + const double group_av_inactive_zone = ((gf_group_inactive_zone_rows * 2) / + (gop_frames * (double)cm->mb_rows)); int tmp_q = get_twopass_worst_quality( cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), group_av_noise, vbr_group_bits_per_frame); twopass->active_worst_quality = - (tmp_q + (twopass->active_worst_quality * 3)) >> 2; + (int)((tmp_q + (twopass->active_worst_quality * + (twopass->active_wq_factor - 1))) / + twopass->active_wq_factor); + +#if CONFIG_ALWAYS_ADJUST_BPM + // Reset rolling actual and target bits counters for ARF groups. + twopass->rolling_arf_group_target_bits = 0; + twopass->rolling_arf_group_actual_bits = 0; +#endif } // Context Adjustment of ARNR filter strength if (rc->baseline_gf_interval > 1) { - adjust_group_arnr_filter(cpi, (gf_group_noise / rc->baseline_gf_interval), - (gf_group_inter / rc->baseline_gf_interval), - (gf_group_motion / rc->baseline_gf_interval)); + adjust_group_arnr_filter(cpi, (gf_group_noise / gop_frames), + (gf_group_inter / gop_frames), + (gf_group_motion / gop_frames)); } else { twopass->arnr_strength_adjustment = 0; } // Calculate the extra bits to be used for boosted frame(s) - gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost, - gf_group_bits); + gf_arf_bits = calculate_boost_bits((rc->baseline_gf_interval - 1), + rc->gfu_boost, gf_group_bits); // Adjust KF group bits and error remaining. - twopass->kf_group_error_left -= (int64_t)gf_group_err; + twopass->kf_group_error_left -= gf_group_err; + + // Decide GOP structure. + if (gop_decision_ready) { + ext_rc_define_gf_group_structure(&gop_decision, &twopass->gf_group); + // Set the fb idx for the first frame in this GOP. + cpi->lst_fb_idx = twopass->gf_group.ext_rc_ref[0].last_index; + cpi->gld_fb_idx = twopass->gf_group.ext_rc_ref[0].golden_index; + cpi->alt_fb_idx = twopass->gf_group.ext_rc_ref[0].altref_index; + } else { + define_gf_group_structure(cpi); + } // Allocate bits to each of the frames in the GF group. allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits); @@ -2379,32 +3040,94 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { reset_fpf_position(twopass, start_pos); // Calculate a section intra ratio used in setting max loop filter. - if (cpi->common.frame_type != KEY_FRAME) { - twopass->section_intra_rating = calculate_section_intra_ratio( - start_pos, twopass->stats_in_end, rc->baseline_gf_interval); - } + twopass->section_intra_rating = calculate_section_intra_ratio( + start_pos, twopass->stats_in_end, rc->baseline_gf_interval); if (oxcf->resize_mode == RESIZE_DYNAMIC) { // Default to starting GF groups at normal frame size. cpi->rc.next_frame_size_selector = UNSCALED; } - +#if !CONFIG_ALWAYS_ADJUST_BPM // Reset rolling actual and target bits counters for ARF groups. twopass->rolling_arf_group_target_bits = 0; twopass->rolling_arf_group_actual_bits = 0; +#endif + rc->preserve_arf_as_gld = rc->preserve_next_arf_as_gld; + rc->preserve_next_arf_as_gld = 0; + // If alt ref frame is flash do not set preserve_arf_as_gld + if (!is_lossless_requested(&cpi->oxcf) && !cpi->use_svc && + cpi->oxcf.aq_mode == NO_AQ && cpi->multi_layer_arf && !is_alt_ref_flash) + rc->preserve_next_arf_as_gld = 1; +} + +// Intra / Inter threshold very low +#define VERY_LOW_II 1.5 +// Clean slide transitions we expect a sharp single frame spike in error. +#define ERROR_SPIKE 5.0 + +// Slide show transition detection. +// Tests for case where there is very low error either side of the current frame +// but much higher just for this frame. This can help detect key frames in +// slide shows even where the slides are pictures of different sizes. +// Also requires that intra and inter errors are very similar to help eliminate +// harmful false positives. +// It will not help if the transition is a fade or other multi-frame effect. +static int slide_transition(const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *next_frame) { + return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) && + (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) && + (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE)); +} + +// This test looks for anomalous changes in the nature of the intra signal +// related to the previous and next frame as an indicator for coding a key +// frame. This test serves to detect some additional scene cuts, +// especially in lowish motion and low contrast sections, that are missed +// by the other tests. +static int intra_step_transition(const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *next_frame) { + double last_ii_ratio; + double this_ii_ratio; + double next_ii_ratio; + double last_pcnt_intra = 1.0 - last_frame->pcnt_inter; + double this_pcnt_intra = 1.0 - this_frame->pcnt_inter; + double next_pcnt_intra = 1.0 - next_frame->pcnt_inter; + double mod_this_intra = this_pcnt_intra + this_frame->pcnt_neutral; + + // Calculate ii ratio for this frame last frame and next frame. + last_ii_ratio = + last_frame->intra_error / DOUBLE_DIVIDE_CHECK(last_frame->coded_error); + this_ii_ratio = + this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + next_ii_ratio = + next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error); + + // Return true the intra/inter ratio for the current frame is + // low but better in the next and previous frame and the relative usage of + // intra in the current frame is markedly higher than the last and next frame. + if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) && + (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) && + (this_pcnt_intra > (3 * next_pcnt_intra)) && + ((this_pcnt_intra > 0.075) || (mod_this_intra > 0.85))) { + return 1; + // Very low inter intra ratio (i.e. not much gain from inter coding), most + // blocks neutral on coding method and better inter prediction either side + } else if ((this_ii_ratio < 1.25) && (mod_this_intra > 0.85) && + (this_ii_ratio < last_ii_ratio * 0.9) && + (this_ii_ratio < next_ii_ratio * 0.9)) { + return 1; + } else { + return 0; + } } -// Threshold for use of the lagging second reference frame. High second ref -// usage may point to a transient event like a flash or occlusion rather than -// a real scene cut. -#define SECOND_REF_USEAGE_THRESH 0.1 // Minimum % intra coding observed in first pass (1.0 = 100%) #define MIN_INTRA_LEVEL 0.25 -// Minimum ratio between the % of intra coding and inter coding in the first -// pass after discounting neutral blocks (discounting neutral blocks in this -// way helps catch scene cuts in clips with very flat areas or letter box -// format clips with image padding. -#define INTRA_VS_INTER_THRESH 2.0 +// Threshold for use of the lagging second reference frame. Scene cuts do not +// usually have a high second ref usage. +#define SECOND_REF_USAGE_THRESH 0.2 // Hard threshold where the first pass chooses intra for almost all blocks. // In such a case even if the frame is not a scene cut coding a key frame // may be a good option. @@ -2412,80 +3135,75 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Maximum threshold for the relative ratio of intra error score vs best // inter error score. #define KF_II_ERR_THRESHOLD 2.5 -// In real scene cuts there is almost always a sharp change in the intra -// or inter error score. -#define ERR_CHANGE_THRESHOLD 0.4 -// For real scene cuts we expect an improvment in the intra inter error -// ratio in the next frame. -#define II_IMPROVEMENT_THRESHOLD 3.5 #define KF_II_MAX 128.0 +#define II_FACTOR 12.5 +// Test for very low intra complexity which could cause false key frames +#define V_LOW_INTRA 0.5 -static int test_candidate_kf(TWO_PASS *twopass, - const FIRSTPASS_STATS *last_frame, - const FIRSTPASS_STATS *this_frame, - const FIRSTPASS_STATS *next_frame) { +static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info, + int show_idx) { + const FIRSTPASS_STATS *last_frame = + fps_get_frame_stats(first_pass_info, show_idx - 1); + const FIRSTPASS_STATS *this_frame = + fps_get_frame_stats(first_pass_info, show_idx); + const FIRSTPASS_STATS *next_frame = + fps_get_frame_stats(first_pass_info, show_idx + 1); int is_viable_kf = 0; double pcnt_intra = 1.0 - this_frame->pcnt_inter; - double modified_pcnt_inter = - this_frame->pcnt_inter - this_frame->pcnt_neutral; // Does the frame satisfy the primary criteria of a key frame? // See above for an explanation of the test criteria. // If so, then examine how well it predicts subsequent frames. - if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && - (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && + detect_flash_from_frame_stats(next_frame); + if (!detect_flash_from_frame_stats(this_frame) && + !detect_flash_from_frame_stats(next_frame) && + (this_frame->pcnt_second_ref < SECOND_REF_USAGE_THRESH) && ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || - ((pcnt_intra > MIN_INTRA_LEVEL) && - (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && + (slide_transition(this_frame, last_frame, next_frame)) || + (intra_step_transition(this_frame, last_frame, next_frame)) || + (((this_frame->coded_error > (next_frame->coded_error * 1.2)) && + (this_frame->coded_error > (last_frame->coded_error * 1.2))) && + (pcnt_intra > MIN_INTRA_LEVEL) && + ((pcnt_intra + this_frame->pcnt_neutral) > 0.5) && ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < - KF_II_ERR_THRESHOLD) && - ((fabs(last_frame->coded_error - this_frame->coded_error) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > - ERR_CHANGE_THRESHOLD) || - (fabs(last_frame->intra_error - this_frame->intra_error) / - DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > - ERR_CHANGE_THRESHOLD) || - ((next_frame->intra_error / - DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > - II_IMPROVEMENT_THRESHOLD))))) { + KF_II_ERR_THRESHOLD)))) { int i; - const FIRSTPASS_STATS *start_pos = twopass->stats_in; - FIRSTPASS_STATS local_next_frame = *next_frame; double boost_score = 0.0; double old_boost_score = 0.0; double decay_accumulator = 1.0; // Examine how well the key frame predicts subsequent frames. for (i = 0; i < 16; ++i) { - double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error / - DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); + const FIRSTPASS_STATS *frame_stats = + fps_get_frame_stats(first_pass_info, show_idx + 1 + i); + double next_iiratio = (II_FACTOR * frame_stats->intra_error / + DOUBLE_DIVIDE_CHECK(frame_stats->coded_error)); if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX; // Cumulative effect of decay in prediction quality. - if (local_next_frame.pcnt_inter > 0.85) - decay_accumulator *= local_next_frame.pcnt_inter; + if (frame_stats->pcnt_inter > 0.85) + decay_accumulator *= frame_stats->pcnt_inter; else - decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0; + decay_accumulator *= (0.85 + frame_stats->pcnt_inter) / 2.0; // Keep a running total. boost_score += (decay_accumulator * next_iiratio); // Test various breakout clauses. - if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) || - (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) < - 0.20) && + if ((frame_stats->pcnt_inter < 0.05) || (next_iiratio < 1.5) || + (((frame_stats->pcnt_inter - frame_stats->pcnt_neutral) < 0.20) && (next_iiratio < 3.0)) || ((boost_score - old_boost_score) < 3.0) || - (local_next_frame.intra_error < 200)) { + (frame_stats->intra_error < V_LOW_INTRA)) { break; } old_boost_score = boost_score; // Get the next frame details - if (EOF == input_stats(twopass, &local_next_frame)) break; + if (show_idx + 1 + i == fps_get_num_frames(first_pass_info) - 1) break; } // If there is tolerable prediction for at least the next 3 frames then @@ -2493,9 +3211,6 @@ static int test_candidate_kf(TWO_PASS *twopass, if (boost_score > 30.0 && (i > 3)) { is_viable_kf = 1; } else { - // Reset the file position - reset_fpf_position(twopass, start_pos); - is_viable_kf = 0; } } @@ -2504,33 +3219,105 @@ static int test_candidate_kf(TWO_PASS *twopass, } #define FRAMES_TO_CHECK_DECAY 8 -#define KF_MAX_FRAME_BOOST 96.0 #define MIN_KF_TOT_BOOST 300 -#define MAX_KF_TOT_BOOST 5400 -#define KF_BOOST_SCAN_MAX_FRAMES 32 +#define DEFAULT_SCAN_FRAMES_FOR_KF_BOOST 32 +#define MAX_SCAN_FRAMES_FOR_KF_BOOST 48 +#define MIN_SCAN_FRAMES_FOR_KF_BOOST 32 +#define KF_ABS_ZOOM_THRESH 6.0 -static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { - int i, j; +int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, int kf_show_idx, + int min_gf_interval) { + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; + double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; + int j; + int frames_to_key; + int max_frames_to_key = first_pass_info->num_frames - kf_show_idx; + max_frames_to_key = VPXMIN(max_frames_to_key, oxcf->key_freq); + + // Initialize the decay rates for the recent frames to check + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; + // Find the next keyframe. + if (!oxcf->auto_key) { + frames_to_key = max_frames_to_key; + } else { + frames_to_key = 1; + while (frames_to_key < max_frames_to_key) { + // Provided that we are not at the end of the file... + if (kf_show_idx + frames_to_key + 1 < first_pass_info->num_frames) { + double loop_decay_rate; + double decay_accumulator; + const FIRSTPASS_STATS *next_frame = fps_get_frame_stats( + first_pass_info, kf_show_idx + frames_to_key + 1); + + // Check for a scene cut. + if (test_candidate_kf(first_pass_info, kf_show_idx + frames_to_key)) + break; + + // How fast is the prediction quality decaying? + loop_decay_rate = get_prediction_decay_rate(twopass, next_frame); + + // We want to know something about the recent past... rather than + // as used elsewhere where we are concerned with decay in prediction + // quality since the last GF or KF. + recent_loop_decay[(frames_to_key - 1) % FRAMES_TO_CHECK_DECAY] = + loop_decay_rate; + decay_accumulator = 1.0; + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) + decay_accumulator *= recent_loop_decay[j]; + + // Special check for transition or high motion followed by a + // static scene. + if ((frames_to_key - 1) > min_gf_interval && loop_decay_rate >= 0.999 && + decay_accumulator < 0.9) { + int still_interval = oxcf->key_freq - (frames_to_key - 1); + // TODO(angiebird): Figure out why we use "+1" here + int show_idx = kf_show_idx + frames_to_key; + if (check_transition_to_still(first_pass_info, show_idx, + still_interval)) { + break; + } + } + } + ++frames_to_key; + } + } + return frames_to_key; +} + +static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) { + int i; RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->twopass; GF_GROUP *const gf_group = &twopass->gf_group; const VP9EncoderConfig *const oxcf = &cpi->oxcf; - const FIRSTPASS_STATS first_frame = *this_frame; + const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info; + const FRAME_INFO *frame_info = &cpi->frame_info; const FIRSTPASS_STATS *const start_position = twopass->stats_in; + const FIRSTPASS_STATS *keyframe_stats = + fps_get_frame_stats(first_pass_info, kf_show_idx); FIRSTPASS_STATS next_frame; - FIRSTPASS_STATS last_frame; int kf_bits = 0; - double decay_accumulator = 1.0; + int64_t max_kf_bits; double zero_motion_accumulator = 1.0; + double zero_motion_sum = 0.0; + double zero_motion_avg; + double motion_compensable_sum = 0.0; + double motion_compensable_avg; + int num_frames = 0; + int kf_boost_scan_frames = DEFAULT_SCAN_FRAMES_FOR_KF_BOOST; double boost_score = 0.0; double kf_mod_err = 0.0; + double kf_raw_err = 0.0; double kf_group_err = 0.0; - double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; double sr_accumulator = 0.0; - + double abs_mv_in_out_accumulator = 0.0; + const double av_err = get_distribution_av_err(cpi, twopass); + const double mean_mod_score = twopass->mean_mod_score; vp9_zero(next_frame); cpi->common.frame_type = KEY_FRAME; + rc->frames_since_key = 0; // Reset the GF group data structures. vp9_zero(*gf_group); @@ -2541,116 +3328,56 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Clear the alt ref active flag and last group multi arf flags as they // can never be set for a key frame. rc->source_alt_ref_active = 0; - cpi->multi_arf_last_grp_enabled = 0; // KF is always a GF so clear frames till next gf counter. rc->frames_till_gf_update_due = 0; rc->frames_to_key = 1; - twopass->kf_group_bits = 0; // Total bits available to kf group - twopass->kf_group_error_left = 0; // Group modified error score. + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0.0; // Group modified error score. - kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + kf_raw_err = keyframe_stats->intra_error; + kf_mod_err = calc_norm_frame_score(oxcf, frame_info, keyframe_stats, + mean_mod_score, av_err); - // Initialize the decay rates for the recent frames to check - for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; - - // Find the next keyframe. - i = 0; - while (twopass->stats_in < twopass->stats_in_end && - rc->frames_to_key < cpi->oxcf.key_freq) { - // Accumulate kf group error. - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); - - // Load the next frame's stats. - last_frame = *this_frame; - input_stats(twopass, this_frame); - - // Provided that we are not at the end of the file... - if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) { - double loop_decay_rate; - - // Check for a scene cut. - if (test_candidate_kf(twopass, &last_frame, this_frame, - twopass->stats_in)) - break; - - // How fast is the prediction quality decaying? - loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in); - - // We want to know something about the recent past... rather than - // as used elsewhere where we are concerned with decay in prediction - // quality since the last GF or KF. - recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate; - decay_accumulator = 1.0; - for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) - decay_accumulator *= recent_loop_decay[j]; - - // Special check for transition or high motion followed by a - // static scene. - if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i, - loop_decay_rate, decay_accumulator)) - break; - - // Step on to the next frame. - ++rc->frames_to_key; - - // If we don't have a real key frame within the next two - // key_freq intervals then break out of the loop. - if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break; + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 && + cpi->ext_ratectrl.funcs.get_key_frame_decision != NULL) { + vpx_rc_key_frame_decision_t key_frame_decision; + vpx_codec_err_t codec_status = vp9_extrc_get_key_frame_decision( + &cpi->ext_ratectrl, &key_frame_decision); + if (codec_status == VPX_CODEC_OK) { + rc->frames_to_key = key_frame_decision.key_frame_group_size; } else { - ++rc->frames_to_key; + vpx_internal_error(&cpi->common.error, codec_status, + "vp9_extrc_get_key_frame_decision() failed"); } - ++i; + } else { + rc->frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, kf_show_idx, + rc->min_gf_interval); } // If there is a max kf interval set by the user we must obey it. // We already breakout of the loop above at 2x max. // This code centers the extra kf if the actual natural interval // is between 1x and 2x. - if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) { - FIRSTPASS_STATS tmp_frame = first_frame; - - rc->frames_to_key /= 2; - - // Reset to the start of the group. - reset_fpf_position(twopass, start_position); - - kf_group_err = 0.0; - - // Rescan to get the correct error data for the forced kf group. - for (i = 0; i < rc->frames_to_key; ++i) { - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame); - input_stats(twopass, &tmp_frame); - } - rc->next_key_frame_forced = 1; - } else if (twopass->stats_in == twopass->stats_in_end || - rc->frames_to_key >= cpi->oxcf.key_freq) { + if (rc->frames_to_key >= cpi->oxcf.key_freq) { rc->next_key_frame_forced = 1; } else { rc->next_key_frame_forced = 0; } - if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) { - int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1; - int new_frame_to_key = (rc->frames_to_key + count) & (~count); - int j; - for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) { - if (EOF == input_stats(twopass, this_frame)) break; - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); - } - rc->frames_to_key = new_frame_to_key; - } - - // Special case for the last key frame of the file. - if (twopass->stats_in >= twopass->stats_in_end) { + for (i = 0; i < rc->frames_to_key; ++i) { + const FIRSTPASS_STATS *frame_stats = + fps_get_frame_stats(first_pass_info, kf_show_idx + i); // Accumulate kf group error. - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + kf_group_err += calc_norm_frame_score(oxcf, frame_info, frame_stats, + mean_mod_score, av_err); } // Calculate the number of bits that should be assigned to the kf group. - if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) { + if (twopass->bits_left > 0 && twopass->normalized_score_left > 0.0) { // Maximum number of bits for a single normal frame (not key frame). const int max_bits = frame_max_bits(rc, &cpi->oxcf); @@ -2659,8 +3386,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Default allocation based on bits left and relative // complexity of the section. - twopass->kf_group_bits = (int64_t)( - twopass->bits_left * (kf_group_err / twopass->modified_error_left)); + twopass->kf_group_bits = + (int64_t)(twopass->bits_left * + (kf_group_err / twopass->normalized_score_left)); // Clip based on maximum per frame rate defined by the user. max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; @@ -2671,37 +3399,73 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } twopass->kf_group_bits = VPXMAX(0, twopass->kf_group_bits); - // Reset the first pass file position. - reset_fpf_position(twopass, start_position); - // Scan through the kf group collating various stats used to determine // how many bits to spend on it. boost_score = 0.0; + for (i = 0; i < VPXMIN(MAX_SCAN_FRAMES_FOR_KF_BOOST, (rc->frames_to_key - 1)); + ++i) { + if (EOF == input_stats(twopass, &next_frame)) break; + + zero_motion_sum += next_frame.pcnt_inter - next_frame.pcnt_motion; + motion_compensable_sum += + 1 - (double)next_frame.coded_error / next_frame.intra_error; + num_frames++; + } + + if (num_frames >= MIN_SCAN_FRAMES_FOR_KF_BOOST) { + zero_motion_avg = zero_motion_sum / num_frames; + motion_compensable_avg = motion_compensable_sum / num_frames; + kf_boost_scan_frames = (int)(VPXMAX(64 * zero_motion_avg - 16, + 160 * motion_compensable_avg - 112)); + kf_boost_scan_frames = + clamp(kf_boost_scan_frames, MIN_SCAN_FRAMES_FOR_KF_BOOST, + MAX_SCAN_FRAMES_FOR_KF_BOOST); + } + reset_fpf_position(twopass, start_position); + for (i = 0; i < (rc->frames_to_key - 1); ++i) { if (EOF == input_stats(twopass, &next_frame)) break; - if (i <= KF_BOOST_SCAN_MAX_FRAMES) { + // The zero motion test here insures that if we mark a kf group as static + // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES. + // It also allows for a larger boost on long static groups. + if ((i <= kf_boost_scan_frames) || (zero_motion_accumulator >= 0.99)) { double frame_boost; double zm_factor; // Monitor for static sections. - zero_motion_accumulator = VPXMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + // First frame in kf group the second ref indicator is invalid. + if (i > 0) { + zero_motion_accumulator = + VPXMIN(zero_motion_accumulator, + get_zero_motion_factor(twopass, &next_frame)); + } else { + zero_motion_accumulator = + next_frame.pcnt_inter - next_frame.pcnt_motion; + } // Factor 0.75-1.25 based on how much of frame is static. zm_factor = (0.75 + (zero_motion_accumulator / 2.0)); // The second (lagging) ref error is not valid immediately after // a key frame because either the lag has not built up (in the case of - // the first key frame or it points to a refernce before the new key + // the first key frame or it points to a reference before the new key // frame. if (i < 2) sr_accumulator = 0.0; - frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, - KF_MAX_FRAME_BOOST * zm_factor); + frame_boost = + calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, zm_factor); boost_score += frame_boost; - if (frame_boost < 25.00) break; + + // Measure of zoom. Large zoom tends to indicate reduced boost. + abs_mv_in_out_accumulator += + fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion); + + if ((frame_boost < 25.00) || + (abs_mv_in_out_accumulator > KF_ABS_ZOOM_THRESH) || + (sr_accumulator > (kf_raw_err * 1.50))) + break; } else { break; } @@ -2713,17 +3477,30 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); // Calculate a section intra ratio used in setting max loop filter. - twopass->section_intra_rating = calculate_section_intra_ratio( + twopass->key_frame_section_intra_rating = calculate_section_intra_ratio( start_position, twopass->stats_in_end, rc->frames_to_key); - // Apply various clamps for min and max boost - rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); - rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); - rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); + // Special case for static / slide show content but don't apply + // if the kf group is very short. + if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) { + rc->kf_boost = (int)(twopass->kf_max_total_boost); + } else { + // Apply various clamps for min and max oost + rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); + rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); + rc->kf_boost = VPXMIN(rc->kf_boost, (int)(twopass->kf_max_total_boost)); + } // Work out how many bits to allocate for the key frame itself. kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, twopass->kf_group_bits); + // Based on the spatial complexity, increase the bits allocated to key frame. + kf_bits += + (int)((twopass->kf_group_bits - kf_bits) * (kf_mod_err / kf_group_err)); + max_kf_bits = + twopass->kf_group_bits - (rc->frames_to_key - 1) * FRAME_OVERHEAD_BITS; + max_kf_bits = lclamp(max_kf_bits, 0, INT_MAX); + kf_bits = VPXMIN(kf_bits, (int)max_kf_bits); twopass->kf_group_bits -= kf_bits; @@ -2731,14 +3508,15 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { gf_group->bit_allocation[0] = kf_bits; gf_group->update_type[0] = KF_UPDATE; gf_group->rf_level[0] = KF_STD; + gf_group->layer_depth[0] = 0; // Note the total error score of the kf group minus the key frame itself. - twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); + twopass->kf_group_error_left = (kf_group_err - kf_mod_err); // Adjust the count of total modified error left. // The count of bits left is adjusted elsewhere based on real coded frame // sizes. - twopass->modified_error_left -= kf_group_err; + twopass->normalized_score_left -= kf_group_err; if (oxcf->resize_mode == RESIZE_DYNAMIC) { // Default to normal-sized frame on keyframes. @@ -2746,114 +3524,117 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } } -// Define the reference buffers that will be updated post encode. -static void configure_buffer_updates(VP9_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; +// Configure image size specific vizier parameters. +// Later these will be set via additional command line options +void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area) { + // When |use_vizier_rc_params| is 1, we expect the rc parameters below to + // have been initialised on the command line as adjustment factors such + // that a factor of 1.0 will match the default behavior when + // |use_vizier_rc_params| is 0 + if (twopass->use_vizier_rc_params) { + twopass->active_wq_factor *= AV_WQ_FACTOR; + twopass->err_per_mb *= BASELINE_ERR_PER_MB; + twopass->sr_default_decay_limit *= DEFAULT_DECAY_LIMIT; + if (twopass->sr_default_decay_limit > 1.0) // > 1.0 here makes no sense + twopass->sr_default_decay_limit = 1.0; + twopass->sr_diff_factor *= 1.0; + twopass->gf_frame_max_boost *= GF_MAX_FRAME_BOOST; + twopass->gf_max_total_boost *= MAX_GF_BOOST; + // NOTE: In use max boost has precedence over min boost. So even if min is + // somehow set higher than max the final boost value will be clamped to the + // appropriate maximum. + twopass->kf_frame_min_boost *= KF_MIN_FRAME_BOOST; + twopass->kf_frame_max_boost_first *= KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs *= KF_MAX_FRAME_BOOST; + twopass->kf_max_total_boost *= MAX_KF_TOT_BOOST; + twopass->zm_factor *= DEFAULT_ZM_FACTOR; + if (twopass->zm_factor > 1.0) // > 1.0 here makes no sense + twopass->zm_factor = 1.0; - cpi->rc.is_src_frame_alt_ref = 0; - switch (twopass->gf_group.update_type[twopass->gf_group.index]) { - case KF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 1; - break; - case LF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - case GF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 0; - break; - case OVERLAY_UPDATE: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 0; - cpi->rc.is_src_frame_alt_ref = 1; - break; - case ARF_UPDATE: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_alt_ref_frame = 1; - break; - default: assert(0); break; - } - if (is_two_pass_svc(cpi)) { - if (cpi->svc.temporal_layer_id > 0) { - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; + // Correction for the fact that the kf_err_per_mb_factor default is + // already different for different video formats and ensures that a passed + // in value of 1.0 on the vizier command line will still match the current + // default. + if (screen_area < 1280 * 720) { + twopass->kf_err_per_mb *= 2000.0; + } else if (screen_area < 1920 * 1080) { + twopass->kf_err_per_mb *= 500.0; + } else { + twopass->kf_err_per_mb *= 250.0; + } + } else { + // When |use_vizier_rc_params| is 0, use defaults. + twopass->active_wq_factor = AV_WQ_FACTOR; + twopass->err_per_mb = BASELINE_ERR_PER_MB; + twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT; + twopass->sr_diff_factor = 1.0; + twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST; + twopass->gf_max_total_boost = MAX_GF_BOOST; + twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST; + twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST; + twopass->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST; + twopass->kf_max_total_boost = MAX_KF_TOT_BOOST; + twopass->zm_factor = DEFAULT_ZM_FACTOR; + + if (screen_area < 1280 * 720) { + twopass->kf_err_per_mb = 2000.0; + } else if (screen_area < 1920 * 1080) { + twopass->kf_err_per_mb = 500.0; + } else { + twopass->kf_err_per_mb = 250.0; } - if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0) - cpi->refresh_golden_frame = 0; - if (cpi->alt_ref_source == NULL) cpi->refresh_alt_ref_frame = 0; } } -static int is_skippable_frame(const VP9_COMP *cpi) { - // If the current frame does not have non-zero motion vector detected in the - // first pass, and so do its previous and forward frames, then this frame - // can be skipped for partition check, and the partition size is assigned - // according to the variance - const SVC *const svc = &cpi->svc; - const TWO_PASS *const twopass = - is_two_pass_svc(cpi) ? &svc->layer_context[svc->spatial_layer_id].twopass - : &cpi->twopass; - - return (!frame_is_intra_only(&cpi->common) && - twopass->stats_in - 2 > twopass->stats_in_start && - twopass->stats_in < twopass->stats_in_end && - (twopass->stats_in - 1)->pcnt_inter - - (twopass->stats_in - 1)->pcnt_motion == - 1 && - (twopass->stats_in - 2)->pcnt_inter - - (twopass->stats_in - 2)->pcnt_motion == - 1 && - twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); -} - void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; TWO_PASS *const twopass = &cpi->twopass; GF_GROUP *const gf_group = &twopass->gf_group; FIRSTPASS_STATS this_frame; + const int show_idx = cm->current_video_frame; - int target_rate; - LAYER_CONTEXT *const lc = - is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] - : 0; + if (cpi->common.current_frame_coding_index == 0 && + cpi->ext_ratectrl.funcs.send_firstpass_stats != NULL) { + const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( + &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_send_firstpass_stats() failed"); + } + } if (!twopass->stats_in) return; - // If this is an arf frame then we dont want to read the stats file or + // Configure image size specific vizier parameters + if (cm->current_video_frame == 0) { + unsigned int screen_area = (cm->width * cm->height); + + vp9_init_vizier_params(twopass, screen_area); + } + + // If this is an arf frame then we don't want to read the stats file or // advance the input pointer as we already have what we need. if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { int target_rate; - configure_buffer_updates(cpi); + + vp9_zero(this_frame); + this_frame = + cpi->twopass.stats_in_start[cm->current_video_frame + + gf_group->arf_src_offset[gf_group->index]]; + + vp9_configure_buffer_updates(cpi, gf_group->index); + target_rate = gf_group->bit_allocation[gf_group->index]; target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); rc->base_frame_target = target_rate; cm->frame_type = INTER_FRAME; - if (lc != NULL) { - if (cpi->svc.spatial_layer_id == 0) { - lc->is_key_frame = 0; - } else { - lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; - - if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG); - } - } - - // Do the firstpass stats indicate that this frame is skippable for the - // partition search? - if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 && - (!cpi->use_svc || is_two_pass_svc(cpi))) { - cpi->partition_search_skippable_frame = is_skippable_frame(cpi); - } + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0); + twopass->mb_smooth_pct = this_frame.intra_smooth_pct; return; } @@ -2862,15 +3643,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (cpi->oxcf.rc_mode == VPX_Q) { twopass->active_worst_quality = cpi->oxcf.cq_level; - } else if (cm->current_video_frame == 0 || - (lc != NULL && lc->current_video_frame_in_layer == 0)) { + } else if (cm->current_video_frame == 0) { const int frames_left = - (int)(twopass->total_stats.count - - ((lc != NULL) ? lc->current_video_frame_in_layer - : cm->current_video_frame)); + (int)(twopass->total_stats.count - cm->current_video_frame); // Special case code for first frame. - const int section_target_bandwidth = - (int)(twopass->bits_left / frames_left); + int64_t section_target_bandwidth = twopass->bits_left / frames_left; + section_target_bandwidth = VPXMIN(section_target_bandwidth, INT_MAX); const double section_length = twopass->total_left_stats.count; const double section_error = twopass->total_left_stats.coded_error / section_length; @@ -2885,7 +3663,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { tmp_q = get_twopass_worst_quality( cpi, section_error, section_intra_skip + section_inactive_zone, - section_noise, section_target_bandwidth); + section_noise, (int)section_target_bandwidth); twopass->active_worst_quality = tmp_q; twopass->baseline_active_worst_quality = tmp_q; @@ -2907,87 +3685,51 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { // Keyframe and section processing. if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) { - FIRSTPASS_STATS this_frame_copy; - this_frame_copy = this_frame; // Define next KF group and assign bits to it. - find_next_key_frame(cpi, &this_frame); - this_frame = this_frame_copy; + find_next_key_frame(cpi, show_idx); } else { cm->frame_type = INTER_FRAME; } - if (lc != NULL) { - if (cpi->svc.spatial_layer_id == 0) { - lc->is_key_frame = (cm->frame_type == KEY_FRAME); - if (lc->is_key_frame) { - cpi->ref_frame_flags &= - (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); - lc->frames_from_key_frame = 0; - // Encode an intra only empty frame since we have a key frame. - cpi->svc.encode_intra_empty_frame = 1; - } - } else { - cm->frame_type = INTER_FRAME; - lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; - - if (lc->is_key_frame) { - cpi->ref_frame_flags &= (~VP9_LAST_FLAG); - lc->frames_from_key_frame = 0; - } - } - } - // Define a new GF/ARF group. (Should always enter here for key frames). if (rc->frames_till_gf_update_due == 0) { - define_gf_group(cpi, &this_frame); - - rc->frames_till_gf_update_due = rc->baseline_gf_interval; - if (lc != NULL) cpi->refresh_golden_frame = 1; + define_gf_group(cpi, show_idx); #if ARF_STATS_OUTPUT { FILE *fpfile; fpfile = fopen("arf.stt", "a"); ++arf_count; - fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n", cm->current_video_frame, - rc->frames_till_gf_update_due, rc->kf_boost, arf_count, - rc->gfu_boost); + fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n", + cm->current_video_frame, rc->baseline_gf_interval, rc->kf_boost, + arf_count, rc->gfu_boost, cm->frame_type); fclose(fpfile); } #endif } - configure_buffer_updates(cpi); - - // Do the firstpass stats indicate that this frame is skippable for the - // partition search? - if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 && - (!cpi->use_svc || is_two_pass_svc(cpi))) { - cpi->partition_search_skippable_frame = is_skippable_frame(cpi); + if (rc->frames_till_gf_update_due == 0) { + if (cpi->ext_ratectrl.ready && cpi->ext_ratectrl.log_file) { + fprintf(cpi->ext_ratectrl.log_file, "GOP_INFO show_frame_count %d\n", + rc->baseline_gf_interval); + } + rc->frames_till_gf_update_due = rc->baseline_gf_interval; } - target_rate = gf_group->bit_allocation[gf_group->index]; - rc->base_frame_target = target_rate; + vp9_configure_buffer_updates(cpi, gf_group->index); - { - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) - ? cpi->initial_mbs - : cpi->common.MBs; - // The multiplication by 256 reverses a scaling factor of (>> 8) - // applied when combining MB error values for the frame. - twopass->mb_av_energy = - log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0); - twopass->mb_smooth_pct = this_frame.intra_smooth_pct; - } + rc->base_frame_target = gf_group->bit_allocation[gf_group->index]; + + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0); + twopass->mb_smooth_pct = this_frame.intra_smooth_pct; // Update the total stats remaining structure. subtract_stats(&twopass->total_left_stats, &this_frame); } -#define MINQ_ADJ_LIMIT 48 -#define MINQ_ADJ_LIMIT_CQ 20 -#define HIGH_UNDERSHOOT_RATIO 2 void vp9_twopass_postencode_update(VP9_COMP *cpi) { TWO_PASS *const twopass = &cpi->twopass; RATE_CONTROL *const rc = &cpi->rc; @@ -3015,8 +3757,7 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { rc->rate_error_estimate = 0; } - if (cpi->common.frame_type != KEY_FRAME && - !vp9_is_upper_layer_key_frame(cpi)) { + if (cpi->common.frame_type != KEY_FRAME) { twopass->kf_group_bits -= bits_used; twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; } @@ -3036,7 +3777,8 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { // Extend min or Max Q range to account for imbalance from the base // value when using AQ. - if (cpi->oxcf.aq_mode != NO_AQ) { + if (cpi->oxcf.aq_mode != NO_AQ && cpi->oxcf.aq_mode != PSNR_AQ && + cpi->oxcf.aq_mode != PERCEPTUAL_AQ) { if (cm->seg.aq_av_offset < 0) { // The balance of the AQ map tends towarda lowering the average Q. aq_extend_min = 0; @@ -3086,7 +3828,8 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { rc->vbr_bits_off_target_fast += fast_extra_thresh - rc->projected_frame_size; rc->vbr_bits_off_target_fast = - VPXMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth)); + VPXMIN(rc->vbr_bits_off_target_fast, + (4 * (int64_t)rc->avg_frame_bandwidth)); // Fast adaptation of minQ if necessary to use up the extra bits. if (rc->avg_frame_bandwidth) { @@ -3104,3 +3847,10 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { } } } + +FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass) { + return twopass->this_frame_stats; +} +FIRSTPASS_STATS vp9_get_total_stats(const TWO_PASS *twopass) { + return twopass->total_stats; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h index 5541893dc8..9fdd5fcb1d 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h @@ -8,9 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_FIRSTPASS_H_ -#define VP9_ENCODER_VP9_FIRSTPASS_H_ +#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ +#define VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ +#include + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/encoder/vp9_firstpass_stats.h" #include "vp9/encoder/vp9_lookahead.h" #include "vp9/encoder/vp9_ratectrl.h" @@ -18,53 +22,42 @@ extern "C" { #endif -#if CONFIG_FP_MB_STATS +#define INVALID_ROW (-1) -#define FPMB_DCINTRA_MASK 0x01 - -#define FPMB_MOTION_ZERO_MASK 0x02 -#define FPMB_MOTION_LEFT_MASK 0x04 -#define FPMB_MOTION_RIGHT_MASK 0x08 -#define FPMB_MOTION_UP_MASK 0x10 -#define FPMB_MOTION_DOWN_MASK 0x20 - -#define FPMB_ERROR_SMALL_MASK 0x40 -#define FPMB_ERROR_LARGE_MASK 0x80 -#define FPMB_ERROR_SMALL_TH 2000 -#define FPMB_ERROR_LARGE_TH 48000 +#define MAX_ARF_LAYERS 6 +#define SECTION_NOISE_DEF 250.0 typedef struct { - uint8_t *mb_stats_start; - uint8_t *mb_stats_end; -} FIRSTPASS_MB_STATS; -#endif + double frame_mb_intra_factor; + double frame_mb_brightness_factor; + double frame_mb_neutral_count; +} FP_MB_FLOAT_STATS; typedef struct { - double frame; - double weight; - double intra_error; - double coded_error; - double sr_coded_error; - double frame_noise_energy; - double pcnt_inter; - double pcnt_motion; - double pcnt_second_ref; - double pcnt_neutral; - double intra_skip_pct; - double intra_smooth_pct; // % of blocks that are smooth - double inactive_zone_rows; // Image mask rows top and bottom. - double inactive_zone_cols; // Image mask columns at left and right edges. - double MVr; - double mvr_abs; - double MVc; - double mvc_abs; - double MVrv; - double MVcv; - double mv_in_out_count; - double duration; - double count; - int64_t spatial_layer_id; -} FIRSTPASS_STATS; + double intra_factor; + double brightness_factor; + int64_t coded_error; + int64_t sr_coded_error; + int64_t frame_noise_energy; + int64_t intra_error; + int intercount; + int second_ref_count; + double neutral_count; + double intra_count_low; // Coded intra but low variance + double intra_count_high; // Coded intra high variance + int intra_skip_count; + int image_data_start_row; + int mvcount; + int sum_mvr; + int sum_mvr_abs; + int sum_mvc; + int sum_mvc_abs; + int64_t sum_mvrs; + int64_t sum_mvcs; + int sum_in_vectors; + int intra_smooth_count; + int new_mv_count; +} FIRSTPASS_DATA; typedef enum { KF_UPDATE = 0, @@ -72,7 +65,9 @@ typedef enum { GF_UPDATE = 2, ARF_UPDATE = 3, OVERLAY_UPDATE = 4, - FRAME_UPDATE_TYPES = 5 + MID_OVERLAY_UPDATE = 5, + USE_BUF_FRAME = 6, // Use show existing frame, no ref buffer update + FRAME_UPDATE_TYPES = 7 } FRAME_UPDATE_TYPE; #define FC_ANIMATION_THRESH 0.15 @@ -82,38 +77,83 @@ typedef enum { FRAME_CONTENT_TYPES = 2 } FRAME_CONTENT_TYPE; +typedef struct ExternalRcReference { + int last_index; + int golden_index; + int altref_index; +} ExternalRcReference; + typedef struct { unsigned char index; - unsigned char first_inter_index; - RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1]; - FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1]; - int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1]; + RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 2]; + FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char layer_depth[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char frame_gop_index[MAX_STATIC_GF_GROUP_LENGTH + 2]; + int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 2]; + int gfu_boost[MAX_STATIC_GF_GROUP_LENGTH + 2]; + int update_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 2]; + + int frame_start; + int frame_end; + // TODO(jingning): The array size of arf_stack could be reduced. + int arf_index_stack[MAX_LAG_BUFFERS * 2]; + int top_arf_idx; + int stack_size; + int gf_group_size; + int max_layer_depth; + int allowed_max_layer_depth; + int group_noise_energy; + + // Structure to store the reference information from external RC. + // Used to override reference frame decisions in libvpx. + ExternalRcReference ext_rc_ref[MAX_STATIC_GF_GROUP_LENGTH + 2]; + int ref_frame_list[MAX_STATIC_GF_GROUP_LENGTH + 2][REFS_PER_FRAME]; } GF_GROUP; +typedef struct { + const FIRSTPASS_STATS *stats; + int num_frames; +} FIRST_PASS_INFO; + +static INLINE void fps_init_first_pass_info(FIRST_PASS_INFO *first_pass_info, + const FIRSTPASS_STATS *stats, + int num_frames) { + first_pass_info->stats = stats; + first_pass_info->num_frames = num_frames; +} + +static INLINE int fps_get_num_frames(const FIRST_PASS_INFO *first_pass_info) { + return first_pass_info->num_frames; +} + +static INLINE const FIRSTPASS_STATS *fps_get_frame_stats( + const FIRST_PASS_INFO *first_pass_info, int show_idx) { + if (show_idx < 0 || show_idx >= first_pass_info->num_frames) { + return NULL; + } + return &first_pass_info->stats[show_idx]; +} + typedef struct { unsigned int section_intra_rating; + unsigned int key_frame_section_intra_rating; FIRSTPASS_STATS total_stats; FIRSTPASS_STATS this_frame_stats; const FIRSTPASS_STATS *stats_in; const FIRSTPASS_STATS *stats_in_start; const FIRSTPASS_STATS *stats_in_end; + FIRST_PASS_INFO first_pass_info; FIRSTPASS_STATS total_left_stats; int first_pass_done; int64_t bits_left; - double modified_error_min; - double modified_error_max; - double modified_error_left; + double mean_mod_score; + double normalized_score_left; double mb_av_energy; double mb_smooth_pct; -#if CONFIG_FP_MB_STATS - uint8_t *frame_mb_stats_buf; - uint8_t *this_frame_mb_stats; - FIRSTPASS_MB_STATS firstpass_mb_stats; -#endif + FP_MB_FLOAT_STATS *fp_mb_float_stats; + // An indication of the content type of the current frame FRAME_CONTENT_TYPE fr_content_type; @@ -121,7 +161,7 @@ typedef struct { int64_t kf_group_bits; // Error score of frames still to be coded in kf group - int64_t kf_group_error_left; + double kf_group_error_left; double bpm_factor; int rolling_arf_group_target_bits; @@ -136,20 +176,46 @@ typedef struct { int extend_maxq; int extend_minq_fast; int arnr_strength_adjustment; + int last_qindex_of_arf_layer[MAX_ARF_LAYERS]; GF_GROUP gf_group; + + // Vizeir project experimental two pass rate control parameters. + // When |use_vizier_rc_params| is 1, the following parameters will + // be overwritten by pass in values. Otherwise, they are initialized + // by default values. + int use_vizier_rc_params; + double active_wq_factor; + double err_per_mb; + double sr_default_decay_limit; + double sr_diff_factor; + double kf_err_per_mb; + double kf_frame_min_boost; + double kf_frame_max_boost_first; // Max for first kf in a chunk. + double kf_frame_max_boost_subs; // Max for subsequent mid chunk kfs. + double kf_max_total_boost; + double gf_max_total_boost; + double gf_frame_max_boost; + double zm_factor; } TWO_PASS; struct VP9_COMP; +struct ThreadData; +struct TileDataEnc; void vp9_init_first_pass(struct VP9_COMP *cpi); -void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi); void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source); void vp9_end_first_pass(struct VP9_COMP *cpi); +void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi, + struct ThreadData *td, + FIRSTPASS_DATA *fp_acc_data, + struct TileDataEnc *tile_data, + MV *best_ref_mv, int mb_row); + void vp9_init_second_pass(struct VP9_COMP *cpi); void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); -void vp9_twopass_postencode_update(struct VP9_COMP *cpi); +void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area); // Post encode update of the rate control parameters for 2-pass void vp9_twopass_postencode_update(struct VP9_COMP *cpi); @@ -157,8 +223,16 @@ void vp9_twopass_postencode_update(struct VP9_COMP *cpi); void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width, int *scaled_frame_height); +struct VP9EncoderConfig; +int vp9_get_frames_to_next_key(const struct VP9EncoderConfig *oxcf, + const TWO_PASS *const twopass, int kf_show_idx, + int min_gf_interval); + +FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass); +FIRSTPASS_STATS vp9_get_total_stats(const TWO_PASS *twopass); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_FIRSTPASS_H_ +#endif // VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h new file mode 100644 index 0000000000..01928e7816 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass_stats.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ +#define VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + double frame; + double weight; + double intra_error; + double coded_error; + double sr_coded_error; + double frame_noise_energy; + double pcnt_inter; + double pcnt_motion; + double pcnt_second_ref; + double pcnt_neutral; + double pcnt_intra_low; // Coded intra but low variance + double pcnt_intra_high; // Coded intra high variance + double intra_skip_pct; + double intra_smooth_pct; // % of blocks that are smooth + double inactive_zone_rows; // Image mask rows top and bottom. + double inactive_zone_cols; // Image mask columns at left and right edges. + double MVr; + double mvr_abs; + double MVc; + double mvc_abs; + double MVrv; + double MVcv; + double mv_in_out_count; + double duration; + double count; + double new_mv_count; + int64_t spatial_layer_id; +} FIRSTPASS_STATS; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c b/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c new file mode 100644 index 0000000000..c74d523246 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "./vpx_scale_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vpx/vpx_codec.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_scale/yv12config.h" + +void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + INTERP_FILTER filter_type, int phase_scaler) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer, + src->v_buffer }; + const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride }; + uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer }; + const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride }; + const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; + int x, y, i; + +#if HAVE_SSSE3 || HAVE_NEON + // TODO(linfengz): The 4:3 specialized C code is disabled by default since + // it's much slower than the general version which calls vpx_scaled_2d() even + // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference + // for the platforms which have faster optimization. + if (4 * dst->y_crop_width == 3 * src_w && + 4 * dst->y_crop_height == 3 * src_h) { + // Specialize 4 to 3 scaling. + // Example pixel locations. + // (O: Original pixel. S: Scaled pixel. X: Overlapped pixel.) + // phase_scaler = 0 | phase_scaler = 8 + // | + // X O S O S O X | O O O O O + // | + // | + // | S S S + // | + // | + // O O O O O | O O O O O + // | + // S S S S | + // | + // | + // | S S S + // O O O O O | O O O O O + // | + // | + // | + // S S S S | + // | + // O O O O O | O O O O O + // | S S S + // | + // | + // | + // | + // X O S O S O X | O O O O O + + const int dst_ws[3] = { dst->y_crop_width, dst->uv_crop_width, + dst->uv_crop_width }; + const int dst_hs[3] = { dst->y_crop_height, dst->uv_crop_height, + dst->uv_crop_height }; + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int dst_w = dst_ws[i]; + const int dst_h = dst_hs[i]; + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + for (y = 0; y < dst_h; y += 3) { + for (x = 0; x < dst_w; x += 3) { + const uint8_t *src_ptr = srcs[i] + 4 * y / 3 * src_stride + 4 * x / 3; + uint8_t *dst_ptr = dsts[i] + y * dst_stride + x; + + // Must call c function because its optimization doesn't support 3x3. + vpx_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + phase_scaler, 64 / 3, phase_scaler, 64 / 3, 3, 3); + } + } + } + } else +#endif + { + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + + // The issue b/311394513 reveals a corner case bug. vpx_scaled_2d() requires + // both x_step_q4 and y_step_q4 are less than or equal to 64. Otherwise, it + // needs to call vp9_scale_and_extend_frame_nonnormative() that supports + // arbitrary scaling. + const int x_step_q4 = 16 * src_w / dst_w; + const int y_step_q4 = 16 * src_h / dst_h; + if (x_step_q4 > 64 || y_step_q4 > 64) { + // This function is only called while cm->bit_depth is VPX_BITS_8. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8); +#else + vp9_scale_and_extend_frame_nonnormative(src, dst); +#endif // CONFIG_VP9_HIGHBITDEPTH + return; + } + + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int factor = (i == 0 || i == 3 ? 1 : 2); + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + for (y = 0; y < dst_h; y += 16) { + const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler; + for (x = 0; x < dst_w; x += 16) { + const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler; + const uint8_t *src_ptr = srcs[i] + + (y / factor) * src_h / dst_h * src_stride + + (x / factor) * src_w / dst_w; + uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); + + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor); + } + } + } + } + + vpx_extend_frame_borders(dst); +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h new file mode 100644 index 0000000000..ad09c11198 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ +#define VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ + +typedef enum { + FIRST_PASS_JOB, + ENCODE_JOB, + ARNR_JOB, + NUM_JOB_TYPES, +} JOB_TYPE; + +// Encode job parameters +typedef struct { + int vert_unit_row_num; // Index of the vertical unit row + int tile_col_id; // tile col id within a tile + int tile_row_id; // tile col id within a tile +} JobNode; + +// Job queue element parameters +typedef struct { + // Pointer to the next link in the job queue + void *next; + + // Job information context of the module + JobNode job_info; +} JobQueue; + +// Job queue handle +typedef struct { + // Pointer to the next link in the job queue + void *next; + + // Counter to store the number of jobs picked up for processing + int num_jobs_acquired; +} JobQueueHandle; + +#endif // VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c index 392cd5d418..ba4fe3d3b7 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c @@ -9,6 +9,7 @@ */ #include #include +#include #include "./vpx_config.h" @@ -64,6 +65,7 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, unsigned int i; ctx->max_sz = depth; ctx->buf = calloc(depth, sizeof(*ctx->buf)); + ctx->next_show_idx = 0; if (!ctx->buf) goto bail; for (i = 0; i < depth; i++) if (vpx_alloc_frame_buffer( @@ -80,20 +82,18 @@ bail: return NULL; } -#define USE_PARTIAL_COPY 0 +int vp9_lookahead_full(const struct lookahead_ctx *ctx) { + return ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz; +} + +int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx) { + return ctx->next_show_idx; +} int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, - int64_t ts_start, int64_t ts_end, -#if CONFIG_VP9_HIGHBITDEPTH - int use_highbitdepth, -#endif + int64_t ts_start, int64_t ts_end, int use_highbitdepth, vpx_enc_frame_flags_t flags) { struct lookahead_entry *buf; -#if USE_PARTIAL_COPY - int row, col, active_end; - int mb_rows = (src->y_height + 15) >> 4; - int mb_cols = (src->y_width + 15) >> 4; -#endif int width = src->y_crop_width; int height = src->y_crop_height; int uv_width = src->uv_crop_width; @@ -101,8 +101,12 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, int subsampling_x = src->subsampling_x; int subsampling_y = src->subsampling_y; int larger_dimensions, new_dimensions; +#if !CONFIG_VP9_HIGHBITDEPTH + (void)use_highbitdepth; + assert(use_highbitdepth == 0); +#endif - if (ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz) return 1; + if (vp9_lookahead_full(ctx)) return 1; ctx->sz++; buf = pop(ctx, &ctx->write_idx); @@ -110,80 +114,50 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, height != buf->img.y_crop_height || uv_width != buf->img.uv_crop_width || uv_height != buf->img.uv_crop_height; - larger_dimensions = width > buf->img.y_width || height > buf->img.y_height || - uv_width > buf->img.uv_width || - uv_height > buf->img.uv_height; + larger_dimensions = + width > buf->img.y_crop_width || height > buf->img.y_crop_height || + uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height; assert(!larger_dimensions || new_dimensions); -#if USE_PARTIAL_COPY - // TODO(jkoleszar): This is disabled for now, as - // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware. - - // Only do this partial copy if the following conditions are all met: - // 1. Lookahead queue has has size of 1. - // 2. Active map is provided. - // 3. This is not a key frame, golden nor altref frame. - if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) { - for (row = 0; row < mb_rows; ++row) { - col = 0; - - while (1) { - // Find the first active macroblock in this row. - for (; col < mb_cols; ++col) { - if (active_map[col]) break; - } - - // No more active macroblock in this row. - if (col == mb_cols) break; - - // Find the end of active region in this row. - active_end = col; - - for (; active_end < mb_cols; ++active_end) { - if (!active_map[active_end]) break; - } - - // Only copy this active region. - vp9_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4, - 16, (active_end - col) << 4); - - // Start again from the end of this active region. - col = active_end; - } - - active_map += mb_cols; - } - } else { -#endif - if (larger_dimensions) { - YV12_BUFFER_CONFIG new_img; - memset(&new_img, 0, sizeof(new_img)); - if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x, - subsampling_y, + if (larger_dimensions) { + YV12_BUFFER_CONFIG new_img; + memset(&new_img, 0, sizeof(new_img)); + if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x, + subsampling_y, #if CONFIG_VP9_HIGHBITDEPTH - use_highbitdepth, + use_highbitdepth, #endif - VP9_ENC_BORDER_IN_PIXELS, 0)) - return 1; - vpx_free_frame_buffer(&buf->img); - buf->img = new_img; - } else if (new_dimensions) { - buf->img.y_crop_width = src->y_crop_width; - buf->img.y_crop_height = src->y_crop_height; - buf->img.uv_crop_width = src->uv_crop_width; - buf->img.uv_crop_height = src->uv_crop_height; - buf->img.subsampling_x = src->subsampling_x; - buf->img.subsampling_y = src->subsampling_y; - } - // Partial copy not implemented yet - vp9_copy_and_extend_frame(src, &buf->img); -#if USE_PARTIAL_COPY + VP9_ENC_BORDER_IN_PIXELS, 0)) + return 1; + vpx_free_frame_buffer(&buf->img); + buf->img = new_img; + } else if (new_dimensions) { + int aligned_width = ALIGN_POWER_OF_TWO(width, 3); + buf->img.y_width = src->y_width; + buf->img.y_height = src->y_height; + buf->img.uv_width = src->uv_width; + buf->img.uv_height = src->uv_height; + buf->img.y_crop_width = src->y_crop_width; + buf->img.y_crop_height = src->y_crop_height; + buf->img.uv_crop_width = src->uv_crop_width; + buf->img.uv_crop_height = src->uv_crop_height; + buf->img.subsampling_x = src->subsampling_x; + buf->img.subsampling_y = src->subsampling_y; + // Here the new width (src->y_crop_width) is <= the previous width + // (since otherwise it would enter the "larger_dimensions" code), so + // it is safe here to update the stride. + // The stride setting is taken from vpx_alloc_frame_buffer(). + buf->img.y_stride = + ALIGN_POWER_OF_TWO((aligned_width + 2 * buf->img.border), 5); + buf->img.uv_stride = buf->img.y_stride >> subsampling_x; } -#endif + vp9_copy_and_extend_frame(src, &buf->img); buf->ts_start = ts_start; buf->ts_end = ts_end; buf->flags = flags; + buf->show_idx = ctx->next_show_idx; + ++ctx->next_show_idx; return 0; } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h index 88be0ffcd5..6ac6736673 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h @@ -8,17 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_LOOKAHEAD_H_ -#define VP9_ENCODER_VP9_LOOKAHEAD_H_ +#ifndef VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ +#define VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_encoder.h" #include "vpx/vpx_integer.h" -#if CONFIG_SPATIAL_SVC -#include "vpx/vp8cx.h" -#endif - #ifdef __cplusplus extern "C" { #endif @@ -29,6 +25,7 @@ struct lookahead_entry { YV12_BUFFER_CONFIG img; int64_t ts_start; int64_t ts_end; + int show_idx; /*The show_idx of this frame*/ vpx_enc_frame_flags_t flags; }; @@ -36,10 +33,12 @@ struct lookahead_entry { #define MAX_PRE_FRAMES 1 struct lookahead_ctx { - int max_sz; /* Absolute size of the queue */ - int sz; /* Number of buffers currently in the queue */ - int read_idx; /* Read index */ - int write_idx; /* Write index */ + int max_sz; /* Absolute size of the queue */ + int sz; /* Number of buffers currently in the queue */ + int read_idx; /* Read index */ + int write_idx; /* Write index */ + int next_show_idx; /* The show_idx that will be assigned to the next frame + being pushed in the queue*/ struct lookahead_entry *buf; /* Buffer list */ }; @@ -61,26 +60,36 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, */ void vp9_lookahead_destroy(struct lookahead_ctx *ctx); +/**\brief Check if lookahead is full + * + * \param[in] ctx Pointer to the lookahead context + * + * Return 1 if lookahead is full, otherwise return 0. + */ +int vp9_lookahead_full(const struct lookahead_ctx *ctx); + +/**\brief Return the next_show_idx + * + * \param[in] ctx Pointer to the lookahead context + * + * Return the show_idx that will be assigned to the next + * frame pushed by vp9_lookahead_push() + */ +int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx); + /**\brief Enqueue a source buffer * * This function will copy the source image into a new framebuffer with * the expected stride/border. * - * If active_map is non-NULL and there is only one frame in the queue, then copy - * only active macroblocks. - * * \param[in] ctx Pointer to the lookahead context * \param[in] src Pointer to the image to enqueue * \param[in] ts_start Timestamp for the start of this frame * \param[in] ts_end Timestamp for the end of this frame * \param[in] flags Flags set on this frame - * \param[in] active_map Map that specifies which macroblock is active */ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, - int64_t ts_start, int64_t ts_end, -#if CONFIG_VP9_HIGHBITDEPTH - int use_highbitdepth, -#endif + int64_t ts_start, int64_t ts_end, int use_highbitdepth, vpx_enc_frame_flags_t flags); /**\brief Get the next source buffer to encode @@ -115,4 +124,4 @@ unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_LOOKAHEAD_H_ +#endif // VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c index e000220b97..2f20a8fe6d 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c @@ -45,20 +45,24 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv, mv_sf->search_method = HEX; vp9_full_pixel_search(cpi, x, BLOCK_16X16, &ref_full, step_param, - x->errorperbit, cond_cost_list(cpi, cost_list), ref_mv, - dst_mv, 0, 0); + cpi->sf.mv.search_method, x->errorperbit, + cond_cost_list(cpi, cost_list), ref_mv, dst_mv, 0, 0); mv_sf->search_method = old_search_method; + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + // Try sub-pixel MC // if (bestsme > error_thresh && bestsme < INT_MAX) { uint32_t distortion; uint32_t sse; + // TODO(yunqing): may use higher tap interp filter than 2 taps if needed. cpi->find_fractional_mv_step( x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, - &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, + &v_fn_ptr, 0, mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, - 0); + 0, USE_2_TAPS); } xd->mi[0]->mode = NEWMV; @@ -66,9 +70,6 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv, vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16); - /* restore UMV window */ - x->mv_limits = tmp_mv_limits; - return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride); } @@ -97,8 +98,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv, // If the current best reference mv is not centered on 0,0 then do a 0,0 // based search as well. if (ref_mv->row != 0 || ref_mv->col != 0) { - unsigned int tmp_err; - MV zero_ref_mv = { 0, 0 }, tmp_mv; + MV zero_ref_mv = { 0, 0 }; tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col); @@ -218,7 +218,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; int mb_col, mb_row, offset = 0; - int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0; + int mb_y_offset = 0; MV gld_top_mv = { 0, 0 }; MODE_INFO mi_local; MODE_INFO mi_above, mi_left; @@ -237,13 +237,11 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, xd->mi[0] = &mi_local; mi_local.sb_type = BLOCK_16X16; mi_local.ref_frame[0] = LAST_FRAME; - mi_local.ref_frame[1] = NONE; + mi_local.ref_frame[1] = NO_REF_FRAME; for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { MV gld_left_mv = gld_top_mv; int mb_y_in_offset = mb_y_offset; - int arf_y_in_offset = arf_y_offset; - int gld_y_in_offset = gld_y_offset; // Set up limit values for motion vectors to prevent them extending outside // the UMV borders. @@ -265,8 +263,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, xd->left_mi = &mi_left; mb_y_in_offset += 16; - gld_y_in_offset += 16; - arf_y_in_offset += 16; x->mv_limits.col_min -= 16; x->mv_limits.col_max -= 16; } @@ -275,8 +271,6 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, xd->above_mi = &mi_above; mb_y_offset += buf->y_stride * 16; - gld_y_offset += golden_ref->y_stride * 16; - if (alt_ref) arf_y_offset += alt_ref->y_stride * 16; x->mv_limits.row_min -= 16; x->mv_limits.row_max -= 16; offset += cm->mb_cols; @@ -294,7 +288,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) { int *arf_not_zz; CHECK_MEM_ERROR( - cm, arf_not_zz, + &cm->error, arf_not_zz, vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1)); // We are not interested in results beyond the alt ref itself. @@ -339,23 +333,16 @@ static void separate_arf_mbs(VP9_COMP *cpi) { } } - // Only bother with segmentation if over 10% of the MBs in static segment - // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) ) - if (1) { - // Note % of blocks that are marked as static - if (cm->MBs) - cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols); + // Note % of blocks that are marked as static + if (cm->MBs) + cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols); - // This error case should not be reachable as this function should - // never be called with the common data structure uninitialized. - else - cpi->static_mb_pct = 0; - - vp9_enable_segmentation(&cm->seg); - } else { + // This error case should not be reachable as this function should + // never be called with the common data structure uninitialized. + else cpi->static_mb_pct = 0; - vp9_disable_segmentation(&cm->seg); - } + + vp9_enable_segmentation(&cm->seg); // Free localy allocated storage vpx_free(arf_not_zz); diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h index df2fb98efa..7b629861d5 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_MBGRAPH_H_ -#define VP9_ENCODER_VP9_MBGRAPH_H_ +#ifndef VPX_VP9_ENCODER_VP9_MBGRAPH_H_ +#define VPX_VP9_ENCODER_VP9_MBGRAPH_H_ #ifdef __cplusplus extern "C" { @@ -25,7 +25,9 @@ typedef struct { } ref[MAX_REF_FRAMES]; } MBGRAPH_MB_STATS; -typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS; +typedef struct { + MBGRAPH_MB_STATS *mb_stats; +} MBGRAPH_FRAME_STATS; struct VP9_COMP; @@ -35,4 +37,4 @@ void vp9_update_mbgraph_stats(struct VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_MBGRAPH_H_ +#endif // VPX_VP9_ENCODER_VP9_MBGRAPH_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c index 70deda8421..1f7f174105 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c @@ -21,6 +21,7 @@ #include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_encoder.h" @@ -28,11 +29,6 @@ // #define NEW_DIAMOND_SEARCH -static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, - const MV *mv) { - return &buf->buf[mv->row * buf->stride + mv->col]; -} - void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) { int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); @@ -52,6 +48,24 @@ void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) { if (mv_limits->row_max > row_max) mv_limits->row_max = row_max; } +void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits, + const MvLimits *umv_window_limits, + const MV *ref_mv) { + subpel_mv_limits->col_min = VPXMAX(umv_window_limits->col_min * 8, + ref_mv->col - MAX_FULL_PEL_VAL * 8); + subpel_mv_limits->col_max = VPXMIN(umv_window_limits->col_max * 8, + ref_mv->col + MAX_FULL_PEL_VAL * 8); + subpel_mv_limits->row_min = VPXMAX(umv_window_limits->row_min * 8, + ref_mv->row - MAX_FULL_PEL_VAL * 8); + subpel_mv_limits->row_max = VPXMIN(umv_window_limits->row_max * 8, + ref_mv->row + MAX_FULL_PEL_VAL * 8); + + subpel_mv_limits->col_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->col_min); + subpel_mv_limits->col_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->col_max); + subpel_mv_limits->row_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->row_min); + subpel_mv_limits->row_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->row_max); +} + int vp9_init_search_range(int size) { int sr = 0; // Minimum search size no matter what the passed in value. @@ -63,14 +77,6 @@ int vp9_init_search_range(int size) { return sr; } -static INLINE int mv_cost(const MV *mv, const int *joint_cost, - int *const comp_cost[2]) { - assert(mv->row >= -MV_MAX && mv->row < MV_MAX); - assert(mv->col >= -MV_MAX && mv->col < MV_MAX); - return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + - comp_cost[1][mv->col]; -} - int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost, int *mvcost[2], int weight) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; @@ -82,24 +88,13 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost, int *mvcost[2], int error_per_bit) { if (mvcost) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; - // This product sits at a 32-bit ceiling right now and any additional - // accuracy in either bit cost or error cost will cause it to overflow. - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(&diff, mvjcost, mvcost) * error_per_bit, + return (int)ROUND64_POWER_OF_TWO( + (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit, RDDIV_BITS + VP9_PROB_COST_SHIFT - RD_EPB_SHIFT + PIXEL_TRANSFORM_ERROR_SCALE); } return 0; } - -static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref, - int sad_per_bit) { - const MV diff = { mv->row - ref->row, mv->col - ref->col }; - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) { int len; int ss_count = 0; @@ -138,17 +133,6 @@ void vp9_init3smotion_compensation(search_site_config *cfg, int stride) { cfg->total_steps = ss_count / cfg->searches_per_step; } -/* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes - * from the same math as in mv_err_cost(). */ -#define MVC(r, c) \ - (mvcost \ - ? ((unsigned)(mvjcost[((r) != rr) * 2 + ((c) != rc)] + \ - mvcost[0][((r)-rr)] + mvcost[1][((c)-rc)]) * \ - error_per_bit + \ - 8192) >> \ - 14 \ - : 0) - // convert motion vector component to offset for sv[a]f calc static INLINE int sp(int x) { return x & 7; } @@ -158,54 +142,65 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { #if CONFIG_VP9_HIGHBITDEPTH /* checks if (r, c) has better score than previous best */ -#define CHECK_BETTER(v, r, c) \ - if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - int64_t tmpmse; \ - if (second_pred == NULL) { \ - thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ - src_stride, &sse); \ - } else { \ - thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ - src_stride, &sse, second_pred); \ - } \ - tmpmse = thismse; \ - tmpmse += MVC(r, c); \ - if (tmpmse >= INT_MAX) { \ - v = INT_MAX; \ - } else if ((v = (uint32_t)tmpmse) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - } else { \ - v = INT_MAX; \ - } +#define CHECK_BETTER(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + int64_t tmpmse; \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + if (second_pred == NULL) { \ + thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse); \ + } else { \ + thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse, second_pred); \ + } \ + tmpmse = thismse; \ + tmpmse += \ + mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \ + if (tmpmse >= INT_MAX) { \ + v = INT_MAX; \ + } else if ((v = (uint32_t)tmpmse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) #else /* checks if (r, c) has better score than previous best */ -#define CHECK_BETTER(v, r, c) \ - if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - if (second_pred == NULL) \ - thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ - src_stride, &sse); \ - else \ - thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ - src_stride, &sse, second_pred); \ - if ((v = MVC(r, c) + thismse) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - } else { \ - v = INT_MAX; \ - } +#define CHECK_BETTER(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + if (second_pred == NULL) \ + thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse); \ + else \ + thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse, second_pred); \ + if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \ + error_per_bit) + \ + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) #endif #define FIRST_LEVEL_CHECKS \ - { \ + do { \ unsigned int left, right, up, down, diag; \ CHECK_BETTER(left, tr, tc - hstep); \ CHECK_BETTER(right, tr, tc + hstep); \ @@ -218,10 +213,10 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \ case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \ } \ - } + } while (0) #define SECOND_LEVEL_CHECKS \ - { \ + do { \ int kr, kc; \ unsigned int second; \ if (tr != br && tc != bc) { \ @@ -250,58 +245,41 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \ } \ } \ - } + } while (0) -// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of -// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten -// later in the same way. -#define SECOND_LEVEL_CHECKS_BEST \ - { \ - unsigned int second; \ - int br0 = br; \ - int bc0 = bc; \ - assert(tr == br || tc == bc); \ - if (tr == br && tc != bc) { \ - kc = bc - tc; \ - } else if (tr != br && tc == bc) { \ - kr = br - tr; \ - } \ - CHECK_BETTER(second, br0 + kr, bc0); \ - CHECK_BETTER(second, br0, bc0 + kc); \ - if (br0 != br || bc0 != bc) { \ - CHECK_BETTER(second, br0 + kr, bc0 + kc); \ - } \ - } - -#define SETUP_SUBPEL_SEARCH \ - const uint8_t *const z = x->plane[0].src.buf; \ - const int src_stride = x->plane[0].src.stride; \ - const MACROBLOCKD *xd = &x->e_mbd; \ - unsigned int besterr = UINT_MAX; \ - unsigned int sse; \ - unsigned int whichdir; \ - int thismse; \ - const unsigned int halfiters = iters_per_step; \ - const unsigned int quarteriters = iters_per_step; \ - const unsigned int eighthiters = iters_per_step; \ - const int y_stride = xd->plane[0].pre[0].stride; \ - const int offset = bestmv->row * y_stride + bestmv->col; \ - const uint8_t *const y = xd->plane[0].pre[0].buf; \ - \ - int rr = ref_mv->row; \ - int rc = ref_mv->col; \ - int br = bestmv->row * 8; \ - int bc = bestmv->col * 8; \ - int hstep = 4; \ - const int minc = VPXMAX(x->mv_limits.col_min * 8, ref_mv->col - MV_MAX); \ - const int maxc = VPXMIN(x->mv_limits.col_max * 8, ref_mv->col + MV_MAX); \ - const int minr = VPXMAX(x->mv_limits.row_min * 8, ref_mv->row - MV_MAX); \ - const int maxr = VPXMIN(x->mv_limits.row_max * 8, ref_mv->row + MV_MAX); \ - int tr = br; \ - int tc = bc; \ - \ - bestmv->row *= 8; \ - bestmv->col *= 8; +#define SETUP_SUBPEL_SEARCH \ + const uint8_t *const z = x->plane[0].src.buf; \ + const int src_stride = x->plane[0].src.stride; \ + const MACROBLOCKD *xd = &x->e_mbd; \ + unsigned int besterr = UINT_MAX; \ + unsigned int sse; \ + unsigned int whichdir; \ + int thismse; \ + const unsigned int halfiters = iters_per_step; \ + const unsigned int quarteriters = iters_per_step; \ + const unsigned int eighthiters = iters_per_step; \ + const int y_stride = xd->plane[0].pre[0].stride; \ + const int offset = bestmv->row * y_stride + bestmv->col; \ + const uint8_t *const y = xd->plane[0].pre[0].buf; \ + \ + int rr = ref_mv->row; \ + int rc = ref_mv->col; \ + int br = bestmv->row * 8; \ + int bc = bestmv->col * 8; \ + int hstep = 4; \ + int minc, maxc, minr, maxr; \ + int tr = br; \ + int tc = bc; \ + MvLimits subpel_mv_limits; \ + \ + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); \ + minc = subpel_mv_limits.col_min; \ + maxc = subpel_mv_limits.col_max; \ + minr = subpel_mv_limits.row_min; \ + maxr = subpel_mv_limits.row_max; \ + \ + bestmv->row *= 8; \ + bestmv->col *= 8 static unsigned int setup_center_error( const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv, @@ -314,12 +292,12 @@ static unsigned int setup_center_error( if (second_pred != NULL) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); - vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, - y_stride); + vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w, + h, CONVERT_TO_SHORTPTR(y + offset), y_stride); besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); } else { - DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } @@ -334,7 +312,7 @@ static unsigned int setup_center_error( uint32_t besterr; (void)xd; if (second_pred != NULL) { - DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } else { @@ -346,7 +324,7 @@ static unsigned int setup_center_error( #endif // CONFIG_VP9_HIGHBITDEPTH } -static INLINE int divide_and_round(const int n, const int d) { +static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) { return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d); } @@ -364,20 +342,21 @@ static INLINE int is_cost_list_wellbehaved(int *cost_list) { // y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0). // The code below is an integerized version of that. static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) { - *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)), - (cost_list[1] - 2 * cost_list[0] + cost_list[3])); - *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)), - (cost_list[4] - 2 * cost_list[0] + cost_list[2])); + const int64_t x0 = (int64_t)cost_list[1] - cost_list[3]; + const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3]; + const int64_t x1 = (int64_t)cost_list[4] - cost_list[2]; + const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2]; + const int b = 1 << (bits - 1); + *ic = (int)divide_and_round(x0 * b, y0); + *ir = (int)divide_and_round(x1 * b, y1); } -uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, - const MV *ref_mv, int allow_hp, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, int iters_per_step, - int *cost_list, int *mvjcost, int *mvcost[2], - uint32_t *distortion, uint32_t *sse1, - const uint8_t *second_pred, int w, int h) { +uint32_t vp9_skip_sub_pixel_tree( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, @@ -400,10 +379,7 @@ uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, (void)sse; (void)thismse; (void)cost_list; - - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return UINT_MAX; + (void)use_accurate_subpel_search; return besterr; } @@ -413,7 +389,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, @@ -425,12 +401,13 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( (void)allow_hp; (void)forced_stop; (void)hstep; + (void)use_accurate_subpel_search; if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) { int ir, ic; - unsigned int minpt; + unsigned int minpt = INT_MAX; get_cost_surf_min(cost_list, &ir, &ic, 2); if (ir != 0 || ic != 0) { CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic); @@ -470,10 +447,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( bestmv->row = br; bestmv->col = bc; - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return UINT_MAX; - return besterr; } @@ -482,8 +455,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_more( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; + (void)use_accurate_subpel_search; + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); @@ -534,10 +509,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_more( bestmv->row = br; bestmv->col = bc; - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return UINT_MAX; - return besterr; } @@ -546,8 +517,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; + (void)use_accurate_subpel_search; + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); @@ -620,10 +593,6 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned( bestmv->row = br; bestmv->col = bc; - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return UINT_MAX; - return besterr; } @@ -636,12 +605,125 @@ static const MV search_step_table[12] = { }; /* clang-format on */ +static int accurate_sub_pel_search( + const MACROBLOCKD *xd, const MV *this_mv, const struct scale_factors *sf, + const InterpKernel *kernel, const vp9_variance_fn_ptr_t *vfp, + const uint8_t *const src_address, const int src_stride, + const uint8_t *const pre_address, int y_stride, const uint8_t *second_pred, + int w, int h, uint32_t *sse) { +#if CONFIG_VP9_HIGHBITDEPTH + uint64_t besterr; + assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16); + assert(w != 0 && h != 0); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]); + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(pre_address), y_stride, + pred16, w, this_mv, sf, w, h, 0, kernel, + MV_PRECISION_Q3, 0, 0, xd->bd); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); + vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w, + h, pred16, w); + besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src_address, + src_stride, sse); + } else { + besterr = + vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src_address, src_stride, sse); + } + } else { + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, + 0, kernel, MV_PRECISION_Q3, 0, 0); + if (second_pred != NULL) { + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); + besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); + } else { + besterr = vfp->vf(pred, w, src_address, src_stride, sse); + } + } + if (besterr >= UINT_MAX) return UINT_MAX; + return (int)besterr; +#else + int besterr; + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16); + assert(w != 0 && h != 0); + (void)xd; + + vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, + 0, kernel, MV_PRECISION_Q3, 0, 0); + if (second_pred != NULL) { + DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); + besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); + } else { + besterr = vfp->vf(pred, w, src_address, src_stride, sse); + } + return besterr; +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +// TODO(yunqing): this part can be further refactored. +#if CONFIG_VP9_HIGHBITDEPTH +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + int64_t tmpmse; \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \ + src_stride, y, y_stride, second_pred, \ + w, h, &sse); \ + tmpmse = thismse; \ + tmpmse += \ + mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \ + if (tmpmse >= INT_MAX) { \ + v = INT_MAX; \ + } else if ((v = (uint32_t)tmpmse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) +#else +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + do { \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \ + src_stride, y, y_stride, second_pred, \ + w, h, &sse); \ + if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \ + error_per_bit) + \ + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } \ + } while (0) + +#endif + uint32_t vp9_find_best_sub_pixel_tree( const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { const uint8_t *const z = x->plane[0].src.buf; const uint8_t *const src_address = z; const int src_stride = x->plane[0].src.stride; @@ -659,16 +741,32 @@ uint32_t vp9_find_best_sub_pixel_tree( int bc = bestmv->col * 8; int hstep = 4; int iter, round = 3 - forced_stop; - const int minc = VPXMAX(x->mv_limits.col_min * 8, ref_mv->col - MV_MAX); - const int maxc = VPXMIN(x->mv_limits.col_max * 8, ref_mv->col + MV_MAX); - const int minr = VPXMAX(x->mv_limits.row_min * 8, ref_mv->row - MV_MAX); - const int maxr = VPXMIN(x->mv_limits.row_max * 8, ref_mv->row + MV_MAX); + + int minc, maxc, minr, maxr; int tr = br; int tc = bc; const MV *search_step = search_step_table; int idx, best_idx = -1; unsigned int cost_array[5]; int kr, kc; + MvLimits subpel_mv_limits; + + // TODO(yunqing): need to add 4-tap filter optimization to speed up the + // encoder. + const InterpKernel *kernel = + (use_accurate_subpel_search > 0) + ? ((use_accurate_subpel_search == USE_4_TAPS) + ? vp9_filter_kernels[FOURTAP] + : ((use_accurate_subpel_search == USE_8_TAPS) + ? vp9_filter_kernels[EIGHTTAP] + : vp9_filter_kernels[EIGHTTAP_SHARP])) + : vp9_filter_kernels[BILINEAR]; + + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); + minc = subpel_mv_limits.col_min; + maxc = subpel_mv_limits.col_max; + minr = subpel_mv_limits.row_min; + maxr = subpel_mv_limits.row_max; if (!(allow_hp && use_mv_hp(ref_mv))) if (round == 3) round = 2; @@ -688,16 +786,25 @@ uint32_t vp9_find_best_sub_pixel_tree( tr = br + search_step[idx].row; tc = bc + search_step[idx].col; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { - const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); MV this_mv; this_mv.row = tr; this_mv.col = tc; - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse, second_pred); + + if (use_accurate_subpel_search) { + thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp, + src_address, src_stride, y, + y_stride, second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = + y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -719,14 +826,21 @@ uint32_t vp9_find_best_sub_pixel_tree( tc = bc + kc; tr = br + kr; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { - const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); MV this_mv = { tr, tc }; - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse, second_pred); + if (use_accurate_subpel_search) { + thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp, + src_address, src_stride, y, y_stride, + second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, + src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -748,10 +862,48 @@ uint32_t vp9_find_best_sub_pixel_tree( bc = tc; } - if (iters_per_step > 1 && best_idx != -1) SECOND_LEVEL_CHECKS_BEST; + if (iters_per_step > 0 && best_idx != -1) { + unsigned int second; + const int br0 = br; + const int bc0 = bc; + assert(tr == br || tc == bc); - tr = br; - tc = bc; + if (tr == br && tc != bc) { + kc = bc - tc; + if (iters_per_step == 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0, bc0 + kc); + } else { + CHECK_BETTER(second, br0, bc0 + kc); + } + } + } else if (tr != br && tc == bc) { + kr = br - tr; + if (iters_per_step == 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0 + kr, bc0); + } else { + CHECK_BETTER(second, br0 + kr, bc0); + } + } + } + + if (iters_per_step > 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0 + kr, bc0); + CHECK_BETTER1(second, br0, bc0 + kc); + if (br0 != br || bc0 != bc) { + CHECK_BETTER1(second, br0 + kr, bc0 + kc); + } + } else { + CHECK_BETTER(second, br0 + kr, bc0); + CHECK_BETTER(second, br0, bc0 + kc); + if (br0 != br || bc0 != bc) { + CHECK_BETTER(second, br0 + kr, bc0 + kc); + } + } + } + } search_step += 4; hstep >>= 1; @@ -769,15 +921,11 @@ uint32_t vp9_find_best_sub_pixel_tree( bestmv->row = br; bestmv->col = bc; - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return UINT_MAX; - return besterr; } -#undef MVC #undef CHECK_BETTER +#undef CHECK_BETTER1 static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col, int range) { @@ -805,7 +953,7 @@ static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) { } #define MAX_PATTERN_SCALES 11 -#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale +#define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale #define PATTERN_CANDIDATES_REF 3 // number of refinement candidates // Calculate and return a sad+mvcost list around an integer best pel. @@ -819,16 +967,14 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv, const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 }; int br = best_mv->row; int bc = best_mv->col; - MV this_mv; + const MV mv = { br, bc }; int i; unsigned int sse; - this_mv.row = br; - this_mv.col = bc; cost_list[0] = - fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), + fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), in_what->stride, &sse) + - mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb); + mvsad_err_cost(x, &mv, &fcenter_mv, sadpb); if (check_bounds(&x->mv_limits, br, bc, 1)) { for (i = 0; i < 4; i++) { const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; @@ -858,13 +1004,12 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv, // candidates as indicated in the num_candidates and candidates arrays // passed into this function // -static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param, - int sad_per_bit, int do_init_search, - int *cost_list, const vp9_variance_fn_ptr_t *vfp, - int use_mvcost, const MV *center_mv, MV *best_mv, - const int num_candidates[MAX_PATTERN_SCALES], - const MV candidates[MAX_PATTERN_SCALES] - [MAX_PATTERN_CANDIDATES]) { +static int vp9_pattern_search( + const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit, + int do_init_search, int *cost_list, const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, const MV *center_mv, MV *best_mv, + const int num_candidates[MAX_PATTERN_SCALES], + const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) { const MACROBLOCKD *const xd = &x->e_mbd; static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = { 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, @@ -889,7 +1034,7 @@ static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param, in_what->stride) + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - // Search all possible scales upto the search param around the center point + // Search all possible scales up to the search param around the center point // pick the scale of the point that is best as the starting scale of // further steps around it. if (do_init_search) { @@ -1010,6 +1155,9 @@ static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param, } while (s--); } + best_mv->row = br; + best_mv->col = bc; + // Returns the one-away integer pel sad values around the best as follows: // cost_list[0]: cost at the best integer pel // cost_list[1]: cost at delta {0, -1} (left) from the best integer pel @@ -1017,11 +1165,8 @@ static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param, // cost_list[3]: cost at delta { 0, 1} (right) from the best integer pel // cost_list[4]: cost at delta {-1, 0} (top) from the best integer pel if (cost_list) { - const MV best_mv = { br, bc }; - calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list); + calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list); } - best_mv->row = br; - best_mv->col = bc; return bestsad; } @@ -1063,7 +1208,7 @@ static int vp9_pattern_search_sad( in_what->stride) + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - // Search all possible scales upto the search param around the center point + // Search all possible scales up to the search param around the center point // pick the scale of the point that is best as the starting scale of // further steps around it. if (do_init_search) { @@ -1489,7 +1634,7 @@ static int fast_dia_search(const MACROBLOCK *x, MV *ref_mv, int search_param, // Exhuastive motion search around a given centre position with a given // step size. -static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, +static int exhaustive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, int range, int step, int sad_per_bit, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { @@ -1575,10 +1720,342 @@ static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, return best_sad; } +#define MIN_RANGE 7 +#define MAX_RANGE 256 +#define MIN_INTERVAL 1 +#if CONFIG_NON_GREEDY_MV +static int64_t exhaustive_mesh_search_multi_step( + MV *best_mv, const MV *center_mv, int range, int step, + const struct buf_2d *src, const struct buf_2d *pre, int lambda, + const int_mv *nb_full_mvs, int full_mv_num, const MvLimits *mv_limits, + const vp9_variance_fn_ptr_t *fn_ptr) { + int64_t best_sad; + int r, c; + int start_col, end_col, start_row, end_row; + *best_mv = *center_mv; + best_sad = + ((int64_t)fn_ptr->sdf(src->buf, src->stride, + get_buf_from_mv(pre, center_mv), pre->stride) + << LOG2_PRECISION) + + lambda * vp9_nb_mvs_inconsistency(best_mv, nb_full_mvs, full_mv_num); + start_row = VPXMAX(center_mv->row - range, mv_limits->row_min); + start_col = VPXMAX(center_mv->col - range, mv_limits->col_min); + end_row = VPXMIN(center_mv->row + range, mv_limits->row_max); + end_col = VPXMIN(center_mv->col + range, mv_limits->col_max); + for (r = start_row; r <= end_row; r += step) { + for (c = start_col; c <= end_col; c += step) { + const MV mv = { r, c }; + int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride, + get_buf_from_mv(pre, &mv), pre->stride) + << LOG2_PRECISION; + if (sad < best_sad) { + sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + } + return best_sad; +} + +static int64_t exhaustive_mesh_search_single_step( + MV *best_mv, const MV *center_mv, int range, const struct buf_2d *src, + const struct buf_2d *pre, int lambda, const int_mv *nb_full_mvs, + int full_mv_num, const MvLimits *mv_limits, + const vp9_variance_fn_ptr_t *fn_ptr) { + int64_t best_sad; + int r, c, i; + int start_col, end_col, start_row, end_row; + + *best_mv = *center_mv; + best_sad = + ((int64_t)fn_ptr->sdf(src->buf, src->stride, + get_buf_from_mv(pre, center_mv), pre->stride) + << LOG2_PRECISION) + + lambda * vp9_nb_mvs_inconsistency(best_mv, nb_full_mvs, full_mv_num); + start_row = VPXMAX(center_mv->row - range, mv_limits->row_min); + start_col = VPXMAX(center_mv->col - range, mv_limits->col_min); + end_row = VPXMIN(center_mv->row + range, mv_limits->row_max); + end_col = VPXMIN(center_mv->col + range, mv_limits->col_max); + for (r = start_row; r <= end_row; r += 1) { + c = start_col; + while (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const MV mv = { r, c + i }; + addrs[i] = get_buf_from_mv(pre, &mv); + } + fn_ptr->sdx4df(src->buf, src->stride, addrs, pre->stride, sads); + + for (i = 0; i < 4; ++i) { + int64_t sad = (int64_t)sads[i] << LOG2_PRECISION; + if (sad < best_sad) { + const MV mv = { r, c + i }; + sad += + lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + c += 4; + } + while (c <= end_col) { + const MV mv = { r, c }; + int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride, + get_buf_from_mv(pre, &mv), pre->stride) + << LOG2_PRECISION; + if (sad < best_sad) { + sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + c += 1; + } + } + return best_sad; +} + +static int64_t exhaustive_mesh_search_new(const MACROBLOCK *x, MV *best_mv, + int range, int step, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, int lambda, + const int_mv *nb_full_mvs, + int full_mv_num) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *src = &x->plane[0].src; + const struct buf_2d *pre = &xd->plane[0].pre[0]; + assert(step >= 1); + assert(is_mv_in(&x->mv_limits, center_mv)); + if (step == 1) { + return exhaustive_mesh_search_single_step( + best_mv, center_mv, range, src, pre, lambda, nb_full_mvs, full_mv_num, + &x->mv_limits, fn_ptr); + } + return exhaustive_mesh_search_multi_step(best_mv, center_mv, range, step, src, + pre, lambda, nb_full_mvs, + full_mv_num, &x->mv_limits, fn_ptr); +} + +static int64_t full_pixel_exhaustive_new(const VP9_COMP *cpi, MACROBLOCK *x, + MV *centre_mv_full, + const vp9_variance_fn_ptr_t *fn_ptr, + MV *dst_mv, int lambda, + const int_mv *nb_full_mvs, + int full_mv_num) { + const SPEED_FEATURES *const sf = &cpi->sf; + MV temp_mv = { centre_mv_full->row, centre_mv_full->col }; + int64_t bestsme; + int i; + int interval = sf->mesh_patterns[0].interval; + int range = sf->mesh_patterns[0].range; + int baseline_interval_divisor; + + // Trap illegal values for interval and range for this function. + if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) || + (interval > range)) { + printf("ERROR: invalid range\n"); + assert(0); + } + + baseline_interval_divisor = range / interval; + + // Check size of proposed first range against magnitude of the centre + // value used as a starting point. + range = clamp(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4, + MAX_RANGE); + interval = VPXMAX(interval, range / baseline_interval_divisor); + + // initial search + bestsme = + exhaustive_mesh_search_new(x, &temp_mv, range, interval, fn_ptr, &temp_mv, + lambda, nb_full_mvs, full_mv_num); + + if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) { + // Progressive searches with range and step size decreasing each time + // till we reach a step size of 1. Then break out. + for (i = 1; i < MAX_MESH_STEP; ++i) { + // First pass with coarser step and longer range + bestsme = exhaustive_mesh_search_new( + x, &temp_mv, sf->mesh_patterns[i].range, + sf->mesh_patterns[i].interval, fn_ptr, &temp_mv, lambda, nb_full_mvs, + full_mv_num); + + if (sf->mesh_patterns[i].interval == 1) break; + } + } + + *dst_mv = temp_mv; + + return bestsme; +} + +static int64_t diamond_search_sad_new(const MACROBLOCK *x, + const search_site_config *cfg, + const MV *init_full_mv, MV *best_full_mv, + int search_param, int lambda, int *num00, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, + int full_mv_num) { + int i, j, step; + + const MACROBLOCKD *const xd = &x->e_mbd; + uint8_t *what = x->plane[0].src.buf; + const int what_stride = x->plane[0].src.stride; + const uint8_t *in_what; + const int in_what_stride = xd->plane[0].pre[0].stride; + const uint8_t *best_address; + + int64_t bestsad; + int best_site = -1; + int last_site = -1; + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + // const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step]; + const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step]; + const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step]; + const int tot_steps = cfg->total_steps - search_param; + vpx_clear_system_state(); + + *best_full_mv = *init_full_mv; + clamp_mv(best_full_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + *num00 = 0; + + // Work out the start point for the search + in_what = xd->plane[0].pre[0].buf + best_full_mv->row * in_what_stride + + best_full_mv->col; + best_address = in_what; + + // Check the starting position + { + const int64_t mv_dist = + (int64_t)fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + << LOG2_PRECISION; + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num); + bestsad = mv_dist + lambda * mv_cost; + } + + i = 0; + + for (step = 0; step < tot_steps; step++) { + int all_in = 1, t; + + // All_in is true if every one of the points we are checking are within + // the bounds of the image. + all_in &= ((best_full_mv->row + ss_mv[i].row) > x->mv_limits.row_min); + all_in &= ((best_full_mv->row + ss_mv[i + 1].row) < x->mv_limits.row_max); + all_in &= ((best_full_mv->col + ss_mv[i + 2].col) > x->mv_limits.col_min); + all_in &= ((best_full_mv->col + ss_mv[i + 3].col) < x->mv_limits.col_max); + + // If all the pixels are within the bounds we don't check whether the + // search point is valid in this loop, otherwise we check each point + // for validity.. + if (all_in) { + unsigned int sad_array[4]; + + for (j = 0; j < cfg->searches_per_step; j += 4) { + unsigned char const *block_offset[4]; + + for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address; + + fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); + + for (t = 0; t < 4; t++, i++) { + const int64_t mv_dist = (int64_t)sad_array[t] << LOG2_PRECISION; + if (mv_dist < bestsad) { + const MV this_mv = { best_full_mv->row + ss_mv[i].row, + best_full_mv->col + ss_mv[i].col }; + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num); + const int64_t thissad = mv_dist + lambda * mv_cost; + if (thissad < bestsad) { + bestsad = thissad; + best_site = i; + } + } + } + } + } else { + for (j = 0; j < cfg->searches_per_step; j++) { + // Trap illegal vectors + const MV this_mv = { best_full_mv->row + ss_mv[i].row, + best_full_mv->col + ss_mv[i].col }; + + if (is_mv_in(&x->mv_limits, &this_mv)) { + const uint8_t *const check_here = ss_os[i] + best_address; + const int64_t mv_dist = + (int64_t)fn_ptr->sdf(what, what_stride, check_here, + in_what_stride) + << LOG2_PRECISION; + if (mv_dist < bestsad) { + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num); + const int64_t thissad = mv_dist + lambda * mv_cost; + if (thissad < bestsad) { + bestsad = thissad; + best_site = i; + } + } + } + i++; + } + } + if (best_site != last_site) { + best_full_mv->row += ss_mv[best_site].row; + best_full_mv->col += ss_mv[best_site].col; + best_address += ss_os[best_site]; + last_site = best_site; + } else if (best_address == in_what) { + (*num00)++; + } + } + return bestsad; +} + +int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row, + int mi_col, int_mv *nb_full_mvs) { + const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize]; + const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize]; + const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } }; + int nb_full_mv_num = 0; + int i; + assert(mi_row % mi_height == 0); + assert(mi_col % mi_width == 0); + for (i = 0; i < NB_MVS_NUM; ++i) { + int r = dirs[i][0]; + int c = dirs[i][1]; + int brow = mi_row / mi_height + r; + int bcol = mi_col / mi_width + c; + if (brow >= 0 && brow < motion_field->block_rows && bcol >= 0 && + bcol < motion_field->block_cols) { + if (vp9_motion_field_is_mv_set(motion_field, brow, bcol)) { + int_mv mv = vp9_motion_field_get_mv(motion_field, brow, bcol); + nb_full_mvs[nb_full_mv_num].as_mv = get_full_mv(&mv.as_mv); + ++nb_full_mv_num; + } + } + } + return nb_full_mv_num; +} +#endif // CONFIG_NON_GREEDY_MV + int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, - MV *ref_mv, MV *best_mv, int search_param, - int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, + MV *ref_mv, uint32_t start_mv_sad, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { int i, j, step; @@ -1589,7 +2066,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, const int in_what_stride = xd->plane[0].pre[0].stride; const uint8_t *best_address; - unsigned int bestsad = INT_MAX; + unsigned int bestsad = start_mv_sad; int best_site = -1; int last_site = -1; @@ -1607,8 +2084,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, const int tot_steps = cfg->total_steps - search_param; const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, - x->mv_limits.row_min, x->mv_limits.row_max); ref_row = ref_mv->row; ref_col = ref_mv->col; *num00 = 0; @@ -1619,10 +2094,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; best_address = in_what; - // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + - mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit); - i = 0; for (step = 0; step < tot_steps; step++) { @@ -1646,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address; - fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, - sad_array); + sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); for (t = 0; t < 4; t++, i++) { if (sad_array[t] < bestsad) { @@ -1671,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, if (is_mv_in(&x->mv_limits, &this_mv)) { const uint8_t *const check_here = ss_os[i] + best_address; unsigned int thissad = - fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride); if (thissad < bestsad) { thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); @@ -1784,12 +2255,15 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { } static const MV search_pos[4] = { - { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, + { -1, 0 }, + { 0, -1 }, + { 0, 1 }, + { 1, 0 }, }; unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, - int mi_col) { + int mi_col, const MV *ref_mv) { MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *mi = xd->mi[0]; struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } }; @@ -1811,6 +2285,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, const int norm_factor = 3 + (bw >> 5); const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]); + MvLimits subpel_mv_limits; if (scaled_ref_frame) { int i; @@ -1822,18 +2297,19 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, } #if CONFIG_VP9_HIGHBITDEPTH - { - unsigned int this_sad; + // TODO(jingning): Implement integral projection functions for high bit-depth + // setting and remove this part of code. + if (xd->bd != 8) { + const unsigned int sad = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride); tmp_mv->row = 0; tmp_mv->col = 0; - this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, - xd->plane[0].pre[0].buf, ref_stride); if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; } - return this_sad; + return sad; } #endif @@ -1873,7 +2349,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, { const uint8_t *const pos[4] = { - ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride, + ref_buf - ref_stride, + ref_buf - 1, + ref_buf + 1, + ref_buf + ref_stride, }; cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad); @@ -1908,6 +2387,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, tmp_mv->row *= 8; tmp_mv->col *= 8; + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); + clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max, + subpel_mv_limits.row_min, subpel_mv_limits.row_max); + if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; @@ -1916,19 +2399,139 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, return best_sad; } +static int get_exhaustive_threshold(int exhaustive_searches_thresh, + BLOCK_SIZE bsize) { + return exhaustive_searches_thresh >> + (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize])); +} + +#if CONFIG_NON_GREEDY_MV // Runs sequence of diamond searches in smaller steps for RD. /* do_refine: If last step (1-away) of n-step search doesn't pick the center point as the best match, we will do a final 1-away diamond refining search */ -static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, - int step_param, int sadpb, int further_steps, - int do_refine, int *cost_list, +int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, int step_param, + int lambda, int do_refine, + const int_mv *nb_full_mvs, int full_mv_num, + MV *best_mv) { + const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; + const SPEED_FEATURES *const sf = &cpi->sf; + int n, num00 = 0; + int thissme; + int bestsme; + const int further_steps = MAX_MVSEARCH_STEPS - 1 - step_param; + const MV center_mv = { 0, 0 }; + vpx_clear_system_state(); + diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, best_mv, step_param, lambda, + &n, fn_ptr, nb_full_mvs, full_mv_num); + + bestsme = vp9_get_mvpred_var(x, best_mv, ¢er_mv, fn_ptr, 0); + + // If there won't be more n-step search, check to see if refining search is + // needed. + if (n > further_steps) do_refine = 0; + + while (n < further_steps) { + ++n; + if (num00) { + num00--; + } else { + MV temp_mv; + diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, &temp_mv, + step_param + n, lambda, &num00, fn_ptr, + nb_full_mvs, full_mv_num); + thissme = vp9_get_mvpred_var(x, &temp_mv, ¢er_mv, fn_ptr, 0); + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; + + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = temp_mv; + } + } + } + + // final 1-away diamond refining search + if (do_refine) { + const int search_range = 8; + MV temp_mv = *best_mv; + vp9_refining_search_sad_new(x, &temp_mv, lambda, search_range, fn_ptr, + nb_full_mvs, full_mv_num); + thissme = vp9_get_mvpred_var(x, &temp_mv, ¢er_mv, fn_ptr, 0); + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = temp_mv; + } + } + + if (sf->exhaustive_searches_thresh < INT_MAX && + !cpi->rc.is_src_frame_alt_ref) { + const int64_t exhaustive_thr = + get_exhaustive_threshold(sf->exhaustive_searches_thresh, bsize); + if (bestsme > exhaustive_thr) { + full_pixel_exhaustive_new(cpi, x, best_mv, fn_ptr, best_mv, lambda, + nb_full_mvs, full_mv_num); + bestsme = vp9_get_mvpred_var(x, best_mv, ¢er_mv, fn_ptr, 0); + } + } + return bestsme; +} +#endif // CONFIG_NON_GREEDY_MV + +// Runs sequence of diamond searches in smaller steps for RD. +/* do_refine: If last step (1-away) of n-step search doesn't pick the center + point as the best match, we will do a final 1-away diamond + refining search */ +static int full_pixel_diamond(const VP9_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + MV *mvp_full, int step_param, int sadpb, + int further_steps, int do_refine, + int use_downsampled_sad, int *cost_list, const vp9_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, MV *dst_mv) { MV temp_mv; int thissme, n, num00 = 0; - int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, - step_param, sadpb, &n, fn_ptr, ref_mv); + int bestsme; + const int src_buf_stride = x->plane[0].src.stride; + const uint8_t *const src_buf = x->plane[0].src.buf; + const MACROBLOCKD *const xd = &x->e_mbd; + const int pred_buf_stride = xd->plane[0].pre[0].stride; + uint8_t *pred_buf; + vp9_sad_fn_ptr_t sad_fn_ptr; + unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows; + const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 }; + clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + + pred_buf = + xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col; + start_mv_sad_even_rows = + fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride); + start_mv_sad_odd_rows = + fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride, + pred_buf + pred_buf_stride, pred_buf_stride); + start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1; + start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb); + + sad_fn_ptr.sdf = fn_ptr->sdf; + sad_fn_ptr.sdx4df = fn_ptr->sdx4df; + if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) { + // If the absolute difference between the pred-to-src SAD of even rows and + // the pred-to-src SAD of odd rows is small, skip every other row in sad + // computation. + const int odd_to_even_diff_sad = + abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows); + const int mult_thresh = 10; + if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) { + sad_fn_ptr.sdf = fn_ptr->sdsf; + sad_fn_ptr.sdx4df = fn_ptr->sdsx4df; + } + } + + bestsme = + cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv, + step_param, sadpb, &n, &sad_fn_ptr, ref_mv); if (bestsme < INT_MAX) bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); *dst_mv = temp_mv; @@ -1943,9 +2546,9 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, if (num00) { num00--; } else { - thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, - step_param + n, sadpb, &num00, fn_ptr, - ref_mv); + thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, + &temp_mv, step_param + n, sadpb, &num00, + &sad_fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); @@ -1963,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, if (do_refine) { const int search_range = 8; MV best_mv = *dst_mv; - thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr, - ref_mv); + thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, + &sad_fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1); if (thissme < bestsme) { @@ -1973,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, } } + if (sad_fn_ptr.sdf != fn_ptr->sdf) { + // If we are skipping rows when we perform the motion search, we need to + // check the quality of skipping. If it's bad, then we run search with + // skip row features off. + const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv); + const int sad = + fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride); + const int skip_sad = + fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride); + // We will keep the result of skipping rows if it's good enough. + const int kSADThresh = + 1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) { + // There is a large discrepancy between skipping and not skipping, so we + // need to redo the motion search. + return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb, + further_steps, do_refine, 0, cost_list, fn_ptr, + ref_mv, dst_mv); + } + } + // Return cost list. if (cost_list) { calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list); @@ -1980,13 +2604,11 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full, return bestsme; } -#define MIN_RANGE 7 -#define MAX_RANGE 256 -#define MIN_INTERVAL 1 // Runs an limited range exhaustive mesh search using a pattern set // according to the encode speed profile. -static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, - MV *centre_mv_full, int sadpb, int *cost_list, +static int full_pixel_exhaustive(const VP9_COMP *const cpi, + const MACROBLOCK *const x, MV *centre_mv_full, + int sadpb, int *cost_list, const vp9_variance_fn_ptr_t *fn_ptr, const MV *ref_mv, MV *dst_mv) { const SPEED_FEATURES *const sf = &cpi->sf; @@ -1998,9 +2620,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, int range = sf->mesh_patterns[0].range; int baseline_interval_divisor; - // Keep track of number of exhaustive calls (this frame in this thread). - ++(*x->ex_search_count_ptr); - // Trap illegal values for interval and range for this function. if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) || (interval > range)) @@ -2010,12 +2629,12 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, // Check size of proposed first range against magnitude of the centre // value used as a starting point. - range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4); - range = VPXMIN(range, MAX_RANGE); + range = clamp(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4, + MAX_RANGE); interval = VPXMAX(interval, range / baseline_interval_divisor); // initial search - bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval, + bestsme = exhaustive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval, sadpb, fn_ptr, &temp_mv); if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) { @@ -2023,7 +2642,7 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, // till we reach a step size of 1. Then break out. for (i = 1; i < MAX_MESH_STEP; ++i) { // First pass with coarser step and longer range - bestsme = exhuastive_mesh_search( + bestsme = exhaustive_mesh_search( x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range, sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv); @@ -2042,200 +2661,94 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, return bestsme; } -int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r, c; +#if CONFIG_NON_GREEDY_MV +int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, + int lambda, int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, + int full_mv_num) { const MACROBLOCKD *const xd = &x->e_mbd; + const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; const struct buf_2d *const what = &x->plane[0].src; const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - for (c = col_min; c < col_max; ++c) { - const MV mv = { r, c }; - const int sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), - in_what->stride) + - mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } + const uint8_t *best_address = get_buf_from_mv(in_what, best_full_mv); + int64_t best_sad; + int i, j; + vpx_clear_system_state(); + { + const int64_t mv_dist = (int64_t)fn_ptr->sdf(what->buf, what->stride, + best_address, in_what->stride) + << LOG2_PRECISION; + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num); + best_sad = mv_dist + lambda * mv_cost; } - return best_sad; -} -int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; + for (i = 0; i < search_range; i++) { + int best_site = -1; + const int all_in = ((best_full_mv->row - 1) > x->mv_limits.row_min) & + ((best_full_mv->row + 1) < x->mv_limits.row_max) & + ((best_full_mv->col - 1) > x->mv_limits.col_min) & + ((best_full_mv->col + 1) < x->mv_limits.col_max); - for (r = row_min; r < row_max; ++r) { - int c = col_min; - const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; + if (all_in) { + unsigned int sads[4]; + const uint8_t *const positions[4] = { best_address - in_what->stride, + best_address - 1, best_address + 1, + best_address + in_what->stride }; - if (fn_ptr->sdx3f != NULL) { - while ((c + 2) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[3]); + fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads); - fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, - sads); + for (j = 0; j < 4; ++j) { + const MV mv = { best_full_mv->row + neighbors[j].row, + best_full_mv->col + neighbors[j].col }; + const int64_t mv_dist = (int64_t)sads[j] << LOG2_PRECISION; + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + const int64_t thissad = mv_dist + lambda * mv_cost; + if (thissad < best_sad) { + best_sad = thissad; + best_site = j; + } + } + } else { + for (j = 0; j < 4; ++j) { + const MV mv = { best_full_mv->row + neighbors[j].row, + best_full_mv->col + neighbors[j].col }; - for (i = 0; i < 3; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } + if (is_mv_in(&x->mv_limits, &mv)) { + const int64_t mv_dist = + (int64_t)fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), + in_what->stride) + << LOG2_PRECISION; + const int64_t mv_cost = + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + const int64_t thissad = mv_dist + lambda * mv_cost; + if (thissad < best_sad) { + best_sad = thissad; + best_site = j; } - ++check_here; - ++c; } } } - while (c < col_max) { - unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride); - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - - return best_sad; -} - -int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv) { - int r; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct buf_2d *const what = &x->plane[0].src; - const struct buf_2d *const in_what = &xd->plane[0].pre[0]; - const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min); - const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max); - const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min); - const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max); - const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), - in_what->stride) + - mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); - *best_mv = *ref_mv; - - for (r = row_min; r < row_max; ++r) { - int c = col_min; - const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - - if (fn_ptr->sdx8f != NULL) { - while ((c + 7) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[8]); - - fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 8; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - if (fn_ptr->sdx3f != NULL) { - while ((c + 2) < col_max) { - int i; - DECLARE_ALIGNED(16, uint32_t, sads[3]); - - fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, - sads); - - for (i = 0; i < 3; ++i) { - unsigned int sad = sads[i]; - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; - } - } - } - - while (c < col_max) { - unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride); - if (sad < best_sad) { - const MV mv = { r, c }; - sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } - } - ++check_here; - ++c; + if (best_site == -1) { + break; + } else { + best_full_mv->row += neighbors[best_site].row; + best_full_mv->col += neighbors[best_site].col; + best_address = get_buf_from_mv(in_what, best_full_mv); } } return best_sad; } +#endif // CONFIG_NON_GREEDY_MV int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, - const vp9_variance_fn_ptr_t *fn_ptr, + const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv) { const MACROBLOCKD *const xd = &x->e_mbd; const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; @@ -2244,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv); unsigned int best_sad = - fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) + + sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); int i, j; @@ -2261,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, best_address - 1, best_address + 1, best_address + in_what->stride }; - fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads); + sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, + sads); for (j = 0; j < 4; ++j) { if (sads[j] < best_sad) { @@ -2281,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, if (is_mv_in(&x->mv_limits, &mv)) { unsigned int sad = - fn_ptr->sdf(what->buf, what->stride, - get_buf_from_mv(in_what, &mv), in_what->stride); + sad_fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); if (sad < best_sad) { sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); if (sad < best_sad) { @@ -2358,26 +2872,16 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, return best_sad; } -#define MIN_EX_SEARCH_LIMIT 128 -static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) { +int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x, + BLOCK_SIZE bsize, MV *mvp_full, int step_param, + int search_method, int error_per_bit, int *cost_list, + const MV *ref_mv, MV *tmp_mv, int var_max, int rd) { const SPEED_FEATURES *const sf = &cpi->sf; - const int max_ex = - VPXMAX(MIN_EX_SEARCH_LIMIT, - (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100); - - return sf->allow_exhaustive_searches && - (sf->exhaustive_searches_thresh < INT_MAX) && - (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref; -} - -int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int error_per_bit, - int *cost_list, const MV *ref_mv, MV *tmp_mv, - int var_max, int rd) { - const SPEED_FEATURES *const sf = &cpi->sf; - const SEARCH_METHODS method = sf->mv.search_method; - vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; + const SEARCH_METHODS method = (SEARCH_METHODS)search_method; + const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; + int run_exhaustive_search = 0; + if (cost_list) { cost_list[0] = INT_MAX; cost_list[1] = INT_MAX; @@ -2386,9 +2890,6 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, cost_list[4] = INT_MAX; } - // Keep track of number of searches (this frame in this thread). - ++(*x->m_search_count_ptr); - switch (method) { case FAST_DIAMOND: var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, @@ -2411,35 +2912,124 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, fn_ptr, 1, ref_mv, tmp_mv); break; case NSTEP: - var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, - MAX_MVSEARCH_STEPS - 1 - step_param, 1, - cost_list, fn_ptr, ref_mv, tmp_mv); - - // Should we allow a follow on exhaustive search? - if (is_exhaustive_allowed(cpi, x)) { - int64_t exhuastive_thr = sf->exhaustive_searches_thresh; - exhuastive_thr >>= - 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); - - // Threshold variance for an exhaustive full search. - if (var > exhuastive_thr) { - int var_ex; - MV tmp_mv_ex; - var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, - cost_list, fn_ptr, ref_mv, &tmp_mv_ex); - - if (var_ex < var) { - var = var_ex; - *tmp_mv = tmp_mv_ex; - } - } - } + case MESH: + var = full_pixel_diamond( + cpi, x, bsize, mvp_full, step_param, error_per_bit, + MAX_MVSEARCH_STEPS - 1 - step_param, 1, + cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv); break; - default: assert(0 && "Invalid search method."); + default: assert(0 && "Unknown search method"); } - if (method != NSTEP && rd && var < var_max) + if (method == NSTEP) { + if (sf->exhaustive_searches_thresh < INT_MAX && + !cpi->rc.is_src_frame_alt_ref) { + const int64_t exhaustive_thr = + get_exhaustive_threshold(sf->exhaustive_searches_thresh, bsize); + if (var > exhaustive_thr) { + run_exhaustive_search = 1; + } + } + } else if (method == MESH) { + run_exhaustive_search = 1; + } + + if (run_exhaustive_search) { + int var_ex; + MV tmp_mv_ex; + var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, cost_list, + fn_ptr, ref_mv, &tmp_mv_ex); + if (var_ex < var) { + var = var_ex; + *tmp_mv = tmp_mv_ex; + } + } + + if (method != NSTEP && method != MESH && rd && var < var_max) var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1); return var; } + +// Note(yunqingwang): The following 2 functions are only used in the motion +// vector unit test, which return extreme motion vectors allowed by the MV +// limits. +#define COMMON_MV_TEST \ + SETUP_SUBPEL_SEARCH; \ + \ + (void)error_per_bit; \ + (void)vfp; \ + (void)z; \ + (void)src_stride; \ + (void)y; \ + (void)y_stride; \ + (void)second_pred; \ + (void)w; \ + (void)h; \ + (void)offset; \ + (void)mvjcost; \ + (void)mvcost; \ + (void)sse1; \ + (void)distortion; \ + \ + (void)halfiters; \ + (void)quarteriters; \ + (void)eighthiters; \ + (void)whichdir; \ + (void)allow_hp; \ + (void)forced_stop; \ + (void)hstep; \ + (void)rr; \ + (void)rc; \ + \ + (void)tr; \ + (void)tc; \ + (void)sse; \ + (void)thismse; \ + (void)cost_list; \ + (void)use_accurate_subpel_search + +// Return the maximum MV. +uint32_t vp9_return_max_sub_pixel_mv( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { + COMMON_MV_TEST; + + (void)minr; + (void)minc; + + bestmv->row = maxr; + bestmv->col = maxc; + besterr = 0; + + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv)); + + return besterr; +} +// Return the minimum MV. +uint32_t vp9_return_min_sub_pixel_mv( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { + COMMON_MV_TEST; + + (void)maxr; + (void)maxc; + + bestmv->row = minr; + bestmv->col = minc; + besterr = 0; + + // In the sub-pel motion search, if hp is not used, then the last bit of mv + // has to be 0. + lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv)); + + return besterr; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h index 2726b9e230..fd6a8b9aca 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h @@ -8,10 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_MCOMP_H_ -#define VP9_ENCODER_VP9_MCOMP_H_ +#ifndef VPX_VP9_ENCODER_VP9_MCOMP_H_ +#define VPX_VP9_ENCODER_VP9_MCOMP_H_ #include "vp9/encoder/vp9_block.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/encoder/vp9_non_greedy_mv.h" +#endif // CONFIG_NON_GREEDY_MV #include "vpx_dsp/variance.h" #ifdef __cplusplus @@ -38,6 +41,16 @@ typedef struct search_site_config { int total_steps; } search_site_config; +typedef struct vp9_sad_table { + vpx_sad_fn_t sdf; + vpx_sad_multi_d_fn_t sdx4df; +} vp9_sad_fn_ptr_t; + +static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, + const MV *mv) { + return &buf->buf[mv->row * buf->stride + mv->col]; +} + void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride); void vp9_init3smotion_compensation(search_site_config *cfg, int stride); @@ -55,18 +68,20 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv, struct VP9_COMP; struct SPEED_FEATURES; +struct vp9_sad_table; int vp9_init_search_range(int size); int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv, - int sad_per_bit, int distance, - const struct vp9_variance_vtable *fn_ptr, + int error_per_bit, int search_range, + const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv); // Perform integral projection based motion estimation. unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int mi_row, int mi_col); + int mi_row, int mi_col, + const MV *ref_mv); typedef uint32_t(fractional_mv_step_fp)( const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, @@ -74,28 +89,20 @@ typedef uint32_t(fractional_mv_step_fp)( int forced_stop, // 0 - full, 1 - qtr only, 2 - half only int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h); + int h, int use_accurate_subpel_search); extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned; extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more; extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore; extern fractional_mv_step_fp vp9_skip_sub_pixel_tree; - -typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv, MV *best_mv); - -typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv, - int sad_per_bit, int distance, - const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv); +extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv; +extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv; typedef int (*vp9_diamond_search_fn_t)( - const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, - int search_param, int sad_per_bit, int *num00, - const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv); + const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, + uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit, + int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv); int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, @@ -104,13 +111,68 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, struct VP9_COMP; -int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int error_per_bit, - int *cost_list, const MV *ref_mv, MV *tmp_mv, - int var_max, int rd); +// "mvp_full" is the MV search starting point; +// "ref_mv" is the context reference MV; +// "tmp_mv" is the searched best MV. +int vp9_full_pixel_search(const struct VP9_COMP *const cpi, + const MACROBLOCK *const x, BLOCK_SIZE bsize, + MV *mvp_full, int step_param, int search_method, + int error_per_bit, int *cost_list, const MV *ref_mv, + MV *tmp_mv, int var_max, int rd); +void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits, + const MvLimits *umv_window_limits, + const MV *ref_mv); + +#if CONFIG_NON_GREEDY_MV +struct TplDepStats; +int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, + int lambda, int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num); + +int vp9_full_pixel_diamond_new(const struct VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, int step_param, + int lambda, int do_refine, + const int_mv *nb_full_mvs, int full_mv_num, + MV *best_mv); + +static INLINE MV get_full_mv(const MV *mv) { + MV out_mv; + out_mv.row = mv->row >> 3; + out_mv.col = mv->col >> 3; + return out_mv; +} +struct TplDepFrame; +int vp9_prepare_nb_full_mvs(const struct MotionField *motion_field, int mi_row, + int mi_col, int_mv *nb_full_mvs); + +static INLINE BLOCK_SIZE get_square_block_size(BLOCK_SIZE bsize) { + BLOCK_SIZE square_bsize; + switch (bsize) { + case BLOCK_4X4: + case BLOCK_4X8: + case BLOCK_8X4: square_bsize = BLOCK_4X4; break; + case BLOCK_8X8: + case BLOCK_8X16: + case BLOCK_16X8: square_bsize = BLOCK_8X8; break; + case BLOCK_16X16: + case BLOCK_16X32: + case BLOCK_32X16: square_bsize = BLOCK_16X16; break; + case BLOCK_32X32: + case BLOCK_32X64: + case BLOCK_64X32: + case BLOCK_64X64: square_bsize = BLOCK_32X32; break; + default: + square_bsize = BLOCK_INVALID; + assert(0 && "ERROR: invalid block size"); + break; + } + return square_bsize; +} +#endif // CONFIG_NON_GREEDY_MV #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_MCOMP_H_ +#endif // VPX_VP9_ENCODER_VP9_MCOMP_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c new file mode 100644 index 0000000000..8437ce7531 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx_util/vpx_pthread.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_temporal_filter.h" + +void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, + int tile_id) { + RowMTInfo *row_mt_info; + JobQueueHandle *job_queue_hdl = NULL; + void *next = NULL; + JobNode *job_info = NULL; +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_handle = NULL; +#endif + + row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]); + job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl; +#if CONFIG_MULTITHREAD + mutex_handle = &row_mt_info->job_mutex; +#endif + +// lock the mutex for queue access +#if CONFIG_MULTITHREAD + pthread_mutex_lock(mutex_handle); +#endif + next = job_queue_hdl->next; + if (next != NULL) { + JobQueue *job_queue = (JobQueue *)next; + job_info = &job_queue->job_info; + // Update the next job in the queue + job_queue_hdl->next = job_queue->next; + job_queue_hdl->num_jobs_acquired++; + } + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(mutex_handle); +#endif + + return job_info; +} + +void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, + TileDataEnc *const this_tile) { + VP9_COMMON *const cm = &cpi->common; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int i; + + if (this_tile->row_base_thresh_freq_fact != NULL) { + if (sb_rows <= this_tile->sb_rows) { + return; + } + vpx_free(this_tile->row_base_thresh_freq_fact); + this_tile->row_base_thresh_freq_fact = NULL; + } + CHECK_MEM_ERROR( + &cm->error, this_tile->row_base_thresh_freq_fact, + (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, + sizeof(*(this_tile->row_base_thresh_freq_fact)))); + for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) + this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT; + this_tile->sb_rows = sb_rows; +} + +void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { + struct VP9Common *cm = &cpi->common; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int tile_row, tile_col; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int jobs_per_tile_col, total_jobs; + + // Allocate memory that is large enough for all row_mt stages. First pass + // uses 16x16 block size. + jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows); + // Calculate the total number of jobs + total_jobs = jobs_per_tile_col * tile_cols; + + multi_thread_ctxt->allocated_tile_cols = tile_cols; + multi_thread_ctxt->allocated_tile_rows = tile_rows; + multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col; + + CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue, + (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue))); + +#if CONFIG_MULTITHREAD + // Create mutex for each tile + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col]; + pthread_mutex_init(&row_mt_info->job_mutex, NULL); + } +#endif + + // Allocate memory for row based multi-threading + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_col]; + vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col); + } + + // Assign the sync pointer of tile row zero for every tile row > 0 + for (tile_row = 1; tile_row < tile_rows; tile_row++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileDataEnc *this_col_tile = &cpi->tile_data[tile_col]; + this_tile->row_mt_sync = this_col_tile->row_mt_sync; + } + } + + // Calculate the number of vertical units in the given tile row + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols]; + TileInfo *tile_info = &this_tile->tile_info; + multi_thread_ctxt->num_tile_vert_sbs[tile_row] = + get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); + } +} + +void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int tile_col; +#if CONFIG_MULTITHREAD + int tile_row; +#endif + + // Deallocate memory for job queue + if (multi_thread_ctxt->job_queue) { + vpx_free(multi_thread_ctxt->job_queue); + multi_thread_ctxt->job_queue = NULL; + } + +#if CONFIG_MULTITHREAD + // Destroy mutex for each tile + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col]; + pthread_mutex_destroy(&row_mt_info->job_mutex); + } +#endif + + // Free row based multi-threading sync memory + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_col]; + vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); + } + +#if CONFIG_MULTITHREAD + for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows; + tile_row++) { + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + TileDataEnc *this_tile = + &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + + tile_col]; + if (this_tile->row_base_thresh_freq_fact != NULL) { + vpx_free(this_tile->row_base_thresh_freq_fact); + this_tile->row_base_thresh_freq_fact = NULL; + } + } + } +#endif + + multi_thread_ctxt->allocated_tile_cols = 0; + multi_thread_ctxt->allocated_tile_rows = 0; + multi_thread_ctxt->allocated_vert_unit_rows = 0; +} + +void vp9_multi_thread_tile_init(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int i; + + for (i = 0; i < tile_cols; i++) { + TileDataEnc *this_tile = &cpi->tile_data[i]; + int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows; + + // Initialize cur_col to -1 for all rows. + memset(this_tile->row_mt_sync.cur_col, -1, + sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col); + vp9_zero(this_tile->fp_data); + this_tile->fp_data.image_data_start_row = INVALID_ROW; + } +} + +void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt, + int tile_cols, int num_workers) { + int tile_id = 0; + int i; + + // Allocating the threads for the tiles + for (i = 0; i < num_workers; i++) { + multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++; + if (tile_id == tile_cols) tile_id = 0; + } +} + +int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt, + int cur_tile_id) { + RowMTInfo *row_mt_info; + JobQueueHandle *job_queue_hndl; +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex; +#endif + int num_jobs_remaining; + + row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id]; + job_queue_hndl = &row_mt_info->job_queue_hdl; +#if CONFIG_MULTITHREAD + mutex = &row_mt_info->job_mutex; +#endif + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(mutex); +#endif + num_jobs_remaining = + multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(mutex); +#endif + + return (num_jobs_remaining); +} + +void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) { + VP9_COMMON *const cm = &cpi->common; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + JobQueue *job_queue = multi_thread_ctxt->job_queue; + const int tile_cols = 1 << cm->log2_tile_cols; + int job_row_num, jobs_per_tile, jobs_per_tile_col = 0, total_jobs; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int tile_col, i; + + switch (job_type) { + case ENCODE_JOB: jobs_per_tile_col = sb_rows; break; + case FIRST_PASS_JOB: jobs_per_tile_col = cm->mb_rows; break; + case ARNR_JOB: + jobs_per_tile_col = ((cm->mi_rows + TF_ROUND) >> TF_SHIFT); + break; + default: assert(0); + } + + total_jobs = jobs_per_tile_col * tile_cols; + + multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col; + // memset the entire job queue buffer to zero + memset(job_queue, 0, total_jobs * sizeof(JobQueue)); + + // Job queue preparation + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col]; + JobQueue *job_queue_curr, *job_queue_temp; + int tile_row = 0; + + tile_ctxt->job_queue_hdl.next = (void *)job_queue; + tile_ctxt->job_queue_hdl.num_jobs_acquired = 0; + + job_queue_curr = job_queue; + job_queue_temp = job_queue; + + // loop over all the vertical rows + for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col; + job_row_num++, jobs_per_tile++) { + job_queue_curr->job_info.vert_unit_row_num = job_row_num; + job_queue_curr->job_info.tile_col_id = tile_col; + job_queue_curr->job_info.tile_row_id = tile_row; + job_queue_curr->next = (void *)(job_queue_temp + 1); + job_queue_curr = ++job_queue_temp; + + if (ENCODE_JOB == job_type) { + if (jobs_per_tile >= + multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) { + tile_row++; + jobs_per_tile = -1; + } + } + } + + // Set the last pointer to NULL + job_queue_curr += -1; + job_queue_curr->next = (void *)NULL; + + // Move to the next tile + job_queue += jobs_per_tile_col; + } + + for (i = 0; i < cpi->num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + thread_data->thread_id = i; + + for (tile_col = 0; tile_col < tile_cols; tile_col++) + thread_data->tile_completion_status[tile_col] = 0; + } +} + +int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt, + int *tile_completion_status, int *cur_tile_id, + int tile_cols) { + int tile_col; + int tile_id = -1; // Stores the tile ID with minimum proc done + int max_num_jobs_remaining = 0; + int num_jobs_remaining; + + // Mark the completion to avoid check in the loop + tile_completion_status[*cur_tile_id] = 1; + // Check for the status of all the tiles + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + if (tile_completion_status[tile_col] == 0) { + num_jobs_remaining = + vp9_get_job_queue_status(multi_thread_ctxt, tile_col); + // Mark the completion to avoid checks during future switches across tiles + if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1; + if (num_jobs_remaining > max_num_jobs_remaining) { + max_num_jobs_remaining = num_jobs_remaining; + tile_id = tile_col; + } + } + } + + if (-1 == tile_id) { + return 1; + } else { + // Update the cur ID to the next tile ID that will be processed, + // which will be the least processed tile + *cur_tile_id = tile_id; + return 0; + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h new file mode 100644 index 0000000000..a2276f4fe6 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ +#define VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_job_queue.h" + +void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, + int tile_id); + +void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type); + +int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt, + int cur_tile_id); + +void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt, + int tile_cols, int num_workers); + +void vp9_multi_thread_tile_init(VP9_COMP *cpi); + +void vp9_row_mt_mem_alloc(VP9_COMP *cpi); + +void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, + TileDataEnc *const this_tile); + +void vp9_row_mt_mem_dealloc(VP9_COMP *cpi); + +int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt, + int *tile_completion_status, int *cur_tile_id, + int tile_cols); + +#endif // VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c index 2252fe16b9..4ee6e51ba8 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c @@ -21,27 +21,42 @@ #include "vp9/encoder/vp9_noise_estimate.h" #include "vp9/encoder/vp9_encoder.h" +#if CONFIG_VP9_TEMPORAL_DENOISING +// For SVC: only do noise estimation on top spatial layer. +static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) { + return (!cpi->use_svc || + (cpi->use_svc && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)); +} +#endif + void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { ne->enabled = 0; - ne->level = kLowLow; + ne->level = (width * height < 1280 * 720) ? kLowLow : kLow; ne->value = 0; ne->count = 0; - ne->thresh = 100; + ne->thresh = 90; ne->last_w = 0; ne->last_h = 0; if (width * height >= 1920 * 1080) { ne->thresh = 200; } else if (width * height >= 1280 * 720) { ne->thresh = 140; + } else if (width * height >= 640 * 360) { + ne->thresh = 115; } - ne->num_frames_estimate = 20; + ne->num_frames_estimate = 15; + ne->adapt_thresh = (3 * ne->thresh) >> 1; } static int enable_noise_estimation(VP9_COMP *const cpi) { -// Enable noise estimation if denoising is on, but not for low resolutions. +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) return 0; +#endif +// Enable noise estimation if denoising is on. #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && cpi->common.width >= 640 && - cpi->common.height >= 360) + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && + cpi->common.width >= 320 && cpi->common.height >= 180) return 1; #endif // Only allow noise estimate under certain encoding mode. @@ -51,8 +66,8 @@ static int enable_noise_estimation(VP9_COMP *const cpi) { if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 && cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc && - cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->common.width >= 640 && - cpi->common.height >= 360) + cpi->oxcf.content != VP9E_CONTENT_SCREEN && + cpi->common.width * cpi->common.height >= 640 * 360) return 1; else return 0; @@ -83,7 +98,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { } else { if (ne->value > ne->thresh) noise_level = kMedium; - else if (ne->value > ((9 * ne->thresh) >> 4)) + else if (ne->value > (ne->thresh >> 1)) noise_level = kLow; else noise_level = kLowLow; @@ -94,24 +109,32 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) { void vp9_update_noise_estimate(VP9_COMP *const cpi) { const VP9_COMMON *const cm = &cpi->common; NOISE_ESTIMATE *const ne = &cpi->noise_estimate; + const int low_res = (cm->width <= 352 && cm->height <= 288); // Estimate of noise level every frame_period frames. int frame_period = 8; int thresh_consec_zeromv = 6; - unsigned int thresh_sum_diff = 100; - unsigned int thresh_sum_spatial = (200 * 200) << 8; - unsigned int thresh_spatial_var = (32 * 32) << 8; - int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7; + int frame_counter = cm->current_video_frame; // Estimate is between current source and last source. YV12_BUFFER_CONFIG *last_source = cpi->Last_Source; #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) last_source = &cpi->denoiser.last_source; + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) { + last_source = &cpi->denoiser.last_source; + // Tune these thresholds for different resolutions when denoising is + // enabled. + if (cm->width > 640 && cm->width <= 1920) { + thresh_consec_zeromv = 2; + } + } #endif ne->enabled = enable_noise_estimation(cpi); - if (!ne->enabled || cm->current_video_frame % frame_period != 0 || - last_source == NULL || ne->last_w != cm->width || - ne->last_h != cm->height) { + if (cpi->svc.number_spatial_layers > 1) + frame_counter = cpi->svc.current_superframe; + if (!ne->enabled || frame_counter % frame_period != 0 || + last_source == NULL || + (cpi->svc.number_spatial_layers == 1 && + (ne->last_w != cm->width || ne->last_h != cm->height))) { #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) copy_frame(&cpi->denoiser.last_source, cpi->Source); #endif if (last_source != NULL) { @@ -119,20 +142,30 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { ne->last_h = cm->height; } return; - } else if (cpi->rc.avg_frame_low_motion < 50) { + } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 && + cpi->rc.frames_since_key > cpi->svc.number_spatial_layers && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) { // Force noise estimation to 0 and denoiser off if content has high motion. ne->level = kLowLow; + ne->count = 0; + ne->num_frames_estimate = 10; #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) - vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && + cpi->svc.current_superframe > 1) { + vp9_denoiser_set_noise_level(cpi, ne->level); + copy_frame(&cpi->denoiser.last_source, cpi->Source); + } #endif return; } else { - int num_samples = 0; - uint64_t avg_est = 0; + unsigned int bin_size = 100; + unsigned int hist[MAX_VAR_HIST_BINS] = { 0 }; + unsigned int hist_avg[MAX_VAR_HIST_BINS]; + unsigned int max_bin = 0; + unsigned int max_bin_count = 0; + unsigned int bin_cnt; int bsize = BLOCK_16X16; - static const unsigned char const_source[16] = { 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0 }; // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have // been encoded as zero/small mv at least x consecutive frames, compute // the variance to update estimate of noise in the source. @@ -164,44 +197,36 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { int bl_index1 = bl_index + 1; int bl_index2 = bl_index + cm->mi_cols; int bl_index3 = bl_index2 + 1; - // Only consider blocks that are likely steady background. i.e, have - // been encoded as zero/low motion x (= thresh_consec_zeromv) frames - // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all - // 4 sub-blocks for 16x16 block. Also, avoid skin blocks. int consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], VPXMIN(cpi->consec_zero_mv[bl_index1], VPXMIN(cpi->consec_zero_mv[bl_index2], cpi->consec_zero_mv[bl_index3]))); - int is_skin = 0; - if (cpi->use_skin_detection) { - is_skin = - vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, - src_uvstride, bsize, consec_zeromv, 0); - } - if (frame_low_motion && - cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv && - cpi->consec_zero_mv[bl_index1] > thresh_consec_zeromv && - cpi->consec_zero_mv[bl_index2] > thresh_consec_zeromv && - cpi->consec_zero_mv[bl_index3] > thresh_consec_zeromv && - !is_skin) { - // Compute variance. - unsigned int sse; - unsigned int variance = cpi->fn_ptr[bsize].vf( - src_y, src_ystride, last_src_y, last_src_ystride, &sse); - // Only consider this block as valid for noise measurement if the - // average term (sse - variance = N * avg^{2}, N = 16X16) of the - // temporal residual is small (avoid effects from lighting change). - if ((sse - variance) < thresh_sum_diff) { - unsigned int sse2; - const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf( - src_y, src_ystride, const_source, 0, &sse2); - // Avoid blocks with high brightness and high spatial variance. - if ((sse2 - spatial_variance) < thresh_sum_spatial && - spatial_variance < thresh_spatial_var) { - avg_est += variance / ((spatial_variance >> 9) + 1); - num_samples++; - } + // Only consider blocks that are likely steady background. i.e., have + // been encoded as zero/low motion x (= thresh_consec_zeromv) frames + // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all + // 4 sub-blocks for 16x16 block. And exclude this frame if + // high_source_sad is true (i.e., scene/content change). + if (frame_low_motion && consec_zeromv > thresh_consec_zeromv && + !cpi->rc.high_source_sad && + !cpi->svc.high_source_sad_superframe) { + int is_skin = 0; + if (cpi->use_skin_detection) { + is_skin = + vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, + src_uvstride, bsize, consec_zeromv, 0); + } + if (!is_skin) { + unsigned int sse; + // Compute variance between co-located blocks from current and + // last input frames. + unsigned int variance = cpi->fn_ptr[bsize].vf( + src_y, src_ystride, last_src_y, last_src_ystride, &sse); + unsigned int hist_index = variance / bin_size; + if (hist_index < MAX_VAR_HIST_BINS) + hist[hist_index]++; + else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1)) + hist[MAX_VAR_HIST_BINS - 1]++; // Account for the tail } } } @@ -217,29 +242,61 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { } ne->last_w = cm->width; ne->last_h = cm->height; - // Update noise estimate if we have at a minimum number of block samples, - // and avg_est > 0 (avg_est == 0 can happen if the application inputs - // duplicate frames). - if (num_samples > min_blocks_estimate && avg_est > 0) { - // Normalize. - avg_est = avg_est / num_samples; - // Update noise estimate. - ne->value = (int)((15 * ne->value + avg_est) >> 4); - ne->count++; - if (ne->count == ne->num_frames_estimate) { - // Reset counter and check noise level condition. - ne->num_frames_estimate = 30; - ne->count = 0; - ne->level = vp9_noise_estimate_extract_level(ne); -#if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) - vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); -#endif + // Adjust histogram to account for effect that histogram flattens + // and shifts to zero as scene darkens. + if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) { + hist[0] = 0; + hist[1] >>= 2; + hist[2] >>= 2; + hist[3] >>= 2; + hist[4] >>= 1; + hist[5] >>= 1; + hist[6] = 3 * hist[6] >> 1; + hist[MAX_VAR_HIST_BINS - 1] >>= 1; + } + + // Average hist[] and find largest bin + for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) { + if (bin_cnt == 0) + hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3; + else if (bin_cnt == MAX_VAR_HIST_BINS - 1) + hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2; + else if (bin_cnt == MAX_VAR_HIST_BINS - 2) + hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + + (hist[bin_cnt + 1] >> 1) + 2) >> + 2; + else + hist_avg[bin_cnt] = + (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >> + 2; + + if (hist_avg[bin_cnt] > max_bin_count) { + max_bin_count = hist_avg[bin_cnt]; + max_bin = bin_cnt; } } + + // Scale by 40 to work with existing thresholds + ne->value = (int)((3 * ne->value + max_bin * 40) >> 2); + // Quickly increase VNR strength when the noise level increases suddenly. + if (ne->level < kMedium && ne->value > ne->adapt_thresh) { + ne->count = ne->num_frames_estimate; + } else { + ne->count++; + } + if (ne->count == ne->num_frames_estimate) { + // Reset counter and check noise level condition. + ne->num_frames_estimate = 30; + ne->count = 0; + ne->level = vp9_noise_estimate_extract_level(ne); +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) + vp9_denoiser_set_noise_level(cpi, ne->level); +#endif + } } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) + if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) copy_frame(&cpi->denoiser.last_source, cpi->Source); #endif } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h index 335cdbe643..7fc94ff8c9 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_NOISE_ESTIMATE_H_ -#define VP9_ENCODER_NOISE_ESTIMATE_H_ +#ifndef VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ +#define VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ #include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_skin_detection.h" @@ -23,6 +23,8 @@ extern "C" { #endif +#define MAX_VAR_HIST_BINS 20 + typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL; typedef struct noise_estimate { @@ -30,6 +32,7 @@ typedef struct noise_estimate { NOISE_LEVEL level; int value; int thresh; + int adapt_thresh; int count; int last_w; int last_h; @@ -48,4 +51,4 @@ void vp9_update_noise_estimate(struct VP9_COMP *const cpi); } // extern "C" #endif -#endif // VP9_ENCODER_NOISE_ESTIMATE_H_ +#endif // VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c new file mode 100644 index 0000000000..d52801c845 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c @@ -0,0 +1,536 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_mv.h" +#include "vp9/encoder/vp9_non_greedy_mv.h" +// TODO(angiebird): move non_greedy_mv related functions to this file + +#define LOG2_TABLE_SIZE 1024 +static const int log2_table[LOG2_TABLE_SIZE] = { + 0, // This is a dummy value + 0, 1048576, 1661954, 2097152, 2434718, 2710530, 2943725, + 3145728, 3323907, 3483294, 3627477, 3759106, 3880192, 3992301, + 4096672, 4194304, 4286015, 4372483, 4454275, 4531870, 4605679, + 4676053, 4743299, 4807682, 4869436, 4928768, 4985861, 5040877, + 5093962, 5145248, 5194851, 5242880, 5289431, 5334591, 5378443, + 5421059, 5462508, 5502851, 5542146, 5580446, 5617800, 5654255, + 5689851, 5724629, 5758625, 5791875, 5824409, 5856258, 5887450, + 5918012, 5947969, 5977344, 6006160, 6034437, 6062195, 6089453, + 6116228, 6142538, 6168398, 6193824, 6218829, 6243427, 6267632, + 6291456, 6314910, 6338007, 6360756, 6383167, 6405252, 6427019, + 6448477, 6469635, 6490501, 6511084, 6531390, 6551427, 6571202, + 6590722, 6609993, 6629022, 6647815, 6666376, 6684713, 6702831, + 6720734, 6738427, 6755916, 6773205, 6790299, 6807201, 6823917, + 6840451, 6856805, 6872985, 6888993, 6904834, 6920510, 6936026, + 6951384, 6966588, 6981641, 6996545, 7011304, 7025920, 7040397, + 7054736, 7068940, 7083013, 7096956, 7110771, 7124461, 7138029, + 7151476, 7164804, 7178017, 7191114, 7204100, 7216974, 7229740, + 7242400, 7254954, 7267405, 7279754, 7292003, 7304154, 7316208, + 7328167, 7340032, 7351805, 7363486, 7375079, 7386583, 7398000, + 7409332, 7420579, 7431743, 7442826, 7453828, 7464751, 7475595, + 7486362, 7497053, 7507669, 7518211, 7528680, 7539077, 7549404, + 7559660, 7569847, 7579966, 7590017, 7600003, 7609923, 7619778, + 7629569, 7639298, 7648964, 7658569, 7668114, 7677598, 7687023, + 7696391, 7705700, 7714952, 7724149, 7733289, 7742375, 7751407, + 7760385, 7769310, 7778182, 7787003, 7795773, 7804492, 7813161, + 7821781, 7830352, 7838875, 7847350, 7855777, 7864158, 7872493, + 7880782, 7889027, 7897226, 7905381, 7913492, 7921561, 7929586, + 7937569, 7945510, 7953410, 7961268, 7969086, 7976864, 7984602, + 7992301, 7999960, 8007581, 8015164, 8022709, 8030217, 8037687, + 8045121, 8052519, 8059880, 8067206, 8074496, 8081752, 8088973, + 8096159, 8103312, 8110431, 8117516, 8124569, 8131589, 8138576, + 8145532, 8152455, 8159347, 8166208, 8173037, 8179836, 8186605, + 8193343, 8200052, 8206731, 8213380, 8220001, 8226593, 8233156, + 8239690, 8246197, 8252676, 8259127, 8265550, 8271947, 8278316, + 8284659, 8290976, 8297266, 8303530, 8309768, 8315981, 8322168, + 8328330, 8334467, 8340579, 8346667, 8352730, 8358769, 8364784, + 8370775, 8376743, 8382687, 8388608, 8394506, 8400381, 8406233, + 8412062, 8417870, 8423655, 8429418, 8435159, 8440878, 8446576, + 8452252, 8457908, 8463542, 8469155, 8474748, 8480319, 8485871, + 8491402, 8496913, 8502404, 8507875, 8513327, 8518759, 8524171, + 8529564, 8534938, 8540293, 8545629, 8550947, 8556245, 8561525, + 8566787, 8572031, 8577256, 8582464, 8587653, 8592825, 8597980, + 8603116, 8608236, 8613338, 8618423, 8623491, 8628542, 8633576, + 8638593, 8643594, 8648579, 8653547, 8658499, 8663434, 8668354, + 8673258, 8678145, 8683017, 8687874, 8692715, 8697540, 8702350, + 8707145, 8711925, 8716690, 8721439, 8726174, 8730894, 8735599, + 8740290, 8744967, 8749628, 8754276, 8758909, 8763528, 8768134, + 8772725, 8777302, 8781865, 8786415, 8790951, 8795474, 8799983, + 8804478, 8808961, 8813430, 8817886, 8822328, 8826758, 8831175, + 8835579, 8839970, 8844349, 8848715, 8853068, 8857409, 8861737, + 8866053, 8870357, 8874649, 8878928, 8883195, 8887451, 8891694, + 8895926, 8900145, 8904353, 8908550, 8912734, 8916908, 8921069, + 8925220, 8929358, 8933486, 8937603, 8941708, 8945802, 8949885, + 8953957, 8958018, 8962068, 8966108, 8970137, 8974155, 8978162, + 8982159, 8986145, 8990121, 8994086, 8998041, 9001986, 9005920, + 9009844, 9013758, 9017662, 9021556, 9025440, 9029314, 9033178, + 9037032, 9040877, 9044711, 9048536, 9052352, 9056157, 9059953, + 9063740, 9067517, 9071285, 9075044, 9078793, 9082533, 9086263, + 9089985, 9093697, 9097400, 9101095, 9104780, 9108456, 9112123, + 9115782, 9119431, 9123072, 9126704, 9130328, 9133943, 9137549, + 9141146, 9144735, 9148316, 9151888, 9155452, 9159007, 9162554, + 9166092, 9169623, 9173145, 9176659, 9180165, 9183663, 9187152, + 9190634, 9194108, 9197573, 9201031, 9204481, 9207923, 9211357, + 9214784, 9218202, 9221613, 9225017, 9228412, 9231800, 9235181, + 9238554, 9241919, 9245277, 9248628, 9251971, 9255307, 9258635, + 9261956, 9265270, 9268577, 9271876, 9275169, 9278454, 9281732, + 9285002, 9288266, 9291523, 9294773, 9298016, 9301252, 9304481, + 9307703, 9310918, 9314126, 9317328, 9320523, 9323711, 9326892, + 9330067, 9333235, 9336397, 9339552, 9342700, 9345842, 9348977, + 9352106, 9355228, 9358344, 9361454, 9364557, 9367654, 9370744, + 9373828, 9376906, 9379978, 9383043, 9386102, 9389155, 9392202, + 9395243, 9398278, 9401306, 9404329, 9407345, 9410356, 9413360, + 9416359, 9419351, 9422338, 9425319, 9428294, 9431263, 9434226, + 9437184, 9440136, 9443082, 9446022, 9448957, 9451886, 9454809, + 9457726, 9460638, 9463545, 9466446, 9469341, 9472231, 9475115, + 9477994, 9480867, 9483735, 9486597, 9489454, 9492306, 9495152, + 9497993, 9500828, 9503659, 9506484, 9509303, 9512118, 9514927, + 9517731, 9520530, 9523324, 9526112, 9528895, 9531674, 9534447, + 9537215, 9539978, 9542736, 9545489, 9548237, 9550980, 9553718, + 9556451, 9559179, 9561903, 9564621, 9567335, 9570043, 9572747, + 9575446, 9578140, 9580830, 9583514, 9586194, 9588869, 9591540, + 9594205, 9596866, 9599523, 9602174, 9604821, 9607464, 9610101, + 9612735, 9615363, 9617987, 9620607, 9623222, 9625832, 9628438, + 9631040, 9633637, 9636229, 9638818, 9641401, 9643981, 9646556, + 9649126, 9651692, 9654254, 9656812, 9659365, 9661914, 9664459, + 9666999, 9669535, 9672067, 9674594, 9677118, 9679637, 9682152, + 9684663, 9687169, 9689672, 9692170, 9694665, 9697155, 9699641, + 9702123, 9704601, 9707075, 9709545, 9712010, 9714472, 9716930, + 9719384, 9721834, 9724279, 9726721, 9729159, 9731593, 9734024, + 9736450, 9738872, 9741291, 9743705, 9746116, 9748523, 9750926, + 9753326, 9755721, 9758113, 9760501, 9762885, 9765266, 9767642, + 9770015, 9772385, 9774750, 9777112, 9779470, 9781825, 9784175, + 9786523, 9788866, 9791206, 9793543, 9795875, 9798204, 9800530, + 9802852, 9805170, 9807485, 9809797, 9812104, 9814409, 9816710, + 9819007, 9821301, 9823591, 9825878, 9828161, 9830441, 9832718, + 9834991, 9837261, 9839527, 9841790, 9844050, 9846306, 9848559, + 9850808, 9853054, 9855297, 9857537, 9859773, 9862006, 9864235, + 9866462, 9868685, 9870904, 9873121, 9875334, 9877544, 9879751, + 9881955, 9884155, 9886352, 9888546, 9890737, 9892925, 9895109, + 9897291, 9899469, 9901644, 9903816, 9905985, 9908150, 9910313, + 9912473, 9914629, 9916783, 9918933, 9921080, 9923225, 9925366, + 9927504, 9929639, 9931771, 9933900, 9936027, 9938150, 9940270, + 9942387, 9944502, 9946613, 9948721, 9950827, 9952929, 9955029, + 9957126, 9959219, 9961310, 9963398, 9965484, 9967566, 9969645, + 9971722, 9973796, 9975866, 9977934, 9980000, 9982062, 9984122, + 9986179, 9988233, 9990284, 9992332, 9994378, 9996421, 9998461, + 10000498, 10002533, 10004565, 10006594, 10008621, 10010644, 10012665, + 10014684, 10016700, 10018713, 10020723, 10022731, 10024736, 10026738, + 10028738, 10030735, 10032729, 10034721, 10036710, 10038697, 10040681, + 10042662, 10044641, 10046617, 10048591, 10050562, 10052530, 10054496, + 10056459, 10058420, 10060379, 10062334, 10064287, 10066238, 10068186, + 10070132, 10072075, 10074016, 10075954, 10077890, 10079823, 10081754, + 10083682, 10085608, 10087532, 10089453, 10091371, 10093287, 10095201, + 10097112, 10099021, 10100928, 10102832, 10104733, 10106633, 10108529, + 10110424, 10112316, 10114206, 10116093, 10117978, 10119861, 10121742, + 10123620, 10125495, 10127369, 10129240, 10131109, 10132975, 10134839, + 10136701, 10138561, 10140418, 10142273, 10144126, 10145976, 10147825, + 10149671, 10151514, 10153356, 10155195, 10157032, 10158867, 10160699, + 10162530, 10164358, 10166184, 10168007, 10169829, 10171648, 10173465, + 10175280, 10177093, 10178904, 10180712, 10182519, 10184323, 10186125, + 10187925, 10189722, 10191518, 10193311, 10195103, 10196892, 10198679, + 10200464, 10202247, 10204028, 10205806, 10207583, 10209357, 10211130, + 10212900, 10214668, 10216435, 10218199, 10219961, 10221721, 10223479, + 10225235, 10226989, 10228741, 10230491, 10232239, 10233985, 10235728, + 10237470, 10239210, 10240948, 10242684, 10244417, 10246149, 10247879, + 10249607, 10251333, 10253057, 10254779, 10256499, 10258217, 10259933, + 10261647, 10263360, 10265070, 10266778, 10268485, 10270189, 10271892, + 10273593, 10275292, 10276988, 10278683, 10280376, 10282068, 10283757, + 10285444, 10287130, 10288814, 10290495, 10292175, 10293853, 10295530, + 10297204, 10298876, 10300547, 10302216, 10303883, 10305548, 10307211, + 10308873, 10310532, 10312190, 10313846, 10315501, 10317153, 10318804, + 10320452, 10322099, 10323745, 10325388, 10327030, 10328670, 10330308, + 10331944, 10333578, 10335211, 10336842, 10338472, 10340099, 10341725, + 10343349, 10344971, 10346592, 10348210, 10349828, 10351443, 10353057, + 10354668, 10356279, 10357887, 10359494, 10361099, 10362702, 10364304, + 10365904, 10367502, 10369099, 10370694, 10372287, 10373879, 10375468, + 10377057, 10378643, 10380228, 10381811, 10383393, 10384973, 10386551, + 10388128, 10389703, 10391276, 10392848, 10394418, 10395986, 10397553, + 10399118, 10400682, 10402244, 10403804, 10405363, 10406920, 10408476, + 10410030, 10411582, 10413133, 10414682, 10416230, 10417776, 10419320, + 10420863, 10422404, 10423944, 10425482, 10427019, 10428554, 10430087, + 10431619, 10433149, 10434678, 10436206, 10437731, 10439256, 10440778, + 10442299, 10443819, 10445337, 10446854, 10448369, 10449882, 10451394, + 10452905, 10454414, 10455921, 10457427, 10458932, 10460435, 10461936, + 10463436, 10464935, 10466432, 10467927, 10469422, 10470914, 10472405, + 10473895, 10475383, 10476870, 10478355, 10479839, 10481322, 10482802, + 10484282, +}; + +static int mi_size_to_block_size(int mi_bsize, int mi_num) { + return (mi_num % mi_bsize) ? mi_num / mi_bsize + 1 : mi_num / mi_bsize; +} + +Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info, + int frame_num, int mi_rows, int mi_cols) { + int frame_idx, rf_idx, square_block_idx; + if (motion_field_info->allocated) { + // TODO(angiebird): Avoid re-allocate buffer if possible + vp9_free_motion_field_info(motion_field_info); + } + motion_field_info->frame_num = frame_num; + motion_field_info->motion_field_array = + vpx_calloc(frame_num, sizeof(*motion_field_info->motion_field_array)); + if (!motion_field_info->motion_field_array) return STATUS_FAILED; + for (frame_idx = 0; frame_idx < frame_num; ++frame_idx) { + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + BLOCK_SIZE bsize = square_block_idx_to_bsize(square_block_idx); + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int block_rows = mi_size_to_block_size(mi_height, mi_rows); + const int block_cols = mi_size_to_block_size(mi_width, mi_cols); + MotionField *motion_field = + &motion_field_info + ->motion_field_array[frame_idx][rf_idx][square_block_idx]; + Status status = + vp9_alloc_motion_field(motion_field, bsize, block_rows, block_cols); + if (status == STATUS_FAILED) { + return STATUS_FAILED; + } + } + } + } + motion_field_info->allocated = 1; + return STATUS_OK; +} + +Status vp9_alloc_motion_field(MotionField *motion_field, BLOCK_SIZE bsize, + int block_rows, int block_cols) { + Status status = STATUS_OK; + motion_field->ready = 0; + motion_field->bsize = bsize; + motion_field->block_rows = block_rows; + motion_field->block_cols = block_cols; + motion_field->block_num = block_rows * block_cols; + motion_field->mf = + vpx_calloc(motion_field->block_num, sizeof(*motion_field->mf)); + if (motion_field->mf == NULL) { + status = STATUS_FAILED; + } + motion_field->set_mv = + vpx_calloc(motion_field->block_num, sizeof(*motion_field->set_mv)); + if (motion_field->set_mv == NULL) { + vpx_free(motion_field->mf); + motion_field->mf = NULL; + status = STATUS_FAILED; + } + motion_field->local_structure = vpx_calloc( + motion_field->block_num, sizeof(*motion_field->local_structure)); + if (motion_field->local_structure == NULL) { + vpx_free(motion_field->mf); + motion_field->mf = NULL; + vpx_free(motion_field->set_mv); + motion_field->set_mv = NULL; + status = STATUS_FAILED; + } + return status; +} + +void vp9_free_motion_field(MotionField *motion_field) { + vpx_free(motion_field->mf); + vpx_free(motion_field->set_mv); + vpx_free(motion_field->local_structure); + vp9_zero(*motion_field); +} + +void vp9_free_motion_field_info(MotionFieldInfo *motion_field_info) { + if (motion_field_info->allocated) { + int frame_idx, rf_idx, square_block_idx; + for (frame_idx = 0; frame_idx < motion_field_info->frame_num; ++frame_idx) { + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + MotionField *motion_field = + &motion_field_info + ->motion_field_array[frame_idx][rf_idx][square_block_idx]; + vp9_free_motion_field(motion_field); + } + } + } + vpx_free(motion_field_info->motion_field_array); + motion_field_info->motion_field_array = NULL; + motion_field_info->frame_num = 0; + motion_field_info->allocated = 0; + } +} + +MotionField *vp9_motion_field_info_get_motion_field( + MotionFieldInfo *motion_field_info, int frame_idx, int rf_idx, + BLOCK_SIZE bsize) { + int square_block_idx = get_square_block_idx(bsize); + assert(frame_idx < motion_field_info->frame_num); + assert(motion_field_info->allocated == 1); + return &motion_field_info + ->motion_field_array[frame_idx][rf_idx][square_block_idx]; +} + +int vp9_motion_field_is_mv_set(const MotionField *motion_field, int brow, + int bcol) { + assert(brow >= 0 && brow < motion_field->block_rows); + assert(bcol >= 0 && bcol < motion_field->block_cols); + return motion_field->set_mv[brow * motion_field->block_cols + bcol]; +} + +int_mv vp9_motion_field_get_mv(const MotionField *motion_field, int brow, + int bcol) { + assert(brow >= 0 && brow < motion_field->block_rows); + assert(bcol >= 0 && bcol < motion_field->block_cols); + return motion_field->mf[brow * motion_field->block_cols + bcol]; +} + +int_mv vp9_motion_field_mi_get_mv(const MotionField *motion_field, int mi_row, + int mi_col) { + const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize]; + const int brow = mi_row / mi_height; + const int bcol = mi_col / mi_width; + assert(mi_row % mi_height == 0); + assert(mi_col % mi_width == 0); + return vp9_motion_field_get_mv(motion_field, brow, bcol); +} + +void vp9_motion_field_mi_set_mv(MotionField *motion_field, int mi_row, + int mi_col, int_mv mv) { + const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize]; + const int brow = mi_row / mi_height; + const int bcol = mi_col / mi_width; + assert(mi_row % mi_height == 0); + assert(mi_col % mi_width == 0); + assert(brow >= 0 && brow < motion_field->block_rows); + assert(bcol >= 0 && bcol < motion_field->block_cols); + motion_field->mf[brow * motion_field->block_cols + bcol] = mv; + motion_field->set_mv[brow * motion_field->block_cols + bcol] = 1; +} + +void vp9_motion_field_reset_mvs(MotionField *motion_field) { + memset(motion_field->set_mv, 0, + motion_field->block_num * sizeof(*motion_field->set_mv)); +} + +static int64_t log2_approximation(int64_t v) { + assert(v > 0); + if (v < LOG2_TABLE_SIZE) { + return log2_table[v]; + } else { + // use linear approximation when v >= 2^10 + const int slope = + 1477; // slope = 1 / (log(2) * 1024) * (1 << LOG2_PRECISION) + assert(LOG2_TABLE_SIZE == 1 << 10); + + return slope * (v - LOG2_TABLE_SIZE) + (10 << LOG2_PRECISION); + } +} + +int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_full_mvs, + int mv_num) { + // The behavior of this function is to compute log2 of mv difference, + // i.e. min log2(1 + row_diff * row_diff + col_diff * col_diff) + // against available neighbor mvs. + // Since the log2 is monotonically increasing, we can compute + // min row_diff * row_diff + col_diff * col_diff first + // then apply log2 in the end. + int i; + int64_t min_abs_diff = INT64_MAX; + int cnt = 0; + assert(mv_num <= NB_MVS_NUM); + for (i = 0; i < mv_num; ++i) { + MV nb_mv = nb_full_mvs[i].as_mv; + const int64_t row_diff = abs(mv->row - nb_mv.row); + const int64_t col_diff = abs(mv->col - nb_mv.col); + const int64_t abs_diff = row_diff * row_diff + col_diff * col_diff; + assert(nb_full_mvs[i].as_int != INVALID_MV); + min_abs_diff = VPXMIN(abs_diff, min_abs_diff); + ++cnt; + } + if (cnt) { + return log2_approximation(1 + min_abs_diff); + } + return 0; +} + +static FloatMV get_smooth_motion_vector(const FloatMV scaled_search_mv, + const FloatMV *tmp_mf, + const int (*M)[MF_LOCAL_STRUCTURE_SIZE], + int rows, int cols, int row, int col, + float alpha) { + const FloatMV tmp_mv = tmp_mf[row * cols + col]; + int idx_row, idx_col; + FloatMV avg_nb_mv = { 0.0f, 0.0f }; + FloatMV mv = { 0.0f, 0.0f }; + float filter[3][3] = { { 1.0f / 12.0f, 1.0f / 6.0f, 1.0f / 12.0f }, + { 1.0f / 6.0f, 0.0f, 1.0f / 6.0f }, + { 1.0f / 12.0f, 1.0f / 6.0f, 1.0f / 12.0f } }; + for (idx_row = 0; idx_row < 3; ++idx_row) { + int nb_row = row + idx_row - 1; + for (idx_col = 0; idx_col < 3; ++idx_col) { + int nb_col = col + idx_col - 1; + if (nb_row < 0 || nb_col < 0 || nb_row >= rows || nb_col >= cols) { + avg_nb_mv.row += (tmp_mv.row) * filter[idx_row][idx_col]; + avg_nb_mv.col += (tmp_mv.col) * filter[idx_row][idx_col]; + } else { + const FloatMV nb_mv = tmp_mf[nb_row * cols + nb_col]; + avg_nb_mv.row += (nb_mv.row) * filter[idx_row][idx_col]; + avg_nb_mv.col += (nb_mv.col) * filter[idx_row][idx_col]; + } + } + } + { + // M is the local variance of reference frame + float M00 = M[row * cols + col][0]; + float M01 = M[row * cols + col][1]; + float M10 = M[row * cols + col][2]; + float M11 = M[row * cols + col][3]; + + float det = (M00 + alpha) * (M11 + alpha) - M01 * M10; + + float inv_M00 = (M11 + alpha) / det; + float inv_M01 = -M01 / det; + float inv_M10 = -M10 / det; + float inv_M11 = (M00 + alpha) / det; + + float inv_MM00 = inv_M00 * M00 + inv_M01 * M10; + float inv_MM01 = inv_M00 * M01 + inv_M01 * M11; + float inv_MM10 = inv_M10 * M00 + inv_M11 * M10; + float inv_MM11 = inv_M10 * M01 + inv_M11 * M11; + + mv.row = inv_M00 * avg_nb_mv.row * alpha + inv_M01 * avg_nb_mv.col * alpha + + inv_MM00 * scaled_search_mv.row + inv_MM01 * scaled_search_mv.col; + mv.col = inv_M10 * avg_nb_mv.row * alpha + inv_M11 * avg_nb_mv.col * alpha + + inv_MM10 * scaled_search_mv.row + inv_MM11 * scaled_search_mv.col; + } + return mv; +} + +void vp9_get_smooth_motion_field(const MV *search_mf, + const int (*M)[MF_LOCAL_STRUCTURE_SIZE], + int rows, int cols, BLOCK_SIZE bsize, + float alpha, int num_iters, MV *smooth_mf) { + // M is the local variation of reference frame + // build two buffers + FloatMV *input = (FloatMV *)malloc(rows * cols * sizeof(FloatMV)); + FloatMV *output = (FloatMV *)malloc(rows * cols * sizeof(FloatMV)); + int idx; + int row, col; + int bw = 4 << b_width_log2_lookup[bsize]; + int bh = 4 << b_height_log2_lookup[bsize]; + if (!(input && output)) goto fail; + // copy search results to input buffer + for (idx = 0; idx < rows * cols; ++idx) { + input[idx].row = (float)search_mf[idx].row / bh; + input[idx].col = (float)search_mf[idx].col / bw; + } + for (idx = 0; idx < num_iters; ++idx) { + FloatMV *tmp; + for (row = 0; row < rows; ++row) { + for (col = 0; col < cols; ++col) { + // note: the scaled_search_mf and smooth_mf are all scaled by macroblock + // size + const MV search_mv = search_mf[row * cols + col]; + FloatMV scaled_search_mv = { (float)search_mv.row / bh, + (float)search_mv.col / bw }; + output[row * cols + col] = get_smooth_motion_vector( + scaled_search_mv, input, M, rows, cols, row, col, alpha); + } + } + // swap buffers + tmp = input; + input = output; + output = tmp; + } + // copy smoothed results to output + for (idx = 0; idx < rows * cols; ++idx) { + smooth_mf[idx].row = (int)(input[idx].row * bh); + smooth_mf[idx].col = (int)(input[idx].col * bw); + } +fail: + free(input); + free(output); +} + +void vp9_get_local_structure(const YV12_BUFFER_CONFIG *cur_frame, + const YV12_BUFFER_CONFIG *ref_frame, + const MV *search_mf, + const vp9_variance_fn_ptr_t *fn_ptr, int rows, + int cols, BLOCK_SIZE bsize, + int (*M)[MF_LOCAL_STRUCTURE_SIZE]) { + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int cur_stride = cur_frame->y_stride; + const int ref_stride = ref_frame->y_stride; + const int width = ref_frame->y_width; + const int height = ref_frame->y_height; + int row, col; + for (row = 0; row < rows; ++row) { + for (col = 0; col < cols; ++col) { + int cur_offset = row * bh * cur_stride + col * bw; + uint8_t *center = cur_frame->y_buffer + cur_offset; + int ref_h = row * bh + search_mf[row * cols + col].row; + int ref_w = col * bw + search_mf[row * cols + col].col; + int ref_offset; + uint8_t *target; + uint8_t *nb; + int search_dist; + int nb_dist; + int I_row = 0, I_col = 0; + // TODO(Dan): handle the case that when reference frame block beyond the + // boundary + ref_h = ref_h < 0 ? 0 : (ref_h >= height - bh ? height - bh - 1 : ref_h); + ref_w = ref_w < 0 ? 0 : (ref_w >= width - bw ? width - bw - 1 : ref_w); + // compute search results distortion + // TODO(Dan): maybe need to use vp9 function to find the reference block, + // to compare with the results of my python code, I first use my way to + // compute the reference block + ref_offset = ref_h * ref_stride + ref_w; + target = ref_frame->y_buffer + ref_offset; + search_dist = fn_ptr->sdf(center, cur_stride, target, ref_stride); + // compute target's neighbors' distortions + // TODO(Dan): if using padding, the boundary condition may vary + // up + if (ref_h - bh >= 0) { + nb = target - ref_stride * bh; + nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride); + I_row += nb_dist - search_dist; + } + // down + if (ref_h + bh < height - bh) { + nb = target + ref_stride * bh; + nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride); + I_row += nb_dist - search_dist; + } + if (ref_h - bh >= 0 && ref_h + bh < height - bh) { + I_row /= 2; + } + I_row /= (bw * bh); + // left + if (ref_w - bw >= 0) { + nb = target - bw; + nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride); + I_col += nb_dist - search_dist; + } + // down + if (ref_w + bw < width - bw) { + nb = target + bw; + nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride); + I_col += nb_dist - search_dist; + } + if (ref_w - bw >= 0 && ref_w + bw < width - bw) { + I_col /= 2; + } + I_col /= (bw * bh); + M[row * cols + col][0] = I_row * I_row; + M[row * cols + col][1] = I_row * I_col; + M[row * cols + col][2] = I_col * I_row; + M[row * cols + col][3] = I_col * I_col; + } + } +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h new file mode 100644 index 0000000000..c2bd69722a --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_ +#define VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_ + +#include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_blockd.h" +#include "vpx_scale/yv12config.h" +#include "vpx_dsp/variance.h" + +#ifdef __cplusplus +extern "C" { +#endif +#define NB_MVS_NUM 4 +#define LOG2_PRECISION 20 +#define MF_LOCAL_STRUCTURE_SIZE 4 +#define SQUARE_BLOCK_SIZES 4 + +typedef enum Status { STATUS_OK = 0, STATUS_FAILED = 1 } Status; + +typedef struct MotionField { + int ready; + BLOCK_SIZE bsize; + int block_rows; + int block_cols; + int block_num; // block_num == block_rows * block_cols + int (*local_structure)[MF_LOCAL_STRUCTURE_SIZE]; + int_mv *mf; + int *set_mv; + int mv_log_scale; +} MotionField; + +typedef struct MotionFieldInfo { + int frame_num; + int allocated; + MotionField (*motion_field_array)[MAX_INTER_REF_FRAMES][SQUARE_BLOCK_SIZES]; +} MotionFieldInfo; + +typedef struct { + float row, col; +} FloatMV; + +static INLINE int get_square_block_idx(BLOCK_SIZE bsize) { + if (bsize == BLOCK_4X4) { + return 0; + } + if (bsize == BLOCK_8X8) { + return 1; + } + if (bsize == BLOCK_16X16) { + return 2; + } + if (bsize == BLOCK_32X32) { + return 3; + } + assert(0 && "ERROR: non-square block size"); + return -1; +} + +static INLINE BLOCK_SIZE square_block_idx_to_bsize(int square_block_idx) { + if (square_block_idx == 0) { + return BLOCK_4X4; + } + if (square_block_idx == 1) { + return BLOCK_8X8; + } + if (square_block_idx == 2) { + return BLOCK_16X16; + } + if (square_block_idx == 3) { + return BLOCK_32X32; + } + assert(0 && "ERROR: invalid square_block_idx"); + return BLOCK_INVALID; +} + +Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info, + int frame_num, int mi_rows, int mi_cols); + +Status vp9_alloc_motion_field(MotionField *motion_field, BLOCK_SIZE bsize, + int block_rows, int block_cols); + +void vp9_free_motion_field(MotionField *motion_field); + +void vp9_free_motion_field_info(MotionFieldInfo *motion_field_info); + +int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_full_mvs, + int mv_num); + +void vp9_get_smooth_motion_field(const MV *search_mf, + const int (*M)[MF_LOCAL_STRUCTURE_SIZE], + int rows, int cols, BLOCK_SIZE bize, + float alpha, int num_iters, MV *smooth_mf); + +void vp9_get_local_structure(const YV12_BUFFER_CONFIG *cur_frame, + const YV12_BUFFER_CONFIG *ref_frame, + const MV *search_mf, + const vp9_variance_fn_ptr_t *fn_ptr, int rows, + int cols, BLOCK_SIZE bsize, + int (*M)[MF_LOCAL_STRUCTURE_SIZE]); + +MotionField *vp9_motion_field_info_get_motion_field( + MotionFieldInfo *motion_field_info, int frame_idx, int rf_idx, + BLOCK_SIZE bsize); + +void vp9_motion_field_mi_set_mv(MotionField *motion_field, int mi_row, + int mi_col, int_mv mv); + +void vp9_motion_field_reset_mvs(MotionField *motion_field); + +int_mv vp9_motion_field_get_mv(const MotionField *motion_field, int brow, + int bcol); +int_mv vp9_motion_field_mi_get_mv(const MotionField *motion_field, int mi_row, + int mi_col); +int vp9_motion_field_is_mv_set(const MotionField *motion_field, int brow, + int bcol); + +#ifdef __cplusplus +} // extern "C" +#endif +#endif // VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h b/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h new file mode 100644 index 0000000000..09c0e30a47 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h @@ -0,0 +1,975 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ +#define VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define NN_MAX_HIDDEN_LAYERS 10 +#define NN_MAX_NODES_PER_LAYER 128 + +// Neural net model config. It defines the layout of a neural net model, such as +// the number of inputs/outputs, number of layers, the number of nodes in each +// layer, as well as the weights and bias of each node. +typedef struct { + int num_inputs; // Number of input nodes, i.e. features. + int num_outputs; // Number of output nodes. + int num_hidden_layers; // Number of hidden layers, maximum 10. + // Number of nodes for each hidden layer. + int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS]; + // Weight parameters, indexed by layer. + const float *weights[NN_MAX_HIDDEN_LAYERS + 1]; + // Bias parameters, indexed by layer. + const float *bias[NN_MAX_HIDDEN_LAYERS + 1]; +} NN_CONFIG; + +// Partition search breakout model. +#define FEATURES 4 +#define Q_CTX 3 +#define RESOLUTION_CTX 2 +static const float + vp9_partition_breakout_weights_64[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.016673f, + -0.001025f, + -0.000032f, + 0.000833f, + 1.94261885f - 2.1f, + }, + { + -0.160867f, + -0.002101f, + 0.000011f, + 0.002448f, + 1.65738142f - 2.5f, + }, + { + -0.628934f, + -0.011459f, + -0.000009f, + 0.013833f, + 1.47982645f - 1.6f, + }, + }, + { + { + -0.064309f, + -0.006121f, + 0.000232f, + 0.005778f, + 0.7989465f - 5.0f, + }, + { + -0.314957f, + -0.009346f, + -0.000225f, + 0.010072f, + 2.80695581f - 5.5f, + }, + { + -0.635535f, + -0.015135f, + 0.000091f, + 0.015247f, + 2.90381241f - 5.0f, + }, + }, + }; + +static const float + vp9_partition_breakout_weights_32[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.010554f, + -0.003081f, + -0.000134f, + 0.004491f, + 1.68445992f - 3.5f, + }, + { + -0.051489f, + -0.007609f, + 0.000016f, + 0.009792f, + 1.28089404f - 2.5f, + }, + { + -0.163097f, + -0.013081f, + 0.000022f, + 0.019006f, + 1.36129403f - 3.2f, + }, + }, + { + { + -0.024629f, + -0.006492f, + -0.000254f, + 0.004895f, + 1.27919173f - 4.5f, + }, + { + -0.083936f, + -0.009827f, + -0.000200f, + 0.010399f, + 2.73731065f - 4.5f, + }, + { + -0.279052f, + -0.013334f, + 0.000289f, + 0.023203f, + 2.43595719f - 3.5f, + }, + }, + }; + +static const float + vp9_partition_breakout_weights_16[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.013154f, + -0.002404f, + -0.000977f, + 0.008450f, + 2.57404566f - 5.5f, + }, + { + -0.019146f, + -0.004018f, + 0.000064f, + 0.008187f, + 2.15043926f - 2.5f, + }, + { + -0.075755f, + -0.010858f, + 0.000030f, + 0.024505f, + 2.06848121f - 2.5f, + }, + }, + { + { + -0.007636f, + -0.002751f, + -0.000682f, + 0.005968f, + 0.19225763f - 4.5f, + }, + { + -0.047306f, + -0.009113f, + -0.000518f, + 0.016007f, + 2.61068869f - 4.0f, + }, + { + -0.069336f, + -0.010448f, + -0.001120f, + 0.023083f, + 1.47591054f - 5.5f, + }, + }, + }; + +static const float vp9_partition_breakout_weights_8[RESOLUTION_CTX][Q_CTX] + [FEATURES + 1] = { + { + { + -0.011807f, + -0.009873f, + -0.000931f, + 0.034768f, + 1.32254851f - 2.0f, + }, + { + -0.003861f, + -0.002701f, + 0.000100f, + 0.013876f, + 1.96755111f - 1.5f, + }, + { + -0.013522f, + -0.008677f, + -0.000562f, + 0.034468f, + 1.53440356f - 1.5f, + }, + }, + { + { + -0.003221f, + -0.002125f, + 0.000993f, + 0.012768f, + 0.03541421f - 2.0f, + }, + { + -0.006069f, + -0.007335f, + 0.000229f, + 0.026104f, + 0.17135315f - 1.5f, + }, + { + -0.039894f, + -0.011419f, + 0.000070f, + 0.061817f, + 0.6739977f - 1.5f, + }, + }, + }; +#undef FEATURES +#undef Q_CTX +#undef RESOLUTION_CTX + +// Rectangular partition search pruning model. +#define FEATURES 8 +#define LABELS 4 +#define NODES 16 +static const float vp9_rect_part_nn_weights_16_layer0[FEATURES * NODES] = { + -0.432522f, 0.133070f, -0.169187f, 0.768340f, 0.891228f, 0.554458f, + 0.356000f, 0.403621f, 0.809165f, 0.778214f, -0.520357f, 0.301451f, + -0.386972f, -0.314402f, 0.021878f, 1.148746f, -0.462258f, -0.175524f, + -0.344589f, -0.475159f, -0.232322f, 0.471147f, -0.489948f, 0.467740f, + -0.391550f, 0.208601f, 0.054138f, 0.076859f, -0.309497f, -0.095927f, + 0.225917f, 0.011582f, -0.520730f, -0.585497f, 0.174036f, 0.072521f, + 0.120771f, -0.517234f, -0.581908f, -0.034003f, -0.694722f, -0.364368f, + 0.290584f, 0.038373f, 0.685654f, 0.394019f, 0.759667f, 1.257502f, + -0.610516f, -0.185434f, 0.211997f, -0.172458f, 0.044605f, 0.145316f, + -0.182525f, -0.147376f, 0.578742f, 0.312412f, -0.446135f, -0.389112f, + 0.454033f, 0.260490f, 0.664285f, 0.395856f, -0.231827f, 0.215228f, + 0.014856f, -0.395462f, 0.479646f, -0.391445f, -0.357788f, 0.166238f, + -0.056818f, -0.027783f, 0.060880f, -1.604710f, 0.531268f, 0.282184f, + 0.714944f, 0.093523f, -0.218312f, -0.095546f, -0.285621f, -0.190871f, + -0.448340f, -0.016611f, 0.413913f, -0.286720f, -0.158828f, -0.092635f, + -0.279551f, 0.166509f, -0.088162f, 0.446543f, -0.276830f, -0.065642f, + -0.176346f, -0.984754f, 0.338738f, 0.403809f, 0.738065f, 1.154439f, + 0.750764f, 0.770959f, -0.269403f, 0.295651f, -0.331858f, 0.367144f, + 0.279279f, 0.157419f, -0.348227f, -0.168608f, -0.956000f, -0.647136f, + 0.250516f, 0.858084f, 0.809802f, 0.492408f, 0.804841f, 0.282802f, + 0.079395f, -0.291771f, -0.024382f, -1.615880f, -0.445166f, -0.407335f, + -0.483044f, 0.141126f, +}; + +static const float vp9_rect_part_nn_bias_16_layer0[NODES] = { + 0.275384f, -0.053745f, 0.000000f, 0.000000f, -0.178103f, 0.513965f, + -0.161352f, 0.228551f, 0.000000f, 1.013712f, 0.000000f, 0.000000f, + -1.144009f, -0.000006f, -0.241727f, 2.048764f, +}; + +static const float vp9_rect_part_nn_weights_16_layer1[NODES * LABELS] = { + -1.435278f, 2.204691f, -0.410718f, 0.202708f, 0.109208f, 1.059142f, + -0.306360f, 0.845906f, 0.489654f, -1.121915f, -0.169133f, -0.003385f, + 0.660590f, -0.018711f, 1.227158f, -2.967504f, 1.407345f, -1.293243f, + -0.386921f, 0.300492f, 0.338824f, -0.083250f, -0.069454f, -1.001827f, + -0.327891f, 0.899353f, 0.367397f, -0.118601f, -0.171936f, -0.420646f, + -0.803319f, 2.029634f, 0.940268f, -0.664484f, 0.339916f, 0.315944f, + 0.157374f, -0.402482f, -0.491695f, 0.595827f, 0.015031f, 0.255887f, + -0.466327f, -0.212598f, 0.136485f, 0.033363f, -0.796921f, 1.414304f, + -0.282185f, -2.673571f, -0.280994f, 0.382658f, -0.350902f, 0.227926f, + 0.062602f, -1.000199f, 0.433731f, 1.176439f, -0.163216f, -0.229015f, + -0.640098f, -0.438852f, -0.947700f, 2.203434f, +}; + +static const float vp9_rect_part_nn_bias_16_layer1[LABELS] = { + -0.875510f, + 0.982408f, + 0.560854f, + -0.415209f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_16 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_16_layer0, + vp9_rect_part_nn_weights_16_layer1, + }, + { + vp9_rect_part_nn_bias_16_layer0, + vp9_rect_part_nn_bias_16_layer1, + }, +}; + +static const float vp9_rect_part_nn_weights_32_layer0[FEATURES * NODES] = { + -0.147312f, -0.753248f, 0.540206f, 0.661415f, 0.484117f, -0.341609f, + 0.016183f, 0.064177f, 0.781580f, 0.902232f, -0.505342f, 0.325183f, + -0.231072f, -0.120107f, -0.076216f, 0.120038f, 0.403695f, -0.463301f, + -0.192158f, 0.407442f, 0.106633f, 1.072371f, -0.446779f, 0.467353f, + 0.318812f, -0.505996f, -0.008768f, -0.239598f, 0.085480f, 0.284640f, + -0.365045f, -0.048083f, -0.112090f, -0.067089f, 0.304138f, -0.228809f, + 0.383651f, -0.196882f, 0.477039f, -0.217978f, -0.506931f, -0.125675f, + 0.050456f, 1.086598f, 0.732128f, 0.326941f, 0.103952f, 0.121769f, + -0.154487f, -0.255514f, 0.030591f, -0.382797f, -0.019981f, -0.326570f, + 0.149691f, -0.435633f, -0.070795f, 0.167691f, 0.251413f, -0.153405f, + 0.160347f, 0.455107f, -0.968580f, -0.575879f, 0.623115f, -0.069793f, + -0.379768f, -0.965807f, -0.062057f, 0.071312f, 0.457098f, 0.350372f, + -0.460659f, -0.985393f, 0.359963f, -0.093677f, 0.404272f, -0.326896f, + -0.277752f, 0.609322f, -0.114193f, -0.230701f, 0.089208f, 0.645381f, + 0.494485f, 0.467876f, -0.166187f, 0.251044f, -0.394661f, 0.192895f, + -0.344777f, -0.041893f, -0.111163f, 0.066347f, 0.378158f, -0.455465f, + 0.339839f, -0.418207f, -0.356515f, -0.227536f, -0.211091f, -0.122945f, + 0.361772f, -0.338095f, 0.004564f, -0.398510f, 0.060876f, -2.132504f, + -0.086776f, -0.029166f, 0.039241f, 0.222534f, -0.188565f, -0.288792f, + -0.160789f, -0.123905f, 0.397916f, -0.063779f, 0.167210f, -0.445004f, + 0.056889f, 0.207280f, 0.000101f, 0.384507f, -1.721239f, -2.036402f, + -2.084403f, -2.060483f, +}; + +static const float vp9_rect_part_nn_bias_32_layer0[NODES] = { + -0.859251f, -0.109938f, 0.091838f, 0.187817f, -0.728265f, 0.253080f, + 0.000000f, -0.357195f, -0.031290f, -1.373237f, -0.761086f, 0.000000f, + -0.024504f, 1.765711f, 0.000000f, 1.505390f, +}; + +static const float vp9_rect_part_nn_weights_32_layer1[NODES * LABELS] = { + 0.680940f, 1.367178f, 0.403075f, 0.029957f, 0.500917f, 1.407776f, + -0.354002f, 0.011667f, 1.663767f, 0.959155f, 0.428323f, -0.205345f, + -0.081850f, -3.920103f, -0.243802f, -4.253933f, -0.034020f, -1.361057f, + 0.128236f, -0.138422f, -0.025790f, -0.563518f, -0.148715f, -0.344381f, + -1.677389f, -0.868332f, -0.063792f, 0.052052f, 0.359591f, 2.739808f, + -0.414304f, 3.036597f, -0.075368f, -1.019680f, 0.642501f, 0.209779f, + -0.374539f, -0.718294f, -0.116616f, -0.043212f, -1.787809f, -0.773262f, + 0.068734f, 0.508309f, 0.099334f, 1.802239f, -0.333538f, 2.708645f, + -0.447682f, -2.355555f, -0.506674f, -0.061028f, -0.310305f, -0.375475f, + 0.194572f, 0.431788f, -0.789624f, -0.031962f, 0.358353f, 0.382937f, + 0.232002f, 2.321813f, -0.037523f, 2.104652f, +}; + +static const float vp9_rect_part_nn_bias_32_layer1[LABELS] = { + -0.693383f, + 0.773661f, + 0.426878f, + -0.070619f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_32 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_32_layer0, + vp9_rect_part_nn_weights_32_layer1, + }, + { + vp9_rect_part_nn_bias_32_layer0, + vp9_rect_part_nn_bias_32_layer1, + }, +}; +#undef NODES + +#define NODES 24 +static const float vp9_rect_part_nn_weights_64_layer0[FEATURES * NODES] = { + 0.024671f, -0.220610f, -0.284362f, -0.069556f, -0.315700f, 0.187861f, + 0.139782f, 0.063110f, 0.796561f, 0.172868f, -0.662194f, -1.393074f, + 0.085003f, 0.393381f, 0.358477f, -0.187268f, -0.370745f, 0.218287f, + 0.027271f, -0.254089f, -0.048236f, -0.459137f, 0.253171f, 0.122598f, + -0.550107f, -0.568456f, 0.159866f, -0.246534f, 0.096384f, -0.255460f, + 0.077864f, -0.334837f, 0.026921f, -0.697252f, 0.345262f, 1.343578f, + 0.815984f, 1.118211f, 1.574016f, 0.578476f, -0.285967f, -0.508672f, + 0.118137f, 0.037695f, 1.540510f, 1.256648f, 1.163819f, 1.172027f, + 0.661551f, -0.111980f, -0.434204f, -0.894217f, 0.570524f, 0.050292f, + -0.113680f, 0.000784f, -0.211554f, -0.369394f, 0.158306f, -0.512505f, + -0.238696f, 0.091498f, -0.448490f, -0.491268f, -0.353112f, -0.303315f, + -0.428438f, 0.127998f, -0.406790f, -0.401786f, -0.279888f, -0.384223f, + 0.026100f, 0.041621f, -0.315818f, -0.087888f, 0.353497f, 0.163123f, + -0.380128f, -0.090334f, -0.216647f, -0.117849f, -0.173502f, 0.301871f, + 0.070854f, 0.114627f, -0.050545f, -0.160381f, 0.595294f, 0.492696f, + -0.453858f, -1.154139f, 0.126000f, 0.034550f, 0.456665f, -0.236618f, + -0.112640f, 0.050759f, -0.449162f, 0.110059f, 0.147116f, 0.249358f, + -0.049894f, 0.063351f, -0.004467f, 0.057242f, -0.482015f, -0.174335f, + -0.085617f, -0.333808f, -0.358440f, -0.069006f, 0.099260f, -1.243430f, + -0.052963f, 0.112088f, -2.661115f, -2.445893f, -2.688174f, -2.624232f, + 0.030494f, 0.161311f, 0.012136f, 0.207564f, -2.776856f, -2.791940f, + -2.623962f, -2.918820f, 1.231619f, -0.376692f, -0.698078f, 0.110336f, + -0.285378f, 0.258367f, -0.180159f, -0.376608f, -0.034348f, -0.130206f, + 0.160020f, 0.852977f, 0.580573f, 1.450782f, 1.357596f, 0.787382f, + -0.544004f, -0.014795f, 0.032121f, -0.557696f, 0.159994f, -0.540908f, + 0.180380f, -0.398045f, 0.705095f, 0.515103f, -0.511521f, -1.271374f, + -0.231019f, 0.423647f, 0.064907f, -0.255338f, -0.877748f, -0.667205f, + 0.267847f, 0.135229f, 0.617844f, 1.349849f, 1.012623f, 0.730506f, + -0.078571f, 0.058401f, 0.053221f, -2.426146f, -0.098808f, -0.138508f, + -0.153299f, 0.149116f, -0.444243f, 0.301807f, 0.065066f, 0.092929f, + -0.372784f, -0.095540f, 0.192269f, 0.237894f, 0.080228f, -0.214074f, + -0.011426f, -2.352367f, -0.085394f, -0.190361f, -0.001177f, 0.089197f, +}; + +static const float vp9_rect_part_nn_bias_64_layer0[NODES] = { + 0.000000f, -0.057652f, -0.175413f, -0.175389f, -1.084097f, -1.423801f, + -0.076307f, -0.193803f, 0.000000f, -0.066474f, -0.050318f, -0.019832f, + -0.038814f, -0.144184f, 2.652451f, 2.415006f, 0.197464f, -0.729842f, + -0.173774f, 0.239171f, 0.486425f, 2.463304f, -0.175279f, 2.352637f, +}; + +static const float vp9_rect_part_nn_weights_64_layer1[NODES * LABELS] = { + -0.063237f, 1.925696f, -0.182145f, -0.226687f, 0.602941f, -0.941140f, + 0.814598f, -0.117063f, 0.282988f, 0.066369f, 0.096951f, 1.049735f, + -0.188188f, -0.281227f, -4.836746f, -5.047797f, 0.892358f, 0.417145f, + -0.279849f, 1.335945f, 0.660338f, -2.757938f, -0.115714f, -1.862183f, + -0.045980f, -1.597624f, -0.586822f, -0.615589f, -0.330537f, 1.068496f, + -0.167290f, 0.141290f, -0.112100f, 0.232761f, 0.252307f, -0.399653f, + 0.353118f, 0.241583f, 2.635241f, 4.026119f, -1.137327f, -0.052446f, + -0.139814f, -1.104256f, -0.759391f, 2.508457f, -0.526297f, 2.095348f, + -0.444473f, -1.090452f, 0.584122f, 0.468729f, -0.368865f, 1.041425f, + -1.079504f, 0.348837f, 0.390091f, 0.416191f, 0.212906f, -0.660255f, + 0.053630f, 0.209476f, 3.595525f, 2.257293f, -0.514030f, 0.074203f, + -0.375862f, -1.998307f, -0.930310f, 1.866686f, -0.247137f, 1.087789f, + 0.100186f, 0.298150f, 0.165265f, 0.050478f, 0.249167f, 0.371789f, + -0.294497f, 0.202954f, 0.037310f, 0.193159f, 0.161551f, 0.301597f, + 0.299286f, 0.185946f, 0.822976f, 2.066130f, -1.724588f, 0.055977f, + -0.330747f, -0.067747f, -0.475801f, 1.555958f, -0.025808f, -0.081516f, +}; + +static const float vp9_rect_part_nn_bias_64_layer1[LABELS] = { + -0.090723f, + 0.894968f, + 0.844754f, + -3.496194f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_64 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_64_layer0, + vp9_rect_part_nn_weights_64_layer1, + }, + { + vp9_rect_part_nn_bias_64_layer0, + vp9_rect_part_nn_bias_64_layer1, + }, +}; +#undef FEATURES +#undef LABELS +#undef NODES + +#define FEATURES 7 +// Partition pruning model(neural nets). +static const float vp9_partition_nn_weights_64x64_layer0[FEATURES * 8] = { + -3.571348f, 0.014835f, -3.255393f, -0.098090f, -0.013120f, 0.000221f, + 0.056273f, 0.190179f, -0.268130f, -1.828242f, -0.010655f, 0.937244f, + -0.435120f, 0.512125f, 1.610679f, 0.190816f, -0.799075f, -0.377348f, + -0.144232f, 0.614383f, -0.980388f, 1.754150f, -0.185603f, -0.061854f, + -0.807172f, 1.240177f, 1.419531f, -0.438544f, -5.980774f, 0.139045f, + -0.032359f, -0.068887f, -1.237918f, 0.115706f, 0.003164f, 2.924212f, + 1.246838f, -0.035833f, 0.810011f, -0.805894f, 0.010966f, 0.076463f, + -4.226380f, -2.437764f, -0.010619f, -0.020935f, -0.451494f, 0.300079f, + -0.168961f, -3.326450f, -2.731094f, 0.002518f, 0.018840f, -1.656815f, + 0.068039f, 0.010586f, +}; + +static const float vp9_partition_nn_bias_64x64_layer0[8] = { + -3.469882f, 0.683989f, 0.194010f, 0.313782f, + -3.153335f, 2.245849f, -1.946190f, -3.740020f, +}; + +static const float vp9_partition_nn_weights_64x64_layer1[8] = { + -8.058566f, 0.108306f, -0.280620f, -0.818823f, + -6.445117f, 0.865364f, -1.127127f, -8.808660f, +}; + +static const float vp9_partition_nn_bias_64x64_layer1[1] = { + 6.46909416f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_64x64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_64x64_layer0, + vp9_partition_nn_weights_64x64_layer1, + }, + { + vp9_partition_nn_bias_64x64_layer0, + vp9_partition_nn_bias_64x64_layer1, + }, +}; + +static const float vp9_partition_nn_weights_32x32_layer0[FEATURES * 8] = { + -0.295437f, -4.002648f, -0.205399f, -0.060919f, 0.708037f, 0.027221f, + -0.039137f, -0.907724f, -3.151662f, 0.007106f, 0.018726f, -0.534928f, + 0.022744f, 0.000159f, -1.717189f, -3.229031f, -0.027311f, 0.269863f, + -0.400747f, -0.394366f, -0.108878f, 0.603027f, 0.455369f, -0.197170f, + 1.241746f, -1.347820f, -0.575636f, -0.462879f, -2.296426f, 0.196696f, + -0.138347f, -0.030754f, -0.200774f, 0.453795f, 0.055625f, -3.163116f, + -0.091003f, -0.027028f, -0.042984f, -0.605185f, 0.143240f, -0.036439f, + -0.801228f, 0.313409f, -0.159942f, 0.031267f, 0.886454f, -1.531644f, + -0.089655f, 0.037683f, -0.163441f, -0.130454f, -0.058344f, 0.060011f, + 0.275387f, 1.552226f, +}; + +static const float vp9_partition_nn_bias_32x32_layer0[8] = { + -0.838372f, -2.609089f, -0.055763f, 1.329485f, + -1.297638f, -2.636622f, -0.826909f, 1.012644f, +}; + +static const float vp9_partition_nn_weights_32x32_layer1[8] = { + -1.792632f, -7.322353f, -0.683386f, 0.676564f, + -1.488118f, -7.527719f, 1.240163f, 0.614309f, +}; + +static const float vp9_partition_nn_bias_32x32_layer1[1] = { + 4.97422546f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_32x32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_32x32_layer0, + vp9_partition_nn_weights_32x32_layer1, + }, + { + vp9_partition_nn_bias_32x32_layer0, + vp9_partition_nn_bias_32x32_layer1, + }, +}; + +static const float vp9_partition_nn_weights_16x16_layer0[FEATURES * 8] = { + -1.717673f, -4.718130f, -0.125725f, -0.183427f, -0.511764f, 0.035328f, + 0.130891f, -3.096753f, 0.174968f, -0.188769f, -0.640796f, 1.305661f, + 1.700638f, -0.073806f, -4.006781f, -1.630999f, -0.064863f, -0.086410f, + -0.148617f, 0.172733f, -0.018619f, 2.152595f, 0.778405f, -0.156455f, + 0.612995f, -0.467878f, 0.152022f, -0.236183f, 0.339635f, -0.087119f, + -3.196610f, -1.080401f, -0.637704f, -0.059974f, 1.706298f, -0.793705f, + -6.399260f, 0.010624f, -0.064199f, -0.650621f, 0.338087f, -0.001531f, + 1.023655f, -3.700272f, -0.055281f, -0.386884f, 0.375504f, -0.898678f, + 0.281156f, -0.314611f, 0.863354f, -0.040582f, -0.145019f, 0.029329f, + -2.197880f, -0.108733f, +}; + +static const float vp9_partition_nn_bias_16x16_layer0[8] = { + 0.411516f, -2.143737f, -3.693192f, 2.123142f, + -1.356910f, -3.561016f, -0.765045f, -2.417082f, +}; + +static const float vp9_partition_nn_weights_16x16_layer1[8] = { + -0.619755f, -2.202391f, -4.337171f, 0.611319f, + 0.377677f, -4.998723f, -1.052235f, 1.949922f, +}; + +static const float vp9_partition_nn_bias_16x16_layer1[1] = { + 3.20981717f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_16x16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_16x16_layer0, + vp9_partition_nn_weights_16x16_layer1, + }, + { + vp9_partition_nn_bias_16x16_layer0, + vp9_partition_nn_bias_16x16_layer1, + }, +}; +#undef FEATURES + +#define FEATURES 6 +static const float vp9_var_part_nn_weights_64_layer0[FEATURES * 8] = { + -0.249572f, 0.205532f, -2.175608f, 1.094836f, -2.986370f, 0.193160f, + -0.143823f, 0.378511f, -1.997788f, -2.166866f, -1.930158f, -1.202127f, + -0.611875f, -0.506422f, -0.432487f, 0.071205f, 0.578172f, -0.154285f, + -0.051830f, 0.331681f, -1.457177f, -2.443546f, -2.000302f, -1.389283f, + 0.372084f, -0.464917f, 2.265235f, 2.385787f, 2.312722f, 2.127868f, + -0.403963f, -0.177860f, -0.436751f, -0.560539f, 0.254903f, 0.193976f, + -0.305611f, 0.256632f, 0.309388f, -0.437439f, 1.702640f, -5.007069f, + -0.323450f, 0.294227f, 1.267193f, 1.056601f, 0.387181f, -0.191215f, +}; + +static const float vp9_var_part_nn_bias_64_layer0[8] = { + -0.044396f, -0.938166f, 0.000000f, -0.916375f, + 1.242299f, 0.000000f, -0.405734f, 0.014206f, +}; + +static const float vp9_var_part_nn_weights_64_layer1[8] = { + 1.635945f, 0.979557f, 0.455315f, 1.197199f, + -2.251024f, -0.464953f, 1.378676f, -0.111927f, +}; + +static const float vp9_var_part_nn_bias_64_layer1[1] = { + -0.37972447f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_64_layer0, + vp9_var_part_nn_weights_64_layer1, + }, + { + vp9_var_part_nn_bias_64_layer0, + vp9_var_part_nn_bias_64_layer1, + }, +}; + +static const float vp9_var_part_nn_weights_32_layer0[FEATURES * 8] = { + 0.067243f, -0.083598f, -2.191159f, 2.726434f, -3.324013f, 3.477977f, + 0.323736f, -0.510199f, 2.960693f, 2.937661f, 2.888476f, 2.938315f, + -0.307602f, -0.503353f, -0.080725f, -0.473909f, -0.417162f, 0.457089f, + 0.665153f, -0.273210f, 0.028279f, 0.972220f, -0.445596f, 1.756611f, + -0.177892f, -0.091758f, 0.436661f, -0.521506f, 0.133786f, 0.266743f, + 0.637367f, -0.160084f, -1.396269f, 1.020841f, -1.112971f, 0.919496f, + -0.235883f, 0.651954f, 0.109061f, -0.429463f, 0.740839f, -0.962060f, + 0.299519f, -0.386298f, 1.550231f, 2.464915f, 1.311969f, 2.561612f, +}; + +static const float vp9_var_part_nn_bias_32_layer0[8] = { + 0.368242f, 0.736617f, 0.000000f, 0.757287f, + 0.000000f, 0.613248f, -0.776390f, 0.928497f, +}; + +static const float vp9_var_part_nn_weights_32_layer1[8] = { + 0.939884f, -2.420850f, -0.410489f, -0.186690f, + 0.063287f, -0.522011f, 0.484527f, -0.639625f, +}; + +static const float vp9_var_part_nn_bias_32_layer1[1] = { + -0.6455006f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_32_layer0, + vp9_var_part_nn_weights_32_layer1, + }, + { + vp9_var_part_nn_bias_32_layer0, + vp9_var_part_nn_bias_32_layer1, + }, +}; + +static const float vp9_var_part_nn_weights_16_layer0[FEATURES * 8] = { + 0.742567f, -0.580624f, -0.244528f, 0.331661f, -0.113949f, -0.559295f, + -0.386061f, 0.438653f, 1.467463f, 0.211589f, 0.513972f, 1.067855f, + -0.876679f, 0.088560f, -0.687483f, -0.380304f, -0.016412f, 0.146380f, + 0.015318f, 0.000351f, -2.764887f, 3.269717f, 2.752428f, -2.236754f, + 0.561539f, -0.852050f, -0.084667f, 0.202057f, 0.197049f, 0.364922f, + -0.463801f, 0.431790f, 1.872096f, -0.091887f, -0.055034f, 2.443492f, + -0.156958f, -0.189571f, -0.542424f, -0.589804f, -0.354422f, 0.401605f, + 0.642021f, -0.875117f, 2.040794f, 1.921070f, 1.792413f, 1.839727f, +}; + +static const float vp9_var_part_nn_bias_16_layer0[8] = { + 2.901234f, -1.940932f, -0.198970f, -0.406524f, + 0.059422f, -1.879207f, -0.232340f, 2.979821f, +}; + +static const float vp9_var_part_nn_weights_16_layer1[8] = { + -0.528731f, 0.375234f, -0.088422f, 0.668629f, + 0.870449f, 0.578735f, 0.546103f, -1.957207f, +}; + +static const float vp9_var_part_nn_bias_16_layer1[1] = { + -1.95769405f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_16_layer0, + vp9_var_part_nn_weights_16_layer1, + }, + { + vp9_var_part_nn_bias_16_layer0, + vp9_var_part_nn_bias_16_layer1, + }, +}; +#undef FEATURES + +#define FEATURES 12 +#define LABELS 1 +#define NODES 8 +static const float vp9_part_split_nn_weights_64_layer0[FEATURES * NODES] = { + -0.609728f, -0.409099f, -0.472449f, 0.183769f, -0.457740f, 0.081089f, + 0.171003f, 0.578696f, -0.019043f, -0.856142f, 0.557369f, -1.779424f, + -0.274044f, -0.320632f, -0.392531f, -0.359462f, -0.404106f, -0.288357f, + 0.200620f, 0.038013f, -0.430093f, 0.235083f, -0.487442f, 0.424814f, + -0.232758f, -0.442943f, 0.229397f, -0.540301f, -0.648421f, -0.649747f, + -0.171638f, 0.603824f, 0.468497f, -0.421580f, 0.178840f, -0.533838f, + -0.029471f, -0.076296f, 0.197426f, -0.187908f, -0.003950f, -0.065740f, + 0.085165f, -0.039674f, -5.640702f, 1.909538f, -1.434604f, 3.294606f, + -0.788812f, 0.196864f, 0.057012f, -0.019757f, 0.336233f, 0.075378f, + 0.081503f, 0.491864f, -1.899470f, -1.764173f, -1.888137f, -1.762343f, + 0.845542f, 0.202285f, 0.381948f, -0.150996f, 0.556893f, -0.305354f, + 0.561482f, -0.021974f, -0.703117f, 0.268638f, -0.665736f, 1.191005f, + -0.081568f, -0.115653f, 0.272029f, -0.140074f, 0.072683f, 0.092651f, + -0.472287f, -0.055790f, -0.434425f, 0.352055f, 0.048246f, 0.372865f, + 0.111499f, -0.338304f, 0.739133f, 0.156519f, -0.594644f, 0.137295f, + 0.613350f, -0.165102f, -1.003731f, 0.043070f, -0.887896f, -0.174202f, +}; + +static const float vp9_part_split_nn_bias_64_layer0[NODES] = { + 1.182714f, 0.000000f, 0.902019f, 0.953115f, + -1.372486f, -1.288740f, -0.155144f, -3.041362f, +}; + +static const float vp9_part_split_nn_weights_64_layer1[NODES * LABELS] = { + 0.841214f, 0.456016f, 0.869270f, 1.692999f, + -1.700494f, -0.911761f, 0.030111f, -1.447548f, +}; + +static const float vp9_part_split_nn_bias_64_layer1[LABELS] = { + 1.17782545f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_64 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_64_layer0, + vp9_part_split_nn_weights_64_layer1, + }, + { + vp9_part_split_nn_bias_64_layer0, + vp9_part_split_nn_bias_64_layer1, + }, +}; + +static const float vp9_part_split_nn_weights_32_layer0[FEATURES * NODES] = { + -0.105488f, -0.218662f, 0.010980f, -0.226979f, 0.028076f, 0.743430f, + 0.789266f, 0.031907f, -1.464200f, 0.222336f, -1.068493f, -0.052712f, + -0.176181f, -0.102654f, -0.973932f, -0.182637f, -0.198000f, 0.335977f, + 0.271346f, 0.133005f, 1.674203f, 0.689567f, 0.657133f, 0.283524f, + 0.115529f, 0.738327f, 0.317184f, -0.179736f, 0.403691f, 0.679350f, + 0.048925f, 0.271338f, -1.538921f, -0.900737f, -1.377845f, 0.084245f, + 0.803122f, -0.107806f, 0.103045f, -0.023335f, -0.098116f, -0.127809f, + 0.037665f, -0.523225f, 1.622185f, 1.903999f, 1.358889f, 1.680785f, + 0.027743f, 0.117906f, -0.158810f, 0.057775f, 0.168257f, 0.062414f, + 0.086228f, -0.087381f, -3.066082f, 3.021855f, -4.092155f, 2.550104f, + -0.230022f, -0.207445f, -0.000347f, 0.034042f, 0.097057f, 0.220088f, + -0.228841f, -0.029405f, -1.507174f, -1.455184f, 2.624904f, 2.643355f, + 0.319912f, 0.585531f, -1.018225f, -0.699606f, 1.026490f, 0.169952f, + -0.093579f, -0.142352f, -0.107256f, 0.059598f, 0.043190f, 0.507543f, + -0.138617f, 0.030197f, 0.059574f, -0.634051f, -0.586724f, -0.148020f, + -0.334380f, 0.459547f, 1.620600f, 0.496850f, 0.639480f, -0.465715f, +}; + +static const float vp9_part_split_nn_bias_32_layer0[NODES] = { + -1.125885f, 0.753197f, -0.825808f, 0.004839f, + 0.583920f, 0.718062f, 0.976741f, 0.796188f, +}; + +static const float vp9_part_split_nn_weights_32_layer1[NODES * LABELS] = { + -0.458745f, 0.724624f, -0.479720f, -2.199872f, + 1.162661f, 1.194153f, -0.716896f, 0.824080f, +}; + +static const float vp9_part_split_nn_bias_32_layer1[LABELS] = { + 0.71644074f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_32 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_32_layer0, + vp9_part_split_nn_weights_32_layer1, + }, + { + vp9_part_split_nn_bias_32_layer0, + vp9_part_split_nn_bias_32_layer1, + }, +}; + +static const float vp9_part_split_nn_weights_16_layer0[FEATURES * NODES] = { + -0.003629f, -0.046852f, 0.220428f, -0.033042f, 0.049365f, 0.112818f, + -0.306149f, -0.005872f, 1.066947f, -2.290226f, 2.159505f, -0.618714f, + -0.213294f, 0.451372f, -0.199459f, 0.223730f, -0.321709f, 0.063364f, + 0.148704f, -0.293371f, 0.077225f, -0.421947f, -0.515543f, -0.240975f, + -0.418516f, 1.036523f, -0.009165f, 0.032484f, 1.086549f, 0.220322f, + -0.247585f, -0.221232f, -0.225050f, 0.993051f, 0.285907f, 1.308846f, + 0.707456f, 0.335152f, 0.234556f, 0.264590f, -0.078033f, 0.542226f, + 0.057777f, 0.163471f, 0.039245f, -0.725960f, 0.963780f, -0.972001f, + 0.252237f, -0.192745f, -0.836571f, -0.460539f, -0.528713f, -0.160198f, + -0.621108f, 0.486405f, -0.221923f, 1.519426f, -0.857871f, 0.411595f, + 0.947188f, 0.203339f, 0.174526f, 0.016382f, 0.256879f, 0.049818f, + 0.057836f, -0.659096f, 0.459894f, 0.174695f, 0.379359f, 0.062530f, + -0.210201f, -0.355788f, -0.208432f, -0.401723f, -0.115373f, 0.191336f, + -0.109342f, 0.002455f, -0.078746f, -0.391871f, 0.149892f, -0.239615f, + -0.520709f, 0.118568f, -0.437975f, 0.118116f, -0.565426f, -0.206446f, + 0.113407f, 0.558894f, 0.534627f, 1.154350f, -0.116833f, 1.723311f, +}; + +static const float vp9_part_split_nn_bias_16_layer0[NODES] = { + 0.013109f, -0.034341f, 0.679845f, -0.035781f, + -0.104183f, 0.098055f, -0.041130f, 0.160107f, +}; + +static const float vp9_part_split_nn_weights_16_layer1[NODES * LABELS] = { + 1.499564f, -0.403259f, 1.366532f, -0.469868f, + 0.482227f, -2.076697f, 0.527691f, 0.540495f, +}; + +static const float vp9_part_split_nn_bias_16_layer1[LABELS] = { + 0.01134653f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_16 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_16_layer0, + vp9_part_split_nn_weights_16_layer1, + }, + { + vp9_part_split_nn_bias_16_layer0, + vp9_part_split_nn_bias_16_layer1, + }, +}; + +static const float vp9_part_split_nn_weights_8_layer0[FEATURES * NODES] = { + -0.668875f, -0.159078f, -0.062663f, -0.483785f, -0.146814f, -0.608975f, + -0.589145f, 0.203704f, -0.051007f, -0.113769f, -0.477511f, -0.122603f, + -1.329890f, 1.403386f, 0.199636f, -0.161139f, 2.182090f, -0.014307f, + 0.015755f, -0.208468f, 0.884353f, 0.815920f, 0.632464f, 0.838225f, + 1.369483f, -0.029068f, 0.570213f, -0.573546f, 0.029617f, 0.562054f, + -0.653093f, -0.211910f, -0.661013f, -0.384418f, -0.574038f, -0.510069f, + 0.173047f, -0.274231f, -1.044008f, -0.422040f, -0.810296f, 0.144069f, + -0.406704f, 0.411230f, -0.144023f, 0.745651f, -0.595091f, 0.111787f, + 0.840651f, 0.030123f, -0.242155f, 0.101486f, -0.017889f, -0.254467f, + -0.285407f, -0.076675f, -0.549542f, -0.013544f, -0.686566f, -0.755150f, + 1.623949f, -0.286369f, 0.170976f, 0.016442f, -0.598353f, -0.038540f, + 0.202597f, -0.933582f, 0.599510f, 0.362273f, 0.577722f, 0.477603f, + 0.767097f, 0.431532f, 0.457034f, 0.223279f, 0.381349f, 0.033777f, + 0.423923f, -0.664762f, 0.385662f, 0.075744f, 0.182681f, 0.024118f, + 0.319408f, -0.528864f, 0.976537f, -0.305971f, -0.189380f, -0.241689f, + -1.318092f, 0.088647f, -0.109030f, -0.945654f, 1.082797f, 0.184564f, +}; + +static const float vp9_part_split_nn_bias_8_layer0[NODES] = { + -0.237472f, 2.051396f, 0.297062f, -0.730194f, + 0.060472f, -0.565959f, 0.560869f, -0.395448f, +}; + +static const float vp9_part_split_nn_weights_8_layer1[NODES * LABELS] = { + 0.568121f, 1.575915f, -0.544309f, 0.751595f, + -0.117911f, -1.340730f, -0.739671f, 0.661216f, +}; + +static const float vp9_part_split_nn_bias_8_layer1[LABELS] = { + -0.63375306f, +}; + +static const NN_CONFIG vp9_part_split_nnconfig_8 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + NODES, + }, // num_hidden_nodes + { + vp9_part_split_nn_weights_8_layer0, + vp9_part_split_nn_weights_8_layer1, + }, + { + vp9_part_split_nn_bias_8_layer0, + vp9_part_split_nn_bias_8_layer1, + }, +}; +#undef NODES +#undef FEATURES +#undef LABELS + +// Partition pruning model(linear). +static const float vp9_partition_feature_mean[24] = { + 303501.697372f, 3042630.372158f, 24.694696f, 1.392182f, + 689.413511f, 162.027012f, 1.478213f, 0.0, + 135382.260230f, 912738.513263f, 28.845217f, 1.515230f, + 544.158492f, 131.807995f, 1.436863f, 0.0f, + 43682.377587f, 208131.711766f, 28.084737f, 1.356677f, + 138.254122f, 119.522553f, 1.252322f, 0.0f, +}; + +static const float vp9_partition_feature_std[24] = { + 673689.212982f, 5996652.516628f, 0.024449f, 1.989792f, + 985.880847f, 0.014638f, 2.001898f, 0.0f, + 208798.775332f, 1812548.443284f, 0.018693f, 1.838009f, + 396.986910f, 0.015657f, 1.332541f, 0.0f, + 55888.847031f, 448587.962714f, 0.017900f, 1.904776f, + 98.652832f, 0.016598f, 1.320992f, 0.0f, +}; + +// Error tolerance: 0.01%-0.0.05%-0.1% +static const float vp9_partition_linear_weights[24] = { + 0.111736f, 0.289977f, 0.042219f, 0.204765f, 0.120410f, -0.143863f, + 0.282376f, 0.847811f, 0.637161f, 0.131570f, 0.018636f, 0.202134f, + 0.112797f, 0.028162f, 0.182450f, 1.124367f, 0.386133f, 0.083700f, + 0.050028f, 0.150873f, 0.061119f, 0.109318f, 0.127255f, 0.625211f, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c index 6fc7cd1e3c..3a620df693 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c @@ -24,10 +24,20 @@ #include "vp9/encoder/vp9_picklpf.h" #include "vp9/encoder/vp9_quantize.h" +static unsigned int get_section_intra_rating(const VP9_COMP *cpi) { + unsigned int section_intra_rating; + + section_intra_rating = (cpi->common.frame_type == KEY_FRAME) + ? cpi->twopass.key_frame_section_intra_rating + : cpi->twopass.section_intra_rating; + + return section_intra_rating; +} + static int get_max_filter_level(const VP9_COMP *cpi) { if (cpi->oxcf.pass == 2) { - return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 - : MAX_LOOP_FILTER; + unsigned int section_intra_rating = get_section_intra_rating(cpi); + return section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 : MAX_LOOP_FILTER; } else { return MAX_LOOP_FILTER; } @@ -81,6 +91,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; // Sum squared error at each filter level int64_t ss_err[MAX_LOOP_FILTER + 1]; + unsigned int section_intra_rating = get_section_intra_rating(cpi); // Set each entry to -1 memset(ss_err, 0xFF, sizeof(ss_err)); @@ -99,8 +110,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, // Bias against raising loop filter in favor of lowering it. int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; - if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20)) - bias = (bias * cpi->twopass.section_intra_rating) / 20; + if ((cpi->oxcf.pass == 2) && (section_intra_rating < 20)) + bias = (bias * section_intra_rating) / 20; // yx, bias less for large block size if (cm->tx_mode != ONLY_4X4) bias >>= 1; @@ -150,7 +161,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; struct loopfilter *const lf = &cm->lf; - lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness; + lf->sharpness_level = 0; if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) { lf->filter_level = 0; @@ -169,22 +180,20 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, case VPX_BITS_10: filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); break; - case VPX_BITS_12: + default: + assert(cm->bit_depth == VPX_BITS_12); filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); break; - default: - assert(0 && - "bit_depth should be VPX_BITS_8, VPX_BITS_10 " - "or VPX_BITS_12"); - return; } #else int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); +#endif // CONFIG_VP9_HIGHBITDEPTH if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && + (cm->base_qindex < 200 || cm->width * cm->height > 320 * 240) && cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME) filt_guess = 5 * filt_guess >> 3; -#endif // CONFIG_VP9_HIGHBITDEPTH if (cm->frame_type == KEY_FRAME) filt_guess -= 4; lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level); } else { diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h index cecca058b4..8881b44daa 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_PICKLPF_H_ -#define VP9_ENCODER_VP9_PICKLPF_H_ +#ifndef VPX_VP9_ENCODER_VP9_PICKLPF_H_ +#define VPX_VP9_ENCODER_VP9_PICKLPF_H_ #ifdef __cplusplus extern "C" { @@ -26,4 +26,4 @@ void vp9_pick_filter_level(const struct yv12_buffer_config *sd, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_PICKLPF_H_ +#endif // VPX_VP9_ENCODER_VP9_PICKLPF_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c index 33f3f5a476..b841385baa 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" @@ -19,7 +20,7 @@ #include "vpx/vpx_codec.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" -#include "vpx_ports/mem.h" +#include "vpx_ports/compiler_attributes.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" @@ -41,6 +42,17 @@ typedef struct { int in_use; } PRED_BUFFER; +typedef struct { + PRED_BUFFER *best_pred; + PREDICTION_MODE best_mode; + TX_SIZE best_tx_size; + TX_SIZE best_intra_tx_size; + MV_REFERENCE_FRAME best_ref_frame; + MV_REFERENCE_FRAME best_second_ref_frame; + uint8_t best_mode_skip_txfm; + INTERP_FILTER best_pred_filter; +} BEST_PICKMODE; + static const int pos_shift_16x16[4][4] = { { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 } }; @@ -117,15 +129,24 @@ static int mv_refs_rt(VP9_COMP *cpi, const VP9_COMMON *cm, const MACROBLOCK *x, !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && ref_frame == LAST_FRAME) { // Get base layer mv. - MV_REF *candidate = - &cm->prev_frame - ->mvs[(mi_col >> 1) + (mi_row >> 1) * (cm->mi_cols >> 1)]; - if (candidate->mv[0].as_int != INVALID_MV) { - base_mv->as_mv.row = (candidate->mv[0].as_mv.row * 2); - base_mv->as_mv.col = (candidate->mv[0].as_mv.col * 2); - clamp_mv_ref(&base_mv->as_mv, xd); - } else { - base_mv->as_int = INVALID_MV; + const int prev_layer = cpi->svc.spatial_layer_id - 1; + const int index = + (mi_col >> 1) + (mi_row >> 1) * cpi->svc.mi_cols[prev_layer]; + // prev_frame->mvs[] is allocated to size mi_cols * mi_rows corresponding + // to the previous spatial layer, so the index check is against + // svc.mi_col/rows[prev_layer]. + if (index < cpi->svc.mi_cols[prev_layer] * cpi->svc.mi_rows[prev_layer]) { + MV_REF *candidate = &cm->prev_frame->mvs[index]; + // Avoid using base_mv if scaled mv is out of range, for either component. + if (candidate->mv[0].as_int != INVALID_MV && + abs(candidate->mv[0].as_mv.row) <= INT16_MAX >> 1 && + abs(candidate->mv[0].as_mv.col) <= INT16_MAX >> 1) { + base_mv->as_mv.row = candidate->mv[0].as_mv.row * 2; + base_mv->as_mv.col = candidate->mv[0].as_mv.col * 2; + clamp_mv_ref(&base_mv->as_mv, xd); + } else { + base_mv->as_int = INVALID_MV; + } } } @@ -158,6 +179,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, const MvLimits tmp_mv_limits = x->mv_limits; int rv = 0; int cost_list[5]; + int search_subpel = 1; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); if (scaled_ref_frame) { @@ -170,6 +192,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + // Limit motion vector for large lightning change. + if (cpi->oxcf.speed > 5 && x->lowvar_highsumdiff) { + x->mv_limits.col_min = VPXMAX(x->mv_limits.col_min, -10); + x->mv_limits.row_min = VPXMAX(x->mv_limits.row_min, -10); + x->mv_limits.col_max = VPXMIN(x->mv_limits.col_max, 10); + x->mv_limits.row_max = VPXMIN(x->mv_limits.row_max, 10); + } + assert(x->mv_best_ref_index[ref] <= 2); if (x->mv_best_ref_index[ref] < 2) mvp_full = x->mbmi_ext->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv; @@ -184,9 +214,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, else center_mv = tmp_mv->as_mv; - vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, - cond_cost_list(cpi, cost_list), ¢er_mv, - &tmp_mv->as_mv, INT_MAX, 0); + if (x->sb_use_mv_part) { + tmp_mv->as_mv.row = x->sb_mvrow_part >> 3; + tmp_mv->as_mv.col = x->sb_mvcol_part >> 3; + } else { + vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, + cond_cost_list(cpi, cost_list), ¢er_mv, &tmp_mv->as_mv, INT_MAX, 0); + } x->mv_limits = tmp_mv_limits; @@ -202,15 +237,28 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, rv = !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar); - if (rv) { - const int subpel_force_stop = use_base_mv && cpi->sf.base_mv_aggressive - ? 2 - : cpi->sf.mv.subpel_force_stop; + // For SVC on non-reference frame, avoid subpel for (0, 0) motion. + if (cpi->use_svc && cpi->svc.non_reference_frame) { + if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0; + } + + if (rv && search_subpel) { + SUBPEL_FORCE_STOP subpel_force_stop = cpi->sf.mv.subpel_force_stop; + if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = HALF_PEL; + if (cpi->sf.mv.enable_adaptive_subpel_force_stop) { + const int mv_thresh = cpi->sf.mv.adapt_subpel_force_stop.mv_thresh; + if (abs(tmp_mv->as_mv.row) >= mv_thresh || + abs(tmp_mv->as_mv.col) >= mv_thresh) + subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_above; + else + subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_below; + } cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } @@ -230,6 +278,7 @@ static void block_variance(const uint8_t *src, int src_stride, #endif uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) { int i, j, k = 0; + uint32_t k_sqr = 0; *sse = 0; *sum = 0; @@ -267,7 +316,8 @@ static void block_variance(const uint8_t *src, int src_stride, #endif *sse += sse8x8[k]; *sum += sum8x8[k]; - var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6); + k_sqr = (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6); + var8x8[k] = sse8x8[k] > k_sqr ? sse8x8[k] - k_sqr : k_sqr - sse8x8[k]; k++; } } @@ -281,6 +331,7 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size, const int nw = 1 << (bw - b_width_log2_lookup[unit_size]); const int nh = 1 << (bh - b_height_log2_lookup[unit_size]); int i, j, k = 0; + uint32_t k_sqr = 0; for (i = 0; i < nh; i += 2) { for (j = 0; j < nw; j += 2) { @@ -288,19 +339,109 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size, sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1]; sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] + sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1]; - var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> - (b_width_log2_lookup[unit_size] + - b_height_log2_lookup[unit_size] + 6)); + k_sqr = (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >> + (b_width_log2_lookup[unit_size] + + b_height_log2_lookup[unit_size] + 6)); + var_o[k] = sse_o[k] > k_sqr ? sse_o[k] - k_sqr : k_sqr - sse_o[k]; k++; } } } +// Adjust the ac_thr according to speed, width, height and normalized sum +static int ac_thr_factor(const int speed, const int width, const int height, + const int norm_sum) { + if (speed >= 8 && norm_sum < 5) { + if (width <= 640 && height <= 480) + return 4; + else + return 2; + } + return 1; +} + +static TX_SIZE calculate_tx_size(VP9_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCKD *const xd, unsigned int var, + unsigned int sse, int64_t ac_thr, + unsigned int source_variance, int is_intra) { + // TODO(marpan): Tune selection for intra-modes, screen content, etc. + TX_SIZE tx_size; + unsigned int var_thresh = is_intra ? (unsigned int)ac_thr : 1; + int limit_tx = 1; + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + (source_variance == 0 || var < var_thresh)) + limit_tx = 0; + if (cpi->common.tx_mode == TX_MODE_SELECT) { + if (sse > (var << 2)) + tx_size = VPXMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + else + tx_size = TX_8X8; + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && limit_tx && + cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) + tx_size = TX_8X8; + else if (tx_size > TX_16X16 && limit_tx) + tx_size = TX_16X16; + // For screen-content force 4X4 tx_size over 8X8, for large variance. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && tx_size == TX_8X8 && + bsize <= BLOCK_16X16 && ((var >> 5) > (unsigned int)ac_thr)) + tx_size = TX_4X4; + } else { + tx_size = VPXMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + } + return tx_size; +} + +static void compute_intra_yprediction(PREDICTION_MODE mode, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd) { + struct macroblockd_plane *const pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[0]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int row, col; + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (row = 0; row < max_blocks_high; row += (1 << tx_size)) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) { + p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; + vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, + x->skip_encode ? p->src.buf : pd->dst.buf, + x->skip_encode ? src_stride : dst_stride, + pd->dst.buf, dst_stride, col, row, 0); + } + } + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; +} + static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, unsigned int *var_y, unsigned int *sse_y, - int mi_row, int mi_col, int *early_term) { + int mi_row, int mi_col, int *early_term, + int *flag_preduv_computed) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -311,8 +452,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, struct macroblockd_plane *const pd = &xd->plane[0]; const uint32_t dc_quant = pd->dequant[0]; const uint32_t ac_quant = pd->dequant[1]; - const int64_t dc_thr = dc_quant * dc_quant >> 6; - const int64_t ac_thr = ac_quant * ac_quant >> 6; + int64_t dc_thr = dc_quant * dc_quant >> 6; + int64_t ac_thr = ac_quant * ac_quant >> 6; unsigned int var; int sum; int skip_dc = 0; @@ -325,6 +466,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, unsigned int var8x8[64] = { 0 }; TX_SIZE tx_size; int i, k; + uint32_t sum_sqr; #if CONFIG_VP9_HIGHBITDEPTH const vpx_bit_depth_t bd = cpi->common.bit_depth; #endif @@ -336,31 +478,37 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, cpi->common.use_highbitdepth, bd, #endif sse8x8, sum8x8, var8x8); - var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4)); + sum_sqr = (uint32_t)((int64_t)sum * sum) >> (bw + bh + 4); + var = sse > sum_sqr ? sse - sum_sqr : sum_sqr - sse; *var_y = var; *sse_y = sse; - if (cpi->common.tx_mode == TX_MODE_SELECT) { - if (sse > (var << 2)) - tx_size = VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - else - tx_size = TX_8X8; +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->oxcf.speed > 5) + ac_thr = vp9_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level, + (abs(sum) >> (bw + bh)), + cpi->svc.temporal_layer_id); + else + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width, + cpi->common.height, abs(sum) >> (bw + bh)); +#else + ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width, + cpi->common.height, abs(sum) >> (bw + bh)); +#endif - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && - cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) - tx_size = TX_8X8; - else if (tx_size > TX_16X16) - tx_size = TX_16X16; - } else { - tx_size = VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - } - - assert(tx_size >= TX_8X8); + tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr, + x->source_variance, 0); + // The code below for setting skip flag assumes tranform size of at least 8x8, + // so force this lower limit on transform. + if (tx_size < TX_8X8) tx_size = TX_8X8; xd->mi[0]->tx_size = tx_size; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->zero_temp_sad_source && + x->source_variance == 0) + dc_thr = dc_thr << 1; + // Evaluate if the partition block is a skippable block in Y plane. { unsigned int sse16x16[16] = { 0 }; @@ -428,22 +576,26 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, // Transform skipping test in UV planes. for (i = 1; i <= 2; i++) { - struct macroblock_plane *const p = &x->plane[i]; - struct macroblockd_plane *const pd = &xd->plane[i]; - const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd); + struct macroblock_plane *const p_uv = &x->plane[i]; + struct macroblockd_plane *const pd_uv = &xd->plane[i]; + const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd_uv); const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; - const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd); + const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd_uv); const int uv_bw = b_width_log2_lookup[uv_bsize]; const int uv_bh = b_height_log2_lookup[uv_bsize]; const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + (uv_bh - b_height_log2_lookup[unit_size]); - const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); - const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); + const uint32_t uv_dc_thr = + pd_uv->dequant[0] * pd_uv->dequant[0] >> (6 - sf); + const uint32_t uv_ac_thr = + pd_uv->dequant[1] * pd_uv->dequant[1] >> (6 - sf); int j = i - 1; vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); - var_uv[j] = cpi->fn_ptr[uv_bsize].vf( - p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]); + flag_preduv_computed[i - 1] = 1; + var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p_uv->src.buf, p_uv->src.stride, + pd_uv->dst.buf, pd_uv->dst.stride, + &sse_uv[j]); if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) && (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j])) @@ -457,7 +609,6 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, if (skip_uv[0] & skip_uv[1]) { *early_term = 1; } - return; } @@ -494,7 +645,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, unsigned int *var_y, - unsigned int *sse_y) { + unsigned int *sse_y, int is_intra) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -514,24 +665,8 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, *var_y = var; *sse_y = sse; - if (cpi->common.tx_mode == TX_MODE_SELECT) { - if (sse > (var << 2)) - xd->mi[0]->tx_size = - VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - else - xd->mi[0]->tx_size = TX_8X8; - - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && - cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) - xd->mi[0]->tx_size = TX_8X8; - else if (xd->mi[0]->tx_size > TX_16X16) - xd->mi[0]->tx_size = TX_16X16; - } else { - xd->mi[0]->tx_size = - VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - } + xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr, + x->source_variance, is_intra); // Evaluate if the partition block is a skippable block in Y plane. { @@ -590,24 +725,9 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, *out_dist_sum += dist << 4; } -#if CONFIG_VP9_HIGHBITDEPTH static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, int *skippable, int64_t *sse, BLOCK_SIZE bsize, - TX_SIZE tx_size) { - MACROBLOCKD *xd = &x->e_mbd; - unsigned int var_y, sse_y; - - (void)tx_size; - model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, &var_y, - &sse_y); - *sse = INT_MAX; - *skippable = 0; - return; -} -#else -static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, - int *skippable, int64_t *sse, BLOCK_SIZE bsize, - TX_SIZE tx_size) { + TX_SIZE tx_size, int rd_computed, int is_intra) { MACROBLOCKD *xd = &x->e_mbd; const struct macroblockd_plane *pd = &xd->plane[0]; struct macroblock_plane *const p = &x->plane[0]; @@ -624,20 +744,44 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, const int bw = 4 * num_4x4_w; const int bh = 4 * num_4x4_h; + if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME && + (bsize < BLOCK_32X32 || + (cpi->use_svc && + (bsize < BLOCK_32X32 || cpi->svc.temporal_layer_id > 0)))) { + unsigned int var_y, sse_y; + (void)tx_size; + if (!rd_computed) + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, + &var_y, &sse_y, is_intra); + *sse = INT_MAX; + *skippable = 0; + return; + } + (void)cpi; // The max tx_size passed in is TX_16X16. assert(tx_size != TX_32X32); - +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, + p->src.stride, pd->dst.buf, pd->dst.stride, + x->e_mbd.bd); + } else { + vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride); + } +#else vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); +#endif *skippable = 1; // Keep track of the row and column of the blocks we use so that we know // if we are in the unrestricted motion border. for (r = 0; r < max_blocks_high; r += block_step) { for (c = 0; c < num_4x4_w; c += block_step) { if (c < max_blocks_wide) { - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); @@ -646,29 +790,26 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, const int16_t *src_diff; src_diff = &p->src_diff[(r * diff_stride + c) << 2]; + // skip block condition should be handled before this is called. + assert(!x->skip_block); + switch (tx_size) { case TX_16X16: - vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff); - vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_hadamard_16x16(src_diff, diff_stride, coeff); + vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; case TX_8X8: - vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff); - vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + vpx_hadamard_8x8(src_diff, diff_stride, coeff); + vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; - case TX_4X4: - x->fwd_txm4x4(src_diff, coeff, diff_stride); - vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, - p->quant_fp, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, eob, scan_order->scan, - scan_order->iscan); + default: + assert(tx_size == TX_4X4); + x->fwd_txfm4x4(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); break; - default: assert(0); break; } *skippable &= (*eob == 0); eob_cost += 1; @@ -699,7 +840,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, if (*eob == 1) this_rdc->rate += (int)abs(qcoeff[0]); else if (*eob > 1) - this_rdc->rate += vpx_satd((const int16_t *)qcoeff, step << 4); + this_rdc->rate += vpx_satd(qcoeff, step << 4); this_rdc->dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2; } @@ -711,7 +852,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT); this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT); } -#endif static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize, MACROBLOCK *x, MACROBLOCKD *xd, @@ -799,13 +939,11 @@ static void free_pred_buffer(PRED_BUFFER *p) { if (p != NULL) p->in_use = 0; } -static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int mi_row, int mi_col, - MV_REFERENCE_FRAME ref_frame, - PREDICTION_MODE this_mode, unsigned int var_y, - unsigned int sse_y, - struct buf_2d yv12_mb[][MAX_MB_PLANE], - int *rate, int64_t *dist) { +static void encode_breakout_test( + VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + MV_REFERENCE_FRAME ref_frame, PREDICTION_MODE this_mode, unsigned int var_y, + unsigned int sse_y, struct buf_2d yv12_mb[][MAX_MB_PLANE], int *rate, + int64_t *dist, int *flag_preduv_computed) { MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); @@ -815,6 +953,8 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // Skipping threshold for dc. unsigned int thresh_dc; int motion_low = 1; + + if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return; if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 || mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64) motion_low = 0; @@ -865,9 +1005,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, thresh_dc_uv = 0; } - // Skip UV prediction unless breakout is zero (lossless) to save - // computation with low impact on the result - if (x->encode_breakout == 0) { + if (!flag_preduv_computed[0] || !flag_preduv_computed[1]) { xd->plane[1].pre[0] = yv12_mb[ref_frame][1]; xd->plane[2].pre[0] = yv12_mb[ref_frame][2]; vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize); @@ -921,8 +1059,8 @@ static void estimate_block_intra(int plane, int block, int row, int col, VP9_COMP *const cpi = args->cpi; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - struct macroblock_plane *const p = &x->plane[0]; - struct macroblockd_plane *const pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size]; uint8_t *const src_buf_base = p->src.buf; uint8_t *const dst_buf_base = pd->dst.buf; @@ -932,8 +1070,8 @@ static void estimate_block_intra(int plane, int block, int row, int col, (void)block; - p->src.buf = &src_buf_base[4 * (row * src_stride + col)]; - pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)]; + p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)]; + pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)]; // Use source buffer as an approximation for the fully reconstructed buffer. vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize], tx_size, args->mode, x->skip_encode ? p->src.buf : pd->dst.buf, @@ -942,13 +1080,12 @@ static void estimate_block_intra(int plane, int block, int row, int col, if (plane == 0) { int64_t this_sse = INT64_MAX; - // TODO(jingning): This needs further refactoring. block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx, - VPXMIN(tx_size, TX_16X16)); + VPXMIN(tx_size, TX_16X16), 0, 1); } else { unsigned int var = 0; unsigned int sse = 0; - model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane, + model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, &var, &sse, plane, plane); } @@ -958,10 +1095,11 @@ static void estimate_block_intra(int plane, int block, int row, int col, args->rdc->dist += this_rdc.dist; } -static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = { +static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = { { THR_DC, THR_V_PRED, THR_H_PRED, THR_TM }, { THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV }, { THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG }, + { THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA }, }; static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED, @@ -981,8 +1119,34 @@ static int mode_offset(const PREDICTION_MODE mode) { } } +static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh, + const int *const thresh_fact) { + int is_rd_less_than_thresh; + is_rd_less_than_thresh = + best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX; + return is_rd_less_than_thresh; +} + +static INLINE void update_thresh_freq_fact_row_mt( + VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance, + int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame, + THR_MODES best_mode_idx, PREDICTION_MODE mode) { + THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; + int freq_fact_idx = thresh_freq_fact_idx + thr_mode_idx; + int *freq_fact = &tile_data->row_base_thresh_freq_fact[freq_fact_idx]; + if (thr_mode_idx == best_mode_idx) + *freq_fact -= (*freq_fact >> 4); + else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV && + ref_frame == LAST_FRAME && source_variance < 5) { + *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32); + } else { + *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } +} + static INLINE void update_thresh_freq_fact( - VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance, + VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance, BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx, PREDICTION_MODE mode) { THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)]; @@ -1015,6 +1179,7 @@ void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0); const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0); bmode_costs = cpi->y_mode_costs[A][L]; + assert(bsize >= BLOCK_8X8); (void)ctx; vp9_rd_cost_reset(&best_rdc); @@ -1089,33 +1254,22 @@ static const REF_MODE ref_mode_set[RT_INTER_MODES] = { { ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV } }; -static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = { - { LAST_FRAME, ZEROMV }, { GOLDEN_FRAME, ZEROMV }, - { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV }, + +#define RT_INTER_MODES_SVC 8 +static const REF_MODE ref_mode_set_svc[RT_INTER_MODES_SVC] = { + { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV }, + { LAST_FRAME, NEARMV }, { GOLDEN_FRAME, ZEROMV }, { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV }, { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEWMV } }; -static int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) { - const VP9_COMMON *const cm = &cpi->common; - // Reduce the intra cost penalty for small blocks (<=16x16). - int reduction_fac = - (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0; - if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh) - // Don't reduce intra cost penalty if estimated noise level is high. - reduction_fac = 0; - return vp9_get_intra_cost_penalty(cm->base_qindex, cm->y_dc_delta_q, - cm->bit_depth) >> - reduction_fac; -} - static INLINE void find_predictors( VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame, int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask, - const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col, + TileDataEnc *tile_data, int mi_row, int mi_col, struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize, - int force_skip_low_temp_var) { + int force_skip_low_temp_var, int comp_pred_allowed) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); @@ -1125,11 +1279,11 @@ static INLINE void find_predictors( frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; // this needs various further optimizations. to be continued.. - if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) { int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); - if (cm->use_prev_frame_mvs) { + if (cm->use_prev_frame_mvs || comp_pred_allowed) { vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, x->mbmi_ext->mode_context); } else { @@ -1155,7 +1309,8 @@ static INLINE void find_predictors( static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd, PREDICTION_MODE this_mode, RD_COST *this_rdc, BLOCK_SIZE bsize, int mv_row, int mv_col, - int is_last_frame) { + int is_last_frame, int lowvar_highsumdiff, + int is_skin) { // Bias against MVs associated with NEWMV mode that are very different from // top/left neighbors. if (this_mode == NEWMV) { @@ -1202,9 +1357,12 @@ static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd, // If noise estimation is enabled, and estimated level is above threshold, // add a bias to LAST reference with small motion, for large blocks. if (ne->enabled && ne->level >= kMedium && bsize >= BLOCK_32X32 && - is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) { - this_rdc->rdcost = 7 * this_rdc->rdcost >> 3; - } + is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8) + this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3); + else if (lowvar_highsumdiff && !is_skin && bsize >= BLOCK_16X16 && + is_last_frame && mv_row < 16 && mv_row > -16 && mv_col < 16 && + mv_col > -16) + this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3); } #if CONFIG_VP9_TEMPORAL_DENOISING @@ -1212,18 +1370,16 @@ static void vp9_pickmode_ctx_den_update( VP9_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig, int ref_frame_cost[MAX_REF_FRAMES], int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int reuse_inter_pred, - TX_SIZE best_tx_size, PREDICTION_MODE best_mode, - MV_REFERENCE_FRAME best_ref_frame, INTERP_FILTER best_pred_filter, - uint8_t best_mode_skip_txfm) { + BEST_PICKMODE *bp) { ctx_den->zero_last_cost_orig = zero_last_cost_orig; ctx_den->ref_frame_cost = ref_frame_cost; ctx_den->frame_mv = frame_mv; ctx_den->reuse_inter_pred = reuse_inter_pred; - ctx_den->best_tx_size = best_tx_size; - ctx_den->best_mode = best_mode; - ctx_den->best_ref_frame = best_ref_frame; - ctx_den->best_pred_filter = best_pred_filter; - ctx_den->best_mode_skip_txfm = best_mode_skip_txfm; + ctx_den->best_tx_size = bp->best_tx_size; + ctx_den->best_mode = bp->best_mode; + ctx_den->best_ref_frame = bp->best_ref_frame; + ctx_den->best_pred_filter = bp->best_pred_filter; + ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm; } static void recheck_zeromv_after_denoising( @@ -1239,8 +1395,10 @@ static void recheck_zeromv_after_denoising( ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) && ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) || (ctx_den->best_ref_frame == GOLDEN_FRAME && + cpi->svc.number_spatial_layers == 1 && decision == FILTER_ZEROMV_BLOCK))) { // Check if we should pick ZEROMV on denoised signal. + VP9_COMMON *const cm = &cpi->common; int rate = 0; int64_t dist = 0; uint32_t var_y = UINT_MAX; @@ -1248,12 +1406,14 @@ static void recheck_zeromv_after_denoising( RD_COST this_rdc; mi->mode = ZEROMV; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME); mi->mv[0].as_int = 0; mi->interp_filter = EIGHTTAP; + if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR; xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0]; vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y); + model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y, 0); this_rdc.rate = rate + ctx_den->ref_frame_cost[LAST_FRAME] + cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]] [INTER_OFFSET(ZEROMV)]; @@ -1265,6 +1425,7 @@ static void recheck_zeromv_after_denoising( this_rdc = *best_rdc; mi->mode = ctx_den->best_mode; mi->ref_frame[0] = ctx_den->best_ref_frame; + set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME); mi->interp_filter = ctx_den->best_pred_filter; if (ctx_den->best_ref_frame == INTRA_FRAME) { mi->mv[0].as_int = INVALID_MV; @@ -1335,42 +1496,240 @@ static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row, return force_skip_low_temp_var; } +static void search_filter_ref(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, + int mi_row, int mi_col, PRED_BUFFER *tmp, + BLOCK_SIZE bsize, int reuse_inter_pred, + PRED_BUFFER **this_mode_pred, unsigned int *var_y, + unsigned int *sse_y, int force_smooth_filter, + int *this_early_term, int *flag_preduv_computed, + int use_model_yrd_large) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int bw = num_4x4_blocks_wide_lookup[bsize] << 2; + + int pf_rate[3] = { 0 }; + int64_t pf_dist[3] = { 0 }; + int curr_rate[3] = { 0 }; + unsigned int pf_var[3] = { 0 }; + unsigned int pf_sse[3] = { 0 }; + TX_SIZE pf_tx_size[3] = { 0 }; + int64_t best_cost = INT64_MAX; + INTERP_FILTER best_filter = SWITCHABLE, filter; + PRED_BUFFER *current_pred = *this_mode_pred; + uint8_t skip_txfm = SKIP_TXFM_NONE; + int best_early_term = 0; + int best_flag_preduv_computed[2] = { 0 }; + INTERP_FILTER filter_start = force_smooth_filter ? EIGHTTAP_SMOOTH : EIGHTTAP; + INTERP_FILTER filter_end = EIGHTTAP_SMOOTH; + for (filter = filter_start; filter <= filter_end; ++filter) { + int64_t cost; + mi->interp_filter = filter; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + // For large partition blocks, extra testing is done. + if (use_model_yrd_large) + model_rd_for_sb_y_large(cpi, bsize, x, xd, &pf_rate[filter], + &pf_dist[filter], &pf_var[filter], + &pf_sse[filter], mi_row, mi_col, this_early_term, + flag_preduv_computed); + else + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter], + &pf_var[filter], &pf_sse[filter], 0); + curr_rate[filter] = pf_rate[filter]; + pf_rate[filter] += vp9_get_switchable_rate(cpi, xd); + cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]); + pf_tx_size[filter] = mi->tx_size; + if (cost < best_cost) { + best_filter = filter; + best_cost = cost; + skip_txfm = x->skip_txfm[0]; + best_early_term = *this_early_term; + best_flag_preduv_computed[0] = flag_preduv_computed[0]; + best_flag_preduv_computed[1] = flag_preduv_computed[1]; + + if (reuse_inter_pred) { + if (*this_mode_pred != current_pred) { + free_pred_buffer(*this_mode_pred); + *this_mode_pred = current_pred; + } + if (filter != filter_end) { + current_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = current_pred->data; + pd->dst.stride = bw; + } + } + } + } + + if (reuse_inter_pred && *this_mode_pred != current_pred) + free_pred_buffer(current_pred); + + mi->interp_filter = best_filter; + mi->tx_size = pf_tx_size[best_filter]; + this_rdc->rate = curr_rate[best_filter]; + this_rdc->dist = pf_dist[best_filter]; + *var_y = pf_var[best_filter]; + *sse_y = pf_sse[best_filter]; + x->skip_txfm[0] = skip_txfm; + *this_early_term = best_early_term; + flag_preduv_computed[0] = best_flag_preduv_computed[0]; + flag_preduv_computed[1] = best_flag_preduv_computed[1]; + if (reuse_inter_pred) { + pd->dst.buf = (*this_mode_pred)->data; + pd->dst.stride = (*this_mode_pred)->stride; + } else if (best_filter < filter_end) { + mi->interp_filter = best_filter; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + } +} + +static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x, + int_mv frame_mv[][MAX_REF_FRAMES], + MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int best_pred_sad, int *rate_mv, + unsigned int best_sse_sofar, RD_COST *best_rdc) { + SVC *const svc = &cpi->svc; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + SPEED_FEATURES *const sf = &cpi->sf; + + if (ref_frame > LAST_FRAME && gf_temporal_ref && + cpi->oxcf.rc_mode == VPX_CBR) { + int tmp_sad; + uint32_t dis; + int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + + if (bsize < BLOCK_16X16) return -1; + + tmp_sad = vp9_int_pro_motion_estimation( + cpi, x, bsize, mi_row, mi_col, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv); + + if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1; + if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1; + + frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int; + *rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + frame_mv[NEWMV][ref_frame].as_mv.row >>= 3; + frame_mv[NEWMV][ref_frame].as_mv.col >>= 3; + + cpi->find_fractional_mv_step( + x, &frame_mv[NEWMV][ref_frame].as_mv, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); + } else if (svc->use_base_mv && svc->spatial_layer_id) { + if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { + const int pre_stride = xd->plane[0].pre[0].stride; + unsigned int base_mv_sse = UINT_MAX; + int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4; + const uint8_t *const pre_buf = + xd->plane[0].pre[0].buf + + (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride + + (frame_mv[NEWMV][ref_frame].as_mv.col >> 3); + cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + pre_buf, pre_stride, &base_mv_sse); + + // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16, + // for SVC encoding. + if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + return -1; + + // Exit NEWMV search if base_mv_sse is large. + if (sf->base_mv_aggressive && (base_mv_sse >> scale) > best_sse_sofar) + return -1; + if ((base_mv_sse >> 1) < best_sse_sofar) { + // Base layer mv is good. + // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since + // (0, 0) mode is already tested. + unsigned int base_mv_sse_normalized = + base_mv_sse >> + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar && + base_mv_sse_normalized < 400 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + return -1; + if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 1)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + + return 0; +} + +static INLINE void init_best_pickmode(BEST_PICKMODE *bp) { + bp->best_mode = ZEROMV; + bp->best_ref_frame = LAST_FRAME; + bp->best_tx_size = TX_SIZES; + bp->best_intra_tx_size = TX_SIZES; + bp->best_pred_filter = EIGHTTAP; + bp->best_mode_skip_txfm = SKIP_TXFM_NONE; + bp->best_second_ref_frame = NO_REF_FRAME; + bp->best_pred = NULL; +} + void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; - const SVC *const svc = &cpi->svc; + SVC *const svc = &cpi->svc; MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[0]; - PREDICTION_MODE best_mode = ZEROMV; - MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME; - MV_REFERENCE_FRAME usable_ref_frame; - TX_SIZE best_tx_size = TX_SIZES; - INTERP_FILTER best_pred_filter = EIGHTTAP; + + BEST_PICKMODE best_pickmode; + + MV_REFERENCE_FRAME ref_frame; + MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; - struct buf_2d yv12_mb[4][MAX_MB_PLANE]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; + uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES]; + struct buf_2d yv12_mb[4][MAX_MB_PLANE] = { 0 }; RD_COST this_rdc, best_rdc; - uint8_t skip_txfm = SKIP_TXFM_NONE, best_mode_skip_txfm = SKIP_TXFM_NONE; // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; unsigned int sse_y = UINT_MAX; - const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize); + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize]; - const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + int thresh_freq_fact_idx = (sb_row * BLOCK_SIZES + bsize) * MAX_MODES; + const int *const rd_thresh_freq_fact = + (cpi->sf.adaptive_rd_thresh_row_mt) + ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx]) + : tile_data->thresh_freq_fact[bsize]; +#if CONFIG_VP9_TEMPORAL_DENOISING + const int denoise_recheck_zeromv = 1; +#endif INTERP_FILTER filter_ref; - const int bsl = mi_width_log2_lookup[bsize]; - const int pred_filter_search = - cm->interp_filter == SWITCHABLE - ? (((mi_row + mi_col) >> bsl) + - get_chessboard_index(cm->current_video_frame)) & - 0x1 - : 0; + int pred_filter_search = cm->interp_filter == SWITCHABLE; int const_motion[MAX_REF_FRAMES] = { 0 }; const int bh = num_4x4_blocks_high_lookup[bsize] << 2; const int bw = num_4x4_blocks_wide_lookup[bsize] << 2; @@ -1378,12 +1737,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // process. // tmp[3] points to dst buffer, and the other 3 point to allocated buffers. PRED_BUFFER tmp[4]; - DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64]); + DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64] VPX_UNINITIALIZED); #if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]); + DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64] VPX_UNINITIALIZED); #endif struct buf_2d orig_dst = pd->dst; - PRED_BUFFER *best_pred = NULL; PRED_BUFFER *this_mode_pred = NULL; const int pixels_in_block = bh * bw; int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready; @@ -1397,12 +1755,87 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int use_golden_nonzeromv = 1; int force_skip_low_temp_var = 0; int skip_ref_find_pred[4] = { 0 }; + unsigned int sse_zeromv_normalized = UINT_MAX; + unsigned int best_sse_sofar = UINT_MAX; + int gf_temporal_ref = 0; + int force_test_gf_zeromv = 0; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; + int denoise_svc_pickmode = 1; #endif + INTERP_FILTER filter_gf_svc = EIGHTTAP; + MV_REFERENCE_FRAME inter_layer_ref = GOLDEN_FRAME; + const struct segmentation *const seg = &cm->seg; + int comp_modes = 0; + int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES; + int flag_svc_subpel = 0; + int svc_mv_col = 0; + int svc_mv_row = 0; + int no_scaling = 0; + int large_block = 0; + int use_model_yrd_large = 0; + unsigned int thresh_svc_skip_golden = 500; + unsigned int thresh_skip_golden = 500; + int force_smooth_filter = cpi->sf.force_smooth_interpol; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); + + init_best_pickmode(&best_pickmode); + + x->encode_breakout = seg->enabled + ? cpi->segment_encode_breakout[mi->segment_id] + : cpi->encode_breakout; + + x->source_variance = UINT_MAX; + if (cpi->sf.default_interp_filter == BILINEAR) { + best_pickmode.best_pred_filter = BILINEAR; + filter_gf_svc = BILINEAR; + } + if (cpi->use_svc && svc->spatial_layer_id > 0) { + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id - 1, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1; + } + if (svc->spatial_layer_id > 0 && + (svc->high_source_sad_superframe || no_scaling)) + thresh_svc_skip_golden = 0; + // Lower the skip threshold if lower spatial layer is better quality relative + // to current layer. + else if (svc->spatial_layer_id > 0 && cm->base_qindex > 150 && + cm->base_qindex > svc->lower_layer_qindex + 15) + thresh_svc_skip_golden = 100; + // Increase skip threshold if lower spatial layer is lower quality relative + // to current layer. + else if (svc->spatial_layer_id > 0 && cm->base_qindex < 140 && + cm->base_qindex < svc->lower_layer_qindex - 20) + thresh_svc_skip_golden = 1000; + + if (!cpi->use_svc || + (svc->use_gf_temporal_ref_current_layer && + !svc->layer_context[svc->temporal_layer_id].is_key_frame)) { + struct scale_factors *const sf_last = &cm->frame_refs[LAST_FRAME - 1].sf; + struct scale_factors *const sf_golden = + &cm->frame_refs[GOLDEN_FRAME - 1].sf; + gf_temporal_ref = 1; + // For temporal long term prediction, check that the golden reference + // is same scale as last reference, otherwise disable. + if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) || + (sf_last->y_scale_fp != sf_golden->y_scale_fp)) { + gf_temporal_ref = 0; + } else { + if (cpi->rc.avg_frame_low_motion > 70) + thresh_svc_skip_golden = 500; + else + thresh_svc_skip_golden = 0; + } + } init_ref_frame_cost(cm, xd, ref_frame_cost); + memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES); if (reuse_inter_pred) { int i; @@ -1426,23 +1859,32 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; x->skip = 0; + if (cpi->sf.cb_pred_filter_search) { + const int bsl = mi_width_log2_lookup[bsize]; + pred_filter_search = cm->interp_filter == SWITCHABLE + ? (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & + 0x1 + : 0; + } // Instead of using vp9_get_pred_context_switchable_interp(xd) to assign // filter_ref, we use a less strict condition on assigning filter_ref. // This is to reduce the probabily of entering the flow of not assigning // filter_ref and then skip filter search. - if (xd->above_mi && is_inter_block(xd->above_mi)) - filter_ref = xd->above_mi->interp_filter; - else if (xd->left_mi && is_inter_block(xd->left_mi)) - filter_ref = xd->left_mi->interp_filter; - else - filter_ref = cm->interp_filter; + filter_ref = cm->interp_filter; + if (cpi->sf.default_interp_filter != BILINEAR) { + if (xd->above_mi && is_inter_block(xd->above_mi)) + filter_ref = xd->above_mi->interp_filter; + else if (xd->left_mi && is_inter_block(xd->left_mi)) + filter_ref = xd->left_mi->interp_filter; + } // initialize mode decisions vp9_rd_cost_reset(&best_rdc); vp9_rd_cost_reset(rd_cost); mi->sb_type = bsize; - mi->ref_frame[0] = NONE; - mi->ref_frame[1] = NONE; + mi->ref_frame[0] = NO_REF_FRAME; + mi->ref_frame[1] = NO_REF_FRAME; mi->tx_size = VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cm->tx_mode]); @@ -1456,16 +1898,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, #endif // CONFIG_VP9_HIGHBITDEPTH x->source_variance = vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && mi->segment_id > 0 && + x->zero_temp_sad_source && x->source_variance == 0) { + mi->segment_id = 0; + vp9_init_plane_quantizers(cpi, x); + } } #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && - cpi->denoiser.denoising_level > kDenLowLow) { - vp9_denoiser_reset_frame_stats(ctx); + if (cpi->oxcf.noise_sensitivity > 0) { + if (cpi->use_svc) denoise_svc_pickmode = vp9_denoise_svc_non_key(cpi); + if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode) + vp9_denoiser_reset_frame_stats(ctx); } #endif - if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc) { + if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref && + !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) { usable_ref_frame = LAST_FRAME; } else { usable_ref_frame = GOLDEN_FRAME; @@ -1479,19 +1930,32 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, skip_ref_find_pred[LAST_FRAME] = 1; skip_ref_find_pred[GOLDEN_FRAME] = 1; } + if (!cm->show_frame) { + if (cpi->rc.frames_since_key == 1) { + usable_ref_frame = LAST_FRAME; + skip_ref_find_pred[GOLDEN_FRAME] = 1; + skip_ref_find_pred[ALTREF_FRAME] = 1; + } + } } // For svc mode, on spatial_layer_id > 0: if the reference has different scale // constrain the inter mode to only test zero motion. if (cpi->use_svc && svc->force_zero_mode_spatial_ref && - cpi->svc.spatial_layer_id > 0) { - if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) { - struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; - if (vp9_is_scaled(sf)) svc_force_zero_mode[LAST_FRAME - 1] = 1; + svc->spatial_layer_id > 0 && !gf_temporal_ref) { + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + struct scale_factors *const ref_sf = &cm->frame_refs[LAST_FRAME - 1].sf; + if (vp9_is_scaled(ref_sf)) { + svc_force_zero_mode[LAST_FRAME - 1] = 1; + inter_layer_ref = LAST_FRAME; + } } - if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) { - struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; - if (vp9_is_scaled(sf)) svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; + if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + struct scale_factors *const ref_sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; + if (vp9_is_scaled(ref_sf)) { + svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; + inter_layer_ref = GOLDEN_FRAME; + } } } @@ -1507,19 +1971,83 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } } - if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + if (sf->disable_golden_ref && (x->content_state_sb != kVeryHighSad || + cpi->rc.avg_frame_low_motion < 60)) + usable_ref_frame = LAST_FRAME; + + if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) && !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) use_golden_nonzeromv = 0; + if (cpi->oxcf.speed >= 8 && !cpi->use_svc && + ((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content || + x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120)) + usable_ref_frame = LAST_FRAME; + + // Compound prediction modes: (0,0) on LAST/GOLDEN and ARF. + if (cm->reference_mode == REFERENCE_MODE_SELECT && + cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME) + comp_modes = 2; + + // If the segment reference frame feature is enabled and it's set to GOLDEN + // reference, then make sure we don't skip checking GOLDEN, this is to + // prevent possibility of not picking any mode. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) { + usable_ref_frame = GOLDEN_FRAME; + skip_ref_find_pred[GOLDEN_FRAME] = 0; + thresh_svc_skip_golden = 0; + } + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { + // Skip find_predictor if the reference frame is not in the + // ref_frame_flags (i.e., not used as a reference for this frame). + skip_ref_find_pred[ref_frame] = + !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)); if (!skip_ref_find_pred[ref_frame]) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, - &ref_frame_skip_mask, flag_list, tile_data, mi_row, - mi_col, yv12_mb, bsize, force_skip_low_temp_var); + &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb, + bsize, force_skip_low_temp_var, comp_modes > 0); } } - for (idx = 0; idx < RT_INTER_MODES; ++idx) { + if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32) + x->sb_use_mv_part = 0; + + // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used + // an averaging filter for downsampling (phase = 8). If so, we will test + // a nonzero motion mode on the spatial reference. + // The nonzero motion is half pixel shifted to left and top (-4, -4). + if (cpi->use_svc && svc->spatial_layer_id > 0 && + svc_force_zero_mode[inter_layer_ref - 1] && + svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 && + !gf_temporal_ref) { + svc_mv_col = -4; + svc_mv_row = -4; + flag_svc_subpel = 1; + } + + // For SVC with quality layers, when QP of lower layer is lower + // than current layer: force check of GF-ZEROMV before early exit + // due to skip flag. + if (svc->spatial_layer_id > 0 && no_scaling && + (cpi->ref_frame_flags & VP9_GOLD_FLAG) && + cm->base_qindex > svc->lower_layer_qindex + 10) + force_test_gf_zeromv = 1; + + // For low motion content use x->sb_is_skin in addition to VeryHighSad + // for setting large_block. + large_block = (x->content_state_sb == kVeryHighSad || + (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) || + cpi->oxcf.speed < 7) + ? bsize > BLOCK_32X32 + : bsize >= BLOCK_32X32; + use_model_yrd_large = + cpi->oxcf.rc_mode == VPX_CBR && large_block && + !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && + cm->base_qindex; + + for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) { int rate_mv = 0; int mode_rd_thresh; int mode_index; @@ -1527,20 +2055,96 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int64_t this_sse; int is_skippable; int this_early_term = 0; - PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode; + int rd_computed = 0; + int flag_preduv_computed[2] = { 0 }; + int inter_mv_mode = 0; + int skip_this_mv = 0; + int comp_pred = 0; + int force_mv_inter_layer = 0; + PREDICTION_MODE this_mode; + second_ref_frame = NO_REF_FRAME; - ref_frame = ref_mode_set[idx].ref_frame; + if (idx < num_inter_modes) { + this_mode = ref_mode_set[idx].pred_mode; + ref_frame = ref_mode_set[idx].ref_frame; - if (cpi->use_svc) { - this_mode = ref_mode_set_svc[idx].pred_mode; - ref_frame = ref_mode_set_svc[idx].ref_frame; + if (cpi->use_svc) { + this_mode = ref_mode_set_svc[idx].pred_mode; + ref_frame = ref_mode_set_svc[idx].ref_frame; + } + } else { + // Add (0,0) compound modes. + this_mode = ZEROMV; + ref_frame = LAST_FRAME; + if (idx == num_inter_modes + comp_modes - 1) ref_frame = GOLDEN_FRAME; + second_ref_frame = ALTREF_FRAME; + comp_pred = 1; } + if (ref_frame > usable_ref_frame) continue; if (skip_ref_find_pred[ref_frame]) continue; - if (sf->short_circuit_flat_blocks && x->source_variance == 0 && - this_mode != NEARESTMV) { + if (svc->previous_frame_is_intra_only) { + if (ref_frame != LAST_FRAME || frame_mv[this_mode][ref_frame].as_int != 0) + continue; + } + + // If the segment reference frame feature is enabled then do nothing if the + // current ref frame is not allowed. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) continue; + + if (flag_svc_subpel && ref_frame == inter_layer_ref) { + force_mv_inter_layer = 1; + // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row), + // otherwise set NEWMV to (svc_mv_col, svc_mv_row). + if (this_mode == NEWMV) { + frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col; + frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row; + } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col || + frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) { + continue; + } + } + + if (comp_pred) { + if (!cpi->allow_comp_inter_inter) continue; + // Skip compound inter modes if ARF is not available. + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue; + } + + // For CBR mode: skip the golden reference search if sse of zeromv_last is + // below threshold. + if (ref_frame == GOLDEN_FRAME && cpi->oxcf.rc_mode == VPX_CBR && + ((cpi->use_svc && sse_zeromv_normalized < thresh_svc_skip_golden) || + (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden))) + continue; + + if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue; + + // For screen content. If zero_temp_sad source is computed: skip + // non-zero motion check for stationary blocks. If the superblock is + // non-stationary then for flat blocks skip the zero last check (keep golden + // as it may be inter-layer reference). Otherwise (if zero_temp_sad_source + // is not computed) skip non-zero motion check for flat blocks. + // TODO(marpan): Compute zero_temp_sad_source per coding block. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) { + if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) { + if ((frame_mv[this_mode][ref_frame].as_int != 0 && + x->zero_temp_sad_source) || + (frame_mv[this_mode][ref_frame].as_int == 0 && + x->source_variance == 0 && ref_frame == LAST_FRAME && + !x->zero_temp_sad_source)) + continue; + } else if (frame_mv[this_mode][ref_frame].as_int != 0 && + x->source_variance == 0) { + continue; + } } if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue; @@ -1551,76 +2155,92 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, frame_mv[this_mode][ref_frame].as_int != 0)) continue; - if (cpi->rc.alt_ref_gf_group && + if (!cm->show_frame && ref_frame == ALTREF_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + + if (cpi->rc.alt_ref_gf_group && cm->show_frame && cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) && ref_frame == GOLDEN_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) continue; - if (cpi->rc.alt_ref_gf_group && + if (cpi->rc.alt_ref_gf_group && cm->show_frame && + cpi->rc.frames_since_golden > 0 && cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) && ref_frame == ALTREF_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) continue; } - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; - if (const_motion[ref_frame] && this_mode == NEARMV) continue; // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped // later. - if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && + if (!force_mv_inter_layer && force_skip_low_temp_var && + ref_frame == GOLDEN_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) { continue; } - if (cpi->sf.short_circuit_low_temp_var >= 2 && force_skip_low_temp_var && - ref_frame == LAST_FRAME && this_mode == NEWMV) { + if (x->content_state_sb != kVeryHighSad && + (cpi->sf.short_circuit_low_temp_var >= 2 || + (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) && + force_skip_low_temp_var && ref_frame == LAST_FRAME && + this_mode == NEWMV) { continue; } if (cpi->use_svc) { - if (svc_force_zero_mode[ref_frame - 1] && + if (!force_mv_inter_layer && svc_force_zero_mode[ref_frame - 1] && frame_mv[this_mode][ref_frame].as_int != 0) continue; } - if (sf->reference_masking && - !(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == LAST_FRAME)) { - if (usable_ref_frame < ALTREF_FRAME) { - if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { - i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; - if ((cpi->ref_frame_flags & flag_list[i])) - if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) - ref_frame_skip_mask |= (1 << ref_frame); + // Disable this drop out case if the ref frame segment level feature is + // enabled for this segment. This is to prevent the possibility that we end + // up unable to pick any mode. + if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) { + if (sf->reference_masking && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == LAST_FRAME)) { + if (usable_ref_frame < ALTREF_FRAME) { + if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { + i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; + if ((cpi->ref_frame_flags & ref_frame_to_flag(i))) + if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) + ref_frame_skip_mask |= (1 << ref_frame); + } + } else if (!cpi->rc.is_src_frame_alt_ref && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == ALTREF_FRAME)) { + int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; + int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; + if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || + ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) + ref_frame_skip_mask |= (1 << ref_frame); } - } else if (!cpi->rc.is_src_frame_alt_ref && - !(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == ALTREF_FRAME)) { - int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; - int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; - if (((cpi->ref_frame_flags & flag_list[ref1]) && - (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || - ((cpi->ref_frame_flags & flag_list[ref2]) && - (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) - ref_frame_skip_mask |= (1 << ref_frame); } + if (ref_frame_skip_mask & (1 << ref_frame)) continue; } - if (ref_frame_skip_mask & (1 << ref_frame)) continue; // Select prediction reference frames. - for (i = 0; i < MAX_MB_PLANE; i++) + for (i = 0; i < MAX_MB_PLANE; i++) { xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } mi->ref_frame[0] = ref_frame; - set_ref_ptrs(cm, xd, ref_frame, NONE); + mi->ref_frame[1] = second_ref_frame; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)]; - mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1 - : rd_threshes[mode_index]; + mode_rd_thresh = best_pickmode.best_mode_skip_txfm + ? rd_threshes[mode_index] << 1 + : rd_threshes[mode_index]; // Increase mode_rd_thresh value for GOLDEN_FRAME for improved encoding // speed with little/no subjective quality loss. @@ -1628,76 +2248,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cpi->rc.frames_since_golden > 4) mode_rd_thresh = mode_rd_thresh << 3; - if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, - rd_thresh_freq_fact[mode_index])) - continue; + if ((cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index])) || + (!cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index]))) + if (frame_mv[this_mode][ref_frame].as_int != 0) continue; - if (this_mode == NEWMV) { - if (ref_frame > LAST_FRAME && !cpi->use_svc && - cpi->oxcf.rc_mode == VPX_CBR) { - int tmp_sad; - uint32_t dis; - int cost_list[5]; - - if (bsize < BLOCK_16X16) continue; - - tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); - - if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue; - if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) - continue; - - frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int; - rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, - &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - frame_mv[NEWMV][ref_frame].as_mv.row >>= 3; - frame_mv[NEWMV][ref_frame].as_mv.col >>= 3; - - cpi->find_fractional_mv_step( - x, &frame_mv[NEWMV][ref_frame].as_mv, - &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, - cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, - 0); - } else if (svc->use_base_mv && svc->spatial_layer_id) { - if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { - const int pre_stride = xd->plane[0].pre[0].stride; - int base_mv_sad = INT_MAX; - const float base_mv_bias = sf->base_mv_aggressive ? 1.5f : 1.0f; - const uint8_t *const pre_buf = - xd->plane[0].pre[0].buf + - (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride + - (frame_mv[NEWMV][ref_frame].as_mv.col >> 3); - base_mv_sad = cpi->fn_ptr[bsize].sdf( - x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride); - - if (base_mv_sad < (int)(base_mv_bias * x->pred_mv_sad[ref_frame])) { - // Base layer mv is good. - if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], &rate_mv, - best_rdc.rdcost, 1)) { - continue; - } - } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], - &rate_mv, best_rdc.rdcost, 0)) { - continue; - } - } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], - &rate_mv, best_rdc.rdcost, 0)) { - continue; - } - } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], &rate_mv, - best_rdc.rdcost, 0)) { + if (this_mode == NEWMV && !force_mv_inter_layer) { + if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize, + mi_row, mi_col, best_pred_sad, &rate_mv, best_sse_sofar, + &best_rdc)) continue; + } + + // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector + // causes some regression, leave it for duplicate zero-mv for now, until + // regression issue is resolved. + for (inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; inter_mv_mode++) { + if (inter_mv_mode == this_mode || comp_pred) continue; + if (mode_checked[inter_mv_mode][ref_frame] && + frame_mv[this_mode][ref_frame].as_int == + frame_mv[inter_mv_mode][ref_frame].as_int && + frame_mv[inter_mv_mode][ref_frame].as_int == 0) { + skip_this_mv = 1; + break; } } + if (skip_this_mv) continue; + // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no // need to compute best_pred_sad which is only used to skip golden NEWMV. if (use_golden_nonzeromv && this_mode == NEWMV && ref_frame == LAST_FRAME && @@ -1712,13 +2293,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, x->pred_mv_sad[LAST_FRAME] = best_pred_sad; } - if (this_mode != NEARESTMV && + if (this_mode != NEARESTMV && !comp_pred && frame_mv[this_mode][ref_frame].as_int == frame_mv[NEARESTMV][ref_frame].as_int) continue; mi->mode = this_mode; mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int; + mi->mv[1].as_int = 0; // Search for the best prediction filter type, when the resulting // motion vector is at sub-pixel accuracy level for luma component, i.e., @@ -1736,88 +2318,47 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search && (ref_frame == LAST_FRAME || - (ref_frame == GOLDEN_FRAME && + (ref_frame == GOLDEN_FRAME && !force_mv_inter_layer && (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) && (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) { - int pf_rate[3]; - int64_t pf_dist[3]; - unsigned int pf_var[3]; - unsigned int pf_sse[3]; - TX_SIZE pf_tx_size[3]; - int64_t best_cost = INT64_MAX; - INTERP_FILTER best_filter = SWITCHABLE, filter; - PRED_BUFFER *current_pred = this_mode_pred; - - for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) { - int64_t cost; - mi->interp_filter = filter; - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter], - &pf_var[filter], &pf_sse[filter]); - pf_rate[filter] += vp9_get_switchable_rate(cpi, xd); - cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]); - pf_tx_size[filter] = mi->tx_size; - if (cost < best_cost) { - best_filter = filter; - best_cost = cost; - skip_txfm = x->skip_txfm[0]; - - if (reuse_inter_pred) { - if (this_mode_pred != current_pred) { - free_pred_buffer(this_mode_pred); - this_mode_pred = current_pred; - } - current_pred = &tmp[get_pred_buffer(tmp, 3)]; - pd->dst.buf = current_pred->data; - pd->dst.stride = bw; - } - } - } - - if (reuse_inter_pred && this_mode_pred != current_pred) - free_pred_buffer(current_pred); - - mi->interp_filter = best_filter; - mi->tx_size = pf_tx_size[best_filter]; - this_rdc.rate = pf_rate[best_filter]; - this_rdc.dist = pf_dist[best_filter]; - var_y = pf_var[best_filter]; - sse_y = pf_sse[best_filter]; - x->skip_txfm[0] = skip_txfm; - if (reuse_inter_pred) { - pd->dst.buf = this_mode_pred->data; - pd->dst.stride = this_mode_pred->stride; - } + rd_computed = 1; + search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize, + reuse_inter_pred, &this_mode_pred, &var_y, &sse_y, + force_smooth_filter, &this_early_term, + flag_preduv_computed, use_model_yrd_large); } else { -// TODO(jackychen): the low-bitdepth condition causes a segfault in -// high-bitdepth builds. -// https://bugs.chromium.org/p/webm/issues/detail?id=1250 -#if CONFIG_VP9_HIGHBITDEPTH - const int large_block = bsize > BLOCK_32X32; -#else - const int large_block = - x->sb_is_skin ? bsize > BLOCK_32X32 : bsize >= BLOCK_32X32; -#endif mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref; + + if (cpi->use_svc && ref_frame == GOLDEN_FRAME && + svc_force_zero_mode[ref_frame - 1]) + mi->interp_filter = filter_gf_svc; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); // For large partition blocks, extra testing is done. - if (cpi->oxcf.rc_mode == VPX_CBR && large_block && - !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && - cm->base_qindex) { + if (use_model_yrd_large) { + rd_computed = 1; model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col, - &this_early_term); + &this_early_term, flag_preduv_computed); } else { + rd_computed = 1; model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, - &var_y, &sse_y); + &var_y, &sse_y, 0); } + // Save normalized sse (between current and last frame) for (0, 0) motion. + if (ref_frame == LAST_FRAME && + frame_mv[this_mode][ref_frame].as_int == 0) { + sse_zeromv_normalized = + sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + } + if (sse_y < best_sse_sofar) best_sse_sofar = sse_y; } if (!this_early_term) { this_sse = (int64_t)sse_y; block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize, - VPXMIN(mi->tx_size, TX_16X16)); + VPXMIN(mi->tx_size, TX_16X16), rd_computed, 0); x->skip_txfm[0] = is_skippable; if (is_skippable) { this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); @@ -1837,19 +2378,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_rdc.rate += vp9_get_switchable_rate(cpi, xd); } } else { - this_rdc.rate += cm->interp_filter == SWITCHABLE - ? vp9_get_switchable_rate(cpi, xd) - : 0; + if (cm->interp_filter == SWITCHABLE) { + if ((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) + this_rdc.rate += vp9_get_switchable_rate(cpi, xd); + } this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); } - if (x->color_sensitivity[0] || x->color_sensitivity[1]) { + if (!this_early_term && + (x->color_sensitivity[0] || x->color_sensitivity[1])) { RD_COST rdc_uv; const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]); - if (x->color_sensitivity[0]) + if (x->color_sensitivity[0] && !flag_preduv_computed[0]) { vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1); - if (x->color_sensitivity[1]) + flag_preduv_computed[0] = 1; + } + if (x->color_sensitivity[1] && !flag_preduv_computed[1]) { vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2); + flag_preduv_computed[1] = 1; + } model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2); this_rdc.rate += rdc_uv.rate; this_rdc.dist += rdc_uv.dist; @@ -1858,6 +2405,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_rdc.rate += rate_mv; this_rdc.rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]] [INTER_OFFSET(this_mode)]; + // TODO(marpan): Add costing for compound mode. this_rdc.rate += ref_frame_cost[ref_frame]; this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); @@ -1868,15 +2416,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, vp9_NEWMV_diff_bias(&cpi->noise_estimate, xd, this_mode, &this_rdc, bsize, frame_mv[this_mode][ref_frame].as_mv.row, frame_mv[this_mode][ref_frame].as_mv.col, - ref_frame == LAST_FRAME); + ref_frame == LAST_FRAME, x->lowvar_highsumdiff, + x->sb_is_skin); } // Skipping checking: test to see if this block can be reconstructed by // prediction only. - if (cpi->allow_encode_breakout) { + if (cpi->allow_encode_breakout && !xd->lossless && !scene_change_detected && + !svc->high_num_blocks_with_motion) { encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode, var_y, sse_y, yv12_mb, &this_rdc.rate, - &this_rdc.dist); + &this_rdc.dist, flag_preduv_computed); if (x->skip) { this_rdc.rate += rate_mv; this_rdc.rdcost = @@ -1884,8 +2434,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } } + // On spatially flat blocks for screne content: bias against zero-last + // if the sse_y is non-zero. Only on scene change or high motion frames. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + (scene_change_detected || svc->high_num_blocks_with_motion) && + ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0 && + svc->spatial_layer_id == 0 && x->source_variance == 0 && sse_y > 0) { + this_rdc.rdcost = this_rdc.rdcost << 2; + } + #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow) { vp9_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx); // Keep track of zero_last cost. @@ -1896,88 +2455,110 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (void)ctx; #endif + mode_checked[this_mode][ref_frame] = 1; + if (this_rdc.rdcost < best_rdc.rdcost || x->skip) { best_rdc = this_rdc; - best_mode = this_mode; - best_pred_filter = mi->interp_filter; - best_tx_size = mi->tx_size; - best_ref_frame = ref_frame; - best_mode_skip_txfm = x->skip_txfm[0]; best_early_term = this_early_term; + best_pickmode.best_mode = this_mode; + best_pickmode.best_pred_filter = mi->interp_filter; + best_pickmode.best_tx_size = mi->tx_size; + best_pickmode.best_ref_frame = ref_frame; + best_pickmode.best_mode_skip_txfm = x->skip_txfm[0]; + best_pickmode.best_second_ref_frame = second_ref_frame; if (reuse_inter_pred) { - free_pred_buffer(best_pred); - best_pred = this_mode_pred; + free_pred_buffer(best_pickmode.best_pred); + best_pickmode.best_pred = this_mode_pred; } } else { if (reuse_inter_pred) free_pred_buffer(this_mode_pred); } - if (x->skip) break; + if (x->skip && + (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) + break; // If early termination flag is 1 and at least 2 modes are checked, // the mode search is terminated. - if (best_early_term && idx > 0) { + if (best_early_term && idx > 0 && !scene_change_detected && + (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) { x->skip = 1; break; } } - mi->mode = best_mode; - mi->interp_filter = best_pred_filter; - mi->tx_size = best_tx_size; - mi->ref_frame[0] = best_ref_frame; - mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; + mi->mode = best_pickmode.best_mode; + mi->interp_filter = best_pickmode.best_pred_filter; + mi->tx_size = best_pickmode.best_tx_size; + mi->ref_frame[0] = best_pickmode.best_ref_frame; + mi->mv[0].as_int = + frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int; xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int; - x->skip_txfm[0] = best_mode_skip_txfm; + x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm; + mi->ref_frame[1] = best_pickmode.best_second_ref_frame; // For spatial enhancemanent layer: perform intra prediction only if base // layer is chosen as the reference. Always perform intra prediction if - // LAST is the only reference or is_key_frame is set. - if (cpi->svc.spatial_layer_id) { + // LAST is the only reference, or is_key_frame is set, or on base + // temporal layer. + if (svc->spatial_layer_id && !gf_temporal_ref) { perform_intra_pred = - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame || - !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) || - (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && - svc_force_zero_mode[best_ref_frame - 1]); + svc->temporal_layer_id == 0 || + svc->layer_context[svc->temporal_layer_id].is_key_frame || + !(cpi->ref_frame_flags & VP9_GOLD_FLAG) || + (!svc->layer_context[svc->temporal_layer_id].is_key_frame && + svc_force_zero_mode[best_pickmode.best_ref_frame - 1]); inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; } - if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && - cpi->rc.is_src_frame_alt_ref) + if ((cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) || + svc->previous_frame_is_intra_only) perform_intra_pred = 0; + + // If the segment reference frame feature is enabled and set then + // skip the intra prediction. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0) + perform_intra_pred = 0; + // Perform intra prediction search, if the best SAD is above a certain // threshold. - if ((!force_skip_low_temp_var || bsize < BLOCK_32X32) && perform_intra_pred && - (best_rdc.rdcost == INT64_MAX || - (!x->skip && best_rdc.rdcost > inter_mode_thresh && - bsize <= cpi->sf.max_intra_bsize))) { + if (best_rdc.rdcost == INT64_MAX || + (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->source_variance == 0) || + (scene_change_detected && perform_intra_pred) || + ((!force_skip_low_temp_var || bsize < BLOCK_32X32 || + x->content_state_sb == kVeryHighSad) && + perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh && + bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad && + !x->lowvar_highsumdiff)) { struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 }; + int64_t this_sse = INT64_MAX; int i; - TX_SIZE best_intra_tx_size = TX_SIZES; + PRED_BUFFER *const best_pred = best_pickmode.best_pred; TX_SIZE intra_tx_size = VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && intra_tx_size > TX_16X16) - intra_tx_size = TX_16X16; if (reuse_inter_pred && best_pred != NULL) { if (best_pred->data == orig_dst.buf) { this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, - this_mode_pred->data, this_mode_pred->stride, - NULL, 0, NULL, 0, bw, bh, xd->bd); + vpx_highbd_convolve_copy( + CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, + CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride, + NULL, 0, 0, 0, 0, bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, - 0, NULL, 0, bw, bh); + 0, 0, 0, 0, bw, bh); #else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, 0, - NULL, 0, bw, bh); + 0, 0, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH - best_pred = this_mode_pred; + best_pickmode.best_pred = this_mode_pred; } } pd->dst = orig_dst; @@ -1986,18 +2567,35 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, const PREDICTION_MODE this_mode = intra_mode_list[i]; THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)]; int mode_rd_thresh = rd_threshes[mode_index]; + // For spatially flat blocks, under short_circuit_flat_blocks flag: + // only check DC mode for stationary blocks, otherwise also check + // H and V mode. if (sf->short_circuit_flat_blocks && x->source_variance == 0 && - this_mode != DC_PRED) { + ((x->zero_temp_sad_source && this_mode != DC_PRED) || i > 2)) { continue; } if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize])) continue; - if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, - rd_thresh_freq_fact[mode_index])) + if (cpi->sf.rt_intra_dc_only_low_content && this_mode != DC_PRED && + x->content_state_sb != kVeryHighSad) continue; + if ((cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index])) || + (!cpi->sf.adaptive_rd_thresh_row_mt && + rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, + &rd_thresh_freq_fact[mode_index]))) { + // Avoid this early exit for screen on base layer, for scene + // changes or high motion frames. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN || + svc->spatial_layer_id > 0 || + (!scene_change_detected && !svc->high_num_blocks_with_motion)) + continue; + } + mi->mode = this_mode; mi->ref_frame[0] = INTRA_FRAME; this_rdc.dist = this_rdc.rate = 0; @@ -2005,8 +2603,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, args.skippable = 1; args.rdc = &this_rdc; mi->tx_size = intra_tx_size; - vp9_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra, - &args); + + compute_intra_yprediction(this_mode, bsize, x, xd); + model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, + &var_y, &sse_y, 1); + block_yrd(cpi, x, &this_rdc, &args.skippable, &this_sse, bsize, + VPXMIN(mi->tx_size, TX_16X16), 1, 1); + // Check skip cost here since skippable is not set for for uv, this // mirrors the behavior used by inter if (args.skippable) { @@ -2033,68 +2636,86 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (this_rdc.rdcost < best_rdc.rdcost) { best_rdc = this_rdc; - best_mode = this_mode; - best_intra_tx_size = mi->tx_size; - best_ref_frame = INTRA_FRAME; + best_pickmode.best_mode = this_mode; + best_pickmode.best_intra_tx_size = mi->tx_size; + best_pickmode.best_ref_frame = INTRA_FRAME; + best_pickmode.best_second_ref_frame = NO_REF_FRAME; mi->uv_mode = this_mode; mi->mv[0].as_int = INVALID_MV; - best_mode_skip_txfm = x->skip_txfm[0]; + mi->mv[1].as_int = INVALID_MV; + best_pickmode.best_mode_skip_txfm = x->skip_txfm[0]; } } // Reset mb_mode_info to the best inter mode. - if (best_ref_frame != INTRA_FRAME) { - mi->tx_size = best_tx_size; + if (best_pickmode.best_ref_frame != INTRA_FRAME) { + mi->tx_size = best_pickmode.best_tx_size; } else { - mi->tx_size = best_intra_tx_size; + mi->tx_size = best_pickmode.best_intra_tx_size; } } pd->dst = orig_dst; - mi->mode = best_mode; - mi->ref_frame[0] = best_ref_frame; - x->skip_txfm[0] = best_mode_skip_txfm; + mi->mode = best_pickmode.best_mode; + mi->ref_frame[0] = best_pickmode.best_ref_frame; + mi->ref_frame[1] = best_pickmode.best_second_ref_frame; + x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm; if (!is_inter_block(mi)) { mi->interp_filter = SWITCHABLE_FILTERS; } - if (reuse_inter_pred && best_pred != NULL) { + if (reuse_inter_pred && best_pickmode.best_pred != NULL) { + PRED_BUFFER *const best_pred = best_pickmode.best_pred; if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) - vpx_highbd_convolve_copy(best_pred->data, best_pred->stride, - pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, - bw, bh, xd->bd); + vpx_highbd_convolve_copy( + CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0, + bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, - pd->dst.stride, NULL, 0, NULL, 0, bw, bh); + pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh); #else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, - pd->dst.stride, NULL, 0, NULL, 0, bw, bh); + pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH } } #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && cpi->resize_pending == 0 && - cpi->denoiser.denoising_level > kDenLowLow && cpi->denoiser.reset == 0) { + denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow && + cpi->denoiser.reset == 0) { VP9_DENOISER_DECISION decision = COPY_BLOCK; + ctx->sb_skip_denoising = 0; + // TODO(marpan): There is an issue with denoising when the + // superblock partitioning scheme is based on the pickmode. + // Remove this condition when the issue is resolved. + if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1; vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost, - frame_mv, reuse_inter_pred, best_tx_size, - best_mode, best_ref_frame, best_pred_filter, - best_mode_skip_txfm); - vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision); - recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, yv12_mb, - &best_rdc, bsize, mi_row, mi_col); - best_ref_frame = ctx_den.best_ref_frame; + frame_mv, reuse_inter_pred, &best_pickmode); + vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision, + gf_temporal_ref); + if (denoise_recheck_zeromv) + recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, + yv12_mb, &best_rdc, bsize, mi_row, mi_col); + best_pickmode.best_ref_frame = ctx_den.best_ref_frame; } #endif - if (cpi->sf.adaptive_rd_thresh) { - THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)]; + if (best_pickmode.best_ref_frame == ALTREF_FRAME || + best_pickmode.best_second_ref_frame == ALTREF_FRAME) + x->arf_frame_usage++; + else if (best_pickmode.best_ref_frame != INTRA_FRAME) + x->lastgolden_frame_usage++; - if (best_ref_frame == INTRA_FRAME) { + if (cpi->sf.adaptive_rd_thresh) { + THR_MODES best_mode_idx = + mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)]; + + if (best_pickmode.best_ref_frame == INTRA_FRAME) { // Only consider the modes that are included in the intra_mode_list. int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE); int i; @@ -2102,16 +2723,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // TODO(yunqingwang): Check intra mode mask and only update freq_fact // for those valid modes. for (i = 0; i < intra_modes; i++) { - update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, - INTRA_FRAME, best_mode_idx, intra_mode_list[i]); + if (cpi->sf.adaptive_rd_thresh_row_mt) + update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance, + thresh_freq_fact_idx, INTRA_FRAME, + best_mode_idx, intra_mode_list[i]); + else + update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, + INTRA_FRAME, best_mode_idx, + intra_mode_list[i]); } } else { for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { PREDICTION_MODE this_mode; - if (best_ref_frame != ref_frame) continue; + if (best_pickmode.best_ref_frame != ref_frame) continue; for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { - update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, - ref_frame, best_mode_idx, this_mode); + if (cpi->sf.adaptive_rd_thresh_row_mt) + update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance, + thresh_freq_fact_idx, ref_frame, + best_mode_idx, this_mode); + else + update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize, + ref_frame, best_mode_idx, this_mode); } } } @@ -2129,12 +2761,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, MODE_INFO *const mi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const struct segmentation *const seg = &cm->seg; - MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE; - MV_REFERENCE_FRAME best_ref_frame = NONE; + MV_REFERENCE_FRAME ref_frame, second_ref_frame = NO_REF_FRAME; + MV_REFERENCE_FRAME best_ref_frame = NO_REF_FRAME; unsigned char segment_id = mi->segment_id; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; int64_t best_rd = INT64_MAX; b_mode_info bsi[MAX_REF_FRAMES][4]; int ref_frame_skip_mask = 0; @@ -2150,11 +2780,13 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int_mv dummy_mv[2]; x->pred_mv_sad[ref_frame] = INT_MAX; - if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && + (yv12 != NULL)) { int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame]; - const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; - vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, - sf); + const struct scale_factors *const ref_sf = + &cm->frame_refs[ref_frame - 1].sf; + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, ref_sf, + ref_sf); vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, mbmi_ext->mode_context); @@ -2169,7 +2801,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, mi->tx_size = TX_4X4; mi->uv_mode = DC_PRED; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter; @@ -2258,12 +2890,12 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, } vp9_set_mv_search_range(&x->mv_limits, - &mbmi_ext->ref_mvs[0]->as_mv); + &mbmi_ext->ref_mvs[ref_frame][0].as_mv); - vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, - x->sadperbit4, cond_cost_list(cpi, cost_list), - &mbmi_ext->ref_mvs[ref_frame][0].as_mv, - &tmp_mv, INT_MAX, 0); + vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, + x->sadperbit4, cond_cost_list(cpi, cost_list), + &mbmi_ext->ref_mvs[ref_frame][0].as_mv, &tmp_mv, INT_MAX, 0); x->mv_limits = tmp_mv_limits; @@ -2283,9 +2915,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, x, &tmp_mv, &mbmi_ext->ref_mvs[ref_frame][0].as_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, - cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, - &dummy_dist, &x->pred_sse[ref_frame], NULL, 0, 0); + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dummy_dist, + &x->pred_sse[ref_frame], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv; } else { @@ -2296,7 +2929,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_build_inter_predictor( - pd->pre[0].buf, pd->pre[0].stride, pd->dst.buf, pd->dst.stride, + CONVERT_TO_SHORTPTR(pd->pre[0].buf), pd->pre[0].stride, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, &xd->mi[0]->bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf, 4 * num_4x4_blocks_wide, 4 * num_4x4_blocks_high, 0, vp9_filter_kernels[mi->interp_filter], MV_PRECISION_Q3, @@ -2317,7 +2951,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, #endif model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, - &var_y, &sse_y); + &var_y, &sse_y, 0); this_rdc.rate += b_rate; this_rdc.rdcost = diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h index 9aa00c4fab..15207e6cf4 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_PICKMODE_H_ -#define VP9_ENCODER_VP9_PICKMODE_H_ +#ifndef VPX_VP9_ENCODER_VP9_PICKMODE_H_ +#define VPX_VP9_ENCODER_VP9_PICKMODE_H_ #include "vp9/encoder/vp9_encoder.h" @@ -32,4 +32,4 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_PICKMODE_H_ +#endif // VPX_VP9_ENCODER_VP9_PICKMODE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c index de96c6e068..a1e0b4439e 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c @@ -8,12 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/bitops.h" #include "vpx_ports/mem.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_encoder.h" @@ -21,77 +24,64 @@ #include "vp9/encoder/vp9_rd.h" void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { int i, eob = -1; - // TODO(jingning) Decide the need of these arguments after the - // quantization process is completed. - (void)zbin_ptr; - (void)quant_shift_ptr; - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant_ptr[rc != 0]) >> 16; + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (tmp) eob = i; - } + if (tmp) eob = i; } *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i; int eob = -1; - // TODO(jingning) Decide the need of these arguments after the - // quantization process is completed. - (void)zbin_ptr; - (void)quant_shift_ptr; - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + round_ptr[rc != 0]; - const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (abs_qcoeff) eob = i; - } + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[rc != 0]; + const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tran_low_t)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_qcoeff) eob = i; } *eob_ptr = eob + 1; } @@ -100,108 +90,77 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, // TODO(jingning) Refactor this file and combine functions with similar // operations. void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, eob = -1; - (void)zbin_ptr; - (void)quant_shift_ptr; - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - int tmp = 0; - int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp = 0; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { - abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); - tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - } - - if (tmp) eob = i; + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; } + + if (tmp) eob = i; } *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_quantize_fp_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, eob = -1; - (void)zbin_ptr; - (void)quant_shift_ptr; - (void)iscan; + const int16_t *round_ptr = mb_plane->round_fp; + const int16_t *quant_ptr = mb_plane->quant_fp; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - for (i = 0; i < n_coeffs; i++) { - uint32_t abs_qcoeff = 0; - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + for (i = 0; i < n_coeffs; i++) { + int abs_qcoeff = 0; + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { - const int64_t tmp = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - } - - if (abs_qcoeff) eob = i; + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; } + + if (abs_qcoeff) eob = i; } *eob_ptr = eob + 1; } #endif -void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, - const int16_t *scan, const int16_t *iscan) { - MACROBLOCKD *const xd = &x->e_mbd; - struct macroblock_plane *p = &x->plane[plane]; - struct macroblockd_plane *pd = &xd->plane[plane]; - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, - p->zbin, p->round, p->quant, p->quant_shift, - BLOCK_OFFSET(p->qcoeff, block), - BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, - &p->eobs[block], scan, iscan); - return; - } -#endif - vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, p->zbin, - p->round, p->quant, p->quant_shift, - BLOCK_OFFSET(p->qcoeff, block), - BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, &p->eobs[block], - scan, iscan); -} - static void invert_quant(int16_t *quant, int16_t *shift, int d) { - unsigned t; + unsigned int t; int l, m; - t = d; - for (l = 0; t > 1; l++) t >>= 1; + t = (unsigned int)d; + l = get_msb(t); m = 1 + (1 << (16 + l)) / d; *quant = (int16_t)(m - (1 << 16)); *shift = 1 << (16 - l); @@ -213,10 +172,9 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); - case VPX_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80); default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; + assert(bit_depth == VPX_BITS_12); + return q == 0 ? 64 : (quant < 2368 ? 84 : 80); } #else (void)bit_depth; @@ -230,13 +188,20 @@ void vp9_init_quantizer(VP9_COMP *cpi) { int i, q, quant; for (q = 0; q < QINDEX_RANGE; q++) { - const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth); - const int qrounding_factor = q == 0 ? 64 : 48; + int qzbin_factor = get_qzbin_factor(q, cm->bit_depth); + int qrounding_factor = q == 0 ? 64 : 48; + const int sharpness_adjustment = 16 * (7 - cpi->oxcf.sharpness) / 7; + + if (cpi->oxcf.sharpness > 0 && q > 0) { + qzbin_factor = 64 + sharpness_adjustment; + qrounding_factor = 64 - sharpness_adjustment; + } for (i = 0; i < 2; ++i) { int qrounding_factor_fp = i == 0 ? 48 : 42; if (q == 0) qrounding_factor_fp = 64; - + if (cpi->oxcf.sharpness > 0) + qrounding_factor_fp = 64 - sharpness_adjustment; // y quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth) : vp9_ac_quant(q, 0, cm->bit_depth); @@ -296,7 +261,6 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[0].zbin = quants->y_zbin[qindex]; x->plane[0].round = quants->y_round[qindex]; xd->plane[0].dequant = cpi->y_dequant[qindex]; - x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0]; x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1]; @@ -309,7 +273,6 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[i].zbin = quants->uv_zbin[qindex]; x->plane[i].round = quants->uv_round[qindex]; xd->plane[i].dequant = cpi->uv_dequant[qindex]; - x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0]; x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1]; } @@ -326,13 +289,25 @@ void vp9_frame_init_quantizer(VP9_COMP *cpi) { vp9_init_plane_quantizers(cpi, &cpi->td.mb); } -void vp9_set_quantizer(VP9_COMMON *cm, int q) { +void vp9_set_quantizer(VP9_COMP *cpi, int q, int ext_rc_delta_q_uv) { + VP9_COMMON *cm = &cpi->common; // quantizer has to be reinitialized with vp9_init_quantizer() if any // delta_q changes. cm->base_qindex = q; cm->y_dc_delta_q = 0; cm->uv_dc_delta_q = 0; cm->uv_ac_delta_q = 0; + + if (ext_rc_delta_q_uv != 0) { + cm->uv_dc_delta_q = cm->uv_ac_delta_q = ext_rc_delta_q_uv; + vp9_init_quantizer(cpi); + return; + } + + if (cpi->oxcf.delta_q_uv != 0) { + cm->uv_dc_delta_q = cm->uv_ac_delta_q = cpi->oxcf.delta_q_uv; + vp9_init_quantizer(cpi); + } } // Table that converts 0-63 Q-range values passed in outside to the Qindex diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h index 61320361b6..bca2e055a2 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_QUANTIZE_H_ -#define VP9_ENCODER_VP9_QUANTIZE_H_ +#ifndef VPX_VP9_ENCODER_VP9_QUANTIZE_H_ +#define VPX_VP9_ENCODER_VP9_QUANTIZE_H_ #include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" @@ -37,9 +37,6 @@ typedef struct { DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); } QUANTS; -void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, - const int16_t *scan, const int16_t *iscan); - struct VP9_COMP; struct VP9Common; @@ -49,7 +46,7 @@ void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x); void vp9_init_quantizer(struct VP9_COMP *cpi); -void vp9_set_quantizer(struct VP9Common *cm, int q); +void vp9_set_quantizer(struct VP9_COMP *cpi, int q, int ext_rc_delta_q_uv); int vp9_quantizer_to_qindex(int quantizer); @@ -59,4 +56,4 @@ int vp9_qindex_to_quantizer(int qindex); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_QUANTIZE_H_ +#endif // VPX_VP9_ENCODER_VP9_QUANTIZE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c index 1eb8b50f01..529cbfab0c 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c @@ -11,56 +11,61 @@ #include #include #include +#include #include #include #include +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" #include "vpx_ports/system_state.h" #include "vp9/common/vp9_alloccommon.h" -#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_seg_common.h" +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ext_ratectrl.h" +#include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_svc_layercontext.h" -// Max rate target for 1080P and below encodes under normal circumstances -// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB +#include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx/internal/vpx_codec_internal.h" + +// Max rate per frame for 1080P and below encodes if no level requirement given. +// For larger formats limit to MAX_MB_RATE bits per MB +// 4Mbits is derived from the level requirement for level 4 (1080P 30) which +// requires that HW can sustain a rate of 16Mbits over a 4 frame group. +// If a lower level requirement is specified then this may over ride this value. #define MAX_MB_RATE 250 -#define MAXRATE_1080P 2025000 - -#define DEFAULT_KF_BOOST 2000 -#define DEFAULT_GF_BOOST 2000 +#define MAXRATE_1080P 4000000 #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1 #define MIN_BPB_FACTOR 0.005 #define MAX_BPB_FACTOR 50 -#define FRAME_OVERHEAD_BITS 200 - -// Use this macro to turn on/off use of alt-refs in one-pass vbr mode. -#define USE_ALTREF_FOR_ONE_PASS 0 - #if CONFIG_VP9_HIGHBITDEPTH -#define ASSIGN_MINQ_TABLE(bit_depth, name) \ - do { \ - switch (bit_depth) { \ - case VPX_BITS_8: name = name##_8; break; \ - case VPX_BITS_10: name = name##_10; break; \ - case VPX_BITS_12: name = name##_12; break; \ - default: \ - assert(0 && \ - "bit_depth should be VPX_BITS_8, VPX_BITS_10" \ - " or VPX_BITS_12"); \ - name = NULL; \ - } \ +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + switch (bit_depth) { \ + case VPX_BITS_8: name = name##_8; break; \ + case VPX_BITS_10: name = name##_10; break; \ + default: \ + assert(bit_depth == VPX_BITS_12); \ + name = name##_12; \ + break; \ + } \ } while (0) #else #define ASSIGN_MINQ_TABLE(bit_depth, name) \ @@ -93,10 +98,17 @@ static int inter_minq_12[QINDEX_RANGE]; static int rtc_minq_12[QINDEX_RANGE]; #endif +#ifdef AGGRESSIVE_VBR +static int gf_high = 2400; +static int gf_low = 400; +static int kf_high = 4000; +static int kf_low = 400; +#else static int gf_high = 2000; static int gf_low = 400; -static int kf_high = 5000; -static int kf_low = 400; +static int kf_high = 4800; +static int kf_low = 300; +#endif // Functions to compute the active minq lookup table entries based on a // formulaic approach to facilitate easier adjustment of the Q tables. @@ -125,10 +137,15 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, for (i = 0; i < QINDEX_RANGE; i++) { const double maxq = vp9_convert_qindex_to_q(i, bit_depth); kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); - kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth); +#ifdef AGGRESSIVE_VBR + arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.275, bit_depth); + inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.80, bit_depth); +#else arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); - arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); +#endif + arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); } } @@ -156,16 +173,26 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; case VPX_BITS_10: return vp9_ac_quant(qindex, 0, bit_depth) / 16.0; - case VPX_BITS_12: return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1.0; + assert(bit_depth == VPX_BITS_12); + return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; } #else return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; #endif } +int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth) { + int i; + + for (i = 0; i < QINDEX_RANGE; ++i) + if (vp9_convert_qindex_to_q(i, bit_depth) >= q_val) break; + + if (i == QINDEX_RANGE) i--; + + return i; +} + int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, double correction_factor, vpx_bit_depth_t bit_depth) { const double q = vp9_convert_qindex_to_q(qindex, bit_depth); @@ -185,12 +212,13 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth)); return VPXMAX(FRAME_OVERHEAD_BITS, - (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS); + (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS)); } int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { const RATE_CONTROL *rc = &cpi->rc; const VP9EncoderConfig *oxcf = &cpi->oxcf; + const int min_frame_target = VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); if (target < min_frame_target) target = min_frame_target; @@ -201,12 +229,15 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { // number of bits will be spent if needed for constructed ARFs. target = min_frame_target; } + // Clip the frame target to the maximum allowed value. if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; + if (oxcf->rc_max_inter_bitrate_pct) { - const int max_rate = - rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; - target = VPXMIN(target, max_rate); + const int64_t max_rate = + (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; + // target is of type int and VPXMIN cannot evaluate to larger than target + target = (int)VPXMIN(target, max_rate); } return target; } @@ -215,28 +246,78 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) { const RATE_CONTROL *rc = &cpi->rc; const VP9EncoderConfig *oxcf = &cpi->oxcf; if (oxcf->rc_max_intra_bitrate_pct) { - const int max_rate = - rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100; - target = VPXMIN(target, max_rate); + const int64_t max_rate = + (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100; + target = (int)VPXMIN(target, max_rate); } if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth; return target; } +// TODO(marpan/jianj): bits_off_target and buffer_level are used in the same +// way for CBR mode, for the buffering updates below. Look into removing one +// of these (i.e., bits_off_target). +// Update the buffer level before encoding with the per-frame-bandwidth, +void vp9_update_buffer_level_preencode(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + rc->bits_off_target += rc->avg_frame_bandwidth; + // Clip the buffer level to the maximum specified buffer size. + rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = rc->bits_off_target; +} + +// Update the buffer level before encoding with the per-frame-bandwidth +// for SVC. The current and all upper temporal layers are updated, needed +// for the layered rate control which involves cumulative buffer levels for +// the temporal layers. Allow for using the timestamp(pts) delta for the +// framerate when the set_ref_frame_config is used. +void vp9_update_buffer_level_svc_preencode(VP9_COMP *cpi) { + SVC *const svc = &cpi->svc; + int i; + // Set this to 1 to use timestamp delta for "framerate" under + // ref_frame_config usage. + int use_timestamp = 1; + const int64_t ts_delta = + svc->time_stamp_superframe - svc->time_stamp_prev[svc->spatial_layer_id]; + for (i = svc->temporal_layer_id; i < svc->number_temporal_layers; ++i) { + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + if (use_timestamp && cpi->svc.use_set_ref_frame_config && + svc->number_temporal_layers == 1 && ts_delta > 0 && + svc->current_superframe > 0) { + // TODO(marpan): This may need to be modified for temporal layers. + const double framerate_pts = 10000000.0 / ts_delta; + lrc->bits_off_target += saturate_cast_double_to_int( + round(lc->target_bandwidth / framerate_pts)); + } else { + lrc->bits_off_target += saturate_cast_double_to_int( + round(lc->target_bandwidth / lc->framerate)); + } + // Clip buffer level to maximum buffer size for the layer. + lrc->bits_off_target = + VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = lrc->bits_off_target; + if (i == svc->temporal_layer_id) { + cpi->rc.bits_off_target = lrc->bits_off_target; + cpi->rc.buffer_level = lrc->buffer_level; + } + } +} + // Update the buffer level for higher temporal layers, given the encoded current // temporal layer. -static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { +static void update_layer_buffer_level_postencode(SVC *svc, + int encoded_frame_size) { int i = 0; - int current_temporal_layer = svc->temporal_layer_id; + const int current_temporal_layer = svc->temporal_layer_id; for (i = current_temporal_layer + 1; i < svc->number_temporal_layers; ++i) { const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; - int bits_off_for_this_layer = - (int)(lc->target_bandwidth / lc->framerate - encoded_frame_size); - lrc->bits_off_target += bits_off_for_this_layer; - + lrc->bits_off_target -= encoded_frame_size; // Clip buffer level to maximum buffer size for the layer. lrc->bits_off_target = VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); @@ -244,21 +325,13 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { } } -// Update the buffer level: leaky bucket model. -static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { - const VP9_COMMON *const cm = &cpi->common; +// Update the buffer level after encoding with encoded frame size. +static void update_buffer_level_postencode(VP9_COMP *cpi, + int encoded_frame_size) { RATE_CONTROL *const rc = &cpi->rc; - - // Non-viewable frames are a special case and are treated as pure overhead. - if (!cm->show_frame) { - rc->bits_off_target -= encoded_frame_size; - } else { - rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; - } - + rc->bits_off_target -= encoded_frame_size; // Clip the buffer level to the maximum specified buffer size. rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); - // For screen-content mode, and if frame-dropper is off, don't let buffer // level go below threshold, given here as -rc->maximum_ buffer_size. if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && @@ -267,8 +340,8 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { rc->buffer_level = rc->bits_off_target; - if (is_one_pass_cbr_svc(cpi)) { - update_layer_buffer_level(&cpi->svc, encoded_frame_size); + if (is_one_pass_svc(cpi)) { + update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size); } } @@ -278,13 +351,13 @@ int vp9_rc_get_default_min_gf_interval(int width, int height, static const double factor_safe = 3840 * 2160 * 20.0; const double factor = width * height * framerate; const int default_interval = - clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL); + clamp((int)round(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL); if (factor <= factor_safe) return default_interval; else return VPXMAX(default_interval, - (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5)); + (int)round(MIN_GF_INTERVAL * factor / factor_safe)); // Note this logic makes: // 4K24: 5 // 4K30: 6 @@ -292,7 +365,7 @@ int vp9_rc_get_default_min_gf_interval(int width, int height, } int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) { - int interval = VPXMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75)); + int interval = VPXMIN(MAX_GF_INTERVAL, (int)round(framerate * 0.75)); interval += (interval & 0x01); // Round to even value return VPXMAX(interval, min_gf_interval); } @@ -329,12 +402,18 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->af_ratio_onepass_vbr = 10; rc->prev_avg_source_sad_lag = 0; rc->high_source_sad = 0; + rc->reset_high_source_sad = 0; rc->high_source_sad_lagindex = -1; + rc->high_num_blocks_with_motion = 0; + rc->hybrid_intra_scene_change = 0; + rc->re_encode_maxq_scene_change = 0; rc->alt_ref_gf_group = 0; + rc->last_frame_is_src_altref = 0; rc->fac_active_worst_inter = 150; rc->fac_active_worst_gf = 100; rc->force_qpmin = 0; for (i = 0; i < MAX_LAG_BUFFERS; ++i) rc->avg_source_sad[i] = 0; + rc->frames_to_key = 0; rc->frames_since_key = 8; // Sensible default for first frame. rc->this_key_frame_forced = 0; rc->next_key_frame_forced = 0; @@ -342,6 +421,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->source_alt_ref_active = 0; rc->frames_till_gf_update_due = 0; + rc->constrain_gf_key_freq_onepass_vbr = 1; rc->ni_av_qi = oxcf->worst_allowed_q; rc->ni_tot_qi = 0; rc->ni_frames = 0; @@ -351,6 +431,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { rc->rate_correction_factors[i] = 1.0; + rc->damped_adjustment[i] = 0; } rc->min_gf_interval = oxcf->min_gf_interval; @@ -362,27 +443,121 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( oxcf->init_framerate, rc->min_gf_interval); rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2; + if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) { + rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL; + } else { + rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; + } + + rc->force_max_q = 0; + rc->last_post_encode_dropped_scene_change = 0; + rc->use_post_encode_drop = 0; + rc->ext_use_post_encode_drop = 0; + rc->disable_overshoot_maxq_cbr = 0; + rc->arf_active_best_quality_adjustment_factor = 1.0; + rc->arf_increase_active_best_quality = 0; + rc->preserve_arf_as_gld = 0; + rc->preserve_next_arf_as_gld = 0; + rc->show_arf_as_gld = 0; } -int vp9_rc_drop_frame(VP9_COMP *cpi) { +static int check_buffer_above_thresh(VP9_COMP *cpi, int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->use_svc || cpi->svc.framedrop_mode != FULL_SUPERFRAME_DROP) { + RATE_CONTROL *const rc = &cpi->rc; + return (rc->buffer_level > drop_mark); + } else { + int i; + // For SVC in the FULL_SUPERFRAME_DROP): the condition on + // buffer (if its above threshold, so no drop) is checked on current and + // upper spatial layers. If any spatial layer is not above threshold then + // we return 0. + for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * + lrc->optimal_buffer_level / 100); + if (!(lrc->buffer_level > drop_mark_layer)) return 0; + } + } + return 1; + } +} + +static int check_buffer_below_thresh(VP9_COMP *cpi, int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->use_svc || cpi->svc.framedrop_mode == LAYER_DROP) { + RATE_CONTROL *const rc = &cpi->rc; + return (rc->buffer_level <= drop_mark); + } else { + int i; + // For SVC in the constrained framedrop mode (svc->framedrop_mode = + // CONSTRAINED_LAYER_DROP or FULL_SUPERFRAME_DROP): the condition on + // buffer (if its below threshold, so drop frame) is checked on current + // and upper spatial layers. For FULL_SUPERFRAME_DROP mode if any + // spatial layer is <= threshold, then we return 1 (drop). + for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * + lrc->optimal_buffer_level / 100); + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) { + if (lrc->buffer_level <= drop_mark_layer) return 1; + } else { + if (!(lrc->buffer_level <= drop_mark_layer)) return 0; + } + } + } + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) + return 0; + else + return 1; + } +} + +int vp9_test_drop(VP9_COMP *cpi) { const VP9EncoderConfig *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; - if (!oxcf->drop_frames_water_mark || - (is_one_pass_cbr_svc(cpi) && - cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode)) { + SVC *svc = &cpi->svc; + int drop_frames_water_mark = oxcf->drop_frames_water_mark; + if (cpi->use_svc) { + // If we have dropped max_consec_drop frames, then we don't + // drop this spatial layer, and reset counter to 0. + if (svc->drop_count[svc->spatial_layer_id] == svc->max_consec_drop) { + svc->drop_count[svc->spatial_layer_id] = 0; + return 0; + } else { + drop_frames_water_mark = svc->framedrop_thresh[svc->spatial_layer_id]; + } + } + if (!drop_frames_water_mark || + (svc->spatial_layer_id > 0 && + svc->framedrop_mode == FULL_SUPERFRAME_DROP)) { return 0; } else { - if (rc->buffer_level < 0) { + if ((rc->buffer_level < 0 && svc->framedrop_mode != FULL_SUPERFRAME_DROP) || + (check_buffer_below_thresh(cpi, -1) && + svc->framedrop_mode == FULL_SUPERFRAME_DROP)) { // Always drop if buffer is below 0. return 1; } else { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. int drop_mark = - (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100); - if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { + (int)(drop_frames_water_mark * rc->optimal_buffer_level / 100); + if (check_buffer_above_thresh(cpi, drop_mark) && + (rc->decimation_factor > 0)) { --rc->decimation_factor; - } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) { + } else if (check_buffer_below_thresh(cpi, drop_mark) && + rc->decimation_factor == 0) { rc->decimation_factor = 1; } if (rc->decimation_factor > 0) { @@ -401,11 +576,135 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) { } } +int post_encode_drop_cbr(VP9_COMP *cpi, size_t *size) { + size_t frame_size = *size << 3; + int64_t new_buffer_level = + cpi->rc.buffer_level + cpi->rc.avg_frame_bandwidth - (int64_t)frame_size; + + // For now we drop if new buffer level (given the encoded frame size) goes + // below 0. + if (new_buffer_level < 0) { + *size = 0; + vp9_rc_postencode_update_drop_frame(cpi); + // Update flag to use for next frame. + if (cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe)) + cpi->rc.last_post_encode_dropped_scene_change = 1; + // Force max_q on next fame. + cpi->rc.force_max_q = 1; + cpi->rc.avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality; + cpi->last_frame_dropped = 1; + cpi->ext_refresh_frame_flags_pending = 0; + if (cpi->use_svc) { + SVC *svc = &cpi->svc; + int sl = 0; + int tl = 0; + svc->last_layer_dropped[svc->spatial_layer_id] = 1; + svc->drop_spatial_layer[svc->spatial_layer_id] = 1; + svc->drop_count[svc->spatial_layer_id]++; + svc->skip_enhancement_layer = 1; + // Postencode drop is only checked on base spatial layer, + // for now if max-q is set on base we force it on all layers. + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->force_max_q = 1; + lrc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality; + } + } + } + return 1; + } + + cpi->rc.force_max_q = 0; + cpi->rc.last_post_encode_dropped_scene_change = 0; + return 0; +} + +int vp9_rc_drop_frame(VP9_COMP *cpi) { + SVC *svc = &cpi->svc; + int svc_prev_layer_dropped = 0; + // In the constrained or full_superframe framedrop mode for svc + // (framedrop_mode != (LAYER_DROP && CONSTRAINED_FROM_ABOVE)), + // if the previous spatial layer was dropped, drop the current spatial layer. + if (cpi->use_svc && svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1]) + svc_prev_layer_dropped = 1; + if ((svc_prev_layer_dropped && svc->framedrop_mode != LAYER_DROP && + svc->framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP) || + svc->force_drop_constrained_from_above[svc->spatial_layer_id] || + vp9_test_drop(cpi)) { + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + if (cpi->use_svc) { + svc->last_layer_dropped[svc->spatial_layer_id] = 1; + svc->drop_spatial_layer[svc->spatial_layer_id] = 1; + svc->drop_count[svc->spatial_layer_id]++; + svc->skip_enhancement_layer = 1; + if (svc->framedrop_mode == LAYER_DROP || + (svc->framedrop_mode == CONSTRAINED_FROM_ABOVE_DROP && + svc->force_drop_constrained_from_above[svc->number_spatial_layers - + 1] == 0) || + svc->drop_spatial_layer[0] == 0) { + // For the case of constrained drop mode where full superframe is + // dropped, we don't increment the svc frame counters. + // In particular temporal layer counter (which is incremented in + // vp9_inc_frame_in_layer()) won't be incremented, so on a dropped + // frame we try the same temporal_layer_id on next incoming frame. + // This is to avoid an issue with temporal alignment with full + // superframe dropping. + vp9_inc_frame_in_layer(cpi); + } + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { + int i; + int all_layers_drop = 1; + for (i = 0; i < svc->spatial_layer_id; i++) { + if (svc->drop_spatial_layer[i] == 0) { + all_layers_drop = 0; + break; + } + } + if (all_layers_drop == 1) svc->skip_enhancement_layer = 0; + } + } + return 1; + } + return 0; +} + +static int adjust_q_cbr(const VP9_COMP *cpi, int q) { + // This makes sure q is between oscillating Qs to prevent resonance. + if (!cpi->rc.reset_high_source_sad && + (!cpi->oxcf.gf_cbr_boost_pct || + !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && + (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && + cpi->rc.q_1_frame != cpi->rc.q_2_frame) { + int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), + VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); + // If the previous frame had overshoot and the current q needs to increase + // above the clamped value, reduce the clamp for faster reaction to + // overshoot. + if (cpi->rc.rc_1_frame == -1 && q > qclamp) + q = (q + qclamp) >> 1; + else + q = qclamp; + } + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_limit_q(cpi, &q); + return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality); +} + static double get_rate_correction_factor(const VP9_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; double rcf; - if (cpi->common.frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { rcf = rc->rate_correction_factors[KF_STD]; } else if (cpi->oxcf.pass == 2) { RATE_FACTOR_LEVEL rf_lvl = @@ -425,13 +724,14 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) { static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; // Normalize RCF to account for the size-dependent scaling factor. factor /= rcf_mult[cpi->rc.frame_size_selector]; factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR); - if (cpi->common.frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { rc->rate_correction_factors[KF_STD] = factor; } else if (cpi->oxcf.pass == 2) { RATE_FACTOR_LEVEL rf_lvl = @@ -452,6 +752,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { int correction_factor = 100; double rate_correction_factor = get_rate_correction_factor(cpi); double adjustment_limit; + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; int projected_size_based_on_q = 0; @@ -468,8 +770,9 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { projected_size_based_on_q = vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor); } else { + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; projected_size_based_on_q = - vp9_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, cm->MBs, + vp9_estimate_bits_at_q(frame_type, cm->base_qindex, cm->MBs, rate_correction_factor, cm->bit_depth); } // Work out a size correction factor. @@ -477,10 +780,16 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) / projected_size_based_on_q); - // More heavily damped adjustment used if we have been oscillating either side - // of target. - adjustment_limit = - 0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor))); + // Do not use damped adjustment for the first frame of each frame type + if (!cpi->rc.damped_adjustment[rf_lvl]) { + adjustment_limit = 1.0; + cpi->rc.damped_adjustment[rf_lvl] = 1; + } else { + // More heavily damped adjustment used if we have been oscillating either + // side of target. + adjustment_limit = + 0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor))); + } cpi->rc.q_2_frame = cpi->rc.q_1_frame; cpi->rc.q_1_frame = cm->base_qindex; @@ -523,6 +832,7 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, int active_best_quality, int active_worst_quality) { const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; int q = active_worst_quality; int last_error = INT_MAX; int i, target_bits_per_mb, bits_per_mb_at_this_q; @@ -536,42 +846,35 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, i = active_best_quality; do { - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && - cpi->svc.temporal_layer_id == 0) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cr->apply_cyclic_refresh && + (!cpi->oxcf.gf_cbr_boost_pct || !cpi->refresh_golden_frame)) { bits_per_mb_at_this_q = (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor); } else { + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb( - cm->frame_type, i, correction_factor, cm->bit_depth); + frame_type, i, correction_factor, cm->bit_depth); } + int diff_bits = (int)VPXMIN( + VPXMAX(((int64_t)target_bits_per_mb - (int64_t)bits_per_mb_at_this_q), + -INT_MAX), + INT_MAX); if (bits_per_mb_at_this_q <= target_bits_per_mb) { - if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) + if (diff_bits <= last_error) q = i; else q = i - 1; break; } else { - last_error = bits_per_mb_at_this_q - target_bits_per_mb; + last_error = -diff_bits; } } while (++i <= active_worst_quality); - // In CBR mode, this makes sure q is between oscillating Qs to prevent - // resonance. - if (cpi->oxcf.rc_mode == VPX_CBR && - (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && - cpi->rc.q_1_frame != cpi->rc.q_2_frame) { - q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), - VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); - } -#if USE_ALTREF_FOR_ONE_PASS - if (cpi->oxcf.enable_auto_arf && cpi->oxcf.pass == 0 && - cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && - cpi->rc.is_src_frame_alt_ref && !cpi->rc.alt_ref_gf_group) { - q = VPXMIN(q, (q + cpi->rc.last_boosted_qindex) >> 1); - } -#endif + // Adjustment to q for CBR mode. + if (cpi->oxcf.rc_mode == VPX_CBR) return adjust_q_cbr(cpi, q); + return q; } @@ -600,13 +903,19 @@ static int get_kf_active_quality(const RATE_CONTROL *const rc, int q, kf_low_motion_minq, kf_high_motion_minq); } -static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, +static int get_gf_active_quality(const VP9_COMP *const cpi, int q, vpx_bit_depth_t bit_depth) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const RATE_CONTROL *const rc = &cpi->rc; + int *arfgf_low_motion_minq; int *arfgf_high_motion_minq; + const int gfu_boost = cpi->multi_layer_arf + ? gf_group->gfu_boost[gf_group->index] + : rc->gfu_boost; ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); - return get_active_quality(q, rc->gfu_boost, gf_low, gf_high, + return get_active_quality(q, gfu_boost, gf_low, gf_high, arfgf_low_motion_minq, arfgf_high_motion_minq); } @@ -619,7 +928,7 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) { active_worst_quality = curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] << 1; } else { - if (!rc->is_src_frame_alt_ref && + if (!rc->is_src_frame_alt_ref && !cpi->use_svc && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { active_worst_quality = curr_frame == 1 @@ -651,7 +960,8 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { int active_worst_quality; int ambient_qp; unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers; - if (cm->frame_type == KEY_FRAME) return rc->worst_quality; + if (frame_is_intra_only(cm) || rc->reset_high_source_sad || rc->force_max_q) + return rc->worst_quality; // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] // for the first few frames following key frame. These are both initialized // to worst_quality and updated with (3/4, 1/4) average in postencode_update. @@ -661,11 +971,25 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { ? VPXMIN(rc->avg_frame_qindex[INTER_FRAME], rc->avg_frame_qindex[KEY_FRAME]) : rc->avg_frame_qindex[INTER_FRAME]; - active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2); + active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 5) >> 2); + // For SVC if the current base spatial layer was key frame, use the QP from + // that base layer for ambient_qp. + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) { + int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + if (lc->is_key_frame) { + const RATE_CONTROL *lrc = &lc->rc; + ambient_qp = VPXMIN(ambient_qp, lrc->last_q[KEY_FRAME]); + active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 9) >> 3); + } + } if (rc->buffer_level > rc->optimal_buffer_level) { // Adjust down. - // Maximum limit for down adjustment, ~30%. + // Maximum limit for down adjustment ~30%; make it lower for screen content. int max_adjustment_down = active_worst_quality / 3; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + max_adjustment_down = active_worst_quality >> 3; if (max_adjustment_down) { buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) / max_adjustment_down); @@ -734,6 +1058,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); } } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc && + cpi->oxcf.gf_cbr_boost_pct && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was @@ -744,7 +1069,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, } else { q = active_worst_quality; } - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); } else { // Use the lower of active_worst_quality and recent/average Q. if (cm->current_video_frame > 1) { @@ -769,21 +1094,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, *top_index = active_worst_quality; *bottom_index = active_best_quality; -#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY - // Limit Q range for the adaptive loop. - if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced && - !(cm->current_video_frame == 0)) { - int qdelta = 0; - vpx_clear_system_state(); - qdelta = vp9_compute_qdelta_by_rate( - &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth); - *top_index = active_worst_quality + qdelta; - *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; - } -#endif - // Special case code to try and match quality with forced key frames - if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) { + if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { q = rc->last_boosted_qindex; } else { q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, @@ -796,6 +1108,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, q = *top_index; } } + assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality); assert(*bottom_index <= rc->worst_quality && *bottom_index >= rc->best_quality); @@ -855,8 +1168,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, if (frame_is_intra_only(cm)) { if (oxcf->rc_mode == VPX_Q) { int qindex = cq_level; - double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); - int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25, cm->bit_depth); + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + int delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.25, cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else if (rc->this_key_frame_forced) { // Handle the special case for key frames forced when we have reached @@ -900,35 +1214,38 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, } else { q = rc->avg_frame_qindex[KEY_FRAME]; } - // For constrained quality dont allow Q less than the cq level + // For constrained quality don't allow Q less than the cq level if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; } else if (oxcf->rc_mode == VPX_Q) { int qindex = cq_level; - double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); int delta_qindex; if (cpi->refresh_alt_ref_frame) - delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth); + delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.40, cm->bit_depth); else - delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth); + delta_qindex = + vp9_compute_qdelta(rc, qstart, qstart * 0.50, cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); } } else { if (oxcf->rc_mode == VPX_Q) { int qindex = cq_level; - double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth); double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0, 0.70, 1.0, 0.85, 1.0 }; int delta_qindex = vp9_compute_qdelta( - rc, q, q * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL], + rc, qstart, + qstart * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL], cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { @@ -972,6 +1289,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, qdelta = vp9_compute_qdelta_by_rate( &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth); } + if (rc->high_source_sad && cpi->sf.use_altref_onepass) qdelta = 0; *top_index = active_worst_quality + qdelta; *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; } @@ -985,6 +1303,30 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, } else { q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, active_worst_quality); + + // For no lookahead: if buffer_level indicates overshoot, then avoid going + // to very low QP. This reduces overshoot observed in Issue: 376707227. + // Note the buffer_level is updated for every encoded frame as: + // buffer_level - starting_buffer_level += (avg_frame_bandwidth - + // encoded_frame_size). So normalizing this with framerate and #encoded + // frames (current_video_frame) gives the difference/error between target + // and encoding bitrate. The additional avg_frame_bandwidth term is to + // compensate for the pre-encoded buffer update (in + // vp9_rc_get_one_pass_vbr_params). + const int qp_thresh = 32; + const int64_t bitrate_err = + (int64_t)(cpi->framerate * + (rc->buffer_level - rc->starting_buffer_level - + rc->avg_frame_bandwidth) / + (cm->current_video_frame + 1)); + // Threshold may be tuned, but for now condition this on low QP. + if (cpi->oxcf.lag_in_frames == 0 && bitrate_err / 1000 < -10 && + qp_thresh < rc->worst_quality && + (q < qp_thresh || *top_index < qp_thresh)) { + q = qp_thresh; + *top_index = VPXMAX(*top_index, q); + } + if (q > *top_index) { // Special case when we are targeting the max allowed rate if (rc->this_frame_target >= rc->max_frame_bandwidth) @@ -1009,19 +1351,122 @@ int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) { 1.75, // GF_ARF_STD 2.00, // KF_STD }; - static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = { - INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME - }; const VP9_COMMON *const cm = &cpi->common; - int qdelta = - vp9_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q, - rate_factor_deltas[rf_level], cm->bit_depth); + + int qdelta = vp9_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, q, rate_factor_deltas[rf_level], cm->bit_depth); return qdelta; } #define STATIC_MOTION_THRESH 95 -static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, - int *top_index) { + +static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index, + int *top_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + int active_best_quality; + int active_worst_quality = cpi->twopass.active_worst_quality; + + if (rc->this_key_frame_forced) { + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + double last_boosted_q; + int delta_qindex; + int qindex; + + if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + active_best_quality = qindex; + last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 1.25, cm->bit_depth); + active_worst_quality = + VPXMIN(qindex + delta_qindex, active_worst_quality); + } else { + qindex = rc->last_boosted_qindex; + last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.75, cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); + } + } else { + // Not forced keyframe. + double q_adj_factor = 1.0; + double q_val; + // Baseline value derived from cpi->active_worst_quality and kf boost. + active_best_quality = + get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); + if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { + active_best_quality /= 4; + } + + // Don't allow the active min to be lossless (q0) unlesss the max q + // already indicates lossless. + active_best_quality = + VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality)); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Make a further adjustment based on the kf zero motion measure. + q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); + active_best_quality += + vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + } + *top_index = active_worst_quality; + *bottom_index = active_best_quality; +} + +static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index, + int gf_group_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + const int is_intra_frame = frame_is_intra_only(cm); + + const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf); + + int q = cq_level; + int active_best_quality = cq_level; + int active_worst_quality = cq_level; + + // Key frame qp decision + if (is_intra_frame && rc->frames_to_key > 1) + pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality); + + // ARF / GF qp decision + if (!is_intra_frame && !rc->is_src_frame_alt_ref && + cpi->refresh_alt_ref_frame) { + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); + + // Modify best quality for second level arfs. For mode VPX_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) { + const int layer_depth = gf_group->layer_depth[gf_group_index]; + // linearly fit the frame q depending on the layer depth index from + // the base layer ARF. + active_best_quality = ((layer_depth - 1) * cq_level + + active_best_quality + layer_depth / 2) / + layer_depth; + } + } + + q = active_best_quality; + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + return q; +} + +int vp9_rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, + int *top_index, int gf_group_index) { const VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -1031,56 +1476,20 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, int active_worst_quality = cpi->twopass.active_worst_quality; int q; int *inter_minq; + int arf_active_best_quality_hl; + int *arfgf_high_motion_minq, *arfgf_low_motion_minq; + const int boost_frame = + !rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); - if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) { - // Handle the special case for key frames forced when we have reached - // the maximum key frame interval. Here force the Q to a range - // based on the ambient Q to reduce the risk of popping. - if (rc->this_key_frame_forced) { - double last_boosted_q; - int delta_qindex; - int qindex; + if (oxcf->rc_mode == VPX_Q) + return rc_constant_q(cpi, bottom_index, top_index, gf_group_index); - if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { - qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); - active_best_quality = qindex; - last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); - delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 1.25, cm->bit_depth); - active_worst_quality = - VPXMIN(qindex + delta_qindex, active_worst_quality); - } else { - qindex = rc->last_boosted_qindex; - last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); - delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 0.75, cm->bit_depth); - active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); - } - } else { - // Not forced keyframe. - double q_adj_factor = 1.0; - double q_val; - // Baseline value derived from cpi->active_worst_quality and kf boost. - active_best_quality = - get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); - - // Allow somewhat lower kf minq with small image formats. - if ((cm->width * cm->height) <= (352 * 288)) { - q_adj_factor -= 0.25; - } - - // Make a further adjustment based on the kf zero motion measure. - q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); - - // Convert the adjustment factor to a qindex delta - // on active_best_quality. - q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); - active_best_quality += - vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); - } - } else if (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + if (frame_is_intra_only(cm)) { + pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality); + } else if (boost_frame) { // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. @@ -1090,66 +1499,81 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, } else { q = active_worst_quality; } - // For constrained quality dont allow Q less than the cq level + // For constrained quality don't allow Q less than the cq level if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; + } + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); + arf_active_best_quality_hl = active_best_quality; - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + if (rc->arf_increase_active_best_quality == 1) { + ASSIGN_MINQ_TABLE(cm->bit_depth, arfgf_high_motion_minq); + arf_active_best_quality_hl = arfgf_high_motion_minq[q]; + } else if (rc->arf_increase_active_best_quality == -1) { + ASSIGN_MINQ_TABLE(cm->bit_depth, arfgf_low_motion_minq); + arf_active_best_quality_hl = arfgf_low_motion_minq[q]; + } + active_best_quality = + (int)((double)active_best_quality * + rc->arf_active_best_quality_adjustment_factor + + (double)arf_active_best_quality_hl * + (1.0 - rc->arf_active_best_quality_adjustment_factor)); - // Constrained quality use slightly lower active best. - active_best_quality = active_best_quality * 15 / 16; - - } else if (oxcf->rc_mode == VPX_Q) { - if (!cpi->refresh_alt_ref_frame) { - active_best_quality = cq_level; - } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); - - // Modify best quality for second level arfs. For mode VPX_Q this - // becomes the baseline frame q. - if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) - active_best_quality = (active_best_quality + cq_level + 1) / 2; - } - } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + // Modify best quality for second level arfs. For mode VPX_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) { + const int layer_depth = gf_group->layer_depth[gf_group_index]; + // linearly fit the frame q depending on the layer depth index from + // the base layer ARF. + active_best_quality = + ((layer_depth - 1) * q + active_best_quality + layer_depth / 2) / + layer_depth; } } else { - if (oxcf->rc_mode == VPX_Q) { - active_best_quality = cq_level; - } else { - active_best_quality = inter_minq[active_worst_quality]; + active_best_quality = inter_minq[active_worst_quality]; - // For the constrained quality mode we don't want - // q to fall below the cq level. - if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) { - active_best_quality = cq_level; - } + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; } } // Extension to max or min Q if undershoot or overshoot is outside // the permitted range. - if (cpi->oxcf.rc_mode != VPX_Q) { - if (frame_is_intra_only(cm) || - (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { - active_best_quality -= - (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); - active_worst_quality += (cpi->twopass.extend_maxq / 2); - } else { - active_best_quality -= - (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; - active_worst_quality += cpi->twopass.extend_maxq; + if (frame_is_intra_only(cm) || boost_frame) { + const int layer_depth = gf_group->layer_depth[gf_group_index]; + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); + active_worst_quality += (cpi->twopass.extend_maxq / 2); + + if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) { + assert(layer_depth > 1); + active_best_quality = + VPXMAX(active_best_quality, + cpi->twopass.last_qindex_of_arf_layer[layer_depth - 1]); } + } else { + const int max_layer_depth = gf_group->max_layer_depth; + assert(max_layer_depth > 0); + + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; + active_worst_quality += cpi->twopass.extend_maxq; + + // For normal frames do not allow an active minq lower than the q used for + // the last boosted frame. + active_best_quality = + VPXMAX(active_best_quality, + cpi->twopass.last_qindex_of_arf_layer[max_layer_depth - 1]); } #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY vpx_clear_system_state(); // Static forced key frames Q restrictions dealt with elsewhere. - if (!((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi))) || - !rc->this_key_frame_forced || - (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { - int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index], + if (!frame_is_intra_only(cm) || !rc->this_key_frame_forced || + cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH) { + int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group_index], active_worst_quality); active_worst_quality = VPXMAX(active_worst_quality + qdelta, active_best_quality); @@ -1169,17 +1593,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, active_worst_quality = clamp(active_worst_quality, active_best_quality, rc->worst_quality); - if (oxcf->rc_mode == VPX_Q) { - q = active_best_quality; - // Special case code to try and match quality with forced key frames. - } else if ((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) && - rc->this_key_frame_forced) { + if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { // If static since last kf use better of last boosted and last kf q. if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); } else { q = rc->last_boosted_qindex; } + } else if (frame_is_intra_only(cm) && !rc->this_key_frame_forced) { + q = active_best_quality; } else { q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, active_worst_quality); @@ -1191,7 +1613,6 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, q = active_worst_quality; } } - clamp(q, active_best_quality, active_worst_quality); *top_index = active_worst_quality; *bottom_index = active_best_quality; @@ -1206,13 +1627,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, int *top_index) { int q; + const int gf_group_index = cpi->twopass.gf_group.index; if (cpi->oxcf.pass == 0) { if (cpi->oxcf.rc_mode == VPX_CBR) q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index); else q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index); } else { - q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index); + q = vp9_rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index, + gf_group_index); } if (cpi->sf.use_nonrd_pick_mode) { if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex; @@ -1225,6 +1648,64 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, return q; } +void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) { + VP9_COMMON *cm = &cpi->common; + TWO_PASS *const twopass = &cpi->twopass; + + cpi->rc.is_src_frame_alt_ref = 0; + cm->show_existing_frame = 0; + cpi->rc.show_arf_as_gld = 0; + switch (twopass->gf_group.update_type[gf_group_index]) { + case KF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + break; + case LF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + break; + case GF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + break; + case OVERLAY_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + if (cpi->rc.preserve_arf_as_gld) { + cpi->rc.show_arf_as_gld = 1; + cpi->refresh_golden_frame = 0; + cm->show_existing_frame = 1; + cm->refresh_frame_context = 0; + } + break; + case MID_OVERLAY_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + break; + case USE_BUF_FRAME: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + cm->show_existing_frame = 1; + cm->refresh_frame_context = 0; + break; + default: + assert(twopass->gf_group.update_type[gf_group_index] == ARF_UPDATE); + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 1; + break; + } +} + void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit) { @@ -1234,8 +1715,10 @@ void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target, } else { // For very small rate targets where the fractional adjustment // may be tiny make sure there is at least a minimum range. - const int tol_low = (cpi->sf.recode_tolerance_low * frame_target) / 100; - const int tol_high = (cpi->sf.recode_tolerance_high * frame_target) / 100; + const int tol_low = + (int)(((int64_t)cpi->sf.recode_tolerance_low * frame_target) / 100); + const int tol_high = + (int)(((int64_t)cpi->sf.recode_tolerance_high * frame_target) / 100); *frame_under_shoot_limit = VPXMAX(frame_target - tol_low - 100, 0); *frame_over_shoot_limit = VPXMIN(frame_target + tol_high + 100, cpi->rc.max_frame_bandwidth); @@ -1250,13 +1733,15 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) { // Modify frame size target when down-scaling. if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && - rc->frame_size_selector != UNSCALED) + rc->frame_size_selector != UNSCALED) { rc->this_frame_target = (int)(rc->this_frame_target * rate_thresh_mult[rc->frame_size_selector]); + } // Target rate per SB64 (including partial SB64s. - rc->sb64_target_rate = (int)(((int64_t)rc->this_frame_target * 64 * 64) / - (cm->width * cm->height)); + const int64_t sb64_target_rate = + ((int64_t)rc->this_frame_target * 64 * 64) / (cm->width * cm->height); + rc->sb64_target_rate = (int)VPXMIN(sb64_target_rate, INT_MAX); } static void update_alt_ref_frame_stats(VP9_COMP *cpi) { @@ -1297,11 +1782,43 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; rc->frames_since_golden++; + + if (rc->show_arf_as_gld) { + rc->frames_since_golden = 0; + // If we are not using alt ref in the up and coming group clear the arf + // active flag. In multi arf group case, if the index is not 0 then + // we are overlaying a mid group arf so should not reset the flag. + if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0)) + rc->source_alt_ref_active = 0; + } } } -static void compute_frame_low_motion(VP9_COMP *const cpi) { +static void update_altref_usage(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; + int sum_ref_frame_usage = 0; + int arf_frame_usage = 0; + int mi_row, mi_col; + if (cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref && + !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8) { + int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + sum_ref_frame_usage += cpi->count_arf_frame_usage[sboffset] + + cpi->count_lastgolden_frame_usage[sboffset]; + arf_frame_usage += cpi->count_arf_frame_usage[sboffset]; + } + } + if (sum_ref_frame_usage > 0) { + double altref_count = 100.0 * arf_frame_usage / sum_ref_frame_usage; + cpi->rc.perc_arf_usage = + 0.75 * cpi->rc.perc_arf_usage + 0.25 * altref_count; + } +} + +void vp9_compute_frame_low_motion(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; int mi_row, mi_col; MODE_INFO **mi = cm->mi_grid_visible; RATE_CONTROL *const rc = &cpi->rc; @@ -1309,7 +1826,8 @@ static void compute_frame_low_motion(VP9_COMP *const cpi) { int cnt_zeromv = 0; for (mi_row = 0; mi_row < rows; mi_row++) { for (mi_col = 0; mi_col < cols; mi_col++) { - if (abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16) + if (mi[0]->ref_frame[0] == LAST_FRAME && + abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16) cnt_zeromv++; mi++; } @@ -1317,17 +1835,30 @@ static void compute_frame_low_motion(VP9_COMP *const cpi) { } cnt_zeromv = 100 * cnt_zeromv / (rows * cols); rc->avg_frame_low_motion = (3 * rc->avg_frame_low_motion + cnt_zeromv) >> 2; + + // For SVC: set avg_frame_low_motion (only computed on top spatial layer) + // to all lower spatial layers. + if (cpi->use_svc && svc->spatial_layer_id == svc->number_spatial_layers - 1) { + int i; + for (i = 0; i < svc->number_spatial_layers - 1; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_frame_low_motion = rc->avg_frame_low_motion; + } + } } void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { const VP9_COMMON *const cm = &cpi->common; const VP9EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; const int qindex = cm->base_qindex; - - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { - vp9_cyclic_refresh_postencode(cpi); - } + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + const int gf_group_index = cpi->twopass.gf_group.index; + const int layer_depth = gf_group->layer_depth[gf_group_index]; // Update rate control heuristics rc->projected_frame_size = (int)(bytes_used << 3); @@ -1336,13 +1867,12 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { vp9_rc_update_rate_correction_factors(cpi); // Keep a record of last Q and ambient average Q. - if (cm->frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { rc->last_q[KEY_FRAME] = qindex; rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); if (cpi->use_svc) { - int i = 0; - SVC *svc = &cpi->svc; + int i; for (i = 0; i < svc->number_temporal_layers; ++i) { const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); @@ -1353,7 +1883,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } } } else { - if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) || + if ((cpi->use_svc) || (!rc->is_src_frame_alt_ref && !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { rc->last_q[INTER_FRAME] = qindex; @@ -1369,6 +1899,8 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } } + if (cpi->use_svc) vp9_svc_adjust_avg_frame_qindex(cpi); + // Keep record of last boosted (KF/KF/ARF) Q value. // If the current frame is coded at a lower Q then we also update it. // If all mbs in this group are skipped only update if the Q value is @@ -1380,21 +1912,31 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { rc->last_boosted_qindex = qindex; } - if (cm->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex; - update_buffer_level(cpi, rc->projected_frame_size); + if ((qindex < cpi->twopass.last_qindex_of_arf_layer[layer_depth]) || + (cm->frame_type == KEY_FRAME) || + (!rc->constrained_gf_group && + (cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { + cpi->twopass.last_qindex_of_arf_layer[layer_depth] = qindex; + } + + if (frame_is_intra_only(cm)) rc->last_kf_qindex = qindex; + + update_buffer_level_postencode(cpi, rc->projected_frame_size); // Rolling monitors of whether we are over or underspending used to help // regulate min and Max Q in two pass. - if (cm->frame_type != KEY_FRAME) { - rc->rolling_target_bits = ROUND_POWER_OF_TWO( - rc->rolling_target_bits * 3 + rc->this_frame_target, 2); - rc->rolling_actual_bits = ROUND_POWER_OF_TWO( - rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2); - rc->long_rolling_target_bits = ROUND_POWER_OF_TWO( - rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5); - rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO( - rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5); + if (!frame_is_intra_only(cm)) { + rc->rolling_target_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)rc->rolling_target_bits * 3 + rc->this_frame_target, 2); + rc->rolling_actual_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2); + rc->long_rolling_target_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5); + rc->long_rolling_actual_bits = (int)ROUND64_POWER_OF_TWO( + (int64_t)rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, + 5); } // Actual bits spent @@ -1403,9 +1945,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; - if (!cpi->use_svc || is_two_pass_svc(cpi)) { + if (!cpi->use_svc) { if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame && - (cm->frame_type != KEY_FRAME)) + (!frame_is_intra_only(cm))) // Update the alternate reference frame stats as appropriate. update_alt_ref_frame_stats(cpi); else @@ -1413,7 +1955,28 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { update_golden_frame_stats(cpi); } - if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0; + // If second (long term) temporal reference is used for SVC, + // update the golden frame counter, only for base temporal layer. + if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer && + svc->temporal_layer_id == 0) { + int i = 0; + if (cpi->refresh_golden_frame) + rc->frames_since_golden = 0; + else + rc->frames_since_golden++; + // Decrement count down till next gf + if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; + // Update the frames_since_golden for all upper temporal layers. + for (i = 1; i < svc->number_temporal_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->frames_since_golden = rc->frames_since_golden; + } + } + + if (frame_is_intra_only(cm)) rc->frames_since_key = 0; if (cm->show_frame) { rc->frames_since_key++; rc->frames_to_key--; @@ -1427,36 +1990,68 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } if (oxcf->pass == 0) { - if (cm->frame_type != KEY_FRAME) compute_frame_low_motion(cpi); + if (!frame_is_intra_only(cm)) + if (cpi->sf.use_altref_onepass) update_altref_usage(cpi); + cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref; } + + if (!frame_is_intra_only(cm)) rc->reset_high_source_sad = 0; + + rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth; + if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1) + svc->lower_layer_qindex = cm->base_qindex; + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; } void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { - // Update buffer level with zero size, update frame counters, and return. - update_buffer_level(cpi, 0); + cpi->common.current_video_frame++; cpi->rc.frames_since_key++; cpi->rc.frames_to_key--; cpi->rc.rc_2_frame = 0; cpi->rc.rc_1_frame = 0; + cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth; + cpi->rc.last_q[INTER_FRAME] = cpi->common.base_qindex; + // For SVC on dropped frame when framedrop_mode != LAYER_DROP: + // in this mode the whole superframe may be dropped if only a single layer + // has buffer underflow (below threshold). Since this can then lead to + // increasing buffer levels/overflow for certain layers even though whole + // superframe is dropped, we cap buffer level if its already stable. + if (cpi->use_svc && cpi->svc.framedrop_mode != LAYER_DROP && + cpi->rc.buffer_level > cpi->rc.optimal_buffer_level) { + cpi->rc.buffer_level = cpi->rc.optimal_buffer_level; + cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level; + } + cpi->deadline_mode_previous_frame = cpi->oxcf.mode; } -static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { +int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; const int af_ratio = rc->af_ratio_onepass_vbr; - int target = + int64_t target = (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) - ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) / + ? ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval * + af_ratio) / (rc->baseline_gf_interval + af_ratio - 1) - : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) / + : ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) / (rc->baseline_gf_interval + af_ratio - 1); - return vp9_rc_clamp_pframe_target_size(cpi, target); + // For SVC: refresh flags are used to define the pattern, so we can't + // use that for boosting the target size here. + // TODO(marpan): Consider adding internal boost on TL0 for VBR-SVC. + // For now just use the CBR logic for setting target size. + if (cpi->use_svc) target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); + if (target > INT_MAX) target = INT_MAX; + return vp9_rc_clamp_pframe_target_size(cpi, (int)target); } -static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { +int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) { static const int kf_ratio = 25; const RATE_CONTROL *rc = &cpi->rc; - const int target = rc->avg_frame_bandwidth * kf_ratio; + int target = rc->avg_frame_bandwidth; + if (target > INT_MAX / kf_ratio) + target = INT_MAX; + else + target = rc->avg_frame_bandwidth * kf_ratio; return vp9_rc_clamp_iframe_target_size(cpi, target); } @@ -1479,23 +2074,9 @@ static void adjust_gfint_frame_constraint(VP9_COMP *cpi, int frame_constraint) { } } -void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; +void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) { RATE_CONTROL *const rc = &cpi->rc; - int target; - // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. - if (!cpi->refresh_alt_ref_frame && - (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || - rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) { - cm->frame_type = KEY_FRAME; - rc->this_key_frame_forced = - cm->current_video_frame != 0 && rc->frames_to_key == 0; - rc->frames_to_key = cpi->oxcf.key_freq; - rc->kf_boost = DEFAULT_KF_BOOST; - rc->source_alt_ref_active = 0; - } else { - cm->frame_type = INTER_FRAME; - } + VP9_COMMON *const cm = &cpi->common; if (rc->frames_till_gf_update_due == 0) { double rate_err = 1.0; rc->gfu_boost = DEFAULT_GF_BOOST; @@ -1514,39 +2095,63 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { rate_err > 3.5) { rc->baseline_gf_interval = VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1); - } else if (rc->avg_frame_low_motion < 20) { + } else if (rc->avg_frame_low_motion > 0 && + rc->avg_frame_low_motion < 20) { // Decrease gf interval for high motion case. rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1); } - // Adjust boost and af_ratio based on avg_frame_low_motion, which varies - // between 0 and 100 (stationary, 100% zero/small motion). - rc->gfu_boost = - VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / - (rc->avg_frame_low_motion + 100)); + // Adjust boost and af_ratio based on avg_frame_low_motion, which + // varies between 0 and 100 (stationary, 100% zero/small motion). + if (rc->avg_frame_low_motion > 0) + rc->gfu_boost = + VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / + (rc->avg_frame_low_motion + 100)); + else if (rc->avg_frame_low_motion == 0 && rate_err > 1.0) + rc->gfu_boost = DEFAULT_GF_BOOST >> 1; rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400)); } - adjust_gfint_frame_constraint(cpi, rc->frames_to_key); + if (rc->constrain_gf_key_freq_onepass_vbr) + adjust_gfint_frame_constraint(cpi, rc->frames_to_key); rc->frames_till_gf_update_due = rc->baseline_gf_interval; cpi->refresh_golden_frame = 1; rc->source_alt_ref_pending = 0; rc->alt_ref_gf_group = 0; -#if USE_ALTREF_FOR_ONE_PASS - if (cpi->oxcf.enable_auto_arf) { + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) { rc->source_alt_ref_pending = 1; rc->alt_ref_gf_group = 1; } -#endif } +} + +void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || + rc->frames_to_key == 0 || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame))) { + cm->frame_type = KEY_FRAME; + rc->this_key_frame_forced = + cm->current_video_frame != 0 && rc->frames_to_key == 0; + rc->frames_to_key = cpi->oxcf.key_freq; + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + } else { + cm->frame_type = INTER_FRAME; + } + vp9_set_gf_update_one_pass_vbr(cpi); if (cm->frame_type == KEY_FRAME) - target = calc_iframe_target_size_one_pass_vbr(cpi); + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); else - target = calc_pframe_target_size_one_pass_vbr(cpi); + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); vp9_rc_set_frame_target(cpi, target); + if (cm->show_frame) vp9_update_buffer_level_preencode(cpi); if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) vp9_cyclic_refresh_update_parameters(cpi); } -static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { +int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { const VP9EncoderConfig *oxcf = &cpi->oxcf; const RATE_CONTROL *rc = &cpi->rc; const SVC *const svc = &cpi->svc; @@ -1554,20 +2159,21 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100; int min_frame_target = VPXMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); - int target; + int64_t target; if (oxcf->gf_cbr_boost_pct) { const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100; target = cpi->refresh_golden_frame - ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * - af_ratio_pct) / + ? ((int64_t)rc->avg_frame_bandwidth * + rc->baseline_gf_interval * af_ratio_pct) / (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) - : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) / + : ((int64_t)rc->avg_frame_bandwidth * + rc->baseline_gf_interval * 100) / (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); } else { target = rc->avg_frame_bandwidth; } - if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_svc(cpi)) { // Note that for layers, avg_frame_bandwidth is the cumulative // per-frame-bandwidth. For the target size of this frame, use the // layer average frame size (i.e., non-cumulative per-frame-bw). @@ -1588,22 +2194,21 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { target += (target * pct_high) / 200; } if (oxcf->rc_max_inter_bitrate_pct) { - const int max_rate = - rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; + const int64_t max_rate = + (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100; target = VPXMIN(target, max_rate); } - return VPXMAX(min_frame_target, target); + if (target > INT_MAX) target = INT_MAX; + return VPXMAX(min_frame_target, (int)target); } -static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { +int vp9_calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { const RATE_CONTROL *rc = &cpi->rc; const VP9EncoderConfig *oxcf = &cpi->oxcf; const SVC *const svc = &cpi->svc; - int target; + int64_t target; if (cpi->common.current_video_frame == 0) { - target = ((rc->starting_buffer_level / 2) > INT_MAX) - ? INT_MAX - : (int)(rc->starting_buffer_level / 2); + target = rc->starting_buffer_level / 2; } else { int kf_boost = 32; double framerate = cpi->framerate; @@ -1615,88 +2220,295 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { const LAYER_CONTEXT *lc = &svc->layer_context[layer]; framerate = lc->framerate; } - kf_boost = VPXMAX(kf_boost, (int)(2 * framerate - 16)); + kf_boost = VPXMAX(kf_boost, (int)round(2 * framerate - 16)); if (rc->frames_since_key < framerate / 2) { - kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2)); + kf_boost = (int)round(kf_boost * rc->frames_since_key / (framerate / 2)); + } + + target = ((int64_t)(16 + kf_boost) * rc->avg_frame_bandwidth) >> 4; + } + target = VPXMIN(INT_MAX, target); + return vp9_rc_clamp_iframe_target_size(cpi, (int)target); +} + +static void set_intra_only_frame(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + // Don't allow intra_only frame for bypass/flexible SVC mode, or if number + // of spatial layers is 1 or if number of spatial or temporal layers > 3. + // Also if intra-only is inserted on very first frame, don't allow if + // if number of temporal layers > 1. This is because on intra-only frame + // only 3 reference buffers can be updated, but for temporal layers > 1 + // we generally need to use buffer slots 4 and 5. + if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) || + svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 || + svc->number_spatial_layers == 1) + return; + cm->show_frame = 0; + cm->intra_only = 1; + cm->frame_type = INTER_FRAME; + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_last_frame = 1; + cpi->ext_refresh_golden_frame = 1; + cpi->ext_refresh_alt_ref_frame = 1; + if (cm->current_video_frame == 0) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 1; + cpi->alt_fb_idx = 2; + } else { + int i; + int count = 0; + cpi->lst_fb_idx = -1; + cpi->gld_fb_idx = -1; + cpi->alt_fb_idx = -1; + svc->update_buffer_slot[0] = 0; + // For intra-only frame we need to refresh all slots that were + // being used for the base layer (fb_idx_base[i] == 1). + // Start with assigning last first, then golden and then alt. + for (i = 0; i < REF_FRAMES; ++i) { + if (svc->fb_idx_base[i] == 1) { + svc->update_buffer_slot[0] |= 1 << i; + count++; + } + if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i; + if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i; + if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i; + } + // If golden or alt is not being used for base layer, then set them + // to the lst_fb_idx. + if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx; + if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx; + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 0; + cpi->ext_refresh_alt_ref_frame = 0; + cpi->ref_frame_flags = 0; } - target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4; } - return vp9_rc_clamp_iframe_target_size(cpi, target); } void vp9_rc_get_svc_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; int target = rc->avg_frame_bandwidth; - int layer = - LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); + int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + if (svc->first_spatial_layer_to_encode) + svc->layer_context[svc->temporal_layer_id].is_key_frame = 0; // Periodic key frames is based on the super-frame counter // (svc.current_superframe), also only base spatial layer is key frame. - if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) || + // Key frame is set for any of the following: very first frame, frame flags + // indicates key, superframe counter hits key frequency,(non-intra) sync + // flag is set for spatial layer 0, or deadline mode changes. + if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) || + (cpi->frame_flags & FRAMEFLAGS_KEY) || (cpi->oxcf.auto_key && - (cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) && - cpi->svc.spatial_layer_id == 0)) { + (svc->current_superframe % cpi->oxcf.key_freq == 0) && + !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) || + (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0) || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) { cm->frame_type = KEY_FRAME; rc->source_alt_ref_active = 0; - if (is_two_pass_svc(cpi)) { - cpi->svc.layer_context[layer].is_key_frame = 1; - cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); - } else if (is_one_pass_cbr_svc(cpi)) { - if (cm->current_video_frame > 0) vp9_svc_reset_key_frame(cpi); - layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, - cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); - cpi->svc.layer_context[layer].is_key_frame = 1; + if (is_one_pass_svc(cpi)) { + if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1); + layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + svc->layer_context[layer].is_key_frame = 1; cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); // Assumption here is that LAST_FRAME is being updated for a keyframe. // Thus no change in update flags. - target = calc_iframe_target_size_one_pass_cbr(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); } } else { cm->frame_type = INTER_FRAME; - if (is_two_pass_svc(cpi)) { - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - if (cpi->svc.spatial_layer_id == 0) { - lc->is_key_frame = 0; + if (is_one_pass_svc(cpi)) { + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + // Add condition current_video_frame > 0 for the case where first frame + // is intra only followed by overlay/copy frame. In this case we don't + // want to reset is_key_frame to 0 on overlay/copy frame. + lc->is_key_frame = + (svc->spatial_layer_id == 0 && cm->current_video_frame > 0) + ? 0 + : svc->layer_context[svc->temporal_layer_id].is_key_frame; + if (cpi->oxcf.rc_mode == VPX_CBR) { + target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); } else { - lc->is_key_frame = - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame; - if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + double rate_err = 0.0; + rc->fac_active_worst_inter = 140; + rc->fac_active_worst_gf = 100; + if (rc->rolling_target_bits > 0) { + rate_err = + (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits; + if (rate_err < 1.0) + rc->fac_active_worst_inter = 120; + else if (rate_err > 2.0) + // Increase active_worst faster if rate fluctuation is high. + rc->fac_active_worst_inter = 160; + } + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); } - cpi->ref_frame_flags &= (~VP9_ALT_FLAG); - } else if (is_one_pass_cbr_svc(cpi)) { - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode) { - lc->is_key_frame = 0; - } else { - lc->is_key_frame = - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame; - } - target = calc_pframe_target_size_one_pass_cbr(cpi); } } + if (svc->simulcast_mode) { + if (svc->spatial_layer_id > 0 && + svc->layer_context[layer].is_key_frame == 1) { + cm->frame_type = KEY_FRAME; + cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); + } + // Set the buffer idx and refresh flags for key frames in simulcast mode. + // Note the buffer slot for long-term reference is set below (line 2255), + // and alt_ref is used for that on key frame. So use last and golden for + // the other two normal slots. + if (cm->frame_type == KEY_FRAME) { + if (svc->number_spatial_layers == 2) { + if (svc->spatial_layer_id == 0) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 2; + cpi->alt_fb_idx = 6; + } else if (svc->spatial_layer_id == 1) { + cpi->lst_fb_idx = 1; + cpi->gld_fb_idx = 3; + cpi->alt_fb_idx = 6; + } + } else if (svc->number_spatial_layers == 3) { + if (svc->spatial_layer_id == 0) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 3; + cpi->alt_fb_idx = 6; + } else if (svc->spatial_layer_id == 1) { + cpi->lst_fb_idx = 1; + cpi->gld_fb_idx = 4; + cpi->alt_fb_idx = 6; + } else if (svc->spatial_layer_id == 2) { + cpi->lst_fb_idx = 2; + cpi->gld_fb_idx = 5; + cpi->alt_fb_idx = 7; + } + } + cpi->ext_refresh_last_frame = 1; + cpi->ext_refresh_golden_frame = 1; + cpi->ext_refresh_alt_ref_frame = 1; + } + } + + // Check if superframe contains a sync layer request. + vp9_svc_check_spatial_layer_sync(cpi); + + // If long term termporal feature is enabled, set the period of the update. + // The update/refresh of this reference frame is always on base temporal + // layer frame. + if (svc->use_gf_temporal_ref_current_layer) { + // Only use gf long-term prediction on non-key superframes. + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // Use golden for this reference, which will be used for prediction. + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->gld_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + // Enable prediction off LAST (last reference) and golden (which will + // generally be further behind/long-term reference). + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + // Check for update/refresh of reference: only refresh on base temporal + // layer. + if (svc->temporal_layer_id == 0) { + if (svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // On key frame we update the buffer index used for long term reference. + // Use the alt_ref since it is not used or updated on key frames. + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->ext_refresh_alt_ref_frame = 1; + } else if (rc->frames_till_gf_update_due == 0) { + // Set perdiod of next update. Make it a multiple of 10, as the cyclic + // refresh is typically ~10%, and we'd like the update to happen after + // a few cylces of the refresh (so it better quality frame). Note the + // cyclic refresh for SVC only operates on base temporal layer frames. + // Choose 20 as perdiod for now (2 cycles). + rc->baseline_gf_interval = 20; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + cpi->ext_refresh_golden_frame = 1; + rc->gfu_boost = DEFAULT_GF_BOOST; + } + } + } else if (!svc->use_gf_temporal_ref) { + rc->frames_till_gf_update_due = INT_MAX; + rc->baseline_gf_interval = INT_MAX; + } + if (svc->set_intra_only_frame) { + set_intra_only_frame(cpi); + if (cpi->oxcf.rc_mode == VPX_CBR) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); + else + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi); + } + // Overlay frame predicts from LAST (intra-only) + if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG; + // Any update/change of global cyclic refresh parameters (amount/delta-qp) // should be done here, before the frame qp is selected. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_update_parameters(cpi); vp9_rc_set_frame_target(cpi, target); - rc->frames_till_gf_update_due = INT_MAX; - rc->baseline_gf_interval = INT_MAX; + if (cm->show_frame) vp9_update_buffer_level_svc_preencode(cpi); + + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 && + svc->spatial_layer_id == svc->first_spatial_layer_to_encode && + svc->temporal_layer_id == 0) { + LAYER_CONTEXT *lc = NULL; + cpi->resize_pending = vp9_resize_one_pass_cbr(cpi); + if (cpi->resize_pending) { + int tl, width, height; + // Apply the same scale to all temporal layers. + for (tl = 0; tl < svc->number_temporal_layers; tl++) { + lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + tl]; + lc->scaling_factor_num_resize = + cpi->resize_scale_num * lc->scaling_factor_num; + lc->scaling_factor_den_resize = + cpi->resize_scale_den * lc->scaling_factor_den; + // Reset rate control for all temporal layers. + lc->rc.buffer_level = lc->rc.optimal_buffer_level; + lc->rc.bits_off_target = lc->rc.optimal_buffer_level; + lc->rc.rate_correction_factors[INTER_FRAME] = + rc->rate_correction_factors[INTER_FRAME]; + } + // Set the size for this current temporal layer. + lc = &svc->layer_context[svc->spatial_layer_id * + svc->number_temporal_layers + + svc->temporal_layer_id]; + get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, + lc->scaling_factor_num_resize, + lc->scaling_factor_den_resize, &width, &height); + vp9_set_size_literal(cpi, width, height); + svc->resize_set = 1; + } + } else { + cpi->resize_pending = 0; + svc->resize_set = 0; + } } void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; - // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. - if ((cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || - rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) { + if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) || + (cpi->oxcf.auto_key && rc->frames_to_key == 0) || + (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) { cm->frame_type = KEY_FRAME; - rc->this_key_frame_forced = - cm->current_video_frame != 0 && rc->frames_to_key == 0; rc->frames_to_key = cpi->oxcf.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; rc->source_alt_ref_active = 0; @@ -1722,12 +2534,15 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_update_parameters(cpi); - if (cm->frame_type == KEY_FRAME) - target = calc_iframe_target_size_one_pass_cbr(cpi); + if (frame_is_intra_only(cm)) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi); else - target = calc_pframe_target_size_one_pass_cbr(cpi); + target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); vp9_rc_set_frame_target(cpi, target); + + if (cm->show_frame) vp9_update_buffer_level_preencode(cpi); + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC) cpi->resize_pending = vp9_resize_one_pass_cbr(cpi); else @@ -1789,29 +2604,46 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, rc->min_gf_interval = FIXED_GF_INTERVAL; rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL; } else { + double framerate = cpi->framerate; // Set Maximum gf/arf interval rc->max_gf_interval = oxcf->max_gf_interval; rc->min_gf_interval = oxcf->min_gf_interval; - if (rc->min_gf_interval == 0) + if (rc->min_gf_interval == 0) { rc->min_gf_interval = vp9_rc_get_default_min_gf_interval( - oxcf->width, oxcf->height, cpi->framerate); - if (rc->max_gf_interval == 0) - rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( - cpi->framerate, rc->min_gf_interval); - - // Extended interval for genuinely static scenes - rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; - - if (is_altref_enabled(cpi)) { - if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) - rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; + oxcf->width, oxcf->height, framerate); } + if (rc->max_gf_interval == 0) { + rc->max_gf_interval = + vp9_rc_get_default_max_gf_interval(framerate, rc->min_gf_interval); + } + + // Extended max interval for genuinely static scenes like slide shows. + rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; if (rc->max_gf_interval > rc->static_scene_max_gf_interval) rc->max_gf_interval = rc->static_scene_max_gf_interval; // Clamp min to max rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval); + + if (oxcf->target_level == LEVEL_AUTO) { + const uint32_t pic_size = cpi->common.width * cpi->common.height; + const uint32_t pic_breadth = + VPXMAX(cpi->common.width, cpi->common.height); + int i; + for (i = 0; i < VP9_LEVELS; ++i) { + if (vp9_level_defs[i].max_luma_picture_size >= pic_size && + vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) { + if (rc->min_gf_interval <= + (int)vp9_level_defs[i].min_altref_distance) { + rc->min_gf_interval = (int)vp9_level_defs[i].min_altref_distance; + rc->max_gf_interval = + VPXMAX(rc->max_gf_interval, rc->min_gf_interval); + } + break; + } + } + } } } @@ -1819,27 +2651,29 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) { const VP9_COMMON *const cm = &cpi->common; const VP9EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; - int vbr_max_bits; - rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate); - rc->min_frame_bandwidth = - (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); + rc->avg_frame_bandwidth = saturate_cast_double_to_int( + round(oxcf->target_bandwidth / cpi->framerate)); - rc->min_frame_bandwidth = - VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); + int64_t vbr_min_bits = + (int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100; + vbr_min_bits = VPXMIN(vbr_min_bits, INT_MAX); + + rc->min_frame_bandwidth = VPXMAX((int)vbr_min_bits, FRAME_OVERHEAD_BITS); // A maximum bitrate for a frame is defined. - // The baseline for this aligns with HW implementations that - // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits - // per 16x16 MB (averaged over a frame). However this limit is extended if - // a very high rate is given on the command line or the the rate cannnot - // be acheived because of a user specificed max q (e.g. when the user - // specifies lossless encode. - vbr_max_bits = - (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) / - 100); + // However this limit is extended if a very high rate is given on the command + // line or the rate can not be achieved because of a user specified max q + // (e.g. when the user specifies lossless encode). + // + // If a level is specified that requires a lower maximum rate then the level + // value take precedence. + int64_t vbr_max_bits = + (int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section / 100; + vbr_max_bits = VPXMIN(vbr_max_bits, INT_MAX); + rc->max_frame_bandwidth = - VPXMAX(VPXMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits); + VPXMAX(VPXMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), (int)vbr_max_bits); vp9_rc_set_gf_interval_range(cpi, rc); } @@ -1849,44 +2683,43 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) { static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) { RATE_CONTROL *const rc = &cpi->rc; int64_t vbr_bits_off_target = rc->vbr_bits_off_target; - int max_delta; - int frame_window = VPXMIN(16, ((int)cpi->twopass.total_stats.count - - cpi->common.current_video_frame)); + int64_t frame_target = *this_frame_target; + int frame_window = (int)VPXMIN( + 16, cpi->twopass.total_stats.count - cpi->common.current_video_frame); // Calcluate the adjustment to rate for this frame. if (frame_window > 0) { - max_delta = (vbr_bits_off_target > 0) - ? (int)(vbr_bits_off_target / frame_window) - : (int)(-vbr_bits_off_target / frame_window); + int64_t max_delta = (vbr_bits_off_target > 0) + ? (vbr_bits_off_target / frame_window) + : (-vbr_bits_off_target / frame_window); - max_delta = VPXMIN(max_delta, - ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100)); + max_delta = + VPXMIN(max_delta, ((frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100)); // vbr_bits_off_target > 0 means we have extra bits to spend if (vbr_bits_off_target > 0) { - *this_frame_target += (vbr_bits_off_target > max_delta) - ? max_delta - : (int)vbr_bits_off_target; + frame_target += VPXMIN(vbr_bits_off_target, max_delta); } else { - *this_frame_target -= (vbr_bits_off_target < -max_delta) - ? max_delta - : (int)-vbr_bits_off_target; + frame_target -= VPXMIN(-vbr_bits_off_target, max_delta); } } // Fast redistribution of bits arising from massive local undershoot. - // Dont do it for kf,arf,gf or overlay frames. + // Don't do it for kf,arf,gf or overlay frames. if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref && rc->vbr_bits_off_target_fast) { - int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target); - int fast_extra_bits; - fast_extra_bits = (int)VPXMIN(rc->vbr_bits_off_target_fast, one_frame_bits); - fast_extra_bits = (int)VPXMIN( - fast_extra_bits, - VPXMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8)); - *this_frame_target += (int)fast_extra_bits; + int64_t one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, frame_target); + int64_t fast_extra_bits = + VPXMIN(rc->vbr_bits_off_target_fast, one_frame_bits); + fast_extra_bits = + VPXMIN(fast_extra_bits, + VPXMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8)); + frame_target += fast_extra_bits; rc->vbr_bits_off_target_fast -= fast_extra_bits; } + + // Clamp the target for the frame to the maximum allowed for one frame. + *this_frame_target = (int)VPXMIN(frame_target, INT_MAX); } void vp9_set_target_rate(VP9_COMP *cpi) { @@ -1898,9 +2731,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) { else target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); - // Correction to rate target based on prior over or under shoot. - if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ) - vbr_rate_correction(cpi, &target_rate); + if (!cpi->oxcf.vbr_corpus_complexity) { + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ) + vbr_rate_correction(cpi, &target_rate); + } vp9_rc_set_frame_target(cpi, target_rate); } @@ -1912,9 +2747,11 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { RESIZE_ACTION resize_action = NO_RESIZE; int avg_qp_thr1 = 70; int avg_qp_thr2 = 50; - int min_width = 180; - int min_height = 180; + // Don't allow for resized frame to go below 320x180, resize in steps of 3/4. + int min_width = (320 * 4) / 3; + int min_height = (180 * 4) / 3; int down_size_on = 1; + int force_downsize_rate = 0; cpi->resize_scale_num = 1; cpi->resize_scale_den = 1; // Don't resize on key frame; reset the counters on key frame. @@ -1923,20 +2760,9 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { cpi->resize_count = 0; return 0; } - // Check current frame reslution to avoid generating frames smaller than - // the minimum resolution. - if (ONEHALFONLY_RESIZE) { - if ((cm->width >> 1) < min_width || (cm->height >> 1) < min_height) - down_size_on = 0; - } else { - if (cpi->resize_state == ORIG && - (cm->width * 3 / 4 < min_width || cm->height * 3 / 4 < min_height)) - return 0; - else if (cpi->resize_state == THREE_QUARTER && - ((cpi->oxcf.width >> 1) < min_width || - (cpi->oxcf.height >> 1) < min_height)) - down_size_on = 0; - } + + // No resizing down if frame size is below some limit. + if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0; #if CONFIG_VP9_TEMPORAL_DENOISING // If denoiser is on, apply a smaller qp threshold. @@ -1946,11 +2772,32 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { } #endif + // Force downsize based on per-frame-bandwidth, for extreme case, + // for HD input. + if (cpi->resize_state == ORIG && cm->width * cm->height >= 1280 * 720) { + if (rc->avg_frame_bandwidth < 300000 / 30) { + resize_action = DOWN_ONEHALF; + cpi->resize_state = ONE_HALF; + force_downsize_rate = 1; + } else if (rc->avg_frame_bandwidth < 400000 / 30) { + resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR; + cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER; + force_downsize_rate = 1; + } + } else if (cpi->resize_state == THREE_QUARTER && + cm->width * cm->height >= 960 * 540) { + if (rc->avg_frame_bandwidth < 300000 / 30) { + resize_action = DOWN_ONEHALF; + cpi->resize_state = ONE_HALF; + force_downsize_rate = 1; + } + } + // Resize based on average buffer underflow and QP over some window. // Ignore samples close to key frame, since QP is usually high after key. - if (cpi->rc.frames_since_key > 2 * cpi->framerate) { - const int window = (int)(4 * cpi->framerate); - cpi->resize_avg_qp += cm->base_qindex; + if (!force_downsize_rate && cpi->rc.frames_since_key > cpi->framerate) { + const int window = VPXMIN(30, (int)round(2 * cpi->framerate)); + cpi->resize_avg_qp += rc->last_q[INTER_FRAME]; if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100)) ++cpi->resize_buffer_underflow; ++cpi->resize_count; @@ -1962,8 +2809,9 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { // Resize back up if average QP is low, and we are currently in a resized // down state, i.e. 1/2 or 3/4 of original resolution. // Currently, use a flag to turn 3/4 resizing feature on/off. - if (cpi->resize_buffer_underflow > (cpi->resize_count >> 2)) { - if (cpi->resize_state == THREE_QUARTER && down_size_on) { + if (cpi->resize_buffer_underflow > (cpi->resize_count >> 2) && + down_size_on) { + if (cpi->resize_state == THREE_QUARTER) { resize_action = DOWN_ONEHALF; cpi->resize_state = ONE_HALF; } else if (cpi->resize_state == ORIG) { @@ -2010,7 +2858,7 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { // Reset buffer level to optimal, update target size. rc->buffer_level = rc->optimal_buffer_level; rc->bits_off_target = rc->optimal_buffer_level; - rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi); + rc->this_frame_target = vp9_calc_pframe_target_size_one_pass_cbr(cpi); // Get the projected qindex, based on the scaled target frame size (scaled // so target_bits_per_mb in vp9_rc_regulate_q will be correct target). target_bits_per_frame = (resize_action >= 0) @@ -2035,7 +2883,8 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) { return resize_action; } -void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { +static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, + uint64_t avg_sad_current) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; @@ -2046,7 +2895,7 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { uint64_t avg_source_sad_lag = avg_sad_current; int high_source_sad_lagindex = -1; int steady_sad_lagindex = -1; - uint32_t sad_thresh1 = 60000; + uint32_t sad_thresh1 = 70000; uint32_t sad_thresh2 = 120000; int low_content = 0; int high_content = 0; @@ -2135,9 +2984,14 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { // Adjust factors for active_worst setting & af_ratio for next gf interval. rc->fac_active_worst_inter = 150; // corresponds to 3/2 (= 150 /100). rc->fac_active_worst_gf = 100; - if (rate_err < 1.5 && !high_content) { + if (rate_err < 2.0 && !high_content) { rc->fac_active_worst_inter = 120; rc->fac_active_worst_gf = 90; + } else if (rate_err > 8.0 && rc->avg_frame_qindex[INTER_FRAME] < 16) { + // Increase active_worst faster at low Q if rate fluctuation is high. + rc->fac_active_worst_inter = 200; + if (rc->avg_frame_qindex[INTER_FRAME] < 8) + rc->fac_active_worst_inter = 400; } if (low_content && rc->avg_frame_low_motion > 80) { rc->af_ratio_onepass_vbr = 15; @@ -2145,11 +2999,16 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { rc->af_ratio_onepass_vbr = 5; rc->gfu_boost = DEFAULT_GF_BOOST >> 2; } -#if USE_ALTREF_FOR_ONE_PASS - if (cpi->oxcf.enable_auto_arf) { - // Don't use alt-ref if there is a scene cut within the group, - // or content is not low. - if ((rc->high_source_sad_lagindex > 0 && + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) { + // Flag to disable usage of ARF based on past usage, only allow this + // disabling if current frame/group does not start with key frame or + // scene cut. Note perc_arf_usage is only computed for speed >= 5. + int arf_usage_low = + (cm->frame_type != KEY_FRAME && !rc->high_source_sad && + cpi->rc.perc_arf_usage < 15 && cpi->oxcf.speed >= 5); + // Don't use alt-ref for this group under certain conditions. + if (arf_usage_low || + (rc->high_source_sad_lagindex > 0 && rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) || (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) { rc->source_alt_ref_pending = 0; @@ -2158,13 +3017,13 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { rc->source_alt_ref_pending = 1; rc->alt_ref_gf_group = 1; // If alt-ref is used for this gf group, limit the interval. - if (rc->baseline_gf_interval > 10 && - rc->baseline_gf_interval < rc->frames_to_key) - rc->baseline_gf_interval = 10; + if (rc->baseline_gf_interval > 12) { + rc->baseline_gf_interval = 12; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + } } } -#endif - target = calc_pframe_target_size_one_pass_vbr(cpi); + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); vp9_rc_set_frame_target(cpi, target); } rc->prev_avg_source_sad_lag = avg_source_sad_lag; @@ -2175,27 +3034,59 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { // in content and allow rate control to react. // This function also handles special case of lag_in_frames, to measure content // level in #future frames set by the lag_in_frames. -void vp9_avg_source_sad(VP9_COMP *cpi) { +void vp9_scene_detection_onepass(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + YV12_BUFFER_CONFIG const *unscaled_src = cpi->un_scaled_source; + YV12_BUFFER_CONFIG const *unscaled_last_src = cpi->unscaled_last_source; + uint8_t *src_y; + int src_ystride; + int src_width; + int src_height; + uint8_t *last_src_y; + int last_src_ystride; + int last_src_width; + int last_src_height; + if (cpi->un_scaled_source == NULL || cpi->unscaled_last_source == NULL || + (cpi->use_svc && cpi->svc.current_superframe == 0)) + return; + src_y = unscaled_src->y_buffer; + src_ystride = unscaled_src->y_stride; + src_width = unscaled_src->y_width; + src_height = unscaled_src->y_height; + last_src_y = unscaled_last_src->y_buffer; + last_src_ystride = unscaled_last_src->y_stride; + last_src_width = unscaled_last_src->y_width; + last_src_height = unscaled_last_src->y_height; +#if CONFIG_VP9_HIGHBITDEPTH + if (cm->use_highbitdepth) return; +#endif rc->high_source_sad = 0; - if (cpi->Last_Source != NULL && - cpi->Last_Source->y_width == cpi->Source->y_width && - cpi->Last_Source->y_height == cpi->Source->y_height) { + rc->high_num_blocks_with_motion = 0; + // For SVC: scene detection is only checked on first spatial layer of + // the superframe using the original/unscaled resolutions. + if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode && + src_width == last_src_width && src_height == last_src_height) { YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; - uint8_t *src_y = cpi->Source->y_buffer; - int src_ystride = cpi->Source->y_stride; - uint8_t *last_src_y = cpi->Last_Source->y_buffer; - int last_src_ystride = cpi->Last_Source->y_stride; + int num_mi_cols = cm->mi_cols; + int num_mi_rows = cm->mi_rows; int start_frame = 0; int frames_to_buffer = 1; int frame = 0; + int scene_cut_force_key_frame = 0; + int num_zero_temp_sad = 0; uint64_t avg_sad_current = 0; - uint32_t min_thresh = 4000; + uint32_t min_thresh = 20000; // ~5 * 64 * 64 float thresh = 8.0f; - if (cpi->oxcf.rc_mode == VPX_VBR) { - min_thresh = 60000; - thresh = 2.1f; + uint32_t thresh_key = 140000; + if (cpi->oxcf.speed <= 5) thresh_key = 240000; + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) min_thresh = 65000; + if (cpi->oxcf.rc_mode == VPX_VBR) thresh = 2.1f; + if (cpi->use_svc && cpi->svc.number_spatial_layers > 1) { + const int aligned_width = ALIGN_POWER_OF_TWO(src_width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(src_height, MI_SIZE_LOG2); + num_mi_cols = aligned_width >> MI_SIZE_LOG2; + num_mi_rows = aligned_height >> MI_SIZE_LOG2; } if (cpi->oxcf.lag_in_frames > 0) { frames_to_buffer = (cm->current_video_frame == 1) @@ -2220,6 +3111,8 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { rc->high_source_sad = 1; else rc->high_source_sad = 0; + if (rc->high_source_sad && avg_sad_current > thresh_key) + scene_cut_force_key_frame = 1; // Update recursive average for current frame. if (avg_sad_current > 0) rc->avg_source_sad[0] = @@ -2239,27 +3132,29 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { const BLOCK_SIZE bsize = BLOCK_64X64; // Loop over sub-sample of frame, compute average sad over 64x64 blocks. uint64_t avg_sad = 0; + uint64_t tmp_sad = 0; int num_samples = 0; - int sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; - int sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + int sb_cols = (num_mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + int sb_rows = (num_mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; if (cpi->oxcf.lag_in_frames > 0) { src_y = frames[frame]->y_buffer; src_ystride = frames[frame]->y_stride; last_src_y = frames[frame + 1]->y_buffer; last_src_ystride = frames[frame + 1]->y_stride; } + num_zero_temp_sad = 0; for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) { for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { // Checker-board pattern, ignore boundary. - // If the partition copy is on, compute for every superblock. - if (cpi->sf.copy_partition_flag || - ((sbi_row > 0 && sbi_col > 0) && + if (((sbi_row > 0 && sbi_col > 0) && (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) && ((sbi_row % 2 == 0 && sbi_col % 2 == 0) || (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) { + tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, + last_src_ystride); + avg_sad += tmp_sad; num_samples++; - avg_sad += cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, - last_src_ystride); + if (tmp_sad == 0) num_zero_temp_sad++; } src_y += 64; last_src_y += 64; @@ -2276,17 +3171,52 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { if (avg_sad > VPXMAX(min_thresh, (unsigned int)(rc->avg_source_sad[0] * thresh)) && - rc->frames_since_key > 1) + rc->frames_since_key > 1 + cpi->svc.number_spatial_layers && + num_zero_temp_sad < 3 * (num_samples >> 2)) rc->high_source_sad = 1; else rc->high_source_sad = 0; + if (rc->high_source_sad && avg_sad > thresh_key) + scene_cut_force_key_frame = 1; if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR) rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2; } else { rc->avg_source_sad[lagframe_idx] = avg_sad; } + if (num_zero_temp_sad < (3 * num_samples >> 2)) + rc->high_num_blocks_with_motion = 1; } } + // For CBR non-screen content mode, check if we should reset the rate + // control. Reset is done if high_source_sad is detected and the rate + // control is at very low QP with rate correction factor at min level. + if (cpi->oxcf.rc_mode == VPX_CBR && + cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) { + if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality && + rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) && + rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) { + rc->rate_correction_factors[INTER_NORMAL] = 0.5; + rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality; + rc->buffer_level = rc->optimal_buffer_level; + rc->bits_off_target = rc->optimal_buffer_level; + rc->reset_high_source_sad = 1; + } + if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad) + rc->this_frame_target = rc->avg_frame_bandwidth; + } + // For SVC the new (updated) avg_source_sad[0] for the current superframe + // updates the setting for all layers. + if (cpi->use_svc) { + int sl, tl; + SVC *const svc = &cpi->svc; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_source_sad[0] = rc->avg_source_sad[0]; + } + } // For VBR, under scene change/high content change, force golden refresh. if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME && rc->high_source_sad && rc->frames_to_key > 3 && @@ -2294,16 +3224,16 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { cpi->ext_refresh_frame_flags_pending == 0) { int target; cpi->refresh_golden_frame = 1; + if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME; rc->source_alt_ref_pending = 0; -#if USE_ALTREF_FOR_ONE_PASS - if (cpi->oxcf.enable_auto_arf) rc->source_alt_ref_pending = 1; -#endif + if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) + rc->source_alt_ref_pending = 1; rc->gfu_boost = DEFAULT_GF_BOOST >> 1; rc->baseline_gf_interval = VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval)); adjust_gfint_frame_constraint(cpi, rc->frames_to_key); rc->frames_till_gf_update_due = rc->baseline_gf_interval; - target = calc_pframe_target_size_one_pass_vbr(cpi); + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi); vp9_rc_set_frame_target(cpi, target); rc->count_last_scene_change = 0; } else { @@ -2317,21 +3247,65 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { // Test if encoded frame will significantly overshoot the target bitrate, and // if so, set the QP, reset/adjust some rate control parameters, and return 1. +// frame_size = -1 means frame has not been encoded. int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; - int thresh_qp = 3 * (rc->worst_quality >> 2); - int thresh_rate = rc->avg_frame_bandwidth * 10; - if (cm->base_qindex < thresh_qp && frame_size > thresh_rate) { + SPEED_FEATURES *const sf = &cpi->sf; + int thresh_qp = 7 * (rc->worst_quality >> 3); + int thresh_rate = rc->avg_frame_bandwidth << 3; + // Lower thresh_qp for video (more overshoot at lower Q) to be + // more conservative for video. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) + thresh_qp = 3 * (rc->worst_quality >> 2); + // If this decision is not based on an encoded frame size but just on + // scene/slide change detection (i.e., re_encode_overshoot_cbr_rt == + // FAST_DETECTION_MAXQ), for now skip the (frame_size > thresh_rate) + // condition in this case. + // TODO(marpan): Use a better size/rate condition for this case and + // adjust thresholds. + if ((sf->overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ || + frame_size > thresh_rate) && + cm->base_qindex < thresh_qp) { double rate_correction_factor = cpi->rc.rate_correction_factors[INTER_NORMAL]; const int target_size = cpi->rc.avg_frame_bandwidth; + const uint64_t sad_thr = 64 * 64 * 32; + int force_maxqp = 1; double new_correction_factor; int target_bits_per_mb; double q2; int enumerator; - // Force a re-encode, and for now use max-QP. - *q = cpi->rc.worst_quality; + // Set a larger QP. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && + (rc->buffer_level > (3 * rc->optimal_buffer_level) >> 2) && + (cpi->rc.avg_source_sad[0] < sad_thr)) { + *q = (*q + cpi->rc.worst_quality) >> 1; + force_maxqp = 0; + } else { + *q = cpi->rc.worst_quality; + } + cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0; + cpi->rc.re_encode_maxq_scene_change = 1; + // If the frame_size is much larger than the threshold (big content change) + // and the encoded frame used alot of Intra modes, then force hybrid_intra + // encoding for the re-encode on this scene change. hybrid_intra will + // use rd-based intra mode selection for small blocks. + if (sf->overshoot_detection_cbr_rt == RE_ENCODE_MAXQ && + frame_size > (thresh_rate << 1) && cpi->svc.spatial_layer_id == 0) { + MODE_INFO **mi = cm->mi_grid_visible; + int sum_intra_usage = 0; + int mi_row, mi_col; + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { + if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++; + mi++; + } + mi += 8; + } + sum_intra_usage = 100 * sum_intra_usage / (cm->mi_rows * cm->mi_cols); + if (sum_intra_usage > 60) cpi->rc.hybrid_intra_scene_change = 1; + } // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as // these parameters will affect QP selection for subsequent frames. If they // have settled down to a very different (low QP) state, then not adjusting @@ -2360,20 +3334,32 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { } // For temporal layers, reset the rate control parametes across all // temporal layers. + // If the first_spatial_layer_to_encode > 0, then this superframe has + // skipped lower base layers. So in this case we should also reset and + // force max-q for spatial layers < first_spatial_layer_to_encode. + // For the case of no inter-layer prediction on delta frames: reset and + // force max-q for all spatial layers, to avoid excessive frame drops. if (cpi->use_svc) { - int i = 0; + int tl = 0; + int sl = 0; SVC *svc = &cpi->svc; - for (i = 0; i < svc->number_temporal_layers; ++i) { - const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, - svc->number_temporal_layers); - LAYER_CONTEXT *lc = &svc->layer_context[layer]; - RATE_CONTROL *lrc = &lc->rc; - lrc->avg_frame_qindex[INTER_FRAME] = *q; - lrc->buffer_level = rc->optimal_buffer_level; - lrc->bits_off_target = rc->optimal_buffer_level; - lrc->rc_1_frame = 0; - lrc->rc_2_frame = 0; - lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor; + int num_spatial_layers = VPXMAX(1, svc->first_spatial_layer_to_encode); + if (svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON) + num_spatial_layers = svc->number_spatial_layers; + for (sl = 0; sl < num_spatial_layers; ++sl) { + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->avg_frame_qindex[INTER_FRAME] = *q; + lrc->buffer_level = lrc->optimal_buffer_level; + lrc->bits_off_target = lrc->optimal_buffer_level; + lrc->rc_1_frame = 0; + lrc->rc_2_frame = 0; + lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor; + lrc->force_max_q = force_maxqp; + } } } return 1; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h index 70aef03ffb..0c61ad3461 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RATECTRL_H_ -#define VP9_ENCODER_VP9_RATECTRL_H_ +#ifndef VPX_VP9_ENCODER_VP9_RATECTRL_H_ +#define VPX_VP9_ENCODER_VP9_RATECTRL_H_ #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" @@ -21,14 +21,30 @@ extern "C" { #endif +// Used to control aggressive VBR mode. +// #define AGGRESSIVE_VBR 1 + // Bits Per MB at different Q (Multiplied by 512) #define BPER_MB_NORMBITS 9 +#define DEFAULT_KF_BOOST 2000 +#define DEFAULT_GF_BOOST 2000 + #define MIN_GF_INTERVAL 4 #define MAX_GF_INTERVAL 16 #define FIXED_GF_INTERVAL 8 // Used in some testing modes only #define ONEHALFONLY_RESIZE 0 +#define FRAME_OVERHEAD_BITS 200 + +// Threshold used to define a KF group as static (e.g. a slide show). +// Essentially this means that no frame in the group has more than 1% of MBs +// that are not marked as coded with 0,0 motion in the first pass. +#define STATIC_KF_GROUP_THRESH 99 + +// The maximum duration of a GF group that is static (for example a slide show). +#define MAX_STATIC_GF_GROUP_LENGTH 250 + typedef enum { INTER_NORMAL = 0, INTER_HIGH = 1, @@ -70,7 +86,7 @@ static const double rate_thresh_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 }; static const double rcf_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 }; typedef struct { - // Rate targetting variables + // Rate targeting variables int base_frame_target; // A baseline frame target before adjustment // for previous under or over shoot. int this_frame_target; // Actual frame target after rc adjustment. @@ -147,6 +163,8 @@ typedef struct { int rc_2_frame; int q_1_frame; int q_2_frame; + // Keep track of the last target average frame bandwidth. + int last_avg_frame_bandwidth; // Auto frame-scaling variables. FRAME_SCALE_LEVEL frame_size_selector; @@ -160,12 +178,43 @@ typedef struct { uint64_t avg_source_sad[MAX_LAG_BUFFERS]; uint64_t prev_avg_source_sad_lag; int high_source_sad_lagindex; + int high_num_blocks_with_motion; int alt_ref_gf_group; + int last_frame_is_src_altref; int high_source_sad; int count_last_scene_change; + int hybrid_intra_scene_change; + int re_encode_maxq_scene_change; int avg_frame_low_motion; int af_ratio_onepass_vbr; int force_qpmin; + int reset_high_source_sad; + double perc_arf_usage; + int force_max_q; + // Last frame was dropped post encode on scene change. + int last_post_encode_dropped_scene_change; + // Enable post encode frame dropping for screen content. Only enabled when + // ext_use_post_encode_drop is enabled by user. + int use_post_encode_drop; + // External flag to enable post encode frame dropping, controlled by user. + int ext_use_post_encode_drop; + // Flag to disable CBR feature to increase Q on overshoot detection. + int disable_overshoot_maxq_cbr; + int damped_adjustment[RATE_FACTOR_LEVELS]; + double arf_active_best_quality_adjustment_factor; + int arf_increase_active_best_quality; + + int preserve_arf_as_gld; + int preserve_next_arf_as_gld; + int show_arf_as_gld; + + // Flag to constrain golden frame interval on key frame frequency for 1 pass + // VBR. + int constrain_gf_key_freq_onepass_vbr; + + // The index of the current GOP. Start from zero. + // When a key frame is inserted, it resets to zero. + int gop_global_index; } RATE_CONTROL; struct VP9_COMP; @@ -174,18 +223,20 @@ struct VP9EncoderConfig; void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc); -int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs, +int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, double correction_factor, vpx_bit_depth_t bit_depth); double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth); +int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth); + void vp9_rc_init_minq_luts(void); int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate); // Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to -// be passed in to ensure that the max_gf_interval returned is at least as bis +// be passed in to ensure that the max_gf_interval returned is at least as big // as that. -int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate); +int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval); // Generally at the high level, the following flow is expected // to be enforced for rate control: @@ -213,6 +264,12 @@ int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate); // encode_frame_to_data_rate() function. void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi); void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi); +int vp9_calc_pframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi); +int vp9_calc_iframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi); +int vp9_calc_pframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi); +int vp9_calc_iframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi); +void vp9_set_gf_update_one_pass_vbr(struct VP9_COMP *const cpi); +void vp9_update_buffer_level_preencode(struct VP9_COMP *cpi); void vp9_rc_get_svc_params(struct VP9_COMP *cpi); // Post encode update of the rate control parameters based @@ -225,13 +282,18 @@ void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi); // Changes only the rate correction factors in the rate control structure. void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi); +// Post encode drop for CBR mode. +int post_encode_drop_cbr(struct VP9_COMP *cpi, size_t *size); + +int vp9_test_drop(struct VP9_COMP *cpi); + // Decide if we should drop this frame: For 1-pass CBR. // Changes only the decimation count in the rate control structure int vp9_rc_drop_frame(struct VP9_COMP *cpi); // Computes frame size bounds. void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi, - int this_frame_target, + int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit); @@ -278,12 +340,22 @@ void vp9_set_target_rate(struct VP9_COMP *cpi); int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi); -void vp9_avg_source_sad(struct VP9_COMP *cpi); +void vp9_scene_detection_onepass(struct VP9_COMP *cpi); int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q); +void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index); + +void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi); + +void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi); + +int vp9_rc_pick_q_and_bounds_two_pass(const struct VP9_COMP *cpi, + int *bottom_index, int *top_index, + int gf_group_index); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RATECTRL_H_ +#endif // VPX_VP9_ENCODER_VP9_RATECTRL_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rd.c b/media/libvpx/libvpx/vp9/encoder/vp9_rd.c index 76c639a647..95c95971c5 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_rd.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rd.c @@ -57,6 +57,30 @@ void vp9_rd_cost_init(RD_COST *rd_cost) { rd_cost->rdcost = 0; } +int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist) { + assert(mult >= 0); + assert(div > 0); + if (rate >= 0 && dist >= 0) { + return RDCOST(mult, div, rate, dist); + } + if (rate >= 0 && dist < 0) { + return RDCOST_NEG_D(mult, div, rate, -dist); + } + if (rate < 0 && dist >= 0) { + return RDCOST_NEG_R(mult, div, -rate, dist); + } + return -RDCOST(mult, div, -rate, -dist); +} + +void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost) { + if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX) { + rd_cost->rdcost = + vp9_calculate_rd_cost(mult, div, rd_cost->rate, rd_cost->dist); + } else { + vp9_rd_cost_reset(rd_cost); + } +} + // The baseline rd thresholds for breaking out of the rd loop for // certain modes are assumed to be based on 8x8 blocks. // This table is used to correct for block size. @@ -69,10 +93,12 @@ static void fill_mode_costs(VP9_COMP *cpi) { const FRAME_CONTEXT *const fc = cpi->common.fc; int i, j; - for (i = 0; i < INTRA_MODES; ++i) - for (j = 0; j < INTRA_MODES; ++j) + for (i = 0; i < INTRA_MODES; ++i) { + for (j = 0; j < INTRA_MODES; ++j) { vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], vp9_intra_mode_tree); + } + } vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree); for (i = 0; i < INTRA_MODES; ++i) { @@ -82,9 +108,28 @@ static void fill_mode_costs(VP9_COMP *cpi) { fc->uv_mode_prob[i], vp9_intra_mode_tree); } - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { vp9_cost_tokens(cpi->switchable_interp_costs[i], fc->switchable_interp_prob[i], vp9_switchable_interp_tree); + } + + for (i = TX_8X8; i < TX_SIZES; ++i) { + for (j = 0; j < TX_SIZE_CONTEXTS; ++j) { + const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs); + int k; + for (k = 0; k <= i; ++k) { + int cost = 0; + int m; + for (m = 0; m <= k - (k == i); ++m) { + if (m == k) + cost += vp9_cost_zero(tx_probs[m]); + else + cost += vp9_cost_one(tx_probs[m]); + } + cpi->tx_size_cost[i - 1][j][k] = cost; + } + } + } } static void fill_token_costs(vp9_coeff_cost *c, @@ -143,34 +188,125 @@ void vp9_init_me_luts(void) { static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, 8, 8, 4, 4, 2, 2, 1, 0 }; -static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, - 128, 144 }; -int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { - const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); -#if CONFIG_VP9_HIGHBITDEPTH - int64_t rdmult = 0; - switch (cpi->common.bit_depth) { - case VPX_BITS_8: rdmult = 88 * q * q / 24; break; - case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break; - case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break; - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; +// Note that the element below for frame type "USE_BUF_FRAME", which indicates +// that the show frame flag is set, should not be used as no real frame +// is encoded so we should not reach here. However, a dummy value +// is inserted here to make sure the data structure has the right number +// of values assigned. +static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, + 128, 144, 144 }; + +// Configure Vizier RD parameters. +// Later this function will use passed in command line values. +void vp9_init_rd_parameters(VP9_COMP *cpi) { + RD_CONTROL *const rdc = &cpi->rd_ctrl; + + // When |use_vizier_rc_params| is 1, we expect the rd parameters have been + // initialized by the pass in values. + // Be careful that parameters below are only initialized to 1, if we do not + // pass values to them. It is desired to take care of each parameter when + // using |use_vizier_rc_params|. + if (cpi->twopass.use_vizier_rc_params) return; + + // Make sure this function is floating point safe. + vpx_clear_system_state(); + + rdc->rd_mult_inter_qp_fac = 1.0; + rdc->rd_mult_arf_qp_fac = 1.0; + rdc->rd_mult_key_qp_fac = 1.0; +} + +// Returns the default rd multiplier for inter frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_inter_rd_multiplier(int qindex) { + return 4.15 + (0.001 * (double)qindex); +} + +// Returns the default rd multiplier for ARF/Golden Frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_arf_rd_multiplier(int qindex) { + return 4.25 + (0.001 * (double)qindex); +} + +// Returns the default rd multiplier for key frames for a given qindex. +// The function here is a first pass estimate based on data from +// a previous Vizer run +static double def_kf_rd_multiplier(int qindex) { + return 4.35 + (0.001 * (double)qindex); +} + +int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { + const RD_CONTROL *rdc = &cpi->rd_ctrl; + const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); + // largest dc_quant is 21387, therefore rdmult should fit in int32_t + int rdmult = q * q; + + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) { + return cpi->ext_ratectrl.ext_rdmult; + } + + // Make sure this function is floating point safe. + vpx_clear_system_state(); + + if (cpi->common.frame_type == KEY_FRAME) { + double def_rd_q_mult = def_kf_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_key_qp_fac); + } else if (!cpi->rc.is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + double def_rd_q_mult = def_arf_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_arf_qp_fac); + } else { + double def_rd_q_mult = def_inter_rd_multiplier(qindex); + rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_inter_qp_fac); + } + +#if CONFIG_VP9_HIGHBITDEPTH + switch (cpi->common.bit_depth) { + case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break; + case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break; + default: break; } -#else - int64_t rdmult = 88 * q * q / 24; #endif // CONFIG_VP9_HIGHBITDEPTH + return rdmult > 0 ? rdmult : 1; +} + +static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) { + int64_t rdmult_64 = rdmult; if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; - const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100)); + const int gfu_boost = cpi->multi_layer_arf + ? gf_group->gfu_boost[gf_group->index] + : cpi->rc.gfu_boost; + const int boost_index = VPXMIN(15, (gfu_boost / 100)); - rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7; - rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); + rdmult_64 = (rdmult_64 * rd_frame_type_factor[frame_type]) >> 7; + rdmult_64 += ((rdmult_64 * rd_boost_factor[boost_index]) >> 7); } - if (rdmult < 1) rdmult = 1; - return (int)rdmult; + return (int)rdmult_64; +} + +int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { + int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex); + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 && + cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) { + return cpi->ext_ratectrl.ext_rdmult; + } + return modulate_rdmult(cpi, rdmult); +} + +int vp9_get_adaptive_rdmult(const VP9_COMP *cpi, double beta) { + int rdmult = + vp9_compute_rd_mult_based_on_qindex(cpi, cpi->common.base_qindex); + rdmult = (int)((double)rdmult / beta); + rdmult = rdmult > 0 ? rdmult : 1; + return modulate_rdmult(cpi, rdmult); } static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { @@ -179,10 +315,10 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break; case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break; - case VPX_BITS_12: q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; break; default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; + assert(bit_depth == VPX_BITS_12); + q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; + break; } #else (void)bit_depth; @@ -203,12 +339,11 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) { x->sadperbit16 = sad_per_bit16lut_10[qindex]; x->sadperbit4 = sad_per_bit4lut_10[qindex]; break; - case VPX_BITS_12: + default: + assert(cpi->common.bit_depth == VPX_BITS_12); x->sadperbit16 = sad_per_bit16lut_12[qindex]; x->sadperbit4 = sad_per_bit4lut_12[qindex]; break; - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); } #else (void)cpi; @@ -249,6 +384,15 @@ static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) { } } +void vp9_build_inter_mode_cost(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + int i; + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { + vp9_cost_tokens((int *)cpi->inter_mode_cost[i], cm->fc->inter_mode_probs[i], + vp9_inter_mode_tree); + } +} + void vp9_initialize_rd_consts(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->td.mb; @@ -297,72 +441,68 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) { x->nmvjointcost, cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc, cm->allow_high_precision_mv); - - for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - vp9_cost_tokens((int *)cpi->inter_mode_cost[i], - cm->fc->inter_mode_probs[i], vp9_inter_mode_tree); + vp9_build_inter_mode_cost(cpi); } } } } +// NOTE: The tables below must be of the same size. + +// The functions described below are sampled at the four most significant +// bits of x^2 + 8 / 256. + +// Normalized rate: +// This table models the rate for a Laplacian source with given variance +// when quantized with a uniform quantizer with given stepsize. The +// closed form expression is: +// Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], +// where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), +// and H(x) is the binary entropy function. +static const int rate_tab_q10[] = { + 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044, + 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037, + 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179, + 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398, + 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, 911, 864, 821, 781, 745, + 680, 623, 574, 530, 490, 455, 424, 395, 345, 304, 269, 239, 213, + 190, 171, 154, 126, 104, 87, 73, 61, 52, 44, 38, 28, 21, + 16, 12, 10, 8, 6, 5, 3, 2, 1, 1, 1, 0, 0, +}; + +// Normalized distortion: +// This table models the normalized distortion for a Laplacian source +// with given variance when quantized with a uniform quantizer +// with given stepsize. The closed form expression is: +// Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) +// where x = qpstep / sqrt(variance). +// Note the actual distortion is Dn * variance. +static const int dist_tab_q10[] = { + 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, + 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 21, + 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, 59, 64, 69, + 73, 78, 88, 97, 106, 115, 124, 133, 142, 151, 167, 184, 200, + 215, 231, 245, 260, 274, 301, 327, 351, 375, 397, 418, 439, 458, + 495, 528, 559, 587, 613, 637, 659, 680, 717, 749, 777, 801, 823, + 842, 859, 874, 899, 919, 936, 949, 960, 969, 977, 983, 994, 1001, + 1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, +}; +static const int xsq_iq_q10[] = { + 0, 4, 8, 12, 16, 20, 24, 28, 32, + 40, 48, 56, 64, 72, 80, 88, 96, 112, + 128, 144, 160, 176, 192, 208, 224, 256, 288, + 320, 352, 384, 416, 448, 480, 544, 608, 672, + 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504, + 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296, + 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136, + 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, + 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736, + 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696, + 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808, + 180192, 196576, 212960, 229344, 245728, +}; + static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { - // NOTE: The tables below must be of the same size. - - // The functions described below are sampled at the four most significant - // bits of x^2 + 8 / 256. - - // Normalized rate: - // This table models the rate for a Laplacian source with given variance - // when quantized with a uniform quantizer with given stepsize. The - // closed form expression is: - // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], - // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), - // and H(x) is the binary entropy function. - static const int rate_tab_q10[] = { - 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, - 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, - 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, - 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, - 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, - 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424, - 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87, - 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6, - 5, 3, 2, 1, 1, 1, 0, 0, - }; - - // Normalized distortion: - // This table models the normalized distortion for a Laplacian source - // with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expression is: - // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) - // where x = qpstep / sqrt(variance). - // Note the actual distortion is Dn * variance. - static const int dist_tab_q10[] = { - 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, - 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, - 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, - 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142, - 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351, - 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659, - 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936, - 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017, - 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, - }; - static const int xsq_iq_q10[] = { - 0, 4, 8, 12, 16, 20, 24, 28, 32, - 40, 48, 56, 64, 72, 80, 88, 96, 112, - 128, 144, 160, 176, 192, 208, 224, 256, 288, - 320, 352, 384, 416, 448, 480, 544, 608, 672, - 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504, - 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296, - 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136, - 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, - 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736, - 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696, - 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808, - 180192, 196576, 212960, 229344, 245728, - }; const int tmp = (xsq_q10 >> 2) + 8; const int k = get_msb(tmp) - 3; const int xq = (k << 3) + ((tmp >> k) & 0x7); @@ -373,6 +513,8 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; } +static const uint32_t MAX_XSQ_Q10 = 245727; + void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist) { @@ -387,7 +529,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, *dist = 0; } else { int d_q10, r_q10; - static const uint32_t MAX_XSQ_Q10 = 245727; const uint64_t xsq_q10_64 = (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var; const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10); @@ -397,6 +538,12 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, } } +// Disable gcc 12.2 false positive warning. +// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-overflow" +#endif void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd, ENTROPY_CONTEXT t_above[16], @@ -425,15 +572,18 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, for (i = 0; i < num_4x4_h; i += 4) t_left[i] = !!*(const uint32_t *)&left[i]; break; - case TX_32X32: + default: + assert(tx_size == TX_32X32); for (i = 0; i < num_4x4_w; i += 8) t_above[i] = !!*(const uint64_t *)&above[i]; for (i = 0; i < num_4x4_h; i += 8) t_left[i] = !!*(const uint64_t *)&left[i]; break; - default: assert(0 && "Invalid transform size."); break; } } +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) { @@ -447,8 +597,7 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, uint8_t *src_y_ptr = x->plane[0].src.buf; uint8_t *ref_y_ptr; const int num_mv_refs = - MAX_MV_REF_CANDIDATES + - (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size); + MAX_MV_REF_CANDIDATES + (block_size < x->max_partition_size); MV pred_mv[3]; pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv; @@ -458,11 +607,12 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int == x->mbmi_ext->ref_mvs[ref_frame][1].as_int; + // Get the sad for each candidate reference mv. for (i = 0; i < num_mv_refs; ++i) { const MV *this_mv = &pred_mv[i]; int fp_row, fp_col; - + if (this_mv->row == INT16_MAX || this_mv->col == INT16_MAX) continue; if (i == 1 && near_same_nearest) continue; fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3; fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3; @@ -527,6 +677,7 @@ YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi, const VP9_COMMON *const cm = &cpi->common; const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1]; const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame); + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) ? &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL; @@ -624,19 +775,21 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, } } -int vp9_get_intra_cost_penalty(int qindex, int qdelta, - vpx_bit_depth_t bit_depth) { - const int q = vp9_dc_quant(qindex, qdelta, bit_depth); -#if CONFIG_VP9_HIGHBITDEPTH - switch (bit_depth) { - case VPX_BITS_8: return 20 * q; - case VPX_BITS_10: return 5 * q; - case VPX_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2); - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; - } -#else - return 20 * q; -#endif // CONFIG_VP9_HIGHBITDEPTH +int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize, + int qindex, int qdelta) { + // Reduce the intra cost penalty for small blocks (<=16x16). + int reduction_fac = + (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0; + + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh) + // Don't reduce intra cost penalty if estimated noise level is high. + reduction_fac = 0; + + // Always use VPX_BITS_8 as input here because the penalty is applied + // to rate not distortion so we want a consistent penalty for all bit + // depths. If the actual bit depth were passed in here then the value + // retured by vp9_dc_quant() would scale with the bit depth and we would + // then need to apply inverse scaling to correct back to a bit depth + // independent rate penalty. + return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac; } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rd.h b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h index 05344b6cf0..6c61ae514a 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_rd.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RD_H_ -#define VP9_ENCODER_VP9_RD_H_ +#ifndef VPX_VP9_ENCODER_VP9_RD_H_ +#define VPX_VP9_ENCODER_VP9_RD_H_ #include @@ -27,20 +27,27 @@ extern "C" { #define RD_EPB_SHIFT 6 #define RDCOST(RM, DM, R, D) \ - (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), VP9_PROB_COST_SHIFT) + (D << DM)) + ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) + ((D) << (DM)) +#define RDCOST_NEG_R(RM, DM, R, D) \ + ((D) << (DM)) - ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) +#define RDCOST_NEG_D(RM, DM, R, D) \ + ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) - ((D) << (DM)) + #define QIDX_SKIP_THRESH 115 #define MV_COST_WEIGHT 108 #define MV_COST_WEIGHT_SUB 120 -#define INVALID_MV 0x80008000 - #define MAX_MODES 30 #define MAX_REFS 6 +#define RD_THRESH_INIT_FACT 32 #define RD_THRESH_MAX_FACT 64 #define RD_THRESH_INC 1 +#define VP9_DIST_SCALE_LOG2 4 +#define VP9_DIST_SCALE (1 << VP9_DIST_SCALE_LOG2) + // This enumerator type needs to be kept aligned with the mode order in // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. typedef enum { @@ -94,11 +101,18 @@ typedef enum { THR_INTRA, } THR_MODES_SUB8X8; +typedef struct { + // RD multiplier control factors added for Vizier project. + double rd_mult_inter_qp_fac; + double rd_mult_arf_qp_fac; + double rd_mult_key_qp_fac; +} RD_CONTROL; + typedef struct RD_OPT { // Thresh_mult is used to set a threshold for the rd score. A higher value // means that we will accept the best mode so far more often. This number - // is used in combination with the current block size, and thresh_freq_fact - // to pick a threshold. + // is used in combination with the current block size, and thresh_freq_fact to + // pick a threshold. int thresh_mult[MAX_MODES]; int thresh_mult_sub8x8[MAX_REFS]; @@ -107,9 +121,12 @@ typedef struct RD_OPT { int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; + int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES]; + int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; int RDMULT; int RDDIV; + double r0; } RD_OPT; typedef struct RD_COST { @@ -122,19 +139,29 @@ typedef struct RD_COST { void vp9_rd_cost_reset(RD_COST *rd_cost); // Initialize the rate distortion cost values to zero. void vp9_rd_cost_init(RD_COST *rd_cost); +// It supports negative rate and dist, which is different from RDCOST(). +int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist); +// Update the cost value based on its rate and distortion. +void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost); struct TileInfo; struct TileDataEnc; struct VP9_COMP; struct macroblock; +void vp9_init_rd_parameters(struct VP9_COMP *cpi); + +int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex); + int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex); +int vp9_get_adaptive_rdmult(const struct VP9_COMP *cpi, double beta); + void vp9_initialize_rd_consts(struct VP9_COMP *cpi); void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex); -void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist); int vp9_get_switchable_rate(const struct VP9_COMP *cpi, @@ -160,12 +187,12 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi); void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi); -void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize, - int best_mode_index); +void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, + int bsize, int best_mode_index); static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, - int thresh_fact) { - return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX; + const int *const thresh_fact) { + return best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX; } static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) { @@ -182,20 +209,27 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd, const struct scale_factors *scale, const struct scale_factors *scale_uv); -int vp9_get_intra_cost_penalty(int qindex, int qdelta, - vpx_bit_depth_t bit_depth); +int vp9_get_intra_cost_penalty(const struct VP9_COMP *const cpi, + BLOCK_SIZE bsize, int qindex, int qdelta); +unsigned int vp9_get_sby_variance(struct VP9_COMP *cpi, + const struct buf_2d *ref, BLOCK_SIZE bs); unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs); #if CONFIG_VP9_HIGHBITDEPTH +unsigned int vp9_high_get_sby_variance(struct VP9_COMP *cpi, + const struct buf_2d *ref, BLOCK_SIZE bs, + int bd); unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd); #endif +void vp9_build_inter_mode_cost(struct VP9_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RD_H_ +#endif // VPX_VP9_ENCODER_VP9_RD_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c index 27d4e9d6d3..f8c6f850fc 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c @@ -31,6 +31,9 @@ #include "vp9/common/vp9_scan.h" #include "vp9/common/vp9_seg_common.h" +#if !CONFIG_REALTIME_ONLY +#include "vp9/encoder/vp9_aq_variance.h" +#endif #include "vp9/encoder/vp9_cost.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" @@ -40,7 +43,6 @@ #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_rdopt.h" -#include "vp9/encoder/vp9_aq_variance.h" #define LAST_FRAME_MODE_MASK \ ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME)) @@ -59,7 +61,9 @@ typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } MODE_DEFINITION; -typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION; +typedef struct { + MV_REFERENCE_FRAME ref_frame[2]; +} REF_DEFINITION; struct rdcost_block_args { const VP9_COMP *cpi; @@ -73,34 +77,37 @@ struct rdcost_block_args { int64_t best_rd; int exit_early; int use_fast_coef_costing; - const scan_order *so; + const ScanOrder *so; uint8_t skippable; + struct buf_2d *this_recon; }; #define LAST_NEW_MV_INDEX 6 + +#if !CONFIG_REALTIME_ONLY static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { - { NEARESTMV, { LAST_FRAME, NONE } }, - { NEARESTMV, { ALTREF_FRAME, NONE } }, - { NEARESTMV, { GOLDEN_FRAME, NONE } }, + { NEARESTMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEARESTMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEARESTMV, { GOLDEN_FRAME, NO_REF_FRAME } }, - { DC_PRED, { INTRA_FRAME, NONE } }, + { DC_PRED, { INTRA_FRAME, NO_REF_FRAME } }, - { NEWMV, { LAST_FRAME, NONE } }, - { NEWMV, { ALTREF_FRAME, NONE } }, - { NEWMV, { GOLDEN_FRAME, NONE } }, + { NEWMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEWMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEWMV, { GOLDEN_FRAME, NO_REF_FRAME } }, - { NEARMV, { LAST_FRAME, NONE } }, - { NEARMV, { ALTREF_FRAME, NONE } }, - { NEARMV, { GOLDEN_FRAME, NONE } }, + { NEARMV, { LAST_FRAME, NO_REF_FRAME } }, + { NEARMV, { ALTREF_FRAME, NO_REF_FRAME } }, + { NEARMV, { GOLDEN_FRAME, NO_REF_FRAME } }, - { ZEROMV, { LAST_FRAME, NONE } }, - { ZEROMV, { GOLDEN_FRAME, NONE } }, - { ZEROMV, { ALTREF_FRAME, NONE } }, + { ZEROMV, { LAST_FRAME, NO_REF_FRAME } }, + { ZEROMV, { GOLDEN_FRAME, NO_REF_FRAME } }, + { ZEROMV, { ALTREF_FRAME, NO_REF_FRAME } }, { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } }, { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } }, - { TM_PRED, { INTRA_FRAME, NONE } }, + { TM_PRED, { INTRA_FRAME, NO_REF_FRAME } }, { NEARMV, { LAST_FRAME, ALTREF_FRAME } }, { NEWMV, { LAST_FRAME, ALTREF_FRAME } }, @@ -110,21 +117,22 @@ static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { { ZEROMV, { LAST_FRAME, ALTREF_FRAME } }, { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } }, - { H_PRED, { INTRA_FRAME, NONE } }, - { V_PRED, { INTRA_FRAME, NONE } }, - { D135_PRED, { INTRA_FRAME, NONE } }, - { D207_PRED, { INTRA_FRAME, NONE } }, - { D153_PRED, { INTRA_FRAME, NONE } }, - { D63_PRED, { INTRA_FRAME, NONE } }, - { D117_PRED, { INTRA_FRAME, NONE } }, - { D45_PRED, { INTRA_FRAME, NONE } }, + { H_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { V_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D135_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D207_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D153_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D63_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D117_PRED, { INTRA_FRAME, NO_REF_FRAME } }, + { D45_PRED, { INTRA_FRAME, NO_REF_FRAME } }, }; static const REF_DEFINITION vp9_ref_order[MAX_REFS] = { - { { LAST_FRAME, NONE } }, { { GOLDEN_FRAME, NONE } }, - { { ALTREF_FRAME, NONE } }, { { LAST_FRAME, ALTREF_FRAME } }, - { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } }, + { { LAST_FRAME, NO_REF_FRAME } }, { { GOLDEN_FRAME, NO_REF_FRAME } }, + { { ALTREF_FRAME, NO_REF_FRAME } }, { { LAST_FRAME, ALTREF_FRAME } }, + { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NO_REF_FRAME } }, }; +#endif // !CONFIG_REALTIME_ONLY static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, int min_plane, int max_plane) { @@ -151,10 +159,14 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n, } } -static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, - MACROBLOCKD *xd, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, - int64_t *skip_sse_sb) { +#if !CONFIG_REALTIME_ONLY +// Planewise build inter prediction and compute rdcost with early termination +// option +static int build_inter_pred_model_rd_earlyterm( + VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x, + MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, + int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm, + int64_t best_rd) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. @@ -164,12 +176,9 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, const int ref = xd->mi[0]->ref_frame[0]; unsigned int sse; unsigned int var = 0; - unsigned int sum_sse = 0; int64_t total_sse = 0; int skip_flag = 1; const int shift = 6; - int rate; - int64_t dist; const int dequant_shift = #if CONFIG_VP9_HIGHBITDEPTH (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : @@ -178,6 +187,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, x->pred_sse[ref] = 0; + // Build prediction signal, compute stats and RD cost on per-plane basis for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; struct macroblockd_plane *const pd = &xd->plane[i]; @@ -186,6 +196,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size]; const int64_t dc_thr = p->quant_thred[0] >> shift; const int64_t ac_thr = p->quant_thred[1] >> shift; + unsigned int sum_sse = 0; // The low thresholds are used to measure if the prediction errors are // low enough so that we can skip the mode search. const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2); @@ -195,9 +206,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, int idx, idy; int lw = b_width_log2_lookup[unit_size] + 2; int lh = b_height_log2_lookup[unit_size] + 2; + unsigned int qstep; + unsigned int nlog2; + int64_t dist = 0; - sum_sse = 0; + // Build inter predictor + vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); + // Compute useful stats for (idy = 0; idy < bh; ++idy) { for (idx = 0; idx < bw; ++idx) { uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw); @@ -233,34 +249,38 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, } total_sse += sum_sse; + qstep = pd->dequant[1] >> dequant_shift; + nlog2 = num_pels_log2_lookup[bs]; // Fast approximate the modelling function. if (cpi->sf.simple_model_rd_from_var) { int64_t rate; - const int64_t square_error = sum_sse; - int quantizer = (pd->dequant[1] >> dequant_shift); - - if (quantizer < 120) - rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT); + if (qstep < 120) + rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT); else rate = 0; - dist = (square_error * quantizer) >> 8; + dist = ((int64_t)sum_sse * qstep) >> 8; rate_sum += rate; - dist_sum += dist; } else { - vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs], - pd->dequant[1] >> dequant_shift, &rate, - &dist); + int rate; + vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist); rate_sum += rate; - dist_sum += dist; + } + dist_sum += dist; + if (do_earlyterm) { + if (RDCOST(x->rdmult, x->rddiv, rate_sum, + dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd) + return 1; } } - *skip_txfm_sb = skip_flag; - *skip_sse_sb = total_sse << 4; + *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2; *out_rate_sum = (int)rate_sum; - *out_dist_sum = dist_sum << 4; + *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2; + + return 0; } +#endif // !CONFIG_REALTIME_ONLY #if CONFIG_VP9_HIGHBITDEPTH int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, @@ -284,22 +304,12 @@ int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, return error; } -int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz) { - // Note that the C versions of these 2 functions (vp9_block_error and - // vp9_highbd_block_error_8bit are the same, but the optimized assembly - // routines are not compatible in the non high bitdepth configuration, so - // they still cannot share the same name. - return vp9_block_error_c(coeff, dqcoeff, block_size, ssz); -} - static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd) { if (bd == 8) { - return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz); + return vp9_block_error(coeff, dqcoeff, block_size, ssz); } else { return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd); } @@ -321,7 +331,7 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, return error; } -int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, +int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size) { int i; int64_t error = 0; @@ -358,11 +368,11 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = x->token_costs[tx_size][type][is_inter_block(mi)]; uint8_t token_cache[32 * 32]; - int c, cost; + int cost; #if CONFIG_VP9_HIGHBITDEPTH - const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd); + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); #else - const int *cat6_high_cost = vp9_get_high_cost_table(8); + const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8); #endif // Check for consistency of tx_size with mode info @@ -373,10 +383,10 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, if (eob == 0) { // single eob token cost = token_costs[0][0][pt][EOB_TOKEN]; - c = 0; } else { if (use_fast_coef_costing) { int band_left = *band_count++; + int c; // dc token int v = qcoeff[0]; @@ -407,6 +417,7 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, } else { // !use_fast_coef_costing int band_left = *band_count++; + int c; // dc token int v = qcoeff[0]; @@ -447,9 +458,64 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, return cost; } -static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim, - int subsampling_dim, int blk_dim) { - return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim; +// Copy all visible 4x4s in the transform block. +static void copy_block_visible(const MACROBLOCKD *xd, + const struct macroblockd_plane *const pd, + const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, int blk_row, + int blk_col, const BLOCK_SIZE plane_bsize, + const BLOCK_SIZE tx_bsize) { + const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; + const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; + int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge, + pd->subsampling_x, blk_col); + int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge, + pd->subsampling_y, blk_row); + const int is_highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + if (tx_bsize == BLOCK_4X4 || + (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { + const int w = tx_4x4_w << 2; + const int h = tx_4x4_h << 2; +#if CONFIG_VP9_HIGHBITDEPTH + if (is_highbd) { + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), src_stride, + CONVERT_TO_SHORTPTR(dst), dst_stride, NULL, 0, 0, + 0, 0, w, h, xd->bd); + } else { +#endif + vpx_convolve_copy(src, src_stride, dst, dst_stride, NULL, 0, 0, 0, 0, w, + h); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + } else { + int r, c; + int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); + int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); + // if we are in the unrestricted motion border. + for (r = 0; r < max_r; ++r) { + // Skip visiting the sub blocks that are wholly within the UMV. + for (c = 0; c < max_c; ++c) { + const uint8_t *src_ptr = src + r * src_stride * 4 + c * 4; + uint8_t *dst_ptr = dst + r * dst_stride * 4 + c * 4; +#if CONFIG_VP9_HIGHBITDEPTH + if (is_highbd) { + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, + NULL, 0, 0, 0, 0, 4, 4, xd->bd); + } else { +#endif + vpx_convolve_copy(src_ptr, src_stride, dst_ptr, dst_stride, NULL, 0, + 0, 0, 0, 4, 4); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + } + } + } + (void)is_highbd; } // Compute the pixel domain sum square error on all visible 4x4s in the @@ -492,50 +558,17 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd, return sse; } -// Compute the squares sum squares on all visible 4x4s in the transform block. -static int64_t sum_squares_visible(const MACROBLOCKD *xd, - const struct macroblockd_plane *const pd, - const int16_t *diff, const int diff_stride, - int blk_row, int blk_col, - const BLOCK_SIZE plane_bsize, - const BLOCK_SIZE tx_bsize) { - int64_t sse; - const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; - const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize]; - const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize]; - int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge, - pd->subsampling_x, blk_col); - int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge, - pd->subsampling_y, blk_row); - if (tx_bsize == BLOCK_4X4 || - (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) { - sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_bsize); - } else { - int r, c; - int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h); - int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w); - sse = 0; - // if we are in the unrestricted motion border. - for (r = 0; r < max_r; ++r) { - // Skip visiting the sub blocks that are wholly within the UMV. - for (c = 0; c < max_c; ++c) { - sse += (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, BLOCK_4X4); - } - } - } - return sse; -} - static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, int block, int blk_row, int blk_col, TX_SIZE tx_size, int64_t *out_dist, - int64_t *out_sse) { + int64_t *out_sse, struct buf_2d *out_recon, + int sse_calc_done) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int eob = p->eobs[block]; - if (x->block_tx_domain) { + if (!out_recon && x->block_tx_domain && eob) { const int ss_txfrm_size = tx_size << 1; int64_t this_sse; const int shift = tx_size == TX_32X32 ? 0 : 2; @@ -555,15 +588,15 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, if (x->skip_encode && !is_inter_block(xd->mi[0])) { // TODO(jingning): tune the model to better capture the distortion. - const int64_t p = + const int64_t mean_quant_error = (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >> #if CONFIG_VP9_HIGHBITDEPTH (shift + 2 + (bd - 8) * 2); #else (shift + 2); #endif // CONFIG_VP9_HIGHBITDEPTH - *out_dist += (p >> 4); - *out_sse += p; + *out_dist += (mean_quant_error >> 4); + *out_sse += mean_quant_error; } } else { const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; @@ -574,15 +607,27 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, const int dst_idx = 4 * (blk_row * dst_stride + blk_col); const uint8_t *src = &p->src.buf[src_idx]; const uint8_t *dst = &pd->dst.buf[dst_idx]; + uint8_t *out_recon_ptr = 0; + const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const uint16_t *eob = &p->eobs[block]; unsigned int tmp; - tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row, - blk_col, plane_bsize, tx_bsize); + if (sse_calc_done) { + tmp = (unsigned int)(*out_sse); + } else { + tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row, + blk_col, plane_bsize, tx_bsize); + } *out_sse = (int64_t)tmp * 16; + if (out_recon) { + const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col); + out_recon_ptr = &out_recon->buf[out_recon_idx]; + copy_block_visible(xd, pd, dst, dst_stride, out_recon_ptr, + out_recon->stride, blk_row, blk_col, plane_bsize, + tx_bsize); + } - if (*eob) { + if (eob) { #if CONFIG_VP9_HIGHBITDEPTH DECLARE_ALIGNED(16, uint16_t, recon16[1024]); uint8_t *recon = (uint8_t *)recon16; @@ -592,42 +637,42 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - recon = CONVERT_TO_BYTEPTR(recon); - vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, - bs, bs, xd->bd); + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, + 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, eob, xd->bd); break; - case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd); + default: + assert(tx_size == TX_32X32); + vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, eob, xd->bd); break; - default: assert(0 && "Invalid transform size"); } } + recon = CONVERT_TO_BYTEPTR(recon16); } else { #endif // CONFIG_VP9_HIGHBITDEPTH - vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs); + vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs); switch (tx_size) { - case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break; - case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break; - case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, *eob); break; - case TX_4X4: + case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, eob); break; + case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, eob); break; + case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, eob); break; + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around // eob<=1, which is significant (not just an optimization) for // the lossless case. - x->itxm_add(dqcoeff, recon, 32, *eob); + x->inv_txfm_add(dqcoeff, recon, 32, eob); break; - default: assert(0 && "Invalid transform size"); break; } #if CONFIG_VP9_HIGHBITDEPTH } @@ -635,6 +680,10 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, tmp = pixel_sse(cpi, xd, pd, src, src_stride, recon, 32, blk_row, blk_col, plane_bsize, tx_bsize); + if (out_recon) { + copy_block_visible(xd, pd, recon, 32, out_recon_ptr, out_recon->stride, + blk_row, blk_col, plane_bsize, tx_bsize); + } } *out_dist = (int64_t)tmp * 16; @@ -655,34 +704,60 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, MODE_INFO *const mi = xd->mi[0]; int64_t rd1, rd2, rd; int rate; - int64_t dist; - int64_t sse; + int64_t dist = INT64_MAX; + int64_t sse = INT64_MAX; const int coeff_ctx = combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]); + struct buf_2d *recon = args->this_recon; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int dst_stride = pd->dst.stride; + const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)]; + const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method; + const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh; + int sse_calc_done = 0; +#if CONFIG_MISMATCH_DEBUG + struct encode_b_args encode_b_arg = { + x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done, + &sse, args->t_above, args->t_left, &mi->skip, + 0, // mi_row + 0, // mi_col + 0 // output_enabled + }; +#else + struct encode_b_args encode_b_arg = { + x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done, + &sse, args->t_above, args->t_left, &mi->skip + }; +#endif if (args->exit_early) return; if (!is_inter_block(mi)) { - struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above, - args->t_left, &mi->skip }; vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, - &intra_arg); + &encode_b_arg); + if (recon) { + uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)]; + copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride, + blk_row, blk_col, plane_bsize, tx_bsize); + } if (x->block_tx_domain) { dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse); + tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done); } else { - const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; const struct macroblock_plane *const p = &x->plane[plane]; - const struct macroblockd_plane *const pd = &xd->plane[plane]; const int src_stride = p->src.stride; - const int dst_stride = pd->dst.stride; - const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)]; - const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)]; - const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; unsigned int tmp; - sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col, - plane_bsize, tx_bsize); + if (!sse_calc_done) { + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *diff = + &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; + int visible_width, visible_height; + sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col, + plane_bsize, tx_bsize, &visible_width, + &visible_height); + } #if CONFIG_VP9_HIGHBITDEPTH if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8)) sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2); @@ -692,17 +767,33 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, blk_row, blk_col, plane_bsize, tx_bsize); dist = (int64_t)tmp * 16; } - } else if (max_txsize_lookup[plane_bsize] == tx_size) { - if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == - SKIP_TXFM_NONE) { + } else { + int skip_txfm_flag = SKIP_TXFM_NONE; + if (max_txsize_lookup[plane_bsize] == tx_size) + skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))]; + + // This reduces the risk of bad perceptual quality due to bad prediction. + // We always force the encoder to perform transform and quantization. + if (!args->cpi->sf.allow_skip_txfm_ac_dc && + skip_txfm_flag == SKIP_TXFM_AC_DC) { + skip_txfm_flag = SKIP_TXFM_NONE; + } + + if (skip_txfm_flag == SKIP_TXFM_NONE || + (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) { + const struct macroblock_plane *const p = &x->plane[plane]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int16_t *const diff = + &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; + const int use_trellis_opt = + do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize, + tx_size, &encode_b_arg); // full forward transform and quantization vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); - if (x->block_qcoeff_opt) - vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); + if (use_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse); - } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == - SKIP_TXFM_AC_ONLY) { + tx_size, &dist, &sse, recon, sse_calc_done); + } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) { // compute DC coefficient tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); @@ -722,19 +813,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, dist = VPXMAX(0, sse - dc_correct); } } else { - // SKIP_TXFM_AC_DC - // skip forward transform - x->plane[plane].eobs[block] = 0; - sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; - dist = sse; + assert(0 && "allow_skip_txfm_ac_dc does not allow SKIP_TXFM_AC_DC."); } - } else { - // full forward transform and quantization - vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); - if (x->block_qcoeff_opt) - vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); - dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse); } rd = RDCOST(x->rdmult, x->rddiv, 0, dist); @@ -751,9 +831,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, // TODO(jingning): temporarily enabled only for luma component rd = VPXMIN(rd1, rd2); - if (plane == 0) + if (plane == 0) { x->zcoeff_blk[tx_size][block] = - !x->plane[plane].eobs[block] || (rd1 > rd2 && !xd->lossless); + !x->plane[plane].eobs[block] || + (x->sharpness == 0 && rd1 > rd2 && !xd->lossless); + x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block]; + } args->this_rate += rate; args->this_dist += dist; @@ -771,7 +854,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, int64_t ref_best_rd, int plane, BLOCK_SIZE bsize, - TX_SIZE tx_size, int use_fast_coef_casting) { + TX_SIZE tx_size, int use_fast_coef_costing, + struct buf_2d *recon) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; struct rdcost_block_args args; @@ -779,8 +863,9 @@ static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, args.cpi = cpi; args.x = x; args.best_rd = ref_best_rd; - args.use_fast_coef_costing = use_fast_coef_casting; + args.use_fast_coef_costing = use_fast_coef_costing; args.skippable = 1; + args.this_recon = recon; if (plane == 0) xd->mi[0]->tx_size = tx_size; @@ -805,7 +890,8 @@ static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, int64_t *sse, - int64_t ref_best_rd, BLOCK_SIZE bs) { + int64_t ref_best_rd, BLOCK_SIZE bs, + struct buf_2d *recon) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; @@ -815,13 +901,13 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate, mi->tx_size = VPXMIN(max_tx_size, largest_tx_size); txfm_rd_in_plane(cpi, x, rate, distortion, skip, sse, ref_best_rd, 0, bs, - mi->tx_size, cpi->sf.use_fast_coef_costing); + mi->tx_size, cpi->sf.use_fast_coef_costing, recon); } static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, int64_t *psse, int64_t ref_best_rd, - BLOCK_SIZE bs) { + BLOCK_SIZE bs, struct buf_2d *recon) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; @@ -833,20 +919,34 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, { INT64_MAX, INT64_MAX }, { INT64_MAX, INT64_MAX }, { INT64_MAX, INT64_MAX } }; - int n, m; + int n; int s0, s1; - int64_t best_rd = INT64_MAX; + int64_t best_rd = ref_best_rd; TX_SIZE best_tx = max_tx_size; int start_tx, end_tx; + const int tx_size_ctx = get_tx_size_context(xd); +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, recon_buf16[TX_SIZES][64 * 64]); + uint8_t *recon_buf[TX_SIZES]; + for (n = 0; n < TX_SIZES; ++n) { + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + recon_buf[n] = CONVERT_TO_BYTEPTR(recon_buf16[n]); + } else { + recon_buf[n] = (uint8_t *)recon_buf16[n]; + } + } +#else + DECLARE_ALIGNED(16, uint8_t, recon_buf[TX_SIZES][64 * 64]); +#endif // CONFIG_VP9_HIGHBITDEPTH - const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs); assert(skip_prob > 0); s0 = vp9_cost_bit(skip_prob, 0); s1 = vp9_cost_bit(skip_prob, 1); if (cm->tx_mode == TX_MODE_SELECT) { start_tx = max_tx_size; - end_tx = 0; + end_tx = VPXMAX(start_tx - cpi->sf.tx_size_search_depth, 0); + if (bs > BLOCK_32X32) end_tx = VPXMIN(end_tx + 1, start_tx); } else { TX_SIZE chosen_tx_size = VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]); @@ -855,15 +955,17 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, } for (n = start_tx; n >= end_tx; n--) { - int r_tx_size = 0; - for (m = 0; m <= n - (n == (int)max_tx_size); m++) { - if (m == n) - r_tx_size += vp9_cost_zero(tx_probs[m]); - else - r_tx_size += vp9_cost_one(tx_probs[m]); + const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n]; + if (recon) { + struct buf_2d this_recon; + this_recon.buf = recon_buf[n]; + this_recon.stride = recon->stride; + txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs, + n, cpi->sf.use_fast_coef_costing, &this_recon); + } else { + txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs, + n, cpi->sf.use_fast_coef_costing, 0); } - txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0, - bs, n, cpi->sf.use_fast_coef_costing); r[n][1] = r[n][0]; if (r[n][0] < INT_MAX) { r[n][1] += r_tx_size; @@ -905,11 +1007,25 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, *rate = r[mi->tx_size][cm->tx_mode == TX_MODE_SELECT]; *skip = s[mi->tx_size]; *psse = sse[mi->tx_size]; + if (recon) { +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + memcpy(CONVERT_TO_SHORTPTR(recon->buf), + CONVERT_TO_SHORTPTR(recon_buf[mi->tx_size]), + 64 * 64 * sizeof(uint16_t)); + } else { +#endif + memcpy(recon->buf, recon_buf[mi->tx_size], 64 * 64); +#if CONFIG_VP9_HIGHBITDEPTH + } +#endif + } } static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, int64_t *psse, - BLOCK_SIZE bs, int64_t ref_best_rd) { + BLOCK_SIZE bs, int64_t ref_best_rd, + struct buf_2d *recon) { MACROBLOCKD *xd = &x->e_mbd; int64_t sse; int64_t *ret_sse = psse ? psse : &sse; @@ -918,10 +1034,10 @@ static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) { choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd, - bs); + bs, recon); } else { choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd, - bs); + bs, recon); } } @@ -971,6 +1087,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, xd->mi[0]->tx_size = TX_4X4; + assert(!x->skip_block); + #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { for (mode = DC_PRED; mode <= TM_PRED; ++mode) { @@ -995,9 +1113,13 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, const int block = (row + idy) * 2 + (col + idx); const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; + uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst); int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); - tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; xd->mi[0]->bmi[block].as_mode = mode; vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst, @@ -1006,29 +1128,31 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride, xd->bd); if (xd->lossless) { - const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_highbd_fwht4x4(src_diff, coeff, 8); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, + eob, so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0); if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next_highbd; - vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, + vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16, dst_stride, p->eobs[block], xd->bd); } else { int64_t unused; const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block); - const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type]; + const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); if (tx_type == DCT_DCT) vpx_highbd_fdct4x4(src_diff, coeff, 8); else vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, + eob, so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); distortion += vp9_highbd_block_error_dispatch( @@ -1039,7 +1163,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next_highbd; vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), - dst, dst_stride, p->eobs[block], xd->bd); + dst16, dst_stride, p->eobs[block], xd->bd); } } } @@ -1061,7 +1185,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, num_4x4_blocks_wide * 4 * sizeof(uint16_t)); } } - next_highbd : {} + next_highbd: {} } if (best_rd >= rd_thresh || x->skip_encode) return best_rd; @@ -1098,7 +1222,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); - tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; xd->mi[0]->bmi[block].as_mode = mode; vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst, x->skip_encode ? src_stride : dst_stride, dst, @@ -1106,11 +1233,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride); if (xd->lossless) { - const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fwht4x4(src_diff, coeff, 8); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; @@ -1121,24 +1249,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, } else { int64_t unused; const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block); - const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type]; + const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type]; const int coeff_ctx = combine_entropy_contexts(tempa[idx], templ[idy]); vp9_fht4x4(src_diff, coeff, 8, tx_type); - vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; -#if CONFIG_VP9_HIGHBITDEPTH - distortion += - vp9_highbd_block_error_8bit( - coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> - 2; -#else distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> 2; -#endif if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next; vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst, @@ -1162,7 +1284,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, memcpy(best_dst + idy * 8, dst_init + idy * dst_stride, num_4x4_blocks_wide * 4); } - next : {} + next: {} } if (best_rd >= rd_thresh || x->skip_encode) return best_rd; @@ -1269,7 +1391,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, mic->mode = mode; super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, - bsize, best_rd); + bsize, best_rd, /*recon=*/NULL); if (this_rate_tokenonly == INT_MAX) continue; @@ -1309,7 +1431,6 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, if (ref_best_rd < 0) is_cost_valid = 0; if (is_inter_block(mi) && is_cost_valid) { - int plane; for (plane = 1; plane < MAX_MB_PLANE; ++plane) vp9_subtract_plane(x, bsize, plane); } @@ -1321,7 +1442,8 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, for (plane = 1; plane < MAX_MB_PLANE; ++plane) { txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd, - plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing); + plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing, + /*recon=*/NULL); if (pnrate == INT_MAX) { is_cost_valid = 0; break; @@ -1389,6 +1511,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } +#if !CONFIG_REALTIME_ONLY static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, BLOCK_SIZE bsize) { @@ -1462,11 +1585,11 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, if (is_compound) this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int; break; - case ZEROMV: + default: + assert(mode == ZEROMV); this_mv[0].as_int = 0; if (is_compound) this_mv[1].as_int = 0; break; - default: break; } mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int; @@ -1503,10 +1626,12 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)]; int64_t thisdistortion = 0, thissse = 0; int thisrate = 0, ref; - const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + const ScanOrder *so = &vp9_default_scan_orders[TX_4X4]; const int is_compound = has_second_ref(mi); const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; + assert(!x->skip_block); + for (ref = 0; ref < 1 + is_compound; ++ref) { const int bw = b_width_log2_lookup[BLOCK_8X8]; const int h = 4 * (i >> bw); @@ -1526,7 +1651,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vp9_highbd_build_inter_predictor( - pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, + CONVERT_TO_SHORTPTR(pre), y_stride, CONVERT_TO_SHORTPTR(dst), + pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2), xd->bd); @@ -1567,18 +1693,25 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8; #endif int64_t ssz, rd, rd1, rd2; - tran_low_t *coeff; + tran_low_t *coeff, *qcoeff, *dqcoeff; + uint16_t *eob; int coeff_ctx; k += (idy * 2 + idx); coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]); coeff = BLOCK_OFFSET(p->coeff, k); - x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), - coeff, 8); - vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan); + qcoeff = BLOCK_OFFSET(p->qcoeff, k); + dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k); + eob = &p->eobs[k]; + + x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), + coeff, 8); #if CONFIG_VP9_HIGHBITDEPTH + vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, + so); thisdistortion += vp9_highbd_block_error_dispatch( coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd); #else + vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, so); thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); #endif // CONFIG_VP9_HIGHBITDEPTH @@ -1599,6 +1732,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion); } +#endif // !CONFIG_REALTIME_ONLY typedef struct { int eobs; @@ -1626,6 +1760,7 @@ typedef struct { int mvthresh; } BEST_SEG_INFO; +#if !CONFIG_REALTIME_ONLY static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) { return (mv->row >> 3) < mv_limits->row_min || (mv->row >> 3) > mv_limits->row_max || @@ -1670,7 +1805,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi, const MV_REFERENCE_FRAME ref_frames[2]) { if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && frame_mv[this_mode][ref_frames[0]].as_int == 0 && - (ref_frames[1] == NONE || + (ref_frames[1] == NO_REF_FRAME || frame_mv[this_mode][ref_frames[1]].as_int == 0)) { int rfc = mode_context[ref_frames[0]]; int c1 = cost_mv_ref(cpi, NEARMV, rfc); @@ -1683,7 +1818,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi, if (c2 > c3) return 0; } else { assert(this_mode == ZEROMV); - if (ref_frames[1] == NONE) { + if (ref_frames[1] == NO_REF_FRAME) { if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) || (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0)) return 0; @@ -1699,10 +1834,80 @@ static int check_best_zero_mv(const VP9_COMP *cpi, return 1; } +static INLINE int skip_iters(int_mv iter_mvs[][2], int ite, int id) { + if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) { + int_mv cur_fullpel_mv, prev_fullpel_mv; + cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3; + cur_fullpel_mv.as_mv.col = iter_mvs[ite][id].as_mv.col >> 3; + prev_fullpel_mv.as_mv.row = iter_mvs[ite - 2][id].as_mv.row >> 3; + prev_fullpel_mv.as_mv.col = iter_mvs[ite - 2][id].as_mv.col >> 3; + if (cur_fullpel_mv.as_int == prev_fullpel_mv.as_int) return 1; + } + return 0; +} + +// Compares motion vector and mode rate of current mode and given mode. +static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv, + int this_mode_rate, int mode_rate, + int mv_thresh) { + const int mv_diff = + abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row); + if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1; + return 0; +} + +// Skips single reference inter modes NEARMV and ZEROMV based on motion vector +// difference and mode rate. +static INLINE int skip_single_mode_based_on_mode_rate( + int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode, + int ref0, int this_mode_rate, int best_mode_index) { + MV this_mv = mode_mv[this_mode][ref0].as_mv; + const int mv_thresh = 3; + + // Pruning is not applicable for NEARESTMV or NEWMV modes. + if (this_mode == NEARESTMV || this_mode == NEWMV) return 0; + // Pruning is not done when reference frame of the mode is same as best + // reference so far. + if (best_mode_index > 0 && + ref0 == vp9_mode_order[best_mode_index].ref_frame[0]) + return 0; + + // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV + if (compare_mv_mode_rate( + this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh)) + return 1; + + // Check absolute mv difference and mode rate of current mode w.r.t NEWMV + if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh)) + return 1; + + // Pruning w.r.t NEARMV is applicable only for ZEROMV mode + if (this_mode == NEARMV) return 0; + // Check absolute mv difference and mode rate of current mode w.r.t NEARMV + if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh)) + return 1; + return 0; +} + +#define MAX_JOINT_MV_SEARCH_ITERS 4 +static INLINE int get_joint_search_iters(int sf_level, BLOCK_SIZE bsize) { + int num_iters = MAX_JOINT_MV_SEARCH_ITERS; // sf_level = 0 + if (sf_level >= 2) + num_iters = 0; + else if (sf_level >= 1) + num_iters = bsize < BLOCK_8X8 + ? 0 + : (bsize <= BLOCK_16X16 ? 2 : MAX_JOINT_MV_SEARCH_ITERS); + return num_iters; +} + static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], - int *rate_mv) { + int *rate_mv, int num_iters) { const VP9_COMMON *const cm = &cpi->common; const int pw = 4 * num_4x4_blocks_wide_lookup[bsize]; const int ph = 4 * num_4x4_blocks_high_lookup[bsize]; @@ -1711,6 +1916,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const int refs[2] = { mi->ref_frame[0], mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] }; int_mv ref_mv[2]; + int_mv iter_mvs[MAX_JOINT_MV_SEARCH_ITERS][2]; int ite, ref; const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter]; struct scale_factors sf; @@ -1725,12 +1931,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // Prediction buffer from second frame. #if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]); + DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]); uint8_t *second_pred; #else - DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]); + DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]); #endif // CONFIG_VP9_HIGHBITDEPTH + // Check number of iterations do not exceed the max + assert(num_iters <= MAX_JOINT_MV_SEARCH_ITERS); + for (ref = 0; ref < 2; ++ref) { ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0]; @@ -1746,6 +1955,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int; + iter_mvs[0][ref].as_int = single_newmv[refs[ref]].as_int; } // Since we have scaled the reference frames to match the size of the current @@ -1760,7 +1970,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // Allow joint search multiple times iteratively for each reference frame // and break out of the search loop if it couldn't find a better mv. - for (ite = 0; ite < 4; ite++) { + for (ite = 0; ite < num_iters; ite++) { struct buf_2d ref_yv12[2]; uint32_t bestsme = UINT_MAX; int sadpb = x->sadperbit16; @@ -1772,6 +1982,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // odd iterations search in the second. The predictor // found for the 'other' reference frame is factored in. + // Skip further iterations of search if in the previous iteration, the + // motion vector of the searched ref frame is unchanged, and the other ref + // frame's full-pixel mv is unchanged. + if (skip_iters(iter_mvs, ite, id)) break; + // Initialized here because of compiler problem in Visual Studio. ref_yv12[0] = xd->plane[0].pre[0]; ref_yv12[1] = xd->plane[0].pre[1]; @@ -1781,9 +1996,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); vp9_highbd_build_inter_predictor( - ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw, - &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, kernel, MV_PRECISION_Q3, - mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd); + CONVERT_TO_SHORTPTR(ref_yv12[!id].buf), ref_yv12[!id].stride, + second_pred_alloc_16, pw, &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, + kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd); } else { second_pred = (uint8_t *)second_pred_alloc_16; vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride, @@ -1824,8 +2039,8 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, bestsme = cpi->find_fractional_mv_step( x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], 0, - cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost, - &dis, &sse, second_pred, pw, ph); + cpi->sf.mv.subpel_search_level, NULL, x->nmvjointcost, x->mvcost, + &dis, &sse, second_pred, pw, ph, cpi->sf.use_accurate_subpel_search); } // Restore the pointer to the first (possibly scaled) prediction buffer. @@ -1837,6 +2052,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } else { break; } + if (ite < num_iters - 1) { + iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int; + iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int; + } } *rate_mv = 0; @@ -1857,7 +2076,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, static int64_t rd_pick_best_sub8x8_mode( VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv, - int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate, + int_mv *second_best_ref_mv, int64_t best_rd_so_far, int *returntotrate, int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse, int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf, int filter_idx, int mi_row, int mi_col) { @@ -1879,6 +2098,8 @@ static int64_t rd_pick_best_sub8x8_mode( const BLOCK_SIZE bsize = mi->sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + const int pw = num_4x4_blocks_wide << 2; + const int ph = num_4x4_blocks_high << 2; ENTROPY_CONTEXT t_above[2], t_left[2]; int subpelmv = 1, have_ref = 0; SPEED_FEATURES *const sf = &cpi->sf; @@ -1888,7 +2109,7 @@ static int64_t rd_pick_best_sub8x8_mode( vp9_zero(*bsi); - bsi->segment_rd = best_rd; + bsi->segment_rd = best_rd_so_far; bsi->ref_mv[0] = best_ref_mv; bsi->ref_mv[1] = second_best_ref_mv; bsi->mvp.as_int = best_ref_mv->as_int; @@ -1914,14 +2135,14 @@ static int64_t rd_pick_best_sub8x8_mode( int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; PREDICTION_MODE mode_selected = ZEROMV; int64_t best_rd = INT64_MAX; - const int i = idy * 2 + idx; + const int block = idy * 2 + idx; int ref; for (ref = 0; ref < 1 + has_second_rf; ++ref) { const MV_REFERENCE_FRAME frame = mi->ref_frame[ref]; frame_mv[ZEROMV][frame].as_int = 0; vp9_append_sub8x8_mvs_for_idx( - cm, xd, i, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame], + cm, xd, block, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame], &frame_mv[NEARMV][frame], mbmi_ext->mode_context); } @@ -1931,7 +2152,7 @@ static int64_t rd_pick_best_sub8x8_mode( struct buf_2d orig_pre[2]; mode_idx = INTER_OFFSET(this_mode); - bsi->rdstat[i][mode_idx].brdcost = INT64_MAX; + bsi->rdstat[block][mode_idx].brdcost = INT64_MAX; if (!(inter_mode_mask & (1 << this_mode))) continue; if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv, @@ -1939,14 +2160,14 @@ static int64_t rd_pick_best_sub8x8_mode( continue; memcpy(orig_pre, pd->pre, sizeof(orig_pre)); - memcpy(bsi->rdstat[i][mode_idx].ta, t_above, - sizeof(bsi->rdstat[i][mode_idx].ta)); - memcpy(bsi->rdstat[i][mode_idx].tl, t_left, - sizeof(bsi->rdstat[i][mode_idx].tl)); + memcpy(bsi->rdstat[block][mode_idx].ta, t_above, + sizeof(bsi->rdstat[block][mode_idx].ta)); + memcpy(bsi->rdstat[block][mode_idx].tl, t_left, + sizeof(bsi->rdstat[block][mode_idx].tl)); // motion search for newmv (single predictor case only) if (!has_second_rf && this_mode == NEWMV && - seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) { + seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) { MV *const new_mv = &mode_mv[NEWMV][0].as_mv; int step_param = 0; uint32_t bestsme = UINT_MAX; @@ -1956,18 +2177,19 @@ static int64_t rd_pick_best_sub8x8_mode( int cost_list[5]; const MvLimits tmp_mv_limits = x->mv_limits; - /* Is the best so far sufficiently good that we cant justify doing + /* Is the best so far sufficiently good that we can't justify doing * and new motion search. */ if (best_rd < label_mv_thresh) break; if (cpi->oxcf.mode != BEST) { // use previous block's result as next block's MV predictor. - if (i > 0) { - bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int; - if (i == 2) bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int; + if (block > 0) { + bsi->mvp.as_int = mi->bmi[block - 1].as_mv[0].as_int; + if (block == 2) + bsi->mvp.as_int = mi->bmi[block - 2].as_mv[0].as_int; } } - if (i == 0) + if (block == 0) max_mv = x->max_mv_context[mi->ref_frame[0]]; else max_mv = @@ -1987,18 +2209,22 @@ static int64_t rd_pick_best_sub8x8_mode( mvp_full.col = bsi->mvp.as_mv.col >> 3; if (sf->adaptive_motion_search) { - mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3; - mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3; + if (x->pred_mv[mi->ref_frame[0]].row != INT16_MAX && + x->pred_mv[mi->ref_frame[0]].col != INT16_MAX) { + mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3; + mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3; + } step_param = VPXMAX(step_param, 8); } // adjust src pointer for this block - mi_buf_shift(x, i); + mi_buf_shift(x, block); vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv); bestsme = vp9_full_pixel_search( - cpi, x, bsize, &mvp_full, step_param, sadpb, + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, + sadpb, sf->mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL, &bsi->ref_mv[0]->as_mv, new_mv, INT_MAX, 1); @@ -2009,56 +2235,60 @@ static int64_t rd_pick_best_sub8x8_mode( cpi->find_fractional_mv_step( x, new_mv, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], sf->mv.subpel_force_stop, - sf->mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), + sf->mv.subpel_search_level, cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, &distortion, - &x->pred_sse[mi->ref_frame[0]], NULL, 0, 0); + &x->pred_sse[mi->ref_frame[0]], NULL, pw, ph, + cpi->sf.use_accurate_subpel_search); // save motion search result for use in compound prediction - seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv; + seg_mvs[block][mi->ref_frame[0]].as_mv = *new_mv; } - if (sf->adaptive_motion_search) - x->pred_mv[mi->ref_frame[0]] = *new_mv; + x->pred_mv[mi->ref_frame[0]] = *new_mv; // restore src pointers mi_buf_restore(x, orig_src, orig_pre); } if (has_second_rf) { - if (seg_mvs[i][mi->ref_frame[1]].as_int == INVALID_MV || - seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) + if (seg_mvs[block][mi->ref_frame[1]].as_int == INVALID_MV || + seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) continue; } if (has_second_rf && this_mode == NEWMV && mi->interp_filter == EIGHTTAP) { + // Decide number of joint motion search iterations + const int num_joint_search_iters = get_joint_search_iters( + cpi->sf.comp_inter_joint_search_iter_level, bsize); // adjust src pointers - mi_buf_shift(x, i); - if (sf->comp_inter_joint_search_thresh <= bsize) { + mi_buf_shift(x, block); + if (num_joint_search_iters) { int rate_mv; joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row, - mi_col, seg_mvs[i], &rate_mv); - seg_mvs[i][mi->ref_frame[0]].as_int = + mi_col, seg_mvs[block], &rate_mv, + num_joint_search_iters); + seg_mvs[block][mi->ref_frame[0]].as_int = frame_mv[this_mode][mi->ref_frame[0]].as_int; - seg_mvs[i][mi->ref_frame[1]].as_int = + seg_mvs[block][mi->ref_frame[1]].as_int = frame_mv[this_mode][mi->ref_frame[1]].as_int; } // restore src pointers mi_buf_restore(x, orig_src, orig_pre); } - bsi->rdstat[i][mode_idx].brate = set_and_cost_bmi_mvs( - cpi, x, xd, i, this_mode, mode_mv[this_mode], frame_mv, seg_mvs[i], - bsi->ref_mv, x->nmvjointcost, x->mvcost); + bsi->rdstat[block][mode_idx].brate = set_and_cost_bmi_mvs( + cpi, x, xd, block, this_mode, mode_mv[this_mode], frame_mv, + seg_mvs[block], bsi->ref_mv, x->nmvjointcost, x->mvcost); for (ref = 0; ref < 1 + has_second_rf; ++ref) { - bsi->rdstat[i][mode_idx].mvs[ref].as_int = + bsi->rdstat[block][mode_idx].mvs[ref].as_int = mode_mv[this_mode][ref].as_int; if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int = + bsi->rdstat[block + 1][mode_idx].mvs[ref].as_int = mode_mv[this_mode][ref].as_int; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int = + bsi->rdstat[block + 2][mode_idx].mvs[ref].as_int = mode_mv[this_mode][ref].as_int; } @@ -2076,7 +2306,7 @@ static int64_t rd_pick_best_sub8x8_mode( for (ref = 0; ref < 1 + has_second_rf; ++ref) { subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv); have_ref &= mode_mv[this_mode][ref].as_int == - ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int; + ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int; } if (filter_idx > 1 && !subpelmv && !have_ref) { @@ -2084,53 +2314,54 @@ static int64_t rd_pick_best_sub8x8_mode( have_ref = 1; for (ref = 0; ref < 1 + has_second_rf; ++ref) have_ref &= mode_mv[this_mode][ref].as_int == - ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int; + ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int; } if (!subpelmv && have_ref && - ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { - memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx], - sizeof(SEG_RDSTAT)); + ref_bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) { + bsi->rdstat[block][mode_idx] = ref_bsi->rdstat[block][mode_idx]; if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].eobs = - ref_bsi->rdstat[i + 1][mode_idx].eobs; + bsi->rdstat[block + 1][mode_idx].eobs = + ref_bsi->rdstat[block + 1][mode_idx].eobs; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].eobs = - ref_bsi->rdstat[i + 2][mode_idx].eobs; + bsi->rdstat[block + 2][mode_idx].eobs = + ref_bsi->rdstat[block + 2][mode_idx].eobs; - if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { + if (bsi->rdstat[block][mode_idx].brdcost < best_rd) { mode_selected = this_mode; - best_rd = bsi->rdstat[i][mode_idx].brdcost; + best_rd = bsi->rdstat[block][mode_idx].brdcost; } continue; } } - bsi->rdstat[i][mode_idx].brdcost = encode_inter_mb_segment( - cpi, x, bsi->segment_rd - this_segment_rd, i, - &bsi->rdstat[i][mode_idx].byrate, &bsi->rdstat[i][mode_idx].bdist, - &bsi->rdstat[i][mode_idx].bsse, bsi->rdstat[i][mode_idx].ta, - bsi->rdstat[i][mode_idx].tl, mi_row, mi_col); - if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { - bsi->rdstat[i][mode_idx].brdcost += - RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0); - bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate; - bsi->rdstat[i][mode_idx].eobs = p->eobs[i]; + bsi->rdstat[block][mode_idx].brdcost = encode_inter_mb_segment( + cpi, x, bsi->segment_rd - this_segment_rd, block, + &bsi->rdstat[block][mode_idx].byrate, + &bsi->rdstat[block][mode_idx].bdist, + &bsi->rdstat[block][mode_idx].bsse, bsi->rdstat[block][mode_idx].ta, + bsi->rdstat[block][mode_idx].tl, mi_row, mi_col); + if (bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) { + bsi->rdstat[block][mode_idx].brdcost += RDCOST( + x->rdmult, x->rddiv, bsi->rdstat[block][mode_idx].brate, 0); + bsi->rdstat[block][mode_idx].brate += + bsi->rdstat[block][mode_idx].byrate; + bsi->rdstat[block][mode_idx].eobs = p->eobs[block]; if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1]; + bsi->rdstat[block + 1][mode_idx].eobs = p->eobs[block + 1]; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2]; + bsi->rdstat[block + 2][mode_idx].eobs = p->eobs[block + 2]; } - if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { + if (bsi->rdstat[block][mode_idx].brdcost < best_rd) { mode_selected = this_mode; - best_rd = bsi->rdstat[i][mode_idx].brdcost; + best_rd = bsi->rdstat[block][mode_idx].brdcost; } } /*for each 4x4 mode*/ if (best_rd == INT64_MAX) { int iy, midx; - for (iy = i + 1; iy < 4; ++iy) + for (iy = block + 1; iy < 4; ++iy) for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; @@ -2138,22 +2369,22 @@ static int64_t rd_pick_best_sub8x8_mode( } mode_idx = INTER_OFFSET(mode_selected); - memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above)); - memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left)); + memcpy(t_above, bsi->rdstat[block][mode_idx].ta, sizeof(t_above)); + memcpy(t_left, bsi->rdstat[block][mode_idx].tl, sizeof(t_left)); - set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected], - frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost, - x->mvcost); + set_and_cost_bmi_mvs(cpi, x, xd, block, mode_selected, + mode_mv[mode_selected], frame_mv, seg_mvs[block], + bsi->ref_mv, x->nmvjointcost, x->mvcost); - br += bsi->rdstat[i][mode_idx].brate; - bd += bsi->rdstat[i][mode_idx].bdist; - block_sse += bsi->rdstat[i][mode_idx].bsse; - segmentyrate += bsi->rdstat[i][mode_idx].byrate; - this_segment_rd += bsi->rdstat[i][mode_idx].brdcost; + br += bsi->rdstat[block][mode_idx].brate; + bd += bsi->rdstat[block][mode_idx].bdist; + block_sse += bsi->rdstat[block][mode_idx].bsse; + segmentyrate += bsi->rdstat[block][mode_idx].byrate; + this_segment_rd += bsi->rdstat[block][mode_idx].brdcost; if (this_segment_rd > bsi->segment_rd) { int iy, midx; - for (iy = i + 1; iy < 4; ++iy) + for (iy = block + 1; iy < 4; ++iy) for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; @@ -2171,7 +2402,7 @@ static int64_t rd_pick_best_sub8x8_mode( // update the coding decisions for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode; - if (bsi->segment_rd > best_rd) return INT64_MAX; + if (bsi->segment_rd > best_rd_so_far) return INT64_MAX; /* set it to the best */ for (i = 0; i < 4; i++) { mode_idx = INTER_OFFSET(bsi->modes[i]); @@ -2313,6 +2544,22 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, block_size); } +#if CONFIG_NON_GREEDY_MV +static int ref_frame_to_gf_rf_idx(int ref_frame) { + if (ref_frame == GOLDEN_FRAME) { + return 0; + } + if (ref_frame == LAST_FRAME) { + return 1; + } + if (ref_frame == ALTREF_FRAME) { + return 2; + } + assert(0); + return -1; +} +#endif + static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv) { @@ -2320,19 +2567,35 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const VP9_COMMON *cm = &cpi->common; MODE_INFO *mi = xd->mi[0]; struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } }; - int bestsme = INT_MAX; int step_param; - int sadpb = x->sadperbit16; MV mvp_full; int ref = mi->ref_frame[0]; MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv; const MvLimits tmp_mv_limits = x->mv_limits; int cost_list[5]; - + const int best_predmv_idx = x->mv_best_ref_index[ref]; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); - + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; MV pred_mv[3]; + + int bestsme = INT_MAX; +#if CONFIG_NON_GREEDY_MV + int gf_group_idx = cpi->twopass.gf_group.index; + int gf_rf_idx = ref_frame_to_gf_rf_idx(ref); + BLOCK_SIZE square_bsize = get_square_block_size(bsize); + int_mv nb_full_mvs[NB_MVS_NUM] = { 0 }; + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, gf_group_idx, gf_rf_idx, square_bsize); + const int nb_full_mv_num = + vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs); + const int lambda = (pw * ph) / 4; + assert(pw * ph == lambda << 2); +#else // CONFIG_NON_GREEDY_MV + int sadpb = x->sadperbit16; +#endif // CONFIG_NON_GREEDY_MV + pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv; pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv; pred_mv[2] = x->pred_mv[ref]; @@ -2361,7 +2624,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) { - int boffset = + const int boffset = 2 * (b_width_log2_lookup[BLOCK_64X64] - VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize])); step_param = VPXMAX(step_param, boffset); @@ -2379,14 +2642,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int i; for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) { if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { - x->pred_mv[ref].row = 0; - x->pred_mv[ref].col = 0; + x->pred_mv[ref].row = INT16_MAX; + x->pred_mv[ref].col = INT16_MAX; tmp_mv->as_int = INVALID_MV; if (scaled_ref_frame) { - int i; - for (i = 0; i < MAX_MB_PLANE; ++i) - xd->plane[i].pre[0] = backup_yv12[i]; + int j; + for (j = 0; j < MAX_MB_PLANE; ++j) + xd->plane[j].pre[0] = backup_yv12[j]; } return; } @@ -2398,14 +2661,65 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // after full-pixel motion search. vp9_set_mv_search_range(&x->mv_limits, &ref_mv); - mvp_full = pred_mv[x->mv_best_ref_index[ref]]; - + mvp_full = pred_mv[best_predmv_idx]; mvp_full.col >>= 3; mvp_full.row >>= 3; - bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, - cond_cost_list(cpi, cost_list), &ref_mv, - &tmp_mv->as_mv, INT_MAX, 1); +#if CONFIG_NON_GREEDY_MV + bestsme = vp9_full_pixel_diamond_new(cpi, x, bsize, &mvp_full, step_param, + lambda, 1, nb_full_mvs, nb_full_mv_num, + &tmp_mv->as_mv); +#else // CONFIG_NON_GREEDY_MV + bestsme = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, + cond_cost_list(cpi, cost_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); +#endif // CONFIG_NON_GREEDY_MV + + if (cpi->sf.enhanced_full_pixel_motion_search) { + int i; + for (i = 0; i < 3; ++i) { + int this_me; + MV this_mv; + int diff_row; + int diff_col; + int step; + + if (pred_mv[i].row == INT16_MAX || pred_mv[i].col == INT16_MAX) continue; + if (i == best_predmv_idx) continue; + + diff_row = ((int)pred_mv[i].row - + pred_mv[i > 0 ? (i - 1) : best_predmv_idx].row) >> + 3; + diff_col = ((int)pred_mv[i].col - + pred_mv[i > 0 ? (i - 1) : best_predmv_idx].col) >> + 3; + if (diff_row == 0 && diff_col == 0) continue; + if (diff_row < 0) diff_row = -diff_row; + if (diff_col < 0) diff_col = -diff_col; + step = get_msb((diff_row + diff_col + 1) >> 1); + if (step <= 0) continue; + + mvp_full = pred_mv[i]; + mvp_full.col >>= 3; + mvp_full.row >>= 3; +#if CONFIG_NON_GREEDY_MV + this_me = vp9_full_pixel_diamond_new( + cpi, x, bsize, &mvp_full, + VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), lambda, 1, nb_full_mvs, + nb_full_mv_num, &this_mv); +#else // CONFIG_NON_GREEDY_MV + this_me = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, + VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), + cpi->sf.mv.search_method, sadpb, cond_cost_list(cpi, cost_list), + &ref_mv, &this_mv, INT_MAX, 1); +#endif // CONFIG_NON_GREEDY_MV + if (this_me < bestsme) { + tmp_mv->as_mv = this_mv; + bestsme = this_me; + } + } + } x->mv_limits = tmp_mv_limits; @@ -2414,13 +2728,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph, + cpi->sf.use_accurate_subpel_search); } *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - if (cpi->sf.adaptive_motion_search) x->pred_mv[ref] = tmp_mv->as_mv; + x->pred_mv[ref] = tmp_mv->as_mv; if (scaled_ref_frame) { int i; @@ -2445,26 +2760,63 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd, // However, once established that vector may be usable through the nearest and // near mv modes to reduce distortion in subsequent blocks and also improve // visual quality. -static int discount_newmv_test(const VP9_COMP *cpi, int this_mode, - int_mv this_mv, - int_mv (*mode_mv)[MAX_REF_FRAMES], - int ref_frame) { +static int discount_newmv_test(VP9_COMP *cpi, int this_mode, int_mv this_mv, + int_mv (*mode_mv)[MAX_REF_FRAMES], int ref_frame, + int mi_row, int mi_col, BLOCK_SIZE bsize) { +#if CONFIG_NON_GREEDY_MV + (void)mode_mv; + (void)this_mv; + if (this_mode == NEWMV && bsize >= BLOCK_8X8 && cpi->tpl_ready) { + const int gf_group_idx = cpi->twopass.gf_group.index; + const int gf_rf_idx = ref_frame_to_gf_rf_idx(ref_frame); + const TplDepFrame tpl_frame = cpi->tpl_stats[gf_group_idx]; + const MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, gf_group_idx, gf_rf_idx, cpi->tpl_bsize); + const int tpl_block_mi_h = num_8x8_blocks_high_lookup[cpi->tpl_bsize]; + const int tpl_block_mi_w = num_8x8_blocks_wide_lookup[cpi->tpl_bsize]; + const int tpl_mi_row = mi_row - (mi_row % tpl_block_mi_h); + const int tpl_mi_col = mi_col - (mi_col % tpl_block_mi_w); + const int mv_mode = + tpl_frame + .mv_mode_arr[gf_rf_idx][tpl_mi_row * tpl_frame.stride + tpl_mi_col]; + if (mv_mode == NEW_MV_MODE) { + int_mv tpl_new_mv = + vp9_motion_field_mi_get_mv(motion_field, tpl_mi_row, tpl_mi_col); + int row_diff = abs(tpl_new_mv.as_mv.row - this_mv.as_mv.row); + int col_diff = abs(tpl_new_mv.as_mv.col - this_mv.as_mv.col); + if (VPXMAX(row_diff, col_diff) <= 8) { + return 1; + } else { + return 0; + } + } else { + return 0; + } + } else { + return 0; + } +#else + (void)mi_row; + (void)mi_col; + (void)bsize; return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) && (this_mv.as_int != 0) && ((mode_mv[NEARESTMV][ref_frame].as_int == 0) || (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) && ((mode_mv[NEARMV][ref_frame].as_int == 0) || (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV))); +#endif } static int64_t handle_inter_mode( VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int *rate2, int64_t *distortion, int *skippable, int *rate_y, int *rate_uv, - int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, - int mi_col, int_mv single_newmv[MAX_REF_FRAMES], + struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], + int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], INTERP_FILTER (*single_filter)[MAX_REF_FRAMES], - int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse, - const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) { + int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate, + int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter, + int64_t filter_cache[], int best_mode_index) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *mi = xd->mi[0]; @@ -2482,9 +2834,8 @@ static int64_t handle_inter_mode( #else DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]); #endif // CONFIG_VP9_HIGHBITDEPTH - int pred_exists = 0; int intpel_mv; - int64_t rd, tmp_rd, best_rd = INT64_MAX; + int64_t rd, tmp_rd = INT64_MAX, best_rd = INT64_MAX; int best_needs_copy = 0; uint8_t *orig_dst[MAX_MB_PLANE]; int orig_dst_stride[MAX_MB_PLANE]; @@ -2493,13 +2844,12 @@ static int64_t handle_inter_mode( uint8_t skip_txfm[MAX_MB_PLANE << 2] = { 0 }; int64_t bsse[MAX_MB_PLANE << 2] = { 0 }; - int bsl = mi_width_log2_lookup[bsize]; - int pred_filter_search = - cpi->sf.cb_pred_filter_search - ? (((mi_row + mi_col) >> bsl) + - get_chessboard_index(cm->current_video_frame)) & - 0x1 - : 0; + const int bsl = mi_width_log2_lookup[bsize]; + const int blk_parity = (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & + 0x1; + const int pred_filter_search = + (cpi->sf.cb_pred_filter_search >= 2) && blk_parity; int skip_txfm_sb = 0; int64_t skip_sse_sb = INT64_MAX; @@ -2538,13 +2888,23 @@ static int64_t handle_inter_mode( if (this_mode == NEWMV) { int rate_mv; if (is_comp_pred) { + // Decide number of joint motion search iterations + const int num_joint_search_iters = get_joint_search_iters( + cpi->sf.comp_inter_joint_search_iter_level, bsize); + // Initialize mv using single prediction mode result. frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; - if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { + if (num_joint_search_iters) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, joint_motion_search_time); +#endif joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, - single_newmv, &rate_mv); + single_newmv, &rate_mv, num_joint_search_iters); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, joint_motion_search_time); +#endif } else { rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv, @@ -2556,7 +2916,13 @@ static int64_t handle_inter_mode( *rate2 += rate_mv; } else { int_mv tmp_mv; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, single_motion_search_time); +#endif single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, single_motion_search_time); +#endif if (tmp_mv.as_int == INVALID_MV) return INT64_MAX; frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int = @@ -2567,7 +2933,8 @@ static int64_t handle_inter_mode( // under certain circumstances where we want to help initiate a weak // motion field, where the distortion gain for a single block may not // be enough to overcome the cost of a new mv. - if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) { + if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0], mi_row, + mi_col, bsize)) { *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1); } else { *rate2 += rate_mv; @@ -2600,8 +2967,8 @@ static int64_t handle_inter_mode( // // Under some circumstances we discount the cost of new mv mode to encourage // initiation of a motion field. - if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, - refs[0])) { + if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, refs[0], + mi_row, mi_col, bsize)) { *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]), cost_mv_ref(cpi, NEARESTMV, mbmi_ext->mode_context[refs[0]])); @@ -2609,23 +2976,45 @@ static int64_t handle_inter_mode( *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]); } + if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) { + single_mode_rate[INTER_OFFSET(this_mode)] = *rate2; + // Prune NEARMV and ZEROMV modes based on motion vector difference and mode + // rate. + if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate, + this_mode, refs[0], *rate2, + best_mode_index)) { + // Check when the single inter mode is pruned, NEARESTMV or NEWMV modes + // are not early terminated. This ensures all single modes are not getting + // skipped when the speed feature is enabled. + assert(single_mode_rate[INTER_OFFSET(NEARESTMV)] != INT_MAX || + single_mode_rate[INTER_OFFSET(NEWMV)] != INT_MAX); + return INT64_MAX; + } + } if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd && mi->mode != NEARESTMV) return INT64_MAX; - pred_exists = 0; // Are all MVs integer pel for Y and UV intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv); if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, interp_filter_time); +#endif // Search for best switchable filter by checking the variance of // pred error irrespective of whether the filter will be used for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX; if (cm->interp_filter != BILINEAR) { + // Use cb pattern for filter eval when filter is not switchable + const int enable_interp_search = + (cpi->sf.cb_pred_filter_search && cm->interp_filter != SWITCHABLE) + ? blk_parity + : 1; if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { best_filter = EIGHTTAP; - } else if (best_filter == SWITCHABLE) { + } else if (best_filter == SWITCHABLE && enable_interp_search) { int newbest; int tmp_rate_sum = 0; int64_t tmp_dist_sum = 0; @@ -2635,6 +3024,9 @@ static int64_t handle_inter_mode( int64_t rs_rd; int tmp_skip_sb = 0; int64_t tmp_skip_sse = INT64_MAX; + const int enable_earlyterm = + cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i; + int64_t filt_best_rd; mi->interp_filter = i; rs = vp9_get_switchable_rate(cpi, xd); @@ -2668,9 +3060,16 @@ static int64_t handle_inter_mode( xd->plane[j].dst.stride = 64; } } - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, &tmp_skip_sb, - &tmp_skip_sse); + + filt_best_rd = + cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd; + if (build_inter_pred_model_rd_earlyterm( + cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum, + &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm, + filt_best_rd)) { + filter_cache[i] = INT64_MAX; + continue; + } rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum); filter_cache[i] = rd; @@ -2703,7 +3102,6 @@ static int64_t handle_inter_mode( if ((cm->interp_filter == SWITCHABLE && newbest) || (cm->interp_filter != SWITCHABLE && cm->interp_filter == mi->interp_filter)) { - pred_exists = 1; tmp_rd = best_rd; skip_txfm_sb = tmp_skip_sb; @@ -2715,12 +3113,15 @@ static int64_t handle_inter_mode( restore_dst_buf(xd, orig_dst, orig_dst_stride); } } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, interp_filter_time); +#endif // Set the appropriate filter mi->interp_filter = cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter; rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0; - if (pred_exists) { + if (tmp_rd != INT64_MAX) { if (best_needs_copy) { // again temporarily set the buffers to local memory to prevent a memcpy for (i = 0; i < MAX_MB_PLANE; i++) { @@ -2735,9 +3136,9 @@ static int64_t handle_inter_mode( // Handles the special case when a filter that is not in the // switchable list (ex. bilinear) is indicated at the frame level, or // skip condition holds. - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb, - &skip_sse_sb); + build_inter_pred_model_rd_earlyterm( + cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb, + &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX); rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); memcpy(bsse, x->bsse, sizeof(bsse)); @@ -2765,7 +3166,7 @@ static int64_t handle_inter_mode( memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm)); memcpy(x->bsse, bsse, sizeof(bsse)); - if (!skip_txfm_sb) { + if (!skip_txfm_sb || xd->lossless) { int skippable_y, skippable_uv; int64_t sseuv = INT64_MAX; int64_t rdcosty = INT64_MAX; @@ -2773,7 +3174,7 @@ static int64_t handle_inter_mode( // Y cost and distortion vp9_subtract_plane(x, bsize, 0); super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse, bsize, - ref_best_rd); + ref_best_rd, recon); if (*rate_y == INT_MAX) { *rate2 = INT_MAX; @@ -2815,6 +3216,7 @@ static int64_t handle_inter_mode( restore_dst_buf(xd, orig_dst, orig_dst_stride); return 0; // The rate-distortion cost will be re-calculated by caller. } +#endif // !CONFIG_REALTIME_ONLY void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, @@ -2829,7 +3231,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, x->skip_encode = 0; ctx->skip = 0; xd->mi[0]->ref_frame[0] = INTRA_FRAME; - xd->mi[0]->ref_frame[1] = NONE; + xd->mi[0]->ref_frame[1] = NO_REF_FRAME; // Initialize interp_filter here so we do not have to check for inter block // modes in get_pred_context_switchable_interp() xd->mi[0]->interp_filter = SWITCHABLE_FILTERS; @@ -2868,60 +3270,97 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); } +#if !CONFIG_REALTIME_ONLY // This function is designed to apply a bias or adjustment to an rd value based // on the relative variance of the source and reconstruction. -#define LOW_VAR_THRESH 16 -#define VLOW_ADJ_MAX 25 -#define VHIGH_ADJ_MAX 8 +#define LOW_VAR_THRESH 250 +#define VAR_MULT 250 +static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 250 }; + static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *this_rd, + struct buf_2d *recon, MV_REFERENCE_FRAME ref_frame, - unsigned int source_variance) { + MV_REFERENCE_FRAME second_ref_frame, + PREDICTION_MODE this_mode) { MACROBLOCKD *const xd = &x->e_mbd; - unsigned int recon_variance; - unsigned int absvar_diff = 0; - int64_t var_error = 0; - int64_t var_factor = 0; + unsigned int rec_variance; + unsigned int src_variance; + unsigned int src_rec_min; + unsigned int var_diff = 0; + unsigned int var_factor = 0; + unsigned int adj_max; + unsigned int low_var_thresh = LOW_VAR_THRESH; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + vp9e_tune_content content_type = cpi->oxcf.content; if (*this_rd == INT64_MAX) return; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - recon_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, - bsize, xd->bd); + rec_variance = vp9_high_get_sby_variance(cpi, recon, bsize, xd->bd); + src_variance = + vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd); } else { - recon_variance = - vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); + rec_variance = vp9_get_sby_variance(cpi, recon, bsize); + src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); } #else - recon_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); + rec_variance = vp9_get_sby_variance(cpi, recon, bsize); + src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize); #endif // CONFIG_VP9_HIGHBITDEPTH - if ((source_variance + recon_variance) > LOW_VAR_THRESH) { - absvar_diff = (source_variance > recon_variance) - ? (source_variance - recon_variance) - : (recon_variance - source_variance); + // Scale based on area in 8x8 blocks + rec_variance /= (bw * bh); + src_variance /= (bw * bh); - var_error = ((int64_t)200 * source_variance * recon_variance) / - (((int64_t)source_variance * source_variance) + - ((int64_t)recon_variance * recon_variance)); - var_error = 100 - var_error; + if (content_type == VP9E_CONTENT_FILM) { + if (cpi->oxcf.pass == 2) { + // Adjust low variance threshold based on estimated group noise enegry. + double noise_factor = + (double)cpi->twopass.gf_group.group_noise_energy / SECTION_NOISE_DEF; + low_var_thresh = (unsigned int)(low_var_thresh * noise_factor); + + if (ref_frame == INTRA_FRAME) { + low_var_thresh *= 2; + if (this_mode == DC_PRED) low_var_thresh *= 5; + } else if (second_ref_frame > INTRA_FRAME) { + low_var_thresh *= 2; + } + } + } else { + low_var_thresh = LOW_VAR_THRESH / 2; } - // Source variance above a threshold and ref frame is intra. - // This case is targeted mainly at discouraging intra modes that give rise - // to a predictor with a low spatial complexity compared to the source. - if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) && - (source_variance > recon_variance)) { - var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error)); - // A second possible case of interest is where the source variance - // is very low and we wish to discourage false texture or motion trails. - } else if ((source_variance < (LOW_VAR_THRESH >> 1)) && - (recon_variance > source_variance)) { - var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error)); + // Lower of source (raw per pixel value) and recon variance. Note that + // if the source per pixel is 0 then the recon value here will not be per + // pixel (see above) so will likely be much larger. + src_rec_min = VPXMIN(src_variance, rec_variance); + + if (src_rec_min > low_var_thresh) return; + + // We care more when the reconstruction has lower variance so give this case + // a stronger weighting. + var_diff = (src_variance > rec_variance) ? (src_variance - rec_variance) * 2 + : (rec_variance - src_variance) / 2; + + adj_max = max_var_adjust[content_type]; + + var_factor = + (unsigned int)((int64_t)VAR_MULT * var_diff) / VPXMAX(1, src_variance); + var_factor = VPXMIN(adj_max, var_factor); + + if ((content_type == VP9E_CONTENT_FILM) && + ((ref_frame == INTRA_FRAME) || (second_ref_frame > INTRA_FRAME))) { + var_factor *= 2; } + *this_rd += (*this_rd * var_factor) / 100; + + (void)xd; } +#endif // !CONFIG_REALTIME_ONLY // Do we have an internal image edge (e.g. formatting bars). int vp9_internal_image_edge(VP9_COMP *cpi) { @@ -2941,6 +3380,7 @@ int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) { // For two pass account for any formatting bars detected. if (cpi->oxcf.pass == 2) { TWO_PASS *twopass = &cpi->twopass; + vpx_clear_system_state(); // The inactive region is specified in MBs not mi units. // The image edge is in the following MB row. @@ -2968,6 +3408,7 @@ int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) { // For two pass account for any formatting bars detected. if (cpi->oxcf.pass == 2) { TWO_PASS *twopass = &cpi->twopass; + vpx_clear_system_state(); // The inactive region is specified in MBs not mi units. // The image edge is in the following MB row. @@ -2992,6 +3433,15 @@ int vp9_active_edge_sb(VP9_COMP *cpi, int mi_row, int mi_col) { vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE); } +#if !CONFIG_REALTIME_ONLY +static void init_frame_mv(int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]) { + for (int mode = 0; mode < MB_MODE_COUNT; ++mode) { + for (int ref_frame = 0; ref_frame < MAX_REF_FRAMES; ++ref_frame) { + frame_mv[mode][ref_frame].as_int = INVALID_MV; + } + } +} + void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, @@ -3009,12 +3459,11 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, unsigned char segment_id = mi->segment_id; int comp_pred, i, k; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; - struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + struct buf_2d yv12_mb[4][MAX_MB_PLANE] = { 0 }; int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES]; int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; + int single_mode_rate[MAX_REF_FRAMES][INTER_MODES]; int64_t best_rd = best_rd_so_far; int64_t best_pred_diff[REFERENCE_MODES]; int64_t best_pred_rd[REFERENCE_MODES]; @@ -3032,20 +3481,39 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; PREDICTION_MODE mode_uv[TX_SIZES]; - const int intra_cost_penalty = vp9_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int best_skip2 = 0; - uint8_t ref_frame_skip_mask[2] = { 0 }; + uint8_t ref_frame_skip_mask[2] = { 0, 1 }; uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 }; int mode_skip_start = sf->mode_skip_start + 1; const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; int64_t mode_threshold[MAX_MODES]; - int *mode_map = tile_data->mode_map[bsize]; + int8_t *tile_mode_map = tile_data->mode_map[bsize]; + int8_t mode_map[MAX_MODES]; // Maintain mode_map information locally to avoid + // lock mechanism involved with reads from + // tile_mode_map const int mode_search_skip_flags = sf->mode_search_skip_flags; + const int is_rect_partition = + num_4x4_blocks_wide_lookup[bsize] != num_4x4_blocks_high_lookup[bsize]; int64_t mask_filter = 0; int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; + struct buf_2d *recon; + struct buf_2d recon_buf; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, recon16[64 * 64]); + recon_buf.buf = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH + ? CONVERT_TO_BYTEPTR(recon16) + : (uint8_t *)recon16; +#else + DECLARE_ALIGNED(16, uint8_t, recon8[64 * 64]); + recon_buf.buf = recon8; +#endif // CONFIG_VP9_HIGHBITDEPTH + recon_buf.stride = 64; + recon = cpi->oxcf.content == VP9E_CONTENT_FILM ? &recon_buf : 0; + vp9_zero(best_mbmode); x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; @@ -3069,9 +3537,12 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, rd_cost->rate = INT_MAX; + init_frame_mv(frame_mv); + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; - if (cpi->ref_frame_flags & flag_list[ref_frame]) { + if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && + !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) { assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); @@ -3081,7 +3552,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) { + if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { // Skip checking missing references in both single and compound reference // modes. Note that a mode will be skipped if both reference frames // are masked out. @@ -3127,7 +3598,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (cpi->rc.is_src_frame_alt_ref) { if (sf->alt_ref_search_fp) { mode_skip_mask[ALTREF_FRAME] = 0; - ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME); + ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME) & 0xff; ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK; } } @@ -3144,32 +3615,37 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL; } - if (bsize > sf->max_intra_bsize) { + if (bsize > sf->max_intra_bsize && cpi->ref_frame_flags != 0) { ref_frame_skip_mask[0] |= (1 << INTRA_FRAME); ref_frame_skip_mask[1] |= (1 << INTRA_FRAME); } mode_skip_mask[INTRA_FRAME] |= - ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); + (uint16_t)~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0; + for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i) mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5; midx = sf->schedule_mode_search ? mode_skip_start : 0; + while (midx > 4) { uint8_t end_pos = 0; for (i = 5; i < midx; ++i) { - if (mode_threshold[mode_map[i - 1]] > mode_threshold[mode_map[i]]) { - uint8_t tmp = mode_map[i]; - mode_map[i] = mode_map[i - 1]; - mode_map[i - 1] = tmp; + if (mode_threshold[tile_mode_map[i - 1]] > + mode_threshold[tile_mode_map[i]]) { + uint8_t tmp = tile_mode_map[i]; + tile_mode_map[i] = tile_mode_map[i - 1]; + tile_mode_map[i - 1] = tmp; end_pos = i; } } midx = end_pos; } + memcpy(mode_map, tile_mode_map, sizeof(mode_map)); + for (midx = 0; midx < MAX_MODES; ++midx) { int mode_index = mode_map[midx]; int mode_excluded = 0; @@ -3187,21 +3663,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, ref_frame = vp9_mode_order[mode_index].ref_frame[0]; second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; + vp9_zero(x->sum_y_eobs); + comp_pred = second_ref_frame > INTRA_FRAME; + if (!comp_pred && ref_frame != INTRA_FRAME && + sf->prune_single_mode_based_on_mv_diff_mode_rate) + single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX; + + if (is_rect_partition) { + if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue; + if (second_ref_frame > 0 && + (ctx->skip_ref_frame_mask & (1 << second_ref_frame))) + continue; + } + // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. if (midx == mode_skip_start && best_mode_index >= 0) { switch (best_mbmode.ref_frame[0]) { case INTRA_FRAME: break; - case LAST_FRAME: - ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - break; + case LAST_FRAME: ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; break; case GOLDEN_FRAME: ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; break; case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break; - case NONE: + case NO_REF_FRAME: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break; } } @@ -3218,6 +3703,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (best_rd < mode_threshold[mode_index]) continue; + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue; + if (sf->motion_field_mode_search) { const int mi_width = VPXMIN(num_8x8_blocks_wide_lookup[bsize], tile_info->mi_col_end - mi_col); @@ -3231,7 +3719,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, MODE_INFO *ref_mi; int const_motion = 1; int skip_ref_frame = !cb_partition_search_ctrl; - MV_REFERENCE_FRAME rf = NONE; + MV_REFERENCE_FRAME rf = NO_REF_FRAME; int_mv ref_mv; ref_mv.as_int = INVALID_MV; @@ -3248,7 +3736,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if ((mi_col - 1) >= tile_info->mi_col_start) { if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1]->mv[0]; - if (rf == NONE) rf = xd->mi[-1]->ref_frame[0]; + if (rf == NO_REF_FRAME) rf = xd->mi[-1]->ref_frame[0]; for (i = 0; i < mi_height; ++i) { ref_mi = xd->mi[i * xd->mi_stride - 1]; const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) && @@ -3265,12 +3753,16 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (this_mode == NEARMV || this_mode == ZEROMV) continue; } - comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; + if (cm->ref_frame_sign_bias[ref_frame] == + cm->ref_frame_sign_bias[second_ref_frame]) + continue; + // Skip compound inter modes if ARF is not available. - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. @@ -3295,7 +3787,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, // Disable intra modes other than DC_PRED for blocks with low variance // Threshold for intra skipping based on source variance // TODO(debargha): Specialize the threshold for super block sizes - const unsigned int skip_intra_var_thresh = 64; + const unsigned int skip_intra_var_thresh = + (cpi->oxcf.content == VP9E_CONTENT_FILM) ? 0 : 64; if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && x->source_variance < skip_intra_var_thresh) continue; @@ -3339,19 +3832,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; struct macroblockd_plane *const pd = &xd->plane[1]; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, intra_mode_search_time); +#endif memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize, - best_rd); + best_rd, recon); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, intra_mode_search_time); +#endif if (rate_y == INT_MAX) continue; uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x] [pd->subsampling_y]; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, intra_mode_search_time); +#endif if (rate_uv_intra[uv_tx] == INT_MAX) { choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]); } - +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, intra_mode_search_time); +#endif rate_uv = rate_uv_tokenonly[uv_tx]; distortion_uv = dist_uv[uv_tx]; skippable = skippable && skip_uv[uv_tx]; @@ -3362,11 +3866,18 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, rate2 += intra_cost_penalty; distortion2 = distortion_y + distortion_uv; } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_inter_mode_time); +#endif this_rd = handle_inter_mode( cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv, - &disable_skip, frame_mv, mi_row, mi_col, single_newmv, - single_inter_filter, single_skippable, &total_sse, best_rd, - &mask_filter, filter_cache); + recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv, + single_inter_filter, single_skippable, + &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter, + filter_cache, best_mode_index); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_inter_mode_time); +#endif if (this_rd == INT64_MAX) continue; compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred); @@ -3393,7 +3904,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, // Cost the skip mb case rate2 += skip_cost1; - } else if (ref_frame != INTRA_FRAME && !xd->lossless) { + } else if (ref_frame != INTRA_FRAME && !xd->lossless && + !cpi->oxcf.sharpness) { if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + skip_cost0, distortion2) < RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) { @@ -3417,10 +3929,39 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); } - // Apply an adjustment to the rd value based on the similarity of the - // source variance and reconstructed variance. - rd_variance_adjustment(cpi, x, bsize, &this_rd, ref_frame, - x->source_variance); + if (recon) { + // In film mode bias against DC pred and other intra if there is a + // significant difference between the variance of the sub blocks in the + // the source. Also apply some bias against compound modes which also + // tend to blur fine texture such as film grain over time. + // + // The sub block test here acts in the case where one or more sub + // blocks have high relatively variance but others relatively low + // variance. Here the high variance sub blocks may push the + // total variance for the current block size over the thresholds + // used in rd_variance_adjustment() below. + if (cpi->oxcf.content == VP9E_CONTENT_FILM) { + if (bsize >= BLOCK_16X16) { + int min_energy, max_energy; + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + if (max_energy > min_energy) { + if (ref_frame == INTRA_FRAME) { + if (this_mode == DC_PRED) + this_rd += (this_rd * (max_energy - min_energy)); + else + this_rd += (this_rd * (max_energy - min_energy)) / 4; + } else if (second_ref_frame > INTRA_FRAME) { + this_rd += this_rd / 4; + } + } + } + } + // Apply an adjustment to the rd value based on the similarity of the + // source variance and reconstructed variance. + rd_variance_adjustment(cpi, x, bsize, &this_rd, recon, ref_frame, + second_ref_frame, this_mode); + } if (ref_frame == INTRA_FRAME) { // Keep record of best intra rd @@ -3466,6 +4007,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane); memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mi->tx_size], sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk); + ctx->sum_y_eobs = x->sum_y_eobs[mi->tx_size]; // TODO(debargha): enhance this test with a better distortion prediction // based on qp, activity mask and history @@ -3571,6 +4113,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } if (best_mode_index < 0 || best_rd >= best_rd_so_far) { + // If adaptive interp filter is enabled, then the current leaf node of 8x8 + // data is needed for sub8x8. Hence preserve the context. + if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; @@ -3685,10 +4230,12 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data, mi->mode = ZEROMV; mi->uv_mode = DC_PRED; mi->ref_frame[0] = LAST_FRAME; - mi->ref_frame[1] = NONE; + mi->ref_frame[1] = NO_REF_FRAME; mi->mv[0].as_int = 0; x->skip = 1; + ctx->sum_y_eobs = 0; + if (cm->interp_filter != BILINEAR) { best_filter = EIGHTTAP; if (cm->interp_filter == SWITCHABLE && @@ -3759,9 +4306,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, unsigned char segment_id = mi->segment_id; int comp_pred, i; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; - struct buf_2d yv12_mb[4][MAX_MB_PLANE]; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; + struct buf_2d yv12_mb[4][MAX_MB_PLANE] = { 0 }; int64_t best_rd = best_rd_so_far; int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise int64_t best_pred_diff[REFERENCE_MODES]; @@ -3777,8 +4322,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t dist_uv; int skip_uv; PREDICTION_MODE mode_uv = DC_PRED; - const int intra_cost_penalty = vp9_get_intra_cost_penalty( - cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); + const int intra_cost_penalty = + vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int_mv seg_mvs[4][MAX_REF_FRAMES]; b_mode_info best_bmodes[4]; int best_skip2 = 0; @@ -3787,6 +4332,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; int internal_active_edge = vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi); + const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; memset(x->zcoeff_blk[TX_4X4], 0, 4); @@ -3810,7 +4356,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, rd_cost->rate = INT_MAX; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { - if (cpi->ref_frame_flags & flag_list[ref_frame]) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } else { @@ -3829,7 +4375,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int rate2 = 0, rate_y = 0, rate_uv = 0; int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; int skippable = 0; - int i; int this_skip2 = 0; int64_t total_sse = INT_MAX; int early_term = 0; @@ -3838,10 +4383,13 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, ref_frame = vp9_ref_order[ref_index].ref_frame[0]; second_ref_frame = vp9_ref_order[ref_index].ref_frame[1]; + vp9_zero(x->sum_y_eobs); + #if CONFIG_BETTER_HW_COMPATIBILITY // forbid 8X4 and 4X8 partitions if any reference frame is scaled. if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) { - int ref_scaled = vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf); + int ref_scaled = ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf); if (second_ref_frame > INTRA_FRAME) ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf); if (ref_scaled) continue; @@ -3864,7 +4412,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, case ALTREF_FRAME: ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME); break; - case NONE: + case NO_REF_FRAME: case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break; } } @@ -3878,13 +4426,22 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, if (!internal_active_edge && rd_less_than_thresh(best_rd, rd_opt->threshes[segment_id][bsize][ref_index], - tile_data->thresh_freq_fact[bsize][ref_index])) + &rd_thresh_freq_fact[ref_index])) continue; + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue; + comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; + + if (cm->ref_frame_sign_bias[ref_frame] == + cm->ref_frame_sign_bias[second_ref_frame]) + continue; + + if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame))) + continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; @@ -3978,7 +4535,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, : NULL; if (scaled_ref_frame[ref]) { - int i; // Swap out the reference frame for a version that's been scaled to // match the resolution of the current frame, allowing the existing // motion search code to be used without additional modifications. @@ -4048,9 +4604,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, tmp_best_sse = total_sse; tmp_best_skippable = skippable; tmp_best_mbmode = *mi; + x->sum_y_eobs[TX_4X4] = 0; for (i = 0; i < 4; i++) { tmp_best_bmodes[i] = xd->mi[0]->bmi[i]; x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i]; + x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i]; } pred_exists = 1; if (switchable_filter_index == 0 && sf->use_rd_breakout && @@ -4080,6 +4638,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, &rate, &rate_y, &distortion, &skippable, &total_sse, (int)this_rd_thresh, seg_mvs, bsi, 0, mi_row, mi_col); if (tmp_rd == INT64_MAX) continue; + x->sum_y_eobs[TX_4X4] = 0; + for (i = 0; i < 4; i++) { + x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i]; + x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i]; + } } else { total_sse = tmp_best_sse; rate = tmp_best_rate; @@ -4108,14 +4671,13 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, if (tmp_best_rdu > 0) { // If even the 'Y' rd value of split is higher than best so far - // then dont bother looking at UV + // then don't bother looking at UV vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8); memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm)); if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable, &uv_sse, BLOCK_8X8, tmp_best_rdu)) { for (ref = 0; ref < 2; ++ref) { if (scaled_ref_frame[ref]) { - int i; for (i = 0; i < MAX_MB_PLANE; ++i) xd->plane[i].pre[ref] = backup_yv12[ref][i]; } @@ -4132,7 +4694,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, for (ref = 0; ref < 2; ++ref) { if (scaled_ref_frame[ref]) { // Restore the prediction frame pointers to their unscaled versions. - int i; for (i = 0; i < MAX_MB_PLANE; ++i) xd->plane[i].pre[ref] = backup_yv12[ref][i]; } @@ -4215,6 +4776,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane); memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4], sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk); + ctx->sum_y_eobs = x->sum_y_eobs[TX_4X4]; for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i]; @@ -4330,12 +4892,18 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, if (!is_inter_block(&best_mbmode)) { for (i = 0; i < 4; i++) xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode; } else { - for (i = 0; i < 4; ++i) - memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info)); + for (i = 0; i < 4; ++i) xd->mi[0]->bmi[i] = best_bmodes[i]; mi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int; mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int; } + // If the second reference does not exist, set the corresponding mv to zero. + if (mi->ref_frame[1] == NO_REF_FRAME) { + mi->mv[1].as_int = 0; + for (i = 0; i < 4; ++i) { + mi->bmi[i].as_mv[1].as_int = 0; + } + } for (i = 0; i < REFERENCE_MODES; ++i) { if (best_pred_rd[i] == INT64_MAX) @@ -4360,3 +4928,4 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, store_coding_context(x, ctx, best_ref_index, best_pred_diff, best_filter_diff, 0); } +#endif // !CONFIG_REALTIME_ONLY diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h index 795c91aef7..e1147ff943 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RDOPT_H_ -#define VP9_ENCODER_VP9_RDOPT_H_ +#ifndef VPX_VP9_ENCODER_VP9_RDOPT_H_ +#define VPX_VP9_ENCODER_VP9_RDOPT_H_ #include "vp9/common/vp9_blockd.h" @@ -29,6 +29,7 @@ void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x, struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd); +#if !CONFIG_REALTIME_ONLY void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, int mi_row, int mi_col, @@ -39,21 +40,24 @@ void vp9_rd_pick_inter_mode_sb_seg_skip( struct VP9_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); +#endif int vp9_internal_image_edge(struct VP9_COMP *cpi); int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step); int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step); int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col); +#if !CONFIG_REALTIME_ONLY void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, int mi_row, int mi_col, struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); +#endif #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RDOPT_H_ +#endif // VPX_VP9_ENCODER_VP9_RDOPT_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_resize.c b/media/libvpx/libvpx/vp9/encoder/vp9_resize.c index f6c4aad4d3..352d8f1273 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_resize.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_resize.c @@ -360,6 +360,12 @@ static int get_down2_steps(int in_length, int out_length) { while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { ++steps; in_length = proj_in_length; + if (in_length == 1) { + // Special case: we break because any further calls to get_down2_length() + // with be with length == 1, which return 1, resulting in an infinite + // loop. + break; + } } return steps; } @@ -424,11 +430,11 @@ void vp9_resize_plane(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride) { int i; - uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height); + uint8_t *intbuf = (uint8_t *)calloc(width2 * height, sizeof(*intbuf)); uint8_t *tmpbuf = - (uint8_t *)malloc(sizeof(uint8_t) * (width < height ? height : width)); - uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * height); - uint8_t *arrbuf2 = (uint8_t *)malloc(sizeof(uint8_t) * height2); + (uint8_t *)calloc(width < height ? height : width, sizeof(*tmpbuf)); + uint8_t *arrbuf = (uint8_t *)calloc(height, sizeof(*arrbuf)); + uint8_t *arrbuf2 = (uint8_t *)calloc(height2, sizeof(*arrbuf2)); if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error; assert(width > 0); @@ -506,10 +512,12 @@ static void highbd_interpolate(const uint16_t *const input, int inlength, sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; filter = interp_filters[sub_pel]; sum = 0; - for (k = 0; k < INTERP_TAPS; ++k) + for (k = 0; k < INTERP_TAPS; ++k) { + assert(int_pel - INTERP_TAPS / 2 + 1 + k < inlength); sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ? 0 : int_pel - INTERP_TAPS / 2 + 1 + k)]; + } *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); } // Middle part. @@ -720,6 +728,10 @@ void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width, uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2); if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error; + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); for (i = 0; i < height; ++i) { highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width, intbuf + width2 * i, width2, tmpbuf, bd); @@ -738,83 +750,3 @@ Error: free(arrbuf2); } #endif // CONFIG_VP9_HIGHBITDEPTH - -void vp9_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, - int ouv_stride, int oheight, int owidth) { - vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); - vp9_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, - owidth / 2, ouv_stride); - vp9_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, - owidth / 2, ouv_stride); -} - -void vp9_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, - int ouv_stride, int oheight, int owidth) { - vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); - vp9_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2, - ouv_stride); - vp9_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2, - ouv_stride); -} - -void vp9_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, - int ouv_stride, int oheight, int owidth) { - vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride); - vp9_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, - ouv_stride); - vp9_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, - ouv_stride); -} - -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, - uint8_t *oy, int oy_stride, uint8_t *ou, - uint8_t *ov, int ouv_stride, int oheight, - int owidth, int bd) { - vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, - oy_stride, bd); - vp9_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, - owidth / 2, ouv_stride, bd); - vp9_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, - owidth / 2, ouv_stride, bd); -} - -void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, - uint8_t *oy, int oy_stride, uint8_t *ou, - uint8_t *ov, int ouv_stride, int oheight, - int owidth, int bd) { - vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, - oy_stride, bd); - vp9_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, - owidth / 2, ouv_stride, bd); - vp9_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, - owidth / 2, ouv_stride, bd); -} - -void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, - uint8_t *oy, int oy_stride, uint8_t *ou, - uint8_t *ov, int ouv_stride, int oheight, - int owidth, int bd) { - vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth, - oy_stride, bd); - vp9_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth, - ouv_stride, bd); - vp9_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth, - ouv_stride, bd); -} -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_resize.h b/media/libvpx/libvpx/vp9/encoder/vp9_resize.h index d3282ee191..7a984dbc94 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_resize.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_resize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RESIZE_H_ -#define VP9_ENCODER_VP9_RESIZE_H_ +#ifndef VPX_VP9_ENCODER_VP9_RESIZE_H_ +#define VPX_VP9_ENCODER_VP9_RESIZE_H_ #include #include "vpx/vpx_integer.h" @@ -21,48 +21,15 @@ extern "C" { void vp9_resize_plane(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride); -void vp9_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, - int ouv_stride, int oheight, int owidth); -void vp9_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, - int ouv_stride, int oheight, int owidth); -void vp9_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, - int ouv_stride, int oheight, int owidth); #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, int bd); -void vp9_highbd_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, - uint8_t *oy, int oy_stride, uint8_t *ou, - uint8_t *ov, int ouv_stride, int oheight, - int owidth, int bd); -void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, - uint8_t *oy, int oy_stride, uint8_t *ou, - uint8_t *ov, int ouv_stride, int oheight, - int owidth, int bd); -void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, - uint8_t *oy, int oy_stride, uint8_t *ou, - uint8_t *ov, int ouv_stride, int oheight, - int owidth, int bd); #endif // CONFIG_VP9_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RESIZE_H_ +#endif // VPX_VP9_ENCODER_VP9_RESIZE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c index 4a5a68e07a..d75488a8e6 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c @@ -9,6 +9,7 @@ */ #include +#include #include "vpx_mem/vpx_mem.h" @@ -38,7 +39,7 @@ void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data, } void vp9_disable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { - seg->feature_mask[segment_id] &= ~(1 << feature_id); + seg->feature_mask[segment_id] &= ~(1u << feature_id); } void vp9_clear_segdata(struct segmentation *seg, int segment_id, @@ -46,6 +47,59 @@ void vp9_clear_segdata(struct segmentation *seg, int segment_id, seg->feature_data[segment_id][feature_id] = 0; } +void vp9_psnr_aq_mode_setup(struct segmentation *seg) { + int i; + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + seg->abs_delta = SEGMENT_DELTADATA; + + for (i = 0; i < MAX_SEGMENTS; ++i) { + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 2 * (i - (MAX_SEGMENTS / 2))); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } +} + +void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi, + struct segmentation *seg) { + const VP9_COMMON *cm = &cpi->common; + const int seg_counts = cpi->kmeans_ctr_num; + const int base_qindex = cm->base_qindex; + const double base_qstep = vp9_convert_qindex_to_q(base_qindex, cm->bit_depth); + const double mid_ctr = cpi->kmeans_ctr_ls[seg_counts / 2]; + const double var_diff_scale = 4.0; + int i; + + assert(seg_counts <= MAX_SEGMENTS); + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + seg->abs_delta = SEGMENT_DELTADATA; + + for (i = 0; i < seg_counts / 2; ++i) { + double wiener_var_diff = mid_ctr - cpi->kmeans_ctr_ls[i]; + double target_qstep = base_qstep / (1.0 + wiener_var_diff / var_diff_scale); + int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth); + assert(wiener_var_diff >= 0.0); + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 0); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + + for (; i < seg_counts; ++i) { + double wiener_var_diff = cpi->kmeans_ctr_ls[i] - mid_ctr; + double target_qstep = base_qstep * (1.0 + wiener_var_diff / var_diff_scale); + int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth); + assert(wiener_var_diff >= 0.0); + + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } +} + // Based on set of segment counts calculate a probability tree static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) { // Work out probabilities of each segment diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h index 562805543b..9404c38bc8 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SEGMENTATION_H_ -#define VP9_ENCODER_VP9_SEGMENTATION_H_ +#ifndef VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ +#define VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ #include "vp9/common/vp9_blockd.h" #include "vp9/encoder/vp9_encoder.h" @@ -26,6 +26,11 @@ void vp9_disable_segfeature(struct segmentation *seg, int segment_id, void vp9_clear_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); +void vp9_psnr_aq_mode_setup(struct segmentation *seg); + +void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi, + struct segmentation *seg); + // The values given for each segment can be either deltas (from the default // value chosen for the frame) or absolute values. // @@ -47,4 +52,4 @@ void vp9_reset_segment_features(struct segmentation *seg); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SEGMENTATION_H_ +#endif // VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c index 3f3d48fb9f..cc6c967767 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c @@ -15,75 +15,6 @@ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_skin_detection.h" -#define MODEL_MODE 1 - -// Fixed-point skin color model parameters. -static const int skin_mean[5][2] = { { 7463, 9614 }, - { 6400, 10240 }, - { 7040, 10240 }, - { 8320, 9280 }, - { 6800, 9614 } }; -static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16 -static const int skin_threshold[6] = { 1570636, 1400000, 800000, - 800000, 800000, 800000 }; // q18 - -// Thresholds on luminance. -static const int y_low = 40; -static const int y_high = 220; - -// Evaluates the Mahalanobis distance measure for the input CbCr values. -static int evaluate_skin_color_difference(int cb, int cr, int idx) { - const int cb_q6 = cb << 6; - const int cr_q6 = cr << 6; - const int cb_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]); - const int cbcr_diff_q12 = - (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]); - const int cr_diff_q12 = - (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]); - const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10; - const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10; - const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10; - const int skin_diff = - skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 + - skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2; - return skin_diff; -} - -int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr, - int motion) { - if (y < y_low || y > y_high) { - return 0; - } else { - if (MODEL_MODE == 0) { - return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]); - } else { - int i = 0; - // Exit on grey. - if (cb == 128 && cr == 128) return 0; - // Exit on very strong cb. - if (cb > 150 && cr < 110) return 0; - for (; i < 5; i++) { - int skin_color_diff = evaluate_skin_color_difference(cb, cr, i); - if (skin_color_diff < skin_threshold[i + 1]) { - if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) - return 0; - else if (motion == 0 && - skin_color_diff > (skin_threshold[i + 1] >> 1)) - return 0; - else - return 1; - } - // Exit if difference is much large than the threshold. - if (skin_color_diff > (skin_threshold[i + 1] << 3)) { - return 0; - } - } - return 0; - } - } -} - int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, int stride, int strideuv, int bsize, int consec_zeromv, int curr_motion_magn) { @@ -100,31 +31,113 @@ int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, const uint8_t ysource = y[y_height_shift * stride + y_width_shift]; const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift]; const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift]; + if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0; - return vp9_skin_pixel(ysource, usource, vsource, motion); + return vpx_skin_pixel(ysource, usource, vsource, motion); } } -#ifdef OUTPUT_YUV_SKINMAP -// For viewing skin map on input source. -void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { - int i, j, mi_row, mi_col, num_bl; +void vp9_compute_skin_sb(VP9_COMP *const cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + int i, j, num_bl; VP9_COMMON *const cm = &cpi->common; - uint8_t *y; const uint8_t *src_y = cpi->Source->y_buffer; const uint8_t *src_u = cpi->Source->u_buffer; const uint8_t *src_v = cpi->Source->v_buffer; const int src_ystride = cpi->Source->y_stride; const int src_uvstride = cpi->Source->uv_stride; - int y_bsize = 16; // Use 8x8 or 16x16. - int uv_bsize = y_bsize >> 1; - int ypos = y_bsize >> 1; - int uvpos = uv_bsize >> 1; - int shy = (y_bsize == 8) ? 3 : 4; - int shuv = shy - 1; - int fac = y_bsize / 8; - // Use center pixel or average of center 2x2 pixels. - int mode_filter = 0; + const int y_bsize = 4 << b_width_log2_lookup[bsize]; + const int uv_bsize = y_bsize >> 1; + const int shy = (y_bsize == 8) ? 3 : 4; + const int shuv = shy - 1; + const int fac = y_bsize / 8; + const int y_shift = src_ystride * (mi_row << 3) + (mi_col << 3); + const int uv_shift = src_uvstride * (mi_row << 2) + (mi_col << 2); + const int mi_row_limit = VPXMIN(mi_row + 8, cm->mi_rows - 2); + const int mi_col_limit = VPXMIN(mi_col + 8, cm->mi_cols - 2); + src_y += y_shift; + src_u += uv_shift; + src_v += uv_shift; + + for (i = mi_row; i < mi_row_limit; i += fac) { + num_bl = 0; + for (j = mi_col; j < mi_col_limit; j += fac) { + int consec_zeromv = 0; + int bl_index = i * cm->mi_cols + j; + int bl_index1 = bl_index + 1; + int bl_index2 = bl_index + cm->mi_cols; + int bl_index3 = bl_index2 + 1; + // Don't detect skin on the boundary. + if (i == 0 || j == 0) continue; + if (bsize == BLOCK_8X8) + consec_zeromv = cpi->consec_zero_mv[bl_index]; + else + consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], + VPXMIN(cpi->consec_zero_mv[bl_index1], + VPXMIN(cpi->consec_zero_mv[bl_index2], + cpi->consec_zero_mv[bl_index3]))); + cpi->skin_map[bl_index] = + vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride, + bsize, consec_zeromv, 0); + num_bl++; + src_y += y_bsize; + src_u += uv_bsize; + src_v += uv_bsize; + } + src_y += (src_ystride << shy) - (num_bl << shy); + src_u += (src_uvstride << shuv) - (num_bl << shuv); + src_v += (src_uvstride << shuv) - (num_bl << shuv); + } + + // Remove isolated skin blocks (none of its neighbors are skin) and isolated + // non-skin blocks (all of its neighbors are skin). + // Skip 4 corner blocks which have only 3 neighbors to remove isolated skin + // blocks. Skip superblock borders to remove isolated non-skin blocks. + for (i = mi_row; i < mi_row_limit; i += fac) { + for (j = mi_col; j < mi_col_limit; j += fac) { + int bl_index = i * cm->mi_cols + j; + int num_neighbor = 0; + int mi, mj; + int non_skin_threshold = 8; + // Skip 4 corners. + if ((i == mi_row && (j == mi_col || j == mi_col_limit - fac)) || + (i == mi_row_limit - fac && (j == mi_col || j == mi_col_limit - fac))) + continue; + // There are only 5 neighbors for non-skin blocks on the border. + if (i == mi_row || i == mi_row_limit - fac || j == mi_col || + j == mi_col_limit - fac) + non_skin_threshold = 5; + + for (mi = -fac; mi <= fac; mi += fac) { + for (mj = -fac; mj <= fac; mj += fac) { + if (i + mi >= mi_row && i + mi < mi_row_limit && j + mj >= mi_col && + j + mj < mi_col_limit) { + int bl_neighbor_index = (i + mi) * cm->mi_cols + j + mj; + if (cpi->skin_map[bl_neighbor_index]) num_neighbor++; + } + } + } + + if (cpi->skin_map[bl_index] && num_neighbor < 2) + cpi->skin_map[bl_index] = 0; + if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold) + cpi->skin_map[bl_index] = 1; + } + } +} + +#ifdef OUTPUT_YUV_SKINMAP +// For viewing skin map on input source. +void vp9_output_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { + int i, j, mi_row, mi_col, num_bl; + VP9_COMMON *const cm = &cpi->common; + uint8_t *y; + const uint8_t *src_y = cpi->Source->y_buffer; + const int src_ystride = cpi->Source->y_stride; + const int y_bsize = 16; // Use 8x8 or 16x16. + const int shy = (y_bsize == 8) ? 3 : 4; + const int fac = y_bsize / 8; + YV12_BUFFER_CONFIG skinmap; memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG)); if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height, cm->subsampling_x, @@ -141,65 +154,21 @@ void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) { for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) { num_bl = 0; for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) { - int is_skin = 0; - if (mode_filter == 1) { - // Use 2x2 average at center. - uint8_t ysource = src_y[ypos * src_ystride + ypos]; - uint8_t usource = src_u[uvpos * src_uvstride + uvpos]; - uint8_t vsource = src_v[uvpos * src_uvstride + uvpos]; - uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos]; - uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos]; - uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos]; - uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)]; - uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos + 1)]; - uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos + 1)]; - uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)]; - uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos + 1)]; - uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos + 1)]; - ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2; - usource = (usource + usource2 + usource3 + usource4) >> 2; - vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2; - is_skin = vp9_skin_pixel(ysource, usource, vsource, 1); - } else { - int block_size = BLOCK_8X8; - int consec_zeromv = 0; - int bl_index = mi_row * cm->mi_cols + mi_col; - int bl_index1 = bl_index + 1; - int bl_index2 = bl_index + cm->mi_cols; - int bl_index3 = bl_index2 + 1; - if (y_bsize == 8) - consec_zeromv = cpi->consec_zero_mv[bl_index]; - else - consec_zeromv = - VPXMIN(cpi->consec_zero_mv[bl_index], - VPXMIN(cpi->consec_zero_mv[bl_index1], - VPXMIN(cpi->consec_zero_mv[bl_index2], - cpi->consec_zero_mv[bl_index3]))); - if (y_bsize == 16) block_size = BLOCK_16X16; - is_skin = - vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, - src_uvstride, block_size, consec_zeromv, 0); - } + const int block_index = mi_row * cm->mi_cols + mi_col; + const int is_skin = cpi->skin_map[block_index]; for (i = 0; i < y_bsize; i++) { for (j = 0; j < y_bsize; j++) { - if (is_skin) - y[i * src_ystride + j] = 255; - else - y[i * src_ystride + j] = src_y[i * src_ystride + j]; + y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j]; } } num_bl++; y += y_bsize; src_y += y_bsize; - src_u += uv_bsize; - src_v += uv_bsize; } y += (src_ystride << shy) - (num_bl << shy); src_y += (src_ystride << shy) - (num_bl << shy); - src_u += (src_uvstride << shuv) - (num_bl << shuv); - src_v += (src_uvstride << shuv) - (num_bl << shuv); } - vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file); + vpx_write_yuv_frame(yuv_skinmap_file, &skinmap); vpx_free_frame_buffer(&skinmap); } #endif diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h index c77382dbd7..46a722af9b 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h @@ -8,10 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SKIN_MAP_H_ -#define VP9_ENCODER_VP9_SKIN_MAP_H_ +#ifndef VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ +#define VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ #include "vp9/common/vp9_blockd.h" +#include "vpx_dsp/skin_detection.h" +#include "vpx_util/vpx_write_yuv_frame.h" #ifdef __cplusplus extern "C" { @@ -19,23 +21,20 @@ extern "C" { struct VP9_COMP; -// #define OUTPUT_YUV_SKINMAP - -int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr, - int motion); - int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v, int stride, int strideuv, int bsize, int consec_zeromv, int curr_motion_magn); +void vp9_compute_skin_sb(struct VP9_COMP *const cpi, BLOCK_SIZE bsize, + int mi_row, int mi_col); + #ifdef OUTPUT_YUV_SKINMAP // For viewing skin map on input source. -void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); -extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f); +void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); #endif #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SKIN_MAP_H_ +#endif // VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c index 81cb431ba5..3268f64648 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c @@ -16,34 +16,33 @@ #include "vpx_dsp/vpx_dsp_common.h" // Mesh search patters for various speed settings -static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = { - { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } +// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non +// FC_GRAPHICS_ANIMATION content type. +static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = { + { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, }; -#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method +#if !CONFIG_REALTIME_ONLY +// Define 3 mesh density levels to control the number of searches. +#define MESH_DENSITY_LEVELS 3 static MESH_PATTERN - good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { - { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, + good_quality_mesh_patterns[MESH_DENSITY_LEVELS][MAX_MESH_STEP] = { { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } }, { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, - { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, - { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } }, }; -static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = { - 50, 25, 15, 5, 1, 1 -}; // Intra only frames, golden frames (except alt ref overlays) and // alt ref frames tend to be coded at a higher than ambient quality static int frame_is_boosted(const VP9_COMP *cpi) { - return frame_is_kf_gf_arf(cpi) || vp9_is_upper_layer_key_frame(cpi); + return frame_is_kf_gf_arf(cpi); } // Sets a partition size down to which the auto partition code will always // search (can go lower), based on the image dimensions. The logic here // is that the extent to which ringing artefacts are offensive, depends -// partly on the screen area that over which they propogate. Propogation is +// partly on the screen area that over which they propagate. Propagation is // limited by transform block size but the screen area take up by a given block // size will be larger for a small image format stretched to full screen. static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) { @@ -66,56 +65,127 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed) { VP9_COMMON *const cm = &cpi->common; + const int min_frame_size = VPXMIN(cm->width, cm->height); + const int is_480p_or_larger = min_frame_size >= 480; + const int is_720p_or_larger = min_frame_size >= 720; + const int is_1080p_or_larger = min_frame_size >= 1080; + const int is_2160p_or_larger = min_frame_size >= 2160; + const int boosted = frame_is_boosted(cpi); - if (speed >= 1) { - if (VPXMIN(cm->width, cm->height) >= 720) { - sf->disable_split_mask = - cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; - sf->partition_search_breakout_dist_thr = (1 << 23); + // speed 0 features + sf->partition_search_breakout_thr.dist = (1 << 20); + sf->partition_search_breakout_thr.rate = 80; + sf->use_square_only_thresh_high = BLOCK_SIZES; + sf->use_square_only_thresh_low = BLOCK_4X4; + + if (is_480p_or_larger) { + // Currently, the machine-learning based partition search early termination + // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. + sf->rd_ml_partition.search_early_termination = 1; + sf->recode_tolerance_high = 45; + } else { + sf->use_square_only_thresh_high = BLOCK_32X32; + } + if (is_720p_or_larger) { + sf->alt_ref_search_fp = 1; + } + + if (!is_1080p_or_larger) { + sf->rd_ml_partition.search_breakout = 1; + if (is_720p_or_larger) { + sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = 0.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = 0.0f; } else { - sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; - sf->partition_search_breakout_dist_thr = (1 << 21); + sf->rd_ml_partition.search_breakout_thresh[0] = 2.5f; + sf->rd_ml_partition.search_breakout_thresh[1] = 1.5f; + sf->rd_ml_partition.search_breakout_thresh[2] = 1.5f; } } + if (!is_720p_or_larger) { + if (is_480p_or_larger) + sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1; + else + sf->prune_single_mode_based_on_mv_diff_mode_rate = 1; + } + + if (speed >= 1) { + sf->rd_ml_partition.search_early_termination = 0; + sf->rd_ml_partition.search_breakout = 1; + if (is_480p_or_larger) + sf->use_square_only_thresh_high = BLOCK_64X64; + else + sf->use_square_only_thresh_high = BLOCK_32X32; + sf->use_square_only_thresh_low = BLOCK_16X16; + if (is_720p_or_larger) { + sf->disable_split_mask = + cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + sf->partition_search_breakout_thr.dist = (1 << 22); + sf->rd_ml_partition.search_breakout_thresh[0] = -5.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = -5.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = -9.0f; + } else { + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + sf->partition_search_breakout_thr.dist = (1 << 21); + sf->rd_ml_partition.search_breakout_thresh[0] = -1.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = -1.0f; + } +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) { + sf->rd_ml_partition.search_breakout_thresh[0] -= 1.0f; + sf->rd_ml_partition.search_breakout_thresh[1] -= 1.0f; + sf->rd_ml_partition.search_breakout_thresh[2] -= 1.0f; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + } + if (speed >= 2) { - if (VPXMIN(cm->width, cm->height) >= 720) { + sf->use_square_only_thresh_high = BLOCK_4X4; + sf->use_square_only_thresh_low = BLOCK_SIZES; + if (is_720p_or_larger) { sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; sf->adaptive_pred_interp_filter = 0; - sf->partition_search_breakout_dist_thr = (1 << 24); - sf->partition_search_breakout_rate_thr = 120; + sf->partition_search_breakout_thr.dist = (1 << 24); + sf->partition_search_breakout_thr.rate = 120; + sf->rd_ml_partition.search_breakout = 0; } else { sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; - sf->partition_search_breakout_dist_thr = (1 << 22); - sf->partition_search_breakout_rate_thr = 100; + sf->partition_search_breakout_thr.dist = (1 << 22); + sf->partition_search_breakout_thr.rate = 100; + sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f; + sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f; + sf->rd_ml_partition.search_breakout_thresh[2] = -4.0f; } sf->rd_auto_partition_min_limit = set_partition_min_limit(cm); // Use a set of speed features for 4k videos. - if (VPXMIN(cm->width, cm->height) >= 2160) { + if (is_2160p_or_larger) { sf->use_square_partition_only = 1; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; sf->alt_ref_search_fp = 1; - sf->cb_pred_filter_search = 1; + sf->cb_pred_filter_search = 2; sf->adaptive_interp_filter_search = 1; sf->disable_split_mask = DISABLE_ALL_SPLIT; } } if (speed >= 3) { - if (VPXMIN(cm->width, cm->height) >= 720) { + sf->rd_ml_partition.search_breakout = 0; + if (is_720p_or_larger) { sf->disable_split_mask = DISABLE_ALL_SPLIT; sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0; - sf->partition_search_breakout_dist_thr = (1 << 25); - sf->partition_search_breakout_rate_thr = 200; + sf->partition_search_breakout_thr.dist = (1 << 25); + sf->partition_search_breakout_thr.rate = 200; } else { sf->max_intra_bsize = BLOCK_32X32; sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0; - sf->partition_search_breakout_dist_thr = (1 << 23); - sf->partition_search_breakout_rate_thr = 120; + sf->partition_search_breakout_thr.dist = (1 << 23); + sf->partition_search_breakout_thr.rate = 120; } } @@ -129,37 +199,87 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, } if (speed >= 4) { - if (VPXMIN(cm->width, cm->height) >= 720) { - sf->partition_search_breakout_dist_thr = (1 << 26); + sf->partition_search_breakout_thr.rate = 300; + if (is_720p_or_larger) { + sf->partition_search_breakout_thr.dist = (1 << 26); } else { - sf->partition_search_breakout_dist_thr = (1 << 24); + sf->partition_search_breakout_thr.dist = (1 << 24); } sf->disable_split_mask = DISABLE_ALL_SPLIT; } + + if (speed >= 5) { + sf->partition_search_breakout_thr.rate = 500; + } } static double tx_dom_thresholds[6] = { 99.0, 14.0, 12.0, 8.0, 4.0, 0.0 }; static double qopt_thresholds[6] = { 99.0, 12.0, 10.0, 4.0, 2.0, 0.0 }; -static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, - SPEED_FEATURES *sf, int speed) { +static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, + VP9_COMMON *cm, + SPEED_FEATURES *sf, + int speed) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; const int boosted = frame_is_boosted(cpi); + int i; - sf->partition_search_breakout_dist_thr = (1 << 20); - sf->partition_search_breakout_rate_thr = 80; - sf->tx_size_search_breakout = 1; + sf->adaptive_interp_filter_search = 1; + sf->adaptive_pred_interp_filter = 1; sf->adaptive_rd_thresh = 1; + sf->adaptive_rd_thresh_row_mt = 0; sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; - sf->use_square_partition_only = !frame_is_boosted(cpi); - sf->use_square_only_threshold = BLOCK_16X16; + sf->mv.auto_mv_step_size = 1; + sf->mv.use_downsampled_sad = 1; + sf->prune_ref_frame_for_rect_partitions = 1; + sf->temporal_filter_search_method = NSTEP; + sf->tx_size_search_breakout = 1; + sf->use_square_partition_only = !boosted; + sf->early_term_interp_search_plane_rd = 1; + sf->cb_pred_filter_search = 1; + sf->trellis_opt_tx_rd.method = sf->optimize_coefficients + ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE + : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0; + + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->comp_inter_joint_search_iter_level = 1; + + // Reference masking is not supported in dynamic scaling mode. + sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC; + + sf->rd_ml_partition.var_pruning = 1; + sf->rd_ml_partition.prune_rect_thresh[0] = -1; + sf->rd_ml_partition.prune_rect_thresh[1] = 350; + sf->rd_ml_partition.prune_rect_thresh[2] = 325; + sf->rd_ml_partition.prune_rect_thresh[3] = 250; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + sf->exhaustive_searches_thresh = (1 << 22); + } else { + sf->exhaustive_searches_thresh = INT_MAX; + } + + for (i = 0; i < MAX_MESH_STEP; ++i) { + const int mesh_density_level = 0; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } if (speed >= 1) { - if (cpi->oxcf.pass == 2) { + sf->rd_ml_partition.var_pruning = !boosted; + sf->rd_ml_partition.prune_rect_thresh[1] = 225; + sf->rd_ml_partition.prune_rect_thresh[2] = 225; + sf->rd_ml_partition.prune_rect_thresh[3] = 225; + + if (oxcf->pass == 2) { TWO_PASS *const twopass = &cpi->twopass; if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) || vp9_internal_image_edge(cpi)) { - sf->use_square_partition_only = !frame_is_boosted(cpi); + sf->use_square_partition_only = !boosted; } else { sf->use_square_partition_only = !frame_is_intra_only(cm); } @@ -169,49 +289,70 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->allow_txfm_domain_distortion = 1; sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5]; - sf->allow_quant_coeff_opt = sf->optimize_coefficients; - sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5]; - - sf->use_square_only_threshold = BLOCK_4X4; + sf->trellis_opt_tx_rd.method = sf->optimize_coefficients + ? ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR + : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = qopt_thresholds[(speed < 6) ? speed : 5]; sf->less_rectangular_check = 1; - sf->use_rd_breakout = 1; sf->adaptive_motion_search = 1; - sf->mv.auto_mv_step_size = 1; sf->adaptive_rd_thresh = 2; - sf->mv.subpel_iters_per_step = 1; - sf->mode_skip_start = 10; - sf->adaptive_pred_interp_filter = 1; + sf->mv.subpel_search_level = 1; + if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10; sf->allow_acl = 0; - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; - sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + if (cpi->oxcf.content != VP9E_CONTENT_FILM) { + sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + } sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 30; + + sf->exhaustive_searches_thresh = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23) + : INT_MAX; + sf->use_accurate_subpel_search = USE_4_TAPS; } if (speed >= 2) { - sf->recode_loop = ALLOW_RECODE_KFARFGF; + sf->rd_ml_partition.var_pruning = 0; + if (oxcf->vbr_corpus_complexity) + sf->recode_loop = ALLOW_RECODE_FIRST; + else + sf->recode_loop = ALLOW_RECODE_KFARFGF; + sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL; - // Reference masking is not supported in dynamic scaling mode. - sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0; - sf->mode_search_skip_flags = - (cm->frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH | - FLAG_SKIP_INTRA_BESTINTER | - FLAG_SKIP_COMP_BESTINTRA | - FLAG_SKIP_INTRA_LOWVAR; + (cm->frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; sf->disable_filter_search_var_thresh = 100; - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->comp_inter_joint_search_iter_level = 2; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; - sf->allow_partition_search_skip = 1; - sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 45; + sf->enhanced_full_pixel_motion_search = 0; + sf->prune_ref_frame_for_rect_partitions = 0; + sf->rd_ml_partition.prune_rect_thresh[1] = -1; + sf->rd_ml_partition.prune_rect_thresh[2] = -1; + sf->rd_ml_partition.prune_rect_thresh[3] = -1; + sf->mv.subpel_search_level = 0; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + for (i = 0; i < MAX_MESH_STEP; ++i) { + int mesh_density_level = 1; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + } + + sf->use_accurate_subpel_search = USE_2_TAPS; } if (speed >= 3) { @@ -222,14 +363,23 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 1; sf->cb_partition_search = !boosted; - sf->cb_pred_filter_search = 1; + sf->cb_pred_filter_search = 2; sf->alt_ref_search_fp = 1; sf->recode_loop = ALLOW_RECODE_KFMAXBW; sf->adaptive_rd_thresh = 3; sf->mode_skip_start = 6; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; - sf->adaptive_interp_filter_search = 1; + + if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + for (i = 0; i < MAX_MESH_STEP; ++i) { + int mesh_density_level = 2; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + } } if (speed >= 4) { @@ -245,11 +395,9 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->use_fast_coef_updates = ONE_LOOP_REDUCED; sf->use_fast_coef_costing = 1; sf->motion_field_mode_search = !boosted; - sf->partition_search_breakout_rate_thr = 300; } if (speed >= 5) { - int i; sf->optimize_coefficients = 0; sf->mv.search_method = HEX; sf->disable_filter_search_var_thresh = 500; @@ -257,11 +405,11 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->intra_y_mode_mask[i] = INTRA_DC; sf->intra_uv_mode_mask[i] = INTRA_DC; } - sf->partition_search_breakout_rate_thr = 500; sf->mv.reduce_first_step_size = 1; sf->simple_model_rd_from_var = 1; } } +#endif // !CONFIG_REALTIME_ONLY static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi, SPEED_FEATURES *sf, @@ -287,10 +435,11 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi, } if (speed >= 5) { + sf->partition_search_breakout_thr.rate = 200; if (VPXMIN(cm->width, cm->height) >= 720) { - sf->partition_search_breakout_dist_thr = (1 << 25); + sf->partition_search_breakout_thr.dist = (1 << 25); } else { - sf->partition_search_breakout_dist_thr = (1 << 23); + sf->partition_search_breakout_thr.dist = (1 << 23); } } @@ -300,24 +449,44 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi, } } -static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, - vp9e_tune_content content) { +static void set_rt_speed_feature_framesize_independent( + VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) { VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; const int is_keyframe = cm->frame_type == KEY_FRAME; const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; sf->static_segmentation = 0; sf->adaptive_rd_thresh = 1; + sf->adaptive_rd_thresh_row_mt = 0; sf->use_fast_coef_costing = 1; - sf->allow_exhaustive_searches = 0; sf->exhaustive_searches_thresh = INT_MAX; sf->allow_acl = 0; sf->copy_partition_flag = 0; + sf->use_source_sad = 0; + sf->use_simple_block_yrd = 0; + sf->adapt_partition_source_sad = 0; + sf->use_altref_onepass = 0; + sf->use_compound_nonrd_pickmode = 0; + sf->nonrd_keyframe = 0; + sf->svc_use_lowres_part = 0; + sf->overshoot_detection_cbr_rt = NO_DETECTION; + sf->disable_16x16part_nonkey = 0; + sf->disable_golden_ref = 0; + sf->enable_tpl_model = 0; + sf->enhanced_full_pixel_motion_search = 0; + sf->use_accurate_subpel_search = USE_2_TAPS; + sf->nonrd_use_ml_partition = 0; + sf->variance_part_thresh_mult = 1; + sf->cb_pred_filter_search = 0; + sf->force_smooth_interpol = 0; + sf->rt_intra_dc_only_low_content = 0; + sf->mv.enable_adaptive_subpel_force_stop = 0; if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; sf->tx_domain_thresh = 0.0; - sf->allow_quant_coeff_opt = 0; - sf->quant_opt_thresh = 0.0; + sf->trellis_opt_tx_rd.method = DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = 0.0; sf->use_square_partition_only = !frame_is_intra_only(cm); sf->less_rectangular_check = 1; sf->tx_size_search_method = @@ -336,25 +505,24 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, if (speed >= 2) { sf->mode_search_skip_flags = - (cm->frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH | - FLAG_SKIP_INTRA_BESTINTER | - FLAG_SKIP_COMP_BESTINTRA | - FLAG_SKIP_INTRA_LOWVAR; + (cm->frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR; sf->adaptive_pred_interp_filter = 2; // Reference masking only enabled for 1 spatial layer, and if none of the // references have been scaled. The latter condition needs to be checked // for external or internal dynamic resize. - sf->reference_masking = (cpi->svc.number_spatial_layers == 1); + sf->reference_masking = (svc->number_spatial_layers == 1); if (sf->reference_masking == 1 && (cpi->external_resize == 1 || cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) { MV_REFERENCE_FRAME ref_frame; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); - if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { + if (yv12 != NULL && + (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { const struct scale_factors *const scale_fac = &cm->frame_refs[ref_frame - 1].sf; if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0; @@ -363,7 +531,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, } sf->disable_filter_search_var_thresh = 50; - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->comp_inter_joint_search_iter_level = 2; sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; sf->adjust_partitioning_from_last_frame = 1; @@ -378,7 +546,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, sf->disable_filter_search_var_thresh = 100; sf->use_uv_intra_rd_estimate = 1; sf->skip_encode_sb = 1; - sf->mv.subpel_iters_per_step = 1; + sf->mv.subpel_search_level = 0; sf->adaptive_rd_thresh = 4; sf->mode_skip_start = 6; sf->allow_skip_recode = 0; @@ -389,14 +557,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, if (speed >= 4) { int i; - sf->last_partitioning_redo_frequency = 4; - sf->adaptive_rd_thresh = 5; - sf->use_fast_coef_costing = 0; - sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX; - sf->adjust_partitioning_from_last_frame = - cm->last_frame_type != cm->frame_type || - (0 == (frames_since_key + 1) % sf->last_partitioning_redo_frequency); - sf->mv.subpel_force_stop = 1; + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) + sf->use_altref_onepass = 1; + sf->mv.subpel_force_stop = QUARTER_PEL; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_DC_H_V; sf->intra_uv_mode_mask[i] = INTRA_DC; @@ -404,16 +567,23 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->frame_parameter_update = 0; sf->mv.search_method = FAST_HEX; - - sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW; - sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST; - sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST; + sf->allow_skip_recode = 0; sf->max_intra_bsize = BLOCK_32X32; - sf->allow_skip_recode = 1; + sf->use_fast_coef_costing = 0; + sf->use_quant_fp = !is_keyframe; + sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO; + sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO; + sf->adaptive_rd_thresh = 2; + sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED; + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH; + sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8; + sf->partition_search_type = VAR_BASED_PARTITION; } if (speed >= 5) { + sf->use_altref_onepass = 0; sf->use_quant_fp = !is_keyframe; sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX; @@ -437,7 +607,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, sf->adaptive_rd_thresh = 2; // This feature is only enabled when partition search is disabled. sf->reuse_inter_pred_sby = 1; - sf->partition_search_breakout_rate_thr = 200; sf->coeff_prob_appx_step = 4; sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED; sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH; @@ -449,7 +618,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, int i; if (content == VP9E_CONTENT_SCREEN) { for (i = 0; i < BLOCK_SIZES; ++i) - sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V; + if (i >= BLOCK_32X32) + sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V; + else + sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V; } else { for (i = 0; i < BLOCK_SIZES; ++i) if (i > BLOCK_16X16) @@ -463,80 +635,260 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, sf->short_circuit_flat_blocks = 1; } if (cpi->oxcf.rc_mode == VPX_CBR && - cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) { + cpi->oxcf.content != VP9E_CONTENT_SCREEN) { sf->limit_newmv_early_exit = 1; - sf->bias_golden = 1; + if (!cpi->use_svc) sf->bias_golden = 1; } + // Keep nonrd_keyframe = 1 for non-base spatial layers to prevent + // increase in encoding time. + if (cpi->use_svc && svc->spatial_layer_id > 0) sf->nonrd_keyframe = 1; + if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG && + cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr) { + if (cm->width * cm->height <= 352 * 288 && !cpi->use_svc && + cpi->oxcf.content != VP9E_CONTENT_SCREEN) + sf->overshoot_detection_cbr_rt = RE_ENCODE_MAXQ; + else + sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ; + } + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && + cm->width <= 1280 && cm->height <= 720) { + sf->use_altref_onepass = 1; + sf->use_compound_nonrd_pickmode = 1; + } + if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 2; + if (!cpi->external_resize) sf->use_source_sad = 1; } if (speed >= 6) { + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) { + sf->use_altref_onepass = 1; + sf->use_compound_nonrd_pickmode = 1; + } sf->partition_search_type = VAR_BASED_PARTITION; - // Turn on this to use non-RD key frame coding mode. - sf->use_nonrd_pick_mode = 1; sf->mv.search_method = NSTEP; sf->mv.reduce_first_step_size = 1; sf->skip_encode_sb = 0; - if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && - content != VP9E_CONTENT_SCREEN) { + + if (sf->use_source_sad) { + sf->adapt_partition_source_sad = 1; + sf->adapt_partition_thresh = + (cm->width * cm->height <= 640 * 360) ? 40000 : 60000; + if (cpi->content_state_sb_fd == NULL && + (!cpi->use_svc || + svc->spatial_layer_id == svc->number_spatial_layers - 1)) { + CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd, + (uint8_t *)vpx_calloc( + (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(uint8_t))); + } + } + if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) { // Enable short circuit for low temporal variance. sf->short_circuit_low_temp_var = 1; } - if (cpi->use_svc) sf->base_mv_aggressive = 1; + if (svc->temporal_layer_id > 0) { + sf->adaptive_rd_thresh = 4; + sf->limit_newmv_early_exit = 0; + sf->base_mv_aggressive = 1; + } + if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG && + cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr) + sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ; } if (speed >= 7) { + sf->adapt_partition_source_sad = 0; sf->adaptive_rd_thresh = 3; sf->mv.search_method = FAST_DIAMOND; sf->mv.fullpel_search_step_param = 10; - if (cpi->svc.number_temporal_layers > 2 && - cpi->svc.temporal_layer_id == 0) { + // For SVC: use better mv search on base temporal layer, and only + // on base spatial layer if highest resolution is above 640x360. + if (svc->number_temporal_layers > 2 && svc->temporal_layer_id == 0 && + (svc->spatial_layer_id == 0 || + cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) { sf->mv.search_method = NSTEP; sf->mv.fullpel_search_step_param = 6; } + if (svc->temporal_layer_id > 0 || svc->spatial_layer_id > 1) { + sf->use_simple_block_yrd = 1; + if (svc->non_reference_frame) + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE; + } + if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1) + sf->adaptive_rd_thresh_row_mt = 1; + // Enable partition copy. For SVC only enabled for top spatial resolution + // layer. + cpi->max_copied_frame = 0; + if (!cpi->last_frame_dropped && cpi->resize_state == ORIG && + !cpi->external_resize && + (!cpi->use_svc || + (svc->spatial_layer_id == svc->number_spatial_layers - 1 && + !svc->last_layer_dropped[svc->number_spatial_layers - 1]))) { + sf->copy_partition_flag = 1; + cpi->max_copied_frame = 2; + // The top temporal enhancement layer (for number of temporal layers > 1) + // are non-reference frames, so use large/max value for max_copied_frame. + if (svc->number_temporal_layers > 1 && + svc->temporal_layer_id == svc->number_temporal_layers - 1) + cpi->max_copied_frame = 255; + } + // For SVC: enable use of lower resolution partition for higher resolution, + // only for 3 spatial layers and when config/top resolution is above VGA. + // Enable only for non-base temporal layer frames. + if (cpi->use_svc && svc->use_partition_reuse && + svc->number_spatial_layers == 3 && svc->temporal_layer_id > 0 && + cpi->oxcf.width * cpi->oxcf.height > 640 * 480) + sf->svc_use_lowres_part = 1; + // For SVC when golden is used as second temporal reference: to avoid + // encode time increase only use this feature on base temporal layer. + // (i.e remove golden flag from frame_flags for temporal_layer_id > 0). + if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer && + svc->temporal_layer_id > 0) + cpi->ref_frame_flags &= (~VP9_GOLD_FLAG); + if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 2; } if (speed >= 8) { sf->adaptive_rd_thresh = 4; - // Disabled for now until the threshold is tuned. - sf->copy_partition_flag = 0; - if (sf->copy_partition_flag) { - if (cpi->prev_partition == NULL) { - cpi->prev_partition = (BLOCK_SIZE *)vpx_calloc( - cm->mi_stride * cm->mi_rows, sizeof(BLOCK_SIZE)); - } - if (cpi->prev_segment_id == NULL) { - cpi->prev_segment_id = - (int8_t *)vpx_calloc(cm->mi_stride * cm->mi_rows, sizeof(int8_t)); - } + sf->skip_encode_sb = 1; + if (cpi->svc.number_spatial_layers > 1 && !cpi->svc.simulcast_mode) + sf->nonrd_keyframe = 0; + else + sf->nonrd_keyframe = 1; + if (!cpi->use_svc) cpi->max_copied_frame = 4; + if (cpi->row_mt && cpi->oxcf.max_threads > 1) + sf->adaptive_rd_thresh_row_mt = 1; + // Enable ML based partition for low res. + if (!frame_is_intra_only(cm) && cm->width * cm->height <= 352 * 288) { + sf->nonrd_use_ml_partition = 1; } - sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2; - if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF; - // Only keep INTRA_DC mode for speed 8. +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) + sf->nonrd_use_ml_partition = 0; +#endif + if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = HALF_PEL; + sf->rt_intra_dc_only_low_content = 1; + if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && + content != VP9E_CONTENT_SCREEN) { + // More aggressive short circuit for speed 8. + sf->short_circuit_low_temp_var = 3; + // Use level 2 for noisey cases as there is a regression in some + // noisy clips with level 3. + if (cpi->noise_estimate.enabled && cm->width >= 1280 && + cm->height >= 720) { + NOISE_LEVEL noise_level = + vp9_noise_estimate_extract_level(&cpi->noise_estimate); + if (noise_level >= kMedium) sf->short_circuit_low_temp_var = 2; + } + // Since the short_circuit_low_temp_var is used, reduce the + // adaptive_rd_thresh level. + if (cm->width * cm->height > 352 * 288) + sf->adaptive_rd_thresh = 1; + else + sf->adaptive_rd_thresh = 2; + } + sf->limit_newmv_early_exit = 0; + sf->use_simple_block_yrd = 1; + if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 2; + } + + if (speed >= 9) { + // Only keep INTRA_DC mode for speed 9. if (!is_keyframe) { int i = 0; for (i = 0; i < BLOCK_SIZES; ++i) sf->intra_y_mode_bsize_mask[i] = INTRA_DC; } - if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && - content != VP9E_CONTENT_SCREEN) { - // More aggressive short circuit for speed 8. - sf->short_circuit_low_temp_var = 3; - } - sf->limit_newmv_early_exit = 0; + sf->cb_pred_filter_search = 2; + sf->mv.enable_adaptive_subpel_force_stop = 1; + sf->mv.adapt_subpel_force_stop.mv_thresh = 1; + sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL; + sf->mv.adapt_subpel_force_stop.force_stop_above = HALF_PEL; + // Disable partition blocks below 16x16, except for low-resolutions. + if (cm->frame_type != KEY_FRAME && cm->width >= 320 && cm->height >= 240) + sf->disable_16x16part_nonkey = 1; + // Allow for disabling GOLDEN reference, for CBR mode. + if (cpi->oxcf.rc_mode == VPX_CBR) sf->disable_golden_ref = 1; + if (cpi->rc.avg_frame_low_motion < 70) sf->default_interp_filter = BILINEAR; + if (cm->width * cm->height >= 640 * 360) sf->variance_part_thresh_mult = 2; } + + // Disable split to 8x8 for low-resolution at very high Q. + // For variance partition (speed >= 6). Ignore the first few frames + // as avg_frame_qindex starts at max_q (worst_quality). + if (cm->frame_type != KEY_FRAME && cm->width * cm->height <= 320 * 240 && + sf->partition_search_type == VAR_BASED_PARTITION && + cpi->rc.avg_frame_qindex[INTER_FRAME] > 208 && + cpi->common.current_video_frame > 8) + sf->disable_16x16part_nonkey = 1; + + if (sf->nonrd_use_ml_partition) + sf->partition_search_type = ML_BASED_PARTITION; + + if (sf->use_altref_onepass) { + if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) { + sf->partition_search_type = FIXED_PARTITION; + sf->always_this_block_size = BLOCK_64X64; + } + if (cpi->count_arf_frame_usage == NULL) { + CHECK_MEM_ERROR( + &cm->error, cpi->count_arf_frame_usage, + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->count_arf_frame_usage))); + } + if (cpi->count_lastgolden_frame_usage == NULL) + CHECK_MEM_ERROR( + &cm->error, cpi->count_lastgolden_frame_usage, + (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), + sizeof(*cpi->count_lastgolden_frame_usage))); + } + if (svc->previous_frame_is_intra_only) { + sf->partition_search_type = FIXED_PARTITION; + sf->always_this_block_size = BLOCK_64X64; + } + // Special case for screen content: increase motion search on base spatial + // layer when high motion is detected or previous SL0 frame was dropped. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && cpi->oxcf.speed >= 5 && + (svc->high_num_blocks_with_motion || svc->last_layer_dropped[0])) { + sf->mv.search_method = NSTEP; + // TODO(marpan/jianj): Tune this setting for screensharing. For now use + // small step_param for all spatial layers. + sf->mv.fullpel_search_step_param = 2; + } + // TODO(marpan): There is regression for aq-mode=3 speed <= 4, force it + // off for now. + if (speed <= 3 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + cpi->oxcf.aq_mode = 0; + // For all speeds for rt mode: if the deadline mode changed (was good/best + // quality on previous frame and now is realtime) set nonrd_keyframe to 1 to + // avoid entering rd pickmode. This causes issues, such as: b/310663186. + if (cpi->oxcf.mode != cpi->deadline_mode_previous_frame) + sf->nonrd_keyframe = 1; + + // TODO(marpan): Force this feature off always, for the issue: 366146260 + // Remove this disabling when underlying issue is resolved. + sf->svc_use_lowres_part = 0; } -void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { +void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) { SPEED_FEATURES *const sf = &cpi->sf; const VP9EncoderConfig *const oxcf = &cpi->oxcf; RD_OPT *const rd = &cpi->rd; int i; - if (oxcf->mode == REALTIME) { - set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); - } else if (oxcf->mode == GOOD) { - set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); - } + // best quality defaults + // Some speed-up features even for best quality as minimal impact on quality. + sf->partition_search_breakout_thr.dist = (1 << 19); + sf->partition_search_breakout_thr.rate = 80; + sf->rd_ml_partition.search_early_termination = 0; + sf->rd_ml_partition.search_breakout = 0; + + if (oxcf->mode == REALTIME) + set_rt_speed_feature_framesize_dependent(cpi, sf, speed); +#if !CONFIG_REALTIME_ONLY + else if (oxcf->mode == GOOD) + set_good_speed_feature_framesize_dependent(cpi, sf, speed); +#endif if (sf->disable_split_mask == DISABLE_ALL_SPLIT) { sf->adaptive_pred_interp_filter = 0; @@ -553,11 +905,22 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { rd->thresh_mult_sub8x8[i] = INT_MAX; } } + + // With row based multi-threading, the following speed features + // have to be disabled to guarantee that bitstreams encoded with single thread + // and multiple threads match. + // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since + // adaptive_rd_thresh is defined per-row for non-rd pickmode. + if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && + oxcf->max_threads > 1) + sf->adaptive_rd_thresh = 0; } -void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { +void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { SPEED_FEATURES *const sf = &cpi->sf; +#if !CONFIG_REALTIME_ONLY VP9_COMMON *const cm = &cpi->common; +#endif MACROBLOCK *const x = &cpi->td.mb; const VP9EncoderConfig *const oxcf = &cpi->oxcf; int i; @@ -567,20 +930,24 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->mv.search_method = NSTEP; sf->recode_loop = ALLOW_RECODE_FIRST; sf->mv.subpel_search_method = SUBPEL_TREE; - sf->mv.subpel_iters_per_step = 2; - sf->mv.subpel_force_stop = 0; + sf->mv.subpel_search_level = 2; + sf->mv.subpel_force_stop = EIGHTH_PEL; sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf); sf->mv.reduce_first_step_size = 0; sf->coeff_prob_appx_step = 1; sf->mv.auto_mv_step_size = 0; sf->mv.fullpel_search_step_param = 6; - sf->comp_inter_joint_search_thresh = BLOCK_4X4; + sf->mv.use_downsampled_sad = 0; + sf->comp_inter_joint_search_iter_level = 0; sf->tx_size_search_method = USE_FULL_RD; sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; + sf->enhanced_full_pixel_motion_search = 1; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; + sf->prune_single_mode_based_on_mv_diff_mode_rate = 0; sf->cb_pred_filter_search = 0; + sf->early_term_interp_search_plane_rd = 0; sf->cb_partition_search = 0; sf->motion_field_mode_search = 0; sf->alt_ref_search_fp = 0; @@ -589,7 +956,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->partition_search_type = SEARCH_PARTITION; sf->less_rectangular_check = 0; sf->use_square_partition_only = 0; - sf->use_square_only_threshold = BLOCK_SIZES; + sf->use_square_only_thresh_high = BLOCK_SIZES; + sf->use_square_only_thresh_low = BLOCK_4X4; sf->auto_min_max_partition_size = NOT_IN_USE; sf->rd_auto_partition_min_limit = BLOCK_4X4; sf->default_max_partition_size = BLOCK_64X64; @@ -602,12 +970,16 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->max_delta_qindex = 0; sf->disable_filter_search_var_thresh = 0; sf->adaptive_interp_filter_search = 0; - sf->allow_partition_search_skip = 0; sf->allow_txfm_domain_distortion = 0; sf->tx_domain_thresh = 99.0; - sf->allow_quant_coeff_opt = sf->optimize_coefficients; - sf->quant_opt_thresh = 99.0; + sf->trellis_opt_tx_rd.method = + sf->optimize_coefficients ? ENABLE_TRELLIS_OPT : DISABLE_TRELLIS_OPT; + sf->trellis_opt_tx_rd.thresh = 99.0; sf->allow_acl = 1; + sf->enable_tpl_model = oxcf->enable_tpl_model; + sf->prune_ref_frame_for_rect_partitions = 0; + sf->temporal_filter_search_method = MESH; + sf->allow_skip_txfm_ac_dc = 0; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; @@ -629,7 +1001,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { // This setting only takes effect when partition_search_type is set // to FIXED_PARTITION. sf->always_this_block_size = BLOCK_16X16; - sf->search_type_check_frequency = 50; sf->encode_breakout_thresh = 0; // Recode loop tolerance %. sf->recode_tolerance_low = 12; @@ -641,49 +1012,41 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->limit_newmv_early_exit = 0; sf->bias_golden = 0; sf->base_mv_aggressive = 0; + sf->rd_ml_partition.prune_rect_thresh[0] = -1; + sf->rd_ml_partition.prune_rect_thresh[1] = -1; + sf->rd_ml_partition.prune_rect_thresh[2] = -1; + sf->rd_ml_partition.prune_rect_thresh[3] = -1; + sf->rd_ml_partition.var_pruning = 0; + sf->use_accurate_subpel_search = USE_8_TAPS; // Some speed-up features even for best quality as minimal impact on quality. sf->adaptive_rd_thresh = 1; sf->tx_size_search_breakout = 1; - sf->partition_search_breakout_dist_thr = (1 << 19); - sf->partition_search_breakout_rate_thr = 80; + sf->tx_size_search_depth = 2; - if (oxcf->mode == REALTIME) - set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content); - else if (oxcf->mode == GOOD) - set_good_speed_feature(cpi, cm, sf, oxcf->speed); - - cpi->full_search_sad = vp9_full_search_sad; - cpi->diamond_search_sad = vp9_diamond_search_sad; - - sf->allow_exhaustive_searches = 1; - if (oxcf->mode == BEST) { - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) - sf->exhaustive_searches_thresh = (1 << 20); - else - sf->exhaustive_searches_thresh = (1 << 21); - sf->max_exaustive_pct = 100; + sf->exhaustive_searches_thresh = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20) + : INT_MAX; + { + const int mesh_density_level = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1; for (i = 0; i < MAX_MESH_STEP; ++i) { - sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range; - sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval; - } - } else { - int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed; - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) - sf->exhaustive_searches_thresh = (1 << 22); - else - sf->exhaustive_searches_thresh = (1 << 23); - sf->max_exaustive_pct = good_quality_max_mesh_pct[speed]; - if (speed > 0) - sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1; - - for (i = 0; i < MAX_MESH_STEP; ++i) { - sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range; + sf->mesh_patterns[i].range = + best_quality_mesh_pattern[mesh_density_level][i].range; sf->mesh_patterns[i].interval = - good_quality_mesh_patterns[speed][i].interval; + best_quality_mesh_pattern[mesh_density_level][i].interval; } } + if (oxcf->mode == REALTIME) + set_rt_speed_feature_framesize_independent(cpi, sf, speed, oxcf->content); +#if !CONFIG_REALTIME_ONLY + else if (oxcf->mode == GOOD) + set_good_speed_feature_framesize_independent(cpi, cm, sf, speed); +#endif + + cpi->diamond_search_sad = vp9_diamond_search_sad; + // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. if (oxcf->pass == 1) sf->optimize_coefficients = 0; @@ -694,7 +1057,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->optimize_coefficients = 0; } - if (sf->mv.subpel_force_stop == 3) { + if (sf->mv.subpel_force_stop == FULL_PEL) { // Whole pel only cpi->find_fractional_mv_step = vp9_skip_sub_pixel_tree; } else if (sf->mv.subpel_search_method == SUBPEL_TREE) { @@ -707,6 +1070,12 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore; } + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test == 1) + cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv; + else if (cpi->oxcf.motion_vector_unit_test == 2) + cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv; + x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1; x->min_partition_size = sf->default_min_partition_size; @@ -715,4 +1084,13 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { if (!cpi->oxcf.frame_periodic_boost) { sf->max_delta_qindex = 0; } + + // With row based multi-threading, the following speed features + // have to be disabled to guarantee that bitstreams encoded with single thread + // and multiple threads match. + // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since + // adaptive_rd_thresh is defined per-row for non-rd pickmode. + if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && + oxcf->max_threads > 1) + sf->adaptive_rd_thresh = 0; } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h index 478684d059..92a7df767f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SPEED_FEATURES_H_ -#define VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#ifndef VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#define VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ #include "vp9/common/vp9_enums.h" @@ -57,7 +57,8 @@ typedef enum { BIGDIA = 3, SQUARE = 4, FAST_HEX = 5, - FAST_DIAMOND = 6 + FAST_DIAMOND = 6, + MESH = 7 } SEARCH_METHODS; typedef enum { @@ -135,20 +136,20 @@ typedef enum { } INTERP_FILTER_MASK; typedef enum { - // Search partitions using RD/NONRD criterion + // Search partitions using RD/NONRD criterion. SEARCH_PARTITION, - // Always use a fixed size partition + // Always use a fixed size partition. FIXED_PARTITION, REFERENCE_PARTITION, // Use an arbitrary partitioning scheme based on source variance within - // a 64X64 SB + // a 64X64 SB. VAR_BASED_PARTITION, - // Use non-fixed partitions based on source variance - SOURCE_VAR_BASED_PARTITION + // Make partition decisions with machine learning models. + ML_BASED_PARTITION } PARTITION_SEARCH_TYPE; typedef enum { @@ -161,6 +162,19 @@ typedef enum { ONE_LOOP_REDUCED = 1 } FAST_COEFF_UPDATE; +typedef enum { EIGHTH_PEL, QUARTER_PEL, HALF_PEL, FULL_PEL } SUBPEL_FORCE_STOP; + +typedef struct ADAPT_SUBPEL_FORCE_STOP { + // Threshold for full pixel motion vector; + int mv_thresh; + + // subpel_force_stop if full pixel MV is below the threshold. + SUBPEL_FORCE_STOP force_stop_below; + + // subpel_force_stop if full pixel MV is equal to or above the threshold. + SUBPEL_FORCE_STOP force_stop_above; +} ADAPT_SUBPEL_FORCE_STOP; + typedef struct MV_SPEED_FEATURES { // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). SEARCH_METHODS search_method; @@ -179,20 +193,31 @@ typedef struct MV_SPEED_FEATURES { // the same process. Along the way it skips many diagonals. SUBPEL_SEARCH_METHODS subpel_search_method; - // Maximum number of steps in logarithmic subpel search before giving up. - int subpel_iters_per_step; + // Subpel MV search level. Can take values 0 - 2. Higher values mean more + // extensive subpel search. + int subpel_search_level; - // Control when to stop subpel search: - // 0: Full subpel search. - // 1: Stop at quarter pixel. - // 2: Stop at half pixel. - // 3: Stop at full pixel. - int subpel_force_stop; + // When to stop subpel motion search. + SUBPEL_FORCE_STOP subpel_force_stop; + + // If it's enabled, different subpel_force_stop will be used for different MV. + int enable_adaptive_subpel_force_stop; + + ADAPT_SUBPEL_FORCE_STOP adapt_subpel_force_stop; // This variable sets the step_param used in full pel motion search. int fullpel_search_step_param; + + // Whether to downsample the rows in sad calculation during motion search. + // This is only active when there are at least 8 rows. + int use_downsampled_sad; } MV_SPEED_FEATURES; +typedef struct PARTITION_SEARCH_BREAKOUT_THR { + int64_t dist; + int rate; +} PARTITION_SEARCH_BREAKOUT_THR; + #define MAX_MESH_STEP 4 typedef struct MESH_PATTERN { @@ -200,6 +225,46 @@ typedef struct MESH_PATTERN { int interval; } MESH_PATTERN; +typedef enum { + // No reaction to rate control on a detected slide/scene change. + NO_DETECTION = 0, + + // Set to larger Q (max_q set by user) based only on the + // detected slide/scene change and current/past Q. + FAST_DETECTION_MAXQ = 1, + + // Based on (first pass) encoded frame, if large frame size is detected + // then set to higher Q for the second re-encode. This involves 2 pass + // encoding on slide change, so slower than 1, but more accurate for + // detecting overshoot. + RE_ENCODE_MAXQ = 2 +} OVERSHOOT_DETECTION_CBR_RT; + +typedef enum { + USE_2_TAPS = 0, + USE_4_TAPS, + USE_8_TAPS, + USE_8_TAPS_SHARP, +} SUBPEL_SEARCH_TYPE; + +typedef enum { + // Disable trellis coefficient optimization + DISABLE_TRELLIS_OPT, + // Enable trellis coefficient optimization + ENABLE_TRELLIS_OPT, + // Enable trellis coefficient optimization based on source variance of the + // prediction block during transform RD + ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR, + // Enable trellis coefficient optimization based on residual mse of the + // transform block during transform RD + ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE, +} ENABLE_TRELLIS_OPT_METHOD; + +typedef struct TRELLIS_OPT_CONTROL { + ENABLE_TRELLIS_OPT_METHOD method; + double thresh; +} TRELLIS_OPT_CONTROL; + typedef struct SPEED_FEATURES { MV_SPEED_FEATURES mv; @@ -218,16 +283,30 @@ typedef struct SPEED_FEATURES { // adds overhead. int static_segmentation; - // If 1 we iterate finding a best reference for 2 ref frames together - via - // a log search that iterates 4 times (check around mv for last for best - // error of combined predictor then check around mv for alt). If 0 we - // we just use the best motion vector found for each frame by itself. - BLOCK_SIZE comp_inter_joint_search_thresh; + // The best compound predictor is found using an iterative log search process + // that searches for best ref0 mv using error of combined predictor and then + // searches for best ref1 mv. This sf determines the number of iterations of + // this process based on block size. The sf becomes more aggressive from level + // 0 to 2. The following table indicates the number of iterations w.r.t bsize: + // ----------------------------------------------- + // |sf (level)|bsize < 8X8| [8X8, 16X16] | > 16X16 | + // | 0 | 4 | 4 | 4 | + // | 1 | 0 | 2 | 4 | + // | 2 | 0 | 0 | 0 | + // ----------------------------------------------- + // Here, 0 iterations indicate using the best single motion vector selected + // for each ref frame without any iterative refinement. + int comp_inter_joint_search_iter_level; // This variable is used to cap the maximum number of times we skip testing a // mode to be evaluated. A high value means we will be faster. + // Turned off when (row_mt_bit_exact == 1 && adaptive_rd_thresh_row_mt == 0). int adaptive_rd_thresh; + // Flag to use adaptive_rd_thresh when row-mt is enabled, only for non-rd + // pickmode. + int adaptive_rd_thresh_row_mt; + // Enables skipping the reconstruction step (idct, recon) in the // intermediate steps assuming the last frame didn't have too many intra // blocks and the q is less than a threshold. @@ -241,13 +320,16 @@ typedef struct SPEED_FEATURES { int coeff_prob_appx_step; // Enable uniform quantizer followed by trellis coefficient optimization - int allow_quant_coeff_opt; - double quant_opt_thresh; + // during transform RD + TRELLIS_OPT_CONTROL trellis_opt_tx_rd; // Enable asymptotic closed-loop encoding decision for key frame and // alternate reference frames. int allow_acl; + // Temporal dependency model based encoding mode optimization + int enable_tpl_model; + // Use transform domain distortion. Use pixel domain distortion in speed 0 // and certain situations in higher speed to improve the RD model precision. int allow_txfm_domain_distortion; @@ -262,6 +344,9 @@ typedef struct SPEED_FEATURES { // for intra and model coefs for the rest. TX_SIZE_SEARCH_METHOD tx_size_search_method; + // How many levels of tx size to search, starting from the largest. + int tx_size_search_depth; + // Low precision 32x32 fdct keeps everything in 16 bits and thus is less // precise but significantly faster than the non lp version. int use_lp32x32fdct; @@ -276,16 +361,21 @@ typedef struct SPEED_FEATURES { PARTITION_SEARCH_TYPE partition_search_type; - // Used if partition_search_type = FIXED_SIZE_PARTITION + // Used if partition_search_type = FIXED_PARTITION BLOCK_SIZE always_this_block_size; // Skip rectangular partition test when partition type none gives better // rd than partition type split. int less_rectangular_check; - // Disable testing non square partitions. (eg 16x32) + // Disable testing non square partitions(eg 16x32) for block sizes larger than + // use_square_only_thresh_high or smaller than use_square_only_thresh_low. int use_square_partition_only; - BLOCK_SIZE use_square_only_threshold; + BLOCK_SIZE use_square_only_thresh_high; + BLOCK_SIZE use_square_only_thresh_low; + + // Prune reference frames for rectangular partitions. + int prune_ref_frame_for_rect_partitions; // Sets min and max partition sizes for this 64x64 region based on the // same 64x64 in last encoded frame, and the left and above neighbor. @@ -317,15 +407,12 @@ typedef struct SPEED_FEATURES { // point for this motion search and limits the search range around it. int adaptive_motion_search; - // Flag for allowing some use of exhaustive searches; - int allow_exhaustive_searches; + // Do extra full pixel motion search to obtain better motion vector. + int enhanced_full_pixel_motion_search; // Threshold for allowing exhaistive motion search. int exhaustive_searches_thresh; - // Maximum number of exhaustive searches for a frame. - int max_exaustive_pct; - // Pattern to be used for any exhaustive mesh searches. MESH_PATTERN mesh_patterns[MAX_MESH_STEP]; @@ -340,9 +427,21 @@ typedef struct SPEED_FEATURES { // Adaptive prediction mode search int adaptive_mode_search; - // Chessboard pattern prediction filter type search + // Prune NEAREST and ZEROMV single reference modes based on motion vector + // difference and mode rate + int prune_single_mode_based_on_mv_diff_mode_rate; + + // Chessboard pattern prediction for interp filter. Aggressiveness increases + // with levels. + // 0: disable + // 1: cb pattern in eval when filter is not switchable + // 2: cb pattern prediction for filter search int cb_pred_filter_search; + // This variable enables an early termination of interpolation filter eval + // based on the current rd cost after processing each plane + int early_term_interp_search_plane_rd; + int cb_partition_search; int motion_field_mode_search; @@ -415,10 +514,6 @@ typedef struct SPEED_FEATURES { // TODO(aconverse): Fold this into one of the other many mode skips BLOCK_SIZE max_intra_bsize; - // The frequency that we check if SOURCE_VAR_BASED_PARTITION or - // FIXED_PARTITION search type should be used. - int search_type_check_frequency; - // When partition is pre-set, the inter prediction result from pick_inter_mode // can be reused in final block encoding process. It is enabled only for real- // time mode speed 6. @@ -442,11 +537,29 @@ typedef struct SPEED_FEATURES { INTERP_FILTER_MASK interp_filter_search_mask; // Partition search early breakout thresholds. - int64_t partition_search_breakout_dist_thr; - int partition_search_breakout_rate_thr; + PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr; - // Allow skipping partition search for still image frame - int allow_partition_search_skip; + struct { + // Use ML-based partition search early breakout. + int search_breakout; + // Higher values mean more aggressiveness for partition search breakout that + // results in better encoding speed but worse compression performance. + float search_breakout_thresh[3]; + + // Machine-learning based partition search early termination + int search_early_termination; + + // Machine-learning based partition search pruning using prediction residue + // variance. + int var_pruning; + + // Threshold values used for ML based rectangular partition search pruning. + // If < 0, the feature is turned off. + // Higher values mean more aggressiveness to skip rectangular partition + // search that results in better encoding speed but worse coding + // performance. + int prune_rect_thresh[4]; + } rd_ml_partition; // Fast approximation of vp9_model_rd_from_var_lapndz int simple_model_rd_from_var; @@ -456,12 +569,13 @@ typedef struct SPEED_FEATURES { int short_circuit_flat_blocks; // Skip a number of expensive mode evaluations for blocks with very low - // temporal variance. + // temporal variance. If the low temporal variance flag is set for a block, + // do the following: // 1: Skip all golden modes and ALL INTRA for bsize >= 32x32. // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and // 32x16. - // 3: Same as (2), but also skip golden zeromv for low res. + // 3: Same as (2), but also skip golden zeromv. int short_circuit_low_temp_var; // Limits the rd-threshold update for early exit for the newmv-last mode, @@ -477,15 +591,77 @@ typedef struct SPEED_FEATURES { // Global flag to enable partition copy from the previous frame. int copy_partition_flag; + + // Compute the source sad for every superblock of the frame, + // prior to encoding the frame, to be used to bypass some encoder decisions. + int use_source_sad; + + int use_simple_block_yrd; + + // If source sad of superblock is high (> adapt_partition_thresh), will switch + // from VARIANCE_PARTITION to REFERENCE_PARTITION (which selects partition + // based on the nonrd-pickmode). + int adapt_partition_source_sad; + int adapt_partition_thresh; + + // Enable use of alt-refs in 1 pass VBR. + int use_altref_onepass; + + // Enable use of compound prediction, for nonrd_pickmode with nonzero lag. + int use_compound_nonrd_pickmode; + + // Always use nonrd_pick_intra for all block sizes on keyframes. + int nonrd_keyframe; + + // For SVC: enables use of partition from lower spatial resolution. + int svc_use_lowres_part; + + // Flag to indicate process for handling overshoot on slide/scene change, + // for real-time CBR mode. + OVERSHOOT_DETECTION_CBR_RT overshoot_detection_cbr_rt; + + // Disable partitioning of 16x16 blocks. + int disable_16x16part_nonkey; + + // Allow for disabling golden reference. + int disable_golden_ref; + + // Allow sub-pixel search to use interpolation filters with different taps in + // order to achieve accurate motion search result. + SUBPEL_SEARCH_TYPE use_accurate_subpel_search; + + // Search method used by temporal filtering in full_pixel_motion_search. + SEARCH_METHODS temporal_filter_search_method; + + // Use machine learning based partition search. + int nonrd_use_ml_partition; + + // Multiplier for base threshold for variance partitioning. + int variance_part_thresh_mult; + + // Force subpel motion filter to always use SMOOTH_FILTER. + int force_smooth_interpol; + + // For real-time mode: force DC only under intra search when content + // does not have high souce SAD. + int rt_intra_dc_only_low_content; + + // The encoder has a feature that skips forward transform and quantization + // based on a model rd estimation to reduce encoding time. + // However, this feature is dangerous since it could lead to bad perceptual + // quality. This flag is added to guard the feature. + int allow_skip_txfm_ac_dc; } SPEED_FEATURES; struct VP9_COMP; -void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi); -void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi); +void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi, + int speed); +void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi, + int speed); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#endif // VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c index e8212ce05e..2e1810b0ee 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c @@ -66,11 +66,13 @@ static int remap_prob(int v, int m) { }; v--; m--; + assert(m >= 0); if ((m << 1) <= MAX_PROB) i = recenter_nonneg(v, m) - 1; else i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1; + assert(i >= 0 && (size_t)i < sizeof(map_table)); i = map_table[i]; return i; } @@ -113,19 +115,20 @@ void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) { encode_term_subexp(w, delp); } -int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp, - vpx_prob *bestp, vpx_prob upd) { - const int old_b = cost_branch256(ct, oldp); - int bestsavings = 0; +int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct, + vpx_prob oldp, vpx_prob *bestp, + vpx_prob upd) { + const int64_t old_b = cost_branch256(ct, oldp); + int64_t bestsavings = 0; vpx_prob newp, bestnewp = oldp; const int step = *bestp > oldp ? -1 : 1; const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd); if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) { for (newp = *bestp; newp != oldp; newp += step) { - const int new_b = cost_branch256(ct, newp); - const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost; - const int savings = old_b - new_b - update_b; + const int64_t new_b = cost_branch256(ct, newp); + const int64_t update_b = prob_diff_update_cost(newp, oldp) + upd_cost; + const int64_t savings = old_b - new_b - update_b; if (savings > bestsavings) { bestsavings = savings; bestnewp = newp; @@ -136,15 +139,15 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp, return bestsavings; } -int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, - const vpx_prob oldp, - vpx_prob *bestp, vpx_prob upd, - int stepsize) { - int i, old_b, new_b, update_b, savings, bestsavings; - int newp; - const int step_sign = *bestp > oldp ? -1 : 1; - const int step = stepsize * step_sign; - const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd); +int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct, + const vpx_prob oldp, + vpx_prob *bestp, vpx_prob upd, + int stepsize) { + int64_t i, old_b, new_b, update_b, savings, bestsavings; + int64_t newp; + const int64_t step_sign = *bestp > oldp ? -1 : 1; + const int64_t step = stepsize * step_sign; + const int64_t upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd); const vpx_prob *newplist, *oldplist; vpx_prob bestnewp; oldplist = vp9_pareto8_full[oldp - 1]; @@ -161,14 +164,14 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) { if (newp < 1 || newp > 255) continue; newplist = vp9_pareto8_full[newp - 1]; - new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp); + new_b = cost_branch256(ct + 2 * PIVOT_NODE, (vpx_prob)newp); for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i) new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]); - update_b = prob_diff_update_cost(newp, oldp) + upd_cost; + update_b = prob_diff_update_cost((vpx_prob)newp, oldp) + upd_cost; savings = old_b - new_b - update_b; if (savings > bestsavings) { bestsavings = savings; - bestnewp = newp; + bestnewp = (vpx_prob)newp; } } } @@ -181,7 +184,7 @@ void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp, const unsigned int ct[2]) { const vpx_prob upd = DIFF_UPDATE_PROB; vpx_prob newp = get_binary_prob(ct[0], ct[1]); - const int savings = + const int64_t savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp, upd); assert(newp >= 1); if (savings > 0) { diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h index 26c89e2ea7..2d016d24c5 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SUBEXP_H_ -#define VP9_ENCODER_VP9_SUBEXP_H_ +#ifndef VPX_VP9_ENCODER_VP9_SUBEXP_H_ +#define VPX_VP9_ENCODER_VP9_SUBEXP_H_ #ifdef __cplusplus extern "C" { @@ -25,16 +25,17 @@ void vp9_write_prob_diff_update(struct vpx_writer *w, vpx_prob newp, void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp, const unsigned int ct[2]); -int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp, - vpx_prob *bestp, vpx_prob upd); +int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct, + vpx_prob oldp, vpx_prob *bestp, + vpx_prob upd); -int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, - const vpx_prob oldp, - vpx_prob *bestp, vpx_prob upd, - int stepsize); +int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct, + const vpx_prob oldp, + vpx_prob *bestp, vpx_prob upd, + int stepsize); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SUBEXP_H_ +#endif // VPX_VP9_ENCODER_VP9_SUBEXP_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c index 1d892dc148..6e9405e422 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -19,6 +19,14 @@ #define SMALL_FRAME_WIDTH 32 #define SMALL_FRAME_HEIGHT 16 +static void swap_ptr(void *a, void *b) { + void **a_p = (void **)a; + void **b_p = (void **)b; + void *c = *a_p; + *a_p = *b_p; + *b_p = c; +} + void vp9_init_layer_context(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -29,20 +37,53 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->spatial_layer_id = 0; svc->temporal_layer_id = 0; - svc->first_spatial_layer_to_encode = 0; - svc->rc_drop_superframe = 0; svc->force_zero_mode_spatial_ref = 0; svc->use_base_mv = 0; + svc->use_partition_reuse = 0; + svc->use_gf_temporal_ref = 1; + svc->use_gf_temporal_ref_current_layer = 0; svc->scaled_temp_is_alloc = 0; svc->scaled_one_half = 0; svc->current_superframe = 0; - for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1; - for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { - cpi->svc.ext_frame_flags[sl] = 0; - cpi->svc.ext_lst_fb_idx[sl] = 0; - cpi->svc.ext_gld_fb_idx[sl] = 1; - cpi->svc.ext_alt_fb_idx[sl] = 2; + svc->non_reference_frame = 0; + svc->skip_enhancement_layer = 0; + svc->disable_inter_layer_pred = INTER_LAYER_PRED_ON; + svc->framedrop_mode = CONSTRAINED_LAYER_DROP; + svc->set_intra_only_frame = 0; + svc->previous_frame_is_intra_only = 0; + svc->superframe_has_layer_sync = 0; + svc->use_set_ref_frame_config = 0; + svc->num_encoded_top_layer = 0; + svc->simulcast_mode = 0; + svc->single_layer_svc = 0; + svc->resize_set = 0; + + for (i = 0; i < REF_FRAMES; ++i) { + svc->fb_idx_spatial_layer_id[i] = 0xff; + svc->fb_idx_temporal_layer_id[i] = 0xff; + svc->fb_idx_base[i] = 0; } + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + svc->last_layer_dropped[sl] = 0; + svc->drop_spatial_layer[sl] = 0; + svc->ext_frame_flags[sl] = 0; + svc->lst_fb_idx[sl] = 0; + svc->gld_fb_idx[sl] = 1; + svc->alt_fb_idx[sl] = 2; + svc->downsample_filter_type[sl] = BILINEAR; + svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter. + svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark; + svc->fb_idx_upd_tl0[sl] = -1; + svc->drop_count[sl] = 0; + svc->spatial_layer_sync[sl] = 0; + svc->force_drop_constrained_from_above[sl] = 0; + } + svc->max_consec_drop = INT_MAX; + + svc->buffer_gf_temporal_ref[1].idx = 7; + svc->buffer_gf_temporal_ref[0].idx = 6; + svc->buffer_gf_temporal_ref[1].is_used = 0; + svc->buffer_gf_temporal_ref[0].is_used = 0; if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, SMALL_FRAME_WIDTH, @@ -66,7 +107,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers); LAYER_CONTEXT *const lc = &svc->layer_context[layer]; RATE_CONTROL *const lrc = &lc->rc; - int i; lc->current_video_frame_in_layer = 0; lc->layer_size = 0; lc->frames_from_key_frame = 0; @@ -80,6 +120,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { lrc->ni_frames = 0; lrc->decimation_count = 0; lrc->decimation_factor = 0; + lrc->worst_quality = oxcf->worst_allowed_q; + lrc->best_quality = oxcf->best_allowed_q; for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { lrc->rate_correction_factors[i] = 1.0; @@ -118,17 +160,20 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { size_t consec_zero_mv_size; VP9_COMMON *const cm = &cpi->common; lc->sb_index = 0; - CHECK_MEM_ERROR(cm, lc->map, + lc->actual_num_seg1_blocks = 0; + lc->actual_num_seg2_blocks = 0; + lc->counter_encode_maxq_scene_change = 0; + CHECK_MEM_ERROR(&cm->error, lc->map, vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map))); memset(lc->map, 0, mi_rows * mi_cols); last_coded_q_map_size = mi_rows * mi_cols * sizeof(*lc->last_coded_q_map); - CHECK_MEM_ERROR(cm, lc->last_coded_q_map, + CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map, vpx_malloc(last_coded_q_map_size)); assert(MAXQ <= 255); memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size); consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv); - CHECK_MEM_ERROR(cm, lc->consec_zero_mv, + CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv, vpx_malloc(consec_zero_mv_size)); memset(lc->consec_zero_mv, 0, consec_zero_mv_size); } @@ -149,6 +194,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, const RATE_CONTROL *const rc = &cpi->rc; int sl, tl, layer = 0, spatial_layer_target; float bitrate_alloc = 1.0; + int num_spatial_layers_nonzero_rate = 0; + + cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode; if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { @@ -171,18 +219,21 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, RATE_CONTROL *const lrc = &lc->rc; lc->spatial_layer_target_bandwidth = spatial_layer_target; - bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } lrc->starting_buffer_level = - (int64_t)(rc->starting_buffer_level * bitrate_alloc); + (int64_t)(rc->starting_buffer_level * bitrate_alloc + 0.5); lrc->optimal_buffer_level = - (int64_t)(rc->optimal_buffer_level * bitrate_alloc); + (int64_t)(rc->optimal_buffer_level * bitrate_alloc + 0.5); lrc->maximum_buffer_size = - (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + (int64_t)(rc->maximum_buffer_size * bitrate_alloc + 0.5); lrc->bits_off_target = VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size); lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl]; - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = saturate_cast_double_to_int( + round(lc->target_bandwidth / lc->framerate)); lrc->max_frame_bandwidth = rc->max_frame_bandwidth; lrc->worst_quality = rc->worst_quality; lrc->best_quality = rc->best_quality; @@ -203,7 +254,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, lc->target_bandwidth = oxcf->layer_target_bitrate[layer]; - bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } // Update buffer-related quantities. lrc->starting_buffer_level = (int64_t)(rc->starting_buffer_level * bitrate_alloc); @@ -220,17 +273,29 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, } else { lc->framerate = cpi->framerate; } - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = saturate_cast_double_to_int( + round(lc->target_bandwidth / lc->framerate)); lrc->max_frame_bandwidth = rc->max_frame_bandwidth; // Update qp-related quantities. lrc->worst_quality = rc->worst_quality; lrc->best_quality = rc->best_quality; } } + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + // Check bitrate of spatia layer. + layer = LAYER_IDS_TO_IDX(sl, oxcf->ts_number_layers - 1, + oxcf->ts_number_layers); + if (oxcf->layer_target_bitrate[layer] > 0) + num_spatial_layers_nonzero_rate += 1; + } + if (num_spatial_layers_nonzero_rate == 1) + svc->single_layer_svc = 1; + else + svc->single_layer_svc = 0; } static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) { - if (is_one_pass_cbr_svc(cpi)) + if (is_one_pass_svc(cpi)) return &cpi->svc.layer_context[cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id]; @@ -251,7 +316,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { const int tl = svc->temporal_layer_id; lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl]; - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->avg_frame_bandwidth = + saturate_cast_double_to_int(round(lc->target_bandwidth / lc->framerate)); lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; // Update the average layer frame size (non-cumulative per-frame-bw). if (tl == 0) { @@ -262,8 +328,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { const int prev_layer_target_bandwidth = oxcf->layer_target_bitrate[st_idx - 1]; lc->avg_frame_size = - (int)((lc->target_bandwidth - prev_layer_target_bandwidth) / - (lc->framerate - prev_layer_framerate)); + (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) / + (lc->framerate - prev_layer_framerate)); } } @@ -273,12 +339,14 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) { RATE_CONTROL *const lrc = &lc->rc; lc->framerate = framerate; - lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); - lrc->min_frame_bandwidth = - (int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100); - lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth * - oxcf->two_pass_vbrmax_section) / - 100); + lrc->avg_frame_bandwidth = + saturate_cast_double_to_int(round(lc->target_bandwidth / lc->framerate)); + const int64_t vbr_min_bits = + (int64_t)lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100; + lrc->min_frame_bandwidth = (int)VPXMIN(vbr_min_bits, INT_MAX); + const int64_t vbr_max_bits = + (int64_t)lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section / 100; + lrc->max_frame_bandwidth = (int)VPXMIN(vbr_max_bits, INT_MAX); vp9_rc_set_gf_interval_range(cpi, lrc); } @@ -286,6 +354,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { LAYER_CONTEXT *const lc = get_layer_context(cpi); const int old_frame_since_key = cpi->rc.frames_since_key; const int old_frame_to_key = cpi->rc.frames_to_key; + const int old_ext_use_post_encode_drop = cpi->rc.ext_use_post_encode_drop; cpi->rc = lc->rc; cpi->twopass = lc->twopass; @@ -293,32 +362,30 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { cpi->alt_ref_source = lc->alt_ref_source; // Check if it is one_pass_cbr_svc mode and lc->speed > 0 (real-time mode // does not use speed = 0). - if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) { + if (is_one_pass_svc(cpi) && lc->speed > 0) { cpi->oxcf.speed = lc->speed; } + cpi->loopfilter_ctrl = lc->loopfilter_ctrl; // Reset the frames_since_key and frames_to_key counters to their values // before the layer restore. Keep these defined for the stream (not layer). if (cpi->svc.number_temporal_layers > 1 || - (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) { + cpi->svc.number_spatial_layers > 1) { cpi->rc.frames_since_key = old_frame_since_key; cpi->rc.frames_to_key = old_frame_to_key; } - + cpi->rc.ext_use_post_encode_drop = old_ext_use_post_encode_drop; // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, // for the base temporal layer. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->svc.number_spatial_layers > 1 && cpi->svc.temporal_layer_id == 0) { CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; - signed char *temp = cr->map; - uint8_t *temp2 = cr->last_coded_q_map; - uint8_t *temp3 = cpi->consec_zero_mv; - cr->map = lc->map; - lc->map = temp; - cr->last_coded_q_map = lc->last_coded_q_map; - lc->last_coded_q_map = temp2; - cpi->consec_zero_mv = lc->consec_zero_mv; - lc->consec_zero_mv = temp3; + swap_ptr(&cr->map, &lc->map); + swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map); + swap_ptr(&cpi->consec_zero_mv, &lc->consec_zero_mv); cr->sb_index = lc->sb_index; + cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks; + cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks; + cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change; } } @@ -330,6 +397,8 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { lc->twopass = cpi->twopass; lc->target_bandwidth = (int)oxcf->target_bandwidth; lc->alt_ref_source = cpi->alt_ref_source; + lc->frame_qp = cpi->common.base_qindex; + lc->MBs = cpi->common.MBs; // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, // for the base temporal layer. @@ -346,9 +415,16 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { lc->consec_zero_mv = cpi->consec_zero_mv; cpi->consec_zero_mv = temp3; lc->sb_index = cr->sb_index; + lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks; + lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks; + lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change; + lc->qindex_delta[0] = cr->qindex_delta[0]; + lc->qindex_delta[1] = cr->qindex_delta[1]; + lc->qindex_delta[2] = cr->qindex_delta[2]; } } +#if !CONFIG_REALTIME_ONLY void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; int i; @@ -364,6 +440,7 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { } svc->spatial_layer_id = 0; } +#endif // !CONFIG_REALTIME_ONLY void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { LAYER_CONTEXT *const lc = @@ -375,21 +452,18 @@ void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { ++cpi->svc.current_superframe; } -int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { - return is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0 && - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id] - .is_key_frame; -} - -static void get_layer_resolution(const int width_org, const int height_org, - const int num, const int den, int *width_out, - int *height_out) { +void get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out) { int w, h; - if (width_out == NULL || height_out == NULL || den == 0) return; + if (width_out == NULL || height_out == NULL) return; + + if (den == 0 || num == 0) { + *width_out = width_org; + *height_out = height_org; + return; + } w = width_org * num / den; h = height_org * num / den; @@ -402,6 +476,48 @@ static void get_layer_resolution(const int width_org, const int height_org, *height_out = h; } +static void reset_fb_idx_unused(VP9_COMP *const cpi) { + // If a reference frame is not referenced or refreshed, then set the + // fb_idx for that reference to the first one used/referenced. + // This is to avoid setting fb_idx for a reference to a slot that is not + // used/needed (i.e., since that reference is not referenced or refreshed). + MV_REFERENCE_FRAME ref_frame; + MV_REFERENCE_FRAME first_ref = 0; + int first_fb_idx = 0; + int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx }; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) { + first_ref = ref_frame; + first_fb_idx = fb_idx[ref_frame - 1]; + break; + } + } + if (first_ref > 0) { + if (first_ref != LAST_FRAME && !(cpi->ref_frame_flags & VP9_LAST_FLAG) && + !cpi->ext_refresh_last_frame) + cpi->lst_fb_idx = first_fb_idx; + else if (first_ref != GOLDEN_FRAME && + !(cpi->ref_frame_flags & VP9_GOLD_FLAG) && + !cpi->ext_refresh_golden_frame) + cpi->gld_fb_idx = first_fb_idx; + else if (first_ref != ALTREF_FRAME && + !(cpi->ref_frame_flags & VP9_ALT_FLAG) && + !cpi->ext_refresh_alt_ref_frame) + cpi->alt_fb_idx = first_fb_idx; + } +} + +// Never refresh any reference frame buffers on top temporal layers in +// simulcast mode, which has interlayer prediction disabled. +static void non_reference_frame_simulcast(VP9_COMP *const cpi) { + if (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1 && + cpi->svc.temporal_layer_id > 0) { + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 0; + cpi->ext_refresh_alt_ref_frame = 0; + } +} + // The function sets proper ref_frame_flags, buffer indices, and buffer update // variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering // scheme. @@ -505,6 +621,10 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) { cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; } + + if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi); + + reset_fb_idx_unused(cpi); } // The function sets proper ref_frame_flags, buffer indices, and buffer update @@ -540,6 +660,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) { if (!spatial_id) { cpi->ref_frame_flags = VP9_LAST_FLAG; } else { + if (spatial_id == cpi->svc.number_spatial_layers - 1) + cpi->ext_refresh_alt_ref_frame = 0; cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; } } @@ -562,6 +684,10 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) { cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; } + + if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi); + + reset_fb_idx_unused(cpi); } // The function sets proper ref_frame_flags, buffer indices, and buffer update @@ -594,197 +720,286 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering( } else { cpi->gld_fb_idx = 0; } + + if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi); + + reset_fb_idx_unused(cpi); } -int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { - int width = 0, height = 0; - LAYER_CONTEXT *lc = NULL; - if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1; - cpi->svc.force_zero_mode_spatial_ref = 1; +static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config( + VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode; + cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl]; + cpi->ext_refresh_frame_flags_pending = 1; + cpi->lst_fb_idx = svc->lst_fb_idx[sl]; + cpi->gld_fb_idx = svc->gld_fb_idx[sl]; + cpi->alt_fb_idx = svc->alt_fb_idx[sl]; + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 0; + cpi->ext_refresh_alt_ref_frame = 0; + cpi->ref_frame_flags = 0; + if (svc->reference_last[sl]) cpi->ref_frame_flags |= VP9_LAST_FLAG; + if (svc->reference_golden[sl]) cpi->ref_frame_flags |= VP9_GOLD_FLAG; + if (svc->reference_altref[sl]) cpi->ref_frame_flags |= VP9_ALT_FLAG; +} - if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { - set_flags_and_fb_idx_for_temporal_mode3(cpi); - } else if (cpi->svc.temporal_layering_mode == - VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { - set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi); - } else if (cpi->svc.temporal_layering_mode == - VP9E_TEMPORAL_LAYERING_MODE_0101) { - set_flags_and_fb_idx_for_temporal_mode2(cpi); - } else if (cpi->svc.temporal_layering_mode == - VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { - // In the BYPASS/flexible mode, the encoder is relying on the application - // to specify, for each spatial layer, the flags and buffer indices for the - // layering. - // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is - // needed to support the case where the frame flags may be passed in via - // vpx_codec_encode(), which can be used for the temporal-only svc case. - // TODO(marpan): Consider adding an enc_config parameter to better handle - // this case. - if (cpi->ext_refresh_frame_flags_pending == 0) { - int sl; - cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; - sl = cpi->svc.spatial_layer_id; - vp9_apply_encoding_flags(cpi, cpi->svc.ext_frame_flags[sl]); - cpi->lst_fb_idx = cpi->svc.ext_lst_fb_idx[sl]; - cpi->gld_fb_idx = cpi->svc.ext_gld_fb_idx[sl]; - cpi->alt_fb_idx = cpi->svc.ext_alt_fb_idx[sl]; +void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + int sl = svc->spatial_layer_id; + svc->lst_fb_idx[sl] = cpi->lst_fb_idx; + svc->gld_fb_idx[sl] = cpi->gld_fb_idx; + svc->alt_fb_idx[sl] = cpi->alt_fb_idx; + // For the fixed SVC mode: pass the refresh_lst/gld/alt_frame flags to the + // update_buffer_slot, this is needed for the GET_SVC_REF_FRAME_CONFIG api. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + int ref; + for (ref = 0; ref < REF_FRAMES; ++ref) { + svc->update_buffer_slot[sl] &= ~(1 << ref); + if ((ref == svc->lst_fb_idx[sl] && cpi->refresh_last_frame) || + (ref == svc->gld_fb_idx[sl] && cpi->refresh_golden_frame) || + (ref == svc->alt_fb_idx[sl] && cpi->refresh_alt_ref_frame)) + svc->update_buffer_slot[sl] |= (1 << ref); } } - if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode) - cpi->svc.rc_drop_superframe = 0; + // TODO(jianj): Remove these 3, deprecated. + svc->update_last[sl] = (uint8_t)cpi->refresh_last_frame; + svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame; + svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame; - lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id]; + svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG); + svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG); + svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG); +} + +int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) { + int width = 0, height = 0; + SVC *const svc = &cpi->svc; + LAYER_CONTEXT *lc = NULL; + int scaling_factor_num = 1; + int scaling_factor_den = 1; + svc->skip_enhancement_layer = 0; + + if (svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF && + svc->number_spatial_layers > 1 && svc->number_spatial_layers <= 3 && + svc->number_temporal_layers <= 3) + svc->simulcast_mode = 1; + else + svc->simulcast_mode = 0; + + if (svc->number_spatial_layers > 1) { + svc->use_base_mv = 1; + svc->use_partition_reuse = 1; + } + svc->force_zero_mode_spatial_ref = 1; + + // For constrained_from_above drop mode: before encoding superframe (i.e., + // at SL0 frame) check all spatial layers (starting from top) for possible + // drop, and if so, set a flag to force drop of that layer and all its lower + // layers. + if (svc->spatial_layer_to_encode == svc->first_spatial_layer_to_encode) { + int sl; + for (sl = 0; sl < svc->number_spatial_layers; sl++) + svc->force_drop_constrained_from_above[sl] = 0; + if (svc->framedrop_mode == CONSTRAINED_FROM_ABOVE_DROP) { + for (sl = svc->number_spatial_layers - 1; + sl >= svc->first_spatial_layer_to_encode; sl--) { + int layer = sl * svc->number_temporal_layers + svc->temporal_layer_id; + LAYER_CONTEXT *const sl_lc = &svc->layer_context[layer]; + cpi->rc = sl_lc->rc; + cpi->oxcf.target_bandwidth = sl_lc->target_bandwidth; + if (vp9_test_drop(cpi)) { + int sl2; + // Set flag to force drop in encoding for this mode. + for (sl2 = sl; sl2 >= svc->first_spatial_layer_to_encode; sl2--) + svc->force_drop_constrained_from_above[sl2] = 1; + break; + } + } + } + } + + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { + set_flags_and_fb_idx_for_temporal_mode3(cpi); + } else if (svc->temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { + set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi); + } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) { + set_flags_and_fb_idx_for_temporal_mode2(cpi); + } else if (svc->temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + set_flags_and_fb_idx_bypass_via_set_ref_frame_config(cpi); + } + + if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[0].idx || + cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[0].idx || + cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[0].idx) + svc->buffer_gf_temporal_ref[0].is_used = 1; + if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[1].idx || + cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[1].idx || + cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[1].idx) + svc->buffer_gf_temporal_ref[1].is_used = 1; + + // For the fixed (non-flexible/bypass) SVC mode: + // If long term temporal reference is enabled at the sequence level + // (use_gf_temporal_ref == 1), and inter_layer is disabled (on inter-frames), + // we can use golden as a second temporal reference + // (since the spatial/inter-layer reference is disabled). + // We check that the fb_idx for this reference (buffer_gf_temporal_ref.idx) is + // unused (slot 7 and 6 should be available for 3-3 layer system). + // For now usage of this second temporal reference will only be used for + // highest and next to highest spatial layer (i.e., top and middle layer for + // 3 spatial layers). + svc->use_gf_temporal_ref_current_layer = 0; + if (svc->use_gf_temporal_ref && !svc->buffer_gf_temporal_ref[0].is_used && + !svc->buffer_gf_temporal_ref[1].is_used && + svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON && + svc->number_spatial_layers <= 3 && svc->number_temporal_layers <= 3 && + svc->spatial_layer_id >= svc->number_spatial_layers - 2) { + // Enable the second (long-term) temporal reference at the frame-level. + svc->use_gf_temporal_ref_current_layer = 1; + } + + // Check if current superframe has any layer sync, only check once on + // base layer. + if (svc->spatial_layer_id == 0) { + int sl = 0; + // Default is no sync. + svc->superframe_has_layer_sync = 0; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + if (cpi->svc.spatial_layer_sync[sl]) svc->superframe_has_layer_sync = 1; + } + } + + // Reset the drop flags for all spatial layers, on the + // first_spatial_layer_to_encode. + if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) { + vp9_zero(svc->drop_spatial_layer); + // TODO(jianj/marpan): Investigate why setting svc->lst/gld/alt_fb_idx + // causes an issue with frame dropping and temporal layers, when the frame + // flags are passed via the encode call (bypass mode). Issue is that we're + // resetting ext_refresh_frame_flags_pending to 0 on frame drops. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx)); + // These are set by API before the superframe is encoded and they are + // passed to encoder layer by layer. Don't reset them on layer 0 in bypass + // mode. + vp9_zero(svc->update_buffer_slot); + vp9_zero(svc->reference_last); + vp9_zero(svc->reference_golden); + vp9_zero(svc->reference_altref); + // TODO(jianj): Remove these 3, deprecated. + vp9_zero(svc->update_last); + vp9_zero(svc->update_golden); + vp9_zero(svc->update_altref); + } + } + + lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id]; // Setting the worst/best_quality via the encoder control: SET_SVC_PARAMETERS, // only for non-BYPASS mode for now. - if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS || + svc->use_set_ref_frame_config) { RATE_CONTROL *const lrc = &lc->rc; lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q); lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q); + if (cpi->fixed_qp_onepass) { + lrc->worst_quality = cpi->rc.worst_quality; + lrc->best_quality = cpi->rc.best_quality; + } } - get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, - lc->scaling_factor_num, lc->scaling_factor_den, &width, - &height); + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 && + svc->spatial_layer_id == svc->first_spatial_layer_to_encode && + cpi->resize_state != ORIG) { + scaling_factor_num = lc->scaling_factor_num_resize; + scaling_factor_den = lc->scaling_factor_den_resize; + } else { + scaling_factor_num = lc->scaling_factor_num; + scaling_factor_den = lc->scaling_factor_den; + } - // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use - // of base motion vectors if spatial scale factors for any layers are not 2. + get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, scaling_factor_num, + scaling_factor_den, &width, &height); + + // Use Eightap_smooth for low resolutions. + if (width * height <= 320 * 240) + svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH; + // For scale factors > 0.75, set the phase to 0 (aligns decimated pixel + // to source pixel). + if (scaling_factor_num > (3 * scaling_factor_den) >> 2) + svc->downsample_filter_phase[svc->spatial_layer_id] = 0; + + // The usage of use_base_mv or partition_reuse assumes down-scale of 2x2. + // For now, turn off use of base motion vectors and partition reuse if the + // spatial scale factors for any layers are not 2, + // keep the case of 3 spatial layers with scale factor of 4x4 for base layer. // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2. - if (cpi->svc.number_spatial_layers > 1) { + if (svc->number_spatial_layers > 1) { int sl; - for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) { - lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id]; - if (lc->scaling_factor_num != lc->scaling_factor_den >> 1) { - cpi->svc.use_base_mv = 0; + for (sl = 0; sl < svc->number_spatial_layers - 1; ++sl) { + lc = &svc->layer_context[sl * svc->number_temporal_layers + + svc->temporal_layer_id]; + if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) && + !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 && + svc->number_spatial_layers == 3)) { + svc->use_base_mv = 0; + svc->use_partition_reuse = 0; break; } } + // For non-zero spatial layers: if the previous spatial layer was dropped + // disable the base_mv and partition_reuse features. + if (svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1]) { + svc->use_base_mv = 0; + svc->use_partition_reuse = 0; + } + } + + svc->non_reference_frame = 0; + if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame && + !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) + svc->non_reference_frame = 1; + // For flexible mode, where update_buffer_slot is used, need to check if + // all buffer slots are not refreshed. + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + if (svc->update_buffer_slot[svc->spatial_layer_id] != 0) + svc->non_reference_frame = 0; + } + + if (svc->spatial_layer_id == 0) { + svc->high_source_sad_superframe = 0; + svc->high_num_blocks_with_motion = 0; + } + + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->last_layer_dropped[svc->spatial_layer_id] && + svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 && + !svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // For fixed/non-flexible mode, if the previous frame (same spatial layer + // from previous superframe) was dropped, make sure the lst_fb_idx + // for this frame corresponds to the buffer index updated on (last) encoded + // TL0 frame (with same spatial layer). + cpi->lst_fb_idx = svc->fb_idx_upd_tl0[svc->spatial_layer_id]; } if (vp9_set_size_literal(cpi, width, height) != 0) return VPX_CODEC_INVALID_PARAM; + svc->mi_stride[svc->spatial_layer_id] = cpi->common.mi_stride; + svc->mi_rows[svc->spatial_layer_id] = cpi->common.mi_rows; + svc->mi_cols[svc->spatial_layer_id] = cpi->common.mi_cols; return 0; } -#if CONFIG_SPATIAL_SVC -#define SMALL_FRAME_FB_IDX 7 - -int vp9_svc_start_frame(VP9_COMP *const cpi) { - int width = 0, height = 0; - LAYER_CONTEXT *lc; - struct lookahead_entry *buf; - int count = 1 << (cpi->svc.number_temporal_layers - 1); - - cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; - lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; - - cpi->svc.temporal_layer_id = 0; - while ((lc->current_video_frame_in_layer % count) != 0) { - ++cpi->svc.temporal_layer_id; - count >>= 1; - } - - cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; - - cpi->lst_fb_idx = cpi->svc.spatial_layer_id; - - if (cpi->svc.spatial_layer_id == 0) - cpi->gld_fb_idx = - (lc->gold_ref_idx >= 0) ? lc->gold_ref_idx : cpi->lst_fb_idx; - else - cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1; - - if (lc->current_video_frame_in_layer == 0) { - if (cpi->svc.spatial_layer_id >= 2) { - cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; - } else { - cpi->alt_fb_idx = cpi->lst_fb_idx; - cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG); - } - } else { - if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) { - cpi->alt_fb_idx = lc->alt_ref_idx; - if (!lc->has_alt_frame) cpi->ref_frame_flags &= (~VP9_ALT_FLAG); - } else { - // Find a proper alt_fb_idx for layers that don't have alt ref frame - if (cpi->svc.spatial_layer_id == 0) { - cpi->alt_fb_idx = cpi->lst_fb_idx; - } else { - LAYER_CONTEXT *lc_lower = - &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1]; - - if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] && - lc_lower->alt_ref_source != NULL) - cpi->alt_fb_idx = lc_lower->alt_ref_idx; - else if (cpi->svc.spatial_layer_id >= 2) - cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; - else - cpi->alt_fb_idx = cpi->lst_fb_idx; - } - } - } - - get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, - lc->scaling_factor_num, lc->scaling_factor_den, &width, - &height); - - // Workaround for multiple frame contexts. In some frames we can't use prev_mi - // since its previous frame could be changed during decoding time. The idea is - // we put a empty invisible frame in front of them, then we will not use - // prev_mi when encoding these frames. - - buf = vp9_lookahead_peek(cpi->lookahead, 0); - if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 && - cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE && - lc->rc.frames_to_key != 0 && - !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) { - if ((cpi->svc.number_temporal_layers > 1 && - cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) || - (cpi->svc.number_spatial_layers > 1 && - cpi->svc.spatial_layer_id == 0)) { - struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, 0); - - if (buf != NULL) { - cpi->svc.empty_frame.ts_start = buf->ts_start; - cpi->svc.empty_frame.ts_end = buf->ts_end; - cpi->svc.encode_empty_frame_state = ENCODING; - cpi->common.show_frame = 0; - cpi->ref_frame_flags = 0; - cpi->common.frame_type = INTER_FRAME; - cpi->lst_fb_idx = cpi->gld_fb_idx = cpi->alt_fb_idx = - SMALL_FRAME_FB_IDX; - - if (cpi->svc.encode_intra_empty_frame != 0) cpi->common.intra_only = 1; - - width = SMALL_FRAME_WIDTH; - height = SMALL_FRAME_HEIGHT; - } - } - } - - cpi->oxcf.worst_allowed_q = vp9_quantizer_to_qindex(lc->max_q); - cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q); - - vp9_change_config(cpi, &cpi->oxcf); - - if (vp9_set_size_literal(cpi, width, height) != 0) - return VPX_CODEC_INVALID_PARAM; - - vp9_set_high_precision_mv(cpi, 1); - - cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source; - - return 0; -} - -#undef SMALL_FRAME_FB_IDX -#endif // CONFIG_SPATIAL_SVC - struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi, struct lookahead_ctx *ctx, int drain) { @@ -817,7 +1032,7 @@ void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) { } // Reset on key frame: reset counters, references and buffer updates. -void vp9_svc_reset_key_frame(VP9_COMP *const cpi) { +void vp9_svc_reset_temporal_layers(VP9_COMP *const cpi, int is_key) { int sl, tl; SVC *const svc = &cpi->svc; LAYER_CONTEXT *lc = NULL; @@ -825,7 +1040,7 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) { for (tl = 0; tl < svc->number_temporal_layers; ++tl) { lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl]; lc->current_video_frame_in_layer = 0; - lc->frames_from_key_frame = 0; + if (is_key) lc->frames_from_key_frame = 0; } } if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { @@ -839,3 +1054,330 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) { vp9_update_temporal_layer_framerate(cpi); vp9_restore_layer_context(cpi); } + +void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) { + SVC *svc = &cpi->svc; + int sl, tl; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + // Check for reset based on avg_frame_bandwidth for spatial layer sl. + const int spatial_layer_idx = LAYER_IDS_TO_IDX( + sl, svc->number_temporal_layers - 1, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[spatial_layer_idx]; + RATE_CONTROL *lrc = &lc->rc; + if (lrc->avg_frame_bandwidth / 3 > (lrc->last_avg_frame_bandwidth >> 1) || + lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) { + // Reset for all temporal layers with spatial layer sl. + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + int temporal_layer_idx = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + lrc = &svc->layer_context[temporal_layer_idx].rc; + lrc->rc_1_frame = 0; + lrc->rc_2_frame = 0; + lrc->bits_off_target = lrc->optimal_buffer_level; + lrc->buffer_level = lrc->optimal_buffer_level; + } + } + } +} + +void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + const int sl = svc->spatial_layer_id; + // Check for disabling inter-layer (spatial) prediction, if + // svc.disable_inter_layer_pred is set. If the previous spatial layer was + // dropped then disable the prediction from this (scaled) reference. + // For INTER_LAYER_PRED_OFF_NONKEY: inter-layer prediction is disabled + // on key frames or if any spatial layer is a sync layer. + if ((svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY && + !svc->layer_context[svc->temporal_layer_id].is_key_frame && + !svc->superframe_has_layer_sync) || + svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF || + svc->drop_spatial_layer[sl - 1]) { + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + if (yv12 != NULL && + (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) { + const struct scale_factors *const scale_fac = + &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) { + cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame)); + // Point golden/altref frame buffer index to last. + if (!svc->simulcast_mode) { + if (ref_frame == GOLDEN_FRAME) + cpi->gld_fb_idx = cpi->lst_fb_idx; + else if (ref_frame == ALTREF_FRAME) + cpi->alt_fb_idx = cpi->lst_fb_idx; + } + } + } + } + } + // For fixed/non-flexible SVC: check for disabling inter-layer prediction. + // If the reference for inter-layer prediction (the reference that is scaled) + // is not the previous spatial layer from the same superframe, then we disable + // inter-layer prediction. Only need to check when inter_layer prediction is + // not set to OFF mode. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred != INTER_LAYER_PRED_OFF) { + // We only use LAST and GOLDEN for prediction in real-time mode, so we + // check both here. + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) { + struct scale_factors *scale_fac = &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) { + // If this reference was updated on the previous spatial layer of the + // current superframe, then we keep this reference (don't disable). + // Otherwise we disable the inter-layer prediction. + // This condition is verified by checking if the current frame buffer + // index is equal to any of the slots for the previous spatial layer, + // and if so, check if that slot was updated/refreshed. If that is the + // case, then this reference is valid for inter-layer prediction under + // the mode INTER_LAYER_PRED_ON_CONSTRAINED. + int fb_idx = + ref_frame == LAST_FRAME ? cpi->lst_fb_idx : cpi->gld_fb_idx; + int ref_flag = ref_frame == LAST_FRAME ? VP9_LAST_FLAG : VP9_GOLD_FLAG; + int disable = 1; + if (fb_idx < 0) continue; + if ((fb_idx == svc->lst_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) || + (fb_idx == svc->gld_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) || + (fb_idx == svc->alt_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx)))) + disable = 0; + if (disable) cpi->ref_frame_flags &= (~ref_flag); + } + } + } +} + +void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // For fixed/non-flexible mode, the following constraint are expected, + // when inter-layer prediction is on (default). + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON && + svc->framedrop_mode != LAYER_DROP) { + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // On non-key frames: LAST is always temporal reference, GOLDEN is + // spatial reference. + if (svc->temporal_layer_id == 0) + // Base temporal only predicts from base temporal. + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == 0); + else + // Non-base temporal only predicts from lower temporal layer. + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] < + svc->temporal_layer_id); + if (svc->spatial_layer_id > 0 && cpi->ref_frame_flags & VP9_GOLD_FLAG && + svc->spatial_layer_id > svc->first_spatial_layer_to_encode) { + // Non-base spatial only predicts from lower spatial layer with same + // temporal_id. + assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] == + svc->temporal_layer_id); + } + } else if (svc->spatial_layer_id > 0 && + svc->spatial_layer_id > svc->first_spatial_layer_to_encode) { + // Only 1 reference for frame whose base is key; reference may be LAST + // or GOLDEN, so we check both. + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + assert(svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == + svc->temporal_layer_id); + } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] == + svc->temporal_layer_id); + } + } + } else if (svc->use_gf_temporal_ref_current_layer && + !svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // For the usage of golden as second long term reference: the + // temporal_layer_id of that reference must be base temporal layer 0, and + // spatial_layer_id of that reference must be same as current + // spatial_layer_id. If not, disable feature. + // TODO(marpan): Investigate when this can happen, and maybe put this check + // and reset in a different place. + if (svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] != + svc->spatial_layer_id || + svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] != 0) + svc->use_gf_temporal_ref_current_layer = 0; + } +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +int vp9_denoise_svc_non_key(VP9_COMP *const cpi) { + int layer = + LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + return denoise_svc(cpi) && !lc->is_key_frame; +} +#endif + +void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // Only for superframes whose base is not key, as those are + // already sync frames. + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + if (svc->spatial_layer_id == 0) { + // On base spatial layer: if the current superframe has a layer sync then + // reset the pattern counters and reset to base temporal layer. + if (svc->superframe_has_layer_sync) + vp9_svc_reset_temporal_layers(cpi, cpi->common.frame_type == KEY_FRAME); + } + // If the layer sync is set for this current spatial layer then + // disable the temporal reference. + if (svc->spatial_layer_id > 0 && + svc->spatial_layer_sync[svc->spatial_layer_id]) { + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + if (svc->use_gf_temporal_ref_current_layer) { + int index = svc->spatial_layer_id; + // If golden is used as second reference: need to remove it from + // prediction, reset refresh period to 0, and update the reference. + svc->use_gf_temporal_ref_current_layer = 0; + cpi->rc.baseline_gf_interval = 0; + cpi->rc.frames_till_gf_update_due = 0; + // On layer sync frame we must update the buffer index used for long + // term reference. Use the alt_ref since it is not used or updated on + // sync frames. + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->ext_refresh_alt_ref_frame = 1; + } + } + } +} + +void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + int i = 0; + // Update the usage of frame buffer index for base spatial layers. + if (svc->spatial_layer_id == 0) { + if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame) + svc->fb_idx_base[cpi->lst_fb_idx] = 1; + if ((cpi->ref_frame_flags & VP9_GOLD_FLAG) || cpi->refresh_golden_frame) + svc->fb_idx_base[cpi->gld_fb_idx] = 1; + if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame) + svc->fb_idx_base[cpi->alt_fb_idx] = 1; + // For bypass/flexible mode: check for refresh slots. + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + for (i = 0; i < REF_FRAMES; ++i) + if (svc->update_buffer_slot[0] & (1 << i)) svc->fb_idx_base[i] = 1; + } + } +} + +static void vp9_svc_update_ref_frame_bypass_mode(VP9_COMP *const cpi) { + // For non-flexible/bypass SVC mode: check for refreshing other buffer + // slots. + SVC *const svc = &cpi->svc; + VP9_COMMON *const cm = &cpi->common; + BufferPool *const pool = cm->buffer_pool; + int i; + for (i = 0; i < REF_FRAMES; i++) { + if ((cm->frame_type == KEY_FRAME && !svc->simulcast_mode) || + svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx); + svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id; + } + } +} + +void vp9_svc_update_ref_frame(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + BufferPool *const pool = cm->buffer_pool; + + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + vp9_svc_update_ref_frame_bypass_mode(cpi); + } else if (cm->frame_type == KEY_FRAME && !svc->simulcast_mode) { + // Keep track of frame index for each reference frame. + int i; + // On key frame update all reference frame slots. + for (i = 0; i < REF_FRAMES; i++) { + svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id; + // LAST/GOLDEN/ALTREF is already updated above. + if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx && i != cpi->alt_fb_idx) + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx); + } + } else { + if (cpi->refresh_last_frame) { + svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] = svc->temporal_layer_id; + } + if (cpi->refresh_golden_frame) { + svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] = svc->temporal_layer_id; + } + if (cpi->refresh_alt_ref_frame) { + svc->fb_idx_spatial_layer_id[cpi->alt_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->alt_fb_idx] = svc->temporal_layer_id; + } + } + // Copy flags from encoder to SVC struct. + vp9_copy_flags_ref_update_idx(cpi); + vp9_svc_update_ref_frame_buffer_idx(cpi); +} + +void vp9_svc_adjust_frame_rate(VP9_COMP *const cpi) { + int64_t this_duration = + cpi->svc.timebase_fac * cpi->svc.duration[cpi->svc.spatial_layer_id]; + vp9_new_framerate(cpi, 10000000.0 / this_duration); +} + +void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + RATE_CONTROL *const rc = &cpi->rc; + // On key frames in CBR mode: reset the avg_frame_qindex for base layer + // (to level closer to worst_quality) if the overshoot is significant. + // Reset it for all temporal layers on base spatial layer. + if (cm->frame_type == KEY_FRAME && cpi->oxcf.rc_mode == VPX_CBR && + !svc->simulcast_mode && + rc->projected_frame_size / 3 > rc->avg_frame_bandwidth) { + int tl; + rc->avg_frame_qindex[INTER_FRAME] = + VPXMAX(rc->avg_frame_qindex[INTER_FRAME], + (cm->base_qindex + rc->worst_quality) >> 1); + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = LAYER_IDS_TO_IDX(0, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->avg_frame_qindex[INTER_FRAME] = rc->avg_frame_qindex[INTER_FRAME]; + } + } +} + +// SVC: skip encoding of enhancement layer if the layer target bandwidth = 0. +// No need to set svc.skip_enhancement_layer if whole superframe will be +// dropped. +int vp9_svc_check_skip_enhancement_layer(VP9_COMP *const cpi) { + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && + cpi->oxcf.target_bandwidth == 0 && + !(cpi->svc.framedrop_mode != LAYER_DROP && + (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP || + cpi->svc + .force_drop_constrained_from_above[cpi->svc.number_spatial_layers - + 1]) && + cpi->svc.drop_spatial_layer[0])) { + cpi->svc.skip_enhancement_layer = 1; + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1; + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1; + vp9_inc_frame_in_layer(cpi); + return 1; + } + return 0; +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h index ee7a6638b4..388a02789d 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ -#define VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ +#ifndef VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ +#define VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ #include "vpx/vpx_encoder.h" @@ -19,6 +19,24 @@ extern "C" { #endif +typedef enum { + // Inter-layer prediction is on on all frames. + INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. + INTER_LAYER_PRED_OFF, + // Inter-layer prediction is off on non-key frames and non-sync frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED +} INTER_LAYER_PRED; + +typedef struct BUFFER_LONGTERM_REF { + int idx; + int is_used; +} BUFFER_LONGTERM_REF; + typedef struct { RATE_CONTROL rc; int target_bandwidth; @@ -29,6 +47,9 @@ typedef struct { int min_q; int scaling_factor_num; int scaling_factor_den; + // Scaling factors used for internal resize scaling for single layer SVC. + int scaling_factor_num_resize; + int scaling_factor_den_resize; TWO_PASS twopass; vpx_fixed_buf_t rc_twopass_stats_in; unsigned int current_video_frame_in_layer; @@ -40,24 +61,29 @@ typedef struct { int gold_ref_idx; int has_alt_frame; size_t layer_size; - struct vpx_psnr_pkt psnr_pkt; // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame. + // TODO(jianj/marpan): Is it better to use the full cyclic refresh struct. int sb_index; signed char *map; uint8_t *last_coded_q_map; uint8_t *consec_zero_mv; + int actual_num_seg1_blocks; + int actual_num_seg2_blocks; + int counter_encode_maxq_scene_change; + int qindex_delta[3]; uint8_t speed; + int loopfilter_ctrl; + int frame_qp; + int MBs; } LAYER_CONTEXT; -typedef struct { +typedef struct SVC { int spatial_layer_id; int temporal_layer_id; int number_spatial_layers; int number_temporal_layers; int spatial_layer_to_encode; - int first_spatial_layer_to_encode; - int rc_drop_superframe; // Workaround for multiple frame contexts enum { ENCODED = 0, ENCODING, NEED_TO_ENCODE } encode_empty_frame_state; @@ -81,13 +107,104 @@ typedef struct { // Frame flags and buffer indexes for each spatial layer, set by the // application (external settings). int ext_frame_flags[VPX_MAX_LAYERS]; - int ext_lst_fb_idx[VPX_MAX_LAYERS]; - int ext_gld_fb_idx[VPX_MAX_LAYERS]; - int ext_alt_fb_idx[VPX_MAX_LAYERS]; - int ref_frame_index[REF_FRAMES]; + int lst_fb_idx[VPX_MAX_LAYERS]; + int gld_fb_idx[VPX_MAX_LAYERS]; + int alt_fb_idx[VPX_MAX_LAYERS]; int force_zero_mode_spatial_ref; + // Sequence level flag to enable second (long term) temporal reference. + int use_gf_temporal_ref; + // Frame level flag to enable second (long term) temporal reference. + int use_gf_temporal_ref_current_layer; + // Allow second reference for at most 2 top highest resolution layers. + BUFFER_LONGTERM_REF buffer_gf_temporal_ref[2]; int current_superframe; + int non_reference_frame; int use_base_mv; + int use_partition_reuse; + // Used to control the downscaling filter for source scaling, for 1 pass CBR. + // downsample_filter_phase: = 0 will do sub-sampling (no weighted average), + // = 8 will center the target pixel and get a symmetric averaging filter. + // downsample_filter_type: 4 filters may be used: eighttap_regular, + // eighttap_smooth, eighttap_sharp, and bilinear. + INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS]; + int downsample_filter_phase[VPX_SS_MAX_LAYERS]; + + BLOCK_SIZE *prev_partition_svc; + int mi_stride[VPX_MAX_LAYERS]; + int mi_rows[VPX_MAX_LAYERS]; + int mi_cols[VPX_MAX_LAYERS]; + + int first_layer_denoise; + + int skip_enhancement_layer; + + int lower_layer_qindex; + + int last_layer_dropped[VPX_MAX_LAYERS]; + int drop_spatial_layer[VPX_MAX_LAYERS]; + int framedrop_thresh[VPX_MAX_LAYERS]; + int drop_count[VPX_MAX_LAYERS]; + int force_drop_constrained_from_above[VPX_MAX_LAYERS]; + int max_consec_drop; + SVC_LAYER_DROP_MODE framedrop_mode; + + INTER_LAYER_PRED disable_inter_layer_pred; + + // Flag to indicate scene change and high num of motion blocks at current + // superframe, scene detection is currently checked for each superframe prior + // to encoding, on the full resolution source. + int high_source_sad_superframe; + int high_num_blocks_with_motion; + + // Flags used to get SVC pattern info. + int update_buffer_slot[VPX_SS_MAX_LAYERS]; + uint8_t reference_last[VPX_SS_MAX_LAYERS]; + uint8_t reference_golden[VPX_SS_MAX_LAYERS]; + uint8_t reference_altref[VPX_SS_MAX_LAYERS]; + // TODO(jianj): Remove these last 3, deprecated. + uint8_t update_last[VPX_SS_MAX_LAYERS]; + uint8_t update_golden[VPX_SS_MAX_LAYERS]; + uint8_t update_altref[VPX_SS_MAX_LAYERS]; + + // Keep track of the frame buffer index updated/refreshed on the base + // temporal superframe. + int fb_idx_upd_tl0[VPX_SS_MAX_LAYERS]; + + // Keep track of the spatial and temporal layer id of the frame that last + // updated the frame buffer index. + uint8_t fb_idx_spatial_layer_id[REF_FRAMES]; + uint8_t fb_idx_temporal_layer_id[REF_FRAMES]; + + int spatial_layer_sync[VPX_SS_MAX_LAYERS]; + // Quantizer for each spatial layer. + int base_qindex[VPX_SS_MAX_LAYERS]; + uint8_t set_intra_only_frame; + uint8_t previous_frame_is_intra_only; + uint8_t superframe_has_layer_sync; + + uint8_t fb_idx_base[REF_FRAMES]; + + int use_set_ref_frame_config; + + int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS]; + + int first_spatial_layer_to_encode; + + // Parameters for allowing framerate per spatial layer, and buffer + // update based on timestamps. + int64_t duration[VPX_SS_MAX_LAYERS]; + int64_t timebase_fac; + int64_t time_stamp_superframe; + int64_t time_stamp_prev[VPX_SS_MAX_LAYERS]; + + int num_encoded_top_layer; + + // Every spatial layer on a superframe whose base is key is key too. + int simulcast_mode; + + // Flag to indicate SVC is dynamically switched to a single layer. + int single_layer_svc; + int resize_set; } SVC; struct VP9_COMP; @@ -117,6 +234,10 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi); // Initialize second pass rc for spatial svc. void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi); +void get_layer_resolution(const int width_org, const int height_org, + const int num, const int den, int *width_out, + int *height_out); + // Increment number of video frames in layer void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi); @@ -131,14 +252,39 @@ struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi, // Start a frame and initialize svc parameters int vp9_svc_start_frame(struct VP9_COMP *const cpi); -int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi); +#if CONFIG_VP9_TEMPORAL_DENOISING +int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi); +#endif + +void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi); + +int vp9_one_pass_svc_start_layer(struct VP9_COMP *const cpi); void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi); -void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi); +void vp9_svc_reset_temporal_layers(struct VP9_COMP *const cpi, int is_key); +void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi); + +void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi); + +void vp9_svc_assert_constraints_pattern(struct VP9_COMP *const cpi); + +void vp9_svc_check_spatial_layer_sync(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame_buffer_idx(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame_key_simulcast(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi); + +void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi); + +void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi); + +int vp9_svc_check_skip_enhancement_layer(struct VP9_COMP *const cpi); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SVC_LAYERCONTEXT_ +#endif // VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c index 344658483a..46f36c3eb9 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -8,13 +8,17 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_common.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mcomp.h" @@ -30,54 +34,372 @@ #include "vpx_scale/vpx_scale.h" static int fixed_divide[512]; +static unsigned int index_mult[14] = { 0, 0, 0, 0, 49152, + 39322, 32768, 28087, 24576, 21846, + 19661, 17874, 0, 15124 }; +#if CONFIG_VP9_HIGHBITDEPTH +static int64_t highbd_index_mult[14] = { 0U, 0U, 0U, + 0U, 3221225472U, 2576980378U, + 2147483648U, 1840700270U, 1610612736U, + 1431655766U, 1288490189U, 1171354718U, + 0U, 991146300U }; +#endif // CONFIG_VP9_HIGHBITDEPTH + +static const MV kZeroMv = { 0, 0 }; +#define TF_INTERP_EXTEND 6 + +// Prediction function using 12-tap interpolation filter. +void vpx_convolve12_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)y0_q4; + (void)y_step_q4; + int x, y; + src -= MAX_FILTER_TAP / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = filter[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < MAX_FILTER_TAP; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve12_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + int x, y; + src -= src_stride * (MAX_FILTER_TAP / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = filter[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < MAX_FILTER_TAP; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +// Copied from vpx_convolve8_c(). Possible block sizes are 32x32, 16x16, 8x8. +void vpx_convolve12_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel12 *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + uint8_t temp[BW * (BH + MAX_FILTER_TAP - 1)]; + const int temp_stride = BW; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP; + + vpx_convolve12_horiz_c(src - src_stride * (MAX_FILTER_TAP / 2 - 1), + src_stride, temp, temp_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, intermediate_height); + vpx_convolve12_vert_c(temp + temp_stride * (MAX_FILTER_TAP / 2 - 1), + temp_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); +} + +static void vp9_build_inter_predictor_12( + const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, + const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, + const InterpKernel12 *kernel, enum mv_precision precision, int x, int y) { + (void)ref; + const int is_q4 = precision == MV_PRECISION_Q4; + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, + is_q4 ? src_mv->col : src_mv->col * 2 }; + MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf); + const int subpel_x = mv.col & SUBPEL_MASK; + const int subpel_y = mv.row & SUBPEL_MASK; + + src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); + + if (subpel_x == 0 && subpel_y == 0) { + vpx_convolve_copy(src, src_stride, dst, dst_stride, NULL, subpel_x, + sf->x_step_q4, subpel_y, sf->y_step_q4, w, h); + } else if (subpel_x == 0 && subpel_y != 0) { + vpx_convolve12_vert(src, src_stride, dst, dst_stride, kernel, subpel_x, + sf->x_step_q4, subpel_y, sf->y_step_q4, w, h); + } else if (subpel_x != 0 && subpel_y == 0) { + vpx_convolve12_horiz(src, src_stride, dst, dst_stride, kernel, subpel_x, + sf->x_step_q4, subpel_y, sf->y_step_q4, w, h); + } else { + vpx_convolve12(src, src_stride, dst, dst_stride, kernel, subpel_x, + sf->x_step_q4, subpel_y, sf->y_step_q4, w, h); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_convolve12_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)y0_q4; + (void)y_step_q4; + int x, y; + src -= MAX_FILTER_TAP / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = filter[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < MAX_FILTER_TAP; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +void vpx_highbd_convolve12_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)x0_q4; + (void)x_step_q4; + int x, y; + src -= src_stride * (MAX_FILTER_TAP / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = filter[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < MAX_FILTER_TAP; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve12(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + uint16_t temp[BW * (BH + MAX_FILTER_TAP - 1)]; + const int temp_stride = BW; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP; + + vpx_highbd_convolve12_horiz_c(src - src_stride * (MAX_FILTER_TAP / 2 - 1), + src_stride, temp, temp_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, + intermediate_height, bd); + vpx_highbd_convolve12_vert_c(temp + temp_stride * (MAX_FILTER_TAP / 2 - 1), + temp_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); +} + +// Copied from vpx_highbd_convolve8_c() +void vpx_highbd_convolve12_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + highbd_convolve12(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); +} + +static void vp9_highbd_build_inter_predictor_12( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, + const InterpKernel12 *kernel, enum mv_precision precision, int x, int y, + int bd) { + (void)ref; + const int is_q4 = precision == MV_PRECISION_Q4; + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, + is_q4 ? src_mv->col : src_mv->col * 2 }; + MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf); + const int subpel_x = mv.col & SUBPEL_MASK; + const int subpel_y = mv.row & SUBPEL_MASK; + + src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); + + if (subpel_x == 0 && subpel_y == 0) { + vpx_highbd_convolve_copy(src, src_stride, dst, dst_stride, NULL, subpel_x, + sf->x_step_q4, subpel_y, sf->y_step_q4, w, h, bd); + } else if (subpel_x == 0 && subpel_y != 0) { + vpx_highbd_convolve12_vert(src, src_stride, dst, dst_stride, kernel, + subpel_x, sf->x_step_q4, subpel_y, sf->y_step_q4, + w, h, bd); + } else if (subpel_x != 0 && subpel_y == 0) { + vpx_highbd_convolve12_horiz(src, src_stride, dst, dst_stride, kernel, + subpel_x, sf->x_step_q4, subpel_y, + sf->y_step_q4, w, h, bd); + } else { + vpx_highbd_convolve12(src, src_stride, dst, dst_stride, kernel, subpel_x, + sf->x_step_q4, subpel_y, sf->y_step_q4, w, h, bd); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH static void temporal_filter_predictors_mb_c( MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col, - uint8_t *pred, struct scale_factors *scale, int x, int y) { + uint8_t *pred, struct scale_factors *scale, int x, int y, MV *blk_mvs, + int use_32x32) { const int which_mv = 0; - const MV mv = { mv_row, mv_col }; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP]; + const InterpKernel12 *const kernel = sub_pel_filters_12; + int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1); enum mv_precision mv_precision_uv; int uv_stride; - if (uv_block_width == 8) { + if (uv_block_width == (BW >> 1)) { uv_stride = (stride + 1) >> 1; mv_precision_uv = MV_PRECISION_Q4; } else { uv_stride = stride; mv_precision_uv = MV_PRECISION_Q3; } +#if !CONFIG_VP9_HIGHBITDEPTH + (void)xd; +#endif + if (use_32x32) { + const MV mv = { mv_row, mv_col }; #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, - 16, 16, which_mv, kernel, MV_PRECISION_Q3, - x, y, xd->bd); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor_12(CONVERT_TO_SHORTPTR(y_mb_ptr), stride, + CONVERT_TO_SHORTPTR(&pred[0]), BW, + &mv, scale, BW, BH, which_mv, kernel, + MV_PRECISION_Q3, x, y, xd->bd); - vp9_highbd_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], - uv_block_width, &mv, scale, uv_block_width, - uv_block_height, which_mv, kernel, - mv_precision_uv, x, y, xd->bd); + vp9_highbd_build_inter_predictor_12( + CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[BLK_PELS]), uv_block_width, &mv, scale, + uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, + y, xd->bd); - vp9_highbd_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], - uv_block_width, &mv, scale, uv_block_width, - uv_block_height, which_mv, kernel, - mv_precision_uv, x, y, xd->bd); + vp9_highbd_build_inter_predictor_12( + CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1)]), uv_block_width, &mv, + scale, uv_block_width, uv_block_height, which_mv, kernel, + mv_precision_uv, x, y, xd->bd); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + vp9_build_inter_predictor_12(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW, + BH, which_mv, kernel, MV_PRECISION_Q3, x, y); + + vp9_build_inter_predictor_12(u_mb_ptr, uv_stride, &pred[BLK_PELS], + uv_block_width, &mv, scale, uv_block_width, + uv_block_height, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor_12(v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)], + uv_block_width, &mv, scale, uv_block_width, + uv_block_height, which_mv, kernel, + mv_precision_uv, x, y); return; } + + // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16 + // predictors. + // Y predictor + for (i = 0; i < BH; i += ys) { + for (j = 0; j < BW; j += xs) { + const MV mv = blk_mvs[k]; + const int y_offset = i * stride + j; + const int p_offset = i * BW + j; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor_12( + CONVERT_TO_SHORTPTR(y_mb_ptr + y_offset), stride, + CONVERT_TO_SHORTPTR(&pred[p_offset]), BW, &mv, scale, xs, ys, + which_mv, kernel, MV_PRECISION_Q3, x, y, xd->bd); + } else { + vp9_build_inter_predictor_12(y_mb_ptr + y_offset, stride, + &pred[p_offset], BW, &mv, scale, xs, ys, + which_mv, kernel, MV_PRECISION_Q3, x, y); + } +#else + vp9_build_inter_predictor_12(y_mb_ptr + y_offset, stride, &pred[p_offset], + BW, &mv, scale, xs, ys, which_mv, kernel, + MV_PRECISION_Q3, x, y); #endif // CONFIG_VP9_HIGHBITDEPTH - (void)xd; - vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16, - which_mv, kernel, MV_PRECISION_Q3, x, y); + k++; + } + } - vp9_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width, - &mv, scale, uv_block_width, uv_block_height, - which_mv, kernel, mv_precision_uv, x, y); + // U and V predictors + ys = (uv_block_height >> 1); + xs = (uv_block_width >> 1); + k = 0; - vp9_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width, - &mv, scale, uv_block_width, uv_block_height, - which_mv, kernel, mv_precision_uv, x, y); + for (i = 0; i < uv_block_height; i += ys) { + for (j = 0; j < uv_block_width; j += xs) { + const MV mv = blk_mvs[k]; + const int uv_offset = i * uv_stride + j; + const int p_offset = i * uv_block_width + j; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor_12( + CONVERT_TO_SHORTPTR(u_mb_ptr + uv_offset), uv_stride, + CONVERT_TO_SHORTPTR(&pred[BLK_PELS + p_offset]), uv_block_width, + &mv, scale, xs, ys, which_mv, kernel, mv_precision_uv, x, y, + xd->bd); + + vp9_highbd_build_inter_predictor_12( + CONVERT_TO_SHORTPTR(v_mb_ptr + uv_offset), uv_stride, + CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1) + p_offset]), + uv_block_width, &mv, scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y, xd->bd); + } else { + vp9_build_inter_predictor_12(u_mb_ptr + uv_offset, uv_stride, + &pred[BLK_PELS + p_offset], uv_block_width, + &mv, scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor_12(v_mb_ptr + uv_offset, uv_stride, + &pred[(BLK_PELS << 1) + p_offset], + uv_block_width, &mv, scale, xs, ys, + which_mv, kernel, mv_precision_uv, x, y); + } +#else + vp9_build_inter_predictor_12(u_mb_ptr + uv_offset, uv_stride, + &pred[BLK_PELS + p_offset], uv_block_width, + &mv, scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor_12(v_mb_ptr + uv_offset, uv_stride, + &pred[(BLK_PELS << 1) + p_offset], + uv_block_width, &mv, scale, xs, ys, which_mv, + kernel, mv_precision_uv, x, y); +#endif // CONFIG_VP9_HIGHBITDEPTH + k++; + } + } } void vp9_temporal_filter_init(void) { @@ -87,149 +409,387 @@ void vp9_temporal_filter_init(void) { for (i = 1; i < 512; ++i) fixed_divide[i] = 0x80000 / i; } -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, - uint8_t *frame2, unsigned int block_width, - unsigned int block_height, int strength, - int filter_weight, unsigned int *accumulator, - uint16_t *count) { - unsigned int i, j, k; - int modifier; - int byte = 0; - const int rounding = strength > 0 ? 1 << (strength - 1) : 0; +static INLINE int mod_index(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int mod; - for (i = 0, k = 0; i < block_height; i++) { - for (j = 0; j < block_width; j++, k++) { - int pixel_value = *frame2; + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + mod = + ((unsigned int)clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE int highbd_mod_index(int sum_dist, int index, int rounding, + int strength, int filter_weight) { + int mod; + + assert(index >= 0 && index <= 13); + assert(highbd_index_mult[index] != 0); + + mod = (int)((clamp(sum_dist, 0, INT32_MAX) * highbd_index_mult[index]) >> 32); + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static INLINE int get_filter_weight(unsigned int i, unsigned int j, + unsigned int block_height, + unsigned int block_width, + const int *const blk_fw, int use_32x32) { + // blk_fw[0] ~ blk_fw[3] are the same. + if (use_32x32) { + return blk_fw[0]; + } + + if (i < block_height / 2) { + if (j < block_width / 2) { + return blk_fw[0]; + } + + return blk_fw[1]; + } + + if (j < block_width / 2) { + return blk_fw[2]; + } + + return blk_fw[3]; +} + +void vp9_apply_temporal_filter_c( + const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred, + int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1, + int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred, + int uv_buf_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, + uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, + uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) { + unsigned int i, j, k, m; + int modifier; + const int rounding = (1 << strength) >> 1; + const unsigned int uv_block_width = block_width >> ss_x; + const unsigned int uv_block_height = block_height >> ss_y; + DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]); + + int idx = 0, idy; + + assert(strength >= 0); + assert(strength <= 6); + + memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + + // Calculate diff^2 for each pixel of the 16x16 block. + // TODO(yunqing): the following code needs to be optimized. + for (i = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int16_t diff = + y_frame1[i * (int)y_stride + j] - y_pred[i * (int)block_width + j]; + y_diff_sse[idx++] = diff * diff; + } + } + idx = 0; + for (i = 0; i < uv_block_height; i++) { + for (j = 0; j < uv_block_width; j++) { + const int16_t diffu = + u_frame1[i * uv_stride + j] - u_pred[i * uv_buf_stride + j]; + const int16_t diffv = + v_frame1[i * uv_stride + j] - v_pred[i * uv_buf_stride + j]; + u_diff_sse[idx] = diffu * diffu; + v_diff_sse[idx] = diffv * diffv; + idx++; + } + } + + for (i = 0, k = 0, m = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int pixel_value = y_pred[i * y_buf_stride + j]; + const int filter_weight = + get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); // non-local mean approach - int diff_sse[9] = { 0 }; - int idx, idy, index = 0; + int y_index = 0; + + const int uv_r = i >> ss_y; + const int uv_c = j >> ss_x; + modifier = 0; for (idy = -1; idy <= 1; ++idy) { for (idx = -1; idx <= 1; ++idx) { - int row = (int)i + idy; - int col = (int)j + idx; + const int row = (int)i + idy; + const int col = (int)j + idx; if (row >= 0 && row < (int)block_height && col >= 0 && col < (int)block_width) { - int diff = frame1[byte + idy * (int)stride + idx] - - frame2[idy * (int)block_width + idx]; - diff_sse[index] = diff * diff; - ++index; + modifier += y_diff_sse[row * (int)block_width + col]; + ++y_index; } } } - assert(index > 0); + assert(y_index > 0); - modifier = 0; - for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx]; + modifier += u_diff_sse[uv_r * uv_block_width + uv_c]; + modifier += v_diff_sse[uv_r * uv_block_width + uv_c]; - modifier *= 3; - modifier /= index; + y_index += 2; - ++frame2; + modifier = + mod_index(modifier, y_index, rounding, strength, filter_weight); - modifier += rounding; - modifier >>= strength; + y_count[k] += modifier; + y_accumulator[k] += modifier * pixel_value; - if (modifier > 16) modifier = 16; + ++k; - modifier = 16 - modifier; - modifier *= filter_weight; + // Process chroma component + if (!(i & ss_y) && !(j & ss_x)) { + const int u_pixel_value = u_pred[uv_r * uv_buf_stride + uv_c]; + const int v_pixel_value = v_pred[uv_r * uv_buf_stride + uv_c]; - count[k] += modifier; - accumulator[k] += modifier * pixel_value; + // non-local mean approach + int cr_index = 0; + int u_mod = 0, v_mod = 0; + int y_diff = 0; - byte++; + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + const int row = uv_r + idy; + const int col = uv_c + idx; + + if (row >= 0 && row < (int)uv_block_height && col >= 0 && + col < (int)uv_block_width) { + u_mod += u_diff_sse[row * uv_block_width + col]; + v_mod += v_diff_sse[row * uv_block_width + col]; + ++cr_index; + } + } + } + + assert(cr_index > 0); + + for (idy = 0; idy < 1 + ss_y; ++idy) { + for (idx = 0; idx < 1 + ss_x; ++idx) { + const int row = (uv_r << ss_y) + idy; + const int col = (uv_c << ss_x) + idx; + y_diff += y_diff_sse[row * (int)block_width + col]; + ++cr_index; + } + } + + u_mod += y_diff; + v_mod += y_diff; + + u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight); + v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight); + + u_count[m] += u_mod; + u_accumulator[m] += u_mod * u_pixel_value; + v_count[m] += v_mod; + v_accumulator[m] += v_mod * v_pixel_value; + + ++m; + } // Complete YUV pixel } - - byte += stride - block_width; } } #if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_temporal_filter_apply_c( - uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8, - unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, unsigned int *accumulator, uint16_t *count) { - uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); - uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); - unsigned int i, j, k; - int modifier; - int byte = 0; - const int rounding = strength > 0 ? 1 << (strength - 1) : 0; +void vp9_highbd_apply_temporal_filter_c( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, + uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + const int uv_block_width = block_width >> ss_x; + const int uv_block_height = block_height >> ss_y; + const int y_diff_stride = BW; + const int uv_diff_stride = BW; - for (i = 0, k = 0; i < block_height; i++) { - for (j = 0; j < block_width; j++, k++) { - int pixel_value = *frame2; - int diff_sse[9] = { 0 }; - int idx, idy, index = 0; + DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]); - for (idy = -1; idy <= 1; ++idy) { - for (idx = -1; idx <= 1; ++idx) { - int row = (int)i + idy; - int col = (int)j + idx; + const int rounding = (1 << strength) >> 1; - if (row >= 0 && row < (int)block_height && col >= 0 && - col < (int)block_width) { - int diff = frame1[byte + idy * (int)stride + idx] - - frame2[idy * (int)block_width + idx]; - diff_sse[index] = diff * diff; - ++index; + // Loop variables + int row, col; + int uv_row, uv_col; + int row_step, col_step; + + memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + + // Get the square diffs + for (row = 0; row < (int)block_height; row++) { + for (col = 0; col < (int)block_width; col++) { + const int diff = + y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col]; + y_diff_sse[row * y_diff_stride + col] = diff * diff; + } + } + + for (row = 0; row < uv_block_height; row++) { + for (col = 0; col < uv_block_width; col++) { + const int u_diff = + u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col]; + const int v_diff = + v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col]; + u_diff_sse[row * uv_diff_stride + col] = u_diff * u_diff; + v_diff_sse[row * uv_diff_stride + col] = v_diff * v_diff; + } + } + + // Apply the filter to luma + for (row = 0; row < (int)block_height; row++) { + for (col = 0; col < (int)block_width; col++) { + const int filter_weight = get_filter_weight( + row, col, block_height, block_width, blk_fw, use_32x32); + + // First we get the modifier for the current y pixel + const int y_pixel = y_pre[row * y_pre_stride + col]; + int y_num_used = 0; + int y_mod = 0; + + // Sum the neighboring 3x3 y pixels + for (row_step = -1; row_step <= 1; row_step++) { + for (col_step = -1; col_step <= 1; col_step++) { + const int sub_row = row + row_step; + const int sub_col = col + col_step; + + if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 && + sub_col < (int)block_width) { + y_mod += y_diff_sse[sub_row * y_diff_stride + sub_col]; + y_num_used++; } } } - assert(index > 0); - modifier = 0; - for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx]; + // Sum the corresponding uv pixels to the current y modifier + // Note we are rounding down instead of rounding to the nearest pixel. + uv_row = row >> ss_y; + uv_col = col >> ss_x; + y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col]; + y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col]; - modifier *= 3; - modifier /= index; + y_num_used += 2; - ++frame2; - modifier += rounding; - modifier >>= strength; + // Set the modifier + y_mod = highbd_mod_index(y_mod, y_num_used, rounding, strength, + filter_weight); - if (modifier > 16) modifier = 16; - - modifier = 16 - modifier; - modifier *= filter_weight; - - count[k] += modifier; - accumulator[k] += modifier * pixel_value; - - byte++; + // Accumulate the result + y_count[row * block_width + col] += y_mod; + y_accum[row * block_width + col] += y_mod * y_pixel; } + } - byte += stride - block_width; + // Apply the filter to chroma + for (uv_row = 0; uv_row < uv_block_height; uv_row++) { + for (uv_col = 0; uv_col < uv_block_width; uv_col++) { + const int y_row = uv_row << ss_y; + const int y_col = uv_col << ss_x; + const int filter_weight = get_filter_weight( + uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32); + + const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col]; + const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col]; + + int uv_num_used = 0; + int u_mod = 0, v_mod = 0; + + // Sum the neighboring 3x3 chromal pixels to the chroma modifier + for (row_step = -1; row_step <= 1; row_step++) { + for (col_step = -1; col_step <= 1; col_step++) { + const int sub_row = uv_row + row_step; + const int sub_col = uv_col + col_step; + + if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && + sub_col < uv_block_width) { + u_mod += u_diff_sse[sub_row * uv_diff_stride + sub_col]; + v_mod += v_diff_sse[sub_row * uv_diff_stride + sub_col]; + uv_num_used++; + } + } + } + + // Sum all the luma pixels associated with the current luma pixel + for (row_step = 0; row_step < 1 + ss_y; row_step++) { + for (col_step = 0; col_step < 1 + ss_x; col_step++) { + const int sub_row = y_row + row_step; + const int sub_col = y_col + col_step; + const int y_diff = y_diff_sse[sub_row * y_diff_stride + sub_col]; + + u_mod += y_diff; + v_mod += y_diff; + uv_num_used++; + } + } + + // Set the modifier + u_mod = highbd_mod_index(u_mod, uv_num_used, rounding, strength, + filter_weight); + v_mod = highbd_mod_index(v_mod, uv_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + u_count[uv_row * uv_block_width + uv_col] += u_mod; + u_accum[uv_row * uv_block_width + uv_col] += u_mod * u_pixel; + v_count[uv_row * uv_block_width + uv_col] += v_mod; + v_accum[uv_row * uv_block_width + uv_col] += v_mod * v_pixel; + } } } #endif // CONFIG_VP9_HIGHBITDEPTH -static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, - uint8_t *arf_frame_buf, - uint8_t *frame_ptr_buf, - int stride) { - MACROBLOCK *const x = &cpi->td.mb; +static uint32_t temporal_filter_find_matching_mb_c( + VP9_COMP *cpi, ThreadData *td, uint8_t *arf_frame_buf, + uint8_t *frame_ptr_buf, int stride, MV *ref_mv, MV *blk_mvs, + int *blk_bestsme, int *is_dc_diff_large) { + MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - const SEARCH_METHODS old_search_method = mv_sf->search_method; + const SEARCH_METHODS search_method = MESH; + const SEARCH_METHODS search_method_16 = cpi->sf.temporal_filter_search_method; int step_param; int sadpb = x->sadperbit16; uint32_t bestsme = UINT_MAX; uint32_t distortion; uint32_t sse; int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; MV best_ref_mv1 = { 0, 0 }; MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv; // Save input state struct buf_2d src = x->plane[0].src; struct buf_2d pre = xd->plane[0].pre[0]; + int i, j, k = 0; best_ref_mv1_full.col = best_ref_mv1.col >> 3; best_ref_mv1_full.row = best_ref_mv1.row >> 3; @@ -239,22 +799,60 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, x->plane[0].src.stride = stride; xd->plane[0].pre[0].buf = frame_ptr_buf; xd->plane[0].pre[0].stride = stride; + *is_dc_diff_large = 0; step_param = mv_sf->reduce_first_step_size; step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); - mv_sf->search_method = HEX; - vp9_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param, - sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1, - ref_mv, 0, 0); - mv_sf->search_method = old_search_method; + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - // Ignore mv costing by sending NULL pointer instead of cost array + vp9_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, ref_mv, 0, 0); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost + // calculation. The start full mv and the search result are stored in + // ref_mv. bestsme = cpi->find_fractional_mv_step( x, ref_mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, - x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0, - mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, NULL, - &distortion, &sse, NULL, 0, 0); + x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, BW, + BH, USE_8_TAPS_SHARP); + *is_dc_diff_large = 50 * bestsme < sse; + + // DO motion search on 4 16x16 sub_blocks. + best_ref_mv1.row = ref_mv->row; + best_ref_mv1.col = ref_mv->col; + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + for (i = 0; i < BH; i += SUB_BH) { + for (j = 0; j < BW; j += SUB_BW) { + // Setup frame pointers + x->plane[0].src.buf = arf_frame_buf + i * stride + j; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j; + xd->plane[0].pre[0].stride = stride; + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + vp9_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full, + step_param, search_method_16, sadpb, + cond_cost_list(cpi, cost_list), &best_ref_mv1, + &blk_mvs[k], 0, 0); + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + blk_bestsme[k] = cpi->find_fractional_mv_step( + x, &blk_mvs[k], &best_ref_mv1, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[TF_SUB_BLOCK], 0, + mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL, + NULL, &distortion, &sse, NULL, SUB_BW, SUB_BH, USE_8_TAPS_SHARP); + k++; + } + } // Restore input state x->plane[0].src = src; @@ -263,37 +861,40 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, return bestsme; } -static void temporal_filter_iterate_c(VP9_COMP *cpi, - YV12_BUFFER_CONFIG **frames, - int frame_count, int alt_ref_index, - int strength, - struct scale_factors *scale) { +void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, + int mb_row, int mb_col_start, + int mb_col_end) { + ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data; + YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames; + int frame_count = arnr_filter_data->frame_count; + int alt_ref_index = arnr_filter_data->alt_ref_index; + int strength = arnr_filter_data->strength; + struct scale_factors *scale = &arnr_filter_data->sf; int byte; int frame; - int mb_col, mb_row; - unsigned int filter_weight; - int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4; - int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4; - int mb_y_offset = 0; - int mb_uv_offset = 0; - DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]); - DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]); - MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; + int mb_col; + int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2; + int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2; + DECLARE_ALIGNED(16, uint32_t, accumulator[BLK_PELS * 3]); + DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]); + MACROBLOCKD *mbd = &td->mb.e_mbd; YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; + YV12_BUFFER_CONFIG *dst = arnr_filter_data->dst; uint8_t *dst1, *dst2; #if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]); - DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint16_t, predictor16[BLK_PELS * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[BLK_PELS * 3]); uint8_t *predictor; #else - DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor[BLK_PELS * 3]); #endif - const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; - const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; + const int mb_uv_height = BH >> mbd->plane[1].subsampling_y; + const int mb_uv_width = BW >> mbd->plane[1].subsampling_x; + // Addition of the tile col level offsets + int mb_y_offset = mb_row * BH * (f->y_stride) + BW * mb_col_start; + int mb_uv_offset = + mb_row * mb_uv_height * f->uv_stride + mb_uv_width * mb_col_start; - // Save input state - uint8_t *input_buffer[MAX_MB_PLANE]; - int i; #if CONFIG_VP9_HIGHBITDEPTH if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { predictor = CONVERT_TO_BYTEPTR(predictor16); @@ -302,221 +903,234 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, } #endif - for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; + // Source frames are extended to 16 pixels. This is different than + // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS) + // A 6/8/12 tap filter is used for motion search and prediction. So the + // largest Y mv on a border would then be 16 - TF_INTERP_EXTEND. The UV + // blocks are half the size of the Y and therefore only extended by 8. + // The largest mv that a UV block can support is 8 - TF_INTERP_EXTEND. + // A UV mv is half of a Y mv. (16 - TF_INTERP_EXTEND) >> 1 is greater than + // 8 - TF_INTERP_EXTEND. To keep the mv in play for both Y and UV planes, + // the max that it can be on a border is therefore 16 - (2 * TF_INTERP_EXTEND + // + 1). + td->mb.mv_limits.row_min = -((mb_row * BH) + (17 - 2 * TF_INTERP_EXTEND)); + td->mb.mv_limits.row_max = + ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * TF_INTERP_EXTEND); - for (mb_row = 0; mb_row < mb_rows; mb_row++) { - // Source frames are extended to 16 pixels. This is different than - // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS) - // A 6/8 tap filter is used for motion search. This requires 2 pixels - // before and 3 pixels after. So the largest Y mv on a border would - // then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the - // Y and therefore only extended by 8. The largest mv that a UV block - // can support is 8 - VP9_INTERP_EXTEND. A UV mv is half of a Y mv. - // (16 - VP9_INTERP_EXTEND) >> 1 which is greater than - // 8 - VP9_INTERP_EXTEND. - // To keep the mv in play for both Y and UV planes the max that it - // can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1). - cpi->td.mb.mv_limits.row_min = - -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND)); - cpi->td.mb.mv_limits.row_max = - ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) { + int i, j, k; + int stride; + MV ref_mv; - for (mb_col = 0; mb_col < mb_cols; mb_col++) { - int i, j, k; - int stride; + vp9_zero_array(accumulator, BLK_PELS * 3); + vp9_zero_array(count, BLK_PELS * 3); - memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0])); - memset(count, 0, 16 * 16 * 3 * sizeof(count[0])); + td->mb.mv_limits.col_min = -((mb_col * BW) + (17 - 2 * TF_INTERP_EXTEND)); + td->mb.mv_limits.col_max = + ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * TF_INTERP_EXTEND); - cpi->td.mb.mv_limits.col_min = - -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); - cpi->td.mb.mv_limits.col_max = - ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + if (cpi->oxcf.content == VP9E_CONTENT_FILM) { + unsigned int src_variance; + struct buf_2d src; - for (frame = 0; frame < frame_count; frame++) { - const uint32_t thresh_low = 10000; - const uint32_t thresh_high = 20000; - - if (frames[frame] == NULL) continue; - - mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0; - mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0; - - if (frame == alt_ref_index) { - filter_weight = 2; - } else { - // Find best match in this frame by MC - uint32_t err = temporal_filter_find_matching_mb_c( - cpi, frames[alt_ref_index]->y_buffer + mb_y_offset, - frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride); - - // Assign higher weight to matching MB if its error - // score is lower. If not applying MC default behavior - // is to weight all MBs equal. - filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0; - } - - if (filter_weight != 0) { - // Construct the predictors - temporal_filter_predictors_mb_c( - mbd, frames[frame]->y_buffer + mb_y_offset, - frames[frame]->u_buffer + mb_uv_offset, - frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, - mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, - mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale, - mb_col * 16, mb_row * 16); - -#if CONFIG_VP9_HIGHBITDEPTH - if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - int adj_strength = strength + 2 * (mbd->bd - 8); - // Apply the filter (YUV) - vp9_highbd_temporal_filter_apply_c( - f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, - adj_strength, filter_weight, accumulator, count); - vp9_highbd_temporal_filter_apply_c( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 256, count + 256); - vp9_highbd_temporal_filter_apply_c( - f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 512, count + 512); - } else { - // Apply the filter (YUV) - vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, - filter_weight, accumulator, count); - vp9_temporal_filter_apply_c( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, strength, filter_weight, - accumulator + 256, count + 256); - vp9_temporal_filter_apply_c( - f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, strength, filter_weight, - accumulator + 512, count + 512); - } -#else - // Apply the filter (YUV) - // TODO(jingning): Need SIMD optimization for this. - vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, - filter_weight, accumulator, count); - vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, mb_uv_width, - mb_uv_height, strength, filter_weight, - accumulator + 256, count + 256); - vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, mb_uv_width, - mb_uv_height, strength, filter_weight, - accumulator + 512, count + 512); -#endif // CONFIG_VP9_HIGHBITDEPTH - } - } + src.buf = f->y_buffer + mb_y_offset; + src.stride = f->y_stride; #if CONFIG_VP9_HIGHBITDEPTH if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - uint16_t *dst1_16; - uint16_t *dst2_16; - // Normalize filter output to produce AltRef frame - dst1 = cpi->alt_ref_buffer.y_buffer; - dst1_16 = CONVERT_TO_SHORTPTR(dst1); - stride = cpi->alt_ref_buffer.y_stride; - byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - - dst1_16[byte] = (uint16_t)pval; - - // move to next pixel - byte++; - } - - byte += stride - 16; - } - - dst1 = cpi->alt_ref_buffer.u_buffer; - dst2 = cpi->alt_ref_buffer.v_buffer; - dst1_16 = CONVERT_TO_SHORTPTR(dst1); - dst2_16 = CONVERT_TO_SHORTPTR(dst2); - stride = cpi->alt_ref_buffer.uv_stride; - byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { - for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; - - // U - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - dst1_16[byte] = (uint16_t)pval; - - // V - pval = accumulator[m] + (count[m] >> 1); - pval *= fixed_divide[count[m]]; - pval >>= 19; - dst2_16[byte] = (uint16_t)pval; - - // move to next pixel - byte++; - } - - byte += stride - mb_uv_width; - } + src_variance = + vp9_high_get_sby_perpixel_variance(cpi, &src, TF_BLOCK, mbd->bd); } else { - // Normalize filter output to produce AltRef frame - dst1 = cpi->alt_ref_buffer.y_buffer; - stride = cpi->alt_ref_buffer.y_stride; - byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - - dst1[byte] = (uint8_t)pval; - - // move to next pixel - byte++; - } - byte += stride - 16; - } - - dst1 = cpi->alt_ref_buffer.u_buffer; - dst2 = cpi->alt_ref_buffer.v_buffer; - stride = cpi->alt_ref_buffer.uv_stride; - byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { - for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; - - // U - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - dst1[byte] = (uint8_t)pval; - - // V - pval = accumulator[m] + (count[m] >> 1); - pval *= fixed_divide[count[m]]; - pval >>= 19; - dst2[byte] = (uint8_t)pval; - - // move to next pixel - byte++; - } - byte += stride - mb_uv_width; - } + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK); } #else + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (src_variance <= 2) { + strength = VPXMAX(0, arnr_filter_data->strength - 2); + } + } + + for (frame = 0; frame < frame_count; frame++) { + // MVs for 4 16x16 sub blocks. + MV blk_mvs[4]; + // Filter weights for 4 16x16 sub blocks. + int blk_fw[4] = { 0, 0, 0, 0 }; + int use_32x32 = 0; + + if (frames[frame] == NULL) continue; + + ref_mv.row = 0; + ref_mv.col = 0; + blk_mvs[0] = kZeroMv; + blk_mvs[1] = kZeroMv; + blk_mvs[2] = kZeroMv; + blk_mvs[3] = kZeroMv; + + if (frame == alt_ref_index) { + blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2; + use_32x32 = 1; + } else { + const int thresh_low = 10000; + const int thresh_high = 20000; + int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + int is_dc_diff_large = 0; + + // Find best match in this frame by MC + int err = temporal_filter_find_matching_mb_c( + cpi, td, frames[alt_ref_index]->y_buffer + mb_y_offset, + frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride, + &ref_mv, blk_mvs, blk_bestsme, &is_dc_diff_large); + + if (cpi->oxcf.enable_keyframe_filtering == 1 && + cpi->common.frame_type == KEY_FRAME && is_dc_diff_large) + strength = VPXMIN(strength, 1); + + int err16 = + blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3]; + int max_err = INT_MIN, min_err = INT_MAX; + for (k = 0; k < 4; k++) { + if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k]; + if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k]; + } + + if (((err * 15 < (err16 << 4)) && max_err - min_err < 10000) || + ((err * 14 < (err16 << 4)) && max_err - min_err < 5000)) { + use_32x32 = 1; + // Assign higher weight to matching MB if it's error + // score is lower. If not applying MC default behavior + // is to weight all MBs equal. + blk_fw[0] = err < (thresh_low << THR_SHIFT) ? 2 + : err < (thresh_high << THR_SHIFT) ? 1 + : 0; + blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0]; + } else { + use_32x32 = 0; + for (k = 0; k < 4; k++) + blk_fw[k] = blk_bestsme[k] < thresh_low ? 2 + : blk_bestsme[k] < thresh_high ? 1 + : 0; + } + + for (k = 0; k < 4; k++) { + switch (abs(frame - alt_ref_index)) { + case 1: blk_fw[k] = VPXMIN(blk_fw[k], 2); break; + case 2: + case 3: blk_fw[k] = VPXMIN(blk_fw[k], 1); break; + default: break; + } + } + } + + if (blk_fw[0] | blk_fw[1] | blk_fw[2] | blk_fw[3]) { + // Construct the predictors + temporal_filter_predictors_mb_c( + mbd, frames[frame]->y_buffer + mb_y_offset, + frames[frame]->u_buffer + mb_uv_offset, + frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, + mb_uv_width, mb_uv_height, ref_mv.row, ref_mv.col, predictor, scale, + mb_col * BW, mb_row * BH, blk_mvs, use_32x32); + +#if CONFIG_VP9_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + int adj_strength = strength + 2 * (mbd->bd - 8); + // Apply the filter (YUV) + vp9_highbd_apply_temporal_filter( + CONVERT_TO_SHORTPTR(f->y_buffer + mb_y_offset), f->y_stride, + CONVERT_TO_SHORTPTR(predictor), BW, + CONVERT_TO_SHORTPTR(f->u_buffer + mb_uv_offset), + CONVERT_TO_SHORTPTR(f->v_buffer + mb_uv_offset), f->uv_stride, + CONVERT_TO_SHORTPTR(predictor + BLK_PELS), + CONVERT_TO_SHORTPTR(predictor + (BLK_PELS << 1)), mb_uv_width, BW, + BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y, + adj_strength, blk_fw, use_32x32, accumulator, count, + accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); + } else { + // Apply the filter (YUV) + vp9_apply_temporal_filter( + f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, + f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, + f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1), + mb_uv_width, BW, BH, mbd->plane[1].subsampling_x, + mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32, + accumulator, count, accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); + } +#else + // Apply the filter (YUV) + vp9_apply_temporal_filter( + f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, + f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, + f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1), + mb_uv_width, BW, BH, mbd->plane[1].subsampling_x, + mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32, + accumulator, count, accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + +#if CONFIG_VP9_HIGHBITDEPTH + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *dst1_16; + uint16_t *dst2_16; // Normalize filter output to produce AltRef frame - dst1 = cpi->alt_ref_buffer.y_buffer; - stride = cpi->alt_ref_buffer.y_stride; + dst1 = dst->y_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + stride = dst->y_stride; byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + + dst1_16[byte] = (uint16_t)pval; + + // move to next pixel + byte++; + } + + byte += stride - BW; + } + + dst1 = dst->u_buffer; + dst2 = dst->v_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + dst2_16 = CONVERT_TO_SHORTPTR(dst2); + stride = dst->uv_stride; + byte = mb_uv_offset; + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + BLK_PELS; + + // U + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + dst1_16[byte] = (uint16_t)pval; + + // V + pval = accumulator[m] + (count[m] >> 1); + pval *= fixed_divide[count[m]]; + pval >>= 19; + dst2_16[byte] = (uint16_t)pval; + + // move to next pixel + byte++; + } + + byte += stride - mb_uv_width; + } + } else { + // Normalize filter output to produce AltRef frame + dst1 = dst->y_buffer; + stride = dst->y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { unsigned int pval = accumulator[k] + (count[k] >> 1); pval *= fixed_divide[count[k]]; pval >>= 19; @@ -526,16 +1140,16 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, // move to next pixel byte++; } - byte += stride - 16; + byte += stride - BW; } - dst1 = cpi->alt_ref_buffer.u_buffer; - dst2 = cpi->alt_ref_buffer.v_buffer; - stride = cpi->alt_ref_buffer.uv_stride; + dst1 = dst->u_buffer; + dst2 = dst->v_buffer; + stride = dst->uv_stride; byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; + int m = k + BLK_PELS; // U unsigned int pval = accumulator[k] + (count[k] >> 1); @@ -554,50 +1168,110 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, } byte += stride - mb_uv_width; } -#endif // CONFIG_VP9_HIGHBITDEPTH - mb_y_offset += 16; - mb_uv_offset += mb_uv_width; } - mb_y_offset += 16 * (f->y_stride - mb_cols); - mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols; - } +#else + // Normalize filter output to produce AltRef frame + dst1 = dst->y_buffer; + stride = dst->y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; - // Restore input state - for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i]; + dst1[byte] = (uint8_t)pval; + + // move to next pixel + byte++; + } + byte += stride - BW; + } + + dst1 = dst->u_buffer; + dst2 = dst->v_buffer; + stride = dst->uv_stride; + byte = mb_uv_offset; + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + BLK_PELS; + + // U + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + dst1[byte] = (uint8_t)pval; + + // V + pval = accumulator[m] + (count[m] >> 1); + pval *= fixed_divide[count[m]]; + pval >>= 19; + dst2[byte] = (uint8_t)pval; + + // move to next pixel + byte++; + } + byte += stride - mb_uv_width; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + mb_y_offset += BW; + mb_uv_offset += mb_uv_width; + } +} + +static void temporal_filter_iterate_tile_c(VP9_COMP *cpi, int tile_row, + int tile_col) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileInfo *tile_info = + &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; + const int mb_row_start = (tile_info->mi_row_start) >> TF_SHIFT; + const int mb_row_end = (tile_info->mi_row_end + TF_ROUND) >> TF_SHIFT; + const int mb_col_start = (tile_info->mi_col_start) >> TF_SHIFT; + const int mb_col_end = (tile_info->mi_col_end + TF_ROUND) >> TF_SHIFT; + int mb_row; + + for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) { + vp9_temporal_filter_iterate_row_c(cpi, &cpi->td, mb_row, mb_col_start, + mb_col_end); + } +} + +static void temporal_filter_iterate_c(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_row, tile_col; + vp9_init_tile_data(cpi); + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + temporal_filter_iterate_tile_c(cpi, tile_row, tile_col); + } + } } // Apply buffer limits and context specific adjustments to arnr filter. static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost, - int *arnr_frames, int *arnr_strength) { + int *arnr_frames, int *frames_backward, + int *frames_forward, int *arnr_strength) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; - const int frames_after_arf = - vp9_lookahead_depth(cpi->lookahead) - distance - 1; - int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1; - int frames_bwd; - int q, frames, base_strength, strength; + + int max_fwd = + VPXMAX((int)vp9_lookahead_depth(cpi->lookahead) - distance - 1, 0); + int max_bwd = VPXMAX(distance, 0); + int frames = VPXMAX(oxcf->arnr_max_frames, 1); + int q, base_strength, strength; // Context dependent two pass adjustment to strength. if (oxcf->pass == 2) { base_strength = oxcf->arnr_strength + cpi->twopass.arnr_strength_adjustment; // Clip to allowed range. - base_strength = VPXMIN(6, VPXMAX(0, base_strength)); + base_strength = clamp(base_strength, 0, 6); } else { base_strength = oxcf->arnr_strength; } - // Define the forward and backwards filter limits for this arnr group. - if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf; - if (frames_fwd > distance) frames_fwd = distance; - - frames_bwd = frames_fwd; - - // For even length filter there is one more frame backward - // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff. - if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1; - - // Set the baseline active filter size. - frames = frames_bwd + 1 + frames_fwd; - // Adjust the strength based on active max q. if (cpi->common.current_video_frame > 1) q = ((int)vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME], @@ -613,23 +1287,44 @@ static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost, } // Adjust number of frames in filter and strength based on gf boost level. - if (frames > group_boost / 150) { - frames = group_boost / 150; - frames += !(frames & 1); - } + frames = VPXMIN(frames, group_boost / 150); if (strength > group_boost / 300) { strength = group_boost / 300; } - // Adjustments for second level arf in multi arf case. - if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) { - strength >>= 1; + if (VPXMIN(max_fwd, max_bwd) >= frames / 2) { + // Handle the even/odd case. + *frames_backward = frames / 2; + *frames_forward = (frames - 1) / 2; + } else { + if (max_fwd < frames / 2) { + *frames_forward = max_fwd; + *frames_backward = VPXMIN(frames - 1 - *frames_forward, max_bwd); + } else { + *frames_backward = max_bwd; + *frames_forward = VPXMIN(frames - 1 - *frames_backward, max_fwd); } } + // Set the baseline active filter size. + frames = *frames_backward + 1 + *frames_forward; + + // Adjustments for second level arf in multi arf case. + // Leave commented out place holder for possible filtering adjustment with + // new multi-layer arf code. + // if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) + // if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) strength >>= 1; + + // TODO(jingning): Skip temporal filtering for intermediate frames that will + // be used as show_existing_frame. Need to further explore the possibility to + // apply certain filter. + if (frames <= 1) { + frames = 1; + *frames_backward = 0; + *frames_forward = 0; + } + *arnr_frames = frames; *arnr_strength = strength; } @@ -638,21 +1333,28 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data; int frame; int frames_to_blur; int start_frame; int strength; int frames_to_blur_backward; int frames_to_blur_forward; - struct scale_factors sf; - YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; + struct scale_factors *sf = &arnr_filter_data->sf; + YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames; + int rdmult; // Apply context specific adjustments to the arnr filter parameters. - adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength); - frames_to_blur_backward = (frames_to_blur / 2); - frames_to_blur_forward = ((frames_to_blur - 1) / 2); + adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, + &frames_to_blur_backward, &frames_to_blur_forward, + &strength); start_frame = distance + frames_to_blur_forward; + arnr_filter_data->strength = strength; + arnr_filter_data->frame_count = frames_to_blur; + arnr_filter_data->alt_ref_index = frames_to_blur_backward; + arnr_filter_data->dst = &cpi->tf_buffer; + // Setup frame pointers, NULL indicates frame not included in filter. for (frame = 0; frame < frames_to_blur; ++frame) { const int which_buffer = start_frame - frame; @@ -661,6 +1363,11 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { frames[frames_to_blur - 1 - frame] = &buf->img; } + YV12_BUFFER_CONFIG *f = frames[arnr_filter_data->alt_ref_index]; + xd->cur_buf = f; + xd->plane[1].subsampling_y = f->subsampling_y; + xd->plane[1].subsampling_x = f->subsampling_x; + if (frames_to_blur > 0) { // Setup scaling factors. Scaling on each of the arnr frames is not // supported. @@ -670,13 +1377,13 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { int frame_used = 0; #if CONFIG_VP9_HIGHBITDEPTH vp9_setup_scale_factors_for_frame( - &sf, get_frame_new_buffer(cm)->y_crop_width, + sf, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height, cm->use_highbitdepth); #else vp9_setup_scale_factors_for_frame( - &sf, get_frame_new_buffer(cm)->y_crop_width, + sf, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height); @@ -697,7 +1404,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { "Failed to reallocate alt_ref_buffer"); } frames[frame] = vp9_scale_if_required( - cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0); + cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0, + EIGHTTAP, 0); ++frame_used; } } @@ -708,17 +1416,24 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { // ARF is produced at the native frame size and resized when coded. #if CONFIG_VP9_HIGHBITDEPTH vp9_setup_scale_factors_for_frame( - &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + sf, frames[0]->y_crop_width, frames[0]->y_crop_height, frames[0]->y_crop_width, frames[0]->y_crop_height, cm->use_highbitdepth); #else vp9_setup_scale_factors_for_frame( - &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + sf, frames[0]->y_crop_width, frames[0]->y_crop_height, frames[0]->y_crop_width, frames[0]->y_crop_height); #endif // CONFIG_VP9_HIGHBITDEPTH } } - temporal_filter_iterate_c(cpi, frames, frames_to_blur, - frames_to_blur_backward, strength, &sf); + // Initialize errorperbit and sabperbit. + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX); + set_error_per_bit(&cpi->td.mb, rdmult); + vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX); + + if (!cpi->row_mt) + temporal_filter_iterate_c(cpi); + else + vp9_temporal_filter_row_mt(cpi); } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h index f537b8870a..59fb71fc0d 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h @@ -8,18 +8,64 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ -#define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ +#ifndef VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ +#define VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ #ifdef __cplusplus extern "C" { #endif +#define ARNR_FILT_QINDEX 128 +struct VP9_COMP; +struct ThreadData; + +// Block size used in temporal filtering +#define TF_BLOCK BLOCK_32X32 +#define BH 32 +#define BH_LOG2 5 +#define BW 32 +#define BW_LOG2 5 +#define BLK_PELS ((BH) * (BW)) // Pixels in the block +#define TF_SHIFT 2 +#define TF_ROUND 3 +#define THR_SHIFT 2 +#define TF_SUB_BLOCK BLOCK_16X16 +#define SUB_BH 16 +#define SUB_BW 16 +#define MAX_FILTER_TAP 12 + +typedef int16_t InterpKernel12[MAX_FILTER_TAP]; + +// 12-tap filter (used by the encoder only). +DECLARE_ALIGNED(256, static const InterpKernel12, + sub_pel_filters_12[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }, + { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0 }, + { -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1 }, + { -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1 }, + { -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1 }, + { -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2 }, + { -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2 }, + { -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2 }, + { -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2 }, + { -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2 }, + { -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2 }, + { -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2 }, + { -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1 }, + { -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1 }, + { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1 }, + { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 } +}; + void vp9_temporal_filter_init(void); -void vp9_temporal_filter(VP9_COMP *cpi, int distance); +void vp9_temporal_filter(struct VP9_COMP *cpi, int distance); + +void vp9_temporal_filter_iterate_row_c(struct VP9_COMP *cpi, + struct ThreadData *td, int mb_row, + int mb_col_start, int mb_col_end); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ +#endif // VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h new file mode 100644 index 0000000000..8776dfc068 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ +#define VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ +#include "./vpx_config.h" + +// Division using multiplication and shifting. The C implementation does: +// modifier *= 3; +// modifier /= index; +// where 'modifier' is a set of summed values and 'index' is the number of +// summed values. +// +// This equation works out to (m * 3) / i which reduces to: +// m * 3/4 +// m * 1/2 +// m * 1/3 +// +// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): +// m * C / 65536 +// we can create a C to replicate the division. +// +// m * 49152 / 65536 = m * 3/4 +// m * 32758 / 65536 = m * 1/2 +// m * 21846 / 65536 = m * 0.3333 +// +// These are loaded using an instruction expecting int16_t values but are used +// with _mm_mulhi_epu16(), which treats them as unsigned. +#define NEIGHBOR_CONSTANT_4 (int16_t)49152 +#define NEIGHBOR_CONSTANT_5 (int16_t)39322 +#define NEIGHBOR_CONSTANT_6 (int16_t)32768 +#define NEIGHBOR_CONSTANT_7 (int16_t)28087 +#define NEIGHBOR_CONSTANT_8 (int16_t)24576 +#define NEIGHBOR_CONSTANT_9 (int16_t)21846 +#define NEIGHBOR_CONSTANT_10 (int16_t)19661 +#define NEIGHBOR_CONSTANT_11 (int16_t)17874 +#define NEIGHBOR_CONSTANT_13 (int16_t)15124 + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4 +}; + +#if CONFIG_VP9_HIGHBITDEPTH +#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U +#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U +#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U +#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U +#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U +#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U +#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U +#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U +#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13 +}; + +static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4 + }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4 + }; +#endif // CONFIG_VP9_HIGHBITDEPTH + +#define DIST_STRIDE ((BW) + 2) + +#endif // VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c index dc2616dbe1..6c6c04493f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c @@ -123,7 +123,7 @@ const int16_t vp9_cat6_low_cost[256] = { 6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831, 6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982 }; -const int vp9_cat6_high_cost[64] = { +const uint16_t vp9_cat6_high_cost[64] = { 88, 2251, 2727, 4890, 3148, 5311, 5787, 7950, 3666, 5829, 6305, 8468, 6726, 8889, 9365, 11528, 3666, 5829, 6305, 8468, 6726, 8889, 9365, 11528, 7244, 9407, 9883, 12046, 10304, 12467, 12943, 15106, 3666, @@ -133,7 +133,7 @@ const int vp9_cat6_high_cost[64] = { }; #if CONFIG_VP9_HIGHBITDEPTH -const int vp9_cat6_high10_high_cost[256] = { +const uint16_t vp9_cat6_high10_high_cost[256] = { 94, 2257, 2733, 4896, 3154, 5317, 5793, 7956, 3672, 5835, 6311, 8474, 6732, 8895, 9371, 11534, 3672, 5835, 6311, 8474, 6732, 8895, 9371, 11534, 7250, 9413, 9889, 12052, 10310, 12473, 12949, 15112, 3672, @@ -159,7 +159,7 @@ const int vp9_cat6_high10_high_cost[256] = { 18075, 20238, 18496, 20659, 21135, 23298, 19014, 21177, 21653, 23816, 22074, 24237, 24713, 26876 }; -const int vp9_cat6_high12_high_cost[1024] = { +const uint16_t vp9_cat6_high12_high_cost[1024] = { 100, 2263, 2739, 4902, 3160, 5323, 5799, 7962, 3678, 5841, 6317, 8480, 6738, 8901, 9377, 11540, 3678, 5841, 6317, 8480, 6738, 8901, 9377, 11540, 7256, 9419, 9895, 12058, 10316, 12479, 12955, 15118, 3678, @@ -364,7 +364,7 @@ static void tokenize_b(int plane, int block, int row, int col, const PLANE_TYPE type = get_plane_type(plane); const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); const int16_t *scan, *nb; - const scan_order *so; + const ScanOrder *so; const int ref = is_inter_block(mi); unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] = td->rd_counts.coef_counts[tx_size][type][ref]; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h index c905715d7d..6407ff9237 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_TOKENIZE_H_ -#define VP9_ENCODER_VP9_TOKENIZE_H_ +#ifndef VPX_VP9_ENCODER_VP9_TOKENIZE_H_ +#define VPX_VP9_ENCODER_VP9_TOKENIZE_H_ #include "vp9/common/vp9_entropy.h" @@ -76,25 +76,18 @@ extern const TOKENVALUE *vp9_dct_value_tokens_ptr; extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens; extern const int *vp9_dct_cat_lt_10_value_cost; extern const int16_t vp9_cat6_low_cost[256]; -extern const int vp9_cat6_high_cost[64]; -extern const int vp9_cat6_high10_high_cost[256]; -extern const int vp9_cat6_high12_high_cost[1024]; -static INLINE int vp9_get_cost(int16_t token, EXTRABIT extrabits, - const int *cat6_high_table) { - if (token != CATEGORY6_TOKEN) - return vp9_extra_bits[token].cost[extrabits >> 1]; - return vp9_cat6_low_cost[(extrabits >> 1) & 0xff] + - cat6_high_table[extrabits >> 9]; -} +extern const uint16_t vp9_cat6_high_cost[64]; +extern const uint16_t vp9_cat6_high10_high_cost[256]; +extern const uint16_t vp9_cat6_high12_high_cost[1024]; #if CONFIG_VP9_HIGHBITDEPTH -static INLINE const int *vp9_get_high_cost_table(int bit_depth) { +static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) { return bit_depth == 8 ? vp9_cat6_high_cost : (bit_depth == 10 ? vp9_cat6_high10_high_cost : vp9_cat6_high12_high_cost); } #else -static INLINE const int *vp9_get_high_cost_table(int bit_depth) { +static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) { (void)bit_depth; return vp9_cat6_high_cost; } @@ -118,7 +111,7 @@ static INLINE int16_t vp9_get_token(int v) { } static INLINE int vp9_get_token_cost(int v, int16_t *token, - const int *cat6_high_table) { + const uint16_t *cat6_high_table) { if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) { EXTRABIT extrabits; *token = CATEGORY6_TOKEN; @@ -134,4 +127,4 @@ static INLINE int vp9_get_token_cost(int v, int16_t *token, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_TOKENIZE_H_ +#endif // VPX_VP9_ENCODER_VP9_TOKENIZE_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c new file mode 100644 index 0000000000..f65c98f779 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c @@ -0,0 +1,1791 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/common/vp9_mvref_common.h" +#endif +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ext_ratectrl.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_tpl_model.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vpx_codec.h" +#include "vpx/vpx_ext_ratectrl.h" + +static int init_gop_frames_rc(VP9_COMP *cpi, GF_PICTURE *gf_picture, + const GF_GROUP *gf_group, int *tpl_group_frames) { + VP9_COMMON *cm = &cpi->common; + int frame_idx = 0; + int i; + int extend_frame_count = 0; + int pframe_qindex = cpi->tpl_stats[2].base_qindex; + int frame_gop_offset = 0; + + int added_overlay = 0; + + RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; + int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; + + memset(recon_frame_index, -1, sizeof(recon_frame_index)); + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (frame_bufs[i].ref_count == 0) { + alloc_frame_mvs(cm, i); + if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + recon_frame_index[frame_idx] = i; + ++frame_idx; + + if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; + } + } + + for (i = 0; i < REFS_PER_FRAME + 1; ++i) { + assert(recon_frame_index[i] >= 0); + cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; + } + + *tpl_group_frames = 0; + + int ref_table[3]; + + if (gf_group->index == 1 && gf_group->update_type[1] == ARF_UPDATE) { + if (gf_group->update_type[0] == KF_UPDATE) { + // This is the only frame in ref buffer. We need it to be on + // gf_picture[0]. + for (i = 0; i < 3; ++i) ref_table[i] = -REFS_PER_FRAME; + + gf_picture[0].frame = + &cm->buffer_pool + ->frame_bufs[cm->ref_frame_map[gf_group->update_ref_idx[0]]] + .buf; + ref_table[gf_group->update_ref_idx[0]] = 0; + + for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -REFS_PER_FRAME; + gf_picture[0].update_type = gf_group->update_type[0]; + } else { + for (i = 0; i < REFS_PER_FRAME; i++) { + if (cm->ref_frame_map[i] != -1) { + gf_picture[-i].frame = + &cm->buffer_pool->frame_bufs[cm->ref_frame_map[i]].buf; + ref_table[i] = -i; + } else { + ref_table[i] = -REFS_PER_FRAME; + } + } + for (i = 0; i < 3; ++i) { + gf_picture[0].ref_frame[i] = ref_table[i]; + } + } + ++*tpl_group_frames; + + // Initialize base layer ARF frame + gf_picture[1].frame = cpi->Source; + for (i = 0; i < 3; ++i) gf_picture[1].ref_frame[i] = ref_table[i]; + gf_picture[1].update_type = gf_group->update_type[1]; + ref_table[gf_group->update_ref_idx[1]] = 1; + + ++*tpl_group_frames; + } else { + assert(gf_group->index == 0); + if (gf_group->update_type[0] == KF_UPDATE) { + // This is the only frame in ref buffer. We need it to be on + // gf_picture[0]. + gf_picture[0].frame = cpi->Source; + for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -REFS_PER_FRAME; + gf_picture[0].update_type = gf_group->update_type[0]; + + for (i = 0; i < 3; ++i) ref_table[i] = -REFS_PER_FRAME; + ref_table[gf_group->update_ref_idx[0]] = 0; + } else { + // Initialize ref table + for (i = 0; i < REFS_PER_FRAME; i++) { + if (cm->ref_frame_map[i] != -1) { + gf_picture[-i].frame = + &cm->buffer_pool->frame_bufs[cm->ref_frame_map[i]].buf; + ref_table[i] = -i; + } else { + ref_table[i] = -REFS_PER_FRAME; + } + } + for (i = 0; i < 3; ++i) { + gf_picture[0].ref_frame[i] = ref_table[i]; + } + gf_picture[0].update_type = gf_group->update_type[0]; + if (gf_group->update_type[0] != OVERLAY_UPDATE && + gf_group->update_ref_idx[0] != -1) { + ref_table[gf_group->update_ref_idx[0]] = 0; + } + } + ++*tpl_group_frames; + } + + int has_arf = + gf_group->gf_group_size > 1 && gf_group->update_type[1] == ARF_UPDATE && + gf_group->update_type[gf_group->gf_group_size] == OVERLAY_UPDATE; + + // Initialize P frames + for (frame_idx = *tpl_group_frames; frame_idx < MAX_ARF_GOP_SIZE; + ++frame_idx) { + if (frame_idx >= gf_group->gf_group_size && !has_arf) break; + struct lookahead_entry *buf; + frame_gop_offset = gf_group->frame_gop_index[frame_idx]; + buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + gf_picture[frame_idx].frame = &buf->img; + for (i = 0; i < 3; ++i) { + gf_picture[frame_idx].ref_frame[i] = ref_table[i]; + } + + if (gf_group->update_type[frame_idx] != OVERLAY_UPDATE && + gf_group->update_ref_idx[frame_idx] != -1) { + ref_table[gf_group->update_ref_idx[frame_idx]] = frame_idx; + } + + gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; + + ++*tpl_group_frames; + + // The length of group of pictures is baseline_gf_interval, plus the + // beginning golden frame from last GOP, plus the last overlay frame in + // the same GOP. + if (frame_idx == gf_group->gf_group_size) { + added_overlay = 1; + + ++frame_idx; + ++frame_gop_offset; + break; + } + + if (frame_idx == gf_group->gf_group_size - 1 && + gf_group->update_type[gf_group->gf_group_size] != OVERLAY_UPDATE) { + ++frame_idx; + ++frame_gop_offset; + break; + } + } + + int lst_index = frame_idx - 1; + // Extend two frames outside the current gf group. + for (; has_arf && frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; + ++frame_idx) { + struct lookahead_entry *buf = + vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gf_picture[lst_index].ref_frame[0]; + gf_picture[frame_idx].ref_frame[1] = gf_picture[lst_index].ref_frame[1]; + gf_picture[frame_idx].ref_frame[2] = gf_picture[lst_index].ref_frame[2]; + + if (gf_picture[frame_idx].ref_frame[0] > + gf_picture[frame_idx].ref_frame[1] && + gf_picture[frame_idx].ref_frame[0] > + gf_picture[frame_idx].ref_frame[2]) { + gf_picture[frame_idx].ref_frame[0] = lst_index; + } else if (gf_picture[frame_idx].ref_frame[1] > + gf_picture[frame_idx].ref_frame[0] && + gf_picture[frame_idx].ref_frame[1] > + gf_picture[frame_idx].ref_frame[2]) { + gf_picture[frame_idx].ref_frame[1] = lst_index; + } else { + gf_picture[frame_idx].ref_frame[2] = lst_index; + } + + gf_picture[frame_idx].update_type = LF_UPDATE; + lst_index = frame_idx; + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_gop_offset; + } + + return extend_frame_count + added_overlay; +} + +static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, + const GF_GROUP *gf_group, int *tpl_group_frames) { + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) { + return init_gop_frames_rc(cpi, gf_picture, gf_group, tpl_group_frames); + } + + VP9_COMMON *cm = &cpi->common; + int frame_idx = 0; + int i; + int gld_index = -1; + int alt_index = -2; + int lst_index = -1; + int arf_index_stack[MAX_ARF_LAYERS]; + int arf_stack_size = 0; + int extend_frame_count = 0; + int pframe_qindex = cpi->tpl_stats[2].base_qindex; + int frame_gop_offset = 0; + + RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; + int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; + + memset(recon_frame_index, -1, sizeof(recon_frame_index)); + stack_init(arf_index_stack, MAX_ARF_LAYERS); + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (frame_bufs[i].ref_count == 0) { + alloc_frame_mvs(cm, i); + if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + recon_frame_index[frame_idx] = i; + ++frame_idx; + + if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; + } + } + + for (i = 0; i < REFS_PER_FRAME + 1; ++i) { + assert(recon_frame_index[i] >= 0); + cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; + } + + *tpl_group_frames = 0; + + // Initialize Golden reference frame. + gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -REFS_PER_FRAME; + gf_picture[0].update_type = gf_group->update_type[0]; + gld_index = 0; + ++*tpl_group_frames; + + gf_picture[-1].frame = get_ref_frame_buffer(cpi, LAST_FRAME); + gf_picture[-2].frame = get_ref_frame_buffer(cpi, ALTREF_FRAME); + + // Initialize base layer ARF frame + gf_picture[1].frame = cpi->Source; + gf_picture[1].ref_frame[0] = gld_index; + gf_picture[1].ref_frame[1] = lst_index; + gf_picture[1].ref_frame[2] = alt_index; + gf_picture[1].update_type = gf_group->update_type[1]; + alt_index = 1; + ++*tpl_group_frames; + + // Initialize P frames + for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + struct lookahead_entry *buf; + frame_gop_offset = gf_group->frame_gop_index[frame_idx]; + buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; + + switch (gf_group->update_type[frame_idx]) { + case ARF_UPDATE: + stack_push(arf_index_stack, alt_index, arf_stack_size); + ++arf_stack_size; + alt_index = frame_idx; + break; + case LF_UPDATE: lst_index = frame_idx; break; + case OVERLAY_UPDATE: + gld_index = frame_idx; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + case USE_BUF_FRAME: + lst_index = alt_index; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + default: break; + } + + ++*tpl_group_frames; + + // The length of group of pictures is baseline_gf_interval, plus the + // beginning golden frame from last GOP, plus the last overlay frame in + // the same GOP. + if (frame_idx == gf_group->gf_group_size) break; + } + + alt_index = -1; + ++frame_idx; + ++frame_gop_offset; + + // Extend two frames outside the current gf group. + for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { + struct lookahead_entry *buf = + vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = LF_UPDATE; + lst_index = frame_idx; + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_gop_offset; + } + + return extend_frame_count; +} + +static void init_tpl_stats(VP9_COMP *cpi) { + int frame_idx; + for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + memset(tpl_frame->tpl_stats_ptr, 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + tpl_frame->is_valid = 0; + } +} + +static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) { + int frame_idx; + for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) { + vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list); + } + vpx_free(tpl_gop_stats->frame_stats_list); +} + +static void init_tpl_stats_before_propagation( + struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats, + TplDepFrame *tpl_stats, int tpl_gop_frames, int frame_width, + int frame_height) { + int frame_idx; + free_tpl_frame_stats_list(tpl_gop_stats); + CHECK_MEM_ERROR( + error_info, tpl_gop_stats->frame_stats_list, + vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list))); + tpl_gop_stats->size = tpl_gop_frames; + for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) { + const int mi_rows = tpl_stats[frame_idx].mi_rows; + const int mi_cols = tpl_stats[frame_idx].mi_cols; + CHECK_MEM_ERROR( + error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list, + vpx_calloc( + mi_rows * mi_cols, + sizeof( + *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list))); + tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols; + tpl_gop_stats->frame_stats_list[frame_idx].frame_width = frame_width; + tpl_gop_stats->frame_stats_list[frame_idx].frame_height = frame_height; + } +} + +#if CONFIG_NON_GREEDY_MV +static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, + MotionField *motion_field, + int frame_idx, uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + BLOCK_SIZE bsize, int mi_row, + int mi_col, MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + int step_param; + uint32_t bestsme = UINT_MAX; + const MvLimits tmp_mv_limits = x->mv_limits; + // lambda is used to adjust the importance of motion vector consistency. + // TODO(angiebird): Figure out lambda's proper value. + const int lambda = cpi->tpl_stats[frame_idx].lambda; + int_mv nb_full_mvs[NB_MVS_NUM]; + int nb_full_mv_num; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + nb_full_mv_num = + vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs); + vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param, + lambda, 1, nb_full_mvs, nb_full_mv_num, mv); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + return bestsme; +} + +static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + BLOCK_SIZE bsize, MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + + MV best_ref_mv1 = { 0, 0 }; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} + +#else // CONFIG_NON_GREEDY_MV +static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, + int stride, BLOCK_SIZE bsize, + MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const SEARCH_METHODS search_method = NSTEP; + int step_param; + int sadpb = x->sadperbit16; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, mv, 0, 0); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} +#endif + +static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, + int ref_pos_col, int block, BLOCK_SIZE bsize) { + int width = 0, height = 0; + int bw = 4 << b_width_log2_lookup[bsize]; + int bh = 4 << b_height_log2_lookup[bsize]; + + switch (block) { + case 0: + width = grid_pos_col + bw - ref_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 1: + width = ref_pos_col + bw - grid_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 2: + width = grid_pos_col + bw - ref_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + case 3: + width = ref_pos_col + bw - grid_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + default: assert(0); + } + + return width * height; +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, + BLOCK_SIZE bsize, int stride) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx]; + const int64_t mc_flow = tpl_ptr->mc_flow; + const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost; + *tpl_ptr = *src_stats; + tpl_ptr->mc_flow = mc_flow; + tpl_ptr->mc_ref_cost = mc_ref_cost; + tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; + } + } +} + +static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats, + TplDepStats *tpl_stats, int mi_row, + int mi_col, BLOCK_SIZE bsize, + int src_stride, int64_t recon_error, + int64_t pred_error, int64_t rate_cost, + int ref_frame_idx, int mi_rows, + int mi_cols) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * src_stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + if (mi_row + idy >= mi_rows || mi_col + idx >= mi_cols) continue; + VpxTplBlockStats *tpl_block_stats_ptr = + &tpl_block_stats[(mi_row + idy) * mi_cols + mi_col + idx]; + tpl_block_stats_ptr->row = mi_row * 8 + idy * 8; + tpl_block_stats_ptr->col = mi_col * 8 + idx * 8; + tpl_block_stats_ptr->inter_cost = src_stats->inter_cost; + tpl_block_stats_ptr->intra_cost = src_stats->intra_cost; + // inter/intra_cost here is calculated with SATD which should be close + // enough to be used as inter/intra_pred_error + tpl_block_stats_ptr->inter_pred_err = src_stats->inter_cost; + tpl_block_stats_ptr->intra_pred_err = src_stats->intra_cost; + tpl_block_stats_ptr->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->pred_error = pred_error << TPL_DEP_COST_SCALE_LOG2; + tpl_block_stats_ptr->mv_r = (src_stats->mv.as_mv.row >= 0 ? 1 : -1) * + (abs(src_stats->mv.as_mv.row) + 4) / 8; + tpl_block_stats_ptr->mv_c = (src_stats->mv.as_mv.col >= 0 ? 1 : -1) * + (abs(src_stats->mv.as_mv.col) + 4) / 8; + tpl_block_stats_ptr->ref_frame_index = ref_frame_idx; + } + } +} + +static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + if (tpl_stats->ref_frame_index < 0) return; + + TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; + TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; + MV mv = tpl_stats->mv.as_mv; + int mv_row = mv.row >> 3; + int mv_col = mv.col >> 3; + + int ref_pos_row = mi_row * MI_SIZE + mv_row; + int ref_pos_col = mi_col * MI_SIZE + mv_col; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = get_overlap_area( + grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + + int64_t mc_flow = tpl_stats->mc_dep_cost - + (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / + tpl_stats->intra_cost; + + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *des_stats = + &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + + (ref_mi_col + idx)]; + + des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; + des_stats->mc_ref_cost += + ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / + pix_num; + assert(overlap_area >= 0); + } + } + } + } +} + +static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + int idx, idy; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = + &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; + tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, + BLOCK_8X8); + } + } +} + +static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + TX_SIZE tx_size, int64_t *recon_error, + int64_t *sse, uint16_t *eob) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size]; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + + // skip block condition should be handled before this is called. + assert(!x->skip_block); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, + pd->dequant, eob, scan_order); + } else { + vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); + } +#else + vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob, + scan_order); +#endif // CONFIG_VP9_HIGHBITDEPTH + + *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + *recon_error = VPXMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = VPXMAX(*sse, 1); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms. + switch (tx_size) { + case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + switch (tx_size) { + case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} + +static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, + int mi_col) { + x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.row_max = + (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); + x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.col_max = + ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); +} + +static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) { + const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT]; + int rate_cost = 1; + int idx; + assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob); + for (idx = 0; idx < eob; ++idx) { + unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]); + rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0); + } + + return (rate_cost << VP9_PROB_COST_SHIFT); +} + +static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + struct scale_factors *sf, GF_PICTURE *gf_picture, + int frame_idx, TplDepFrame *tpl_frame, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, + int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, + YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, + int64_t *recon_error, int64_t *rate_cost, + int64_t *sse, int *ref_frame_idx) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int pix_num = bw * bh; + int best_rf_idx = -1; + int_mv best_mv; + int64_t best_inter_cost = INT64_MAX; + int64_t inter_cost; + int rf_idx; + const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + + int64_t best_intra_cost = INT64_MAX; + int64_t intra_cost; + PREDICTION_MODE mode; + int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + MODE_INFO mi_above, mi_left; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; + xd->above_mi = (mi_row > 0) ? &mi_above : NULL; + xd->left_mi = (mi_col > 0) ? &mi_left : NULL; + + // Intra prediction search + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + uint8_t *src, *dst; + int src_stride, dst_stride; + + src = xd->cur_buf->y_buffer + mb_y_offset; + src_stride = xd->cur_buf->y_stride; + + dst = &predictor[0]; + dst_stride = bw; + + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src, + src_stride, dst, dst_stride, 0, 0, 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride, xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); + } +#else + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; + } + + // Motion compensated prediction + best_mv.as_int = 0; + + set_mv_limits(cm, x, mi_row, mi_col); + + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int_mv mv; +#if CONFIG_NON_GREEDY_MV + MotionField *motion_field; +#endif + if (ref_frame[rf_idx] == NULL) continue; + +#if CONFIG_NON_GREEDY_MV + (void)td; + motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); +#else + motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, bsize, &mv.as_mv); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), + ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd->bd); + vpx_highbd_subtract_block( + bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vp9_build_inter_predictor( + ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh, + 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); + } +#else + vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); +#endif + + if (inter_cost < best_inter_cost) { + uint16_t eob = 0; + best_rf_idx = rf_idx; + best_inter_cost = inter_cost; + best_mv.as_int = mv.as_int; + // Since best_inter_cost is initialized as INT64_MAX, recon_error and + // rate_cost will be calculated with the best reference frame. + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, + sse, &eob); + *rate_cost = rate_estimator(qcoeff, eob, tx_size); + } + } + best_intra_cost = VPXMAX(best_intra_cost, 1); + best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost); + tpl_stats->inter_cost = VPXMAX( + 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->intra_cost = VPXMAX( + 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + if (best_rf_idx >= 0) { + tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + } + tpl_stats->mv.as_int = best_mv.as_int; + *ref_frame_idx = best_rf_idx; +} + +#if CONFIG_NON_GREEDY_MV +static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture, + int frame_idx, int rf_idx, int mi_row, + int mi_col, struct buf_2d *src, + struct buf_2d *pre) { + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + YV12_BUFFER_CONFIG *ref_frame = NULL; + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + ref_frame = gf_picture[ref_frame_idx].frame; + src->buf = xd->cur_buf->y_buffer + mb_y_offset; + src->stride = xd->cur_buf->y_stride; + pre->buf = ref_frame->y_buffer + mb_y_offset; + pre->stride = ref_frame->y_stride; + assert(src->stride == pre->stride); + return 1; + } else { + printf("invalid ref_frame_idx"); + assert(ref_frame_idx != -1); + return 0; + } +} + +#define kMvPreCheckLines 5 +#define kMvPreCheckSize 15 + +#define MV_REF_POS_NUM 3 +POSITION mv_ref_pos[MV_REF_POS_NUM] = { + { -1, 0 }, + { 0, -1 }, + { -1, -1 }, +}; + +static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row, + int mi_col) { + return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col]; +} + +static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + int i; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int_mv nearest_mv, near_mv, invalid_mv; + nearest_mv.as_int = INVALID_MV; + near_mv.as_int = INVALID_MV; + invalid_mv.as_int = INVALID_MV; + for (i = 0; i < MV_REF_POS_NUM; ++i) { + int nb_row = mi_row + mv_ref_pos[i].row * mi_height; + int nb_col = mi_col + mv_ref_pos[i].col * mi_width; + assert(mv_ref_pos[i].row <= 0); + assert(mv_ref_pos[i].col <= 0); + if (nb_row >= 0 && nb_col >= 0) { + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + } else { + int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + if (mv.as_int == nearest_mv.as_int) { + continue; + } else { + near_mv = mv; + break; + } + } + } + } + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv.as_mv.row = 0; + nearest_mv.as_mv.col = 0; + } + if (near_mv.as_int == INVALID_MV) { + near_mv.as_mv.row = 0; + near_mv.as_mv.col = 0; + } + if (mv_mode == NEAREST_MV_MODE) { + return nearest_mv; + } + if (mv_mode == NEAR_MV_MODE) { + return near_mv; + } + assert(0); + return invalid_mv; +} + +static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi, + MotionField *motion_field, + TplDepFrame *tpl_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + int_mv mv; + switch (mv_mode) { + case ZERO_MV_MODE: + mv.as_mv.row = 0; + mv.as_mv.col = 0; + break; + case NEW_MV_MODE: + mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); + break; + case NEAREST_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + case NEAR_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + default: + mv.as_int = INVALID_MV; + assert(0); + break; + } + return mv; +} + +static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *mv) { + uint32_t sse; + struct buf_2d src; + struct buf_2d pre; + MV full_mv; + *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize, + mi_row, mi_col); + full_mv = get_full_mv(&mv->as_mv); + if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col, + &src, &pre)) { + // TODO(angiebird): Consider subpixel when computing the sse. + cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv), + pre.stride, &sse); + return (double)(sse << VP9_DIST_SCALE_LOG2); + } else { + assert(0); + return 0; + } +} + +static int get_mv_mode_cost(int mv_mode) { + // TODO(angiebird): The probabilities are roughly inferred from + // default_inter_mode_probs. Check if there is a better way to set the + // probabilities. + const int zero_mv_prob = 16; + const int new_mv_prob = 24 * 1; + const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob; + assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256); + switch (mv_mode) { + case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break; + case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break; + case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + default: assert(0); return -1; + } +} + +static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) { + double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) + + log2(1 + abs(new_mv->col - ref_mv->col)); + mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT); + return mv_diff_cost; +} +static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field, + TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + double mv_cost = get_mv_mode_cost(mv_mode); + if (mv_mode == NEW_MV_MODE) { + MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, + bsize, mi_row, mi_col) + .as_mv; + MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field, + tpl_frame, bsize, mi_row, mi_col) + .as_mv; + MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame, + bsize, mi_row, mi_col) + .as_mv; + double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv); + double near_cost = get_mv_diff_cost(&new_mv, &near_mv); + mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost; + } + return mv_cost; +} + +static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *mv) { + MACROBLOCKD *xd = &x->e_mbd; + double mv_dist = + get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, mv); + double mv_cost = + get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col); + double mult = 180; + + return mv_cost + mult * log2f(1 + mv_dist); +} + +static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, + MotionField *motion_field, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + double *rd, int_mv *mv) { + int best_mv_mode = ZERO_MV_MODE; + int update = 0; + int mv_mode; + *rd = 0; + for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) { + double this_rd; + int_mv this_mv; + if (mv_mode == NEW_MV_MODE) { + continue; + } + this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv); + if (update == 0) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + update = 1; + } else { + if (this_rd < *rd) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + } + } + } + return best_mv_mode; +} + +static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int tmp_mv_mode_arr[kMvPreCheckSize]; + int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx]; + double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx]; + int_mv *select_mv_arr = cpi->select_mv_arr; + int_mv tmp_select_mv_arr[kMvPreCheckSize]; + int stride = tpl_frame->stride; + double new_mv_rd = 0; + double no_new_mv_rd = 0; + double this_new_mv_rd = 0; + double this_no_new_mv_rd = 0; + int idx; + int tmp_idx; + assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1); + + // no new mv + // diagonal scan order + tmp_idx = 0; + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( + cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, + bsize, nb_row, nb_col, &this_rd, mv); + if (r == 0 && c == 0) { + this_no_new_mv_rd = this_rd; + } + no_new_mv_rd += this_rd; + tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col]; + tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col]; + ++tmp_idx; + } + } + } + + // new mv + mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE; + this_new_mv_rd = eval_mv_mode( + NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]); + new_mv_rd = this_new_mv_rd; + // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE + // beforehand. + for (idx = 1; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( + cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, + bsize, nb_row, nb_col, &this_rd, mv); + new_mv_rd += this_rd; + } + } + } + + // update best_mv_mode + tmp_idx = 0; + if (no_new_mv_rd < new_mv_rd) { + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx]; + select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx]; + ++tmp_idx; + } + } + } + rd_diff_arr[mi_row * stride + mi_col] = 0; + } else { + rd_diff_arr[mi_row * stride + mi_col] = + (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd); + } +} + +static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, + MotionField *motion_field, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int unit_rows = tpl_frame->mi_rows / mi_height; + const int unit_cols = tpl_frame->mi_cols / mi_width; + const int max_diagonal_lines = unit_rows + unit_cols - 1; + int idx; + for (idx = 0; idx < max_diagonal_lines; ++idx) { + int r; + for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1); + ++r) { + int c = idx - r; + int mi_row = r * mi_height; + int mi_col = c * mi_width; + assert(c >= 0 && c < unit_cols); + assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows); + assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols); + predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col); + } + } +} + +static void do_motion_search(VP9_COMP *cpi, ThreadData *td, + MotionField *motion_field, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + assert(ref_frame != NULL); + set_mv_limits(cm, x, mi_row, mi_col); + { + int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); + uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset; + uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset; + const int stride = xd->cur_buf->y_stride; + full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf, + ref_frame_buf, stride, bsize, mi_row, mi_col, + &mv.as_mv); + sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride, + bsize, &mv.as_mv); + vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv); + } +} + +static void build_motion_field( + VP9_COMP *cpi, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; + int mi_row, mi_col; + int rf_idx; + + tpl_frame->lambda = (pw * ph) >> 2; + assert(pw * ph == tpl_frame->lambda << 2); + + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + if (ref_frame[rf_idx] == NULL) { + continue; + } + vp9_motion_field_reset_mvs(motion_field); + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx], + bsize, mi_row, mi_col); + } + } + } +} +#endif // CONFIG_NON_GREEDY_MV + +static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, + int frame_idx, BLOCK_SIZE bsize) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + VpxTplFrameStats *tpl_frame_stats_before_propagation = + &cpi->tpl_gop_stats.frame_stats_list[frame_idx]; + YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; + YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; + + VP9_COMMON *cm = &cpi->common; + struct scale_factors sf; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + int mi_row, mi_col; + +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); + uint8_t *predictor; +#else + DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); +#endif + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + + tpl_frame_stats_before_propagation->frame_width = cm->width; + tpl_frame_stats_before_propagation->frame_height = cm->height; + // Setup scaling factor +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height, + cpi->common.use_highbitdepth); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + predictor = CONVERT_TO_BYTEPTR(predictor16); + else + predictor = predictor8; +#else + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Prepare reference frame pointers. If any reference frame slot is + // unavailable, the pointer will be set to Null. + for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) { + int rf_idx = gf_picture[frame_idx].ref_frame[idx]; + if (rf_idx != -REFS_PER_FRAME) ref_frame[idx] = gf_picture[rf_idx].frame; + } + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + xd->cur_buf = this_frame; + + // Get rd multiplier set up. + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex); + set_error_per_bit(&cpi->td.mb, rdmult); + vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); + + tpl_frame->is_valid = 1; + + cm->base_qindex = tpl_frame->base_qindex; + vp9_frame_init_quantizer(cpi); + +#if CONFIG_NON_GREEDY_MV + { + int square_block_idx; + int rf_idx; + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); + build_motion_field(cpi, frame_idx, ref_frame, square_bsize); + } + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize); + } + } + } +#endif // CONFIG_NON_GREEDY_MV + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + int64_t recon_error = 0; + int64_t rate_cost = 0; + int64_t sse = 0; + // Ref frame index in the ref frame buffer. + int ref_frame_idx = -1; + mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, + src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, + tx_size, ref_frame, predictor, &recon_error, &rate_cost, + &sse, &ref_frame_idx); + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, + tpl_frame->stride); + + tpl_store_before_propagation( + tpl_frame_stats_before_propagation->block_stats_list, + tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride, + recon_error, sse, rate_cost, ref_frame_idx, tpl_frame->mi_rows, + tpl_frame->mi_cols); + + tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, + bsize); + } + } +} + +static void trim_tpl_stats(struct vpx_internal_error_info *error_info, + VpxTplGopStats *tpl_gop_stats, int extra_frames) { + int i; + VpxTplFrameStats *new_frame_stats; + const int new_size = tpl_gop_stats->size - extra_frames; + if (tpl_gop_stats->size <= extra_frames) + vpx_internal_error( + error_info, VPX_CODEC_ERROR, + "The number of frames in VpxTplGopStats is fewer than expected."); + CHECK_MEM_ERROR(error_info, new_frame_stats, + vpx_calloc(new_size, sizeof(*new_frame_stats))); + for (i = 0; i < new_size; i++) { + VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i]; + const int num_blocks = frame_stats->num_blocks; + new_frame_stats[i].num_blocks = frame_stats->num_blocks; + new_frame_stats[i].frame_width = frame_stats->frame_width; + new_frame_stats[i].frame_height = frame_stats->frame_height; + new_frame_stats[i].num_blocks = num_blocks; + CHECK_MEM_ERROR( + error_info, new_frame_stats[i].block_stats_list, + vpx_calloc(num_blocks, sizeof(*new_frame_stats[i].block_stats_list))); + memcpy(new_frame_stats[i].block_stats_list, frame_stats->block_stats_list, + num_blocks * sizeof(*new_frame_stats[i].block_stats_list)); + } + free_tpl_frame_stats_list(tpl_gop_stats); + tpl_gop_stats->size = new_size; + tpl_gop_stats->frame_stats_list = new_frame_stats; +} + +#if CONFIG_NON_GREEDY_MV +#define DUMP_TPL_STATS 0 +#if DUMP_TPL_STATS +static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) { + int i, j; + printf("%d %d\n", h, w); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + printf("%d ", buf[(row + i) * stride + col + j]); + } + } + printf("\n"); +} + +static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) { + dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height, + frame_buf->y_width); + dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); + dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); +} + +static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames, + const GF_GROUP *gf_group, + const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) { + int frame_idx; + const VP9_COMMON *cm = &cpi->common; + int rf_idx; + for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) { + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + int mi_row, mi_col; + int ref_frame_idx; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame; + const int gf_frame_offset = gf_group->frame_gop_index[frame_idx]; + const int ref_gf_frame_offset = + gf_group->frame_gop_index[ref_frame_idx]; + printf("=\n"); + printf( + "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d " + "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n", + frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE, + ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset); + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info, + frame_idx, rf_idx, bsize, + mi_row, mi_col); + printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row, + mv.as_mv.col); + } + } + } + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + const TplDepStats *tpl_ptr = + &tpl_frame + ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + printf("%f ", tpl_ptr->feature_score); + } + } + } + printf("\n"); + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + const int mv_mode = + tpl_frame + ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col]; + printf("%d ", mv_mode); + } + } + printf("\n"); + + dump_frame_buf(gf_picture[frame_idx].frame); + dump_frame_buf(ref_frame_buf); + } + } + } +} +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +void vp9_init_tpl_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int frame; + + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); +#if CONFIG_NON_GREEDY_MV + int rf_idx; + + vpx_free(cpi->select_mv_arr); + CHECK_MEM_ERROR( + &cm->error, cpi->select_mv_arr, + vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); +#endif + + // TODO(jingning): Reduce the actual memory use for tpl model build up. + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { + if (cpi->tpl_stats[frame].width >= mi_cols && + cpi->tpl_stats[frame].height >= mi_rows && + cpi->tpl_stats[frame].tpl_stats_ptr) + continue; + +#if CONFIG_NON_GREEDY_MV + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + CHECK_MEM_ERROR( + &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + CHECK_MEM_ERROR( + &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx]))); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr, + vpx_calloc(mi_rows * mi_cols, + sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); + cpi->tpl_stats[frame].is_valid = 0; + cpi->tpl_stats[frame].width = mi_cols; + cpi->tpl_stats[frame].height = mi_rows; + cpi->tpl_stats[frame].stride = mi_cols; + cpi->tpl_stats[frame].mi_rows = cm->mi_rows; + cpi->tpl_stats[frame].mi_cols = cm->mi_cols; + } + + for (frame = 0; frame < REF_FRAMES; ++frame) { + cpi->enc_frame_buf[frame].mem_valid = 0; + cpi->enc_frame_buf[frame].released = 1; + } +} + +void vp9_free_tpl_buffer(VP9_COMP *cpi) { + int frame; +#if CONFIG_NON_GREEDY_MV + vp9_free_motion_field_info(&cpi->motion_field_info); + vpx_free(cpi->select_mv_arr); +#endif + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { +#if CONFIG_NON_GREEDY_MV + int rf_idx; + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + cpi->tpl_stats[frame].is_valid = 0; + } + free_tpl_frame_stats_list(&cpi->tpl_gop_stats); +} + +void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int gop_length = cpi->twopass.gf_group.gf_group_size; + int bottom_index, top_index; + int idx; + const int gf_index = cpi->twopass.gf_group.index; + const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref; + const int refresh_frame_context = cpi->common.refresh_frame_context; + + const int sb_size = num_8x8_blocks_wide_lookup[BLOCK_64X64] * MI_SIZE; + const int frame_height_sb = (cm->height + sb_size - 1) / sb_size; + const int frame_width_sb = (cm->width + sb_size - 1) / sb_size; + + vpx_codec_err_t codec_status; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + vpx_rc_encodeframe_decision_t encode_frame_decision; + + CHECK_MEM_ERROR( + &cm->error, encode_frame_decision.sb_params_list, + (sb_params *)vpx_malloc(frame_height_sb * frame_width_sb * + sizeof(*encode_frame_decision.sb_params_list))); + + for (idx = gf_index; idx <= gop_length; ++idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[idx]; + int target_rate = cpi->twopass.gf_group.bit_allocation[idx]; + cpi->twopass.gf_group.index = idx; + vp9_rc_set_frame_target(cpi, target_rate); + vp9_configure_buffer_updates(cpi, idx); + if (cpi->ext_ratectrl.ready && + (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 && + cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) { + if (idx == gop_length) break; + memset(encode_frame_decision.sb_params_list, 0, + sizeof(*encode_frame_decision.sb_params_list) * frame_height_sb * + frame_width_sb); + codec_status = vp9_extrc_get_encodeframe_decision( + &cpi->ext_ratectrl, gf_group->index, &encode_frame_decision); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cm->error, codec_status, + "vp9_extrc_get_encodeframe_decision() failed"); + } + for (int i = 0; i < frame_height_sb * frame_width_sb; ++i) { + cpi->sb_mul_scale[i] = + (((int64_t)encode_frame_decision.sb_params_list[i].rdmult * 256) / + (encode_frame_decision.rdmult + 1)); + } + tpl_frame->base_qindex = encode_frame_decision.q_index; + } else { + tpl_frame->base_qindex = vp9_rc_pick_q_and_bounds_two_pass( + cpi, &bottom_index, &top_index, idx); + tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1); + } + } + // Reset the actual index and frame update + cpi->twopass.gf_group.index = gf_index; + cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref; + cpi->common.refresh_frame_context = refresh_frame_context; + vp9_configure_buffer_updates(cpi, gf_index); + + vpx_free(encode_frame_decision.sb_params_list); +} + +void vp9_setup_tpl_stats(VP9_COMP *cpi) { + GF_PICTURE gf_picture_buf[MAX_ARF_GOP_SIZE + REFS_PER_FRAME]; + GF_PICTURE *gf_picture = &gf_picture_buf[REFS_PER_FRAME]; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int tpl_group_frames = 0; + int frame_idx; + int extended_frame_count; + cpi->tpl_bsize = BLOCK_32X32; + + memset(gf_picture_buf, 0, sizeof(gf_picture_buf)); + extended_frame_count = + init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); + + init_tpl_stats(cpi); + + init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats, + cpi->tpl_stats, tpl_group_frames, + cpi->common.width, cpi->common.height); + + // Backward propagation from tpl_group_frames to 1. + for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { + if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; + mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); + } + + if (cpi->ext_ratectrl.ready && + cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) { + // Intra search on key frame + if (gf_group->update_type[0] != OVERLAY_UPDATE) { + mc_flow_dispenser(cpi, gf_picture, 0, cpi->tpl_bsize); + } + // TPL stats has extra frames from next GOP. Trim those extra frames for + // Qmode. + trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, + extended_frame_count); + const vpx_codec_err_t codec_status = + vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats); + if (codec_status != VPX_CODEC_OK) { + vpx_internal_error(&cpi->common.error, codec_status, + "vp9_extrc_send_tpl_stats() failed"); + } + } + +#if CONFIG_NON_GREEDY_MV + cpi->tpl_ready = 1; +#if DUMP_TPL_STATS + dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize); +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h new file mode 100644 index 0000000000..de0ac39a1f --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ +#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef M_LOG2_E +#define M_LOG2_E 0.693147180559945309417 +#endif +#define log2f(x) (log(x) / (float)M_LOG2_E) + +#define TPL_DEP_COST_SCALE_LOG2 4 + +typedef struct GF_PICTURE { + YV12_BUFFER_CONFIG *frame; + int ref_frame[3]; + FRAME_UPDATE_TYPE update_type; +} GF_PICTURE; + +void vp9_init_tpl_buffer(VP9_COMP *cpi); +void vp9_setup_tpl_stats(VP9_COMP *cpi); +void vp9_free_tpl_buffer(VP9_COMP *cpi); +void vp9_estimate_tpl_qp_gop(VP9_COMP *cpi); + +void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h index a8b9c2cd31..86c5fa2244 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_TREEWRITER_H_ -#define VP9_ENCODER_VP9_TREEWRITER_H_ +#ifndef VPX_VP9_ENCODER_VP9_TREEWRITER_H_ +#define VPX_VP9_ENCODER_VP9_TREEWRITER_H_ #include "vpx_dsp/bitwriter.h" @@ -48,4 +48,4 @@ static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_TREEWRITER_H_ +#endif // VPX_VP9_ENCODER_VP9_TREEWRITER_H_ diff --git a/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_avx2.c new file mode 100644 index 0000000000..418edd62b5 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_avx2.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vp9/encoder/vp9_temporal_filter.h" + +static INLINE void highbd_shuffle_12tap_filter_avx2(const int16_t *filter, + __m256i *f) { + const __m256i f_low = + _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)filter)); + const __m256i f_high = _mm256_broadcastsi128_si256( + _mm_loadl_epi64((const __m128i *)(filter + 8))); + + f[0] = _mm256_shuffle_epi32(f_low, 0x00); + f[1] = _mm256_shuffle_epi32(f_low, 0x55); + f[2] = _mm256_shuffle_epi32(f_low, 0xaa); + f[3] = _mm256_shuffle_epi32(f_low, 0xff); + f[4] = _mm256_shuffle_epi32(f_high, 0x00); + f[5] = _mm256_shuffle_epi32(f_high, 0x55); +} + +static INLINE __m256i highbd_convolve_12tap(const __m256i *s, + const __m256i *f) { + const __m256i res_0 = _mm256_madd_epi16(s[0], f[0]); + const __m256i res_1 = _mm256_madd_epi16(s[1], f[1]); + const __m256i res_2 = _mm256_madd_epi16(s[2], f[2]); + const __m256i res_3 = _mm256_madd_epi16(s[3], f[3]); + const __m256i res_4 = _mm256_madd_epi16(s[4], f[4]); + const __m256i res_5 = _mm256_madd_epi16(s[5], f[5]); + + const __m256i res = + _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), + _mm256_add_epi32(_mm256_add_epi32(res_2, res_3), + _mm256_add_epi32(res_4, res_5))); + return res; +} + +static INLINE void reuse_src_data_avx2(const __m256i *src, __m256i *des) { + des[0] = src[0]; + des[1] = src[1]; + des[2] = src[2]; + des[3] = src[3]; + des[4] = src[4]; +} + +void vpx_highbd_convolve12_horiz_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + assert(x_step_q4 == 16); + (void)y0_q4; + (void)x_step_q4; + (void)y_step_q4; + const uint16_t *src_ptr = src; + src_ptr -= MAX_FILTER_TAP / 2 - 1; + __m256i s[6], f[6]; + const __m256i rounding = _mm256_set1_epi32(1 << (FILTER_BITS - 1)); + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + highbd_shuffle_12tap_filter_avx2(filter[x0_q4], f); + + for (int j = 0; j < w; j += 8) { + for (int i = 0; i < h; i += 2) { + // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015 + const __m256i row0 = + _mm256_loadu_si256((const __m256i *)&src_ptr[i * src_stride + j]); + // s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114 + // s115 + const __m256i row1 = _mm256_loadu_si256( + (const __m256i *)&src_ptr[(i + 1) * src_stride + j]); + // s016 s017 s018 s019 s020 s021 s022 s023 + const __m128i row0_16 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j + 16]); + // s116 s117 s118 s119 s120 s121 s122 s123 + const __m128i row1_16 = _mm_loadu_si128( + (const __m128i *)&src_ptr[(i + 1) * src_stride + j + 16]); + + // s00 s01 s02 s03 s04 s05 s06 s07 | s10 s11 s12 s13 s14 s15 s16 s17 + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + // s08 s09 s010 s011 s012 s013 s014 s015 | s18 s19 s110 s111 s112 s113 + // s114 s115 + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + // s016 s017 s018 s019 s020 s021 s022 s023 | s116 s117 s118 s119 s120 s121 + // s122 s123 + const __m256i r2 = + _mm256_inserti128_si256(_mm256_castsi128_si256(row0_16), row1_16, 1); + + // even pixels + s[0] = r0; + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + s[4] = r1; + s[5] = _mm256_alignr_epi8(r2, r1, 4); + + // 00 02 04 06 | 10 12 14 16 + __m256i res_even = highbd_convolve_12tap(s, f); + res_even = + _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding), FILTER_BITS); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + s[4] = _mm256_alignr_epi8(r2, r1, 2); + s[5] = _mm256_alignr_epi8(r2, r1, 6); + + // 01 03 05 07 | 11 13 15 17 + __m256i res_odd = highbd_convolve_12tap(s, f); + res_odd = + _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding), FILTER_BITS); + + // 00 01 02 03 | 10 11 12 13 + const __m256i res_0 = _mm256_unpacklo_epi32(res_even, res_odd); + // 04 05 06 07 | 14 15 16 17 + const __m256i res_1 = _mm256_unpackhi_epi32(res_even, res_odd); + // 00 01 02 03 | 04 05 06 07 | 10 11 12 13 | 14 15 16 17 + const __m256i res_2 = _mm256_packus_epi32(res_0, res_1); + const __m256i res = _mm256_min_epi16(res_2, max); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + if (i + 1 < h) { + _mm_storeu_si128((__m128i *)(&dst[(i + 1) * dst_stride + j]), + _mm256_extractf128_si256(res, 1)); + } + } + } +} + +void vpx_highbd_convolve12_vert_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + assert(y_step_q4 == 16); + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + const uint16_t *src_ptr = src; + src_ptr -= src_stride * (MAX_FILTER_TAP / 2 - 1); + __m256i s[12], f[6]; + const __m256i rounding = _mm256_set1_epi32(((1 << FILTER_BITS) >> 1)); + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + highbd_shuffle_12tap_filter_avx2(filter[y0_q4], f); + + for (int j = 0; j < w; j += 8) { + __m128i s0 = + _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride + j)); + __m128i s1 = + _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride + j)); + __m128i s2 = + _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride + j)); + __m128i s3 = + _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_stride + j)); + __m128i s4 = + _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_stride + j)); + __m128i s5 = + _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_stride + j)); + __m128i s6 = + _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_stride + j)); + __m128i s7 = + _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_stride + j)); + __m128i s8 = + _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_stride + j)); + __m128i s9 = + _mm_loadu_si128((const __m128i *)(src_ptr + 9 * src_stride + j)); + __m128i s10t = + _mm_loadu_si128((const __m128i *)(src_ptr + 10 * src_stride + j)); + + __m256i r01 = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + __m256i r12 = _mm256_inserti128_si256(_mm256_castsi128_si256(s1), s2, 1); + __m256i r23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); + __m256i r34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); + __m256i r45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); + __m256i r56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); + __m256i r67 = _mm256_inserti128_si256(_mm256_castsi128_si256(s6), s7, 1); + __m256i r78 = _mm256_inserti128_si256(_mm256_castsi128_si256(s7), s8, 1); + __m256i r89 = _mm256_inserti128_si256(_mm256_castsi128_si256(s8), s9, 1); + __m256i r910 = _mm256_inserti128_si256(_mm256_castsi128_si256(s9), s10t, 1); + + s[0] = _mm256_unpacklo_epi16(r01, r12); + s[1] = _mm256_unpacklo_epi16(r23, r34); + s[2] = _mm256_unpacklo_epi16(r45, r56); + s[3] = _mm256_unpacklo_epi16(r67, r78); + s[4] = _mm256_unpacklo_epi16(r89, r910); + + s[6] = _mm256_unpackhi_epi16(r01, r12); + s[7] = _mm256_unpackhi_epi16(r23, r34); + s[8] = _mm256_unpackhi_epi16(r45, r56); + s[9] = _mm256_unpackhi_epi16(r67, r78); + s[10] = _mm256_unpackhi_epi16(r89, r910); + for (int i = 0; i < h; i += 2) { + const __m128i s10 = _mm_loadu_si128( + (const __m128i *)(src_ptr + (i + 10) * src_stride + j)); + const __m128i s11 = _mm_loadu_si128( + (const __m128i *)(src_ptr + (i + 11) * src_stride + j)); + const __m128i s12 = _mm_loadu_si128( + (const __m128i *)(src_ptr + (i + 12) * src_stride + j)); + __m256i r1011 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s10), s11, 1); + __m256i r1112 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s11), s12, 1); + + s[5] = _mm256_unpacklo_epi16(r1011, r1112); + s[11] = _mm256_unpackhi_epi16(r1011, r1112); + + // 00 01 02 03 | 10 11 12 13 + const __m256i res_a = highbd_convolve_12tap(s, f); + __m256i res_a_round = + _mm256_srai_epi32(_mm256_add_epi32(res_a, rounding), FILTER_BITS); + // 04 05 06 07 | 14 15 16 17 + const __m256i res_b = highbd_convolve_12tap(s + 6, f); + __m256i res_b_round = + _mm256_srai_epi32(_mm256_add_epi32(res_b, rounding), FILTER_BITS); + + // 00 01 02 03 | 04 05 06 07 | 10 11 12 13 | 14 15 16 17 + const __m256i res_0 = _mm256_packus_epi32(res_a_round, res_b_round); + const __m256i res = _mm256_min_epi16(res_0, max); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + + _mm_storeu_si128((__m128i *)(&dst[(i + 1) * dst_stride + j]), + _mm256_extractf128_si256(res, 1)); + + reuse_src_data_avx2(s + 1, s); + reuse_src_data_avx2(s + 7, s + 6); + } + } +} + +void vpx_highbd_convolve12_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(h == 32 || h == 16 || h == 8); + assert(w == 32 || w == 16 || w == 8); + DECLARE_ALIGNED(32, uint16_t, temp[BW * (BH + MAX_FILTER_TAP - 1)]); + const int temp_stride = BW; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP; + + vpx_highbd_convolve12_horiz_avx2(src - src_stride * (MAX_FILTER_TAP / 2 - 1), + src_stride, temp, temp_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, + intermediate_height, bd); + vpx_highbd_convolve12_vert_avx2(temp + temp_stride * (MAX_FILTER_TAP / 2 - 1), + temp_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c new file mode 100644 index 0000000000..97f182c660 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c @@ -0,0 +1,893 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Compute (a-b)**2 for 8 pixels with size 16-bit +static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, + uint32_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu16_epi32(a_reg); + const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero); + const __m128i b_first = _mm_cvtepu16_epi32(b_reg); + const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi32(a_first, b_first); + dist_second = _mm_sub_epi32(a_second, b_second); + dist_first = _mm_mullo_epi32(dist_first, dist_first); + dist_second = _mm_mullo_epi32(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 4), dist_second); +} + +// Sum up three neighboring distortions for the pixels +static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)dist); + dist_left = _mm_loadu_si128((const __m128i *)(dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(dist + 1)); + + *sum = _mm_add_epi32(dist_reg, dist_left); + *sum = _mm_add_epi32(*sum, dist_right); +} + +static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first, + __m128i *sum_second) { + highbd_get_sum_4(dist, sum_first); + highbd_get_sum_4(dist + 4, sum_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values, plus +// however many values from y/uv plane are). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE void highbd_average_4(__m128i *output, const __m128i *sum, + const __m128i *mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u32 = _mm_set1_epi32(rounding); + const __m128i weight_u32 = _mm_set1_epi32(weight); + const __m128i sixteen = _mm_set1_epi32(16); + const __m128i zero = _mm_setzero_si128(); + + // modifier * 3 / index; + const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero); + const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero); + const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero); + const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero); + + const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo); + const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32); + const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi); + const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32); + + // Now we have + // mul_lo: 00 a1 00 a0 + // mul_hi: 00 a3 00 a2 + // Unpack as 64 bit words to get even and odd elements + // unpack_lo: 00 a2 00 a0 + // unpack_hi: 00 a3 00 a1 + // Then we can shift and OR the results to get everything in 32-bits + const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4); + const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift); + + // Round + *output = _mm_add_epi32(mul, rounding_u32); + *output = _mm_srl_epi32(*output, strength_u128); + + // Multiply with the weight + *output = _mm_min_epu32(*output, sixteen); + *output = _mm_sub_epi32(sixteen, *output); + *output = _mm_mullo_epi32(*output, weight_u32); +} + +static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1, + const __m128i *sum_0_u32, + const __m128i *sum_1_u32, + const __m128i *mul_constants_0, + const __m128i *mul_constants_1, + const int strength, const int rounding, + const int weight) { + highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding, + weight); + highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding, + weight); +} + +// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32, + const __m128i sum_second_u32, + const uint16_t *pred, + uint16_t *count, + uint32_t *accumulator) { + // Cast down to 16-bit ints + const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32); + const __m128i zero = _mm_setzero_si128(); + + __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32); + pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first, + __m128i *reg_second) { + highbd_read_dist_4(dist, reg_first); + highbd_read_dist_4(dist + 4, reg_second); +} + +static INLINE void highbd_read_chroma_dist_row_8( + int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first, + __m128i *u_second, __m128i *v_first, __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 8 entries from chroma. + highbd_read_dist_8(u_dist, u_first, u_second); + highbd_read_dist_8(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + highbd_read_dist_4(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi32(u_reg, u_reg); + *u_second = _mm_unpackhi_epi32(u_reg, u_reg); + + highbd_read_dist_4(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi32(v_reg, v_reg); + *v_second = _mm_unpackhi_epi32(v_reg, v_reg); + } +} + +static void vp9_highbd_apply_temporal_filter_luma_8( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_first, + const uint32_t *const *neighbors_second, int top_weight, + int bottom_weight) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(block_width == 8); + + (void)block_width; + + // First row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second); + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + // We don't need to saturate here because the maximum value is UINT12_MAX ** 2 + // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX + sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second); + + // Add chroma values + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + weight = bottom_weight; + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, + rounding, weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void vp9_highbd_apply_temporal_filter_luma( + const uint16_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_first; + const uint32_t *const *neighbors_second; + + // Left + neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + } + + // Right + neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; + vp9_highbd_apply_temporal_filter_luma_8( + y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); +} + +// Add a row of luma distortion that corresponds to 8 chroma mods. If we are +// subsampling in x direction, then we have 16 lumas, else we have 8. +static INLINE void highbd_add_luma_dist_to_8_chroma_mod( + const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst, + __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) { + __m128i y_reg_fst, y_reg_snd; + if (!ss_x) { + highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst); + y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd); + } + } else { + // Temporary + __m128i y_fst, y_snd; + + // First 8 + highbd_read_dist_8(y_dist, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_fst = _mm_hadd_epi32(y_fst, y_snd); + + // Second 8 + highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_snd = _mm_hadd_epi32(y_fst, y_snd); + } + + *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst); + *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd); + *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst); + *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_highbd_apply_temporal_filter_chroma_8( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int uv_block_width, unsigned int uv_block_height, int ss_x, + int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, + int top_weight, int bottom_weight, const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_fst, mul_snd; + + __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst; + __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst; + __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd; + __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd; + + __m128i u_sum_row_fst, v_sum_row_fst; + __m128i u_sum_row_snd, v_sum_row_snd; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]); + + // Add chroma values + highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + + u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd); + + highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[1]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]); + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); +} + +// Perform temporal filter for the chroma components. +static void vp9_highbd_apply_temporal_filter_chroma( + const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_fst; + const uint32_t *const *neighbors_snd; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } else { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + vp9_highbd_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, + neighbors_snd, top_weight, bottom_weight, NULL); +} + +void vp9_highbd_apply_temporal_filter_sse4_1( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + + uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 8) { + highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, + block_height, ss_x, ss_y, strength, + blk_fw, use_whole_blk, y_accum, y_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); + + vp9_highbd_apply_temporal_filter_chroma( + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_ssse3.c new file mode 100644 index 0000000000..4540dca3a6 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_ssse3.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // SSSE3 + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vp9/encoder/vp9_temporal_filter.h" + +static INLINE void highbd_shuffle_12tap_filter_ssse3(const int16_t *filter, + __m128i *f) { + const __m128i f_low = _mm_loadu_si128((const __m128i *)filter); + const __m128i f_high = _mm_loadl_epi64((const __m128i *)(filter + 8)); + + f[0] = _mm_shuffle_epi32(f_low, 0x00); + f[1] = _mm_shuffle_epi32(f_low, 0x55); + f[2] = _mm_shuffle_epi32(f_low, 0xaa); + f[3] = _mm_shuffle_epi32(f_low, 0xff); + f[4] = _mm_shuffle_epi32(f_high, 0x00); + f[5] = _mm_shuffle_epi32(f_high, 0x55); +} + +static INLINE void unpacklo_src_ssse3(__m128i *a, __m128i *s) { + s[0] = _mm_unpacklo_epi16(a[0], a[1]); + s[1] = _mm_unpacklo_epi16(a[2], a[3]); + s[2] = _mm_unpacklo_epi16(a[4], a[5]); + s[3] = _mm_unpacklo_epi16(a[6], a[7]); + s[4] = _mm_unpacklo_epi16(a[8], a[9]); +} + +static INLINE void unpackhi_src_ssse3(__m128i *a, __m128i *s) { + s[0] = _mm_unpackhi_epi16(a[0], a[1]); + s[1] = _mm_unpackhi_epi16(a[2], a[3]); + s[2] = _mm_unpackhi_epi16(a[4], a[5]); + s[3] = _mm_unpackhi_epi16(a[6], a[7]); + s[4] = _mm_unpackhi_epi16(a[8], a[9]); +} + +static INLINE __m128i highbd_convolve_12tap(const __m128i *s, + const __m128i *f) { + const __m128i rounding = _mm_set1_epi32(1 << (FILTER_BITS - 1)); + const __m128i res_0 = _mm_madd_epi16(s[0], f[0]); + const __m128i res_1 = _mm_madd_epi16(s[1], f[1]); + const __m128i res_2 = _mm_madd_epi16(s[2], f[2]); + const __m128i res_3 = _mm_madd_epi16(s[3], f[3]); + const __m128i res_4 = _mm_madd_epi16(s[4], f[4]); + const __m128i res_5 = _mm_madd_epi16(s[5], f[5]); + + const __m128i res_6 = _mm_add_epi32( + _mm_add_epi32(res_0, res_1), + _mm_add_epi32(_mm_add_epi32(res_2, res_3), _mm_add_epi32(res_4, res_5))); + const __m128i res = + _mm_srai_epi32(_mm_add_epi32(res_6, rounding), FILTER_BITS); + return res; +} + +static INLINE void reuse_src_data_ssse3(const __m128i *src, __m128i *des) { + des[0] = src[0]; + des[1] = src[1]; + des[2] = src[2]; + des[3] = src[3]; + des[4] = src[4]; +} + +void vpx_highbd_convolve12_horiz_ssse3(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + assert(x_step_q4 == 16); + (void)y0_q4; + (void)x_step_q4; + (void)y_step_q4; + const uint16_t *src_ptr = src; + src_ptr -= MAX_FILTER_TAP / 2 - 1; + __m128i s[6], f[6]; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + const __m128i min = _mm_setzero_si128(); + highbd_shuffle_12tap_filter_ssse3(filter[x0_q4], f); + + for (int j = 0; j < w; j += 8) { + for (int i = 0; i < h; i++) { + // s00 s01 s02 s03 s04 s05 s06 s07 + const __m128i r0 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j]); + // s08 s09 s010 s011 s012 s013 s014 s015 + const __m128i r1 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j + 8]); + // s016 s017 s018 s019 s020 s021 s022 s023 + const __m128i r2 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j + 16]); + + // even pixels + s[0] = r0; + s[1] = _mm_alignr_epi8(r1, r0, 4); + s[2] = _mm_alignr_epi8(r1, r0, 8); + s[3] = _mm_alignr_epi8(r1, r0, 12); + s[4] = r1; + s[5] = _mm_alignr_epi8(r2, r1, 4); + + // 00 02 04 06 + __m128i res_even = highbd_convolve_12tap(s, f); + + // odd pixels + s[0] = _mm_alignr_epi8(r1, r0, 2); + s[1] = _mm_alignr_epi8(r1, r0, 6); + s[2] = _mm_alignr_epi8(r1, r0, 10); + s[3] = _mm_alignr_epi8(r1, r0, 14); + s[4] = _mm_alignr_epi8(r2, r1, 2); + s[5] = _mm_alignr_epi8(r2, r1, 6); + + // 01 03 05 07 + __m128i res_odd = highbd_convolve_12tap(s, f); + + // 00 01 02 03 + const __m128i res_0 = _mm_unpacklo_epi32(res_even, res_odd); + // 04 05 06 07 + const __m128i res_1 = _mm_unpackhi_epi32(res_even, res_odd); + // 00 01 02 03 | 04 05 06 07 + const __m128i res_2 = _mm_packs_epi32(res_0, res_1); + const __m128i res = _mm_max_epi16(_mm_min_epi16(res_2, max), min); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } + } +} + +void vpx_highbd_convolve12_vert_ssse3(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + assert(y_step_q4 == 16); + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + const uint16_t *src_ptr = src; + src_ptr -= src_stride * (MAX_FILTER_TAP / 2 - 1); + __m128i s[12], r[12], a[11], f[6]; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + const __m128i min = _mm_setzero_si128(); + highbd_shuffle_12tap_filter_ssse3(filter[y0_q4], f); + + for (int j = 0; j < w; j += 8) { + a[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride + j)); + a[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride + j)); + a[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride + j)); + a[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_stride + j)); + a[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_stride + j)); + a[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_stride + j)); + a[6] = _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_stride + j)); + a[7] = _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_stride + j)); + a[8] = _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_stride + j)); + a[9] = _mm_loadu_si128((const __m128i *)(src_ptr + 9 * src_stride + j)); + a[10] = _mm_loadu_si128((const __m128i *)(src_ptr + 10 * src_stride + j)); + + // even row + unpacklo_src_ssse3(a, s); + unpackhi_src_ssse3(a, s + 6); + // odd row + unpacklo_src_ssse3(a + 1, r); + unpackhi_src_ssse3(a + 1, r + 6); + + for (int i = 0; i < h; i += 2) { + const __m128i s0 = _mm_loadu_si128( + (const __m128i *)(src_ptr + (i + 10) * src_stride + j)); + const __m128i s1 = _mm_loadu_si128( + (const __m128i *)(src_ptr + (i + 11) * src_stride + j)); + const __m128i s2 = _mm_loadu_si128( + (const __m128i *)(src_ptr + (i + 12) * src_stride + j)); + + s[5] = _mm_unpacklo_epi16(s0, s1); + r[5] = _mm_unpacklo_epi16(s1, s2); + + s[11] = _mm_unpackhi_epi16(s0, s1); + r[11] = _mm_unpackhi_epi16(s1, s2); + + // 00 01 02 03 + const __m128i res_a = highbd_convolve_12tap(s, f); + // 04 05 06 07 + const __m128i res_b = highbd_convolve_12tap(s + 6, f); + // 10 11 12 13 + const __m128i res_c = highbd_convolve_12tap(r, f); + // 14 15 16 17 + const __m128i res_d = highbd_convolve_12tap(r + 6, f); + + // 00 01 02 03 | 04 05 06 07 + const __m128i res_0 = _mm_packs_epi32(res_a, res_b); + // 10 11 12 13 | 14 15 16 17 + const __m128i res_1 = _mm_packs_epi32(res_c, res_d); + const __m128i res_r0 = _mm_max_epi16(_mm_min_epi16(res_0, max), min); + const __m128i res_r1 = _mm_max_epi16(_mm_min_epi16(res_1, max), min); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_r0); + _mm_storeu_si128((__m128i *)&dst[(i + 1) * dst_stride + j], res_r1); + + reuse_src_data_ssse3(s + 1, s); + reuse_src_data_ssse3(s + 7, s + 6); + reuse_src_data_ssse3(r + 1, r); + reuse_src_data_ssse3(r + 7, r + 6); + } + } +} + +void vpx_highbd_convolve12_ssse3(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(h == 32 || h == 16 || h == 8); + assert(w == 32 || w == 16 || w == 8); + DECLARE_ALIGNED(32, uint16_t, temp[BW * (BH + MAX_FILTER_TAP - 1)]); + const int temp_stride = BW; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP; + + vpx_highbd_convolve12_horiz_ssse3(src - src_stride * (MAX_FILTER_TAP / 2 - 1), + src_stride, temp, temp_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height, bd); + vpx_highbd_convolve12_vert_ssse3( + temp + temp_stride * (MAX_FILTER_TAP / 2 - 1), temp_stride, dst, + dst_stride, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_avx2.c new file mode 100644 index 0000000000..9d94ff8d69 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_avx2.c @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vp9/encoder/vp9_temporal_filter.h" + +DECLARE_ALIGNED(32, static const uint8_t, + shuffle_src_mask1_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, + 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, + 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_src_mask2_avx2[32]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_src_mask3_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, shuffle_src_mask4_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +static INLINE void shuffle_12tap_filter_avx2(const int16_t *filter, + __m256i *f) { + const __m256i f_low = + _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)filter)); + const __m256i f_high = _mm256_broadcastsi128_si256( + _mm_loadl_epi64((const __m128i *)(filter + 8))); + + f[0] = _mm256_shuffle_epi8(f_low, _mm256_set1_epi16(0x0200u)); + f[1] = _mm256_shuffle_epi8(f_low, _mm256_set1_epi16(0x0604u)); + f[2] = _mm256_shuffle_epi8(f_low, _mm256_set1_epi16(0x0a08u)); + f[3] = _mm256_shuffle_epi8(f_low, _mm256_set1_epi16(0x0e0cu)); + f[4] = _mm256_shuffle_epi8(f_high, _mm256_set1_epi16(0x0200u)); + f[5] = _mm256_shuffle_epi8(f_high, _mm256_set1_epi16(0x0604u)); +} + +static INLINE void shuffle_src_data_avx2(const __m256i *r1, const __m256i *r2, + const __m256i *f, __m256i *s) { + s[0] = _mm256_shuffle_epi8(*r1, f[0]); + s[1] = _mm256_shuffle_epi8(*r1, f[1]); + s[2] = _mm256_shuffle_epi8(*r1, f[2]); + s[3] = _mm256_shuffle_epi8(*r1, f[3]); + s[4] = _mm256_shuffle_epi8(*r2, f[0]); + s[5] = _mm256_shuffle_epi8(*r2, f[1]); +} + +static INLINE void reuse_src_data_avx2(const __m256i *src, __m256i *des) { + des[0] = src[0]; + des[1] = src[1]; + des[2] = src[2]; + des[3] = src[3]; + des[4] = src[4]; +} + +static INLINE __m256i convolve12_16_avx2(const __m256i *s, const __m256i *f) { + // multiply 2 adjacent elements with the filter and add the result + const __m256i k_64 = _mm256_set1_epi16(1 << (FILTER_BITS - 1)); + const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]); + const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]); + const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]); + const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]); + const __m256i x4 = _mm256_maddubs_epi16(s[4], f[4]); + const __m256i x5 = _mm256_maddubs_epi16(s[5], f[5]); + __m256i sum1, sum2, sum3; + + sum1 = _mm256_add_epi16(x0, x2); + sum2 = _mm256_add_epi16(x3, x5); + sum3 = _mm256_add_epi16(x1, x4); + sum3 = _mm256_add_epi16(sum3, k_64); + + const __m256i s0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum1)); + const __m256i s1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(sum1, 1)); + const __m256i s2 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum2)); + const __m256i s3 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(sum2, 1)); + const __m256i s4 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum3)); + const __m256i s5 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(sum3, 1)); + + sum1 = _mm256_add_epi32(s0, s2); + sum2 = _mm256_add_epi32(s1, s3); + sum1 = _mm256_add_epi32(sum1, s4); + sum2 = _mm256_add_epi32(sum2, s5); + + // round and shift by 7 bit each 32 bit + // 0 1 2 3 4 5 6 7 + sum1 = _mm256_srai_epi32(sum1, FILTER_BITS); + // 8 9 10 11 12 13 14 15 + sum2 = _mm256_srai_epi32(sum2, FILTER_BITS); + + // 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + __m256i const res = + _mm256_permute4x64_epi64(_mm256_packus_epi32(sum1, sum2), 0xD8); + return res; +} + +void vpx_convolve12_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(w == 32 || w == 16 || w == 8); + (void)y0_q4; + (void)x_step_q4; + (void)y_step_q4; + const uint8_t *src_ptr = src; + src_ptr -= MAX_FILTER_TAP / 2 - 1; + __m256i s[6], f[6], src_mask[4]; + + shuffle_12tap_filter_avx2(filter[x0_q4], f); + src_mask[0] = _mm256_load_si256((__m256i const *)shuffle_src_mask1_avx2); + src_mask[1] = _mm256_load_si256((__m256i const *)shuffle_src_mask2_avx2); + src_mask[2] = _mm256_load_si256((__m256i const *)shuffle_src_mask3_avx2); + src_mask[3] = _mm256_load_si256((__m256i const *)shuffle_src_mask4_avx2); + if (w == 8) { + for (int i = 0; i < h; i += 4) { + // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015 + const __m128i row0 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride]); + // s08 s09 s010 s011 s012 s013 s014 s015 s016 s017 s018 s019 s020 s021 + // s022 s023 + const __m128i row0_8 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + 8]); + // s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114 s115 + const __m128i row1 = + _mm_loadu_si128((const __m128i *)&src_ptr[(i + 1) * src_stride]); + const __m128i row1_8 = + _mm_loadu_si128((const __m128i *)&src_ptr[(i + 1) * src_stride + 8]); + // s20 s21 s22 s23 s24 s25 s26 s27 s28 s29 s210 s211 s212 s213 s214 s215 + const __m128i row2 = + _mm_loadu_si128((const __m128i *)&src_ptr[(i + 2) * src_stride]); + const __m128i row2_8 = + _mm_loadu_si128((const __m128i *)&src_ptr[(i + 2) * src_stride + 8]); + // s30 s31 s32 s33 s34 s35 s36 s37 s38 s39 s310 s311 s312 s313 s314 s115 + const __m128i row3 = + _mm_loadu_si128((const __m128i *)&src_ptr[(i + 3) * src_stride]); + const __m128i row3_8 = + _mm_loadu_si128((const __m128i *)&src_ptr[(i + 3) * src_stride + 8]); + // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015 | + // s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114 s115 + const __m256i row01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(row0), row1, 1); + // s20 s21 s22 s23 s24 s25 s26 s27 s28 s29 s210 s211 s212 s213 s214 s215 | + // s30 s31 s32 s33 s34 s35 s36 s37 s38 s39 s310 s311 s312 s313 s314 s115 + const __m256i row23 = + _mm256_inserti128_si256(_mm256_castsi128_si256(row2), row3, 1); + // s08 s09 s010 s011 s012 s013 s014 s015 s016 s017 s018 s019 s020 s021 + // s022 s023 | s18 s19 s110 s111 s112 s113 s114 s115 s116 s117 s118 s119 + // s120 s121 s122 s123 + const __m256i row01_8 = + _mm256_inserti128_si256(_mm256_castsi128_si256(row0_8), row1_8, 1); + const __m256i row23_8 = + _mm256_inserti128_si256(_mm256_castsi128_si256(row2_8), row3_8, 1); + + shuffle_src_data_avx2(&row01, &row01_8, src_mask, s); + const __m256i res_0 = convolve12_16_avx2(s, f); + + shuffle_src_data_avx2(&row23, &row23_8, src_mask, s); + const __m256i res_1 = convolve12_16_avx2(s, f); + + // 00 01 02 03 04 05 06 07 | 10 11 12 13 14 15 16 17 | 08 09 010 011 012 + // 013 014 015 | 18 19 110 111 112 113 114 115 + const __m256i res = _mm256_packus_epi16(res_0, res_1); + const __m128i res_lo = _mm256_castsi256_si128(res); + const __m128i res_hi = _mm256_extracti128_si256(res, 1); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_lo); + _mm_storel_epi64((__m128i *)&dst[(i + 1) * dst_stride], res_hi); + _mm_storel_epi64((__m128i *)&dst[(i + 2) * dst_stride], + _mm_srli_si128(res_lo, 8)); + _mm_storel_epi64((__m128i *)&dst[(i + 3) * dst_stride], + _mm_srli_si128(res_hi, 8)); + } + } else { + for (int j = 0; j < w; j += 16) { + for (int i = 0; i < h; i += 2) { + // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015 + const __m128i row0 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j]); + // s016 s017 s018 s019 s020 s021 s022 s023 s024 s025 s026 s027 s028 s029 + // s030 s031 + const __m128i row0_16 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j + 16]); + // s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114 + // s115 + const __m128i row1 = _mm_loadu_si128( + (const __m128i *)&src_ptr[(i + 1) * src_stride + j]); + // s116 s117 s118 s119 s120 s121 s122 s123 s124 s125 s126 s127 s128 + // s129 s130 s131 + const __m128i row1_16 = _mm_loadu_si128( + (const __m128i *)&src_ptr[(i + 1) * src_stride + j + 16]); + + // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015 + // | s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114 + // s115 + const __m256i r0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(row0), row1, 1); + // s016 s017 s018 s019 s020 s021 s022 s023 s024 s025 s026 s027 s028 s029 + // s030 s031 | s116 s117 s118 s119 s120 s121 s122 s123 s124 s125 s126 + // s127 s128 s129 s130 s131 + const __m256i r2 = _mm256_inserti128_si256( + _mm256_castsi128_si256(row0_16), row1_16, 1); + + // s08 s09 s010 s011 s012 s013 s014 s015 s016 s017 s018 s019 s020 s021 + // s022 s023 | s18 s19 s110 s111 s112 s113 s114 s115 s116 s117 s118 s119 + // s120 s121 s122 s123 + const __m256i r1 = _mm256_alignr_epi8(r2, r0, 8); + + shuffle_src_data_avx2(&r0, &r1, src_mask, s); + const __m256i res_0 = convolve12_16_avx2(s, f); + + shuffle_src_data_avx2(&r1, &r2, src_mask, s); + const __m256i res_1 = convolve12_16_avx2(s, f); + + const __m256i res = _mm256_packus_epi16(res_0, res_1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + if (i + 1 < h) { + _mm_storeu_si128((__m128i *)&dst[(i + 1) * dst_stride + j], + _mm256_extracti128_si256(res, 1)); + } + } + } + } +} + +void vpx_convolve12_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(y_step_q4 == 16); + assert(h == 32 || h == 16 || h == 8); + assert(w == 32 || w == 16 || w == 8); + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + const uint8_t *src_ptr = src; + src_ptr -= src_stride * (MAX_FILTER_TAP / 2 - 1); + __m256i s[12], f[6]; + + shuffle_12tap_filter_avx2(filter[y0_q4], f); + if (w == 8) { + const __m128i s0 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_stride)); + const __m128i s1 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_stride)); + const __m128i s2 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_stride)); + const __m128i s3 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_stride)); + const __m128i s4 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_stride)); + const __m128i s5 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_stride)); + const __m128i s6 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_stride)); + const __m128i s7 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_stride)); + const __m128i s8 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_stride)); + const __m128i s9 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_stride)); + const __m128i s10t = + _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_stride)); + + const __m256i r01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + const __m256i r12 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s1), s2, 1); + const __m256i r23 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); + const __m256i r34 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); + const __m256i r45 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); + const __m256i r56 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); + const __m256i r67 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s6), s7, 1); + const __m256i r78 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s7), s8, 1); + const __m256i r89 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s8), s9, 1); + const __m256i r910 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s9), s10t, 1); + + s[0] = _mm256_unpacklo_epi8(r01, r12); + s[1] = _mm256_unpacklo_epi8(r23, r34); + s[2] = _mm256_unpacklo_epi8(r45, r56); + s[3] = _mm256_unpacklo_epi8(r67, r78); + s[4] = _mm256_unpacklo_epi8(r89, r910); + for (int i = 0; i < h; i += 2) { + const __m128i s10 = + _mm_loadl_epi64((const __m128i *)(src_ptr + (i + 10) * src_stride)); + const __m128i s11 = + _mm_loadl_epi64((const __m128i *)(src_ptr + (i + 11) * src_stride)); + const __m128i s12 = + _mm_loadl_epi64((const __m128i *)(src_ptr + (i + 12) * src_stride)); + + const __m256i r1011 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s10), s11, 1); + const __m256i r1112 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s11), s12, 1); + s[5] = _mm256_unpacklo_epi8(r1011, r1112); + const __m256i res_0 = convolve12_16_avx2(s, f); + + __m256i res = _mm256_packus_epi16(res_0, res_0); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst[(i + 1) * dst_stride], + _mm256_extracti128_si256(res, 1)); + + reuse_src_data_avx2(s + 1, s); + } + } else { + for (int j = 0; j < w; j += 16) { + const __m128i s0 = + _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride + j)); + const __m128i s1 = + _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride + j)); + const __m128i s2 = + _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride + j)); + const __m128i s3 = + _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_stride + j)); + const __m128i s4 = + _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_stride + j)); + const __m128i s5 = + _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_stride + j)); + const __m128i s6 = + _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_stride + j)); + const __m128i s7 = + _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_stride + j)); + const __m128i s8 = + _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_stride + j)); + const __m128i s9 = + _mm_loadu_si128((const __m128i *)(src_ptr + 9 * src_stride + j)); + const __m128i s10t = + _mm_loadu_si128((const __m128i *)(src_ptr + 10 * src_stride + j)); + + const __m256i r01 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + const __m256i r12 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s1), s2, 1); + const __m256i r23 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1); + const __m256i r34 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1); + const __m256i r45 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1); + const __m256i r56 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1); + const __m256i r67 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s6), s7, 1); + const __m256i r78 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s7), s8, 1); + const __m256i r89 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s8), s9, 1); + const __m256i r910 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s9), s10t, 1); + + s[0] = _mm256_unpacklo_epi8(r01, r12); + s[1] = _mm256_unpacklo_epi8(r23, r34); + s[2] = _mm256_unpacklo_epi8(r45, r56); + s[3] = _mm256_unpacklo_epi8(r67, r78); + s[4] = _mm256_unpacklo_epi8(r89, r910); + + s[6] = _mm256_unpackhi_epi8(r01, r12); + s[7] = _mm256_unpackhi_epi8(r23, r34); + s[8] = _mm256_unpackhi_epi8(r45, r56); + s[9] = _mm256_unpackhi_epi8(r67, r78); + s[10] = _mm256_unpackhi_epi8(r89, r910); + for (int i = 0; i < h; i += 2) { + const __m128i s10 = _mm_loadu_si128( + (const __m128i *)(src_ptr + (i + 10) * src_stride + j)); + const __m128i s11 = _mm_loadu_si128( + (const __m128i *)(src_ptr + (i + 11) * src_stride + j)); + const __m128i s12 = _mm_loadu_si128( + (const __m128i *)(src_ptr + (i + 12) * src_stride + j)); + + const __m256i r1011 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s10), s11, 1); + const __m256i r1112 = + _mm256_inserti128_si256(_mm256_castsi128_si256(s11), s12, 1); + + s[5] = _mm256_unpacklo_epi8(r1011, r1112); + s[11] = _mm256_unpackhi_epi8(r1011, r1112); + + const __m256i res_0 = convolve12_16_avx2(s, f); + const __m256i res_1 = convolve12_16_avx2(s + 6, f); + + __m256i res = _mm256_packus_epi16(res_0, res_1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storeu_si128((__m128i *)&dst[(i + 1) * dst_stride + j], + _mm256_extracti128_si256(res, 1)); + + reuse_src_data_avx2(s + 1, s); + reuse_src_data_avx2(s + 7, s + 6); + } + } + } +} + +void vpx_convolve12_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel12 *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(h == 32 || h == 16 || h == 8); + assert(w == 32 || w == 16 || w == 8); + DECLARE_ALIGNED(32, uint8_t, temp[BW * (BH + MAX_FILTER_TAP - 1)]); + const int temp_stride = BW; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP; + vpx_convolve12_horiz_avx2(src - src_stride * (MAX_FILTER_TAP / 2 - 1), + src_stride, temp, temp_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); + vpx_convolve12_vert_avx2(temp + temp_stride * (MAX_FILTER_TAP / 2 - 1), + temp_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c new file mode 100644 index 0000000000..7571bfccac --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c @@ -0,0 +1,875 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_temporal_filter_constants.h" + +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a); + const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + + __m128i dist_first; + + dist_first = _mm_sub_epi16(a_first, b_first); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + + _mm_storeu_si128((__m128i *)dst, dist_first); +} + +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi16(a_first, b_first); + dist_second = _mm_sub_epi16(a_second, b_second); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + dist_second = _mm_mullo_epi16(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 8), dist_second); +} + +static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first, + __m128i *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants, + const int strength, const int rounding, + const __m128i *weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = *weight; + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, *mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, + uint16_t *count, uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void accumulate_and_store_16(const __m128i sum_0_u16, + const __m128i sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), + count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8)); + __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8), + pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero); + __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32; + __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16); + _mm_storeu_si128((__m128i *)count, count_0_u16); + + count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16); + _mm_storeu_si128((__m128i *)(count + 8), count_1_u16); + + pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16); + pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero); + pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16); + pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8)); + accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32); + accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); + _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32); + _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); +} + +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)y_dist); + dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1)); + + *sum = _mm_adds_epu16(dist_reg, dist_left); + *sum = _mm_adds_epu16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first, + __m128i *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + __m128i *u_first, __m128i *u_second, + __m128i *v_first, + __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + read_dist_8(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi16(u_reg, u_reg); + *u_second = _mm_unpackhi_epi16(u_reg, u_reg); + + read_dist_8(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi16(v_reg, v_reg); + *v_second = _mm_unpackhi_epi16(v_reg, v_reg); + } +} + +// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit +// int in dst. +static INLINE void hadd_epu16(__m128i *src, __m128i *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i shift_right = _mm_srli_si128(*src, 2); + + const __m128i odd = _mm_blend_epi16(shift_right, zero, 170); + const __m128i even = _mm_blend_epi16(*src, zero, 170); + + *dst = _mm_add_epi32(even, odd); +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + __m128i *u_mod, + __m128i *v_mod) { + __m128i y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + __m128i y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = _mm_adds_epu16(y_reg, y_tmp); + } + } else { + __m128i y_first, y_second; + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + __m128i y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = _mm_adds_epu16(y_first, y_tmp_0); + y_second = _mm_adds_epu16(y_second, y_tmp_1); + } + + hadd_epu16(&y_first, &y_first); + hadd_epu16(&y_second, &y_second); + + y_reg = _mm_packus_epi32(y_first, y_second); + } + + *u_mod = _mm_adds_epu16(*u_mod, y_reg); + *v_mod = _mm_adds_epu16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_apply_temporal_filter_luma_16( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + __m128i weight_first, weight_second; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 0); + assert(strength <= 6); + + assert(block_width == 16); + (void)block_width; + + // Initialize the weights + if (blk_fw) { + weight_first = _mm_set1_epi16(blk_fw[0]); + weight_second = _mm_set1_epi16(blk_fw[1]); + } else { + weight_first = _mm_set1_epi16(top_weight); + weight_second = weight_first; + } + + // First row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second); + + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + weight_first = _mm_set1_epi16(blk_fw[2]); + weight_second = _mm_set1_epi16(blk_fw[3]); + } else { + weight_first = _mm_set1_epi16(bottom_weight); + weight_second = weight_first; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_dist += DIST_STRIDE; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void vp9_apply_temporal_filter_luma( + const uint8_t *y_pre, int y_pre_stride, unsigned int block_width, + unsigned int block_height, int ss_x, int ss_y, int strength, + const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The blockwidth is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usual left-middle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } else { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, 0, 0, blk_fw); + } + + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + vp9_apply_temporal_filter_luma_16( + y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void vp9_apply_temporal_filter_chroma_8( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + + __m128i weight; + + __m128i mul; + + __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3; + __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3; + + __m128i u_sum_row, v_sum_row; + + // Loop variable + unsigned int h; + + // Initialize weight + if (blk_fw) { + weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0], + blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]); + } else { + weight = _mm_set1_epi16(top_weight); + } + + // First row + mul = _mm_load_si128((const __m128i *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = _mm_load_si128((const __m128i *)neighbors[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + weight = _mm_setr_epi16(blk_fw[2], blk_fw[2], blk_fw[2], blk_fw[2], + blk_fw[3], blk_fw[3], blk_fw[3], blk_fw[3]); + } else { + weight = _mm_set1_epi16(bottom_weight); + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = _mm_load_si128((const __m128i *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void vp9_apply_temporal_filter_chroma( + const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, + int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } else { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, + ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + vp9_apply_temporal_filter_chroma_8( + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x, + ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +void vp9_apply_temporal_filter_sse4_1( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *const blk_fw, + int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, + uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + vp9_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height, + ss_x, ss_y, strength, blk_fw_ptr, + use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); + + vp9_apply_temporal_filter_chroma( + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_ssse3.c new file mode 100644 index 0000000000..abf0ae1381 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_ssse3.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // SSSE3 + +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" +#include "vp9/encoder/vp9_temporal_filter.h" + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_src_mask1_ssse3[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, + 4, 5, 5, 6, 6, 7, 7, 8 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_src_mask2_ssse3[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, 9, 9, 10 }; + +DECLARE_ALIGNED(16, static const uint8_t, + shuffle_src_mask3_ssse3[32]) = { 4, 5, 5, 6, 6, 7, 7, 8, + 8, 9, 9, 10, 10, 11, 11, 12 }; + +DECLARE_ALIGNED(16, static const uint8_t, shuffle_src_mask4_ssse3[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +static INLINE void sign_extend_16bit_to_32bit_ssse3(__m128i in, __m128i zero, + __m128i *out_lo, + __m128i *out_hi) { + const __m128i sign_bits = _mm_cmpgt_epi16(zero, in); + *out_lo = _mm_unpacklo_epi16(in, sign_bits); + *out_hi = _mm_unpackhi_epi16(in, sign_bits); +} + +static INLINE void shuffle_12tap_filter_ssse3(const int16_t *filter, + __m128i *f) { + const __m128i f_low = _mm_loadu_si128((const __m128i *)filter); + const __m128i f_high = _mm_loadl_epi64((const __m128i *)(filter + 8)); + + f[0] = _mm_shuffle_epi8(f_low, _mm_set1_epi16(0x0200u)); + f[1] = _mm_shuffle_epi8(f_low, _mm_set1_epi16(0x0604u)); + f[2] = _mm_shuffle_epi8(f_low, _mm_set1_epi16(0x0a08u)); + f[3] = _mm_shuffle_epi8(f_low, _mm_set1_epi16(0x0e0cu)); + f[4] = _mm_shuffle_epi8(f_high, _mm_set1_epi16(0x0200u)); + f[5] = _mm_shuffle_epi8(f_high, _mm_set1_epi16(0x0604u)); +} + +static INLINE void shuffle_src_data_ssse3(const __m128i *r1, const __m128i *r2, + const __m128i *f, __m128i *s) { + s[0] = _mm_shuffle_epi8(*r1, f[0]); + s[1] = _mm_shuffle_epi8(*r1, f[1]); + s[2] = _mm_shuffle_epi8(*r1, f[2]); + s[3] = _mm_shuffle_epi8(*r1, f[3]); + s[4] = _mm_shuffle_epi8(*r2, f[0]); + s[5] = _mm_shuffle_epi8(*r2, f[1]); +} + +static INLINE void reuse_src_data_ssse3(const __m128i *src, __m128i *des) { + des[0] = src[0]; + des[1] = src[1]; + des[2] = src[2]; + des[3] = src[3]; + des[4] = src[4]; +} + +static INLINE __m128i convolve12_16_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << (FILTER_BITS - 1)); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); + const __m128i x5 = _mm_maddubs_epi16(s[5], f[5]); + __m128i sum1, sum2, sum3, s0, s1, s2, s3, s4, s5; + + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x3, x5); + sum3 = _mm_add_epi16(x1, x4); + sum3 = _mm_add_epi16(sum3, k_64); + + sign_extend_16bit_to_32bit_ssse3(sum1, _mm_setzero_si128(), &s0, &s1); + sign_extend_16bit_to_32bit_ssse3(sum2, _mm_setzero_si128(), &s2, &s3); + sign_extend_16bit_to_32bit_ssse3(sum3, _mm_setzero_si128(), &s4, &s5); + sum1 = _mm_add_epi32(s0, s2); + sum2 = _mm_add_epi32(s1, s3); + sum1 = _mm_add_epi32(sum1, s4); + sum2 = _mm_add_epi32(sum2, s5); + + // round and shift by 7 bit each 32 bit + // 0 1 2 3 + sum1 = _mm_srai_epi32(sum1, FILTER_BITS); + // 4 5 6 7 + sum2 = _mm_srai_epi32(sum2, FILTER_BITS); + + // 0 1 2 3 4 5 6 7 + __m128i const res = _mm_packs_epi32(sum1, sum2); + return res; +} + +void vpx_convolve12_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(w == 32 || w == 16 || w == 8); + (void)y0_q4; + (void)x_step_q4; + (void)y_step_q4; + const uint8_t *src_ptr = src; + src_ptr -= MAX_FILTER_TAP / 2 - 1; + __m128i s[6], f[6], src_mask[4]; + + shuffle_12tap_filter_ssse3(filter[x0_q4], f); + src_mask[0] = _mm_load_si128((__m128i const *)shuffle_src_mask1_ssse3); + src_mask[1] = _mm_load_si128((__m128i const *)shuffle_src_mask2_ssse3); + src_mask[2] = _mm_load_si128((__m128i const *)shuffle_src_mask3_ssse3); + src_mask[3] = _mm_load_si128((__m128i const *)shuffle_src_mask4_ssse3); + if (w == 8) { + for (int i = 0; i < h; i += 2) { + // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015 + const __m128i row0 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride]); + // s08 s09 s010 s011 s012 s013 s014 s015 s016 s017 s018 s019 s020 s021 + // s022 s023 + const __m128i row0_8 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + 8]); + // s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s110 s111 s112 s113 s114 s115 + const __m128i row1 = + _mm_loadu_si128((const __m128i *)&src_ptr[(i + 1) * src_stride]); + const __m128i row1_8 = + _mm_loadu_si128((const __m128i *)&src_ptr[(i + 1) * src_stride + 8]); + + shuffle_src_data_ssse3(&row0, &row0_8, src_mask, s); + const __m128i res_0 = convolve12_16_ssse3(s, f); + + shuffle_src_data_ssse3(&row1, &row1_8, src_mask, s); + const __m128i res_1 = convolve12_16_ssse3(s, f); + + const __m128i res = _mm_packus_epi16(res_0, res_1); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res); + _mm_storel_epi64((__m128i *)&dst[(i + 1) * dst_stride], + _mm_srli_si128(res, 8)); + } + } else { + for (int j = 0; j < w; j += 16) { + for (int i = 0; i < h; i++) { + // s00 s01 s02 s03 s04 s05 s06 s07 s08 s09 s010 s011 s012 s013 s014 s015 + const __m128i r0 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j]); + // s016 s017 s018 s019 s020 s021 s022 s023 s024 s025 s026 s027 s028 s029 + // s030 s031 + const __m128i r2 = + _mm_loadu_si128((const __m128i *)&src_ptr[i * src_stride + j + 16]); + + // s08 s09 s010 s011 s012 s013 s014 s015 s016 s017 s018 s019 s020 s021 + // s022 s023 + const __m128i r1 = _mm_alignr_epi8(r2, r0, 8); + + shuffle_src_data_ssse3(&r0, &r1, src_mask, s); + const __m128i res_0 = convolve12_16_ssse3(s, f); + + shuffle_src_data_ssse3(&r1, &r2, src_mask, s); + const __m128i res_1 = convolve12_16_ssse3(s, f); + + const __m128i res = _mm_packus_epi16(res_0, res_1); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } + } + } +} + +void vpx_convolve12_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(y_step_q4 == 16); + assert(h == 32 || h == 16 || h == 8); + assert(w == 32 || w == 16 || w == 8); + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + const uint8_t *src_ptr = src; + src_ptr -= src_stride * (MAX_FILTER_TAP / 2 - 1); + __m128i s[12], f[6]; + + shuffle_12tap_filter_ssse3(filter[y0_q4], f); + for (int j = 0; j < w; j += 8) { + const __m128i s0 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_stride + j)); + const __m128i s1 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_stride + j)); + const __m128i s2 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_stride + j)); + const __m128i s3 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_stride + j)); + const __m128i s4 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_stride + j)); + const __m128i s5 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_stride + j)); + const __m128i s6 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_stride + j)); + const __m128i s7 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_stride + j)); + const __m128i s8 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_stride + j)); + const __m128i s9 = + _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_stride + j)); + const __m128i s10t = + _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_stride + j)); + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + s[0] = _mm_unpacklo_epi8(s0, s1); + s[1] = _mm_unpacklo_epi8(s2, s3); + s[2] = _mm_unpacklo_epi8(s4, s5); + s[3] = _mm_unpacklo_epi8(s6, s7); + s[4] = _mm_unpacklo_epi8(s8, s9); + + s[6] = _mm_unpacklo_epi8(s1, s2); + s[7] = _mm_unpacklo_epi8(s3, s4); + s[8] = _mm_unpacklo_epi8(s5, s6); + s[9] = _mm_unpacklo_epi8(s7, s8); + s[10] = _mm_unpacklo_epi8(s9, s10t); + for (int i = 0; i < h; i += 2) { + const __m128i s10 = _mm_loadl_epi64( + (const __m128i *)(src_ptr + (i + 10) * src_stride + j)); + const __m128i s11 = _mm_loadl_epi64( + (const __m128i *)(src_ptr + (i + 11) * src_stride + j)); + const __m128i s12 = _mm_loadl_epi64( + (const __m128i *)(src_ptr + (i + 12) * src_stride + j)); + + s[5] = _mm_unpacklo_epi8(s10, s11); + s[11] = _mm_unpacklo_epi8(s11, s12); + + const __m128i res_0 = convolve12_16_ssse3(s, f); + const __m128i res_1 = convolve12_16_ssse3(s + 6, f); + + __m128i res = _mm_packus_epi16(res_0, res_1); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); + _mm_storel_epi64((__m128i *)&dst[(i + 1) * dst_stride + j], + _mm_srli_si128(res, 8)); + + reuse_src_data_ssse3(s + 1, s); + reuse_src_data_ssse3(s + 7, s + 6); + } + } +} + +void vpx_convolve12_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel12 *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(h == 32 || h == 16 || h == 8); + assert(w == 32 || w == 16 || w == 8); + DECLARE_ALIGNED(32, uint8_t, temp[BW * (BH + MAX_FILTER_TAP - 1)]); + const int temp_stride = BW; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + MAX_FILTER_TAP; + vpx_convolve12_horiz_ssse3(src - src_stride * (MAX_FILTER_TAP / 2 - 1), + src_stride, temp, temp_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); + vpx_convolve12_vert_ssse3(temp + temp_stride * (MAX_FILTER_TAP / 2 - 1), + temp_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c index 0712779b75..e9943447fd 100644 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -14,7 +14,9 @@ #include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/fwd_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" @@ -71,7 +73,7 @@ static INLINE void transpose_4x4(__m128i *res) { } static void fdct4_sse2(__m128i *in) { - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); @@ -109,7 +111,7 @@ static void fadst4_sse2(__m128i *in) { const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8]; __m128i in7 = _mm_add_epi16(in[0], in[1]); @@ -169,454 +171,13 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, fadst4_sse2(in); write_buffer_4x4(output, in); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_4x4(input, in, stride); fadst4_sse2(in); fadst4_sse2(in); write_buffer_4x4(output, in); break; - default: assert(0); break; - } -} - -void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, - int16_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { - __m128i zero; - int pass; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - __m128i *in[8]; - int index = 0; - - (void)scan_ptr; - (void)zbin_ptr; - (void)quant_shift_ptr; - (void)coeff_ptr; - - // Pre-condition input (shift by two) - in0 = _mm_slli_epi16(in0, 2); - in1 = _mm_slli_epi16(in1, 2); - in2 = _mm_slli_epi16(in2, 2); - in3 = _mm_slli_epi16(in3, 2); - in4 = _mm_slli_epi16(in4, 2); - in5 = _mm_slli_epi16(in5, 2); - in6 = _mm_slli_epi16(in6, 2); - in7 = _mm_slli_epi16(in7, 2); - - in[0] = &in0; - in[1] = &in1; - in[2] = &in2; - in[3] = &in3; - in[4] = &in4; - in[5] = &in5; - in[6] = &in6; - in[7] = &in7; - - // We do two passes, first the columns, then the rows. The results of the - // first pass are transposed so that the same column code can be reused. The - // results of the second pass are also transposed so that the rows (processed - // as columns) are put back in row positions. - for (pass = 0; pass < 2; pass++) { - // To store results of each pass before the transpose. - __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/subtract - const __m128i q0 = _mm_add_epi16(in0, in7); - const __m128i q1 = _mm_add_epi16(in1, in6); - const __m128i q2 = _mm_add_epi16(in2, in5); - const __m128i q3 = _mm_add_epi16(in3, in4); - const __m128i q4 = _mm_sub_epi16(in3, in4); - const __m128i q5 = _mm_sub_epi16(in2, in5); - const __m128i q6 = _mm_sub_epi16(in1, in6); - const __m128i q7 = _mm_sub_epi16(in0, in7); - // Work on first four results - { - // Add/subtract - const __m128i r0 = _mm_add_epi16(q0, q3); - const __m128i r1 = _mm_add_epi16(q1, q2); - const __m128i r2 = _mm_sub_epi16(q1, q2); - const __m128i r3 = _mm_sub_epi16(q0, q3); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res0 = _mm_packs_epi32(w0, w1); - res4 = _mm_packs_epi32(w2, w3); - res2 = _mm_packs_epi32(w4, w5); - res6 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res1 = _mm_packs_epi32(w0, w1); - res7 = _mm_packs_epi32(w2, w3); - res5 = _mm_packs_epi32(w4, w5); - res3 = _mm_packs_epi32(w6, w7); - } - // Transpose the 8x8. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); - const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); - const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); - const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); - const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); - const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); - const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } - // Post-condition output and store it - { - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const __m128i sign_in0 = _mm_srai_epi16(in0, 15); - const __m128i sign_in1 = _mm_srai_epi16(in1, 15); - const __m128i sign_in2 = _mm_srai_epi16(in2, 15); - const __m128i sign_in3 = _mm_srai_epi16(in3, 15); - const __m128i sign_in4 = _mm_srai_epi16(in4, 15); - const __m128i sign_in5 = _mm_srai_epi16(in5, 15); - const __m128i sign_in6 = _mm_srai_epi16(in6, 15); - const __m128i sign_in7 = _mm_srai_epi16(in7, 15); - in0 = _mm_sub_epi16(in0, sign_in0); - in1 = _mm_sub_epi16(in1, sign_in1); - in2 = _mm_sub_epi16(in2, sign_in2); - in3 = _mm_sub_epi16(in3, sign_in3); - in4 = _mm_sub_epi16(in4, sign_in4); - in5 = _mm_sub_epi16(in5, sign_in5); - in6 = _mm_sub_epi16(in6, sign_in6); - in7 = _mm_sub_epi16(in7, sign_in7); - in0 = _mm_srai_epi16(in0, 1); - in1 = _mm_srai_epi16(in1, 1); - in2 = _mm_srai_epi16(in2, 1); - in3 = _mm_srai_epi16(in3, 1); - in4 = _mm_srai_epi16(in4, 1); - in5 = _mm_srai_epi16(in5, 1); - in6 = _mm_srai_epi16(in6, 1); - in7 = _mm_srai_epi16(in7, 1); - } - - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - - if (!skip_block) { - __m128i eob; - __m128i round, quant, dequant; - { - __m128i coeff0, coeff1; - - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - coeff0 = *in[0]; - coeff1 = *in[1]; - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - // AC only loop - index = 2; - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); - coeff0 = *in[index]; - coeff1 = *in[index + 1]; - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - index += 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; } } @@ -708,61 +269,9 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res, store_output(&res[7], (output + 7 * stride)); } -// perform in-place transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - static void fdct8_sse2(__m128i *in) { // constants - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); @@ -897,7 +406,7 @@ static void fdct8_sse2(__m128i *in) { in[7] = _mm_packs_epi32(v6, v7); // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); } static void fadst8_sse2(__m128i *in) { @@ -914,8 +423,8 @@ static void fadst8_sse2(__m128i *in) { const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__const_0 = _mm_setzero_si128(); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; @@ -1127,7 +636,7 @@ static void fadst8_sse2(__m128i *in) { in[7] = _mm_sub_epi16(k__const_0, s1); // transpose - array_transpose_8x8(in, in); + transpose_16bit_8x8(in, in); } void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, @@ -1150,14 +659,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_8x8(input, in, stride); fadst8_sse2(in); fadst8_sse2(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; - default: assert(0); break; } } @@ -1184,23 +693,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0, write_buffer_8x8(output + 8 * stride, in1 + 8, stride); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { // perform rounding operations right_shift_8x8(res0, 2); @@ -1212,7 +704,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { static void fdct16_8col(__m128i *in) { // perform 16x16 1-D DCT for 8 columns __m128i i[8], s[8], p[8], t[8], u[16], v[16]; - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); @@ -1559,12 +1051,12 @@ static void fadst16_8col(__m128i *in) { const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); u[0] = _mm_unpacklo_epi16(in[15], in[0]); u[1] = _mm_unpackhi_epi16(in[15], in[0]); @@ -2004,13 +1496,13 @@ static void fadst16_8col(__m128i *in) { static void fdct16_sse2(__m128i *in0, __m128i *in1) { fdct16_8col(in0); fdct16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } static void fadst16_sse2(__m128i *in0, __m128i *in1) { fadst16_8col(in0); fadst16_8col(in1); - array_transpose_16x16(in0, in1); + transpose_16bit_16x16(in0, in1); } void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, @@ -2033,13 +1525,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_16x16(input, in0, in1, stride); fadst16_sse2(in0, in1); right_shift_16x16(in0, in1); fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; - default: assert(0); break; } } diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm index ced37bd16e..8152dce864 100644 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm @@ -11,6 +11,7 @@ %define private_prefix vp9 %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text @@ -62,25 +63,7 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride psllw m0, 2 psllw m1, 2 -%if CONFIG_VP9_HIGHBITDEPTH - ; sign extension - mova m2, m0 - mova m3, m1 - punpcklwd m0, m0 - punpcklwd m1, m1 - punpckhwd m2, m2 - punpckhwd m3, m3 - psrad m0, 16 - psrad m1, 16 - psrad m2, 16 - psrad m3, 16 - mova [outputq], m0 - mova [outputq + 16], m2 - mova [outputq + 32], m1 - mova [outputq + 48], m3 -%else - mova [outputq], m0 - mova [outputq + 16], m1 -%endif + STORE_TRAN_LOW 0, outputq, 0, 2, 3 + STORE_TRAN_LOW 1, outputq, 8, 2, 3 RET diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c deleted file mode 100644 index b3c3d7beb9..0000000000 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include // SSSE3 - -#include "./vp9_rtcd.h" -#include "./vpx_config.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/x86/fdct.h" -#include "vpx_dsp/x86/inv_txfm_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" - -void vp9_fdct8x8_quant_ssse3( - const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, - const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - __m128i zero; - int pass; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - __m128i *in[8]; - int index = 0; - - (void)scan_ptr; - (void)zbin_ptr; - (void)quant_shift_ptr; - (void)coeff_ptr; - - // Pre-condition input (shift by two) - in0 = _mm_slli_epi16(in0, 2); - in1 = _mm_slli_epi16(in1, 2); - in2 = _mm_slli_epi16(in2, 2); - in3 = _mm_slli_epi16(in3, 2); - in4 = _mm_slli_epi16(in4, 2); - in5 = _mm_slli_epi16(in5, 2); - in6 = _mm_slli_epi16(in6, 2); - in7 = _mm_slli_epi16(in7, 2); - - in[0] = &in0; - in[1] = &in1; - in[2] = &in2; - in[3] = &in3; - in[4] = &in4; - in[5] = &in5; - in[6] = &in6; - in[7] = &in7; - - // We do two passes, first the columns, then the rows. The results of the - // first pass are transposed so that the same column code can be reused. The - // results of the second pass are also transposed so that the rows (processed - // as columns) are put back in row positions. - for (pass = 0; pass < 2; pass++) { - // To store results of each pass before the transpose. - __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/subtract - const __m128i q0 = _mm_add_epi16(in0, in7); - const __m128i q1 = _mm_add_epi16(in1, in6); - const __m128i q2 = _mm_add_epi16(in2, in5); - const __m128i q3 = _mm_add_epi16(in3, in4); - const __m128i q4 = _mm_sub_epi16(in3, in4); - const __m128i q5 = _mm_sub_epi16(in2, in5); - const __m128i q6 = _mm_sub_epi16(in1, in6); - const __m128i q7 = _mm_sub_epi16(in0, in7); - // Work on first four results - { - // Add/subtract - const __m128i r0 = _mm_add_epi16(q0, q3); - const __m128i r1 = _mm_add_epi16(q1, q2); - const __m128i r2 = _mm_sub_epi16(q1, q2); - const __m128i r3 = _mm_sub_epi16(q0, q3); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - - res0 = _mm_packs_epi32(w0, w1); - res4 = _mm_packs_epi32(w2, w3); - res2 = _mm_packs_epi32(w4, w5); - res6 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i d0 = _mm_sub_epi16(q6, q5); - const __m128i d1 = _mm_add_epi16(q6, q5); - const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16); - const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16); - - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res1 = _mm_packs_epi32(w0, w1); - res7 = _mm_packs_epi32(w2, w3); - res5 = _mm_packs_epi32(w4, w5); - res3 = _mm_packs_epi32(w6, w7); - } - // Transpose the 8x8. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); - const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); - const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); - const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); - const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); - const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); - const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } - // Post-condition output and store it - { - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const __m128i sign_in0 = _mm_srai_epi16(in0, 15); - const __m128i sign_in1 = _mm_srai_epi16(in1, 15); - const __m128i sign_in2 = _mm_srai_epi16(in2, 15); - const __m128i sign_in3 = _mm_srai_epi16(in3, 15); - const __m128i sign_in4 = _mm_srai_epi16(in4, 15); - const __m128i sign_in5 = _mm_srai_epi16(in5, 15); - const __m128i sign_in6 = _mm_srai_epi16(in6, 15); - const __m128i sign_in7 = _mm_srai_epi16(in7, 15); - in0 = _mm_sub_epi16(in0, sign_in0); - in1 = _mm_sub_epi16(in1, sign_in1); - in2 = _mm_sub_epi16(in2, sign_in2); - in3 = _mm_sub_epi16(in3, sign_in3); - in4 = _mm_sub_epi16(in4, sign_in4); - in5 = _mm_sub_epi16(in5, sign_in5); - in6 = _mm_sub_epi16(in6, sign_in6); - in7 = _mm_sub_epi16(in7, sign_in7); - in0 = _mm_srai_epi16(in0, 1); - in1 = _mm_srai_epi16(in1, 1); - in2 = _mm_srai_epi16(in2, 1); - in3 = _mm_srai_epi16(in3, 1); - in4 = _mm_srai_epi16(in4, 1); - in5 = _mm_srai_epi16(in5, 1); - in6 = _mm_srai_epi16(in6, 1); - in7 = _mm_srai_epi16(in7, 1); - } - - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - - if (!skip_block) { - __m128i eob; - __m128i round, quant, dequant, thr; - int16_t nzflag; - { - __m128i coeff0, coeff1; - - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - coeff0 = *in[0]; - coeff1 = *in[1]; - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - // AC only loop - index = 2; - thr = _mm_srai_epi16(dequant, 1); - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1); - coeff0 = *in[index]; - coeff1 = *in[index + 1]; - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } else { - // Maybe a more efficient way to store 0? - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); - } - } - - if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - index += 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; - } -} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c index 91d0602f9d..5930bf491e 100644 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c @@ -13,7 +13,6 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" -#include "vpx_ports/emmintrin_compat.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_context_tree.h" diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c deleted file mode 100644 index 2f3c66c083..0000000000 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#if defined(_MSC_VER) -#include -#endif -#include -#include - -#include "vpx_dsp/vpx_dsp_common.h" -#include "vp9/encoder/vp9_encoder.h" -#include "vpx_ports/mem.h" - -#ifdef __GNUC__ -#define LIKELY(v) __builtin_expect(v, 1) -#define UNLIKELY(v) __builtin_expect(v, 0) -#else -#define LIKELY(v) (v) -#define UNLIKELY(v) (v) -#endif - -static INLINE int_mv pack_int_mv(int16_t row, int16_t col) { - int_mv result; - result.as_mv.row = row; - result.as_mv.col = col; - return result; -} - -static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) { - // This is simplified from the C implementation to utilise that - // x->nmvjointsadcost[1] == x->nmvjointsadcost[2] and - // x->nmvjointsadcost[1] == x->nmvjointsadcost[3] - return mv.as_int == 0 ? 0 : 1; -} - -static INLINE int mv_cost(const int_mv mv, const int *joint_cost, - int *const comp_cost[2]) { - return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] + - comp_cost[1][mv.as_mv.col]; -} - -static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref, - int sad_per_bit) { - const int_mv diff = - pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col); - return ROUND_POWER_OF_TWO( - (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit, - VP9_PROB_COST_SHIFT); -} - -/***************************************************************************** - * This function utilizes 3 properties of the cost function lookup tables, * - * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * - * vp9_encoder.c. * - * For the joint cost: * - * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * - * For the component costs: * - * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * - * (Equal costs for both components) * - * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * - * (Cost function is even) * - * If these do not hold, then this function cannot be used without * - * modification, in which case you can revert to using the C implementation, * - * which does not rely on these properties. * - *****************************************************************************/ -int vp9_diamond_search_sad_avx(const MACROBLOCK *x, - const search_site_config *cfg, MV *ref_mv, - MV *best_mv, int search_param, int sad_per_bit, - int *num00, const vp9_variance_fn_ptr_t *fn_ptr, - const MV *center_mv) { - const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max); - const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); - const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min); - const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); - - const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); - - const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); - const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); - - // search_param determines the length of the initial step and hence the number - // of iterations. - // 0 = initial step (MAX_FIRST_STEP) pel - // 1 = (MAX_FIRST_STEP/2) pel, - // 2 = (MAX_FIRST_STEP/4) pel... - const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; - const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; - const int tot_steps = cfg->total_steps - search_param; - - const int_mv fcenter_mv = - pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); - const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); - - const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); - const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); - - int_mv bmv = pack_int_mv(ref_row, ref_col); - int_mv new_bmv = bmv; - __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); - - const int what_stride = x->plane[0].src.stride; - const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; - const uint8_t *const what = x->plane[0].src.buf; - const uint8_t *const in_what = - x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; - - // Work out the start point for the search - const uint8_t *best_address = in_what; - const uint8_t *new_best_address = best_address; -#if ARCH_X86_64 - __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); -#else - __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); -#endif - - unsigned int best_sad; - int i, j, step; - - // Check the prerequisite cost function properties that are easy to check - // in an assert. See the function-level documentation for details on all - // prerequisites. - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); - assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); - - // Check the starting position - best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); - best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); - - *num00 = 0; - - for (i = 0, step = 0; step < tot_steps; step++) { - for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { - __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w; -#if ARCH_X86_64 - __m128i v_blocka[2]; -#else - __m128i v_blocka[1]; -#endif - - // Compute the candidate motion vectors - const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]); - const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); - // Clamp them to the search bounds - __m128i v_these_mv_clamp_w = v_these_mv_w; - v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); - v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); - // The ones that did not change are inside the search area - v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); - - // If none of them are inside, then move on - if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) { - continue; - } - - // The inverse mask indicates which of the MVs are outside - v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); - // Shift right to keep the sign bit clear, we will use this later - // to set the cost to the maximum value. - v_outside_d = _mm_srli_epi32(v_outside_d, 1); - - // Compute the difference MV - v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); - // We utilise the fact that the cost function is even, and use the - // absolute difference. This allows us to use unsigned indexes later - // and reduces cache pressure somewhat as only a half of the table - // is ever referenced. - v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); - - // Compute the SIMD pointer offsets. - { -#if ARCH_X86_64 // sizeof(intptr_t) == 8 - // Load the offsets - __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]); - __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]); - // Set the ones falling outside to zero - v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); - v_bo32_q = - _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); - // Compute the candidate addresses - v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); - v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); -#else // ARCH_X86 // sizeof(intptr_t) == 4 - __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]); - v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); - v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); -#endif - } - - fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], - in_what_stride, (uint32_t *)&v_sad_d); - - // Look up the component cost of the residual motion vector - { - const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); - const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); - const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); - const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); - const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); - const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); - const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); - const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); - - // Note: This is a use case for vpgather in AVX2 - const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; - const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; - const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; - const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; - - __m128i v_cost_10_d, v_cost_32_d; - v_cost_10_d = _mm_cvtsi32_si128(cost0); - v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); - v_cost_32_d = _mm_cvtsi32_si128(cost2); - v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); - v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); - } - - // Now add in the joint cost - { - const __m128i v_sel_d = - _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); - const __m128i v_joint_cost_d = - _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); - v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); - } - - // Multiply by sad_per_bit - v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); - // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT) - v_cost_d = _mm_add_epi32(v_cost_d, - _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1))); - v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT); - // Add the cost to the sad - v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); - - // Make the motion vectors outside the search area have max cost - // by or'ing in the comparison mask, this way the minimum search won't - // pick them. - v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); - - // Find the minimum value and index horizontally in v_sad_d - { - // Try speculatively on 16 bits, so we can use the minpos intrinsic - const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); - const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); - - uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); - uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); - - // If the local best value is not saturated, just use it, otherwise - // find the horizontal minimum again the hard way on 32 bits. - // This is executed rarely. - if (UNLIKELY(local_best_sad == 0xffff)) { - __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; - - v_loval_d = v_sad_d; - v_loidx_d = _mm_set_epi32(3, 2, 1, 0); - v_hival_d = _mm_srli_si128(v_loval_d, 8); - v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); - - v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); - - v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); - v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); - v_hival_d = _mm_srli_si128(v_loval_d, 4); - v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); - - v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); - - v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); - v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); - - local_best_sad = _mm_extract_epi32(v_loval_d, 0); - local_best_idx = _mm_extract_epi32(v_loidx_d, 0); - } - - // Update the global minimum if the local minimum is smaller - if (LIKELY(local_best_sad < best_sad)) { - new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; - new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; - - best_sad = local_best_sad; - } - } - } - - bmv = new_bmv; - best_address = new_best_address; - - v_bmv_w = _mm_set1_epi32(bmv.as_int); -#if ARCH_X86_64 - v_ba_q = _mm_set1_epi64x((intptr_t)best_address); -#else - v_ba_d = _mm_set1_epi32((intptr_t)best_address); -#endif - - if (UNLIKELY(best_address == in_what)) { - (*num00)++; - } - } - - *best_mv = bmv.as_mv; - return best_sad; -} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c new file mode 100644 index 0000000000..99fef31d16 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" + +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + __m256i sse_256, ssz_256; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_hi, ssz_hi; + __m128i sse_128, ssz_128; + int64_t sse; + const __m256i zero = _mm256_setzero_si256(); + + // If the block size is 16 then the results will fit in 32 bits. + if (block_size == 16) { + __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi; + // Load 16 elements for coeff and dqcoeff. + coeff_256 = load_tran_low(coeff); + dqcoeff_256 = load_tran_low(dqcoeff); + // dqcoeff - coeff + dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256); + // madd (dqcoeff - coeff) + dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256); + // madd coeff + coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256); + // Save the higher 64 bit of each 128 bit lane. + dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8); + coeff_hi = _mm256_srli_si256(coeff_256, 8); + // Add the higher 64 bit to the low 64 bit. + dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi); + coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero); + ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero); + } else { + int i; + assert(block_size % 32 == 0); + sse_256 = zero; + ssz_256 = zero; + + for (i = 0; i < block_size; i += 32) { + __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1; + // Load 32 elements for coeff and dqcoeff. + coeff_0 = load_tran_low(coeff + i); + dqcoeff_0 = load_tran_low(dqcoeff + i); + coeff_1 = load_tran_low(coeff + i + 16); + dqcoeff_1 = load_tran_low(dqcoeff + i + 16); + // dqcoeff - coeff + dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0); + dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1); + // madd (dqcoeff - coeff) + dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0); + dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1); + // madd coeff + coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0); + coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1); + // Add the first madd (dqcoeff - coeff) with the second. + dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1); + // Add the first madd (coeff) with the second. + coeff_0 = _mm256_add_epi32(coeff_0, coeff_1); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero); + // Add each quad word of madd (dqcoeff - coeff) and madd (coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo); + ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi); + ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi); + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + ssz_hi = _mm256_srli_si256(ssz_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256), + _mm256_extractf128_si256(ssz_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)(&sse), sse_128); + + _mm_storel_epi64((__m128i *)(ssz), ssz_128); + return sse; +} + +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + int i; + const __m256i zero = _mm256_setzero_si256(); + __m256i sse_256 = zero; + __m256i sse_hi; + __m128i sse_128; + int64_t sse; + + if (block_size == 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + // dqcoeff - coeff + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + // madd (dqcoeff - coeff) + const __m256i error_lo = _mm256_madd_epi16(diff, diff); + // Save the higher 64 bit of each 128 bit lane. + const __m256i error_hi = _mm256_srli_si256(error_lo, 8); + // Add the higher 64 bit to the low 64 bit. + const __m256i error = _mm256_add_epi32(error_lo, error_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(error, zero); + } else { + for (i = 0; i < block_size; i += 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + const __m256i error = _mm256_madd_epi16(diff, diff); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero); + const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero); + // Add each quad word of madd (dqcoeff - coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_error_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_error_hi); + coeff += 16; + dqcoeff += 16; + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)&sse, sse_128); + return sse; +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c deleted file mode 100644 index 453af2a403..0000000000 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Usee of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // AVX2 - -#include "./vp9_rtcd.h" -#include "vpx/vpx_integer.h" - -int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, - intptr_t block_size, int64_t *ssz) { - __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; - __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; - __m256i sse_reg_64hi, ssz_reg_64hi; - __m128i sse_reg128, ssz_reg128; - int64_t sse; - int i; - const __m256i zero_reg = _mm256_set1_epi16(0); - - // init sse and ssz registerd to zero - sse_reg = _mm256_set1_epi16(0); - ssz_reg = _mm256_set1_epi16(0); - - for (i = 0; i < block_size; i += 16) { - // load 32 bytes from coeff and dqcoeff - coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); - dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); - // dqcoeff - coeff - dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); - // madd (dqcoeff - coeff) - dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); - // madd coeff - coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); - // expand each double word of madd (dqcoeff - coeff) to quad word - exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); - exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); - // expand each double word of madd (coeff) to quad word - exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); - exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); - // add each quad word of madd (dqcoeff - coeff) and madd (coeff) - sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); - ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); - sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); - ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); - } - // save the higher 64 bit of each 128 bit lane - sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); - ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); - // add the higher 64 bit to the low 64 bit - sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); - ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); - - // add each 64 bit from each of the 128 bit lane of the 256 bit - sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), - _mm256_extractf128_si256(sse_reg, 1)); - - ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), - _mm256_extractf128_si256(ssz_reg, 1)); - - // store the results - _mm_storel_epi64((__m128i *)(&sse), sse_reg128); - - _mm_storel_epi64((__m128i *)(ssz), ssz_reg128); - return sse; -} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm index 5b0238272b..7beec130ab 100644 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm @@ -11,6 +11,7 @@ %define private_prefix vp9 %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text @@ -22,14 +23,14 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz pxor m4, m4 ; sse accumulator pxor m6, m6 ; ssz accumulator pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*2] - lea dqcq, [dqcq+sizeq*2] - neg sizeq .loop: - mova m2, [uqcq+sizeq*2] - mova m0, [dqcq+sizeq*2] - mova m3, [uqcq+sizeq*2+mmsize] - mova m1, [dqcq+sizeq*2+mmsize] + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 psubw m0, m2 psubw m1, m3 ; individual errors are max. 15bit+sign, so squares are 30bit, and @@ -38,32 +39,26 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 + paddd m2, m3 ; accumulate in 64bit punpckldq m7, m0, m5 punpckhdq m0, m5 paddq m4, m7 - punpckldq m7, m1, m5 - paddq m4, m0 - punpckhdq m1, m5 - paddq m4, m7 punpckldq m7, m2, m5 - paddq m4, m1 + paddq m4, m0 punpckhdq m2, m5 paddq m6, m7 - punpckldq m7, m3, m5 paddq m6, m2 - punpckhdq m3, m5 - paddq m6, m7 - paddq m6, m3 - add sizeq, mmsize - jl .loop + jg .loop ; accumulate horizontally and store in return value movhlps m5, m4 movhlps m7, m6 paddq m4, m5 paddq m6, m7 -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 movq rax, m4 movq [sszq], m6 %else @@ -75,44 +70,42 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz %endif RET -; Compute the sum of squared difference between two int16_t vectors. -; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff, +; Compute the sum of squared difference between two tran_low_t vectors. +; Vectors are converted (if necessary) to int16_t for calculations. +; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff, ; intptr_t block_size) INIT_XMM sse2 cglobal block_error_fp, 3, 3, 6, uqc, dqc, size pxor m4, m4 ; sse accumulator pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*2] - lea dqcq, [dqcq+sizeq*2] - neg sizeq .loop: - mova m2, [uqcq+sizeq*2] - mova m0, [dqcq+sizeq*2] - mova m3, [uqcq+sizeq*2+mmsize] - mova m1, [dqcq+sizeq*2+mmsize] + LOAD_TRAN_LOW 2, uqcq, 0 + LOAD_TRAN_LOW 0, dqcq, 0 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 + INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 + INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 + sub sizeq, 16 psubw m0, m2 psubw m1, m3 ; individual errors are max. 15bit+sign, so squares are 30bit, and ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) pmaddwd m0, m0 pmaddwd m1, m1 + ; the sum of 2 31bit integers will fit in a 32bit unsigned integer + paddd m0, m1 ; accumulate in 64bit punpckldq m3, m0, m5 punpckhdq m0, m5 paddq m4, m3 - punpckldq m3, m1, m5 paddq m4, m0 - punpckhdq m1, m5 - paddq m4, m3 - paddq m4, m1 - add sizeq, mmsize - jl .loop + jnz .loop ; accumulate horizontally and store in return value movhlps m5, m4 paddq m4, m5 -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 movq rax, m4 %else pshufd m5, m4, 0x1 diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c index fa2a6449b0..628dc4fead 100644 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -13,192 +13,895 @@ #include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" +#include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_scale/yv12config.h" -extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst); +static INLINE __m128i scale_plane_2_to_1_phase_0_kernel( + const uint8_t *const src, const __m128i *const mask) { + const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0])); + const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16])); + const __m128i a_and = _mm_and_si128(a, *mask); + const __m128i b_and = _mm_and_si128(b, *mask); + return _mm_packus_epi16(a_and, b_and); +} -static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int w, - int h) { +static void scale_plane_2_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; const __m128i mask = _mm_set1_epi16(0x00FF); - const int max_width = w & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask); + _mm_storeu_si128((__m128i *)dst, d); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h) { + const int max_width = (dst_w + 15) & ~15; + const __m128i mask = _mm_set1_epi32(0x000000FF); + int y = dst_h; + + do { + int x = max_width; + do { + const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask); + const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask); + const __m128i d2 = _mm_packus_epi16(d0, d1); + _mm_storeu_si128((__m128i *)dst, d2); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s, + const __m128i c0c1) { + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1); + const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1); + // round and shift by 7 bit each 16 bit + const __m128i t2 = _mm_adds_epi16(t0, k_64); + const __m128i t3 = _mm_adds_epi16(t1, k_64); + const __m128i t4 = _mm_srai_epi16(t2, 7); + const __m128i t5 = _mm_srai_epi16(t3, 7); + return _mm_packus_epi16(t4, t5); +} + +static void scale_plane_2_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[2], d[2]; + + // Horizontal + // Even rows + s[0] = _mm_loadu_si128((const __m128i *)(src + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + 16)); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + // odd rows + s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + d[1] = scale_plane_bilinear_kernel(s, c0c1); + + // Vertical + s[0] = _mm_unpacklo_epi8(d[0], d[1]); + s[1] = _mm_unpackhi_epi8(d[0], d[1]); + d[0] = scale_plane_bilinear_kernel(s, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 32; + dst += 16; + x -= 16; + } while (x); + src += 2 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_4_to_1_bilinear(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *dst, + const ptrdiff_t dst_stride, + const int dst_w, const int dst_h, + const __m128i c0c1) { + const int max_width = (dst_w + 15) & ~15; + int y = dst_h; + + do { + int x = max_width; + do { + __m128i s[8], d[8]; + + // Note: Using _mm_packus_epi32() in SSE4.1 could be faster. + // Here we tried to not use shuffle instructions which would be slow + // on some x86 CPUs. + + // Horizontal + // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx + // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx + // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx + // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx + // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx + // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx + // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx + // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx + s[0] = _mm_loadu_si128((const __m128i *)(&src[0])); + s[1] = _mm_loadu_si128((const __m128i *)(&src[16])); + s[2] = _mm_loadu_si128((const __m128i *)(&src[32])); + s[3] = _mm_loadu_si128((const __m128i *)(&src[48])); + s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0)); + s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16)); + s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32)); + s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48)); + + // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx + // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx + // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx + // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx + // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx + // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx + // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx + // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx + d[0] = _mm_unpacklo_epi16(s[0], s[4]); + d[1] = _mm_unpackhi_epi16(s[0], s[4]); + d[2] = _mm_unpacklo_epi16(s[1], s[5]); + d[3] = _mm_unpackhi_epi16(s[1], s[5]); + d[4] = _mm_unpacklo_epi16(s[2], s[6]); + d[5] = _mm_unpackhi_epi16(s[2], s[6]); + d[6] = _mm_unpacklo_epi16(s[3], s[7]); + d[7] = _mm_unpackhi_epi16(s[3], s[7]); + + // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx + // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx + // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx + // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx + // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx + // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx + // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx + // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx + s[0] = _mm_unpacklo_epi32(d[0], d[1]); + s[1] = _mm_unpackhi_epi32(d[0], d[1]); + s[2] = _mm_unpacklo_epi32(d[2], d[3]); + s[3] = _mm_unpackhi_epi32(d[2], d[3]); + s[4] = _mm_unpacklo_epi32(d[4], d[5]); + s[5] = _mm_unpackhi_epi32(d[4], d[5]); + s[6] = _mm_unpacklo_epi32(d[6], d[7]); + s[7] = _mm_unpackhi_epi32(d[6], d[7]); + + // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D + // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D + // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D + // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D + d[0] = _mm_unpacklo_epi32(s[0], s[1]); + d[1] = _mm_unpacklo_epi32(s[2], s[3]); + d[2] = _mm_unpacklo_epi32(s[4], s[5]); + d[3] = _mm_unpacklo_epi32(s[6], s[7]); + + d[0] = scale_plane_bilinear_kernel(&d[0], c0c1); + d[1] = scale_plane_bilinear_kernel(&d[2], c0c1); + + // Vertical + d[0] = scale_plane_bilinear_kernel(d, c0c1); + + _mm_storeu_si128((__m128i *)dst, d[0]); + src += 64; + dst += 16; + x -= 16; + } while (x); + src += 4 * (src_stride - max_width); + dst += dst_stride - max_width; + } while (--y); +} + +static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 3) & ~3; + const int width_ver = (w + 7) & ~7; + const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 3) & ~3; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1; + + // horizontal 4x8 + do { + load_8bit_8x8(src + 2, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[3]); + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + transpose_16bit_4x8(&s[3], &s[3]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71 + d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72 + d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73 + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + d[0] = _mm_packus_epi16(d[0], d[2]); + d[1] = _mm_packus_epi16(d[1], d[3]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + d[2] = _mm_unpacklo_epi16(d[0], d[1]); + d[3] = _mm_unpackhi_epi16(d[0], d[1]); + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + d[0] = _mm_unpacklo_epi32(d[2], d[3]); + d[1] = _mm_unpackhi_epi32(d[2], d[3]); + store_8bit_8x4_from_16x2(d, t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + t += 8; + x -= 4; + } while (x); + src += 8 * src_stride - 2 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x4 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor)); + t += 6 * width_hor; + y = height_ver; + + do { + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[3]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17 + d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27 + d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[1] = _mm_packus_epi16(d[2], d[3]); + store_8bit_8x4_from_16x2(d, dst, dst_stride); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + + dst += 4 * dst_stride; + y -= 4; + } while (y); + t -= width_hor * (2 * height_ver + 6); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + const int width_hor = (w + 1) & ~1; + const int width_ver = (w + 7) & ~7; + const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7; + const int height_ver = (h + 1) & ~1; + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[11], d[4]; + __m128i f[4]; + + assert(w && h); + + shuffle_filter_ssse3(coef, f); + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3; + + // horizontal 2x8 + do { + load_8bit_8x8(src + 4, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped) + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped) + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[2]); + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + transpose_16bit_4x8(&s[2], &s[2]); + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70 + d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71 + + // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx + // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + d[0] = _mm_packus_epi16(d[0], d[0]); + d[1] = _mm_packus_epi16(d[1], d[1]); + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + d[0] = _mm_unpacklo_epi16(d[0], d[1]); + store_8bit_4x4_sse2(d[0], t, 2 * width_hor); + + s[0] = s[4]; + s[1] = s[5]; + + t += 4; + x -= 2; + } while (x); + src += 8 * src_stride - 4 * width_hor; + t += 6 * width_hor; + y -= 8; + } while (y); + + // vertical 8x2 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor)); + s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor)); + t += 4 * width_hor; + y = height_ver; + + do { + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77 + loadu_8bit_16x4(t, 2 * width_hor, &s[2]); + t += 8 * width_hor; + + d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07 + d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17 + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + d[0] = _mm_packus_epi16(d[0], d[1]); + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + + s[0] = s[4]; + s[1] = s[5]; + + dst += 2 * dst_stride; + y -= 2; + } while (y); + t -= width_hor * (4 * height_ver + 4); + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +typedef void (*shuffle_filter_funcs)(const int16_t *const filter, + __m128i *const f); + +typedef __m128i (*convolve8_funcs)(const __m128i *const s, + const __m128i *const f); + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[12], d[6], dd[4]; + __m128i f0[4], f1[5], f2[5]; + // The offset of the first row is always less than 1 pixel. + const int offset1_q4 = phase_scaler + 1 * step_q4; + const int offset2_q4 = phase_scaler + 2 * step_q4; + // offset_idxx indicates the pixel offset is even (0) or odd (1). + // It's used to choose the src offset and filter coefficient offset. + const int offset_idx1 = (offset1_q4 >> 4) & 1; + const int offset_idx2 = (offset2_q4 >> 4) & 1; + static const shuffle_filter_funcs kShuffleFilterFuncs[2] = { + shuffle_filter_ssse3, shuffle_filter_odd_ssse3 + }; + static const convolve8_funcs kConvolve8Funcs[2] = { + convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 + }; + + assert(w && h); + + shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0); + kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + + // Sub 64 to avoid overflow. + // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. + // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. + // When filter phase idx is 1, the two biggest coefficients are shuffled + // together, and the sum of them are always no less than 128. Sub 64 here. + // After the subtraction, when the sum of all positive coefficients are no + // larger than 128, and the sum of all negative coefficients are no + // less than -128, there will be no overflow in the convolve8 functions. + f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); + f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); + f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; + + // horizontal 6x8 + do { + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[4]); + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F + transpose_16bit_4x8(&s[4], &s[4]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx + // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx + dd[0] = _mm_packus_epi16(d[0], d[2]); + dd[1] = _mm_packus_epi16(d[1], d[3]); + dd[2] = _mm_packus_epi16(d[4], d[4]); + dd[3] = _mm_packus_epi16(d[5], d[5]); + + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 + d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); + d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); + d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); + + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx + // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx + dd[0] = _mm_unpacklo_epi32(d[0], d[1]); + dd[1] = _mm_unpackhi_epi32(d[0], d[1]); + dd[2] = _mm_unpacklo_epi32(d[2], d[2]); + dd[3] = _mm_unpackhi_epi32(d[2], d[2]); + + // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx + // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx + // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx + // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx + d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); + d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); + d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); + d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); + + // store 4 extra pixels + storeu_8bit_16x4(d, t, stride_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + t += 12; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 3 * stride_hor + 4; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + loadu_8bit_16x4(t, stride_hor, s); + y = height_ver; + + do { + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 + // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 + t += 4 * stride_hor; + loadu_8bit_16x4(t, stride_hor, &s[4]); + + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[2] = _mm_packus_epi16(d[2], d[3]); + d[4] = _mm_packus_epi16(d[4], d[5]); + + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); + _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); + _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * 2 * height_ver / 3; + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + +static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, + const __m128i *const f) { + __m128i ss[4], temp; + + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + temp = convolve8_8_ssse3(ss, f); + return _mm_packus_epi16(temp, temp); +} + +// Only calculate odd columns since even columns are just src pixels' copies. +static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst, + const int w, const __m128i *const f) { + int x = w; + + do { + __m128i s[8], temp; + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0)); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 1)); + s[2] = _mm_loadl_epi64((const __m128i *)(src + 2)); + s[3] = _mm_loadl_epi64((const __m128i *)(src + 3)); + s[4] = _mm_loadl_epi64((const __m128i *)(src + 4)); + s[5] = _mm_loadl_epi64((const __m128i *)(src + 5)); + s[6] = _mm_loadl_epi64((const __m128i *)(src + 6)); + s[7] = _mm_loadl_epi64((const __m128i *)(src + 7)); + temp = scale_1_to_2_phase_0_kernel(s, f); + _mm_storel_epi64((__m128i *)dst, temp); + src += 8; + dst += 8; + x -= 8; + } while (x); +} + +static void scale_plane_1_to_2_phase_0(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const int src_w, const int src_h, + const int16_t *const coef, + uint8_t *const temp_buffer) { + int max_width; int y; - for (y = 0; y < h; ++y) { + uint8_t *tmp[9]; + __m128i f[4]; + + max_width = (src_w + 7) & ~7; + tmp[0] = temp_buffer + 0 * max_width; + tmp[1] = temp_buffer + 1 * max_width; + tmp[2] = temp_buffer + 2 * max_width; + tmp[3] = temp_buffer + 3 * max_width; + tmp[4] = temp_buffer + 4 * max_width; + tmp[5] = temp_buffer + 5 * max_width; + tmp[6] = temp_buffer + 6 * max_width; + tmp[7] = temp_buffer + 7 * max_width; + + shuffle_filter_ssse3(coef, f); + + scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f); + scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f); + scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f); + scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f); + scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f); + scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f); + scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f); + + y = src_h; + do { int x; - for (x = 0; x < max_width; x += 16) { - const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0)); - const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16)); - const __m128i a_and = _mm_and_si128(a, mask); - const __m128i b_and = _mm_and_si128(b, mask); - const __m128i c = _mm_packus_epi16(a_and, b_and); - _mm_storeu_si128((__m128i *)(dst + x), c); + scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f); + for (x = 0; x < max_width; x += 8) { + __m128i s[8], C, D, CD; + + // Even rows + const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x)); + const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + const __m128i ab = _mm_unpacklo_epi8(a, b); + _mm_storeu_si128((__m128i *)(dst + 2 * x), ab); + + // Odd rows + // Even columns + load_8bit_8x8(src + x - 3 * src_stride, src_stride, s); + C = scale_1_to_2_phase_0_kernel(s, f); + + // Odd columns + s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x)); + s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x)); + s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x)); + s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x)); + s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x)); + s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x)); + s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x)); + s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x)); + D = scale_1_to_2_phase_0_kernel(s, f); + + CD = _mm_unpacklo_epi8(C, D); + _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD); } - for (; x < w; ++x) dst[x] = src[x * 2]; - src += src_stride * 2; - dst += dst_stride; - } -} -static INLINE __m128i filter(const __m128i *const a, const __m128i *const b, - const __m128i *const c, const __m128i *const d, - const __m128i *const e, const __m128i *const f, - const __m128i *const g, const __m128i *const h) { - const __m128i coeffs_ab = - _mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1); - const __m128i coeffs_cd = _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78, - -19, 78, -19, 78, -19, 78, -19); - const __m128i const64_x16 = _mm_set1_epi16(64); - const __m128i ab = _mm_unpacklo_epi8(*a, *b); - const __m128i cd = _mm_unpacklo_epi8(*c, *d); - const __m128i fe = _mm_unpacklo_epi8(*f, *e); - const __m128i hg = _mm_unpacklo_epi8(*h, *g); - const __m128i ab_terms = _mm_maddubs_epi16(ab, coeffs_ab); - const __m128i cd_terms = _mm_maddubs_epi16(cd, coeffs_cd); - const __m128i fe_terms = _mm_maddubs_epi16(fe, coeffs_cd); - const __m128i hg_terms = _mm_maddubs_epi16(hg, coeffs_ab); - // can not overflow - const __m128i abcd_terms = _mm_add_epi16(ab_terms, cd_terms); - // can not overflow - const __m128i fehg_terms = _mm_add_epi16(fe_terms, hg_terms); - // can overflow, use saturating add - const __m128i terms = _mm_adds_epi16(abcd_terms, fehg_terms); - const __m128i round = _mm_adds_epi16(terms, const64_x16); - const __m128i shift = _mm_srai_epi16(round, 7); - return _mm_packus_epi16(shift, shift); -} - -static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) { - const int max_width = w & ~7; - int x = 0; - for (; x < max_width; x += 8) { - const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x + 0)); - const __m128i b = _mm_loadl_epi64((const __m128i *)(src + x + 1)); - const __m128i c = _mm_loadl_epi64((const __m128i *)(src + x + 2)); - const __m128i d = _mm_loadl_epi64((const __m128i *)(src + x + 3)); - const __m128i e = _mm_loadl_epi64((const __m128i *)(src + x + 4)); - const __m128i f = _mm_loadl_epi64((const __m128i *)(src + x + 5)); - const __m128i g = _mm_loadl_epi64((const __m128i *)(src + x + 6)); - const __m128i h = _mm_loadl_epi64((const __m128i *)(src + x + 7)); - const __m128i pack = filter(&a, &b, &c, &d, &e, &f, &g, &h); - _mm_storel_epi64((__m128i *)(dst + x), pack); - } -} - -static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int dst_w, - int dst_h) { - dst_w /= 2; - dst_h /= 2; - { - DECLARE_ALIGNED(16, uint8_t, tmp[1920 * 8]); - uint8_t *tmp0 = tmp + dst_w * 0; - uint8_t *tmp1 = tmp + dst_w * 1; - uint8_t *tmp2 = tmp + dst_w * 2; - uint8_t *tmp3 = tmp + dst_w * 3; - uint8_t *tmp4 = tmp + dst_w * 4; - uint8_t *tmp5 = tmp + dst_w * 5; - uint8_t *tmp6 = tmp + dst_w * 6; - uint8_t *tmp7 = tmp + dst_w * 7; - uint8_t *tmp8 = NULL; - const int max_width = dst_w & ~7; - int y; - eight_tap_row_ssse3(src - src_stride * 3 - 3, tmp0, dst_w); - eight_tap_row_ssse3(src - src_stride * 2 - 3, tmp1, dst_w); - eight_tap_row_ssse3(src - src_stride * 1 - 3, tmp2, dst_w); - eight_tap_row_ssse3(src + src_stride * 0 - 3, tmp3, dst_w); - eight_tap_row_ssse3(src + src_stride * 1 - 3, tmp4, dst_w); - eight_tap_row_ssse3(src + src_stride * 2 - 3, tmp5, dst_w); - eight_tap_row_ssse3(src + src_stride * 3 - 3, tmp6, dst_w); - for (y = 0; y < dst_h; y++) { - int x; - eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w); - for (x = 0; x < max_width; x += 8) { - const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x)); - const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x)); - const __m128i AB = _mm_unpacklo_epi8(A, B); - __m128i C, D, CD; - _mm_storeu_si128((__m128i *)(dst + x * 2), AB); - { - const __m128i a = - _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 3)); - const __m128i b = - _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 2)); - const __m128i c = - _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 1)); - const __m128i d = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 0)); - const __m128i e = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 1)); - const __m128i f = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 2)); - const __m128i g = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 3)); - const __m128i h = - _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 4)); - C = filter(&a, &b, &c, &d, &e, &f, &g, &h); - } - { - const __m128i a = _mm_loadl_epi64((const __m128i *)(tmp0 + x)); - const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp1 + x)); - const __m128i c = _mm_loadl_epi64((const __m128i *)(tmp2 + x)); - const __m128i d = _mm_loadl_epi64((const __m128i *)(tmp3 + x)); - const __m128i e = _mm_loadl_epi64((const __m128i *)(tmp4 + x)); - const __m128i f = _mm_loadl_epi64((const __m128i *)(tmp5 + x)); - const __m128i g = _mm_loadl_epi64((const __m128i *)(tmp6 + x)); - const __m128i h = _mm_loadl_epi64((const __m128i *)(tmp7 + x)); - D = filter(&a, &b, &c, &d, &e, &f, &g, &h); - } - CD = _mm_unpacklo_epi8(C, D); - _mm_storeu_si128((__m128i *)(dst + x * 2 + dst_stride), CD); - } - src += src_stride; - dst += dst_stride * 2; - tmp8 = tmp0; - tmp0 = tmp1; - tmp1 = tmp2; - tmp2 = tmp3; - tmp3 = tmp4; - tmp4 = tmp5; - tmp5 = tmp6; - tmp6 = tmp7; - tmp7 = tmp8; - } - } + src += src_stride; + dst += 2 * dst_stride; + tmp[8] = tmp[0]; + tmp[0] = tmp[1]; + tmp[1] = tmp[2]; + tmp[2] = tmp[3]; + tmp[3] = tmp[4]; + tmp[4] = tmp[5]; + tmp[5] = tmp[6]; + tmp[6] = tmp[7]; + tmp[7] = tmp[8]; + } while (--y); } void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { + YV12_BUFFER_CONFIG *dst, + uint8_t filter_type, int phase_scaler) { const int src_w = src->y_crop_width; const int src_h = src->y_crop_height; const int dst_w = dst->y_crop_width; const int dst_h = dst->y_crop_height; - const int dst_uv_w = dst_w / 2; - const int dst_uv_h = dst_h / 2; + const int dst_uv_w = dst->uv_crop_width; + const int dst_uv_h = dst->uv_crop_height; + int scaled = 0; + + // phase_scaler is usually 0 or 8. + assert(phase_scaler >= 0 && phase_scaler < 16); if (dst_w * 2 == src_w && dst_h * 2 == src_h) { - downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer, - dst->y_stride, dst_w, dst_h); - downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - vpx_extend_frame_borders(dst); - } else if (dst_w == src_w * 2 && dst_h == src_h * 2) { - // The upsample() supports widths up to 1920 * 2. If greater, fall back - // to vp9_scale_and_extend_frame_c(). - if (dst_w / 2 <= 1920) { - upsample_1_to_2_ssse3(src->y_buffer, src->y_stride, dst->y_buffer, - dst->y_stride, dst_w, dst_h); - upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer, - dst->uv_stride, dst_uv_w, dst_uv_h); - vpx_extend_frame_borders(dst); + // 2 to 1 + scaled = 1; + + if (phase_scaler == 0) { + scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0c1); + scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); } else { - vp9_scale_and_extend_frame_c(src, dst); + const int buffer_stride = (dst_w + 3) & ~3; + const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height); + if (temp_buffer) { + scale_plane_2_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_2_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_2_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } } + } else if (4 * dst_w == src_w && 4 * dst_h == src_h) { + // 4 to 1 + scaled = 1; + if (phase_scaler == 0) { + scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h); + scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h); + } else if (filter_type == BILINEAR) { + const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3]; + const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4]; + const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0 + scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, dst_w, dst_h, c0c1); + scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, c0c1); + } else { + const int buffer_stride = (dst_w + 1) & ~1; + const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7; + // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow + const int extra_padding = 16; + uint8_t *const temp_buffer = + (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding); + if (temp_buffer) { + scale_plane_4_to_1_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer); + scale_plane_4_to_1_general( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + scale_plane_4_to_1_general( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler], + temp_buffer); + free(temp_buffer); + } else { + scaled = 0; + } + } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_stride_ver = (dst_w + 7) & ~7; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + // When the vertical filter reads more pixels than the horizontal filter + // generated in each row, we need extra padding to avoid heap read overflow. + // For example, the horizontal filter generates 18 pixels but the vertical + // filter reads 24 pixels in a row. The difference is multiplied by 2 since + // two rows are interlaced together in the optimization. + const int extra_padding = (buffer_stride_ver > buffer_stride_hor) + ? 2 * (buffer_stride_ver - buffer_stride_hor) + : 0; + const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; + uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); + if (temp_buffer) { + scaled = 1; + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + free(temp_buffer); + } + } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) { + // 1 to 2 + uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7)); + if (temp_buffer) { + scaled = 1; + scale_plane_1_to_2_phase_0( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w, + src_h, vp9_filter_kernels[filter_type][8], temp_buffer); + const int src_uv_w = src->uv_crop_width; + const int src_uv_h = src->uv_crop_height; + scale_plane_1_to_2_phase_0( + src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer); + scale_plane_1_to_2_phase_0( + src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + src_uv_w, src_uv_h, vp9_filter_kernels[filter_type][8], temp_buffer); + free(temp_buffer); + } + } + + if (scaled) { + vpx_extend_frame_borders(dst); } else { - vp9_scale_and_extend_frame_c(src, dst); + // Call c version for all other scaling ratios. + vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler); } } diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c index 91f627c343..d7aafe7b01 100644 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c @@ -11,27 +11,28 @@ #include #include +#include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" -int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz, - int bps) { +int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { int i, j, test; uint32_t temp[4]; __m128i max, min, cmp0, cmp1, cmp2, cmp3; int64_t error = 0, sqcoeff = 0; - const int shift = 2 * (bps - 8); + const int shift = 2 * (bd - 8); const int rounding = shift > 0 ? 1 << (shift - 1) : 0; for (i = 0; i < block_size; i += 8) { // Load the data into xmm registers - __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); - __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); - __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); - __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); + __m128i mm_coeff = _mm_load_si128((const __m128i *)(coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((const __m128i *)(coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((const __m128i *)(dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((const __m128i *)(dqcoeff + i + 4)); // Check if any values require more than 15 bit max = _mm_set1_epi32(0x3fff); - min = _mm_set1_epi32(0xffffc000); + min = _mm_set1_epi32((int32_t)0xffffc000); cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), _mm_cmplt_epi32(mm_coeff, min)); cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm deleted file mode 100644 index e476323e14..0000000000 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_avx.asm +++ /dev/null @@ -1,261 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text -ALIGN 16 - -; -; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff, -; intptr_t block_size, int64_t *ssz) -; - -INIT_XMM avx -cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz - vzeroupper - - ; If only one iteration is required, then handle this as a special case. - ; It is the most frequent case, so we can have a significant gain here - ; by not setting up a loop and accumulators. - cmp sizeq, 16 - jne .generic - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Common case of size == 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Load input vectors - mova xm0, [dqcq] - packssdw xm0, [dqcq+16] - mova xm2, [uqcq] - packssdw xm2, [uqcq+16] - - mova xm1, [dqcq+32] - packssdw xm1, [dqcq+48] - mova xm3, [uqcq+32] - packssdw xm3, [uqcq+48] - - ; Compute the errors. - psubw xm0, xm2 - psubw xm1, xm3 - - ; Individual errors are max 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). - pmaddwd xm2, xm2 - pmaddwd xm3, xm3 - - pmaddwd xm0, xm0 - pmaddwd xm1, xm1 - - ; Squares are always positive, so we can use unsigned arithmetic after - ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will - ; fit in 32bits - paddd xm2, xm3 - paddd xm0, xm1 - - ; Accumulate horizontally in 64 bits, there is no chance of overflow here - pxor xm5, xm5 - - pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits - psrlq xm2, 32 ; Zero extended high of a pair of 32 bits - - pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits - psrlq xm0, 32 ; Zero extended high of a pair of 32 bits - - paddq xm2, xm3 - paddq xm0, xm1 - - psrldq xm3, xm2, 8 - psrldq xm1, xm0, 8 - - paddq xm2, xm3 - paddq xm0, xm1 - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm0 - movq [sszq], xm2 -%else - movd eax, xm0 - pextrd edx, xm0, 1 - movq [sszd], xm2 -%endif - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of size != 16, speculative low precision - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ALIGN 16 -.generic: - pxor xm4, xm4 ; sse accumulator - pxor xm5, xm5 ; overflow detection register for xm4 - pxor xm6, xm6 ; ssz accumulator - pxor xm7, xm7 ; overflow detection register for xm6 - lea uqcq, [uqcq+sizeq*4] - lea dqcq, [dqcq+sizeq*4] - neg sizeq - - ; Push the negative size as the high precision code might need it - push sizeq - -.loop: - ; Load input vectors - mova xm0, [dqcq+sizeq*4] - packssdw xm0, [dqcq+sizeq*4+16] - mova xm2, [uqcq+sizeq*4] - packssdw xm2, [uqcq+sizeq*4+16] - - mova xm1, [dqcq+sizeq*4+32] - packssdw xm1, [dqcq+sizeq*4+48] - mova xm3, [uqcq+sizeq*4+32] - packssdw xm3, [uqcq+sizeq*4+48] - - add sizeq, 16 - - ; Compute the squared errors. - ; Individual errors are max 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). - psubw xm0, xm2 - pmaddwd xm2, xm2 - pmaddwd xm0, xm0 - - psubw xm1, xm3 - pmaddwd xm3, xm3 - pmaddwd xm1, xm1 - - ; Squares are always positive, so we can use unsigned arithmetic after - ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will - ; fit in 32bits - paddd xm2, xm3 - paddd xm0, xm1 - - ; We accumulate using 32 bit arithmetic, but detect potential overflow - ; by checking if the MSB of the accumulators have ever been a set bit. - ; If yes, we redo the whole compute at the end on higher precision, but - ; this happens extremely rarely, so we still achieve a net gain. - paddd xm4, xm0 - paddd xm6, xm2 - por xm5, xm4 ; OR in the accumulator for overflow detection - por xm7, xm6 ; OR in the accumulator for overflow detection - - jnz .loop - - ; Add pairs horizontally (still only on 32 bits) - phaddd xm4, xm4 - por xm5, xm4 ; OR in the accumulator for overflow detection - phaddd xm6, xm6 - por xm7, xm6 ; OR in the accumulator for overflow detection - - ; Check for possibility of overflow by testing if bit 32 of each dword lane - ; have ever been set. If they were not, then there was no overflow and the - ; final sum will fit in 32 bits. If overflow happened, then - ; we redo the whole computation on higher precision. - por xm7, xm5 - pmovmskb r4, xm7 - test r4, 0x8888 - jnz .highprec - - phaddd xm4, xm4 - phaddd xm6, xm6 - pmovzxdq xm4, xm4 - pmovzxdq xm6, xm6 - - ; Restore stack - pop sizeq - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm4 - movq [sszq], xm6 -%else - movd eax, xm4 - pextrd edx, xm4, 1 - movq [sszd], xm6 -%endif - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of size != 16, high precision case - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -.highprec: - pxor xm4, xm4 ; sse accumulator - pxor xm5, xm5 ; dedicated zero register - pxor xm6, xm6 ; ssz accumulator - pop sizeq - -.loophp: - mova xm0, [dqcq+sizeq*4] - packssdw xm0, [dqcq+sizeq*4+16] - mova xm2, [uqcq+sizeq*4] - packssdw xm2, [uqcq+sizeq*4+16] - - mova xm1, [dqcq+sizeq*4+32] - packssdw xm1, [dqcq+sizeq*4+48] - mova xm3, [uqcq+sizeq*4+32] - packssdw xm3, [uqcq+sizeq*4+48] - - add sizeq, 16 - - ; individual errors are max. 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) - - psubw xm0, xm2 - pmaddwd xm2, xm2 - pmaddwd xm0, xm0 - - psubw xm1, xm3 - pmaddwd xm3, xm3 - pmaddwd xm1, xm1 - - ; accumulate in 64bit - punpckldq xm7, xm0, xm5 - punpckhdq xm0, xm5 - paddq xm4, xm7 - - punpckldq xm7, xm2, xm5 - punpckhdq xm2, xm5 - paddq xm6, xm7 - - punpckldq xm7, xm1, xm5 - punpckhdq xm1, xm5 - paddq xm4, xm7 - - punpckldq xm7, xm3, xm5 - punpckhdq xm3, xm5 - paddq xm6, xm7 - - paddq xm4, xm0 - paddq xm4, xm1 - paddq xm6, xm2 - paddq xm6, xm3 - - jnz .loophp - - ; Accumulate horizontally - movhlps xm5, xm4 - movhlps xm7, xm6 - paddq xm4, xm5 - paddq xm6, xm7 - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm4 - movq [sszq], xm6 -%else - movd eax, xm4 - pextrd edx, xm4, 1 - movq [sszd], xm6 -%endif - RET - -END diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm deleted file mode 100644 index f3b8f01947..0000000000 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_error_sse2.asm +++ /dev/null @@ -1,98 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text -ALIGN 16 - -; -; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff, -; intptr_t block_size, int64_t *ssz) -; - -INIT_XMM sse2 -cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz - pxor m4, m4 ; sse accumulator - pxor m6, m6 ; ssz accumulator - pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*4] - lea dqcq, [dqcq+sizeq*4] - neg sizeq - - ALIGN 16 - -.loop: - mova m0, [dqcq+sizeq*4] - packssdw m0, [dqcq+sizeq*4+mmsize] - mova m2, [uqcq+sizeq*4] - packssdw m2, [uqcq+sizeq*4+mmsize] - - mova m1, [dqcq+sizeq*4+mmsize*2] - packssdw m1, [dqcq+sizeq*4+mmsize*3] - mova m3, [uqcq+sizeq*4+mmsize*2] - packssdw m3, [uqcq+sizeq*4+mmsize*3] - - add sizeq, mmsize - - ; individual errors are max. 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) - - psubw m0, m2 - pmaddwd m2, m2 - pmaddwd m0, m0 - - psubw m1, m3 - pmaddwd m3, m3 - pmaddwd m1, m1 - - ; accumulate in 64bit - punpckldq m7, m0, m5 - punpckhdq m0, m5 - paddq m4, m7 - - punpckldq m7, m2, m5 - punpckhdq m2, m5 - paddq m6, m7 - - punpckldq m7, m1, m5 - punpckhdq m1, m5 - paddq m4, m7 - - punpckldq m7, m3, m5 - punpckhdq m3, m5 - paddq m6, m7 - - paddq m4, m0 - paddq m4, m1 - paddq m6, m2 - paddq m6, m3 - - jnz .loop - - ; accumulate horizontally and store in return value - movhlps m5, m4 - movhlps m7, m6 - paddq m4, m5 - paddq m6, m7 - -%if ARCH_X86_64 - movq rax, m4 - movq [sszq], m6 -%else - mov eax, sszm - pshufd m5, m4, 0x1 - movq [eax], m6 - movd eax, m4 - movd edx, m5 -%endif - RET diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c new file mode 100644 index 0000000000..bf44b08674 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // AVX2 + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +// Zero fill 8 positions in the output buffer. +static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) { + const __m256i zero = _mm256_setzero_si256(); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(a), zero); + _mm256_storeu_si256((__m256i *)(a + 8), zero); +#else + _mm256_storeu_si256((__m256i *)(a), zero); +#endif +} + +static VPX_FORCE_INLINE void load_fp_values_avx2( + const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant) { + *round = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->round_fp)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + *quant = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->quant_fp)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); +#else + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); +#endif + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) { + const __m256i eob_lo = eob256; + // Copy upper 128 to lower 128 + const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81); + __m256i eob = _mm256_max_epi16(eob_lo, eob_hi); + __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); +#if defined(_MSC_VER) && (_MSC_VER < 1910) + return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; +#else + return (uint16_t)_mm256_extract_epi16(eob, 0); +#endif +} + +static VPX_FORCE_INLINE void quantize_fp_16( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const int32_t nzflag = + _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr)); + + if (nzflag) { + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + } else { + store_zero_tran_low(qcoeff_ptr); + store_zero_tran_low(dqcoeff_ptr); + } +} + +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + const int16_t *iscan = scan_order->iscan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant); + thr = _mm256_setzero_si256(); + + quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += 8 * 2; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_srai_epi16(dequant, 1); + + // AC only loop + while (n_coeffs < 0) { + quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += 8 * 2; + } + + *eob_ptr = get_max_eob(eob_max); +} + +// Enable this flag when matching the optimized code to +// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the +// existing ssse3 code and quantize_fp_32x32_nz_c(). +// +// #define MATCH_VP9_QUANTIZE_FP_32X32_C + +#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C +static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i abs_dqcoeff = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); + const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + (void)thr; +} +#endif + +static VPX_FORCE_INLINE void quantize_fp_32x32_16( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) { + const __m256i coeff = load_tran_low(coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi16(coeff); + const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr); + const int32_t nzflag = _mm256_movemask_epi8(thr_mask); + + if (nzflag) { +#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask); +#else + const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round); +#endif + const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant); + const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff); + const __m256i abs_dqcoeff = + _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1); + const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff); + const __m256i nz_mask = + _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256()); + store_tran_low(qcoeff, qcoeff_ptr); + store_tran_low(dqcoeff, dqcoeff_ptr); + + *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask); + } else { + store_zero_tran_low(qcoeff_ptr); + store_zero_tran_low(dqcoeff_ptr); + } +} + +void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + const int16_t *iscan = scan_order->iscan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant); + thr = _mm256_srli_epi16(dequant, 2); + quant = _mm256_slli_epi16(quant, 1); + { + const __m256i rnd = _mm256_set1_epi16((int16_t)1); + round = _mm256_add_epi16(round, rnd); + round = _mm256_srai_epi16(round, 1); + } + +#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. + thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1)); + quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); +#else + quantize_fp_32x32_16_no_nzflag( + &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, + qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); +#endif + + n_coeffs += 8 * 2; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_permute2x128_si256(thr, thr, 0x31); + + // AC only loop + while (n_coeffs < 0) { + quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += 8 * 2; + } + + *eob_ptr = get_max_eob(eob_max); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) { + const __m128i v = _mm_load_si128((const __m128i *)val_ptr); + const __m128i zero = _mm_setzero_si128(); + const __m128i dc = _mm_unpacklo_epi16(v, zero); + const __m128i ac = _mm_unpackhi_epi16(v, zero); + return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void highbd_load_fp_values( + const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant, + const int16_t *dequant_ptr, __m256i *dequant) { + *round = highbd_init_256(mb_plane->round_fp); + *quant = highbd_init_256(mb_plane->quant_fp); + *dequant = highbd_init_256(dequant_ptr); +} + +static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( + const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) { + const __m256i packed_nz_mask = + _mm256_packs_epi32(nz_mask, _mm256_setzero_si256()); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +static VPX_FORCE_INLINE void highbd_quantize_fp( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int step = 8; + __m256i round, quant, dequant; + __m256i eob_max = _mm256_setzero_si256(); + const int16_t *iscan = scan_order->iscan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); + + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } + + *eob_ptr = get_max_eob(eob_max); +} + +static VPX_FORCE_INLINE void highbd_quantize_fp_32x32( + const __m256i *round, const __m256i *quant, const __m256i *dequant, + const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr); + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + + _mm256_storeu_si256((__m256i *)qcoeff_ptr, q); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq); + + *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask); +} + +void vp9_highbd_quantize_fp_32x32_avx2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int step = 8; + __m256i round, quant, dequant, thr; + __m256i eob_max = _mm256_setzero_si256(); + const int16_t *iscan = scan_order->iscan; + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + // Setup global values + highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); + thr = _mm256_srli_epi32(dequant, 2); + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1)); + quant = _mm256_slli_epi32(quant, 1); + round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1); + + highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, + iscan + n_coeffs, qcoeff_ptr + n_coeffs, + dqcoeff_ptr + n_coeffs, &eob_max); + + n_coeffs += step; + + // remove dc constants + dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31); + quant = _mm256_permute2x128_si256(quant, quant, 0x31); + round = _mm256_permute2x128_si256(round, round, 0x31); + thr = _mm256_permute2x128_si256(thr, thr, 0x31); + + // AC only loop + while (n_coeffs < 0) { + highbd_quantize_fp_32x32( + &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs, + qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max); + n_coeffs += step; + } + + *eob_ptr = get_max_eob(eob_max); +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c index 3f8ee5f244..2481eb366e 100644 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c @@ -8,203 +8,119 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include "./vp9_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" -void vp9_quantize_fp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { - __m128i zero; +void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); __m128i thr; - int16_t nzflag; - (void)scan_ptr; - (void)zbin_ptr; - (void)quant_shift_ptr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i eob; + const int16_t *iscan = scan_order->iscan; - coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); + // Setup global values. + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); - if (!skip_block) { - __m128i eob; - __m128i round, quant, dequant; - { - __m128i coeff0, coeff1; + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); - // Setup global values - { - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - } + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - // Do DC and first 15 AC - coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); - } + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + __m128i eob0; + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); } - thr = _mm_srai_epi16(dequant, 1); - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - - coeff0 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs)); - coeff1 = _mm_load_si128((const __m128i *)(coeff_ptr + n_coeffs) + 1); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | - _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); - - if (nzflag) { - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); - } else { - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); - - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - } - } - - if (nzflag) { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + index += 16; } + + *eob_ptr = accumulate_eob(eob); } diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c new file mode 100644 index 0000000000..98decae749 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i eob; + const int16_t *iscan = scan_order->iscan; + + // Setup global values. + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 1); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + __m128i eob0; + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one_s16 = _mm_set1_epi16(1); + __m128i thr; + int nzflag; + int index = 16; + __m128i round, quant, dequant; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i eob; + const int16_t *iscan = scan_order->iscan; + + // Setup global values. + load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant); + // The 32x32 halves round. + round = _mm_add_epi16(round, one_s16); + round = _mm_srli_epi16(round, 1); + + // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so + // upshift quant to account for this. + quant = _mm_slli_epi16(quant, 1); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + // Get the abs value of qcoeff again so we can use shifts for division. + qcoeff0 = _mm_abs_epi16(qcoeff0); + qcoeff1 = _mm_abs_epi16(qcoeff1); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + // Divide by 2. + qcoeff0 = _mm_srli_epi16(qcoeff0, 1); + qcoeff1 = _mm_srli_epi16(qcoeff1, 1); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, dqcoeff_ptr); + store_tran_low(qcoeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + thr = _mm_srai_epi16(dequant, 2); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | + _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); + + if (nzflag) { + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant); + qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + // Get the abs value of qcoeff again so we can use shifts for division. + qcoeff0 = _mm_abs_epi16(qcoeff0); + qcoeff1 = _mm_abs_epi16(qcoeff1); + + qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant); + qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + // Divide by 2. + qcoeff0 = _mm_srli_epi16(qcoeff0, 1); + qcoeff1 = _mm_srli_epi16(qcoeff1, 1); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + store_tran_low(qcoeff0, dqcoeff_ptr + index); + store_tran_low(qcoeff1, dqcoeff_ptr + index + 8); + } else { + store_zero_tran_low(qcoeff_ptr + index); + store_zero_tran_low(qcoeff_ptr + index + 8); + + store_zero_tran_low(dqcoeff_ptr + index); + store_zero_tran_low(dqcoeff_ptr + index + 8); + } + + if (nzflag) { + const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm deleted file mode 100644 index ec61c0c3a7..0000000000 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ /dev/null @@ -1,201 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_1: times 8 dw 1 - -SECTION .text - -%macro QUANTIZE_FP 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - cmp dword skipm, 0 - jne .blank - - ; actual quantize loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp - movifnidn zbinq, zbinmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant -%ifidn %1, fp_32x32 - pcmpeqw m5, m5 - psrlw m5, 15 - paddw m1, m5 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - mova m3, [r2q] ; m3 = dequant - mov r3, qcoeffmp - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, fp_32x32 - psllw m2, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - - lea coeffq, [ coeffq+ncoeffq*2] - lea r5q, [ r5q+ncoeffq*2] - lea r3q, [ r3q+ncoeffq*2] - lea r4q, [r4q+ncoeffq*2] - neg ncoeffq - - ; get DC and first 15 AC coeffs - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpeqw m7, m7 - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - mova [r3q+ncoeffq*2+ 0], m8 - mova [r3q+ncoeffq*2+16], m13 -%ifidn %1, fp_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; r4[i] = r3[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; r4[i] = r3[i] * q -%ifidn %1, fp_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 - psrlw m0, m3, 2 -%else - psrlw m0, m3, 1 -%endif - mova [r4q+ncoeffq*2+ 0], m8 - mova [r4q+ncoeffq*2+16], m13 - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m7 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - jz .accumulate_eob - -.ac_only_loop: - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - - pcmpgtw m7, m6, m0 - pcmpgtw m12, m11, m0 - pmovmskb r6d, m7 - pmovmskb r2d, m12 - - or r6, r2 - jz .skip_iter - - pcmpeqw m7, m7 - - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - mova [r3q+ncoeffq*2+ 0], m14 - mova [r3q+ncoeffq*2+16], m13 -%ifidn %1, fp_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; r4[i] = r3[i] * q - pmullw m13, m3 ; r4[i] = r3[i] * q -%ifidn %1, fp_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif - mova [r4q+ncoeffq*2+ 0], m14 - mova [r4q+ncoeffq*2+16], m13 - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m7 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jl .ac_only_loop - - jmp .accumulate_eob -.skip_iter: - mova [r3q+ncoeffq*2+ 0], m5 - mova [r3q+ncoeffq*2+16], m5 - mova [r4q+ncoeffq*2+ 0], m5 - mova [r4q+ncoeffq*2+16], m5 - add ncoeffq, mmsize - jl .ac_only_loop - -.accumulate_eob: - ; horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - pextrw r6, m8, 0 - mov [r2], r6 - RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - - lea r0q, [r0q+ncoeffq*2] - lea r2q, [r2q+ncoeffq*2] - neg ncoeffq - pxor m7, m7 -.blank_loop: - mova [r0q+ncoeffq*2+ 0], m7 - mova [r0q+ncoeffq*2+16], m7 - mova [r2q+ncoeffq*2+ 0], m7 - mova [r2q+ncoeffq*2+16], m7 - add ncoeffq, mmsize - jl .blank_loop - mov word [r3q], 0 - RET -%endmacro - -INIT_XMM ssse3 -QUANTIZE_FP fp, 7 -QUANTIZE_FP fp_32x32, 7 diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm deleted file mode 100644 index 21aaa93831..0000000000 --- a/media/libvpx/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm +++ /dev/null @@ -1,212 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -; void vp9_temporal_filter_apply_sse2 | arg -; (unsigned char *frame1, | 0 -; unsigned int stride, | 1 -; unsigned char *frame2, | 2 -; unsigned int block_width, | 3 -; unsigned int block_height, | 4 -; int strength, | 5 -; int filter_weight, | 6 -; unsigned int *accumulator, | 7 -; unsigned short *count) | 8 -global sym(vp9_temporal_filter_apply_sse2) PRIVATE -sym(vp9_temporal_filter_apply_sse2): - - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ALIGN_STACK 16, rax - %define block_width 0 - %define block_height 16 - %define strength 32 - %define filter_weight 48 - %define rounding_bit 64 - %define rbp_backup 80 - %define stack_size 96 - sub rsp, stack_size - mov [rsp + rbp_backup], rbp - ; end prolog - - mov edx, arg(3) - mov [rsp + block_width], rdx - mov edx, arg(4) - mov [rsp + block_height], rdx - movd xmm6, arg(5) - movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read - - ; calculate the rounding bit outside the loop - ; 0x8000 >> (16 - strength) - mov rdx, 16 - sub rdx, arg(5) ; 16 - strength - movq xmm4, rdx ; can't use rdx w/ shift - movdqa xmm5, [GLOBAL(_const_top_bit)] - psrlw xmm5, xmm4 - movdqa [rsp + rounding_bit], xmm5 - - mov rsi, arg(0) ; src/frame1 - mov rdx, arg(2) ; predictor frame - mov rdi, arg(7) ; accumulator - mov rax, arg(8) ; count - - ; dup the filter weight and store for later - movd xmm0, arg(6) ; filter_weight - pshuflw xmm0, xmm0, 0 - punpcklwd xmm0, xmm0 - movdqa [rsp + filter_weight], xmm0 - - mov rbp, arg(1) ; stride - pxor xmm7, xmm7 ; zero for extraction - - mov rcx, [rsp + block_width] - imul rcx, [rsp + block_height] - add rcx, rdx - cmp dword ptr [rsp + block_width], 8 - jne .temporal_filter_apply_load_16 - -.temporal_filter_apply_load_8: - movq xmm0, [rsi] ; first row - lea rsi, [rsi + rbp] ; += stride - punpcklbw xmm0, xmm7 ; src[ 0- 7] - movq xmm1, [rsi] ; second row - lea rsi, [rsi + rbp] ; += stride - punpcklbw xmm1, xmm7 ; src[ 8-15] - jmp .temporal_filter_apply_load_finished - -.temporal_filter_apply_load_16: - movdqa xmm0, [rsi] ; src (frame1) - lea rsi, [rsi + rbp] ; += stride - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 ; src[ 0- 7] - punpckhbw xmm1, xmm7 ; src[ 8-15] - -.temporal_filter_apply_load_finished: - movdqa xmm2, [rdx] ; predictor (frame2) - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm7 ; pred[ 0- 7] - punpckhbw xmm3, xmm7 ; pred[ 8-15] - - ; modifier = src_byte - pixel_value - psubw xmm0, xmm2 ; src - pred[ 0- 7] - psubw xmm1, xmm3 ; src - pred[ 8-15] - - ; modifier *= modifier - pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 - pmullw xmm1, xmm1 ; modifer[ 8-15]^2 - - ; modifier *= 3 - pmullw xmm0, [GLOBAL(_const_3w)] - pmullw xmm1, [GLOBAL(_const_3w)] - - ; modifer += 0x8000 >> (16 - strength) - paddw xmm0, [rsp + rounding_bit] - paddw xmm1, [rsp + rounding_bit] - - ; modifier >>= strength - psrlw xmm0, [rsp + strength] - psrlw xmm1, [rsp + strength] - - ; modifier = 16 - modifier - ; saturation takes care of modifier > 16 - movdqa xmm3, [GLOBAL(_const_16w)] - movdqa xmm2, [GLOBAL(_const_16w)] - psubusw xmm3, xmm1 - psubusw xmm2, xmm0 - - ; modifier *= filter_weight - pmullw xmm2, [rsp + filter_weight] - pmullw xmm3, [rsp + filter_weight] - - ; count - movdqa xmm4, [rax] - movdqa xmm5, [rax+16] - ; += modifier - paddw xmm4, xmm2 - paddw xmm5, xmm3 - ; write back - movdqa [rax], xmm4 - movdqa [rax+16], xmm5 - lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) - - ; load and extract the predictor up to shorts - pxor xmm7, xmm7 - movdqa xmm0, [rdx] - lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 ; pred[ 0- 7] - punpckhbw xmm1, xmm7 ; pred[ 8-15] - - ; modifier *= pixel_value - pmullw xmm0, xmm2 - pmullw xmm1, xmm3 - - ; expand to double words - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm7 ; [ 0- 3] - punpckhwd xmm2, xmm7 ; [ 4- 7] - movdqa xmm3, xmm1 - punpcklwd xmm1, xmm7 ; [ 8-11] - punpckhwd xmm3, xmm7 ; [12-15] - - ; accumulator - movdqa xmm4, [rdi] - movdqa xmm5, [rdi+16] - movdqa xmm6, [rdi+32] - movdqa xmm7, [rdi+48] - ; += modifier - paddd xmm4, xmm0 - paddd xmm5, xmm2 - paddd xmm6, xmm1 - paddd xmm7, xmm3 - ; write back - movdqa [rdi], xmm4 - movdqa [rdi+16], xmm5 - movdqa [rdi+32], xmm6 - movdqa [rdi+48], xmm7 - lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) - - cmp rdx, rcx - je .temporal_filter_apply_epilog - pxor xmm7, xmm7 ; zero for extraction - cmp dword ptr [rsp + block_width], 16 - je .temporal_filter_apply_load_16 - jmp .temporal_filter_apply_load_8 - -.temporal_filter_apply_epilog: - ; begin epilog - mov rbp, [rsp + rbp_backup] - add rsp, stack_size - pop rsp - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -_const_3w: - times 8 dw 3 -align 16 -_const_top_bit: - times 8 dw 1<<15 -align 16 -_const_16w - times 8 dw 16 diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc new file mode 100644 index 0000000000..b29e1ec236 --- /dev/null +++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vp9/ratectrl_rtc.h" + +#include + +#include "vp9/common/vp9_common.h" +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_picklpf.h" +#include "vpx/vp8cx.h" +#include "vpx/vpx_codec.h" +#include "vpx_mem/vpx_mem.h" + +namespace libvpx { + +std::unique_ptr VP9RateControlRTC::Create( + const VP9RateControlRtcConfig &cfg) { + std::unique_ptr rc_api(new (std::nothrow) + VP9RateControlRTC()); + if (!rc_api) return nullptr; + rc_api->cpi_ = static_cast(vpx_memalign(32, sizeof(*cpi_))); + if (!rc_api->cpi_) return nullptr; + vp9_zero(*rc_api->cpi_); + + if (!rc_api->InitRateControl(cfg)) return nullptr; + if (cfg.aq_mode) { + VP9_COMP *const cpi = rc_api->cpi_; + cpi->segmentation_map = static_cast( + vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, + sizeof(*cpi->segmentation_map))); + if (!cpi->segmentation_map) return nullptr; + cpi->cyclic_refresh = + vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols); + cpi->cyclic_refresh->content_mode = 0; + } + return rc_api; +} + +VP9RateControlRTC::~VP9RateControlRTC() { + if (cpi_) { + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { + int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers); + LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; + vpx_free(lc->map); + vpx_free(lc->last_coded_q_map); + vpx_free(lc->consec_zero_mv); + } + } + } + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vpx_free(cpi_->segmentation_map); + cpi_->segmentation_map = NULL; + vp9_cyclic_refresh_free(cpi_->cyclic_refresh); + } + vpx_free(cpi_); + } +} + +bool VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { + VP9_COMMON *cm = &cpi_->common; + VP9EncoderConfig *oxcf = &cpi_->oxcf; + RATE_CONTROL *const rc = &cpi_->rc; + cm->profile = PROFILE_0; + cm->bit_depth = VPX_BITS_8; + cm->show_frame = 1; + oxcf->profile = cm->profile; + oxcf->bit_depth = cm->bit_depth; + oxcf->rc_mode = rc_cfg.rc_mode; + oxcf->pass = 0; + oxcf->aq_mode = rc_cfg.aq_mode ? CYCLIC_REFRESH_AQ : NO_AQ; + oxcf->content = VP9E_CONTENT_DEFAULT; + oxcf->drop_frames_water_mark = 0; + cm->current_video_frame = 0; + rc->kf_boost = DEFAULT_KF_BOOST; + + if (!UpdateRateControl(rc_cfg)) return false; + vp9_set_mb_mi(cm, cm->width, cm->height); + + cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) + ? 1 + : 0; + + rc->rc_1_frame = 0; + rc->rc_2_frame = 0; + vp9_rc_init_minq_luts(); + vp9_rc_init(oxcf, 0, rc); + rc->constrain_gf_key_freq_onepass_vbr = 0; + cpi_->sf.use_nonrd_pick_mode = 1; + return true; +} + +bool VP9RateControlRTC::UpdateRateControl( + const VP9RateControlRtcConfig &rc_cfg) { + // Since VPX_MAX_LAYERS (12) is less than the product of VPX_SS_MAX_LAYERS (5) + // and VPX_TS_MAX_LAYERS (5), check all three. + if (rc_cfg.ss_number_layers < 1 || + rc_cfg.ss_number_layers > VPX_SS_MAX_LAYERS || + rc_cfg.ts_number_layers < 1 || + rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS || + rc_cfg.ss_number_layers * rc_cfg.ts_number_layers > VPX_MAX_LAYERS) { + return false; + } + + VP9_COMMON *cm = &cpi_->common; + VP9EncoderConfig *oxcf = &cpi_->oxcf; + RATE_CONTROL *const rc = &cpi_->rc; + + cm->width = rc_cfg.width; + cm->height = rc_cfg.height; + oxcf->width = rc_cfg.width; + oxcf->height = rc_cfg.height; + oxcf->worst_allowed_q = vp9_quantizer_to_qindex(rc_cfg.max_quantizer); + oxcf->best_allowed_q = vp9_quantizer_to_qindex(rc_cfg.min_quantizer); + rc->worst_quality = oxcf->worst_allowed_q; + rc->best_quality = oxcf->best_allowed_q; + oxcf->init_framerate = rc_cfg.framerate; + oxcf->target_bandwidth = 1000 * rc_cfg.target_bandwidth; + oxcf->starting_buffer_level_ms = rc_cfg.buf_initial_sz; + oxcf->optimal_buffer_level_ms = rc_cfg.buf_optimal_sz; + oxcf->maximum_buffer_size_ms = rc_cfg.buf_sz; + oxcf->under_shoot_pct = rc_cfg.undershoot_pct; + oxcf->over_shoot_pct = rc_cfg.overshoot_pct; + oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh; + oxcf->content = rc_cfg.is_screen ? VP9E_CONTENT_SCREEN : VP9E_CONTENT_DEFAULT; + oxcf->ss_number_layers = rc_cfg.ss_number_layers; + oxcf->ts_number_layers = rc_cfg.ts_number_layers; + oxcf->temporal_layering_mode = + (VP9E_TEMPORAL_LAYERING_MODE)((rc_cfg.ts_number_layers > 1) + ? rc_cfg.ts_number_layers + : 0); + + cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct; + cpi_->oxcf.rc_max_inter_bitrate_pct = rc_cfg.max_inter_bitrate_pct; + cpi_->framerate = rc_cfg.framerate; + cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers; + cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers; + + vp9_set_mb_mi(cm, cm->width, cm->height); + + if (setjmp(cpi_->common.error.jmp)) { + cpi_->common.error.setjmp = 0; + vpx_clear_system_state(); + return false; + } + cpi_->common.error.setjmp = 1; + + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { + oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl]; + } + for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) { + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + oxcf->layer_target_bitrate[layer] = + 1000 * rc_cfg.layer_target_bitrate[layer]; + lrc->worst_quality = + vp9_quantizer_to_qindex(rc_cfg.max_quantizers[layer]); + lrc->best_quality = vp9_quantizer_to_qindex(rc_cfg.min_quantizers[layer]); + lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl]; + lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl]; + } + } + vp9_set_rc_buffer_sizes(cpi_); + vp9_new_framerate(cpi_, cpi_->framerate); + if (cpi_->svc.number_temporal_layers > 1 || + cpi_->svc.number_spatial_layers > 1) { + if (cm->current_video_frame == 0) { + vp9_init_layer_context(cpi_); + // svc->framedrop_mode is not currently exposed, so only allow for + // full superframe drop for now. + cpi_->svc.framedrop_mode = FULL_SUPERFRAME_DROP; + } + vp9_update_layer_context_change_config(cpi_, + (int)cpi_->oxcf.target_bandwidth); + cpi_->svc.max_consec_drop = rc_cfg.max_consec_drop; + } + vp9_check_reset_rc_flag(cpi_); + + cpi_->common.error.setjmp = 0; + return true; +} + +// Compute the QP for the frame. If the frame is dropped this function +// returns kDrop, and no QP is computed. If the frame is encoded (not dropped) +// the QP is computed and kOk is returned. +FrameDropDecision VP9RateControlRTC::ComputeQP( + const VP9FrameParamsQpRTC &frame_params) { + VP9_COMMON *const cm = &cpi_->common; + int width, height; + cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id; + cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id; + if (cpi_->svc.number_spatial_layers > 1) { + const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, + cpi_->svc.temporal_layer_id, + cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; + get_layer_resolution(cpi_->oxcf.width, cpi_->oxcf.height, + lc->scaling_factor_num, lc->scaling_factor_den, &width, + &height); + cm->width = width; + cm->height = height; + } + vp9_set_mb_mi(cm, cm->width, cm->height); + cm->frame_type = static_cast(frame_params.frame_type); + // This is needed to ensure key frame does not get unset in rc_get_svc_params. + cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0; + cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; + cpi_->sf.use_nonrd_pick_mode = 1; + if (cpi_->svc.number_spatial_layers == 1 && + cpi_->svc.number_temporal_layers == 1) { + int target = 0; + if (cpi_->oxcf.rc_mode == VPX_CBR) { + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_update_parameters(cpi_); + if (frame_is_intra_only(cm)) + target = vp9_calc_iframe_target_size_one_pass_cbr(cpi_); + else + target = vp9_calc_pframe_target_size_one_pass_cbr(cpi_); + } else if (cpi_->oxcf.rc_mode == VPX_VBR) { + if (cm->frame_type == KEY_FRAME) { + cpi_->rc.this_key_frame_forced = cm->current_video_frame != 0; + cpi_->rc.frames_to_key = cpi_->oxcf.key_freq; + } + vp9_set_gf_update_one_pass_vbr(cpi_); + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_update_parameters(cpi_); + if (frame_is_intra_only(cm)) + target = vp9_calc_iframe_target_size_one_pass_vbr(cpi_); + else + target = vp9_calc_pframe_target_size_one_pass_vbr(cpi_); + } + vp9_rc_set_frame_target(cpi_, target); + vp9_update_buffer_level_preencode(cpi_); + } else { + vp9_update_temporal_layer_framerate(cpi_); + vp9_restore_layer_context(cpi_); + vp9_rc_get_svc_params(cpi_); + } + if (cpi_->svc.spatial_layer_id == 0) vp9_zero(cpi_->svc.drop_spatial_layer); + // SVC: check for skip encoding of enhancement layer if the + // layer target bandwidth = 0. + if (vp9_svc_check_skip_enhancement_layer(cpi_)) + return FrameDropDecision::kDrop; + // Check for dropping this frame based on buffer level. + // Never drop on key frame, or if base layer is key for svc, + if (!frame_is_intra_only(cm) && + (!cpi_->use_svc || + !cpi_->svc.layer_context[cpi_->svc.temporal_layer_id].is_key_frame)) { + if (vp9_rc_drop_frame(cpi_)) { + // For FULL_SUPERFRAME_DROP mode (the only mode considered here): + // if the superframe drop is decided we need to save the layer context for + // all spatial layers, and call update_buffer_level and postencode_drop + // for all spatial layers. + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + vp9_save_layer_context(cpi_); + for (int sl = 1; sl < cpi_->svc.number_spatial_layers; sl++) { + cpi_->svc.spatial_layer_id = sl; + vp9_restore_layer_context(cpi_); + vp9_update_buffer_level_svc_preencode(cpi_); + vp9_rc_postencode_update_drop_frame(cpi_); + vp9_save_layer_context(cpi_); + } + } + return FrameDropDecision::kDrop; + } + } + // Compute the QP for the frame. + int bottom_index, top_index; + cpi_->common.base_qindex = + vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index); + + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_); + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) + vp9_save_layer_context(cpi_); + + cpi_->last_frame_dropped = 0; + cpi_->svc.last_layer_dropped[cpi_->svc.spatial_layer_id] = 0; + if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1) + cpi_->svc.num_encoded_top_layer++; + + return FrameDropDecision::kOk; +} + +int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; } + +int VP9RateControlRTC::GetLoopfilterLevel() const { + struct loopfilter *const lf = &cpi_->common.lf; + vp9_pick_filter_level(nullptr, cpi_, LPF_PICK_FROM_Q); + return lf->filter_level; +} + +bool VP9RateControlRTC::GetSegmentationData( + VP9SegmentationData *segmentation_data) const { + if (!cpi_->cyclic_refresh || !cpi_->cyclic_refresh->apply_cyclic_refresh) { + return false; + } + + segmentation_data->segmentation_map = cpi_->segmentation_map; + segmentation_data->segmentation_map_size = + cpi_->common.mi_cols * cpi_->common.mi_rows; + segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta; + segmentation_data->delta_q_size = 3u; + return true; +} + +void VP9RateControlRTC::PostEncodeUpdate( + uint64_t encoded_frame_size, const VP9FrameParamsQpRTC &frame_params) { + cpi_->common.frame_type = static_cast(frame_params.frame_type); + cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id; + cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id; + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + vp9_restore_layer_context(cpi_); + const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id, + cpi_->svc.temporal_layer_id, + cpi_->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer]; + cpi_->common.base_qindex = lc->frame_qp; + cpi_->common.MBs = lc->MBs; + // For spatial-svc, allow cyclic-refresh to be applied on the spatial + // layers, for the base temporal layer. + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cpi_->svc.number_spatial_layers > 1 && + cpi_->svc.temporal_layer_id == 0) { + CYCLIC_REFRESH *const cr = cpi_->cyclic_refresh; + cr->qindex_delta[0] = lc->qindex_delta[0]; + cr->qindex_delta[1] = lc->qindex_delta[1]; + cr->qindex_delta[2] = lc->qindex_delta[2]; + } + } + vp9_rc_postencode_update(cpi_, encoded_frame_size); + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) + vp9_save_layer_context(cpi_); + cpi_->common.current_video_frame++; +} + +} // namespace libvpx diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.h b/media/libvpx/libvpx/vp9/ratectrl_rtc.h new file mode 100644 index 0000000000..4c39255886 --- /dev/null +++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_RATECTRL_RTC_H_ +#define VPX_VP9_RATECTRL_RTC_H_ + +#include +#include +#include +#include + +#include "vpx/vpx_encoder.h" +#include "vpx/internal/vpx_ratectrl_rtc.h" + +struct VP9_COMP; + +namespace libvpx { +struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { + VP9RateControlRtcConfig() { + memset(layer_target_bitrate, 0, sizeof(layer_target_bitrate)); + memset(ts_rate_decimator, 0, sizeof(ts_rate_decimator)); + scaling_factor_num[0] = 1; + scaling_factor_den[0] = 1; + max_quantizers[0] = max_quantizer; + min_quantizers[0] = min_quantizer; + } + + // Number of spatial layers + int ss_number_layers = 1; + int max_quantizers[VPX_MAX_LAYERS] = {}; + int min_quantizers[VPX_MAX_LAYERS] = {}; + int scaling_factor_num[VPX_SS_MAX_LAYERS] = {}; + int scaling_factor_den[VPX_SS_MAX_LAYERS] = {}; + // This is only for SVC for now. + int max_consec_drop = std::numeric_limits::max(); +}; + +struct VP9FrameParamsQpRTC { + RcFrameType frame_type; + int spatial_layer_id; + int temporal_layer_id; +}; + +struct VP9SegmentationData { + const uint8_t *segmentation_map; + size_t segmentation_map_size; + const int *delta_q; + size_t delta_q_size; +}; + +// This interface allows using VP9 real-time rate control without initializing +// the encoder. To use this interface, you need to link with libvpxrc.a. +// +// #include "vp9/ratectrl_rtc.h" +// VP9RateControlRtcConfig cfg; +// VP9FrameParamsQpRTC frame_params; +// +// YourFunctionToInitializeConfig(cfg); +// std::unique_ptr rc_api = VP9RateControlRTC::Create(cfg); +// // start encoding +// while (frame_to_encode) { +// if (config_changed) +// rc_api->UpdateRateControl(cfg); +// YourFunctionToFillFrameParams(frame_params); +// rc_api->ComputeQP(frame_params); +// YourFunctionToUseQP(rc_api->GetQP()); +// YourFunctionToUseLoopfilter(rc_api->GetLoopfilterLevel()); +// // After encoding +// rc_api->PostEncode(encoded_frame_size, frame_params); +// } +class VP9RateControlRTC { + public: + static std::unique_ptr Create( + const VP9RateControlRtcConfig &cfg); + ~VP9RateControlRTC(); + + bool UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg); + // GetQP() needs to be called after ComputeQP() to get the latest QP + int GetQP() const; + int GetLoopfilterLevel() const; + bool GetSegmentationData(VP9SegmentationData *segmentation_data) const; + // ComputeQP computes the QP if the frame is not dropped (kOk return), + // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate + // are not to be called (vp9_rc_postencode_update_drop_frame is already + // called via ComputeQP if drop is decided). + FrameDropDecision ComputeQP(const VP9FrameParamsQpRTC &frame_params); + // Feedback to rate control with the size of current encoded frame + void PostEncodeUpdate(uint64_t encoded_frame_size, + const VP9FrameParamsQpRTC &frame_params); + + private: + VP9RateControlRTC() = default; + bool InitRateControl(const VP9RateControlRtcConfig &cfg); + struct VP9_COMP *cpi_ = nullptr; +}; + +} // namespace libvpx + +#endif // VPX_VP9_RATECTRL_RTC_H_ diff --git a/media/libvpx/libvpx/vp9/vp9_common.mk b/media/libvpx/libvpx/vp9/vp9_common.mk index 5bfc0d3599..5ef2f891a8 100644 --- a/media/libvpx/libvpx/vp9/vp9_common.mk +++ b/media/libvpx/libvpx/vp9/vp9_common.mk @@ -10,6 +10,7 @@ VP9_COMMON_SRCS-yes += vp9_common.mk VP9_COMMON_SRCS-yes += vp9_iface_common.h +VP9_COMMON_SRCS-yes += vp9_iface_common.c VP9_COMMON_SRCS-yes += common/vp9_ppflags.h VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c VP9_COMMON_SRCS-yes += common/vp9_blockd.c @@ -63,30 +64,36 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c +endif # !CONFIG_VP9_HIGHBITDEPTH + +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c +VP9_COMMON_SRCS-$(HAVE_VSX) += common/ppc/vp9_idct_vsx.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht16x16_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht_neon.h + ifeq ($(CONFIG_VP9_POSTPROC),yes) +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm endif ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c -endif - -# common (msa) -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c - -ifeq ($(CONFIG_VP9_POSTPROC),yes) -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c -endif - -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c - -ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c +else +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht4x4_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht16x16_add_neon.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c endif $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c index 77b13da58f..3e896848ff 100644 --- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c +++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c @@ -8,21 +8,35 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include +#include #include #include #include "./vpx_config.h" #include "vpx/vpx_encoder.h" -#include "vpx_ports/vpx_once.h" +#include "vpx/vpx_ext_ratectrl.h" +#include "vpx_dsp/psnr.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_ports/static_assert.h" #include "vpx_ports/system_state.h" +#include "vpx_util/vpx_timestamp.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vpx_version.h" #include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" #include "vpx/vp8cx.h" +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_scale.h" +#include "vp9/vp9_cx_iface.h" #include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_lookahead.h" +#include "vp9/vp9_cx_iface.h" #include "vp9/vp9_iface_common.h" -struct vp9_extracfg { +#include "vpx/vpx_tpl.h" + +typedef struct vp9_extracfg { int cpu_used; // available cpu percentage in 1/16 unsigned int enable_auto_alt_ref; unsigned int noise_sensitivity; @@ -30,6 +44,8 @@ struct vp9_extracfg { unsigned int static_thresh; unsigned int tile_columns; unsigned int tile_rows; + unsigned int enable_tpl_model; + unsigned int enable_keyframe_filtering; unsigned int arnr_max_frames; unsigned int arnr_strength; unsigned int min_gf_interval; @@ -51,16 +67,25 @@ struct vp9_extracfg { vpx_color_range_t color_range; int render_width; int render_height; -}; + unsigned int row_mt; + unsigned int motion_vector_unit_test; + int delta_q_uv; +} vp9_extracfg; static struct vp9_extracfg default_extra_cfg = { - 0, // cpu_used +#if CONFIG_REALTIME_ONLY + 5, // cpu_used +#else + 0, // cpu_used +#endif 1, // enable_auto_alt_ref 0, // noise_sensitivity 0, // sharpness 0, // static_thresh 6, // tile_columns 0, // tile_rows + 1, // enable_tpl_model + 0, // enable_keyframe_filtering 7, // arnr_max_frames 5, // arnr_strength 0, // min_gf_interval; 0 -> default decision @@ -82,12 +107,17 @@ static struct vp9_extracfg default_extra_cfg = { 0, // color range 0, // render width 0, // render height + 0, // row_mt + 0, // motion_vector_unit_test + 0, // delta_q_uv }; struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_enc_cfg_t cfg; struct vp9_extracfg extra_cfg; + vpx_codec_pts_t pts_offset; + unsigned char pts_offset_initialized; VP9EncoderConfig oxcf; VP9_COMP *cpi; unsigned char *cx_data; @@ -105,8 +135,14 @@ struct vpx_codec_alg_priv { vpx_codec_priv_output_cx_pkt_cb_pair_t output_cx_pkt_cb; // BufferPool that holds all reference frames. BufferPool *buffer_pool; + vpx_fixed_buf_t global_headers; + int global_header_subsampling; }; +// Called by encoder_set_config() and encoder_encode() only. Must not be called +// by encoder_init() because the `error` paramerer (cpi->common.error) will be +// destroyed by vpx_codec_enc_init_ver() after encoder_init() returns an error. +// See the "IMPORTANT" comment in vpx_codec_enc_init_ver(). static vpx_codec_err_t update_error_state( vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) { const vpx_codec_err_t res = error->error_code; @@ -124,10 +160,10 @@ static vpx_codec_err_t update_error_state( return VPX_CODEC_INVALID_PARAM; \ } while (0) -#define RANGE_CHECK(p, memb, lo, hi) \ - do { \ - if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \ - ERROR(#memb " out of range [" #lo ".." #hi "]"); \ +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ } while (0) #define RANGE_CHECK_HI(p, memb, hi) \ @@ -148,8 +184,8 @@ static vpx_codec_err_t update_error_state( static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg, const struct vp9_extracfg *extra_cfg) { - RANGE_CHECK(cfg, g_w, 1, 65535); // 16 bits available - RANGE_CHECK(cfg, g_h, 1, 65535); // 16 bits available + RANGE_CHECK(cfg, g_w, 1, 65536); // 16 bits available + RANGE_CHECK(cfg, g_h, 1, 65536); // 16 bits available RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000); RANGE_CHECK_HI(cfg, g_profile, 3); @@ -161,18 +197,23 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2); RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1); RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1); - RANGE_CHECK_HI(cfg, g_threads, 64); + RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS); RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q); RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100); RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); + RANGE_CHECK(cfg, rc_2pass_vbr_corpus_complexity, 0, 10000); RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO); RANGE_CHECK_BOOL(cfg, rc_resize_allowed); RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100); RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100); RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100); +#if CONFIG_REALTIME_ONLY + RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS); +#else RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS); +#endif RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1)); RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1)); if (extra_cfg->max_gf_interval > 0) { @@ -183,6 +224,13 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, (MAX_LAG_BUFFERS - 1)); } + // For formation of valid ARF groups lag_in _frames should be 0 or greater + // than the max_gf_interval + 2 + if (cfg->g_lag_in_frames > 0 && extra_cfg->max_gf_interval > 0 && + cfg->g_lag_in_frames < extra_cfg->max_gf_interval + 2) { + ERROR("Set lag in frames to 0 (low delay) or >= (max-gf-interval + 2)"); + } + if (cfg->rc_resize_allowed == 1) { RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w); RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h); @@ -198,7 +246,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 && level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 && level != LEVEL_6_1 && level != LEVEL_6_2 && level != LEVEL_UNKNOWN && - level != LEVEL_MAX) + level != LEVEL_AUTO && level != LEVEL_MAX) ERROR("target_level is invalid"); } @@ -221,22 +269,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("ts_rate_decimator factors are not powers of 2"); } -#if CONFIG_SPATIAL_SVC - - if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) && - cfg->g_pass == VPX_RC_LAST_PASS) { - unsigned int i, alt_ref_sum = 0; - for (i = 0; i < cfg->ss_number_layers; ++i) { - if (cfg->ss_enable_auto_alt_ref[i]) ++alt_ref_sum; - } - if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers) - ERROR("Not enough ref buffers for svc alt ref frames"); - if (cfg->ss_number_layers * cfg->ts_number_layers > 3 && - cfg->g_error_resilient == 0) - ERROR("Multiple frame context are not supported for more than 3 layers"); - } -#endif - // VP9 does not support a lower bound on the keyframe interval in // automatic keyframe placement mode. if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist && @@ -245,8 +277,10 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, "kf_min_dist not supported in auto mode, use 0 " "or kf_max_dist instead."); - RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2); - RANGE_CHECK(extra_cfg, cpu_used, -8, 8); + RANGE_CHECK(extra_cfg, row_mt, 0, 1); + RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2); + RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, MAX_ARF_LAYERS); + RANGE_CHECK(extra_cfg, cpu_used, -9, 9); RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); RANGE_CHECK(extra_cfg, tile_columns, 0, 6); RANGE_CHECK(extra_cfg, tile_rows, 0, 2); @@ -259,10 +293,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, content, VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1); - // TODO(yaowu): remove this when ssim tuning is implemented for vp9 - if (extra_cfg->tuning == VP8_TUNE_SSIM) - ERROR("Option --tune=ssim is not currently supported in VP9."); - +#if !CONFIG_REALTIME_ONLY if (cfg->g_pass == VPX_RC_LAST_PASS) { const size_t packet_sz = sizeof(FIRSTPASS_STATS); const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); @@ -314,6 +345,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("rc_twopass_stats_in missing EOS stats packet"); } } +#endif // !CONFIG_REALTIME_ONLY #if !CONFIG_VP9_HIGHBITDEPTH if (cfg->g_profile > (unsigned int)PROFILE_1) { @@ -333,6 +365,24 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, } RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB); RANGE_CHECK(extra_cfg, color_range, VPX_CR_STUDIO_RANGE, VPX_CR_FULL_RANGE); + + // The range below shall be further tuned. + RANGE_CHECK(cfg, use_vizier_rc_params, 0, 1); + RANGE_CHECK(cfg, active_wq_factor.den, 1, 1000); + RANGE_CHECK(cfg, err_per_mb_factor.den, 1, 1000); + RANGE_CHECK(cfg, sr_default_decay_limit.den, 1, 1000); + RANGE_CHECK(cfg, sr_diff_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_err_per_mb_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_min_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_frame_max_boost_subs_factor.den, 1, 1000); + RANGE_CHECK(cfg, kf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_max_total_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, gf_frame_max_boost_factor.den, 1, 1000); + RANGE_CHECK(cfg, zm_factor.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_inter_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_arf_qp_fac.den, 1, 1000); + RANGE_CHECK(cfg, rd_mult_key_qp_fac.den, 1, 1000); + return VPX_CODEC_OK; } @@ -341,14 +391,15 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, switch (img->fmt) { case VPX_IMG_FMT_YV12: case VPX_IMG_FMT_I420: - case VPX_IMG_FMT_I42016: break; + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_NV12: break; case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I444: case VPX_IMG_FMT_I440: if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) { ERROR( - "Invalid image format. I422, I444, I440 images are " - "not supported in profile."); + "Invalid image format. I422, I444, I440 images are not supported " + "in profile."); } break; case VPX_IMG_FMT_I42216: @@ -363,8 +414,8 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, break; default: ERROR( - "Invalid image format. Only YV12, I420, I422, I444 images are " - "supported."); + "Invalid image format. Only YV12, I420, I422, I444, I440, NV12 " + "images are supported."); break; } @@ -377,6 +428,7 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, static int get_image_bps(const vpx_image_t *img) { switch (img->fmt) { case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_NV12: case VPX_IMG_FMT_I420: return 12; case VPX_IMG_FMT_I422: return 16; case VPX_IMG_FMT_I444: return 24; @@ -419,10 +471,20 @@ static void config_target_level(VP9EncoderConfig *oxcf) { oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63); // Adjust minimum art-ref distance. - if (oxcf->min_gf_interval < - (int)vp9_level_defs[target_level_index].min_altref_distance) + // min_gf_interval should be no less than min_altref_distance + 1, + // as the encoder may produce bitstream with alt-ref distance being + // min_gf_interval - 1. + if (oxcf->min_gf_interval <= + (int)vp9_level_defs[target_level_index].min_altref_distance) { oxcf->min_gf_interval = - (int)vp9_level_defs[target_level_index].min_altref_distance; + (int)vp9_level_defs[target_level_index].min_altref_distance + 1; + // If oxcf->max_gf_interval == 0, it will be assigned with a default value + // in vp9_rc_set_gf_interval_range(). + if (oxcf->max_gf_interval != 0) { + oxcf->max_gf_interval = + VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval); + } + } // Adjust maximum column tiles. if (vp9_level_defs[target_level_index].max_col_tiles < @@ -434,20 +496,33 @@ static void config_target_level(VP9EncoderConfig *oxcf) { } } +static vpx_rational64_t get_g_timebase_in_ts(vpx_rational_t g_timebase) { + vpx_rational64_t g_timebase_in_ts; + g_timebase_in_ts.den = g_timebase.den; + g_timebase_in_ts.num = g_timebase.num; + g_timebase_in_ts.num *= TICKS_PER_SEC; + reduce_ratio(&g_timebase_in_ts); + return g_timebase_in_ts; +} + static vpx_codec_err_t set_encoder_config( - VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg, + VP9EncoderConfig *oxcf, vpx_codec_enc_cfg_t *cfg, const struct vp9_extracfg *extra_cfg) { - const int is_vbr = cfg->rc_end_usage == VPX_VBR; int sl, tl; + unsigned int raw_target_rate; oxcf->profile = cfg->g_profile; oxcf->max_threads = (int)cfg->g_threads; oxcf->width = cfg->g_w; oxcf->height = cfg->g_h; oxcf->bit_depth = cfg->g_bit_depth; oxcf->input_bit_depth = cfg->g_input_bit_depth; + // TODO(angiebird): Figure out if we can just use g_timebase to indicate the + // inverse of framerate // guess a frame rate if out of whack, use 30 oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num; if (oxcf->init_framerate > 180) oxcf->init_framerate = 30; + oxcf->g_timebase = cfg->g_timebase; + oxcf->g_timebase_in_ts = get_g_timebase_in_ts(oxcf->g_timebase); oxcf->mode = GOOD; @@ -461,8 +536,15 @@ static vpx_codec_err_t set_encoder_config( cfg->g_pass == VPX_RC_FIRST_PASS ? 0 : cfg->g_lag_in_frames; oxcf->rc_mode = cfg->rc_end_usage; + raw_target_rate = + (unsigned int)((int64_t)oxcf->width * oxcf->height * oxcf->bit_depth * 3 * + oxcf->init_framerate / 1000); + // Cap target bitrate to raw rate or 1000Mbps, whichever is less + cfg->rc_target_bitrate = + VPXMIN(VPXMIN(raw_target_rate, cfg->rc_target_bitrate), 1000000); + // Convert target bandwidth from Kbit/s to Bit/s - oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate; + oxcf->target_bandwidth = 1000 * (int64_t)cfg->rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct; oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct; oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct; @@ -488,15 +570,16 @@ static vpx_codec_err_t set_encoder_config( oxcf->resize_mode = RESIZE_NONE; } - oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz; - oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz; - oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz; + oxcf->maximum_buffer_size_ms = cfg->rc_buf_sz; + oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz; + oxcf->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz; oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh; oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct; oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct; oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct; + oxcf->vbr_corpus_complexity = cfg->rc_2pass_vbr_corpus_complexity; oxcf->auto_key = cfg->kf_mode == VPX_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist; @@ -506,14 +589,16 @@ static vpx_codec_err_t set_encoder_config( oxcf->speed = abs(extra_cfg->cpu_used); oxcf->encode_breakout = extra_cfg->static_thresh; oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref; - oxcf->noise_sensitivity = extra_cfg->noise_sensitivity; + if (oxcf->bit_depth == VPX_BITS_8) { + oxcf->noise_sensitivity = extra_cfg->noise_sensitivity; + } else { + // Disable denoiser for high bitdepth since vp9_denoiser_filter only works + // for 8 bits. + oxcf->noise_sensitivity = 0; + } oxcf->sharpness = extra_cfg->sharpness; - oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in; - -#if CONFIG_FP_MB_STATS - oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in; -#endif + vp9_set_first_pass_stats(oxcf, &cfg->rc_twopass_stats_in); oxcf->color_space = extra_cfg->color_space; oxcf->color_range = extra_cfg->color_range; @@ -529,6 +614,10 @@ static vpx_codec_err_t set_encoder_config( oxcf->tile_columns = extra_cfg->tile_columns; + oxcf->enable_tpl_model = extra_cfg->enable_tpl_model; + + oxcf->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering; + // TODO(yunqing): The dependencies between row tiles cause error in multi- // threaded encoding. For now, tile_rows is forced to be 0 in this case. // The further fix can be done by adding synchronizations after a tile row @@ -554,20 +643,23 @@ static vpx_codec_err_t set_encoder_config( oxcf->target_level = extra_cfg->target_level; + oxcf->row_mt = extra_cfg->row_mt; + oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test; + + oxcf->delta_q_uv = extra_cfg->delta_q_uv; + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { -#if CONFIG_SPATIAL_SVC - oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl]; -#endif for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { - oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] = - 1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl]; + const int layer = sl * oxcf->ts_number_layers + tl; + if (cfg->layer_target_bitrate[layer] > INT_MAX / 1000) + oxcf->layer_target_bitrate[layer] = INT_MAX; + else + oxcf->layer_target_bitrate[layer] = + 1000 * cfg->layer_target_bitrate[layer]; } } if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) { oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; -#if CONFIG_SPATIAL_SVC - oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref; -#endif } if (oxcf->ts_number_layers > 1) { for (tl = 0; tl < VPX_TS_MAX_LAYERS; ++tl) { @@ -579,54 +671,161 @@ static vpx_codec_err_t set_encoder_config( } if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf); - /* - printf("Current VP9 Settings: \n"); - printf("target_bandwidth: %d\n", oxcf->target_bandwidth); - printf("target_level: %d\n", oxcf->target_level); - printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity); - printf("sharpness: %d\n", oxcf->sharpness); - printf("cpu_used: %d\n", oxcf->cpu_used); - printf("Mode: %d\n", oxcf->mode); - printf("auto_key: %d\n", oxcf->auto_key); - printf("key_freq: %d\n", oxcf->key_freq); - printf("end_usage: %d\n", oxcf->end_usage); - printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct); - printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct); - printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level); - printf("optimal_buffer_level: %d\n", oxcf->optimal_buffer_level); - printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size); - printf("fixed_q: %d\n", oxcf->fixed_q); - printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q); - printf("best_allowed_q: %d\n", oxcf->best_allowed_q); - printf("allow_spatial_resampling: %d\n", oxcf->allow_spatial_resampling); - printf("scaled_frame_width: %d\n", oxcf->scaled_frame_width); - printf("scaled_frame_height: %d\n", oxcf->scaled_frame_height); - printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias); - printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section); - printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section); - printf("lag_in_frames: %d\n", oxcf->lag_in_frames); - printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf); - printf("Version: %d\n", oxcf->Version); - printf("encode_breakout: %d\n", oxcf->encode_breakout); - printf("error resilient: %d\n", oxcf->error_resilient_mode); - printf("frame parallel detokenization: %d\n", - oxcf->frame_parallel_decoding_mode); - */ + // vp9_dump_encoder_config(oxcf, stderr); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t set_twopass_params_from_config( + const vpx_codec_enc_cfg_t *const cfg, struct VP9_COMP *cpi) { + if (!cfg->use_vizier_rc_params) return VPX_CODEC_OK; + if (cpi == NULL) return VPX_CODEC_ERROR; + + cpi->twopass.use_vizier_rc_params = cfg->use_vizier_rc_params; + + // The values set here are factors that will be applied to default values + // to get the final value used in the two pass code. Hence 1.0 will + // match the default behaviour when not using passed in values. + // We also apply limits here to prevent the user from applying settings + // that make no sense. + cpi->twopass.active_wq_factor = + (double)cfg->active_wq_factor.num / (double)cfg->active_wq_factor.den; + if (cpi->twopass.active_wq_factor < 0.25) + cpi->twopass.active_wq_factor = 0.25; + else if (cpi->twopass.active_wq_factor > 16.0) + cpi->twopass.active_wq_factor = 16.0; + + cpi->twopass.err_per_mb = + (double)cfg->err_per_mb_factor.num / (double)cfg->err_per_mb_factor.den; + if (cpi->twopass.err_per_mb < 0.25) + cpi->twopass.err_per_mb = 0.25; + else if (cpi->twopass.err_per_mb > 4.0) + cpi->twopass.err_per_mb = 4.0; + + cpi->twopass.sr_default_decay_limit = + (double)cfg->sr_default_decay_limit.num / + (double)cfg->sr_default_decay_limit.den; + if (cpi->twopass.sr_default_decay_limit < 0.25) + cpi->twopass.sr_default_decay_limit = 0.25; + // If the default changes this will need to change. + else if (cpi->twopass.sr_default_decay_limit > 1.33) + cpi->twopass.sr_default_decay_limit = 1.33; + + cpi->twopass.sr_diff_factor = + (double)cfg->sr_diff_factor.num / (double)cfg->sr_diff_factor.den; + if (cpi->twopass.sr_diff_factor < 0.25) + cpi->twopass.sr_diff_factor = 0.25; + else if (cpi->twopass.sr_diff_factor > 4.0) + cpi->twopass.sr_diff_factor = 4.0; + + cpi->twopass.kf_err_per_mb = (double)cfg->kf_err_per_mb_factor.num / + (double)cfg->kf_err_per_mb_factor.den; + if (cpi->twopass.kf_err_per_mb < 0.25) + cpi->twopass.kf_err_per_mb = 0.25; + else if (cpi->twopass.kf_err_per_mb > 4.0) + cpi->twopass.kf_err_per_mb = 4.0; + + cpi->twopass.kf_frame_min_boost = (double)cfg->kf_frame_min_boost_factor.num / + (double)cfg->kf_frame_min_boost_factor.den; + if (cpi->twopass.kf_frame_min_boost < 0.25) + cpi->twopass.kf_frame_min_boost = 0.25; + else if (cpi->twopass.kf_frame_min_boost > 4.0) + cpi->twopass.kf_frame_min_boost = 4.0; + + cpi->twopass.kf_frame_max_boost_first = + (double)cfg->kf_frame_max_boost_first_factor.num / + (double)cfg->kf_frame_max_boost_first_factor.den; + if (cpi->twopass.kf_frame_max_boost_first < 0.25) + cpi->twopass.kf_frame_max_boost_first = 0.25; + else if (cpi->twopass.kf_frame_max_boost_first > 4.0) + cpi->twopass.kf_frame_max_boost_first = 4.0; + + cpi->twopass.kf_frame_max_boost_subs = + (double)cfg->kf_frame_max_boost_subs_factor.num / + (double)cfg->kf_frame_max_boost_subs_factor.den; + if (cpi->twopass.kf_frame_max_boost_subs < 0.25) + cpi->twopass.kf_frame_max_boost_subs = 0.25; + else if (cpi->twopass.kf_frame_max_boost_subs > 4.0) + cpi->twopass.kf_frame_max_boost_subs = 4.0; + + cpi->twopass.kf_max_total_boost = (double)cfg->kf_max_total_boost_factor.num / + (double)cfg->kf_max_total_boost_factor.den; + if (cpi->twopass.kf_max_total_boost < 0.25) + cpi->twopass.kf_max_total_boost = 0.25; + else if (cpi->twopass.kf_max_total_boost > 4.0) + cpi->twopass.kf_max_total_boost = 4.0; + + cpi->twopass.gf_max_total_boost = (double)cfg->gf_max_total_boost_factor.num / + (double)cfg->gf_max_total_boost_factor.den; + if (cpi->twopass.gf_max_total_boost < 0.25) + cpi->twopass.gf_max_total_boost = 0.25; + else if (cpi->twopass.gf_max_total_boost > 4.0) + cpi->twopass.gf_max_total_boost = 4.0; + + cpi->twopass.gf_frame_max_boost = (double)cfg->gf_frame_max_boost_factor.num / + (double)cfg->gf_frame_max_boost_factor.den; + if (cpi->twopass.gf_frame_max_boost < 0.25) + cpi->twopass.gf_frame_max_boost = 0.25; + else if (cpi->twopass.gf_frame_max_boost > 4.0) + cpi->twopass.gf_frame_max_boost = 4.0; + + cpi->twopass.zm_factor = + (double)cfg->zm_factor.num / (double)cfg->zm_factor.den; + if (cpi->twopass.zm_factor < 0.25) + cpi->twopass.zm_factor = 0.25; + else if (cpi->twopass.zm_factor > 2.0) + cpi->twopass.zm_factor = 2.0; + + cpi->rd_ctrl.rd_mult_inter_qp_fac = (double)cfg->rd_mult_inter_qp_fac.num / + (double)cfg->rd_mult_inter_qp_fac.den; + if (cpi->rd_ctrl.rd_mult_inter_qp_fac < 0.25) + cpi->rd_ctrl.rd_mult_inter_qp_fac = 0.25; + else if (cpi->rd_ctrl.rd_mult_inter_qp_fac > 4.0) + cpi->rd_ctrl.rd_mult_inter_qp_fac = 4.0; + + cpi->rd_ctrl.rd_mult_arf_qp_fac = + (double)cfg->rd_mult_arf_qp_fac.num / (double)cfg->rd_mult_arf_qp_fac.den; + if (cpi->rd_ctrl.rd_mult_arf_qp_fac < 0.25) + cpi->rd_ctrl.rd_mult_arf_qp_fac = 0.25; + else if (cpi->rd_ctrl.rd_mult_arf_qp_fac > 4.0) + cpi->rd_ctrl.rd_mult_arf_qp_fac = 4.0; + + cpi->rd_ctrl.rd_mult_key_qp_fac = + (double)cfg->rd_mult_key_qp_fac.num / (double)cfg->rd_mult_key_qp_fac.den; + if (cpi->rd_ctrl.rd_mult_key_qp_fac < 0.25) + cpi->rd_ctrl.rd_mult_key_qp_fac = 0.25; + else if (cpi->rd_ctrl.rd_mult_key_qp_fac > 4.0) + cpi->rd_ctrl.rd_mult_key_qp_fac = 4.0; + return VPX_CODEC_OK; } static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg) { vpx_codec_err_t res; - int force_key = 0; + volatile int force_key = 0; if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) { if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS) ERROR("Cannot change width or height after initialization"); - if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) || + // Note: function encoder_set_config() is allowed to be called multiple + // times. However, when the original frame width or height is less than two + // times of the new frame width or height, a forced key frame should be + // used (for the case of single spatial layer, since otherwise a previous + // encoded frame at a lower layer may be the desired reference). To make + // sure the correct detection of a forced key frame, we need + // to update the frame width and height only when the actual encoding is + // performed. cpi->last_coded_width and cpi->last_coded_height are used to + // track the actual coded frame size. + if ((ctx->cpi->last_coded_width && ctx->cpi->last_coded_height && + (!valid_ref_frame_size(ctx->cpi->last_coded_width, + ctx->cpi->last_coded_height, cfg->g_w, + cfg->g_h) && + ctx->cpi->svc.number_spatial_layers == 1)) || (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) || - (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height)) + (ctx->cpi->initial_height && + (int)cfg->g_h > ctx->cpi->initial_height)) { force_key = 1; + } } // Prevent increasing lag_in_frames. This check is stricter than it needs @@ -637,18 +836,29 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, ERROR("Cannot increase lag_in_frames"); res = validate_config(ctx, cfg, &ctx->extra_cfg); + if (res != VPX_CODEC_OK) return res; - if (res == VPX_CODEC_OK) { - ctx->cfg = *cfg; - set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); - // On profile change, request a key frame - force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; - vp9_change_config(ctx->cpi, &ctx->oxcf); + if (setjmp(ctx->cpi->common.error.jmp)) { + const vpx_codec_err_t codec_err = + update_error_state(ctx, &ctx->cpi->common.error); + ctx->cpi->common.error.setjmp = 0; + vpx_clear_system_state(); + assert(codec_err != VPX_CODEC_OK); + return codec_err; } + ctx->cpi->common.error.setjmp = 1; + + ctx->cfg = *cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + set_twopass_params_from_config(&ctx->cfg, ctx->cpi); + // On profile change, request a key frame + force_key |= ctx->cpi->common.profile != ctx->oxcf.profile; + vp9_change_config(ctx->cpi, &ctx->oxcf); if (force_key) ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF; - return res; + ctx->cpi->common.error.setjmp = 0; + return VPX_CODEC_OK; } static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx, @@ -667,12 +877,32 @@ static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_get_quantizer_svc_layers(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + int i; + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + for (i = 0; i < VPX_SS_MAX_LAYERS; i++) { + arg[i] = ctx->cpi->svc.base_qindex[i]; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_get_loopfilter_level(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = ctx->cpi->common.lf.filter_level; + return VPX_CODEC_OK; +} + static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx, const struct vp9_extracfg *extra_cfg) { const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg); if (res == VPX_CODEC_OK) { ctx->extra_cfg = *extra_cfg; set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); + set_twopass_params_from_config(&ctx->cfg, ctx->cpi); vp9_change_config(ctx->cpi, &ctx->oxcf); } return res; @@ -681,7 +911,13 @@ static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; + // Use fastest speed setting (speed 9 or -9) if it's set beyond the range. extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args); + extra_cfg.cpu_used = clamp(extra_cfg.cpu_used, -9, 9); +#if CONFIG_REALTIME_ONLY + if (extra_cfg.cpu_used > -5 && extra_cfg.cpu_used < 5) + extra_cfg.cpu_used = (extra_cfg.cpu_used > 0) ? 5 : -5; +#endif return update_extra_cfg(ctx, &extra_cfg); } @@ -727,6 +963,21 @@ static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_tpl_model(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_tpl_model = CAST(VP9E_SET_TPL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_keyframe_filtering(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_keyframe_filtering = + CAST(VP9E_SET_KEY_FRAME_FILTERING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; @@ -774,7 +1025,7 @@ static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct( vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.rc_max_inter_bitrate_pct = - CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args); + CAST(VP9E_SET_MAX_INTER_BITRATE_PCT, args); return update_extra_cfg(ctx, &extra_cfg); } @@ -804,6 +1055,7 @@ static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args); + if (ctx->cpi->fixed_qp_onepass) extra_cfg.aq_mode = 0; return update_extra_cfg(ctx, &extra_cfg); } @@ -842,6 +1094,34 @@ static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.row_mt = CAST(VP9E_SET_ROW_MT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + if (data) { + cpi->compute_frame_low_motion_onepass = 0; + cpi->rc.constrain_gf_key_freq_onepass_vbr = 0; + cpi->cyclic_refresh->content_mode = 0; + cpi->disable_scene_detection_rtc_ratectrl = 1; + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_enable_motion_vector_unit_test( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.motion_vector_unit_test = + CAST(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return VPX_CODEC_INVALID_PARAM; @@ -864,12 +1144,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, priv->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); if (priv->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR; -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) { - return VPX_CODEC_MEM_ERROR; - } -#endif - if (ctx->config.enc) { // Update the reference to the config structure to an internal copy. priv->cfg = *ctx->config.enc; @@ -877,21 +1151,21 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, } priv->extra_cfg = default_extra_cfg; - once(vp9_initialize_enc); + vp9_initialize_enc(); res = validate_config(priv, &priv->cfg, &priv->extra_cfg); if (res == VPX_CODEC_OK) { + priv->pts_offset_initialized = 0; + priv->global_header_subsampling = -1; set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); #if CONFIG_VP9_HIGHBITDEPTH priv->oxcf.use_highbitdepth = (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0; #endif priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool); - if (priv->cpi == NULL) - res = VPX_CODEC_MEM_ERROR; - else - priv->cpi->output_pkt_list = &priv->pkt_list.head; + if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR; + set_twopass_params_from_config(&priv->cfg, priv->cpi); } } @@ -900,29 +1174,36 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) { free(ctx->cx_data); + free(ctx->global_headers.buf); vp9_remove_compressor(ctx->cpi); -#if CONFIG_MULTITHREAD - pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); -#endif vpx_free(ctx->buffer_pool); vpx_free(ctx); return VPX_CODEC_OK; } -static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, - unsigned long duration, - unsigned long deadline) { +static vpx_codec_err_t pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, + unsigned long duration, + vpx_enc_deadline_t deadline) { MODE new_mode = BEST; +#if CONFIG_REALTIME_ONLY + (void)duration; + deadline = VPX_DL_REALTIME; +#else switch (ctx->cfg.g_pass) { case VPX_RC_ONE_PASS: if (deadline > 0) { - const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg; - // Convert duration parameter from stream timebase to microseconds. - const uint64_t duration_us = (uint64_t)duration * 1000000 * - (uint64_t)cfg->g_timebase.num / - (uint64_t)cfg->g_timebase.den; + VPX_STATIC_ASSERT(TICKS_PER_SEC > 1000000 && + (TICKS_PER_SEC % 1000000) == 0); + + if (duration > UINT64_MAX / (uint64_t)ctx->oxcf.g_timebase_in_ts.num) { + ERROR("duration is too big"); + } + uint64_t duration_us = duration * + (uint64_t)ctx->oxcf.g_timebase_in_ts.num / + ((uint64_t)ctx->oxcf.g_timebase_in_ts.den * + (TICKS_PER_SEC / 1000000)); // If the deadline is more that the duration this frame is to be shown, // use good quality mode. Otherwise use realtime mode. @@ -934,6 +1215,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, case VPX_RC_FIRST_PASS: break; case VPX_RC_LAST_PASS: new_mode = deadline > 0 ? GOOD : BEST; break; } +#endif // CONFIG_REALTIME_ONLY if (deadline == VPX_DL_REALTIME) { ctx->oxcf.pass = 0; @@ -944,6 +1226,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, ctx->oxcf.mode = new_mode; vp9_change_config(ctx->cpi, &ctx->oxcf); } + return VPX_CODEC_OK; } // Turn on to test if supplemental superframe data breaks decoding @@ -1005,71 +1288,68 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { return index_sz; } -static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase, - int64_t n) { - return n * TICKS_PER_SEC * timebase->num / timebase->den; -} - -static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase, - int64_t n) { - const int64_t round = (int64_t)TICKS_PER_SEC * timebase->num / 2 - 1; - return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; -} - static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, unsigned int lib_flags) { vpx_codec_frame_flags_t flags = lib_flags << 16; if (lib_flags & FRAMEFLAGS_KEY || - (cpi->use_svc && - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id] - .is_key_frame)) + (cpi->use_svc && cpi->svc + .layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id] + .is_key_frame)) flags |= VPX_FRAME_IS_KEY; + if (!cpi->common.show_frame) { + flags |= VPX_FRAME_IS_INVISIBLE; + } + if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE; return flags; } +static INLINE vpx_codec_cx_pkt_t get_psnr_pkt(const PSNR_STATS *psnr) { + vpx_codec_cx_pkt_t pkt; + pkt.kind = VPX_CODEC_PSNR_PKT; + pkt.data.psnr = *psnr; + return pkt; +} + +#if !CONFIG_REALTIME_ONLY +static INLINE vpx_codec_cx_pkt_t +get_first_pass_stats_pkt(FIRSTPASS_STATS *stats) { + // WARNNING: This function assumes that stats will + // exist and not be changed until the packet is processed + // TODO(angiebird): Refactor the code to avoid using the assumption. + vpx_codec_cx_pkt_t pkt; + pkt.kind = VPX_CODEC_STATS_PKT; + pkt.data.twopass_stats.buf = stats; + pkt.data.twopass_stats.sz = sizeof(*stats); + return pkt; +} +#endif + const size_t kMinCompressedSize = 8192; static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, const vpx_image_t *img, - vpx_codec_pts_t pts, + vpx_codec_pts_t pts_val, unsigned long duration, vpx_enc_frame_flags_t enc_flags, - unsigned long deadline) { + vpx_enc_deadline_t deadline) { volatile vpx_codec_err_t res = VPX_CODEC_OK; volatile vpx_enc_frame_flags_t flags = enc_flags; + volatile vpx_codec_pts_t pts = pts_val; VP9_COMP *const cpi = ctx->cpi; - const vpx_rational_t *const timebase = &ctx->cfg.g_timebase; + const vpx_rational64_t *const timebase_in_ts = &ctx->oxcf.g_timebase_in_ts; size_t data_sz; + vpx_codec_cx_pkt_t pkt; + memset(&pkt, 0, sizeof(pkt)); if (cpi == NULL) return VPX_CODEC_INVALID_PARAM; - if (cpi->oxcf.pass == 2 && cpi->level_constraint.level_index >= 0 && - !cpi->level_constraint.rc_config_updated) { - SVC *const svc = &cpi->svc; - const int is_two_pass_svc = - (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); - const VP9EncoderConfig *const oxcf = &cpi->oxcf; - TWO_PASS *const twopass = &cpi->twopass; - FIRSTPASS_STATS *stats = &twopass->total_stats; - if (is_two_pass_svc) { - const double frame_rate = 10000000.0 * stats->count / stats->duration; - vp9_update_spatial_layer_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * - svc->layer_context[svc->spatial_layer_id].target_bandwidth / - 10000000.0); - } else { - twopass->bits_left = - (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); - } - cpi->level_constraint.rc_config_updated = 1; - } + cpi->last_coded_width = ctx->oxcf.width; + cpi->last_coded_height = ctx->oxcf.height; if (img != NULL) { res = validate_img(ctx, img); @@ -1077,7 +1357,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // There's no codec control for multiple alt-refs so check the encoder // instance for its status to determine the compressed data size. data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 * - (cpi->multi_arf_allowed ? 8 : 2); + (cpi->multi_layer_arf ? 8 : 2); if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize; if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) { ctx->cx_data_sz = data_sz; @@ -1087,10 +1367,27 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_MEM_ERROR; } } + + int chroma_subsampling = -1; + if ((img->fmt & VPX_IMG_FMT_I420) == VPX_IMG_FMT_I420 || + (img->fmt & VPX_IMG_FMT_NV12) == VPX_IMG_FMT_NV12 || + (img->fmt & VPX_IMG_FMT_YV12) == VPX_IMG_FMT_YV12) { + chroma_subsampling = 1; // matches default for Codec Parameter String + } else if ((img->fmt & VPX_IMG_FMT_I422) == VPX_IMG_FMT_I422) { + chroma_subsampling = 2; + } else if ((img->fmt & VPX_IMG_FMT_I444) == VPX_IMG_FMT_I444) { + chroma_subsampling = 3; + } + if (chroma_subsampling > ctx->global_header_subsampling) { + ctx->global_header_subsampling = chroma_subsampling; + } } } - pick_quickcompress_mode(ctx, duration, deadline); + res = pick_quickcompress_mode(ctx, duration, deadline); + if (res != VPX_CODEC_OK) { + return res; + } vpx_codec_pkt_list_init(&ctx->pkt_list); // Handle Flags @@ -1108,7 +1405,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, } cpi->common.error.setjmp = 1; - vp9_apply_encoding_flags(cpi, flags); + if (res == VPX_CODEC_OK) vp9_apply_encoding_flags(cpi, flags); // Handle fixed keyframe intervals if (ctx->cfg.kf_mode == VPX_KF_AUTO && @@ -1121,33 +1418,71 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (res == VPX_CODEC_OK) { unsigned int lib_flags = 0; - YV12_BUFFER_CONFIG sd; - int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts); - int64_t dst_end_time_stamp = - timebase_units_to_ticks(timebase, pts + duration); size_t size, cx_data_sz; unsigned char *cx_data; + // Per-frame PSNR is not supported when g_lag_in_frames is greater than 0. + if ((flags & VPX_EFLAG_CALCULATE_PSNR) && ctx->cfg.g_lag_in_frames != 0) { + vpx_internal_error( + &ctx->cpi->common.error, VPX_CODEC_INCAPABLE, + "Cannot calculate per-frame PSNR when g_lag_in_frames is nonzero"); + } // Set up internal flags - if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1; +#if CONFIG_INTERNAL_STATS + assert(cpi->b_calculate_psnr == 1); +#else + cpi->b_calculate_psnr = (ctx->base.init_flags & VPX_CODEC_USE_PSNR) || + (flags & VPX_EFLAG_CALCULATE_PSNR); +#endif if (img != NULL) { + YV12_BUFFER_CONFIG sd; + + if (!ctx->pts_offset_initialized) { + ctx->pts_offset = pts; + ctx->pts_offset_initialized = 1; + } + if (pts < ctx->pts_offset) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM, + "pts is smaller than initial pts"); + } + pts -= ctx->pts_offset; + if (pts > INT64_MAX / timebase_in_ts->num) { + vpx_internal_error( + &cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts to ticks would overflow"); + } + const int64_t dst_time_stamp = + timebase_units_to_ticks(timebase_in_ts, pts); + + cpi->svc.timebase_fac = timebase_units_to_ticks(timebase_in_ts, 1); + cpi->svc.time_stamp_superframe = dst_time_stamp; + +#if ULONG_MAX > INT64_MAX + if (duration > INT64_MAX) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM, + "duration is too big"); + } +#endif + if (pts > INT64_MAX - (int64_t)duration) { + vpx_internal_error(&cpi->common.error, VPX_CODEC_INVALID_PARAM, + "relative pts + duration is too big"); + } + vpx_codec_pts_t pts_end = pts + (int64_t)duration; + if (pts_end > INT64_MAX / timebase_in_ts->num) { + vpx_internal_error( + &cpi->common.error, VPX_CODEC_INVALID_PARAM, + "conversion of relative pts + duration to ticks would overflow"); + } + const int64_t dst_end_time_stamp = + timebase_units_to_ticks(timebase_in_ts, pts_end); res = image2yuvconfig(img, &sd); - if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { - /* from vpx_encoder.h for g_w/g_h: - "Note that the frames passed as input to the encoder must have this - resolution" - */ - ctx->base.err_detail = "Invalid input frame resolution"; - res = VPX_CODEC_INVALID_PARAM; - } else { - // Store the original flags in to the frame buffer. Will extract the - // key frame flag when we actually encode this frame. - if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, - dst_time_stamp, dst_end_time_stamp)) { - res = update_error_state(ctx, &cpi->common.error); - } + // Store the original flags in to the frame buffer. Will extract the + // key frame flag when we actually encode this frame. + if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, + dst_time_stamp, dst_end_time_stamp)) { + res = update_error_state(ctx, &cpi->common.error); } ctx->next_frame_flags = 0; } @@ -1157,127 +1492,154 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, /* Any pending invisible frames? */ if (ctx->pending_cx_data) { + assert(cx_data_sz >= ctx->pending_cx_data_sz); memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz); ctx->pending_cx_data = cx_data; cx_data += ctx->pending_cx_data_sz; cx_data_sz -= ctx->pending_cx_data_sz; - /* TODO: this is a minimal check, the underlying codec doesn't respect - * the buffer size anyway. + /* TODO(webm:1844): this is a minimal check, the underlying codec doesn't + * respect the buffer size anyway. */ if (cx_data_sz < ctx->cx_data_sz / 2) { vpx_internal_error(&cpi->common.error, VPX_CODEC_ERROR, "Compressed data buffer too small"); - return VPX_CODEC_ERROR; } } - while (cx_data_sz >= ctx->cx_data_sz / 2 && - -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data, - &dst_time_stamp, &dst_end_time_stamp, - !img)) { - if (size) { - vpx_codec_cx_pkt_t pkt; + if (cpi->oxcf.pass == 1 && !cpi->use_svc) { +#if !CONFIG_REALTIME_ONLY + // compute first pass stats + if (img) { + int ret; + int64_t dst_time_stamp; + int64_t dst_end_time_stamp; + vpx_codec_cx_pkt_t fps_pkt; + ENCODE_FRAME_RESULT encode_frame_result; + vp9_init_encode_frame_result(&encode_frame_result); + // TODO(angiebird): Call vp9_first_pass directly + ret = vp9_get_compressed_data( + cpi, &lib_flags, &size, cx_data, cx_data_sz, &dst_time_stamp, + &dst_end_time_stamp, !img, &encode_frame_result); + assert(size == 0); // There is no compressed data in the first pass + (void)ret; + assert(ret == 0); + fps_pkt = get_first_pass_stats_pkt(&cpi->twopass.this_frame_stats); + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &fps_pkt); + } else { + if (!cpi->twopass.first_pass_done) { + vpx_codec_cx_pkt_t fps_pkt; + vp9_end_first_pass(cpi); + fps_pkt = get_first_pass_stats_pkt(&cpi->twopass.total_stats); + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &fps_pkt); + } + } +#else // !CONFIG_REALTIME_ONLY + assert(0); +#endif // !CONFIG_REALTIME_ONLY + } else { + ENCODE_FRAME_RESULT encode_frame_result; + int64_t dst_time_stamp; + int64_t dst_end_time_stamp; + vp9_init_encode_frame_result(&encode_frame_result); + while (cx_data_sz >= ctx->cx_data_sz / 2 && + -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data, + cx_data_sz, &dst_time_stamp, + &dst_end_time_stamp, !img, + &encode_frame_result)) { + // Pack psnr pkt. + if (size > 0) { + PSNR_STATS psnr; + if (vp9_get_psnr(cpi, &psnr)) { + vpx_codec_cx_pkt_t psnr_pkt = get_psnr_pkt(&psnr); + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &psnr_pkt); + } + } -#if CONFIG_SPATIAL_SVC - if (cpi->use_svc) - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers] - .layer_size += size; -#endif + if (size || (cpi->use_svc && cpi->svc.skip_enhancement_layer)) { + // Pack invisible frames with the next visible frame + if (!cpi->common.show_frame || + (cpi->use_svc && cpi->svc.spatial_layer_id < + cpi->svc.number_spatial_layers - 1)) { + if (ctx->pending_cx_data == NULL) ctx->pending_cx_data = cx_data; + ctx->pending_cx_data_sz += size; + if (size) + ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + ctx->pending_frame_magnitude |= size; + cx_data += size; + cx_data_sz -= size; + pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; + pkt.data.frame.height[cpi->svc.spatial_layer_id] = + cpi->common.height; + pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] = + 1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id]; - // Pack invisible frames with the next visible frame - if (!cpi->common.show_frame || - (cpi->use_svc && - cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)) { - if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data; - ctx->pending_cx_data_sz += size; - ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; - ctx->pending_frame_magnitude |= size; - cx_data += size; - cx_data_sz -= size; + if (ctx->output_cx_pkt_cb.output_cx_pkt) { + pkt.kind = VPX_CODEC_CX_FRAME_PKT; + pkt.data.frame.pts = + ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) + + ctx->pts_offset; + pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( + timebase_in_ts, dst_end_time_stamp - dst_time_stamp); + pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); + pkt.data.frame.buf = ctx->pending_cx_data; + pkt.data.frame.sz = size; + ctx->pending_cx_data = NULL; + ctx->pending_cx_data_sz = 0; + ctx->pending_frame_count = 0; + ctx->pending_frame_magnitude = 0; + ctx->output_cx_pkt_cb.output_cx_pkt( + &pkt, ctx->output_cx_pkt_cb.user_priv); + } + continue; + } - if (ctx->output_cx_pkt_cb.output_cx_pkt) { - pkt.kind = VPX_CODEC_CX_FRAME_PKT; - pkt.data.frame.pts = - ticks_to_timebase_units(timebase, dst_time_stamp); - pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( - timebase, dst_end_time_stamp - dst_time_stamp); - pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); + // Add the frame packet to the list of returned packets. + pkt.kind = VPX_CODEC_CX_FRAME_PKT; + pkt.data.frame.pts = + ticks_to_timebase_units(timebase_in_ts, dst_time_stamp) + + ctx->pts_offset; + pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( + timebase_in_ts, dst_end_time_stamp - dst_time_stamp); + pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); + pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; + pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height; + pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] = + 1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id]; + + if (ctx->pending_cx_data) { + if (size) + ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + ctx->pending_frame_magnitude |= size; + ctx->pending_cx_data_sz += size; + // write the superframe only for the case when + if (!ctx->output_cx_pkt_cb.output_cx_pkt) + size += write_superframe_index(ctx); pkt.data.frame.buf = ctx->pending_cx_data; - pkt.data.frame.sz = size; + pkt.data.frame.sz = ctx->pending_cx_data_sz; ctx->pending_cx_data = NULL; ctx->pending_cx_data_sz = 0; ctx->pending_frame_count = 0; ctx->pending_frame_magnitude = 0; + } else { + pkt.data.frame.buf = cx_data; + pkt.data.frame.sz = size; + } + pkt.data.frame.partition_id = -1; + + if (ctx->output_cx_pkt_cb.output_cx_pkt) ctx->output_cx_pkt_cb.output_cx_pkt( &pkt, ctx->output_cx_pkt_cb.user_priv); + else + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + + cx_data += size; + cx_data_sz -= size; + if (is_one_pass_svc(cpi) && (cpi->svc.spatial_layer_id == + cpi->svc.number_spatial_layers - 1)) { + // Encoded all spatial layers; exit loop. + break; } - continue; - } - - // Add the frame packet to the list of returned packets. - pkt.kind = VPX_CODEC_CX_FRAME_PKT; - pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp); - pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( - timebase, dst_end_time_stamp - dst_time_stamp); - pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); - - if (ctx->pending_cx_data) { - ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; - ctx->pending_frame_magnitude |= size; - ctx->pending_cx_data_sz += size; - // write the superframe only for the case when - if (!ctx->output_cx_pkt_cb.output_cx_pkt) - size += write_superframe_index(ctx); - pkt.data.frame.buf = ctx->pending_cx_data; - pkt.data.frame.sz = ctx->pending_cx_data_sz; - ctx->pending_cx_data = NULL; - ctx->pending_cx_data_sz = 0; - ctx->pending_frame_count = 0; - ctx->pending_frame_magnitude = 0; - } else { - pkt.data.frame.buf = cx_data; - pkt.data.frame.sz = size; - } - pkt.data.frame.partition_id = -1; - - if (ctx->output_cx_pkt_cb.output_cx_pkt) - ctx->output_cx_pkt_cb.output_cx_pkt(&pkt, - ctx->output_cx_pkt_cb.user_priv); - else - vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); - - cx_data += size; - cx_data_sz -= size; -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) -#if CONFIG_SPATIAL_SVC - if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) { - vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr; - int sl; - vp9_zero(pkt_sizes); - vp9_zero(pkt_psnr); - pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES; - pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR; - for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { - LAYER_CONTEXT *lc = - &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers]; - pkt_sizes.data.layer_sizes[sl] = lc->layer_size; - pkt_psnr.data.layer_psnr[sl] = lc->psnr_pkt; - lc->layer_size = 0; - } - - vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes); - - vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr); - } -#endif -#endif - if (is_one_pass_cbr_svc(cpi) && - (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { - // Encoded all spatial layers; exit loop. - break; } } } @@ -1303,9 +1665,8 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, @@ -1319,9 +1680,8 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, vp9_copy_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, @@ -1329,14 +1689,13 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *); if (frame != NULL) { - YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx); + const int fb_idx = ctx->cpi->common.cur_show_frame_fb_idx; + YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->cpi->common, fb_idx); if (fb == NULL) return VPX_CODEC_ERROR; - yuvconfig2image(&frame->img, fb, NULL); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, @@ -1346,9 +1705,8 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, if (config != NULL) { ctx->preview_ppcfg = *config; return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; #else (void)ctx; (void)args; @@ -1356,6 +1714,34 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, #endif } +// Returns the contents of CodecPrivate described in: +// https://www.webmproject.org/docs/container/#vp9-codec-feature-metadata-codecprivate +// This includes Profile, Level, Bit depth and Chroma subsampling. Each entry +// is 3 bytes. 1 byte ID, 1 byte length (= 1) and 1 byte value. +static vpx_fixed_buf_t *encoder_get_global_headers(vpx_codec_alg_priv_t *ctx) { + if (!ctx->cpi) return NULL; + + const unsigned int profile = ctx->cfg.g_profile; + const VP9_LEVEL level = vp9_get_level(&ctx->cpi->level_info.level_spec); + const vpx_bit_depth_t bit_depth = ctx->cfg.g_bit_depth; + const int subsampling = ctx->global_header_subsampling; + const uint8_t buf[12] = { + 1, 1, (uint8_t)profile, 2, 1, (uint8_t)level, + 3, 1, (uint8_t)bit_depth, 4, 1, (uint8_t)subsampling + }; + + if (ctx->global_headers.buf) free(ctx->global_headers.buf); + ctx->global_headers.buf = malloc(sizeof(buf)); + if (!ctx->global_headers.buf) return NULL; + + ctx->global_headers.sz = sizeof(buf); + // No data or I440, which isn't mapped. + if (ctx->global_header_subsampling == -1) ctx->global_headers.sz -= 3; + memcpy(ctx->global_headers.buf, buf, ctx->global_headers.sz); + + return &ctx->global_headers; +} + static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) { YV12_BUFFER_CONFIG sd; vp9_ppflags_t flags; @@ -1370,17 +1756,20 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) { if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) { yuvconfig2image(&ctx->preview_img, &sd, NULL); return &ctx->preview_img; - } else { - return NULL; } + return NULL; } static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx, va_list args) { - (void)ctx; - (void)args; + vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *); - // TODO(yaowu): Need to re-implement and test for VP9. + if (data) { + vpx_roi_map_t *roi = (vpx_roi_map_t *)data; + return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols, + roi->delta_q, roi->delta_lf, roi->skip, + roi->ref_frame); + } return VPX_CODEC_INVALID_PARAM; } @@ -1392,11 +1781,10 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx, if (!vp9_set_active_map(ctx->cpi, map->active_map, (int)map->rows, (int)map->cols)) return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else { + return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx, @@ -1407,11 +1795,10 @@ static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx, if (!vp9_get_active_map(ctx->cpi, map->active_map, (int)map->rows, (int)map->cols)) return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else { + return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, @@ -1419,13 +1806,11 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *); if (mode) { - const int res = - vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode, - (VPX_SCALING)mode->v_scaling_mode); + const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode, + mode->v_scaling_mode); return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { @@ -1445,6 +1830,9 @@ static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { cfg->ss_number_layers > 1 && cfg->ts_number_layers > 1) { return VPX_CODEC_INVALID_PARAM; } + + vp9_set_row_mt(ctx->cpi); + return VPX_CODEC_OK; } @@ -1453,22 +1841,23 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx, vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *); VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; SVC *const svc = &cpi->svc; + int sl; - svc->first_spatial_layer_to_encode = data->spatial_layer_id; svc->spatial_layer_to_encode = data->spatial_layer_id; + svc->first_spatial_layer_to_encode = data->spatial_layer_id; + // TODO(jianj): Deprecated to be removed. svc->temporal_layer_id = data->temporal_layer_id; + // Allow for setting temporal layer per spatial layer for superframe. + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { + svc->temporal_layer_id_per_spatial[sl] = + data->temporal_layer_id_per_spatial[sl]; + } // Checks on valid layer_id input. if (svc->temporal_layer_id < 0 || svc->temporal_layer_id >= (int)ctx->cfg.ts_number_layers) { return VPX_CODEC_INVALID_PARAM; } - if (svc->first_spatial_layer_to_encode < 0 || - svc->first_spatial_layer_to_encode >= (int)ctx->cfg.ss_number_layers) { - return VPX_CODEC_INVALID_PARAM; - } - // First spatial layer to encode not implemented for two-pass. - if (is_two_pass_svc(cpi) && svc->first_spatial_layer_to_encode > 0) - return VPX_CODEC_INVALID_PARAM; + return VPX_CODEC_OK; } @@ -1499,29 +1888,112 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx, LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; lc->max_q = params->max_quantizers[layer]; lc->min_q = params->min_quantizers[layer]; + // Checks on valid scale factors. + if (params->scaling_factor_num[sl] < 1 || + params->scaling_factor_den[sl] < 1 || + (params->scaling_factor_num[sl] > params->scaling_factor_den[sl])) { + return VPX_CODEC_INVALID_PARAM; + } lc->scaling_factor_num = params->scaling_factor_num[sl]; lc->scaling_factor_den = params->scaling_factor_den[sl]; lc->speed = params->speed_per_layer[sl]; + lc->loopfilter_ctrl = params->loopfilter_ctrl[sl]; } } return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *); + int sl; + for (sl = 0; sl <= cpi->svc.spatial_layer_id; sl++) { + data->update_buffer_slot[sl] = cpi->svc.update_buffer_slot[sl]; + data->reference_last[sl] = cpi->svc.reference_last[sl]; + data->reference_golden[sl] = cpi->svc.reference_golden[sl]; + data->reference_alt_ref[sl] = cpi->svc.reference_altref[sl]; + data->lst_fb_idx[sl] = cpi->svc.lst_fb_idx[sl]; + data->gld_fb_idx[sl] = cpi->svc.gld_fb_idx[sl]; + data->alt_fb_idx[sl] = cpi->svc.alt_fb_idx[sl]; + // TODO(jianj): Remove these 3, deprecated. + data->update_last[sl] = cpi->svc.update_last[sl]; + data->update_golden[sl] = cpi->svc.update_golden[sl]; + data->update_alt_ref[sl] = cpi->svc.update_altref[sl]; + } + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *); int sl; + cpi->svc.use_set_ref_frame_config = 1; for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { - cpi->svc.ext_frame_flags[sl] = data->frame_flags[sl]; - cpi->svc.ext_lst_fb_idx[sl] = data->lst_fb_idx[sl]; - cpi->svc.ext_gld_fb_idx[sl] = data->gld_fb_idx[sl]; - cpi->svc.ext_alt_fb_idx[sl] = data->alt_fb_idx[sl]; + cpi->svc.update_buffer_slot[sl] = data->update_buffer_slot[sl]; + cpi->svc.reference_last[sl] = data->reference_last[sl]; + cpi->svc.reference_golden[sl] = data->reference_golden[sl]; + cpi->svc.reference_altref[sl] = data->reference_alt_ref[sl]; + cpi->svc.lst_fb_idx[sl] = data->lst_fb_idx[sl]; + cpi->svc.gld_fb_idx[sl] = data->gld_fb_idx[sl]; + cpi->svc.alt_fb_idx[sl] = data->alt_fb_idx[sl]; + cpi->svc.duration[sl] = data->duration[sl]; } return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_svc_inter_layer_pred(vpx_codec_alg_priv_t *ctx, + va_list args) { + const int data = va_arg(args, int); + VP9_COMP *const cpi = ctx->cpi; + cpi->svc.disable_inter_layer_pred = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_frame_drop_layer(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_frame_drop_t *data = va_arg(args, vpx_svc_frame_drop_t *); + int sl; + cpi->svc.framedrop_mode = data->framedrop_mode; + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) + cpi->svc.framedrop_thresh[sl] = data->framedrop_thresh[sl]; + // Don't allow max_consec_drop values below 1. + cpi->svc.max_consec_drop = VPXMAX(1, data->max_consec_drop); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_gf_temporal_ref(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->svc.use_gf_temporal_ref = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_spatial_layer_sync( + vpx_codec_alg_priv_t *ctx, va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_spatial_layer_sync_t *data = + va_arg(args, vpx_svc_spatial_layer_sync_t *); + int sl; + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) + cpi->svc.spatial_layer_sync[sl] = data->spatial_layer_sync[sl]; + cpi->svc.set_intra_only_frame = data->base_layer_intra_only; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_delta_q_uv(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + int data = va_arg(args, int); + data = clamp(data, -15, 15); + extra_cfg.delta_q_uv = data; + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp = @@ -1562,13 +2034,99 @@ static vpx_codec_err_t ctrl_set_render_size(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_postencode_drop(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->rc.ext_use_post_encode_drop = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_disable_overshoot_maxq_cbr( + vpx_codec_alg_priv_t *ctx, va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->rc.disable_overshoot_maxq_cbr = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_disable_loopfilter(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->loopfilter_ctrl = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_rc_funcs_t funcs = *CAST(VP9E_SET_EXTERNAL_RATE_CONTROL, args); + VP9_COMP *cpi = ctx->cpi; + EXT_RATECTRL *ext_ratectrl = &cpi->ext_ratectrl; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + if (oxcf->pass == 2) { + const FRAME_INFO *frame_info = &cpi->frame_info; + vpx_rc_config_t ratectrl_config; + vpx_codec_err_t codec_status; + memset(&ratectrl_config, 0, sizeof(ratectrl_config)); + + ratectrl_config.frame_width = frame_info->frame_width; + ratectrl_config.frame_height = frame_info->frame_height; + ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames; + ratectrl_config.max_gf_interval = oxcf->max_gf_interval; + ratectrl_config.min_gf_interval = oxcf->min_gf_interval; + // TODO(angiebird): Double check whether this is the proper way to set up + // target_bitrate and frame_rate. + ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000); + ratectrl_config.frame_rate_num = oxcf->g_timebase.den; + ratectrl_config.frame_rate_den = oxcf->g_timebase.num; + ratectrl_config.overshoot_percent = oxcf->over_shoot_pct; + ratectrl_config.undershoot_percent = oxcf->under_shoot_pct; + ratectrl_config.min_base_q_index = oxcf->best_allowed_q; + ratectrl_config.max_base_q_index = oxcf->worst_allowed_q; + ratectrl_config.base_qp = oxcf->cq_level; + + if (oxcf->rc_mode == VPX_VBR) { + ratectrl_config.rc_mode = VPX_RC_VBR; + } else if (oxcf->rc_mode == VPX_Q) { + ratectrl_config.rc_mode = VPX_RC_QMODE; + } else if (oxcf->rc_mode == VPX_CQ) { + ratectrl_config.rc_mode = VPX_RC_CQ; + } + + codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl); + if (codec_status != VPX_CODEC_OK) { + return codec_status; + } + } + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_quantizer_one_pass(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const int qp = va_arg(args, int); + vpx_codec_enc_cfg_t *cfg = &ctx->cfg; + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + vpx_codec_err_t res; + + if (qp < 0 || qp > 63) return VPX_CODEC_INVALID_PARAM; + + cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp; + extra_cfg.aq_mode = 0; + cpi->fixed_qp_onepass = 1; + + res = update_extra_cfg(ctx, &extra_cfg); + return res; +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, // Setters { VP8_SET_REFERENCE, ctrl_set_reference }, { VP8_SET_POSTPROC, ctrl_set_previewpp }, - { VP8E_SET_ROI_MAP, ctrl_set_roi_map }, + { VP9E_SET_ROI_MAP, ctrl_set_roi_map }, { VP8E_SET_ACTIVEMAP, ctrl_set_active_map }, { VP8E_SET_SCALEMODE, ctrl_set_scale_mode }, { VP8E_SET_CPUUSED, ctrl_set_cpuused }, @@ -1577,6 +2135,8 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8E_SET_STATIC_THRESHOLD, ctrl_set_static_thresh }, { VP9E_SET_TILE_COLUMNS, ctrl_set_tile_columns }, { VP9E_SET_TILE_ROWS, ctrl_set_tile_rows }, + { VP9E_SET_TPL, ctrl_set_tpl_model }, + { VP9E_SET_KEY_FRAME_FILTERING, ctrl_set_keyframe_filtering }, { VP8E_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames }, { VP8E_SET_ARNR_STRENGTH, ctrl_set_arnr_strength }, { VP8E_SET_ARNR_TYPE, ctrl_set_arnr_type }, @@ -1603,14 +2163,30 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config }, { VP9E_SET_RENDER_SIZE, ctrl_set_render_size }, { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level }, + { VP9E_SET_ROW_MT, ctrl_set_row_mt }, + { VP9E_SET_POSTENCODE_DROP, ctrl_set_postencode_drop }, + { VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, ctrl_set_disable_overshoot_maxq_cbr }, + { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, + { VP9E_SET_SVC_INTER_LAYER_PRED, ctrl_set_svc_inter_layer_pred }, + { VP9E_SET_SVC_FRAME_DROP_LAYER, ctrl_set_svc_frame_drop_layer }, + { VP9E_SET_SVC_GF_TEMPORAL_REF, ctrl_set_svc_gf_temporal_ref }, + { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync }, + { VP9E_SET_DELTA_Q_UV, ctrl_set_delta_q_uv }, + { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter }, + { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl }, + { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control }, + { VP9E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass }, // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, { VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 }, + { VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, ctrl_get_quantizer_svc_layers }, + { VP9E_GET_LOOPFILTER_LEVEL, ctrl_get_loopfilter_level }, { VP9_GET_REFERENCE, ctrl_get_reference }, { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id }, { VP9E_GET_ACTIVEMAP, ctrl_get_active_map }, { VP9E_GET_LEVEL, ctrl_get_level }, + { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config }, { -1, NULL }, }; @@ -1619,7 +2195,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0, { // NOLINT - 0, // g_usage + 0, // g_usage (unused) 8, // g_threads 0, // g_profile @@ -1640,13 +2216,13 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { 0, // rc_resize_allowed 0, // rc_scaled_width 0, // rc_scaled_height - 60, // rc_resize_down_thresold - 30, // rc_resize_up_thresold + 60, // rc_resize_down_thresh + 30, // rc_resize_up_thresh VPX_VBR, // rc_end_usage { NULL, 0 }, // rc_twopass_stats_in { NULL, 0 }, // rc_firstpass_mb_stats_in - 256, // rc_target_bandwidth + 256, // rc_target_bitrate 0, // rc_min_quantizer 63, // rc_max_quantizer 25, // rc_undershoot_pct @@ -1659,6 +2235,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { 50, // rc_two_pass_vbrbias 0, // rc_two_pass_vbrmin_section 2000, // rc_two_pass_vbrmax_section + 0, // rc_2pass_vbr_corpus_complexity (non 0 for corpus vbr) // keyframing settings (kf) VPX_KF_AUTO, // g_kfmode @@ -1667,14 +2244,30 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { VPX_SS_DEFAULT_LAYERS, // ss_number_layers { 0 }, - { 0 }, // ss_target_bitrate - 1, // ts_number_layers - { 0 }, // ts_target_bitrate - { 0 }, // ts_rate_decimator - 0, // ts_periodicity - { 0 }, // ts_layer_id - { 0 }, // layer_taget_bitrate - 0 // temporal_layering_mode + { 0 }, // ss_target_bitrate + 1, // ts_number_layers + { 0 }, // ts_target_bitrate + { 0 }, // ts_rate_decimator + 0, // ts_periodicity + { 0 }, // ts_layer_id + { 0 }, // layer_target_bitrate + 0, // temporal_layering_mode + 0, // use_vizier_rc_params + { 1, 1 }, // active_wq_factor + { 1, 1 }, // err_per_mb_factor + { 1, 1 }, // sr_default_decay_limit + { 1, 1 }, // sr_diff_factor + { 1, 1 }, // kf_err_per_mb_factor + { 1, 1 }, // kf_frame_min_boost_factor + { 1, 1 }, // kf_frame_max_boost_first_factor + { 1, 1 }, // kf_frame_max_boost_subs_factor + { 1, 1 }, // kf_max_total_boost_factor + { 1, 1 }, // gf_max_total_boost_factor + { 1, 1 }, // gf_frame_max_boost_factor + { 1, 1 }, // zm_factor + { 1, 1 }, // rd_mult_inter_qp_fac + { 1, 1 }, // rd_mult_arf_qp_fac + { 1, 1 }, // rd_mult_key_qp_fac } }, }; @@ -1701,13 +2294,238 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = { }, { // NOLINT - 1, // 1 cfg map - encoder_usage_cfg_map, // vpx_codec_enc_cfg_map_t - encoder_encode, // vpx_codec_encode_fn_t - encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t - encoder_set_config, // vpx_codec_enc_config_set_fn_t - NULL, // vpx_codec_get_global_headers_fn_t - encoder_get_preview, // vpx_codec_get_preview_frame_fn_t - NULL // vpx_codec_enc_mr_get_mem_loc_fn_t + 1, // 1 cfg map + encoder_usage_cfg_map, // vpx_codec_enc_cfg_map_t + encoder_encode, // vpx_codec_encode_fn_t + encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t + encoder_set_config, // vpx_codec_enc_config_set_fn_t + encoder_get_global_headers, // vpx_codec_get_global_headers_fn_t + encoder_get_preview, // vpx_codec_get_preview_frame_fn_t + NULL, // vpx_codec_enc_mr_get_mem_loc_fn_t + NULL // vpx_codec_enc_mr_free_mem_loc_fn_t } }; + +static vpx_codec_enc_cfg_t get_enc_cfg(int frame_width, int frame_height, + vpx_rational_t frame_rate, + int target_bitrate, + vpx_enc_pass enc_pass) { + vpx_codec_enc_cfg_t enc_cfg = encoder_usage_cfg_map[0].cfg; + enc_cfg.g_w = frame_width; + enc_cfg.g_h = frame_height; + enc_cfg.rc_target_bitrate = target_bitrate; + enc_cfg.g_pass = enc_pass; + // g_timebase is the inverse of frame_rate + enc_cfg.g_timebase.num = frame_rate.den; + enc_cfg.g_timebase.den = frame_rate.num; + return enc_cfg; +} + +static vp9_extracfg get_extra_cfg(void) { + vp9_extracfg extra_cfg = default_extra_cfg; + return extra_cfg; +} + +VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, + vpx_rational_t frame_rate, + int target_bitrate, int encode_speed, + int target_level, + vpx_enc_pass enc_pass) { + /* This function will generate the same VP9EncoderConfig used by the + * vpxenc command given below. + * The configs in the vpxenc command corresponds to parameters of + * vp9_get_encoder_config() as follows. + * + * WIDTH: frame_width + * HEIGHT: frame_height + * FPS: frame_rate + * BITRATE: target_bitrate + * CPU_USED:encode_speed + * TARGET_LEVEL: target_level + * + * INPUT, OUTPUT, LIMIT will not affect VP9EncoderConfig + * + * vpxenc command: + * INPUT=bus_cif.y4m + * OUTPUT=output.webm + * WIDTH=352 + * HEIGHT=288 + * BITRATE=600 + * FPS=30/1 + * LIMIT=150 + * CPU_USED=0 + * TARGET_LEVEL=0 + * ./vpxenc --limit=$LIMIT --width=$WIDTH --height=$HEIGHT --fps=$FPS + * --lag-in-frames=25 \ + * --codec=vp9 --good --cpu-used=CPU_USED --threads=0 --profile=0 \ + * --min-q=0 --max-q=63 --auto-alt-ref=1 --passes=2 --kf-max-dist=150 \ + * --kf-min-dist=0 --drop-frame=0 --static-thresh=0 --bias-pct=50 \ + * --minsection-pct=0 --maxsection-pct=150 --arnr-maxframes=7 --psnr \ + * --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100 \ + * --frame-parallel=0 --tile-columns=0 --cpu-used=0 --end-usage=vbr \ + * --target-bitrate=$BITRATE --target-level=0 -o $OUTPUT $INPUT + */ + + VP9EncoderConfig oxcf; + vp9_extracfg extra_cfg = get_extra_cfg(); + vpx_codec_enc_cfg_t enc_cfg = get_enc_cfg( + frame_width, frame_height, frame_rate, target_bitrate, enc_pass); + set_encoder_config(&oxcf, &enc_cfg, &extra_cfg); + + // These settings are made to match the settings of the vpxenc command. + oxcf.key_freq = 150; + oxcf.under_shoot_pct = 100; + oxcf.over_shoot_pct = 100; + oxcf.max_threads = 0; + oxcf.tile_columns = 0; + oxcf.frame_parallel_decoding_mode = 0; + oxcf.two_pass_vbrmax_section = 150; + oxcf.speed = abs(encode_speed); + oxcf.target_level = target_level; + return oxcf; +} + +#define DUMP_STRUCT_VALUE(fp, structure, value) \ + fprintf(fp, #value " %" PRId64 "\n", (int64_t)(structure)->value) + +void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp) { + DUMP_STRUCT_VALUE(fp, oxcf, profile); + DUMP_STRUCT_VALUE(fp, oxcf, bit_depth); + DUMP_STRUCT_VALUE(fp, oxcf, width); + DUMP_STRUCT_VALUE(fp, oxcf, height); + DUMP_STRUCT_VALUE(fp, oxcf, input_bit_depth); + DUMP_STRUCT_VALUE(fp, oxcf, init_framerate); + // TODO(angiebird): dump g_timebase + // TODO(angiebird): dump g_timebase_in_ts + + DUMP_STRUCT_VALUE(fp, oxcf, target_bandwidth); + + DUMP_STRUCT_VALUE(fp, oxcf, noise_sensitivity); + DUMP_STRUCT_VALUE(fp, oxcf, sharpness); + DUMP_STRUCT_VALUE(fp, oxcf, speed); + DUMP_STRUCT_VALUE(fp, oxcf, rc_max_intra_bitrate_pct); + DUMP_STRUCT_VALUE(fp, oxcf, rc_max_inter_bitrate_pct); + DUMP_STRUCT_VALUE(fp, oxcf, gf_cbr_boost_pct); + + DUMP_STRUCT_VALUE(fp, oxcf, mode); + DUMP_STRUCT_VALUE(fp, oxcf, pass); + + // Key Framing Operations + DUMP_STRUCT_VALUE(fp, oxcf, auto_key); + DUMP_STRUCT_VALUE(fp, oxcf, key_freq); + + DUMP_STRUCT_VALUE(fp, oxcf, lag_in_frames); + + // ---------------------------------------------------------------- + // DATARATE CONTROL OPTIONS + + // vbr, cbr, constrained quality or constant quality + DUMP_STRUCT_VALUE(fp, oxcf, rc_mode); + + // buffer targeting aggressiveness + DUMP_STRUCT_VALUE(fp, oxcf, under_shoot_pct); + DUMP_STRUCT_VALUE(fp, oxcf, over_shoot_pct); + + // buffering parameters + // TODO(angiebird): dump tarting_buffer_level_ms + // TODO(angiebird): dump ptimal_buffer_level_ms + // TODO(angiebird): dump maximum_buffer_size_ms + + // Frame drop threshold. + DUMP_STRUCT_VALUE(fp, oxcf, drop_frames_water_mark); + + // controlling quality + DUMP_STRUCT_VALUE(fp, oxcf, fixed_q); + DUMP_STRUCT_VALUE(fp, oxcf, worst_allowed_q); + DUMP_STRUCT_VALUE(fp, oxcf, best_allowed_q); + DUMP_STRUCT_VALUE(fp, oxcf, cq_level); + DUMP_STRUCT_VALUE(fp, oxcf, aq_mode); + + // Special handling of Adaptive Quantization for AltRef frames + DUMP_STRUCT_VALUE(fp, oxcf, alt_ref_aq); + + // Internal frame size scaling. + DUMP_STRUCT_VALUE(fp, oxcf, resize_mode); + DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_width); + DUMP_STRUCT_VALUE(fp, oxcf, scaled_frame_height); + + // Enable feature to reduce the frame quantization every x frames. + DUMP_STRUCT_VALUE(fp, oxcf, frame_periodic_boost); + + // two pass datarate control + DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrbias); + DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmin_section); + DUMP_STRUCT_VALUE(fp, oxcf, two_pass_vbrmax_section); + DUMP_STRUCT_VALUE(fp, oxcf, vbr_corpus_complexity); + // END DATARATE CONTROL OPTIONS + // ---------------------------------------------------------------- + + // Spatial and temporal scalability. + DUMP_STRUCT_VALUE(fp, oxcf, ss_number_layers); + DUMP_STRUCT_VALUE(fp, oxcf, ts_number_layers); + + // Bitrate allocation for spatial layers. + // TODO(angiebird): dump layer_target_bitrate[VPX_MAX_LAYERS] + // TODO(angiebird): dump ss_target_bitrate[VPX_SS_MAX_LAYERS] + // TODO(angiebird): dump ss_enable_auto_arf[VPX_SS_MAX_LAYERS] + // TODO(angiebird): dump ts_rate_decimator[VPX_TS_MAX_LAYERS] + + DUMP_STRUCT_VALUE(fp, oxcf, enable_auto_arf); + DUMP_STRUCT_VALUE(fp, oxcf, encode_breakout); + DUMP_STRUCT_VALUE(fp, oxcf, error_resilient_mode); + DUMP_STRUCT_VALUE(fp, oxcf, frame_parallel_decoding_mode); + + DUMP_STRUCT_VALUE(fp, oxcf, arnr_max_frames); + DUMP_STRUCT_VALUE(fp, oxcf, arnr_strength); + + DUMP_STRUCT_VALUE(fp, oxcf, min_gf_interval); + DUMP_STRUCT_VALUE(fp, oxcf, max_gf_interval); + + DUMP_STRUCT_VALUE(fp, oxcf, tile_columns); + DUMP_STRUCT_VALUE(fp, oxcf, tile_rows); + + DUMP_STRUCT_VALUE(fp, oxcf, enable_tpl_model); + + DUMP_STRUCT_VALUE(fp, oxcf, enable_keyframe_filtering); + + DUMP_STRUCT_VALUE(fp, oxcf, max_threads); + + DUMP_STRUCT_VALUE(fp, oxcf, target_level); + + // TODO(angiebird): dump two_pass_stats_in + DUMP_STRUCT_VALUE(fp, oxcf, tuning); + DUMP_STRUCT_VALUE(fp, oxcf, content); +#if CONFIG_VP9_HIGHBITDEPTH + DUMP_STRUCT_VALUE(fp, oxcf, use_highbitdepth); +#endif + DUMP_STRUCT_VALUE(fp, oxcf, color_space); + DUMP_STRUCT_VALUE(fp, oxcf, color_range); + DUMP_STRUCT_VALUE(fp, oxcf, render_width); + DUMP_STRUCT_VALUE(fp, oxcf, render_height); + DUMP_STRUCT_VALUE(fp, oxcf, temporal_layering_mode); + + DUMP_STRUCT_VALUE(fp, oxcf, row_mt); + DUMP_STRUCT_VALUE(fp, oxcf, motion_vector_unit_test); + DUMP_STRUCT_VALUE(fp, oxcf, delta_q_uv); +} + +FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf) { + FRAME_INFO frame_info; + int dummy; + frame_info.frame_width = oxcf->width; + frame_info.frame_height = oxcf->height; + frame_info.render_frame_width = oxcf->width; + frame_info.render_frame_height = oxcf->height; + frame_info.bit_depth = oxcf->bit_depth; + vp9_set_mi_size(&frame_info.mi_rows, &frame_info.mi_cols, &dummy, + frame_info.frame_width, frame_info.frame_height); + vp9_set_mb_size(&frame_info.mb_rows, &frame_info.mb_cols, &frame_info.num_mbs, + frame_info.mi_rows, frame_info.mi_cols); + // TODO(angiebird): Figure out how to get subsampling_x/y here + return frame_info; +} + +void vp9_set_first_pass_stats(VP9EncoderConfig *oxcf, + const vpx_fixed_buf_t *stats) { + oxcf->two_pass_stats_in = *stats; +} diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.h b/media/libvpx/libvpx/vp9/vp9_cx_iface.h new file mode 100644 index 0000000000..f2de8507ff --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_VP9_CX_IFACE_H_ +#define VPX_VP9_VP9_CX_IFACE_H_ +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/common/vp9_onyxc_int.h" + +#ifdef __cplusplus +extern "C" { +#endif + +VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height, + vpx_rational_t frame_rate, + int target_bitrate, int encode_speed, + int target_level, + vpx_enc_pass enc_pass); + +void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp); + +FRAME_INFO vp9_get_frame_info(const VP9EncoderConfig *oxcf); + +static INLINE int64_t +timebase_units_to_ticks(const vpx_rational64_t *timestamp_ratio, int64_t n) { + return n * timestamp_ratio->num / timestamp_ratio->den; +} + +static INLINE int64_t +ticks_to_timebase_units(const vpx_rational64_t *timestamp_ratio, int64_t n) { + int64_t round = timestamp_ratio->num / 2; + if (round > 0) --round; + return (n * timestamp_ratio->den + round) / timestamp_ratio->num; +} + +void vp9_set_first_pass_stats(VP9EncoderConfig *oxcf, + const vpx_fixed_buf_t *stats); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_VP9_CX_IFACE_H_ diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/libvpx/vp9/vp9_dx_iface.c index 0a3e84a0da..b6eeee008d 100644 --- a/media/libvpx/libvpx/vp9/vp9_dx_iface.c +++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.c @@ -19,7 +19,6 @@ #include "vpx/vpx_decoder.h" #include "vpx_dsp/bitreader_buffer.h" #include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_util/vpx_thread.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_frame_buffers.h" @@ -47,12 +46,6 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, ctx->priv->init_flags = ctx->init_flags; priv->si.sz = sizeof(priv->si); priv->flushed = 0; - // Only do frame parallel decode when threads > 1. - priv->frame_parallel_decode = - (ctx->config.dec && (ctx->config.dec->threads > 1) && - (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) - ? 1 - : 0; if (ctx->config.dec) { priv->cfg = *ctx->config.dec; ctx->config.dec = &priv->cfg; @@ -63,33 +56,8 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, } static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { - if (ctx->frame_workers != NULL) { - int i; - // Shutdown all threads before reclaiming any memory. The frame-level - // parallel decoder may access data from another worker. - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - vpx_get_worker_interface()->end(worker); - } - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - vp9_remove_common(&frame_worker_data->pbi->common); -#if CONFIG_VP9_POSTPROC - vp9_free_postproc_buffers(&frame_worker_data->pbi->common); -#endif - vp9_decoder_remove(frame_worker_data->pbi); - vpx_free(frame_worker_data->scratch_buffer); -#if CONFIG_MULTITHREAD - pthread_mutex_destroy(&frame_worker_data->stats_mutex); - pthread_cond_destroy(&frame_worker_data->stats_cond); -#endif - vpx_free(frame_worker_data); - } -#if CONFIG_MULTITHREAD - pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex); -#endif + if (ctx->pbi != NULL) { + vp9_decoder_remove(ctx->pbi); } if (ctx->buffer_pool) { @@ -97,7 +65,6 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers); } - vpx_free(ctx->frame_workers); vpx_free(ctx->buffer_pool); vpx_free(ctx); return VPX_CODEC_OK; @@ -129,7 +96,7 @@ static vpx_codec_err_t decoder_peek_si_internal( const uint8_t *data, unsigned int data_sz, vpx_codec_stream_info_t *si, int *is_intra_only, vpx_decrypt_cb decrypt_cb, void *decrypt_state) { int intra_only_flag = 0; - uint8_t clear_buffer[10]; + uint8_t clear_buffer[11]; if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM; @@ -190,6 +157,9 @@ static vpx_codec_err_t decoder_peek_si_internal( if (profile > PROFILE_0) { if (!parse_bitdepth_colorspace_sampling(profile, &rb)) return VPX_CODEC_UNSUP_BITSTREAM; + // The colorspace info may cause vp9_read_frame_size() to need 11 + // bytes. + if (data_sz < 11) return VPX_CODEC_UNSUP_BITSTREAM; } rb.bit_offset += REF_FRAMES; // refresh_frame_flags vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); @@ -230,34 +200,32 @@ static vpx_codec_err_t update_error_state( return error->error_code; } -static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { - int i; +static vpx_codec_err_t init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { + VP9_COMMON *const cm = &ctx->pbi->common; + BufferPool *const pool = cm->buffer_pool; - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - VP9_COMMON *const cm = &frame_worker_data->pbi->common; - BufferPool *const pool = cm->buffer_pool; + cm->new_fb_idx = INVALID_IDX; + cm->byte_alignment = ctx->byte_alignment; + cm->skip_loop_filter = ctx->skip_loop_filter; - cm->new_fb_idx = INVALID_IDX; - cm->byte_alignment = ctx->byte_alignment; - cm->skip_loop_filter = ctx->skip_loop_filter; + if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { + pool->get_fb_cb = ctx->get_ext_fb_cb; + pool->release_fb_cb = ctx->release_ext_fb_cb; + pool->cb_priv = ctx->ext_priv; + } else { + pool->get_fb_cb = vp9_get_frame_buffer; + pool->release_fb_cb = vp9_release_frame_buffer; - if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { - pool->get_fb_cb = ctx->get_ext_fb_cb; - pool->release_fb_cb = ctx->release_ext_fb_cb; - pool->cb_priv = ctx->ext_priv; - } else { - pool->get_fb_cb = vp9_get_frame_buffer; - pool->release_fb_cb = vp9_release_frame_buffer; - - if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to initialize internal frame buffers"); - - pool->cb_priv = &pool->int_frame_buffers; + if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to initialize internal frame buffers"); + return VPX_CODEC_MEM_ERROR; } + + pool->cb_priv = &pool->int_frame_buffers; } + + return VPX_CODEC_OK; } static void set_default_ppflags(vp8_postproc_cfg_t *cfg) { @@ -273,138 +241,62 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) { flags->noise_level = ctx->postproc_cfg.noise_level; } -static int frame_worker_hook(void *arg1, void *arg2) { - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1; - const uint8_t *data = frame_worker_data->data; - (void)arg2; +#undef ERROR +#define ERROR(str) \ + do { \ + ctx->base.err_detail = str; \ + return VPX_CODEC_INVALID_PARAM; \ + } while (0) - frame_worker_data->result = vp9_receive_compressed_data( - frame_worker_data->pbi, frame_worker_data->data_size, &data); - frame_worker_data->data_end = data; - - if (frame_worker_data->pbi->frame_parallel_decode) { - // In frame parallel decoding, a worker thread must successfully decode all - // the compressed data. - if (frame_worker_data->result != 0 || - frame_worker_data->data + frame_worker_data->data_size - 1 > data) { - VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner; - BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool; - // Signal all the other threads that are waiting for this frame. - vp9_frameworker_lock_stats(worker); - frame_worker_data->frame_context_ready = 1; - lock_buffer_pool(pool); - frame_worker_data->pbi->cur_buf->buf.corrupted = 1; - unlock_buffer_pool(pool); - frame_worker_data->pbi->need_resync = 1; - vp9_frameworker_signal_stats(worker); - vp9_frameworker_unlock_stats(worker); - return 0; - } - } else if (frame_worker_data->result != 0) { - // Check decode result in serial decode. - frame_worker_data->pbi->cur_buf->buf.corrupted = 1; - frame_worker_data->pbi->need_resync = 1; - } - return !frame_worker_data->result; -} +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ + } while (0) static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { - int i; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - + vpx_codec_err_t res; ctx->last_show_frame = -1; - ctx->next_submit_worker_id = 0; - ctx->last_submit_worker_id = 0; - ctx->next_output_worker_id = 0; - ctx->frame_cache_read = 0; - ctx->frame_cache_write = 0; - ctx->num_cache_frames = 0; ctx->need_resync = 1; - ctx->num_frame_workers = - (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads : 1; - if (ctx->num_frame_workers > MAX_DECODE_THREADS) - ctx->num_frame_workers = MAX_DECODE_THREADS; - ctx->available_threads = ctx->num_frame_workers; ctx->flushed = 0; ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool)); if (ctx->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR; -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) { - set_error_detail(ctx, "Failed to allocate buffer pool mutex"); + ctx->pbi = vp9_decoder_create(ctx->buffer_pool); + if (ctx->pbi == NULL) { + vpx_free(ctx->buffer_pool); + ctx->buffer_pool = NULL; + set_error_detail(ctx, "Failed to allocate decoder"); return VPX_CODEC_MEM_ERROR; } -#endif + ctx->pbi->max_threads = ctx->cfg.threads; + ctx->pbi->inv_tile_order = ctx->invert_tile_order; - ctx->frame_workers = (VPxWorker *)vpx_malloc(ctx->num_frame_workers * - sizeof(*ctx->frame_workers)); - if (ctx->frame_workers == NULL) { - set_error_detail(ctx, "Failed to allocate frame_workers"); - return VPX_CODEC_MEM_ERROR; - } + RANGE_CHECK(ctx, row_mt, 0, 1); + ctx->pbi->row_mt = ctx->row_mt; - for (i = 0; i < ctx->num_frame_workers; ++i) { - VPxWorker *const worker = &ctx->frame_workers[i]; - FrameWorkerData *frame_worker_data = NULL; - winterface->init(worker); - worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData)); - if (worker->data1 == NULL) { - set_error_detail(ctx, "Failed to allocate frame_worker_data"); - return VPX_CODEC_MEM_ERROR; - } - frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool); - if (frame_worker_data->pbi == NULL) { - set_error_detail(ctx, "Failed to allocate frame_worker_data"); - return VPX_CODEC_MEM_ERROR; - } - frame_worker_data->pbi->frame_worker_owner = worker; - frame_worker_data->worker_id = i; - frame_worker_data->scratch_buffer = NULL; - frame_worker_data->scratch_buffer_size = 0; - frame_worker_data->frame_context_ready = 0; - frame_worker_data->received_frame = 0; -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) { - set_error_detail(ctx, "Failed to allocate frame_worker_data mutex"); - return VPX_CODEC_MEM_ERROR; - } - - if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) { - set_error_detail(ctx, "Failed to allocate frame_worker_data cond"); - return VPX_CODEC_MEM_ERROR; - } -#endif - // If decoding in serial mode, FrameWorker thread could create tile worker - // thread or loopfilter thread. - frame_worker_data->pbi->max_threads = - (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0; - - frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order; - frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode; - frame_worker_data->pbi->common.frame_parallel_decode = - ctx->frame_parallel_decode; - worker->hook = (VPxWorkerHook)frame_worker_hook; - if (!winterface->reset(worker)) { - set_error_detail(ctx, "Frame Worker thread creation failed"); - return VPX_CODEC_MEM_ERROR; - } - } + RANGE_CHECK(ctx, lpf_opt, 0, 1); + ctx->pbi->lpf_mt_opt = ctx->lpf_opt; // If postprocessing was enabled by the application and a // configuration has not been provided, default it. if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) set_default_ppflags(&ctx->postproc_cfg); - init_buffer_callbacks(ctx); - - return VPX_CODEC_OK; + res = init_buffer_callbacks(ctx); + if (res != VPX_CODEC_OK) { + vpx_free(ctx->buffer_pool); + ctx->buffer_pool = NULL; + vp9_decoder_remove(ctx->pbi); + ctx->pbi = NULL; + } + return res; } static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, const VP9Decoder *const pbi) { - // Clear resync flag if worker got a key frame or intra only frame. + // Clear resync flag if the decoder got a key frame or intra only frame. if (ctx->need_resync == 1 && pbi->need_resync == 0 && (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME)) ctx->need_resync = 0; @@ -412,10 +304,7 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx, static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, const uint8_t **data, unsigned int data_sz, - void *user_priv, int64_t deadline) { - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - (void)deadline; - + void *user_priv) { // Determine the stream parameters. Note that we rely on peek_si to // validate that we have a buffer that does not wrap around the top // of the heap. @@ -429,108 +318,29 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, if (!ctx->si.is_kf && !is_intra_only) return VPX_CODEC_ERROR; } - if (!ctx->frame_parallel_decode) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->data = *data; - frame_worker_data->data_size = data_sz; - frame_worker_data->user_priv = user_priv; - frame_worker_data->received_frame = 1; + ctx->user_priv = user_priv; - // Set these even if already initialized. The caller may have changed the - // decrypt config between frames. - frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb; - frame_worker_data->pbi->decrypt_state = ctx->decrypt_state; + // Set these even if already initialized. The caller may have changed the + // decrypt config between frames. + ctx->pbi->decrypt_cb = ctx->decrypt_cb; + ctx->pbi->decrypt_state = ctx->decrypt_state; - worker->had_error = 0; - winterface->execute(worker); - - // Update data pointer after decode. - *data = frame_worker_data->data_end; - - if (worker->had_error) - return update_error_state(ctx, &frame_worker_data->pbi->common.error); - - check_resync(ctx, frame_worker_data->pbi); - } else { - VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id]; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - // Copy context from last worker thread to next worker thread. - if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) - vp9_frameworker_copy_context( - &ctx->frame_workers[ctx->next_submit_worker_id], - &ctx->frame_workers[ctx->last_submit_worker_id]); - - frame_worker_data->pbi->ready_for_new_data = 0; - // Copy the compressed data into worker's internal buffer. - // TODO(hkuang): Will all the workers allocate the same size - // as the size of the first intra frame be better? This will - // avoid too many deallocate and allocate. - if (frame_worker_data->scratch_buffer_size < data_sz) { - vpx_free(frame_worker_data->scratch_buffer); - frame_worker_data->scratch_buffer = (uint8_t *)vpx_malloc(data_sz); - if (frame_worker_data->scratch_buffer == NULL) { - set_error_detail(ctx, "Failed to reallocate scratch buffer"); - return VPX_CODEC_MEM_ERROR; - } - frame_worker_data->scratch_buffer_size = data_sz; - } - frame_worker_data->data_size = data_sz; - memcpy(frame_worker_data->scratch_buffer, *data, data_sz); - - frame_worker_data->frame_decoded = 0; - frame_worker_data->frame_context_ready = 0; - frame_worker_data->received_frame = 1; - frame_worker_data->data = frame_worker_data->scratch_buffer; - frame_worker_data->user_priv = user_priv; - - if (ctx->next_submit_worker_id != ctx->last_submit_worker_id) - ctx->last_submit_worker_id = - (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers; - - ctx->next_submit_worker_id = - (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers; - --ctx->available_threads; - worker->had_error = 0; - winterface->launch(worker); + if (vp9_receive_compressed_data(ctx->pbi, data_sz, data)) { + ctx->pbi->cur_buf->buf.corrupted = 1; + ctx->pbi->need_resync = 1; + ctx->need_resync = 1; + return update_error_state(ctx, &ctx->pbi->common.error); } + check_resync(ctx, ctx->pbi); + return VPX_CODEC_OK; } -static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) { - YV12_BUFFER_CONFIG sd; - vp9_ppflags_t flags = { 0, 0, 0 }; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - ctx->next_output_worker_id = - (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; - // TODO(hkuang): Add worker error handling here. - winterface->sync(worker); - frame_worker_data->received_frame = 0; - ++ctx->available_threads; - - check_resync(ctx, frame_worker_data->pbi); - - if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { - VP9_COMMON *const cm = &frame_worker_data->pbi->common; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx; - yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd, - frame_worker_data->user_priv); - ctx->frame_cache[ctx->frame_cache_write].img.fb_priv = - frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; - ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE; - ++ctx->num_cache_frames; - } -} - static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, - void *user_priv, long deadline) { + void *user_priv) { const uint8_t *data_start = data; - const uint8_t *const data_end = data + data_sz; vpx_codec_err_t res; uint32_t frame_sizes[8]; int frame_count; @@ -543,9 +353,9 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, // Reset flushed when receiving a valid frame. ctx->flushed = 0; - // Initialize the decoder workers on the first frame. - if (ctx->frame_workers == NULL) { - const vpx_codec_err_t res = init_decoder(ctx); + // Initialize the decoder on the first frame. + if (ctx->pbi == NULL) { + res = init_decoder(ctx); if (res != VPX_CODEC_OK) return res; } @@ -556,91 +366,37 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1) frame_count = ctx->svc_spatial_layer + 1; - if (ctx->frame_parallel_decode) { - // Decode in frame parallel mode. When decoding in this mode, the frame - // passed to the decoder must be either a normal frame or a superframe with - // superframe index so the decoder could get each frame's start position - // in the superframe. - if (frame_count > 0) { - int i; + // Decode in serial mode. + if (frame_count > 0) { + const uint8_t *const data_end = data + data_sz; + int i; - for (i = 0; i < frame_count; ++i) { - const uint8_t *data_start_copy = data_start; - const uint32_t frame_size = frame_sizes[i]; - if (data_start < data || - frame_size > (uint32_t)(data_end - data_start)) { - set_error_detail(ctx, "Invalid frame size in index"); - return VPX_CODEC_CORRUPT_FRAME; - } - - if (ctx->available_threads == 0) { - // No more threads for decoding. Wait until the next output worker - // finishes decoding. Then copy the decoded frame into cache. - if (ctx->num_cache_frames < FRAME_CACHE_SIZE) { - wait_worker_and_cache_frame(ctx); - } else { - // TODO(hkuang): Add unit test to test this path. - set_error_detail(ctx, "Frame output cache is full."); - return VPX_CODEC_ERROR; - } - } - - res = - decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline); - if (res != VPX_CODEC_OK) return res; - data_start += frame_size; - } - } else { - if (ctx->available_threads == 0) { - // No more threads for decoding. Wait until the next output worker - // finishes decoding. Then copy the decoded frame into cache. - if (ctx->num_cache_frames < FRAME_CACHE_SIZE) { - wait_worker_and_cache_frame(ctx); - } else { - // TODO(hkuang): Add unit test to test this path. - set_error_detail(ctx, "Frame output cache is full."); - return VPX_CODEC_ERROR; - } + for (i = 0; i < frame_count; ++i) { + const uint8_t *data_start_copy = data_start; + const uint32_t frame_size = frame_sizes[i]; + if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) { + set_error_detail(ctx, "Invalid frame size in index"); + return VPX_CODEC_CORRUPT_FRAME; } - res = decode_one(ctx, &data, data_sz, user_priv, deadline); + res = decode_one(ctx, &data_start_copy, frame_size, user_priv); if (res != VPX_CODEC_OK) return res; + + data_start += frame_size; } } else { - // Decode in serial mode. - if (frame_count > 0) { - int i; + const uint8_t *const data_end = data + data_sz; + while (data_start < data_end) { + const uint32_t frame_size = (uint32_t)(data_end - data_start); + res = decode_one(ctx, &data_start, frame_size, user_priv); + if (res != VPX_CODEC_OK) return res; - for (i = 0; i < frame_count; ++i) { - const uint8_t *data_start_copy = data_start; - const uint32_t frame_size = frame_sizes[i]; - vpx_codec_err_t res; - if (data_start < data || - frame_size > (uint32_t)(data_end - data_start)) { - set_error_detail(ctx, "Invalid frame size in index"); - return VPX_CODEC_CORRUPT_FRAME; - } - - res = - decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline); - if (res != VPX_CODEC_OK) return res; - - data_start += frame_size; - } - } else { + // Account for suboptimal termination by the encoder. while (data_start < data_end) { - const uint32_t frame_size = (uint32_t)(data_end - data_start); - const vpx_codec_err_t res = - decode_one(ctx, &data_start, frame_size, user_priv, deadline); - if (res != VPX_CODEC_OK) return res; - - // Account for suboptimal termination by the encoder. - while (data_start < data_end) { - const uint8_t marker = - read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start); - if (marker) break; - ++data_start; - } + const uint8_t marker = + read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start); + if (marker) break; + ++data_start; } } } @@ -648,80 +404,28 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, return res; } -static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) { - RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs; - // Decrease reference count of last output frame in frame parallel mode. - if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) { - BufferPool *const pool = ctx->buffer_pool; - lock_buffer_pool(pool); - decrease_ref_count(ctx->last_show_frame, frame_bufs, pool); - unlock_buffer_pool(pool); - } -} - static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter) { vpx_image_t *img = NULL; - // Only return frame when all the cpu are busy or - // application fluhsed the decoder in frame parallel decode. - if (ctx->frame_parallel_decode && ctx->available_threads > 0 && - !ctx->flushed) { - return NULL; - } + // Legacy parameter carried over from VP8. Has no effect for VP9 since we + // always return only 1 frame per decode call. + (void)iter; - // Output the frames in the cache first. - if (ctx->num_cache_frames > 0) { - release_last_output_frame(ctx); - ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx; - if (ctx->need_resync) return NULL; - img = &ctx->frame_cache[ctx->frame_cache_read].img; - ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE; - --ctx->num_cache_frames; - return img; - } - - // iter acts as a flip flop, so an image is only returned on the first - // call to get_frame. - if (*iter == NULL && ctx->frame_workers != NULL) { - do { - YV12_BUFFER_CONFIG sd; - vp9_ppflags_t flags = { 0, 0, 0 }; - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - ctx->next_output_worker_id = - (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; - if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) - set_ppflags(ctx, &flags); - // Wait for the frame from worker thread. - if (winterface->sync(worker)) { - // Check if worker has received any frames. - if (frame_worker_data->received_frame == 1) { - ++ctx->available_threads; - frame_worker_data->received_frame = 0; - check_resync(ctx, frame_worker_data->pbi); - } - if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) { - VP9_COMMON *const cm = &frame_worker_data->pbi->common; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - release_last_output_frame(ctx); - ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx; - if (ctx->need_resync) return NULL; - yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv); - ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; - img = &ctx->img; - return img; - } - } else { - // Decoding failed. Release the worker thread. - frame_worker_data->received_frame = 0; - ++ctx->available_threads; - ctx->need_resync = 1; - if (ctx->flushed != 1) return NULL; - } - } while (ctx->next_output_worker_id != ctx->next_submit_worker_id); + if (ctx->pbi != NULL) { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = { 0, 0, 0 }; + if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags); + if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) { + VP9_COMMON *const cm = &ctx->pbi->common; + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + ctx->last_show_frame = ctx->pbi->common.new_fb_idx; + if (ctx->need_resync) return NULL; + yuvconfig2image(&ctx->img, &sd, ctx->user_priv); + ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; + img = &ctx->img; + return img; + } } return NULL; } @@ -731,7 +435,7 @@ static vpx_codec_err_t decoder_set_fb_fn( vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { if (cb_get == NULL || cb_release == NULL) { return VPX_CODEC_INVALID_PARAM; - } else if (ctx->frame_workers == NULL) { + } else if (ctx->pbi == NULL) { // If the decoder has already been initialized, do not accept changes to // the frame buffer functions. ctx->get_ext_fb_cb = cb_get; @@ -747,21 +451,12 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (data) { vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); - return vp9_set_reference_dec(&frame_worker_data->pbi->common, - ref_frame_to_vp9_reframe(frame->frame_type), - &sd); + return vp9_set_reference_dec( + &ctx->pbi->common, ref_frame_to_vp9_reframe(frame->frame_type), &sd); } else { return VPX_CODEC_INVALID_PARAM; } @@ -771,20 +466,12 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (data) { vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; image2yuvconfig(&frame->img, &sd); - return vp9_copy_reference_dec(frame_worker_data->pbi, - (VP9_REFFRAME)frame->frame_type, &sd); + return vp9_copy_reference_dec(ctx->pbi, (VP9_REFFRAME)frame->frame_type, + &sd); } else { return VPX_CODEC_INVALID_PARAM; } @@ -794,20 +481,16 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, va_list args) { vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (data) { - YV12_BUFFER_CONFIG *fb; - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx); - if (fb == NULL) return VPX_CODEC_ERROR; - yuvconfig2image(&data->img, fb, NULL); - return VPX_CODEC_OK; + if (ctx->pbi) { + const int fb_idx = ctx->pbi->common.cur_show_frame_fb_idx; + YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->pbi->common, fb_idx); + if (fb == NULL) return VPX_CODEC_ERROR; + yuvconfig2image(&data->img, fb, NULL); + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } } else { return VPX_CODEC_INVALID_PARAM; } @@ -832,22 +515,21 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx, #endif } +static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL || ctx->pbi == NULL) return VPX_CODEC_INVALID_PARAM; + *arg = ctx->pbi->common.base_qindex; + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, va_list args) { int *const update_info = va_arg(args, int *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (update_info) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - *update_info = frame_worker_data->pbi->refresh_frame_flags; + if (ctx->pbi != NULL) { + *update_info = ctx->pbi->refresh_frame_flags; return VPX_CODEC_OK; } else { return VPX_CODEC_ERROR; @@ -862,14 +544,9 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, int *corrupted = va_arg(args, int *); if (corrupted) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - RefCntBuffer *const frame_bufs = - frame_worker_data->pbi->common.buffer_pool->frame_bufs; - if (frame_worker_data->pbi->common.frame_to_show == NULL) - return VPX_CODEC_ERROR; + if (ctx->pbi != NULL) { + RefCntBuffer *const frame_bufs = ctx->pbi->common.buffer_pool->frame_bufs; + if (ctx->pbi->common.frame_to_show == NULL) return VPX_CODEC_ERROR; if (ctx->last_show_frame >= 0) *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted; return VPX_CODEC_OK; @@ -885,18 +562,9 @@ static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx, va_list args) { int *const frame_size = va_arg(args, int *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (frame_size) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; frame_size[0] = cm->width; frame_size[1] = cm->height; return VPX_CODEC_OK; @@ -912,18 +580,9 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx, va_list args) { int *const render_size = va_arg(args, int *); - // Only support this function in serial decode. - if (ctx->frame_parallel_decode) { - set_error_detail(ctx, "Not supported in frame parallel decode"); - return VPX_CODEC_INCAPABLE; - } - if (render_size) { - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; render_size[0] = cm->render_width; render_size[1] = cm->render_height; return VPX_CODEC_OK; @@ -938,13 +597,10 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx, va_list args) { unsigned int *const bit_depth = va_arg(args, unsigned int *); - VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id]; if (bit_depth) { - if (worker) { - FrameWorkerData *const frame_worker_data = - (FrameWorkerData *)worker->data1; - const VP9_COMMON *const cm = &frame_worker_data->pbi->common; + if (ctx->pbi != NULL) { + const VP9_COMMON *const cm = &ctx->pbi->common; *bit_depth = cm->bit_depth; return VPX_CODEC_OK; } else { @@ -983,10 +639,8 @@ static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; ctx->byte_alignment = byte_alignment; - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->pbi->common.byte_alignment = byte_alignment; + if (ctx->pbi != NULL) { + ctx->pbi->common.byte_alignment = byte_alignment; } return VPX_CODEC_OK; } @@ -995,10 +649,8 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx, va_list args) { ctx->skip_loop_filter = va_arg(args, int); - if (ctx->frame_workers) { - VPxWorker *const worker = ctx->frame_workers; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter; + if (ctx->pbi != NULL) { + ctx->pbi->common.skip_loop_filter = ctx->skip_loop_filter; } return VPX_CODEC_OK; @@ -1014,6 +666,20 @@ static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->row_mt = va_arg(args, int); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_enable_lpf_opt(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->lpf_opt = va_arg(args, int); + + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -1025,8 +691,11 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment }, { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc }, + { VP9D_SET_ROW_MT, ctrl_set_row_mt }, + { VP9D_SET_LOOP_FILTER_OPT, ctrl_enable_lpf_opt }, // Getters + { VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer }, { VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates }, { VP8D_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted }, { VP9_GET_REFERENCE, ctrl_get_reference }, @@ -1043,7 +712,10 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { CODEC_INTERFACE(vpx_codec_vp9_dx) = { "WebM Project VP9 Decoder" VERSION_STRING, VPX_CODEC_INTERNAL_ABI_VERSION, - VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC | +#if CONFIG_VP9_HIGHBITDEPTH + VPX_CODEC_CAP_HIGHBITDEPTH | +#endif + VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC | VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // vpx_codec_caps_t decoder_init, // vpx_codec_init_fn_t decoder_destroy, // vpx_codec_destroy_fn_t @@ -1065,6 +737,7 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = { NULL, // vpx_codec_enc_config_set_fn_t NULL, // vpx_codec_get_global_headers_fn_t NULL, // vpx_codec_get_preview_frame_fn_t - NULL // vpx_codec_enc_mr_get_mem_loc_fn_t + NULL, // vpx_codec_enc_mr_get_mem_loc_fn_t + NULL // vpx_codec_enc_mr_free_mem_loc_fn_t } }; diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.h b/media/libvpx/libvpx/vp9/vp9_dx_iface.h index c1559599b8..f60688c4db 100644 --- a/media/libvpx/libvpx/vp9/vp9_dx_iface.h +++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.h @@ -8,26 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_VP9_DX_IFACE_H_ -#define VP9_VP9_DX_IFACE_H_ +#ifndef VPX_VP9_VP9_DX_IFACE_H_ +#define VPX_VP9_VP9_DX_IFACE_H_ #include "vp9/decoder/vp9_decoder.h" typedef vpx_codec_stream_info_t vp9_stream_info_t; -// This limit is due to framebuffer numbers. -// TODO(hkuang): Remove this limit after implementing ondemand framebuffers. -#define FRAME_CACHE_SIZE 6 // Cache maximum 6 decoded frames. - -typedef struct cache_frame { - int fb_idx; - vpx_image_t img; -} cache_frame; - struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_dec_cfg_t cfg; vp9_stream_info_t si; + VP9Decoder *pbi; + void *user_priv; int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; vpx_decrypt_cb decrypt_cb; @@ -40,20 +33,8 @@ struct vpx_codec_alg_priv { int byte_alignment; int skip_loop_filter; - // Frame parallel related. - int frame_parallel_decode; // frame-based threading. - VPxWorker *frame_workers; - int num_frame_workers; - int next_submit_worker_id; - int last_submit_worker_id; - int next_output_worker_id; - int available_threads; - cache_frame frame_cache[FRAME_CACHE_SIZE]; - int frame_cache_write; - int frame_cache_read; - int num_cache_frames; int need_resync; // wait for key/intra-only frame - // BufferPool that holds all reference frames. Shared by all the FrameWorkers. + // BufferPool that holds all reference frames. BufferPool *buffer_pool; // External frame buffer info to save for VP9 common. @@ -64,6 +45,8 @@ struct vpx_codec_alg_priv { // Allow for decoding up to a given spatial layer for SVC stream. int svc_decoding; int svc_spatial_layer; + int row_mt; + int lpf_opt; }; -#endif // VP9_VP9_DX_IFACE_H_ +#endif // VPX_VP9_VP9_DX_IFACE_H_ diff --git a/media/libvpx/libvpx/vp9/vp9_iface_common.c b/media/libvpx/libvpx/vp9/vp9_iface_common.c new file mode 100644 index 0000000000..8d031694d8 --- /dev/null +++ b/media/libvpx/libvpx/vp9/vp9_iface_common.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE file in the root of the source tree. An additional + * intellectual property rights grant can be found in the file PATENTS. + * All contributing project authors may be found in the AUTHORS file in + * the root of the source tree. + */ + +#include "vp9/vp9_iface_common.h" +void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, + void *user_priv) { + /** vpx_img_wrap() doesn't allow specifying independent strides for + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ + int bps; + if (!yv12->subsampling_y) { + if (!yv12->subsampling_x) { + img->fmt = VPX_IMG_FMT_I444; + bps = 24; + } else { + img->fmt = VPX_IMG_FMT_I422; + bps = 16; + } + } else { + if (!yv12->subsampling_x) { + img->fmt = VPX_IMG_FMT_I440; + bps = 16; + } else { + img->fmt = VPX_IMG_FMT_I420; + bps = 12; + } + } + img->cs = yv12->color_space; + img->range = yv12->color_range; + img->bit_depth = 8; + img->w = yv12->y_stride; + img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3); + img->d_w = yv12->y_crop_width; + img->d_h = yv12->y_crop_height; + img->r_w = yv12->render_width; + img->r_h = yv12->render_height; + img->x_chroma_shift = yv12->subsampling_x; + img->y_chroma_shift = yv12->subsampling_y; + img->planes[VPX_PLANE_Y] = yv12->y_buffer; + img->planes[VPX_PLANE_U] = yv12->u_buffer; + img->planes[VPX_PLANE_V] = yv12->v_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = yv12->y_stride; + img->stride[VPX_PLANE_U] = yv12->uv_stride; + img->stride[VPX_PLANE_V] = yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; +#if CONFIG_VP9_HIGHBITDEPTH + if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) { + // vpx_image_t uses byte strides and a pointer to the first byte + // of the image. + img->fmt = (vpx_img_fmt_t)(img->fmt | VPX_IMG_FMT_HIGHBITDEPTH); + img->bit_depth = yv12->bit_depth; + img->planes[VPX_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer); + img->planes[VPX_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer); + img->planes[VPX_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer); + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride; + img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride; + img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + img->bps = bps; + img->user_priv = user_priv; + img->img_data = yv12->buffer_alloc; + img->img_data_owner = 0; + img->self_allocd = 0; +} + +vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, + YV12_BUFFER_CONFIG *yv12) { + yv12->y_buffer = img->planes[VPX_PLANE_Y]; + yv12->u_buffer = img->planes[VPX_PLANE_U]; + yv12->v_buffer = img->planes[VPX_PLANE_V]; + + yv12->y_crop_width = img->d_w; + yv12->y_crop_height = img->d_h; + yv12->render_width = img->r_w; + yv12->render_height = img->r_h; + yv12->y_width = img->d_w; + yv12->y_height = img->d_h; + + yv12->uv_width = img->x_chroma_shift == 1 || img->fmt == VPX_IMG_FMT_NV12 + ? (1 + yv12->y_width) / 2 + : yv12->y_width; + yv12->uv_height = + img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height; + yv12->uv_crop_width = yv12->uv_width; + yv12->uv_crop_height = yv12->uv_height; + + yv12->y_stride = img->stride[VPX_PLANE_Y]; + yv12->uv_stride = img->stride[VPX_PLANE_U]; + yv12->color_space = img->cs; + yv12->color_range = img->range; + +#if CONFIG_VP9_HIGHBITDEPTH + if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { + // In vpx_image_t + // planes point to uint8 address of start of data + // stride counts uint8s to reach next row + // In YV12_BUFFER_CONFIG + // y_buffer, u_buffer, v_buffer point to uint16 address of data + // stride and border counts in uint16s + // This means that all the address calculations in the main body of code + // should work correctly. + // However, before we do any pixel operations we need to cast the address + // to a uint16 ponter and double its value. + yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer); + yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer); + yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer); + yv12->y_stride >>= 1; + yv12->uv_stride >>= 1; + yv12->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + yv12->flags = 0; + } + yv12->border = (yv12->y_stride - img->w) / 2; +#else + yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; +#endif // CONFIG_VP9_HIGHBITDEPTH + yv12->subsampling_x = img->x_chroma_shift; + yv12->subsampling_y = img->y_chroma_shift; + // When reading the data, UV are in one plane for NV12 format, thus + // x_chroma_shift is 0. After converting, UV are in separate planes, and + // subsampling_x should be set to 1. + if (img->fmt == VPX_IMG_FMT_NV12) yv12->subsampling_x = 1; + return VPX_CODEC_OK; +} diff --git a/media/libvpx/libvpx/vp9/vp9_iface_common.h b/media/libvpx/libvpx/vp9/vp9_iface_common.h index d68872750b..e646917c69 100644 --- a/media/libvpx/libvpx/vp9/vp9_iface_common.h +++ b/media/libvpx/libvpx/vp9/vp9_iface_common.h @@ -7,133 +7,27 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_VP9_IFACE_COMMON_H_ -#define VP9_VP9_IFACE_COMMON_H_ +#ifndef VPX_VP9_VP9_IFACE_COMMON_H_ +#define VPX_VP9_VP9_IFACE_COMMON_H_ +#include #include "vpx_ports/mem.h" +#include "vpx/vp8.h" +#include "vpx_scale/yv12config.h" +#include "common/vp9_enums.h" -static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, - void *user_priv) { - /** vpx_img_wrap() doesn't allow specifying independent strides for - * the Y, U, and V planes, nor other alignment adjustments that - * might be representable by a YV12_BUFFER_CONFIG, so we just - * initialize all the fields.*/ - int bps; - if (!yv12->subsampling_y) { - if (!yv12->subsampling_x) { - img->fmt = VPX_IMG_FMT_I444; - bps = 24; - } else { - img->fmt = VPX_IMG_FMT_I422; - bps = 16; - } - } else { - if (!yv12->subsampling_x) { - img->fmt = VPX_IMG_FMT_I440; - bps = 16; - } else { - img->fmt = VPX_IMG_FMT_I420; - bps = 12; - } - } - img->cs = yv12->color_space; - img->range = yv12->color_range; - img->bit_depth = 8; - img->w = yv12->y_stride; - img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3); - img->d_w = yv12->y_crop_width; - img->d_h = yv12->y_crop_height; - img->r_w = yv12->render_width; - img->r_h = yv12->render_height; - img->x_chroma_shift = yv12->subsampling_x; - img->y_chroma_shift = yv12->subsampling_y; - img->planes[VPX_PLANE_Y] = yv12->y_buffer; - img->planes[VPX_PLANE_U] = yv12->u_buffer; - img->planes[VPX_PLANE_V] = yv12->v_buffer; - img->planes[VPX_PLANE_ALPHA] = NULL; - img->stride[VPX_PLANE_Y] = yv12->y_stride; - img->stride[VPX_PLANE_U] = yv12->uv_stride; - img->stride[VPX_PLANE_V] = yv12->uv_stride; - img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; -#if CONFIG_VP9_HIGHBITDEPTH - if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) { - // vpx_image_t uses byte strides and a pointer to the first byte - // of the image. - img->fmt = (vpx_img_fmt_t)(img->fmt | VPX_IMG_FMT_HIGHBITDEPTH); - img->bit_depth = yv12->bit_depth; - img->planes[VPX_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer); - img->planes[VPX_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer); - img->planes[VPX_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer); - img->planes[VPX_PLANE_ALPHA] = NULL; - img->stride[VPX_PLANE_Y] = 2 * yv12->y_stride; - img->stride[VPX_PLANE_U] = 2 * yv12->uv_stride; - img->stride[VPX_PLANE_V] = 2 * yv12->uv_stride; - img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride; - } -#endif // CONFIG_VP9_HIGHBITDEPTH - img->bps = bps; - img->user_priv = user_priv; - img->img_data = yv12->buffer_alloc; - img->img_data_owner = 0; - img->self_allocd = 0; -} +#ifdef __cplusplus +extern "C" { +#endif -static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, - YV12_BUFFER_CONFIG *yv12) { - yv12->y_buffer = img->planes[VPX_PLANE_Y]; - yv12->u_buffer = img->planes[VPX_PLANE_U]; - yv12->v_buffer = img->planes[VPX_PLANE_V]; +void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, + void *user_priv); - yv12->y_crop_width = img->d_w; - yv12->y_crop_height = img->d_h; - yv12->render_width = img->r_w; - yv12->render_height = img->r_h; - yv12->y_width = img->d_w; - yv12->y_height = img->d_h; +vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, + YV12_BUFFER_CONFIG *yv12); - yv12->uv_width = - img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width; - yv12->uv_height = - img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height; - yv12->uv_crop_width = yv12->uv_width; - yv12->uv_crop_height = yv12->uv_height; - - yv12->y_stride = img->stride[VPX_PLANE_Y]; - yv12->uv_stride = img->stride[VPX_PLANE_U]; - yv12->color_space = img->cs; - yv12->color_range = img->range; - -#if CONFIG_VP9_HIGHBITDEPTH - if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { - // In vpx_image_t - // planes point to uint8 address of start of data - // stride counts uint8s to reach next row - // In YV12_BUFFER_CONFIG - // y_buffer, u_buffer, v_buffer point to uint16 address of data - // stride and border counts in uint16s - // This means that all the address calculations in the main body of code - // should work correctly. - // However, before we do any pixel operations we need to cast the address - // to a uint16 ponter and double its value. - yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer); - yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer); - yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer); - yv12->y_stride >>= 1; - yv12->uv_stride >>= 1; - yv12->flags = YV12_FLAG_HIGHBITDEPTH; - } else { - yv12->flags = 0; - } - yv12->border = (yv12->y_stride - img->w) / 2; -#else - yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; -#endif // CONFIG_VP9_HIGHBITDEPTH - yv12->subsampling_x = img->x_chroma_shift; - yv12->subsampling_y = img->y_chroma_shift; - return VPX_CODEC_OK; -} - -static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { +static INLINE VP9_REFFRAME +ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { switch (frame) { case VP8_LAST_FRAME: return VP9_LAST_FLAG; case VP8_GOLD_FRAME: return VP9_GOLD_FLAG; @@ -142,4 +36,9 @@ static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { assert(0 && "Invalid Reference Frame"); return VP9_LAST_FLAG; } -#endif // VP9_VP9_IFACE_COMMON_H_ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_VP9_IFACE_COMMON_H_ diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk index a8ca0d5935..04af1c51e9 100644 --- a/media/libvpx/libvpx/vp9/vp9cx.mk +++ b/media/libvpx/libvpx/vp9/vp9cx.mk @@ -16,6 +16,7 @@ VP9_CX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes) VP9_CX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no) VP9_CX_SRCS-yes += vp9_cx_iface.c +VP9_CX_SRCS-yes += vp9_cx_iface.h VP9_CX_SRCS-yes += encoder/vp9_bitstream.c VP9_CX_SRCS-yes += encoder/vp9_context_tree.c @@ -39,9 +40,14 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h VP9_CX_SRCS-yes += encoder/vp9_encodemv.h VP9_CX_SRCS-yes += encoder/vp9_extend.h VP9_CX_SRCS-yes += encoder/vp9_firstpass.h +VP9_CX_SRCS-yes += encoder/vp9_firstpass_stats.h +VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c +VP9_CX_SRCS-yes += encoder/vp9_job_queue.h VP9_CX_SRCS-yes += encoder/vp9_lookahead.c VP9_CX_SRCS-yes += encoder/vp9_lookahead.h VP9_CX_SRCS-yes += encoder/vp9_mcomp.h +VP9_CX_SRCS-yes += encoder/vp9_multi_thread.c +VP9_CX_SRCS-yes += encoder/vp9_multi_thread.h VP9_CX_SRCS-yes += encoder/vp9_encoder.h VP9_CX_SRCS-yes += encoder/vp9_quantize.h VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h @@ -60,6 +66,7 @@ VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c VP9_CX_SRCS-yes += encoder/vp9_rd.c VP9_CX_SRCS-yes += encoder/vp9_rdopt.c VP9_CX_SRCS-yes += encoder/vp9_pickmode.c +VP9_CX_SRCS-yes += encoder/vp9_partition_models.h VP9_CX_SRCS-yes += encoder/vp9_segmentation.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.h VP9_CX_SRCS-yes += encoder/vp9_speed_features.c @@ -70,6 +77,9 @@ VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.c VP9_CX_SRCS-yes += encoder/vp9_resize.c VP9_CX_SRCS-yes += encoder/vp9_resize.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c +VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.h +VP9_CX_SRCS-$(CONFIG_NON_GREEDY_MV) += encoder/vp9_non_greedy_mv.c +VP9_CX_SRCS-$(CONFIG_NON_GREEDY_MV) += encoder/vp9_non_greedy_mv.h VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c @@ -87,57 +97,97 @@ VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.c VP9_CX_SRCS-yes += encoder/vp9_noise_estimate.h +VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.c +VP9_CX_SRCS-yes += encoder/vp9_ext_ratectrl.h ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c endif VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h +VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c +VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/temporal_filter_ssse3.c +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/temporal_filter_avx2.c +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/vp9_temporal_filter_constants.h +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_temporal_filter_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h +VP9_CX_SRCS-$(HAVE_NEON_DOTPROD) += encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c +VP9_CX_SRCS-$(HAVE_NEON_I8MM) += encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c + VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c -VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/highbd_temporal_filter_ssse3.c +VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/highbd_temporal_filter_avx2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c +VP9_CX_SRCS-$(HAVE_SVE2) += encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm -ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm -VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm -else VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm -endif - -ifeq ($(ARCH_X86_64),yes) -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm -endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c -ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c -endif +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_denoiser_neon.c endif -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c -ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c -endif +VP9_CX_SRCS-$(HAVE_SVE) += encoder/arm/neon/vp9_error_sve.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_error_neon.c +endif VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h -VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c +endif # !CONFIG_VP9_HIGHBITDEPTH + +VP9_CX_SRCS-$(HAVE_VSX) += encoder/ppc/vp9_quantize_vsx.c + +# Strip unnecessary files with CONFIG_REALTIME_ONLY +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_ssse3.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_avx2.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter_constants.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_ssse3.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_avx2.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon_dotprod.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon_i8mm.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_sve2.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_360.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_360.h +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_complexity.c +VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_complexity.h VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/media/libvpx/libvpx/vp9/vp9dx.mk b/media/libvpx/libvpx/vp9/vp9dx.mk index 4c6fd00715..93a5f368bd 100644 --- a/media/libvpx/libvpx/vp9/vp9dx.mk +++ b/media/libvpx/libvpx/vp9/vp9dx.mk @@ -24,11 +24,11 @@ VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h VP9_DX_SRCS-yes += decoder/vp9_detokenize.c VP9_DX_SRCS-yes += decoder/vp9_decodemv.h VP9_DX_SRCS-yes += decoder/vp9_detokenize.h -VP9_DX_SRCS-yes += decoder/vp9_dthread.c -VP9_DX_SRCS-yes += decoder/vp9_dthread.h VP9_DX_SRCS-yes += decoder/vp9_decoder.c VP9_DX_SRCS-yes += decoder/vp9_decoder.h VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h +VP9_DX_SRCS-yes += decoder/vp9_job_queue.c +VP9_DX_SRCS-yes += decoder/vp9_job_queue.h VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes)) diff --git a/media/libvpx/libvpx/vpx/exports_spatial_svc b/media/libvpx/libvpx/vpx/exports_spatial_svc deleted file mode 100644 index d258a1d618..0000000000 --- a/media/libvpx/libvpx/vpx/exports_spatial_svc +++ /dev/null @@ -1,6 +0,0 @@ -text vpx_svc_dump_statistics -text vpx_svc_encode -text vpx_svc_get_message -text vpx_svc_init -text vpx_svc_release -text vpx_svc_set_options diff --git a/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h b/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h index 522e5c1684..ff51881eea 100644 --- a/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h +++ b/media/libvpx/libvpx/vpx/internal/vpx_codec_internal.h @@ -27,25 +27,29 @@ * * * An application instantiates a specific decoder instance by using - * vpx_codec_init() and a pointer to the algorithm's interface structure: + * vpx_codec_dec_init() and a pointer to the algorithm's interface structure: *

  *     my_app.c:
  *       extern vpx_codec_iface_t my_codec;
  *       {
  *           vpx_codec_ctx_t algo;
- *           res = vpx_codec_init(&algo, &my_codec);
+ *           int threads = 4;
+ *           vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
+ *           res = vpx_codec_dec_init(&algo, &my_codec, &cfg, 0);
  *       }
  *     
* * Once initialized, the instance is manged using other functions from * the vpx_codec_* family. */ -#ifndef VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ -#define VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#ifndef VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#define VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ #include "../vpx_decoder.h" #include "../vpx_encoder.h" #include +#include "vpx_config.h" + #ifdef __cplusplus extern "C" { #endif @@ -66,7 +70,7 @@ typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t; /*!\brief init function pointer prototype * * Performs algorithm-specific initialization of the decoder context. This - * function is called by the generic vpx_codec_init() wrapper function, so + * function is called by vpx_codec_dec_init() and vpx_codec_enc_init(), so * plugins implementing this interface may trust the input parameters to be * properly initialized. * @@ -175,16 +179,15 @@ typedef const struct vpx_codec_ctrl_fn_map { /*!\brief decode data function pointer prototype * * Processes a buffer of coded data. If the processing results in a new - * decoded frame becoming available, #VPX_CODEC_CB_PUT_SLICE and - * #VPX_CODEC_CB_PUT_FRAME events are generated as appropriate. This - * function is called by the generic vpx_codec_decode() wrapper function, - * so plugins implementing this interface may trust the input parameters - * to be properly initialized. + * decoded frame becoming available, put_slice and put_frame callbacks + * are invoked as appropriate. This function is called by the generic + * vpx_codec_decode() wrapper function, so plugins implementing this + * interface may trust the input parameters to be properly initialized. * * \param[in] ctx Pointer to this instance's context * \param[in] data Pointer to this block of new coded data. If - * NULL, a #VPX_CODEC_CB_PUT_FRAME event is posted - * for the previously decoded frame. + * NULL, the put_frame callback is invoked for + * the previously decoded frame. * \param[in] data_sz Size of the coded data, in bytes. * * \return Returns #VPX_CODEC_OK if the coded data was processed completely @@ -195,8 +198,7 @@ typedef const struct vpx_codec_ctrl_fn_map { typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx, const uint8_t *data, unsigned int data_sz, - void *user_priv, - long deadline); + void *user_priv); /*!\brief Decoded frames iterator * @@ -252,7 +254,7 @@ typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx, vpx_codec_pts_t pts, unsigned long duration, vpx_enc_frame_flags_t flags, - unsigned long deadline); + vpx_enc_deadline_t deadline); typedef const vpx_codec_cx_pkt_t *(*vpx_codec_get_cx_data_fn_t)( vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter); @@ -267,6 +269,8 @@ typedef vpx_image_t *(*vpx_codec_get_preview_frame_fn_t)( typedef vpx_codec_err_t (*vpx_codec_enc_mr_get_mem_loc_fn_t)( const vpx_codec_enc_cfg_t *cfg, void **mem_loc); +typedef void (*vpx_codec_enc_mr_free_mem_loc_fn_t)(void *mem_loc); + /*!\brief usage configuration mapping * * This structure stores the mapping between usage identifiers and @@ -282,7 +286,7 @@ typedef const struct vpx_codec_enc_cfg_map { vpx_codec_enc_cfg_t cfg; } vpx_codec_enc_cfg_map_t; -/*!\brief Decoder algorithm interface interface +/*!\brief Decoder algorithm interface * * All decoders \ref MUST expose a variable of this type. */ @@ -316,6 +320,8 @@ struct vpx_codec_iface { get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */ vpx_codec_enc_mr_get_mem_loc_fn_t mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */ + vpx_codec_enc_mr_free_mem_loc_fn_t + mr_free_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_free_mem_loc_fn_t */ } enc; }; @@ -426,6 +432,27 @@ struct vpx_internal_error_info { jmp_buf jmp; }; +#if CONFIG_DEBUG +#define CHECK_MEM_ERROR(error, lval, expr) \ + do { \ + assert((error)->setjmp); \ + (lval) = (expr); \ + if (!(lval)) \ + vpx_internal_error(error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval " at %s:%d", __FILE__, \ + __LINE__); \ + } while (0) +#else +#define CHECK_MEM_ERROR(error, lval, expr) \ + do { \ + assert((error)->setjmp); \ + (lval) = (expr); \ + if (!(lval)) \ + vpx_internal_error(error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate " #lval); \ + } while (0) +#endif + #define CLANG_ANALYZER_NORETURN #if defined(__has_feature) #if __has_feature(attribute_analyzer_noreturn) @@ -434,12 +461,24 @@ struct vpx_internal_error_info { #endif #endif +// Tells the compiler to perform `printf` format string checking if the +// compiler supports it; see the 'format' attribute in +// . +#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check) +#if defined(__has_attribute) +#if __has_attribute(format) +#undef LIBVPX_FORMAT_PRINTF +#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check) \ + __attribute__((__format__(__printf__, string_index, first_to_check))) +#endif +#endif + void vpx_internal_error(struct vpx_internal_error_info *info, - vpx_codec_err_t error, const char *fmt, - ...) CLANG_ANALYZER_NORETURN; + vpx_codec_err_t error, const char *fmt, ...) + LIBVPX_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN; #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#endif // VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ diff --git a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h new file mode 100644 index 0000000000..2643b5578a --- /dev/null +++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ +#define VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ + +#include "vpx/vpx_encoder.h" + +namespace libvpx { + +enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 }; + +enum class FrameDropDecision { + kOk, // Frame is encoded. + kDrop, // Frame is dropped. +}; + +struct UVDeltaQP { + // For the UV channel: the QP for the dc/ac value is given as + // GetQP() + uvdc/ac_delta_q, where the uvdc/ac_delta_q are negative numbers. + int uvdc_delta_q; + int uvac_delta_q; +}; + +struct VpxRateControlRtcConfig { + VpxRateControlRtcConfig() { + width = 1280; + height = 720; + max_quantizer = 63; + min_quantizer = 2; + target_bandwidth = 1000; + buf_initial_sz = 600; + buf_optimal_sz = 600; + buf_sz = 1000; + undershoot_pct = overshoot_pct = 50; + max_intra_bitrate_pct = 50; + max_inter_bitrate_pct = 0; + framerate = 30.0; + ts_number_layers = 1; + rc_mode = VPX_CBR; + aq_mode = 0; + layer_target_bitrate[0] = static_cast(target_bandwidth); + ts_rate_decimator[0] = 1; + frame_drop_thresh = 0; + is_screen = false; + } + + int width; + int height; + // 0-63 + int max_quantizer; + int min_quantizer; + int64_t target_bandwidth; + int64_t buf_initial_sz; + int64_t buf_optimal_sz; + int64_t buf_sz; + int undershoot_pct; + int overshoot_pct; + int max_intra_bitrate_pct; + int max_inter_bitrate_pct; + double framerate; + // Number of temporal layers + int ts_number_layers; + int layer_target_bitrate[VPX_MAX_LAYERS]; + int ts_rate_decimator[VPX_TS_MAX_LAYERS]; + // vbr, cbr + enum vpx_rc_mode rc_mode; + int aq_mode; + int frame_drop_thresh; + bool is_screen; +}; +} // namespace libvpx +#endif // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_ diff --git a/media/libvpx/libvpx/vpx/src/vpx_codec.c b/media/libvpx/libvpx/vpx/src/vpx_codec.c index 10331aa21b..c54a18ecd4 100644 --- a/media/libvpx/libvpx/vpx/src/vpx_codec.c +++ b/media/libvpx/libvpx/vpx/src/vpx_codec.c @@ -13,6 +13,7 @@ * */ #include +#include #include #include "vpx/vpx_integer.h" #include "vpx/internal/vpx_codec_internal.h" @@ -50,12 +51,12 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err) { return "Unrecognized error code"; } -const char *vpx_codec_error(vpx_codec_ctx_t *ctx) { +const char *vpx_codec_error(const vpx_codec_ctx_t *ctx) { return (ctx) ? vpx_codec_err_to_string(ctx->err) : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM); } -const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) { +const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx) { if (ctx && ctx->err) return ctx->priv ? ctx->priv->err_detail : ctx->err_detail; @@ -82,7 +83,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) { } vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) { - return (iface) ? iface->caps : 0; + return iface ? iface->caps : 0; } vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) { @@ -97,7 +98,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) { res = VPX_CODEC_INCAPABLE; - for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) { + for (entry = ctx->iface->ctrl_maps; entry->fn; entry++) { if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) { va_list ap; diff --git a/media/libvpx/libvpx/vpx/src/vpx_decoder.c b/media/libvpx/libvpx/vpx/src/vpx_decoder.c index fc1c2bccae..c79cc708cd 100644 --- a/media/libvpx/libvpx/vpx/src/vpx_decoder.c +++ b/media/libvpx/libvpx/vpx/src/vpx_decoder.c @@ -105,6 +105,7 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data, unsigned int data_sz, void *user_priv, long deadline) { vpx_codec_err_t res; + (void)deadline; /* Sanity checks */ /* NULL data ptr allowed if data_sz is 0 too */ @@ -112,10 +113,8 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, const uint8_t *data, res = VPX_CODEC_INVALID_PARAM; else if (!ctx->iface || !ctx->priv) res = VPX_CODEC_ERROR; - else { - res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv, - deadline); - } + else + res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv); return SAVE_STATUS(ctx, res); } @@ -138,9 +137,10 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, if (!ctx || !cb) res = VPX_CODEC_INVALID_PARAM; - else if (!ctx->iface || !ctx->priv || - !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME)) + else if (!ctx->iface || !ctx->priv) res = VPX_CODEC_ERROR; + else if (!(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME)) + res = VPX_CODEC_INCAPABLE; else { ctx->priv->dec.put_frame_cb.u.put_frame = cb; ctx->priv->dec.put_frame_cb.user_priv = user_priv; @@ -157,9 +157,10 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, if (!ctx || !cb) res = VPX_CODEC_INVALID_PARAM; - else if (!ctx->iface || !ctx->priv || - !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE)) + else if (!ctx->iface || !ctx->priv) res = VPX_CODEC_ERROR; + else if (!(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE)) + res = VPX_CODEC_INCAPABLE; else { ctx->priv->dec.put_slice_cb.u.put_slice = cb; ctx->priv->dec.put_slice_cb.user_priv = user_priv; @@ -176,9 +177,10 @@ vpx_codec_err_t vpx_codec_set_frame_buffer_functions( if (!ctx || !cb_get || !cb_release) { res = VPX_CODEC_INVALID_PARAM; - } else if (!ctx->iface || !ctx->priv || - !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) { + } else if (!ctx->iface || !ctx->priv) { res = VPX_CODEC_ERROR; + } else if (!(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) { + res = VPX_CODEC_INCAPABLE; } else { res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release, cb_priv); diff --git a/media/libvpx/libvpx/vpx/src/vpx_encoder.c b/media/libvpx/libvpx/vpx/src/vpx_encoder.c index 42d49970fe..36dfa51897 100644 --- a/media/libvpx/libvpx/vpx/src/vpx_encoder.c +++ b/media/libvpx/libvpx/vpx/src/vpx_encoder.c @@ -14,13 +14,13 @@ */ #include #include -#include +#include #include -#include "vp8/common/blockd.h" #include "vpx_config.h" +#include "vpx/vpx_encoder.h" #include "vpx/internal/vpx_codec_internal.h" -#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var) +#define SAVE_STATUS(ctx, var) ((ctx) ? ((ctx)->err = (var)) : (var)) static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { return (vpx_codec_alg_priv_t *)ctx->priv; @@ -54,6 +54,10 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, res = ctx->iface->init(ctx, NULL); if (res) { + // IMPORTANT: ctx->priv->err_detail must be null or point to a string + // that remains valid after ctx->priv is destroyed, such as a C string + // literal. This makes it safe to call vpx_codec_error_detail() after + // vpx_codec_enc_init_ver() failed. ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; vpx_codec_destroy(ctx); } @@ -63,13 +67,14 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, } vpx_codec_err_t vpx_codec_enc_init_multi_ver( - vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg, - int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver) { + vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, + const vpx_codec_enc_cfg_t *cfg, int num_enc, vpx_codec_flags_t flags, + const vpx_rational_t *dsf, int ver) { vpx_codec_err_t res = VPX_CODEC_OK; if (ver != VPX_ENCODER_ABI_VERSION) res = VPX_CODEC_ABI_MISMATCH; - else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1)) + else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1) || !dsf) res = VPX_CODEC_INVALID_PARAM; else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) res = VPX_CODEC_ABI_MISMATCH; @@ -82,7 +87,10 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( res = VPX_CODEC_INCAPABLE; else { int i; - void *mem_loc = NULL; +#if CONFIG_MULTI_RES_ENCODING + int mem_loc_owned = 0; +#endif + void *mem_loc; if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE; @@ -94,27 +102,20 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 || dsf->den > dsf->num) { res = VPX_CODEC_INVALID_PARAM; - break; } else { - mr_cfg.mr_low_res_mode_info = mem_loc; mr_cfg.mr_total_resolutions = num_enc; mr_cfg.mr_encoder_id = num_enc - 1 - i; - mr_cfg.mr_down_sampling_factor.num = dsf->num; - mr_cfg.mr_down_sampling_factor.den = dsf->den; - - /* Force Key-frame synchronization. Namely, encoder at higher - * resolution always use the same frame_type chosen by the - * lowest-resolution encoder. - */ - if (mr_cfg.mr_encoder_id) - cfg->kf_mode = VPX_KF_DISABLED; + mr_cfg.mr_down_sampling_factor = *dsf; ctx->iface = iface; ctx->name = iface->name; ctx->priv = NULL; ctx->init_flags = flags; ctx->config.enc = cfg; + // ctx takes ownership of mr_cfg.mr_low_res_mode_info if and only if + // this call succeeds. The first ctx entry in the array is + // responsible for freeing the memory. res = ctx->iface->init(ctx, &mr_cfg); } @@ -132,13 +133,16 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( i--; } #if CONFIG_MULTI_RES_ENCODING - assert(mem_loc); - free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info); - free(mem_loc); + if (!mem_loc_owned) { + assert(mem_loc); + iface->enc.mr_free_mem_loc(mem_loc); + } #endif return SAVE_STATUS(ctx, res); } - +#if CONFIG_MULTI_RES_ENCODING + mem_loc_owned = 1; +#endif ctx++; cfg++; dsf++; @@ -154,52 +158,42 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg, unsigned int usage) { vpx_codec_err_t res; - vpx_codec_enc_cfg_map_t *map; - int i; - if (!iface || !cfg || usage > INT_MAX) + if (!iface || !cfg || usage != 0) res = VPX_CODEC_INVALID_PARAM; else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; else { - res = VPX_CODEC_INVALID_PARAM; - - for (i = 0; i < iface->enc.cfg_map_count; ++i) { - map = iface->enc.cfg_maps + i; - if (map->usage == (int)usage) { - *cfg = map->cfg; - cfg->g_usage = usage; - res = VPX_CODEC_OK; - break; - } - } + assert(iface->enc.cfg_map_count == 1); + *cfg = iface->enc.cfg_maps->cfg; + res = VPX_CODEC_OK; } return res; } -#if ARCH_X86 || ARCH_X86_64 +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 /* On X86, disable the x87 unit's internal 80 bit precision for better * consistency with the SSE unit's 64 bit precision. */ #include "vpx_ports/x86.h" #define FLOATING_POINT_INIT() \ do { \ - unsigned short x87_orig_mode = x87_set_double_precision(); + unsigned short x87_orig_mode = x87_set_double_precision() #define FLOATING_POINT_RESTORE() \ x87_set_control_word(x87_orig_mode); \ } \ while (0) #else -static void FLOATING_POINT_INIT() {} -static void FLOATING_POINT_RESTORE() {} +static void FLOATING_POINT_INIT(void) {} +static void FLOATING_POINT_RESTORE(void) {} #endif vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, unsigned long duration, vpx_enc_frame_flags_t flags, - unsigned long deadline) { + vpx_enc_deadline_t deadline) { vpx_codec_err_t res = VPX_CODEC_OK; if (!ctx || (img && !duration)) @@ -208,6 +202,10 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, res = VPX_CODEC_ERROR; else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; +#if ULONG_MAX > UINT32_MAX + else if (duration > UINT32_MAX || deadline > UINT32_MAX) + res = VPX_CODEC_INVALID_PARAM; +#endif else { unsigned int num_enc = ctx->priv->enc.total_encoders; diff --git a/media/libvpx/libvpx/vpx/src/vpx_image.c b/media/libvpx/libvpx/vpx/src/vpx_image.c index dba439c10a..7f9f6cd4d0 100644 --- a/media/libvpx/libvpx/vpx/src/vpx_image.c +++ b/media/libvpx/libvpx/vpx/src/vpx_image.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include +#include #include #include @@ -20,9 +22,22 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int buf_align, unsigned int stride_align, unsigned char *img_data) { - unsigned int h, w, s, xcs, ycs, bps; - unsigned int stride_in_bytes; - int align; + unsigned int h, w, xcs, ycs, bps; + uint64_t s; + int stride_in_bytes; + unsigned int align; + + if (img != NULL) memset(img, 0, sizeof(vpx_image_t)); + + if (fmt == VPX_IMG_FMT_NONE) goto fail; + + /* Impose maximum values on input parameters so that this function can + * perform arithmetic operations without worrying about overflows. + */ + if (d_w > 0x08000000 || d_h > 0x08000000 || buf_align > 65536 || + stride_align > 65536) { + goto fail; + } /* Treat align==0 like align==1 */ if (!buf_align) buf_align = 1; @@ -38,23 +53,9 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, /* Get sample size for this format */ switch (fmt) { - case VPX_IMG_FMT_RGB32: - case VPX_IMG_FMT_RGB32_LE: - case VPX_IMG_FMT_ARGB: - case VPX_IMG_FMT_ARGB_LE: bps = 32; break; - case VPX_IMG_FMT_RGB24: - case VPX_IMG_FMT_BGR24: bps = 24; break; - case VPX_IMG_FMT_RGB565: - case VPX_IMG_FMT_RGB565_LE: - case VPX_IMG_FMT_RGB555: - case VPX_IMG_FMT_RGB555_LE: - case VPX_IMG_FMT_UYVY: - case VPX_IMG_FMT_YUY2: - case VPX_IMG_FMT_YVYU: bps = 16; break; case VPX_IMG_FMT_I420: case VPX_IMG_FMT_YV12: - case VPX_IMG_FMT_VPXI420: - case VPX_IMG_FMT_VPXYV12: bps = 12; break; + case VPX_IMG_FMT_NV12: bps = 12; break; case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I440: bps = 16; break; case VPX_IMG_FMT_I444: bps = 24; break; @@ -66,11 +67,11 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, } /* Get chroma shift values for this format */ + // For VPX_IMG_FMT_NV12, xcs needs to be 0 such that UV data is all read at + // once. switch (fmt) { case VPX_IMG_FMT_I420: case VPX_IMG_FMT_YV12: - case VPX_IMG_FMT_VPXI420: - case VPX_IMG_FMT_VPXYV12: case VPX_IMG_FMT_I422: case VPX_IMG_FMT_I42016: case VPX_IMG_FMT_I42216: xcs = 1; break; @@ -79,23 +80,36 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, switch (fmt) { case VPX_IMG_FMT_I420: + case VPX_IMG_FMT_NV12: case VPX_IMG_FMT_I440: case VPX_IMG_FMT_YV12: - case VPX_IMG_FMT_VPXI420: - case VPX_IMG_FMT_VPXYV12: case VPX_IMG_FMT_I42016: case VPX_IMG_FMT_I44016: ycs = 1; break; default: ycs = 0; break; } - /* Calculate storage sizes given the chroma subsampling */ - align = (1 << xcs) - 1; - w = (d_w + align) & ~align; - align = (1 << ycs) - 1; - h = (d_h + align) & ~align; - s = (fmt & VPX_IMG_FMT_PLANAR) ? w : bps * w / 8; - s = (s + stride_align - 1) & ~(stride_align - 1); - stride_in_bytes = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s; + /* Calculate storage sizes. */ + if (img_data) { + /* If the buffer was allocated externally, the width and height shouldn't + * be adjusted. */ + w = d_w; + h = d_h; + } else { + /* Calculate storage sizes given the chroma subsampling */ + align = (1 << xcs) - 1; + w = (d_w + align) & ~align; + assert(d_w <= w); + align = (1 << ycs) - 1; + h = (d_h + align) & ~align; + assert(d_h <= h); + } + + s = (fmt & VPX_IMG_FMT_PLANAR) ? w : (uint64_t)bps * w / 8; + s = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s; + s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1); + if (s > INT_MAX) goto fail; + stride_in_bytes = (int)s; + s = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s / 2 : s; /* Allocate the new image */ if (!img) { @@ -104,16 +118,14 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, if (!img) goto fail; img->self_allocd = 1; - } else { - memset(img, 0, sizeof(vpx_image_t)); } img->img_data = img_data; if (!img_data) { - const uint64_t alloc_size = (fmt & VPX_IMG_FMT_PLANAR) - ? (uint64_t)h * s * bps / 8 - : (uint64_t)h * s; + uint64_t alloc_size; + alloc_size = (fmt & VPX_IMG_FMT_PLANAR) ? (uint64_t)h * s * bps / 8 + : (uint64_t)h * s; if (alloc_size != (size_t)alloc_size) goto fail; @@ -135,8 +147,12 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, vpx_img_fmt_t fmt, img->stride[VPX_PLANE_Y] = img->stride[VPX_PLANE_ALPHA] = stride_in_bytes; img->stride[VPX_PLANE_U] = img->stride[VPX_PLANE_V] = stride_in_bytes >> xcs; - /* Default viewport to entire image */ - if (!vpx_img_set_rect(img, 0, 0, d_w, d_h)) return img; + /* Default viewport to entire image. (This vpx_img_set_rect call always + * succeeds.) */ + int ret = vpx_img_set_rect(img, 0, 0, d_w, d_h); + assert(ret == 0); + (void)ret; + return img; fail: vpx_img_free(img); @@ -152,16 +168,15 @@ vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt, vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, unsigned int d_h, unsigned int stride_align, unsigned char *img_data) { - /* By setting buf_align = 1, we don't change buffer alignment in this - * function. */ + /* Set buf_align = 1. It is ignored by img_alloc_helper because img_data is + * not NULL. */ return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, img_data); } int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y, unsigned int w, unsigned int h) { - unsigned char *data; - - if (x + w <= img->w && y + h <= img->h) { + if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h && + y + h <= img->h) { img->d_w = w; img->d_h = h; @@ -172,34 +187,38 @@ int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y, } else { const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; - data = img->img_data; + unsigned char *data = img->img_data; if (img->fmt & VPX_IMG_FMT_HAS_ALPHA) { img->planes[VPX_PLANE_ALPHA] = data + x * bytes_per_sample + y * img->stride[VPX_PLANE_ALPHA]; - data += img->h * img->stride[VPX_PLANE_ALPHA]; + data += (size_t)img->h * img->stride[VPX_PLANE_ALPHA]; } img->planes[VPX_PLANE_Y] = data + x * bytes_per_sample + y * img->stride[VPX_PLANE_Y]; - data += img->h * img->stride[VPX_PLANE_Y]; + data += (size_t)img->h * img->stride[VPX_PLANE_Y]; - if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) { + unsigned int uv_x = x >> img->x_chroma_shift; + unsigned int uv_y = y >> img->y_chroma_shift; + if (img->fmt == VPX_IMG_FMT_NV12) { img->planes[VPX_PLANE_U] = - data + (x >> img->x_chroma_shift) * bytes_per_sample + - (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; - data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; + data + uv_x + uv_y * img->stride[VPX_PLANE_U]; + img->planes[VPX_PLANE_V] = img->planes[VPX_PLANE_U] + 1; + } else if (!(img->fmt & VPX_IMG_FMT_UV_FLIP)) { + img->planes[VPX_PLANE_U] = + data + uv_x * bytes_per_sample + uv_y * img->stride[VPX_PLANE_U]; + data += + (size_t)(img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; img->planes[VPX_PLANE_V] = - data + (x >> img->x_chroma_shift) * bytes_per_sample + - (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V]; + data + uv_x * bytes_per_sample + uv_y * img->stride[VPX_PLANE_V]; } else { img->planes[VPX_PLANE_V] = - data + (x >> img->x_chroma_shift) * bytes_per_sample + - (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_V]; - data += (img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V]; + data + uv_x * bytes_per_sample + uv_y * img->stride[VPX_PLANE_V]; + data += + (size_t)(img->h >> img->y_chroma_shift) * img->stride[VPX_PLANE_V]; img->planes[VPX_PLANE_U] = - data + (x >> img->x_chroma_shift) * bytes_per_sample + - (y >> img->y_chroma_shift) * img->stride[VPX_PLANE_U]; + data + uv_x * bytes_per_sample + uv_y * img->stride[VPX_PLANE_U]; } } return 0; diff --git a/media/libvpx/libvpx/vpx/vp8.h b/media/libvpx/libvpx/vpx/vp8.h index 059c9d0f65..f30dafed58 100644 --- a/media/libvpx/libvpx/vpx/vp8.h +++ b/media/libvpx/libvpx/vpx/vp8.h @@ -10,7 +10,7 @@ /*!\defgroup vp8 VP8 * \ingroup codecs - * VP8 is vpx's newest video compression algorithm that uses motion + * VP8 is a video compression algorithm that uses motion * compensated prediction, Discrete Cosine Transform (DCT) coding of the * prediction error signal and context dependent entropy coding techniques * based on arithmetic principles. It features: @@ -27,8 +27,8 @@ /*!\file * \brief Provides controls common to both the VP8 encoder and decoder. */ -#ifndef VPX_VP8_H_ -#define VPX_VP8_H_ +#ifndef VPX_VPX_VP8_H_ +#define VPX_VPX_VP8_H_ #include "./vpx_codec.h" #include "./vpx_image.h" @@ -47,10 +47,6 @@ enum vp8_com_control_id { VP8_SET_REFERENCE = 1, VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */ VP8_SET_POSTPROC = 3, /**< set the decoder's post processing settings */ - VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< \deprecated */ - VP8_SET_DBG_COLOR_MB_MODES = 5, /**< \deprecated */ - VP8_SET_DBG_COLOR_B_MODES = 6, /**< \deprecated */ - VP8_SET_DBG_DISPLAY_MV = 7, /**< \deprecated */ /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+) * for its control ids. These should be migrated to something like the @@ -70,12 +66,7 @@ enum vp8_postproc_level { VP8_DEBLOCK = 1 << 0, VP8_DEMACROBLOCK = 1 << 1, VP8_ADDNOISE = 1 << 2, - VP8_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */ - VP8_DEBUG_TXT_MBLK_MODES = - 1 << 4, /**< print macro block modes over each macro block */ - VP8_DEBUG_TXT_DC_DIFF = 1 << 5, /**< print dc diff for each macro block */ - VP8_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */ - VP8_MFQE = 1 << 10 + VP8_MFQE = 1 << 3 }; /*!\brief post process flags @@ -132,14 +123,6 @@ VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *) #define VPX_CTRL_VP8_COPY_REFERENCE VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *) #define VPX_CTRL_VP8_SET_POSTPROC -VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_REF_FRAME, int) -#define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME -VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_MB_MODES, int) -#define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES -VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_B_MODES, int) -#define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES -VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_DISPLAY_MV, int) -#define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) #define VPX_CTRL_VP9_GET_REFERENCE @@ -150,4 +133,4 @@ VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) } // extern "C" #endif -#endif // VPX_VP8_H_ +#endif // VPX_VPX_VP8_H_ diff --git a/media/libvpx/libvpx/vpx/vp8cx.h b/media/libvpx/libvpx/vpx/vp8cx.h index cc90159bc3..3a432cc12f 100644 --- a/media/libvpx/libvpx/vpx/vp8cx.h +++ b/media/libvpx/libvpx/vpx/vp8cx.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VP8CX_H_ -#define VPX_VP8CX_H_ +#ifndef VPX_VPX_VP8CX_H_ +#define VPX_VPX_VP8CX_H_ /*!\defgroup vp8_encoder WebM VP8/VP9 Encoder * \ingroup vp8 @@ -17,6 +17,7 @@ */ #include "./vp8.h" #include "./vpx_encoder.h" +#include "./vpx_ext_ratectrl.h" /*!\file * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the @@ -32,7 +33,15 @@ extern "C" { * This interface provides the capability to encode raw VP8 streams. * @{ */ + +/*!\brief A single instance of the VP8 encoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp8_cx(). + */ extern vpx_codec_iface_t vpx_codec_vp8_cx_algo; + +/*!\brief The interface to the VP8 encoder. + */ extern vpx_codec_iface_t *vpx_codec_vp8_cx(void); /*!@} - end algorithm interface member group*/ @@ -41,7 +50,15 @@ extern vpx_codec_iface_t *vpx_codec_vp8_cx(void); * This interface provides the capability to encode raw VP9 streams. * @{ */ + +/*!\brief A single instance of the VP9 encoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp9_cx(). + */ extern vpx_codec_iface_t vpx_codec_vp9_cx_algo; + +/*!\brief The interface to the VP9 encoder. + */ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void); /*!@} - end algorithm interface member group*/ @@ -125,7 +142,7 @@ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void); enum vp8e_enc_control_id { /*!\brief Codec control function to pass an ROI map to encoder. * - * Supported in codecs: VP8, VP9 + * Supported in codecs: VP8 */ VP8E_SET_ROI_MAP = 8, @@ -148,13 +165,17 @@ enum vp8e_enc_control_id { * speed at the expense of quality. * * \note Valid range for VP8: -16..16 - * \note Valid range for VP9: -8..8 + * \note Valid range for VP9: -9..9 + * \note A negative value (-n) is treated as its absolute value (n) in VP9. * * Supported in codecs: VP8, VP9 */ VP8E_SET_CPUUSED = 13, - /*!\brief Codec control function to enable automatic set and use alf frames. + /*!\brief Codec control function to enable automatic use of arf frames. + * + * \note Valid range for VP8: 0..1 + * \note Valid range for VP9: 0..6 * * Supported in codecs: VP8, VP9 */ @@ -169,7 +190,10 @@ enum vp8e_enc_control_id { */ VP8E_SET_NOISE_SENSITIVITY, - /*!\brief Codec control function to set sharpness. + /*!\brief Codec control function to set higher sharpness at the expense + * of a lower PSNR. + * + * \note Valid range: 0..7 * * Supported in codecs: VP8, VP9 */ @@ -225,10 +249,10 @@ enum vp8e_enc_control_id { */ VP8E_SET_TUNING, - /*!\brief Codec control function to set constrained quality level. + /*!\brief Codec control function to set constrained / constant quality level. * - * \attention For this value to be used vpx_codec_enc_cfg_t::g_usage must be - * set to #VPX_CQ. + * \attention For this value to be used vpx_codec_enc_cfg_t::rc_end_usage must + * be set to #VPX_CQ or #VPX_Q * \note Valid range: 0..63 * * Supported in codecs: VP8, VP9 @@ -279,7 +303,7 @@ enum vp8e_enc_control_id { * the feature is off, i.e., no golden frame boost in CBR mode and * average bitrate target is used. * - * For example, to allow 100% more bits, i.e, 2X, in a golden frame + * For example, to allow 100% more bits, i.e., 2X, in a golden frame * than average frame, set this to 100. * * Supported in codecs: VP9 @@ -333,11 +357,12 @@ enum vp8e_enc_control_id { * 2 = 4 tile columns * ..... * n = 2**n tile columns - * The requested tile columns will be capped by encoder based on image size - * limitation (The minimum width of a tile column is 256 pixel, the maximum - * is 4096). + * The requested tile columns will be capped by the encoder based on image + * size limitations (The minimum width of a tile column is 256 pixels, the + * maximum is 4096). * - * By default, the value is 0, i.e. one single column tile for entire image. + * By default, the value is 6, i.e., the maximum number of tiles supported by + * the resolution. * * Supported in codecs: VP9 */ @@ -368,10 +393,10 @@ enum vp8e_enc_control_id { * VP9 has a bitstream feature to reduce decoding dependency between frames * by turning off backward update of probability context used in encoding * and decoding. This allows staged parallel processing of more than one - * video frames in the decoder. This control function provides a mean to + * video frame in the decoder. This control function provides a means to * turn this feature on or off for bitstreams produced by encoder. * - * By default, this feature is off. + * By default, this feature is on. * * Supported in codecs: VP9 */ @@ -407,7 +432,7 @@ enum vp8e_enc_control_id { /*!\brief Codec control function to set noise sensitivity. * - * 0: off, 1: On(YOnly) + * 0: off, 1: On(YOnly), 2: For SVC only, on top two spatial layers(YOnly) * * Supported in codecs: VP9 */ @@ -422,6 +447,12 @@ enum vp8e_enc_control_id { */ VP9E_SET_SVC, + /*!\brief Codec control function to pass an ROI map to encoder. + * + * Supported in codecs: VP9 + */ + VP9E_SET_ROI_MAP, + /*!\brief Codec control function to set parameters for SVC. * \note Parameters contain min_q, max_q, scaling factor for each of the * SVC layers. @@ -443,6 +474,7 @@ enum vp8e_enc_control_id { * \note Valid parameter range: * VP9E_CONTENT_DEFAULT = Regular video content (Default) * VP9E_CONTENT_SCREEN = Screen capture content + * VP9E_CONTENT_FILM = Film content: improves grain retention * * Supported in codecs: VP9 */ @@ -479,25 +511,13 @@ enum vp8e_enc_control_id { */ VP9E_SET_COLOR_SPACE, - /*!\brief Codec control function to set temporal layering mode. - * \note Valid ranges: 0..3, default is "0" - * (VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING). - * 0 = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING - * 1 = VP9E_TEMPORAL_LAYERING_MODE_BYPASS - * 2 = VP9E_TEMPORAL_LAYERING_MODE_0101 - * 3 = VP9E_TEMPORAL_LAYERING_MODE_0212 - * - * Supported in codecs: VP9 - */ - VP9E_SET_TEMPORAL_LAYERING_MODE, - /*!\brief Codec control function to set minimum interval between GF/ARF frames * * By default the value is set as 4. * * Supported in codecs: VP9 */ - VP9E_SET_MIN_GF_INTERVAL, + VP9E_SET_MIN_GF_INTERVAL = 48, /*!\brief Codec control function to set minimum interval between GF/ARF frames * @@ -527,7 +547,7 @@ enum vp8e_enc_control_id { * struct #vpx_svc_ref_frame_config defined below. * * Supported in codecs: VP9 - */ + */ VP9E_SET_SVC_REF_FRAME_CONFIG, /*!\brief Codec control function to set intended rendering image size. @@ -547,6 +567,14 @@ enum vp8e_enc_control_id { */ VP9E_SET_TARGET_LEVEL, + /*!\brief Codec control function to set row level multi-threading. + * + * 0 : off, 1 : on + * + * Supported in codecs: VP9 + */ + VP9E_SET_ROW_MT, + /*!\brief Codec control function to get bitstream level. * * Supported in codecs: VP9 @@ -564,19 +592,190 @@ enum vp8e_enc_control_id { VP9E_SET_ALT_REF_AQ, /*!\brief Boost percentage for Golden Frame in CBR mode. - * - * This value controls the amount of boost given to Golden Frame in - * CBR mode. It is expressed as a percentage of the average - * per-frame bitrate, with the special (and default) value 0 meaning - * the feature is off, i.e., no golden frame boost in CBR mode and - * average bitrate target is used. - * - * For example, to allow 100% more bits, i.e, 2X, in a golden frame - * than average frame, set this to 100. - * - * Supported in codecs: VP8 - */ + * + * This value controls the amount of boost given to Golden Frame in + * CBR mode. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * the feature is off, i.e., no golden frame boost in CBR mode and + * average bitrate target is used. + * + * For example, to allow 100% more bits, i.e., 2X, in a golden frame + * than average frame, set this to 100. + * + * Supported in codecs: VP8 + */ VP8E_SET_GF_CBR_BOOST_PCT, + + /*!\brief Codec control function to enable the extreme motion vector unit test + * in VP9. Please note that this is only used in motion vector unit test. + * + * 0 : off, 1 : MAX_EXTREME_MV, 2 : MIN_EXTREME_MV + * + * Supported in codecs: VP9 + */ + VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, + + /*!\brief Codec control function to constrain the inter-layer prediction + * (prediction of lower spatial resolution) in VP9 SVC. + * + * 0 : inter-layer prediction on, 1 : off, 2 : off only on non-key frames + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_INTER_LAYER_PRED, + + /*!\brief Codec control function to set mode and thresholds for frame + * dropping in SVC. Drop frame thresholds are set per-layer. Mode is set as: + * 0 : layer-dependent dropping, 1 : constrained dropping, current layer drop + * forces drop on all upper layers. Default mode is 0. + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_FRAME_DROP_LAYER, + + /*!\brief Codec control function to get the refresh and reference flags and + * the buffer indices, up to the last encoded spatial layer. + * + * Supported in codecs: VP9 + */ + VP9E_GET_SVC_REF_FRAME_CONFIG, + + /*!\brief Codec control function to enable/disable use of golden reference as + * a second temporal reference for SVC. Only used when inter-layer prediction + * is disabled on INTER frames. + * + * 0: Off, 1: Enabled (default) + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_GF_TEMPORAL_REF, + + /*!\brief Codec control function to enable spatial layer sync frame, for any + * spatial layer. Enabling it for layer k means spatial layer k will disable + * all temporal prediction, but keep the inter-layer prediction. It will + * refresh any temporal reference buffer for that layer, and reset the + * temporal layer for the superframe to 0. Setting the layer sync for base + * spatial layer forces a key frame. Default is off (0) for all spatial + * layers. Spatial layer sync flag is reset to 0 after each encoded layer, + * so when control is invoked it is only used for the current superframe. + * + * 0: Off (default), 1: Enabled + * + * Supported in codecs: VP9 + */ + VP9E_SET_SVC_SPATIAL_LAYER_SYNC, + + /*!\brief Codec control function to enable temporal dependency model. + * + * Vp9 allows the encoder to run temporal dependency model and use it to + * improve the compression performance. To enable, set this parameter to be + * 1. The default value is set to be 1. + */ + VP9E_SET_TPL, + + /*!\brief Codec control function to enable post encode frame drop. + * + * This will allow encoder to drop frame after it's encoded. + * + * 0: Off (default), 1: Enabled + * + * Supported in codecs: VP9 + */ + VP9E_SET_POSTENCODE_DROP, + + /*!\brief Codec control function to set delta q for uv. + * + * Cap it at +/-15. + * + * Supported in codecs: VP9 + */ + VP9E_SET_DELTA_Q_UV, + + /*!\brief Codec control function to disable increase Q on overshoot in CBR. + * + * 0: On (default), 1: Disable. + * + * Supported in codecs: VP9 + */ + VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, + + /*!\brief Codec control function to disable loopfilter. + * + * 0: Loopfilter on all frames, 1: Disable on non reference frames. + * 2: Disable on all frames. + * + * Supported in codecs: VP9 + */ + VP9E_SET_DISABLE_LOOPFILTER, + + /*!\brief Codec control function to enable external rate control library. + * + * args[0]: path of the rate control library + * + * args[1]: private config of the rate control library + * + * Supported in codecs: VP9 + */ + VP9E_SET_EXTERNAL_RATE_CONTROL, + + /*!\brief Codec control to disable internal features in rate control. + * + * This will do 3 things, only for 1 pass: + * - Turn off low motion computation + * - Turn off gf update constraint on key frame frequency + * - Turn off content mode for cyclic refresh + * + * With those, the rate control is expected to work exactly the same as the + * interface provided in ratectrl_rtc.cc/h + * + * Supported in codecs: VP9 + */ + VP9E_SET_RTC_EXTERNAL_RATECTRL, + + /*!\brief Codec control function to get loopfilter level in the encoder. + * + * Supported in codecs: VP9 + */ + VP9E_GET_LOOPFILTER_LEVEL, + + /*!\brief Codec control to get last quantizers for all spatial layers. + * + * Return value uses an array of internal quantizers scale defined by the + * codec, for all spatial layers. + * The size of the array passed in should be #VPX_SS_MAX_LAYERS. + * + * Supported in codecs: VP9 + */ + VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, + + /*!\brief Codec control to disable internal features in rate control. + * + * This will turn off cyclic refresh for vp8. + * + * With this, the rate control is expected to work exactly the same as the + * interface provided in vp8_ratectrl_rtc.cc/h + * + * Supported in codecs: VP8 + */ + VP8E_SET_RTC_EXTERNAL_RATECTRL, + + /*!\brief Codec control to set quantizer for the next frame. + * + * This will turn off cyclic refresh. Only applicable to 1-pass without + * spatial layers. + * + * Supported in codecs: VP9 + * + */ + VP9E_SET_QUANTIZER_ONE_PASS, + + /*!\brief Codec control function to enable key frame temporal filtering. + * + * Vp9 allows the encoder to run key frame temporal filtering and use it to + * improve the compression performance. To enable, set this parameter to be + * 1. The default value is set to be 0. + */ + VP9E_SET_KEY_FRAME_FILTERING, }; /*!\brief vpx 1-D scaling mode @@ -624,16 +823,20 @@ typedef enum vp9e_temporal_layering_mode { */ typedef struct vpx_roi_map { - /*! An id between 0 and 3 for each 16x16 region within a frame. */ + /*! If ROI is enabled. */ + uint8_t enabled; + /*! An id between 0-3 (0-7 for vp9) for each 16x16 (8x8 for VP9) + * region within a frame. */ unsigned char *roi_map; unsigned int rows; /**< Number of rows. */ unsigned int cols; /**< Number of columns. */ - // TODO(paulwilkins): broken for VP9 which has 8 segments - // q and loop filter deltas for each segment - // (see MAX_MB_SEGMENTS) - int delta_q[4]; /**< Quantizer deltas. */ - int delta_lf[4]; /**< Loop filter deltas. */ - /*! Static breakout threshold for each segment. */ + /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */ + int delta_q[8]; /**< Quantizer deltas. Valid range: [-63, 63].*/ + int delta_lf[8]; /**< Loop filter deltas. Valid range: [-63, 63].*/ + /*! skip and ref frame segment is only used in VP9. */ + int skip[8]; /**< Skip this block. */ + int ref_frame[8]; /**< Reference frame for this block. */ + /*! Static breakout threshold for each segment. Only used in VP8. */ unsigned int static_threshold[4]; } vpx_roi_map_t; @@ -674,10 +877,11 @@ typedef enum { VP8_EIGHT_TOKENPARTITION = 3 } vp8e_token_partitions; -/*!brief VP9 encoder content type */ +/*!\brief VP9 encoder content type */ typedef enum { VP9E_CONTENT_DEFAULT, VP9E_CONTENT_SCREEN, + VP9E_CONTENT_FILM, VP9E_CONTENT_INVALID } vp9e_tune_content; @@ -696,11 +900,13 @@ typedef enum { VP8_TUNE_PSNR, VP8_TUNE_SSIM } vp8e_tuning; * */ typedef struct vpx_svc_layer_id { - int spatial_layer_id; /**< Spatial layer id number. */ + int spatial_layer_id; /**< First spatial layer to start encoding. */ + // TODO(jianj): Deprecated, to be removed. int temporal_layer_id; /**< Temporal layer id number. */ + int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS]; /**< Temp layer id. */ } vpx_svc_layer_id_t; -/*!\brief vp9 svc frame flag parameters. +/*!\brief vp9 svc frame flag parameters. * * This defines the frame flags and buffer indices for each spatial layer for * svc encoding. @@ -709,12 +915,58 @@ typedef struct vpx_svc_layer_id { * */ typedef struct vpx_svc_ref_frame_config { - int frame_flags[VPX_TS_MAX_LAYERS]; /**< Frame flags. */ - int lst_fb_idx[VPX_TS_MAX_LAYERS]; /**< Last buffer index. */ - int gld_fb_idx[VPX_TS_MAX_LAYERS]; /**< Golden buffer index. */ - int alt_fb_idx[VPX_TS_MAX_LAYERS]; /**< Altref buffer index. */ + int lst_fb_idx[VPX_SS_MAX_LAYERS]; /**< Last buffer index. */ + int gld_fb_idx[VPX_SS_MAX_LAYERS]; /**< Golden buffer index. */ + int alt_fb_idx[VPX_SS_MAX_LAYERS]; /**< Altref buffer index. */ + int update_buffer_slot[VPX_SS_MAX_LAYERS]; /**< Update reference frames. */ + // TODO(jianj): Remove update_last/golden/alt_ref, these are deprecated. + int update_last[VPX_SS_MAX_LAYERS]; /**< Update last. */ + int update_golden[VPX_SS_MAX_LAYERS]; /**< Update golden. */ + int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */ + int reference_last[VPX_SS_MAX_LAYERS]; /**< Last as reference. */ + int reference_golden[VPX_SS_MAX_LAYERS]; /**< Golden as reference. */ + int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */ + int64_t duration[VPX_SS_MAX_LAYERS]; /**< Duration per spatial layer. */ } vpx_svc_ref_frame_config_t; +/*!\brief VP9 svc frame dropping mode. + * + * This defines the frame drop mode for SVC. + * + */ +typedef enum { + CONSTRAINED_LAYER_DROP, + /**< Upper layers are constrained to drop if current layer drops. */ + LAYER_DROP, /**< Any spatial layer can drop. */ + FULL_SUPERFRAME_DROP, /**< Only full superframe can drop. */ + CONSTRAINED_FROM_ABOVE_DROP, + /**< Lower layers are constrained to drop if current layer drops. */ +} SVC_LAYER_DROP_MODE; + +/*!\brief vp9 svc frame dropping parameters. + * + * This defines the frame drop thresholds for each spatial layer, and + * the frame dropping mode: 0 = layer based frame dropping (default), + * 1 = constrained dropping where current layer drop forces all upper + * spatial layers to drop. + */ +typedef struct vpx_svc_frame_drop { + int framedrop_thresh[VPX_SS_MAX_LAYERS]; /**< Frame drop thresholds */ + SVC_LAYER_DROP_MODE + framedrop_mode; /**< Layer-based or constrained dropping. */ + int max_consec_drop; /**< Maximum consecutive drops, for any layer. */ +} vpx_svc_frame_drop_t; + +/*!\brief vp9 svc spatial layer sync parameters. + * + * This defines the spatial layer sync flag, defined per spatial layer. + * + */ +typedef struct vpx_svc_spatial_layer_sync { + int spatial_layer_sync[VPX_SS_MAX_LAYERS]; /**< Sync layer flags */ + int base_layer_intra_only; /**< Flag for setting Intra-only frame on base */ +} vpx_svc_spatial_layer_sync_t; + /*!\cond */ /*!\brief VP8 encoder control function parameter type * @@ -723,26 +975,12 @@ typedef struct vpx_svc_ref_frame_config { * */ -VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int) -#define VPX_CTRL_VP8E_SET_FRAME_FLAGS -VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int) -#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *) #define VPX_CTRL_VP8E_SET_ROI_MAP VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *) #define VPX_CTRL_VP8E_SET_ACTIVEMAP VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *) #define VPX_CTRL_VP8E_SET_SCALEMODE - -VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int) -#define VPX_CTRL_VP9E_SET_SVC -VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *) -#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS -VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *) -#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK -VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *) -#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID - VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int) #define VPX_CTRL_VP8E_SET_CPUUSED VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int) @@ -755,7 +993,10 @@ VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD, unsigned int) #define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */ #define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS - +VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) +#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER +VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) +#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int) #define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int) @@ -766,80 +1007,107 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vp8e_tuning */ #define VPX_CTRL_VP8E_SET_TUNING VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int) #define VPX_CTRL_VP8E_SET_CQ_LEVEL - +VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) +#define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT +VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int) +#define VPX_CTRL_VP8E_SET_FRAME_FLAGS +VPX_CTRL_USE_TYPE(VP9E_SET_MAX_INTER_BITRATE_PCT, unsigned int) +#define VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT +VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int) +#define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT +VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int) +#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID +VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int) +#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE +VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int) +#define VPX_CTRL_VP9E_SET_LOSSLESS VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int) #define VPX_CTRL_VP9E_SET_TILE_COLUMNS VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int) #define VPX_CTRL_VP9E_SET_TILE_ROWS - -VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *) -#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER -VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *) -#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64 -VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *) -#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID - -VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) -#define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT -VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int) -#define VPX_CTRL_VP8E_SET_MAX_INTER_BITRATE_PCT - -VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int) -#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT - -VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int) -#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE - -VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int) -#define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT - -VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int) -#define VPX_CTRL_VP9E_SET_LOSSLESS - VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int) #define VPX_CTRL_VP9E_SET_FRAME_PARALLEL_DECODING - VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int) #define VPX_CTRL_VP9E_SET_AQ_MODE - -VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int) -#define VPX_CTRL_VP9E_SET_ALT_REF_AQ - VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int) #define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST - VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int) #define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY - +VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int) +#define VPX_CTRL_VP9E_SET_SVC +VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *) +#define VPX_CTRL_VP9E_SET_ROI_MAP +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *) +#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *) +#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */ #define VPX_CTRL_VP9E_SET_TUNE_CONTENT - +VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *) +#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID +VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *) +#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int) #define VPX_CTRL_VP9E_SET_COLOR_SPACE - VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int) #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL - VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int) #define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL - VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *) #define VPX_CTRL_VP9E_GET_ACTIVEMAP - VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_RANGE, int) #define VPX_CTRL_VP9E_SET_COLOR_RANGE - VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *) #define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG - VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *) #define VPX_CTRL_VP9E_SET_RENDER_SIZE - VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int) #define VPX_CTRL_VP9E_SET_TARGET_LEVEL - +VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int) +#define VPX_CTRL_VP9E_SET_ROW_MT VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *) #define VPX_CTRL_VP9E_GET_LEVEL +VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int) +#define VPX_CTRL_VP9E_SET_ALT_REF_AQ +VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int) +#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT +VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int) +#define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int) +#define VPX_CTRL_VP9E_SET_SVC_INTER_LAYER_PRED +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *) +#define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER +VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *) +#define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_GF_TEMPORAL_REF, unsigned int) +#define VPX_CTRL_VP9E_SET_SVC_GF_TEMPORAL_REF +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, + vpx_svc_spatial_layer_sync_t *) +#define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC +VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int) +#define VPX_CTRL_VP9E_SET_TPL +VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int) +#define VPX_CTRL_VP9E_SET_POSTENCODE_DROP +VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int) +#define VPX_CTRL_VP9E_SET_DELTA_Q_UV +VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int) +#define VPX_CTRL_VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR +VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int) +#define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER +VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *) +#define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL +VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int) +#define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL +VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *) +#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL +VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *) +#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS +VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int) +#define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL +VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int) +#define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS +VPX_CTRL_USE_TYPE(VP9E_SET_KEY_FRAME_FILTERING, int) +#define VPX_CTRL_VP9E_SET_KEY_FRAME_FILTERING /*!\endcond */ /*! @} - end defgroup vp8_encoder */ @@ -847,4 +1115,4 @@ VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *) } // extern "C" #endif -#endif // VPX_VP8CX_H_ +#endif // VPX_VPX_VP8CX_H_ diff --git a/media/libvpx/libvpx/vpx/vp8dx.h b/media/libvpx/libvpx/vpx/vp8dx.h index 0d7759eb25..8c13649f4a 100644 --- a/media/libvpx/libvpx/vpx/vp8dx.h +++ b/media/libvpx/libvpx/vpx/vp8dx.h @@ -17,8 +17,8 @@ * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder * interface. */ -#ifndef VPX_VP8DX_H_ -#define VPX_VP8DX_H_ +#ifndef VPX_VPX_VP8DX_H_ +#define VPX_VPX_VP8DX_H_ #ifdef __cplusplus extern "C" { @@ -32,7 +32,15 @@ extern "C" { * This interface provides the capability to decode VP8 streams. * @{ */ + +/*!\brief A single instance of the VP8 decoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp8_dx(). + */ extern vpx_codec_iface_t vpx_codec_vp8_dx_algo; + +/*!\brief The interface to the VP8 decoder. + */ extern vpx_codec_iface_t *vpx_codec_vp8_dx(void); /*!@} - end algorithm interface member group*/ @@ -41,7 +49,15 @@ extern vpx_codec_iface_t *vpx_codec_vp8_dx(void); * This interface provides the capability to decode VP9 streams. * @{ */ + +/*!\brief A single instance of the VP9 decoder. + *\deprecated This access mechanism is provided for backwards compatibility; + * prefer vpx_codec_vp9_dx(). + */ extern vpx_codec_iface_t vpx_codec_vp9_dx_algo; + +/*!\brief The interface to the VP9 decoder. + */ extern vpx_codec_iface_t *vpx_codec_vp9_dx(void); /*!@} - end algorithm interface member group*/ @@ -116,6 +132,32 @@ enum vp8_dec_control_id { */ VP9_DECODE_SVC_SPATIAL_LAYER, + /*!\brief Codec control function to get last decoded frame quantizer. + * + * Return value uses internal quantizer scale defined by the codec. + * + * Supported in codecs: VP8, VP9 + */ + VPXD_GET_LAST_QUANTIZER, + + /*!\brief Codec control function to set row level multi-threading. + * + * 0 : off, 1 : on + * + * Supported in codecs: VP9 + */ + VP9D_SET_ROW_MT, + + /*!\brief Codec control function to set loopfilter optimization. + * + * 0 : off, Loop filter is done after all tiles have been decoded + * 1 : on, Loop filter is done immediately after decode without + * waiting for all threads to sync. + * + * Supported in codecs: VP9 + */ + VP9D_SET_LOOP_FILTER_OPT, + VP8_DECODER_CTRL_ID_MAX }; @@ -137,10 +179,6 @@ typedef struct vpx_decrypt_init { void *decrypt_state; } vpx_decrypt_init; -/*!\brief A deprecated alias for vpx_decrypt_init. - */ -typedef vpx_decrypt_init vp8_decrypt_init; - /*!\cond */ /*!\brief VP8 decoder control function parameter type * @@ -159,16 +197,26 @@ VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *) #define VPX_CTRL_VPXD_SET_DECRYPTOR VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *) #define VPX_CTRL_VP8D_SET_DECRYPTOR +VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) +#define VPX_CTRL_VP9D_GET_FRAME_SIZE VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *) #define VPX_CTRL_VP9D_GET_DISPLAY_SIZE VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *) #define VPX_CTRL_VP9D_GET_BIT_DEPTH -VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) -#define VPX_CTRL_VP9D_GET_FRAME_SIZE +VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int) +#define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER -#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER +VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int) +#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) +#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER +VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *) +#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER +VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int) +#define VPX_CTRL_VP9_DECODE_SET_ROW_MT +VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int) +#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT /*!\endcond */ /*! @} - end defgroup vp8_decoder */ @@ -177,4 +225,4 @@ VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) } // extern "C" #endif -#endif // VPX_VP8DX_H_ +#endif // VPX_VPX_VP8DX_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_codec.h b/media/libvpx/libvpx/vpx/vpx_codec.h index fe75d23872..602889773d 100644 --- a/media/libvpx/libvpx/vpx/vpx_codec.h +++ b/media/libvpx/libvpx/vpx/vpx_codec.h @@ -22,58 +22,62 @@ * video codec algorithm. * * An application instantiates a specific codec instance by using - * vpx_codec_init() and a pointer to the algorithm's interface structure: + * vpx_codec_dec_init() or vpx_codec_enc_init() and a pointer to the + * algorithm's interface structure: *
  *     my_app.c:
  *       extern vpx_codec_iface_t my_codec;
  *       {
  *           vpx_codec_ctx_t algo;
- *           res = vpx_codec_init(&algo, &my_codec);
+ *           int threads = 4;
+ *           vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
+ *           res = vpx_codec_dec_init(&algo, &my_codec, &cfg, 0);
  *       }
  *     
* * Once initialized, the instance is manged using other functions from * the vpx_codec_* family. */ -#ifndef VPX_VPX_CODEC_H_ -#define VPX_VPX_CODEC_H_ +#ifndef VPX_VPX_VPX_CODEC_H_ +#define VPX_VPX_VPX_CODEC_H_ #ifdef __cplusplus extern "C" { #endif -#include "./vpx_integer.h" #include "./vpx_image.h" +#include "./vpx_integer.h" /*!\brief Decorator indicating a function is deprecated */ -#ifndef DEPRECATED -#if defined(__GNUC__) && __GNUC__ -#define DEPRECATED __attribute__((deprecated)) +#ifndef VPX_DEPRECATED +#if defined(__GNUC__) +#define VPX_DEPRECATED __attribute__((deprecated)) #elif defined(_MSC_VER) -#define DEPRECATED +#define VPX_DEPRECATED #else -#define DEPRECATED +#define VPX_DEPRECATED #endif -#endif /* DEPRECATED */ +#endif /* VPX_DEPRECATED */ -#ifndef DECLSPEC_DEPRECATED -#if defined(__GNUC__) && __GNUC__ -#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */ +#ifndef VPX_DECLSPEC_DEPRECATED +#if defined(__GNUC__) +#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */ #elif defined(_MSC_VER) -/*!\brief \copydoc #DEPRECATED */ -#define DECLSPEC_DEPRECATED __declspec(deprecated) +/*!\brief \copydoc #VPX_DEPRECATED */ +#define VPX_DECLSPEC_DEPRECATED __declspec(deprecated) #else -#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */ +#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */ #endif -#endif /* DECLSPEC_DEPRECATED */ +#endif /* VPX_DECLSPEC_DEPRECATED */ /*!\brief Decorator indicating a function is potentially unused */ -#ifdef UNUSED -#elif defined(__GNUC__) || defined(__clang__) -#define UNUSED __attribute__((unused)) +#ifndef VPX_UNUSED +#if defined(__GNUC__) || defined(__clang__) +#define VPX_UNUSED __attribute__((unused)) #else -#define UNUSED +#define VPX_UNUSED #endif +#endif /* VPX_UNUSED */ /*!\brief Current ABI version number * @@ -83,7 +87,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/ +#define VPX_CODEC_ABI_VERSION (4 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/ /*!\brief Algorithm return codes */ typedef enum { @@ -152,6 +156,10 @@ typedef long vpx_codec_caps_t; #define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ #define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ +/*! Can support images at greater than 8 bitdepth. + */ +#define VPX_CODEC_CAP_HIGHBITDEPTH 0x4 + /*! \brief Initialization-time Feature Enabling * * Certain codec features must be known at initialization time, to allow for @@ -236,11 +244,11 @@ typedef enum vpx_bit_depth { */ int vpx_codec_version(void); #define VPX_VERSION_MAJOR(v) \ - ((v >> 16) & 0xff) /**< extract major from packed version */ + (((v) >> 16) & 0xff) /**< extract major from packed version */ #define VPX_VERSION_MINOR(v) \ - ((v >> 8) & 0xff) /**< extract minor from packed version */ + (((v) >> 8) & 0xff) /**< extract minor from packed version */ #define VPX_VERSION_PATCH(v) \ - ((v >> 0) & 0xff) /**< extract patch from packed version */ + (((v) >> 0) & 0xff) /**< extract patch from packed version */ /*!\brief Return the version major number */ #define vpx_codec_version_major() ((vpx_codec_version() >> 16) & 0xff) @@ -310,19 +318,21 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err); * \param[in] ctx Pointer to this instance's context. * */ -const char *vpx_codec_error(vpx_codec_ctx_t *ctx); +const char *vpx_codec_error(const vpx_codec_ctx_t *ctx); /*!\brief Retrieve detailed error information for codec context * * Returns a human readable string providing detailed information about - * the last error. + * the last error. The returned string is only valid until the next + * vpx_codec_* function call (except vpx_codec_error and + * vpx_codec_error_detail) on the codec context. * * \param[in] ctx Pointer to this instance's context. * * \retval NULL * No detailed information is available. */ -const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx); +const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx); /* REQUIRED FUNCTIONS * @@ -337,9 +347,11 @@ const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx); * \param[in] ctx Pointer to this instance's context * * \retval #VPX_CODEC_OK - * The codec algorithm initialized. - * \retval #VPX_CODEC_MEM_ERROR - * Memory allocation failed. + * The codec instance has been destroyed. + * \retval #VPX_CODEC_INVALID_PARAM + * ctx is a null pointer. + * \retval #VPX_CODEC_ERROR + * Codec context not initialized. */ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx); @@ -409,7 +421,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); */ #define VPX_CTRL_USE_TYPE(id, typ) \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \ - UNUSED; \ + VPX_UNUSED; \ \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \ int ctrl_id, typ data) { \ @@ -426,13 +438,13 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); * It defines a static function with the correctly typed arguments as a * wrapper to the type-unsafe internal function. */ -#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ - DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ - vpx_codec_ctx_t *, int, typ) DEPRECATED UNUSED; \ - \ - DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ - vpx_codec_ctx_t *ctx, int ctrl_id, typ data) { \ - return vpx_codec_control_(ctx, ctrl_id, data); \ +#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ + VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ + vpx_codec_ctx_t *, int, typ) VPX_DEPRECATED VPX_UNUSED; \ + \ + VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \ + vpx_codec_ctx_t *ctx, int ctrl_id, typ data) { \ + return vpx_codec_control_(ctx, ctrl_id, data); \ } /**<\hideinitializer*/ /*!\brief vpx_codec_control void type definition macro @@ -447,7 +459,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); */ #define VPX_CTRL_VOID(id) \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \ - UNUSED; \ + VPX_UNUSED; \ \ static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx, \ int ctrl_id) { \ @@ -460,4 +472,4 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...); #ifdef __cplusplus } #endif -#endif // VPX_VPX_CODEC_H_ +#endif // VPX_VPX_VPX_CODEC_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_codec.mk b/media/libvpx/libvpx/vpx/vpx_codec.mk index b77f45817b..778f1a6146 100644 --- a/media/libvpx/libvpx/vpx/vpx_codec.mk +++ b/media/libvpx/libvpx/vpx/vpx_codec.mk @@ -15,10 +15,6 @@ API_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h API_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h -ifeq ($(CONFIG_VP9_ENCODER),yes) - API_SRCS-$(CONFIG_SPATIAL_SVC) += src/svc_encodeframe.c - API_SRCS-$(CONFIG_SPATIAL_SVC) += svc_context.h -endif API_SRCS-$(CONFIG_VP8_DECODER) += vp8.h API_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h @@ -28,14 +24,17 @@ API_DOC_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h API_DOC_SRCS-yes += vpx_codec.h API_DOC_SRCS-yes += vpx_decoder.h API_DOC_SRCS-yes += vpx_encoder.h +API_DOC_SRCS-$(CONFIG_ENCODERS) += vpx_ext_ratectrl.h API_DOC_SRCS-yes += vpx_frame_buffer.h API_DOC_SRCS-yes += vpx_image.h +API_DOC_SRCS-$(CONFIG_ENCODERS) += vpx_tpl.h API_SRCS-yes += src/vpx_decoder.c API_SRCS-yes += vpx_decoder.h API_SRCS-yes += src/vpx_encoder.c API_SRCS-yes += vpx_encoder.h API_SRCS-yes += internal/vpx_codec_internal.h +API_SRCS-yes += internal/vpx_ratectrl_rtc.h API_SRCS-yes += src/vpx_codec.c API_SRCS-yes += src/vpx_image.c API_SRCS-yes += vpx_codec.h @@ -43,3 +42,5 @@ API_SRCS-yes += vpx_codec.mk API_SRCS-yes += vpx_frame_buffer.h API_SRCS-yes += vpx_image.h API_SRCS-yes += vpx_integer.h +API_SRCS-yes += vpx_ext_ratectrl.h +API_SRCS-yes += vpx_tpl.h diff --git a/media/libvpx/libvpx/vpx/vpx_decoder.h b/media/libvpx/libvpx/vpx/vpx_decoder.h index 2ff12112bc..0536d5d5ad 100644 --- a/media/libvpx/libvpx/vpx/vpx_decoder.h +++ b/media/libvpx/libvpx/vpx/vpx_decoder.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VPX_DECODER_H_ -#define VPX_VPX_DECODER_H_ +#ifndef VPX_VPX_VPX_DECODER_H_ +#define VPX_VPX_VPX_DECODER_H_ /*!\defgroup decoder Decoder Algorithm Interface * \ingroup codec @@ -29,7 +29,7 @@ extern "C" { #endif -#include "./vpx_codec.h" +#include "./vpx_codec.h" // IWYU pragma: export #include "./vpx_frame_buffer.h" /*!\brief Current ABI version number @@ -58,6 +58,10 @@ extern "C" { #define VPX_CODEC_CAP_ERROR_CONCEALMENT 0x80000 /*!\brief Can receive encoded frames one fragment at a time */ #define VPX_CODEC_CAP_INPUT_FRAGMENTS 0x100000 +/*!\brief Can support frame-based multi-threading */ +#define VPX_CODEC_CAP_FRAME_THREADING 0x200000 +/*!\brief Can support external frame buffers */ +#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 /*! \brief Initialization-time Feature Enabling * @@ -66,11 +70,6 @@ extern "C" { * * The available flags are specified by VPX_CODEC_USE_* defines. */ -/*!\brief Can support frame-based multi-threading */ -#define VPX_CODEC_CAP_FRAME_THREADING 0x200000 -/*!brief Can support external frame buffers */ -#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 - #define VPX_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */ /*!\brief Conceal errors in decoded frames */ #define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 @@ -128,7 +127,7 @@ typedef struct vpx_codec_dec_cfg { * \param[in] ver ABI version number. Must be set to * VPX_DECODER_ABI_VERSION * \retval #VPX_CODEC_OK - * The decoder algorithm initialized. + * The decoder algorithm has been initialized. * \retval #VPX_CODEC_MEM_ERROR * Memory allocation failed. */ @@ -153,7 +152,7 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, * \param[in] iface Pointer to the algorithm interface * \param[in] data Pointer to a block of data to parse * \param[in] data_sz Size of the data buffer - * \param[in,out] si Pointer to stream info to update. The size member + * \param[in,out] si Pointer to stream info to update. The sz member * \ref MUST be properly initialized, but \ref MAY be * clobbered by the algorithm. This parameter \ref MAY * be NULL. @@ -171,7 +170,7 @@ vpx_codec_err_t vpx_codec_peek_stream_info(vpx_codec_iface_t *iface, * Returns information about the stream that has been parsed during decoding. * * \param[in] ctx Pointer to this instance's context - * \param[in,out] si Pointer to stream info to update. The size member + * \param[in,out] si Pointer to stream info to update. The sz member * \ref MUST be properly initialized, but \ref MAY be * clobbered by the algorithm. This parameter \ref MAY * be NULL. @@ -185,8 +184,8 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, /*!\brief Decode data * * Processes a buffer of coded data. If the processing results in a new - * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be - * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode + * decoded frame becoming available, put_slice and put_frame callbacks may be + * invoked, as appropriate. Encoded data \ref MUST be passed in DTS (decode * time stamp) order. Frames produced will always be in PTS (presentation * time stamp) order. * If the decoder is configured with VPX_CODEC_USE_INPUT_FRAGMENTS enabled, @@ -199,13 +198,15 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, * * \param[in] ctx Pointer to this instance's context * \param[in] data Pointer to this block of new coded data. If - * NULL, a VPX_CODEC_CB_PUT_FRAME event is posted - * for the previously decoded frame. + * NULL, the put_frame callback is invoked for + * the previously decoded frame. * \param[in] data_sz Size of the coded data, in bytes. * \param[in] user_priv Application specific data to associate with * this frame. * \param[in] deadline Soft deadline the decoder should attempt to meet, * in us. Set to zero for unlimited. + * NOTE: The deadline parameter is ignored. Always + * pass 0. * * \return Returns #VPX_CODEC_OK if the coded data was processed completely * and future pictures can be decoded without error. Otherwise, @@ -236,11 +237,10 @@ vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, vpx_codec_iter_t *iter); /*!\defgroup cap_put_frame Frame-Based Decoding Functions * - * The following functions are required to be implemented for all decoders - * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling these - * functions - * for codecs that don't advertise this capability will result in an error - * code being returned, usually VPX_CODEC_ERROR + * The following function is required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_PUT_FRAME capability. Calling this + * function for codecs that don't advertise this capability will result in + * an error code being returned, usually VPX_CODEC_INCAPABLE. * @{ */ @@ -264,8 +264,9 @@ typedef void (*vpx_codec_put_frame_cb_fn_t)(void *user_priv, * \retval #VPX_CODEC_OK * Callback successfully registered. * \retval #VPX_CODEC_ERROR - * Decoder context not initialized, or algorithm not capable of - * posting slice completion. + * Decoder context not initialized. + * \retval #VPX_CODEC_INCAPABLE + * Algorithm not capable of posting frame completion. */ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, vpx_codec_put_frame_cb_fn_t cb, @@ -275,18 +276,17 @@ vpx_codec_err_t vpx_codec_register_put_frame_cb(vpx_codec_ctx_t *ctx, /*!\defgroup cap_put_slice Slice-Based Decoding Functions * - * The following functions are required to be implemented for all decoders - * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling these - * functions - * for codecs that don't advertise this capability will result in an error - * code being returned, usually VPX_CODEC_ERROR + * The following function is required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_PUT_SLICE capability. Calling this + * function for codecs that don't advertise this capability will result in + * an error code being returned, usually VPX_CODEC_INCAPABLE. * @{ */ /*!\brief put slice callback prototype * * This callback is invoked by the decoder to notify the application of - * the availability of partially decoded image data. The + * the availability of partially decoded image data. */ typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv, const vpx_image_t *img, @@ -305,8 +305,9 @@ typedef void (*vpx_codec_put_slice_cb_fn_t)(void *user_priv, * \retval #VPX_CODEC_OK * Callback successfully registered. * \retval #VPX_CODEC_ERROR - * Decoder context not initialized, or algorithm not capable of - * posting slice completion. + * Decoder context not initialized. + * \retval #VPX_CODEC_INCAPABLE + * Algorithm not capable of posting slice completion. */ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, vpx_codec_put_slice_cb_fn_t cb, @@ -316,10 +317,10 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions * - * The following section is required to be implemented for all decoders + * The following function is required to be implemented for all decoders * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability. * Calling this function for codecs that don't advertise this capability - * will result in an error code being returned, usually VPX_CODEC_ERROR. + * will result in an error code being returned, usually VPX_CODEC_INCAPABLE. * * \note * Currently this only works with VP9. @@ -344,8 +345,9 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx, * \retval #VPX_CODEC_INVALID_PARAM * One or more of the callbacks were NULL. * \retval #VPX_CODEC_ERROR - * Decoder context not initialized, or algorithm not capable of - * using external frame buffers. + * Decoder context not initialized. + * \retval #VPX_CODEC_INCAPABLE + * Algorithm not capable of using external frame buffers. * * \note * When decoding VP9, the application may be required to pass in at least @@ -362,4 +364,4 @@ vpx_codec_err_t vpx_codec_set_frame_buffer_functions( #ifdef __cplusplus } #endif -#endif // VPX_VPX_DECODER_H_ +#endif // VPX_VPX_VPX_DECODER_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_encoder.h b/media/libvpx/libvpx/vpx/vpx_encoder.h index 28fcd5f999..04e6b028f3 100644 --- a/media/libvpx/libvpx/vpx/vpx_encoder.h +++ b/media/libvpx/libvpx/vpx/vpx_encoder.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VPX_ENCODER_H_ -#define VPX_VPX_ENCODER_H_ +#ifndef VPX_VPX_VPX_ENCODER_H_ +#define VPX_VPX_VPX_ENCODER_H_ /*!\defgroup encoder Encoder Algorithm Interface * \ingroup codec @@ -29,7 +29,8 @@ extern "C" { #endif -#include "./vpx_codec.h" +#include "./vpx_codec.h" // IWYU pragma: export +#include "./vpx_ext_ratectrl.h" /*! Temporal Scalability: Maximum length of the sequence defining frame * layer membership @@ -39,15 +40,9 @@ extern "C" { /*! Temporal Scalability: Maximum number of coding layers */ #define VPX_TS_MAX_LAYERS 5 -/*!\deprecated Use #VPX_TS_MAX_PERIODICITY instead. */ -#define MAX_PERIODICITY VPX_TS_MAX_PERIODICITY - /*! Temporal+Spatial Scalability: Maximum number of coding layers */ #define VPX_MAX_LAYERS 12 // 3 temporal + 4 spatial layers are allowed. -/*!\deprecated Use #VPX_MAX_LAYERS instead. */ -#define MAX_LAYERS VPX_MAX_LAYERS // 3 temporal + 4 spatial layers allowed. - /*! Spatial Scalability: Maximum number of coding layers */ #define VPX_SS_MAX_LAYERS 5 @@ -61,9 +56,15 @@ extern "C" { * must be bumped. Examples include, but are not limited to, changing * types, removing or reassigning enums, adding/removing/rearranging * fields to structures + * + * \note + * VPX_ENCODER_ABI_VERSION has a VPX_EXT_RATECTRL_ABI_VERSION component + * because the VP9E_SET_EXTERNAL_RATE_CONTROL codec control uses + * vpx_rc_funcs_t. */ #define VPX_ENCODER_ABI_VERSION \ - (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ + (18 + VPX_CODEC_ABI_VERSION + \ + VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -83,10 +84,6 @@ extern "C" { */ #define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000 -/*! Can support input images at greater than 8 bitdepth. - */ -#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000 - /*! \brief Initialization-time Feature Enabling * * Certain codec features must be known at initialization time, to allow @@ -123,14 +120,14 @@ typedef int64_t vpx_codec_pts_t; * support frame types that are codec specific (MPEG-1 D-frames for example) */ typedef uint32_t vpx_codec_frame_flags_t; -#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */ +#define VPX_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */ /*!\brief frame can be dropped without affecting the stream (no future frame * depends on this one) */ -#define VPX_FRAME_IS_DROPPABLE 0x2 +#define VPX_FRAME_IS_DROPPABLE 0x2u /*!\brief frame should be decoded but will not be shown */ -#define VPX_FRAME_IS_INVISIBLE 0x4 +#define VPX_FRAME_IS_INVISIBLE 0x4u /*!\brief this is a fragment of the encoded frame */ -#define VPX_FRAME_IS_FRAGMENT 0x8 +#define VPX_FRAME_IS_FRAGMENT 0x8u /*!\brief Error Resilient flags * @@ -140,12 +137,13 @@ typedef uint32_t vpx_codec_frame_flags_t; */ typedef uint32_t vpx_codec_er_flags_t; /*!\brief Improve resiliency against losses of whole frames */ -#define VPX_ERROR_RESILIENT_DEFAULT 0x1 +#define VPX_ERROR_RESILIENT_DEFAULT 0x1u /*!\brief The frame partitions are independently decodable by the bool decoder, * meaning that partitions can be decoded even though earlier partitions have * been lost. Note that intra prediction is still done over the partition - * boundary. */ -#define VPX_ERROR_RESILIENT_PARTITIONS 0x2 + * boundary. + * \note This is only supported by VP8.*/ +#define VPX_ERROR_RESILIENT_PARTITIONS 0x2u /*!\brief Encoder output packet variants * @@ -154,16 +152,10 @@ typedef uint32_t vpx_codec_er_flags_t; * extend this list to provide additional functionality. */ enum vpx_codec_cx_pkt_kind { - VPX_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ - VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ - VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ - VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ -// Spatial SVC is still experimental and may be removed before the next ABI -// bump. -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) - VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/ - VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/ -#endif + VPX_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ + VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ + VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ + VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ }; @@ -187,6 +179,13 @@ typedef struct vpx_codec_cx_pkt { * Only applicable when "output partition" mode is enabled. First * partition has id 0.*/ int partition_id; + /*!\brief Width and height of frames in this packet. VP8 will only use the + * first one.*/ + unsigned int width[VPX_SS_MAX_LAYERS]; /**< frame width */ + unsigned int height[VPX_SS_MAX_LAYERS]; /**< frame height */ + /*!\brief Flag to indicate if spatial layer frame in this packet is + * encoded or dropped. VP8 will always be set to 1.*/ + uint8_t spatial_layer_encoded[VPX_SS_MAX_LAYERS]; } frame; /**< data for compressed frame packet */ vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */ vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ @@ -194,14 +193,9 @@ typedef struct vpx_codec_cx_pkt { unsigned int samples[4]; /**< Number of samples, total/y/u/v */ uint64_t sse[4]; /**< sum squared error, total/y/u/v */ double psnr[4]; /**< PSNR, total/y/u/v */ + int spatial_layer_id; /**< Spatial layer id */ } psnr; /**< data for PSNR packet */ vpx_fixed_buf_t raw; /**< data for arbitrary packets */ -// Spatial SVC is still experimental and may be removed before the next -// ABI bump. -#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION) - size_t layer_sizes[VPX_SS_MAX_LAYERS]; - struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS]; -#endif /* This packet size is fixed to allow codecs to extend this * interface without having to manage storage for raw packets, @@ -217,8 +211,6 @@ typedef struct vpx_codec_cx_pkt { * This callback function, when registered, returns with packets when each * spatial layer is encoded. */ -// putting the definitions here for now. (agrange: find if there -// is a better place for this) typedef void (*vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt, void *user_data); @@ -238,11 +230,11 @@ typedef struct vpx_rational { } vpx_rational_t; /**< alias for struct vpx_rational */ /*!\brief Multi-pass Encoding Pass */ -enum vpx_enc_pass { +typedef enum vpx_enc_pass { VPX_RC_ONE_PASS, /**< Single pass mode */ VPX_RC_FIRST_PASS, /**< First pass of multi-pass mode */ VPX_RC_LAST_PASS /**< Final pass of multi-pass mode */ -}; +} vpx_enc_pass; /*!\brief Rate control mode */ enum vpx_rc_mode { @@ -275,6 +267,8 @@ enum vpx_kf_mode { */ typedef long vpx_enc_frame_flags_t; #define VPX_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */ +/** Calculate PSNR on this frame, requires g_lag_in_frames to be 0 */ +#define VPX_EFLAG_CALCULATE_PSNR (1 << 1) /*!\brief Encoder configuration structure * @@ -287,12 +281,9 @@ typedef struct vpx_codec_enc_cfg { * generic settings (g) */ - /*!\brief Algorithm specific "usage" value + /*!\brief Deprecated: Algorithm specific "usage" value * - * Algorithms may define multiple values for usage, which may convey the - * intent of how the application intends to use the stream. If this value - * is non-zero, consult the documentation for the codec to determine its - * meaning. + * This value must be zero. */ unsigned int g_usage; @@ -403,9 +394,6 @@ typedef struct vpx_codec_enc_cfg { * trade-off is often acceptable, but for many applications is not. It can * be disabled in these cases. * - * Note that not all codecs support this feature. All vpx VPx codecs do. - * For other codecs, consult the documentation for that algorithm. - * * This threshold is described as a percentage of the target data buffer. * When the data buffer falls below this percentage of fullness, a * dropped frame is indicated. Set the threshold to zero (0) to disable @@ -478,7 +466,9 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Target data rate * - * Target bandwidth to use for this stream, in kilobits per second. + * Target bitrate to use for this stream, in kilobits per second. + * Internally capped to the smaller of the uncompressed bitrate and + * 1000000 kilobits per second. */ unsigned int rc_target_bitrate; @@ -491,8 +481,7 @@ typedef struct vpx_codec_enc_cfg { * The quantizer is the most direct control over the quality of the * encoded image. The range of valid values for the quantizer is codec * specific. Consult the documentation for the codec to determine the - * values to use. To determine the range programmatically, call - * vpx_codec_enc_config_default() with a usage value of 0. + * values to use. */ unsigned int rc_min_quantizer; @@ -501,8 +490,7 @@ typedef struct vpx_codec_enc_cfg { * The quantizer is the most direct control over the quality of the * encoded image. The range of valid values for the quantizer is codec * specific. Consult the documentation for the codec to determine the - * values to use. To determine the range programmatically, call - * vpx_codec_enc_config_default() with a usage value of 0. + * values to use. */ unsigned int rc_max_quantizer; @@ -512,25 +500,31 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Rate control adaptation undershoot control * - * This value, expressed as a percentage of the target bitrate, + * VP8: Expressed as a percentage of the target bitrate, * controls the maximum allowed adaptation speed of the codec. * This factor controls the maximum amount of bits that can * be subtracted from the target bitrate in order to compensate * for prior overshoot. - * - * Valid values in the range 0-1000. + * VP9: Expressed as a percentage of the target bitrate, a threshold + * undershoot level (current rate vs target) beyond which more aggressive + * corrective measures are taken. + * * + * Valid values in the range VP8:0-100 VP9: 0-100. */ unsigned int rc_undershoot_pct; /*!\brief Rate control adaptation overshoot control * - * This value, expressed as a percentage of the target bitrate, + * VP8: Expressed as a percentage of the target bitrate, * controls the maximum allowed adaptation speed of the codec. * This factor controls the maximum amount of bits that can * be added to the target bitrate in order to compensate for * prior undershoot. + * VP9: Expressed as a percentage of the target bitrate, a threshold + * overshoot level (current rate vs target) beyond which more aggressive + * corrective measures are taken. * - * Valid values in the range 0-1000. + * Valid values in the range VP8:0-100 VP9: 0-100. */ unsigned int rc_overshoot_pct; @@ -595,6 +589,13 @@ typedef struct vpx_codec_enc_cfg { */ unsigned int rc_2pass_vbr_maxsection_pct; + /*!\brief Two-pass corpus vbr mode complexity control + * Used only in VP9: A value representing the corpus midpoint complexity + * for corpus vbr mode. This value defaults to 0 which disables corpus vbr + * mode in favour of normal vbr mode. + */ + unsigned int rc_2pass_vbr_corpus_complexity; + /* * keyframing settings (kf) */ @@ -645,7 +646,7 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Target bitrate for each spatial layer. * * These values specify the target coding bitrate to be used for each - * spatial layer. + * spatial layer. (in kbps) */ unsigned int ss_target_bitrate[VPX_SS_MAX_LAYERS]; @@ -658,7 +659,7 @@ typedef struct vpx_codec_enc_cfg { /*!\brief Target bitrate for each temporal layer. * * These values specify the target coding bitrate to be used for each - * temporal layer. + * temporal layer. (in kbps) */ unsigned int ts_target_bitrate[VPX_TS_MAX_LAYERS]; @@ -675,7 +676,7 @@ typedef struct vpx_codec_enc_cfg { * membership of frames to temporal layers. For example, if the * ts_periodicity = 8, then the frames are assigned to coding layers with a * repeated sequence of length 8. - */ + */ unsigned int ts_periodicity; /*!\brief Template defining the membership of frames to temporal layers. @@ -684,13 +685,13 @@ typedef struct vpx_codec_enc_cfg { * For a 2-layer encoding that assigns even numbered frames to one temporal * layer (0) and odd numbered frames to a second temporal layer (1) with * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1). - */ + */ unsigned int ts_layer_id[VPX_TS_MAX_PERIODICITY]; /*!\brief Target bitrate for each spatial/temporal layer. * * These values specify the target coding bitrate to be used for each - * spatial/temporal layer. + * spatial/temporal layer. (in kbps) * */ unsigned int layer_target_bitrate[VPX_MAX_LAYERS]; @@ -703,6 +704,151 @@ typedef struct vpx_codec_enc_cfg { * */ int temporal_layering_mode; + + /*!\brief A flag indicating whether to use external rate control parameters. + * By default is 0. If set to 1, the following parameters will be used in the + * rate control system. + */ + int use_vizier_rc_params; + + /*!\brief Active worst quality factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t active_wq_factor; + + /*!\brief Error per macroblock adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t err_per_mb_factor; + + /*!\brief Second reference default decay limit. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t sr_default_decay_limit; + + /*!\brief Second reference difference factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t sr_diff_factor; + + /*!\brief Keyframe error per macroblock adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t kf_err_per_mb_factor; + + /*!\brief Keyframe minimum boost adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t kf_frame_min_boost_factor; + + /*!\brief Keyframe maximum boost adjustment factor, for the first keyframe + * in a chunk. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t kf_frame_max_boost_first_factor; + + /*!\brief Keyframe maximum boost adjustment factor, for subsequent keyframes. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t kf_frame_max_boost_subs_factor; + + /*!\brief Keyframe maximum total boost adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t kf_max_total_boost_factor; + + /*!\brief Golden frame maximum total boost adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t gf_max_total_boost_factor; + + /*!\brief Golden frame maximum boost adjustment factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t gf_frame_max_boost_factor; + + /*!\brief Zero motion power factor. + * + * Rate control parameters, set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t zm_factor; + + /*!\brief Rate-distortion multiplier for inter frames. + * The multiplier is a crucial parameter in the calculation of rate distortion + * cost. It is often related to the qp (qindex) value. + * Rate control parameters, could be set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t rd_mult_inter_qp_fac; + + /*!\brief Rate-distortion multiplier for alt-ref frames. + * The multiplier is a crucial parameter in the calculation of rate distortion + * cost. It is often related to the qp (qindex) value. + * Rate control parameters, could be set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t rd_mult_arf_qp_fac; + + /*!\brief Rate-distortion multiplier for key frames. + * The multiplier is a crucial parameter in the calculation of rate distortion + * cost. It is often related to the qp (qindex) value. + * Rate control parameters, could be set from external experiment results. + * Only when |use_vizier_rc_params| is set to 1, the pass in value will be + * used. Otherwise, the default value is used. + * + */ + vpx_rational_t rd_mult_key_qp_fac; } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ /*!\brief vp9 svc extra configure parameters @@ -717,11 +863,12 @@ typedef struct vpx_svc_parameters { int scaling_factor_den[VPX_MAX_LAYERS]; /**< Scaling factor-denominator */ int speed_per_layer[VPX_MAX_LAYERS]; /**< Speed setting for each sl */ int temporal_layering_mode; /**< Temporal layering mode */ + int loopfilter_ctrl[VPX_MAX_LAYERS]; /**< Loopfilter ctrl for each sl */ } vpx_svc_extra_cfg_t; /*!\brief Initialize an encoder instance * - * Initializes a encoder context using the given interface. Applications + * Initializes an encoder context using the given interface. Applications * should call the vpx_codec_enc_init convenience macro instead of this * function directly, to ensure that the ABI version number parameter * is properly initialized. @@ -730,9 +877,12 @@ typedef struct vpx_svc_parameters { * is not thread safe and should be guarded with a lock if being used * in a multithreaded context. * + * If vpx_codec_enc_init_ver() fails, it is not necessary to call + * vpx_codec_destroy() on the encoder context. + * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. - * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] cfg Configuration to use. * \param[in] flags Bitfield of VPX_CODEC_USE_* flags * \param[in] ver ABI version number. Must be set to * VPX_ENCODER_ABI_VERSION @@ -755,27 +905,32 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, /*!\brief Initialize multi-encoder instance * - * Initializes multi-encoder context using the given interface. + * Initializes multiple encoder contexts using the given interface. * Applications should call the vpx_codec_enc_init_multi convenience macro * instead of this function directly, to ensure that the ABI version number * parameter is properly initialized. * - * \param[in] ctx Pointer to this instance's context. + * \param[in] ctx Pointer to an array of num_enc instances' contexts. * \param[in] iface Pointer to the algorithm interface to use. - * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] cfg An array of num_enc configurations to use. * \param[in] num_enc Total number of encoders. * \param[in] flags Bitfield of VPX_CODEC_USE_* flags - * \param[in] dsf Pointer to down-sampling factors. + * \param[in] dsf Pointer to an array of num_enc down-sampling factors. * \param[in] ver ABI version number. Must be set to * VPX_ENCODER_ABI_VERSION * \retval #VPX_CODEC_OK - * The decoder algorithm initialized. + * The encoder algorithm has been initialized. * \retval #VPX_CODEC_MEM_ERROR * Memory allocation failed. + * + * \note + * This is only supported by VP8. iface must point to the interface to the VP8 + * encoder. */ vpx_codec_err_t vpx_codec_enc_init_multi_ver( - vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg, - int num_enc, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver); + vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, + const vpx_codec_enc_cfg_t *cfg, int num_enc, vpx_codec_flags_t flags, + const vpx_rational_t *dsf, int ver); /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver() * @@ -795,7 +950,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( * * \param[in] iface Pointer to the algorithm interface to use. * \param[out] cfg Configuration buffer to populate. - * \param[in] reserved Must set to 0 for VP8 and VP9. + * \param[in] usage Must be set to 0. * * \retval #VPX_CODEC_OK * The configuration was populated. @@ -806,7 +961,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( */ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *cfg, - unsigned int reserved); + unsigned int usage); /*!\brief Set or change configuration * @@ -829,21 +984,36 @@ vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, * * Retrieves a stream level global header packet, if supported by the codec. * + * \li VP8: Unsupported + * \li VP9: Returns a buffer of ID (1 byte)|Length (1 byte)|Length + * bytes values. The function should be called after encoding to retrieve + * the most accurate information. + * * \param[in] ctx Pointer to this instance's context * * \retval NULL * Encoder does not support global header * \retval Non-NULL - * Pointer to buffer containing global header packet + * Pointer to buffer containing global header packet. The buffer pointer + * and its contents are only valid for the lifetime of \a ctx. The contents + * may change in subsequent calls to the function. + * \sa + * https://www.webmproject.org/docs/container/#vp9-codec-feature-metadata-codecprivate */ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); +/*!\brief Encode Deadline + * + * This type indicates a deadline, in microseconds, to be passed to + * vpx_codec_encode(). + */ +typedef unsigned long vpx_enc_deadline_t; /*!\brief deadline parameter analogous to VPx REALTIME mode. */ -#define VPX_DL_REALTIME (1) +#define VPX_DL_REALTIME 1ul /*!\brief deadline parameter analogous to VPx GOOD QUALITY mode. */ -#define VPX_DL_GOOD_QUALITY (1000000) +#define VPX_DL_GOOD_QUALITY 1000000ul /*!\brief deadline parameter analogous to VPx BEST QUALITY mode. */ -#define VPX_DL_BEST_QUALITY (0) +#define VPX_DL_BEST_QUALITY 0ul /*!\brief Encode a frame * * Encodes a video frame at the given "presentation time." The presentation @@ -855,7 +1025,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); * implicit that limiting the available time to encode will degrade the * output quality. The encoder can be given an unlimited time to produce the * best possible frame by specifying a deadline of '0'. This deadline - * supercedes the VPx notion of "best quality, good quality, realtime". + * supersedes the VPx notion of "best quality, good quality, realtime". * Applications that wish to map these former settings to the new deadline * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY, * and #VPX_DL_BEST_QUALITY. @@ -868,6 +1038,8 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); * * \param[in] ctx Pointer to this instance's context * \param[in] img Image data to encode, NULL to flush. + * Encoding sample values outside the range + * [0..(1<bit_depth)-1] is undefined behavior. * \param[in] pts Presentation time stamp, in timebase units. * \param[in] duration Duration to show frame, in timebase units. * \param[in] flags Flags to use for encoding this frame. @@ -883,7 +1055,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx); vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, vpx_codec_pts_t pts, unsigned long duration, vpx_enc_frame_flags_t flags, - unsigned long deadline); + vpx_enc_deadline_t deadline); /*!\brief Set compressed data output buffer * @@ -927,6 +1099,12 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, const vpx_image_t *img, * The buffer was set successfully. * \retval #VPX_CODEC_INVALID_PARAM * A parameter was NULL, the image format is unsupported, etc. + * + * \note + * `duration` and `deadline` are of the unsigned long type, which can be 32 + * or 64 bits. `duration` and `deadline` must be less than or equal to + * UINT32_MAX so that their ranges are independent of the size of unsigned + * long. */ vpx_codec_err_t vpx_codec_set_cx_data_buf(vpx_codec_ctx_t *ctx, const vpx_fixed_buf_t *buf, @@ -977,4 +1155,4 @@ const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx); #ifdef __cplusplus } #endif -#endif // VPX_VPX_ENCODER_H_ +#endif // VPX_VPX_VPX_ENCODER_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h new file mode 100644 index 0000000000..1c502f8101 --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_ext_ratectrl.h @@ -0,0 +1,605 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Defines structs and callbacks needed for external rate control. + * + */ +#ifndef VPX_VPX_VPX_EXT_RATECTRL_H_ +#define VPX_VPX_VPX_EXT_RATECTRL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_integer.h" +#include "./vpx_tpl.h" + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures. + */ +#define VPX_EXT_RATECTRL_ABI_VERSION (7 + VPX_TPL_ABI_VERSION) + +/*!\brief Corresponds to MAX_STATIC_GF_GROUP_LENGTH defined in vp9_ratectrl.h + */ +#define VPX_RC_MAX_STATIC_GF_GROUP_LENGTH 250 + +/*!\brief Max number of ref frames returned by the external RC. + * + * Corresponds to MAX_REF_FRAMES defined in vp9_blockd.h. + */ +#define VPX_RC_MAX_REF_FRAMES 4 + +/*!\brief The type of the external rate control. + * + * This controls what encoder parameters are determined by the external rate + * control. + */ +typedef enum vpx_rc_type { + /*! + * The external rate control doesn't determine anything. + * This mode is used as baseline. + */ + VPX_RC_NONE = 0, + /*! + * The external rate control model determines the quantization parameter (QP) + * for each frame. + */ + VPX_RC_QP = 1 << 0, + /*! + * The external rate control model determines the group of picture (GOP) of + * the video sequence. + */ + VPX_RC_GOP = 1 << 1, + /*! + * The external rate control model determines the rate-distortion multiplier + * (rdmult) for the current frame. + */ + VPX_RC_RDMULT = 1 << 2, + /*! + * The external rate control model determines both QP and GOP. + */ + VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP, + /*! + * The external rate control model determines the QP, GOP and the rdmult. + */ + VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT +} vpx_rc_type_t; + +/*!\brief The rate control mode for the external rate control model. + */ +typedef enum vpx_ext_rc_mode { + VPX_RC_QMODE = 0, + VPX_RC_VBR = 1, + VPX_RC_CQ = 2, +} vpx_ext_rc_mode_t; + +/*!\brief Corresponds to FRAME_UPDATE_TYPE defined in vp9_firstpass.h. + */ +typedef enum vpx_rc_frame_update_type { + VPX_RC_INVALID_UPDATE_TYPE = -1, + VPX_RC_KF_UPDATE = 0, + VPX_RC_LF_UPDATE = 1, + VPX_RC_GF_UPDATE = 2, + VPX_RC_ARF_UPDATE = 3, + VPX_RC_OVERLAY_UPDATE = 4, + VPX_RC_MID_OVERLAY_UPDATE = 5, + VPX_RC_USE_BUF_FRAME = 6, +} vpx_rc_frame_update_type_t; + +/*!\brief Name for the ref frames returned by the external RC. + * + * Corresponds to the ref frames defined in vp9_blockd.h. + */ +typedef enum vpx_rc_ref_name { + VPX_RC_INVALID_REF_FRAME = -1, + VPX_RC_INTRA_FRAME = 0, + VPX_RC_LAST_FRAME = 1, + VPX_RC_GOLDEN_FRAME = 2, + VPX_RC_ALTREF_FRAME = 3, +} vpx_rc_ref_name_t; + +/*!\brief Abstract rate control model handler + * + * The encoder will receive the model handler from + * vpx_rc_funcs_t::create_model(). + */ +typedef void *vpx_rc_model_t; + +/*!\brief A reserved value for the q index. + * If the external rate control model returns this value, + * the encoder will use the default q selected by libvpx's rate control + * system. + */ +#define VPX_DEFAULT_Q -1 + +/*!\brief A reserved value for the rdmult. + * If the external rate control model returns this value, + * the encoder will use the default rdmult selected by libvpx's rate control + * system. + */ +#define VPX_DEFAULT_RDMULT -1 + +/*!\brief Superblock quantization parameters + * Store the superblock quantization parameters + */ +typedef struct sb_parameters { + int q_index; /**< Quantizer step index [0..255]*/ + int rdmult; /**< Superblock level Lagrangian multiplier*/ +} sb_params; + +/*!\brief Encode frame decision made by the external rate control model + * + * The encoder will receive the decision from the external rate control model + * through vpx_rc_funcs_t::get_encodeframe_decision(). + */ +typedef struct vpx_rc_encodeframe_decision { + int q_index; /**< Required: Quantizer step index [0..255]*/ + int rdmult; /**< Required: Frame level Lagrangian multiplier*/ + int delta_q_uv; /**< Required: Delta QP for UV */ + /*! + * Optional: Superblock quantization parameters + * It is zero initialized by default. It will be set for key and ARF frames + * but not leaf frames. + */ + sb_params *sb_params_list; +} vpx_rc_encodeframe_decision_t; + +/*!\brief Information for the frame to be encoded. + * + * The encoder will send the information to external rate control model through + * vpx_rc_funcs_t::get_encodeframe_decision(). + * + */ +typedef struct vpx_rc_encodeframe_info { + /*! + * 0: Key frame + * 1: Inter frame + * 2: Alternate reference frame + * 3: Overlay frame + * 4: Golden frame + */ + int frame_type; + int show_index; /**< display index, starts from zero*/ + int coding_index; /**< coding index, starts from zero*/ + /*! + * index of the current frame in this group of picture, starts from zero. + */ + int gop_index; + int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/ + /*! + * The validity of the three reference frames. + * 0: Invalid + * 1: Valid + */ + int ref_frame_valid_list[3]; + /*! + * The length of the current GOP. + */ + int gop_size; + /*! + * Whether the current GOP uses an alt ref. + */ + int use_alt_ref; +} vpx_rc_encodeframe_info_t; + +/*!\brief Frame coding result + * + * The encoder will send the result to the external rate control model through + * vpx_rc_funcs_t::update_encodeframe_result(). + */ +typedef struct vpx_rc_encodeframe_result { + int64_t bit_count; /**< number of bits spent on coding the frame*/ + int actual_encoding_qindex; /**< the actual qindex used to encode the frame*/ +} vpx_rc_encodeframe_result_t; + +/*!\brief Status returned by rate control callback functions. + */ +typedef enum vpx_rc_status { + VPX_RC_OK = 0, + VPX_RC_ERROR = 1, +} vpx_rc_status_t; + +/*!\brief First pass frame stats + * This is a mirror of vp9's FIRSTPASS_STATS except that spatial_layer_id is + * omitted + */ +typedef struct vpx_rc_frame_stats { + /*! + * Frame number in display order, if stats are for a single frame. + * No real meaning for a collection of frames. + */ + double frame; + /*! + * Weight assigned to this frame (or total weight for the collection of + * frames) currently based on intra factor and brightness factor. This is used + * to distribute bits between easier and harder frames. + */ + double weight; + /*! + * Intra prediction error. + */ + double intra_error; + /*! + * Best of intra pred error and inter pred error using last frame as ref. + */ + double coded_error; + /*! + * Best of intra pred error and inter pred error using golden frame as ref. + */ + double sr_coded_error; + /*! + * Estimate the noise energy of the current frame. + */ + double frame_noise_energy; + /*! + * Percentage of blocks with inter pred error < intra pred error. + */ + double pcnt_inter; + /*! + * Percentage of blocks using (inter prediction and) non-zero motion vectors. + */ + double pcnt_motion; + /*! + * Percentage of blocks where golden frame was better than last or intra: + * inter pred error using golden frame < inter pred error using last frame and + * inter pred error using golden frame < intra pred error + */ + double pcnt_second_ref; + /*! + * Percentage of blocks where intra and inter prediction errors were very + * close. + */ + double pcnt_neutral; + /*! + * Percentage of blocks that have intra error < inter error and inter error < + * LOW_I_THRESH + * - bit_depth 8: LOW_I_THRESH = 24000 + * - bit_depth 10: LOW_I_THRESH = 24000 << 4 + * - bit_depth 12: LOW_I_THRESH = 24000 << 8 + */ + double pcnt_intra_low; + /*! + * Percentage of blocks that have intra error < inter error and intra error < + * LOW_I_THRESH but inter error >= LOW_I_THRESH LOW_I_THRESH + * - bit_depth 8: LOW_I_THRESH = 24000 + * - bit_depth 10: LOW_I_THRESH = 24000 << 4 + * - bit_depth 12: LOW_I_THRESH = 24000 << 8 + */ + double pcnt_intra_high; + /*! + * Percentage of blocks that have almost no intra error residual + * (i.e. are in effect completely flat and untextured in the intra + * domain). In natural videos this is uncommon, but it is much more + * common in animations, graphics and screen content, so may be used + * as a signal to detect these types of content. + */ + double intra_skip_pct; + /*! + * Percentage of blocks that have intra error < SMOOTH_INTRA_THRESH + * - bit_depth 8: SMOOTH_INTRA_THRESH = 4000 + * - bit_depth 10: SMOOTH_INTRA_THRESH = 4000 << 4 + * - bit_depth 12: SMOOTH_INTRA_THRESH = 4000 << 8 + */ + double intra_smooth_pct; + /*! + * Image mask rows top and bottom. + */ + double inactive_zone_rows; + /*! + * Image mask columns at left and right edges. + */ + double inactive_zone_cols; + /*! + * Mean of row motion vectors. + */ + double MVr; + /*! + * Mean of absolute value of row motion vectors. + */ + double mvr_abs; + /*! + * Mean of column motion vectors. + */ + double MVc; + /*! + * Mean of absolute value of column motion vectors. + */ + double mvc_abs; + /*! + * Variance of row motion vectors. + */ + double MVrv; + /*! + * Variance of column motion vectors. + */ + double MVcv; + /*! + * Value in range [-1,1] indicating fraction of row and column motion vectors + * that point inwards (negative MV value) or outwards (positive MV value). + * For example, value of 1 indicates, all row/column MVs are inwards. + */ + double mv_in_out_count; + /*! + * Duration of the frame / collection of frames. + */ + double duration; + /*! + * 1.0 if stats are for a single frame, or + * number of frames whose stats are accumulated. + */ + double count; + /*! + * Number of new mv in a frame. + */ + double new_mv_count; +} vpx_rc_frame_stats_t; + +/*!\brief Collection of first pass frame stats + */ +typedef struct vpx_rc_firstpass_stats { + /*! + * Pointer to first pass frame stats. + * The pointed array of vpx_rc_frame_stats_t should have length equal to + * number of show frames in the video. + */ + vpx_rc_frame_stats_t *frame_stats; + /*! + * Number of show frames in the video. + */ + int num_frames; +} vpx_rc_firstpass_stats_t; + +/*!\brief Encode config sent to external rate control model + */ +typedef struct vpx_rc_config { + int frame_width; /**< frame width */ + int frame_height; /**< frame height */ + int show_frame_count; /**< number of visible frames in the video */ + int max_gf_interval; /**< max GOP size in number of show frames */ + int min_gf_interval; /**< min GOP size in number of show frames */ + /*! + * Target bitrate in kilobytes per second + */ + int target_bitrate_kbps; + int frame_rate_num; /**< numerator of frame rate */ + int frame_rate_den; /**< denominator of frame rate */ + /*! + * The following fields are only for external rate control models that support + * different rate control modes. + */ + vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */ + int overshoot_percent; /**< for VBR mode only */ + int undershoot_percent; /**< for VBR mode only */ + int min_base_q_index; /**< for VBR mode only */ + int max_base_q_index; /**< for VBR mode only */ + int base_qp; /**< base QP for leaf frames, 0-255 */ +} vpx_rc_config_t; + +/*!\brief Control what ref frame to use and its index. + */ +typedef struct vpx_rc_ref_frame { + /*! + * Ref frame index. Corresponding to |lst_fb_idx|, |gld_fb_idx| or + * |alt_fb_idx| in VP9_COMP depending on the ref frame #name. + */ + int index[VPX_RC_MAX_REF_FRAMES]; + /*! + * Ref frame name. This decides whether the #index is used as + * |lst_fb_idx|, |gld_fb_idx| or |alt_fb_idx| in VP9_COMP. + * + */ + vpx_rc_ref_name_t name[VPX_RC_MAX_REF_FRAMES]; +} vpx_rc_ref_frame_t; + +/*!\brief The decision made by the external rate control model to set the + * group of picture. + */ +typedef struct vpx_rc_gop_decision { + int gop_coding_frames; /**< The number of frames of this GOP */ + int use_alt_ref; /**< Whether to use alt ref for this GOP */ + int use_key_frame; /**< Whether to set key frame for this GOP */ + /*! + * Frame type for each frame in this GOP. + * This will be populated to |update_type| in GF_GROUP defined in + * vp9_firstpass.h + */ + vpx_rc_frame_update_type_t update_type[VPX_RC_MAX_STATIC_GF_GROUP_LENGTH + 2]; + /*! Ref frame buffer index to be updated for each frame in this GOP. */ + int update_ref_index[VPX_RC_MAX_STATIC_GF_GROUP_LENGTH + 2]; + /*! Ref frame list to be used for each frame in this GOP. */ + vpx_rc_ref_frame_t ref_frame_list[VPX_RC_MAX_STATIC_GF_GROUP_LENGTH + 2]; +} vpx_rc_gop_decision_t; + +/*!\brief The decision made by the external rate control model to set the + * key frame location and the show frame count in the key frame group + */ +typedef struct vpx_rc_key_frame_decision { + int key_frame_show_index; /**< This key frame's show index in the video */ + int key_frame_group_size; /**< Show frame count of this key frame group */ +} vpx_rc_key_frame_decision_t; + +/*!\brief Create an external rate control model callback prototype + * + * This callback is invoked by the encoder to create an external rate control + * model. + * + * \param[in] priv Callback's private data + * \param[in] ratectrl_config Pointer to vpx_rc_config_t + * \param[out] rate_ctrl_model_ptr Pointer to vpx_rc_model_t + */ +typedef vpx_rc_status_t (*vpx_rc_create_model_cb_fn_t)( + void *priv, const vpx_rc_config_t *ratectrl_config, + vpx_rc_model_t *rate_ctrl_model_ptr); + +/*!\brief Send first pass stats to the external rate control model callback + * prototype + * + * This callback is invoked by the encoder to send first pass stats to the + * external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] first_pass_stats first pass stats + */ +typedef vpx_rc_status_t (*vpx_rc_send_firstpass_stats_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_firstpass_stats_t *first_pass_stats); + +/*!\brief Send TPL stats for the current GOP to the external rate control model + * callback prototype + * + * This callback is invoked by the encoder to send TPL stats for the GOP to the + * external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] tpl_gop_stats TPL stats for current GOP + */ +typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const VpxTplGopStats *tpl_gop_stats); + +/*!\brief Receive encode frame decision callback prototype + * + * This callback is invoked by the encoder to receive encode frame decision from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] frame_gop_index index of the frame in current gop + * \param[out] frame_decision encode decision of the coding frame + */ +typedef vpx_rc_status_t (*vpx_rc_get_encodeframe_decision_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const int frame_gop_index, + vpx_rc_encodeframe_decision_t *frame_decision); + +/*!\brief Update encode frame result callback prototype + * + * This callback is invoked by the encoder to update encode frame result to the + * external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[out] encode_frame_result encode result of the coding frame + */ +typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, + const vpx_rc_encodeframe_result_t *encode_frame_result); + +/*!\brief Get the key frame decision from the external rate control model. + * + * This callback is invoked by the encoder to get key frame decision from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[out] key_frame_decision key frame decision from the model + */ +typedef vpx_rc_status_t (*vpx_rc_get_key_frame_decision_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, + vpx_rc_key_frame_decision_t *key_frame_decision); + +/*!\brief Get the GOP structure from the external rate control model. + * + * This callback is invoked by the encoder to get GOP decisions from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[out] gop_decision GOP decision from the model + */ +typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, vpx_rc_gop_decision_t *gop_decision); + +/*!\brief Get the frame rdmult from the external rate control model. + * + * This callback is invoked by the encoder to get rdmult from + * the external rate control model. + * + * \param[in] rate_ctrl_model rate control model + * \param[in] frame_info information collected from the encoder + * \param[out] rdmult frame rate-distortion multiplier from the model + */ +typedef vpx_rc_status_t (*vpx_rc_get_frame_rdmult_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *frame_info, + int *rdmult); + +/*!\brief Delete the external rate control model callback prototype + * + * This callback is invoked by the encoder to delete the external rate control + * model. + * + * \param[in] rate_ctrl_model rate control model + */ +typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)( + vpx_rc_model_t rate_ctrl_model); + +/*!\brief Callback function set for external rate control. + * + * The user can enable external rate control by registering + * a set of callback functions with the codec control flag + * #VP9E_SET_EXTERNAL_RATE_CONTROL. + */ +typedef struct vpx_rc_funcs { + /*! + * The rate control type of this API. + */ + vpx_rc_type_t rc_type; + /*! + * Create an external rate control model. + */ + vpx_rc_create_model_cb_fn_t create_model; + /*! + * Send first pass stats to the external rate control model. + */ + vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats; + /*! + * Send TPL stats for current GOP to the external rate control model. + */ + vpx_rc_send_tpl_gop_stats_cb_fn_t send_tpl_gop_stats; + /*! + * Get encodeframe decision from the external rate control model. + */ + vpx_rc_get_encodeframe_decision_cb_fn_t get_encodeframe_decision; + /*! + * Update encodeframe result to the external rate control model. + */ + vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result; + /*! + * Get key frame decision from the external rate control model. + */ + vpx_rc_get_key_frame_decision_cb_fn_t get_key_frame_decision; + /*! + * Get GOP decisions from the external rate control model. + */ + vpx_rc_get_gop_decision_cb_fn_t get_gop_decision; + /*! + * Get rdmult for the frame from the external rate control model. + */ + vpx_rc_get_frame_rdmult_cb_fn_t get_frame_rdmult; + /*! + * Delete the external rate control model. + */ + vpx_rc_delete_model_cb_fn_t delete_model; + + /*! + * Rate control log path. + */ + const char *rate_ctrl_log_path; + /*! + * Private data for the external rate control model. + */ + void *priv; +} vpx_rc_funcs_t; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VPX_EXT_RATECTRL_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_frame_buffer.h b/media/libvpx/libvpx/vpx/vpx_frame_buffer.h index ad70cdd572..fc8320017b 100644 --- a/media/libvpx/libvpx/vpx/vpx_frame_buffer.h +++ b/media/libvpx/libvpx/vpx/vpx_frame_buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VPX_FRAME_BUFFER_H_ -#define VPX_VPX_FRAME_BUFFER_H_ +#ifndef VPX_VPX_VPX_FRAME_BUFFER_H_ +#define VPX_VPX_VPX_FRAME_BUFFER_H_ /*!\file * \brief Describes the decoder external frame buffer interface. @@ -52,12 +52,12 @@ typedef struct vpx_codec_frame_buffer { * data. The callback is triggered when the decoder needs a frame buffer to * decode a compressed image into. This function may be called more than once * for every call to vpx_codec_decode. The application may set fb->priv to - * some data which will be passed back in the ximage and the release function - * call. |fb| is guaranteed to not be NULL. On success the callback must - * return 0. Any failure the callback must return a value less than 0. + * some data which will be passed back in the vpx_image_t and the release + * function call. |fb| is guaranteed to not be NULL. On success the callback + * must return 0. Any failure the callback must return a value less than 0. * * \param[in] priv Callback's private data - * \param[in] new_size Size in bytes needed by the buffer + * \param[in] min_size Size in bytes needed by the buffer * \param[in,out] fb Pointer to vpx_codec_frame_buffer_t */ typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size, @@ -80,4 +80,4 @@ typedef int (*vpx_release_frame_buffer_cb_fn_t)(void *priv, } // extern "C" #endif -#endif // VPX_VPX_FRAME_BUFFER_H_ +#endif // VPX_VPX_VPX_FRAME_BUFFER_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_image.h b/media/libvpx/libvpx/vpx/vpx_image.h index d6d3166d2f..cca1c42f00 100644 --- a/media/libvpx/libvpx/vpx/vpx_image.h +++ b/media/libvpx/libvpx/vpx/vpx_image.h @@ -12,8 +12,8 @@ * \brief Describes the vpx image descriptor and associated operations * */ -#ifndef VPX_VPX_IMAGE_H_ -#define VPX_VPX_IMAGE_H_ +#ifndef VPX_VPX_VPX_IMAGE_H_ +#define VPX_VPX_VPX_IMAGE_H_ #ifdef __cplusplus extern "C" { @@ -27,7 +27,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/ +#define VPX_IMAGE_ABI_VERSION (5) /**<\hideinitializer*/ #define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ #define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ @@ -37,29 +37,13 @@ extern "C" { /*!\brief List of supported image formats */ typedef enum vpx_img_fmt { VPX_IMG_FMT_NONE, - VPX_IMG_FMT_RGB24, /**< 24 bit per pixel packed RGB */ - VPX_IMG_FMT_RGB32, /**< 32 bit per pixel packed 0RGB */ - VPX_IMG_FMT_RGB565, /**< 16 bit per pixel, 565 */ - VPX_IMG_FMT_RGB555, /**< 16 bit per pixel, 555 */ - VPX_IMG_FMT_UYVY, /**< UYVY packed YUV */ - VPX_IMG_FMT_YUY2, /**< YUYV packed YUV */ - VPX_IMG_FMT_YVYU, /**< YVYU packed YUV */ - VPX_IMG_FMT_BGR24, /**< 24 bit per pixel packed BGR */ - VPX_IMG_FMT_RGB32_LE, /**< 32 bit packed BGR0 */ - VPX_IMG_FMT_ARGB, /**< 32 bit packed ARGB, alpha=255 */ - VPX_IMG_FMT_ARGB_LE, /**< 32 bit packed BGRA, alpha=255 */ - VPX_IMG_FMT_RGB565_LE, /**< 16 bit per pixel, gggbbbbb rrrrrggg */ - VPX_IMG_FMT_RGB555_LE, /**< 16 bit per pixel, gggbbbbb 0rrrrrgg */ VPX_IMG_FMT_YV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */ VPX_IMG_FMT_I420 = VPX_IMG_FMT_PLANAR | 2, - VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | - 3, /** < planar 4:2:0 format with vpx color space */ - VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4, VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5, VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6, VPX_IMG_FMT_I440 = VPX_IMG_FMT_PLANAR | 7, - VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 6, + VPX_IMG_FMT_NV12 = VPX_IMG_FMT_PLANAR | 9, VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH, VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH, VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH, @@ -80,8 +64,12 @@ typedef enum vpx_color_space { /*!\brief List of supported color range */ typedef enum vpx_color_range { - VPX_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */ - VPX_CR_FULL_RANGE = 1 /**< YUV/RGB [0..255] */ + VPX_CR_STUDIO_RANGE = 0, /**<- Y [16..235], UV [16..240] (bit depth 8) */ + /**<- Y [64..940], UV [64..960] (bit depth 10) */ + /**<- Y [256..3760], UV [256..3840] (bit depth 12) */ + VPX_CR_FULL_RANGE = 1 /**<- YUV/RGB [0..255] (bit depth 8) */ + /**<- YUV/RGB [0..1023] (bit depth 10) */ + /**<- YUV/RGB [0..4095] (bit depth 12) */ } vpx_color_range_t; /**< alias for enum vpx_color_range */ /**\brief Image Descriptor */ @@ -142,16 +130,19 @@ typedef struct vpx_image_rect { /*!\brief Open a descriptor, allocating storage for the underlying image * * Returns a descriptor for storing an image of the given format. The - * storage for the descriptor is allocated on the heap. + * storage for the image is allocated on the heap. * * \param[in] img Pointer to storage for descriptor. If this parameter * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and - * each row in the image(stride). + * each row in the image (stride). Must not exceed + * 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be @@ -164,30 +155,65 @@ vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt, /*!\brief Open a descriptor, using existing storage for the underlying image * * Returns a descriptor for storing an image of the given format. The - * storage for descriptor has been allocated elsewhere, and a descriptor is + * storage for the image has been allocated elsewhere, and a descriptor is * desired to "wrap" that storage. * - * \param[in] img Pointer to storage for descriptor. If this parameter - * is NULL, the storage for the descriptor will be - * allocated on the heap. - * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image - * \param[in] align Alignment, in bytes, of each row in the image. - * \param[in] img_data Storage to use for the image + * \param[in] img Pointer to storage for descriptor. If this + * parameter is NULL, the storage for the descriptor + * will be allocated on the heap. + * \param[in] fmt Format for the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] stride_align Alignment, in bytes, of each row in the image + * (stride). Must not exceed 65536. + * \param[in] img_data Storage to use for the image. The storage must + * outlive the returned image descriptor; it can be + * disposed of after calling vpx_img_free(). * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be * returned. + * + * \note \a img_data is required to have a minimum allocation size that + * satisfies the requirements of the \a fmt, \a d_w, \a d_h and \a + * stride_align parameters. This size can be calculated as follows (see + * \c img_alloc_helper in the vpx_image.c file in the libvpx source tree + * for more detail): + * \code + * align = (1 << x_chroma_shift) - 1; + * w = (d_w + align) & ~align; + * align = (1 << y_chroma_shift) - 1; + * h = (d_h + align) & ~align; + * + * s = (fmt & VPX_IMG_FMT_PLANAR) ? w : (uint64_t)bps * w / 8; + * s = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s; + * s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1); + * s = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? s / 2 : s; + * alloc_size = (fmt & VPX_IMG_FMT_PLANAR) ? (uint64_t)h * s * bps / 8 + * : (uint64_t)h * s; + * \endcode + * \a x_chroma_shift, \a y_chroma_shift and \a bps can be obtained by calling + * \ref vpx_img_wrap with a non-\c NULL \a img_data parameter. The \c + * vpx_image_t pointer should \em not be used in other API calls until \em after + * a successful call to \ref vpx_img_wrap with a valid image buffer. For + * example: + * \code + * vpx_img_wrap(img, fmt, d_w, d_h, stride_align, (unsigned char *)1); + * ... calculate buffer size and allocate buffer as described earlier + * vpx_img_wrap(img, fmt, d_w, d_h, stride_align, img_data); + * \endcode */ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, - unsigned int d_h, unsigned int align, + unsigned int d_h, unsigned int stride_align, unsigned char *img_data); /*!\brief Set the rectangle identifying the displayed portion of the image * * Updates the displayed rectangle (aka viewport) on the image surface to - * match the specified coordinates and size. + * match the specified coordinates and size. Specifically, sets img->d_w, + * img->d_h, and elements of the img->planes[] array. * * \param[in] img Image descriptor * \param[in] x leftmost column @@ -195,7 +221,7 @@ vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w, * \param[in] w width * \param[in] h height * - * \return 0 if the requested rectangle is valid, nonzero otherwise. + * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise. */ int vpx_img_set_rect(vpx_image_t *img, unsigned int x, unsigned int y, unsigned int w, unsigned int h); @@ -221,4 +247,4 @@ void vpx_img_free(vpx_image_t *img); } // extern "C" #endif -#endif // VPX_VPX_IMAGE_H_ +#endif // VPX_VPX_VPX_IMAGE_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_integer.h b/media/libvpx/libvpx/vpx/vpx_integer.h index 0c27142ff9..34e3796411 100644 --- a/media/libvpx/libvpx/vpx/vpx_integer.h +++ b/media/libvpx/libvpx/vpx/vpx_integer.h @@ -8,39 +8,22 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_VPX_INTEGER_H_ -#define VPX_VPX_INTEGER_H_ +#ifndef VPX_VPX_VPX_INTEGER_H_ +#define VPX_VPX_VPX_INTEGER_H_ /* get ptrdiff_t, size_t, wchar_t, NULL */ -#include +#include // IWYU pragma: export #if defined(_MSC_VER) #define VPX_FORCE_INLINE __forceinline #define VPX_INLINE __inline #else -#define VPX_FORCE_INLINE __inline__ __attribute__(always_inline) +#define VPX_FORCE_INLINE __inline__ __attribute__((always_inline)) // TODO(jbb): Allow a way to force inline off for older compilers. #define VPX_INLINE inline #endif -#if !defined(VPX_DONT_DEFINE_STDINT_TYPES) - -#if defined(VPX_EMULATE_INTTYPES) -typedef signed char int8_t; -typedef signed short int16_t; -typedef signed int int32_t; - -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; - -#ifndef _UINTPTR_T_DEFINED -typedef size_t uintptr_t; -#endif - -#else - -/* Most platforms have the C99 standard integer types. */ +/* Assume platforms have the C99 standard integer types. */ #if defined(__cplusplus) #if !defined(__STDC_FORMAT_MACROS) @@ -51,17 +34,7 @@ typedef size_t uintptr_t; #endif #endif // __cplusplus -#include +#include // IWYU pragma: export +#include // IWYU pragma: export -#endif - -#endif // VPX_DONT_DEFINE_STDINT_TYPES - -/* VS2010 defines stdint.h, but not inttypes.h */ -#if defined(_MSC_VER) && _MSC_VER < 1800 -#define PRId64 "I64d" -#else -#include -#endif - -#endif // VPX_VPX_INTEGER_H_ +#endif // VPX_VPX_VPX_INTEGER_H_ diff --git a/media/libvpx/libvpx/vpx/vpx_tpl.h b/media/libvpx/libvpx/vpx/vpx_tpl.h new file mode 100644 index 0000000000..ecee5d1efe --- /dev/null +++ b/media/libvpx/libvpx/vpx/vpx_tpl.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Describes the TPL stats descriptor and associated operations + * + */ +#ifndef VPX_VPX_VPX_TPL_H_ +#define VPX_VPX_VPX_TPL_H_ + +#include "./vpx_integer.h" +#include "./vpx_codec.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_TPL_ABI_VERSION 5 /**<\hideinitializer*/ + +/*!\brief Temporal dependency model stats for each block before propagation */ +typedef struct VpxTplBlockStats { + int16_t row; /**< Pixel row of the top left corner */ + int16_t col; /**< Pixel col of the top left corner */ + int64_t intra_cost; /**< Intra cost */ + int64_t inter_cost; /**< Inter cost */ + int16_t mv_r; /**< Motion vector row in pixel */ + int16_t mv_c; /**< Motion vector col in pixel */ + int64_t srcrf_rate; /**< Rate from source ref frame */ + int64_t srcrf_dist; /**< Distortion from source ref frame */ + int64_t pred_error; /**< Prediction error */ + int64_t inter_pred_err; /**< Inter prediction error */ + int64_t intra_pred_err; /**< Intra prediction error */ + int ref_frame_index; /**< Ref frame index in the ref frame buffer */ +} VpxTplBlockStats; + +/*!\brief Temporal dependency model stats for each frame before propagation */ +typedef struct VpxTplFrameStats { + int frame_width; /**< Frame width */ + int frame_height; /**< Frame height */ + int num_blocks; /**< Number of blocks. Size of block_stats_list */ + VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */ +} VpxTplFrameStats; + +/*!\brief Temporal dependency model stats for each GOP before propagation */ +typedef struct VpxTplGopStats { + int size; /**< GOP size, also the size of frame_stats_list. */ + VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */ +} VpxTplGopStats; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VPX_TPL_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/add_noise.c b/media/libvpx/libvpx/vpx_dsp/add_noise.c index a2b4c9010f..6839e97928 100644 --- a/media/libvpx/libvpx/vpx_dsp/add_noise.c +++ b/media/libvpx/libvpx/vpx_dsp/add_noise.c @@ -15,6 +15,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/postproc.h" #include "vpx_ports/mem.h" void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, @@ -51,6 +52,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) { const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i)); if (a_i) { for (j = 0; j < a_i; ++j) { + if (next + j >= 256) goto set_noise; char_dist[next + j] = (int8_t)i; } next = next + j; @@ -62,6 +64,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) { char_dist[next] = 0; } +set_noise: for (i = 0; i < size; ++i) { noise[i] = char_dist[rand() & 0xff]; // NOLINT } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c index 001517d33e..1b17a326b4 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/avg_neon.c @@ -15,145 +15,124 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { - const uint32x4_t a = vpaddlq_u16(v_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); +uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) { + const uint8x16_t b = load_unaligned_u8q(a, a_stride); + const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b)); + return (horizontal_add_uint16x8(c) + (1 << 3)) >> 4; } -unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) { - uint16x8_t v_sum; - uint32x2_t v_s0 = vdup_n_u32(0); - uint32x2_t v_s1 = vdup_n_u32(0); - v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0); - v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); - v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); - return (horizontal_add_u16x8(v_sum) + 8) >> 4; -} +uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) { + int i; + uint8x8_t b, c; + uint16x8_t sum; + b = vld1_u8(a); + a += a_stride; + c = vld1_u8(a); + a += a_stride; + sum = vaddl_u8(b, c); -unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) { - uint8x8_t v_s0 = vld1_u8(s); - const uint8x8_t v_s1 = vld1_u8(s + p); - uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); + for (i = 0; i < 6; ++i) { + const uint8x8_t d = vld1_u8(a); + a += a_stride; + sum = vaddw_u8(sum, d); + } - v_s0 = vld1_u8(s + 2 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 3 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 4 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 5 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 6 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 7 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - return (horizontal_add_u16x8(v_sum) + 32) >> 6; + return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6; } // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. -int vpx_satd_neon(const int16_t *coeff, int length) { - const int16x4_t zero = vdup_n_s16(0); - int32x4_t accum = vdupq_n_s32(0); +// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] +int vpx_satd_neon(const tran_low_t *coeff, int length) { + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; do { - const int16x8_t src0 = vld1q_s16(coeff); - const int16x8_t src8 = vld1q_s16(coeff + 8); - accum = vabal_s16(accum, vget_low_s16(src0), zero); - accum = vabal_s16(accum, vget_high_s16(src0), zero); - accum = vabal_s16(accum, vget_low_s16(src8), zero); - accum = vabal_s16(accum, vget_high_s16(src8), zero); + int16x8_t abs0, abs1; + const int16x8_t s0 = load_tran_low_to_s16q(coeff); + const int16x8_t s1 = load_tran_low_to_s16q(coeff + 8); + + abs0 = vabsq_s16(s0); + sum_s32[0] = vpadalq_s16(sum_s32[0], abs0); + abs1 = vabsq_s16(s1); + sum_s32[1] = vpadalq_s16(sum_s32[1], abs1); + length -= 16; coeff += 16; } while (length != 0); - { - // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. - const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), - vreinterpret_s32_s64(vget_high_s64(s0))); - const int satd = vget_lane_s32(s1, 0); - return satd; - } + return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1])); } void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; - uint16x8_t vec_sum_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_hi = vdupq_n_u16(0); - const int shift_factor = ((height >> 5) + 3) * -1; - const int16x8_t vec_shift = vdupq_n_s16(shift_factor); + uint8x16_t r0, r1, r2, r3; + uint16x8_t sum_lo[2], sum_hi[2]; + uint16x8_t tmp_lo[2], tmp_hi[2]; + int16x8_t avg_lo, avg_hi; - for (i = 0; i < height; i += 8) { - const uint8x16_t vec_row1 = vld1q_u8(ref); - const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); - const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); - const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); - const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); - const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); - const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); - const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); + const int norm_factor = (height >> 5) + 3; + const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); + assert(height >= 4 && height % 4 == 0); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); + r0 = vld1q_u8(ref + 0 * ref_stride); + r1 = vld1q_u8(ref + 1 * ref_stride); + r2 = vld1q_u8(ref + 2 * ref_stride); + r3 = vld1q_u8(ref + 3 * ref_stride); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); + sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); + ref += 4 * ref_stride; - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); + for (i = 4; i < height; i += 4) { + r0 = vld1q_u8(ref + 0 * ref_stride); + r1 = vld1q_u8(ref + 1 * ref_stride); + r2 = vld1q_u8(ref + 2 * ref_stride); + r3 = vld1q_u8(ref + 3 * ref_stride); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); + tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1)); + tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1)); + tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3)); + tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3)); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); + sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]); + sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]); + sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]); + sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]); - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); - - ref += ref_stride * 8; + ref += 4 * ref_stride; } - vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); - vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); + sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]); + sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]); - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); - hbuf += 8; - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); + avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor); + avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor); + + vst1q_s16(hbuf, avg_lo); + vst1q_s16(hbuf + 8, avg_hi); } int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) { + uint16x8_t sum; int i; - uint16x8_t vec_sum = vdupq_n_u16(0); - for (i = 0; i < width; i += 16) { - const uint8x16_t vec_row = vld1q_u8(ref); - vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); - vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); - ref += 16; + assert(width >= 16 && width % 16 == 0); + + sum = vpaddlq_u8(vld1q_u8(ref)); + for (i = 16; i < width; i += 16) { + sum = vpadalq_u8(sum, vld1q_u8(ref + i)); } - return horizontal_add_u16x8(vec_sum); + return (int16_t)horizontal_add_uint16x8(sum); } // ref, src = [0, 510] - max diff = 16-bits @@ -183,7 +162,7 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { { // Note: 'total''s pairwise addition could be implemented similarly to - // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired + // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired // with the summation of 'sse' performed better on a Cortex-A15. const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); @@ -232,11 +211,16 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); - // Split to D and start doing pairwise. +#if VPX_ARCH_AARCH64 + *min = *max = 0; // Clear high bits + *((uint8_t *)max) = vmaxvq_u8(ab07_max); + *((uint8_t *)min) = vminvq_u8(ab07_min); +#else + // Split into 64-bit vectors and execute pairwise min/max. uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); - // Enough runs of vpmax/min propogate the max/min values to every position. + // Enough runs of vpmax/min propagate the max/min values to every position. ab_max = vpmax_u8(ab_max, ab_max); ab_min = vpmin_u8(ab_min, ab_min); @@ -250,4 +234,5 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, // Store directly to avoid costly neon->gpr transfer. vst1_lane_u8((uint8_t *)max, ab_max, 0); vst1_lane_u8((uint8_t *)min, ab_min, 0); +#endif } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c new file mode 100644 index 0000000000..5afdece0ab --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/avg_pred_neon.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" + +void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + if (width > 8) { + int x, y = height; + do { + for (x = 0; x < width; x += 16) { + const uint8x16_t p = vld1q_u8(pred + x); + const uint8x16_t r = vld1q_u8(ref + x); + const uint8x16_t avg = vrhaddq_u8(p, r); + vst1q_u8(comp + x, avg); + } + comp += width; + pred += width; + ref += ref_stride; + } while (--y); + } else if (width == 8) { + int i = width * height; + do { + const uint8x16_t p = vld1q_u8(pred); + uint8x16_t r; + const uint8x8_t r_0 = vld1_u8(ref); + const uint8x8_t r_1 = vld1_u8(ref + ref_stride); + r = vcombine_u8(r_0, r_1); + ref += 2 * ref_stride; + r = vrhaddq_u8(r, p); + vst1q_u8(comp, r); + + pred += 16; + comp += 16; + i -= 16; + } while (i); + } else { + int i = width * height; + assert(width == 4); + do { + const uint8x16_t p = vld1q_u8(pred); + uint8x16_t r; + + r = load_unaligned_u8q(ref, ref_stride); + ref += 4 * ref_stride; + r = vrhaddq_u8(r, p); + vst1q_u8(comp, r); + + pred += 16; + comp += 16; + i -= 16; + } while (i); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c index ed1a4df25c..7efce32735 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/deblock_neon.c @@ -15,6 +15,8 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/transpose_neon.h" +extern const int16_t vpx_rv[]; + static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1, const uint8x8_t v0, const uint8x8_t b1, const uint8x8_t b2) { @@ -89,11 +91,6 @@ void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr, int row; int col; - // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for - // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8 - // (for U/V). - assert((size == 8 || size == 16) && cols % 8 == 0); - // While columns of length 16 can be processed, load them. for (col = 0; col < cols - 8; col += 16) { uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7; @@ -384,3 +381,100 @@ void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols, src += pitch; } } + +// Apply filter of (vpx_rv + sum + s[c]) >> 4. +static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s, + const int16x8_t rv) { + const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s)); + const int16x8_t sum_s = vaddq_s16(sum, s16); + const int16x8_t rounded = vaddq_s16(sum_s, rv); + + return vqshrun_n_s16(rounded, 4); +} + +void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols, + int flimit) { + int row, col, i; + const int32x4_t f = vdupq_n_s32(flimit); + uint8x8_t below_context = vdup_n_u8(0); + + // 8 columns are processed at a time. + // If rows is less than 8 the bottom border extension fails. + assert(cols % 8 == 0); + assert(rows >= 8); + + // Load and keep the first 8 values in memory. Process a vertical stripe that + // is 8 wide. + for (col = 0; col < cols; col += 8) { + uint8x8_t s, above_context[8]; + int16x8_t sum, sum_tmp; + int32x4_t sumsq_low, sumsq_high; + + // Load and extend the top border. + s = vld1_u8(dst); + for (i = 0; i < 8; i++) { + above_context[i] = s; + } + + sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s)); + + // sum * 9 + sum = vmulq_n_s16(sum_tmp, 9); + + // (sum * 9) * sum == sum * sum * 9 + sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp)); + sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp)); + + // Load and discard the next 6 values to prime sum and sumsq. + for (i = 1; i <= 6; ++i) { + const uint8x8_t a = vld1_u8(dst + i * pitch); + const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a)); + sum = vaddq_s16(sum, b); + + sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b)); + sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b)); + } + + for (row = 0; row < rows; ++row) { + uint8x8_t mask, output; + int16x8_t x, y; + int32x4_t xy_low, xy_high; + + s = vld1_u8(dst + row * pitch); + + // Extend the bottom border. + if (row + 7 < rows) { + below_context = vld1_u8(dst + (row + 7) * pitch); + } + + x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0])); + y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0])); + xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y)); + xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y)); + + sum = vaddq_s16(sum, x); + + sumsq_low = vaddq_s32(sumsq_low, xy_low); + sumsq_high = vaddq_s32(sumsq_high, xy_high); + + mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low, + sumsq_high, f); + + output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127))); + output = vbsl_u8(mask, output, s); + + vst1_u8(dst + row * pitch, output); + + above_context[0] = above_context[1]; + above_context[1] = above_context[2]; + above_context[2] = above_context[3]; + above_context[3] = above_context[4]; + above_context[4] = above_context[5]; + above_context[5] = above_context[6]; + above_context[6] = above_context[7]; + above_context[7] = s; + } + + dst += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c new file mode 100644 index 0000000000..fde71ff30d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct16x16_neon.h" + +// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline +// functions. +#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 + +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { + vpx_fdct16x16_c(input, output, stride); +} + +#else + +// Main body of fdct16x16. +static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/, + int16x8_t *out /*[16]*/) { + int16x8_t s[8]; + int16x8_t x[4]; + int16x8_t step[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[8]); + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]); + + // Stage 3 + x[0] = vaddq_s16(s[4], s[5]); + x[1] = vsubq_s16(s[4], s[5]); + x[2] = vsubq_s16(s[7], s[6]); + x[3] = vaddq_s16(s[7], s[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]); + butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]); + + // step 3 + s[0] = vaddq_s16(in[8], s[3]); + s[1] = vaddq_s16(in[9], s[2]); + x[0] = vsubq_s16(in[9], s[2]); + x[1] = vsubq_s16(in[8], s[3]); + x[2] = vsubq_s16(in[15], s[4]); + x[3] = vsubq_s16(in[14], s[5]); + s[6] = vaddq_s16(in[14], s[5]); + s[7] = vaddq_s16(in[15], s[4]); + + // step 4 + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]); + + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]); + + // step 5 + step[0] = vaddq_s16(s[0], s[1]); + step[1] = vsubq_s16(s[0], s[1]); + step[2] = vaddq_s16(x[1], s[2]); + step[3] = vsubq_s16(x[1], s[2]); + step[4] = vsubq_s16(x[2], s[5]); + step[5] = vaddq_s16(x[2], s[5]); + step[6] = vsubq_s16(s[7], s[6]); + step[7] = vaddq_s16(s[7], s[6]); + + // step 6 + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9], + &out[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1], + &out[15]); + + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13], + &out[3]); + + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5], + &out[11]); +} + +void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x8_t temp0[16]; + int16x8_t temp1[16]; + int16x8_t temp2[16]; + int16x8_t temp3[16]; + + // Left half. + load_cross(input, stride, temp0); + scale_input(temp0, temp1); + vpx_fdct8x16_body(temp1, temp0); + + // Right half. + load_cross(input + 8, stride, temp1); + scale_input(temp1, temp2); + vpx_fdct8x16_body(temp2, temp1); + + // Transpose top left and top right quarters into one contiguous location to + // process to the top half. + + transpose_s16_8x8q(&temp0[0], &temp2[0]); + transpose_s16_8x8q(&temp1[0], &temp2[8]); + partial_round_shift(temp2); + cross_input(temp2, temp3); + vpx_fdct8x16_body(temp3, temp2); + transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4], + &temp2[5], &temp2[6], &temp2[7]); + transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12], + &temp2[13], &temp2[14], &temp2[15]); + store(output, temp2); + store(output + 8, temp2 + 8); + output += 8 * 16; + + // Transpose bottom left and bottom right quarters into one contiguous + // location to process to the bottom half. + transpose_s16_8x8q(&temp0[8], &temp1[0]); + + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], + &temp1[13], &temp1[14], &temp1[15]); + partial_round_shift(temp1); + cross_input(temp1, temp0); + vpx_fdct8x16_body(temp0, temp1); + transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4], + &temp1[5], &temp1[6], &temp1[7]); + transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], + &temp1[13], &temp1[14], &temp1[15]); + store(output, temp1); + store(output + 8, temp1 + 8); +} + +#if CONFIG_VP9_HIGHBITDEPTH + +// Main body of fdct8x16 column +static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + int32x4_t sl[8]; + int32x4_t sr[8]; + int32x4_t xl[4]; + int32x4_t xr[4]; + int32x4_t inl[8]; + int32x4_t inr[8]; + int32x4_t stepl[8]; + int32x4_t stepr[8]; + + // stage 1 + // From fwd_txfm.c: Work on the first eight values; fdct8(input, + // even_results);" + sl[0] = vaddq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sr[1] = vaddq_s32(right[1], right[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sr[2] = vaddq_s32(right[2], right[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sr[3] = vaddq_s32(right[3], right[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sr[5] = vsubq_s32(right[2], right[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sr[6] = vsubq_s32(right[1], right[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[7] = vsubq_s32(right[0], right[7]); + + // Copy values 8-15 as we're storing in-place + inl[0] = left[8]; + inr[0] = right[8]; + inl[1] = left[9]; + inr[1] = right[9]; + inl[2] = left[10]; + inr[2] = right[10]; + inl[3] = left[11]; + inr[3] = right[11]; + inl[4] = left[12]; + inr[4] = right[12]; + inl[5] = left[13]; + inr[5] = right[13]; + inl[6] = left[14]; + inr[6] = right[14]; + inl[7] = left[15]; + inr[7] = right[15]; + + // fdct4(step, step); + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64) + // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[8], &right[8]); + + // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64); + // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64); + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[4], &right[4], + &left[12], &right[12]); + + // Stage 2 + // Re-using source s5/s6 + // s5 = fdct_round_shift((s6 - s5) * cospi_16_64) + // s6 = fdct_round_shift((s6 + s5) * cospi_16_64) + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6], + &sr[6], &sl[5], &sr[5]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], sl[5]); + xr[0] = vaddq_s32(sr[4], sr[5]); + xl[1] = vsubq_s32(sl[4], sl[5]); + xr[1] = vsubq_s32(sr[4], sr[5]); + xl[2] = vsubq_s32(sl[7], sl[6]); + xr[2] = vsubq_s32(sr[7], sr[6]); + xl[3] = vaddq_s32(sl[7], sl[6]); + xr[3] = vaddq_s32(sr[7], sr[6]); + + // Stage 4 + // out[2] = fdct_round_shift(x3 * cospi_4_64 + x0 * cospi_28_64) + // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[2], &right[2], + &left[14], &right[14]); + // out[6] = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64) + // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[10], &right[10], + &left[6], &right[6]); + + // step 2 + // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results" + // That file distinguished between "in_high" and "step1" but the only + // difference is that "in_high" is the first 8 values and "step 1" is the + // second. Here, since they are all in one array, "step1" values are += 8. + + // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64) + // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64) + // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64) + // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64) + butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64, + &sl[5], &sr[5], &sl[2], &sr[2]); + butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64, + &sl[4], &sr[4], &sl[3], &sr[3]); + + // step 3 + sl[0] = vaddq_s32(inl[0], sl[3]); + sr[0] = vaddq_s32(inr[0], sr[3]); + sl[1] = vaddq_s32(inl[1], sl[2]); + sr[1] = vaddq_s32(inr[1], sr[2]); + xl[0] = vsubq_s32(inl[1], sl[2]); + xr[0] = vsubq_s32(inr[1], sr[2]); + xl[1] = vsubq_s32(inl[0], sl[3]); + xr[1] = vsubq_s32(inr[0], sr[3]); + xl[2] = vsubq_s32(inl[7], sl[4]); + xr[2] = vsubq_s32(inr[7], sr[4]); + xl[3] = vsubq_s32(inl[6], sl[5]); + xr[3] = vsubq_s32(inr[6], sr[5]); + sl[6] = vaddq_s32(inl[6], sl[5]); + sr[6] = vaddq_s32(inr[6], sr[5]); + sl[7] = vaddq_s32(inl[7], sl[4]); + sr[7] = vaddq_s32(inr[7], sr[4]); + + // step 4 + // step2[6] = fdct_round_shift(step3[6] * cospi_8_64 + step3[1] * + // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1] + // * cospi_8_64) + butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64, + cospi_24_64, &sl[6], &sr[6], &sl[1], + &sr[1]); + // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64) + // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * + // cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64, + cospi_8_64, &sl[2], &sr[2], &sl[5], + &sr[5]); + + // step 5 + stepl[0] = vaddq_s32(sl[0], sl[1]); + stepr[0] = vaddq_s32(sr[0], sr[1]); + stepl[1] = vsubq_s32(sl[0], sl[1]); + stepr[1] = vsubq_s32(sr[0], sr[1]); + stepl[2] = vaddq_s32(xl[1], sl[2]); + stepr[2] = vaddq_s32(xr[1], sr[2]); + stepl[3] = vsubq_s32(xl[1], sl[2]); + stepr[3] = vsubq_s32(xr[1], sr[2]); + stepl[4] = vsubq_s32(xl[2], sl[5]); + stepr[4] = vsubq_s32(xr[2], sr[5]); + stepl[5] = vaddq_s32(xl[2], sl[5]); + stepr[5] = vaddq_s32(xr[2], sr[5]); + stepl[6] = vsubq_s32(sl[7], sl[6]); + stepr[6] = vsubq_s32(sr[7], sr[6]); + stepl[7] = vaddq_s32(sl[7], sl[6]); + stepr[7] = vaddq_s32(sr[7], sr[6]); + + // step 6 + // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64) + // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64) + butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1], + cospi_18_64, cospi_14_64, &left[9], + &right[9], &left[7], &right[7]); + // out[1] = fdct_round_shift(step1[7] * cospi_2_64 + step1[0] * cospi_30_64) + // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64) + butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0], + cospi_2_64, cospi_30_64, &left[1], + &right[1], &left[15], &right[15]); + // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64) + // out[3] = fdct_round_shift(step1[4] * cospi_6_64 - step1[3] * cospi_26_64) + butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3], + cospi_26_64, cospi_6_64, &left[13], + &right[13], &left[3], &right[3]); + // out[5] = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64) + // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64) + butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2], + cospi_10_64, cospi_22_64, &left[5], + &right[5], &left[11], &right[11]); +} + +void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[16]; + int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16], + right3[16], right4[16]; + + // Left half. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); + + // right half. + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); + + // Transpose top left and top right quarters into one contiguous location to + // process to the top half. + + transpose_s32_8x8_2(left1, right1, left3, right3); + transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8); + + highbd_partial_round_shift(left3, right3); + highbd_cross_input(left3, right3, left1, right1); + vpx_highbd_fdct8x16_body(left1, right1); + + // Transpose bottom left and bottom right quarters into one contiguous + // location to process to the bottom half. + + highbd_partial_round_shift(left4, right4); + highbd_cross_input(left4, right4, left2, right2); + vpx_highbd_fdct8x16_body(left2, right2); + + transpose_s32_8x8_2(left1, right1, left3, right3); + transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8); + store16_s32(output, left3); + output += 4; + store16_s32(output, right3); + output += 4; + + store16_s32(output, left4); + output += 4; + store16_s32(output, right4); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && + // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4 diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h new file mode 100644 index 0000000000..cd58675ca4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.h @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ + +#include + +#include "fdct_neon.h" + +static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) { + b[0] = vld1q_s16(a); + a += stride; + b[1] = vld1q_s16(a); + a += stride; + b[2] = vld1q_s16(a); + a += stride; + b[3] = vld1q_s16(a); + a += stride; + b[4] = vld1q_s16(a); + a += stride; + b[5] = vld1q_s16(a); + a += stride; + b[6] = vld1q_s16(a); + a += stride; + b[7] = vld1q_s16(a); + a += stride; + b[8] = vld1q_s16(a); + a += stride; + b[9] = vld1q_s16(a); + a += stride; + b[10] = vld1q_s16(a); + a += stride; + b[11] = vld1q_s16(a); + a += stride; + b[12] = vld1q_s16(a); + a += stride; + b[13] = vld1q_s16(a); + a += stride; + b[14] = vld1q_s16(a); + a += stride; + b[15] = vld1q_s16(a); +} + +// Store 8 16x8 values, assuming stride == 16. +static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) { + store_s16q_to_tran_low(a, b[0]); + a += 16; + store_s16q_to_tran_low(a, b[1]); + a += 16; + store_s16q_to_tran_low(a, b[2]); + a += 16; + store_s16q_to_tran_low(a, b[3]); + a += 16; + store_s16q_to_tran_low(a, b[4]); + a += 16; + store_s16q_to_tran_low(a, b[5]); + a += 16; + store_s16q_to_tran_low(a, b[6]); + a += 16; + store_s16q_to_tran_low(a, b[7]); +} + +// Load step of each pass. Add and subtract clear across the input, requiring +// all 16 values to be loaded. For the first pass it also multiplies by 4. + +// To maybe reduce register usage this could be combined with the load() step to +// get the first 4 and last 4 values, cross those, then load the middle 8 values +// and cross them. +static INLINE void scale_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/) { + b[0] = vshlq_n_s16(a[0], 2); + b[1] = vshlq_n_s16(a[1], 2); + b[2] = vshlq_n_s16(a[2], 2); + b[3] = vshlq_n_s16(a[3], 2); + b[4] = vshlq_n_s16(a[4], 2); + b[5] = vshlq_n_s16(a[5], 2); + b[6] = vshlq_n_s16(a[6], 2); + b[7] = vshlq_n_s16(a[7], 2); + + b[8] = vshlq_n_s16(a[8], 2); + b[9] = vshlq_n_s16(a[9], 2); + b[10] = vshlq_n_s16(a[10], 2); + b[11] = vshlq_n_s16(a[11], 2); + b[12] = vshlq_n_s16(a[12], 2); + b[13] = vshlq_n_s16(a[13], 2); + b[14] = vshlq_n_s16(a[14], 2); + b[15] = vshlq_n_s16(a[15], 2); +} + +static INLINE void cross_input(const int16x8_t *a /*[16]*/, + int16x8_t *b /*[16]*/) { + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); +} + +static INLINE void load_cross(const int16_t *a, int stride, + int16x8_t *b /*[16]*/) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride)); + + b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride)); + b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride)); + b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride)); + b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride)); + b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride)); + b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride)); + b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride)); + b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride)); +} + +// Quarter round at the beginning of the second pass. Can't use vrshr (rounding) +// because this only adds 1, not 1 << 2. +static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) { + const int16x8_t one = vdupq_n_s16(1); + a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2); + a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2); + a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2); + a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2); + a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2); + a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2); + a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2); + a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2); + a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2); + a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2); + a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2); + a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2); + a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2); + a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2); + a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2); + a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2); +} + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/, + int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); + left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); + left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); + left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); + left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); + left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); + left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); + left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); + left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); + right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); +} + +static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/, + int32x4_t *a_right /*[16]*/, + int32x4_t *b_left /*[16]*/, + int32x4_t *b_right /*[16]*/) { + b_left[0] = vaddq_s32(a_left[0], a_left[15]); + b_left[1] = vaddq_s32(a_left[1], a_left[14]); + b_left[2] = vaddq_s32(a_left[2], a_left[13]); + b_left[3] = vaddq_s32(a_left[3], a_left[12]); + b_left[4] = vaddq_s32(a_left[4], a_left[11]); + b_left[5] = vaddq_s32(a_left[5], a_left[10]); + b_left[6] = vaddq_s32(a_left[6], a_left[9]); + b_left[7] = vaddq_s32(a_left[7], a_left[8]); + + b_right[0] = vaddq_s32(a_right[0], a_right[15]); + b_right[1] = vaddq_s32(a_right[1], a_right[14]); + b_right[2] = vaddq_s32(a_right[2], a_right[13]); + b_right[3] = vaddq_s32(a_right[3], a_right[12]); + b_right[4] = vaddq_s32(a_right[4], a_right[11]); + b_right[5] = vaddq_s32(a_right[5], a_right[10]); + b_right[6] = vaddq_s32(a_right[6], a_right[9]); + b_right[7] = vaddq_s32(a_right[7], a_right[8]); + + b_left[8] = vsubq_s32(a_left[7], a_left[8]); + b_left[9] = vsubq_s32(a_left[6], a_left[9]); + b_left[10] = vsubq_s32(a_left[5], a_left[10]); + b_left[11] = vsubq_s32(a_left[4], a_left[11]); + b_left[12] = vsubq_s32(a_left[3], a_left[12]); + b_left[13] = vsubq_s32(a_left[2], a_left[13]); + b_left[14] = vsubq_s32(a_left[1], a_left[14]); + b_left[15] = vsubq_s32(a_left[0], a_left[15]); + + b_right[8] = vsubq_s32(a_right[7], a_right[8]); + b_right[9] = vsubq_s32(a_right[6], a_right[9]); + b_right[10] = vsubq_s32(a_right[5], a_right[10]); + b_right[11] = vsubq_s32(a_right[4], a_right[11]); + b_right[12] = vsubq_s32(a_right[3], a_right[12]); + b_right[13] = vsubq_s32(a_right[2], a_right[13]); + b_right[14] = vsubq_s32(a_right[1], a_right[14]); + b_right[15] = vsubq_s32(a_right[0], a_right[15]); +} + +static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/, + int32x4_t *right /* [16] */) { + const int32x4_t one = vdupq_n_s32(1); + left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2); + left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2); + left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2); + left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2); + left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2); + left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2); + left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2); + left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2); + left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2); + left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2); + left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2); + left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2); + left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2); + left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2); + left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2); + left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2); + + right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2); + right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2); + right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2); + right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2); + right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2); + right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2); + right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2); + right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2); + right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2); + right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2); + right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2); + right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2); + right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2); + right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2); + right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2); + right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2); +} + +// Store 16 32x4 vectors, assuming stride == 16. +static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) { + vst1q_s32(a, b[0]); + a += 16; + vst1q_s32(a, b[1]); + a += 16; + vst1q_s32(a, b[2]); + a += 16; + vst1q_s32(a, b[3]); + a += 16; + vst1q_s32(a, b[4]); + a += 16; + vst1q_s32(a, b[5]); + a += 16; + vst1q_s32(a, b[6]); + a += 16; + vst1q_s32(a, b[7]); + a += 16; + vst1q_s32(a, b[8]); + a += 16; + vst1q_s32(a, b[9]); + a += 16; + vst1q_s32(a, b[10]); + a += 16; + vst1q_s32(a, b[11]); + a += 16; + vst1q_s32(a, b[12]); + a += 16; + vst1q_s32(a, b[13]); + a += 16; + vst1q_s32(a, b[14]); + a += 16; + vst1q_s32(a, b[15]); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c new file mode 100644 index 0000000000..a91730ce8b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/fdct32x32_neon.h" + +// Most gcc 4.9 distributions outside of Android do not generate correct code +// for this function. +#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ <= 9 + +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { + vpx_fdct32x32_c(input, output, stride); +} + +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct32x32_rd_c(input, output, stride); +} + +#else + +void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); + + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); + + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); + + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output, temp5); + + // Second row of 8x32. + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 8 * 32, temp5); + + // Third row of 8x32 + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 16 * 32, temp5); + + // Final row of 8x32. + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); + + dct_body_second_pass(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 24 * 32, temp5); +} + +void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp1); + + load_cross(input + 8, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp2); + + load_cross(input + 16, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp3); + + load_cross(input + 24, stride, temp0); + scale_input(temp0, temp5); + dct_body_first_pass(temp5, temp4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output, temp5); + + // Second row of 8x32. + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 8 * 32, temp5); + + // Third row of 8x32 + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 16 * 32, temp5); + + // Final row of 8x32. + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); + + dct_body_second_pass_rd(temp0, temp5); + + transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4], + &temp5[5], &temp5[6], &temp5[7]); + transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12], + &temp5[13], &temp5[14], &temp5[15]); + transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20], + &temp5[21], &temp5[22], &temp5[23]); + transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28], + &temp5[29], &temp5[30], &temp5[31]); + store(output + 24 * 32, temp5); +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32], + right3[32], right4[32]; + int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32], + left8[32], right8[32]; + int32x4_t temp1[32], temp2[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + highbd_dct8x32_body_first_pass(left1, right1); + highbd_partial_sub_round_shift(left1, right1); + + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + highbd_dct8x32_body_first_pass(left2, right2); + highbd_partial_sub_round_shift(left2, right2); + + load_cross(input + 16, stride, temp0); + highbd_scale_input(temp0, left3, right3); + highbd_dct8x32_body_first_pass(left3, right3); + highbd_partial_sub_round_shift(left3, right3); + + load_cross(input + 24, stride, temp0); + highbd_scale_input(temp0, left4, right4); + highbd_dct8x32_body_first_pass(left4, right4); + highbd_partial_sub_round_shift(left4, right4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s32_8x8_2(left1, right1, temp1, temp2); + transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left5, right5); + highbd_dct8x32_body_second_pass(left5, right5); + highbd_partial_add_round_shift(left5, right5); + + // Second row of 8x32. + transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2); + transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left6, right6); + highbd_dct8x32_body_second_pass(left6, right6); + highbd_partial_add_round_shift(left6, right6); + + // Third row of 8x32 + transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2); + transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left7, right7); + highbd_dct8x32_body_second_pass(left7, right7); + highbd_partial_add_round_shift(left7, right7); + + // Final row of 8x32. + transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2); + transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left8, right8); + highbd_dct8x32_body_second_pass(left8, right8); + highbd_partial_add_round_shift(left8, right8); + + // Final transpose + transpose_s32_8x8_2(left5, right5, left1, right1); + transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2); + transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3); + transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4); + transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8); + transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8); + transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16); + transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16); + transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16); + transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16); + transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24); + transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24); + transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24); + transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24); + + store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4, + right4); +} + +void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, + int stride) { + int16x8_t temp0[32]; + int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32], + right3[32], right4[32]; + int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32], + left8[32], right8[32]; + int32x4_t temp1[32], temp2[32]; + + // Process in 8x32 columns. + load_cross(input, stride, temp0); + highbd_scale_input(temp0, left1, right1); + highbd_dct8x32_body_first_pass(left1, right1); + highbd_partial_sub_round_shift(left1, right1); + + load_cross(input + 8, stride, temp0); + highbd_scale_input(temp0, left2, right2); + highbd_dct8x32_body_first_pass(left2, right2); + highbd_partial_sub_round_shift(left2, right2); + + load_cross(input + 16, stride, temp0); + highbd_scale_input(temp0, left3, right3); + highbd_dct8x32_body_first_pass(left3, right3); + highbd_partial_sub_round_shift(left3, right3); + + load_cross(input + 24, stride, temp0); + highbd_scale_input(temp0, left4, right4); + highbd_dct8x32_body_first_pass(left4, right4); + highbd_partial_sub_round_shift(left4, right4); + + // Generate the top row by munging the first set of 8 from each one together. + transpose_s32_8x8_2(left1, right1, temp1, temp2); + transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left5, right5); + highbd_dct8x32_body_second_pass_rd(left5, right5); + + // Second row of 8x32. + transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2); + transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left6, right6); + highbd_dct8x32_body_second_pass_rd(left6, right6); + + // Third row of 8x32 + transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2); + transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left7, right7); + highbd_dct8x32_body_second_pass_rd(left7, right7); + + // Final row of 8x32. + transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2); + transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8); + transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16); + transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24); + + highbd_cross_input(temp1, temp2, left8, right8); + highbd_dct8x32_body_second_pass_rd(left8, right8); + + // Final transpose + transpose_s32_8x8_2(left5, right5, left1, right1); + transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2); + transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3); + transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4); + transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8); + transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8); + transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8); + transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16); + transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16); + transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16); + transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16); + transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24); + transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24); + transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24); + transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24); + + store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4, + right4); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && + // __GNUC__ == 4 && __GNUC_MINOR__ <= 9 diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h new file mode 100644 index 0000000000..3b9e64c6df --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.h @@ -0,0 +1,2919 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" + +// Load & cross the first 8 and last 8, then the middle +static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) { + b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + + b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride)); + b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride)); + b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride)); + b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride)); + b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride)); + b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride)); + b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride)); + b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride)); + + b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); + b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + + b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride)); + b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride)); + b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride)); + b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride)); + b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride)); + b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride)); + b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride)); + b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride)); +} + +#define STORE_S16(src, index, dest) \ + do { \ + store_s16q_to_tran_low(dest, src[index]); \ + dest += 8; \ + } while (0) + +// Store 32 16x8 values, assuming stride == 32. +// Slight twist: store horizontally in blocks of 8. +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + STORE_S16(b, 0, a); + STORE_S16(b, 8, a); + STORE_S16(b, 16, a); + STORE_S16(b, 24, a); + STORE_S16(b, 1, a); + STORE_S16(b, 9, a); + STORE_S16(b, 17, a); + STORE_S16(b, 25, a); + STORE_S16(b, 2, a); + STORE_S16(b, 10, a); + STORE_S16(b, 18, a); + STORE_S16(b, 26, a); + STORE_S16(b, 3, a); + STORE_S16(b, 11, a); + STORE_S16(b, 19, a); + STORE_S16(b, 27, a); + STORE_S16(b, 4, a); + STORE_S16(b, 12, a); + STORE_S16(b, 20, a); + STORE_S16(b, 28, a); + STORE_S16(b, 5, a); + STORE_S16(b, 13, a); + STORE_S16(b, 21, a); + STORE_S16(b, 29, a); + STORE_S16(b, 6, a); + STORE_S16(b, 14, a); + STORE_S16(b, 22, a); + STORE_S16(b, 30, a); + STORE_S16(b, 7, a); + STORE_S16(b, 15, a); + STORE_S16(b, 23, a); + STORE_S16(b, 31, a); +} + +#undef STORE_S16 + +static INLINE void scale_input(const int16x8_t *in /*32*/, + int16x8_t *out /*32*/) { + out[0] = vshlq_n_s16(in[0], 2); + out[1] = vshlq_n_s16(in[1], 2); + out[2] = vshlq_n_s16(in[2], 2); + out[3] = vshlq_n_s16(in[3], 2); + out[4] = vshlq_n_s16(in[4], 2); + out[5] = vshlq_n_s16(in[5], 2); + out[6] = vshlq_n_s16(in[6], 2); + out[7] = vshlq_n_s16(in[7], 2); + + out[8] = vshlq_n_s16(in[8], 2); + out[9] = vshlq_n_s16(in[9], 2); + out[10] = vshlq_n_s16(in[10], 2); + out[11] = vshlq_n_s16(in[11], 2); + out[12] = vshlq_n_s16(in[12], 2); + out[13] = vshlq_n_s16(in[13], 2); + out[14] = vshlq_n_s16(in[14], 2); + out[15] = vshlq_n_s16(in[15], 2); + + out[16] = vshlq_n_s16(in[16], 2); + out[17] = vshlq_n_s16(in[17], 2); + out[18] = vshlq_n_s16(in[18], 2); + out[19] = vshlq_n_s16(in[19], 2); + out[20] = vshlq_n_s16(in[20], 2); + out[21] = vshlq_n_s16(in[21], 2); + out[22] = vshlq_n_s16(in[22], 2); + out[23] = vshlq_n_s16(in[23], 2); + + out[24] = vshlq_n_s16(in[24], 2); + out[25] = vshlq_n_s16(in[25], 2); + out[26] = vshlq_n_s16(in[26], 2); + out[27] = vshlq_n_s16(in[27], 2); + out[28] = vshlq_n_s16(in[28], 2); + out[29] = vshlq_n_s16(in[29], 2); + out[30] = vshlq_n_s16(in[30], 2); + out[31] = vshlq_n_s16(in[31], 2); +} + +static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + a[0] = vaddq_s16(in[0], in[15]); + a[1] = vaddq_s16(in[1], in[14]); + a[2] = vaddq_s16(in[2], in[13]); + a[3] = vaddq_s16(in[3], in[12]); + a[4] = vaddq_s16(in[4], in[11]); + a[5] = vaddq_s16(in[5], in[10]); + a[6] = vaddq_s16(in[6], in[9]); + a[7] = vaddq_s16(in[7], in[8]); + + a[8] = vsubq_s16(in[7], in[8]); + a[9] = vsubq_s16(in[6], in[9]); + a[10] = vsubq_s16(in[5], in[10]); + a[11] = vsubq_s16(in[4], in[11]); + a[12] = vsubq_s16(in[3], in[12]); + a[13] = vsubq_s16(in[2], in[13]); + a[14] = vsubq_s16(in[1], in[14]); + a[15] = vsubq_s16(in[0], in[15]); + + a[16] = in[16]; + a[17] = in[17]; + a[18] = in[18]; + a[19] = in[19]; + + butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27], + &a[20]); + butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26], + &a[21]); + butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25], + &a[22]); + butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24], + &a[23]); + + a[28] = in[28]; + a[29] = in[29]; + a[30] = in[30]; + a[31] = in[31]; + + // Stage 3. + b[0] = vaddq_s16(a[0], a[7]); + b[1] = vaddq_s16(a[1], a[6]); + b[2] = vaddq_s16(a[2], a[5]); + b[3] = vaddq_s16(a[3], a[4]); + + b[4] = vsubq_s16(a[3], a[4]); + b[5] = vsubq_s16(a[2], a[5]); + b[6] = vsubq_s16(a[1], a[6]); + b[7] = vsubq_s16(a[0], a[7]); + + b[8] = a[8]; + b[9] = a[9]; + + butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]); + butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]); + + b[14] = a[14]; + b[15] = a[15]; + + b[16] = vaddq_s16(in[16], a[23]); + b[17] = vaddq_s16(in[17], a[22]); + b[18] = vaddq_s16(in[18], a[21]); + b[19] = vaddq_s16(in[19], a[20]); + + b[20] = vsubq_s16(in[19], a[20]); + b[21] = vsubq_s16(in[18], a[21]); + b[22] = vsubq_s16(in[17], a[22]); + b[23] = vsubq_s16(in[16], a[23]); + + b[24] = vsubq_s16(in[31], a[24]); + b[25] = vsubq_s16(in[30], a[25]); + b[26] = vsubq_s16(in[29], a[26]); + b[27] = vsubq_s16(in[28], a[27]); + + b[28] = vaddq_s16(in[28], a[27]); + b[29] = vaddq_s16(in[29], a[26]); + b[30] = vaddq_s16(in[30], a[25]); + b[31] = vaddq_s16(in[31], a[24]); + + // Stage 4. + a[0] = vaddq_s16(b[0], b[3]); + a[1] = vaddq_s16(b[1], b[2]); + a[2] = vsubq_s16(b[1], b[2]); + a[3] = vsubq_s16(b[0], b[3]); + + a[4] = b[4]; + + butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]); + + a[7] = b[7]; + + a[8] = vaddq_s16(b[8], b[11]); + a[9] = vaddq_s16(b[9], b[10]); + a[10] = vsubq_s16(b[9], b[10]); + a[11] = vsubq_s16(b[8], b[11]); + a[12] = vsubq_s16(b[15], b[12]); + a[13] = vsubq_s16(b[14], b[13]); + a[14] = vaddq_s16(b[14], b[13]); + a[15] = vaddq_s16(b[15], b[12]); + + a[16] = b[16]; + a[17] = b[17]; + + butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]); + butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]); + butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]); + butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]); + + a[22] = b[22]; + a[23] = b[23]; + a[24] = b[24]; + a[25] = b[25]; + + a[30] = b[30]; + a[31] = b[31]; + + // Stage 5. + butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]); + butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]); + + b[4] = vaddq_s16(a[4], a[5]); + b[5] = vsubq_s16(a[4], a[5]); + b[6] = vsubq_s16(a[7], a[6]); + b[7] = vaddq_s16(a[7], a[6]); + + b[8] = a[8]; + + butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]); + butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]); + + b[11] = a[11]; + b[12] = a[12]; + + b[15] = a[15]; + + b[16] = vaddq_s16(a[19], a[16]); + b[17] = vaddq_s16(a[18], a[17]); + b[18] = vsubq_s16(a[17], a[18]); + b[19] = vsubq_s16(a[16], a[19]); + b[20] = vsubq_s16(a[23], a[20]); + b[21] = vsubq_s16(a[22], a[21]); + b[22] = vaddq_s16(a[21], a[22]); + b[23] = vaddq_s16(a[20], a[23]); + b[24] = vaddq_s16(a[27], a[24]); + b[25] = vaddq_s16(a[26], a[25]); + b[26] = vsubq_s16(a[25], a[26]); + b[27] = vsubq_s16(a[24], a[27]); + b[28] = vsubq_s16(a[31], a[28]); + b[29] = vsubq_s16(a[30], a[29]); + b[30] = vaddq_s16(a[29], a[30]); + b[31] = vaddq_s16(a[28], a[31]); + + // Stage 6. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + + butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]); + butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]); + + a[8] = vaddq_s16(b[8], b[9]); + a[9] = vsubq_s16(b[8], b[9]); + a[10] = vsubq_s16(b[11], b[10]); + a[11] = vaddq_s16(b[11], b[10]); + a[12] = vaddq_s16(b[12], b[13]); + a[13] = vsubq_s16(b[12], b[13]); + a[14] = vsubq_s16(b[15], b[14]); + a[15] = vaddq_s16(b[15], b[14]); + + a[16] = b[16]; + a[19] = b[19]; + a[20] = b[20]; + a[23] = b[23]; + a[24] = b[24]; + a[27] = b[27]; + a[28] = b[28]; + a[31] = b[31]; + + butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]); + butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]); + + butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]); + butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]); + + // Stage 7. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + b[4] = a[4]; + b[5] = a[5]; + b[6] = a[6]; + b[7] = a[7]; + + butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]); + butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]); + butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]); + butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]); + + b[16] = vaddq_s16(a[16], a[17]); + b[17] = vsubq_s16(a[16], a[17]); + b[18] = vsubq_s16(a[19], a[18]); + b[19] = vaddq_s16(a[19], a[18]); + b[20] = vaddq_s16(a[20], a[21]); + b[21] = vsubq_s16(a[20], a[21]); + b[22] = vsubq_s16(a[23], a[22]); + b[23] = vaddq_s16(a[23], a[22]); + b[24] = vaddq_s16(a[24], a[25]); + b[25] = vsubq_s16(a[24], a[25]); + b[26] = vsubq_s16(a[27], a[26]); + b[27] = vaddq_s16(a[27], a[26]); + b[28] = vaddq_s16(a[28], a[29]); + b[29] = vsubq_s16(a[28], a[29]); + b[30] = vsubq_s16(a[31], a[30]); + b[31] = vaddq_s16(a[31], a[30]); + + // Final stage. + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[0] = sub_round_shift_s16(b[0]); + out[16] = sub_round_shift_s16(b[1]); + out[8] = sub_round_shift_s16(b[2]); + out[24] = sub_round_shift_s16(b[3]); + out[4] = sub_round_shift_s16(b[4]); + out[20] = sub_round_shift_s16(b[5]); + out[12] = sub_round_shift_s16(b[6]); + out[28] = sub_round_shift_s16(b[7]); + out[2] = sub_round_shift_s16(b[8]); + out[18] = sub_round_shift_s16(b[9]); + out[10] = sub_round_shift_s16(b[10]); + out[26] = sub_round_shift_s16(b[11]); + out[6] = sub_round_shift_s16(b[12]); + out[22] = sub_round_shift_s16(b[13]); + out[14] = sub_round_shift_s16(b[14]); + out[30] = sub_round_shift_s16(b[15]); + + butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]); + out[1] = sub_round_shift_s16(a[1]); + out[31] = sub_round_shift_s16(a[31]); + + butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]); + out[17] = sub_round_shift_s16(a[17]); + out[15] = sub_round_shift_s16(a[15]); + + butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]); + out[9] = sub_round_shift_s16(a[9]); + out[23] = sub_round_shift_s16(a[23]); + + butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]); + out[25] = sub_round_shift_s16(a[25]); + out[7] = sub_round_shift_s16(a[7]); + + butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]); + out[5] = sub_round_shift_s16(a[5]); + out[27] = sub_round_shift_s16(a[27]); + + butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]); + out[21] = sub_round_shift_s16(a[21]); + out[11] = sub_round_shift_s16(a[11]); + + butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]); + out[13] = sub_round_shift_s16(a[13]); + out[19] = sub_round_shift_s16(a[19]); + + butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]); + out[29] = sub_round_shift_s16(a[29]); + out[3] = sub_round_shift_s16(a[3]); +} + +#define PASS_THROUGH(src, dst, element) \ + do { \ + dst##_lo[element] = src##_lo[element]; \ + dst##_hi[element] = src##_hi[element]; \ + } while (0) + +#define ADD_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define SUB_S16_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = \ + vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \ + b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \ + vget_high_s16(a[right_index])); \ + } while (0) + +#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \ + do { \ + c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \ + c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \ + } while (0) + +#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \ + do { \ + temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \ + temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \ + c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \ + c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \ + } while (0) + +#define ADD_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define SUB_S32(a, left_index, right_index, b, b_index) \ + do { \ + b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \ + b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \ + add_index, sub_index) \ + do { \ + butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \ + &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \ + sub_index) \ + do { \ + butterfly_one_coeff_s32_fast( \ + a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \ + a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \ + &b##_lo[sub_index], &b##_hi[sub_index]); \ + } while (0) + +#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \ + right_constant, b, add_index, sub_index) \ + do { \ + butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \ + a##_lo[right_index], a##_hi[right_index], \ + left_constant, right_constant, &b##_lo[add_index], \ + &b##_hi[add_index], &b##_lo[sub_index], \ + &b##_hi[sub_index]); \ + } while (0) + +static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + int32x4_t c_lo[32]; + int32x4_t c_hi[32]; + int32x4_t d_lo[32]; + int32x4_t d_hi[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + b[0] = vaddq_s16(a[0], a[15]); + b[1] = vaddq_s16(a[1], a[14]); + b[2] = vaddq_s16(a[2], a[13]); + b[3] = vaddq_s16(a[3], a[12]); + b[4] = vaddq_s16(a[4], a[11]); + b[5] = vaddq_s16(a[5], a[10]); + b[6] = vaddq_s16(a[6], a[9]); + b[7] = vaddq_s16(a[7], a[8]); + + b[8] = vsubq_s16(a[7], a[8]); + b[9] = vsubq_s16(a[6], a[9]); + b[10] = vsubq_s16(a[5], a[10]); + b[11] = vsubq_s16(a[4], a[11]); + b[12] = vsubq_s16(a[3], a[12]); + b[13] = vsubq_s16(a[2], a[13]); + b[14] = vsubq_s16(a[1], a[14]); + b[15] = vsubq_s16(a[0], a[15]); + + b[16] = a[16]; + b[17] = a[17]; + b[18] = a[18]; + b[19] = a[19]; + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + + b[28] = a[28]; + b[29] = a[29]; + b[30] = a[30]; + b[31] = a[31]; + + // Stage 3. With extreme values for input this calculation rolls over int16_t. + // The sources for b[0] get added multiple times and, through testing, have + // been shown to overflow starting here. + ADD_S16_S32(b, 0, 7, c, 0); + ADD_S16_S32(b, 1, 6, c, 1); + ADD_S16_S32(b, 2, 5, c, 2); + ADD_S16_S32(b, 3, 4, c, 3); + SUB_S16_S32(b, 3, 4, c, 4); + SUB_S16_S32(b, 2, 5, c, 5); + SUB_S16_S32(b, 1, 6, c, 6); + SUB_S16_S32(b, 0, 7, c, 7); + + a[8] = b[8]; + a[9] = b[9]; + + BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10); + BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11); + + a[14] = b[14]; + a[15] = b[15]; + + ADD_S16_S32(b, 16, 23, c, 16); + ADD_S16_S32(b, 17, 22, c, 17); + ADD_S16_S32(b, 18, 21, c, 18); + ADD_S16_S32(b, 19, 20, c, 19); + SUB_S16_S32(b, 19, 20, c, 20); + SUB_S16_S32(b, 18, 21, c, 21); + SUB_S16_S32(b, 17, 22, c, 22); + SUB_S16_S32(b, 16, 23, c, 23); + SUB_S16_S32(b, 31, 24, c, 24); + SUB_S16_S32(b, 30, 25, c, 25); + SUB_S16_S32(b, 29, 26, c, 26); + SUB_S16_S32(b, 28, 27, c, 27); + ADD_S16_S32(b, 28, 27, c, 28); + ADD_S16_S32(b, 29, 26, c, 29); + ADD_S16_S32(b, 30, 25, c, 30); + ADD_S16_S32(b, 31, 24, c, 31); + + // Stage 4. + ADD_S32(c, 0, 3, d, 0); + ADD_S32(c, 1, 2, d, 1); + SUB_S32(c, 1, 2, d, 2); + SUB_S32(c, 0, 3, d, 3); + + PASS_THROUGH(c, d, 4); + + BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5); + + PASS_THROUGH(c, d, 7); + + ADDW_S16_S32(c, 11, a, 8, d, 8); + ADDW_S16_S32(c, 10, a, 9, d, 9); + SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10); + SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11); + SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12); + SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13); + ADDW_S16_S32(c, 13, b, 14, d, 14); + ADDW_S16_S32(c, 12, b, 15, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 17); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19); + BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21); + + PASS_THROUGH(c, d, 22); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 25); + + PASS_THROUGH(c, d, 30); + PASS_THROUGH(c, d, 31); + + // Stage 5. + BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1); + BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3); + + ADD_S32(d, 4, 5, c, 4); + SUB_S32(d, 4, 5, c, 5); + SUB_S32(d, 7, 6, c, 6); + ADD_S32(d, 7, 6, c, 7); + + PASS_THROUGH(d, c, 8); + + BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10); + + PASS_THROUGH(d, c, 11); + PASS_THROUGH(d, c, 12); + PASS_THROUGH(d, c, 15); + + ADD_S32(d, 16, 19, c, 16); + ADD_S32(d, 17, 18, c, 17); + SUB_S32(d, 17, 18, c, 18); + SUB_S32(d, 16, 19, c, 19); + SUB_S32(d, 23, 20, c, 20); + SUB_S32(d, 22, 21, c, 21); + ADD_S32(d, 22, 21, c, 22); + ADD_S32(d, 23, 20, c, 23); + ADD_S32(d, 24, 27, c, 24); + ADD_S32(d, 25, 26, c, 25); + SUB_S32(d, 25, 26, c, 26); + SUB_S32(d, 24, 27, c, 27); + SUB_S32(d, 31, 28, c, 28); + SUB_S32(d, 30, 29, c, 29); + ADD_S32(d, 30, 29, c, 30); + ADD_S32(d, 31, 28, c, 31); + + // Stage 6. + PASS_THROUGH(c, d, 0); + PASS_THROUGH(c, d, 1); + PASS_THROUGH(c, d, 2); + PASS_THROUGH(c, d, 3); + + BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7); + BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6); + + ADD_S32(c, 8, 9, d, 8); + SUB_S32(c, 8, 9, d, 9); + SUB_S32(c, 11, 10, d, 10); + ADD_S32(c, 11, 10, d, 11); + ADD_S32(c, 12, 13, d, 12); + SUB_S32(c, 12, 13, d, 13); + SUB_S32(c, 15, 14, d, 14); + ADD_S32(c, 15, 14, d, 15); + + PASS_THROUGH(c, d, 16); + PASS_THROUGH(c, d, 19); + PASS_THROUGH(c, d, 20); + PASS_THROUGH(c, d, 23); + PASS_THROUGH(c, d, 24); + PASS_THROUGH(c, d, 27); + PASS_THROUGH(c, d, 28); + PASS_THROUGH(c, d, 31); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17); + BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18); + BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21); + BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22); + + // Stage 7. + PASS_THROUGH(d, c, 0); + PASS_THROUGH(d, c, 1); + PASS_THROUGH(d, c, 2); + PASS_THROUGH(d, c, 3); + PASS_THROUGH(d, c, 4); + PASS_THROUGH(d, c, 5); + PASS_THROUGH(d, c, 6); + PASS_THROUGH(d, c, 7); + + BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15); + BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14); + BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13); + BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12); + + ADD_S32(d, 16, 17, c, 16); + SUB_S32(d, 16, 17, c, 17); + SUB_S32(d, 19, 18, c, 18); + ADD_S32(d, 19, 18, c, 19); + ADD_S32(d, 20, 21, c, 20); + SUB_S32(d, 20, 21, c, 21); + SUB_S32(d, 23, 22, c, 22); + ADD_S32(d, 23, 22, c, 23); + ADD_S32(d, 24, 25, c, 24); + SUB_S32(d, 24, 25, c, 25); + SUB_S32(d, 27, 26, c, 26); + ADD_S32(d, 27, 26, c, 27); + ADD_S32(d, 28, 29, c, 28); + SUB_S32(d, 28, 29, c, 29); + SUB_S32(d, 31, 30, c, 30); + ADD_S32(d, 31, 30, c, 31); + + // Final stage. + // Roll rounding into this function so we can pass back int16x8. + + out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]); + out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]); + + out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]); + out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]); + out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]); + out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]); + out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]); + + out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]); + out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]); + out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]); + out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]); + + out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]); + out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]); + out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]); + out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]); + out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]); + + BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31); + out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]); + out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]); + + BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15); + out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]); + out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]); + + BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23); + out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]); + out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]); + + BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7); + out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]); + out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]); + + BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27); + out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]); + out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]); + + BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11); + out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]); + out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]); + + BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19); + out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]); + out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]); + + BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3); + out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]); + out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]); +} + +static INLINE void dct_body_second_pass_rd(const int16x8_t *in, + int16x8_t *out) { + int16x8_t a[32]; + int16x8_t b[32]; + + // Stage 1. Done as part of the load for the first pass. + a[0] = vaddq_s16(in[0], in[31]); + a[1] = vaddq_s16(in[1], in[30]); + a[2] = vaddq_s16(in[2], in[29]); + a[3] = vaddq_s16(in[3], in[28]); + a[4] = vaddq_s16(in[4], in[27]); + a[5] = vaddq_s16(in[5], in[26]); + a[6] = vaddq_s16(in[6], in[25]); + a[7] = vaddq_s16(in[7], in[24]); + a[8] = vaddq_s16(in[8], in[23]); + a[9] = vaddq_s16(in[9], in[22]); + a[10] = vaddq_s16(in[10], in[21]); + a[11] = vaddq_s16(in[11], in[20]); + a[12] = vaddq_s16(in[12], in[19]); + a[13] = vaddq_s16(in[13], in[18]); + a[14] = vaddq_s16(in[14], in[17]); + a[15] = vaddq_s16(in[15], in[16]); + a[16] = vsubq_s16(in[15], in[16]); + a[17] = vsubq_s16(in[14], in[17]); + a[18] = vsubq_s16(in[13], in[18]); + a[19] = vsubq_s16(in[12], in[19]); + a[20] = vsubq_s16(in[11], in[20]); + a[21] = vsubq_s16(in[10], in[21]); + a[22] = vsubq_s16(in[9], in[22]); + a[23] = vsubq_s16(in[8], in[23]); + a[24] = vsubq_s16(in[7], in[24]); + a[25] = vsubq_s16(in[6], in[25]); + a[26] = vsubq_s16(in[5], in[26]); + a[27] = vsubq_s16(in[4], in[27]); + a[28] = vsubq_s16(in[3], in[28]); + a[29] = vsubq_s16(in[2], in[29]); + a[30] = vsubq_s16(in[1], in[30]); + a[31] = vsubq_s16(in[0], in[31]); + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15])); + b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14])); + b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13])); + b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12])); + b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11])); + b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10])); + b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9])); + b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8])); + + b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8])); + b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9])); + b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10])); + b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11])); + b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12])); + b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13])); + b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14])); + b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15])); + + b[16] = add_round_shift_s16(a[16]); + b[17] = add_round_shift_s16(a[17]); + b[18] = add_round_shift_s16(a[18]); + b[19] = add_round_shift_s16(a[19]); + + butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]); + butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]); + butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]); + butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]); + b[20] = add_round_shift_s16(b[20]); + b[21] = add_round_shift_s16(b[21]); + b[22] = add_round_shift_s16(b[22]); + b[23] = add_round_shift_s16(b[23]); + b[24] = add_round_shift_s16(b[24]); + b[25] = add_round_shift_s16(b[25]); + b[26] = add_round_shift_s16(b[26]); + b[27] = add_round_shift_s16(b[27]); + + b[28] = add_round_shift_s16(a[28]); + b[29] = add_round_shift_s16(a[29]); + b[30] = add_round_shift_s16(a[30]); + b[31] = add_round_shift_s16(a[31]); + + // Stage 3. + a[0] = vaddq_s16(b[0], b[7]); + a[1] = vaddq_s16(b[1], b[6]); + a[2] = vaddq_s16(b[2], b[5]); + a[3] = vaddq_s16(b[3], b[4]); + + a[4] = vsubq_s16(b[3], b[4]); + a[5] = vsubq_s16(b[2], b[5]); + a[6] = vsubq_s16(b[1], b[6]); + a[7] = vsubq_s16(b[0], b[7]); + + a[8] = b[8]; + a[9] = b[9]; + + butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]); + butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]); + + a[14] = b[14]; + a[15] = b[15]; + + a[16] = vaddq_s16(b[16], b[23]); + a[17] = vaddq_s16(b[17], b[22]); + a[18] = vaddq_s16(b[18], b[21]); + a[19] = vaddq_s16(b[19], b[20]); + + a[20] = vsubq_s16(b[19], b[20]); + a[21] = vsubq_s16(b[18], b[21]); + a[22] = vsubq_s16(b[17], b[22]); + a[23] = vsubq_s16(b[16], b[23]); + + a[24] = vsubq_s16(b[31], b[24]); + a[25] = vsubq_s16(b[30], b[25]); + a[26] = vsubq_s16(b[29], b[26]); + a[27] = vsubq_s16(b[28], b[27]); + + a[28] = vaddq_s16(b[28], b[27]); + a[29] = vaddq_s16(b[29], b[26]); + a[30] = vaddq_s16(b[30], b[25]); + a[31] = vaddq_s16(b[31], b[24]); + + // Stage 4. + b[0] = vaddq_s16(a[0], a[3]); + b[1] = vaddq_s16(a[1], a[2]); + b[2] = vsubq_s16(a[1], a[2]); + b[3] = vsubq_s16(a[0], a[3]); + + b[4] = a[4]; + + butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]); + + b[7] = a[7]; + + b[8] = vaddq_s16(a[8], a[11]); + b[9] = vaddq_s16(a[9], a[10]); + b[10] = vsubq_s16(a[9], a[10]); + b[11] = vsubq_s16(a[8], a[11]); + b[12] = vsubq_s16(a[15], a[12]); + b[13] = vsubq_s16(a[14], a[13]); + b[14] = vaddq_s16(a[14], a[13]); + b[15] = vaddq_s16(a[15], a[12]); + + b[16] = a[16]; + b[17] = a[17]; + + butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]); + butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]); + butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]); + butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]); + + b[22] = a[22]; + b[23] = a[23]; + b[24] = a[24]; + b[25] = a[25]; + + b[30] = a[30]; + b[31] = a[31]; + + // Stage 5. + butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]); + butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]); + + a[4] = vaddq_s16(b[4], b[5]); + a[5] = vsubq_s16(b[4], b[5]); + a[6] = vsubq_s16(b[7], b[6]); + a[7] = vaddq_s16(b[7], b[6]); + + a[8] = b[8]; + + butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]); + butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]); + + a[11] = b[11]; + a[12] = b[12]; + + a[15] = b[15]; + + a[16] = vaddq_s16(b[19], b[16]); + a[17] = vaddq_s16(b[18], b[17]); + a[18] = vsubq_s16(b[17], b[18]); + a[19] = vsubq_s16(b[16], b[19]); + a[20] = vsubq_s16(b[23], b[20]); + a[21] = vsubq_s16(b[22], b[21]); + a[22] = vaddq_s16(b[21], b[22]); + a[23] = vaddq_s16(b[20], b[23]); + a[24] = vaddq_s16(b[27], b[24]); + a[25] = vaddq_s16(b[26], b[25]); + a[26] = vsubq_s16(b[25], b[26]); + a[27] = vsubq_s16(b[24], b[27]); + a[28] = vsubq_s16(b[31], b[28]); + a[29] = vsubq_s16(b[30], b[29]); + a[30] = vaddq_s16(b[29], b[30]); + a[31] = vaddq_s16(b[28], b[31]); + + // Stage 6. + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; + + butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]); + butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]); + + b[8] = vaddq_s16(a[8], a[9]); + b[9] = vsubq_s16(a[8], a[9]); + b[10] = vsubq_s16(a[11], a[10]); + b[11] = vaddq_s16(a[11], a[10]); + b[12] = vaddq_s16(a[12], a[13]); + b[13] = vsubq_s16(a[12], a[13]); + b[14] = vsubq_s16(a[15], a[14]); + b[15] = vaddq_s16(a[15], a[14]); + + b[16] = a[16]; + b[19] = a[19]; + b[20] = a[20]; + b[23] = a[23]; + b[24] = a[24]; + b[27] = a[27]; + b[28] = a[28]; + b[31] = a[31]; + + butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]); + butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]); + + butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]); + butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]); + + // Stage 7. + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + a[4] = b[4]; + a[5] = b[5]; + a[6] = b[6]; + a[7] = b[7]; + + butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]); + butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]); + butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]); + butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]); + + a[16] = vaddq_s16(b[16], b[17]); + a[17] = vsubq_s16(b[16], b[17]); + a[18] = vsubq_s16(b[19], b[18]); + a[19] = vaddq_s16(b[19], b[18]); + a[20] = vaddq_s16(b[20], b[21]); + a[21] = vsubq_s16(b[20], b[21]); + a[22] = vsubq_s16(b[23], b[22]); + a[23] = vaddq_s16(b[23], b[22]); + a[24] = vaddq_s16(b[24], b[25]); + a[25] = vsubq_s16(b[24], b[25]); + a[26] = vsubq_s16(b[27], b[26]); + a[27] = vaddq_s16(b[27], b[26]); + a[28] = vaddq_s16(b[28], b[29]); + a[29] = vsubq_s16(b[28], b[29]); + a[30] = vsubq_s16(b[31], b[30]); + a[31] = vaddq_s16(b[31], b[30]); + + // Final stage. + out[0] = a[0]; + out[16] = a[1]; + out[8] = a[2]; + out[24] = a[3]; + out[4] = a[4]; + out[20] = a[5]; + out[12] = a[6]; + out[28] = a[7]; + out[2] = a[8]; + out[18] = a[9]; + out[10] = a[10]; + out[26] = a[11]; + out[6] = a[12]; + out[22] = a[13]; + out[14] = a[14]; + out[30] = a[15]; + + butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]); + butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17], + &out[15]); + butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]); + butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]); + butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]); + butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21], + &out[11]); + butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13], + &out[19]); + butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]); +} + +#undef PASS_THROUGH +#undef ADD_S16_S32 +#undef SUB_S16_S32 +#undef ADDW_S16_S32 +#undef SUBW_S16_S32 +#undef ADD_S32 +#undef SUB_S32 +#undef BUTTERFLY_ONE_S16_S32 +#undef BUTTERFLY_ONE_S32 +#undef BUTTERFLY_TWO_S32 + +#if CONFIG_VP9_HIGHBITDEPTH + +// Store 32 32x4 vectors, assuming stride == 32. +static INLINE void store32x32_s32( + tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/, + const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/, + const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/, + const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) { + int i; + for (i = 0; i < 32; i++) { + vst1q_s32(a, l1[i]); + vst1q_s32(a + 4, r1[i]); + vst1q_s32(a + 8, l2[i]); + vst1q_s32(a + 12, r2[i]); + vst1q_s32(a + 16, l3[i]); + vst1q_s32(a + 20, r3[i]); + vst1q_s32(a + 24, l4[i]); + vst1q_s32(a + 28, r4[i]); + a += 32; + } +} + +static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/, + int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + left[0] = vshll_n_s16(vget_low_s16(a[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(a[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(a[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(a[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(a[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(a[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(a[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(a[7]), 2); + left[8] = vshll_n_s16(vget_low_s16(a[8]), 2); + left[9] = vshll_n_s16(vget_low_s16(a[9]), 2); + left[10] = vshll_n_s16(vget_low_s16(a[10]), 2); + left[11] = vshll_n_s16(vget_low_s16(a[11]), 2); + left[12] = vshll_n_s16(vget_low_s16(a[12]), 2); + left[13] = vshll_n_s16(vget_low_s16(a[13]), 2); + left[14] = vshll_n_s16(vget_low_s16(a[14]), 2); + left[15] = vshll_n_s16(vget_low_s16(a[15]), 2); + left[16] = vshll_n_s16(vget_low_s16(a[16]), 2); + left[17] = vshll_n_s16(vget_low_s16(a[17]), 2); + left[18] = vshll_n_s16(vget_low_s16(a[18]), 2); + left[19] = vshll_n_s16(vget_low_s16(a[19]), 2); + left[20] = vshll_n_s16(vget_low_s16(a[20]), 2); + left[21] = vshll_n_s16(vget_low_s16(a[21]), 2); + left[22] = vshll_n_s16(vget_low_s16(a[22]), 2); + left[23] = vshll_n_s16(vget_low_s16(a[23]), 2); + left[24] = vshll_n_s16(vget_low_s16(a[24]), 2); + left[25] = vshll_n_s16(vget_low_s16(a[25]), 2); + left[26] = vshll_n_s16(vget_low_s16(a[26]), 2); + left[27] = vshll_n_s16(vget_low_s16(a[27]), 2); + left[28] = vshll_n_s16(vget_low_s16(a[28]), 2); + left[29] = vshll_n_s16(vget_low_s16(a[29]), 2); + left[30] = vshll_n_s16(vget_low_s16(a[30]), 2); + left[31] = vshll_n_s16(vget_low_s16(a[31]), 2); + + right[0] = vshll_n_s16(vget_high_s16(a[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(a[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(a[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(a[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(a[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(a[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(a[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(a[7]), 2); + right[8] = vshll_n_s16(vget_high_s16(a[8]), 2); + right[9] = vshll_n_s16(vget_high_s16(a[9]), 2); + right[10] = vshll_n_s16(vget_high_s16(a[10]), 2); + right[11] = vshll_n_s16(vget_high_s16(a[11]), 2); + right[12] = vshll_n_s16(vget_high_s16(a[12]), 2); + right[13] = vshll_n_s16(vget_high_s16(a[13]), 2); + right[14] = vshll_n_s16(vget_high_s16(a[14]), 2); + right[15] = vshll_n_s16(vget_high_s16(a[15]), 2); + right[16] = vshll_n_s16(vget_high_s16(a[16]), 2); + right[17] = vshll_n_s16(vget_high_s16(a[17]), 2); + right[18] = vshll_n_s16(vget_high_s16(a[18]), 2); + right[19] = vshll_n_s16(vget_high_s16(a[19]), 2); + right[20] = vshll_n_s16(vget_high_s16(a[20]), 2); + right[21] = vshll_n_s16(vget_high_s16(a[21]), 2); + right[22] = vshll_n_s16(vget_high_s16(a[22]), 2); + right[23] = vshll_n_s16(vget_high_s16(a[23]), 2); + right[24] = vshll_n_s16(vget_high_s16(a[24]), 2); + right[25] = vshll_n_s16(vget_high_s16(a[25]), 2); + right[26] = vshll_n_s16(vget_high_s16(a[26]), 2); + right[27] = vshll_n_s16(vget_high_s16(a[27]), 2); + right[28] = vshll_n_s16(vget_high_s16(a[28]), 2); + right[29] = vshll_n_s16(vget_high_s16(a[29]), 2); + right[30] = vshll_n_s16(vget_high_s16(a[30]), 2); + right[31] = vshll_n_s16(vget_high_s16(a[31]), 2); +} + +static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/, + int32x4_t *a_right /*[32]*/, + int32x4_t *b_left /*[32]*/, + int32x4_t *b_right /*[32]*/) { + // Stage 1. Done as part of the load for the first pass. + b_left[0] = vaddq_s32(a_left[0], a_left[31]); + b_left[1] = vaddq_s32(a_left[1], a_left[30]); + b_left[2] = vaddq_s32(a_left[2], a_left[29]); + b_left[3] = vaddq_s32(a_left[3], a_left[28]); + b_left[4] = vaddq_s32(a_left[4], a_left[27]); + b_left[5] = vaddq_s32(a_left[5], a_left[26]); + b_left[6] = vaddq_s32(a_left[6], a_left[25]); + b_left[7] = vaddq_s32(a_left[7], a_left[24]); + b_left[8] = vaddq_s32(a_left[8], a_left[23]); + b_left[9] = vaddq_s32(a_left[9], a_left[22]); + b_left[10] = vaddq_s32(a_left[10], a_left[21]); + b_left[11] = vaddq_s32(a_left[11], a_left[20]); + b_left[12] = vaddq_s32(a_left[12], a_left[19]); + b_left[13] = vaddq_s32(a_left[13], a_left[18]); + b_left[14] = vaddq_s32(a_left[14], a_left[17]); + b_left[15] = vaddq_s32(a_left[15], a_left[16]); + + b_right[0] = vaddq_s32(a_right[0], a_right[31]); + b_right[1] = vaddq_s32(a_right[1], a_right[30]); + b_right[2] = vaddq_s32(a_right[2], a_right[29]); + b_right[3] = vaddq_s32(a_right[3], a_right[28]); + b_right[4] = vaddq_s32(a_right[4], a_right[27]); + b_right[5] = vaddq_s32(a_right[5], a_right[26]); + b_right[6] = vaddq_s32(a_right[6], a_right[25]); + b_right[7] = vaddq_s32(a_right[7], a_right[24]); + b_right[8] = vaddq_s32(a_right[8], a_right[23]); + b_right[9] = vaddq_s32(a_right[9], a_right[22]); + b_right[10] = vaddq_s32(a_right[10], a_right[21]); + b_right[11] = vaddq_s32(a_right[11], a_right[20]); + b_right[12] = vaddq_s32(a_right[12], a_right[19]); + b_right[13] = vaddq_s32(a_right[13], a_right[18]); + b_right[14] = vaddq_s32(a_right[14], a_right[17]); + b_right[15] = vaddq_s32(a_right[15], a_right[16]); + + b_left[16] = vsubq_s32(a_left[15], a_left[16]); + b_left[17] = vsubq_s32(a_left[14], a_left[17]); + b_left[18] = vsubq_s32(a_left[13], a_left[18]); + b_left[19] = vsubq_s32(a_left[12], a_left[19]); + b_left[20] = vsubq_s32(a_left[11], a_left[20]); + b_left[21] = vsubq_s32(a_left[10], a_left[21]); + b_left[22] = vsubq_s32(a_left[9], a_left[22]); + b_left[23] = vsubq_s32(a_left[8], a_left[23]); + b_left[24] = vsubq_s32(a_left[7], a_left[24]); + b_left[25] = vsubq_s32(a_left[6], a_left[25]); + b_left[26] = vsubq_s32(a_left[5], a_left[26]); + b_left[27] = vsubq_s32(a_left[4], a_left[27]); + b_left[28] = vsubq_s32(a_left[3], a_left[28]); + b_left[29] = vsubq_s32(a_left[2], a_left[29]); + b_left[30] = vsubq_s32(a_left[1], a_left[30]); + b_left[31] = vsubq_s32(a_left[0], a_left[31]); + + b_right[16] = vsubq_s32(a_right[15], a_right[16]); + b_right[17] = vsubq_s32(a_right[14], a_right[17]); + b_right[18] = vsubq_s32(a_right[13], a_right[18]); + b_right[19] = vsubq_s32(a_right[12], a_right[19]); + b_right[20] = vsubq_s32(a_right[11], a_right[20]); + b_right[21] = vsubq_s32(a_right[10], a_right[21]); + b_right[22] = vsubq_s32(a_right[9], a_right[22]); + b_right[23] = vsubq_s32(a_right[8], a_right[23]); + b_right[24] = vsubq_s32(a_right[7], a_right[24]); + b_right[25] = vsubq_s32(a_right[6], a_right[25]); + b_right[26] = vsubq_s32(a_right[5], a_right[26]); + b_right[27] = vsubq_s32(a_right[4], a_right[27]); + b_right[28] = vsubq_s32(a_right[3], a_right[28]); + b_right[29] = vsubq_s32(a_right[2], a_right[29]); + b_right[30] = vsubq_s32(a_right[1], a_right[30]); + b_right[31] = vsubq_s32(a_right[0], a_right[31]); +} + +static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + + left[0] = add_round_shift_s32(left[0]); + left[1] = add_round_shift_s32(left[1]); + left[2] = add_round_shift_s32(left[2]); + left[3] = add_round_shift_s32(left[3]); + left[4] = add_round_shift_s32(left[4]); + left[5] = add_round_shift_s32(left[5]); + left[6] = add_round_shift_s32(left[6]); + left[7] = add_round_shift_s32(left[7]); + left[8] = add_round_shift_s32(left[8]); + left[9] = add_round_shift_s32(left[9]); + left[10] = add_round_shift_s32(left[10]); + left[11] = add_round_shift_s32(left[11]); + left[12] = add_round_shift_s32(left[12]); + left[13] = add_round_shift_s32(left[13]); + left[14] = add_round_shift_s32(left[14]); + left[15] = add_round_shift_s32(left[15]); + left[16] = add_round_shift_s32(left[16]); + left[17] = add_round_shift_s32(left[17]); + left[18] = add_round_shift_s32(left[18]); + left[19] = add_round_shift_s32(left[19]); + left[20] = add_round_shift_s32(left[20]); + left[21] = add_round_shift_s32(left[21]); + left[22] = add_round_shift_s32(left[22]); + left[23] = add_round_shift_s32(left[23]); + left[24] = add_round_shift_s32(left[24]); + left[25] = add_round_shift_s32(left[25]); + left[26] = add_round_shift_s32(left[26]); + left[27] = add_round_shift_s32(left[27]); + left[28] = add_round_shift_s32(left[28]); + left[29] = add_round_shift_s32(left[29]); + left[30] = add_round_shift_s32(left[30]); + left[31] = add_round_shift_s32(left[31]); + + right[0] = add_round_shift_s32(right[0]); + right[1] = add_round_shift_s32(right[1]); + right[2] = add_round_shift_s32(right[2]); + right[3] = add_round_shift_s32(right[3]); + right[4] = add_round_shift_s32(right[4]); + right[5] = add_round_shift_s32(right[5]); + right[6] = add_round_shift_s32(right[6]); + right[7] = add_round_shift_s32(right[7]); + right[8] = add_round_shift_s32(right[8]); + right[9] = add_round_shift_s32(right[9]); + right[10] = add_round_shift_s32(right[10]); + right[11] = add_round_shift_s32(right[11]); + right[12] = add_round_shift_s32(right[12]); + right[13] = add_round_shift_s32(right[13]); + right[14] = add_round_shift_s32(right[14]); + right[15] = add_round_shift_s32(right[15]); + right[16] = add_round_shift_s32(right[16]); + right[17] = add_round_shift_s32(right[17]); + right[18] = add_round_shift_s32(right[18]); + right[19] = add_round_shift_s32(right[19]); + right[20] = add_round_shift_s32(right[20]); + right[21] = add_round_shift_s32(right[21]); + right[22] = add_round_shift_s32(right[22]); + right[23] = add_round_shift_s32(right[23]); + right[24] = add_round_shift_s32(right[24]); + right[25] = add_round_shift_s32(right[25]); + right[26] = add_round_shift_s32(right[26]); + right[27] = add_round_shift_s32(right[27]); + right[28] = add_round_shift_s32(right[28]); + right[29] = add_round_shift_s32(right[29]); + right[30] = add_round_shift_s32(right[30]); + right[31] = add_round_shift_s32(right[31]); +} + +static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/, + int32x4_t *right /* [32] */) { + // Also compute partial rounding shift: + // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + + left[0] = sub_round_shift_s32(left[0]); + left[1] = sub_round_shift_s32(left[1]); + left[2] = sub_round_shift_s32(left[2]); + left[3] = sub_round_shift_s32(left[3]); + left[4] = sub_round_shift_s32(left[4]); + left[5] = sub_round_shift_s32(left[5]); + left[6] = sub_round_shift_s32(left[6]); + left[7] = sub_round_shift_s32(left[7]); + left[8] = sub_round_shift_s32(left[8]); + left[9] = sub_round_shift_s32(left[9]); + left[10] = sub_round_shift_s32(left[10]); + left[11] = sub_round_shift_s32(left[11]); + left[12] = sub_round_shift_s32(left[12]); + left[13] = sub_round_shift_s32(left[13]); + left[14] = sub_round_shift_s32(left[14]); + left[15] = sub_round_shift_s32(left[15]); + left[16] = sub_round_shift_s32(left[16]); + left[17] = sub_round_shift_s32(left[17]); + left[18] = sub_round_shift_s32(left[18]); + left[19] = sub_round_shift_s32(left[19]); + left[20] = sub_round_shift_s32(left[20]); + left[21] = sub_round_shift_s32(left[21]); + left[22] = sub_round_shift_s32(left[22]); + left[23] = sub_round_shift_s32(left[23]); + left[24] = sub_round_shift_s32(left[24]); + left[25] = sub_round_shift_s32(left[25]); + left[26] = sub_round_shift_s32(left[26]); + left[27] = sub_round_shift_s32(left[27]); + left[28] = sub_round_shift_s32(left[28]); + left[29] = sub_round_shift_s32(left[29]); + left[30] = sub_round_shift_s32(left[30]); + left[31] = sub_round_shift_s32(left[31]); + + right[0] = sub_round_shift_s32(right[0]); + right[1] = sub_round_shift_s32(right[1]); + right[2] = sub_round_shift_s32(right[2]); + right[3] = sub_round_shift_s32(right[3]); + right[4] = sub_round_shift_s32(right[4]); + right[5] = sub_round_shift_s32(right[5]); + right[6] = sub_round_shift_s32(right[6]); + right[7] = sub_round_shift_s32(right[7]); + right[8] = sub_round_shift_s32(right[8]); + right[9] = sub_round_shift_s32(right[9]); + right[10] = sub_round_shift_s32(right[10]); + right[11] = sub_round_shift_s32(right[11]); + right[12] = sub_round_shift_s32(right[12]); + right[13] = sub_round_shift_s32(right[13]); + right[14] = sub_round_shift_s32(right[14]); + right[15] = sub_round_shift_s32(right[15]); + right[16] = sub_round_shift_s32(right[16]); + right[17] = sub_round_shift_s32(right[17]); + right[18] = sub_round_shift_s32(right[18]); + right[19] = sub_round_shift_s32(right[19]); + right[20] = sub_round_shift_s32(right[20]); + right[21] = sub_round_shift_s32(right[21]); + right[22] = sub_round_shift_s32(right[22]); + right[23] = sub_round_shift_s32(right[23]); + right[24] = sub_round_shift_s32(right[24]); + right[25] = sub_round_shift_s32(right[25]); + right[26] = sub_round_shift_s32(right[26]); + right[27] = sub_round_shift_s32(right[27]); + right[28] = sub_round_shift_s32(right[28]); + right[29] = sub_round_shift_s32(right[29]); + right[30] = sub_round_shift_s32(right[30]); + right[31] = sub_round_shift_s32(right[31]); +} + +static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + al[0] = vaddq_s32(left[0], left[15]); + ar[0] = vaddq_s32(right[0], right[15]); + al[1] = vaddq_s32(left[1], left[14]); + ar[1] = vaddq_s32(right[1], right[14]); + al[2] = vaddq_s32(left[2], left[13]); + ar[2] = vaddq_s32(right[2], right[13]); + al[3] = vaddq_s32(left[3], left[12]); + ar[3] = vaddq_s32(right[3], right[12]); + al[4] = vaddq_s32(left[4], left[11]); + ar[4] = vaddq_s32(right[4], right[11]); + al[5] = vaddq_s32(left[5], left[10]); + ar[5] = vaddq_s32(right[5], right[10]); + al[6] = vaddq_s32(left[6], left[9]); + ar[6] = vaddq_s32(right[6], right[9]); + al[7] = vaddq_s32(left[7], left[8]); + ar[7] = vaddq_s32(right[7], right[8]); + + al[8] = vsubq_s32(left[7], left[8]); + ar[8] = vsubq_s32(right[7], right[8]); + al[9] = vsubq_s32(left[6], left[9]); + ar[9] = vsubq_s32(right[6], right[9]); + al[10] = vsubq_s32(left[5], left[10]); + ar[10] = vsubq_s32(right[5], right[10]); + al[11] = vsubq_s32(left[4], left[11]); + ar[11] = vsubq_s32(right[4], right[11]); + al[12] = vsubq_s32(left[3], left[12]); + ar[12] = vsubq_s32(right[3], right[12]); + al[13] = vsubq_s32(left[2], left[13]); + ar[13] = vsubq_s32(right[2], right[13]); + al[14] = vsubq_s32(left[1], left[14]); + ar[14] = vsubq_s32(right[1], right[14]); + al[15] = vsubq_s32(left[0], left[15]); + ar[15] = vsubq_s32(right[0], right[15]); + + al[16] = left[16]; + ar[16] = right[16]; + al[17] = left[17]; + ar[17] = right[17]; + al[18] = left[18]; + ar[18] = right[18]; + al[19] = left[19]; + ar[19] = right[19]; + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[28] = left[28]; + ar[28] = right[28]; + al[29] = left[29]; + ar[29] = right[29]; + al[30] = left[30]; + ar[30] = right[30]; + al[31] = left[31]; + ar[31] = right[31]; + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(left[16], al[23]); + br[16] = vaddq_s32(right[16], ar[23]); + bl[17] = vaddq_s32(left[17], al[22]); + br[17] = vaddq_s32(right[17], ar[22]); + bl[18] = vaddq_s32(left[18], al[21]); + br[18] = vaddq_s32(right[18], ar[21]); + bl[19] = vaddq_s32(left[19], al[20]); + br[19] = vaddq_s32(right[19], ar[20]); + + bl[20] = vsubq_s32(left[19], al[20]); + br[20] = vsubq_s32(right[19], ar[20]); + bl[21] = vsubq_s32(left[18], al[21]); + br[21] = vsubq_s32(right[18], ar[21]); + bl[22] = vsubq_s32(left[17], al[22]); + br[22] = vsubq_s32(right[17], ar[22]); + bl[23] = vsubq_s32(left[16], al[23]); + br[23] = vsubq_s32(right[16], ar[23]); + + bl[24] = vsubq_s32(left[31], al[24]); + br[24] = vsubq_s32(right[31], ar[24]); + bl[25] = vsubq_s32(left[30], al[25]); + br[25] = vsubq_s32(right[30], ar[25]); + bl[26] = vsubq_s32(left[29], al[26]); + br[26] = vsubq_s32(right[29], ar[26]); + bl[27] = vsubq_s32(left[28], al[27]); + br[27] = vsubq_s32(right[28], ar[27]); + + bl[28] = vaddq_s32(left[28], al[27]); + br[28] = vaddq_s32(right[28], ar[27]); + bl[29] = vaddq_s32(left[29], al[26]); + br[29] = vaddq_s32(right[29], ar[26]); + bl[30] = vaddq_s32(left[30], al[25]); + br[30] = vaddq_s32(right[30], ar[25]); + bl[31] = vaddq_s32(left[31], al[24]); + br[31] = vaddq_s32(right[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], + &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], + &ar[19]); + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], + cospi_24_64, -cospi_8_64, &al[27], &ar[27], + &al[20], &ar[20]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_24_64, -cospi_8_64, &al[26], &ar[26], + &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64, + cospi_24_64, &bl[2], &br[2], &bl[3], + &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64, + cospi_24_64, &bl[14], &br[14], &bl[9], + &br[9]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_24_64, -cospi_8_64, &bl[13], &br[13], + &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64, + cospi_28_64, &al[4], &ar[4], &al[7], + &ar[7]); + butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64, + cospi_12_64, &al[5], &ar[5], &al[6], + &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], + &ar[17]); + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], + cospi_28_64, -cospi_4_64, &al[29], &ar[29], + &al[18], &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_20_64, cospi_12_64, &al[26], &ar[26], + &al[21], &ar[21]); + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_12_64, -cospi_20_64, &al[25], + &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64, + cospi_30_64, &bl[8], &br[8], &bl[15], + &br[15]); + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], + &br[14]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_10_64, cospi_22_64, &bl[10], &br[10], + &bl[13], &br[13]); + butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11], + cospi_26_64, cospi_6_64, &bl[11], &br[11], + &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], + &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], + cospi_17_64, cospi_15_64, &al[17], &ar[17], + &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], + &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], + cospi_25_64, cospi_7_64, &al[25], &ar[25], + &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], + &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_21_64, cospi_11_64, &al[21], &ar[21], + &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_13_64, cospi_19_64, &al[13], &ar[13], + &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23], + cospi_29_64, cospi_3_64, &al[29], &ar[29], + &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // Mini cross. X the first 16 values and the middle 8 of the second half. + al[0] = vaddq_s32(left[0], left[15]); + ar[0] = vaddq_s32(right[0], right[15]); + al[1] = vaddq_s32(left[1], left[14]); + ar[1] = vaddq_s32(right[1], right[14]); + al[2] = vaddq_s32(left[2], left[13]); + ar[2] = vaddq_s32(right[2], right[13]); + al[3] = vaddq_s32(left[3], left[12]); + ar[3] = vaddq_s32(right[3], right[12]); + al[4] = vaddq_s32(left[4], left[11]); + ar[4] = vaddq_s32(right[4], right[11]); + al[5] = vaddq_s32(left[5], left[10]); + ar[5] = vaddq_s32(right[5], right[10]); + al[6] = vaddq_s32(left[6], left[9]); + ar[6] = vaddq_s32(right[6], right[9]); + al[7] = vaddq_s32(left[7], left[8]); + ar[7] = vaddq_s32(right[7], right[8]); + + al[8] = vsubq_s32(left[7], left[8]); + ar[8] = vsubq_s32(right[7], right[8]); + al[9] = vsubq_s32(left[6], left[9]); + ar[9] = vsubq_s32(right[6], right[9]); + al[10] = vsubq_s32(left[5], left[10]); + ar[10] = vsubq_s32(right[5], right[10]); + al[11] = vsubq_s32(left[4], left[11]); + ar[11] = vsubq_s32(right[4], right[11]); + al[12] = vsubq_s32(left[3], left[12]); + ar[12] = vsubq_s32(right[3], right[12]); + al[13] = vsubq_s32(left[2], left[13]); + ar[13] = vsubq_s32(right[2], right[13]); + al[14] = vsubq_s32(left[1], left[14]); + ar[14] = vsubq_s32(right[1], right[14]); + al[15] = vsubq_s32(left[0], left[15]); + ar[15] = vsubq_s32(right[0], right[15]); + + al[16] = left[16]; + ar[16] = right[16]; + al[17] = left[17]; + ar[17] = right[17]; + al[18] = left[18]; + ar[18] = right[18]; + al[19] = left[19]; + ar[19] = right[19]; + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[28] = left[28]; + ar[28] = right[28]; + al[29] = left[29]; + ar[29] = right[29]; + al[30] = left[30]; + ar[30] = right[30]; + al[31] = left[31]; + ar[31] = right[31]; + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(left[16], al[23]); + br[16] = vaddq_s32(right[16], ar[23]); + bl[17] = vaddq_s32(left[17], al[22]); + br[17] = vaddq_s32(right[17], ar[22]); + bl[18] = vaddq_s32(left[18], al[21]); + br[18] = vaddq_s32(right[18], ar[21]); + bl[19] = vaddq_s32(left[19], al[20]); + br[19] = vaddq_s32(right[19], ar[20]); + + bl[20] = vsubq_s32(left[19], al[20]); + br[20] = vsubq_s32(right[19], ar[20]); + bl[21] = vsubq_s32(left[18], al[21]); + br[21] = vsubq_s32(right[18], ar[21]); + bl[22] = vsubq_s32(left[17], al[22]); + br[22] = vsubq_s32(right[17], ar[22]); + bl[23] = vsubq_s32(left[16], al[23]); + br[23] = vsubq_s32(right[16], ar[23]); + + bl[24] = vsubq_s32(left[31], al[24]); + br[24] = vsubq_s32(right[31], ar[24]); + bl[25] = vsubq_s32(left[30], al[25]); + br[25] = vsubq_s32(right[30], ar[25]); + bl[26] = vsubq_s32(left[29], al[26]); + br[26] = vsubq_s32(right[29], ar[26]); + bl[27] = vsubq_s32(left[28], al[27]); + br[27] = vsubq_s32(right[28], ar[27]); + + bl[28] = vaddq_s32(left[28], al[27]); + br[28] = vaddq_s32(right[28], ar[27]); + bl[29] = vaddq_s32(left[29], al[26]); + br[29] = vaddq_s32(right[29], ar[26]); + bl[30] = vaddq_s32(left[30], al[25]); + br[30] = vaddq_s32(right[30], ar[25]); + bl[31] = vaddq_s32(left[31], al[24]); + br[31] = vaddq_s32(right[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], + &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], + &ar[19]); + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], + cospi_24_64, -cospi_8_64, &al[27], &ar[27], + &al[20], &ar[20]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_24_64, -cospi_8_64, &al[26], &ar[26], + &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64, + cospi_24_64, &bl[2], &br[2], &bl[3], + &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64, + cospi_24_64, &bl[14], &br[14], &bl[9], + &br[9]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_24_64, -cospi_8_64, &bl[13], &br[13], + &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64, + cospi_28_64, &al[4], &ar[4], &al[7], + &ar[7]); + butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64, + cospi_12_64, &al[5], &ar[5], &al[6], + &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], + &ar[17]); + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], + cospi_28_64, -cospi_4_64, &al[29], &ar[29], + &al[18], &ar[18]); + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_20_64, cospi_12_64, &al[26], &ar[26], + &al[21], &ar[21]); + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_12_64, -cospi_20_64, &al[25], + &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64, + cospi_30_64, &bl[8], &br[8], &bl[15], + &br[15]); + butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], + &br[14]); + butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10], + cospi_10_64, cospi_22_64, &bl[10], &br[10], + &bl[13], &br[13]); + butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11], + cospi_26_64, cospi_6_64, &bl[11], &br[11], + &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], + &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], + cospi_17_64, cospi_15_64, &al[17], &ar[17], + &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], + &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], + cospi_25_64, cospi_7_64, &al[25], &ar[25], + &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], + &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21], + cospi_21_64, cospi_11_64, &al[21], &ar[21], + &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22], + cospi_13_64, cospi_19_64, &al[13], &ar[13], + &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23], + cospi_29_64, cospi_3_64, &al[29], &ar[29], + &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/, + int32x4_t *right /*32*/) { + int32x4_t al[32], ar[32]; + int32x4_t bl[32], br[32]; + + // Stage 1: Done as part of the load. + + // Stage 2. + // For the "rd" version, all the values are rounded down after stage 2 to keep + // the values in 16 bits. + al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15])); + ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15])); + al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14])); + ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14])); + al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13])); + ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13])); + al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12])); + ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12])); + al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11])); + ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11])); + al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10])); + ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10])); + al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9])); + ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9])); + al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8])); + ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8])); + + al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8])); + ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8])); + al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9])); + ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9])); + al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10])); + ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10])); + al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11])); + ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11])); + al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12])); + ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12])); + al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13])); + ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13])); + al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14])); + ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14])); + al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15])); + ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15])); + + al[16] = add_round_shift_s32(left[16]); + ar[16] = add_round_shift_s32(right[16]); + al[17] = add_round_shift_s32(left[17]); + ar[17] = add_round_shift_s32(right[17]); + al[18] = add_round_shift_s32(left[18]); + ar[18] = add_round_shift_s32(right[18]); + al[19] = add_round_shift_s32(left[19]); + ar[19] = add_round_shift_s32(right[19]); + + butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20], + cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21], + cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22], + cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]); + butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23], + cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]); + + al[20] = add_round_shift_s32(al[20]); + ar[20] = add_round_shift_s32(ar[20]); + al[21] = add_round_shift_s32(al[21]); + ar[21] = add_round_shift_s32(ar[21]); + al[22] = add_round_shift_s32(al[22]); + ar[22] = add_round_shift_s32(ar[22]); + al[23] = add_round_shift_s32(al[23]); + ar[23] = add_round_shift_s32(ar[23]); + al[24] = add_round_shift_s32(al[24]); + ar[24] = add_round_shift_s32(ar[24]); + al[25] = add_round_shift_s32(al[25]); + ar[25] = add_round_shift_s32(ar[25]); + al[26] = add_round_shift_s32(al[26]); + ar[26] = add_round_shift_s32(ar[26]); + al[27] = add_round_shift_s32(al[27]); + ar[27] = add_round_shift_s32(ar[27]); + + al[28] = add_round_shift_s32(left[28]); + ar[28] = add_round_shift_s32(right[28]); + al[29] = add_round_shift_s32(left[29]); + ar[29] = add_round_shift_s32(right[29]); + al[30] = add_round_shift_s32(left[30]); + ar[30] = add_round_shift_s32(right[30]); + al[31] = add_round_shift_s32(left[31]); + ar[31] = add_round_shift_s32(right[31]); + + // Stage 3. + bl[0] = vaddq_s32(al[0], al[7]); + br[0] = vaddq_s32(ar[0], ar[7]); + bl[1] = vaddq_s32(al[1], al[6]); + br[1] = vaddq_s32(ar[1], ar[6]); + bl[2] = vaddq_s32(al[2], al[5]); + br[2] = vaddq_s32(ar[2], ar[5]); + bl[3] = vaddq_s32(al[3], al[4]); + br[3] = vaddq_s32(ar[3], ar[4]); + + bl[4] = vsubq_s32(al[3], al[4]); + br[4] = vsubq_s32(ar[3], ar[4]); + bl[5] = vsubq_s32(al[2], al[5]); + br[5] = vsubq_s32(ar[2], ar[5]); + bl[6] = vsubq_s32(al[1], al[6]); + br[6] = vsubq_s32(ar[1], ar[6]); + bl[7] = vsubq_s32(al[0], al[7]); + br[7] = vsubq_s32(ar[0], ar[7]); + + bl[8] = al[8]; + br[8] = ar[8]; + bl[9] = al[9]; + br[9] = ar[9]; + + butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64, + &bl[13], &br[13], &bl[10], &br[10]); + butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64, + &bl[12], &br[12], &bl[11], &br[11]); + + bl[14] = al[14]; + br[14] = ar[14]; + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[16], al[23]); + br[16] = vaddq_s32(ar[16], ar[23]); + bl[17] = vaddq_s32(al[17], al[22]); + br[17] = vaddq_s32(ar[17], ar[22]); + bl[18] = vaddq_s32(al[18], al[21]); + br[18] = vaddq_s32(ar[18], ar[21]); + bl[19] = vaddq_s32(al[19], al[20]); + br[19] = vaddq_s32(ar[19], ar[20]); + + bl[20] = vsubq_s32(al[19], al[20]); + br[20] = vsubq_s32(ar[19], ar[20]); + bl[21] = vsubq_s32(al[18], al[21]); + br[21] = vsubq_s32(ar[18], ar[21]); + bl[22] = vsubq_s32(al[17], al[22]); + br[22] = vsubq_s32(ar[17], ar[22]); + bl[23] = vsubq_s32(al[16], al[23]); + br[23] = vsubq_s32(ar[16], ar[23]); + + bl[24] = vsubq_s32(al[31], al[24]); + br[24] = vsubq_s32(ar[31], ar[24]); + bl[25] = vsubq_s32(al[30], al[25]); + br[25] = vsubq_s32(ar[30], ar[25]); + bl[26] = vsubq_s32(al[29], al[26]); + br[26] = vsubq_s32(ar[29], ar[26]); + bl[27] = vsubq_s32(al[28], al[27]); + br[27] = vsubq_s32(ar[28], ar[27]); + + bl[28] = vaddq_s32(al[28], al[27]); + br[28] = vaddq_s32(ar[28], ar[27]); + bl[29] = vaddq_s32(al[29], al[26]); + br[29] = vaddq_s32(ar[29], ar[26]); + bl[30] = vaddq_s32(al[30], al[25]); + br[30] = vaddq_s32(ar[30], ar[25]); + bl[31] = vaddq_s32(al[31], al[24]); + br[31] = vaddq_s32(ar[31], ar[24]); + + // Stage 4. + al[0] = vaddq_s32(bl[0], bl[3]); + ar[0] = vaddq_s32(br[0], br[3]); + al[1] = vaddq_s32(bl[1], bl[2]); + ar[1] = vaddq_s32(br[1], br[2]); + al[2] = vsubq_s32(bl[1], bl[2]); + ar[2] = vsubq_s32(br[1], br[2]); + al[3] = vsubq_s32(bl[0], bl[3]); + ar[3] = vsubq_s32(br[0], br[3]); + + al[4] = bl[4]; + ar[4] = br[4]; + + butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6], + &ar[6], &al[5], &ar[5]); + + al[7] = bl[7]; + ar[7] = br[7]; + + al[8] = vaddq_s32(bl[8], bl[11]); + ar[8] = vaddq_s32(br[8], br[11]); + al[9] = vaddq_s32(bl[9], bl[10]); + ar[9] = vaddq_s32(br[9], br[10]); + al[10] = vsubq_s32(bl[9], bl[10]); + ar[10] = vsubq_s32(br[9], br[10]); + al[11] = vsubq_s32(bl[8], bl[11]); + ar[11] = vsubq_s32(br[8], br[11]); + al[12] = vsubq_s32(bl[15], bl[12]); + ar[12] = vsubq_s32(br[15], br[12]); + al[13] = vsubq_s32(bl[14], bl[13]); + ar[13] = vsubq_s32(br[14], br[13]); + al[14] = vaddq_s32(bl[14], bl[13]); + ar[14] = vaddq_s32(br[14], br[13]); + al[15] = vaddq_s32(bl[15], bl[12]); + ar[15] = vaddq_s32(br[15], br[12]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[17] = bl[17]; + ar[17] = br[17]; + + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64, + cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]); + butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64, + cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]); + butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64, + -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]); + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64, + -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]); + + al[22] = bl[22]; + ar[22] = br[22]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[25] = bl[25]; + ar[25] = br[25]; + + al[30] = bl[30]; + ar[30] = br[30]; + al[31] = bl[31]; + ar[31] = br[31]; + + // Stage 5. + butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0], + &br[0], &bl[1], &br[1]); + butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64, + &bl[2], &br[2], &bl[3], &br[3]); + + bl[4] = vaddq_s32(al[4], al[5]); + br[4] = vaddq_s32(ar[4], ar[5]); + bl[5] = vsubq_s32(al[4], al[5]); + br[5] = vsubq_s32(ar[4], ar[5]); + bl[6] = vsubq_s32(al[7], al[6]); + br[6] = vsubq_s32(ar[7], ar[6]); + bl[7] = vaddq_s32(al[7], al[6]); + br[7] = vaddq_s32(ar[7], ar[6]); + + bl[8] = al[8]; + br[8] = ar[8]; + + butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64, + &bl[14], &br[14], &bl[9], &br[9]); + butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64, + -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]); + + bl[11] = al[11]; + br[11] = ar[11]; + bl[12] = al[12]; + br[12] = ar[12]; + + bl[15] = al[15]; + br[15] = ar[15]; + + bl[16] = vaddq_s32(al[19], al[16]); + br[16] = vaddq_s32(ar[19], ar[16]); + bl[17] = vaddq_s32(al[18], al[17]); + br[17] = vaddq_s32(ar[18], ar[17]); + bl[18] = vsubq_s32(al[17], al[18]); + br[18] = vsubq_s32(ar[17], ar[18]); + bl[19] = vsubq_s32(al[16], al[19]); + br[19] = vsubq_s32(ar[16], ar[19]); + bl[20] = vsubq_s32(al[23], al[20]); + br[20] = vsubq_s32(ar[23], ar[20]); + bl[21] = vsubq_s32(al[22], al[21]); + br[21] = vsubq_s32(ar[22], ar[21]); + bl[22] = vaddq_s32(al[21], al[22]); + br[22] = vaddq_s32(ar[21], ar[22]); + bl[23] = vaddq_s32(al[20], al[23]); + br[23] = vaddq_s32(ar[20], ar[23]); + bl[24] = vaddq_s32(al[27], al[24]); + br[24] = vaddq_s32(ar[27], ar[24]); + bl[25] = vaddq_s32(al[26], al[25]); + br[25] = vaddq_s32(ar[26], ar[25]); + bl[26] = vsubq_s32(al[25], al[26]); + br[26] = vsubq_s32(ar[25], ar[26]); + bl[27] = vsubq_s32(al[24], al[27]); + br[27] = vsubq_s32(ar[24], ar[27]); + bl[28] = vsubq_s32(al[31], al[28]); + br[28] = vsubq_s32(ar[31], ar[28]); + bl[29] = vsubq_s32(al[30], al[29]); + br[29] = vsubq_s32(ar[30], ar[29]); + bl[30] = vaddq_s32(al[29], al[30]); + br[30] = vaddq_s32(ar[29], ar[30]); + bl[31] = vaddq_s32(al[28], al[31]); + br[31] = vaddq_s32(ar[28], ar[31]); + + // Stage 6. + al[0] = bl[0]; + ar[0] = br[0]; + al[1] = bl[1]; + ar[1] = br[1]; + al[2] = bl[2]; + ar[2] = br[2]; + al[3] = bl[3]; + ar[3] = br[3]; + + butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64, + &al[4], &ar[4], &al[7], &ar[7]); + butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64, + &al[5], &ar[5], &al[6], &ar[6]); + + al[8] = vaddq_s32(bl[8], bl[9]); + ar[8] = vaddq_s32(br[8], br[9]); + al[9] = vsubq_s32(bl[8], bl[9]); + ar[9] = vsubq_s32(br[8], br[9]); + al[10] = vsubq_s32(bl[11], bl[10]); + ar[10] = vsubq_s32(br[11], br[10]); + al[11] = vaddq_s32(bl[11], bl[10]); + ar[11] = vaddq_s32(br[11], br[10]); + al[12] = vaddq_s32(bl[12], bl[13]); + ar[12] = vaddq_s32(br[12], br[13]); + al[13] = vsubq_s32(bl[12], bl[13]); + ar[13] = vsubq_s32(br[12], br[13]); + al[14] = vsubq_s32(bl[15], bl[14]); + ar[14] = vsubq_s32(br[15], br[14]); + al[15] = vaddq_s32(bl[15], bl[14]); + ar[15] = vaddq_s32(br[15], br[14]); + + al[16] = bl[16]; + ar[16] = br[16]; + al[19] = bl[19]; + ar[19] = br[19]; + al[20] = bl[20]; + ar[20] = br[20]; + al[23] = bl[23]; + ar[23] = br[23]; + al[24] = bl[24]; + ar[24] = br[24]; + al[27] = bl[27]; + ar[27] = br[27]; + al[28] = bl[28]; + ar[28] = br[28]; + al[31] = bl[31]; + ar[31] = br[31]; + + butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64, + cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]); + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64, + -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]); + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64, + cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]); + butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64, + -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]); + + // Stage 7. + bl[0] = al[0]; + br[0] = ar[0]; + bl[1] = al[1]; + br[1] = ar[1]; + bl[2] = al[2]; + br[2] = ar[2]; + bl[3] = al[3]; + br[3] = ar[3]; + bl[4] = al[4]; + br[4] = ar[4]; + bl[5] = al[5]; + br[5] = ar[5]; + bl[6] = al[6]; + br[6] = ar[6]; + bl[7] = al[7]; + br[7] = ar[7]; + + butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64, + &bl[8], &br[8], &bl[15], &br[15]); + butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64, + cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]); + butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64, + cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]); + butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64, + cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]); + + bl[16] = vaddq_s32(al[16], al[17]); + br[16] = vaddq_s32(ar[16], ar[17]); + bl[17] = vsubq_s32(al[16], al[17]); + br[17] = vsubq_s32(ar[16], ar[17]); + bl[18] = vsubq_s32(al[19], al[18]); + br[18] = vsubq_s32(ar[19], ar[18]); + bl[19] = vaddq_s32(al[19], al[18]); + br[19] = vaddq_s32(ar[19], ar[18]); + bl[20] = vaddq_s32(al[20], al[21]); + br[20] = vaddq_s32(ar[20], ar[21]); + bl[21] = vsubq_s32(al[20], al[21]); + br[21] = vsubq_s32(ar[20], ar[21]); + bl[22] = vsubq_s32(al[23], al[22]); + br[22] = vsubq_s32(ar[23], ar[22]); + bl[23] = vaddq_s32(al[23], al[22]); + br[23] = vaddq_s32(ar[23], ar[22]); + bl[24] = vaddq_s32(al[24], al[25]); + br[24] = vaddq_s32(ar[24], ar[25]); + bl[25] = vsubq_s32(al[24], al[25]); + br[25] = vsubq_s32(ar[24], ar[25]); + bl[26] = vsubq_s32(al[27], al[26]); + br[26] = vsubq_s32(ar[27], ar[26]); + bl[27] = vaddq_s32(al[27], al[26]); + br[27] = vaddq_s32(ar[27], ar[26]); + bl[28] = vaddq_s32(al[28], al[29]); + br[28] = vaddq_s32(ar[28], ar[29]); + bl[29] = vsubq_s32(al[28], al[29]); + br[29] = vsubq_s32(ar[28], ar[29]); + bl[30] = vsubq_s32(al[31], al[30]); + br[30] = vsubq_s32(ar[31], ar[30]); + bl[31] = vaddq_s32(al[31], al[30]); + br[31] = vaddq_s32(ar[31], ar[30]); + + // Final stage. + left[0] = bl[0]; + right[0] = br[0]; + left[16] = bl[1]; + right[16] = br[1]; + left[8] = bl[2]; + right[8] = br[2]; + left[24] = bl[3]; + right[24] = br[3]; + left[4] = bl[4]; + right[4] = br[4]; + left[20] = bl[5]; + right[20] = br[5]; + left[12] = bl[6]; + right[12] = br[6]; + left[28] = bl[7]; + right[28] = br[7]; + left[2] = bl[8]; + right[2] = br[8]; + left[18] = bl[9]; + right[18] = br[9]; + left[10] = bl[10]; + right[10] = br[10]; + left[26] = bl[11]; + right[26] = br[11]; + left[6] = bl[12]; + right[6] = br[12]; + left[22] = bl[13]; + right[22] = br[13]; + left[14] = bl[14]; + right[14] = br[14]; + left[30] = bl[15]; + right[30] = br[15]; + + butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64, + cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]); + left[1] = al[1]; + right[1] = ar[1]; + left[31] = al[31]; + right[31] = ar[31]; + + butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64, + cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]); + left[17] = al[17]; + right[17] = ar[17]; + left[15] = al[15]; + right[15] = ar[15]; + + butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64, + cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]); + left[9] = al[9]; + right[9] = ar[9]; + left[23] = al[23]; + right[23] = ar[23]; + + butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64, + cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]); + left[25] = al[25]; + right[25] = ar[25]; + left[7] = al[7]; + right[7] = ar[7]; + + butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64, + cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]); + left[5] = al[5]; + right[5] = ar[5]; + left[27] = al[27]; + right[27] = ar[27]; + + butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64, + cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]); + left[21] = al[21]; + right[21] = ar[21]; + left[11] = al[11]; + right[11] = ar[11]; + + butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64, + cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]); + left[13] = al[13]; + right[13] = ar[13]; + left[19] = al[19]; + right[19] = ar[19]; + + butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64, + cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]); + left[29] = al[29]; + right[29] = ar[29]; + left[3] = al[3]; + right[3] = ar[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH + +#endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c new file mode 100644 index 0000000000..4bc968ecba --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/fdct4x4_neon.h" + +void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // input[M * stride] * 16 + int16x4_t in[4]; + in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4); + + // If the very first value != 0, then add 1. + if (input[0] != 0) { + const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); + in[0] = vadd_s16(in[0], one); + } + vpx_fdct4x4_pass1_neon(in); + vpx_fdct4x4_pass2_neon(in); + { + // Not quite a rounding shift. Only add 1 despite shifting by 2. + const int16x8_t one = vdupq_n_s16(1); + int16x8_t out_01 = vcombine_s16(in[0], in[1]); + int16x8_t out_23 = vcombine_s16(in[2], in[3]); + out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); + out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); + store_s16q_to_tran_low(final_output + 0 * 8, out_01); + store_s16q_to_tran_low(final_output + 1 * 8, out_23); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + const int32x4_t const_one = vdupq_n_s32(1); + + // input[M * stride] * 16 + int32x4_t in[4]; + in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4); + in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4); + in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4); + in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4); + + // If the very first value != 0, then add 1. + if (input[0] != 0) { + static const int32_t k1000[4] = { 1, 0, 0, 0 }; + in[0] = vaddq_s32(in[0], vld1q_s32(k1000)); + } + + vpx_highbd_fdct4x4_pass1_neon(in); + vpx_highbd_fdct4x4_pass1_neon(in); + { + // Not quite a rounding shift. Only add 1 despite shifting by 2. + in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2); + in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2); + in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2); + in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2); + + vst1q_s32(final_output, in[0]); + vst1q_s32(final_output + 4, in[1]); + vst1q_s32(final_output + 8, in[2]); + vst1q_s32(final_output + 12, in[3]); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h new file mode 100644 index 0000000000..de3db9774c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct4x4_neon.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ + +#include + +static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) { + int16x4_t out[4]; + + const int16x8_t input_01 = vcombine_s16(in[0], in[1]); + const int16x8_t input_32 = vcombine_s16(in[3], in[2]); + + // in_0 +/- in_3, in_1 +/- in_2 + const int16x8_t s_01 = vaddq_s16(input_01, input_32); + const int16x8_t s_32 = vsubq_s16(input_01, input_32); + + // step_0 +/- step_1, step_2 +/- step_3 + const int16x4_t s_0 = vget_low_s16(s_01); + const int16x4_t s_1 = vget_high_s16(s_01); + const int16x4_t s_2 = vget_high_s16(s_32); + const int16x4_t s_3 = vget_low_s16(s_32); + + // fdct_round_shift(s_0 +/- s_1) * cospi_16_64 + butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0], + &out[2]); + + // s_3 * cospi_8_64 + s_2 * cospi_24_64 + // s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]); + + transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#if CONFIG_VP9_HIGHBITDEPTH + +static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) { + int32x4_t out[4]; + // in_0 +/- in_3, in_1 +/- in_2 + const int32x4_t s_0 = vaddq_s32(in[0], in[3]); + const int32x4_t s_1 = vaddq_s32(in[1], in[2]); + const int32x4_t s_2 = vsubq_s32(in[1], in[2]); + const int32x4_t s_3 = vsubq_s32(in[0], in[3]); + + butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]); + + // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64 + // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64 + butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64, + &out[1], &out[3]); + + transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]); + + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c new file mode 100644 index 0000000000..75ee6f2230 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.c @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/fdct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/fdct8x8_neon.h" + +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // stage 1 + int16x8_t in[8]; + in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + + vpx_fdct8x8_pass1_neon(in); + vpx_fdct8x8_pass2_neon(in); + { + // from vpx_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15); + const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15); + const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15); + const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15); + const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15); + const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15); + const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15); + const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15); + in[0] = vhsubq_s16(in[0], sign_in0); + in[1] = vhsubq_s16(in[1], sign_in1); + in[2] = vhsubq_s16(in[2], sign_in2); + in[3] = vhsubq_s16(in[3], sign_in3); + in[4] = vhsubq_s16(in[4], sign_in4); + in[5] = vhsubq_s16(in[5], sign_in5); + in[6] = vhsubq_s16(in[6], sign_in6); + in[7] = vhsubq_s16(in[7], sign_in7); + // store results + store_s16q_to_tran_low(final_output + 0 * 8, in[0]); + store_s16q_to_tran_low(final_output + 1 * 8, in[1]); + store_s16q_to_tran_low(final_output + 2 * 8, in[2]); + store_s16q_to_tran_low(final_output + 3 * 8, in[3]); + store_s16q_to_tran_low(final_output + 4 * 8, in[4]); + store_s16q_to_tran_low(final_output + 5 * 8, in[5]); + store_s16q_to_tran_low(final_output + 6 * 8, in[6]); + store_s16q_to_tran_low(final_output + 7 * 8, in[7]); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { + // input[M * stride] * 16 + int32x4_t left[8], right[8]; + int16x8_t in[8]; + in[0] = vld1q_s16(input + 0 * stride); + in[1] = vld1q_s16(input + 1 * stride); + in[2] = vld1q_s16(input + 2 * stride); + in[3] = vld1q_s16(input + 3 * stride); + in[4] = vld1q_s16(input + 4 * stride); + in[5] = vld1q_s16(input + 5 * stride); + in[6] = vld1q_s16(input + 6 * stride); + in[7] = vld1q_s16(input + 7 * stride); + + left[0] = vshll_n_s16(vget_low_s16(in[0]), 2); + left[1] = vshll_n_s16(vget_low_s16(in[1]), 2); + left[2] = vshll_n_s16(vget_low_s16(in[2]), 2); + left[3] = vshll_n_s16(vget_low_s16(in[3]), 2); + left[4] = vshll_n_s16(vget_low_s16(in[4]), 2); + left[5] = vshll_n_s16(vget_low_s16(in[5]), 2); + left[6] = vshll_n_s16(vget_low_s16(in[6]), 2); + left[7] = vshll_n_s16(vget_low_s16(in[7]), 2); + right[0] = vshll_n_s16(vget_high_s16(in[0]), 2); + right[1] = vshll_n_s16(vget_high_s16(in[1]), 2); + right[2] = vshll_n_s16(vget_high_s16(in[2]), 2); + right[3] = vshll_n_s16(vget_high_s16(in[3]), 2); + right[4] = vshll_n_s16(vget_high_s16(in[4]), 2); + right[5] = vshll_n_s16(vget_high_s16(in[5]), 2); + right[6] = vshll_n_s16(vget_high_s16(in[6]), 2); + right[7] = vshll_n_s16(vget_high_s16(in[7]), 2); + + vpx_highbd_fdct8x8_pass1_neon(left, right); + vpx_highbd_fdct8x8_pass2_neon(left, right); + { + left[0] = add_round_shift_half_s32(left[0]); + left[1] = add_round_shift_half_s32(left[1]); + left[2] = add_round_shift_half_s32(left[2]); + left[3] = add_round_shift_half_s32(left[3]); + left[4] = add_round_shift_half_s32(left[4]); + left[5] = add_round_shift_half_s32(left[5]); + left[6] = add_round_shift_half_s32(left[6]); + left[7] = add_round_shift_half_s32(left[7]); + right[0] = add_round_shift_half_s32(right[0]); + right[1] = add_round_shift_half_s32(right[1]); + right[2] = add_round_shift_half_s32(right[2]); + right[3] = add_round_shift_half_s32(right[3]); + right[4] = add_round_shift_half_s32(right[4]); + right[5] = add_round_shift_half_s32(right[5]); + right[6] = add_round_shift_half_s32(right[6]); + right[7] = add_round_shift_half_s32(right[7]); + + // store results + vst1q_s32(final_output, left[0]); + vst1q_s32(final_output + 4, right[0]); + vst1q_s32(final_output + 8, left[1]); + vst1q_s32(final_output + 12, right[1]); + vst1q_s32(final_output + 16, left[2]); + vst1q_s32(final_output + 20, right[2]); + vst1q_s32(final_output + 24, left[3]); + vst1q_s32(final_output + 28, right[3]); + vst1q_s32(final_output + 32, left[4]); + vst1q_s32(final_output + 36, right[4]); + vst1q_s32(final_output + 40, left[5]); + vst1q_s32(final_output + 44, right[5]); + vst1q_s32(final_output + 48, left[6]); + vst1q_s32(final_output + 52, right[6]); + vst1q_s32(final_output + 56, left[7]); + vst1q_s32(final_output + 60, right[7]); + } +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h new file mode 100644 index 0000000000..cc65157430 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct8x8_neon.h @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ + +#include + +static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in, + int16x8_t *out) { + int16x8_t s[8], x[4], t[2]; + + s[0] = vaddq_s16(in[0], in[7]); + s[1] = vaddq_s16(in[1], in[6]); + s[2] = vaddq_s16(in[2], in[5]); + s[3] = vaddq_s16(in[3], in[4]); + s[4] = vsubq_s16(in[3], in[4]); + s[5] = vsubq_s16(in[2], in[5]); + s[6] = vsubq_s16(in[1], in[6]); + s[7] = vsubq_s16(in[0], in[7]); + // fdct4(step, step); + x[0] = vaddq_s16(s[0], s[3]); + x[1] = vaddq_s16(s[1], s[2]); + x[2] = vsubq_s16(s[1], s[2]); + x[3] = vsubq_s16(s[0], s[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0], + &out[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1], + &t[0]); + + // Stage 3 + x[0] = vaddq_s16(s[4], t[0]); + x[1] = vsubq_s16(s[4], t[0]); + x[2] = vsubq_s16(s[7], t[1]); + x[3] = vaddq_s16(s[7], t[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]); +} + +static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass1_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) { + int16x8_t out[8]; + vpx_fdct8x8_pass2_notranspose_neon(in, out); + // transpose 8x8 + transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + in[0] = out[0]; + in[1] = out[1]; + in[2] = out[2]; + in[3] = out[3]; + in[4] = out[4]; + in[5] = out[5]; + in[6] = out[6]; + in[7] = out[7]; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64, + &left[2], &right[2], &left[6], &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64, + &left[1], &right[1], &left[7], &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64, + &left[5], &right[5], &left[3], &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left, + int32x4_t *right) { + int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4]; + + sl[0] = vaddq_s32(left[0], left[7]); + sl[1] = vaddq_s32(left[1], left[6]); + sl[2] = vaddq_s32(left[2], left[5]); + sl[3] = vaddq_s32(left[3], left[4]); + sl[4] = vsubq_s32(left[3], left[4]); + sl[5] = vsubq_s32(left[2], left[5]); + sl[6] = vsubq_s32(left[1], left[6]); + sl[7] = vsubq_s32(left[0], left[7]); + sr[0] = vaddq_s32(right[0], right[7]); + sr[1] = vaddq_s32(right[1], right[6]); + sr[2] = vaddq_s32(right[2], right[5]); + sr[3] = vaddq_s32(right[3], right[4]); + sr[4] = vsubq_s32(right[3], right[4]); + sr[5] = vsubq_s32(right[2], right[5]); + sr[6] = vsubq_s32(right[1], right[6]); + sr[7] = vsubq_s32(right[0], right[7]); + + // fdct4(step, step); + // x0 = s0 + s3; + xl[0] = vaddq_s32(sl[0], sl[3]); + xr[0] = vaddq_s32(sr[0], sr[3]); + // x1 = s1 + s2; + xl[1] = vaddq_s32(sl[1], sl[2]); + xr[1] = vaddq_s32(sr[1], sr[2]); + // x2 = s1 - s2; + xl[2] = vsubq_s32(sl[1], sl[2]); + xr[2] = vsubq_s32(sr[1], sr[2]); + // x3 = s0 - s3; + xl[3] = vsubq_s32(sl[0], sl[3]); + xr[3] = vsubq_s32(sr[0], sr[3]); + + // fdct4(step, step); + // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64) + // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64) + butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64, + &left[0], &right[0], &left[4], &right[4]); + // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64) + // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64, + cospi_24_64, &left[2], &right[2], &left[6], + &right[6]); + + // Stage 2 + // t0 = (s6 - s5) * cospi_16_64; + // t1 = (s6 + s5) * cospi_16_64; + butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1], + &tr[1], &tl[0], &tr[0]); + + // Stage 3 + xl[0] = vaddq_s32(sl[4], tl[0]); + xr[0] = vaddq_s32(sr[4], tr[0]); + xl[1] = vsubq_s32(sl[4], tl[0]); + xr[1] = vsubq_s32(sr[4], tr[0]); + xl[2] = vsubq_s32(sl[7], tl[1]); + xr[2] = vsubq_s32(sr[7], tr[1]); + xl[3] = vaddq_s32(sl[7], tl[1]); + xr[3] = vaddq_s32(sr[7], tr[1]); + + // Stage 4 + // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64) + // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64) + butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64, + cospi_28_64, &left[1], &right[1], &left[7], + &right[7]); + + // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64) + // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64) + butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64, + cospi_12_64, &left[5], &right[5], &left[3], + &right[3]); +} + +static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left, + int32x4_t *right) { + vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right); + transpose_s32_8x8_2(left, right, left, right); +} + +static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left, + int32x4_t *right) { + vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right); + transpose_s32_8x8_2(left, right, left, right); +} + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h new file mode 100644 index 0000000000..16f5c5fc0e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct_neon.h @@ -0,0 +1,542 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_ +#define VPX_VPX_DSP_ARM_FDCT_NEON_H_ + +#include + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulh_s16 operation on half vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant, + int16x4_t *add, + int16x4_t *sub) { + int16x4_t c = vdup_n_s16(2 * constant); + *add = vqrdmulh_s16(vadd_s16(a, b), c); + *sub = vqrdmulh_s16(vsub_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulh_s16 operation on full vector +// can be slightly less accurate, adequate for pass1 +static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a, + const int16x8_t b, + const tran_coef_t constant, + int16x8_t *add, + int16x8_t *sub) { + int16x8_t c = vdupq_n_s16(2 * constant); + *add = vqrdmulhq_s16(vaddq_s16(a, b), c); + *sub = vqrdmulhq_s16(vsubq_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + int32x4_t c = vdupq_n_s32(constant << 17); + const int16x4_t a_lo = vget_low_s16(a); + const int16x4_t a_hi = vget_high_s16(a); + const int16x4_t b_lo = vget_low_s16(b); + const int16x4_t b_hi = vget_high_s16(b); + *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add_lo, add_hi, sub_lo, sub_hi; + butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo, + &sub_hi); + *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi)); + *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi)); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns full 32-bit values, high/low +static INLINE void butterfly_one_coeff_s16_s32_fast_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int32x4_t *add, int32x4_t *sub) { + int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddl_s16(a, b), c); + *sub = vqrdmulhq_s32(vsubl_s16(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on half vector +// more accurate does 32-bit processing, takes 16-bit input values, +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half( + const int16x4_t a, const int16x4_t b, const tran_coef_t constant, + int16x4_t *add, int16x4_t *sub) { + int32x4_t add32, sub32; + butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32); + *add = vmovn_s32(add32); + *sub = vmovn_s32(sub32); +} + +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values +static INLINE void butterfly_one_coeff_s16_s32( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo, + int32x4_t *sub_hi) { + const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant); + const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant); + const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant); + const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant); + const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// fdct_round_shift((a +/- b) * c) +// Original Variant that performs normal implementation on full vector +// fully accurate does 32-bit processing, takes 16-bit values +// returns narrowed down 16-bit values +static INLINE void butterfly_one_coeff_s16_s32_narrow( + const int16x8_t a, const int16x8_t b, const tran_coef_t constant, + int16x8_t *add, int16x8_t *sub) { + int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi; + butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo, + &sub32_hi); + *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi)); + *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi)); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant); + *add_lo = vmlaq_n_s32(a1, b_lo, constant); + *add_hi = vmlaq_n_s32(a2, b_hi, constant); + *sub_lo = vmlsq_n_s32(a3, b_lo, constant); + *sub_hi = vmlsq_n_s32(a4, b_hi, constant); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a, + const int32x4_t b, + const tran_coef_t constant, + int32x4_t *add, + int32x4_t *sub) { + const int32x4_t c = vdupq_n_s32(constant << 17); + *add = vqrdmulhq_s32(vaddq_s32(a, b), c); + *sub = vqrdmulhq_s32(vsubq_s32(a, b), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs fast vqrdmulhq_s32 operation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values, +// high/low +static INLINE void butterfly_one_coeff_s32_fast( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t c = vdupq_n_s32(constant << 17); + *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c); + *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c); + *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c); + *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c); +} + +// fdct_round_shift((a +/- b) * c) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_one_coeff_s32_s64_narrow( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo, + int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) { + // ac holds the following values: + // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c, + // vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c + int64x2_t ac[4]; + int64x2_t sum[4]; + int64x2_t diff[4]; + + ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant); + ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant); + ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant); + ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant); + + sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant); + sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant); + sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant); + sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant); + *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS), + vrshrn_n_s64(sum[1], DCT_CONST_BITS)); + *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS), + vrshrn_n_s64(sum[3], DCT_CONST_BITS)); + + diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant); + diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant); + diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant); + diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant); + *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS), + vrshrn_n_s64(diff[1], DCT_CONST_BITS)); + *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS), + vrshrn_n_s64(diff[3], DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow_half( + const int32x4_t a, const int32x4_t b, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) { + const int32x2_t a_lo = vget_low_s32(a); + const int32x2_t a_hi = vget_high_s32(a); + const int32x2_t b_lo = vget_low_s32(b); + const int32x2_t b_hi = vget_high_s32(b); + + const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1); + const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1); + const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2); + const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2); + + const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2); + const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2); + const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1); + const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1); + + *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS), + vrshrn_n_s64(sum_hi, DCT_CONST_BITS)); + *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS), + vrshrn_n_s64(diff_hi, DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 64-bit values +// returns results without rounding +static INLINE void butterfly_two_coeff_s32_s64_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/, + int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/, + int64x2_t *sub_hi /*[2]*/) { + // ac1/ac2 hold the following values: + // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1, + // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1 + // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2, + // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2 + int64x2_t ac1[4]; + int64x2_t ac2[4]; + + ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1); + ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1); + ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1); + ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1); + ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2); + ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2); + ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2); + ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2); + + add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2); + add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2); + add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2); + add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2); + + sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1); + sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1); + sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1); + sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on full vector +// more accurate does 64-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_s64_narrow( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + // ac1/ac2 hold the following values: + // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1, + // vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1 + // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2, + // vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2 + int64x2_t ac1[4]; + int64x2_t ac2[4]; + int64x2_t sum[4]; + int64x2_t diff[4]; + + ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1); + ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1); + ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1); + ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1); + ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2); + ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2); + ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2); + ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2); + + sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2); + sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2); + sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2); + sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2); + *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS), + vrshrn_n_s64(sum[1], DCT_CONST_BITS)); + *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS), + vrshrn_n_s64(sum[3], DCT_CONST_BITS)); + + diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1); + diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1); + diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1); + diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1); + *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS), + vrshrn_n_s64(diff[1], DCT_CONST_BITS)); + *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS), + vrshrn_n_s64(diff[3], DCT_CONST_BITS)); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s16_s32_noround( + const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo, + const int16x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmull_n_s16(a_lo, constant1); + const int32x4_t a2 = vmull_n_s16(a_hi, constant1); + const int32x4_t a3 = vmull_n_s16(a_lo, constant2); + const int32x4_t a4 = vmull_n_s16(a_hi, constant2); + *add_lo = vmlal_n_s16(a1, b_lo, constant2); + *add_hi = vmlal_n_s16(a2, b_hi, constant2); + *sub_lo = vmlsl_n_s16(a3, b_lo, constant1); + *sub_hi = vmlsl_n_s16(a4, b_hi, constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32_noround( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + *add_lo = vmlaq_n_s32(a1, b_lo, constant2); + *add_hi = vmlaq_n_s32(a2, b_hi, constant2); + *sub_lo = vmlsq_n_s32(a3, b_lo, constant1); + *sub_hi = vmlsq_n_s32(a4, b_hi, constant1); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Variant that performs normal implementation on half vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_half(const int16x4_t a, + const int16x4_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x4_t *add, int16x4_t *sub) { + const int32x4_t a1 = vmull_n_s16(a, constant1); + const int32x4_t a2 = vmull_n_s16(a, constant2); + const int32x4_t sum = vmlal_n_s16(a1, b, constant2); + const int32x4_t diff = vmlsl_n_s16(a2, b, constant1); + *add = vqrshrn_n_s32(sum, DCT_CONST_BITS); + *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 16-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b, + const tran_coef_t constant1, + const tran_coef_t constant2, + int16x8_t *add, int16x8_t *sub) { + const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1); + const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1); + const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2); + const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2); + const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2); + const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2); + const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1); + const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1); + const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS); + const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS); + const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS); + const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS); + *add = vcombine_s16(rounded0, rounded1); + *sub = vcombine_s16(rounded2, rounded3); +} + +// fdct_round_shift(a * c1 +/- b * c2) +// Original Variant that performs normal implementation on full vector +// more accurate does 32-bit processing, takes and returns 32-bit values +// returns narrowed results +static INLINE void butterfly_two_coeff_s32( + const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo, + const int32x4_t b_hi, const tran_coef_t constant1, + const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi, + int32x4_t *sub_lo, int32x4_t *sub_hi) { + const int32x4_t a1 = vmulq_n_s32(a_lo, constant1); + const int32x4_t a2 = vmulq_n_s32(a_hi, constant1); + const int32x4_t a3 = vmulq_n_s32(a_lo, constant2); + const int32x4_t a4 = vmulq_n_s32(a_hi, constant2); + const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2); + const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2); + const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1); + const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1); + *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS); + *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS); + *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS); + *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { + const int16x8_t one = vdupq_n_s16(1); + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift and round, +// return narrowed results +static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo, + const int32x4_t a_hi) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo); + const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31); + const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32); + const int16x4_t b_lo = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2); + const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi); + const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31); + const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32); + const int16x4_t b_hi = + vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2); + return vcombine_s16(b_lo, b_hi); +} + +// Add 1 if negative, and shift by 1. +// In practice, add the sign bit, then shift and round +static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) { + const int32x4_t one = vdupq_n_s32(1); + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) { + const uint16x8_t a_u16 = vreinterpretq_u16_s16(a); + const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15); + const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16); + return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +// In practice, subtract the sign bit, then shift with rounding. +static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_s32(a); + const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31); + const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32); + return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2); +} + +static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/, + const int64x2_t *b /*[2]*/) { + int64x2_t result[2]; + result[0] = vaddq_s64(a[0], b[0]); + result[1] = vaddq_s64(a[1], b[1]); + return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS), + vrshrn_n_s64(result[1], DCT_CONST_BITS)); +} + +static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/, + const int64x2_t *b /*[2]*/) { + int64x2_t result[2]; + result[0] = vsubq_s64(a[0], b[0]); + result[1] = vsubq_s64(a[1], b[1]); + return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS), + vrshrn_n_s64(result[1], DCT_CONST_BITS)); +} + +static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a, + const int32x4_t b) { + int64x2_t a64[2], b64[2], result[2]; + a64[0] = vmovl_s32(vget_low_s32(a)); + a64[1] = vmovl_s32(vget_high_s32(a)); + b64[0] = vmovl_s32(vget_low_s32(b)); + b64[1] = vmovl_s32(vget_high_s32(b)); + result[0] = vaddq_s64(a64[0], b64[0]); + result[1] = vaddq_s64(a64[1], b64[1]); + return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1])); +} + +static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a, + const int32x4_t b) { + int64x2_t a64[2], b64[2], result[2]; + a64[0] = vmovl_s32(vget_low_s32(a)); + a64[1] = vmovl_s32(vget_high_s32(a)); + b64[0] = vmovl_s32(vget_low_s32(b)); + b64[1] = vmovl_s32(vget_high_s32(b)); + result[0] = vsubq_s64(a64[0], b64[0]); + result[1] = vsubq_s64(a64[1], b64[1]); + return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1])); +} + +#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c new file mode 100644 index 0000000000..ee9e599e09 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct_partial_neon.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { + int16x4_t a0, a1, a2, a3; + int16x8_t b0, b1; + int16x8_t c; + + a0 = vld1_s16(input); + input += stride; + a1 = vld1_s16(input); + input += stride; + a2 = vld1_s16(input); + input += stride; + a3 = vld1_s16(input); + + b0 = vcombine_s16(a0, a1); + b1 = vcombine_s16(a2, a3); + + c = vaddq_s16(b0, b1); + + output[0] = (tran_low_t)(horizontal_add_int16x8(c) << 1); + output[1] = 0; +} + +// Visual Studio 2022 (cl.exe) < 17.7 targeting AArch64 with optimizations +// enabled will fail with an internal compiler error. See: +// https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110 +#if defined(_MSC_VER) && _MSC_VER < 1937 && defined(_M_ARM64) && \ + !defined(__clang__) +#define AOM_WORK_AROUND_MSVC_BUG_10346110 +#endif + +#ifdef AOM_WORK_AROUND_MSVC_BUG_10346110 +#pragma optimize("", off) +#endif +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + + output[0] = (tran_low_t)horizontal_add_int16x8(sum); + output[1] = 0; +} +#ifdef AOM_WORK_AROUND_MSVC_BUG_10346110 +#pragma optimize("", on) +#endif +#undef AOM_WORK_AROUND_MSVC_BUG_10346110 + +void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t left = vld1q_s16(input); + int16x8_t right = vld1q_s16(input + 8); + int32_t sum; + input += stride; + + for (r = 1; r < 16; ++r) { + const int16x8_t a = vld1q_s16(input); + const int16x8_t b = vld1q_s16(input + 8); + input += stride; + left = vaddq_s16(left, a); + right = vaddq_s16(right, b); + } + + sum = horizontal_add_int16x8(left) + horizontal_add_int16x8(right); + + output[0] = (tran_low_t)(sum >> 1); + output[1] = 0; +} + +void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int r; + int16x8_t a0 = vld1q_s16(input); + int16x8_t a1 = vld1q_s16(input + 8); + int16x8_t a2 = vld1q_s16(input + 16); + int16x8_t a3 = vld1q_s16(input + 24); + int32_t sum; + input += stride; + + for (r = 1; r < 32; ++r) { + const int16x8_t b0 = vld1q_s16(input); + const int16x8_t b1 = vld1q_s16(input + 8); + const int16x8_t b2 = vld1q_s16(input + 16); + const int16x8_t b3 = vld1q_s16(input + 24); + input += stride; + a0 = vaddq_s16(a0, b0); + a1 = vaddq_s16(a1, b1); + a2 = vaddq_s16(a2, b2); + a3 = vaddq_s16(a3, b3); + } + + sum = horizontal_add_int16x8(a0); + sum += horizontal_add_int16x8(a1); + sum += horizontal_add_int16x8(a2); + sum += horizontal_add_int16x8(a3); + output[0] = (tran_low_t)(sum >> 3); + output[1] = 0; +} + +#if CONFIG_VP9_HIGHBITDEPTH + +void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int32_t sum; + + int r = 0; + do { + const int16x8_t a = vld1q_s16(input); + const int16x8_t b = vld1q_s16(input + 8); + input += stride; + partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b)); + r++; + } while (r < 16); + + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]); + partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]); + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]); + sum = horizontal_add_int32x4(partial_sum[0]); + + output[0] = (tran_low_t)(sum >> 1); + output[1] = 0; +} + +void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, + int stride) { + int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + + int32_t sum; + + int r = 0; + do { + const int16x8_t a0 = vld1q_s16(input); + const int16x8_t a1 = vld1q_s16(input + 8); + const int16x8_t a2 = vld1q_s16(input + 16); + const int16x8_t a3 = vld1q_s16(input + 24); + input += stride; + partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0)); + partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1)); + partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2)); + partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3)); + partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3)); + r++; + } while (r < 32); + + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]); + partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]); + partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]); + sum = horizontal_add_int32x4(partial_sum[0]); + + output[0] = (tran_low_t)(sum >> 3); + output[1] = 0; +} + +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fwd_txfm_neon.c deleted file mode 100644 index e9503f13d7..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/fwd_txfm_neon.c +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "./vpx_config.h" -#include "vpx_dsp/txfm_common.h" - -void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { - int i; - // stage 1 - int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); - int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); - int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); - int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); - int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); - int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); - int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); - int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); - for (i = 0; i < 2; ++i) { - int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; - const int16x8_t v_s0 = vaddq_s16(input_0, input_7); - const int16x8_t v_s1 = vaddq_s16(input_1, input_6); - const int16x8_t v_s2 = vaddq_s16(input_2, input_5); - const int16x8_t v_s3 = vaddq_s16(input_3, input_4); - const int16x8_t v_s4 = vsubq_s16(input_3, input_4); - const int16x8_t v_s5 = vsubq_s16(input_2, input_5); - const int16x8_t v_s6 = vsubq_s16(input_1, input_6); - const int16x8_t v_s7 = vsubq_s16(input_0, input_7); - // fdct4(step, step); - int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); - int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); - int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); - int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); - // fdct4(step, step); - int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); - int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); - int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); - int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); - int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); - int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); - v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); - v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 - out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 - out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 - out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 - } - // Stage 2 - v_x0 = vsubq_s16(v_s6, v_s5); - v_x1 = vaddq_s16(v_s6, v_s5); - v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x8_t ab = vcombine_s16(a, b); - const int16x8_t cd = vcombine_s16(c, d); - // Stage 3 - v_x0 = vaddq_s16(v_s4, ab); - v_x1 = vsubq_s16(v_s4, ab); - v_x2 = vsubq_s16(v_s7, cd); - v_x3 = vaddq_s16(v_s7, cd); - } - // Stage 4 - v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); - v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); - v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); - v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); - v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); - v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); - v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); - v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); - v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); - v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); - v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); - v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); - v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); - v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); - v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); - v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); - { - const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); - const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); - const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); - const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); - const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); - const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); - const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); - const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); - out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 - out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 - out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 - out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 - } - // transpose 8x8 - { - // 00 01 02 03 40 41 42 43 - // 10 11 12 13 50 51 52 53 - // 20 21 22 23 60 61 62 63 - // 30 31 32 33 70 71 72 73 - // 04 05 06 07 44 45 46 47 - // 14 15 16 17 54 55 56 57 - // 24 25 26 27 64 65 66 67 - // 34 35 36 37 74 75 76 77 - const int32x4x2_t r02_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2)); - const int32x4x2_t r13_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3)); - const int32x4x2_t r46_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6)); - const int32x4x2_t r57_s32 = - vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7)); - const int16x8x2_t r01_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), - vreinterpretq_s16_s32(r13_s32.val[0])); - const int16x8x2_t r23_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), - vreinterpretq_s16_s32(r13_s32.val[1])); - const int16x8x2_t r45_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), - vreinterpretq_s16_s32(r57_s32.val[0])); - const int16x8x2_t r67_s16 = - vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), - vreinterpretq_s16_s32(r57_s32.val[1])); - input_0 = r01_s16.val[0]; - input_1 = r01_s16.val[1]; - input_2 = r23_s16.val[0]; - input_3 = r23_s16.val[1]; - input_4 = r45_s16.val[0]; - input_5 = r45_s16.val[1]; - input_6 = r67_s16.val[0]; - input_7 = r67_s16.val[1]; - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } // for - { - // from vpx_dct_sse2.c - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); - const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); - const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); - const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); - const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); - const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); - const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); - const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); - input_0 = vhsubq_s16(input_0, sign_in0); - input_1 = vhsubq_s16(input_1, sign_in1); - input_2 = vhsubq_s16(input_2, sign_in2); - input_3 = vhsubq_s16(input_3, sign_in3); - input_4 = vhsubq_s16(input_4, sign_in4); - input_5 = vhsubq_s16(input_5, sign_in5); - input_6 = vhsubq_s16(input_6, sign_in6); - input_7 = vhsubq_s16(input_7, sign_in7); - // store results - vst1q_s16(&final_output[0 * 8], input_0); - vst1q_s16(&final_output[1 * 8], input_1); - vst1q_s16(&final_output[2 * 8], input_2); - vst1q_s16(&final_output[3 * 8], input_3); - vst1q_s16(&final_output[4 * 8], input_4); - vst1q_s16(&final_output[5 * 8], input_5); - vst1q_s16(&final_output[6 * 8], input_6); - vst1q_s16(&final_output[7 * 8], input_7); - } -} - -void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { - int r; - int16x8_t sum = vld1q_s16(&input[0]); - for (r = 1; r < 8; ++r) { - const int16x8_t input_00 = vld1q_s16(&input[r * stride]); - sum = vaddq_s16(sum, input_00); - } - { - const int32x4_t a = vpaddlq_s16(sum); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); - output[1] = 0; - } -} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c index 977323497a..f5a044be4d 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/hadamard_neon.c @@ -11,6 +11,9 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, @@ -44,8 +47,8 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, *a7 = vaddq_s16(c1, c5); } -void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { int16x8_t a0 = vld1q_s16(src_diff); int16x8_t a1 = vld1q_s16(src_diff + src_stride); int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); @@ -63,18 +66,18 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, // Skip the second transpose because it is not required. - vst1q_s16(coeff + 0, a0); - vst1q_s16(coeff + 8, a1); - vst1q_s16(coeff + 16, a2); - vst1q_s16(coeff + 24, a3); - vst1q_s16(coeff + 32, a4); - vst1q_s16(coeff + 40, a5); - vst1q_s16(coeff + 48, a6); - vst1q_s16(coeff + 56, a7); + store_s16q_to_tran_low(coeff + 0, a0); + store_s16q_to_tran_low(coeff + 8, a1); + store_s16q_to_tran_low(coeff + 16, a2); + store_s16q_to_tran_low(coeff + 24, a3); + store_s16q_to_tran_low(coeff + 32, a4); + store_s16q_to_tran_low(coeff + 40, a5); + store_s16q_to_tran_low(coeff + 48, a6); + store_s16q_to_tran_low(coeff + 56, a7); } -void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { int i; /* Rearrange 16x16 to 8x32 and remove stride. @@ -88,10 +91,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); for (i = 0; i < 64; i += 8) { - const int16x8_t a0 = vld1q_s16(coeff + 0); - const int16x8_t a1 = vld1q_s16(coeff + 64); - const int16x8_t a2 = vld1q_s16(coeff + 128); - const int16x8_t a3 = vld1q_s16(coeff + 192); + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192); const int16x8_t b0 = vhaddq_s16(a0, a1); const int16x8_t b1 = vhsubq_s16(a0, a1); @@ -103,10 +106,52 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, const int16x8_t c2 = vsubq_s16(b0, b2); const int16x8_t c3 = vsubq_s16(b1, b3); - vst1q_s16(coeff + 0, c0); - vst1q_s16(coeff + 64, c1); - vst1q_s16(coeff + 128, c2); - vst1q_s16(coeff + 192, c3); + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 64, c1); + store_s16q_to_tran_low(coeff + 128, c2); + store_s16q_to_tran_low(coeff + 192, c3); + + coeff += 8; + } +} + +void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int i; + + /* Rearrange 32x32 to 16x64 and remove stride. + * Top left first. */ + vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride, + coeff + 256); + /* Bottom left. */ + vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride, + coeff + 512); + /* Bottom right. */ + vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride, + coeff + 768); + + for (i = 0; i < 256; i += 8) { + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768); + + const int16x8_t b0 = vshrq_n_s16(vhaddq_s16(a0, a1), 1); + const int16x8_t b1 = vshrq_n_s16(vhsubq_s16(a0, a1), 1); + const int16x8_t b2 = vshrq_n_s16(vhaddq_s16(a2, a3), 1); + const int16x8_t b3 = vshrq_n_s16(vhsubq_s16(a2, a3), 1); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 256, c1); + store_s16q_to_tran_low(coeff + 512, c2); + store_s16q_to_tran_low(coeff + 768, c3); coeff += 8; } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c new file mode 100644 index 0000000000..4265596c8c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_neon.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p); + const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p); + return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4; +} + +uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7; + + load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + sum = vaddq_u16(a0, a1); + sum = vaddq_u16(sum, a2); + sum = vaddq_u16(sum, a3); + sum = vaddq_u16(sum, a4); + sum = vaddq_u16(sum, a5); + sum = vaddq_u16(sum, a6); + sum = vaddq_u16(sum, a7); + + return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6; +} + +// coeff: 32 bits, dynamic range [-2147483648, 2147483647]. +// length: value range {16, 64, 256, 1024}. +// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024] +int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) { + int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int32x4_t abs0, abs1; + const int32x4_t s0 = load_tran_low_to_s32q(coeff); + const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4); + + abs0 = vabsq_s32(s0); + sum_s64[0] = vpadalq_s32(sum_s64[0], abs0); + abs1 = vabsq_s32(s1); + sum_s64[1] = vpadalq_s32(sum_s64[1], abs1); + + length -= 8; + coeff += 8; + } while (length != 0); + + return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1])); +} + +void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8); + + const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p); + const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p); + const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p); + const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p); + const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p); + const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p); + const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p); + const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p); + + const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp); + const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp); + const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp); + const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp); + const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp); + const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp); + const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp); + const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp); + + const uint16x8_t abs_diff0 = vabdq_u16(a0, b0); + const uint16x8_t abs_diff1 = vabdq_u16(a1, b1); + const uint16x8_t abs_diff2 = vabdq_u16(a2, b2); + const uint16x8_t abs_diff3 = vabdq_u16(a3, b3); + const uint16x8_t abs_diff4 = vabdq_u16(a4, b4); + const uint16x8_t abs_diff5 = vabdq_u16(a5, b5); + const uint16x8_t abs_diff6 = vabdq_u16(a6, b6); + const uint16x8_t abs_diff7 = vabdq_u16(a7, b7); + + const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1); + const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3); + const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5); + const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7); + + const uint16x8_t max0123 = vmaxq_u16(max01, max23); + const uint16x8_t max4567 = vmaxq_u16(max45, max67); + const uint16x8_t max07 = vmaxq_u16(max0123, max4567); + + const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1); + const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3); + const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5); + const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7); + + const uint16x8_t min0123 = vminq_u16(min01, min23); + const uint16x8_t min4567 = vminq_u16(min45, min67); + const uint16x8_t min07 = vminq_u16(min0123, min4567); + +#if VPX_ARCH_AARCH64 + *min = *max = 0; // Clear high bits + *((uint16_t *)max) = vmaxvq_u16(max07); + *((uint16_t *)min) = vminvq_u16(min07); +#else + // Split into 64-bit vectors and execute pairwise min/max. + uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07)); + uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07)); + + // Enough runs of vpmax/min propagate the max/min values to every position. + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + ab_max = vpmax_u16(ab_max, ab_max); + ab_min = vpmin_u16(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u16((uint16_t *)max, ab_max, 0); + vst1_lane_u16((uint16_t *)min, ab_min, 0); +#endif +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c new file mode 100644 index 0000000000..3063acbb3e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i = height; + if (width > 8) { + do { + int j = 0; + do { + const uint16x8_t p = vld1q_u16(pred + j); + const uint16x8_t r = vld1q_u16(ref + j); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred + j, avg); + + j += 8; + } while (j < width); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else if (width == 8) { + do { + const uint16x8_t p = vld1q_u16(pred); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else { + assert(width == 4); + do { + const uint16x4_t p = vld1_u16(pred); + const uint16x4_t r = vld1_u16(ref); + + uint16x4_t avg = vrhadd_u16(p, r); + vst1_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_neon.h new file mode 100644 index 0000000000..ccc4a797b7 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_neon.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_ +#define VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_ + +#include + +static INLINE uint16x4_t highbd_convolve4_4_neon( + const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t filters, const uint16x4_t max) { + int32x4_t sum = vmull_lane_s16(s0, filters, 0); + sum = vmlal_lane_s16(sum, s1, filters, 1); + sum = vmlal_lane_s16(sum, s2, filters, 2); + sum = vmlal_lane_s16(sum, s3, filters, 3); + + uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve4_8_neon( + const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x4_t filters, const uint16x8_t max) { + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters, 3); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters, 3); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); + return vminq_u16(res, max); +} + +#endif // VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_sve.h b/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_sve.h new file mode 100644 index 0000000000..bc90d9b4dd --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_convolve8_sve.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_SVE_H_ +#define VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_SVE_H_ + +#include + +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" + +static INLINE uint16x4_t highbd_convolve4_4_sve(const int16x4_t s[4], + const int16x8_t filter, + const uint16x4_t max) { + int16x8_t s01 = vcombine_s16(s[0], s[1]); + int16x8_t s23 = vcombine_s16(s[2], s[3]); + + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s01, filter, 0); + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s23, filter, 0); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS); + return vmin_u16(res_u16, max); +} + +static INLINE uint16x8_t highbd_convolve4_8_sve(const int16x8_t s[4], + const int16x8_t filter, + const uint16x8_t max, + uint16x8_t idx) { + int64x2_t sum04 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[0], filter, 0); + int64x2_t sum15 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[1], filter, 0); + int64x2_t sum26 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[2], filter, 0); + int64x2_t sum37 = vpx_dotq_lane_s16(vdupq_n_s64(0), s[3], filter, 0); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + + res = vpx_tbl_u16(res, idx); + + return vminq_u16(res, max); +} + +static INLINE uint16x4_t highbd_convolve8_4(const int16x8_t s[4], + const int16x8_t filter, + const uint16x4_t max) { + int64x2_t sum[4]; + + sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter); + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[2], sum[3]); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + + uint16x4_t res_u16 = vqrshrun_n_s32(res_s32, FILTER_BITS); + return vmin_u16(res_u16, max); +} + +static INLINE uint16x8_t highbd_convolve8_8(const int16x8_t s[8], + const int16x8_t filter, + const uint16x8_t max) { + int64x2_t sum[8]; + + sum[0] = vpx_dotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = vpx_dotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = vpx_dotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = vpx_dotq_s16(vdupq_n_s64(0), s[3], filter); + sum[4] = vpx_dotq_s16(vdupq_n_s64(0), s[4], filter); + sum[5] = vpx_dotq_s16(vdupq_n_s64(0), s[5], filter); + sum[6] = vpx_dotq_s16(vdupq_n_s64(0), s[6], filter); + sum[7] = vpx_dotq_s16(vdupq_n_s64(0), s[7], filter); + + int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); + int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); + int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]); + int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + return vminq_u16(res, max); +} + +#endif // VPX_VPX_DSP_ARM_HIGHBD_CONVOLVE8_SVE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c new file mode 100644 index 0000000000..7be88f6bcb --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_hadamard_neon.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, + int16x8_t *a7) { + int16x8_t b0 = vaddq_s16(*a0, *a1); + int16x8_t b1 = vsubq_s16(*a0, *a1); + int16x8_t b2 = vaddq_s16(*a2, *a3); + int16x8_t b3 = vsubq_s16(*a2, *a3); + int16x8_t b4 = vaddq_s16(*a4, *a5); + int16x8_t b5 = vsubq_s16(*a4, *a5); + int16x8_t b6 = vaddq_s16(*a6, *a7); + int16x8_t b7 = vsubq_s16(*a6, *a7); + + int16x8_t c0 = vaddq_s16(b0, b2); + int16x8_t c2 = vsubq_s16(b0, b2); + int16x8_t c1 = vaddq_s16(b1, b3); + int16x8_t c3 = vsubq_s16(b1, b3); + int16x8_t c4 = vaddq_s16(b4, b6); + int16x8_t c6 = vsubq_s16(b4, b6); + int16x8_t c5 = vaddq_s16(b5, b7); + int16x8_t c7 = vsubq_s16(b5, b7); + + *a0 = vaddq_s16(c0, c4); + *a2 = vsubq_s16(c0, c4); + *a7 = vaddq_s16(c1, c5); + *a6 = vsubq_s16(c1, c5); + *a3 = vaddq_s16(c2, c6); + *a1 = vsubq_s16(c2, c6); + *a4 = vaddq_s16(c3, c7); + *a5 = vsubq_s16(c3, c7); +} + +static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1, + int16x4_t a2, int16x4_t a3, + int16x4_t a4, int16x4_t a5, + int16x4_t a6, int16x4_t a7, + tran_low_t *coeff) { + int32x4_t b0 = vaddl_s16(a0, a1); + int32x4_t b1 = vsubl_s16(a0, a1); + int32x4_t b2 = vaddl_s16(a2, a3); + int32x4_t b3 = vsubl_s16(a2, a3); + int32x4_t b4 = vaddl_s16(a4, a5); + int32x4_t b5 = vsubl_s16(a4, a5); + int32x4_t b6 = vaddl_s16(a6, a7); + int32x4_t b7 = vsubl_s16(a6, a7); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c3 = vsubq_s32(b1, b3); + int32x4_t c4 = vaddq_s32(b4, b6); + int32x4_t c6 = vsubq_s32(b4, b6); + int32x4_t c5 = vaddq_s32(b5, b7); + int32x4_t c7 = vsubq_s32(b5, b7); + + int32x4_t d0 = vaddq_s32(c0, c4); + int32x4_t d2 = vsubq_s32(c0, c4); + int32x4_t d7 = vaddq_s32(c1, c5); + int32x4_t d6 = vsubq_s32(c1, c5); + int32x4_t d3 = vaddq_s32(c2, c6); + int32x4_t d1 = vsubq_s32(c2, c6); + int32x4_t d4 = vaddq_s32(c3, c7); + int32x4_t d5 = vsubq_s32(c3, c7); + + store_s32q_to_tran_low(coeff + 0, d0); + store_s32q_to_tran_low(coeff + 4, d1); + store_s32q_to_tran_low(coeff + 8, d2); + store_s32q_to_tran_low(coeff + 12, d3); + store_s32q_to_tran_low(coeff + 16, d4); + store_s32q_to_tran_low(coeff + 20, d5); + store_s32q_to_tran_low(coeff + 24, d6); + store_s32q_to_tran_low(coeff + 28, d7); +} + +void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x4_t b0, b1, b2, b3, b4, b5, b6, b7; + + int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride); + int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride); + int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride); + + // For the first pass we can stay in 16-bit elements (4095*8 = 32760). + hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + // For the second pass we need to widen to 32-bit elements, so we're + // processing 4 columns at a time. + // Skip the second transpose because it is not required. + + b0 = vget_low_s16(s0); + b1 = vget_low_s16(s1); + b2 = vget_low_s16(s2); + b3 = vget_low_s16(s3); + b4 = vget_low_s16(s4); + b5 = vget_low_s16(s5); + b6 = vget_low_s16(s6); + b7 = vget_low_s16(s7); + + hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff); + + b0 = vget_high_s16(s0); + b1 = vget_high_s16(s1); + b2 = vget_high_s16(s2); + b3 = vget_high_s16(s3); + b4 = vget_high_s16(s4); + b5 = vget_high_s16(s5); + b6 = vget_high_s16(s6); + b7 = vget_high_s16(s7); + + hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32); +} + +void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int i = 0; + + // Rearrange 16x16 to 8x32 and remove stride. + // Top left first. + vpx_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff); + // Top right. + vpx_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64); + // Bottom left. + vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride, + coeff + 128); + // Bottom right. + vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride, + coeff + 192); + + do { + int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i); + int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 64); + int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 128); + int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 192); + + int32x4_t b0 = vhaddq_s32(a0, a1); + int32x4_t b1 = vhsubq_s32(a0, a1); + int32x4_t b2 = vhaddq_s32(a2, a3); + int32x4_t b3 = vhsubq_s32(a2, a3); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c3 = vsubq_s32(b1, b3); + + store_s32q_to_tran_low(coeff + 4 * i, c0); + store_s32q_to_tran_low(coeff + 4 * i + 64, c1); + store_s32q_to_tran_low(coeff + 4 * i + 128, c2); + store_s32q_to_tran_low(coeff + 4 * i + 192, c3); + } while (++i < 16); +} + +void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int i = 0; + + // Rearrange 32x32 to 16x64 and remove stride. + // Top left first. + vpx_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff); + // Top right. + vpx_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256); + // Bottom left. + vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride, + coeff + 512); + // Bottom right. + vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride, + coeff + 768); + + do { + int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i); + int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 256); + int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512); + int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768); + + int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2); + int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2); + int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2); + int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2); + + int32x4_t c0 = vaddq_s32(b0, b2); + int32x4_t c1 = vaddq_s32(b1, b3); + int32x4_t c2 = vsubq_s32(b0, b2); + int32x4_t c3 = vsubq_s32(b1, b3); + + store_s32q_to_tran_low(coeff + 4 * i, c0); + store_s32q_to_tran_low(coeff + 4 * i + 256, c1); + store_s32q_to_tran_low(coeff + 4 * i + 512, c2); + store_s32q_to_tran_low(coeff + 4 * i + 768, c3); + } while (++i < 64); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c new file mode 100644 index 0000000000..654ab42ca4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -0,0 +1,1361 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) { + int32x2x2_t t32; + + t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS); + t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS); + return vcombine_s32(t32.val[0], t32.val[1]); +} + +static INLINE void dct_const_round_shift_high_4_dual( + const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) { + *d0 = dct_const_round_shift_high_4(in[0]); + *d1 = dct_const_round_shift_high_4(in[1]); +} + +static INLINE int32x4x2_t +dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) { + int32x4x2_t out; + out.val[0] = dct_const_round_shift_high_4(in[0]); + out.val[1] = dct_const_round_shift_high_4(in[1]); + return out; +} + +static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0); + *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2); +} + +static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_2_30_10_22, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_4_12_20N_28, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_6_26N_14_18N, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 0); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_6_26N_14_18N), 1); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_2_30_10_22, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_4_12_20N_28, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_6_26N_14_18N, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 0); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_6_26N_14_18N), 1); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_q_kernel( + const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24, + int64x2x2_t *const t) { + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); +} + +static INLINE void highbd_idct_cospi_8_24_d_kernel( + const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24, + int64x2x2_t *const t) { + t[0].val[0] = + vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[1] = + vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[0] = + vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[1] = + vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1), + vget_low_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0), + vget_low_s32(cospi_0_8_16_24), 1); +} + +static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[2]; + + highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); + dct_const_round_shift_high_4_dual(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t); + t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]); + t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]); + t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]); + t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[2]; + + highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); + t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]); + t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]); + dct_const_round_shift_high_4_dual(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[6]; + + t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + dct_const_round_shift_high_4x2x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[3]; + + t[2].val[0] = + vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[1] = + vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + dct_const_round_shift_high_4_dual(t, d0, d1); +} + +static INLINE void highbd_idct16x16_add_stage7_dual( + const int32x4x2_t *const step2, int32x4x2_t *const out) { + out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]); + out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]); + out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]); + out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]); + out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]); + out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]); + out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]); + out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]); + out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]); + out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]); + out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]); + out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]); + out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]); + out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]); + out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]); + out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]); + out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]); + out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]); + out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]); + out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]); + out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]); + out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]); + out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]); + out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]); + out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]); + out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]); + out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]); + out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]); + out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]); + out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]); + out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]); + out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]); +} + +static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2, + int32x4_t *const out) { + out[0] = vaddq_s32(step2[0], step2[15]); + out[1] = vaddq_s32(step2[1], step2[14]); + out[2] = vaddq_s32(step2[2], step2[13]); + out[3] = vaddq_s32(step2[3], step2[12]); + out[4] = vaddq_s32(step2[4], step2[11]); + out[5] = vaddq_s32(step2[5], step2[10]); + out[6] = vaddq_s32(step2[6], step2[9]); + out[7] = vaddq_s32(step2[7], step2[8]); + out[8] = vsubq_s32(step2[7], step2[8]); + out[9] = vsubq_s32(step2[6], step2[9]); + out[10] = vsubq_s32(step2[5], step2[10]); + out[11] = vsubq_s32(step2[4], step2[11]); + out[12] = vsubq_s32(step2[3], step2[12]); + out[13] = vsubq_s32(step2[2], step2[13]); + out[14] = vsubq_s32(step2[1], step2[14]); + out[15] = vsubq_s32(step2[0], step2[15]); +} + +void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, + const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[16], step1[16], step2[16], out[16]; + + // Load input (16x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 8; + in[8].val[0] = vld1q_s32(input); + in[8].val[1] = vld1q_s32(input + 4); + input += 8; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 8; + in[9].val[0] = vld1q_s32(input); + in[9].val[1] = vld1q_s32(input + 4); + input += 8; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 8; + in[10].val[0] = vld1q_s32(input); + in[10].val[1] = vld1q_s32(input + 4); + input += 8; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 8; + in[11].val[0] = vld1q_s32(input); + in[11].val[1] = vld1q_s32(input + 4); + input += 8; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 8; + in[12].val[0] = vld1q_s32(input); + in[12].val[1] = vld1q_s32(input + 4); + input += 8; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 8; + in[13].val[0] = vld1q_s32(input); + in[13].val[1] = vld1q_s32(input + 4); + input += 8; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 8; + in[14].val[0] = vld1q_s32(input); + in[14].val[1] = vld1q_s32(input + 4); + input += 8; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + input += 8; + in[15].val[0] = vld1q_s32(input); + in[15].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[1] = in[16 / 2]; + step1[2] = in[8 / 2]; + step1[3] = in[24 / 2]; + step1[4] = in[4 / 2]; + step1[5] = in[20 / 2]; + step1[6] = in[12 / 2]; + step1[7] = in[28 / 2]; + step1[8] = in[2 / 2]; + step1[9] = in[18 / 2]; + step1[10] = in[10 / 2]; + step1[11] = in[26 / 2]; + step1[12] = in[6 / 2]; + step1[13] = in[22 / 2]; + step1[14] = in[14 / 2]; + step1[15] = in[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], + &step2[15]); + highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9], + &step2[14]); + highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10], + &step2[13]); + highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11], + &step2[12]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], + &step1[7]); + highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], + &step1[6]); + step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]); + step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]); + step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]); + step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]); + step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]); + step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]); + step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]); + step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]); + step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]); + step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]); + step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]); + step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]); + step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]); + step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]); + step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]); + step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]); + + // stage 4 + highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], + &step2[0]); + highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], + &step2[3]); + step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]); + step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]); + step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]); + step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]); + step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]); + step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]); + step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]); + step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]); + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]); + step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]); + step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]); + step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]); + step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]); + step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]); + step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]); + step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]); + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]); + step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]); + step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]); + step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]); + step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]); + step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]); + step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]); + step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]); + step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]); + step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]); + step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]); + step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]); + step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]); + step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]); + step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]); + step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]); + + // stage 6 + step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]); + step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]); + step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]); + step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]); + step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]); + step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]); + step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]); + step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]); + step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]); + step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]); + step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]); + step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]); + step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]); + step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]); + step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]); + step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s, + const int32x2_t coef) { + int64x2x2_t t[2]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0); + return dct_const_round_shift_high_4x2_int64x2x2(t); +} + +static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s, + const int32x2_t coef) { + int64x2x2_t t; + + t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0); + t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0); + return dct_const_round_shift_high_4(t); +} + +static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s, + const int32x2_t coef) { + int64x2x2_t t[2]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1); + return dct_const_round_shift_high_4x2_int64x2x2(t); +} + +static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s, + const int32x2_t coef) { + int64x2x2_t t; + + t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1); + t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1); + return dct_const_round_shift_high_4(t); +} + +static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input, + int32_t *output, uint16_t *dest, + const int stride, const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[8], step1[16], step2[16], out[16]; + + // Load input (8x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 16; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 16; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 16; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 16; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 16; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 16; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 16; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[2] = in[8 / 2]; + step1[4] = in[4 / 2]; + step1[6] = in[12 / 2]; + step1[8] = in[2 / 2]; + step1[10] = in[10 / 2]; + step1[12] = in[6 / 2]; + step1[14] = in[14 / 2]; // 0 in pass 1 + + // stage 2 + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + step2[8] = + highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[9] = highbd_idct_cospi_lane1_dual(step1[14], + vget_high_s32(cospi_6_26N_14_18N)); + step2[10] = + highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[12] = + highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[13] = + highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22)); + step2[14] = highbd_idct_cospi_lane0_dual(step1[14], + vget_high_s32(cospi_6_26N_14_18N)); + step2[15] = + highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = + highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[5] = + highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28)); + step1[6] = + highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28)); + step1[7] = + highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = highbd_idct_add_dual(step2[8], step2[9]); + step1[9] = highbd_idct_sub_dual(step2[8], step2[9]); + step1[10] = highbd_idct_sub_dual(step2[11], step2[10]); + step1[11] = highbd_idct_add_dual(step2[11], step2[10]); + step1[12] = highbd_idct_add_dual(step2[12], step2[13]); + step1[13] = highbd_idct_sub_dual(step2[12], step2[13]); + step1[14] = highbd_idct_sub_dual(step2[15], step2[14]); + step1[15] = highbd_idct_add_dual(step2[15], step2[14]); + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[2] = + highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24)); + step2[3] = + highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24)); + step2[4] = highbd_idct_add_dual(step1[4], step1[5]); + step2[5] = highbd_idct_sub_dual(step1[4], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[7], step1[6]); + step2[7] = highbd_idct_add_dual(step1[7], step1[6]); + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = highbd_idct_add_dual(step2[0], step2[3]); + step1[1] = highbd_idct_add_dual(step2[1], step2[2]); + step1[2] = highbd_idct_sub_dual(step2[1], step2[2]); + step1[3] = highbd_idct_sub_dual(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = highbd_idct_add_dual(step2[8], step2[11]); + step1[9] = highbd_idct_add_dual(step2[9], step2[10]); + step1[10] = highbd_idct_sub_dual(step2[9], step2[10]); + step1[11] = highbd_idct_sub_dual(step2[8], step2[11]); + step1[12] = highbd_idct_sub_dual(step2[15], step2[12]); + step1[13] = highbd_idct_sub_dual(step2[14], step2[13]); + step1[14] = highbd_idct_add_dual(step2[14], step2[13]); + step1[15] = highbd_idct_add_dual(step2[15], step2[12]); + + // stage 6 + step2[0] = highbd_idct_add_dual(step1[0], step1[7]); + step2[1] = highbd_idct_add_dual(step1[1], step1[6]); + step2[2] = highbd_idct_add_dual(step1[2], step1[5]); + step2[3] = highbd_idct_add_dual(step1[3], step1[4]); + step2[4] = highbd_idct_sub_dual(step1[3], step1[4]); + step2[5] = highbd_idct_sub_dual(step1[2], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[1], step1[6]); + step2[7] = highbd_idct_sub_dual(step1[0], step1[7]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +static void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int32_t *output) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x4) + in[0] = vld1q_s32(input); + input += 16; + in[1] = vld1q_s32(input); + input += 16; + in[2] = vld1q_s32(input); + input += 16; + in[3] = vld1q_s32(input); + + // Transpose + transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[12] = + highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[4] = + highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s32(step2[8], step2[11]); + step1[9] = vaddq_s32(step2[9], step2[10]); + step1[10] = vsubq_s32(step2[9], step2[10]); + step1[11] = vsubq_s32(step2[8], step2[11]); + step1[12] = vsubq_s32(step2[15], step2[12]); + step1[13] = vsubq_s32(step2[14], step2[13]); + step1[14] = vaddq_s32(step2[14], step2[13]); + step1[15] = vaddq_s32(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s32(step1[0], step1[7]); + step2[1] = vaddq_s32(step1[1], step1[6]); + step2[2] = vaddq_s32(step1[2], step1[5]); + step2[3] = vaddq_s32(step1[3], step1[4]); + step2[4] = vsubq_s32(step1[3], step1[4]); + step2[5] = vsubq_s32(step1[2], step1[5]); + step2[6] = vsubq_s32(step1[1], step1[6]); + step2[7] = vsubq_s32(step1[0], step1[7]); + highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7(step2, out); + + // pass 1: save the result into output + vst1q_s32(output, out[0]); + output += 4; + vst1q_s32(output, out[1]); + output += 4; + vst1q_s32(output, out[2]); + output += 4; + vst1q_s32(output, out[3]); + output += 4; + vst1q_s32(output, out[4]); + output += 4; + vst1q_s32(output, out[5]); + output += 4; + vst1q_s32(output, out[6]); + output += 4; + vst1q_s32(output, out[7]); + output += 4; + vst1q_s32(output, out[8]); + output += 4; + vst1q_s32(output, out[9]); + output += 4; + vst1q_s32(output, out[10]); + output += 4; + vst1q_s32(output, out[11]); + output += 4; + vst1q_s32(output, out[12]); + output += 4; + vst1q_s32(output, out[13]); + output += 4; + vst1q_s32(output, out[14]); + output += 4; + vst1q_s32(output, out[15]); +} + +static void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input, + int32_t *const output, + uint16_t *const dest, + const int stride, + const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x8) + in[0].val[0] = vld1q_s32(input); + input += 4; + in[0].val[1] = vld1q_s32(input); + input += 4; + in[1].val[0] = vld1q_s32(input); + input += 4; + in[1].val[1] = vld1q_s32(input); + input += 4; + in[2].val[0] = vld1q_s32(input); + input += 4; + in[2].val[1] = vld1q_s32(input); + input += 4; + in[3].val[0] = vld1q_s32(input); + input += 4; + in[3].val[1] = vld1q_s32(input); + + // Transpose + transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1], + &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = + highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[12] = + highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N)); + step2[15] = + highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[4] = + highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[7] = + highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = highbd_idct_add_dual(step2[8], step2[11]); + step1[9] = highbd_idct_add_dual(step2[9], step2[10]); + step1[10] = highbd_idct_sub_dual(step2[9], step2[10]); + step1[11] = highbd_idct_sub_dual(step2[8], step2[11]); + step1[12] = highbd_idct_sub_dual(step2[15], step2[12]); + step1[13] = highbd_idct_sub_dual(step2[14], step2[13]); + step1[14] = highbd_idct_add_dual(step2[14], step2[13]); + step1[15] = highbd_idct_add_dual(step2[15], step2[12]); + + // stage 6 + step2[0] = highbd_idct_add_dual(step1[0], step1[7]); + step2[1] = highbd_idct_add_dual(step1[1], step1[6]); + step2[2] = highbd_idct_add_dual(step1[2], step1[5]); + step2[3] = highbd_idct_add_dual(step1[3], step1[4]); + step2[4] = highbd_idct_sub_dual(step1[3], step1[4]); + step2[5] = highbd_idct_sub_dual(step1[2], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[1], step1[6]); + step2[7] = highbd_idct_sub_dual(step1[0], step1[7]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1); + + // Parallel idct on the lower 8 rows + vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, + stride, 1); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8, + stride, 1); + } else { + int32_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, + bd); + + // Parallel idct on the lower 8 rows + vpx_highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, + dest, stride, bd); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, + bd); + + // Parallel idct to get the right 8 columns + vpx_highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, + dest + 8, stride, bd); + } +} + +void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, + stride, 1); + } else { + int32_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, + bd); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd); + + // Parallel idct to get the right 8 columns + vpx_highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, + stride, bd); + } +} + +void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + int16_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, + stride, 1); + } else { + int32_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, + bd); + + // Parallel idct to get the right 8 columns + highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, + dest + 8, stride, bd); + } +} + +static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a0 = vld1q_u16(*dest + 0); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t c0 = vminq_s16(b0, max); + const int16x8_t c1 = vminq_s16(b1, max); + vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0)); + vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); + *dest += stride; +} + +static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a0 = vld1q_u16(*dest + 0); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const uint16x8_t c0 = vqshluq_n_s16(b0, 0); + const uint16x8_t c1 = vqshluq_n_s16(b1, 0); + vst1q_u16(*dest + 0, c0); + vst1q_u16(*dest + 8, c1); + *dest += stride; +} + +void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + const int16x8_t dc = vdupq_n_s16(a1); + int i; + + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + for (i = 0; i < 4; ++i) { + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + } + } else { + for (i = 0; i < 4; ++i) { + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c new file mode 100644 index 0000000000..5b36f73367 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c @@ -0,0 +1,640 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void load_from_transformed(const int32_t *const trans_buf, + const int first, const int second, + int32x4x2_t *const q0, + int32x4x2_t *const q1) { + q0->val[0] = vld1q_s32(trans_buf + first * 8); + q0->val[1] = vld1q_s32(trans_buf + first * 8 + 4); + q1->val[0] = vld1q_s32(trans_buf + second * 8); + q1->val[1] = vld1q_s32(trans_buf + second * 8 + 4); +} + +static INLINE void load_from_output(const int32_t *const out, const int first, + const int second, int32x4x2_t *const q0, + int32x4x2_t *const q1) { + q0->val[0] = vld1q_s32(out + first * 32); + q0->val[1] = vld1q_s32(out + first * 32 + 4); + q1->val[0] = vld1q_s32(out + second * 32); + q1->val[1] = vld1q_s32(out + second * 32 + 4); +} + +static INLINE void store_in_output(int32_t *const out, const int first, + const int second, const int32x4x2_t q0, + const int32x4x2_t q1) { + vst1q_s32(out + first * 32, q0.val[0]); + vst1q_s32(out + first * 32 + 4, q0.val[1]); + vst1q_s32(out + second * 32, q1.val[0]); + vst1q_s32(out + second * 32 + 4, q1.val[1]); +} + +static INLINE void highbd_store_combine_results( + uint16_t *p1, uint16_t *p2, const int stride, const int32x4x2_t q0, + const int32x4x2_t q1, const int32x4x2_t q2, const int32x4x2_t q3, + const int16x8_t max) { + int16x8_t o[4]; + uint16x8_t d[4]; + + d[0] = vld1q_u16(p1); + p1 += stride; + d[1] = vld1q_u16(p1); + d[3] = vld1q_u16(p2); + p2 -= stride; + d[2] = vld1q_u16(p2); + + o[0] = vcombine_s16(vrshrn_n_s32(q0.val[0], 6), vrshrn_n_s32(q0.val[1], 6)); + o[1] = vcombine_s16(vrshrn_n_s32(q1.val[0], 6), vrshrn_n_s32(q1.val[1], 6)); + o[2] = vcombine_s16(vrshrn_n_s32(q2.val[0], 6), vrshrn_n_s32(q2.val[1], 6)); + o[3] = vcombine_s16(vrshrn_n_s32(q3.val[0], 6), vrshrn_n_s32(q3.val[1], 6)); + + o[0] = vqaddq_s16(o[0], vreinterpretq_s16_u16(d[0])); + o[1] = vqaddq_s16(o[1], vreinterpretq_s16_u16(d[1])); + o[2] = vqaddq_s16(o[2], vreinterpretq_s16_u16(d[2])); + o[3] = vqaddq_s16(o[3], vreinterpretq_s16_u16(d[3])); + o[0] = vminq_s16(o[0], max); + o[1] = vminq_s16(o[1], max); + o[2] = vminq_s16(o[2], max); + o[3] = vminq_s16(o[3], max); + d[0] = vqshluq_n_s16(o[0], 0); + d[1] = vqshluq_n_s16(o[1], 0); + d[2] = vqshluq_n_s16(o[2], 0); + d[3] = vqshluq_n_s16(o[3], 0); + + vst1q_u16(p1, d[1]); + p1 -= stride; + vst1q_u16(p1, d[0]); + vst1q_u16(p2, d[2]); + p2 += stride; + vst1q_u16(p2, d[3]); +} + +static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1, + const int32_t first_const, + const int32_t second_const, + int32x4x2_t *const qOut0, + int32x4x2_t *const qOut1) { + int64x2x2_t q[4]; + int32x2_t d[6]; + + // Note: using v{mul, mla, mls}l_n_s32 here slows down 35% with gcc 4.9. + d[4] = vdup_n_s32(first_const); + d[5] = vdup_n_s32(second_const); + + q[0].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[4]); + q[0].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[4]); + q[1].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[4]); + q[1].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[4]); + q[0].val[0] = vmlsl_s32(q[0].val[0], vget_low_s32(qIn1.val[0]), d[5]); + q[0].val[1] = vmlsl_s32(q[0].val[1], vget_high_s32(qIn1.val[0]), d[5]); + q[1].val[0] = vmlsl_s32(q[1].val[0], vget_low_s32(qIn1.val[1]), d[5]); + q[1].val[1] = vmlsl_s32(q[1].val[1], vget_high_s32(qIn1.val[1]), d[5]); + + q[2].val[0] = vmull_s32(vget_low_s32(qIn0.val[0]), d[5]); + q[2].val[1] = vmull_s32(vget_high_s32(qIn0.val[0]), d[5]); + q[3].val[0] = vmull_s32(vget_low_s32(qIn0.val[1]), d[5]); + q[3].val[1] = vmull_s32(vget_high_s32(qIn0.val[1]), d[5]); + q[2].val[0] = vmlal_s32(q[2].val[0], vget_low_s32(qIn1.val[0]), d[4]); + q[2].val[1] = vmlal_s32(q[2].val[1], vget_high_s32(qIn1.val[0]), d[4]); + q[3].val[0] = vmlal_s32(q[3].val[0], vget_low_s32(qIn1.val[1]), d[4]); + q[3].val[1] = vmlal_s32(q[3].val[1], vget_high_s32(qIn1.val[1]), d[4]); + + qOut0->val[0] = vcombine_s32(vrshrn_n_s64(q[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[0].val[1], DCT_CONST_BITS)); + qOut0->val[1] = vcombine_s32(vrshrn_n_s64(q[1].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[1].val[1], DCT_CONST_BITS)); + qOut1->val[0] = vcombine_s32(vrshrn_n_s64(q[2].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[2].val[1], DCT_CONST_BITS)); + qOut1->val[1] = vcombine_s32(vrshrn_n_s64(q[3].val[0], DCT_CONST_BITS), + vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS)); +} + +static INLINE void load_s32x4q_dual(const int32_t *in, int32x4x2_t *const s) { + s[0].val[0] = vld1q_s32(in); + s[0].val[1] = vld1q_s32(in + 4); + in += 32; + s[1].val[0] = vld1q_s32(in); + s[1].val[1] = vld1q_s32(in + 4); + in += 32; + s[2].val[0] = vld1q_s32(in); + s[2].val[1] = vld1q_s32(in + 4); + in += 32; + s[3].val[0] = vld1q_s32(in); + s[3].val[1] = vld1q_s32(in + 4); + in += 32; + s[4].val[0] = vld1q_s32(in); + s[4].val[1] = vld1q_s32(in + 4); + in += 32; + s[5].val[0] = vld1q_s32(in); + s[5].val[1] = vld1q_s32(in + 4); + in += 32; + s[6].val[0] = vld1q_s32(in); + s[6].val[1] = vld1q_s32(in + 4); + in += 32; + s[7].val[0] = vld1q_s32(in); + s[7].val[1] = vld1q_s32(in + 4); +} + +static INLINE void transpose_and_store_s32_8x8(int32x4x2_t *const a, + int32_t **out) { + transpose_s32_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + + vst1q_s32(*out, a[0].val[0]); + *out += 4; + vst1q_s32(*out, a[0].val[1]); + *out += 4; + vst1q_s32(*out, a[1].val[0]); + *out += 4; + vst1q_s32(*out, a[1].val[1]); + *out += 4; + vst1q_s32(*out, a[2].val[0]); + *out += 4; + vst1q_s32(*out, a[2].val[1]); + *out += 4; + vst1q_s32(*out, a[3].val[0]); + *out += 4; + vst1q_s32(*out, a[3].val[1]); + *out += 4; + vst1q_s32(*out, a[4].val[0]); + *out += 4; + vst1q_s32(*out, a[4].val[1]); + *out += 4; + vst1q_s32(*out, a[5].val[0]); + *out += 4; + vst1q_s32(*out, a[5].val[1]); + *out += 4; + vst1q_s32(*out, a[6].val[0]); + *out += 4; + vst1q_s32(*out, a[6].val[1]); + *out += 4; + vst1q_s32(*out, a[7].val[0]); + *out += 4; + vst1q_s32(*out, a[7].val[1]); + *out += 4; +} + +static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) { + int i; + int32x4x2_t s[8]; + + for (i = 0; i < 4; i++, input += 8) { + load_s32x4q_dual(input, s); + transpose_and_store_s32_8x8(s, &t_buf); + } +} + +static INLINE void idct32_bands_end_1st_pass(int32_t *const out, + int32x4x2_t *const q) { + store_in_output(out, 16, 17, q[6], q[7]); + store_in_output(out, 14, 15, q[8], q[9]); + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 30, 31, q[6], q[7]); + store_in_output(out, 0, 1, q[4], q[5]); + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[10], q[1]); + q[3] = highbd_idct_add_dual(q[11], q[0]); + q[4] = highbd_idct_sub_dual(q[11], q[0]); + q[5] = highbd_idct_sub_dual(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + store_in_output(out, 18, 19, q[6], q[7]); + store_in_output(out, 12, 13, q[8], q[9]); + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 28, 29, q[6], q[7]); + store_in_output(out, 2, 3, q[4], q[5]); + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[12], q[1]); + q[3] = highbd_idct_add_dual(q[13], q[0]); + q[4] = highbd_idct_sub_dual(q[13], q[0]); + q[5] = highbd_idct_sub_dual(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + store_in_output(out, 20, 21, q[6], q[7]); + store_in_output(out, 10, 11, q[8], q[9]); + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 26, 27, q[6], q[7]); + store_in_output(out, 4, 5, q[4], q[5]); + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[14], q[1]); + q[3] = highbd_idct_add_dual(q[15], q[0]); + q[4] = highbd_idct_sub_dual(q[15], q[0]); + q[5] = highbd_idct_sub_dual(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + store_in_output(out, 22, 23, q[6], q[7]); + store_in_output(out, 8, 9, q[8], q[9]); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + store_in_output(out, 24, 25, q[6], q[7]); + store_in_output(out, 6, 7, q[4], q[5]); +} + +static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out, + uint16_t *const dest, + const int stride, + const int16x8_t max, + int32x4x2_t *const q) { + uint16_t *dest0 = dest + 0 * stride; + uint16_t *dest1 = dest + 31 * stride; + uint16_t *dest2 = dest + 16 * stride; + uint16_t *dest3 = dest + 15 * stride; + const int str2 = stride << 1; + + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[10], q[1]); + q[3] = highbd_idct_add_dual(q[11], q[0]); + q[4] = highbd_idct_sub_dual(q[11], q[0]); + q[5] = highbd_idct_sub_dual(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[12], q[1]); + q[3] = highbd_idct_add_dual(q[13], q[0]); + q[4] = highbd_idct_sub_dual(q[13], q[0]); + q[5] = highbd_idct_sub_dual(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[14], q[1]); + q[3] = highbd_idct_add_dual(q[15], q[0]); + q[4] = highbd_idct_sub_dual(q[15], q[0]); + q[5] = highbd_idct_sub_dual(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + highbd_store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9], + max); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = highbd_idct_add_dual(q[2], q[1]); + q[5] = highbd_idct_add_dual(q[3], q[0]); + q[6] = highbd_idct_sub_dual(q[3], q[0]); + q[7] = highbd_idct_sub_dual(q[2], q[1]); + highbd_store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7], + max); +} + +static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input, + uint16_t *dst, const int stride, + const int bd) { + int i, idct32_pass_loop; + int32_t trans_buf[32 * 8]; + int32_t pass1[32 * 32]; + int32_t pass2[32 * 32]; + int32_t *out; + int32x4x2_t q[16]; + + for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; + idct32_pass_loop++, input = pass1, out = pass2) { + for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop + idct32_transpose_pair(input, trans_buf); + input += 32 * 8; + + // ----------------------------------------- + // BLOCK A: 16-19,28-31 + // ----------------------------------------- + // generate 16,17,30,31 + // part of stage 1 + load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]); + // part of stage 2 + q[4] = highbd_idct_add_dual(q[0], q[1]); + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[6] = highbd_idct_add_dual(q[2], q[3]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + // part of stage 3 + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]); + + // generate 18,19,28,29 + // part of stage 1 + load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]); + // part of stage 2 + q[13] = highbd_idct_sub_dual(q[3], q[2]); + q[3] = highbd_idct_add_dual(q[3], q[2]); + q[14] = highbd_idct_sub_dual(q[1], q[0]); + q[2] = highbd_idct_add_dual(q[1], q[0]); + // part of stage 3 + do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]); + // part of stage 4 + q[8] = highbd_idct_add_dual(q[4], q[2]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[10] = highbd_idct_add_dual(q[7], q[1]); + q[15] = highbd_idct_add_dual(q[6], q[3]); + q[13] = highbd_idct_sub_dual(q[5], q[0]); + q[14] = highbd_idct_sub_dual(q[7], q[1]); + store_in_output(out, 16, 31, q[8], q[15]); + store_in_output(out, 17, 30, q[9], q[10]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]); + store_in_output(out, 29, 18, q[1], q[0]); + // part of stage 4 + q[13] = highbd_idct_sub_dual(q[4], q[2]); + q[14] = highbd_idct_sub_dual(q[6], q[3]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]); + store_in_output(out, 19, 28, q[4], q[6]); + + // ----------------------------------------- + // BLOCK B: 20-23,24-27 + // ----------------------------------------- + // generate 20,21,26,27 + // part of stage 1 + load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]); + // part of stage 2 + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[0] = highbd_idct_add_dual(q[0], q[1]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + q[2] = highbd_idct_add_dual(q[2], q[3]); + // part of stage 3 + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); + + // generate 22,23,24,25 + // part of stage 1 + load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]); + // part of stage 2 + q[14] = highbd_idct_sub_dual(q[4], q[5]); + q[5] = highbd_idct_add_dual(q[4], q[5]); + q[13] = highbd_idct_sub_dual(q[6], q[7]); + q[6] = highbd_idct_add_dual(q[6], q[7]); + // part of stage 3 + do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]); + // part of stage 4 + q[10] = highbd_idct_add_dual(q[7], q[1]); + q[11] = highbd_idct_add_dual(q[5], q[0]); + q[12] = highbd_idct_add_dual(q[6], q[2]); + q[15] = highbd_idct_add_dual(q[4], q[3]); + // part of stage 6 + load_from_output(out, 16, 17, &q[14], &q[13]); + q[8] = highbd_idct_add_dual(q[14], q[11]); + q[9] = highbd_idct_add_dual(q[13], q[10]); + q[13] = highbd_idct_sub_dual(q[13], q[10]); + q[11] = highbd_idct_sub_dual(q[14], q[11]); + store_in_output(out, 17, 16, q[9], q[8]); + load_from_output(out, 30, 31, &q[14], &q[9]); + q[8] = highbd_idct_sub_dual(q[9], q[12]); + q[10] = highbd_idct_add_dual(q[14], q[15]); + q[14] = highbd_idct_sub_dual(q[14], q[15]); + q[12] = highbd_idct_add_dual(q[9], q[12]); + store_in_output(out, 30, 31, q[10], q[12]); + // part of stage 7 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 25, 22, q[14], q[13]); + do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 24, 23, q[14], q[13]); + // part of stage 4 + q[14] = highbd_idct_sub_dual(q[5], q[0]); + q[13] = highbd_idct_sub_dual(q[6], q[2]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]); + q[14] = highbd_idct_sub_dual(q[7], q[1]); + q[13] = highbd_idct_sub_dual(q[4], q[3]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]); + // part of stage 6 + load_from_output(out, 18, 19, &q[14], &q[13]); + q[8] = highbd_idct_add_dual(q[14], q[1]); + q[9] = highbd_idct_add_dual(q[13], q[6]); + q[13] = highbd_idct_sub_dual(q[13], q[6]); + q[1] = highbd_idct_sub_dual(q[14], q[1]); + store_in_output(out, 18, 19, q[8], q[9]); + load_from_output(out, 28, 29, &q[8], &q[9]); + q[14] = highbd_idct_sub_dual(q[8], q[5]); + q[10] = highbd_idct_add_dual(q[8], q[5]); + q[11] = highbd_idct_add_dual(q[9], q[0]); + q[0] = highbd_idct_sub_dual(q[9], q[0]); + store_in_output(out, 28, 29, q[10], q[11]); + // part of stage 7 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 20, 27, q[13], q[14]); + do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]); + store_in_output(out, 21, 26, q[1], q[0]); + + // ----------------------------------------- + // BLOCK C: 8-10,11-15 + // ----------------------------------------- + // generate 8,9,14,15 + // part of stage 2 + load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]); + // part of stage 3 + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[0] = highbd_idct_add_dual(q[0], q[1]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + q[2] = highbd_idct_add_dual(q[2], q[3]); + // part of stage 4 + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]); + + // generate 10,11,12,13 + // part of stage 2 + load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]); + // part of stage 3 + q[14] = highbd_idct_sub_dual(q[4], q[5]); + q[5] = highbd_idct_add_dual(q[4], q[5]); + q[13] = highbd_idct_sub_dual(q[6], q[7]); + q[6] = highbd_idct_add_dual(q[6], q[7]); + // part of stage 4 + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]); + // part of stage 5 + q[8] = highbd_idct_add_dual(q[0], q[5]); + q[9] = highbd_idct_add_dual(q[1], q[7]); + q[13] = highbd_idct_sub_dual(q[1], q[7]); + q[14] = highbd_idct_sub_dual(q[3], q[4]); + q[10] = highbd_idct_add_dual(q[3], q[4]); + q[15] = highbd_idct_add_dual(q[2], q[6]); + store_in_output(out, 8, 15, q[8], q[15]); + store_in_output(out, 9, 14, q[9], q[10]); + // part of stage 6 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 13, 10, q[3], q[1]); + q[13] = highbd_idct_sub_dual(q[0], q[5]); + q[14] = highbd_idct_sub_dual(q[2], q[6]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 11, 12, q[1], q[3]); + + // ----------------------------------------- + // BLOCK D: 0-3,4-7 + // ----------------------------------------- + // generate 4,5,6,7 + // part of stage 3 + load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); + // part of stage 4 + q[13] = highbd_idct_sub_dual(q[0], q[1]); + q[0] = highbd_idct_add_dual(q[0], q[1]); + q[14] = highbd_idct_sub_dual(q[2], q[3]); + q[2] = highbd_idct_add_dual(q[2], q[3]); + // part of stage 5 + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + + // generate 0,1,2,3 + // part of stage 4 + load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]); + // part of stage 5 + q[4] = highbd_idct_add_dual(q[7], q[6]); + q[7] = highbd_idct_sub_dual(q[7], q[6]); + q[6] = highbd_idct_sub_dual(q[5], q[14]); + q[5] = highbd_idct_add_dual(q[5], q[14]); + // part of stage 6 + q[8] = highbd_idct_add_dual(q[4], q[2]); + q[9] = highbd_idct_add_dual(q[5], q[3]); + q[10] = highbd_idct_add_dual(q[6], q[1]); + q[11] = highbd_idct_add_dual(q[7], q[0]); + q[12] = highbd_idct_sub_dual(q[7], q[0]); + q[13] = highbd_idct_sub_dual(q[6], q[1]); + q[14] = highbd_idct_sub_dual(q[5], q[3]); + q[15] = highbd_idct_sub_dual(q[4], q[2]); + // part of stage 7 + load_from_output(out, 14, 15, &q[0], &q[1]); + q[2] = highbd_idct_add_dual(q[8], q[1]); + q[3] = highbd_idct_add_dual(q[9], q[0]); + q[4] = highbd_idct_sub_dual(q[9], q[0]); + q[5] = highbd_idct_sub_dual(q[8], q[1]); + load_from_output(out, 16, 17, &q[0], &q[1]); + q[8] = highbd_idct_add_dual(q[4], q[1]); + q[9] = highbd_idct_add_dual(q[5], q[0]); + q[6] = highbd_idct_sub_dual(q[5], q[0]); + q[7] = highbd_idct_sub_dual(q[4], q[1]); + + if (idct32_pass_loop == 0) { + idct32_bands_end_1st_pass(out, q); + } else { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + idct32_bands_end_2nd_pass(out, dst, stride, max, q); + dst += 8; + } + } + } +} + +void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + if (bd == 8) { + vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1); + } else { + vpx_highbd_idct32_32_neon(input, dest, stride, bd); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c new file mode 100644 index 0000000000..6750c1a426 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void load_8x8_s32_dual( + const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1, + int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4, + int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) { + in0->val[0] = vld1q_s32(input); + in0->val[1] = vld1q_s32(input + 4); + input += 32; + in1->val[0] = vld1q_s32(input); + in1->val[1] = vld1q_s32(input + 4); + input += 32; + in2->val[0] = vld1q_s32(input); + in2->val[1] = vld1q_s32(input + 4); + input += 32; + in3->val[0] = vld1q_s32(input); + in3->val[1] = vld1q_s32(input + 4); + input += 32; + in4->val[0] = vld1q_s32(input); + in4->val[1] = vld1q_s32(input + 4); + input += 32; + in5->val[0] = vld1q_s32(input); + in5->val[1] = vld1q_s32(input + 4); + input += 32; + in6->val[0] = vld1q_s32(input); + in6->val[1] = vld1q_s32(input + 4); + input += 32; + in7->val[0] = vld1q_s32(input); + in7->val[1] = vld1q_s32(input + 4); +} + +static INLINE void load_4x8_s32_dual(const tran_low_t *input, + int32x4_t *const in0, int32x4_t *const in1, + int32x4_t *const in2, int32x4_t *const in3, + int32x4_t *const in4, int32x4_t *const in5, + int32x4_t *const in6, + int32x4_t *const in7) { + *in0 = vld1q_s32(input); + input += 32; + *in1 = vld1q_s32(input); + input += 32; + *in2 = vld1q_s32(input); + input += 32; + *in3 = vld1q_s32(input); + input += 32; + *in4 = vld1q_s32(input); + input += 32; + *in5 = vld1q_s32(input); + input += 32; + *in6 = vld1q_s32(input); + input += 32; + *in7 = vld1q_s32(input); +} + +// Only for the first pass of the _135_ variant. Since it only uses values from +// the top left 16x16 it can safely assume all the remaining values are 0 and +// skip an awful lot of calculations. In fact, only the first 12 columns make +// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are +// used so it skips any calls to input[12|13|14|15] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 12x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// 0 0 2 5 10 17 25 38 47 62 83 101 121 +// 1 1 4 8 15 22 30 45 58 74 92 112 133 +// 2 3 7 12 18 28 36 52 64 82 102 118 +// 3 6 11 16 23 31 43 60 73 90 109 126 +// 4 9 14 19 29 37 50 65 78 98 116 134 +// 5 13 20 26 35 44 54 72 85 105 123 +// 6 21 27 33 42 53 63 80 94 113 132 +// 7 24 32 39 48 57 71 88 104 120 +// 8 34 40 46 56 68 81 96 111 130 +// 9 41 49 55 67 77 91 107 124 +// 10 51 59 66 76 89 99 119 131 +// 11 61 69 75 87 100 114 129 +// 12 70 79 86 97 108 122 +// 13 84 93 103 110 125 +// 14 98 106 115 127 +// 15 117 128 +static void vpx_highbd_idct32_12_neon(const tran_low_t *const input, + int32_t *output) { + int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], + s8[32]; + + load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], + &in[6], &in[7]); + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0], + &in[9].val[1], &in[10].val[0], &in[10].val[1], + &in[11].val[0], &in[11].val[1]); + transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1], + &in[10].val[0], &in[10].val[1], &in[11].val[0], + &in[11].val[1]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64); + + s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64); + + s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); + + s2[18] = highbd_idct_sub_dual(s1[19], s1[18]); + s2[19] = highbd_idct_add_dual(s1[18], s1[19]); + s2[20] = highbd_idct_add_dual(s1[20], s1[21]); + s2[21] = highbd_idct_sub_dual(s1[20], s1[21]); + s2[26] = highbd_idct_sub_dual(s1[27], s1[26]); + s2[27] = highbd_idct_add_dual(s1[26], s1[27]); + s2[28] = highbd_idct_add_dual(s1[28], s1[29]); + s2[29] = highbd_idct_sub_dual(s1[28], s1[29]); + + // stage 3 + s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s3[10] = highbd_idct_sub_dual(s2[11], s2[10]); + s3[11] = highbd_idct_add_dual(s2[10], s2[11]); + s3[12] = highbd_idct_add_dual(s2[12], s2[13]); + s3[13] = highbd_idct_sub_dual(s2[12], s2[13]); + + s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, + s1[31], cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, + s1[31], cospi_4_64); + + s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64, + s2[29], cospi_28_64); + + s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64, + s2[26], cospi_20_64); + + s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64); + + s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, + s2[15], cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, + s2[15], cospi_8_64); + + s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64, + s3[13], cospi_24_64); + + s4[16] = highbd_idct_add_dual(s1[16], s2[19]); + s4[17] = highbd_idct_add_dual(s3[17], s3[18]); + s4[18] = highbd_idct_sub_dual(s3[17], s3[18]); + s4[19] = highbd_idct_sub_dual(s1[16], s2[19]); + s4[20] = highbd_idct_sub_dual(s1[23], s2[20]); + s4[21] = highbd_idct_sub_dual(s3[22], s3[21]); + s4[22] = highbd_idct_add_dual(s3[21], s3[22]); + s4[23] = highbd_idct_add_dual(s2[20], s1[23]); + s4[24] = highbd_idct_add_dual(s1[24], s2[27]); + s4[25] = highbd_idct_add_dual(s3[25], s3[26]); + s4[26] = highbd_idct_sub_dual(s3[25], s3[26]); + s4[27] = highbd_idct_sub_dual(s1[24], s2[27]); + s4[28] = highbd_idct_sub_dual(s1[31], s2[28]); + s4[29] = highbd_idct_sub_dual(s3[30], s3[29]); + s4[30] = highbd_idct_add_dual(s3[29], s3[30]); + s4[31] = highbd_idct_add_dual(s2[28], s1[31]); + + // stage 5 + s5[0] = highbd_idct_add_dual(s4[0], s4[3]); + s5[1] = highbd_idct_add_dual(s4[0], s4[2]); + s5[2] = highbd_idct_sub_dual(s4[0], s4[2]); + s5[3] = highbd_idct_sub_dual(s4[0], s4[3]); + + s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64); + + s5[8] = highbd_idct_add_dual(s2[8], s3[11]); + s5[9] = highbd_idct_add_dual(s4[9], s4[10]); + s5[10] = highbd_idct_sub_dual(s4[9], s4[10]); + s5[11] = highbd_idct_sub_dual(s2[8], s3[11]); + s5[12] = highbd_idct_sub_dual(s2[15], s3[12]); + s5[13] = highbd_idct_sub_dual(s4[14], s4[13]); + s5[14] = highbd_idct_add_dual(s4[13], s4[14]); + s5[15] = highbd_idct_add_dual(s2[15], s3[12]); + + s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64, + s4[29], cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64, + s4[29], cospi_8_64); + + s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64, + s4[28], cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64, + s4[28], cospi_8_64); + + s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64, + s4[27], cospi_24_64); + + s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64, + s4[26], cospi_24_64); + + // stage 6 + s6[0] = highbd_idct_add_dual(s5[0], s3[7]); + s6[1] = highbd_idct_add_dual(s5[1], s5[6]); + s6[2] = highbd_idct_add_dual(s5[2], s5[5]); + s6[3] = highbd_idct_add_dual(s5[3], s3[4]); + s6[4] = highbd_idct_sub_dual(s5[3], s3[4]); + s6[5] = highbd_idct_sub_dual(s5[2], s5[5]); + s6[6] = highbd_idct_sub_dual(s5[1], s5[6]); + s6[7] = highbd_idct_sub_dual(s5[0], s3[7]); + + s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64); + + s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64); + + s6[16] = highbd_idct_add_dual(s4[16], s4[23]); + s6[17] = highbd_idct_add_dual(s4[17], s4[22]); + s6[18] = highbd_idct_add_dual(s5[18], s5[21]); + s6[19] = highbd_idct_add_dual(s5[19], s5[20]); + s6[20] = highbd_idct_sub_dual(s5[19], s5[20]); + s6[21] = highbd_idct_sub_dual(s5[18], s5[21]); + s6[22] = highbd_idct_sub_dual(s4[17], s4[22]); + s6[23] = highbd_idct_sub_dual(s4[16], s4[23]); + + s6[24] = highbd_idct_sub_dual(s4[31], s4[24]); + s6[25] = highbd_idct_sub_dual(s4[30], s4[25]); + s6[26] = highbd_idct_sub_dual(s5[29], s5[26]); + s6[27] = highbd_idct_sub_dual(s5[28], s5[27]); + s6[28] = highbd_idct_add_dual(s5[27], s5[28]); + s6[29] = highbd_idct_add_dual(s5[26], s5[29]); + s6[30] = highbd_idct_add_dual(s4[25], s4[30]); + s6[31] = highbd_idct_add_dual(s4[24], s4[31]); + + // stage 7 + s7[0] = highbd_idct_add_dual(s6[0], s5[15]); + s7[1] = highbd_idct_add_dual(s6[1], s5[14]); + s7[2] = highbd_idct_add_dual(s6[2], s6[13]); + s7[3] = highbd_idct_add_dual(s6[3], s6[12]); + s7[4] = highbd_idct_add_dual(s6[4], s6[11]); + s7[5] = highbd_idct_add_dual(s6[5], s6[10]); + s7[6] = highbd_idct_add_dual(s6[6], s5[9]); + s7[7] = highbd_idct_add_dual(s6[7], s5[8]); + s7[8] = highbd_idct_sub_dual(s6[7], s5[8]); + s7[9] = highbd_idct_sub_dual(s6[6], s5[9]); + s7[10] = highbd_idct_sub_dual(s6[5], s6[10]); + s7[11] = highbd_idct_sub_dual(s6[4], s6[11]); + s7[12] = highbd_idct_sub_dual(s6[3], s6[12]); + s7[13] = highbd_idct_sub_dual(s6[2], s6[13]); + s7[14] = highbd_idct_sub_dual(s6[1], s5[14]); + s7[15] = highbd_idct_sub_dual(s6[0], s5[15]); + + s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64); + + s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64); + + s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64); + + s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64); + + // final stage + s8[0] = highbd_idct_add_dual(s7[0], s6[31]); + s8[1] = highbd_idct_add_dual(s7[1], s6[30]); + s8[2] = highbd_idct_add_dual(s7[2], s6[29]); + s8[3] = highbd_idct_add_dual(s7[3], s6[28]); + s8[4] = highbd_idct_add_dual(s7[4], s7[27]); + s8[5] = highbd_idct_add_dual(s7[5], s7[26]); + s8[6] = highbd_idct_add_dual(s7[6], s7[25]); + s8[7] = highbd_idct_add_dual(s7[7], s7[24]); + s8[8] = highbd_idct_add_dual(s7[8], s7[23]); + s8[9] = highbd_idct_add_dual(s7[9], s7[22]); + s8[10] = highbd_idct_add_dual(s7[10], s7[21]); + s8[11] = highbd_idct_add_dual(s7[11], s7[20]); + s8[12] = highbd_idct_add_dual(s7[12], s6[19]); + s8[13] = highbd_idct_add_dual(s7[13], s6[18]); + s8[14] = highbd_idct_add_dual(s7[14], s6[17]); + s8[15] = highbd_idct_add_dual(s7[15], s6[16]); + s8[16] = highbd_idct_sub_dual(s7[15], s6[16]); + s8[17] = highbd_idct_sub_dual(s7[14], s6[17]); + s8[18] = highbd_idct_sub_dual(s7[13], s6[18]); + s8[19] = highbd_idct_sub_dual(s7[12], s6[19]); + s8[20] = highbd_idct_sub_dual(s7[11], s7[20]); + s8[21] = highbd_idct_sub_dual(s7[10], s7[21]); + s8[22] = highbd_idct_sub_dual(s7[9], s7[22]); + s8[23] = highbd_idct_sub_dual(s7[8], s7[23]); + s8[24] = highbd_idct_sub_dual(s7[7], s7[24]); + s8[25] = highbd_idct_sub_dual(s7[6], s7[25]); + s8[26] = highbd_idct_sub_dual(s7[5], s7[26]); + s8[27] = highbd_idct_sub_dual(s7[4], s7[27]); + s8[28] = highbd_idct_sub_dual(s7[3], s6[28]); + s8[29] = highbd_idct_sub_dual(s7[2], s6[29]); + s8[30] = highbd_idct_sub_dual(s7[1], s6[30]); + s8[31] = highbd_idct_sub_dual(s7[0], s6[31]); + + vst1q_s32(output + 0, s8[0].val[0]); + vst1q_s32(output + 4, s8[0].val[1]); + output += 16; + vst1q_s32(output + 0, s8[1].val[0]); + vst1q_s32(output + 4, s8[1].val[1]); + output += 16; + vst1q_s32(output + 0, s8[2].val[0]); + vst1q_s32(output + 4, s8[2].val[1]); + output += 16; + vst1q_s32(output + 0, s8[3].val[0]); + vst1q_s32(output + 4, s8[3].val[1]); + output += 16; + vst1q_s32(output + 0, s8[4].val[0]); + vst1q_s32(output + 4, s8[4].val[1]); + output += 16; + vst1q_s32(output + 0, s8[5].val[0]); + vst1q_s32(output + 4, s8[5].val[1]); + output += 16; + vst1q_s32(output + 0, s8[6].val[0]); + vst1q_s32(output + 4, s8[6].val[1]); + output += 16; + vst1q_s32(output + 0, s8[7].val[0]); + vst1q_s32(output + 4, s8[7].val[1]); + output += 16; + + vst1q_s32(output + 0, s8[8].val[0]); + vst1q_s32(output + 4, s8[8].val[1]); + output += 16; + vst1q_s32(output + 0, s8[9].val[0]); + vst1q_s32(output + 4, s8[9].val[1]); + output += 16; + vst1q_s32(output + 0, s8[10].val[0]); + vst1q_s32(output + 4, s8[10].val[1]); + output += 16; + vst1q_s32(output + 0, s8[11].val[0]); + vst1q_s32(output + 4, s8[11].val[1]); + output += 16; + vst1q_s32(output + 0, s8[12].val[0]); + vst1q_s32(output + 4, s8[12].val[1]); + output += 16; + vst1q_s32(output + 0, s8[13].val[0]); + vst1q_s32(output + 4, s8[13].val[1]); + output += 16; + vst1q_s32(output + 0, s8[14].val[0]); + vst1q_s32(output + 4, s8[14].val[1]); + output += 16; + vst1q_s32(output + 0, s8[15].val[0]); + vst1q_s32(output + 4, s8[15].val[1]); + output += 16; + + vst1q_s32(output + 0, s8[16].val[0]); + vst1q_s32(output + 4, s8[16].val[1]); + output += 16; + vst1q_s32(output + 0, s8[17].val[0]); + vst1q_s32(output + 4, s8[17].val[1]); + output += 16; + vst1q_s32(output + 0, s8[18].val[0]); + vst1q_s32(output + 4, s8[18].val[1]); + output += 16; + vst1q_s32(output + 0, s8[19].val[0]); + vst1q_s32(output + 4, s8[19].val[1]); + output += 16; + vst1q_s32(output + 0, s8[20].val[0]); + vst1q_s32(output + 4, s8[20].val[1]); + output += 16; + vst1q_s32(output + 0, s8[21].val[0]); + vst1q_s32(output + 4, s8[21].val[1]); + output += 16; + vst1q_s32(output + 0, s8[22].val[0]); + vst1q_s32(output + 4, s8[22].val[1]); + output += 16; + vst1q_s32(output + 0, s8[23].val[0]); + vst1q_s32(output + 4, s8[23].val[1]); + output += 16; + + vst1q_s32(output + 0, s8[24].val[0]); + vst1q_s32(output + 4, s8[24].val[1]); + output += 16; + vst1q_s32(output + 0, s8[25].val[0]); + vst1q_s32(output + 4, s8[25].val[1]); + output += 16; + vst1q_s32(output + 0, s8[26].val[0]); + vst1q_s32(output + 4, s8[26].val[1]); + output += 16; + vst1q_s32(output + 0, s8[27].val[0]); + vst1q_s32(output + 4, s8[27].val[1]); + output += 16; + vst1q_s32(output + 0, s8[28].val[0]); + vst1q_s32(output + 4, s8[28].val[1]); + output += 16; + vst1q_s32(output + 0, s8[29].val[0]); + vst1q_s32(output + 4, s8[29].val[1]); + output += 16; + vst1q_s32(output + 0, s8[30].val[0]); + vst1q_s32(output + 4, s8[30].val[1]); + output += 16; + vst1q_s32(output + 0, s8[31].val[0]); + vst1q_s32(output + 4, s8[31].val[1]); +} + +static void vpx_highbd_idct32_16_neon(const int32_t *const input, + uint16_t *const output, const int stride, + const int bd) { + int32x4x2_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], + out[32]; + + load_and_transpose_s32_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); + + load_and_transpose_s32_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11], + &in[12], &in[13], &in[14], &in[15]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + s1[17] = multiply_shift_and_narrow_s32_dual(in[15], -cospi_17_64); + s1[30] = multiply_shift_and_narrow_s32_dual(in[15], cospi_15_64); + + s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64); + + s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64); + + s1[22] = multiply_shift_and_narrow_s32_dual(in[13], cospi_19_64); + s1[25] = multiply_shift_and_narrow_s32_dual(in[13], cospi_13_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + s2[9] = multiply_shift_and_narrow_s32_dual(in[14], -cospi_18_64); + s2[14] = multiply_shift_and_narrow_s32_dual(in[14], cospi_14_64); + + s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64); + + s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); + + s2[16] = highbd_idct_add_dual(s1[16], s1[17]); + s2[17] = highbd_idct_sub_dual(s1[16], s1[17]); + s2[18] = highbd_idct_sub_dual(s1[19], s1[18]); + s2[19] = highbd_idct_add_dual(s1[18], s1[19]); + s2[20] = highbd_idct_add_dual(s1[20], s1[21]); + s2[21] = highbd_idct_sub_dual(s1[20], s1[21]); + s2[22] = highbd_idct_sub_dual(s1[23], s1[22]); + s2[23] = highbd_idct_add_dual(s1[22], s1[23]); + s2[24] = highbd_idct_add_dual(s1[24], s1[25]); + s2[25] = highbd_idct_sub_dual(s1[24], s1[25]); + s2[26] = highbd_idct_sub_dual(s1[27], s1[26]); + s2[27] = highbd_idct_add_dual(s1[26], s1[27]); + s2[28] = highbd_idct_add_dual(s1[28], s1[29]); + s2[29] = highbd_idct_sub_dual(s1[28], s1[29]); + s2[30] = highbd_idct_sub_dual(s1[31], s1[30]); + s2[31] = highbd_idct_add_dual(s1[30], s1[31]); + + // stage 3 + s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s3[5] = multiply_shift_and_narrow_s32_dual(in[12], -cospi_20_64); + s3[6] = multiply_shift_and_narrow_s32_dual(in[12], cospi_12_64); + + s3[8] = highbd_idct_add_dual(s2[8], s2[9]); + s3[9] = highbd_idct_sub_dual(s2[8], s2[9]); + s3[10] = highbd_idct_sub_dual(s2[11], s2[10]); + s3[11] = highbd_idct_add_dual(s2[10], s2[11]); + s3[12] = highbd_idct_add_dual(s2[12], s2[13]); + s3[13] = highbd_idct_sub_dual(s2[12], s2[13]); + s3[14] = highbd_idct_sub_dual(s2[15], s2[14]); + s3[15] = highbd_idct_add_dual(s2[14], s2[15]); + + s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], -cospi_4_64, + s2[30], cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s2[17], cospi_28_64, + s2[30], cospi_4_64); + + s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64, + s2[29], cospi_28_64); + + s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64, + s2[26], cospi_20_64); + + s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_12_64, + s2[25], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s2[22], -cospi_20_64, + s2[25], cospi_12_64); + + // stage 4 + s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64); + + s4[4] = highbd_idct_add_dual(s3[4], s3[5]); + s4[5] = highbd_idct_sub_dual(s3[4], s3[5]); + s4[6] = highbd_idct_sub_dual(s3[7], s3[6]); + s4[7] = highbd_idct_add_dual(s3[6], s3[7]); + + s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], -cospi_8_64, + s3[14], cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s3[9], cospi_24_64, + s3[14], cospi_8_64); + + s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64, + s3[13], cospi_24_64); + + s4[16] = highbd_idct_add_dual(s2[16], s2[19]); + s4[17] = highbd_idct_add_dual(s3[17], s3[18]); + s4[18] = highbd_idct_sub_dual(s3[17], s3[18]); + s4[19] = highbd_idct_sub_dual(s2[16], s2[19]); + s4[20] = highbd_idct_sub_dual(s2[23], s2[20]); + s4[21] = highbd_idct_sub_dual(s3[22], s3[21]); + s4[22] = highbd_idct_add_dual(s3[21], s3[22]); + s4[23] = highbd_idct_add_dual(s2[20], s2[23]); + s4[24] = highbd_idct_add_dual(s2[24], s2[27]); + s4[25] = highbd_idct_add_dual(s3[25], s3[26]); + s4[26] = highbd_idct_sub_dual(s3[25], s3[26]); + s4[27] = highbd_idct_sub_dual(s2[24], s2[27]); + s4[28] = highbd_idct_sub_dual(s2[31], s2[28]); + s4[29] = highbd_idct_sub_dual(s3[30], s3[29]); + s4[30] = highbd_idct_add_dual(s3[29], s3[30]); + s4[31] = highbd_idct_add_dual(s2[28], s2[31]); + + // stage 5 + s5[0] = highbd_idct_add_dual(s4[0], s4[3]); + s5[1] = highbd_idct_add_dual(s4[0], s4[2]); + s5[2] = highbd_idct_sub_dual(s4[0], s4[2]); + s5[3] = highbd_idct_sub_dual(s4[0], s4[3]); + + s5[5] = sub_multiply_shift_and_narrow_s32_dual(s4[6], s4[5], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s32_dual(s4[5], s4[6], cospi_16_64); + + s5[8] = highbd_idct_add_dual(s3[8], s3[11]); + s5[9] = highbd_idct_add_dual(s4[9], s4[10]); + s5[10] = highbd_idct_sub_dual(s4[9], s4[10]); + s5[11] = highbd_idct_sub_dual(s3[8], s3[11]); + s5[12] = highbd_idct_sub_dual(s3[15], s3[12]); + s5[13] = highbd_idct_sub_dual(s4[14], s4[13]); + s5[14] = highbd_idct_add_dual(s4[13], s4[14]); + s5[15] = highbd_idct_add_dual(s3[15], s3[12]); + + s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64, + s4[29], cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64, + s4[29], cospi_8_64); + + s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64, + s4[28], cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64, + s4[28], cospi_8_64); + + s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64, + s4[27], cospi_24_64); + + s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64, + s4[26], cospi_24_64); + + // stage 6 + s6[0] = highbd_idct_add_dual(s5[0], s4[7]); + s6[1] = highbd_idct_add_dual(s5[1], s5[6]); + s6[2] = highbd_idct_add_dual(s5[2], s5[5]); + s6[3] = highbd_idct_add_dual(s5[3], s4[4]); + s6[4] = highbd_idct_sub_dual(s5[3], s4[4]); + s6[5] = highbd_idct_sub_dual(s5[2], s5[5]); + s6[6] = highbd_idct_sub_dual(s5[1], s5[6]); + s6[7] = highbd_idct_sub_dual(s5[0], s4[7]); + + s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64); + + s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64); + + s6[16] = highbd_idct_add_dual(s4[16], s4[23]); + s6[17] = highbd_idct_add_dual(s4[17], s4[22]); + s6[18] = highbd_idct_add_dual(s5[18], s5[21]); + s6[19] = highbd_idct_add_dual(s5[19], s5[20]); + s6[20] = highbd_idct_sub_dual(s5[19], s5[20]); + s6[21] = highbd_idct_sub_dual(s5[18], s5[21]); + s6[22] = highbd_idct_sub_dual(s4[17], s4[22]); + s6[23] = highbd_idct_sub_dual(s4[16], s4[23]); + s6[24] = highbd_idct_sub_dual(s4[31], s4[24]); + s6[25] = highbd_idct_sub_dual(s4[30], s4[25]); + s6[26] = highbd_idct_sub_dual(s5[29], s5[26]); + s6[27] = highbd_idct_sub_dual(s5[28], s5[27]); + s6[28] = highbd_idct_add_dual(s5[27], s5[28]); + s6[29] = highbd_idct_add_dual(s5[26], s5[29]); + s6[30] = highbd_idct_add_dual(s4[25], s4[30]); + s6[31] = highbd_idct_add_dual(s4[24], s4[31]); + + // stage 7 + s7[0] = highbd_idct_add_dual(s6[0], s5[15]); + s7[1] = highbd_idct_add_dual(s6[1], s5[14]); + s7[2] = highbd_idct_add_dual(s6[2], s6[13]); + s7[3] = highbd_idct_add_dual(s6[3], s6[12]); + s7[4] = highbd_idct_add_dual(s6[4], s6[11]); + s7[5] = highbd_idct_add_dual(s6[5], s6[10]); + s7[6] = highbd_idct_add_dual(s6[6], s5[9]); + s7[7] = highbd_idct_add_dual(s6[7], s5[8]); + s7[8] = highbd_idct_sub_dual(s6[7], s5[8]); + s7[9] = highbd_idct_sub_dual(s6[6], s5[9]); + s7[10] = highbd_idct_sub_dual(s6[5], s6[10]); + s7[11] = highbd_idct_sub_dual(s6[4], s6[11]); + s7[12] = highbd_idct_sub_dual(s6[3], s6[12]); + s7[13] = highbd_idct_sub_dual(s6[2], s6[13]); + s7[14] = highbd_idct_sub_dual(s6[1], s5[14]); + s7[15] = highbd_idct_sub_dual(s6[0], s5[15]); + + s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64); + + s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64); + + s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64); + + s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64); + + // final stage + out[0] = highbd_idct_add_dual(s7[0], s6[31]); + out[1] = highbd_idct_add_dual(s7[1], s6[30]); + out[2] = highbd_idct_add_dual(s7[2], s6[29]); + out[3] = highbd_idct_add_dual(s7[3], s6[28]); + out[4] = highbd_idct_add_dual(s7[4], s7[27]); + out[5] = highbd_idct_add_dual(s7[5], s7[26]); + out[6] = highbd_idct_add_dual(s7[6], s7[25]); + out[7] = highbd_idct_add_dual(s7[7], s7[24]); + out[8] = highbd_idct_add_dual(s7[8], s7[23]); + out[9] = highbd_idct_add_dual(s7[9], s7[22]); + out[10] = highbd_idct_add_dual(s7[10], s7[21]); + out[11] = highbd_idct_add_dual(s7[11], s7[20]); + out[12] = highbd_idct_add_dual(s7[12], s6[19]); + out[13] = highbd_idct_add_dual(s7[13], s6[18]); + out[14] = highbd_idct_add_dual(s7[14], s6[17]); + out[15] = highbd_idct_add_dual(s7[15], s6[16]); + out[16] = highbd_idct_sub_dual(s7[15], s6[16]); + out[17] = highbd_idct_sub_dual(s7[14], s6[17]); + out[18] = highbd_idct_sub_dual(s7[13], s6[18]); + out[19] = highbd_idct_sub_dual(s7[12], s6[19]); + out[20] = highbd_idct_sub_dual(s7[11], s7[20]); + out[21] = highbd_idct_sub_dual(s7[10], s7[21]); + out[22] = highbd_idct_sub_dual(s7[9], s7[22]); + out[23] = highbd_idct_sub_dual(s7[8], s7[23]); + out[24] = highbd_idct_sub_dual(s7[7], s7[24]); + out[25] = highbd_idct_sub_dual(s7[6], s7[25]); + out[26] = highbd_idct_sub_dual(s7[5], s7[26]); + out[27] = highbd_idct_sub_dual(s7[4], s7[27]); + out[28] = highbd_idct_sub_dual(s7[3], s6[28]); + out[29] = highbd_idct_sub_dual(s7[2], s6[29]); + out[30] = highbd_idct_sub_dual(s7[1], s6[30]); + out[31] = highbd_idct_sub_dual(s7[0], s6[31]); + + highbd_idct16x16_add_store(out, output, stride, bd); + highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); +} + +void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + + if (bd == 8) { + int16_t temp[32 * 16]; + int16_t *t = temp; + vpx_idct32_12_neon(input, temp); + vpx_idct32_12_neon(input + 32 * 8, temp + 8); + + for (i = 0; i < 32; i += 8) { + vpx_idct32_16_neon(t, dest, stride, 1); + t += (16 * 8); + dest += 8; + } + } else { + int32_t temp[32 * 16]; + int32_t *t = temp; + vpx_highbd_idct32_12_neon(input, temp); + vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8); + + for (i = 0; i < 32; i += 8) { + vpx_highbd_idct32_16_neon(t, dest, stride, bd); + t += (16 * 8); + dest += 8; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c new file mode 100644 index 0000000000..f05932cec3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +// Only for the first pass of the _34_ variant. Since it only uses values from +// the top left 8x8 it can safely assume all the remaining values are 0 and skip +// an awful lot of calculations. In fact, only the first 6 columns make the cut. +// None of the elements in the 7th or 8th column are used so it skips any calls +// to input[67] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 8x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 +// 0 0 2 5 10 17 25 +// 1 1 4 8 15 22 30 +// 2 3 7 12 18 28 +// 3 6 11 16 23 31 +// 4 9 14 19 29 +// 5 13 20 26 +// 6 21 27 33 +// 7 24 32 +static void vpx_highbd_idct32_6_neon(const tran_low_t *input, int32_t *output) { + int32x4x2_t in[8], s1[32], s2[32], s3[32]; + + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 32; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 32; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 32; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 32; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 32; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 32; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 32; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0) + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0) + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + // stage 3 + s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, + s1[31], cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, + s1[31], cospi_4_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64, + s1[27], cospi_20_64); + + s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + + s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, + s2[15], cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, + s2[15], cospi_8_64); + + s2[20] = highbd_idct_sub_dual(s1[23], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[22], s1[21]); + s2[22] = highbd_idct_add_dual(s1[21], s1[22]); + s2[23] = highbd_idct_add_dual(s1[20], s1[23]); + s2[24] = highbd_idct_add_dual(s1[24], s1[27]); + s2[25] = highbd_idct_add_dual(s1[25], s1[26]); + s2[26] = highbd_idct_sub_dual(s1[25], s1[26]); + s2[27] = highbd_idct_sub_dual(s1[24], s1[27]); + + // stage 5 + s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64); + + s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], -cospi_8_64, + s1[30], cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[17], cospi_24_64, + s1[30], cospi_8_64); + + s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_8_64, + s1[31], cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_24_64, + s1[31], cospi_8_64); + + s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64, + s2[27], cospi_24_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64, + s2[26], cospi_24_64); + + // stage 6 + s2[0] = highbd_idct_add_dual(s1[0], s1[7]); + s2[1] = highbd_idct_add_dual(s1[0], s1[6]); + s2[2] = highbd_idct_add_dual(s1[0], s1[5]); + s2[3] = highbd_idct_add_dual(s1[0], s1[4]); + s2[4] = highbd_idct_sub_dual(s1[0], s1[4]); + s2[5] = highbd_idct_sub_dual(s1[0], s1[5]); + s2[6] = highbd_idct_sub_dual(s1[0], s1[6]); + s2[7] = highbd_idct_sub_dual(s1[0], s1[7]); + + s2[10] = sub_multiply_shift_and_narrow_s32_dual(s2[14], s2[9], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s32_dual(s2[9], s2[14], cospi_16_64); + + s2[11] = sub_multiply_shift_and_narrow_s32_dual(s2[15], s2[8], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s32_dual(s2[8], s2[15], cospi_16_64); + + s2[16] = highbd_idct_add_dual(s1[16], s2[23]); + s2[17] = highbd_idct_add_dual(s1[17], s2[22]); + s2[18] = highbd_idct_add_dual(s1[18], s1[21]); + s2[19] = highbd_idct_add_dual(s1[19], s1[20]); + s2[20] = highbd_idct_sub_dual(s1[19], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[18], s1[21]); + s2[22] = highbd_idct_sub_dual(s1[17], s2[22]); + s2[23] = highbd_idct_sub_dual(s1[16], s2[23]); + + s3[24] = highbd_idct_sub_dual(s1[31], s2[24]); + s3[25] = highbd_idct_sub_dual(s1[30], s2[25]); + s3[26] = highbd_idct_sub_dual(s1[29], s1[26]); + s3[27] = highbd_idct_sub_dual(s1[28], s1[27]); + s2[28] = highbd_idct_add_dual(s1[27], s1[28]); + s2[29] = highbd_idct_add_dual(s1[26], s1[29]); + s2[30] = highbd_idct_add_dual(s2[25], s1[30]); + s2[31] = highbd_idct_add_dual(s2[24], s1[31]); + + // stage 7 + s1[0] = highbd_idct_add_dual(s2[0], s2[15]); + s1[1] = highbd_idct_add_dual(s2[1], s2[14]); + s1[2] = highbd_idct_add_dual(s2[2], s2[13]); + s1[3] = highbd_idct_add_dual(s2[3], s2[12]); + s1[4] = highbd_idct_add_dual(s2[4], s2[11]); + s1[5] = highbd_idct_add_dual(s2[5], s2[10]); + s1[6] = highbd_idct_add_dual(s2[6], s2[9]); + s1[7] = highbd_idct_add_dual(s2[7], s2[8]); + s1[8] = highbd_idct_sub_dual(s2[7], s2[8]); + s1[9] = highbd_idct_sub_dual(s2[6], s2[9]); + s1[10] = highbd_idct_sub_dual(s2[5], s2[10]); + s1[11] = highbd_idct_sub_dual(s2[4], s2[11]); + s1[12] = highbd_idct_sub_dual(s2[3], s2[12]); + s1[13] = highbd_idct_sub_dual(s2[2], s2[13]); + s1[14] = highbd_idct_sub_dual(s2[1], s2[14]); + s1[15] = highbd_idct_sub_dual(s2[0], s2[15]); + + s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64); + + s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64); + + s1[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s2[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s32_dual(s2[22], s3[25], cospi_16_64); + + s1[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s2[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s32_dual(s2[23], s3[24], cospi_16_64); + + // final stage + s3[0] = highbd_idct_add_dual(s1[0], s2[31]); + s3[1] = highbd_idct_add_dual(s1[1], s2[30]); + s3[2] = highbd_idct_add_dual(s1[2], s2[29]); + s3[3] = highbd_idct_add_dual(s1[3], s2[28]); + s3[4] = highbd_idct_add_dual(s1[4], s1[27]); + s3[5] = highbd_idct_add_dual(s1[5], s1[26]); + s3[6] = highbd_idct_add_dual(s1[6], s1[25]); + s3[7] = highbd_idct_add_dual(s1[7], s1[24]); + s3[8] = highbd_idct_add_dual(s1[8], s1[23]); + s3[9] = highbd_idct_add_dual(s1[9], s1[22]); + s3[10] = highbd_idct_add_dual(s1[10], s1[21]); + s3[11] = highbd_idct_add_dual(s1[11], s1[20]); + s3[12] = highbd_idct_add_dual(s1[12], s2[19]); + s3[13] = highbd_idct_add_dual(s1[13], s2[18]); + s3[14] = highbd_idct_add_dual(s1[14], s2[17]); + s3[15] = highbd_idct_add_dual(s1[15], s2[16]); + s3[16] = highbd_idct_sub_dual(s1[15], s2[16]); + s3[17] = highbd_idct_sub_dual(s1[14], s2[17]); + s3[18] = highbd_idct_sub_dual(s1[13], s2[18]); + s3[19] = highbd_idct_sub_dual(s1[12], s2[19]); + s3[20] = highbd_idct_sub_dual(s1[11], s1[20]); + s3[21] = highbd_idct_sub_dual(s1[10], s1[21]); + s3[22] = highbd_idct_sub_dual(s1[9], s1[22]); + s3[23] = highbd_idct_sub_dual(s1[8], s1[23]); + s3[24] = highbd_idct_sub_dual(s1[7], s1[24]); + s3[25] = highbd_idct_sub_dual(s1[6], s1[25]); + s3[26] = highbd_idct_sub_dual(s1[5], s1[26]); + s3[27] = highbd_idct_sub_dual(s1[4], s1[27]); + s3[28] = highbd_idct_sub_dual(s1[3], s2[28]); + s3[29] = highbd_idct_sub_dual(s1[2], s2[29]); + s3[30] = highbd_idct_sub_dual(s1[1], s2[30]); + s3[31] = highbd_idct_sub_dual(s1[0], s2[31]); + + vst1q_s32(output, s3[0].val[0]); + output += 4; + vst1q_s32(output, s3[0].val[1]); + output += 4; + vst1q_s32(output, s3[1].val[0]); + output += 4; + vst1q_s32(output, s3[1].val[1]); + output += 4; + vst1q_s32(output, s3[2].val[0]); + output += 4; + vst1q_s32(output, s3[2].val[1]); + output += 4; + vst1q_s32(output, s3[3].val[0]); + output += 4; + vst1q_s32(output, s3[3].val[1]); + output += 4; + vst1q_s32(output, s3[4].val[0]); + output += 4; + vst1q_s32(output, s3[4].val[1]); + output += 4; + vst1q_s32(output, s3[5].val[0]); + output += 4; + vst1q_s32(output, s3[5].val[1]); + output += 4; + vst1q_s32(output, s3[6].val[0]); + output += 4; + vst1q_s32(output, s3[6].val[1]); + output += 4; + vst1q_s32(output, s3[7].val[0]); + output += 4; + vst1q_s32(output, s3[7].val[1]); + output += 4; + + vst1q_s32(output, s3[8].val[0]); + output += 4; + vst1q_s32(output, s3[8].val[1]); + output += 4; + vst1q_s32(output, s3[9].val[0]); + output += 4; + vst1q_s32(output, s3[9].val[1]); + output += 4; + vst1q_s32(output, s3[10].val[0]); + output += 4; + vst1q_s32(output, s3[10].val[1]); + output += 4; + vst1q_s32(output, s3[11].val[0]); + output += 4; + vst1q_s32(output, s3[11].val[1]); + output += 4; + vst1q_s32(output, s3[12].val[0]); + output += 4; + vst1q_s32(output, s3[12].val[1]); + output += 4; + vst1q_s32(output, s3[13].val[0]); + output += 4; + vst1q_s32(output, s3[13].val[1]); + output += 4; + vst1q_s32(output, s3[14].val[0]); + output += 4; + vst1q_s32(output, s3[14].val[1]); + output += 4; + vst1q_s32(output, s3[15].val[0]); + output += 4; + vst1q_s32(output, s3[15].val[1]); + output += 4; + + vst1q_s32(output, s3[16].val[0]); + output += 4; + vst1q_s32(output, s3[16].val[1]); + output += 4; + vst1q_s32(output, s3[17].val[0]); + output += 4; + vst1q_s32(output, s3[17].val[1]); + output += 4; + vst1q_s32(output, s3[18].val[0]); + output += 4; + vst1q_s32(output, s3[18].val[1]); + output += 4; + vst1q_s32(output, s3[19].val[0]); + output += 4; + vst1q_s32(output, s3[19].val[1]); + output += 4; + vst1q_s32(output, s3[20].val[0]); + output += 4; + vst1q_s32(output, s3[20].val[1]); + output += 4; + vst1q_s32(output, s3[21].val[0]); + output += 4; + vst1q_s32(output, s3[21].val[1]); + output += 4; + vst1q_s32(output, s3[22].val[0]); + output += 4; + vst1q_s32(output, s3[22].val[1]); + output += 4; + vst1q_s32(output, s3[23].val[0]); + output += 4; + vst1q_s32(output, s3[23].val[1]); + output += 4; + + vst1q_s32(output, s3[24].val[0]); + output += 4; + vst1q_s32(output, s3[24].val[1]); + output += 4; + vst1q_s32(output, s3[25].val[0]); + output += 4; + vst1q_s32(output, s3[25].val[1]); + output += 4; + vst1q_s32(output, s3[26].val[0]); + output += 4; + vst1q_s32(output, s3[26].val[1]); + output += 4; + vst1q_s32(output, s3[27].val[0]); + output += 4; + vst1q_s32(output, s3[27].val[1]); + output += 4; + vst1q_s32(output, s3[28].val[0]); + output += 4; + vst1q_s32(output, s3[28].val[1]); + output += 4; + vst1q_s32(output, s3[29].val[0]); + output += 4; + vst1q_s32(output, s3[29].val[1]); + output += 4; + vst1q_s32(output, s3[30].val[0]); + output += 4; + vst1q_s32(output, s3[30].val[1]); + output += 4; + vst1q_s32(output, s3[31].val[0]); + output += 4; + vst1q_s32(output, s3[31].val[1]); +} + +static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output, + int stride, const int bd) { + int32x4x2_t in[8], s1[32], s2[32], s3[32], out[32]; + + load_and_transpose_s32_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); + + // stage 1 + s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); + + // Different for _8_ + s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); + + s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); + + s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); + + // stage 2 + s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); + + s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); + + // stage 3 + s1[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); + + s1[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, + s1[31], cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, + s1[31], cospi_4_64); + + // Different for _8_ + s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_28_64, + s1[28], -cospi_4_64); + s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s1[19], -cospi_4_64, + s1[28], cospi_28_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s1[20], cospi_12_64, + s1[27], cospi_20_64); + + s1[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, + s1[24], cospi_12_64); + + // stage 4 + s1[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); + + s2[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, + s2[15], cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, + s2[15], cospi_8_64); + + s2[10] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_24_64, + s2[12], -cospi_8_64); + s2[13] = multiply_accumulate_shift_and_narrow_s32_dual(s2[11], -cospi_8_64, + s2[12], cospi_24_64); + + s2[16] = highbd_idct_add_dual(s1[16], s1[19]); + + s2[17] = highbd_idct_add_dual(s1[17], s1[18]); + s2[18] = highbd_idct_sub_dual(s1[17], s1[18]); + + s2[19] = highbd_idct_sub_dual(s1[16], s1[19]); + + s2[20] = highbd_idct_sub_dual(s1[23], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[22], s1[21]); + + s2[22] = highbd_idct_add_dual(s1[21], s1[22]); + s2[23] = highbd_idct_add_dual(s1[20], s1[23]); + + s2[24] = highbd_idct_add_dual(s1[24], s1[27]); + s2[25] = highbd_idct_add_dual(s1[25], s1[26]); + s2[26] = highbd_idct_sub_dual(s1[25], s1[26]); + s2[27] = highbd_idct_sub_dual(s1[24], s1[27]); + + s2[28] = highbd_idct_sub_dual(s1[31], s1[28]); + s2[29] = highbd_idct_sub_dual(s1[30], s1[29]); + s2[30] = highbd_idct_add_dual(s1[29], s1[30]); + s2[31] = highbd_idct_add_dual(s1[28], s1[31]); + + // stage 5 + s1[5] = sub_multiply_shift_and_narrow_s32_dual(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s32_dual(s1[4], s1[7], cospi_16_64); + + s1[8] = highbd_idct_add_dual(s2[8], s2[11]); + s1[9] = highbd_idct_add_dual(s2[9], s2[10]); + s1[10] = highbd_idct_sub_dual(s2[9], s2[10]); + s1[11] = highbd_idct_sub_dual(s2[8], s2[11]); + s1[12] = highbd_idct_sub_dual(s2[15], s2[12]); + s1[13] = highbd_idct_sub_dual(s2[14], s2[13]); + s1[14] = highbd_idct_add_dual(s2[13], s2[14]); + s1[15] = highbd_idct_add_dual(s2[12], s2[15]); + + s1[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_8_64, + s2[29], cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], cospi_24_64, + s2[29], cospi_8_64); + + s1[19] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], -cospi_8_64, + s2[28], cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s32_dual(s2[19], cospi_24_64, + s2[28], cospi_8_64); + + s1[20] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s32_dual(s2[20], -cospi_8_64, + s2[27], cospi_24_64); + + s1[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_8_64, + s2[26], cospi_24_64); + + // stage 6 + s2[0] = highbd_idct_add_dual(s1[0], s1[7]); + s2[1] = highbd_idct_add_dual(s1[0], s1[6]); + s2[2] = highbd_idct_add_dual(s1[0], s1[5]); + s2[3] = highbd_idct_add_dual(s1[0], s1[4]); + s2[4] = highbd_idct_sub_dual(s1[0], s1[4]); + s2[5] = highbd_idct_sub_dual(s1[0], s1[5]); + s2[6] = highbd_idct_sub_dual(s1[0], s1[6]); + s2[7] = highbd_idct_sub_dual(s1[0], s1[7]); + + s2[10] = sub_multiply_shift_and_narrow_s32_dual(s1[13], s1[10], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s32_dual(s1[10], s1[13], cospi_16_64); + + s2[11] = sub_multiply_shift_and_narrow_s32_dual(s1[12], s1[11], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s32_dual(s1[11], s1[12], cospi_16_64); + + s1[16] = highbd_idct_add_dual(s2[16], s2[23]); + s1[17] = highbd_idct_add_dual(s2[17], s2[22]); + s2[18] = highbd_idct_add_dual(s1[18], s1[21]); + s2[19] = highbd_idct_add_dual(s1[19], s1[20]); + s2[20] = highbd_idct_sub_dual(s1[19], s1[20]); + s2[21] = highbd_idct_sub_dual(s1[18], s1[21]); + s1[22] = highbd_idct_sub_dual(s2[17], s2[22]); + s1[23] = highbd_idct_sub_dual(s2[16], s2[23]); + + s3[24] = highbd_idct_sub_dual(s2[31], s2[24]); + s3[25] = highbd_idct_sub_dual(s2[30], s2[25]); + s3[26] = highbd_idct_sub_dual(s1[29], s1[26]); + s3[27] = highbd_idct_sub_dual(s1[28], s1[27]); + s2[28] = highbd_idct_add_dual(s1[27], s1[28]); + s2[29] = highbd_idct_add_dual(s1[26], s1[29]); + s2[30] = highbd_idct_add_dual(s2[25], s2[30]); + s2[31] = highbd_idct_add_dual(s2[24], s2[31]); + + // stage 7 + s1[0] = highbd_idct_add_dual(s2[0], s1[15]); + s1[1] = highbd_idct_add_dual(s2[1], s1[14]); + s1[2] = highbd_idct_add_dual(s2[2], s2[13]); + s1[3] = highbd_idct_add_dual(s2[3], s2[12]); + s1[4] = highbd_idct_add_dual(s2[4], s2[11]); + s1[5] = highbd_idct_add_dual(s2[5], s2[10]); + s1[6] = highbd_idct_add_dual(s2[6], s1[9]); + s1[7] = highbd_idct_add_dual(s2[7], s1[8]); + s1[8] = highbd_idct_sub_dual(s2[7], s1[8]); + s1[9] = highbd_idct_sub_dual(s2[6], s1[9]); + s1[10] = highbd_idct_sub_dual(s2[5], s2[10]); + s1[11] = highbd_idct_sub_dual(s2[4], s2[11]); + s1[12] = highbd_idct_sub_dual(s2[3], s2[12]); + s1[13] = highbd_idct_sub_dual(s2[2], s2[13]); + s1[14] = highbd_idct_sub_dual(s2[1], s1[14]); + s1[15] = highbd_idct_sub_dual(s2[0], s1[15]); + + s1[20] = sub_multiply_shift_and_narrow_s32_dual(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s32_dual(s2[20], s3[27], cospi_16_64); + + s1[21] = sub_multiply_shift_and_narrow_s32_dual(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s32_dual(s2[21], s3[26], cospi_16_64); + + s2[22] = sub_multiply_shift_and_narrow_s32_dual(s3[25], s1[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s32_dual(s1[22], s3[25], cospi_16_64); + + s2[23] = sub_multiply_shift_and_narrow_s32_dual(s3[24], s1[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s32_dual(s1[23], s3[24], cospi_16_64); + + // final stage + out[0] = highbd_idct_add_dual(s1[0], s2[31]); + out[1] = highbd_idct_add_dual(s1[1], s2[30]); + out[2] = highbd_idct_add_dual(s1[2], s2[29]); + out[3] = highbd_idct_add_dual(s1[3], s2[28]); + out[4] = highbd_idct_add_dual(s1[4], s1[27]); + out[5] = highbd_idct_add_dual(s1[5], s1[26]); + out[6] = highbd_idct_add_dual(s1[6], s1[25]); + out[7] = highbd_idct_add_dual(s1[7], s1[24]); + out[8] = highbd_idct_add_dual(s1[8], s2[23]); + out[9] = highbd_idct_add_dual(s1[9], s2[22]); + out[10] = highbd_idct_add_dual(s1[10], s1[21]); + out[11] = highbd_idct_add_dual(s1[11], s1[20]); + out[12] = highbd_idct_add_dual(s1[12], s2[19]); + out[13] = highbd_idct_add_dual(s1[13], s2[18]); + out[14] = highbd_idct_add_dual(s1[14], s1[17]); + out[15] = highbd_idct_add_dual(s1[15], s1[16]); + out[16] = highbd_idct_sub_dual(s1[15], s1[16]); + out[17] = highbd_idct_sub_dual(s1[14], s1[17]); + out[18] = highbd_idct_sub_dual(s1[13], s2[18]); + out[19] = highbd_idct_sub_dual(s1[12], s2[19]); + out[20] = highbd_idct_sub_dual(s1[11], s1[20]); + out[21] = highbd_idct_sub_dual(s1[10], s1[21]); + out[22] = highbd_idct_sub_dual(s1[9], s2[22]); + out[23] = highbd_idct_sub_dual(s1[8], s2[23]); + out[24] = highbd_idct_sub_dual(s1[7], s1[24]); + out[25] = highbd_idct_sub_dual(s1[6], s1[25]); + out[26] = highbd_idct_sub_dual(s1[5], s1[26]); + out[27] = highbd_idct_sub_dual(s1[4], s1[27]); + out[28] = highbd_idct_sub_dual(s1[3], s2[28]); + out[29] = highbd_idct_sub_dual(s1[2], s2[29]); + out[30] = highbd_idct_sub_dual(s1[1], s2[30]); + out[31] = highbd_idct_sub_dual(s1[0], s2[31]); + + highbd_idct16x16_add_store(out, output, stride, bd); + highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); +} + +void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + + if (bd == 8) { + int16_t temp[32 * 8]; + int16_t *t = temp; + + vpx_idct32_6_neon(input, t); + + for (i = 0; i < 32; i += 8) { + vpx_idct32_8_neon(t, dest, stride, 1); + t += (8 * 8); + dest += 8; + } + } else { + int32_t temp[32 * 8]; + int32_t *t = temp; + + vpx_highbd_idct32_6_neon(input, t); + + for (i = 0; i < 32; i += 8) { + vpx_highbd_idct32_8_neon(t, dest, stride, bd); + t += (8 * 8); + dest += 8; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c new file mode 100644 index 0000000000..c1354c0c1a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a0 = vld1q_u16(*dest); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const uint16x8_t a2 = vld1q_u16(*dest + 16); + const uint16x8_t a3 = vld1q_u16(*dest + 24); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); + const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); + const int16x8_t c0 = vminq_s16(b0, max); + const int16x8_t c1 = vminq_s16(b1, max); + const int16x8_t c2 = vminq_s16(b2, max); + const int16x8_t c3 = vminq_s16(b3, max); + vst1q_u16(*dest, vreinterpretq_u16_s16(c0)); + vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); + vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2)); + vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3)); + *dest += stride; +} + +static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a0 = vld1q_u16(*dest); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const uint16x8_t a2 = vld1q_u16(*dest + 16); + const uint16x8_t a3 = vld1q_u16(*dest + 24); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); + const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); + const uint16x8_t c0 = vqshluq_n_s16(b0, 0); + const uint16x8_t c1 = vqshluq_n_s16(b1, 0); + const uint16x8_t c2 = vqshluq_n_s16(b2, 0); + const uint16x8_t c3 = vqshluq_n_s16(b3, 0); + vst1q_u16(*dest, c0); + vst1q_u16(*dest + 8, c1); + vst1q_u16(*dest + 16, c2); + vst1q_u16(*dest + 24, c3); + *dest += stride; +} + +void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + const int16x8_t dc = vdupq_n_s16(a1); + int i; + + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + for (i = 0; i < 8; ++i) { + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + } + } else { + for (i = 0; i < 8; ++i) { + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c index b9e226a684..7be1dad1d3 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -11,27 +11,10 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest, - const int stride, - const int16x8_t res, - const int16x8_t max) { - const uint16x4_t a0 = vld1_u16(*dest); - const uint16x4_t a1 = vld1_u16(*dest + stride); - const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1)); - // Note: In some profile tests, res is quite close to +/-32767. - // We use saturating addition. - const int16x8_t b = vqaddq_s16(res, a); - const int16x8_t c = vminq_s16(b, max); - const uint16x8_t d = vqshluq_n_s16(c, 0); - vst1_u16(*dest, vget_low_u16(d)); - *dest += stride; - vst1_u16(*dest, vget_high_u16(d)); - *dest += stride; -} - // res is in reverse row order static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, const int stride, @@ -51,123 +34,56 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, *dest += stride; } -void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); - const tran_low_t out0 = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - const tran_low_t out1 = - HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); } -static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, - int32x4_t *const a0, - int32x4_t *const a1, - int32x4_t *const a2, - int32x4_t *const a3) { - int32x4_t b0, b1, b2, b3; - - transpose_s32_4x4(a0, a1, a2, a3); - b0 = vaddq_s32(*a0, *a2); - b1 = vsubq_s32(*a0, *a2); - b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); - b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); - b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1); - b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); - b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); - b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); - b0 = vrshrq_n_s32(b0, 14); - b1 = vrshrq_n_s32(b1, 14); - b2 = vrshrq_n_s32(b2, 14); - b3 = vrshrq_n_s32(b3, 14); - *a0 = vaddq_s32(b0, b3); - *a1 = vaddq_s32(b1, b2); - *a2 = vsubq_s32(b1, b2); - *a3 = vsubq_s32(b0, b3); -} - -static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, - int32x4_t *const a0, - int32x4_t *const a1, - int32x4_t *const a2, - int32x4_t *const a3) { - int32x4_t b0, b1, b2, b3; - int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; - - transpose_s32_4x4(a0, a1, a2, a3); - b0 = vaddq_s32(*a0, *a2); - b1 = vsubq_s32(*a0, *a2); - c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); - c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); - c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); - c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); - c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); - c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); - c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); - c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); - c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); - c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); - c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); - c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); - c4 = vsubq_s64(c4, c8); - c5 = vsubq_s64(c5, c9); - c6 = vaddq_s64(c6, c10); - c7 = vaddq_s64(c7, c11); - b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14)); - b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14)); - b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14)); - b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14)); - *a0 = vaddq_s32(b0, b3); - *a1 = vaddq_s32(b1, b2); - *a2 = vsubq_s32(b1, b2); - *a3 = vsubq_s32(b0, b3); -} - -void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - DECLARE_ALIGNED(16, static const int32_t, kCospi32[4]) = { 0, 15137, 11585, - 6270 }; const int16x8_t max = vdupq_n_s16((1 << bd) - 1); - int32x4_t c0 = vld1q_s32(input); - int32x4_t c1 = vld1q_s32(input + 4); - int32x4_t c2 = vld1q_s32(input + 8); - int32x4_t c3 = vld1q_s32(input + 12); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int16x8_t a0, a1; + int16x8_t a[2]; + int32x4_t c[4]; + + c[0] = vld1q_s32(input); + c[1] = vld1q_s32(input + 4); + c[2] = vld1q_s32(input + 8); + c[3] = vld1q_s32(input + 12); if (bd == 8) { - const int16x4_t cospis = vld1_s16(kCospi); - // Rows - a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1)); - a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3)); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1])); + a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3])); + transpose_idct4x4_16_bd8(a); // Columns - a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); - a0 = vrshrq_n_s16(a0, 4); - a1 = vrshrq_n_s16(a1, 4); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_idct4x4_16_bd8(a); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); } else { const int32x4_t cospis = vld1q_s32(kCospi32); if (bd == 10) { - idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); - idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd10(cospis, c); + idct4x4_16_kernel_bd10(cospis, c); } else { - idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); - idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd12(cospis, c); + idct4x4_16_kernel_bd12(cospis, c); } - a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4)); - a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4)); + a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4)); + a[1] = vcombine_s16(vqrshrn_n_s32(c[3], 4), vqrshrn_n_s32(c[2], 4)); } - highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max); - highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max); + highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max); + highbd_idct4x4_1_add_kernel2(&dest, stride, a[1], max); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c index c1c0f645d1..bed3227ca7 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -11,41 +11,61 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/inv_txfm.h" -static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest, - const int stride, - const int16x8_t res, - const int16x8_t max) { +static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { const uint16x8_t a = vld1q_u16(*dest); const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a)); const int16x8_t c = vminq_s16(b, max); - const uint16x8_t d = vqshluq_n_s16(c, 0); - vst1q_u16(*dest, d); + vst1q_u16(*dest, vreinterpretq_u16_s16(c)); *dest += stride; } -void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8, +static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a = vld1q_u16(*dest); + const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a)); + const uint16x8_t c = vqshluq_n_s16(b, 0); + vst1q_u16(*dest, c); + *dest += stride; +} + +void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - const int16x8_t max = vdupq_n_s16((1 << bd) - 1); - const tran_low_t out0 = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - const tran_low_t out1 = - HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); + const tran_low_t out0 = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + const tran_low_t out1 = HIGHBD_WRAPLOW( + dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); const int16x8_t dc = vdupq_n_s16(a1); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + } else { + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + } } static INLINE void idct8x8_12_half1d_bd10( @@ -62,18 +82,18 @@ static INLINE void idct8x8_12_half1d_bd10( step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); - step1[4] = vrshrq_n_s32(step1[4], 14); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); - step1[7] = vrshrq_n_s32(step1[7], 14); + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); // stage 2 step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); - step2[1] = vrshrq_n_s32(step2[1], 14); - step2[2] = vrshrq_n_s32(step2[2], 14); - step2[3] = vrshrq_n_s32(step2[3], 14); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); step2[4] = vaddq_s32(step1[4], step1[5]); step2[5] = vsubq_s32(step1[4], step1[5]); @@ -89,8 +109,8 @@ static INLINE void idct8x8_12_half1d_bd10( step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); // stage 4 *io0 = vaddq_s32(step1[0], step2[7]); @@ -108,7 +128,7 @@ static INLINE void idct8x8_12_half1d_bd12( int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, int32x4_t *const io7) { - int32x2_t input_1l, input_1h, input_3l, input_3h; + int32x2_t input1l, input1h, input3l, input3h; int32x2_t step1l[2], step1h[2]; int32x4_t step1[8], step2[8]; int64x2_t t64[8]; @@ -117,31 +137,31 @@ static INLINE void idct8x8_12_half1d_bd12( transpose_s32_4x4(io0, io1, io2, io3); // stage 1 - input_1l = vget_low_s32(*io1); - input_1h = vget_high_s32(*io1); - input_3l = vget_low_s32(*io3); - input_3h = vget_high_s32(*io3); + input1l = vget_low_s32(*io1); + input1h = vget_high_s32(*io1); + input3l = vget_low_s32(*io3); + input3h = vget_high_s32(*io3); step1l[0] = vget_low_s32(*io0); step1h[0] = vget_high_s32(*io0); step1l[1] = vget_low_s32(*io2); step1h[1] = vget_high_s32(*io2); - t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1); - t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1); - t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0); - t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0); - t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1); - t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); - t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); - t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); + t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); step1[4] = vcombine_s32(t32[0], t32[1]); step1[5] = vcombine_s32(t32[2], t32[3]); step1[6] = vcombine_s32(t32[4], t32[5]); @@ -154,12 +174,12 @@ static INLINE void idct8x8_12_half1d_bd12( t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); step2[1] = vcombine_s32(t32[2], t32[3]); step2[2] = vcombine_s32(t32[4], t32[5]); step2[3] = vcombine_s32(t32[6], t32[7]); @@ -185,10 +205,10 @@ static INLINE void idct8x8_12_half1d_bd12( vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), vget_high_s32(cospis0), 0); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); step1[5] = vcombine_s32(t32[0], t32[1]); step1[6] = vcombine_s32(t32[2], t32[3]); @@ -203,83 +223,15 @@ static INLINE void idct8x8_12_half1d_bd12( *io7 = vsubq_s32(step1[0], step2[7]); } -static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, - int16x8_t a3, int16x8_t a4, int16x8_t a5, - int16x8_t a6, int16x8_t a7, uint16_t *dest, - const int stride, const int bd) { - const int16x8_t max = vdupq_n_s16((1 << bd) - 1); - const uint16_t *dst = dest; - uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7; - uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; - int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16; - - d0 = vld1q_u16(dst); - dst += stride; - d1 = vld1q_u16(dst); - dst += stride; - d2 = vld1q_u16(dst); - dst += stride; - d3 = vld1q_u16(dst); - dst += stride; - d4 = vld1q_u16(dst); - dst += stride; - d5 = vld1q_u16(dst); - dst += stride; - d6 = vld1q_u16(dst); - dst += stride; - d7 = vld1q_u16(dst); - - d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0)); - d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1)); - d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2)); - d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3)); - d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4)); - d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5)); - d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6)); - d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7)); - - d0_s16 = vminq_s16(d0_s16, max); - d1_s16 = vminq_s16(d1_s16, max); - d2_s16 = vminq_s16(d2_s16, max); - d3_s16 = vminq_s16(d3_s16, max); - d4_s16 = vminq_s16(d4_s16, max); - d5_s16 = vminq_s16(d5_s16, max); - d6_s16 = vminq_s16(d6_s16, max); - d7_s16 = vminq_s16(d7_s16, max); - d0_u16 = vqshluq_n_s16(d0_s16, 0); - d1_u16 = vqshluq_n_s16(d1_s16, 0); - d2_u16 = vqshluq_n_s16(d2_s16, 0); - d3_u16 = vqshluq_n_s16(d3_s16, 0); - d4_u16 = vqshluq_n_s16(d4_s16, 0); - d5_u16 = vqshluq_n_s16(d5_s16, 0); - d6_u16 = vqshluq_n_s16(d6_s16, 0); - d7_u16 = vqshluq_n_s16(d7_s16, 0); - - vst1q_u16(dest, d0_u16); - dest += stride; - vst1q_u16(dest, d1_u16); - dest += stride; - vst1q_u16(dest, d2_u16); - dest += stride; - vst1q_u16(dest, d3_u16); - dest += stride; - vst1q_u16(dest, d4_u16); - dest += stride; - vst1q_u16(dest, d5_u16); - dest += stride; - vst1q_u16(dest, d6_u16); - dest += stride; - vst1q_u16(dest, d7_u16); -} - -void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int32x4_t a0 = vld1q_s32(input); - int32x4_t a1 = vld1q_s32(input + 8); - int32x4_t a2 = vld1q_s32(input + 16); - int32x4_t a3 = vld1q_s32(input + 24); - int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 8); + a[2] = vld1q_s32(input + 16); + a[3] = vld1q_s32(input + 24); if (bd == 8) { const int16x8_t cospis = vld1q_s16(kCospi); @@ -287,328 +239,133 @@ void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8, const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 - int16x4_t b0 = vmovn_s32(a0); - int16x4_t b1 = vmovn_s32(a1); - int16x4_t b2 = vmovn_s32(a2); - int16x4_t b3 = vmovn_s32(a3); - int16x4_t b4, b5, b6, b7; + int16x4_t b[8]; - idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4, - &b5, &b6, &b7); - idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5, - b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7); - c0 = vrshrq_n_s16(c0, 5); - c1 = vrshrq_n_s16(c1, 5); - c2 = vrshrq_n_s16(c2, 5); - c3 = vrshrq_n_s16(c3, 5); - c4 = vrshrq_n_s16(c4, 5); - c5 = vrshrq_n_s16(c5, 5); - c6 = vrshrq_n_s16(c6, 5); - c7 = vrshrq_n_s16(c7, 5); + b[0] = vmovn_s32(a[0]); + b[1] = vmovn_s32(a[1]); + b[2] = vmovn_s32(a[2]); + b[3] = vmovn_s32(a[3]); + + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, b); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b, c); + c[0] = vrshrq_n_s16(c[0], 5); + c[1] = vrshrq_n_s16(c[1], 5); + c[2] = vrshrq_n_s16(c[2], 5); + c[3] = vrshrq_n_s16(c[3], 5); + c[4] = vrshrq_n_s16(c[4], 5); + c[5] = vrshrq_n_s16(c[5], 5); + c[6] = vrshrq_n_s16(c[6], 5); + c[7] = vrshrq_n_s16(c[7], 5); } else { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 - int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15; if (bd == 10) { - idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9, - &a10, &a11); - idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13, - &a14, &a15); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[8], &a[9], &a[10], &a[11]); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7], + &a[12], &a[13], &a[14], &a[15]); } else { - idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9, - &a10, &a11); - idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13, - &a14, &a15); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[8], &a[9], &a[10], &a[11]); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7], + &a[12], &a[13], &a[14], &a[15]); } - c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5)); - c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5)); - c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5)); - c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5)); - c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5)); - c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5)); - c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5)); - c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5)); + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); } - highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd); + highbd_add8x8(c, dest, stride, bd); } -static INLINE void idct8x8_64_half1d_bd10( - const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, - int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, - int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, - int32x4_t *const io7) { - int32x4_t step1[8], step2[8]; - - transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); - - // stage 1 - step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1); - step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); - step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); - step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); - - step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0); - step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1); - step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0); - step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1); - - step1[4] = vrshrq_n_s32(step1[4], 14); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); - step1[7] = vrshrq_n_s32(step1[7], 14); - - // stage 2 - step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); - step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); - step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); - - step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); - step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); - step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1); - step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1); - - step2[0] = vrshrq_n_s32(step2[0], 14); - step2[1] = vrshrq_n_s32(step2[1], 14); - step2[2] = vrshrq_n_s32(step2[2], 14); - step2[3] = vrshrq_n_s32(step2[3], 14); - - step2[4] = vaddq_s32(step1[4], step1[5]); - step2[5] = vsubq_s32(step1[4], step1[5]); - step2[6] = vsubq_s32(step1[7], step1[6]); - step2[7] = vaddq_s32(step1[7], step1[6]); - - // stage 3 - step1[0] = vaddq_s32(step2[0], step2[3]); - step1[1] = vaddq_s32(step2[1], step2[2]); - step1[2] = vsubq_s32(step2[1], step2[2]); - step1[3] = vsubq_s32(step2[0], step2[3]); - - step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); - step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); - step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); - - // stage 4 - *io0 = vaddq_s32(step1[0], step2[7]); - *io1 = vaddq_s32(step1[1], step1[6]); - *io2 = vaddq_s32(step1[2], step1[5]); - *io3 = vaddq_s32(step1[3], step2[4]); - *io4 = vsubq_s32(step1[3], step2[4]); - *io5 = vsubq_s32(step1[2], step1[5]); - *io6 = vsubq_s32(step1[1], step1[6]); - *io7 = vsubq_s32(step1[0], step2[7]); -} - -static INLINE void idct8x8_64_half1d_bd12( - const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, - int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, - int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, - int32x4_t *const io7) { - int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, - input_7l, input_7h; - int32x2_t step1l[4], step1h[4]; - int32x4_t step1[8], step2[8]; - int64x2_t t64[8]; - int32x2_t t32[8]; - - transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); - - // stage 1 - input_1l = vget_low_s32(*io1); - input_1h = vget_high_s32(*io1); - input_3l = vget_low_s32(*io3); - input_3h = vget_high_s32(*io3); - input_5l = vget_low_s32(*io5); - input_5h = vget_high_s32(*io5); - input_7l = vget_low_s32(*io7); - input_7h = vget_high_s32(*io7); - step1l[0] = vget_low_s32(*io0); - step1h[0] = vget_high_s32(*io0); - step1l[1] = vget_low_s32(*io2); - step1h[1] = vget_high_s32(*io2); - step1l[2] = vget_low_s32(*io4); - step1h[2] = vget_high_s32(*io4); - step1l[3] = vget_low_s32(*io6); - step1h[3] = vget_high_s32(*io6); - - t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1); - t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1); - t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0); - t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0); - t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1); - t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); - t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); - t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); - t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0); - t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0); - t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1); - t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1); - t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0); - t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0); - t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1); - t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); - step1[4] = vcombine_s32(t32[0], t32[1]); - step1[5] = vcombine_s32(t32[2], t32[3]); - step1[6] = vcombine_s32(t32[4], t32[5]); - step1[7] = vcombine_s32(t32[6], t32[7]); - - // stage 2 - t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0); - t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0); - t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1); - t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); - t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); - t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); - t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); - t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); - t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); - t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); - t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1); - t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1); - t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1); - t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); - step2[0] = vcombine_s32(t32[0], t32[1]); - step2[1] = vcombine_s32(t32[2], t32[3]); - step2[2] = vcombine_s32(t32[4], t32[5]); - step2[3] = vcombine_s32(t32[6], t32[7]); - - step2[4] = vaddq_s32(step1[4], step1[5]); - step2[5] = vsubq_s32(step1[4], step1[5]); - step2[6] = vsubq_s32(step1[7], step1[6]); - step2[7] = vaddq_s32(step1[7], step1[6]); - - // stage 3 - step1[0] = vaddq_s32(step2[0], step2[3]); - step1[1] = vaddq_s32(step2[1], step2[2]); - step1[2] = vsubq_s32(step2[1], step2[2]); - step1[3] = vsubq_s32(step2[0], step2[3]); - - t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0); - t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0); - t64[0] = - vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); - t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]), - vget_high_s32(cospis0), 0); - t64[2] = - vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); - t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), - vget_high_s32(cospis0), 0); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - step1[5] = vcombine_s32(t32[0], t32[1]); - step1[6] = vcombine_s32(t32[2], t32[3]); - - // stage 4 - *io0 = vaddq_s32(step1[0], step2[7]); - *io1 = vaddq_s32(step1[1], step1[6]); - *io2 = vaddq_s32(step1[2], step1[5]); - *io3 = vaddq_s32(step1[3], step2[4]); - *io4 = vsubq_s32(step1[3], step2[4]); - *io5 = vsubq_s32(step1[2], step1[5]); - *io6 = vsubq_s32(step1[1], step1[6]); - *io7 = vsubq_s32(step1[0], step2[7]); -} - -void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int32x4_t a0 = vld1q_s32(input); - int32x4_t a1 = vld1q_s32(input + 4); - int32x4_t a2 = vld1q_s32(input + 8); - int32x4_t a3 = vld1q_s32(input + 12); - int32x4_t a4 = vld1q_s32(input + 16); - int32x4_t a5 = vld1q_s32(input + 20); - int32x4_t a6 = vld1q_s32(input + 24); - int32x4_t a7 = vld1q_s32(input + 28); - int32x4_t a8 = vld1q_s32(input + 32); - int32x4_t a9 = vld1q_s32(input + 36); - int32x4_t a10 = vld1q_s32(input + 40); - int32x4_t a11 = vld1q_s32(input + 44); - int32x4_t a12 = vld1q_s32(input + 48); - int32x4_t a13 = vld1q_s32(input + 52); - int32x4_t a14 = vld1q_s32(input + 56); - int32x4_t a15 = vld1q_s32(input + 60); - int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 4); + a[2] = vld1q_s32(input + 8); + a[3] = vld1q_s32(input + 12); + a[4] = vld1q_s32(input + 16); + a[5] = vld1q_s32(input + 20); + a[6] = vld1q_s32(input + 24); + a[7] = vld1q_s32(input + 28); + a[8] = vld1q_s32(input + 32); + a[9] = vld1q_s32(input + 36); + a[10] = vld1q_s32(input + 40); + a[11] = vld1q_s32(input + 44); + a[12] = vld1q_s32(input + 48); + a[13] = vld1q_s32(input + 52); + a[14] = vld1q_s32(input + 56); + a[15] = vld1q_s32(input + 60); if (bd == 8) { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 - int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1)); - int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3)); - int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5)); - int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7)); - int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9)); - int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11)); - int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13)); - int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15)); + int16x8_t b[8]; - idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); - idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); + b[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1])); + b[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3])); + b[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5])); + b[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7])); + b[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9])); + b[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11])); + b[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13])); + b[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15])); - c0 = vrshrq_n_s16(b0, 5); - c1 = vrshrq_n_s16(b1, 5); - c2 = vrshrq_n_s16(b2, 5); - c3 = vrshrq_n_s16(b3, 5); - c4 = vrshrq_n_s16(b4, 5); - c5 = vrshrq_n_s16(b5, 5); - c6 = vrshrq_n_s16(b6, 5); - c7 = vrshrq_n_s16(b7, 5); + idct8x8_64_1d_bd8(cospis0, cospis1, b); + idct8x8_64_1d_bd8(cospis0, cospis1, b); + + c[0] = vrshrq_n_s16(b[0], 5); + c[1] = vrshrq_n_s16(b[1], 5); + c[2] = vrshrq_n_s16(b[2], 5); + c[3] = vrshrq_n_s16(b[3], 5); + c[4] = vrshrq_n_s16(b[4], 5); + c[5] = vrshrq_n_s16(b[5], 5); + c[6] = vrshrq_n_s16(b[6], 5); + c[7] = vrshrq_n_s16(b[7], 5); } else { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 if (bd == 10) { - idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13, - &a14, &a15); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10, - &a3, &a11); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14, - &a7, &a15); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); } else { - idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13, - &a14, &a15); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10, - &a3, &a11); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14, - &a7, &a15); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); } - c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5)); - c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5)); - c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5)); - c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5)); - c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5)); - c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5)); - c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5)); - c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5)); + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); } - highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd); + highbd_add8x8(c, dest, stride, bd); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h new file mode 100644 index 0000000000..518ef4336e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_idct_neon.h @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ +#define VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x4_t a0 = vld1_u16(*dest); + const uint16x4_t a1 = vld1_u16(*dest + stride); + const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1)); + // Note: In some profile tests, res is quite close to +/-32767. + // We use saturating addition. + const int16x8_t b = vqaddq_s16(res, a); + const int16x8_t c = vminq_s16(b, max); + const uint16x8_t d = vqshluq_n_s16(c, 0); + vst1_u16(*dest, vget_low_u16(d)); + *dest += stride; + vst1_u16(*dest, vget_high_u16(d)); + *dest += stride; +} + +static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, + int32x4_t *const a) { + int32x4_t b0, b1, b2, b3; + + transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]); + b0 = vaddq_s32(a[0], a[2]); + b1 = vsubq_s32(a[0], a[2]); + b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); + b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); + b2 = vmulq_lane_s32(a[1], vget_high_s32(cospis), 1); + b3 = vmulq_lane_s32(a[1], vget_low_s32(cospis), 1); + b2 = vmlsq_lane_s32(b2, a[3], vget_low_s32(cospis), 1); + b3 = vmlaq_lane_s32(b3, a[3], vget_high_s32(cospis), 1); + b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); + b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); + b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); + b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); + a[0] = vaddq_s32(b0, b3); + a[1] = vaddq_s32(b1, b2); + a[2] = vsubq_s32(b1, b2); + a[3] = vsubq_s32(b0, b3); +} + +static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, + int32x4_t *const a) { + int32x4_t b0, b1, b2, b3; + int64x2_t c[12]; + + transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]); + b0 = vaddq_s32(a[0], a[2]); + b1 = vsubq_s32(a[0], a[2]); + c[0] = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); + c[1] = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); + c[2] = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); + c[3] = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); + c[4] = vmull_lane_s32(vget_low_s32(a[1]), vget_high_s32(cospis), 1); + c[5] = vmull_lane_s32(vget_high_s32(a[1]), vget_high_s32(cospis), 1); + c[6] = vmull_lane_s32(vget_low_s32(a[1]), vget_low_s32(cospis), 1); + c[7] = vmull_lane_s32(vget_high_s32(a[1]), vget_low_s32(cospis), 1); + c[8] = vmull_lane_s32(vget_low_s32(a[3]), vget_low_s32(cospis), 1); + c[9] = vmull_lane_s32(vget_high_s32(a[3]), vget_low_s32(cospis), 1); + c[10] = vmull_lane_s32(vget_low_s32(a[3]), vget_high_s32(cospis), 1); + c[11] = vmull_lane_s32(vget_high_s32(a[3]), vget_high_s32(cospis), 1); + c[4] = vsubq_s64(c[4], c[8]); + c[5] = vsubq_s64(c[5], c[9]); + c[6] = vaddq_s64(c[6], c[10]); + c[7] = vaddq_s64(c[7], c[11]); + b0 = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), + vrshrn_n_s64(c[1], DCT_CONST_BITS)); + b1 = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), + vrshrn_n_s64(c[3], DCT_CONST_BITS)); + b2 = vcombine_s32(vrshrn_n_s64(c[4], DCT_CONST_BITS), + vrshrn_n_s64(c[5], DCT_CONST_BITS)); + b3 = vcombine_s32(vrshrn_n_s64(c[6], DCT_CONST_BITS), + vrshrn_n_s64(c[7], DCT_CONST_BITS)); + a[0] = vaddq_s32(b0, b3); + a[1] = vaddq_s32(b1, b2); + a[2] = vsubq_s32(b1, b2); + a[3] = vsubq_s32(b0, b3); +} + +static INLINE void highbd_add8x8(int16x8_t *const a, uint16_t *dest, + const int stride, const int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + const uint16_t *dst = dest; + uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7; + uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; + int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16; + + d0 = vld1q_u16(dst); + dst += stride; + d1 = vld1q_u16(dst); + dst += stride; + d2 = vld1q_u16(dst); + dst += stride; + d3 = vld1q_u16(dst); + dst += stride; + d4 = vld1q_u16(dst); + dst += stride; + d5 = vld1q_u16(dst); + dst += stride; + d6 = vld1q_u16(dst); + dst += stride; + d7 = vld1q_u16(dst); + + d0_s16 = vqaddq_s16(a[0], vreinterpretq_s16_u16(d0)); + d1_s16 = vqaddq_s16(a[1], vreinterpretq_s16_u16(d1)); + d2_s16 = vqaddq_s16(a[2], vreinterpretq_s16_u16(d2)); + d3_s16 = vqaddq_s16(a[3], vreinterpretq_s16_u16(d3)); + d4_s16 = vqaddq_s16(a[4], vreinterpretq_s16_u16(d4)); + d5_s16 = vqaddq_s16(a[5], vreinterpretq_s16_u16(d5)); + d6_s16 = vqaddq_s16(a[6], vreinterpretq_s16_u16(d6)); + d7_s16 = vqaddq_s16(a[7], vreinterpretq_s16_u16(d7)); + + d0_s16 = vminq_s16(d0_s16, max); + d1_s16 = vminq_s16(d1_s16, max); + d2_s16 = vminq_s16(d2_s16, max); + d3_s16 = vminq_s16(d3_s16, max); + d4_s16 = vminq_s16(d4_s16, max); + d5_s16 = vminq_s16(d5_s16, max); + d6_s16 = vminq_s16(d6_s16, max); + d7_s16 = vminq_s16(d7_s16, max); + d0_u16 = vqshluq_n_s16(d0_s16, 0); + d1_u16 = vqshluq_n_s16(d1_s16, 0); + d2_u16 = vqshluq_n_s16(d2_s16, 0); + d3_u16 = vqshluq_n_s16(d3_s16, 0); + d4_u16 = vqshluq_n_s16(d4_s16, 0); + d5_u16 = vqshluq_n_s16(d5_s16, 0); + d6_u16 = vqshluq_n_s16(d6_s16, 0); + d7_u16 = vqshluq_n_s16(d7_s16, 0); + + vst1q_u16(dest, d0_u16); + dest += stride; + vst1q_u16(dest, d1_u16); + dest += stride; + vst1q_u16(dest, d2_u16); + dest += stride; + vst1q_u16(dest, d3_u16); + dest += stride; + vst1q_u16(dest, d4_u16); + dest += stride; + vst1q_u16(dest, d5_u16); + dest += stride; + vst1q_u16(dest, d6_u16); + dest += stride; + vst1q_u16(dest, d7_u16); +} + +static INLINE void idct8x8_64_half1d_bd10( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x4_t step1[8], step2[8]; + + transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1); + step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); + step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); + step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); + + step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0); + step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1); + step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0); + step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1); + + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); + + // stage 2 + step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); + step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); + step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); + + step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); + step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); + step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1); + step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1); + + step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[0], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[0], step2[3]); + + step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); + step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void idct8x8_64_half1d_bd12( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x2_t input1l, input1h, input3l, input3h, input5l, input5h, input7l, + input7h; + int32x2_t step1l[4], step1h[4]; + int32x4_t step1[8], step2[8]; + int64x2_t t64[8]; + int32x2_t t32[8]; + + transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + input1l = vget_low_s32(*io1); + input1h = vget_high_s32(*io1); + input3l = vget_low_s32(*io3); + input3h = vget_high_s32(*io3); + input5l = vget_low_s32(*io5); + input5h = vget_high_s32(*io5); + input7l = vget_low_s32(*io7); + input7h = vget_high_s32(*io7); + step1l[0] = vget_low_s32(*io0); + step1h[0] = vget_high_s32(*io0); + step1l[1] = vget_low_s32(*io2); + step1h[1] = vget_high_s32(*io2); + step1l[2] = vget_low_s32(*io4); + step1h[2] = vget_high_s32(*io4); + step1l[3] = vget_low_s32(*io6); + step1h[3] = vget_high_s32(*io6); + + t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0); + t64[0] = vmlsl_lane_s32(t64[0], input7l, vget_low_s32(cospis1), 0); + t64[1] = vmlsl_lane_s32(t64[1], input7h, vget_low_s32(cospis1), 0); + t64[2] = vmlal_lane_s32(t64[2], input5l, vget_low_s32(cospis1), 1); + t64[3] = vmlal_lane_s32(t64[3], input5h, vget_low_s32(cospis1), 1); + t64[4] = vmlsl_lane_s32(t64[4], input5l, vget_high_s32(cospis1), 0); + t64[5] = vmlsl_lane_s32(t64[5], input5h, vget_high_s32(cospis1), 0); + t64[6] = vmlal_lane_s32(t64[6], input7l, vget_high_s32(cospis1), 1); + t64[7] = vmlal_lane_s32(t64[7], input7h, vget_high_s32(cospis1), 1); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step1[4] = vcombine_s32(t32[0], t32[1]); + step1[5] = vcombine_s32(t32[2], t32[3]); + step1[6] = vcombine_s32(t32[4], t32[5]); + step1[7] = vcombine_s32(t32[6], t32[7]); + + // stage 2 + t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0); + t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1); + t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); + t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); + t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); + t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); + t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); + t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); + t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); + t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1); + t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1); + t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1); + t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); + step2[0] = vcombine_s32(t32[0], t32[1]); + step2[1] = vcombine_s32(t32[2], t32[3]); + step2[2] = vcombine_s32(t32[4], t32[5]); + step2[3] = vcombine_s32(t32[6], t32[7]); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[0], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[0], step2[3]); + + t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[0] = + vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t64[2] = + vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + step1[5] = vcombine_s32(t32[0], t32[1]); + step1[6] = vcombine_s32(t32[2], t32[3]); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out, + int32_t *output) { + // Save the result into output + vst1q_s32(output + 0, out[0].val[0]); + vst1q_s32(output + 4, out[0].val[1]); + output += 16; + vst1q_s32(output + 0, out[1].val[0]); + vst1q_s32(output + 4, out[1].val[1]); + output += 16; + vst1q_s32(output + 0, out[2].val[0]); + vst1q_s32(output + 4, out[2].val[1]); + output += 16; + vst1q_s32(output + 0, out[3].val[0]); + vst1q_s32(output + 4, out[3].val[1]); + output += 16; + vst1q_s32(output + 0, out[4].val[0]); + vst1q_s32(output + 4, out[4].val[1]); + output += 16; + vst1q_s32(output + 0, out[5].val[0]); + vst1q_s32(output + 4, out[5].val[1]); + output += 16; + vst1q_s32(output + 0, out[6].val[0]); + vst1q_s32(output + 4, out[6].val[1]); + output += 16; + vst1q_s32(output + 0, out[7].val[0]); + vst1q_s32(output + 4, out[7].val[1]); + output += 16; + vst1q_s32(output + 0, out[8].val[0]); + vst1q_s32(output + 4, out[8].val[1]); + output += 16; + vst1q_s32(output + 0, out[9].val[0]); + vst1q_s32(output + 4, out[9].val[1]); + output += 16; + vst1q_s32(output + 0, out[10].val[0]); + vst1q_s32(output + 4, out[10].val[1]); + output += 16; + vst1q_s32(output + 0, out[11].val[0]); + vst1q_s32(output + 4, out[11].val[1]); + output += 16; + vst1q_s32(output + 0, out[12].val[0]); + vst1q_s32(output + 4, out[12].val[1]); + output += 16; + vst1q_s32(output + 0, out[13].val[0]); + vst1q_s32(output + 4, out[13].val[1]); + output += 16; + vst1q_s32(output + 0, out[14].val[0]); + vst1q_s32(output + 4, out[14].val[1]); + output += 16; + vst1q_s32(output + 0, out[15].val[0]); + vst1q_s32(output + 4, out[15].val[1]); +} + +static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out, + uint16_t *dest, const int stride, + const int bd) { + // Add the result to dest + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t o[16]; + o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6), + vrshrn_n_s32(out[0].val[1], 6)); + o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6), + vrshrn_n_s32(out[1].val[1], 6)); + o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6), + vrshrn_n_s32(out[2].val[1], 6)); + o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6), + vrshrn_n_s32(out[3].val[1], 6)); + o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6), + vrshrn_n_s32(out[4].val[1], 6)); + o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6), + vrshrn_n_s32(out[5].val[1], 6)); + o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6), + vrshrn_n_s32(out[6].val[1], 6)); + o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6), + vrshrn_n_s32(out[7].val[1], 6)); + o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6), + vrshrn_n_s32(out[8].val[1], 6)); + o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6), + vrshrn_n_s32(out[9].val[1], 6)); + o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6), + vrshrn_n_s32(out[10].val[1], 6)); + o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6), + vrshrn_n_s32(out[11].val[1], 6)); + o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6), + vrshrn_n_s32(out[12].val[1], 6)); + o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6), + vrshrn_n_s32(out[13].val[1], 6)); + o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6), + vrshrn_n_s32(out[14].val[1], 6)); + o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6), + vrshrn_n_s32(out[15].val[1], 6)); + highbd_idct16x16_add8x1(o[0], max, &dest, stride); + highbd_idct16x16_add8x1(o[1], max, &dest, stride); + highbd_idct16x16_add8x1(o[2], max, &dest, stride); + highbd_idct16x16_add8x1(o[3], max, &dest, stride); + highbd_idct16x16_add8x1(o[4], max, &dest, stride); + highbd_idct16x16_add8x1(o[5], max, &dest, stride); + highbd_idct16x16_add8x1(o[6], max, &dest, stride); + highbd_idct16x16_add8x1(o[7], max, &dest, stride); + highbd_idct16x16_add8x1(o[8], max, &dest, stride); + highbd_idct16x16_add8x1(o[9], max, &dest, stride); + highbd_idct16x16_add8x1(o[10], max, &dest, stride); + highbd_idct16x16_add8x1(o[11], max, &dest, stride); + highbd_idct16x16_add8x1(o[12], max, &dest, stride); + highbd_idct16x16_add8x1(o[13], max, &dest, stride); + highbd_idct16x16_add8x1(o[14], max, &dest, stride); + highbd_idct16x16_add8x1(o[15], max, &dest, stride); +} + +void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, + const int bd); + +#endif // VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c index 6f7e5da762..235cb5b996 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c @@ -12,23 +12,22 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "sum_neon.h" #include "vpx/vpx_integer.h" //------------------------------------------------------------------------------ // DC 4x4 -static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) { +static INLINE uint16_t dc_sum_4(const uint16_t *ref) { const uint16x4_t ref_u16 = vld1_u16(ref); - const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16); - return vpadd_u16(p0, p0); + return horizontal_add_uint16x4(ref_u16); } static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, const uint16x4_t dc) { - const uint16x4_t dc_dup = vdup_lane_u16(dc, 0); int i; for (i = 0; i < 4; ++i, dst += stride) { - vst1_u16(dst, dc_dup); + vst1_u16(dst, dc); } } @@ -37,21 +36,17 @@ void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int bd) { const uint16x4_t a = vld1_u16(above); const uint16x4_t l = vld1_u16(left); - uint16x4_t sum; - uint16x4_t dc; + const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l)); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3); (void)bd; - sum = vadd_u16(a, l); - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vrshr_n_u16(sum, 3); dc_store_4x4(dst, stride, dc); } void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_4(left); - const uint16x4_t dc = vrshr_n_u16(sum, 2); + const uint16_t sum = dc_sum_4(left); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2); (void)above; (void)bd; dc_store_4x4(dst, stride, dc); @@ -60,8 +55,8 @@ void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_4(above); - const uint16x4_t dc = vrshr_n_u16(sum, 2); + const uint16_t sum = dc_sum_4(above); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2); (void)left; (void)bd; dc_store_4x4(dst, stride, dc); @@ -79,19 +74,16 @@ void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 8x8 -static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) { +static INLINE uint16_t dc_sum_8(const uint16_t *ref) { const uint16x8_t ref_u16 = vld1q_u16(ref); - uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); + return horizontal_add_uint16x8(ref_u16); } static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0); + const uint16x8_t dc) { int i; for (i = 0; i < 8; ++i, dst += stride) { - vst1q_u16(dst, dc_dup); + vst1q_u16(dst, dc); } } @@ -101,20 +93,17 @@ void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16x8_t above_u16 = vld1q_u16(above); const uint16x8_t left_u16 = vld1q_u16(left); const uint16x8_t p0 = vaddq_u16(above_u16, left_u16); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - uint16x4_t dc; + const uint16_t sum = horizontal_add_uint16x8(p0); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)bd; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vrshr_n_u16(sum, 4); dc_store_8x8(dst, stride, dc); } void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_8(left); - const uint16x4_t dc = vrshr_n_u16(sum, 3); + const uint16_t sum = dc_sum_8(left); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3); (void)above; (void)bd; dc_store_8x8(dst, stride, dc); @@ -123,8 +112,8 @@ void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_8(above); - const uint16x4_t dc = vrshr_n_u16(sum, 3); + const uint16_t sum = dc_sum_8(above); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3); (void)left; (void)bd; dc_store_8x8(dst, stride, dc); @@ -133,7 +122,7 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_8x8(dst, stride, dc); @@ -142,47 +131,43 @@ void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 16x16 -static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) { - const uint16x8x2_t ref_u16 = vld2q_u16(ref); - const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_16(const uint16_t *ref) { + const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0); + const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8); + const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1); + return horizontal_add_uint16x8(p0); } static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - uint16x8x2_t dc_dup; + const uint16x8_t dc) { int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); for (i = 0; i < 16; ++i, dst += stride) { - vst2q_u16(dst, dc_dup); + vst1q_u16(dst + 0, dc); + vst1q_u16(dst + 8, dc); } } void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t a = vld2q_u16(above); - const uint16x8x2_t l = vld2q_u16(left); - const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]); - const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]); + const uint16x8_t a0 = vld1q_u16(above + 0); + const uint16x8_t a1 = vld1q_u16(above + 8); + const uint16x8_t l0 = vld1q_u16(left + 0); + const uint16x8_t l1 = vld1q_u16(left + 8); + const uint16x8_t pa = vaddq_u16(a0, a1); + const uint16x8_t pl = vaddq_u16(l0, l1); const uint16x8_t pal0 = vaddq_u16(pa, pl); - uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); - uint32x2_t sum; - uint16x4_t dc; + const uint32_t sum = horizontal_add_uint16x8(pal0); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)bd; - pal1 = vpadd_u16(pal1, pal1); - sum = vpaddl_u16(pal1); - dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); dc_store_16x16(dst, stride, dc); } void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_16(left); - const uint16x4_t dc = vrshr_n_u16(sum, 4); + const uint16_t sum = dc_sum_16(left); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)above; (void)bd; dc_store_16x16(dst, stride, dc); @@ -191,8 +176,8 @@ void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_16(above); - const uint16x4_t dc = vrshr_n_u16(sum, 4); + const uint16_t sum = dc_sum_16(above); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)left; (void)bd; dc_store_16x16(dst, stride, dc); @@ -201,7 +186,7 @@ void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_16x16(dst, stride, dc); @@ -210,56 +195,58 @@ void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 32x32 -static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) { - const uint16x8x4_t r = vld4q_u16(ref); - const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]); - const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]); +static INLINE uint32_t dc_sum_32(const uint16_t *ref) { + const uint16x8_t r0 = vld1q_u16(ref + 0); + const uint16x8_t r1 = vld1q_u16(ref + 8); + const uint16x8_t r2 = vld1q_u16(ref + 16); + const uint16x8_t r3 = vld1q_u16(ref + 24); + const uint16x8_t p0 = vaddq_u16(r0, r1); + const uint16x8_t p1 = vaddq_u16(r2, r3); const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - sum = vpadd_u16(sum, sum); - return vpaddl_u16(sum); + return horizontal_add_uint16x8(p2); } static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - uint16x8x2_t dc_dup; + const uint16x8_t dc) { int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); - for (i = 0; i < 32; ++i) { - vst2q_u16(dst, dc_dup); - dst += 16; - vst2q_u16(dst, dc_dup); - dst += stride - 16; + vst1q_u16(dst + 0, dc); + vst1q_u16(dst + 8, dc); + vst1q_u16(dst + 16, dc); + vst1q_u16(dst + 24, dc); + dst += stride; } } void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x4_t a = vld4q_u16(above); - const uint16x8x4_t l = vld4q_u16(left); - const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]); - const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]); - const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]); - const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]); + const uint16x8_t a0 = vld1q_u16(above + 0); + const uint16x8_t a1 = vld1q_u16(above + 8); + const uint16x8_t a2 = vld1q_u16(above + 16); + const uint16x8_t a3 = vld1q_u16(above + 24); + const uint16x8_t l0 = vld1q_u16(left + 0); + const uint16x8_t l1 = vld1q_u16(left + 8); + const uint16x8_t l2 = vld1q_u16(left + 16); + const uint16x8_t l3 = vld1q_u16(left + 24); + const uint16x8_t pa0 = vaddq_u16(a0, a1); + const uint16x8_t pa1 = vaddq_u16(a2, a3); + const uint16x8_t pl0 = vaddq_u16(l0, l1); + const uint16x8_t pl1 = vaddq_u16(l2, l3); const uint16x8_t pa = vaddq_u16(pa0, pa1); const uint16x8_t pl = vaddq_u16(pl0, pl1); const uint16x8_t pal0 = vaddq_u16(pa, pl); - const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); - uint32x2_t sum = vpaddl_u16(pal1); - uint16x4_t dc; + const uint32_t sum = horizontal_add_uint16x8(pal0); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0); (void)bd; - sum = vpadd_u32(sum, sum); - dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6)); dc_store_32x32(dst, stride, dc); } void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint32x2_t sum = dc_sum_32(left); - const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + const uint32_t sum = dc_sum_32(left); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)above; (void)bd; dc_store_32x32(dst, stride, dc); @@ -268,8 +255,8 @@ void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint32x2_t sum = dc_sum_32(above); - const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + const uint32_t sum = dc_sum_32(above); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)left; (void)bd; dc_store_32x32(dst, stride, dc); @@ -278,7 +265,7 @@ void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_32x32(dst, stride, dc); @@ -289,166 +276,1304 @@ void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t ABCDEFGH = vld1q_u16(above); - const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1); - const uint16x8_t CDEFGH00 = vld1q_u16(above + 2); - const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00); - const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0); - const uint16x4_t avg2_low = vget_low_u16(avg2); - const uint16x4_t avg2_high = vget_high_u16(avg2); - const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1); - const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2); - const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3); + uint16x8_t a0, a1, a2, d0; + uint16_t a7; (void)left; (void)bd; - vst1_u16(dst, avg2_low); - dst += stride; - vst1_u16(dst, r1); - dst += stride; - vst1_u16(dst, r2); - dst += stride; - vst1_u16(dst, r3); - vst1q_lane_u16(dst + 3, ABCDEFGH, 7); -} -static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, - const uint16x8_t above_right, uint16x8_t *row) { - *row = vextq_u16(*row, above_right, 1); - vst1q_u16(*dst, *row); - *dst += stride; + a0 = vld1q_u16(above); + a7 = above[7]; + + // [ above[1], ..., above[6], x, x ] + a1 = vextq_u16(a0, a0, 1); + // [ above[2], ..., above[7], x, x ] + a2 = vextq_u16(a0, a0, 2); + + // d0[0] = AVG3(above[0], above[1], above[2]); + // ... + // d0[5] = AVG3(above[5], above[6], above[7]); + // d0[6] = x (don't care) + // d0[7] = x (don't care) + d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + + // We want: + // stride=0 [ d0[0], d0[1], d0[2], d0[3] ] + // stride=1 [ d0[1], d0[2], d0[3], d0[4] ] + // stride=2 [ d0[2], d0[3], d0[4], d0[5] ] + // stride=2 [ d0[3], d0[4], d0[5], above[7] ] + vst1_u16(dst + 0 * stride, vget_low_u16(d0)); + vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1))); + vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2))); + vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3))); + + // We stored d0[6] above, so fixup into above[7]. + dst[3 * stride + 3] = a7; } void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0 = vld1q_u16(above); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3); - const uint16x8_t A1 = vld1q_u16(above + 1); - const uint16x8_t A2 = vld1q_u16(above + 2); - const uint16x8_t avg1 = vhaddq_u16(A0, A2); - uint16x8_t row = vrhaddq_u16(avg1, A1); + uint16x8_t ax0, a0, a1, a7, d0; (void)left; (void)bd; - vst1q_u16(dst, row); - dst += stride; - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - vst1q_u16(dst, above_right); -} + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_dup_u16(above + 7); -static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, - const uint16x8_t above_right, uint16x8_t *row_0, - uint16x8_t *row_1) { - *row_0 = vextq_u16(*row_0, *row_1, 1); - *row_1 = vextq_u16(*row_1, above_right, 1); - vst1q_u16(*dst, *row_0); - *dst += 8; - vst1q_u16(*dst, *row_1); - *dst += stride - 8; + // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can + // shift in above[7] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[7] = AVG3(above[6], above[7], above[8]); + d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[7]. + vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1)); + vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2)); + vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3)); + vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4)); + vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5)); + vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6)); + vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7)); + vst1q_u16(dst + 7 * stride, a7); } void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0_0 = vld1q_u16(above); - const uint16x8_t A0_1 = vld1q_u16(above + 8); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3); - const uint16x8_t A1_0 = vld1q_u16(above + 1); - const uint16x8_t A1_1 = vld1q_u16(above + 9); - const uint16x8_t A2_0 = vld1q_u16(above + 2); - const uint16x8_t A2_1 = vld1q_u16(above + 10); - const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); - const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); - uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); - uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); + uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2]; (void)left; (void)bd; - vst1q_u16(dst, row_0); - vst1q_u16(dst + 8, row_1); - dst += stride; - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - vst1q_u16(dst, above_right); - vst1q_u16(dst + 8, above_right); + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a15 = vld1q_dup_u16(above + 15); + + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + // We have one unused lane here to leave room to shift in above[15] in the + // last lane: + // d0[0][1] = x (don't care) + // d0[0][1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[0][7] = AVG3(above[6], above[7], above[8]); + // d0[1][0] = AVG3(above[7], above[8], above[9]); + // ... + // d0[1][7] = AVG3(above[14], above[15], above[16]); + d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8); + + // Incrementally shift in duplicates of above[15]. + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4)); + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7)); + vst1q_u16(dst + 7 * stride + 0, d0[1]); + vst1q_u16(dst + 7 * stride + 8, a15); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1)); + vst1q_u16(dst + 8 * stride + 8, a15); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2)); + vst1q_u16(dst + 9 * stride + 8, a15); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3)); + vst1q_u16(dst + 10 * stride + 8, a15); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4)); + vst1q_u16(dst + 11 * stride + 8, a15); + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5)); + vst1q_u16(dst + 12 * stride + 8, a15); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6)); + vst1q_u16(dst + 13 * stride + 8, a15); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7)); + vst1q_u16(dst + 14 * stride + 8, a15); + vst1q_u16(dst + 15 * stride + 0, a15); + vst1q_u16(dst + 15 * stride + 8, a15); } void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0_0 = vld1q_u16(above); - const uint16x8_t A0_1 = vld1q_u16(above + 8); - const uint16x8_t A0_2 = vld1q_u16(above + 16); - const uint16x8_t A0_3 = vld1q_u16(above + 24); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3); - const uint16x8_t A1_0 = vld1q_u16(above + 1); - const uint16x8_t A1_1 = vld1q_u16(above + 9); - const uint16x8_t A1_2 = vld1q_u16(above + 17); - const uint16x8_t A1_3 = vld1q_u16(above + 25); - const uint16x8_t A2_0 = vld1q_u16(above + 2); - const uint16x8_t A2_1 = vld1q_u16(above + 10); - const uint16x8_t A2_2 = vld1q_u16(above + 18); - const uint16x8_t A2_3 = vld1q_u16(above + 26); - const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); - const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); - const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2); - const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3); - uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); - uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); - uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2); - uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3); + uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4]; int i; (void)left; (void)bd; - vst1q_u16(dst, row_0); - dst += 8; - vst1q_u16(dst, row_1); - dst += 8; - vst1q_u16(dst, row_2); - dst += 8; - vst1q_u16(dst, row_3); - dst += stride - 24; + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a17 = vld1q_u16(above + 17); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + a25 = vld1q_u16(above + 25); + a31 = vld1q_dup_u16(above + 31); - for (i = 0; i < 30; ++i) { - row_0 = vextq_u16(row_0, row_1, 1); - row_1 = vextq_u16(row_1, row_2, 1); - row_2 = vextq_u16(row_2, row_3, 1); - row_3 = vextq_u16(row_3, above_right, 1); - vst1q_u16(dst, row_0); - dst += 8; - vst1q_u16(dst, row_1); - dst += 8; - vst1q_u16(dst, row_2); - dst += 8; - vst1q_u16(dst, row_3); - dst += stride - 24; + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8); + d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16); + d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24); + + for (i = 0; i < 32; ++i) { + d0[0] = vextq_u16(d0[0], d0[1], 1); + d0[1] = vextq_u16(d0[1], d0[2], 1); + d0[2] = vextq_u16(d0[2], d0[3], 1); + d0[3] = vextq_u16(d0[3], a31, 1); + vst1q_u16(dst + 0, d0[0]); + vst1q_u16(dst + 8, d0[1]); + vst1q_u16(dst + 16, d0[2]); + vst1q_u16(dst + 24, d0[3]); + dst += stride; } +} - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); +// ----------------------------------------------------------------------------- + +void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3; + (void)left; + (void)bd; + + a0 = vld1_u16(above + 0); + a1 = vld1_u16(above + 1); + a2 = vld1_u16(above + 2); + a3 = vld1_u16(above + 3); + + d0 = vrhadd_u16(a0, a1); + d1 = vrhadd_u16(vhadd_u16(a0, a2), a1); + d2 = vrhadd_u16(a1, a2); + d3 = vrhadd_u16(vhadd_u16(a1, a3), a2); + + // Note that here we are performing a full avg calculation for the final + // elements rather than storing a duplicate of above[3], which differs + // (correctly) from the general scheme employed by the bs={8,16,32} + // implementations in order to match the original C implementation. + vst1_u16(dst + 0 * stride, d0); + vst1_u16(dst + 1 * stride, d1); + vst1_u16(dst + 2 * stride, d2); + vst1_u16(dst + 3 * stride, d3); +} + +void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a7 = vld1q_dup_u16(above + 7); + + d0 = vrhaddq_u16(a0, a1); + d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + + // We want to store: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7] ] + // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7] ] + // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7] ] + // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7] ] + // stride=6 [ d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7], a[7] ] + // stride=7 [ d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7], a[7] ] + // Note in particular that d0[7] and d1[7] are only ever referenced in the + // stride=0 and stride=1 cases respectively, and in later strides are + // replaced by a copy of above[7]. These are equivalent if for i>7, + // above[i]==above[7], however that is not always the case. + + // Strip out d0[7] and d1[7] so that we can replace it with an additional + // copy of above[7], the first vector here doesn't matter so just reuse + // d0/d1. + d0_ext = vextq_u16(d0, d0, 7); + d1_ext = vextq_u16(d1, d1, 7); + + // Shuffle in duplicates of above[7] and store. + vst1q_u16(dst + 0 * stride, d0); + vst1q_u16(dst + 1 * stride, d1); + vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2)); + vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2)); + vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3)); + vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3)); + vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4)); + vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4)); +} + +void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation. + uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a10 = vld1q_u16(above + 10); + a15 = vld1q_dup_u16(above + 15); + + d0[0] = vrhaddq_u16(a0, a1); + d0[1] = vrhaddq_u16(a8, a9); + d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9); + + // Strip out the final element of d0/d1 so that we can replace it with an + // additional copy of above[7], the first vector here doesn't matter so just + // reuse the same vector. + d0_ext = vextq_u16(d0[1], d0[1], 7); + d1_ext = vextq_u16(d1[1], d1[1], 7); + + // Shuffle in duplicates of above[7] and store. Note that cases involving + // {d0,d1}_ext require an extra shift to undo the shifting out of the final + // element from above. + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2)); + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4)); + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6)); + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 14 * stride + 8, a15); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 15 * stride + 8, a15); +} + +void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation. + uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4], + d1[4], d0_ext, d1_ext; + (void)left; + (void)bd; + + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a2 = vld1q_u16(above + 2); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a10 = vld1q_u16(above + 10); + a16 = vld1q_u16(above + 16); + a17 = vld1q_u16(above + 17); + a18 = vld1q_u16(above + 18); + a24 = vld1q_u16(above + 24); + a25 = vld1q_u16(above + 25); + a26 = vld1q_u16(above + 26); + a31 = vld1q_dup_u16(above + 31); + + d0[0] = vrhaddq_u16(a0, a1); + d0[1] = vrhaddq_u16(a8, a9); + d0[2] = vrhaddq_u16(a16, a17); + d0[3] = vrhaddq_u16(a24, a25); + d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9); + d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17); + d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25); + + // Strip out the final element of d0/d1 so that we can replace it with an + // additional copy of above[7], the first vector here doesn't matter so just + // reuse the same vector. + d0_ext = vextq_u16(d0[3], d0[3], 7); + d1_ext = vextq_u16(d1[3], d1[3], 7); + + // Shuffle in duplicates of above[7] and store. Note that cases involving + // {d0,d1}_ext require an extra shift to undo the shifting out of the final + // element from above. + + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 0 * stride + 16, d0[2]); + vst1q_u16(dst + 0 * stride + 24, d0[3]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 1 * stride + 16, d1[2]); + vst1q_u16(dst + 1 * stride + 24, d1[3]); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 14 * stride + 24, a31); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 15 * stride + 24, a31); + + vst1q_u16(dst + 16 * stride + 0, d0[1]); + vst1q_u16(dst + 16 * stride + 8, d0[2]); + vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1)); + vst1q_u16(dst + 16 * stride + 24, a31); + vst1q_u16(dst + 17 * stride + 0, d1[1]); + vst1q_u16(dst + 17 * stride + 8, d1[2]); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1)); + vst1q_u16(dst + 17 * stride + 24, a31); + + vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2)); + vst1q_u16(dst + 18 * stride + 24, a31); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2)); + vst1q_u16(dst + 19 * stride + 24, a31); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3)); + vst1q_u16(dst + 20 * stride + 24, a31); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3)); + vst1q_u16(dst + 21 * stride + 24, a31); + + vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4)); + vst1q_u16(dst + 22 * stride + 24, a31); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4)); + vst1q_u16(dst + 23 * stride + 24, a31); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5)); + vst1q_u16(dst + 24 * stride + 24, a31); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5)); + vst1q_u16(dst + 25 * stride + 24, a31); + + vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6)); + vst1q_u16(dst + 26 * stride + 24, a31); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6)); + vst1q_u16(dst + 27 * stride + 24, a31); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7)); + vst1q_u16(dst + 28 * stride + 24, a31); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7)); + vst1q_u16(dst + 29 * stride + 24, a31); + + vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 30 * stride + 16, a31); + vst1q_u16(dst + 30 * stride + 24, a31); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 31 * stride + 16, a31); + vst1q_u16(dst + 31 * stride + 24, a31); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1; + (void)bd; + + az = vld1_u16(above - 1); + a0 = vld1_u16(above + 0); + // [ left[0], above[-1], above[0], above[1] ] + l0az = vext_u16(vld1_dup_u16(left), az, 3); + + l0 = vld1_u16(left + 0); + // The last lane here is unused, reading left[4] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], left[2], left[3], x ] + l1 = vext_u16(l0, l0, 1); + // [ above[-1], left[0], left[1], left[2] ] + azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3); + + d0 = vrhadd_u16(az, a0); + d1 = vrhadd_u16(vhadd_u16(l0az, a0), az); + + col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0); + col0_even = vdup_lane_u16(col0, 0); + col0_odd = vdup_lane_u16(col0, 1); + + vst1_u16(dst + 0 * stride, d0); + vst1_u16(dst + 1 * stride, d1); + vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3)); + vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3)); +} + +void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vextq_u16(l0, l0, 1); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], above[0]) + // ... + // d0[7] = AVG2(above[6], above[7]) + d0 = vrhaddq_u16(az, a0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vector to put the elements to be shifted in + // at the end: + // col0[7] = AVG3(above[-1], left[0], left[1]) + // col0[6] = AVG3(left[0], left[1], left[2]) + // ... + // col0[0] = AVG3(left[6], left[7], left[8]) + col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0 = vrev64q_u16(vextq_u16(col0, col0, 4)); + + // We don't care about the first parameter to this uzp since we only ever use + // the high three elements, we just use col0 again since it is already + // available: + // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ] + // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ] + col0_even = vuzpq_u16(col0, col0).val[1]; + col0_odd = vuzpq_u16(col0, col0).val[0]; + + // Incrementally shift more elements from col0 into d0/1: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ] + // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ] + // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ] + // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ] + // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ] + vst1q_u16(dst + 0 * stride, d0); + vst1q_u16(dst + 1 * stride, d1); + vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7)); + vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7)); + vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6)); + vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6)); + vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5)); + vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5)); +} + +void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo, + col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[8] to avoid needing to + // materialize a zero: + // [ left[9], ... , left[15], x ] + l9 = vextq_u16(l8, l8, 1); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0_lo = vrhaddq_u16(az, a0); + d0_hi = vrhaddq_u16(a7, a8); + d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + + col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + + // Reverse within each vector, then swap the array indices in the uzp to + // complete the reversal across all 16 elements. + col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4)); + col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4)); + col0_even = vuzpq_u16(col0_hi, col0_lo).val[1]; + col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0]; + + vst1q_u16(dst + 0 * stride + 0, d0_lo); + vst1q_u16(dst + 0 * stride + 8, d0_hi); + vst1q_u16(dst + 1 * stride + 0, d1_lo); + vst1q_u16(dst + 1 * stride + 8, d1_hi); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1)); +} + +void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7, + l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4], + col0_even[2], col0_odd[2]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a14 = vld1q_u16(above + 14); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a22 = vld1q_u16(above + 22); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + // [ left[0], above[-1], ..., left[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l15 = vld1q_u16(left + 15); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l23 = vld1q_u16(left + 23); + l24 = vld1q_u16(left + 24); + l25 = vld1q_u16(left + 25); + // The last lane here is unused, reading left[32] could cause a buffer + // over-read, so just fill with a duplicate of left[24] to avoid needing to + // materialize a zero: + // [ left[25], ... , left[31], x ] + l25 = vextq_u16(l24, l24, 1); + // [ above[-1], left[0], ..., left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(az, a0); + d0[1] = vrhaddq_u16(a7, a8); + d0[2] = vrhaddq_u16(a15, a16); + d0[3] = vrhaddq_u16(a23, a24); + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15); + d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23); + + col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16); + col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24); + + // Reverse within each vector, then swap the array indices in both the uzp + // and the col0_{even,odd} assignment to complete the reversal across all + // 32-elements. + col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4)); + col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4)); + col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4)); + col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4)); + + col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1]; + col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1]; + col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0]; + col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0]; + + vst1q_u16(dst + 0 * stride + 0, d0[0]); + vst1q_u16(dst + 0 * stride + 8, d0[1]); + vst1q_u16(dst + 0 * stride + 16, d0[2]); + vst1q_u16(dst + 0 * stride + 24, d0[3]); + vst1q_u16(dst + 1 * stride + 0, d1[0]); + vst1q_u16(dst + 1 * stride + 8, d1[1]); + vst1q_u16(dst + 1 * stride + 16, d1[2]); + vst1q_u16(dst + 1 * stride + 24, d1[3]); + + vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6)); + + vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4)); + + vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2)); + + vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1)); + + vst1q_u16(dst + 16 * stride + 0, col0_even[1]); + vst1q_u16(dst + 16 * stride + 8, d0[0]); + vst1q_u16(dst + 16 * stride + 16, d0[1]); + vst1q_u16(dst + 16 * stride + 24, d0[2]); + vst1q_u16(dst + 17 * stride + 0, col0_odd[1]); + vst1q_u16(dst + 17 * stride + 8, d1[0]); + vst1q_u16(dst + 17 * stride + 16, d1[1]); + vst1q_u16(dst + 17 * stride + 24, d1[2]); + + vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7)); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6)); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6)); + + vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5)); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4)); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4)); + + vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3)); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2)); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2)); + + vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1)); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi; + (void)bd; + + az = vld1_u16(above - 1); + a0 = vld1_u16(above + 0); + // [ left[0], above[-1], above[0], above[1] ] + l0az = vext_u16(vld1_dup_u16(left), az, 3); + + l0 = vld1_u16(left); + // The last lane here is unused, reading left[4] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], left[2], left[3], x ] + l1 = vext_u16(l0, l0, 1); + // [ above[-1], left[0], left[1], left[2] ] + azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3); + + d0 = vrhadd_u16(azl0, l0); + d1 = vrhadd_u16(vhadd_u16(l0az, a0), az); + d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0); + + d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0]; + d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1] ] + vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3)); + vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1)); + vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3)); + vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1)); +} + +void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo, + d20_hi; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vextq_u16(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], left[0]) + // d0[1] = AVG2(left[0], left[1]) + // ... + // d0[7] = AVG2(left[6], left[7]) + d0 = vrhaddq_u16(azl0, l0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + + // d2[0] = AVG3(above[-1], left[0], left[1]) + // d2[1] = AVG3(left[0], left[1], left[2]) + // ... + // d2[7] = AVG3(left[6], left[7], left[8]) + d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vectors to put the elements to be shifted + // in at the end: + d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4)); + d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4)); + + d20_lo = vzipq_u16(d2_rev, d0_rev).val[0]; + d20_hi = vzipq_u16(d2_rev, d0_rev).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ] + // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ] + // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ] + // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ] + // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ] + vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7)); + vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5)); + vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3)); + vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1)); + vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7)); + vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5)); + vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3)); + vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1)); +} + +void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2], + d2[2], d20[4]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[8] to avoid needing to + // materialize a zero: + // [ left[9], ... , left[15], x ] + l9 = vextq_u16(l8, l8, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(azl0, l0); + d0[1] = vrhaddq_u16(l7, l8); + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + + d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4)); + d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4)); + d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4)); + d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4)); + + d20[0] = vzipq_u16(d2[1], d0[1]).val[0]; + d20[1] = vzipq_u16(d2[1], d0[1]).val[1]; + d20[2] = vzipq_u16(d2[0], d0[0]).val[0]; + d20[3] = vzipq_u16(d2[0], d0[0]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1)); +} + +void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation. + uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7, + l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8]; + (void)bd; + + az = vld1q_u16(above - 1); + a0 = vld1q_u16(above + 0); + a6 = vld1q_u16(above + 6); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a14 = vld1q_u16(above + 14); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a22 = vld1q_u16(above + 22); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u16(vld1q_dup_u16(left), az, 7); + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l7 = vld1q_u16(left + 7); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l15 = vld1q_u16(left + 15); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l23 = vld1q_u16(left + 23); + l24 = vld1q_u16(left + 24); + // The last lane here is unused, reading left[32] could cause a buffer + // over-read, so just fill with a duplicate of left[24] to avoid needing to + // materialize a zero: + // [ left[25], ... , left[31], x ] + l25 = vextq_u16(l24, l24, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7); + + d0[0] = vrhaddq_u16(azl0, l0); + d0[1] = vrhaddq_u16(l7, l8); + d0[2] = vrhaddq_u16(l15, l16); + d0[3] = vrhaddq_u16(l23, l24); + + d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az); + d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7); + d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15); + d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23); + + d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0); + d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8); + d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16); + d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24); + + d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4)); + d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4)); + d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4)); + d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4)); + d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4)); + d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4)); + d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4)); + d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4)); + + d20[0] = vzipq_u16(d2[3], d0[3]).val[0]; + d20[1] = vzipq_u16(d2[3], d0[3]).val[1]; + d20[2] = vzipq_u16(d2[2], d0[2]).val[0]; + d20[3] = vzipq_u16(d2[2], d0[2]).val[1]; + d20[4] = vzipq_u16(d2[1], d0[1]).val[0]; + d20[5] = vzipq_u16(d2[1], d0[1]).val[1]; + d20[6] = vzipq_u16(d2[0], d0[0]).val[0]; + d20[7] = vzipq_u16(d2[0], d0[0]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1)); + + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1)); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7)); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1)); + + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7)); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1)); + + vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7)); + vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5)); + vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1)); + + vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7)); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5)); + vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1)); + + vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7)); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5)); + vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1)); + + vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7)); + vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7)); + vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7)); + vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7)); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5)); + vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1)); } // ----------------------------------------------------------------------------- @@ -696,6 +1821,311 @@ void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ +void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi; + (void)above; + (void)bd; + + l0 = vld1_u16(left + 0); + l3 = vld1_dup_u16(left + 3); + + // [ left[1], left[2], left[3], left[3] ] + l1 = vext_u16(l0, l3, 1); + // [ left[2], left[3], left[3], left[3] ] + l2 = vext_u16(l0, l3, 2); + + c0 = vrhadd_u16(l0, l1); + c1 = vrhadd_u16(vhadd_u16(l0, l2), l1); + + c01_lo = vzip_u16(c0, c1).val[0]; + c01_hi = vzip_u16(c0, c1).val[1]; + + // stride=0 [ c0[0], c1[0], c0[1], c1[1] ] + // stride=1 [ c0[1], c1[1], c0[2], c1[2] ] + // stride=2 [ c0[2], c1[2], c0[3], c1[3] ] + // stride=3 [ c0[3], c1[3], left[3], left[3] ] + vst1_u16(dst + 0 * stride, c01_lo); + vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2)); + vst1_u16(dst + 2 * stride, c01_hi); + vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2)); +} + +void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l7 = vld1q_dup_u16(left + 7); + + // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ] + l1 = vextq_u16(l0, l7, 1); + // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ] + l2 = vextq_u16(l0, l7, 2); + + c0 = vrhaddq_u16(l0, l1); + c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + + c01_lo = vzipq_u16(c0, c1).val[0]; + c01_hi = vzipq_u16(c0, c1).val[1]; + + vst1q_u16(dst + 0 * stride, c01_lo); + vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2)); + vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4)); + vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6)); + vst1q_u16(dst + 4 * stride, c01_hi); + vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2)); + vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4)); + vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6)); +} + +void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4]; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l2 = vld1q_u16(left + 2); + l8 = vld1q_u16(left + 8); + l15 = vld1q_dup_u16(left + 15); + + l9 = vextq_u16(l8, l15, 1); + l10 = vextq_u16(l8, l15, 2); + + c0[0] = vrhaddq_u16(l0, l1); + c0[1] = vrhaddq_u16(l8, l9); + c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9); + + c01[0] = vzipq_u16(c0[0], c1[0]).val[0]; + c01[1] = vzipq_u16(c0[0], c1[0]).val[1]; + c01[2] = vzipq_u16(c0[1], c1[1]).val[0]; + c01[3] = vzipq_u16(c0[1], c1[1]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, c01[0]); + vst1q_u16(dst + 0 * stride + 8, c01[1]); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6)); + + vst1q_u16(dst + 4 * stride + 0, c01[1]); + vst1q_u16(dst + 4 * stride + 8, c01[2]); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6)); + + vst1q_u16(dst + 8 * stride + 0, c01[2]); + vst1q_u16(dst + 8 * stride + 8, c01[3]); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6)); + + vst1q_u16(dst + 12 * stride + 0, c01[3]); + vst1q_u16(dst + 12 * stride + 8, l15); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2)); + vst1q_u16(dst + 13 * stride + 8, l15); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4)); + vst1q_u16(dst + 14 * stride + 8, l15); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6)); + vst1q_u16(dst + 15 * stride + 8, l15); +} + +void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4], + c1[4], c01[8]; + (void)above; + (void)bd; + + l0 = vld1q_u16(left + 0); + l1 = vld1q_u16(left + 1); + l2 = vld1q_u16(left + 2); + l8 = vld1q_u16(left + 8); + l9 = vld1q_u16(left + 9); + l10 = vld1q_u16(left + 10); + l16 = vld1q_u16(left + 16); + l17 = vld1q_u16(left + 17); + l18 = vld1q_u16(left + 18); + l24 = vld1q_u16(left + 24); + l31 = vld1q_dup_u16(left + 31); + + l25 = vextq_u16(l24, l31, 1); + l26 = vextq_u16(l24, l31, 2); + + c0[0] = vrhaddq_u16(l0, l1); + c0[1] = vrhaddq_u16(l8, l9); + c0[2] = vrhaddq_u16(l16, l17); + c0[3] = vrhaddq_u16(l24, l25); + c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1); + c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9); + c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17); + c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25); + + c01[0] = vzipq_u16(c0[0], c1[0]).val[0]; + c01[1] = vzipq_u16(c0[0], c1[0]).val[1]; + c01[2] = vzipq_u16(c0[1], c1[1]).val[0]; + c01[3] = vzipq_u16(c0[1], c1[1]).val[1]; + c01[4] = vzipq_u16(c0[2], c1[2]).val[0]; + c01[5] = vzipq_u16(c0[2], c1[2]).val[1]; + c01[6] = vzipq_u16(c0[3], c1[3]).val[0]; + c01[7] = vzipq_u16(c0[3], c1[3]).val[1]; + + vst1q_u16(dst + 0 * stride + 0, c01[0]); + vst1q_u16(dst + 0 * stride + 8, c01[1]); + vst1q_u16(dst + 0 * stride + 16, c01[2]); + vst1q_u16(dst + 0 * stride + 24, c01[3]); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6)); + + vst1q_u16(dst + 4 * stride + 0, c01[1]); + vst1q_u16(dst + 4 * stride + 8, c01[2]); + vst1q_u16(dst + 4 * stride + 16, c01[3]); + vst1q_u16(dst + 4 * stride + 24, c01[4]); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6)); + vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6)); + + vst1q_u16(dst + 8 * stride + 0, c01[2]); + vst1q_u16(dst + 8 * stride + 8, c01[3]); + vst1q_u16(dst + 8 * stride + 16, c01[4]); + vst1q_u16(dst + 8 * stride + 24, c01[5]); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2)); + vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4)); + vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6)); + vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6)); + + vst1q_u16(dst + 12 * stride + 0, c01[3]); + vst1q_u16(dst + 12 * stride + 8, c01[4]); + vst1q_u16(dst + 12 * stride + 16, c01[5]); + vst1q_u16(dst + 12 * stride + 24, c01[6]); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2)); + vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4)); + vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6)); + vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6)); + + vst1q_u16(dst + 16 * stride + 0, c01[4]); + vst1q_u16(dst + 16 * stride + 8, c01[5]); + vst1q_u16(dst + 16 * stride + 16, c01[6]); + vst1q_u16(dst + 16 * stride + 24, c01[7]); + vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2)); + vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4)); + vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6)); + vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6)); + + vst1q_u16(dst + 20 * stride + 0, c01[5]); + vst1q_u16(dst + 20 * stride + 8, c01[6]); + vst1q_u16(dst + 20 * stride + 16, c01[7]); + vst1q_u16(dst + 20 * stride + 24, l31); + vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2)); + vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4)); + vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6)); + vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6)); + + vst1q_u16(dst + 24 * stride + 0, c01[6]); + vst1q_u16(dst + 24 * stride + 8, c01[7]); + vst1q_u16(dst + 24 * stride + 16, l31); + vst1q_u16(dst + 24 * stride + 24, l31); + vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2)); + vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4)); + vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6)); + vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6)); + + vst1q_u16(dst + 28 * stride + 0, c01[7]); + vst1q_u16(dst + 28 * stride + 8, l31); + vst1q_u16(dst + 28 * stride + 16, l31); + vst1q_u16(dst + 28 * stride + 24, l31); + vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2)); + vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2)); + vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4)); + vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4)); + vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6)); + vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6)); + vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6)); +} + +//------------------------------------------------------------------------------ + void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { @@ -725,30 +2155,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row = vld2q_u16(above); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); int i; (void)left; (void)bd; - for (i = 0; i < 16; i++, dst += stride) { - vst2q_u16(dst, row); + for (i = 0; i < 16; i++) { + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + dst += stride; } } void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row0 = vld2q_u16(above); - const uint16x8x2_t row1 = vld2q_u16(above + 16); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); + const uint16x8_t row2 = vld1q_u16(above + 16); + const uint16x8_t row3 = vld1q_u16(above + 24); int i; (void)left; (void)bd; for (i = 0; i < 32; i++) { - vst2q_u16(dst, row0); - dst += 16; - vst2q_u16(dst, row1); - dst += stride - 16; + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + vst1q_u16(dst + 16, row2); + vst1q_u16(dst + 24, row3); + dst += stride; } } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c index 5530c6425b..8d6e8acc4c 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c @@ -661,6 +661,17 @@ void vpx_highbd_lpf_vertical_8_dual_neon( vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd); } +// Quiet warnings of the form: 'vpx_dsp/arm/highbd_loopfilter_neon.c|675 col 67| +// warning: 'oq1' may be used uninitialized in this function +// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding +// an additional branch this warning cannot be silenced otherwise. The +// loopfilter is only called when needed for a block so these output pixels +// will be set. +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + static void lpf_horizontal_16_kernel(uint16_t *s, int p, const uint16x8_t blimit_vec, const uint16x8_t limit_vec, @@ -723,6 +734,10 @@ static void lpf_vertical_16_kernel(uint16_t *s, int p, } } +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c new file mode 100644 index 0000000000..c2ad34a695 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_quantize_neon.c @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store( + const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1, + tran_low_t *dqcoeff_ptr) { + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +} + +static VPX_FORCE_INLINE void highbd_quantize_8_neon( + const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin, + const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift, + int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) { + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31); + const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31); + const int32x4_t coeff_0_abs = vabsq_s32(coeff_0); + const int32x4_t coeff_1_abs = vabsq_s32(coeff_1); + + // Calculate 2 masks of elements outside the bin + const int32x4_t zbin_mask_0 = + vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin)); + const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32( + vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1))); + + // Get the rounded values + const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round); + const int32x4_t rounded_1 = + vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1)); + + // (round * (quant << 15) * 2) >> 16 == (round * quant) + int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant); + int32x4_t qcoeff_tmp_1 = + vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1)); + + // Add rounded values + qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0); + qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1); + + // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift) + qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift); + qcoeff_tmp_1 = + vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1)); + + // Restore the sign bit. + qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign); + qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign); + qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign); + qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign); + + // Only keep the relevant coeffs + *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0); + *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1); +} + +static VPX_FORCE_INLINE int16x8_t +highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32x4_t zbin, + const int32x4_t round, const int32x4_t quant, + const int32x4_t quant_shift, const int32x4_t dequant) { + int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1; + + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0 = vld1q_s32(coeff_ptr); + const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4); + highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift, + &qcoeff_0, &qcoeff_1); + + // Store the 32-bit qcoeffs + vst1q_s32(qcoeff_ptr, qcoeff_0); + vst1q_s32(qcoeff_ptr + 4, qcoeff_1); + + // Calculate and store the dqcoeffs + dqcoeff_0 = vmulq_s32(qcoeff_0, dequant); + dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1)); + + highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr); + + return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1)); +} + +void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + const int16_t *iscan = scan_order->iscan; + + // Only the first element of each vector is DC. + // High half has identical elements, but we can reconstruct it from the low + // half by duplicating the 2nd element. So we only need to pass a 4x32-bit + // vector + int32x4_t zbin = vmovl_s16(vld1_s16(mb_plane->zbin)); + int32x4_t round = vmovl_s16(vld1_s16(mb_plane->round)); + // Extend the quant, quant_shift vectors to ones of 32-bit elements + // scale to high-half, so we can use vqdmulhq_s32 + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); + int32x4_t quant_shift = + vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 15); + int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + n_coeffs -= 8; + + { + zbin = vdupq_lane_s32(vget_low_s32(zbin), 1); + round = vdupq_lane_s32(vget_low_s32(round), 1); + quant = vdupq_lane_s32(vget_low_s32(quant), 1); + quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1); + dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); + + do { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + n_coeffs -= 8; + } while (n_coeffs > 0); + } + +#if VPX_ARCH_AARCH64 + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // VPX_ARCH_AARCH64 +} + +static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32( + int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) { + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + + dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); + dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +} + +static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round, + const int32x4_t quant, const int32x4_t quant_shift, + const int32x4_t dequant) { + int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1; + + // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values + const int32x4_t coeff_0 = vld1q_s32(coeff_ptr); + const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4); + highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift, + &qcoeff_0, &qcoeff_1); + + // Store the 32-bit qcoeffs + vst1q_s32(qcoeff_ptr, qcoeff_0); + vst1q_s32(qcoeff_ptr + 4, qcoeff_1); + + // Calculate and store the dqcoeffs + dqcoeff_0 = vmulq_s32(qcoeff_0, dequant); + dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1)); + + highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr); + + return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1)); +} + +void vpx_highbd_quantize_b_32x32_neon( + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int i; + const int16_t *iscan = scan_order->iscan; + + // Only the first element of each vector is DC. + // High half has identical elements, but we can reconstruct it from the low + // half by duplicating the 2nd element. So we only need to pass a 4x32-bit + // vector + int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1); + int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1); + // Extend the quant, quant_shift vectors to ones of 32-bit elements + // scale to high-half, so we can use vqdmulhq_s32 + int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15); + int32x4_t quant_shift = + vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16); + int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr)); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + { + zbin = vdupq_lane_s32(vget_low_s32(zbin), 1); + round = vdupq_lane_s32(vget_low_s32(round), 1); + quant = vdupq_lane_s32(vget_low_s32(quant), 1); + quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1); + dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); + + for (i = 1; i < 32 * 32 / 8; ++i) { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, + round, quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + } + +#if VPX_ARCH_AARCH64 + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // VPX_ARCH_AARCH64 +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c new file mode 100644 index 0000000000..a6684b0534 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); + uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); + uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); + uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); + uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride); + + sum[0] = vabal_u16(sum[0], s, r0); + sum[1] = vabal_u16(sum[1], s, r1); + sum[2] = vabal_u16(sum[2], s, r2); + sum[3] = vabal_u16(sum[3], s, r3); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32[4]; + + int i = 0; + do { + uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); + + sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); + sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); + sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); + sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride)); + + } while (++i < h); + + sum_u32[0] = vpaddlq_u16(sum[0]); + sum_u32[1] = vpaddlq_u16(sum[1]); + sum_u32[2] = vpaddlq_u16(sum[2]); + sum_u32[3] = vpaddlq_u16(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32)); +} + +static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, + uint32x4_t *const sad_sum) { + uint16x8_t abs_diff = vabdq_u16(src, ref); + *sad_sum = vpadalq_u16(*sad_sum, abs_diff); +} + +static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint16x8_t s0, s1; + + s0 = vld1q_u16(src16_ptr + i * src_stride); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u16(src16_ptr + i * src_stride + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], int w, + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3; + + s0 = vld1q_u16(src16_ptr + i * src_stride + j); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]); + + s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]); + + s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); + sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), + &sum_lo[0]); + sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), + &sum_lo[1]); + sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), + &sum_lo[2]); + sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16), + &sum_lo[3]); + + s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); + sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), + &sum_hi[0]); + sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), + &sum_hi[1]); + sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), + &sum_hi[2]); + sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24), + &sum_hi[3]); + + j += 32; + } while (j < w); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h); +} + +static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h); +} + +#define HBD_SAD_WXH_4D_NEON(w, h) \ + void vpx_highbd_sad##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_neon(src, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +HBD_SAD_WXH_4D_NEON(4, 4) +HBD_SAD_WXH_4D_NEON(4, 8) + +HBD_SAD_WXH_4D_NEON(8, 4) +HBD_SAD_WXH_4D_NEON(8, 8) +HBD_SAD_WXH_4D_NEON(8, 16) + +HBD_SAD_WXH_4D_NEON(16, 8) +HBD_SAD_WXH_4D_NEON(16, 16) +HBD_SAD_WXH_4D_NEON(16, 32) + +HBD_SAD_WXH_4D_NEON(32, 16) +HBD_SAD_WXH_4D_NEON(32, 32) +HBD_SAD_WXH_4D_NEON(32, 64) + +HBD_SAD_WXH_4D_NEON(64, 32) +HBD_SAD_WXH_4D_NEON(64, 64) + +#undef HBD_SAD_WXH_4D_NEON + +#define HBD_SAD_SKIP_WXH_4D_NEON(w, h) \ + void vpx_highbd_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +HBD_SAD_SKIP_WXH_4D_NEON(4, 4) +HBD_SAD_SKIP_WXH_4D_NEON(4, 8) + +HBD_SAD_SKIP_WXH_4D_NEON(8, 4) +HBD_SAD_SKIP_WXH_4D_NEON(8, 8) +HBD_SAD_SKIP_WXH_4D_NEON(8, 16) + +HBD_SAD_SKIP_WXH_4D_NEON(16, 8) +HBD_SAD_SKIP_WXH_4D_NEON(16, 16) +HBD_SAD_SKIP_WXH_4D_NEON(16, 32) + +HBD_SAD_SKIP_WXH_4D_NEON(32, 16) +HBD_SAD_SKIP_WXH_4D_NEON(32, 32) +HBD_SAD_SKIP_WXH_4D_NEON(32, 64) + +HBD_SAD_SKIP_WXH_4D_NEON(64, 32) +HBD_SAD_SKIP_WXH_4D_NEON(64, 64) + +#undef HBD_SAD_SKIP_WXH_4D_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c new file mode 100644 index 0000000000..af36431036 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum = vdupq_n_u32(0); + + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + sum = vabal_u16(sum, s, r); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--h != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint16x8_t sum = vdupq_n_u16(0); + assert(h <= 16); + + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + sum = vabaq_u16(sum, s, r); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--h != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum_u32 = vdupq_n_u32(0); + + // 'h_overflow' is the number of 16-wide rows we can process before 16-bit + // accumulators overflow. After hitting this limit accumulate into 32-bit + // elements. 65535 / 4095 ~= 16, so 16 16-wide rows using two accumulators. + const int h_overflow = 16; + // If block height 'h' is smaller than this limit, use 'h' instead. + const int h_limit = h < h_overflow ? h : h_overflow; + assert(h % h_limit == 0); + + do { + uint16x8_t sum_u16[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + int i = h_limit; + do { + uint16x8_t s0, s1, r0, r1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + sum_u16[0] = vabaq_u16(sum_u16[0], s0, r0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + sum_u16[1] = vabaq_u16(sum_u16[1], s1, r1); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (--i != 0); + + sum_u32 = vpadalq_u16(sum_u32, sum_u16[0]); + sum_u32 = vpadalq_u16(sum_u32, sum_u16[1]); + h -= h_limit; + } while (h != 0); + return horizontal_add_uint32x4(sum_u32); +} + +static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const int h_overflow) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + uint32x4_t sum_u32 = vdupq_n_u32(0); + const int h_limit = h < h_overflow ? h : h_overflow; + assert(h % h_limit == 0); + + do { + uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + int i = 0; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + sum_u16[0] = vabaq_u16(sum_u16[0], s0, r0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + sum_u16[1] = vabaq_u16(sum_u16[1], s1, r1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + sum_u16[2] = vabaq_u16(sum_u16[2], s2, r2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + sum_u16[3] = vabaq_u16(sum_u16[3], s3, r3); + + j += 32; + } while (j < w); + src16_ptr += src_stride; + ref16_ptr += ref_stride; + } while (++i != h_limit); + + sum_u32 = vpadalq_u16(sum_u32, sum_u16[0]); + sum_u32 = vpadalq_u16(sum_u32, sum_u16[1]); + sum_u32 = vpadalq_u16(sum_u32, sum_u16[2]); + sum_u32 = vpadalq_u16(sum_u32, sum_u16[3]); + h -= h_limit; + } while (h != 0); + return horizontal_add_uint32x4(sum_u32); +} + +static INLINE uint32_t highbd_sad32xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + // 'h_overflow' is the number of 32-wide rows we can process before 16-bit + // accumulators overflow. After hitting this limit accumulate into 32-bit + // elements. 65535 / 4095 ~= 16, so 16 32-wide rows using four accumulators. + const int h_overflow = 16; + return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, + h_overflow); +} + +static INLINE uint32_t highbd_sad64xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + // 'h_overflow' is the number of 64-wide rows we can process before 16-bit + // accumulators overflow. After hitting this limit accumulate into 32-bit + // elements. 65535 / 4095 ~= 16, so 8 64-wide rows using four accumulators. + const int h_overflow = 8; + return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, + h_overflow); +} + +#define HBD_SAD_WXH_NEON(w, h) \ + unsigned int vpx_highbd_sad##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ + } + +HBD_SAD_WXH_NEON(4, 4) +HBD_SAD_WXH_NEON(4, 8) + +HBD_SAD_WXH_NEON(8, 4) +HBD_SAD_WXH_NEON(8, 8) +HBD_SAD_WXH_NEON(8, 16) + +HBD_SAD_WXH_NEON(16, 8) +HBD_SAD_WXH_NEON(16, 16) +HBD_SAD_WXH_NEON(16, 32) + +HBD_SAD_WXH_NEON(32, 16) +HBD_SAD_WXH_NEON(32, 32) +HBD_SAD_WXH_NEON(32, 64) + +HBD_SAD_WXH_NEON(64, 32) +HBD_SAD_WXH_NEON(64, 64) + +#undef HBD_SAD_WXH_NEON + +#define HBD_SAD_SKIP_WXH_NEON(w, h) \ + unsigned int vpx_highbd_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad##w##xh_neon(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ + } + +HBD_SAD_SKIP_WXH_NEON(4, 4) +HBD_SAD_SKIP_WXH_NEON(4, 8) + +HBD_SAD_SKIP_WXH_NEON(8, 4) +HBD_SAD_SKIP_WXH_NEON(8, 8) +HBD_SAD_SKIP_WXH_NEON(8, 16) + +HBD_SAD_SKIP_WXH_NEON(16, 8) +HBD_SAD_SKIP_WXH_NEON(16, 16) +HBD_SAD_SKIP_WXH_NEON(16, 32) + +HBD_SAD_SKIP_WXH_NEON(32, 16) +HBD_SAD_SKIP_WXH_NEON(32, 32) +HBD_SAD_SKIP_WXH_NEON(32, 64) + +HBD_SAD_SKIP_WXH_NEON(64, 32) +HBD_SAD_SKIP_WXH_NEON(64, 64) + +#undef HBD_SAD_SKIP_WXH_NEON + +static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + uint16x4_t p = vld1_u16(pred16_ptr); + + uint16x4_t avg = vrhadd_u16(r, p); + sum = vabal_u16(sum, s, avg); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 4; + } while (--h != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint16x8_t sum = vdupq_n_u16(0); + assert(h <= 16); + + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + uint16x8_t p = vld1q_u16(pred16_ptr); + + uint16x8_t avg = vrhaddq_u16(r, p); + sum = vabaq_u16(sum, s, avg); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 8; + } while (--h != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum_u32 = vdupq_n_u32(0); + + // 'h_overflow' is the number of 16-wide rows we can process before 16-bit + // accumulators overflow. After hitting this limit accumulate into 32-bit + // elements. 65535 / 4095 ~= 16, so 16 16-wide rows using two accumulators. + const int h_overflow = 16; + // If block height 'h' is smaller than this limit, use 'h' instead. + const int h_limit = h < h_overflow ? h : h_overflow; + assert(h % h_limit == 0); + + do { + uint16x8_t sum_u16[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + int i = h_limit; + do { + uint16x8_t s0, s1, r0, r1, p0, p1; + uint16x8_t avg0, avg1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + p0 = vld1q_u16(pred16_ptr); + avg0 = vrhaddq_u16(r0, p0); + sum_u16[0] = vabaq_u16(sum_u16[0], s0, avg0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + p1 = vld1q_u16(pred16_ptr + 8); + avg1 = vrhaddq_u16(r1, p1); + sum_u16[1] = vabaq_u16(sum_u16[1], s1, avg1); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 16; + } while (--i != 0); + + sum_u32 = vpadalq_u16(sum_u32, sum_u16[0]); + sum_u32 = vpadalq_u16(sum_u32, sum_u16[1]); + h -= h_limit; + } while (h != 0); + return horizontal_add_uint32x4(sum_u32); +} + +static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred, + const int h_overflow) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum_u32 = vdupq_n_u32(0); + const int h_limit = h < h_overflow ? h : h_overflow; + assert(h % h_limit == 0); + + do { + uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = h_limit; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint16x8_t avg0, avg1, avg2, avg3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + p0 = vld1q_u16(pred16_ptr + j); + avg0 = vrhaddq_u16(r0, p0); + sum_u16[0] = vabaq_u16(sum_u16[0], s0, avg0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + p1 = vld1q_u16(pred16_ptr + j + 8); + avg1 = vrhaddq_u16(r1, p1); + sum_u16[1] = vabaq_u16(sum_u16[1], s1, avg1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + p2 = vld1q_u16(pred16_ptr + j + 16); + avg2 = vrhaddq_u16(r2, p2); + sum_u16[2] = vabaq_u16(sum_u16[2], s2, avg2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + p3 = vld1q_u16(pred16_ptr + j + 24); + avg3 = vrhaddq_u16(r3, p3); + sum_u16[3] = vabaq_u16(sum_u16[3], s3, avg3); + + j += 32; + } while (j < w); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += w; + } while (--i != 0); + + sum_u32 = vpadalq_u16(sum_u32, sum_u16[0]); + sum_u32 = vpadalq_u16(sum_u32, sum_u16[1]); + sum_u32 = vpadalq_u16(sum_u32, sum_u16[2]); + sum_u32 = vpadalq_u16(sum_u32, sum_u16[3]); + h -= h_limit; + } while (h != 0); + return horizontal_add_uint32x4(sum_u32); +} + +static INLINE uint32_t highbd_sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + // 'h_overflow' is the number of 32-wide rows we can process before 16-bit + // accumulators overflow. After hitting this limit accumulate into 32-bit + // elements. 65535 / 4095 ~= 16, so 16 32-wide rows using four accumulators. + const int h_overflow = 16; + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, + second_pred, h_overflow); +} + +static INLINE uint32_t highbd_sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + // 'h_overflow' is the number of 64-wide rows we can process before 16-bit + // accumulators overflow. After hitting this limit accumulate into 32-bit + // elements. 65535 / 4095 ~= 16, so 8 64-wide rows using four accumulators. + const int h_overflow = 8; + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, + second_pred, h_overflow); +} + +#define HBD_SAD_WXH_AVG_NEON(w, h) \ + unsigned int vpx_highbd_sad##w##x##h##_avg_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ + } + +HBD_SAD_WXH_AVG_NEON(4, 4) +HBD_SAD_WXH_AVG_NEON(4, 8) + +HBD_SAD_WXH_AVG_NEON(8, 4) +HBD_SAD_WXH_AVG_NEON(8, 8) +HBD_SAD_WXH_AVG_NEON(8, 16) + +HBD_SAD_WXH_AVG_NEON(16, 8) +HBD_SAD_WXH_AVG_NEON(16, 16) +HBD_SAD_WXH_AVG_NEON(16, 32) + +HBD_SAD_WXH_AVG_NEON(32, 16) +HBD_SAD_WXH_AVG_NEON(32, 32) +HBD_SAD_WXH_AVG_NEON(32, 64) + +HBD_SAD_WXH_AVG_NEON(64, 32) +HBD_SAD_WXH_AVG_NEON(64, 64) + +#undef HBD_SAD_WXH_AVG_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c new file mode 100644 index 0000000000..91dfebf900 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sse_neon.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src, + const uint16_t *ref, + uint32x4_t *sse_acc0, + uint32x4_t *sse_acc1) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo); + *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi); +} + +static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, + uint32x4_t *sse_acc0, + uint32x4_t *sse_acc1) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + uint16x4_t abs_diff_lo = vget_low_u16(abs_diff); + uint16x4_t abs_diff_hi = vget_high_u16(abs_diff); + + *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo); + *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi); +} + +static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[8]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x8(sse); +} + +static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[8]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x8(sse); +} + +static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[4]; + highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x4(sse); +} + +static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2]; + highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4_x2(sse); +} + +static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + // Peel the first loop iteration. + uint16x4_t s = vld1_u16(src); + uint16x4_t r = vld1_u16(ref); + + uint16x4_t abs_diff = vabd_u16(s, r); + uint32x4_t sse = vmull_u16(abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + + while (--height != 0) { + s = vld1_u16(src); + r = vld1_u16(ref); + + abs_diff = vabd_u16(s, r); + sse = vmlal_u16(sse, abs_diff, abs_diff); + + src += src_stride; + ref += ref_stride; + } + + return horizontal_long_add_uint32x4(sse); +} + +static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int width, int height) { + // { 0, 1, 2, 3, 4, 5, 6, 7 } + uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100)); + uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7)); + uint64_t sse = 0; + + do { + int w = width; + int offset = 0; + + do { + uint16x8_t s = vld1q_u16(src + offset); + uint16x8_t r = vld1q_u16(ref + offset); + uint16x8_t abs_diff; + uint16x4_t abs_diff_lo; + uint16x4_t abs_diff_hi; + uint32x4_t sse_u32; + + if (w < 8) { + // Mask out-of-range elements. + s = vandq_u16(s, remainder_mask); + r = vandq_u16(r, remainder_mask); + } + + abs_diff = vabdq_u16(s, r); + abs_diff_lo = vget_low_u16(abs_diff); + abs_diff_hi = vget_high_u16(abs_diff); + + sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo); + sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi); + + sse += horizontal_long_add_uint32x4(sse_u32); + + offset += 8; + w -= 8; + } while (w > 0); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return sse; +} + +int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, int width, + int height) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + switch (width) { + case 4: + return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height); + case 8: + return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height); + case 16: + return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height); + case 32: + return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height); + case 64: + return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height); + default: + return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c new file mode 100644 index 0000000000..f8b94620d4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -0,0 +1,586 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" + +// The bilinear filters look like this: +// +// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, +// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }} +// +// We can factor out the highest common multiple, such that the sum of both +// weights will be 8 instead of 128. The benefits of this are two-fold: +// +// 1) We can infer the filter values from the filter_offset parameter in the +// bilinear filter functions below - we don't have to actually load the values +// from memory: +// f0 = 8 - filter_offset +// f1 = filter_offset +// +// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on +// 16-bit data types at all times, rather than widening out to 32-bit and +// requiring double the number of data processing instructions. (12-bit * 8 = +// 15-bit.) + +// Process a block exactly 4 wide and any height. +static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x4_t s0 = load_unaligned_u16(src_ptr); + uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step); + + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); + + vst1_u16(dst_ptr, blend); + + src_ptr += src_stride; + dst_ptr += 4; + } while (--i != 0); +} + +// Process a block which is a multiple of 8 and any height. +static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height, + int filter_offset) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, blend); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 8, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 16, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 32, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 64, dst_height, filter_offset); +} + +static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + vst1q_u16(dst_ptr + j, avg); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ + h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ + src_stride, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + 1)); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + 1)); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ + xoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + 1), xoffset); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + 1), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 8-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) + +// 10-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) + +// 12-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) + +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) + +// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having +// width 4. +static void highbd_avg_pred_var_filter_block2d_bil_w4( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); + const uint16x4_t f1 = vdup_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x4_t s0 = load_unaligned_u16(src_ptr); + uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step); + uint16x4_t p = vld1_u16(second_pred); + + uint16x4_t blend = vmul_u16(s0, f0); + blend = vmla_u16(blend, s1, f1); + blend = vrshr_n_u16(blend, 3); + + vst1_u16(dst_ptr, vrhadd_u16(blend, p)); + + src_ptr += src_stride; + dst_ptr += 4; + second_pred += 4; + } while (--i != 0); +} + +// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks. +static void highbd_avg_pred_var_filter_block2d_bil_large( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint16_t *second_pred) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p)); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_avg_pred_var_filter_block2d_bil_w8( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 8, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w16( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w32( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w64( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +// Combine averaging subpel filter with vpx_highbd_comp_avg_pred. +static void highbd_avg_pred_var_filter_block2d_avg( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + + uint16x8_t p = vld1q_u16(second_pred); + avg = vrhaddq_u16(avg, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16. +static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, + int src_stride, int dst_width, int dst_height, + const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t avg = vrhaddq_u16(s, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + uint16_t tmp[w * h]; \ + if (yoffset == 0) { \ + highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp, source_stride, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp, source_stride, source_stride, h, yoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp0, source_stride, 1, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + 1)); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + 1)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + 1)); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + 1)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp0, source_stride, 1, h, xoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + 1), xoffset); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + 1), xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 8-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64) + +// 10-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64) + +// 12-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c new file mode 100644 index 0000000000..309ae7fd35 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + + int i = h; + do { + const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = horizontal_add_int32x4(sse_s32); +} + +// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all +// block sizes can be processed in 32-bit elements (1023*1023*64*16 = 1071645696 +// for a 64x64 block). +static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + int i = h; + do { + int j = 0; + do { + const uint16x8_t s = vld1q_u16(src_ptr + j); + const uint16x8_t r = vld1q_u16(ref_ptr + j); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s32 = vpadalq_s16(sum_s32, diff); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_long_add_uint32x4(vaddq_u32( + vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1]))); +} + +static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum); +} + +static INLINE void highbd_variance_16xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum); +} + +static INLINE void highbd_variance_32xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} + +static INLINE void highbd_variance_64xh_neon(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} + +// For 12-bit data, we can only accumulate up to 128 elements in the sum of +// squares (4095*4095*128 = 2146435200), and because we're using two int32x4 +// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128) +// or 16 64-element rows before we have to accumulate into 64-bit elements. +// Therefore blocks of size 32x64, 64x32 and 64x64 are processed in a different +// helper function. + +// Process a block of any size where the width is divisible by 8, with +// accumulation into 64-bit elements. +static INLINE void highbd_variance_xlarge_neon( + const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, + int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit + // accumulator overflows. After hitting this limit we accumulate into 64-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + do { + int j = 0; + do { + const uint16x8_t s0 = vld1q_u16(src_ptr + j); + const uint16x8_t r0 = vld1q_u16(ref_ptr + j); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + sum_s32 = vpadalq_s16(sum_s32, diff); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]); + sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]); + h_tmp += h_limit; + } while (i < h); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = (uint64_t)horizontal_add_int64x2(sse_s64); +} + +static INLINE void highbd_variance_32xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse, + sum); +} + +static INLINE void highbd_variance_64xh_xlarge_neon( + const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, int64_t *sum) { + highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse, + sum); +} + +#define HBD_VARIANCE_WXH_8_NEON(w, h) \ + uint32_t vpx_highbd_8_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ + } + +#define HBD_VARIANCE_WXH_10_NEON(w, h) \ + uint32_t vpx_highbd_10_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HBD_VARIANCE_WXH_12_NEON(w, h) \ + uint32_t vpx_highbd_12_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h) \ + uint32_t vpx_highbd_12_variance##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +// 8-bit +HBD_VARIANCE_WXH_8_NEON(4, 4) +HBD_VARIANCE_WXH_8_NEON(4, 8) + +HBD_VARIANCE_WXH_8_NEON(8, 4) +HBD_VARIANCE_WXH_8_NEON(8, 8) +HBD_VARIANCE_WXH_8_NEON(8, 16) + +HBD_VARIANCE_WXH_8_NEON(16, 8) +HBD_VARIANCE_WXH_8_NEON(16, 16) +HBD_VARIANCE_WXH_8_NEON(16, 32) + +HBD_VARIANCE_WXH_8_NEON(32, 16) +HBD_VARIANCE_WXH_8_NEON(32, 32) +HBD_VARIANCE_WXH_8_NEON(32, 64) + +HBD_VARIANCE_WXH_8_NEON(64, 32) +HBD_VARIANCE_WXH_8_NEON(64, 64) + +// 10-bit +HBD_VARIANCE_WXH_10_NEON(4, 4) +HBD_VARIANCE_WXH_10_NEON(4, 8) + +HBD_VARIANCE_WXH_10_NEON(8, 4) +HBD_VARIANCE_WXH_10_NEON(8, 8) +HBD_VARIANCE_WXH_10_NEON(8, 16) + +HBD_VARIANCE_WXH_10_NEON(16, 8) +HBD_VARIANCE_WXH_10_NEON(16, 16) +HBD_VARIANCE_WXH_10_NEON(16, 32) + +HBD_VARIANCE_WXH_10_NEON(32, 16) +HBD_VARIANCE_WXH_10_NEON(32, 32) +HBD_VARIANCE_WXH_10_NEON(32, 64) + +HBD_VARIANCE_WXH_10_NEON(64, 32) +HBD_VARIANCE_WXH_10_NEON(64, 64) + +// 12-bit +HBD_VARIANCE_WXH_12_NEON(4, 4) +HBD_VARIANCE_WXH_12_NEON(4, 8) + +HBD_VARIANCE_WXH_12_NEON(8, 4) +HBD_VARIANCE_WXH_12_NEON(8, 8) +HBD_VARIANCE_WXH_12_NEON(8, 16) + +HBD_VARIANCE_WXH_12_NEON(16, 8) +HBD_VARIANCE_WXH_12_NEON(16, 16) +HBD_VARIANCE_WXH_12_NEON(16, 32) + +HBD_VARIANCE_WXH_12_NEON(32, 16) +HBD_VARIANCE_WXH_12_NEON(32, 32) +HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64) + +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32) +HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64) + +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + *sum = (int)sum_long; \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + } + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse_u32[0] = + vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff)); + sse_u32[1] = + vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff)); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); +} + +static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h); +} + +static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h); +} + +#define HIGHBD_MSE_WXH_NEON(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON(16, 16) +HIGHBD_MSE_WXH_NEON(16, 8) +HIGHBD_MSE_WXH_NEON(8, 16) +HIGHBD_MSE_WXH_NEON(8, 8) + +#undef HIGHBD_MSE_WXH_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c new file mode 100644 index 0000000000..1a88720172 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon_dotprod.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h / 2; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u16(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u16(ref_ptr); + ref_ptr += ref_stride; + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + } while (--i != 0); + + return horizontal_add_uint32x4(sse_u32); +} + +static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s0, s1, r0, r1; + uint8x16_t s, r, diff; + + s0 = vld1q_u16(src_ptr); + s1 = vld1q_u16(src_ptr + 8); + r0 = vld1q_u16(ref_ptr); + r1 = vld1q_u16(ref_ptr + 8); + + s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1)); + r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1)); + + diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, diff, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(sse_u32); +} + +#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h) \ + uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + *sse = \ + highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16) +HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8) +HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16) +HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8) + +#undef HIGHBD_MSE_WXH_NEON_DOTPROD diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c new file mode 100644 index 0000000000..cebe06b099 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_sve.c @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" +#include "vpx_ports/mem.h" + +static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h) { + uint64x2_t sse = vdupq_n_u64(0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse = vpx_dotq_u16(sse, diff, diff); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + return (uint32_t)horizontal_add_uint64x2(sse); +} + +#define HIGHBD_MSE_WXH_SVE(w, h) \ + uint32_t vpx_highbd_10_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint32_t sse_tmp = \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h); \ + sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 4); \ + *sse = sse_tmp; \ + return sse_tmp; \ + } \ + \ + uint32_t vpx_highbd_12_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint32_t sse_tmp = \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h); \ + sse_tmp = ROUND_POWER_OF_TWO(sse_tmp, 8); \ + *sse = sse_tmp; \ + return sse_tmp; \ + } + +HIGHBD_MSE_WXH_SVE(16, 16) +HIGHBD_MSE_WXH_SVE(16, 8) +HIGHBD_MSE_WXH_SVE(8, 16) +HIGHBD_MSE_WXH_SVE(8, 8) + +#undef HIGHBD_MSE_WXH_SVE + +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + sse_s64 = vpx_dotq_s16(sse_s64, diff, diff); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + h -= 2; + } while (h != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = horizontal_add_int64x2(sse_s64); +} + +static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + const uint16x8_t s = vld1q_u16(src_ptr); + const uint16x8_t r = vld1q_u16(ref_ptr); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s32 = vpadalq_s16(sum_s32, diff); + sse_s64 = vpx_dotq_s16(sse_s64, diff, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = horizontal_add_int64x2(sse_s64); +} + +static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + const uint16x8_t s0 = vld1q_u16(src_ptr); + const uint16x8_t s1 = vld1q_u16(src_ptr + 8); + + const uint16x8_t r0 = vld1q_u16(ref_ptr); + const uint16x8_t r1 = vld1q_u16(ref_ptr + 8); + + const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1)); + + sum_s32[0] = vpadalq_s16(sum_s32[0], diff0); + sum_s32[1] = vpadalq_s16(sum_s32[1], diff1); + + sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0); + sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]); + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]); + + *sum = horizontal_add_int32x4(sum_s32[0]); + *sse = horizontal_add_int64x2(sse_s64[0]); +} + +static INLINE void highbd_variance_wxh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + int i = 0; + do { + const uint16x8_t s0 = vld1q_u16(src_ptr + i); + const uint16x8_t s1 = vld1q_u16(src_ptr + i + 8); + const uint16x8_t s2 = vld1q_u16(src_ptr + i + 16); + const uint16x8_t s3 = vld1q_u16(src_ptr + i + 24); + + const uint16x8_t r0 = vld1q_u16(ref_ptr + i); + const uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8); + const uint16x8_t r2 = vld1q_u16(ref_ptr + i + 16); + const uint16x8_t r3 = vld1q_u16(ref_ptr + i + 24); + + const int16x8_t diff0 = vreinterpretq_s16_u16(vsubq_u16(s0, r0)); + const int16x8_t diff1 = vreinterpretq_s16_u16(vsubq_u16(s1, r1)); + const int16x8_t diff2 = vreinterpretq_s16_u16(vsubq_u16(s2, r2)); + const int16x8_t diff3 = vreinterpretq_s16_u16(vsubq_u16(s3, r3)); + + sum_s32[0] = vpadalq_s16(sum_s32[0], diff0); + sum_s32[1] = vpadalq_s16(sum_s32[1], diff1); + sum_s32[2] = vpadalq_s16(sum_s32[2], diff2); + sum_s32[3] = vpadalq_s16(sum_s32[3], diff3); + + sse_s64[0] = vpx_dotq_s16(sse_s64[0], diff0, diff0); + sse_s64[1] = vpx_dotq_s16(sse_s64[1], diff1, diff1); + sse_s64[2] = vpx_dotq_s16(sse_s64[2], diff2, diff2); + sse_s64[3] = vpx_dotq_s16(sse_s64[3], diff3, diff3); + + i += 32; + } while (i < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]); + sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]); + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[2]); + + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]); + sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]); + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[2]); + + *sum = horizontal_add_int32x4(sum_s32[0]); + *sse = horizontal_add_int64x2(sse_s64[0]); +} + +static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} + +static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_wxh_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} + +#define HBD_VARIANCE_WXH_SVE(w, h) \ + uint32_t vpx_highbd_8_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +HBD_VARIANCE_WXH_SVE(4, 4) +HBD_VARIANCE_WXH_SVE(4, 8) + +HBD_VARIANCE_WXH_SVE(8, 4) +HBD_VARIANCE_WXH_SVE(8, 8) +HBD_VARIANCE_WXH_SVE(8, 16) + +HBD_VARIANCE_WXH_SVE(16, 8) +HBD_VARIANCE_WXH_SVE(16, 16) +HBD_VARIANCE_WXH_SVE(16, 32) + +HBD_VARIANCE_WXH_SVE(32, 16) +HBD_VARIANCE_WXH_SVE(32, 32) +HBD_VARIANCE_WXH_SVE(32, 64) + +HBD_VARIANCE_WXH_SVE(64, 32) +HBD_VARIANCE_WXH_SVE(64, 64) + +#define HIGHBD_GET_VAR_SVE(s) \ + void vpx_highbd_8_get##s##x##s##var_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + *sum = (int)sum_long; \ + } \ + \ + void vpx_highbd_10_get##s##x##s##var_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + } \ + \ + void vpx_highbd_12_get##s##x##s##var_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##s##xh_sve(src, src_stride, ref, ref_stride, s, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + } + +HIGHBD_GET_VAR_SVE(8) +HIGHBD_GET_VAR_SVE(16) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c index 1fde13e8d6..cc6307f923 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -14,84 +14,22 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/highbd_convolve8_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" -static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0, - int16x4_t *s1, int16x4_t *s2, int16x4_t *s3) { - *s0 = vld1_s16(s); - s += p; - *s1 = vld1_s16(s); - s += p; - *s2 = vld1_s16(s); - s += p; - *s3 = vld1_s16(s); -} - -static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0, - uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3) { - *s0 = vld1q_u16(s); - s += p; - *s1 = vld1q_u16(s); - s += p; - *s2 = vld1q_u16(s); - s += p; - *s3 = vld1q_u16(s); -} - -static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0, - int16x8_t *s1, int16x8_t *s2, int16x8_t *s3, - int16x8_t *s4, int16x8_t *s5, int16x8_t *s6, - int16x8_t *s7) { - *s0 = vld1q_s16(s); - s += p; - *s1 = vld1q_s16(s); - s += p; - *s2 = vld1q_s16(s); - s += p; - *s3 = vld1q_s16(s); - s += p; - *s4 = vld1q_s16(s); - s += p; - *s5 = vld1q_s16(s); - s += p; - *s6 = vld1q_s16(s); - s += p; - *s7 = vld1q_s16(s); -} - -static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0, - const uint16x8_t s1, const uint16x8_t s2, - const uint16x8_t s3, const uint16x8_t s4, - const uint16x8_t s5, const uint16x8_t s6, - const uint16x8_t s7) { - vst1q_u16(s, s0); - s += p; - vst1q_u16(s, s1); - s += p; - vst1q_u16(s, s2); - s += p; - vst1q_u16(s, s3); - s += p; - vst1q_u16(s, s4); - s += p; - vst1q_u16(s, s5); - s += p; - vst1q_u16(s, s6); - s += p; - vst1q_u16(s, s7); -} - -static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, - const int16x4_t s2, const int16x4_t s3, - const int16x4_t s4, const int16x4_t s5, - const int16x4_t s6, const int16x4_t s7, - const int16x8_t filters) { +static INLINE uint16x4_t +highbd_convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, + const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filters, const uint16x4_t max) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); - int32x4_t sum = vdupq_n_s32(0); - sum = vmlal_lane_s16(sum, s0, filters_lo, 0); + int32x4_t sum = vmull_lane_s16(s0, filters_lo, 0); sum = vmlal_lane_s16(sum, s1, filters_lo, 1); sum = vmlal_lane_s16(sum, s2, filters_lo, 2); sum = vmlal_lane_s16(sum, s3, filters_lo, 3); @@ -99,22 +37,20 @@ static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, sum = vmlal_lane_s16(sum, s5, filters_hi, 1); sum = vmlal_lane_s16(sum, s6, filters_hi, 2); sum = vmlal_lane_s16(sum, s7, filters_hi, 3); - return sum; + + uint16x4_t res = vqrshrun_n_s32(sum, FILTER_BITS); + return vmin_u16(res, max); } -static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, - const int16x8_t s2, const int16x8_t s3, - const int16x8_t s4, const int16x8_t s5, - const int16x8_t s6, const int16x8_t s7, - const int16x8_t filters, - const uint16x8_t max) { +static INLINE uint16x8_t +highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, + const int16x8_t s3, const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filters, const uint16x8_t max) { const int16x4_t filters_lo = vget_low_s16(filters); const int16x4_t filters_hi = vget_high_s16(filters); - int32x4_t sum0 = vdupq_n_s32(0); - int32x4_t sum1 = vdupq_n_s32(0); - uint16x8_t d; - sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filters_lo, 0); + int32x4_t sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3); @@ -122,7 +58,8 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2); sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3); - sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filters_lo, 0); + + int32x4_t sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3); @@ -130,285 +67,421 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2); sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3); - d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7)); - d = vminq_u16(d, max); - return d; + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS), + vqrshrun_n_s32(sum1, FILTER_BITS)); + return vminq_u16(res, max); } -void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused +static INLINE void highbd_convolve_4tap_horiz_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = + highbd_convolve4_4_neon(s0[0], s0[1], s0[2], s0[3], filter, max); + uint16x4_t d1 = + highbd_convolve4_4_neon(s1[0], s1[1], s1[2], s1[3], filter, max); + uint16x4_t d2 = + highbd_convolve4_4_neon(s2[0], s2[1], s2[2], s2[3], filter, max); + uint16x4_t d3 = + highbd_convolve4_4_neon(s3[0], s3[1], s3[2], s3[3], filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = + highbd_convolve4_8_neon(s0[0], s0[1], s0[2], s0[3], filter, max); + uint16x8_t d1 = + highbd_convolve4_8_neon(s1[0], s1[1], s1[2], s1[3], filter, max); + uint16x8_t d2 = + highbd_convolve4_8_neon(s2[0], s2[1], s2[2], s2[3], filter, max); + uint16x8_t d3 = + highbd_convolve4_8_neon(s3[0], s3[1], s3[2], s3[3], filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void highbd_convolve_8tap_horiz_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[8], s1[8], s2[8], s3[8]; + load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filter, max); + uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filter, max); + uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filter, max); + uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filter, max); + uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filter, max); + uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filter, max); + uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { if (x_step_q4 != 16) { - vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2); + highbd_convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap, bd); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - const int16x8_t filters = vld1q_s16(filter_x); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - uint16x8_t t0, t1, t2, t3; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3; - - if (h == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t d01, d23; - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u16_8x4(&t0, &t1, &t2, &t3); - s0 = vreinterpret_s16_u16(vget_low_u16(t0)); - s1 = vreinterpret_s16_u16(vget_low_u16(t1)); - s2 = vreinterpret_s16_u16(vget_low_u16(t2)); - s3 = vreinterpret_s16_u16(vget_low_u16(t3)); - s4 = vreinterpret_s16_u16(vget_high_u16(t0)); - s5 = vreinterpret_s16_u16(vget_high_u16(t1)); - s6 = vreinterpret_s16_u16(vget_high_u16(t2)); - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - src += 7; - - do { - load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); - transpose_s16_4x4d(&s7, &s8, &s9, &s10); - - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - d01 = vminq_u16(d01, max); - d23 = vminq_u16(d23, max); - transpose_u16_4x4q(&d01, &d23); - - vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); - vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); - vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); - vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); - - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - src += 4; - dst += 4; - w -= 4; - } while (w > 0); - } else { - int16x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3; - - if (w == 4) { - do { - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, - &t4, &t5, &t6, &t7); - src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - - transpose_u16_8x4(&d0, &d1, &d2, &d3); - vst1_u16(dst, vget_low_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d3)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d3)); - dst += dst_stride; - h -= 8; - } while (h > 0); - } else { - int width; - const uint16_t *s; - uint16_t *d; - int16x8_t s11, s12, s13, s14; - uint16x8_t d4, d5, d6, d7; - - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - - do { - load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, - &s12, &s13, &s14); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max); - - transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - - s0 = s8; - s1 = s9; - s2 = s10; - s3 = s11; - s4 = s12; - s5 = s13; - s6 = s14; - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 8 * src_stride; - dst += 8 * dst_stride; - h -= 8; - } while (h > 0); - } - } + const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]); + highbd_convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap, bd); } } -void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8, - ptrdiff_t src_stride, uint8_t *dst8, +void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h, int bd) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h, int bd) { if (x_step_q4 != 16) { - vpx_highbd_convolve8_avg_horiz_c(src8, src_stride, dst8, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, bd); + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + + src -= 3; + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[8], s1[8], s2[8], s3[8]; + load_s16_4x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x4_t d0 = highbd_convolve8_4(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filters, max); + uint16x4_t d3 = highbd_convolve8_4(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - const int16x8_t filters = vld1q_s16(filter_x); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - uint16x8_t t0, t1, t2, t3; - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3; - - if (h == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t d01, d23, t01, t23; - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u16_8x4(&t0, &t1, &t2, &t3); - s0 = vreinterpret_s16_u16(vget_low_u16(t0)); - s1 = vreinterpret_s16_u16(vget_low_u16(t1)); - s2 = vreinterpret_s16_u16(vget_low_u16(t2)); - s3 = vreinterpret_s16_u16(vget_low_u16(t3)); - s4 = vreinterpret_s16_u16(vget_high_u16(t0)); - s5 = vreinterpret_s16_u16(vget_high_u16(t1)); - s6 = vreinterpret_s16_u16(vget_high_u16(t2)); - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - src += 7; + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; do { - load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); - transpose_s16_4x4d(&s7, &s8, &s9, &s10); + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint16x8_t d0 = highbd_convolve8_8(s0[0], s0[1], s0[2], s0[3], s0[4], + s0[5], s0[6], s0[7], filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1[0], s1[1], s1[2], s1[3], s1[4], + s1[5], s1[6], s1[7], filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2[0], s2[1], s2[2], s2[3], s2[4], + s2[5], s2[6], s2[7], filters, max); + uint16x8_t d3 = highbd_convolve8_8(s3[0], s3[1], s3[2], s3[3], s3[4], + s3[5], s3[6], s3[7], filters, max); - t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - t01 = vminq_u16(t01, max); - t23 = vminq_u16(t23, max); - transpose_u16_4x4q(&t01, &t23); + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); - d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), - vld1_u16(dst + 2 * dst_stride)); - d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), - vld1_u16(dst + 3 * dst_stride)); - d01 = vrhaddq_u16(d01, t01); - d23 = vrhaddq_u16(d23, t23); + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); - vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); - vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); - vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); - vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void highbd_convolve_4tap_vert_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x4_t d0 = highbd_convolve4_4_neon(s0, s1, s2, s3, filter, max); + uint16x4_t d1 = highbd_convolve4_4_neon(s1, s2, s3, s4, filter, max); + uint16x4_t d2 = highbd_convolve4_4_neon(s2, s3, s4, s5, filter, max); + uint16x4_t d3 = highbd_convolve4_4_neon(s3, s4, s5, s6, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint16x8_t d0 = highbd_convolve4_8_neon(s0, s1, s2, s3, filter, max); + uint16x8_t d1 = highbd_convolve4_8_neon(s1, s2, s3, s4, filter, max); + uint16x8_t d2 = highbd_convolve4_8_neon(s2, s3, s4, s5, filter, max); + uint16x8_t d3 = highbd_convolve4_8_neon(s3, s4, s5, s6, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_convolve_8tap_vert_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter, max); + uint16x4_t d1 = + highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter, max); + uint16x4_t d2 = + highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter, max); + uint16x4_t d3 = + highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x8_t d0 = + highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter, max); + uint16x8_t d1 = + highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter, max); + uint16x8_t d2 = + highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter, max); + uint16x8_t d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -417,410 +490,140 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8, s4 = s8; s5 = s9; s6 = s10; - src += 4; - dst += 4; - w -= 4; - } while (w > 0); - } else { - int16x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; - - if (w == 4) { - do { - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, - &t4, &t5, &t6, &t7); - src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - transpose_u16_8x4(&t0, &t1, &t2, &t3); - - d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), - vld1_u16(dst + 4 * dst_stride)); - d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), - vld1_u16(dst + 5 * dst_stride)); - d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), - vld1_u16(dst + 6 * dst_stride)); - d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride), - vld1_u16(dst + 7 * dst_stride)); - d0 = vrhaddq_u16(d0, t0); - d1 = vrhaddq_u16(d1, t1); - d2 = vrhaddq_u16(d2, t2); - d3 = vrhaddq_u16(d3, t3); - - vst1_u16(dst, vget_low_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d3)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d0)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d1)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d2)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d3)); - dst += dst_stride; - h -= 8; - } while (h > 0); - } else { - int width; - const uint16_t *s; - uint16_t *d; - int16x8_t s11, s12, s13, s14; - uint16x8_t d4, d5, d6, d7; - - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, - &s5, &s6, &s7); - transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - - do { - load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, - &s12, &s13, &s14); - transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); - - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max); - - transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - - d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); - d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); - d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); - d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); - d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride)); - d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride)); - d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride)); - d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride)); - - store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); - - s0 = s8; - s1 = s9; - s2 = s10; - s3 = s11; - s4 = s12; - s5 = s13; - s6 = s14; - s += 8; - d += 8; - width -= 8; - } while (width > 0); - src += 8 * src_stride; - dst += 8 * dst_stride; - h -= 8; - } while (h > 0); - } - } + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); } } -void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, +void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { - vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + const int16x4_t y_filter_4tap = vld1_s16(filter[y0_q4] + 2); + highbd_convolve_4tap_vert_neon(src - src_stride, src_stride, dst, + dst_stride, w, h, y_filter_4tap, bd); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - const int16x8_t filters = vld1q_s16(filter_y); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - - src -= 3 * src_stride; - - if (w == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t d01, d23; - - s0 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s1 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s2 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s3 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s4 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s5 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s6 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - - do { - s7 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s8 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s9 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s10 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); - - d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - d01 = vminq_u16(d01, max); - d23 = vminq_u16(d23, max); - vst1_u16(dst, vget_low_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d23)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d23)); - dst += dst_stride; - - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - h -= 4; - } while (h > 0); - } else { - int height; - const uint16_t *s; - uint16_t *d; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3; - - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s1 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s2 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s3 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s4 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s5 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s6 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - d = dst; - height = h; - - do { - s7 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s8 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s9 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s10 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - - vst1q_u16(d, d0); - d += dst_stride; - vst1q_u16(d, d1); - d += dst_stride; - vst1q_u16(d, d2); - d += dst_stride; - vst1q_u16(d, d3); - d += dst_stride; - - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } + const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]); + highbd_convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap, bd); } } -void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8, - ptrdiff_t src_stride, uint8_t *dst8, +void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { - vpx_highbd_convolve8_avg_vert_c(src8, src_stride, dst8, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, - h, bd); + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + + src -= 3 * src_stride; + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + + s += 7 * src_stride; + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint16x4_t d0 = + highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + uint16x4_t d1 = + highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + uint16x4_t d2 = + highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + uint16x4_t d3 = + highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); } else { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - const int16x8_t filters = vld1q_s16(filter_y); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; - src -= 3 * src_stride; + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); - if (w == 4) { - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - int32x4_t d0, d1, d2, d3; - uint16x8_t d01, d23, t01, t23; - - s0 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s1 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s2 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s3 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s4 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s5 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s6 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; + s += 7 * src_stride; do { - s7 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s8 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s9 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; - s10 = vreinterpret_s16_u16(vld1_u16(src)); - src += src_stride; + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint16x8_t d0 = + highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + uint16x8_t d1 = + highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + uint16x8_t d2 = + highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + uint16x8_t d3 = + highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); - t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); - t01 = vminq_u16(t01, max); - t23 = vminq_u16(t23, max); + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); - d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), - vld1_u16(dst + 1 * dst_stride)); - d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), - vld1_u16(dst + 3 * dst_stride)); - d01 = vrhaddq_u16(d01, t01); - d23 = vrhaddq_u16(d23, t23); - - vst1_u16(dst, vget_low_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d01)); - dst += dst_stride; - vst1_u16(dst, vget_low_u16(d23)); - dst += dst_stride; - vst1_u16(dst, vget_high_u16(d23)); - dst += dst_stride; + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -829,95 +632,600 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8, s4 = s8; s5 = s9; s6 = s10; - h -= 4; - } while (h > 0); - } else { - int height; - const uint16_t *s; - uint16_t *d; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; - uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; - - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s1 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s2 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s3 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s4 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s5 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s6 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - d = dst; - height = h; - - do { - s7 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s8 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s9 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - s10 = vreinterpretq_s16_u16(vld1q_u16(s)); - s += src_stride; - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); - t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); - t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); - t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); - - d0 = vld1q_u16(d + 0 * dst_stride); - d1 = vld1q_u16(d + 1 * dst_stride); - d2 = vld1q_u16(d + 2 * dst_stride); - d3 = vld1q_u16(d + 3 * dst_stride); - d0 = vrhaddq_u16(d0, t0); - d1 = vrhaddq_u16(d1, t1); - d2 = vrhaddq_u16(d2, t2); - d3 = vrhaddq_u16(d3, t3); - - vst1q_u16(d, d0); - d += dst_stride; - vst1q_u16(d, d1); - d += dst_stride; - vst1q_u16(d, d2); - d += dst_stride; - vst1q_u16(d, d3); - d += dst_stride; - - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); - } + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); } } + +static INLINE void highbd_convolve_2d_4tap_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filter, + const int16x4_t y_filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t h_s0[4], h_s1[4], h_s2[4]; + load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]); + + int16x4_t v_s0 = vreinterpret_s16_u16(highbd_convolve4_4_neon( + h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max)); + int16x4_t v_s1 = vreinterpret_s16_u16(highbd_convolve4_4_neon( + h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max)); + int16x4_t v_s2 = vreinterpret_s16_u16(highbd_convolve4_4_neon( + h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max)); + + s += 3 * src_stride; + + do { + int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4]; + load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], + &h_s3[3]); + load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], + &h_s4[3]); + load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], + &h_s5[3]); + load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], + &h_s6[3]); + + int16x4_t v_s3 = vreinterpret_s16_u16(highbd_convolve4_4_neon( + h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max)); + int16x4_t v_s4 = vreinterpret_s16_u16(highbd_convolve4_4_neon( + h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max)); + int16x4_t v_s5 = vreinterpret_s16_u16(highbd_convolve4_4_neon( + h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max)); + int16x4_t v_s6 = vreinterpret_s16_u16(highbd_convolve4_4_neon( + h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max)); + + uint16x4_t d0 = + highbd_convolve4_4_neon(v_s0, v_s1, v_s2, v_s3, y_filter, max); + uint16x4_t d1 = + highbd_convolve4_4_neon(v_s1, v_s2, v_s3, v_s4, y_filter, max); + uint16x4_t d2 = + highbd_convolve4_4_neon(v_s2, v_s3, v_s4, v_s5, y_filter, max); + uint16x4_t d3 = + highbd_convolve4_4_neon(v_s3, v_s4, v_s5, v_s6, y_filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + + return; + } + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t h_s0[4], h_s1[4], h_s2[4]; + load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]); + + int16x8_t v_s0 = vreinterpretq_s16_u16(highbd_convolve4_8_neon( + h_s0[0], h_s0[1], h_s0[2], h_s0[3], x_filter, max)); + int16x8_t v_s1 = vreinterpretq_s16_u16(highbd_convolve4_8_neon( + h_s1[0], h_s1[1], h_s1[2], h_s1[3], x_filter, max)); + int16x8_t v_s2 = vreinterpretq_s16_u16(highbd_convolve4_8_neon( + h_s2[0], h_s2[1], h_s2[2], h_s2[3], x_filter, max)); + + s += 3 * src_stride; + + do { + int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4]; + load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], + &h_s3[3]); + load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], + &h_s4[3]); + load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], + &h_s5[3]); + load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], + &h_s6[3]); + + int16x8_t v_s3 = vreinterpretq_s16_u16(highbd_convolve4_8_neon( + h_s3[0], h_s3[1], h_s3[2], h_s3[3], x_filter, max)); + int16x8_t v_s4 = vreinterpretq_s16_u16(highbd_convolve4_8_neon( + h_s4[0], h_s4[1], h_s4[2], h_s4[3], x_filter, max)); + int16x8_t v_s5 = vreinterpretq_s16_u16(highbd_convolve4_8_neon( + h_s5[0], h_s5[1], h_s5[2], h_s5[3], x_filter, max)); + int16x8_t v_s6 = vreinterpretq_s16_u16(highbd_convolve4_8_neon( + h_s6[0], h_s6[1], h_s6[2], h_s6[3], x_filter, max)); + + uint16x8_t d0 = + highbd_convolve4_8_neon(v_s0, v_s1, v_s2, v_s3, y_filter, max); + uint16x8_t d1 = + highbd_convolve4_8_neon(v_s1, v_s2, v_s3, v_s4, y_filter, max); + uint16x8_t d2 = + highbd_convolve4_8_neon(v_s2, v_s3, v_s4, v_s5, y_filter, max); + uint16x8_t d3 = + highbd_convolve4_8_neon(v_s3, v_s4, v_s5, v_s6, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +static INLINE void highbd_convolve_2d_8tap_neon( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t x_filter, + const int16x8_t y_filter, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x4_t v_s0 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x4_t v_s1 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x4_t v_s2 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x4_t v_s3 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x4_t v_s4 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x4_t v_s5 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x4_t v_s6 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x4_t v_s7 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x4_t v_s8 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x4_t v_s9 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x4_t v_s10 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + + return; + } + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x8_t v_s0 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x8_t v_s1 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x8_t v_s2 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x8_t v_s3 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x8_t v_s4 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x8_t v_s5 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x8_t v_s6 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x8_t v_s7 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x8_t v_s8 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x8_t v_s9 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x8_t v_s10 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} + +void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8; + const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1; + const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride; + + if (x_filter_taps == 4 && y_filter_taps == 4) { + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2); + + highbd_convolve_2d_4tap_neon(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter, y_filter, bd); + return; + } + + const int16x8_t x_filter = vld1q_s16(filter[x0_q4]); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + highbd_convolve_2d_8tap_neon(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter, y_filter, bd); +} + +void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + // Averaging convolution always uses an 8-tap filter. + const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; + const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride; + // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2 + // lines post both horizontally and vertically. + src = src - horiz_offset - vert_offset; + + const int16x8_t x_filter = vld1q_s16(filter[x0_q4]); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_4x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_4x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_4x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x4_t v_s0 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x4_t v_s1 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x4_t v_s2 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x4_t v_s3 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x4_t v_s4 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x4_t v_s5 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x4_t v_s6 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x4_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_4x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_4x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_4x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_4x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x4_t v_s7 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x4_t v_s8 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x4_t v_s9 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x4_t v_s10 = vreinterpret_s16_u16( + highbd_convolve8_4(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x4_t d0 = highbd_convolve8_4(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + + return; + } + + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t h_s0[8], h_s1[8], h_s2[8], h_s3[8], h_s4[8], h_s5[8], h_s6[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3], + &h_s0[4], &h_s0[5], &h_s0[6], &h_s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3], + &h_s1[4], &h_s1[5], &h_s1[6], &h_s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3], + &h_s2[4], &h_s2[5], &h_s2[6], &h_s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], &h_s3[3], + &h_s3[4], &h_s3[5], &h_s3[6], &h_s3[7]); + load_s16_8x8(s + 4 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], &h_s4[3], + &h_s4[4], &h_s4[5], &h_s4[6], &h_s4[7]); + load_s16_8x8(s + 5 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], &h_s5[3], + &h_s5[4], &h_s5[5], &h_s5[6], &h_s5[7]); + load_s16_8x8(s + 6 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], &h_s6[3], + &h_s6[4], &h_s6[5], &h_s6[6], &h_s6[7]); + + int16x8_t v_s0 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s0[0], h_s0[1], h_s0[2], h_s0[3], h_s0[4], h_s0[5], + h_s0[6], h_s0[7], x_filter, max)); + int16x8_t v_s1 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s1[0], h_s1[1], h_s1[2], h_s1[3], h_s1[4], h_s1[5], + h_s1[6], h_s1[7], x_filter, max)); + int16x8_t v_s2 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s2[0], h_s2[1], h_s2[2], h_s2[3], h_s2[4], h_s2[5], + h_s2[6], h_s2[7], x_filter, max)); + int16x8_t v_s3 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s3[0], h_s3[1], h_s3[2], h_s3[3], h_s3[4], h_s3[5], + h_s3[6], h_s3[7], x_filter, max)); + int16x8_t v_s4 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s4[0], h_s4[1], h_s4[2], h_s4[3], h_s4[4], h_s4[5], + h_s4[6], h_s4[7], x_filter, max)); + int16x8_t v_s5 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s5[0], h_s5[1], h_s5[2], h_s5[3], h_s5[4], h_s5[5], + h_s5[6], h_s5[7], x_filter, max)); + int16x8_t v_s6 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s6[0], h_s6[1], h_s6[2], h_s6[3], h_s6[4], h_s6[5], + h_s6[6], h_s6[7], x_filter, max)); + + s += 7 * src_stride; + + do { + int16x8_t h_s7[8], h_s8[8], h_s9[8], h_s10[8]; + load_s16_8x8(s + 0 * src_stride, 1, &h_s7[0], &h_s7[1], &h_s7[2], + &h_s7[3], &h_s7[4], &h_s7[5], &h_s7[6], &h_s7[7]); + load_s16_8x8(s + 1 * src_stride, 1, &h_s8[0], &h_s8[1], &h_s8[2], + &h_s8[3], &h_s8[4], &h_s8[5], &h_s8[6], &h_s8[7]); + load_s16_8x8(s + 2 * src_stride, 1, &h_s9[0], &h_s9[1], &h_s9[2], + &h_s9[3], &h_s9[4], &h_s9[5], &h_s9[6], &h_s9[7]); + load_s16_8x8(s + 3 * src_stride, 1, &h_s10[0], &h_s10[1], &h_s10[2], + &h_s10[3], &h_s10[4], &h_s10[5], &h_s10[6], &h_s10[7]); + + int16x8_t v_s7 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s7[0], h_s7[1], h_s7[2], h_s7[3], h_s7[4], + h_s7[5], h_s7[6], h_s7[7], x_filter, max)); + int16x8_t v_s8 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s8[0], h_s8[1], h_s8[2], h_s8[3], h_s8[4], + h_s8[5], h_s8[6], h_s8[7], x_filter, max)); + int16x8_t v_s9 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s9[0], h_s9[1], h_s9[2], h_s9[3], h_s9[4], + h_s9[5], h_s9[6], h_s9[7], x_filter, max)); + int16x8_t v_s10 = vreinterpretq_s16_u16( + highbd_convolve8_8(h_s10[0], h_s10[1], h_s10[2], h_s10[3], h_s10[4], + h_s10[5], h_s10[6], h_s10[7], x_filter, max)); + + uint16x8_t d0 = highbd_convolve8_8(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, + v_s6, v_s7, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, + v_s7, v_s8, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7, + v_s8, v_s9, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8, + v_s9, v_s10, y_filter, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + v_s3 = v_s7; + v_s4 = v_s8; + v_s5 = v_s9; + v_s6 = v_s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c new file mode 100644 index 0000000000..f909e06a18 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/highbd_convolve8_sve.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" + +DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6, + 1, 3, 5, 7 }; + +static INLINE void highbd_convolve_4tap_horiz_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t filters, int bd) { + const int16x8_t filter = vcombine_s16(filters, vdup_n_s16(0)); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x4_t s0[4], s1[4], s2[4], s3[4]; + load_s16_4x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_4x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve4_4_sve(s0, filter, max); + uint16x4_t d1 = highbd_convolve4_4_sve(s1, filter, max); + uint16x4_t d2 = highbd_convolve4_4_sve(s2, filter, max); + uint16x4_t d3 = highbd_convolve4_4_sve(s3, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const uint16x8_t idx = vld1q_u16(kTblConv4_8); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = highbd_convolve4_8_sve(s0, filter, max, idx); + uint16x8_t d1 = highbd_convolve4_8_sve(s1, filter, max, idx); + uint16x8_t d2 = highbd_convolve4_8_sve(s2, filter, max, idx); + uint16x8_t d3 = highbd_convolve4_8_sve(s3, filter, max, idx); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void highbd_convolve_8tap_horiz_sve( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filters, int bd) { + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve8_4(s0, filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1, filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2, filters, max); + uint16x4_t d3 = highbd_convolve8_4(s3, filters, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0, filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1, filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2, filters, max); + uint16x8_t d3 = highbd_convolve8_8(s3, filters, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_highbd_convolve8_horiz_sve(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + const int16x4_t x_filter_4tap = vld1_s16(filter[x0_q4] + 2); + highbd_convolve_4tap_horiz_sve(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap, bd); + } else { + const int16x8_t x_filter_8tap = vld1q_s16(filter[x0_q4]); + highbd_convolve_8tap_horiz_sve(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap, bd); + } +} + +void vpx_highbd_convolve8_avg_horiz_sve(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + return; + } + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + + src -= 3; + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve8_4(s0, filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1, filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2, filters, max); + uint16x4_t d3 = highbd_convolve8_4(s3, filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0, filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1, filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2, filters, max); + uint16x8_t d3 = highbd_convolve8_8(s3, filters, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c new file mode 100644 index 0000000000..8408f98f4a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_sve2.c @@ -0,0 +1,660 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/highbd_convolve8_neon.h" +#include "vpx_dsp/arm/highbd_convolve8_sve.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" +#include "vpx_dsp/arm/vpx_neon_sve2_bridge.h" + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 0, 5, 6, 7, 4, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 0, 1, 6, 7, 4, 5, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 0, 1, 2, 7, 4, 5, 6, +}; +// clang-format on + +DECLARE_ALIGNED(16, static const uint16_t, kTblConv4_8[8]) = { 0, 2, 4, 6, + 1, 3, 5, 7 }; + +static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t s_lo[2], + int16x8_t s_hi[2], + int16x8_t filter, + uint16x4_t max) { + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0); + sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1); + + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0); + sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_v(const int16x8_t s_lo[4], + const int16x8_t s_hi[4], + const int16x8_t filter, + const uint16x8_t max) { + int64x2_t sum01 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[0], filter, 0); + sum01 = vpx_dotq_lane_s16(sum01, s_hi[0], filter, 1); + + int64x2_t sum23 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[1], filter, 0); + sum23 = vpx_dotq_lane_s16(sum23, s_hi[1], filter, 1); + + int64x2_t sum45 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[2], filter, 0); + sum45 = vpx_dotq_lane_s16(sum45, s_hi[2], filter, 1); + + int64x2_t sum67 = vpx_dotq_lane_s16(vdupq_n_s64(0), s_lo[3], filter, 0); + sum67 = vpx_dotq_lane_s16(sum67, s_hi[3], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve8_8tap_vert_sve2( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x8_t filter, int bd) { + assert(w >= 4 && h >= 4); + + do { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]); + transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]); + transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]); + transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]); + + do { + int16x4_t s7, s8, s9, sA; + + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + transpose_concat_s16_4x4(s4, s5, s6, s7, &s4567[0], &s4567[1]); + transpose_concat_s16_4x4(s5, s6, s7, s8, &s5678[0], &s5678[1]); + transpose_concat_s16_4x4(s6, s7, s8, s9, &s6789[0], &s6789[1]); + transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]); + + uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filter, max); + uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filter, max); + uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filter, max); + uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s4 = s8; + s5 = s9; + s6 = sA; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + + src += 4; + dst += 4; + w -= 4; + } while (w != 0); +} + +void vpx_highbd_convolve8_vert_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + vpx_highbd_convolve8_vert_neon(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + } else { + const int16x8_t y_filter_8tap = vld1q_s16(filter[y0_q4]); + highbd_convolve8_8tap_vert_sve2(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_8tap, bd); + } +} + +void vpx_highbd_convolve8_avg_vert_sve2(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); + return; + } + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + + const int16x8_t filters = vld1q_s16(filter[y0_q4]); + + src -= 3 * src_stride; + + uint16x8x3_t merge_tbl_idx = vld1q_u16_x3(kDotProdMergeBlockTbl); + + // Correct indices by the size of vector length. + merge_tbl_idx.val[0] = vaddq_u16( + merge_tbl_idx.val[0], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL))); + merge_tbl_idx.val[1] = vaddq_u16( + merge_tbl_idx.val[1], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL))); + merge_tbl_idx.val[2] = vaddq_u16( + merge_tbl_idx.val[2], + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL))); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_s16_4x4(s0, s1, s2, s3, &s0123[0], &s0123[1]); + transpose_concat_s16_4x4(s1, s2, s3, s4, &s1234[0], &s1234[1]); + transpose_concat_s16_4x4(s2, s3, s4, s5, &s2345[0], &s2345[1]); + transpose_concat_s16_4x4(s3, s4, s5, s6, &s3456[0], &s3456[1]); + + do { + int16x4_t s7, s8, s9, sA; + + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + transpose_concat_s16_4x4(s7, s8, s9, sA, &s789A[0], &s789A[1]); + + vpx_tbl2x2_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); + vpx_tbl2x2_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); + vpx_tbl2x2_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]); + + uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, filters, max); + uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, filters, max); + uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, filters, max); + uint16x4_t d3 = highbd_convolve8_4_v(s3456, s789A, filters, max); + + d0 = vrhadd_u16(d0, vld1_u16(d + 0 * dst_stride)); + d1 = vrhadd_u16(d1, vld1_u16(d + 1 * dst_stride)); + d2 = vrhadd_u16(d2, vld1_u16(d + 2 * dst_stride)); + d3 = vrhadd_u16(d3, vld1_u16(d + 3 * dst_stride)); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_s16_8x4(s0, s1, s2, s3, &s0123[0], &s0123[1], &s0123[2], + &s0123[3]); + transpose_concat_s16_8x4(s1, s2, s3, s4, &s1234[0], &s1234[1], &s1234[2], + &s1234[3]); + transpose_concat_s16_8x4(s2, s3, s4, s5, &s2345[0], &s2345[1], &s2345[2], + &s2345[3]); + transpose_concat_s16_8x4(s3, s4, s5, s6, &s3456[0], &s3456[1], &s3456[2], + &s3456[3]); + + do { + int16x8_t s7, s8, s9, sA; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &sA); + + int16x8_t s4567[4], s5678[5], s6789[4], s789A[4]; + transpose_concat_s16_8x4(s7, s8, s9, sA, &s789A[0], &s789A[1], + &s789A[2], &s789A[3]); + + vpx_tbl2x4_s16(s3456, s789A, s4567, merge_tbl_idx.val[0]); + vpx_tbl2x4_s16(s3456, s789A, s5678, merge_tbl_idx.val[1]); + vpx_tbl2x4_s16(s3456, s789A, s6789, merge_tbl_idx.val[2]); + + uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, filters, max); + uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, filters, max); + uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, filters, max); + uint16x8_t d3 = highbd_convolve8_8_v(s3456, s789A, filters, max); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_convolve_2d_4tap_sve2( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int w, int h, const int16x4_t x_filters, + const int16x4_t y_filters, int bd) { + const int16x8_t x_filter = vcombine_s16(x_filters, vdup_n_s16(0)); + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + int16x4_t h_s0[4], h_s1[4], h_s2[4]; + load_s16_4x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], &h_s0[3]); + load_s16_4x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], &h_s1[3]); + load_s16_4x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], &h_s2[3]); + + int16x4_t v_s0 = + vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s0, x_filter, max)); + int16x4_t v_s1 = + vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s1, x_filter, max)); + int16x4_t v_s2 = + vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s2, x_filter, max)); + + s += 3 * src_stride; + + do { + int16x4_t h_s3[4], h_s4[4], h_s5[4], h_s6[4]; + load_s16_4x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], + &h_s3[3]); + load_s16_4x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], + &h_s4[3]); + load_s16_4x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], + &h_s5[3]); + load_s16_4x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], + &h_s6[3]); + + int16x4_t v_s3 = + vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s3, x_filter, max)); + int16x4_t v_s4 = + vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s4, x_filter, max)); + int16x4_t v_s5 = + vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s5, x_filter, max)); + int16x4_t v_s6 = + vreinterpret_s16_u16(highbd_convolve4_4_sve(h_s6, x_filter, max)); + + uint16x4_t d0 = + highbd_convolve4_4_neon(v_s0, v_s1, v_s2, v_s3, y_filters, max); + uint16x4_t d1 = + highbd_convolve4_4_neon(v_s1, v_s2, v_s3, v_s4, y_filters, max); + uint16x4_t d2 = + highbd_convolve4_4_neon(v_s2, v_s3, v_s4, v_s5, y_filters, max); + uint16x4_t d3 = + highbd_convolve4_4_neon(v_s3, v_s4, v_s5, v_s6, y_filters, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const uint16x8_t idx = vld1q_u16(kTblConv4_8); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int height = h; + + int16x8_t h_s0[4], h_s1[4], h_s2[4]; + load_s16_8x4(s + 0 * src_stride, 1, &h_s0[0], &h_s0[1], &h_s0[2], + &h_s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &h_s1[0], &h_s1[1], &h_s1[2], + &h_s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &h_s2[0], &h_s2[1], &h_s2[2], + &h_s2[3]); + + int16x8_t v_s0 = vreinterpretq_s16_u16( + highbd_convolve4_8_sve(h_s0, x_filter, max, idx)); + int16x8_t v_s1 = vreinterpretq_s16_u16( + highbd_convolve4_8_sve(h_s1, x_filter, max, idx)); + int16x8_t v_s2 = vreinterpretq_s16_u16( + highbd_convolve4_8_sve(h_s2, x_filter, max, idx)); + + s += 3 * src_stride; + + do { + int16x8_t h_s3[4], h_s4[4], h_s5[4], h_s6[4]; + load_s16_8x4(s + 0 * src_stride, 1, &h_s3[0], &h_s3[1], &h_s3[2], + &h_s3[3]); + load_s16_8x4(s + 1 * src_stride, 1, &h_s4[0], &h_s4[1], &h_s4[2], + &h_s4[3]); + load_s16_8x4(s + 2 * src_stride, 1, &h_s5[0], &h_s5[1], &h_s5[2], + &h_s5[3]); + load_s16_8x4(s + 3 * src_stride, 1, &h_s6[0], &h_s6[1], &h_s6[2], + &h_s6[3]); + + int16x8_t v_s3 = vreinterpretq_s16_u16( + highbd_convolve4_8_sve(h_s3, x_filter, max, idx)); + int16x8_t v_s4 = vreinterpretq_s16_u16( + highbd_convolve4_8_sve(h_s4, x_filter, max, idx)); + int16x8_t v_s5 = vreinterpretq_s16_u16( + highbd_convolve4_8_sve(h_s5, x_filter, max, idx)); + int16x8_t v_s6 = vreinterpretq_s16_u16( + highbd_convolve4_8_sve(h_s6, x_filter, max, idx)); + + uint16x8_t d0 = + highbd_convolve4_8_neon(v_s0, v_s1, v_s2, v_s3, y_filters, max); + uint16x8_t d1 = + highbd_convolve4_8_neon(v_s1, v_s2, v_s3, v_s4, y_filters, max); + uint16x8_t d2 = + highbd_convolve4_8_neon(v_s2, v_s3, v_s4, v_s5, y_filters, max); + uint16x8_t d3 = + highbd_convolve4_8_neon(v_s3, v_s4, v_s5, v_s6, y_filters, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void highbd_convolve8_2d_horiz_sve2( + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h, int bd) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + assert(h % 4 == 3 && h >= 7); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + const int16x8_t filters = vld1q_s16(filter[x0_q4]); + + src -= 3; + + if (w == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve8_4(s0, filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1, filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2, filters, max); + uint16x4_t d3 = highbd_convolve8_4(s3, filters, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 3); + + // Process final three rows (h % 4 == 3). + int16x8_t s0[4], s1[4], s2[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + + uint16x4_t d0 = highbd_convolve8_4(s0, filters, max); + uint16x4_t d1 = highbd_convolve8_4(s1, filters, max); + uint16x4_t d2 = highbd_convolve8_4(s2, filters, max); + + store_u16_4x3(d, dst_stride, d0, d1, d2); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0, filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1, filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2, filters, max); + uint16x8_t d3 = highbd_convolve8_8(s3, filters, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 3); + + // Process final three rows (h % 4 == 3). + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int width = w; + + do { + int16x8_t s0[8], s1[8], s2[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + + uint16x8_t d0 = highbd_convolve8_8(s0, filters, max); + uint16x8_t d1 = highbd_convolve8_8(s1, filters, max); + uint16x8_t d2 = highbd_convolve8_8(s2, filters, max); + + store_u16_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + } +} + +void vpx_highbd_convolve8_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + const int horiz_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8; + const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + + if (horiz_filter_taps == 4 || vert_filter_taps == 4) { + const ptrdiff_t horiz_offset = horiz_filter_taps / 2 - 1; + const ptrdiff_t vert_offset = (vert_filter_taps / 2 - 1) * src_stride; + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int16x4_t y_filter = vld1_s16(filter[y0_q4] + 2); + + highbd_convolve_2d_4tap_sve2(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter, y_filter, bd); + return; + } + + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7). + DECLARE_ALIGNED(32, uint16_t, im_block[64 * 71]); + const int im_stride = 64; + + // Account for the vertical phase needing SUBPEL_TAPS / 2 - 1 lines prior + // and SUBPEL_TAPS / 2 lines post. + const int im_height = h + SUBPEL_TAPS - 1; + const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1; + + highbd_convolve8_2d_horiz_sve2(src - src_stride * border_offset, src_stride, + im_block, im_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, im_height, bd); + + // Step into the temporary buffer border_offset rows to get actual frame data. + vpx_highbd_convolve8_vert_sve2(im_block + im_stride * border_offset, + im_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_avg_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + if (x_step_q4 != 16 || y_step_q4 != 16) { + vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); + return; + } + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7). + DECLARE_ALIGNED(32, uint16_t, im_block[64 * 71]); + const int im_stride = 64; + + // Account for the vertical phase needing SUBPEL_TAPS / 2 - 1 lines prior + // and SUBPEL_TAPS / 2 lines post. + const int im_height = h + SUBPEL_TAPS - 1; + const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1; + + highbd_convolve8_2d_horiz_sve2(src - src_stride * border_offset, src_stride, + im_block, im_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, im_height, bd); + + // Step into the temporary buffer border_offset rows to get actual frame data. + vpx_highbd_convolve8_avg_vert_sve2(im_block + im_stride * border_offset, + im_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c index f4d70761eb..765a054f8d 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c @@ -13,18 +13,16 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, +void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; if (w < 8) { // avg4 diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index a980ab1a38..7751082083 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -13,91 +13,101 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, +void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; if (w < 8) { // copy4 + uint16x4_t s0, s1; do { - vst1_u16(dst, vld1_u16(src)); + s0 = vld1_u16(src); src += src_stride; + s1 = vld1_u16(src); + src += src_stride; + + vst1_u16(dst, s0); dst += dst_stride; - vst1_u16(dst, vld1_u16(src)); - src += src_stride; + vst1_u16(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 8) { // copy8 + uint16x8_t s0, s1; do { - vst1q_u16(dst, vld1q_u16(src)); + s0 = vld1q_u16(src); src += src_stride; + s1 = vld1q_u16(src); + src += src_stride; + + vst1q_u16(dst, s0); dst += dst_stride; - vst1q_u16(dst, vld1q_u16(src)); - src += src_stride; + vst1q_u16(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w < 32) { // copy16 + uint16x8_t s0, s1, s2, s3; do { - vst2q_u16(dst, vld2q_u16(src)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); src += src_stride; - dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); + s2 = vld1q_u16(src); + s3 = vld1q_u16(src + 8); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); - src += src_stride; + vst1q_u16(dst, s2); + vst1q_u16(dst + 8, s3); dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); - src += src_stride; - dst += dst_stride; - h -= 4; - } while (h > 0); + h -= 2; + } while (h != 0); } else if (w == 32) { // copy32 + uint16x8_t s0, s1, s2, s3; do { - vst4q_u16(dst, vld4q_u16(src)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - h -= 4; - } while (h > 0); + } while (--h != 0); } else { // copy64 + uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7; do { - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + s4 = vld1q_u16(src + 32); + s5 = vld1q_u16(src + 40); + s6 = vld1q_u16(src + 48); + s7 = vld1q_u16(src + 56); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); + vst1q_u16(dst + 32, s4); + vst1q_u16(dst + 40, s5); + vst1q_u16(dst + 48, s6); + vst1q_u16(dst + 56, s7); dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - h -= 4; - } while (h > 0); + } while (--h != 0); } } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c deleted file mode 100644 index 4e6e109920..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2016 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/vpx_dsp_common.h" -#include "vpx_dsp/vpx_filter.h" -#include "vpx_ports/mem.h" - -void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); - // + 1 to make it divisible by 4 - DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); - const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - /* Filter starting 3 lines back. The neon implementation will ignore the given - * height and filter a multiple of 4 lines. Since this goes in to the temp - * buffer which has lots of extra room and is subsequently discarded this is - * safe if somewhat less than ideal. */ - vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3), - src_stride, CONVERT_TO_BYTEPTR(temp), w, - filter_x, x_step_q4, filter_y, y_step_q4, w, - intermediate_height, bd); - - /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst, - dst_stride, filter_x, x_step_q4, filter_y, - y_step_q4, w, h, bd); -} - -void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); - // + 1 to make it divisible by 4 - DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); - const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - /* This implementation has the same issues as above. In addition, we only want - * to average the values after both passes. - */ - vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3), - src_stride, CONVERT_TO_BYTEPTR(temp), w, - filter_x, x_step_q4, filter_y, y_step_q4, w, - intermediate_height, bd); - vpx_highbd_convolve8_avg_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst, - dst_stride, filter_x, x_step_q4, filter_y, - y_step_q4, w, h, bd); -} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm deleted file mode 100644 index d648840df4..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm +++ /dev/null @@ -1,196 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vpx_idct16x16_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int stride) - -|vpx_idct16x16_1_add_neon| PROC - ldrsh r0, [r0] - - ; cospi_16_64 = 11585 - movw r12, #0x2d41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asr r0, r0, #6 ; >> 6 - - vdup.s16 q0, r0 ; duplicate a1 - mov r0, #8 - sub r2, #8 - - ; load destination data row0 - row3 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row4 - row7 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row8 - row11 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row12 - row15 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |vpx_idct16x16_1_add_neon| - - END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c index 968bc5cc3a..bf5192a683 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c @@ -32,7 +32,8 @@ static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride, void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { - const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm deleted file mode 100644 index ea6b099d3b..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm +++ /dev/null @@ -1,1176 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - INCLUDE vpx_dsp/arm/idct_neon.asm.S - - EXPORT |vpx_idct16x16_256_add_neon_pass1| - EXPORT |vpx_idct16x16_256_add_neon_pass2| - IF CONFIG_VP9_HIGHBITDEPTH - EXPORT |vpx_idct16x16_256_add_neon_pass1_tran_low| - EXPORT |vpx_idct16x16_256_add_neon_pass2_tran_low| - ENDIF - EXPORT |vpx_idct16x16_10_add_neon_pass1| - EXPORT |vpx_idct16x16_10_add_neon_pass2| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void |vpx_idct16x16_256_add_neon_pass1|(const int16_t *input, int16_t *output) -; -; r0 const int16_t *input -; r1 int16_t *output - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_256_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - -idct16x16_256_add_neon_pass1 - ; cospi_28_64 = 3196 - movw r3, #0x0c7c - - ; cospi_4_64 = 16069 - movw r12, #0x3ec5 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r12 ; duplicate cospi_4_64 - - ; preloading to avoid stall - ; cospi_12_64 = 13623 - movw r3, #0x3537 - - ; cospi_20_64 = 9102 - movw r12, #0x238e - - ; step2[4] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; step2[4] * cospi_4_64 - vmull.s16 q5, d18, d1 - vmull.s16 q6, d19, d1 - - ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 - vmlal.s16 q5, d30, d0 - vmlal.s16 q6, d31, d0 - - vdup.16 d2, r3 ; duplicate cospi_12_64 - vdup.16 d3, r12 ; duplicate cospi_20_64 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d8, q2, #14 ; >> 14 - vrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d14, q5, #14 ; >> 14 - vrshrn.s32 d15, q6, #14 ; >> 14 - - ; preloading to avoid stall - ; cospi_16_64 = 11585 - movw r3, #0x2d41 - - ; cospi_24_64 = 6270 - movw r12, #0x187e - - ; step2[5] * cospi_12_64 - vmull.s16 q2, d26, d2 - vmull.s16 q3, d27, d2 - - ; step2[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q15, d27, d3 - - ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q2, d22, d3 - vmlsl.s16 q3, d23, d3 - - ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q15, d23, d2 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d10, q2, #14 ; >> 14 - vrshrn.s32 d11, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q9, #14 ; >> 14 - vrshrn.s32 d13, q15, #14 ; >> 14 - - ; stage 4 - vdup.16 d30, r3 ; cospi_16_64 - - ; step1[0] * cospi_16_64 - vmull.s16 q2, d16, d30 - vmull.s16 q11, d17, d30 - - ; step1[1] * cospi_16_64 - vmull.s16 q0, d24, d30 - vmull.s16 q1, d25, d30 - - ; cospi_8_64 = 15137 - movw r3, #0x3b21 - - vdup.16 d30, r12 ; duplicate cospi_24_64 - vdup.16 d31, r3 ; duplicate cospi_8_64 - - ; temp1 = (step1[0] + step1[1]) * cospi_16_64 - vadd.s32 q3, q2, q0 - vadd.s32 q12, q11, q1 - - ; temp2 = (step1[0] - step1[1]) * cospi_16_64 - vsub.s32 q13, q2, q0 - vsub.s32 q1, q11, q1 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d16, q3, #14 ; >> 14 - vrshrn.s32 d17, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d18, q13, #14 ; >> 14 - vrshrn.s32 d19, q1, #14 ; >> 14 - - ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - ; step1[2] * cospi_8_64 - vmull.s16 q0, d20, d31 - vmull.s16 q1, d21, d31 - - ; step1[2] * cospi_24_64 - vmull.s16 q12, d20, d30 - vmull.s16 q13, d21, d30 - - ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q0, d28, d30 - vmlal.s16 q1, d29, d30 - - ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q12, d28, d31 - vmlsl.s16 q13, d29, d31 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d22, q0, #14 ; >> 14 - vrshrn.s32 d23, q1, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d20, q12, #14 ; >> 14 - vrshrn.s32 d21, q13, #14 ; >> 14 - - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; - vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; - - ; cospi_16_64 = 11585 - movw r3, #0x2d41 - - ; stage 5 - vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; - vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; - vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; - vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; - - vdup.16 d16, r3; ; duplicate cospi_16_64 - - ; step2[5] * cospi_16_64 - vmull.s16 q11, d26, d16 - vmull.s16 q12, d27, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q6, q9, q11 - vsub.s32 q13, q10, q12 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d10, q6, #14 ; >> 14 - vrshrn.s32 d11, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q9, #14 ; >> 14 - vrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {q8-q9}, [r1]! - vst1.64 {q10-q11}, [r1]! - vst1.64 {q12-q13}, [r1]! - vst1.64 {q14-q15}, [r1] - - bx lr - ENDP ; |vpx_idct16x16_256_add_neon_pass1| - - IF CONFIG_VP9_HIGHBITDEPTH -;void |vpx_idct16x16_256_add_neon_pass1_tran_low|(const tran_low_t *input, -; int16_t *output) -; -; r0 const tran_low_t *input -; r1 int16_t *output - -|vpx_idct16x16_256_add_neon_pass1_tran_low| PROC - LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 - LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 - LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 - LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 - LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0 - vmov.s16 q15, q1 - - b idct16x16_256_add_neon_pass1 - ENDP ; |vpx_idct16x16_256_add_neon_pass1_tran_low| - ENDIF ; CONFIG_VP9_HIGHBITDEPTH - -;void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, -; int16_t *output, -; int16_t *pass1_output, -; int16_t skip_adding, -; uint8_t *dest, -; int stride) -; -; r0 const int16_t *src -; r1 int16_t *output -; r2 int16_t *pass1_output -; r3 int16_t skip_adding -; r4 uint8_t *dest -; r5 int stride - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_256_add_neon_pass2| PROC - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - -idct16x16_256_add_neon_pass2 - push {r3-r9} - - ; cospi_30_64 = 1606 - movw r3, #0x0646 - - ; cospi_2_64 = 16305 - movw r12, #0x3fb1 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d12, r3 ; duplicate cospi_30_64 - vdup.16 d13, r12 ; duplicate cospi_2_64 - - ; preloading to avoid stall - ; cospi_14_64 = 12665 - movw r3, #0x3179 - - ; cospi_18_64 = 10394 - movw r12, #0x289a - - ; step1[8] * cospi_30_64 - vmull.s16 q2, d16, d12 - vmull.s16 q3, d17, d12 - - ; step1[8] * cospi_2_64 - vmull.s16 q1, d16, d13 - vmull.s16 q4, d17, d13 - - ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 - vmlsl.s16 q2, d30, d13 - vmlsl.s16 q3, d31, d13 - - ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 - vmlal.s16 q1, d30, d12 - vmlal.s16 q4, d31, d12 - - vdup.16 d30, r3 ; duplicate cospi_14_64 - vdup.16 d31, r12 ; duplicate cospi_18_64 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d0, q2, #14 ; >> 14 - vrshrn.s32 d1, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d14, q1, #14 ; >> 14 - vrshrn.s32 d15, q4, #14 ; >> 14 - - ; preloading to avoid stall - ; cospi_22_64 = 7723 - movw r3, #0x1e2b - - ; cospi_10_64 = 14449 - movw r12, #0x3871 - - ; step1[9] * cospi_14_64 - vmull.s16 q2, d24, d30 - vmull.s16 q3, d25, d30 - - ; step1[9] * cospi_18_64 - vmull.s16 q4, d24, d31 - vmull.s16 q5, d25, d31 - - ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 - vmlsl.s16 q2, d22, d31 - vmlsl.s16 q3, d23, d31 - - ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 - vmlal.s16 q4, d22, d30 - vmlal.s16 q5, d23, d30 - - vdup.16 d30, r3 ; duplicate cospi_22_64 - vdup.16 d31, r12 ; duplicate cospi_10_64 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d2, q2, #14 ; >> 14 - vrshrn.s32 d3, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q4, #14 ; >> 14 - vrshrn.s32 d13, q5, #14 ; >> 14 - - ; step1[10] * cospi_22_64 - vmull.s16 q11, d20, d30 - vmull.s16 q12, d21, d30 - - ; step1[10] * cospi_10_64 - vmull.s16 q4, d20, d31 - vmull.s16 q5, d21, d31 - - ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 - vmlsl.s16 q11, d26, d31 - vmlsl.s16 q12, d27, d31 - - ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 - vmlal.s16 q4, d26, d30 - vmlal.s16 q5, d27, d30 - - ; preloading to avoid stall - ; cospi_6_64 = 15679 - movw r3, #0x3d3f - - ; cospi_26_64 = 4756 - movw r12, #0x1294 - - vdup.16 d30, r3 ; duplicate cospi_6_64 - vdup.16 d31, r12 ; duplicate cospi_26_64 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d4, q11, #14 ; >> 14 - vrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d11, q5, #14 ; >> 14 - vrshrn.s32 d10, q4, #14 ; >> 14 - - ; step1[11] * cospi_6_64 - vmull.s16 q10, d28, d30 - vmull.s16 q11, d29, d30 - - ; step1[11] * cospi_26_64 - vmull.s16 q12, d28, d31 - vmull.s16 q13, d29, d31 - - ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 - vmlsl.s16 q10, d18, d31 - vmlsl.s16 q11, d19, d31 - - ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 - vmlal.s16 q12, d18, d30 - vmlal.s16 q13, d19, d30 - - vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] - vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] - - ; dct_const_round_shift(temp1) - vrshrn.s32 d6, q10, #14 ; >> 14 - vrshrn.s32 d7, q11, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d8, q12, #14 ; >> 14 - vrshrn.s32 d9, q13, #14 ; >> 14 - - ; stage 3 - vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] - vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] - vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] - vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] - vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] - vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] - - ; stage 4 - ; cospi_24_64 = 6270 - movw r3, #0x187e - - ; cospi_8_64 = 15137 - movw r12, #0x3b21 - - ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vdup.16 d30, r12 ; duplicate cospi_8_64 - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d18, d31 - vmull.s16 q3, d19, d31 - - ; step1[14] * cospi_24_64 - vmull.s16 q4, d28, d31 - vmull.s16 q5, d29, d31 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d28, d30 - vmlal.s16 q3, d29, d30 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q4, d18, d30 - vmlsl.s16 q5, d19, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q2, #14 ; >> 14 - vrshrn.s32 d13, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d2, q4, #14 ; >> 14 - vrshrn.s32 d3, q5, #14 ; >> 14 - - vmov.s16 q3, q11 - vmov.s16 q4, q12 - - ; - step1[13] * cospi_8_64 - vmull.s16 q11, d26, d30 - vmull.s16 q12, d27, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d20, d30 - vmull.s16 q9, d21, d30 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlsl.s16 q11, d20, d31 - vmlsl.s16 q12, d21, d31 - - ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d26, d31 - vmlal.s16 q9, d27, d31 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d4, q11, #14 ; >> 14 - vrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d10, q8, #14 ; >> 14 - vrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; cospi_16_64 = 11585 - movw r12, #0x2d41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q10, q3, q0 - vadd.s32 q4, q4, q1 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d4, q5, #14 ; >> 14 - vrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d10, q10, #14 ; >> 14 - vrshrn.s32 d11, q4, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d6, q10, #14 ; >> 14 - vrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d8, q13, #14 ; >> 14 - vrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1_output stride - ldr r3, [sp] ; load skip_adding - cmp r3, #0 ; check if need adding dest data - beq skip_adding_dest - - ldr r7, [sp, #28] ; dest used to save element 0-7 - mov r9, r7 ; save dest pointer for later use - ldr r8, [sp, #32] ; load stride - - ; stage 7 - ; load the data in pass1 - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - - ; store the data output 8,9,10,11,12,13,14,15 - vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q8, q8, d12 ; + dest[j * stride + i] - vqmovun.s16 d12, q8 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q9, q9, #6 - vaddw.u8 q9, q9, d13 ; + dest[j * stride + i] - vqmovun.s16 d13, q9 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q2, q2, #6 - vaddw.u8 q2, q2, d12 ; + dest[j * stride + i] - vqmovun.s16 d12, q2 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q3, q3, #6 - vaddw.u8 q3, q3, d13 ; + dest[j * stride + i] - vqmovun.s16 d13, q3 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q4, q4, #6 - vaddw.u8 q4, q4, d12 ; + dest[j * stride + i] - vqmovun.s16 d12, q4 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q5, q5, #6 - vaddw.u8 q5, q5, d13 ; + dest[j * stride + i] - vqmovun.s16 d13, q5 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q14, q14, #6 - vaddw.u8 q14, q14, d12 ; + dest[j * stride + i] - vqmovun.s16 d12, q14 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q15, q15, #6 - vaddw.u8 q15, q15, d13 ; + dest[j * stride + i] - vqmovun.s16 d13, q15 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - b end_idct16x16_pass2 - -skip_adding_dest - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |vpx_idct16x16_256_add_neon_pass2| - - IF CONFIG_VP9_HIGHBITDEPTH -;void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src, -; int16_t *output, -; int16_t *pass1_output, -; int16_t skip_adding, -; uint8_t *dest, -; int stride) -; -; r0 const tran_low_t *src -; r1 int16_t *output -; r2 int16_t *pass1_output -; r3 int16_t skip_adding -; r4 uint8_t *dest -; r5 int stride - -|vpx_idct16x16_256_add_neon_pass2_tran_low| PROC - LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 - LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 - LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 - LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 - LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0 - vmov.s16 q15, q0 - - b idct16x16_256_add_neon_pass2 - ENDP ; |vpx_idct16x16_256_add_neon_pass2_tran_low| - ENDIF ; CONFIG_VP9_HIGHBITDEPTH - -;void |vpx_idct16x16_10_add_neon_pass1|(const tran_low_t *input, -; int16_t *output) -; -; r0 const tran_low_t *input -; r1 int16_t *output - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_10_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 - LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 - LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 - LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 - LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0 - vmov.s16 q15, q1 - - ; cospi_28_64*2 = 6392 - movw r3, #0x18f8 - - ; cospi_4_64*2 = 32138 - movw r12, #0x7d8a - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q0, r3 ; duplicate cospi_28_64*2 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, - ; double, and return the high 16 bits, effectively giving >> 15. Doubling - ; the constant will change this to >> 14. - ; dct_const_round_shift(step2[4] * cospi_28_64); - vqrdmulh.s16 q4, q9, q0 - - ; preloading to avoid stall - ; cospi_16_64*2 = 23170 - movw r3, #0x5a82 - - ; dct_const_round_shift(step2[4] * cospi_4_64); - vqrdmulh.s16 q7, q9, q1 - - ; stage 4 - vdup.16 q1, r3 ; cospi_16_64*2 - - ; cospi_16_64 = 11585 - movw r3, #0x2d41 - - vdup.16 d4, r3; ; duplicate cospi_16_64 - - ; dct_const_round_shift(step1[0] * cospi_16_64) - vqrdmulh.s16 q8, q8, q1 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d14, d4 - vmull.s16 q10, d15, d4 - - ; step2[5] * cospi_16_64 - vmull.s16 q12, d9, d4 - vmull.s16 q11, d8, d4 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q15, q10, q12 - vsub.s32 q6, q9, q11 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d11, q15, #14 ; >> 14 - vrshrn.s32 d10, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q9, #14 ; >> 14 - vrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; - vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; - vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {q2}, [r1]! - vst1.64 {q9-q10}, [r1]! - vst1.64 {q11-q12}, [r1]! - vst1.64 {q13-q14}, [r1]! - vst1.64 {q15}, [r1] - - bx lr - ENDP ; |vpx_idct16x16_10_add_neon_pass1| - -;void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output, -; int16_t *pass1_output) -; -; r0 const tran_low_t *src -; r1 int16_t *output -; r2 int16_t *pass1_output - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_10_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 - LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 - LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 - LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 - LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0 - vmov.s16 q15, q0; - - ; 2*cospi_30_64 = 3212 - movw r3, #0x0c8c - - ; 2*cospi_2_64 = 32610 - movw r12, #0x7f62 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q6, r3 ; duplicate 2*cospi_30_64 - - ; dct_const_round_shift(step1[8] * cospi_30_64) - vqrdmulh.s16 q0, q8, q6 - - vdup.16 q6, r12 ; duplicate 2*cospi_2_64 - - ; dct_const_round_shift(step1[8] * cospi_2_64) - vqrdmulh.s16 q7, q8, q6 - - ; preloading to avoid stall - ; 2*cospi_26_64 = 9512 - movw r12, #0x2528 - rsb r12, #0 - vdup.16 q15, r12 ; duplicate -2*cospi_26_64 - - ; 2*cospi_6_64 = 31358 - movw r3, #0x7a7e - vdup.16 q14, r3 ; duplicate 2*cospi_6_64 - - ; dct_const_round_shift(- step1[12] * cospi_26_64) - vqrdmulh.s16 q3, q9, q15 - - ; dct_const_round_shift(step1[12] * cospi_6_64) - vqrdmulh.s16 q4, q9, q14 - - ; stage 4 - ; cospi_24_64 = 6270 - movw r3, #0x187e - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; cospi_8_64 = 15137 - movw r12, #0x3b21 - vdup.16 d30, r12 ; duplicate cospi_8_64 - - ; step1[14] * cospi_24_64 - vmull.s16 q12, d14, d31 - vmull.s16 q5, d15, d31 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d0, d31 - vmull.s16 q11, d1, d31 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q12, d0, d30 - vmlsl.s16 q5, d1, d30 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d14, d30 - vmlal.s16 q11, d15, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d2, q12, #14 ; >> 14 - vrshrn.s32 d3, q5, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q2, #14 ; >> 14 - vrshrn.s32 d13, q11, #14 ; >> 14 - - ; - step1[13] * cospi_8_64 - vmull.s16 q10, d8, d30 - vmull.s16 q13, d9, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d6, d30 - vmull.s16 q9, d7, d30 - - ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 - vmlsl.s16 q10, d6, d31 - vmlsl.s16 q13, d7, d31 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d8, d31 - vmlal.s16 q9, d9, d31 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d4, q10, #14 ; >> 14 - vrshrn.s32 d5, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d10, q8, #14 ; >> 14 - vrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; cospi_16_64 = 11585 - movw r12, #0x2d41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q0, q3, q0 - vadd.s32 q1, q4, q1 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d4, q5, #14 ; >> 14 - vrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d10, q0, #14 ; >> 14 - vrshrn.s32 d11, q1, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d6, q10, #14 ; >> 14 - vrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); - vrshrn.s32 d8, q13, #14 ; >> 14 - vrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1_output stride - ldr r3, [sp] ; load skip_adding - - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct10_16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |vpx_idct16x16_10_add_neon_pass2| - END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c index f4eb24615e..fc7f4a7747 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_add_neon.c @@ -10,1255 +10,755 @@ #include +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/txfm_common.h" -static void idct16x16_256_add_neon_pass1(const int16x8_t s0, const int16x8_t s1, - const int16x8_t s2, const int16x8_t s3, - const int16x8_t s4, const int16x8_t s5, - const int16x8_t s6, const int16x8_t s7, - int16_t *out) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; +static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0, + int16x4_t *const d1) { + *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS); +} - q8s16 = s0; - q9s16 = s1; - q10s16 = s2; - q11s16 = s3; - q12s16 = s4; - q13s16 = s5; - q14s16 = s6; - q15s16 = s7; +static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0, + const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int32x4_t *const t32) { + t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3); + t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3); + t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1); + t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1); +} - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); +static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, int16x4_t *const d1) { + int32x4_t t32[2]; - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); + idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32); + wrap_low_4x2(t32, d0, d1); +} + +static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, + int16x4_t *const d1) { + int32x4_t t32[2]; + + idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32); + t32[1] = vnegq_s32(t32[1]); + wrap_low_4x2(t32, d0, d1); +} + +static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, + int16x4_t *const d1) { + int32x4_t t32[3]; + + t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2); + t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2); + t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2); + wrap_low_4x2(t32, d0, d1); +} + +void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0); + const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1); + const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1); + int16x8_t in[16], step1[16], step2[16], out[16]; + + // Load input (16x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[8] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[9] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[10] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[11] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[12] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[13] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[14] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[7] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[15] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 8; + in[8] = vld1q_s16(inputT); + inputT += 8; + in[1] = vld1q_s16(inputT); + inputT += 8; + in[9] = vld1q_s16(inputT); + inputT += 8; + in[2] = vld1q_s16(inputT); + inputT += 8; + in[10] = vld1q_s16(inputT); + inputT += 8; + in[3] = vld1q_s16(inputT); + inputT += 8; + in[11] = vld1q_s16(inputT); + inputT += 8; + in[4] = vld1q_s16(inputT); + inputT += 8; + in[12] = vld1q_s16(inputT); + inputT += 8; + in[5] = vld1q_s16(inputT); + inputT += 8; + in[13] = vld1q_s16(inputT); + inputT += 8; + in[6] = vld1q_s16(inputT); + inputT += 8; + in[14] = vld1q_s16(inputT); + inputT += 8; + in[7] = vld1q_s16(inputT); + inputT += 8; + in[15] = vld1q_s16(inputT); + } + + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[1] = in[16 / 2]; + step1[2] = in[8 / 2]; + step1[3] = in[24 / 2]; + step1[4] = in[4 / 2]; + step1[5] = in[20 / 2]; + step1[6] = in[12 / 2]; + step1[7] = in[28 / 2]; + step1[8] = in[2 / 2]; + step1[9] = in[18 / 2]; + step1[10] = in[10 / 2]; + step1[11] = in[26 / 2]; + step1[12] = in[6 / 2]; + step1[13] = in[22 / 2]; + step1[14] = in[14 / 2]; + step1[15] = in[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]); + idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9], + &step2[14]); + idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10], + &step2[13]); + idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11], + &step2[12]); // stage 3 - d0s16 = vdup_n_s16((int16_t)cospi_28_64); - d1s16 = vdup_n_s16((int16_t)cospi_4_64); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d18s16, d1s16); - q6s32 = vmull_s16(d19s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlal_s16(q5s32, d30s16, d0s16); - q6s32 = vmlal_s16(q6s32, d31s16, d0s16); - - d2s16 = vdup_n_s16((int16_t)cospi_12_64); - d3s16 = vdup_n_s16((int16_t)cospi_20_64); - - d8s16 = vrshrn_n_s32(q2s32, 14); - d9s16 = vrshrn_n_s32(q3s32, 14); - d14s16 = vrshrn_n_s32(q5s32, 14); - d15s16 = vrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - q2s32 = vmull_s16(d26s16, d2s16); - q3s32 = vmull_s16(d27s16, d2s16); - q9s32 = vmull_s16(d26s16, d3s16); - q15s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q15s32 = vmlal_s16(q15s32, d23s16, d2s16); - - d10s16 = vrshrn_n_s32(q2s32, 14); - d11s16 = vrshrn_n_s32(q3s32, 14); - d12s16 = vrshrn_n_s32(q9s32, 14); - d13s16 = vrshrn_n_s32(q15s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]); + idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]); + step1[8] = vaddq_s16(step2[8], step2[9]); + step1[9] = vsubq_s16(step2[8], step2[9]); + step1[10] = vsubq_s16(step2[11], step2[10]); + step1[11] = vaddq_s16(step2[11], step2[10]); + step1[12] = vaddq_s16(step2[12], step2[13]); + step1[13] = vsubq_s16(step2[12], step2[13]); + step1[14] = vsubq_s16(step2[15], step2[14]); + step1[15] = vaddq_s16(step2[15], step2[14]); // stage 4 - d30s16 = vdup_n_s16((int16_t)cospi_16_64); - - q2s32 = vmull_s16(d16s16, d30s16); - q11s32 = vmull_s16(d17s16, d30s16); - q0s32 = vmull_s16(d24s16, d30s16); - q1s32 = vmull_s16(d25s16, d30s16); - - d30s16 = vdup_n_s16((int16_t)cospi_24_64); - d31s16 = vdup_n_s16((int16_t)cospi_8_64); - - q3s32 = vaddq_s32(q2s32, q0s32); - q12s32 = vaddq_s32(q11s32, q1s32); - q13s32 = vsubq_s32(q2s32, q0s32); - q1s32 = vsubq_s32(q11s32, q1s32); - - d16s16 = vrshrn_n_s32(q3s32, 14); - d17s16 = vrshrn_n_s32(q12s32, 14); - d18s16 = vrshrn_n_s32(q13s32, 14); - d19s16 = vrshrn_n_s32(q1s32, 14); - q8s16 = vcombine_s16(d16s16, d17s16); - q9s16 = vcombine_s16(d18s16, d19s16); - - q0s32 = vmull_s16(d20s16, d31s16); - q1s32 = vmull_s16(d21s16, d31s16); - q12s32 = vmull_s16(d20s16, d30s16); - q13s32 = vmull_s16(d21s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d30s16); - q1s32 = vmlal_s16(q1s32, d29s16, d30s16); - q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); - - d22s16 = vrshrn_n_s32(q0s32, 14); - d23s16 = vrshrn_n_s32(q1s32, 14); - d20s16 = vrshrn_n_s32(q12s32, 14); - d21s16 = vrshrn_n_s32(q13s32, 14); - q10s16 = vcombine_s16(d20s16, d21s16); - q11s16 = vcombine_s16(d22s16, d23s16); - - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q15s16 = vaddq_s16(q6s16, q7s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); + idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]); + idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]); + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; // stage 5 - q0s16 = vaddq_s16(q8s16, q11s16); - q1s16 = vaddq_s16(q9s16, q10s16); - q2s16 = vsubq_s16(q9s16, q10s16); - q3s16 = vsubq_s16(q8s16, q11s16); - - d16s16 = vdup_n_s16((int16_t)cospi_16_64); - - q11s32 = vmull_s16(d26s16, d16s16); - q12s32 = vmull_s16(d27s16, d16s16); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - - q6s32 = vsubq_s32(q9s32, q11s32); - q13s32 = vsubq_s32(q10s32, q12s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d10s16 = vrshrn_n_s32(q6s32, 14); - d11s16 = vrshrn_n_s32(q13s32, 14); - d12s16 = vrshrn_n_s32(q9s32, 14); - d13s16 = vrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); // stage 6 - q8s16 = vaddq_s16(q0s16, q15s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - // store the data - vst1q_s16(out, q8s16); - out += 8; - vst1q_s16(out, q9s16); - out += 8; - vst1q_s16(out, q10s16); - out += 8; - vst1q_s16(out, q11s16); - out += 8; - vst1q_s16(out, q12s16); - out += 8; - vst1q_s16(out, q13s16); - out += 8; - vst1q_s16(out, q14s16); - out += 8; - vst1q_s16(out, q15s16); -} - -void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) { - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; - int16x8x2_t v; - - v = vld2q_s16(in); - s0 = v.val[0]; - in += 16; - v = vld2q_s16(in); - s1 = v.val[0]; - in += 16; - v = vld2q_s16(in); - s2 = v.val[0]; - in += 16; - v = vld2q_s16(in); - s3 = v.val[0]; - in += 16; - v = vld2q_s16(in); - s4 = v.val[0]; - in += 16; - v = vld2q_s16(in); - s5 = v.val[0]; - in += 16; - v = vld2q_s16(in); - s6 = v.val[0]; - in += 16; - v = vld2q_s16(in); - s7 = v.val[0]; - - idct16x16_256_add_neon_pass1(s0, s1, s2, s3, s4, s5, s6, s7, out); -} - -#if CONFIG_VP9_HIGHBITDEPTH -void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *in, - int16_t *out) { - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; - int16x8x2_t v; - - v = load_tran_low_to_s16x2q(in); - s0 = v.val[0]; - in += 16; - v = load_tran_low_to_s16x2q(in); - s1 = v.val[0]; - in += 16; - v = load_tran_low_to_s16x2q(in); - s2 = v.val[0]; - in += 16; - v = load_tran_low_to_s16x2q(in); - s3 = v.val[0]; - in += 16; - v = load_tran_low_to_s16x2q(in); - s4 = v.val[0]; - in += 16; - v = load_tran_low_to_s16x2q(in); - s5 = v.val[0]; - in += 16; - v = load_tran_low_to_s16x2q(in); - s6 = v.val[0]; - in += 16; - v = load_tran_low_to_s16x2q(in); - s7 = v.val[0]; - - idct16x16_256_add_neon_pass1(s0, s1, s2, s3, s4, s5, s6, s7, out); -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -static void idct16x16_256_add_neon_pass2(const int16x8_t s0, const int16x8_t s1, - const int16x8_t s2, const int16x8_t s3, - const int16x8_t s4, const int16x8_t s5, - const int16x8_t s6, const int16x8_t s7, - int16_t *out, int16_t *pass1_output, - int16_t skip_adding, uint8_t *dest, - int stride) { - uint8_t *d; - uint8x8_t d12u8, d13u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d24u64, d25u64, d26u64, d27u64; - int64x1_t d12s64, d13s64; - uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; - uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - - q8s16 = s0; - q9s16 = s1; - q10s16 = s2; - q11s16 = s3; - q12s16 = s4; - q13s16 = s5; - q14s16 = s6; - q15s16 = s7; - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - // stage 3 - d12s16 = vdup_n_s16((int16_t)cospi_30_64); - d13s16 = vdup_n_s16((int16_t)cospi_2_64); - - q2s32 = vmull_s16(d16s16, d12s16); - q3s32 = vmull_s16(d17s16, d12s16); - q1s32 = vmull_s16(d16s16, d13s16); - q4s32 = vmull_s16(d17s16, d13s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); - q1s32 = vmlal_s16(q1s32, d30s16, d12s16); - q4s32 = vmlal_s16(q4s32, d31s16, d12s16); - - d0s16 = vrshrn_n_s32(q2s32, 14); - d1s16 = vrshrn_n_s32(q3s32, 14); - d14s16 = vrshrn_n_s32(q1s32, 14); - d15s16 = vrshrn_n_s32(q4s32, 14); - q0s16 = vcombine_s16(d0s16, d1s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d30s16 = vdup_n_s16((int16_t)cospi_14_64); - d31s16 = vdup_n_s16((int16_t)cospi_18_64); - - q2s32 = vmull_s16(d24s16, d30s16); - q3s32 = vmull_s16(d25s16, d30s16); - q4s32 = vmull_s16(d24s16, d31s16); - q5s32 = vmull_s16(d25s16, d31s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); - q4s32 = vmlal_s16(q4s32, d22s16, d30s16); - q5s32 = vmlal_s16(q5s32, d23s16, d30s16); - - d2s16 = vrshrn_n_s32(q2s32, 14); - d3s16 = vrshrn_n_s32(q3s32, 14); - d12s16 = vrshrn_n_s32(q4s32, 14); - d13s16 = vrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16((int16_t)cospi_22_64); - d31s16 = vdup_n_s16((int16_t)cospi_10_64); - - q11s32 = vmull_s16(d20s16, d30s16); - q12s32 = vmull_s16(d21s16, d30s16); - q4s32 = vmull_s16(d20s16, d31s16); - q5s32 = vmull_s16(d21s16, d31s16); - - q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); - q4s32 = vmlal_s16(q4s32, d26s16, d30s16); - q5s32 = vmlal_s16(q5s32, d27s16, d30s16); - - d4s16 = vrshrn_n_s32(q11s32, 14); - d5s16 = vrshrn_n_s32(q12s32, 14); - d11s16 = vrshrn_n_s32(q5s32, 14); - d10s16 = vrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - d30s16 = vdup_n_s16((int16_t)cospi_6_64); - d31s16 = vdup_n_s16((int16_t)cospi_26_64); - - q10s32 = vmull_s16(d28s16, d30s16); - q11s32 = vmull_s16(d29s16, d30s16); - q12s32 = vmull_s16(d28s16, d31s16); - q13s32 = vmull_s16(d29s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); - q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); - q12s32 = vmlal_s16(q12s32, d18s16, d30s16); - q13s32 = vmlal_s16(q13s32, d19s16, d30s16); - - d6s16 = vrshrn_n_s32(q10s32, 14); - d7s16 = vrshrn_n_s32(q11s32, 14); - d8s16 = vrshrn_n_s32(q12s32, 14); - d9s16 = vrshrn_n_s32(q13s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 3 - q9s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q10s16 = vsubq_s16(q3s16, q2s16); - q11s16 = vaddq_s16(q2s16, q3s16); - q12s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q6s16, q7s16); - - // stage 4 - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d30s16 = vdup_n_s16((int16_t)cospi_8_64); - d31s16 = vdup_n_s16((int16_t)cospi_24_64); - - q2s32 = vmull_s16(d18s16, d31s16); - q3s32 = vmull_s16(d19s16, d31s16); - q4s32 = vmull_s16(d28s16, d31s16); - q5s32 = vmull_s16(d29s16, d31s16); - - q2s32 = vmlal_s16(q2s32, d28s16, d30s16); - q3s32 = vmlal_s16(q3s32, d29s16, d30s16); - q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); - - d12s16 = vrshrn_n_s32(q2s32, 14); - d13s16 = vrshrn_n_s32(q3s32, 14); - d2s16 = vrshrn_n_s32(q4s32, 14); - d3s16 = vrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - q3s16 = q11s16; - q4s16 = q12s16; - - d30s16 = vdup_n_s16(-cospi_8_64); - q11s32 = vmull_s16(d26s16, d30s16); - q12s32 = vmull_s16(d27s16, d30s16); - q8s32 = vmull_s16(d20s16, d30s16); - q9s32 = vmull_s16(d21s16, d30s16); - - q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); - q8s32 = vmlal_s16(q8s32, d26s16, d31s16); - q9s32 = vmlal_s16(q9s32, d27s16, d31s16); - - d4s16 = vrshrn_n_s32(q11s32, 14); - d5s16 = vrshrn_n_s32(q12s32, 14); - d10s16 = vrshrn_n_s32(q8s32, 14); - d11s16 = vrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); - - // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16((int16_t)cospi_16_64); - - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q10s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vrshrn_n_s32(q5s32, 14); - d5s16 = vrshrn_n_s32(q6s32, 14); - d10s16 = vrshrn_n_s32(q10s32, 14); - d11s16 = vrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vrshrn_n_s32(q10s32, 14); - d7s16 = vrshrn_n_s32(q4s32, 14); - d8s16 = vrshrn_n_s32(q13s32, 14); - d9s16 = vrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; // stage 7 - if (skip_adding != 0) { - d = dest; - // load the data in pass1 - q0s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q1s16 = vld1q_s16(pass1_output); - pass1_output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += stride; + idct16x16_add_stage7(step2, out); - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += stride; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q11s16 = vld1q_s16(pass1_output); - pass1_output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += stride; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += stride; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q1s16 = vld1q_s16(pass1_output); - pass1_output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += stride; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += stride; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q11s16 = vld1q_s16(pass1_output); - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += stride; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += stride; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - // store the data out 8,9,10,11,12,13,14,15 - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - q8s16 = vrshrq_n_s16(q8s16, 6); - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - q9s16 = vrshrq_n_s16(q9s16, 6); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - q2s16 = vrshrq_n_s16(q2s16, 6); - q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - q3s16 = vrshrq_n_s16(q3s16, 6); - q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - q4s16 = vrshrq_n_s16(q4s16, 6); - q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - q5s16 = vrshrq_n_s16(q5s16, 6); - q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += stride; - q14s16 = vrshrq_n_s16(q14s16, 6); - q14u16 = - vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += stride; - - d12s64 = vld1_s64((int64_t *)dest); - q15s16 = vrshrq_n_s16(q15s16, 6); - q15u16 = - vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - } else { // skip_adding_dest - q0s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q1s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q11s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q1s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q11s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } } } -void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out, - int16_t *pass1_output, - int16_t skip_adding, uint8_t *dest, - int stride) { - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int16x8x2_t q0x2s16; +void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output, + void *const dest, const int stride, + const int highbd_flag) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x8_t in[8], step1[16], step2[16], out[16]; - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; + // Load input (8x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[7] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 16; + in[1] = vld1q_s16(inputT); + inputT += 16; + in[2] = vld1q_s16(inputT); + inputT += 16; + in[3] = vld1q_s16(inputT); + inputT += 16; + in[4] = vld1q_s16(inputT); + inputT += 16; + in[5] = vld1q_s16(inputT); + inputT += 16; + in[6] = vld1q_s16(inputT); + inputT += 16; + in[7] = vld1q_s16(inputT); + } - idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, - q14s16, q15s16, out, pass1_output, skip_adding, - dest, stride); -} + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); -#if CONFIG_VP9_HIGHBITDEPTH -void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src, - int16_t *out, - int16_t *pass1_output, - int16_t skip_adding, - uint8_t *dest, int stride) { - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int16x8x2_t q0x2s16; + // stage 1 + step1[0] = in[0 / 2]; + step1[2] = in[8 / 2]; + step1[4] = in[4 / 2]; + step1[6] = in[12 / 2]; + step1[8] = in[2 / 2]; + step1[10] = in[10 / 2]; + step1[12] = in[6 / 2]; + step1[14] = in[14 / 2]; // 0 in pass 1 - q0x2s16 = load_tran_low_to_s16x2q(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q15s16 = q0x2s16.val[0]; - - idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, - q14s16, q15s16, out, pass1_output, skip_adding, - dest, stride); -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *in, int16_t *out) { - int16x4_t d4s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = load_tran_low_to_s16x2q(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = load_tran_low_to_s16x2q(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = load_tran_low_to_s16x2q(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = load_tran_low_to_s16x2q(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = load_tran_low_to_s16x2q(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = load_tran_low_to_s16x2q(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = load_tran_low_to_s16x2q(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = load_tran_low_to_s16x2q(in); - q15s16 = q0x2s16.val[0]; - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + // stage 2 + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3); + step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3); + step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2); + step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2); + step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0); // stage 3 - q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2); - q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - q7s16 = vqrdmulhq_s16(q9s16, q1s16); + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2); + step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1); + step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = vaddq_s16(step2[8], step2[9]); + step1[9] = vsubq_s16(step2[8], step2[9]); + step1[10] = vsubq_s16(step2[11], step2[10]); + step1[11] = vaddq_s16(step2[11], step2[10]); + step1[12] = vaddq_s16(step2[12], step2[13]); + step1[13] = vsubq_s16(step2[12], step2[13]); + step1[14] = vsubq_s16(step2[15], step2[14]); + step1[15] = vaddq_s16(step2[15], step2[14]); // stage 4 - q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2); - d4s16 = vdup_n_s16((int16_t)cospi_16_64); - - q8s16 = vqrdmulhq_s16(q8s16, q1s16); - - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - q9s32 = vmull_s16(d14s16, d4s16); - q10s32 = vmull_s16(d15s16, d4s16); - q12s32 = vmull_s16(d9s16, d4s16); - q11s32 = vmull_s16(d8s16, d4s16); - - q15s32 = vsubq_s32(q10s32, q12s32); - q6s32 = vsubq_s32(q9s32, q11s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d11s16 = vrshrn_n_s32(q15s32, 14); - d10s16 = vrshrn_n_s32(q6s32, 14); - d12s16 = vrshrn_n_s32(q9s32, 14); - d13s16 = vrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q2s16 = vaddq_s16(q8s16, q7s16); - q9s16 = vaddq_s16(q8s16, q6s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q8s16, q4s16); - q12s16 = vsubq_s16(q8s16, q4s16); - q13s16 = vsubq_s16(q8s16, q5s16); - q14s16 = vsubq_s16(q8s16, q6s16); - q15s16 = vsubq_s16(q8s16, q7s16); - - // store the data - vst1q_s16(out, q2s16); - out += 8; - vst1q_s16(out, q9s16); - out += 8; - vst1q_s16(out, q10s16); - out += 8; - vst1q_s16(out, q11s16); - out += 8; - vst1q_s16(out, q12s16); - out += 8; - vst1q_s16(out, q13s16); - out += 8; - vst1q_s16(out, q14s16); - out += 8; - vst1q_s16(out, q15s16); -} - -void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *out, - int16_t *pass1_output) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; - uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; - uint64x1_t d16u64, d17u64, d18u64, d19u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - - q0x2s16 = load_tran_low_to_s16x2q(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = load_tran_low_to_s16x2q(src); - q15s16 = q0x2s16.val[0]; - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // stage 3 - q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2); - q0s16 = vqrdmulhq_s16(q8s16, q6s16); - q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2); - q7s16 = vqrdmulhq_s16(q8s16, q6s16); - - q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2); - q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2); - q3s16 = vqrdmulhq_s16(q9s16, q15s16); - q4s16 = vqrdmulhq_s16(q9s16, q14s16); - - // stage 4 - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - d6s16 = vget_low_s16(q3s16); - d7s16 = vget_high_s16(q3s16); - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - - d30s16 = vdup_n_s16((int16_t)cospi_8_64); - d31s16 = vdup_n_s16((int16_t)cospi_24_64); - - q12s32 = vmull_s16(d14s16, d31s16); - q5s32 = vmull_s16(d15s16, d31s16); - q2s32 = vmull_s16(d0s16, d31s16); - q11s32 = vmull_s16(d1s16, d31s16); - - q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); - q2s32 = vmlal_s16(q2s32, d14s16, d30s16); - q11s32 = vmlal_s16(q11s32, d15s16, d30s16); - - d2s16 = vrshrn_n_s32(q12s32, 14); - d3s16 = vrshrn_n_s32(q5s32, 14); - d12s16 = vrshrn_n_s32(q2s32, 14); - d13s16 = vrshrn_n_s32(q11s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16(-cospi_8_64); - q10s32 = vmull_s16(d8s16, d30s16); - q13s32 = vmull_s16(d9s16, d30s16); - q8s32 = vmull_s16(d6s16, d30s16); - q9s32 = vmull_s16(d7s16, d30s16); - - q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); - q8s32 = vmlal_s16(q8s32, d8s16, d31s16); - q9s32 = vmlal_s16(q9s32, d9s16, d31s16); - - d4s16 = vrshrn_n_s32(q10s32, 14); - d5s16 = vrshrn_n_s32(q13s32, 14); - d10s16 = vrshrn_n_s32(q8s32, 14); - d11s16 = vrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); + step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3); + step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1); + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16((int16_t)cospi_16_64); - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q0s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vrshrn_n_s32(q5s32, 14); - d5s16 = vrshrn_n_s32(q6s32, 14); - d10s16 = vrshrn_n_s32(q0s32, 14); - d11s16 = vrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vrshrn_n_s32(q10s32, 14); - d7s16 = vrshrn_n_s32(q4s32, 14); - d8s16 = vrshrn_n_s32(q13s32, 14); - d9s16 = vrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; // stage 7 - q0s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q1s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); + idct16x16_add_stage7(step2, out); - q10s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q11s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q1s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1_output); - pass1_output += 8; - q11s16 = vld1q_s16(pass1_output); - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); - d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); - d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); - d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); - d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); - d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - vst1_u64((uint64_t *)out, d16u64); - out += 4; - vst1_u64((uint64_t *)out, d17u64); - out += 12; - vst1_u64((uint64_t *)out, d18u64); - out += 4; - vst1_u64((uint64_t *)out, d19u64); - out += 12; - vst1_u64((uint64_t *)out, d4u64); - out += 4; - vst1_u64((uint64_t *)out, d5u64); - out += 12; - vst1_u64((uint64_t *)out, d6u64); - out += 4; - vst1_u64((uint64_t *)out, d7u64); - out += 12; - vst1_u64((uint64_t *)out, d8u64); - out += 4; - vst1_u64((uint64_t *)out, d9u64); - out += 12; - vst1_u64((uint64_t *)out, d10u64); - out += 4; - vst1_u64((uint64_t *)out, d11u64); - out += 12; - vst1_u64((uint64_t *)out, d28u64); - out += 4; - vst1_u64((uint64_t *)out, d29u64); - out += 12; - vst1_u64((uint64_t *)out, d30u64); - out += 4; - vst1_u64((uint64_t *)out, d31u64); + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int16_t *output) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x4_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x4) + in[0] = load_tran_low_to_s16d(input); + input += 16; + in[1] = load_tran_low_to_s16d(input); + input += 16; + in[2] = load_tran_low_to_s16d(input); + input += 16; + in[3] = load_tran_low_to_s16d(input); + + // Transpose + transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0); + + // stage 3 + step1[0] = step2[0]; + step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vadd_s16(step2[8], step2[11]); + step1[9] = vadd_s16(step2[9], step2[10]); + step1[10] = vsub_s16(step2[9], step2[10]); + step1[11] = vsub_s16(step2[8], step2[11]); + step1[12] = vsub_s16(step2[15], step2[12]); + step1[13] = vsub_s16(step2[14], step2[13]); + step1[14] = vadd_s16(step2[14], step2[13]); + step1[15] = vadd_s16(step2[15], step2[12]); + + // stage 6 + step2[0] = vadd_s16(step1[0], step1[7]); + step2[1] = vadd_s16(step1[1], step1[6]); + step2[2] = vadd_s16(step1[2], step1[5]); + step2[3] = vadd_s16(step1[3], step1[4]); + step2[4] = vsub_s16(step1[3], step1[4]); + step2[5] = vsub_s16(step1[2], step1[5]); + step2[6] = vsub_s16(step1[1], step1[6]); + step2[7] = vsub_s16(step1[0], step1[7]); + idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + out[0] = vadd_s16(step2[0], step2[15]); + out[1] = vadd_s16(step2[1], step2[14]); + out[2] = vadd_s16(step2[2], step2[13]); + out[3] = vadd_s16(step2[3], step2[12]); + out[4] = vadd_s16(step2[4], step2[11]); + out[5] = vadd_s16(step2[5], step2[10]); + out[6] = vadd_s16(step2[6], step2[9]); + out[7] = vadd_s16(step2[7], step2[8]); + out[8] = vsub_s16(step2[7], step2[8]); + out[9] = vsub_s16(step2[6], step2[9]); + out[10] = vsub_s16(step2[5], step2[10]); + out[11] = vsub_s16(step2[4], step2[11]); + out[12] = vsub_s16(step2[3], step2[12]); + out[13] = vsub_s16(step2[2], step2[13]); + out[14] = vsub_s16(step2[1], step2[14]); + out[15] = vsub_s16(step2[0], step2[15]); + + // pass 1: save the result into output + vst1_s16(output, out[0]); + output += 4; + vst1_s16(output, out[1]); + output += 4; + vst1_s16(output, out[2]); + output += 4; + vst1_s16(output, out[3]); + output += 4; + vst1_s16(output, out[4]); + output += 4; + vst1_s16(output, out[5]); + output += 4; + vst1_s16(output, out[6]); + output += 4; + vst1_s16(output, out[7]); + output += 4; + vst1_s16(output, out[8]); + output += 4; + vst1_s16(output, out[9]); + output += 4; + vst1_s16(output, out[10]); + output += 4; + vst1_s16(output, out[11]); + output += 4; + vst1_s16(output, out[12]); + output += 4; + vst1_s16(output, out[13]); + output += 4; + vst1_s16(output, out[14]); + output += 4; + vst1_s16(output, out[15]); +} + +void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input, + int16_t *const output, void *const dest, + const int stride, + const int highbd_flag) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x4_t ind[8]; + int16x8_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x8) + ind[0] = vld1_s16(input); + input += 4; + ind[1] = vld1_s16(input); + input += 4; + ind[2] = vld1_s16(input); + input += 4; + ind[3] = vld1_s16(input); + input += 4; + ind[4] = vld1_s16(input); + input += 4; + ind[5] = vld1_s16(input); + input += 4; + ind[6] = vld1_s16(input); + input += 4; + ind[7] = vld1_s16(input); + + // Transpose + transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6], + ind[7], &in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0); + + // stage 3 + step1[0] = step2[0]; + step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + idct16x16_add_stage7(step2, out); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0); + + // Parallel idct on the lower 8 rows + vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, + stride, 0); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, + 0); +} + +void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, + 0); +} + +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0); + + // Parallel idct to get the right 8 columns + vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, + stride, 0); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_neon.c deleted file mode 100644 index 47366bcb7d..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct16x16_neon.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/vpx_dsp_common.h" - -void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output); -void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, - int16_t *pass1_output, - int16_t skip_adding, uint8_t *dest, - int stride); -#if CONFIG_VP9_HIGHBITDEPTH -void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *input, - int16_t *output); -void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src, - int16_t *output, - int16_t *pass1_output, - int16_t skip_adding, - uint8_t *dest, int stride); -#else -#define vpx_idct16x16_256_add_neon_pass1_tran_low \ - vpx_idct16x16_256_add_neon_pass1 -#define vpx_idct16x16_256_add_neon_pass2_tran_low \ - vpx_idct16x16_256_add_neon_pass2 -#endif - -void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *input, int16_t *output); -void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output, - int16_t *pass1_output); - -#if HAVE_NEON_ASM -/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ -extern void vpx_push_neon(int64_t *store); -extern void vpx_pop_neon(int64_t *store); -#endif // HAVE_NEON_ASM - -void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, - int stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16 * 16] = { 0 }; - int16_t row_idct_output[16 * 16] = { 0 }; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - vpx_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1_tran_low(input, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2_tran_low(input + 1, row_idct_output, - pass1_output, 0, dest, stride); - - /* Parallel idct on the lower 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1_tran_low(input + 8 * 16, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2_tran_low( - input + 8 * 16 + 1, row_idct_output + 8, pass1_output, 0, dest, stride); - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, - row_idct_output + 8, pass1_output, 1, - dest + 8, stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - vpx_pop_neon(store_reg); -#endif -} - -void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, - int stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16 * 16] = { 0 }; - int16_t row_idct_output[16 * 16] = { 0 }; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - vpx_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_10_add_neon_pass1(input, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output); - - /* Skip Parallel idct on the lower 8 rows as they are all 0s */ - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, - row_idct_output + 8, pass1_output, 1, - dest + 8, stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - vpx_pop_neon(store_reg); -#endif -} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c index 28b9465584..057731ad92 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" @@ -87,614 +88,573 @@ static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0, // 13 84 93 103 110 125 // 14 98 106 115 127 // 15 117 128 -static void idct32_12_neon(const tran_low_t *input, int16_t *output) { - int16x8_t in0, in1, in2, in3, in4, in5, in6, in7; - int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int16x8_t in8, in9, in10, in11; - int16x8_t s1_16, s1_18, s1_19, s1_20, s1_21, s1_23, s1_24, s1_26, s1_27, - s1_28, s1_29, s1_31; - int16x8_t s2_8, s2_10, s2_11, s2_12, s2_13, s2_15, s2_18, s2_19, s2_20, s2_21, - s2_26, s2_27, s2_28, s2_29; - int16x8_t s3_4, s3_7, s3_10, s3_11, s3_12, s3_13, s3_17, s3_18, s3_21, s3_22, - s3_25, s3_26, s3_29, s3_30; - int16x8_t s4_0, s4_2, s4_3, s4_9, s4_10, s4_13, s4_14, s4_16, s4_17, s4_18, - s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25, s4_26, s4_27, s4_28, - s4_29, s4_30, s4_31; - int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12, - s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28, - s5_29; - int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12, - s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24, - s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31; - int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10, - s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24, - s7_25, s7_26, s7_27; +void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output) { + int16x4_t tmp[8]; + int16x8_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32]; - load_8x8_s16(input, &in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7); - transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7); + load_8x8_s16(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); - load_4x8_s16(input + 8, &tmp0, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &tmp6, - &tmp7); - transpose_s16_4x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, &in8, &in9, - &in10, &in11); + load_4x8_s16(input + 8, &tmp[0], &tmp[1], &tmp[2], &tmp[3], &tmp[4], &tmp[5], + &tmp[6], &tmp[7]); + transpose_s16_4x8(tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6], + tmp[7], &in[8], &in[9], &in[10], &in[11]); // stage 1 - s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64); - s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64); + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); - s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64); - s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64); + s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64); - s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64); - s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64); + s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); - s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64); - s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64); + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); - s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64); - s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64); + s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64); - s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64); - s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64); + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); // stage 2 - s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64); - s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64); + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); - s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64); - s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64); + s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64); - s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64); - s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64); + s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); - s2_18 = vsubq_s16(s1_19, s1_18); - s2_19 = vaddq_s16(s1_18, s1_19); - s2_20 = vaddq_s16(s1_20, s1_21); - s2_21 = vsubq_s16(s1_20, s1_21); - s2_26 = vsubq_s16(s1_27, s1_26); - s2_27 = vaddq_s16(s1_26, s1_27); - s2_28 = vaddq_s16(s1_28, s1_29); - s2_29 = vsubq_s16(s1_28, s1_29); + s2[18] = vsubq_s16(s1[19], s1[18]); + s2[19] = vaddq_s16(s1[18], s1[19]); + s2[20] = vaddq_s16(s1[20], s1[21]); + s2[21] = vsubq_s16(s1[20], s1[21]); + s2[26] = vsubq_s16(s1[27], s1[26]); + s2[27] = vaddq_s16(s1[26], s1[27]); + s2[28] = vaddq_s16(s1[28], s1[29]); + s2[29] = vsubq_s16(s1[28], s1[29]); // stage 3 - s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64); - s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64); + s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); - s3_10 = vsubq_s16(s2_11, s2_10); - s3_11 = vaddq_s16(s2_10, s2_11); - s3_12 = vaddq_s16(s2_12, s2_13); - s3_13 = vsubq_s16(s2_12, s2_13); + s3[10] = vsubq_s16(s2[11], s2[10]); + s3[11] = vaddq_s16(s2[10], s2[11]); + s3[12] = vaddq_s16(s2[12], s2[13]); + s3[13] = vsubq_s16(s2[12], s2[13]); - s3_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31, - cospi_28_64); - s3_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31, - cospi_4_64); + s3[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], + cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], + cospi_4_64); - s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29, - -cospi_4_64); - s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29, - cospi_28_64); + s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29], + cospi_28_64); - s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26, - cospi_12_64); - s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26, - cospi_20_64); + s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26], + cospi_20_64); - s3_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24, - -cospi_20_64); - s3_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24, - cospi_12_64); + s3[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, + s1[24], cospi_12_64); // stage 4 - s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64); - s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64); - s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64); + s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64); - s4_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15, - cospi_24_64); - s4_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15, - cospi_8_64); - - s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13, - -cospi_8_64); - s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13, + s4[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], + cospi_8_64); - s4_16 = vaddq_s16(s1_16, s2_19); - s4_17 = vaddq_s16(s3_17, s3_18); - s4_18 = vsubq_s16(s3_17, s3_18); - s4_19 = vsubq_s16(s1_16, s2_19); - s4_20 = vsubq_s16(s1_23, s2_20); - s4_21 = vsubq_s16(s3_22, s3_21); - s4_22 = vaddq_s16(s3_21, s3_22); - s4_23 = vaddq_s16(s2_20, s1_23); - s4_24 = vaddq_s16(s1_24, s2_27); - s4_25 = vaddq_s16(s3_25, s3_26); - s4_26 = vsubq_s16(s3_25, s3_26); - s4_27 = vsubq_s16(s1_24, s2_27); - s4_28 = vsubq_s16(s1_31, s2_28); - s4_29 = vsubq_s16(s3_30, s3_29); - s4_30 = vaddq_s16(s3_29, s3_30); - s4_31 = vaddq_s16(s2_28, s1_31); + s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13], + cospi_24_64); + + s4[16] = vaddq_s16(s1[16], s2[19]); + s4[17] = vaddq_s16(s3[17], s3[18]); + s4[18] = vsubq_s16(s3[17], s3[18]); + s4[19] = vsubq_s16(s1[16], s2[19]); + s4[20] = vsubq_s16(s1[23], s2[20]); + s4[21] = vsubq_s16(s3[22], s3[21]); + s4[22] = vaddq_s16(s3[21], s3[22]); + s4[23] = vaddq_s16(s2[20], s1[23]); + s4[24] = vaddq_s16(s1[24], s2[27]); + s4[25] = vaddq_s16(s3[25], s3[26]); + s4[26] = vsubq_s16(s3[25], s3[26]); + s4[27] = vsubq_s16(s1[24], s2[27]); + s4[28] = vsubq_s16(s1[31], s2[28]); + s4[29] = vsubq_s16(s3[30], s3[29]); + s4[30] = vaddq_s16(s3[29], s3[30]); + s4[31] = vaddq_s16(s2[28], s1[31]); // stage 5 - s5_0 = vaddq_s16(s4_0, s4_3); - s5_1 = vaddq_s16(s4_0, s4_2); - s5_2 = vsubq_s16(s4_0, s4_2); - s5_3 = vsubq_s16(s4_0, s4_3); + s5[0] = vaddq_s16(s4[0], s4[3]); + s5[1] = vaddq_s16(s4[0], s4[2]); + s5[2] = vsubq_s16(s4[0], s4[2]); + s5[3] = vsubq_s16(s4[0], s4[3]); - s5_5 = sub_multiply_shift_and_narrow_s16(s3_7, s3_4, cospi_16_64); - s5_6 = add_multiply_shift_and_narrow_s16(s3_4, s3_7, cospi_16_64); + s5[5] = sub_multiply_shift_and_narrow_s16(s3[7], s3[4], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s16(s3[4], s3[7], cospi_16_64); - s5_8 = vaddq_s16(s2_8, s3_11); - s5_9 = vaddq_s16(s4_9, s4_10); - s5_10 = vsubq_s16(s4_9, s4_10); - s5_11 = vsubq_s16(s2_8, s3_11); - s5_12 = vsubq_s16(s2_15, s3_12); - s5_13 = vsubq_s16(s4_14, s4_13); - s5_14 = vaddq_s16(s4_13, s4_14); - s5_15 = vaddq_s16(s2_15, s3_12); + s5[8] = vaddq_s16(s2[8], s3[11]); + s5[9] = vaddq_s16(s4[9], s4[10]); + s5[10] = vsubq_s16(s4[9], s4[10]); + s5[11] = vsubq_s16(s2[8], s3[11]); + s5[12] = vsubq_s16(s2[15], s3[12]); + s5[13] = vsubq_s16(s4[14], s4[13]); + s5[14] = vaddq_s16(s4[13], s4[14]); + s5[15] = vaddq_s16(s2[15], s3[12]); - s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29, - cospi_24_64); - s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29, - cospi_8_64); + s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29], + cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29], + cospi_8_64); - s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28, - cospi_24_64); - s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28, - cospi_8_64); + s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28], + cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28], + cospi_8_64); - s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27, - -cospi_8_64); - s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27, - cospi_24_64); + s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27], + cospi_24_64); - s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26, - -cospi_8_64); - s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26, - cospi_24_64); + s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26], + cospi_24_64); // stage 6 - s6_0 = vaddq_s16(s5_0, s3_7); - s6_1 = vaddq_s16(s5_1, s5_6); - s6_2 = vaddq_s16(s5_2, s5_5); - s6_3 = vaddq_s16(s5_3, s3_4); - s6_4 = vsubq_s16(s5_3, s3_4); - s6_5 = vsubq_s16(s5_2, s5_5); - s6_6 = vsubq_s16(s5_1, s5_6); - s6_7 = vsubq_s16(s5_0, s3_7); + s6[0] = vaddq_s16(s5[0], s3[7]); + s6[1] = vaddq_s16(s5[1], s5[6]); + s6[2] = vaddq_s16(s5[2], s5[5]); + s6[3] = vaddq_s16(s5[3], s3[4]); + s6[4] = vsubq_s16(s5[3], s3[4]); + s6[5] = vsubq_s16(s5[2], s5[5]); + s6[6] = vsubq_s16(s5[1], s5[6]); + s6[7] = vsubq_s16(s5[0], s3[7]); - s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64); - s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64); + s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64); - s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64); - s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64); + s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64); - s6_16 = vaddq_s16(s4_16, s4_23); - s6_17 = vaddq_s16(s4_17, s4_22); - s6_18 = vaddq_s16(s5_18, s5_21); - s6_19 = vaddq_s16(s5_19, s5_20); - s6_20 = vsubq_s16(s5_19, s5_20); - s6_21 = vsubq_s16(s5_18, s5_21); - s6_22 = vsubq_s16(s4_17, s4_22); - s6_23 = vsubq_s16(s4_16, s4_23); + s6[16] = vaddq_s16(s4[16], s4[23]); + s6[17] = vaddq_s16(s4[17], s4[22]); + s6[18] = vaddq_s16(s5[18], s5[21]); + s6[19] = vaddq_s16(s5[19], s5[20]); + s6[20] = vsubq_s16(s5[19], s5[20]); + s6[21] = vsubq_s16(s5[18], s5[21]); + s6[22] = vsubq_s16(s4[17], s4[22]); + s6[23] = vsubq_s16(s4[16], s4[23]); - s6_24 = vsubq_s16(s4_31, s4_24); - s6_25 = vsubq_s16(s4_30, s4_25); - s6_26 = vsubq_s16(s5_29, s5_26); - s6_27 = vsubq_s16(s5_28, s5_27); - s6_28 = vaddq_s16(s5_27, s5_28); - s6_29 = vaddq_s16(s5_26, s5_29); - s6_30 = vaddq_s16(s4_25, s4_30); - s6_31 = vaddq_s16(s4_24, s4_31); + s6[24] = vsubq_s16(s4[31], s4[24]); + s6[25] = vsubq_s16(s4[30], s4[25]); + s6[26] = vsubq_s16(s5[29], s5[26]); + s6[27] = vsubq_s16(s5[28], s5[27]); + s6[28] = vaddq_s16(s5[27], s5[28]); + s6[29] = vaddq_s16(s5[26], s5[29]); + s6[30] = vaddq_s16(s4[25], s4[30]); + s6[31] = vaddq_s16(s4[24], s4[31]); // stage 7 - s7_0 = vaddq_s16(s6_0, s5_15); - s7_1 = vaddq_s16(s6_1, s5_14); - s7_2 = vaddq_s16(s6_2, s6_13); - s7_3 = vaddq_s16(s6_3, s6_12); - s7_4 = vaddq_s16(s6_4, s6_11); - s7_5 = vaddq_s16(s6_5, s6_10); - s7_6 = vaddq_s16(s6_6, s5_9); - s7_7 = vaddq_s16(s6_7, s5_8); - s7_8 = vsubq_s16(s6_7, s5_8); - s7_9 = vsubq_s16(s6_6, s5_9); - s7_10 = vsubq_s16(s6_5, s6_10); - s7_11 = vsubq_s16(s6_4, s6_11); - s7_12 = vsubq_s16(s6_3, s6_12); - s7_13 = vsubq_s16(s6_2, s6_13); - s7_14 = vsubq_s16(s6_1, s5_14); - s7_15 = vsubq_s16(s6_0, s5_15); + s7[0] = vaddq_s16(s6[0], s5[15]); + s7[1] = vaddq_s16(s6[1], s5[14]); + s7[2] = vaddq_s16(s6[2], s6[13]); + s7[3] = vaddq_s16(s6[3], s6[12]); + s7[4] = vaddq_s16(s6[4], s6[11]); + s7[5] = vaddq_s16(s6[5], s6[10]); + s7[6] = vaddq_s16(s6[6], s5[9]); + s7[7] = vaddq_s16(s6[7], s5[8]); + s7[8] = vsubq_s16(s6[7], s5[8]); + s7[9] = vsubq_s16(s6[6], s5[9]); + s7[10] = vsubq_s16(s6[5], s6[10]); + s7[11] = vsubq_s16(s6[4], s6[11]); + s7[12] = vsubq_s16(s6[3], s6[12]); + s7[13] = vsubq_s16(s6[2], s6[13]); + s7[14] = vsubq_s16(s6[1], s5[14]); + s7[15] = vsubq_s16(s6[0], s5[15]); - s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64); - s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64); + s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64); - s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64); - s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64); + s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64); - s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64); - s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64); + s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64); - s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64); - s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64); + s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64); // final stage - vst1q_s16(output, vaddq_s16(s7_0, s6_31)); + vst1q_s16(output, vaddq_s16(s7[0], s6[31])); output += 16; - vst1q_s16(output, vaddq_s16(s7_1, s6_30)); + vst1q_s16(output, vaddq_s16(s7[1], s6[30])); output += 16; - vst1q_s16(output, vaddq_s16(s7_2, s6_29)); + vst1q_s16(output, vaddq_s16(s7[2], s6[29])); output += 16; - vst1q_s16(output, vaddq_s16(s7_3, s6_28)); + vst1q_s16(output, vaddq_s16(s7[3], s6[28])); output += 16; - vst1q_s16(output, vaddq_s16(s7_4, s7_27)); + vst1q_s16(output, vaddq_s16(s7[4], s7[27])); output += 16; - vst1q_s16(output, vaddq_s16(s7_5, s7_26)); + vst1q_s16(output, vaddq_s16(s7[5], s7[26])); output += 16; - vst1q_s16(output, vaddq_s16(s7_6, s7_25)); + vst1q_s16(output, vaddq_s16(s7[6], s7[25])); output += 16; - vst1q_s16(output, vaddq_s16(s7_7, s7_24)); + vst1q_s16(output, vaddq_s16(s7[7], s7[24])); output += 16; - vst1q_s16(output, vaddq_s16(s7_8, s7_23)); + vst1q_s16(output, vaddq_s16(s7[8], s7[23])); output += 16; - vst1q_s16(output, vaddq_s16(s7_9, s7_22)); + vst1q_s16(output, vaddq_s16(s7[9], s7[22])); output += 16; - vst1q_s16(output, vaddq_s16(s7_10, s7_21)); + vst1q_s16(output, vaddq_s16(s7[10], s7[21])); output += 16; - vst1q_s16(output, vaddq_s16(s7_11, s7_20)); + vst1q_s16(output, vaddq_s16(s7[11], s7[20])); output += 16; - vst1q_s16(output, vaddq_s16(s7_12, s6_19)); + vst1q_s16(output, vaddq_s16(s7[12], s6[19])); output += 16; - vst1q_s16(output, vaddq_s16(s7_13, s6_18)); + vst1q_s16(output, vaddq_s16(s7[13], s6[18])); output += 16; - vst1q_s16(output, vaddq_s16(s7_14, s6_17)); + vst1q_s16(output, vaddq_s16(s7[14], s6[17])); output += 16; - vst1q_s16(output, vaddq_s16(s7_15, s6_16)); + vst1q_s16(output, vaddq_s16(s7[15], s6[16])); output += 16; - vst1q_s16(output, vsubq_s16(s7_15, s6_16)); + vst1q_s16(output, vsubq_s16(s7[15], s6[16])); output += 16; - vst1q_s16(output, vsubq_s16(s7_14, s6_17)); + vst1q_s16(output, vsubq_s16(s7[14], s6[17])); output += 16; - vst1q_s16(output, vsubq_s16(s7_13, s6_18)); + vst1q_s16(output, vsubq_s16(s7[13], s6[18])); output += 16; - vst1q_s16(output, vsubq_s16(s7_12, s6_19)); + vst1q_s16(output, vsubq_s16(s7[12], s6[19])); output += 16; - vst1q_s16(output, vsubq_s16(s7_11, s7_20)); + vst1q_s16(output, vsubq_s16(s7[11], s7[20])); output += 16; - vst1q_s16(output, vsubq_s16(s7_10, s7_21)); + vst1q_s16(output, vsubq_s16(s7[10], s7[21])); output += 16; - vst1q_s16(output, vsubq_s16(s7_9, s7_22)); + vst1q_s16(output, vsubq_s16(s7[9], s7[22])); output += 16; - vst1q_s16(output, vsubq_s16(s7_8, s7_23)); + vst1q_s16(output, vsubq_s16(s7[8], s7[23])); output += 16; - vst1q_s16(output, vsubq_s16(s7_7, s7_24)); + vst1q_s16(output, vsubq_s16(s7[7], s7[24])); output += 16; - vst1q_s16(output, vsubq_s16(s7_6, s7_25)); + vst1q_s16(output, vsubq_s16(s7[6], s7[25])); output += 16; - vst1q_s16(output, vsubq_s16(s7_5, s7_26)); + vst1q_s16(output, vsubq_s16(s7[5], s7[26])); output += 16; - vst1q_s16(output, vsubq_s16(s7_4, s7_27)); + vst1q_s16(output, vsubq_s16(s7[4], s7[27])); output += 16; - vst1q_s16(output, vsubq_s16(s7_3, s6_28)); + vst1q_s16(output, vsubq_s16(s7[3], s6[28])); output += 16; - vst1q_s16(output, vsubq_s16(s7_2, s6_29)); + vst1q_s16(output, vsubq_s16(s7[2], s6[29])); output += 16; - vst1q_s16(output, vsubq_s16(s7_1, s6_30)); + vst1q_s16(output, vsubq_s16(s7[1], s6[30])); output += 16; - vst1q_s16(output, vsubq_s16(s7_0, s6_31)); + vst1q_s16(output, vsubq_s16(s7[0], s6[31])); } -static void idct32_16_neon(const int16_t *input, uint8_t *output, int stride) { - int16x8_t in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, - in13, in14, in15; - int16x8_t s1_16, s1_17, s1_18, s1_19, s1_20, s1_21, s1_22, s1_23, s1_24, - s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, s1_31; - int16x8_t s2_8, s2_9, s2_10, s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, - s2_18, s2_19, s2_20, s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, - s2_28, s2_29, s2_30, s2_31; - int16x8_t s3_4, s3_5, s3_6, s3_7, s3_8, s3_9, s3_10, s3_11, s3_12, s3_13, - s3_14, s3_15, s3_17, s3_18, s3_21, s3_22, s3_25, s3_26, s3_29, s3_30; - int16x8_t s4_0, s4_2, s4_3, s4_4, s4_5, s4_6, s4_7, s4_9, s4_10, s4_13, s4_14, - s4_16, s4_17, s4_18, s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25, - s4_26, s4_27, s4_28, s4_29, s4_30, s4_31; - int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12, - s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28, - s5_29; - int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12, - s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24, - s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31; - int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10, - s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24, - s7_25, s7_26, s7_27; - int16x8_t out0, out1, out2, out3, out4, out5, out6, out7; +void vpx_idct32_16_neon(const int16_t *const input, void *const output, + const int stride, const int highbd_flag) { + int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], + out[32]; - load_and_transpose_s16_8x8(input, 16, &in0, &in1, &in2, &in3, &in4, &in5, - &in6, &in7); + load_and_transpose_s16_8x8(input, 16, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); - load_and_transpose_s16_8x8(input + 8, 16, &in8, &in9, &in10, &in11, &in12, - &in13, &in14, &in15); + load_and_transpose_s16_8x8(input + 8, 16, &in[8], &in[9], &in[10], &in[11], + &in[12], &in[13], &in[14], &in[15]); // stage 1 - s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64); - s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64); + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); - s1_17 = multiply_shift_and_narrow_s16(in15, -cospi_17_64); - s1_30 = multiply_shift_and_narrow_s16(in15, cospi_15_64); + s1[17] = multiply_shift_and_narrow_s16(in[15], -cospi_17_64); + s1[30] = multiply_shift_and_narrow_s16(in[15], cospi_15_64); - s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64); - s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64); + s1[18] = multiply_shift_and_narrow_s16(in[9], cospi_23_64); + s1[29] = multiply_shift_and_narrow_s16(in[9], cospi_9_64); - s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64); - s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64); + s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); - s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64); - s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64); + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); - s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64); - s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64); + s1[21] = multiply_shift_and_narrow_s16(in[11], -cospi_21_64); + s1[26] = multiply_shift_and_narrow_s16(in[11], cospi_11_64); - s1_22 = multiply_shift_and_narrow_s16(in13, cospi_19_64); - s1_25 = multiply_shift_and_narrow_s16(in13, cospi_13_64); + s1[22] = multiply_shift_and_narrow_s16(in[13], cospi_19_64); + s1[25] = multiply_shift_and_narrow_s16(in[13], cospi_13_64); - s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64); - s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64); + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); // stage 2 - s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64); - s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64); + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); - s2_9 = multiply_shift_and_narrow_s16(in14, -cospi_18_64); - s2_14 = multiply_shift_and_narrow_s16(in14, cospi_14_64); + s2[9] = multiply_shift_and_narrow_s16(in[14], -cospi_18_64); + s2[14] = multiply_shift_and_narrow_s16(in[14], cospi_14_64); - s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64); - s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64); + s2[10] = multiply_shift_and_narrow_s16(in[10], cospi_22_64); + s2[13] = multiply_shift_and_narrow_s16(in[10], cospi_10_64); - s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64); - s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64); + s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); - s2_16 = vaddq_s16(s1_16, s1_17); - s2_17 = vsubq_s16(s1_16, s1_17); - s2_18 = vsubq_s16(s1_19, s1_18); - s2_19 = vaddq_s16(s1_18, s1_19); - s2_20 = vaddq_s16(s1_20, s1_21); - s2_21 = vsubq_s16(s1_20, s1_21); - s2_22 = vsubq_s16(s1_23, s1_22); - s2_23 = vaddq_s16(s1_22, s1_23); - s2_24 = vaddq_s16(s1_24, s1_25); - s2_25 = vsubq_s16(s1_24, s1_25); - s2_26 = vsubq_s16(s1_27, s1_26); - s2_27 = vaddq_s16(s1_26, s1_27); - s2_28 = vaddq_s16(s1_28, s1_29); - s2_29 = vsubq_s16(s1_28, s1_29); - s2_30 = vsubq_s16(s1_31, s1_30); - s2_31 = vaddq_s16(s1_30, s1_31); + s2[16] = vaddq_s16(s1[16], s1[17]); + s2[17] = vsubq_s16(s1[16], s1[17]); + s2[18] = vsubq_s16(s1[19], s1[18]); + s2[19] = vaddq_s16(s1[18], s1[19]); + s2[20] = vaddq_s16(s1[20], s1[21]); + s2[21] = vsubq_s16(s1[20], s1[21]); + s2[22] = vsubq_s16(s1[23], s1[22]); + s2[23] = vaddq_s16(s1[22], s1[23]); + s2[24] = vaddq_s16(s1[24], s1[25]); + s2[25] = vsubq_s16(s1[24], s1[25]); + s2[26] = vsubq_s16(s1[27], s1[26]); + s2[27] = vaddq_s16(s1[26], s1[27]); + s2[28] = vaddq_s16(s1[28], s1[29]); + s2[29] = vsubq_s16(s1[28], s1[29]); + s2[30] = vsubq_s16(s1[31], s1[30]); + s2[31] = vaddq_s16(s1[30], s1[31]); // stage 3 - s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64); - s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64); + s3[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s3[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); - s3_5 = multiply_shift_and_narrow_s16(in12, -cospi_20_64); - s3_6 = multiply_shift_and_narrow_s16(in12, cospi_12_64); + s3[5] = multiply_shift_and_narrow_s16(in[12], -cospi_20_64); + s3[6] = multiply_shift_and_narrow_s16(in[12], cospi_12_64); - s3_8 = vaddq_s16(s2_8, s2_9); - s3_9 = vsubq_s16(s2_8, s2_9); - s3_10 = vsubq_s16(s2_11, s2_10); - s3_11 = vaddq_s16(s2_10, s2_11); - s3_12 = vaddq_s16(s2_12, s2_13); - s3_13 = vsubq_s16(s2_12, s2_13); - s3_14 = vsubq_s16(s2_15, s2_14); - s3_15 = vaddq_s16(s2_14, s2_15); + s3[8] = vaddq_s16(s2[8], s2[9]); + s3[9] = vsubq_s16(s2[8], s2[9]); + s3[10] = vsubq_s16(s2[11], s2[10]); + s3[11] = vaddq_s16(s2[10], s2[11]); + s3[12] = vaddq_s16(s2[12], s2[13]); + s3[13] = vsubq_s16(s2[12], s2[13]); + s3[14] = vsubq_s16(s2[15], s2[14]); + s3[15] = vaddq_s16(s2[14], s2[15]); - s3_17 = multiply_accumulate_shift_and_narrow_s16(s2_17, -cospi_4_64, s2_30, - cospi_28_64); - s3_30 = multiply_accumulate_shift_and_narrow_s16(s2_17, cospi_28_64, s2_30, - cospi_4_64); + s3[17] = multiply_accumulate_shift_and_narrow_s16(s2[17], -cospi_4_64, s2[30], + cospi_28_64); + s3[30] = multiply_accumulate_shift_and_narrow_s16(s2[17], cospi_28_64, s2[30], + cospi_4_64); - s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29, - -cospi_4_64); - s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29, - cospi_28_64); + s3[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_28_64, + s2[29], -cospi_4_64); + s3[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_4_64, s2[29], + cospi_28_64); - s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26, - cospi_12_64); - s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26, - cospi_20_64); + s3[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_20_64, + s2[26], cospi_12_64); + s3[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], cospi_12_64, s2[26], + cospi_20_64); - s3_22 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_12_64, s2_25, - -cospi_20_64); - s3_25 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_20_64, s2_25, - cospi_12_64); + s3[22] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_12_64, + s2[25], -cospi_20_64); + s3[25] = multiply_accumulate_shift_and_narrow_s16(s2[22], -cospi_20_64, + s2[25], cospi_12_64); // stage 4 - s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64); - s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64); - s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64); + s4[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); + s4[2] = multiply_shift_and_narrow_s16(in[8], cospi_24_64); + s4[3] = multiply_shift_and_narrow_s16(in[8], cospi_8_64); - s4_4 = vaddq_s16(s3_4, s3_5); - s4_5 = vsubq_s16(s3_4, s3_5); - s4_6 = vsubq_s16(s3_7, s3_6); - s4_7 = vaddq_s16(s3_6, s3_7); + s4[4] = vaddq_s16(s3[4], s3[5]); + s4[5] = vsubq_s16(s3[4], s3[5]); + s4[6] = vsubq_s16(s3[7], s3[6]); + s4[7] = vaddq_s16(s3[6], s3[7]); - s4_9 = multiply_accumulate_shift_and_narrow_s16(s3_9, -cospi_8_64, s3_14, - cospi_24_64); - s4_14 = multiply_accumulate_shift_and_narrow_s16(s3_9, cospi_24_64, s3_14, - cospi_8_64); - - s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13, - -cospi_8_64); - s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13, + s4[9] = multiply_accumulate_shift_and_narrow_s16(s3[9], -cospi_8_64, s3[14], cospi_24_64); + s4[14] = multiply_accumulate_shift_and_narrow_s16(s3[9], cospi_24_64, s3[14], + cospi_8_64); - s4_16 = vaddq_s16(s2_16, s2_19); - s4_17 = vaddq_s16(s3_17, s3_18); - s4_18 = vsubq_s16(s3_17, s3_18); - s4_19 = vsubq_s16(s2_16, s2_19); - s4_20 = vsubq_s16(s2_23, s2_20); - s4_21 = vsubq_s16(s3_22, s3_21); - s4_22 = vaddq_s16(s3_21, s3_22); - s4_23 = vaddq_s16(s2_20, s2_23); - s4_24 = vaddq_s16(s2_24, s2_27); - s4_25 = vaddq_s16(s3_25, s3_26); - s4_26 = vsubq_s16(s3_25, s3_26); - s4_27 = vsubq_s16(s2_24, s2_27); - s4_28 = vsubq_s16(s2_31, s2_28); - s4_29 = vsubq_s16(s3_30, s3_29); - s4_30 = vaddq_s16(s3_29, s3_30); - s4_31 = vaddq_s16(s2_28, s2_31); + s4[10] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_24_64, + s3[13], -cospi_8_64); + s4[13] = multiply_accumulate_shift_and_narrow_s16(s3[10], -cospi_8_64, s3[13], + cospi_24_64); + + s4[16] = vaddq_s16(s2[16], s2[19]); + s4[17] = vaddq_s16(s3[17], s3[18]); + s4[18] = vsubq_s16(s3[17], s3[18]); + s4[19] = vsubq_s16(s2[16], s2[19]); + s4[20] = vsubq_s16(s2[23], s2[20]); + s4[21] = vsubq_s16(s3[22], s3[21]); + s4[22] = vaddq_s16(s3[21], s3[22]); + s4[23] = vaddq_s16(s2[20], s2[23]); + s4[24] = vaddq_s16(s2[24], s2[27]); + s4[25] = vaddq_s16(s3[25], s3[26]); + s4[26] = vsubq_s16(s3[25], s3[26]); + s4[27] = vsubq_s16(s2[24], s2[27]); + s4[28] = vsubq_s16(s2[31], s2[28]); + s4[29] = vsubq_s16(s3[30], s3[29]); + s4[30] = vaddq_s16(s3[29], s3[30]); + s4[31] = vaddq_s16(s2[28], s2[31]); // stage 5 - s5_0 = vaddq_s16(s4_0, s4_3); - s5_1 = vaddq_s16(s4_0, s4_2); - s5_2 = vsubq_s16(s4_0, s4_2); - s5_3 = vsubq_s16(s4_0, s4_3); + s5[0] = vaddq_s16(s4[0], s4[3]); + s5[1] = vaddq_s16(s4[0], s4[2]); + s5[2] = vsubq_s16(s4[0], s4[2]); + s5[3] = vsubq_s16(s4[0], s4[3]); - s5_5 = sub_multiply_shift_and_narrow_s16(s4_6, s4_5, cospi_16_64); - s5_6 = add_multiply_shift_and_narrow_s16(s4_5, s4_6, cospi_16_64); + s5[5] = sub_multiply_shift_and_narrow_s16(s4[6], s4[5], cospi_16_64); + s5[6] = add_multiply_shift_and_narrow_s16(s4[5], s4[6], cospi_16_64); - s5_8 = vaddq_s16(s3_8, s3_11); - s5_9 = vaddq_s16(s4_9, s4_10); - s5_10 = vsubq_s16(s4_9, s4_10); - s5_11 = vsubq_s16(s3_8, s3_11); - s5_12 = vsubq_s16(s3_15, s3_12); - s5_13 = vsubq_s16(s4_14, s4_13); - s5_14 = vaddq_s16(s4_13, s4_14); - s5_15 = vaddq_s16(s3_15, s3_12); + s5[8] = vaddq_s16(s3[8], s3[11]); + s5[9] = vaddq_s16(s4[9], s4[10]); + s5[10] = vsubq_s16(s4[9], s4[10]); + s5[11] = vsubq_s16(s3[8], s3[11]); + s5[12] = vsubq_s16(s3[15], s3[12]); + s5[13] = vsubq_s16(s4[14], s4[13]); + s5[14] = vaddq_s16(s4[13], s4[14]); + s5[15] = vaddq_s16(s3[15], s3[12]); - s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29, - cospi_24_64); - s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29, - cospi_8_64); + s5[18] = multiply_accumulate_shift_and_narrow_s16(s4[18], -cospi_8_64, s4[29], + cospi_24_64); + s5[29] = multiply_accumulate_shift_and_narrow_s16(s4[18], cospi_24_64, s4[29], + cospi_8_64); - s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28, - cospi_24_64); - s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28, - cospi_8_64); + s5[19] = multiply_accumulate_shift_and_narrow_s16(s4[19], -cospi_8_64, s4[28], + cospi_24_64); + s5[28] = multiply_accumulate_shift_and_narrow_s16(s4[19], cospi_24_64, s4[28], + cospi_8_64); - s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27, - -cospi_8_64); - s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27, - cospi_24_64); + s5[20] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_24_64, + s4[27], -cospi_8_64); + s5[27] = multiply_accumulate_shift_and_narrow_s16(s4[20], -cospi_8_64, s4[27], + cospi_24_64); - s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26, - -cospi_8_64); - s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26, - cospi_24_64); + s5[21] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_24_64, + s4[26], -cospi_8_64); + s5[26] = multiply_accumulate_shift_and_narrow_s16(s4[21], -cospi_8_64, s4[26], + cospi_24_64); // stage 6 - s6_0 = vaddq_s16(s5_0, s4_7); - s6_1 = vaddq_s16(s5_1, s5_6); - s6_2 = vaddq_s16(s5_2, s5_5); - s6_3 = vaddq_s16(s5_3, s4_4); - s6_4 = vsubq_s16(s5_3, s4_4); - s6_5 = vsubq_s16(s5_2, s5_5); - s6_6 = vsubq_s16(s5_1, s5_6); - s6_7 = vsubq_s16(s5_0, s4_7); + s6[0] = vaddq_s16(s5[0], s4[7]); + s6[1] = vaddq_s16(s5[1], s5[6]); + s6[2] = vaddq_s16(s5[2], s5[5]); + s6[3] = vaddq_s16(s5[3], s4[4]); + s6[4] = vsubq_s16(s5[3], s4[4]); + s6[5] = vsubq_s16(s5[2], s5[5]); + s6[6] = vsubq_s16(s5[1], s5[6]); + s6[7] = vsubq_s16(s5[0], s4[7]); - s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64); - s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64); + s6[10] = sub_multiply_shift_and_narrow_s16(s5[13], s5[10], cospi_16_64); + s6[13] = add_multiply_shift_and_narrow_s16(s5[10], s5[13], cospi_16_64); - s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64); - s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64); + s6[11] = sub_multiply_shift_and_narrow_s16(s5[12], s5[11], cospi_16_64); + s6[12] = add_multiply_shift_and_narrow_s16(s5[11], s5[12], cospi_16_64); - s6_16 = vaddq_s16(s4_16, s4_23); - s6_17 = vaddq_s16(s4_17, s4_22); - s6_18 = vaddq_s16(s5_18, s5_21); - s6_19 = vaddq_s16(s5_19, s5_20); - s6_20 = vsubq_s16(s5_19, s5_20); - s6_21 = vsubq_s16(s5_18, s5_21); - s6_22 = vsubq_s16(s4_17, s4_22); - s6_23 = vsubq_s16(s4_16, s4_23); - s6_24 = vsubq_s16(s4_31, s4_24); - s6_25 = vsubq_s16(s4_30, s4_25); - s6_26 = vsubq_s16(s5_29, s5_26); - s6_27 = vsubq_s16(s5_28, s5_27); - s6_28 = vaddq_s16(s5_27, s5_28); - s6_29 = vaddq_s16(s5_26, s5_29); - s6_30 = vaddq_s16(s4_25, s4_30); - s6_31 = vaddq_s16(s4_24, s4_31); + s6[16] = vaddq_s16(s4[16], s4[23]); + s6[17] = vaddq_s16(s4[17], s4[22]); + s6[18] = vaddq_s16(s5[18], s5[21]); + s6[19] = vaddq_s16(s5[19], s5[20]); + s6[20] = vsubq_s16(s5[19], s5[20]); + s6[21] = vsubq_s16(s5[18], s5[21]); + s6[22] = vsubq_s16(s4[17], s4[22]); + s6[23] = vsubq_s16(s4[16], s4[23]); + s6[24] = vsubq_s16(s4[31], s4[24]); + s6[25] = vsubq_s16(s4[30], s4[25]); + s6[26] = vsubq_s16(s5[29], s5[26]); + s6[27] = vsubq_s16(s5[28], s5[27]); + s6[28] = vaddq_s16(s5[27], s5[28]); + s6[29] = vaddq_s16(s5[26], s5[29]); + s6[30] = vaddq_s16(s4[25], s4[30]); + s6[31] = vaddq_s16(s4[24], s4[31]); // stage 7 - s7_0 = vaddq_s16(s6_0, s5_15); - s7_1 = vaddq_s16(s6_1, s5_14); - s7_2 = vaddq_s16(s6_2, s6_13); - s7_3 = vaddq_s16(s6_3, s6_12); - s7_4 = vaddq_s16(s6_4, s6_11); - s7_5 = vaddq_s16(s6_5, s6_10); - s7_6 = vaddq_s16(s6_6, s5_9); - s7_7 = vaddq_s16(s6_7, s5_8); - s7_8 = vsubq_s16(s6_7, s5_8); - s7_9 = vsubq_s16(s6_6, s5_9); - s7_10 = vsubq_s16(s6_5, s6_10); - s7_11 = vsubq_s16(s6_4, s6_11); - s7_12 = vsubq_s16(s6_3, s6_12); - s7_13 = vsubq_s16(s6_2, s6_13); - s7_14 = vsubq_s16(s6_1, s5_14); - s7_15 = vsubq_s16(s6_0, s5_15); + s7[0] = vaddq_s16(s6[0], s5[15]); + s7[1] = vaddq_s16(s6[1], s5[14]); + s7[2] = vaddq_s16(s6[2], s6[13]); + s7[3] = vaddq_s16(s6[3], s6[12]); + s7[4] = vaddq_s16(s6[4], s6[11]); + s7[5] = vaddq_s16(s6[5], s6[10]); + s7[6] = vaddq_s16(s6[6], s5[9]); + s7[7] = vaddq_s16(s6[7], s5[8]); + s7[8] = vsubq_s16(s6[7], s5[8]); + s7[9] = vsubq_s16(s6[6], s5[9]); + s7[10] = vsubq_s16(s6[5], s6[10]); + s7[11] = vsubq_s16(s6[4], s6[11]); + s7[12] = vsubq_s16(s6[3], s6[12]); + s7[13] = vsubq_s16(s6[2], s6[13]); + s7[14] = vsubq_s16(s6[1], s5[14]); + s7[15] = vsubq_s16(s6[0], s5[15]); - s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64); - s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64); + s7[20] = sub_multiply_shift_and_narrow_s16(s6[27], s6[20], cospi_16_64); + s7[27] = add_multiply_shift_and_narrow_s16(s6[20], s6[27], cospi_16_64); - s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64); - s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64); + s7[21] = sub_multiply_shift_and_narrow_s16(s6[26], s6[21], cospi_16_64); + s7[26] = add_multiply_shift_and_narrow_s16(s6[21], s6[26], cospi_16_64); - s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64); - s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64); + s7[22] = sub_multiply_shift_and_narrow_s16(s6[25], s6[22], cospi_16_64); + s7[25] = add_multiply_shift_and_narrow_s16(s6[22], s6[25], cospi_16_64); - s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64); - s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64); + s7[23] = sub_multiply_shift_and_narrow_s16(s6[24], s6[23], cospi_16_64); + s7[24] = add_multiply_shift_and_narrow_s16(s6[23], s6[24], cospi_16_64); // final stage - out0 = vaddq_s16(s7_0, s6_31); - out1 = vaddq_s16(s7_1, s6_30); - out2 = vaddq_s16(s7_2, s6_29); - out3 = vaddq_s16(s7_3, s6_28); - out4 = vaddq_s16(s7_4, s7_27); - out5 = vaddq_s16(s7_5, s7_26); - out6 = vaddq_s16(s7_6, s7_25); - out7 = vaddq_s16(s7_7, s7_24); + out[0] = final_add(s7[0], s6[31]); + out[1] = final_add(s7[1], s6[30]); + out[2] = final_add(s7[2], s6[29]); + out[3] = final_add(s7[3], s6[28]); + out[4] = final_add(s7[4], s7[27]); + out[5] = final_add(s7[5], s7[26]); + out[6] = final_add(s7[6], s7[25]); + out[7] = final_add(s7[7], s7[24]); + out[8] = final_add(s7[8], s7[23]); + out[9] = final_add(s7[9], s7[22]); + out[10] = final_add(s7[10], s7[21]); + out[11] = final_add(s7[11], s7[20]); + out[12] = final_add(s7[12], s6[19]); + out[13] = final_add(s7[13], s6[18]); + out[14] = final_add(s7[14], s6[17]); + out[15] = final_add(s7[15], s6[16]); + out[16] = final_sub(s7[15], s6[16]); + out[17] = final_sub(s7[14], s6[17]); + out[18] = final_sub(s7[13], s6[18]); + out[19] = final_sub(s7[12], s6[19]); + out[20] = final_sub(s7[11], s7[20]); + out[21] = final_sub(s7[10], s7[21]); + out[22] = final_sub(s7[9], s7[22]); + out[23] = final_sub(s7[8], s7[23]); + out[24] = final_sub(s7[7], s7[24]); + out[25] = final_sub(s7[6], s7[25]); + out[26] = final_sub(s7[5], s7[26]); + out[27] = final_sub(s7[4], s7[27]); + out[28] = final_sub(s7[3], s6[28]); + out[29] = final_sub(s7[2], s6[29]); + out[30] = final_sub(s7[1], s6[30]); + out[31] = final_sub(s7[0], s6[31]); - add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output, - stride); - - out0 = vaddq_s16(s7_8, s7_23); - out1 = vaddq_s16(s7_9, s7_22); - out2 = vaddq_s16(s7_10, s7_21); - out3 = vaddq_s16(s7_11, s7_20); - out4 = vaddq_s16(s7_12, s6_19); - out5 = vaddq_s16(s7_13, s6_18); - out6 = vaddq_s16(s7_14, s6_17); - out7 = vaddq_s16(s7_15, s6_16); - - add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, - output + (8 * stride), stride); - - out0 = vsubq_s16(s7_15, s6_16); - out1 = vsubq_s16(s7_14, s6_17); - out2 = vsubq_s16(s7_13, s6_18); - out3 = vsubq_s16(s7_12, s6_19); - out4 = vsubq_s16(s7_11, s7_20); - out5 = vsubq_s16(s7_10, s7_21); - out6 = vsubq_s16(s7_9, s7_22); - out7 = vsubq_s16(s7_8, s7_23); - - add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, - output + (16 * stride), stride); - - out0 = vsubq_s16(s7_7, s7_24); - out1 = vsubq_s16(s7_6, s7_25); - out2 = vsubq_s16(s7_5, s7_26); - out3 = vsubq_s16(s7_4, s7_27); - out4 = vsubq_s16(s7_3, s6_28); - out5 = vsubq_s16(s7_2, s6_29); - out6 = vsubq_s16(s7_1, s6_30); - out7 = vsubq_s16(s7_0, s6_31); - - add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, - output + (24 * stride), stride); + if (highbd_flag) { + highbd_add_and_store_bd8(out, output, stride); + } else { + uint8_t *const outputT = (uint8_t *)output; + add_and_store_u8_s16(out + 0, outputT, stride); + add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride); + add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride); + add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride); + } } void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, @@ -703,11 +663,11 @@ void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int16_t temp[32 * 16]; int16_t *t = temp; - idct32_12_neon(input, temp); - idct32_12_neon(input + 32 * 8, temp + 8); + vpx_idct32_12_neon(input, temp); + vpx_idct32_12_neon(input + 32 * 8, temp + 8); for (i = 0; i < 32; i += 8) { - idct32_16_neon(t, dest, stride); + vpx_idct32_16_neon(t, dest, stride, 0); t += (16 * 8); dest += 8; } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c index 604d82abd1..8920b93363 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -39,7 +39,8 @@ static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride, void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { int i; - const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c index b56deeea6d..f570547e44 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" @@ -35,487 +36,465 @@ // 5 13 20 26 // 6 21 27 33 // 7 24 32 -static void idct32_6_neon(const tran_low_t *input, int16_t *output) { - int16x8_t in0, in1, in2, in3, in4, in5, in6, in7; - int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10, - s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20, - s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, - s1_31; - int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10, - s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20, - s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30, - s2_31; - int16x8_t s3_24, s3_25, s3_26, s3_27; +void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) { + int16x8_t in[8], s1[32], s2[32], s3[32]; - in0 = load_tran_low_to_s16q(input); + in[0] = load_tran_low_to_s16q(input); input += 32; - in1 = load_tran_low_to_s16q(input); + in[1] = load_tran_low_to_s16q(input); input += 32; - in2 = load_tran_low_to_s16q(input); + in[2] = load_tran_low_to_s16q(input); input += 32; - in3 = load_tran_low_to_s16q(input); + in[3] = load_tran_low_to_s16q(input); input += 32; - in4 = load_tran_low_to_s16q(input); + in[4] = load_tran_low_to_s16q(input); input += 32; - in5 = load_tran_low_to_s16q(input); + in[5] = load_tran_low_to_s16q(input); input += 32; - in6 = load_tran_low_to_s16q(input); + in[6] = load_tran_low_to_s16q(input); input += 32; - in7 = load_tran_low_to_s16q(input); - transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7); + in[7] = load_tran_low_to_s16q(input); + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); // stage 1 // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0) - s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64); + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0) - s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64); + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); - s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64); - s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64); + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); - s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64); - s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64); + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); // stage 2 - s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64); - s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64); + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); // stage 3 - s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64); - s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64); + s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); - s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31, - cospi_28_64); - s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31, - cospi_4_64); + s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], + cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], + cospi_4_64); - s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27, - cospi_12_64); - s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27, - cospi_20_64); + s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27], + cospi_20_64); - s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24, - -cospi_20_64); - s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24, - cospi_12_64); + s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, + s1[24], cospi_12_64); // stage 4 - s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64); + s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); - s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15, - cospi_24_64); - s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15, - cospi_8_64); + s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], + cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], + cospi_8_64); - s2_20 = vsubq_s16(s1_23, s1_20); - s2_21 = vsubq_s16(s1_22, s1_21); - s2_22 = vaddq_s16(s1_21, s1_22); - s2_23 = vaddq_s16(s1_20, s1_23); - s2_24 = vaddq_s16(s1_24, s1_27); - s2_25 = vaddq_s16(s1_25, s1_26); - s2_26 = vsubq_s16(s1_25, s1_26); - s2_27 = vsubq_s16(s1_24, s1_27); + s2[20] = vsubq_s16(s1[23], s1[20]); + s2[21] = vsubq_s16(s1[22], s1[21]); + s2[22] = vaddq_s16(s1[21], s1[22]); + s2[23] = vaddq_s16(s1[20], s1[23]); + s2[24] = vaddq_s16(s1[24], s1[27]); + s2[25] = vaddq_s16(s1[25], s1[26]); + s2[26] = vsubq_s16(s1[25], s1[26]); + s2[27] = vsubq_s16(s1[24], s1[27]); // stage 5 - s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64); - s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64); + s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64); - s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_17, -cospi_8_64, s1_30, - cospi_24_64); - s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_17, cospi_24_64, s1_30, - cospi_8_64); + s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30], + cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30], + cospi_8_64); - s1_19 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_8_64, s1_31, - cospi_24_64); - s1_28 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_24_64, s1_31, - cospi_8_64); + s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31], + cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31], + cospi_8_64); - s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27, - -cospi_8_64); - s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27, - cospi_24_64); + s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27], + cospi_24_64); - s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26, - -cospi_8_64); - s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26, - cospi_24_64); + s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26], + cospi_24_64); // stage 6 - s2_0 = vaddq_s16(s1_0, s1_7); - s2_1 = vaddq_s16(s1_0, s1_6); - s2_2 = vaddq_s16(s1_0, s1_5); - s2_3 = vaddq_s16(s1_0, s1_4); - s2_4 = vsubq_s16(s1_0, s1_4); - s2_5 = vsubq_s16(s1_0, s1_5); - s2_6 = vsubq_s16(s1_0, s1_6); - s2_7 = vsubq_s16(s1_0, s1_7); + s2[0] = vaddq_s16(s1[0], s1[7]); + s2[1] = vaddq_s16(s1[0], s1[6]); + s2[2] = vaddq_s16(s1[0], s1[5]); + s2[3] = vaddq_s16(s1[0], s1[4]); + s2[4] = vsubq_s16(s1[0], s1[4]); + s2[5] = vsubq_s16(s1[0], s1[5]); + s2[6] = vsubq_s16(s1[0], s1[6]); + s2[7] = vsubq_s16(s1[0], s1[7]); - s2_10 = sub_multiply_shift_and_narrow_s16(s2_14, s2_9, cospi_16_64); - s2_13 = add_multiply_shift_and_narrow_s16(s2_9, s2_14, cospi_16_64); + s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64); - s2_11 = sub_multiply_shift_and_narrow_s16(s2_15, s2_8, cospi_16_64); - s2_12 = add_multiply_shift_and_narrow_s16(s2_8, s2_15, cospi_16_64); + s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64); - s2_16 = vaddq_s16(s1_16, s2_23); - s2_17 = vaddq_s16(s1_17, s2_22); - s2_18 = vaddq_s16(s1_18, s1_21); - s2_19 = vaddq_s16(s1_19, s1_20); - s2_20 = vsubq_s16(s1_19, s1_20); - s2_21 = vsubq_s16(s1_18, s1_21); - s2_22 = vsubq_s16(s1_17, s2_22); - s2_23 = vsubq_s16(s1_16, s2_23); + s2[16] = vaddq_s16(s1[16], s2[23]); + s2[17] = vaddq_s16(s1[17], s2[22]); + s2[18] = vaddq_s16(s1[18], s1[21]); + s2[19] = vaddq_s16(s1[19], s1[20]); + s2[20] = vsubq_s16(s1[19], s1[20]); + s2[21] = vsubq_s16(s1[18], s1[21]); + s2[22] = vsubq_s16(s1[17], s2[22]); + s2[23] = vsubq_s16(s1[16], s2[23]); - s3_24 = vsubq_s16(s1_31, s2_24); - s3_25 = vsubq_s16(s1_30, s2_25); - s3_26 = vsubq_s16(s1_29, s1_26); - s3_27 = vsubq_s16(s1_28, s1_27); - s2_28 = vaddq_s16(s1_27, s1_28); - s2_29 = vaddq_s16(s1_26, s1_29); - s2_30 = vaddq_s16(s2_25, s1_30); - s2_31 = vaddq_s16(s2_24, s1_31); + s3[24] = vsubq_s16(s1[31], s2[24]); + s3[25] = vsubq_s16(s1[30], s2[25]); + s3[26] = vsubq_s16(s1[29], s1[26]); + s3[27] = vsubq_s16(s1[28], s1[27]); + s2[28] = vaddq_s16(s1[27], s1[28]); + s2[29] = vaddq_s16(s1[26], s1[29]); + s2[30] = vaddq_s16(s2[25], s1[30]); + s2[31] = vaddq_s16(s2[24], s1[31]); // stage 7 - s1_0 = vaddq_s16(s2_0, s2_15); - s1_1 = vaddq_s16(s2_1, s2_14); - s1_2 = vaddq_s16(s2_2, s2_13); - s1_3 = vaddq_s16(s2_3, s2_12); - s1_4 = vaddq_s16(s2_4, s2_11); - s1_5 = vaddq_s16(s2_5, s2_10); - s1_6 = vaddq_s16(s2_6, s2_9); - s1_7 = vaddq_s16(s2_7, s2_8); - s1_8 = vsubq_s16(s2_7, s2_8); - s1_9 = vsubq_s16(s2_6, s2_9); - s1_10 = vsubq_s16(s2_5, s2_10); - s1_11 = vsubq_s16(s2_4, s2_11); - s1_12 = vsubq_s16(s2_3, s2_12); - s1_13 = vsubq_s16(s2_2, s2_13); - s1_14 = vsubq_s16(s2_1, s2_14); - s1_15 = vsubq_s16(s2_0, s2_15); + s1[0] = vaddq_s16(s2[0], s2[15]); + s1[1] = vaddq_s16(s2[1], s2[14]); + s1[2] = vaddq_s16(s2[2], s2[13]); + s1[3] = vaddq_s16(s2[3], s2[12]); + s1[4] = vaddq_s16(s2[4], s2[11]); + s1[5] = vaddq_s16(s2[5], s2[10]); + s1[6] = vaddq_s16(s2[6], s2[9]); + s1[7] = vaddq_s16(s2[7], s2[8]); + s1[8] = vsubq_s16(s2[7], s2[8]); + s1[9] = vsubq_s16(s2[6], s2[9]); + s1[10] = vsubq_s16(s2[5], s2[10]); + s1[11] = vsubq_s16(s2[4], s2[11]); + s1[12] = vsubq_s16(s2[3], s2[12]); + s1[13] = vsubq_s16(s2[2], s2[13]); + s1[14] = vsubq_s16(s2[1], s2[14]); + s1[15] = vsubq_s16(s2[0], s2[15]); - s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64); - s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64); + s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64); - s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64); - s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64); + s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64); - s1_22 = sub_multiply_shift_and_narrow_s16(s3_25, s2_22, cospi_16_64); - s1_25 = add_multiply_shift_and_narrow_s16(s2_22, s3_25, cospi_16_64); + s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64); - s1_23 = sub_multiply_shift_and_narrow_s16(s3_24, s2_23, cospi_16_64); - s1_24 = add_multiply_shift_and_narrow_s16(s2_23, s3_24, cospi_16_64); + s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64); // final stage - vst1q_s16(output, vaddq_s16(s1_0, s2_31)); + vst1q_s16(output, vaddq_s16(s1[0], s2[31])); output += 8; - vst1q_s16(output, vaddq_s16(s1_1, s2_30)); + vst1q_s16(output, vaddq_s16(s1[1], s2[30])); output += 8; - vst1q_s16(output, vaddq_s16(s1_2, s2_29)); + vst1q_s16(output, vaddq_s16(s1[2], s2[29])); output += 8; - vst1q_s16(output, vaddq_s16(s1_3, s2_28)); + vst1q_s16(output, vaddq_s16(s1[3], s2[28])); output += 8; - vst1q_s16(output, vaddq_s16(s1_4, s1_27)); + vst1q_s16(output, vaddq_s16(s1[4], s1[27])); output += 8; - vst1q_s16(output, vaddq_s16(s1_5, s1_26)); + vst1q_s16(output, vaddq_s16(s1[5], s1[26])); output += 8; - vst1q_s16(output, vaddq_s16(s1_6, s1_25)); + vst1q_s16(output, vaddq_s16(s1[6], s1[25])); output += 8; - vst1q_s16(output, vaddq_s16(s1_7, s1_24)); + vst1q_s16(output, vaddq_s16(s1[7], s1[24])); output += 8; - vst1q_s16(output, vaddq_s16(s1_8, s1_23)); + vst1q_s16(output, vaddq_s16(s1[8], s1[23])); output += 8; - vst1q_s16(output, vaddq_s16(s1_9, s1_22)); + vst1q_s16(output, vaddq_s16(s1[9], s1[22])); output += 8; - vst1q_s16(output, vaddq_s16(s1_10, s1_21)); + vst1q_s16(output, vaddq_s16(s1[10], s1[21])); output += 8; - vst1q_s16(output, vaddq_s16(s1_11, s1_20)); + vst1q_s16(output, vaddq_s16(s1[11], s1[20])); output += 8; - vst1q_s16(output, vaddq_s16(s1_12, s2_19)); + vst1q_s16(output, vaddq_s16(s1[12], s2[19])); output += 8; - vst1q_s16(output, vaddq_s16(s1_13, s2_18)); + vst1q_s16(output, vaddq_s16(s1[13], s2[18])); output += 8; - vst1q_s16(output, vaddq_s16(s1_14, s2_17)); + vst1q_s16(output, vaddq_s16(s1[14], s2[17])); output += 8; - vst1q_s16(output, vaddq_s16(s1_15, s2_16)); + vst1q_s16(output, vaddq_s16(s1[15], s2[16])); output += 8; - vst1q_s16(output, vsubq_s16(s1_15, s2_16)); + vst1q_s16(output, vsubq_s16(s1[15], s2[16])); output += 8; - vst1q_s16(output, vsubq_s16(s1_14, s2_17)); + vst1q_s16(output, vsubq_s16(s1[14], s2[17])); output += 8; - vst1q_s16(output, vsubq_s16(s1_13, s2_18)); + vst1q_s16(output, vsubq_s16(s1[13], s2[18])); output += 8; - vst1q_s16(output, vsubq_s16(s1_12, s2_19)); + vst1q_s16(output, vsubq_s16(s1[12], s2[19])); output += 8; - vst1q_s16(output, vsubq_s16(s1_11, s1_20)); + vst1q_s16(output, vsubq_s16(s1[11], s1[20])); output += 8; - vst1q_s16(output, vsubq_s16(s1_10, s1_21)); + vst1q_s16(output, vsubq_s16(s1[10], s1[21])); output += 8; - vst1q_s16(output, vsubq_s16(s1_9, s1_22)); + vst1q_s16(output, vsubq_s16(s1[9], s1[22])); output += 8; - vst1q_s16(output, vsubq_s16(s1_8, s1_23)); + vst1q_s16(output, vsubq_s16(s1[8], s1[23])); output += 8; - vst1q_s16(output, vsubq_s16(s1_7, s1_24)); + vst1q_s16(output, vsubq_s16(s1[7], s1[24])); output += 8; - vst1q_s16(output, vsubq_s16(s1_6, s1_25)); + vst1q_s16(output, vsubq_s16(s1[6], s1[25])); output += 8; - vst1q_s16(output, vsubq_s16(s1_5, s1_26)); + vst1q_s16(output, vsubq_s16(s1[5], s1[26])); output += 8; - vst1q_s16(output, vsubq_s16(s1_4, s1_27)); + vst1q_s16(output, vsubq_s16(s1[4], s1[27])); output += 8; - vst1q_s16(output, vsubq_s16(s1_3, s2_28)); + vst1q_s16(output, vsubq_s16(s1[3], s2[28])); output += 8; - vst1q_s16(output, vsubq_s16(s1_2, s2_29)); + vst1q_s16(output, vsubq_s16(s1[2], s2[29])); output += 8; - vst1q_s16(output, vsubq_s16(s1_1, s2_30)); + vst1q_s16(output, vsubq_s16(s1[1], s2[30])); output += 8; - vst1q_s16(output, vsubq_s16(s1_0, s2_31)); + vst1q_s16(output, vsubq_s16(s1[0], s2[31])); } -static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) { - int16x8_t in0, in1, in2, in3, in4, in5, in6, in7; - int16x8_t out0, out1, out2, out3, out4, out5, out6, out7; - int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10, - s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20, - s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, - s1_31; - int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10, - s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20, - s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30, - s2_31; - int16x8_t s3_24, s3_25, s3_26, s3_27; +void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, + const int highbd_flag) { + int16x8_t in[8], s1[32], s2[32], s3[32], out[32]; - load_and_transpose_s16_8x8(input, 8, &in0, &in1, &in2, &in3, &in4, &in5, &in6, - &in7); + load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4], + &in[5], &in[6], &in[7]); // stage 1 - s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64); - s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64); + s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); + s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); // Different for _8_ - s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64); - s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64); + s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); + s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); - s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64); - s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64); + s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); + s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); - s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64); - s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64); + s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); + s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); // stage 2 - s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64); - s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64); + s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); + s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); - s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64); - s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64); + s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); + s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); // stage 3 - s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64); - s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64); + s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); + s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); - s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31, - cospi_28_64); - s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31, - cospi_4_64); + s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], + cospi_28_64); + s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], + cospi_4_64); // Different for _8_ - s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_28_64, s1_28, - -cospi_4_64); - s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_4_64, s1_28, - cospi_28_64); + s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64, + s1[28], -cospi_4_64); + s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28], + cospi_28_64); - s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27, - cospi_12_64); - s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27, - cospi_20_64); + s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64, + s1[27], cospi_12_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27], + cospi_20_64); - s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24, - -cospi_20_64); - s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24, - cospi_12_64); + s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, + s1[24], -cospi_20_64); + s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, + s1[24], cospi_12_64); // stage 4 - s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64); + s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); - s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15, - cospi_24_64); - s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15, - cospi_8_64); - - s2_10 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_24_64, s2_12, - -cospi_8_64); - s2_13 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_8_64, s2_12, + s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], cospi_24_64); + s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], + cospi_8_64); - s2_16 = vaddq_s16(s1_16, s1_19); + s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64, + s2[12], -cospi_8_64); + s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12], + cospi_24_64); - s2_17 = vaddq_s16(s1_17, s1_18); - s2_18 = vsubq_s16(s1_17, s1_18); + s2[16] = vaddq_s16(s1[16], s1[19]); - s2_19 = vsubq_s16(s1_16, s1_19); + s2[17] = vaddq_s16(s1[17], s1[18]); + s2[18] = vsubq_s16(s1[17], s1[18]); - s2_20 = vsubq_s16(s1_23, s1_20); - s2_21 = vsubq_s16(s1_22, s1_21); + s2[19] = vsubq_s16(s1[16], s1[19]); - s2_22 = vaddq_s16(s1_21, s1_22); - s2_23 = vaddq_s16(s1_20, s1_23); + s2[20] = vsubq_s16(s1[23], s1[20]); + s2[21] = vsubq_s16(s1[22], s1[21]); - s2_24 = vaddq_s16(s1_24, s1_27); - s2_25 = vaddq_s16(s1_25, s1_26); - s2_26 = vsubq_s16(s1_25, s1_26); - s2_27 = vsubq_s16(s1_24, s1_27); + s2[22] = vaddq_s16(s1[21], s1[22]); + s2[23] = vaddq_s16(s1[20], s1[23]); - s2_28 = vsubq_s16(s1_31, s1_28); - s2_29 = vsubq_s16(s1_30, s1_29); - s2_30 = vaddq_s16(s1_29, s1_30); - s2_31 = vaddq_s16(s1_28, s1_31); + s2[24] = vaddq_s16(s1[24], s1[27]); + s2[25] = vaddq_s16(s1[25], s1[26]); + s2[26] = vsubq_s16(s1[25], s1[26]); + s2[27] = vsubq_s16(s1[24], s1[27]); + + s2[28] = vsubq_s16(s1[31], s1[28]); + s2[29] = vsubq_s16(s1[30], s1[29]); + s2[30] = vaddq_s16(s1[29], s1[30]); + s2[31] = vaddq_s16(s1[28], s1[31]); // stage 5 - s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64); - s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64); + s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64); + s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64); - s1_8 = vaddq_s16(s2_8, s2_11); - s1_9 = vaddq_s16(s2_9, s2_10); - s1_10 = vsubq_s16(s2_9, s2_10); - s1_11 = vsubq_s16(s2_8, s2_11); - s1_12 = vsubq_s16(s2_15, s2_12); - s1_13 = vsubq_s16(s2_14, s2_13); - s1_14 = vaddq_s16(s2_13, s2_14); - s1_15 = vaddq_s16(s2_12, s2_15); + s1[8] = vaddq_s16(s2[8], s2[11]); + s1[9] = vaddq_s16(s2[9], s2[10]); + s1[10] = vsubq_s16(s2[9], s2[10]); + s1[11] = vsubq_s16(s2[8], s2[11]); + s1[12] = vsubq_s16(s2[15], s2[12]); + s1[13] = vsubq_s16(s2[14], s2[13]); + s1[14] = vaddq_s16(s2[13], s2[14]); + s1[15] = vaddq_s16(s2[12], s2[15]); - s1_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_8_64, s2_29, - cospi_24_64); - s1_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, cospi_24_64, s2_29, - cospi_8_64); + s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29], + cospi_24_64); + s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29], + cospi_8_64); - s1_19 = multiply_accumulate_shift_and_narrow_s16(s2_19, -cospi_8_64, s2_28, - cospi_24_64); - s1_28 = multiply_accumulate_shift_and_narrow_s16(s2_19, cospi_24_64, s2_28, - cospi_8_64); + s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28], + cospi_24_64); + s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28], + cospi_8_64); - s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27, - -cospi_8_64); - s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27, - cospi_24_64); + s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64, + s2[27], -cospi_8_64); + s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27], + cospi_24_64); - s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26, - -cospi_8_64); - s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26, - cospi_24_64); + s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64, + s2[26], -cospi_8_64); + s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26], + cospi_24_64); // stage 6 - s2_0 = vaddq_s16(s1_0, s1_7); - s2_1 = vaddq_s16(s1_0, s1_6); - s2_2 = vaddq_s16(s1_0, s1_5); - s2_3 = vaddq_s16(s1_0, s1_4); - s2_4 = vsubq_s16(s1_0, s1_4); - s2_5 = vsubq_s16(s1_0, s1_5); - s2_6 = vsubq_s16(s1_0, s1_6); - s2_7 = vsubq_s16(s1_0, s1_7); + s2[0] = vaddq_s16(s1[0], s1[7]); + s2[1] = vaddq_s16(s1[0], s1[6]); + s2[2] = vaddq_s16(s1[0], s1[5]); + s2[3] = vaddq_s16(s1[0], s1[4]); + s2[4] = vsubq_s16(s1[0], s1[4]); + s2[5] = vsubq_s16(s1[0], s1[5]); + s2[6] = vsubq_s16(s1[0], s1[6]); + s2[7] = vsubq_s16(s1[0], s1[7]); - s2_10 = sub_multiply_shift_and_narrow_s16(s1_13, s1_10, cospi_16_64); - s2_13 = add_multiply_shift_and_narrow_s16(s1_10, s1_13, cospi_16_64); + s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64); + s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64); - s2_11 = sub_multiply_shift_and_narrow_s16(s1_12, s1_11, cospi_16_64); - s2_12 = add_multiply_shift_and_narrow_s16(s1_11, s1_12, cospi_16_64); + s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64); + s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64); - s1_16 = vaddq_s16(s2_16, s2_23); - s1_17 = vaddq_s16(s2_17, s2_22); - s2_18 = vaddq_s16(s1_18, s1_21); - s2_19 = vaddq_s16(s1_19, s1_20); - s2_20 = vsubq_s16(s1_19, s1_20); - s2_21 = vsubq_s16(s1_18, s1_21); - s1_22 = vsubq_s16(s2_17, s2_22); - s1_23 = vsubq_s16(s2_16, s2_23); + s1[16] = vaddq_s16(s2[16], s2[23]); + s1[17] = vaddq_s16(s2[17], s2[22]); + s2[18] = vaddq_s16(s1[18], s1[21]); + s2[19] = vaddq_s16(s1[19], s1[20]); + s2[20] = vsubq_s16(s1[19], s1[20]); + s2[21] = vsubq_s16(s1[18], s1[21]); + s1[22] = vsubq_s16(s2[17], s2[22]); + s1[23] = vsubq_s16(s2[16], s2[23]); - s3_24 = vsubq_s16(s2_31, s2_24); - s3_25 = vsubq_s16(s2_30, s2_25); - s3_26 = vsubq_s16(s1_29, s1_26); - s3_27 = vsubq_s16(s1_28, s1_27); - s2_28 = vaddq_s16(s1_27, s1_28); - s2_29 = vaddq_s16(s1_26, s1_29); - s2_30 = vaddq_s16(s2_25, s2_30); - s2_31 = vaddq_s16(s2_24, s2_31); + s3[24] = vsubq_s16(s2[31], s2[24]); + s3[25] = vsubq_s16(s2[30], s2[25]); + s3[26] = vsubq_s16(s1[29], s1[26]); + s3[27] = vsubq_s16(s1[28], s1[27]); + s2[28] = vaddq_s16(s1[27], s1[28]); + s2[29] = vaddq_s16(s1[26], s1[29]); + s2[30] = vaddq_s16(s2[25], s2[30]); + s2[31] = vaddq_s16(s2[24], s2[31]); // stage 7 - s1_0 = vaddq_s16(s2_0, s1_15); - s1_1 = vaddq_s16(s2_1, s1_14); - s1_2 = vaddq_s16(s2_2, s2_13); - s1_3 = vaddq_s16(s2_3, s2_12); - s1_4 = vaddq_s16(s2_4, s2_11); - s1_5 = vaddq_s16(s2_5, s2_10); - s1_6 = vaddq_s16(s2_6, s1_9); - s1_7 = vaddq_s16(s2_7, s1_8); - s1_8 = vsubq_s16(s2_7, s1_8); - s1_9 = vsubq_s16(s2_6, s1_9); - s1_10 = vsubq_s16(s2_5, s2_10); - s1_11 = vsubq_s16(s2_4, s2_11); - s1_12 = vsubq_s16(s2_3, s2_12); - s1_13 = vsubq_s16(s2_2, s2_13); - s1_14 = vsubq_s16(s2_1, s1_14); - s1_15 = vsubq_s16(s2_0, s1_15); + s1[0] = vaddq_s16(s2[0], s1[15]); + s1[1] = vaddq_s16(s2[1], s1[14]); + s1[2] = vaddq_s16(s2[2], s2[13]); + s1[3] = vaddq_s16(s2[3], s2[12]); + s1[4] = vaddq_s16(s2[4], s2[11]); + s1[5] = vaddq_s16(s2[5], s2[10]); + s1[6] = vaddq_s16(s2[6], s1[9]); + s1[7] = vaddq_s16(s2[7], s1[8]); + s1[8] = vsubq_s16(s2[7], s1[8]); + s1[9] = vsubq_s16(s2[6], s1[9]); + s1[10] = vsubq_s16(s2[5], s2[10]); + s1[11] = vsubq_s16(s2[4], s2[11]); + s1[12] = vsubq_s16(s2[3], s2[12]); + s1[13] = vsubq_s16(s2[2], s2[13]); + s1[14] = vsubq_s16(s2[1], s1[14]); + s1[15] = vsubq_s16(s2[0], s1[15]); - s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64); - s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64); + s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64); + s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64); - s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64); - s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64); + s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64); + s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64); - s2_22 = sub_multiply_shift_and_narrow_s16(s3_25, s1_22, cospi_16_64); - s1_25 = add_multiply_shift_and_narrow_s16(s1_22, s3_25, cospi_16_64); + s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64); + s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64); - s2_23 = sub_multiply_shift_and_narrow_s16(s3_24, s1_23, cospi_16_64); - s1_24 = add_multiply_shift_and_narrow_s16(s1_23, s3_24, cospi_16_64); + s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64); + s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64); // final stage - out0 = vaddq_s16(s1_0, s2_31); - out1 = vaddq_s16(s1_1, s2_30); - out2 = vaddq_s16(s1_2, s2_29); - out3 = vaddq_s16(s1_3, s2_28); - out4 = vaddq_s16(s1_4, s1_27); - out5 = vaddq_s16(s1_5, s1_26); - out6 = vaddq_s16(s1_6, s1_25); - out7 = vaddq_s16(s1_7, s1_24); + out[0] = final_add(s1[0], s2[31]); + out[1] = final_add(s1[1], s2[30]); + out[2] = final_add(s1[2], s2[29]); + out[3] = final_add(s1[3], s2[28]); + out[4] = final_add(s1[4], s1[27]); + out[5] = final_add(s1[5], s1[26]); + out[6] = final_add(s1[6], s1[25]); + out[7] = final_add(s1[7], s1[24]); + out[8] = final_add(s1[8], s2[23]); + out[9] = final_add(s1[9], s2[22]); + out[10] = final_add(s1[10], s1[21]); + out[11] = final_add(s1[11], s1[20]); + out[12] = final_add(s1[12], s2[19]); + out[13] = final_add(s1[13], s2[18]); + out[14] = final_add(s1[14], s1[17]); + out[15] = final_add(s1[15], s1[16]); + out[16] = final_sub(s1[15], s1[16]); + out[17] = final_sub(s1[14], s1[17]); + out[18] = final_sub(s1[13], s2[18]); + out[19] = final_sub(s1[12], s2[19]); + out[20] = final_sub(s1[11], s1[20]); + out[21] = final_sub(s1[10], s1[21]); + out[22] = final_sub(s1[9], s2[22]); + out[23] = final_sub(s1[8], s2[23]); + out[24] = final_sub(s1[7], s1[24]); + out[25] = final_sub(s1[6], s1[25]); + out[26] = final_sub(s1[5], s1[26]); + out[27] = final_sub(s1[4], s1[27]); + out[28] = final_sub(s1[3], s2[28]); + out[29] = final_sub(s1[2], s2[29]); + out[30] = final_sub(s1[1], s2[30]); + out[31] = final_sub(s1[0], s2[31]); - add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output, - stride); - - out0 = vaddq_s16(s1_8, s2_23); - out1 = vaddq_s16(s1_9, s2_22); - out2 = vaddq_s16(s1_10, s1_21); - out3 = vaddq_s16(s1_11, s1_20); - out4 = vaddq_s16(s1_12, s2_19); - out5 = vaddq_s16(s1_13, s2_18); - out6 = vaddq_s16(s1_14, s1_17); - out7 = vaddq_s16(s1_15, s1_16); - - add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, - output + (8 * stride), stride); - - out0 = vsubq_s16(s1_15, s1_16); - out1 = vsubq_s16(s1_14, s1_17); - out2 = vsubq_s16(s1_13, s2_18); - out3 = vsubq_s16(s1_12, s2_19); - out4 = vsubq_s16(s1_11, s1_20); - out5 = vsubq_s16(s1_10, s1_21); - out6 = vsubq_s16(s1_9, s2_22); - out7 = vsubq_s16(s1_8, s2_23); - - add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, - output + (16 * stride), stride); - - out0 = vsubq_s16(s1_7, s1_24); - out1 = vsubq_s16(s1_6, s1_25); - out2 = vsubq_s16(s1_5, s1_26); - out3 = vsubq_s16(s1_4, s1_27); - out4 = vsubq_s16(s1_3, s2_28); - out5 = vsubq_s16(s1_2, s2_29); - out6 = vsubq_s16(s1_1, s2_30); - out7 = vsubq_s16(s1_0, s2_31); - - add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, - output + (24 * stride), stride); + if (highbd_flag) { + highbd_add_and_store_bd8(out, output, stride); + } else { + uint8_t *const outputT = (uint8_t *)output; + add_and_store_u8_s16(out + 0, outputT, stride); + add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride); + add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride); + add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride); + } } void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, @@ -524,10 +503,10 @@ void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int16_t temp[32 * 8]; int16_t *t = temp; - idct32_6_neon(input, t); + vpx_idct32_6_neon(input, t); for (i = 0; i < 32; i += 8) { - idct32_8_neon(t, dest, stride); + vpx_idct32_8_neon(t, dest, stride, 0); t += (8 * 8); dest += 8; } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c index de1bf97875..9f4589ea96 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct32x32_add_neon.c @@ -13,147 +13,143 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -#define LOAD_FROM_TRANSPOSED(prev, first, second) \ - q14s16 = vld1q_s16(trans_buf + first * 8); \ - q13s16 = vld1q_s16(trans_buf + second * 8); +static INLINE void load_from_transformed(const int16_t *const trans_buf, + const int first, const int second, + int16x8_t *const q0, + int16x8_t *const q1) { + *q0 = vld1q_s16(trans_buf + first * 8); + *q1 = vld1q_s16(trans_buf + second * 8); +} -#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ - qA = vld1q_s16(out + first * 32); \ - qB = vld1q_s16(out + second * 32); +static INLINE void load_from_output(const int16_t *const out, const int first, + const int second, int16x8_t *const q0, + int16x8_t *const q1) { + *q0 = vld1q_s16(out + first * 32); + *q1 = vld1q_s16(out + second * 32); +} -#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ - vst1q_s16(out + first * 32, qA); \ - vst1q_s16(out + second * 32, qB); +static INLINE void store_in_output(int16_t *const out, const int first, + const int second, const int16x8_t q0, + const int16x8_t q1) { + vst1q_s16(out + first * 32, q0); + vst1q_s16(out + second * 32, q1); +} -#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ - __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16); -static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2, - int stride, int16x8_t q6s16, - int16x8_t q7s16, - int16x8_t q8s16, - int16x8_t q9s16) { - int16x4_t d8s16, d9s16, d10s16, d11s16; +static INLINE void store_combine_results(uint8_t *p1, uint8_t *p2, + const int stride, int16x8_t q0, + int16x8_t q1, int16x8_t q2, + int16x8_t q3) { + uint8x8_t d[4]; - d8s16 = vld1_s16((int16_t *)p1); + d[0] = vld1_u8(p1); p1 += stride; - d11s16 = vld1_s16((int16_t *)p2); + d[1] = vld1_u8(p1); + d[3] = vld1_u8(p2); p2 -= stride; - d9s16 = vld1_s16((int16_t *)p1); - d10s16 = vld1_s16((int16_t *)p2); + d[2] = vld1_u8(p2); - q7s16 = vrshrq_n_s16(q7s16, 6); - q8s16 = vrshrq_n_s16(q8s16, 6); - q9s16 = vrshrq_n_s16(q9s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); + q0 = vrshrq_n_s16(q0, 6); + q1 = vrshrq_n_s16(q1, 6); + q2 = vrshrq_n_s16(q2, 6); + q3 = vrshrq_n_s16(q3, 6); - q7s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16))); - q8s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16))); - q9s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16))); - q6s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16))); + q0 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q0), d[0])); + q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), d[1])); + q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), d[2])); + q3 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q3), d[3])); - d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); - d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); - d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + d[0] = vqmovun_s16(q0); + d[1] = vqmovun_s16(q1); + d[2] = vqmovun_s16(q2); + d[3] = vqmovun_s16(q3); - vst1_s16((int16_t *)p1, d9s16); + vst1_u8(p1, d[1]); p1 -= stride; - vst1_s16((int16_t *)p2, d10s16); + vst1_u8(p1, d[0]); + vst1_u8(p2, d[2]); p2 += stride; - vst1_s16((int16_t *)p1, d8s16); - vst1_s16((int16_t *)p2, d11s16); + vst1_u8(p2, d[3]); } -#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \ - __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16); -static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2, - int stride, int16x8_t q4s16, - int16x8_t q5s16, - int16x8_t q6s16, - int16x8_t q7s16) { - int16x4_t d4s16, d5s16, d6s16, d7s16; +static INLINE void highbd_store_combine_results_bd8(uint16_t *p1, uint16_t *p2, + const int stride, + int16x8_t q0, int16x8_t q1, + int16x8_t q2, + int16x8_t q3) { + uint16x8_t d[4]; - d4s16 = vld1_s16((int16_t *)p1); + d[0] = vld1q_u16(p1); p1 += stride; - d7s16 = vld1_s16((int16_t *)p2); + d[1] = vld1q_u16(p1); + d[3] = vld1q_u16(p2); p2 -= stride; - d5s16 = vld1_s16((int16_t *)p1); - d6s16 = vld1_s16((int16_t *)p2); + d[2] = vld1q_u16(p2); - q5s16 = vrshrq_n_s16(q5s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); - q7s16 = vrshrq_n_s16(q7s16, 6); - q4s16 = vrshrq_n_s16(q4s16, 6); + q0 = vrshrq_n_s16(q0, 6); + q1 = vrshrq_n_s16(q1, 6); + q2 = vrshrq_n_s16(q2, 6); + q3 = vrshrq_n_s16(q3, 6); - q5s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16))); - q6s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16))); - q7s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16))); - q4s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16))); + q0 = vaddq_s16(q0, vreinterpretq_s16_u16(d[0])); + q1 = vaddq_s16(q1, vreinterpretq_s16_u16(d[1])); + q2 = vaddq_s16(q2, vreinterpretq_s16_u16(d[2])); + q3 = vaddq_s16(q3, vreinterpretq_s16_u16(d[3])); - d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); - d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); - d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); + d[0] = vmovl_u8(vqmovun_s16(q0)); + d[1] = vmovl_u8(vqmovun_s16(q1)); + d[2] = vmovl_u8(vqmovun_s16(q2)); + d[3] = vmovl_u8(vqmovun_s16(q3)); - vst1_s16((int16_t *)p1, d5s16); + vst1q_u16(p1, d[1]); p1 -= stride; - vst1_s16((int16_t *)p2, d6s16); + vst1q_u16(p1, d[0]); + vst1q_u16(p2, d[2]); p2 += stride; - vst1_s16((int16_t *)p2, d7s16); - vst1_s16((int16_t *)p1, d4s16); + vst1q_u16(p2, d[3]); } -#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ - DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); -static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16, - int16_t first_const, int16_t second_const, - int16x8_t *qAs16, int16x8_t *qBs16) { - int16x4_t d30s16, d31s16; - int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; - int16x4_t dCs16, dDs16, dAs16, dBs16; +static INLINE void do_butterfly(const int16x8_t qIn0, const int16x8_t qIn1, + const int16_t first_const, + const int16_t second_const, + int16x8_t *const qOut0, + int16x8_t *const qOut1) { + int32x4_t q[4]; + int16x4_t d[6]; - dCs16 = vget_low_s16(q14s16); - dDs16 = vget_high_s16(q14s16); - dAs16 = vget_low_s16(q13s16); - dBs16 = vget_high_s16(q13s16); + d[0] = vget_low_s16(qIn0); + d[1] = vget_high_s16(qIn0); + d[2] = vget_low_s16(qIn1); + d[3] = vget_high_s16(qIn1); - d30s16 = vdup_n_s16(first_const); - d31s16 = vdup_n_s16(second_const); + // Note: using v{mul, mla, mls}l_n_s16 here slows down 35% with gcc 4.9. + d[4] = vdup_n_s16(first_const); + d[5] = vdup_n_s16(second_const); - q8s32 = vmull_s16(dCs16, d30s16); - q10s32 = vmull_s16(dAs16, d31s16); - q9s32 = vmull_s16(dDs16, d30s16); - q11s32 = vmull_s16(dBs16, d31s16); - q12s32 = vmull_s16(dCs16, d31s16); + q[0] = vmull_s16(d[0], d[4]); + q[1] = vmull_s16(d[1], d[4]); + q[0] = vmlsl_s16(q[0], d[2], d[5]); + q[1] = vmlsl_s16(q[1], d[3], d[5]); - q8s32 = vsubq_s32(q8s32, q10s32); - q9s32 = vsubq_s32(q9s32, q11s32); + q[2] = vmull_s16(d[0], d[5]); + q[3] = vmull_s16(d[1], d[5]); + q[2] = vmlal_s16(q[2], d[2], d[4]); + q[3] = vmlal_s16(q[3], d[3], d[4]); - q10s32 = vmull_s16(dDs16, d31s16); - q11s32 = vmull_s16(dAs16, d30s16); - q15s32 = vmull_s16(dBs16, d30s16); - - q11s32 = vaddq_s32(q12s32, q11s32); - q10s32 = vaddq_s32(q10s32, q15s32); - - *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14)); - *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14)); + *qOut0 = vcombine_s16(vrshrn_n_s32(q[0], DCT_CONST_BITS), + vrshrn_n_s32(q[1], DCT_CONST_BITS)); + *qOut1 = vcombine_s16(vrshrn_n_s32(q[2], DCT_CONST_BITS), + vrshrn_n_s32(q[3], DCT_CONST_BITS)); } -static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1, - int16x8_t *s2, int16x8_t *s3, int16x8_t *s4, - int16x8_t *s5, int16x8_t *s6, int16x8_t *s7) { +static INLINE void load_s16x8q(const int16_t *in, int16x8_t *const s0, + int16x8_t *const s1, int16x8_t *const s2, + int16x8_t *const s3, int16x8_t *const s4, + int16x8_t *const s5, int16x8_t *const s6, + int16x8_t *const s7) { *s0 = vld1q_s16(in); in += 32; *s1 = vld1q_s16(in); @@ -207,11 +203,10 @@ static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) { } #if CONFIG_VP9_HIGHBITDEPTH -static INLINE void load_s16x8q_tran_low(const tran_low_t *in, int16x8_t *s0, - int16x8_t *s1, int16x8_t *s2, - int16x8_t *s3, int16x8_t *s4, - int16x8_t *s5, int16x8_t *s6, - int16x8_t *s7) { +static INLINE void load_s16x8q_tran_low( + const tran_low_t *in, int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2, int16x8_t *const s3, int16x8_t *const s4, + int16x8_t *const s5, int16x8_t *const s6, int16x8_t *const s7) { *s0 = load_tran_low_to_s16q(in); in += 32; *s1 = load_tran_low_to_s16q(in); @@ -243,197 +238,287 @@ static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input, #define idct32_transpose_pair_tran_low idct32_transpose_pair #endif // CONFIG_VP9_HIGHBITDEPTH -static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16, - int16x8_t q3s16, int16x8_t q6s16, - int16x8_t q7s16, int16x8_t q8s16, - int16x8_t q9s16, int16x8_t q10s16, - int16x8_t q11s16, int16x8_t q12s16, - int16x8_t q13s16, int16x8_t q14s16, - int16x8_t q15s16) { - int16x8_t q0s16, q1s16, q4s16, q5s16; +static INLINE void idct32_bands_end_1st_pass(int16_t *const out, + int16x8_t *const q) { + store_in_output(out, 16, 17, q[6], q[7]); + store_in_output(out, 14, 15, q[8], q[9]); - STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); - STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 30, 31, q[6], q[7]); + store_in_output(out, 0, 1, q[4], q[5]); - LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); - STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = vaddq_s16(q[10], q[1]); + q[3] = vaddq_s16(q[11], q[0]); + q[4] = vsubq_s16(q[11], q[0]); + q[5] = vsubq_s16(q[10], q[1]); - LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = vaddq_s16(q[4], q[1]); + q[9] = vaddq_s16(q[5], q[0]); + q[6] = vsubq_s16(q[5], q[0]); + q[7] = vsubq_s16(q[4], q[1]); + store_in_output(out, 18, 19, q[6], q[7]); + store_in_output(out, 12, 13, q[8], q[9]); - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); - STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 28, 29, q[6], q[7]); + store_in_output(out, 2, 3, q[4], q[5]); - LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); - STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = vaddq_s16(q[12], q[1]); + q[3] = vaddq_s16(q[13], q[0]); + q[4] = vsubq_s16(q[13], q[0]); + q[5] = vsubq_s16(q[12], q[1]); - LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = vaddq_s16(q[4], q[1]); + q[9] = vaddq_s16(q[5], q[0]); + q[6] = vsubq_s16(q[5], q[0]); + q[7] = vsubq_s16(q[4], q[1]); + store_in_output(out, 20, 21, q[6], q[7]); + store_in_output(out, 10, 11, q[8], q[9]); - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); - STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 26, 27, q[6], q[7]); + store_in_output(out, 4, 5, q[4], q[5]); - LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); - STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = vaddq_s16(q[14], q[1]); + q[3] = vaddq_s16(q[15], q[0]); + q[4] = vsubq_s16(q[15], q[0]); + q[5] = vsubq_s16(q[14], q[1]); - LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = vaddq_s16(q[4], q[1]); + q[9] = vaddq_s16(q[5], q[0]); + q[6] = vsubq_s16(q[5], q[0]); + q[7] = vsubq_s16(q[4], q[1]); + store_in_output(out, 22, 23, q[6], q[7]); + store_in_output(out, 8, 9, q[8], q[9]); - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); - STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); - - LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); - STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = vaddq_s16(q[2], q[1]); + q[5] = vaddq_s16(q[3], q[0]); + q[6] = vsubq_s16(q[3], q[0]); + q[7] = vsubq_s16(q[2], q[1]); + store_in_output(out, 24, 25, q[6], q[7]); + store_in_output(out, 6, 7, q[4], q[5]); } -static INLINE void idct32_bands_end_2nd_pass( - int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16, - int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16, - int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16, - int16x8_t q14s16, int16x8_t q15s16) { - uint8_t *r6 = dest + 31 * stride; - uint8_t *r7 = dest /* + 0 * stride*/; - uint8_t *r9 = dest + 15 * stride; - uint8_t *r10 = dest + 16 * stride; - int str2 = stride << 1; - int16x8_t q0s16, q1s16, q4s16, q5s16; +static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out, + uint8_t *const dest, + const int stride, + int16x8_t *const q) { + uint8_t *dest0 = dest + 0 * stride; + uint8_t *dest1 = dest + 31 * stride; + uint8_t *dest2 = dest + 16 * stride; + uint8_t *dest3 = dest + 15 * stride; + const int str2 = stride << 1; - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + dest2 += str2; + dest3 -= str2; - LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); + dest0 += str2; + dest1 -= str2; - LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = vaddq_s16(q[10], q[1]); + q[3] = vaddq_s16(q[11], q[0]); + q[4] = vsubq_s16(q[11], q[0]); + q[5] = vsubq_s16(q[10], q[1]); - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + dest2 += str2; + dest3 -= str2; - LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); + dest0 += str2; + dest1 -= str2; - LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = vaddq_s16(q[12], q[1]); + q[3] = vaddq_s16(q[13], q[0]); + q[4] = vsubq_s16(q[13], q[0]); + q[5] = vsubq_s16(q[12], q[1]); - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); + dest2 += str2; + dest3 -= str2; - LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); + dest0 += str2; + dest1 -= str2; - LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = vaddq_s16(q[14], q[1]); + q[3] = vaddq_s16(q[15], q[0]); + q[4] = vsubq_s16(q[15], q[0]); + q[5] = vsubq_s16(q[14], q[1]); - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]); - LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]); } -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, - int stride) { +static INLINE void highbd_idct32_bands_end_2nd_pass_bd8( + const int16_t *const out, uint16_t *const dest, const int stride, + int16x8_t *const q) { + uint16_t *dest0 = dest + 0 * stride; + uint16_t *dest1 = dest + 31 * stride; + uint16_t *dest2 = dest + 16 * stride; + uint16_t *dest3 = dest + 15 * stride; + const int str2 = stride << 1; + + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 30, 31, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 12, 13, &q[0], &q[1]); + q[2] = vaddq_s16(q[10], q[1]); + q[3] = vaddq_s16(q[11], q[0]); + q[4] = vsubq_s16(q[11], q[0]); + q[5] = vsubq_s16(q[10], q[1]); + + load_from_output(out, 18, 19, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 28, 29, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 10, 11, &q[0], &q[1]); + q[2] = vaddq_s16(q[12], q[1]); + q[3] = vaddq_s16(q[13], q[0]); + q[4] = vsubq_s16(q[13], q[0]); + q[5] = vsubq_s16(q[12], q[1]); + + load_from_output(out, 20, 21, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + dest2 += str2; + dest3 -= str2; + + load_from_output(out, 26, 27, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); + dest0 += str2; + dest1 -= str2; + + load_from_output(out, 8, 9, &q[0], &q[1]); + q[2] = vaddq_s16(q[14], q[1]); + q[3] = vaddq_s16(q[15], q[0]); + q[4] = vsubq_s16(q[15], q[0]); + q[5] = vsubq_s16(q[14], q[1]); + + load_from_output(out, 22, 23, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); + highbd_store_combine_results_bd8(dest2, dest3, stride, q[6], q[7], q[8], + q[9]); + + load_from_output(out, 24, 25, &q[0], &q[1]); + q[4] = final_add(q[2], q[1]); + q[5] = final_add(q[3], q[0]); + q[6] = final_sub(q[3], q[0]); + q[7] = final_sub(q[2], q[1]); + highbd_store_combine_results_bd8(dest0, dest1, stride, q[4], q[5], q[6], + q[7]); +} + +void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, + const int stride, const int highbd_flag) { int i, idct32_pass_loop; int16_t trans_buf[32 * 8]; int16_t pass1[32 * 32]; int16_t pass2[32 * 32]; const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1 int16_t *out; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int16x8_t q[16]; + uint16_t *dst = CAST_TO_SHORTPTR(dest); for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; idct32_pass_loop++, out = pass2) { @@ -451,237 +536,241 @@ void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, // ----------------------------------------- // generate 16,17,30,31 // part of stage 1 - LOAD_FROM_TRANSPOSED(0, 1, 31) - DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(31, 17, 15) - DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) + load_from_transformed(trans_buf, 1, 31, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_31_64, cospi_1_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 17, 15, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_15_64, cospi_17_64, &q[1], &q[3]); // part of stage 2 - q4s16 = vaddq_s16(q0s16, q1s16); - q13s16 = vsubq_s16(q0s16, q1s16); - q6s16 = vaddq_s16(q2s16, q3s16); - q14s16 = vsubq_s16(q2s16, q3s16); + q[4] = vaddq_s16(q[0], q[1]); + q[13] = vsubq_s16(q[0], q[1]); + q[6] = vaddq_s16(q[2], q[3]); + q[14] = vsubq_s16(q[2], q[3]); // part of stage 3 - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[5], &q[7]); // generate 18,19,28,29 // part of stage 1 - LOAD_FROM_TRANSPOSED(15, 9, 23) - DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(23, 25, 7) - DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) + load_from_transformed(trans_buf, 9, 23, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_23_64, cospi_9_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 25, 7, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_7_64, cospi_25_64, &q[1], &q[3]); // part of stage 2 - q13s16 = vsubq_s16(q3s16, q2s16); - q3s16 = vaddq_s16(q3s16, q2s16); - q14s16 = vsubq_s16(q1s16, q0s16); - q2s16 = vaddq_s16(q1s16, q0s16); + q[13] = vsubq_s16(q[3], q[2]); + q[3] = vaddq_s16(q[3], q[2]); + q[14] = vsubq_s16(q[1], q[0]); + q[2] = vaddq_s16(q[1], q[0]); // part of stage 3 - DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) + do_butterfly(q[14], q[13], -cospi_4_64, -cospi_28_64, &q[1], &q[0]); // part of stage 4 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q10s16 = vaddq_s16(q7s16, q1s16); - q15s16 = vaddq_s16(q6s16, q3s16); - q13s16 = vsubq_s16(q5s16, q0s16); - q14s16 = vsubq_s16(q7s16, q1s16); - STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) - STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) + q[8] = vaddq_s16(q[4], q[2]); + q[9] = vaddq_s16(q[5], q[0]); + q[10] = vaddq_s16(q[7], q[1]); + q[15] = vaddq_s16(q[6], q[3]); + q[13] = vsubq_s16(q[5], q[0]); + q[14] = vsubq_s16(q[7], q[1]); + store_in_output(out, 16, 31, q[8], q[15]); + store_in_output(out, 17, 30, q[9], q[10]); // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) - STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[0], &q[1]); + store_in_output(out, 29, 18, q[1], q[0]); // part of stage 4 - q13s16 = vsubq_s16(q4s16, q2s16); - q14s16 = vsubq_s16(q6s16, q3s16); + q[13] = vsubq_s16(q[4], q[2]); + q[14] = vsubq_s16(q[6], q[3]); // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) - STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[4], &q[6]); + store_in_output(out, 19, 28, q[4], q[6]); // ----------------------------------------- // BLOCK B: 20-23,24-27 // ----------------------------------------- // generate 20,21,26,27 // part of stage 1 - LOAD_FROM_TRANSPOSED(7, 5, 27) - DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(27, 21, 11) - DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) + load_from_transformed(trans_buf, 5, 27, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_27_64, cospi_5_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 21, 11, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_11_64, cospi_21_64, &q[1], &q[3]); // part of stage 2 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); + q[13] = vsubq_s16(q[0], q[1]); + q[0] = vaddq_s16(q[0], q[1]); + q[14] = vsubq_s16(q[2], q[3]); + q[2] = vaddq_s16(q[2], q[3]); // part of stage 3 - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); // generate 22,23,24,25 // part of stage 1 - LOAD_FROM_TRANSPOSED(11, 13, 19) - DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(19, 29, 3) - DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) + load_from_transformed(trans_buf, 13, 19, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_19_64, cospi_13_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 29, 3, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_3_64, cospi_29_64, &q[4], &q[6]); // part of stage 2 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); + q[14] = vsubq_s16(q[4], q[5]); + q[5] = vaddq_s16(q[4], q[5]); + q[13] = vsubq_s16(q[6], q[7]); + q[6] = vaddq_s16(q[6], q[7]); // part of stage 3 - DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) + do_butterfly(q[14], q[13], -cospi_20_64, -cospi_12_64, &q[4], &q[7]); // part of stage 4 - q10s16 = vaddq_s16(q7s16, q1s16); - q11s16 = vaddq_s16(q5s16, q0s16); - q12s16 = vaddq_s16(q6s16, q2s16); - q15s16 = vaddq_s16(q4s16, q3s16); + q[10] = vaddq_s16(q[7], q[1]); + q[11] = vaddq_s16(q[5], q[0]); + q[12] = vaddq_s16(q[6], q[2]); + q[15] = vaddq_s16(q[4], q[3]); // part of stage 6 - LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q11s16); - q9s16 = vaddq_s16(q13s16, q10s16); - q13s16 = vsubq_s16(q13s16, q10s16); - q11s16 = vsubq_s16(q14s16, q11s16); - STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) - LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) - q8s16 = vsubq_s16(q9s16, q12s16); - q10s16 = vaddq_s16(q14s16, q15s16); - q14s16 = vsubq_s16(q14s16, q15s16); - q12s16 = vaddq_s16(q9s16, q12s16); - STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) + load_from_output(out, 16, 17, &q[14], &q[13]); + q[8] = vaddq_s16(q[14], q[11]); + q[9] = vaddq_s16(q[13], q[10]); + q[13] = vsubq_s16(q[13], q[10]); + q[11] = vsubq_s16(q[14], q[11]); + store_in_output(out, 17, 16, q[9], q[8]); + load_from_output(out, 30, 31, &q[14], &q[9]); + q[8] = vsubq_s16(q[9], q[12]); + q[10] = vaddq_s16(q[14], q[15]); + q[14] = vsubq_s16(q[14], q[15]); + q[12] = vaddq_s16(q[9], q[12]); + store_in_output(out, 30, 31, q[10], q[12]); // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) - q13s16 = q11s16; - q14s16 = q8s16; - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 25, 22, q[14], q[13]); + do_butterfly(q[8], q[11], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 24, 23, q[14], q[13]); // part of stage 4 - q14s16 = vsubq_s16(q5s16, q0s16); - q13s16 = vsubq_s16(q6s16, q2s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); - q14s16 = vsubq_s16(q7s16, q1s16); - q13s16 = vsubq_s16(q4s16, q3s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); + q[14] = vsubq_s16(q[5], q[0]); + q[13] = vsubq_s16(q[6], q[2]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[5], &q[6]); + q[14] = vsubq_s16(q[7], q[1]); + q[13] = vsubq_s16(q[4], q[3]); + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[0], &q[1]); // part of stage 6 - LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q1s16); - q9s16 = vaddq_s16(q13s16, q6s16); - q13s16 = vsubq_s16(q13s16, q6s16); - q1s16 = vsubq_s16(q14s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) - LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) - q14s16 = vsubq_s16(q8s16, q5s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q9s16, q0s16); - q0s16 = vsubq_s16(q9s16, q0s16); - STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) + load_from_output(out, 18, 19, &q[14], &q[13]); + q[8] = vaddq_s16(q[14], q[1]); + q[9] = vaddq_s16(q[13], q[6]); + q[13] = vsubq_s16(q[13], q[6]); + q[1] = vsubq_s16(q[14], q[1]); + store_in_output(out, 18, 19, q[8], q[9]); + load_from_output(out, 28, 29, &q[8], &q[9]); + q[14] = vsubq_s16(q[8], q[5]); + q[10] = vaddq_s16(q[8], q[5]); + q[11] = vaddq_s16(q[9], q[0]); + q[0] = vsubq_s16(q[9], q[0]); + store_in_output(out, 28, 29, q[10], q[11]); // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) - DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16); - STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[13], &q[14]); + store_in_output(out, 20, 27, q[13], q[14]); + do_butterfly(q[0], q[1], cospi_16_64, cospi_16_64, &q[1], &q[0]); + store_in_output(out, 21, 26, q[1], q[0]); // ----------------------------------------- // BLOCK C: 8-10,11-15 // ----------------------------------------- // generate 8,9,14,15 // part of stage 2 - LOAD_FROM_TRANSPOSED(3, 2, 30) - DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(30, 18, 14) - DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) + load_from_transformed(trans_buf, 2, 30, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_30_64, cospi_2_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 18, 14, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_14_64, cospi_18_64, &q[1], &q[3]); // part of stage 3 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); + q[13] = vsubq_s16(q[0], q[1]); + q[0] = vaddq_s16(q[0], q[1]); + q[14] = vsubq_s16(q[2], q[3]); + q[2] = vaddq_s16(q[2], q[3]); // part of stage 4 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[1], &q[3]); // generate 10,11,12,13 // part of stage 2 - LOAD_FROM_TRANSPOSED(14, 10, 22) - DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(22, 26, 6) - DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) + load_from_transformed(trans_buf, 10, 22, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_22_64, cospi_10_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 26, 6, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_6_64, cospi_26_64, &q[4], &q[6]); // part of stage 3 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); + q[14] = vsubq_s16(q[4], q[5]); + q[5] = vaddq_s16(q[4], q[5]); + q[13] = vsubq_s16(q[6], q[7]); + q[6] = vaddq_s16(q[6], q[7]); // part of stage 4 - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) + do_butterfly(q[14], q[13], -cospi_8_64, -cospi_24_64, &q[4], &q[7]); // part of stage 5 - q8s16 = vaddq_s16(q0s16, q5s16); - q9s16 = vaddq_s16(q1s16, q7s16); - q13s16 = vsubq_s16(q1s16, q7s16); - q14s16 = vsubq_s16(q3s16, q4s16); - q10s16 = vaddq_s16(q3s16, q4s16); - q15s16 = vaddq_s16(q2s16, q6s16); - STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) - STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) + q[8] = vaddq_s16(q[0], q[5]); + q[9] = vaddq_s16(q[1], q[7]); + q[13] = vsubq_s16(q[1], q[7]); + q[14] = vsubq_s16(q[3], q[4]); + q[10] = vaddq_s16(q[3], q[4]); + q[15] = vaddq_s16(q[2], q[6]); + store_in_output(out, 8, 15, q[8], q[15]); + store_in_output(out, 9, 14, q[9], q[10]); // part of stage 6 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) - q13s16 = vsubq_s16(q0s16, q5s16); - q14s16 = vsubq_s16(q2s16, q6s16); - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 13, 10, q[3], q[1]); + q[13] = vsubq_s16(q[0], q[5]); + q[14] = vsubq_s16(q[2], q[6]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); + store_in_output(out, 11, 12, q[1], q[3]); // ----------------------------------------- // BLOCK D: 0-3,4-7 // ----------------------------------------- // generate 4,5,6,7 // part of stage 3 - LOAD_FROM_TRANSPOSED(6, 4, 28) - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(28, 20, 12) - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + load_from_transformed(trans_buf, 4, 28, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_28_64, cospi_4_64, &q[0], &q[2]); + load_from_transformed(trans_buf, 20, 12, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_12_64, cospi_20_64, &q[1], &q[3]); // part of stage 4 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); + q[13] = vsubq_s16(q[0], q[1]); + q[0] = vaddq_s16(q[0], q[1]); + q[14] = vsubq_s16(q[2], q[3]); + q[2] = vaddq_s16(q[2], q[3]); // part of stage 5 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[1], &q[3]); // generate 0,1,2,3 // part of stage 4 - LOAD_FROM_TRANSPOSED(12, 0, 16) - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(16, 8, 24) - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) + load_from_transformed(trans_buf, 0, 16, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_16_64, cospi_16_64, &q[5], &q[7]); + load_from_transformed(trans_buf, 8, 24, &q[14], &q[13]); + do_butterfly(q[14], q[13], cospi_24_64, cospi_8_64, &q[14], &q[6]); // part of stage 5 - q4s16 = vaddq_s16(q7s16, q6s16); - q7s16 = vsubq_s16(q7s16, q6s16); - q6s16 = vsubq_s16(q5s16, q14s16); - q5s16 = vaddq_s16(q5s16, q14s16); + q[4] = vaddq_s16(q[7], q[6]); + q[7] = vsubq_s16(q[7], q[6]); + q[6] = vsubq_s16(q[5], q[14]); + q[5] = vaddq_s16(q[5], q[14]); // part of stage 6 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q3s16); - q10s16 = vaddq_s16(q6s16, q1s16); - q11s16 = vaddq_s16(q7s16, q0s16); - q12s16 = vsubq_s16(q7s16, q0s16); - q13s16 = vsubq_s16(q6s16, q1s16); - q14s16 = vsubq_s16(q5s16, q3s16); - q15s16 = vsubq_s16(q4s16, q2s16); + q[8] = vaddq_s16(q[4], q[2]); + q[9] = vaddq_s16(q[5], q[3]); + q[10] = vaddq_s16(q[6], q[1]); + q[11] = vaddq_s16(q[7], q[0]); + q[12] = vsubq_s16(q[7], q[0]); + q[13] = vsubq_s16(q[6], q[1]); + q[14] = vsubq_s16(q[5], q[3]); + q[15] = vsubq_s16(q[4], q[2]); // part of stage 7 - LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) - q2s16 = vaddq_s16(q8s16, q1s16); - q3s16 = vaddq_s16(q9s16, q0s16); - q4s16 = vsubq_s16(q9s16, q0s16); - q5s16 = vsubq_s16(q8s16, q1s16); - LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); + load_from_output(out, 14, 15, &q[0], &q[1]); + q[2] = vaddq_s16(q[8], q[1]); + q[3] = vaddq_s16(q[9], q[0]); + q[4] = vsubq_s16(q[9], q[0]); + q[5] = vsubq_s16(q[8], q[1]); + load_from_output(out, 16, 17, &q[0], &q[1]); + q[8] = final_add(q[4], q[1]); + q[9] = final_add(q[5], q[0]); + q[6] = final_sub(q[5], q[0]); + q[7] = final_sub(q[4], q[1]); if (idct32_pass_loop == 0) { - idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, - q10s16, q11s16, q12s16, q13s16, q14s16, - q15s16); + idct32_bands_end_1st_pass(out, q); } else { - idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16, - q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, - q14s16, q15s16); - dest += 8; + if (highbd_flag) { + highbd_idct32_bands_end_2nd_pass_bd8(out, dst, stride, q); + dst += 8; + } else { + idct32_bands_end_2nd_pass(out, dest, stride, q); + dest += 8; + } } } } } + +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + vpx_idct32_32_neon(input, dest, stride, 0); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c index d1eae24a22..a14b895431 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -12,6 +12,7 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/inv_txfm.h" static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, @@ -31,7 +32,8 @@ static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { - const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm index 184d218941..175ba7fbc2 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm @@ -17,7 +17,7 @@ INCLUDE vpx_dsp/arm/idct_neon.asm.S - AREA Block, CODE, READONLY ; name this block of code + AREA Block, CODE, READONLY ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride) ; ; r0 int16_t input diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c index bff98cbc16..8192ee4cf8 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct4x4_add_neon.c @@ -13,52 +13,47 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/txfm_common.h" void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const uint8_t *dst = dest; - const int16x4_t cospis = vld1_s16(kCospi); - uint32x2_t dest01_u32 = vdup_n_u32(0); - uint32x2_t dest32_u32 = vdup_n_u32(0); - int16x8_t a0, a1; - uint8x8_t d01, d32; - uint16x8_t d01_u16, d32_u16; + uint32x2_t s32 = vdup_n_u32(0); + int16x8_t a[2]; + uint8x8_t s, d[2]; + uint16x8_t sum[2]; assert(!((intptr_t)dest % sizeof(uint32_t))); assert(!(stride % sizeof(uint32_t))); // Rows - a0 = load_tran_low_to_s16q(input); - a1 = load_tran_low_to_s16q(input + 8); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + transpose_idct4x4_16_bd8(a); // Columns - a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); - a0 = vrshrq_n_s16(a0, 4); - a1 = vrshrq_n_s16(a1, 4); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_idct4x4_16_bd8(a); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); - dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0); + s = load_u8(dst, stride); + dst += 2 * stride; + // The elements are loaded in reverse order. + s32 = vld1_lane_u32((const uint32_t *)dst, s32, 1); dst += stride; - dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1); - dst += stride; - dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1); - dst += stride; - dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0); + s32 = vld1_lane_u32((const uint32_t *)dst, s32, 0); - d01_u16 = - vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32)); - d32_u16 = - vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32)); - d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16)); - d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16)); + sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s); + sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), vreinterpret_u8_u32(s32)); + d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0])); + d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1])); - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0); + store_u8(dest, stride, d[0]); + dest += 2 * stride; + // The elements are stored in reverse order. + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 1); dest += stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1); - dest += stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1); - dest += stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0); + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 0); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm deleted file mode 100644 index 29f678a038..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm +++ /dev/null @@ -1,86 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vpx_idct8x8_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int stride) - -|vpx_idct8x8_1_add_neon| PROC - ldrsh r0, [r0] - - ; cospi_16_64 = 11585 - movw r12, #0x2d41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 5) - add r0, r0, #16 ; + (1 <<((5) - 1)) - asr r0, r0, #5 ; >> 5 - - vdup.s16 q0, r0 ; duplicate a1 - - ; load destination data - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r2 - vld1.64 {d17}, [r1] - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |vpx_idct8x8_1_add_neon| - - END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c index 7bcce913bd..ce9b459589 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c @@ -36,7 +36,8 @@ static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride, void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { - const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out0 = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm deleted file mode 100644 index 2bfbcc5a52..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm +++ /dev/null @@ -1,507 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vpx_idct8x8_64_add_neon| - EXPORT |vpx_idct8x8_12_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - INCLUDE vpx_dsp/arm/idct_neon.asm.S - - ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are - ; loaded in q8-q15. The output will be stored back into q8-q15 registers. - ; This macro will touch q0-q7 registers and use them as buffer during - ; calculation. - MACRO - IDCT8x8_1D - ; stage 1 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r4 ; duplicate cospi_4_64 - vdup.16 d2, r5 ; duplicate cospi_12_64 - vdup.16 d3, r6 ; duplicate cospi_20_64 - - ; input[1] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; input[5] * cospi_12_64 - vmull.s16 q5, d26, d2 - vmull.s16 q6, d27, d2 - - ; input[1]*cospi_28_64-input[7]*cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q5, d22, d3 - vmlsl.s16 q6, d23, d3 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d8, q2, #14 ; >> 14 - vrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d10, q5, #14 ; >> 14 - vrshrn.s32 d11, q6, #14 ; >> 14 - - ; input[1] * cospi_4_64 - vmull.s16 q2, d18, d1 - vmull.s16 q3, d19, d1 - - ; input[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q13, d27, d3 - - ; input[1]*cospi_4_64+input[7]*cospi_28_64 - vmlal.s16 q2, d30, d0 - vmlal.s16 q3, d31, d0 - - ; input[5] * cospi_20_64 + input[3] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q13, d23, d2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d14, q2, #14 ; >> 14 - vrshrn.s32 d15, q3, #14 ; >> 14 - - ; stage 2 & stage 3 - even half - vdup.16 d0, r7 ; duplicate cospi_16_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d12, q9, #14 ; >> 14 - vrshrn.s32 d13, q13, #14 ; >> 14 - - ; input[0] * cospi_16_64 - vmull.s16 q2, d16, d0 - vmull.s16 q3, d17, d0 - - ; input[0] * cospi_16_64 - vmull.s16 q13, d16, d0 - vmull.s16 q15, d17, d0 - - ; (input[0] + input[2]) * cospi_16_64 - vmlal.s16 q2, d24, d0 - vmlal.s16 q3, d25, d0 - - ; (input[0] - input[2]) * cospi_16_64 - vmlsl.s16 q13, d24, d0 - vmlsl.s16 q15, d25, d0 - - vdup.16 d0, r8 ; duplicate cospi_24_64 - vdup.16 d1, r9 ; duplicate cospi_8_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d18, q2, #14 ; >> 14 - vrshrn.s32 d19, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d22, q13, #14 ; >> 14 - vrshrn.s32 d23, q15, #14 ; >> 14 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - ; input[1] * cospi_24_64 - vmull.s16 q2, d20, d0 - vmull.s16 q3, d21, d0 - - ; input[1] * cospi_8_64 - vmull.s16 q8, d20, d1 - vmull.s16 q12, d21, d1 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q2, d28, d1 - vmlsl.s16 q3, d29, d1 - - ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q8, d28, d0 - vmlal.s16 q12, d29, d0 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d26, q2, #14 ; >> 14 - vrshrn.s32 d27, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d30, q8, #14 ; >> 14 - vrshrn.s32 d31, q12, #14 ; >> 14 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d10, q9, #14 ; >> 14 - vrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d12, q11, #14 ; >> 14 - vrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - MEND - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int stride) - -|vpx_idct8x8_64_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0 - - ; transpose the input data - TRANSPOSE8X8 - - ; cospi_28_64 = 3196 - movw r3, #0x0c7c - - ; cospi_4_64 = 16069 - movw r4, #0x3ec5 - - ; cospi_12_64 = 13623 - movw r5, #0x3537 - - ; cospi_20_64 = 9102 - movw r6, #0x238e - - ; cospi_16_64 = 11585 - movw r7, #0x2d41 - - ; cospi_24_64 = 6270 - movw r8, #0x187e - - ; cospi_8_64 = 15137 - movw r9, #0x3b21 - - ; First transform rows - IDCT8x8_1D - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |vpx_idct8x8_64_add_neon| - -;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int stride) - -|vpx_idct8x8_12_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0 - - ; transpose the input data - TRANSPOSE8X8 - - ; cospi_28_64 = 3196 - movw r3, #0x0c7c - - ; cospi_4_64 = 16069 - movw r4, #0x3ec5 - - ; cospi_12_64 = 13623 - movw r5, #0x3537 - - ; cospi_20_64 = 9102 - movw r6, #0x238e - - ; cospi_16_64 = 11585 - movw r7, #0x2d41 - - ; cospi_24_64 = 6270 - movw r8, #0x187e - - ; cospi_8_64 = 15137 - movw r9, #0x3b21 - - ; First transform rows - ; stage 1 - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling - ; multiply and shift the result by 16 bits instead of 14 bits. So we need - ; to double the constants before multiplying to compensate this. - mov r12, r3, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_28_64*2 - mov r12, r4, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; dct_const_round_shift(input[1] * cospi_28_64) - vqrdmulh.s16 q4, q9, q0 - - mov r12, r6, lsl #1 - rsb r12, #0 - vdup.16 q0, r12 ; duplicate -cospi_20_64*2 - - ; dct_const_round_shift(input[1] * cospi_4_64) - vqrdmulh.s16 q7, q9, q1 - - mov r12, r5, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_12_64*2 - - ; dct_const_round_shift(- input[3] * cospi_20_64) - vqrdmulh.s16 q5, q11, q0 - - mov r12, r7, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_16_64*2 - - ; dct_const_round_shift(input[3] * cospi_12_64) - vqrdmulh.s16 q6, q11, q1 - - ; stage 2 & stage 3 - even half - mov r12, r8, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_24_64*2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrdmulh.s16 q9, q8, q0 - - mov r12, r9, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_8_64*2 - - ; dct_const_round_shift(input[1] * cospi_24_64) - vqrdmulh.s16 q13, q10, q1 - - ; dct_const_round_shift(input[1] * cospi_8_64) - vqrdmulh.s16 q15, q10, q0 - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d10, q9, #14 ; >> 14 - vrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d12, q11, #14 ; >> 14 - vrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |vpx_idct8x8_12_add_neon| - - END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c index 279da67d74..7471387e47 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct8x8_add_neon.c @@ -13,94 +13,29 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, - int16x8_t a3, int16x8_t a4, int16x8_t a5, - int16x8_t a6, int16x8_t a7, uint8_t *dest, - const int stride) { - const uint8_t *dst = dest; - uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; - uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; - - a0 = vrshrq_n_s16(a0, 5); - a1 = vrshrq_n_s16(a1, 5); - a2 = vrshrq_n_s16(a2, 5); - a3 = vrshrq_n_s16(a3, 5); - a4 = vrshrq_n_s16(a4, 5); - a5 = vrshrq_n_s16(a5, 5); - a6 = vrshrq_n_s16(a6, 5); - a7 = vrshrq_n_s16(a7, 5); - - d0 = vld1_u8(dst); - dst += stride; - d1 = vld1_u8(dst); - dst += stride; - d2 = vld1_u8(dst); - dst += stride; - d3 = vld1_u8(dst); - dst += stride; - d4 = vld1_u8(dst); - dst += stride; - d5 = vld1_u8(dst); - dst += stride; - d6 = vld1_u8(dst); - dst += stride; - d7 = vld1_u8(dst); - - d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0); - d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1); - d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2); - d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3); - d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4); - d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5); - d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6); - d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7); - - d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16)); - d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16)); - d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16)); - d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16)); - d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16)); - d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16)); - d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16)); - d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16)); - - vst1_u8(dest, d0); - dest += stride; - vst1_u8(dest, d1); - dest += stride; - vst1_u8(dest, d2); - dest += stride; - vst1_u8(dest, d3); - dest += stride; - vst1_u8(dest, d4); - dest += stride; - vst1_u8(dest, d5); - dest += stride; - vst1_u8(dest, d6); - dest += stride; - vst1_u8(dest, d7); -} - void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 - int16x8_t a0 = load_tran_low_to_s16q(input); - int16x8_t a1 = load_tran_low_to_s16q(input + 8); - int16x8_t a2 = load_tran_low_to_s16q(input + 16); - int16x8_t a3 = load_tran_low_to_s16q(input + 24); - int16x8_t a4 = load_tran_low_to_s16q(input + 32); - int16x8_t a5 = load_tran_low_to_s16q(input + 40); - int16x8_t a6 = load_tran_low_to_s16q(input + 48); - int16x8_t a7 = load_tran_low_to_s16q(input + 56); + int16x8_t a[8]; - idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride); + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + a[2] = load_tran_low_to_s16q(input + 16); + a[3] = load_tran_low_to_s16q(input + 24); + a[4] = load_tran_low_to_s16q(input + 32); + a[5] = load_tran_low_to_s16q(input + 40); + a[6] = load_tran_low_to_s16q(input + 48); + a[7] = load_tran_low_to_s16q(input + 56); + + idct8x8_64_1d_bd8(cospis0, cospis1, a); + idct8x8_64_1d_bd8(cospis0, cospis1, a); + idct8x8_add8x8_neon(a, dest, stride); } void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, @@ -110,17 +45,15 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 - int16x4_t a0, a1, a2, a3, a4, a5, a6, a7; - int16x8_t b0, b1, b2, b3, b4, b5, b6, b7; + int16x4_t a[8]; + int16x8_t b[8]; - a0 = load_tran_low_to_s16d(input); - a1 = load_tran_low_to_s16d(input + 8); - a2 = load_tran_low_to_s16d(input + 16); - a3 = load_tran_low_to_s16d(input + 24); + a[0] = load_tran_low_to_s16d(input); + a[1] = load_tran_low_to_s16d(input + 8); + a[2] = load_tran_low_to_s16d(input + 16); + a[3] = load_tran_low_to_s16d(input + 24); - idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4, - &a5, &a6, &a7); - idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6, - a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); - add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride); + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b); + idct8x8_add8x8_neon(b, dest, stride); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h index aecc543dbb..c02311326b 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/idct_neon.h @@ -8,87 +8,113 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_ARM_IDCT_NEON_H_ -#define VPX_DSP_ARM_IDCT_NEON_H_ +#ifndef VPX_VPX_DSP_ARM_IDCT_NEON_H_ +#define VPX_VPX_DSP_ARM_IDCT_NEON_H_ #include #include "./vpx_config.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" -DECLARE_ALIGNED(16, static const int16_t, kCospi[8]) = { - 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, - 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, - 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, - -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */ +static const int16_t kCospi[16] = { + 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, + 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, + 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, + -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */, + 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */, + 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */, + 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */, + 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ }; -DECLARE_ALIGNED(16, static const int32_t, kCospi32[8]) = { - 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, - 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, - 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, - -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */ +static const int32_t kCospi32[16] = { + 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, + 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, + 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, + -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */, + 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */, + 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */, + 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */, + 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ }; //------------------------------------------------------------------------------ -// Helper functions used to load tran_low_t into int16, narrowing if necessary. - -static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { +// Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth +static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) { #if CONFIG_VP9_HIGHBITDEPTH - const int32x4x2_t v0 = vld2q_s32(buf); - const int32x4x2_t v1 = vld2q_s32(buf + 8); - const int16x4_t s0 = vmovn_s32(v0.val[0]); - const int16x4_t s1 = vmovn_s32(v0.val[1]); - const int16x4_t s2 = vmovn_s32(v1.val[0]); - const int16x4_t s3 = vmovn_s32(v1.val[1]); - int16x8x2_t res; - res.val[0] = vcombine_s16(s0, s2); - res.val[1] = vcombine_s16(s1, s3); - return res; + return vqaddq_s16(a, b); #else - return vld2q_s16(buf); + return vaddq_s16(a, b); #endif } -static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { +static INLINE int16x8_t final_sub(const int16x8_t a, const int16x8_t b) { #if CONFIG_VP9_HIGHBITDEPTH - const int32x4_t v0 = vld1q_s32(buf); - const int32x4_t v1 = vld1q_s32(buf + 4); - const int16x4_t s0 = vmovn_s32(v0); - const int16x4_t s1 = vmovn_s32(v1); - return vcombine_s16(s0, s1); + return vqsubq_s16(a, b); #else - return vld1q_s16(buf); -#endif -} - -static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { -#if CONFIG_VP9_HIGHBITDEPTH - const int32x4_t v0 = vld1q_s32(buf); - return vmovn_s32(v0); -#else - return vld1_s16(buf); + return vsubq_s16(a, b); #endif } //------------------------------------------------------------------------------ -// Multiply a by a_const. Saturate, shift and narrow by 14. +static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0, + const int32x4x2_t s1) { + int32x4x2_t t; + t.val[0] = vaddq_s32(s0.val[0], s1.val[0]); + t.val[1] = vaddq_s32(s0.val[1], s1.val[1]); + return t; +} + +static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0, + const int32x4x2_t s1) { + int32x4x2_t t; + t.val[0] = vsubq_s32(s0.val[0], s1.val[0]); + t.val[1] = vsubq_s32(s0.val[1], s1.val[1]); + return t; +} + +//------------------------------------------------------------------------------ + +static INLINE int16x8_t dct_const_round_shift_low_8(const int32x4_t *const in) { + return vcombine_s16(vrshrn_n_s32(in[0], DCT_CONST_BITS), + vrshrn_n_s32(in[1], DCT_CONST_BITS)); +} + +static INLINE void dct_const_round_shift_low_8_dual(const int32x4_t *const t32, + int16x8_t *const d0, + int16x8_t *const d1) { + *d0 = dct_const_round_shift_low_8(t32 + 0); + *d1 = dct_const_round_shift_low_8(t32 + 2); +} + +static INLINE int32x4x2_t +dct_const_round_shift_high_4x2(const int64x2_t *const in) { + int32x4x2_t out; + out.val[0] = vcombine_s32(vrshrn_n_s64(in[0], DCT_CONST_BITS), + vrshrn_n_s64(in[1], DCT_CONST_BITS)); + out.val[1] = vcombine_s32(vrshrn_n_s64(in[2], DCT_CONST_BITS), + vrshrn_n_s64(in[3], DCT_CONST_BITS)); + return out; +} + +// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, const int16_t a_const) { - // Shift by 14 + rounding will be within 16 bits for well formed streams. - // See WRAPLOW and dct_const_round_shift for details. + // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed + // streams. See WRAPLOW and dct_const_round_shift for details. // This instruction doubles the result and returns the high half, essentially // resulting in a right shift by 15. By multiplying the constant first that - // becomes a right shift by 14. + // becomes a right shift by DCT_CONST_BITS. // The largest possible value used here is // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just* // within the range of int16_t (+32767 / -32768) even when negated. return vqrdmulhq_n_s16(a, a_const * 2); } -// Add a and b, then multiply by ab_const. Shift and narrow by 14. +// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. static INLINE int16x8_t add_multiply_shift_and_narrow_s16( const int16x8_t a, const int16x8_t b, const int16_t ab_const) { // In both add_ and it's pair, sub_, the input for well-formed streams will be @@ -98,94 +124,162 @@ static INLINE int16x8_t add_multiply_shift_and_narrow_s16( // input) this function can not use vaddq_s16. // In order to match existing behavior and intentionally out of range tests, // expand the addition up to 32 bits to prevent truncation. - int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); - int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); - temp_low = vmulq_n_s32(temp_low, ab_const); - temp_high = vmulq_n_s32(temp_high, ab_const); - return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); + int32x4_t t[2]; + t[0] = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); + t[1] = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); + t[0] = vmulq_n_s32(t[0], ab_const); + t[1] = vmulq_n_s32(t[1], ab_const); + return dct_const_round_shift_low_8(t); } -// Subtract b from a, then multiply by ab_const. Shift and narrow by 14. +// Subtract b from a, then multiply by ab_const. Shift and narrow by +// DCT_CONST_BITS. static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( const int16x8_t a, const int16x8_t b, const int16_t ab_const) { - int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); - int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); - temp_low = vmulq_n_s32(temp_low, ab_const); - temp_high = vmulq_n_s32(temp_high, ab_const); - return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); + int32x4_t t[2]; + t[0] = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); + t[1] = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); + t[0] = vmulq_n_s32(t[0], ab_const); + t[1] = vmulq_n_s32(t[1], ab_const); + return dct_const_round_shift_low_8(t); } // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by -// 14. +// DCT_CONST_BITS. static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( const int16x8_t a, const int16_t a_const, const int16x8_t b, const int16_t b_const) { - int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const); - int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const); - temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const); - temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const); - return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); + int32x4_t t[2]; + t[0] = vmull_n_s16(vget_low_s16(a), a_const); + t[1] = vmull_n_s16(vget_high_s16(a), a_const); + t[0] = vmlal_n_s16(t[0], vget_low_s16(b), b_const); + t[1] = vmlal_n_s16(t[1], vget_high_s16(b), b_const); + return dct_const_round_shift_low_8(t); +} + +//------------------------------------------------------------------------------ + +// Note: The following 4 functions could use 32-bit operations for bit-depth 10. +// However, although it's 20% faster with gcc, it's 20% slower with clang. +// Use 64-bit operations for now. + +// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. +static INLINE int32x4x2_t +multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) { + int64x2_t b[4]; + + b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); + b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); + b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); + b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const); + return dct_const_round_shift_high_4x2(b); +} + +// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. +static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual( + const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { + int32x4_t t[2]; + int64x2_t c[4]; + + t[0] = vaddq_s32(a.val[0], b.val[0]); + t[1] = vaddq_s32(a.val[1], b.val[1]); + c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const); + c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const); + c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const); + c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const); + return dct_const_round_shift_high_4x2(c); +} + +// Subtract b from a, then multiply by ab_const. Shift and narrow by +// DCT_CONST_BITS. +static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual( + const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { + int32x4_t t[2]; + int64x2_t c[4]; + + t[0] = vsubq_s32(a.val[0], b.val[0]); + t[1] = vsubq_s32(a.val[1], b.val[1]); + c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const); + c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const); + c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const); + c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const); + return dct_const_round_shift_high_4x2(c); +} + +// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by +// DCT_CONST_BITS. +static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual( + const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b, + const int32_t b_const) { + int64x2_t c[4]; + c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); + c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); + c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); + c[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const); + c[0] = vmlal_n_s32(c[0], vget_low_s32(b.val[0]), b_const); + c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const); + c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const); + c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const); + return dct_const_round_shift_high_4x2(c); } // Shift the output down by 6 and add it to the destination buffer. -static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, - const int16x8_t a2, const int16x8_t a3, - const int16x8_t a4, const int16x8_t a5, - const int16x8_t a6, const int16x8_t a7, - uint8_t *b, const int b_stride) { - uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; - int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; - b0 = vld1_u8(b); - b += b_stride; - b1 = vld1_u8(b); - b += b_stride; - b2 = vld1_u8(b); - b += b_stride; - b3 = vld1_u8(b); - b += b_stride; - b4 = vld1_u8(b); - b += b_stride; - b5 = vld1_u8(b); - b += b_stride; - b6 = vld1_u8(b); - b += b_stride; - b7 = vld1_u8(b); - b -= (7 * b_stride); +static INLINE void add_and_store_u8_s16(const int16x8_t *const a, uint8_t *d, + const int stride) { + uint8x8_t b[8]; + int16x8_t c[8]; + + b[0] = vld1_u8(d); + d += stride; + b[1] = vld1_u8(d); + d += stride; + b[2] = vld1_u8(d); + d += stride; + b[3] = vld1_u8(d); + d += stride; + b[4] = vld1_u8(d); + d += stride; + b[5] = vld1_u8(d); + d += stride; + b[6] = vld1_u8(d); + d += stride; + b[7] = vld1_u8(d); + d -= (7 * stride); // c = b + (a >> 6) - c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6); - c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6); - c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6); - c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6); - c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6); - c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6); - c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6); - c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6); + c[0] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[0])), a[0], 6); + c[1] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[1])), a[1], 6); + c[2] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[2])), a[2], 6); + c[3] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[3])), a[3], 6); + c[4] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[4])), a[4], 6); + c[5] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[5])), a[5], 6); + c[6] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[6])), a[6], 6); + c[7] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[7])), a[7], 6); - b0 = vqmovun_s16(c0); - b1 = vqmovun_s16(c1); - b2 = vqmovun_s16(c2); - b3 = vqmovun_s16(c3); - b4 = vqmovun_s16(c4); - b5 = vqmovun_s16(c5); - b6 = vqmovun_s16(c6); - b7 = vqmovun_s16(c7); + b[0] = vqmovun_s16(c[0]); + b[1] = vqmovun_s16(c[1]); + b[2] = vqmovun_s16(c[2]); + b[3] = vqmovun_s16(c[3]); + b[4] = vqmovun_s16(c[4]); + b[5] = vqmovun_s16(c[5]); + b[6] = vqmovun_s16(c[6]); + b[7] = vqmovun_s16(c[7]); - vst1_u8(b, b0); - b += b_stride; - vst1_u8(b, b1); - b += b_stride; - vst1_u8(b, b2); - b += b_stride; - vst1_u8(b, b3); - b += b_stride; - vst1_u8(b, b4); - b += b_stride; - vst1_u8(b, b5); - b += b_stride; - vst1_u8(b, b6); - b += b_stride; - vst1_u8(b, b7); + vst1_u8(d, b[0]); + d += stride; + vst1_u8(d, b[1]); + d += stride; + vst1_u8(d, b[2]); + d += stride; + vst1_u8(d, b[3]); + d += stride; + vst1_u8(d, b[4]); + d += stride; + vst1_u8(d, b[5]); + d += stride; + vst1_u8(d, b[6]); + d += stride; + vst1_u8(d, b[7]); } static INLINE uint8x16_t create_dcq(const int16_t dc) { @@ -194,56 +288,53 @@ static INLINE uint8x16_t create_dcq(const int16_t dc) { return vdupq_n_u8((uint8_t)t); } -static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, - int16x8_t *const a0, - int16x8_t *const a1) { - int16x4_t b0, b1, b2, b3; - int32x4_t c0, c1, c2, c3; - int16x8_t d0, d1; +static INLINE void idct4x4_16_kernel_bd8(int16x8_t *const a) { + const int16x4_t cospis = vld1_s16(kCospi); + int16x4_t b[4]; + int32x4_t c[4]; + int16x8_t d[2]; - transpose_s16_4x4q(a0, a1); - b0 = vget_low_s16(*a0); - b1 = vget_high_s16(*a0); - b2 = vget_low_s16(*a1); - b3 = vget_high_s16(*a1); - c0 = vmull_lane_s16(b0, cospis, 2); - c2 = vmull_lane_s16(b1, cospis, 2); - c1 = vsubq_s32(c0, c2); - c0 = vaddq_s32(c0, c2); - c2 = vmull_lane_s16(b2, cospis, 3); - c3 = vmull_lane_s16(b2, cospis, 1); - c2 = vmlsl_lane_s16(c2, b3, cospis, 1); - c3 = vmlal_lane_s16(c3, b3, cospis, 3); - b0 = vrshrn_n_s32(c0, 14); - b1 = vrshrn_n_s32(c1, 14); - b2 = vrshrn_n_s32(c2, 14); - b3 = vrshrn_n_s32(c3, 14); - d0 = vcombine_s16(b0, b1); - d1 = vcombine_s16(b3, b2); - *a0 = vaddq_s16(d0, d1); - *a1 = vsubq_s16(d0, d1); + b[0] = vget_low_s16(a[0]); + b[1] = vget_high_s16(a[0]); + b[2] = vget_low_s16(a[1]); + b[3] = vget_high_s16(a[1]); + c[0] = vmull_lane_s16(b[0], cospis, 2); + c[2] = vmull_lane_s16(b[1], cospis, 2); + c[1] = vsubq_s32(c[0], c[2]); + c[0] = vaddq_s32(c[0], c[2]); + c[3] = vmull_lane_s16(b[2], cospis, 3); + c[2] = vmull_lane_s16(b[2], cospis, 1); + c[3] = vmlsl_lane_s16(c[3], b[3], cospis, 1); + c[2] = vmlal_lane_s16(c[2], b[3], cospis, 3); + dct_const_round_shift_low_8_dual(c, &d[0], &d[1]); + a[0] = vaddq_s16(d[0], d[1]); + a[1] = vsubq_s16(d[0], d[1]); } -static INLINE void idct8x8_12_pass1_bd8( - const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, - int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2, - int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5, - int16x4_t *const io6, int16x4_t *const io7) { +static INLINE void transpose_idct4x4_16_bd8(int16x8_t *const a) { + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); +} + +static INLINE void idct8x8_12_pass1_bd8(const int16x4_t cospis0, + const int16x4_t cospisd0, + const int16x4_t cospisd1, + int16x4_t *const io) { int16x4_t step1[8], step2[8]; int32x4_t t32[2]; - transpose_s16_4x4d(io0, io1, io2, io3); + transpose_s16_4x4d(&io[0], &io[1], &io[2], &io[3]); // stage 1 - step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3); - step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2); - step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1); - step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0); + step1[4] = vqrdmulh_lane_s16(io[1], cospisd1, 3); + step1[5] = vqrdmulh_lane_s16(io[3], cospisd1, 2); + step1[6] = vqrdmulh_lane_s16(io[3], cospisd1, 1); + step1[7] = vqrdmulh_lane_s16(io[1], cospisd1, 0); // stage 2 - step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2); - step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3); - step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1); + step2[1] = vqrdmulh_lane_s16(io[0], cospisd0, 2); + step2[2] = vqrdmulh_lane_s16(io[2], cospisd0, 3); + step2[3] = vqrdmulh_lane_s16(io[2], cospisd0, 1); step2[4] = vadd_s16(step1[4], step1[5]); step2[5] = vsub_s16(step1[4], step1[5]); @@ -259,36 +350,31 @@ static INLINE void idct8x8_12_pass1_bd8( t32[1] = vmull_lane_s16(step2[6], cospis0, 2); t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2); t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2); - step1[5] = vrshrn_n_s32(t32[0], 14); - step1[6] = vrshrn_n_s32(t32[1], 14); + step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); // stage 4 - *io0 = vadd_s16(step1[0], step2[7]); - *io1 = vadd_s16(step1[1], step1[6]); - *io2 = vadd_s16(step1[2], step1[5]); - *io3 = vadd_s16(step1[3], step2[4]); - *io4 = vsub_s16(step1[3], step2[4]); - *io5 = vsub_s16(step1[2], step1[5]); - *io6 = vsub_s16(step1[1], step1[6]); - *io7 = vsub_s16(step1[0], step2[7]); + io[0] = vadd_s16(step1[0], step2[7]); + io[1] = vadd_s16(step1[1], step1[6]); + io[2] = vadd_s16(step1[2], step1[5]); + io[3] = vadd_s16(step1[3], step2[4]); + io[4] = vsub_s16(step1[3], step2[4]); + io[5] = vsub_s16(step1[2], step1[5]); + io[6] = vsub_s16(step1[1], step1[6]); + io[7] = vsub_s16(step1[0], step2[7]); } -static INLINE void idct8x8_12_pass2_bd8( - const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, - const int16x4_t input0, const int16x4_t input1, const int16x4_t input2, - const int16x4_t input3, const int16x4_t input4, const int16x4_t input5, - const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0, - int16x8_t *const output1, int16x8_t *const output2, - int16x8_t *const output3, int16x8_t *const output4, - int16x8_t *const output5, int16x8_t *const output6, - int16x8_t *const output7) { +static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0, + const int16x4_t cospisd0, + const int16x4_t cospisd1, + const int16x4_t *const input, + int16x8_t *const output) { int16x8_t in[4]; int16x8_t step1[8], step2[8]; int32x4_t t32[8]; - int16x4_t t16[8]; - transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6, - input7, &in[0], &in[1], &in[2], &in[3]); + transpose_s16_4x8(input[0], input[1], input[2], input[3], input[4], input[5], + input[6], input[7], &in[0], &in[1], &in[2], &in[3]); // stage 1 step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3); @@ -318,86 +404,64 @@ static INLINE void idct8x8_12_pass2_bd8( t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); - step1[5] = vcombine_s16(t16[0], t16[1]); - step1[6] = vcombine_s16(t16[2], t16[3]); + dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]); // stage 4 - *output0 = vaddq_s16(step1[0], step2[7]); - *output1 = vaddq_s16(step1[1], step1[6]); - *output2 = vaddq_s16(step1[2], step1[5]); - *output3 = vaddq_s16(step1[3], step2[4]); - *output4 = vsubq_s16(step1[3], step2[4]); - *output5 = vsubq_s16(step1[2], step1[5]); - *output6 = vsubq_s16(step1[1], step1[6]); - *output7 = vsubq_s16(step1[0], step2[7]); + output[0] = vaddq_s16(step1[0], step2[7]); + output[1] = vaddq_s16(step1[1], step1[6]); + output[2] = vaddq_s16(step1[2], step1[5]); + output[3] = vaddq_s16(step1[3], step2[4]); + output[4] = vsubq_s16(step1[3], step2[4]); + output[5] = vsubq_s16(step1[2], step1[5]); + output[6] = vsubq_s16(step1[1], step1[6]); + output[7] = vsubq_s16(step1[0], step2[7]); } -static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, - const int16x4_t cospis1, - int16x8_t *const io0, int16x8_t *const io1, - int16x8_t *const io2, int16x8_t *const io3, - int16x8_t *const io4, int16x8_t *const io5, - int16x8_t *const io6, - int16x8_t *const io7) { - int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, - input_7l, input_7h; +static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io) { + int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l, + input7h; int16x4_t step1l[4], step1h[4]; int16x8_t step1[8], step2[8]; int32x4_t t32[8]; - int16x4_t t16[8]; - - transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7); // stage 1 - input_1l = vget_low_s16(*io1); - input_1h = vget_high_s16(*io1); - input_3l = vget_low_s16(*io3); - input_3h = vget_high_s16(*io3); - input_5l = vget_low_s16(*io5); - input_5h = vget_high_s16(*io5); - input_7l = vget_low_s16(*io7); - input_7h = vget_high_s16(*io7); - step1l[0] = vget_low_s16(*io0); - step1h[0] = vget_high_s16(*io0); - step1l[1] = vget_low_s16(*io2); - step1h[1] = vget_high_s16(*io2); - step1l[2] = vget_low_s16(*io4); - step1h[2] = vget_high_s16(*io4); - step1l[3] = vget_low_s16(*io6); - step1h[3] = vget_high_s16(*io6); + input1l = vget_low_s16(io[1]); + input1h = vget_high_s16(io[1]); + input3l = vget_low_s16(io[3]); + input3h = vget_high_s16(io[3]); + input5l = vget_low_s16(io[5]); + input5h = vget_high_s16(io[5]); + input7l = vget_low_s16(io[7]); + input7h = vget_high_s16(io[7]); + step1l[0] = vget_low_s16(io[0]); + step1h[0] = vget_high_s16(io[0]); + step1l[1] = vget_low_s16(io[2]); + step1h[1] = vget_high_s16(io[2]); + step1l[2] = vget_low_s16(io[4]); + step1h[2] = vget_high_s16(io[4]); + step1l[3] = vget_low_s16(io[6]); + step1h[3] = vget_high_s16(io[6]); - t32[0] = vmull_lane_s16(input_1l, cospis1, 3); - t32[1] = vmull_lane_s16(input_1h, cospis1, 3); - t32[2] = vmull_lane_s16(input_3l, cospis1, 2); - t32[3] = vmull_lane_s16(input_3h, cospis1, 2); - t32[4] = vmull_lane_s16(input_3l, cospis1, 1); - t32[5] = vmull_lane_s16(input_3h, cospis1, 1); - t32[6] = vmull_lane_s16(input_1l, cospis1, 0); - t32[7] = vmull_lane_s16(input_1h, cospis1, 0); - t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0); - t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0); - t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1); - t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1); - t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2); - t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2); - t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3); - t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); - t16[4] = vrshrn_n_s32(t32[4], 14); - t16[5] = vrshrn_n_s32(t32[5], 14); - t16[6] = vrshrn_n_s32(t32[6], 14); - t16[7] = vrshrn_n_s32(t32[7], 14); - step1[4] = vcombine_s16(t16[0], t16[1]); - step1[5] = vcombine_s16(t16[2], t16[3]); - step1[6] = vcombine_s16(t16[4], t16[5]); - step1[7] = vcombine_s16(t16[6], t16[7]); + t32[0] = vmull_lane_s16(input1l, cospis1, 3); + t32[1] = vmull_lane_s16(input1h, cospis1, 3); + t32[2] = vmull_lane_s16(input3l, cospis1, 2); + t32[3] = vmull_lane_s16(input3h, cospis1, 2); + t32[4] = vmull_lane_s16(input3l, cospis1, 1); + t32[5] = vmull_lane_s16(input3h, cospis1, 1); + t32[6] = vmull_lane_s16(input1l, cospis1, 0); + t32[7] = vmull_lane_s16(input1h, cospis1, 0); + t32[0] = vmlsl_lane_s16(t32[0], input7l, cospis1, 0); + t32[1] = vmlsl_lane_s16(t32[1], input7h, cospis1, 0); + t32[2] = vmlal_lane_s16(t32[2], input5l, cospis1, 1); + t32[3] = vmlal_lane_s16(t32[3], input5h, cospis1, 1); + t32[4] = vmlsl_lane_s16(t32[4], input5l, cospis1, 2); + t32[5] = vmlsl_lane_s16(t32[5], input5h, cospis1, 2); + t32[6] = vmlal_lane_s16(t32[6], input7l, cospis1, 3); + t32[7] = vmlal_lane_s16(t32[7], input7h, cospis1, 3); + dct_const_round_shift_low_8_dual(&t32[0], &step1[4], &step1[5]); + dct_const_round_shift_low_8_dual(&t32[4], &step1[6], &step1[7]); // stage 2 t32[2] = vmull_lane_s16(step1l[0], cospis0, 2); @@ -414,18 +478,8 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1); t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3); t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); - t16[4] = vrshrn_n_s32(t32[4], 14); - t16[5] = vrshrn_n_s32(t32[5], 14); - t16[6] = vrshrn_n_s32(t32[6], 14); - t16[7] = vrshrn_n_s32(t32[7], 14); - step2[0] = vcombine_s16(t16[0], t16[1]); - step2[1] = vcombine_s16(t16[2], t16[3]); - step2[2] = vcombine_s16(t16[4], t16[5]); - step2[3] = vcombine_s16(t16[6], t16[7]); + dct_const_round_shift_low_8_dual(&t32[0], &step2[0], &step2[1]); + dct_const_round_shift_low_8_dual(&t32[4], &step2[2], &step2[3]); step2[4] = vaddq_s16(step1[4], step1[5]); step2[5] = vsubq_s16(step1[4], step1[5]); @@ -444,22 +498,422 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); - step1[5] = vcombine_s16(t16[0], t16[1]); - step1[6] = vcombine_s16(t16[2], t16[3]); + dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]); // stage 4 - *io0 = vaddq_s16(step1[0], step2[7]); - *io1 = vaddq_s16(step1[1], step1[6]); - *io2 = vaddq_s16(step1[2], step1[5]); - *io3 = vaddq_s16(step1[3], step2[4]); - *io4 = vsubq_s16(step1[3], step2[4]); - *io5 = vsubq_s16(step1[2], step1[5]); - *io6 = vsubq_s16(step1[1], step1[6]); - *io7 = vsubq_s16(step1[0], step2[7]); + io[0] = vaddq_s16(step1[0], step2[7]); + io[1] = vaddq_s16(step1[1], step1[6]); + io[2] = vaddq_s16(step1[2], step1[5]); + io[3] = vaddq_s16(step1[3], step2[4]); + io[4] = vsubq_s16(step1[3], step2[4]); + io[5] = vsubq_s16(step1[2], step1[5]); + io[6] = vsubq_s16(step1[1], step1[6]); + io[7] = vsubq_s16(step1[0], step2[7]); } -#endif // VPX_DSP_ARM_IDCT_NEON_H_ +static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io) { + transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6], + &io[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io); +} + +static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0, + const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int32x4_t *const t32) { + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1); +} + +static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, + int16x8_t *const d1) { + int32x4_t t32[4]; + + idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); + t32[2] = vnegq_s32(t32[2]); + t32[3] = vnegq_s32(t32[3]); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, + int16x8_t *const d1) { + int32x4_t t32[6]; + + t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2); + t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2); + t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); + t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); + t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); + t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_2_30_10_22, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_4_12_20N_28, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_6_26N_14_18N, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 0); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 0); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 0); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 0); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 1); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_2_30_10_22, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_4_12_20N_28, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_6_26N_14_18N, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26N_14_18N, 2); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26N_14_18N, 2); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26N_14_18N, 2); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26N_14_18N, 2); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26N_14_18N, 3); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3); + dct_const_round_shift_low_8_dual(t32, d0, d1); +} + +static INLINE void idct16x16_add_stage7(const int16x8_t *const step2, + int16x8_t *const out) { +#if CONFIG_VP9_HIGHBITDEPTH + // Use saturating add/sub to avoid overflow in 2nd pass + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +#else + out[0] = vaddq_s16(step2[0], step2[15]); + out[1] = vaddq_s16(step2[1], step2[14]); + out[2] = vaddq_s16(step2[2], step2[13]); + out[3] = vaddq_s16(step2[3], step2[12]); + out[4] = vaddq_s16(step2[4], step2[11]); + out[5] = vaddq_s16(step2[5], step2[10]); + out[6] = vaddq_s16(step2[6], step2[9]); + out[7] = vaddq_s16(step2[7], step2[8]); + out[8] = vsubq_s16(step2[7], step2[8]); + out[9] = vsubq_s16(step2[6], step2[9]); + out[10] = vsubq_s16(step2[5], step2[10]); + out[11] = vsubq_s16(step2[4], step2[11]); + out[12] = vsubq_s16(step2[3], step2[12]); + out[13] = vsubq_s16(step2[2], step2[13]); + out[14] = vsubq_s16(step2[1], step2[14]); + out[15] = vsubq_s16(step2[0], step2[15]); +#endif +} + +static INLINE void idct16x16_store_pass1(const int16x8_t *const out, + int16_t *output) { + // Save the result into output + vst1q_s16(output, out[0]); + output += 16; + vst1q_s16(output, out[1]); + output += 16; + vst1q_s16(output, out[2]); + output += 16; + vst1q_s16(output, out[3]); + output += 16; + vst1q_s16(output, out[4]); + output += 16; + vst1q_s16(output, out[5]); + output += 16; + vst1q_s16(output, out[6]); + output += 16; + vst1q_s16(output, out[7]); + output += 16; + vst1q_s16(output, out[8]); + output += 16; + vst1q_s16(output, out[9]); + output += 16; + vst1q_s16(output, out[10]); + output += 16; + vst1q_s16(output, out[11]); + output += 16; + vst1q_s16(output, out[12]); + output += 16; + vst1q_s16(output, out[13]); + output += 16; + vst1q_s16(output, out[14]); + output += 16; + vst1q_s16(output, out[15]); +} + +static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest, + const int stride) { + const uint8x8_t s = vld1_u8(*dest); + const int16x8_t res = vrshrq_n_s16(a, 5); + const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q)); + vst1_u8(*dest, d); + *dest += stride; +} + +static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest, + const int stride) { + idct8x8_add8x1(out[0], &dest, stride); + idct8x8_add8x1(out[1], &dest, stride); + idct8x8_add8x1(out[2], &dest, stride); + idct8x8_add8x1(out[3], &dest, stride); + idct8x8_add8x1(out[4], &dest, stride); + idct8x8_add8x1(out[5], &dest, stride); + idct8x8_add8x1(out[6], &dest, stride); + idct8x8_add8x1(out[7], &dest, stride); +} + +static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest, + const int stride) { + const uint8x8_t s = vld1_u8(*dest); + const int16x8_t res = vrshrq_n_s16(a, 6); + const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q)); + vst1_u8(*dest, d); + *dest += stride; +} + +static INLINE void idct16x16_add_store(const int16x8_t *const out, + uint8_t *dest, const int stride) { + // Add the result to dest + idct16x16_add8x1(out[0], &dest, stride); + idct16x16_add8x1(out[1], &dest, stride); + idct16x16_add8x1(out[2], &dest, stride); + idct16x16_add8x1(out[3], &dest, stride); + idct16x16_add8x1(out[4], &dest, stride); + idct16x16_add8x1(out[5], &dest, stride); + idct16x16_add8x1(out[6], &dest, stride); + idct16x16_add8x1(out[7], &dest, stride); + idct16x16_add8x1(out[8], &dest, stride); + idct16x16_add8x1(out[9], &dest, stride); + idct16x16_add8x1(out[10], &dest, stride); + idct16x16_add8x1(out[11], &dest, stride); + idct16x16_add8x1(out[12], &dest, stride); + idct16x16_add8x1(out[13], &dest, stride); + idct16x16_add8x1(out[14], &dest, stride); + idct16x16_add8x1(out[15], &dest, stride); +} + +static INLINE void highbd_idct16x16_add8x1(const int16x8_t a, + const int16x8_t max, + uint16_t **const dest, + const int stride) { + const uint16x8_t s = vld1q_u16(*dest); + const int16x8_t res0 = vqaddq_s16(a, vreinterpretq_s16_u16(s)); + const int16x8_t res1 = vminq_s16(res0, max); + const uint16x8_t d = vqshluq_n_s16(res1, 0); + vst1q_u16(*dest, d); + *dest += stride; +} + +static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest, + const int stride) { + // Add the result to dest + const int16x8_t max = vdupq_n_s16((1 << 8) - 1); + out[0] = vrshrq_n_s16(out[0], 6); + out[1] = vrshrq_n_s16(out[1], 6); + out[2] = vrshrq_n_s16(out[2], 6); + out[3] = vrshrq_n_s16(out[3], 6); + out[4] = vrshrq_n_s16(out[4], 6); + out[5] = vrshrq_n_s16(out[5], 6); + out[6] = vrshrq_n_s16(out[6], 6); + out[7] = vrshrq_n_s16(out[7], 6); + out[8] = vrshrq_n_s16(out[8], 6); + out[9] = vrshrq_n_s16(out[9], 6); + out[10] = vrshrq_n_s16(out[10], 6); + out[11] = vrshrq_n_s16(out[11], 6); + out[12] = vrshrq_n_s16(out[12], 6); + out[13] = vrshrq_n_s16(out[13], 6); + out[14] = vrshrq_n_s16(out[14], 6); + out[15] = vrshrq_n_s16(out[15], 6); + highbd_idct16x16_add8x1(out[0], max, &dest, stride); + highbd_idct16x16_add8x1(out[1], max, &dest, stride); + highbd_idct16x16_add8x1(out[2], max, &dest, stride); + highbd_idct16x16_add8x1(out[3], max, &dest, stride); + highbd_idct16x16_add8x1(out[4], max, &dest, stride); + highbd_idct16x16_add8x1(out[5], max, &dest, stride); + highbd_idct16x16_add8x1(out[6], max, &dest, stride); + highbd_idct16x16_add8x1(out[7], max, &dest, stride); + highbd_idct16x16_add8x1(out[8], max, &dest, stride); + highbd_idct16x16_add8x1(out[9], max, &dest, stride); + highbd_idct16x16_add8x1(out[10], max, &dest, stride); + highbd_idct16x16_add8x1(out[11], max, &dest, stride); + highbd_idct16x16_add8x1(out[12], max, &dest, stride); + highbd_idct16x16_add8x1(out[13], max, &dest, stride); + highbd_idct16x16_add8x1(out[14], max, &dest, stride); + highbd_idct16x16_add8x1(out[15], max, &dest, stride); +} + +static INLINE void highbd_idct16x16_add8x1_bd8(const int16x8_t a, + uint16_t **const dest, + const int stride) { + const uint16x8_t s = vld1q_u16(*dest); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), a, 6); + const uint16x8_t d = vmovl_u8(vqmovun_s16(res)); + vst1q_u16(*dest, d); + *dest += stride; +} + +static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a, + uint16_t *out, const int stride) { + highbd_idct16x16_add8x1_bd8(a[0], &out, stride); + highbd_idct16x16_add8x1_bd8(a[1], &out, stride); + highbd_idct16x16_add8x1_bd8(a[2], &out, stride); + highbd_idct16x16_add8x1_bd8(a[3], &out, stride); + highbd_idct16x16_add8x1_bd8(a[4], &out, stride); + highbd_idct16x16_add8x1_bd8(a[5], &out, stride); + highbd_idct16x16_add8x1_bd8(a[6], &out, stride); + highbd_idct16x16_add8x1_bd8(a[7], &out, stride); + highbd_idct16x16_add8x1_bd8(a[8], &out, stride); + highbd_idct16x16_add8x1_bd8(a[9], &out, stride); + highbd_idct16x16_add8x1_bd8(a[10], &out, stride); + highbd_idct16x16_add8x1_bd8(a[11], &out, stride); + highbd_idct16x16_add8x1_bd8(a[12], &out, stride); + highbd_idct16x16_add8x1_bd8(a[13], &out, stride); + highbd_idct16x16_add8x1_bd8(a[14], &out, stride); + highbd_idct16x16_add8x1_bd8(a[15], &out, stride); + highbd_idct16x16_add8x1_bd8(a[16], &out, stride); + highbd_idct16x16_add8x1_bd8(a[17], &out, stride); + highbd_idct16x16_add8x1_bd8(a[18], &out, stride); + highbd_idct16x16_add8x1_bd8(a[19], &out, stride); + highbd_idct16x16_add8x1_bd8(a[20], &out, stride); + highbd_idct16x16_add8x1_bd8(a[21], &out, stride); + highbd_idct16x16_add8x1_bd8(a[22], &out, stride); + highbd_idct16x16_add8x1_bd8(a[23], &out, stride); + highbd_idct16x16_add8x1_bd8(a[24], &out, stride); + highbd_idct16x16_add8x1_bd8(a[25], &out, stride); + highbd_idct16x16_add8x1_bd8(a[26], &out, stride); + highbd_idct16x16_add8x1_bd8(a[27], &out, stride); + highbd_idct16x16_add8x1_bd8(a[28], &out, stride); + highbd_idct16x16_add8x1_bd8(a[29], &out, stride); + highbd_idct16x16_add8x1_bd8(a[30], &out, stride); + highbd_idct16x16_add8x1_bd8(a[31], &out, stride); +} + +void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output, + void *const dest, const int stride, + const int highbd_flag); + +void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int16_t *output); + +void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input, + int16_t *const output, void *const dest, + const int stride, const int highbd_flag); + +void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, + const int stride, const int highbd_flag); + +void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output); +void vpx_idct32_16_neon(const int16_t *const input, void *const output, + const int stride, const int highbd_flag); + +void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output); +void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, + const int highbd_flag); + +#endif // VPX_VPX_DSP_ARM_IDCT_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c index fb1fa6b681..4f909e4935 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c @@ -12,51 +12,47 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "mem_neon.h" +#include "sum_neon.h" #include "vpx/vpx_integer.h" //------------------------------------------------------------------------------ // DC 4x4 -static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) { - const uint8x8_t ref_u8 = vld1_u8(ref); - const uint16x4_t p0 = vpaddl_u8(ref_u8); - return vpadd_u16(p0, p0); +static INLINE uint16_t dc_sum_4(const uint8_t *ref) { + return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref)); } static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc) { - const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); int i; for (i = 0; i < 4; ++i, dst += stride) { - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0); } } void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t a = vld1_u8(above); - const uint8x8_t l = vld1_u8(left); - const uint16x8_t al = vaddl_u8(a, l); - uint16x4_t sum; - uint8x8_t dc; - sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al)); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint8x8_t a = load_unaligned_u8_4x1(above); + const uint8x8_t l = load_unaligned_u8_4x1(left); + const uint16x4_t al = vget_low_u16(vaddl_u8(a, l)); + const uint16_t sum = horizontal_add_uint16x4(al); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); dc_store_4x4(dst, stride, dc); } void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_4(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); + const uint16_t sum = dc_sum_4(left); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2); (void)above; dc_store_4x4(dst, stride, dc); } void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_4(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); + const uint16_t sum = dc_sum_4(above); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2); (void)left; dc_store_4x4(dst, stride, dc); } @@ -72,19 +68,15 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 8x8 -static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) { - const uint8x8_t ref_u8 = vld1_u8(ref); - uint16x4_t sum = vpaddl_u8(ref_u8); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_8(const uint8_t *ref) { + return horizontal_add_uint8x8(vld1_u8(ref)); } static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc) { - const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); int i; for (i = 0; i < 8; ++i, dst += stride) { - vst1_u8(dst, dc_dup); + vst1_u8(dst, dc); } } @@ -92,28 +84,24 @@ void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t above_u8 = vld1_u8(above); const uint8x8_t left_u8 = vld1_u8(left); - const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8); - const uint16x8_t p0 = vpaddlq_u8(above_and_left); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16x8_t al = vaddl_u8(above_u8, left_u8); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4); dc_store_8x8(dst, stride, dc); } void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_8(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint16_t sum = dc_sum_8(left); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); (void)above; dc_store_8x8(dst, stride, dc); } void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_8(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint16_t sum = dc_sum_8(above); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); (void)left; dc_store_8x8(dst, stride, dc); } @@ -129,20 +117,15 @@ void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 16x16 -static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) { - const uint8x16_t ref_u8 = vld1q_u8(ref); - const uint16x8_t p0 = vpaddlq_u8(ref_u8); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_16(const uint8_t *ref) { + return horizontal_add_uint8x16(vld1q_u8(ref)); } static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride, - const uint8x8_t dc) { - const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0); + const uint8x16_t dc) { int i; for (i = 0; i < 16; ++i, dst += stride) { - vst1q_u8(dst, dc_dup); + vst1q_u8(dst + 0, dc); } } @@ -150,22 +133,19 @@ void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t ref0 = vld1q_u8(above); const uint8x16_t ref1 = vld1q_u8(left); - const uint16x8_t p0 = vpaddlq_u8(ref0); - const uint16x8_t p1 = vpaddlq_u8(ref1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16x8_t a = vpaddlq_u8(ref0); + const uint16x8_t l = vpaddlq_u8(ref1); + const uint16x8_t al = vaddq_u16(a, l); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); dc_store_16x16(dst, stride, dc); } void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_16(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16_t sum = dc_sum_16(left); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0); (void)above; dc_store_16x16(dst, stride, dc); } @@ -173,8 +153,8 @@ void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_16(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16_t sum = dc_sum_16(above); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0); (void)left; dc_store_16x16(dst, stride, dc); } @@ -182,7 +162,7 @@ void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t dc = vdup_n_u8(0x80); + const uint8x16_t dc = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_16x16(dst, stride, dc); @@ -191,51 +171,41 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 32x32 -static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { - const uint8x16x2_t r = vld2q_u8(ref); - const uint16x8_t p0 = vpaddlq_u8(r.val[0]); - const uint16x8_t p1 = vpaddlq_u8(r.val[1]); - const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_32(const uint8_t *ref) { + const uint8x16_t r0 = vld1q_u8(ref + 0); + const uint8x16_t r1 = vld1q_u8(ref + 16); + const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1)); + return horizontal_add_uint16x8(r01); } static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride, - const uint8x8_t dc) { - uint8x16x2_t dc_dup; + const uint8x16_t dc) { int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0); - for (i = 0; i < 32; ++i, dst += stride) { - vst2q_u8(dst, dc_dup); + vst1q_u8(dst + 0, dc); + vst1q_u8(dst + 16, dc); } } void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16x2_t a = vld2q_u8(above); - const uint8x16x2_t l = vld2q_u8(left); - const uint16x8_t pa0 = vpaddlq_u8(a.val[0]); - const uint16x8_t pl0 = vpaddlq_u8(l.val[0]); - const uint16x8_t pa1 = vpaddlq_u8(a.val[1]); - const uint16x8_t pl1 = vpaddlq_u8(l.val[1]); - const uint16x8_t pa = vaddq_u16(pa0, pa1); - const uint16x8_t pl = vaddq_u16(pl0, pl1); - const uint16x8_t pal = vaddq_u16(pa, pl); - uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6)); + const uint8x16_t a0 = vld1q_u8(above + 0); + const uint8x16_t a1 = vld1q_u8(above + 16); + const uint8x16_t l0 = vld1q_u8(left + 0); + const uint8x16_t l1 = vld1q_u8(left + 16); + const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1)); + const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1)); + const uint16x8_t al = vaddq_u16(a01, l01); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0); dc_store_32x32(dst, stride, dc); } void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_32(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16_t sum = dc_sum_32(left); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); (void)above; dc_store_32x32(dst, stride, dc); } @@ -243,8 +213,8 @@ void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_32(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16_t sum = dc_sum_32(above); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); (void)left; dc_store_32x32(dst, stride, dc); } @@ -252,7 +222,7 @@ void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t dc = vdup_n_u8(0x80); + const uint8x16_t dc = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_32x32(dst, stride, dc); @@ -262,123 +232,629 @@ void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t ABCDEFGH = vld1_u8(above); - const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8); - const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16); - const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); - const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); - const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); - const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r0 = vreinterpret_u32_u8(avg2); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + uint8x8_t a0, a1, a2, d0; + uint8_t a7; (void)left; - vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); - vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); - vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); - vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); - vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7); -} -static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride, - const uint8x8_t above_right, uint8x8_t *row) { - *row = vext_u8(*row, above_right, 1); - vst1_u8(*dst, *row); - *dst += stride; + a0 = vld1_u8(above); + a7 = above[7]; + + // [ above[1], ..., above[6], x, x ] + a1 = vext_u8(a0, a0, 1); + // [ above[2], ..., above[7], x, x ] + a2 = vext_u8(a0, a0, 2); + + // d0[0] = AVG3(above[0], above[1], above[2]); + // ... + // d0[5] = AVG3(above[5], above[6], above[7]); + // d0[6] = x (don't care) + // d0[7] = x (don't care) + d0 = vrhadd_u8(vhadd_u8(a0, a2), a1); + + // We want: + // stride=0 [ d0[0], d0[1], d0[2], d0[3] ] + // stride=1 [ d0[1], d0[2], d0[3], d0[4] ] + // stride=2 [ d0[2], d0[3], d0[4], d0[5] ] + // stride=2 [ d0[3], d0[4], d0[5], above[7] ] + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1)); + store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2)); + store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3)); + + // We stored d0[6] above, so fixup into above[7]. + dst[3 * stride + 3] = a7; } void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t A0 = vld1_u8(above); - const uint8x8_t above_right = vdup_lane_u8(A0, 7); - const uint8x8_t A1 = vext_u8(A0, above_right, 1); - const uint8x8_t A2 = vext_u8(A0, above_right, 2); - const uint8x8_t avg1 = vhadd_u8(A0, A2); - uint8x8_t row = vrhadd_u8(avg1, A1); + uint8x8_t ax0, a0, a1, a7, d0; (void)left; - vst1_u8(dst, row); - dst += stride; - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - vst1_u8(dst, above_right); -} + a0 = vld1_u8(above + 0); + a1 = vld1_u8(above + 1); + a7 = vld1_dup_u8(above + 7); -static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride, - const uint8x16_t above_right, uint8x16_t *row) { - *row = vextq_u8(*row, above_right, 1); - vst1q_u8(*dst, *row); - *dst += stride; + // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can + // shift in above[7] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[6] ] + ax0 = vext_u8(a0, a0, 7); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[7] = AVG3(above[6], above[7], above[8]); + d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[7]. + vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1)); + vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5)); + vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7)); + vst1_u8(dst + 7 * stride, a7); } void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0 = vld1q_u8(above); - const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7); - const uint8x16_t A1 = vextq_u8(A0, above_right, 1); - const uint8x16_t A2 = vextq_u8(A0, above_right, 2); - const uint8x16_t avg1 = vhaddq_u8(A0, A2); - uint8x16_t row = vrhaddq_u8(avg1, A1); + uint8x16_t ax0, a0, a1, a15, d0; (void)left; - vst1q_u8(dst, row); - dst += stride; - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - vst1q_u8(dst, above_right); + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a15 = vld1q_dup_u8(above + 15); + + // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can + // shift in above[15] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[14] ] + ax0 = vextq_u8(a0, a0, 15); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[15] = AVG3(above[14], above[15], above[16]); + d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[15]. + vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1)); + vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9)); + vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13)); + vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15)); + vst1q_u8(dst + 15 * stride, a15); } void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0_0 = vld1q_u8(above); - const uint8x16_t A0_1 = vld1q_u8(above + 16); - const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7); - const uint8x16_t A1_0 = vld1q_u8(above + 1); - const uint8x16_t A1_1 = vld1q_u8(above + 17); - const uint8x16_t A2_0 = vld1q_u8(above + 2); - const uint8x16_t A2_1 = vld1q_u8(above + 18); - const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0); - const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1); - uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0); - uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1); - int i; + uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2]; (void)left; - vst1q_u8(dst, row_0); - dst += 16; - vst1q_u8(dst, row_1); - dst += stride - 16; + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + a17 = vld1q_u8(above + 17); + a31 = vld1q_dup_u8(above + 31); - for (i = 0; i < 30; ++i) { - row_0 = vextq_u8(row_0, row_1, 1); - row_1 = vextq_u8(row_1, above_right, 1); - vst1q_u8(dst, row_0); - dst += 16; - vst1q_u8(dst, row_1); - dst += stride - 16; - } + // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can + // shift in above[15] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[14] ] + ax0 = vextq_u8(a0, a0, 15); - vst1q_u8(dst, above_right); - dst += 16; - vst1q_u8(dst, row_1); + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[15] = AVG3(above[14], above[15], above[16]); + d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0); + d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16); + + // Undo the earlier ext, incrementally shift in duplicates of above[15]. + vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1)); + vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1)); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15)); + vst1q_u8(dst + 15 * stride + 0, d0[1]); + vst1q_u8(dst + 15 * stride + 16, a31); + + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1)); + vst1q_u8(dst + 16 * stride + 16, a31); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2)); + vst1q_u8(dst + 17 * stride + 16, a31); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3)); + vst1q_u8(dst + 18 * stride + 16, a31); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4)); + vst1q_u8(dst + 19 * stride + 16, a31); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5)); + vst1q_u8(dst + 20 * stride + 16, a31); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6)); + vst1q_u8(dst + 21 * stride + 16, a31); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7)); + vst1q_u8(dst + 22 * stride + 16, a31); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8)); + vst1q_u8(dst + 23 * stride + 16, a31); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9)); + vst1q_u8(dst + 24 * stride + 16, a31); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10)); + vst1q_u8(dst + 25 * stride + 16, a31); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11)); + vst1q_u8(dst + 26 * stride + 16, a31); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12)); + vst1q_u8(dst + 27 * stride + 16, a31); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13)); + vst1q_u8(dst + 28 * stride + 16, a31); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14)); + vst1q_u8(dst + 29 * stride + 16, a31); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15)); + vst1q_u8(dst + 30 * stride + 16, a31); + vst1q_u8(dst + 31 * stride + 0, a31); + vst1q_u8(dst + 31 * stride + 16, a31); +} + +// ----------------------------------------------------------------------------- + +void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3; + (void)left; + + a0 = load_unaligned_u8_4x1(above + 0); + a1 = load_unaligned_u8_4x1(above + 1); + a2 = load_unaligned_u8_4x1(above + 2); + a3 = load_unaligned_u8_4x1(above + 3); + + d0 = vrhadd_u8(a0, a1); + d1 = vrhadd_u8(vhadd_u8(a0, a2), a1); + d2 = vrhadd_u8(a1, a2); + d3 = vrhadd_u8(vhadd_u8(a1, a3), a2); + + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1(dst + 2 * stride, d2); + store_u8_4x1(dst + 3 * stride, d3); +} + +void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t a0, a1, a2, a7, d0, d1; + (void)left; + + a0 = vld1_u8(above + 0); + a1 = vld1_u8(above + 1); + a2 = vld1_u8(above + 2); + a7 = vld1_dup_u8(above + 7); + + d0 = vrhadd_u8(a0, a1); + d1 = vrhadd_u8(vhadd_u8(a0, a2), a1); + + vst1_u8(dst + 0 * stride, d0); + vst1_u8(dst + 1 * stride, d1); + + d0 = vext_u8(d0, d0, 7); + d1 = vext_u8(d1, d1, 7); + + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4)); + vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4)); +} + +void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t a0, a1, a2, a15, d0, d1; + (void)left; + + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a2 = vld1q_u8(above + 2); + a15 = vld1q_dup_u8(above + 15); + + d0 = vrhaddq_u8(a0, a1); + d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1); + + vst1q_u8(dst + 0 * stride, d0); + vst1q_u8(dst + 1 * stride, d1); + + d0 = vextq_u8(d0, d0, 15); + d1 = vextq_u8(d1, d1, 15); + + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8)); + vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8)); +} + +void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi; + (void)left; + + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a2 = vld1q_u8(above + 2); + a16 = vld1q_u8(above + 16); + a17 = vld1q_u8(above + 17); + a18 = vld1q_u8(above + 18); + a31 = vld1q_dup_u8(above + 31); + + d0_lo = vrhaddq_u8(a0, a1); + d0_hi = vrhaddq_u8(a16, a17); + d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1); + d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17); + + vst1q_u8(dst + 0 * stride + 0, d0_lo); + vst1q_u8(dst + 0 * stride + 16, d0_hi); + vst1q_u8(dst + 1 * stride + 0, d1_lo); + vst1q_u8(dst + 1 * stride + 16, d1_hi); + + d0_hi = vextq_u8(d0_lo, d0_hi, 15); + d0_lo = vextq_u8(d0_lo, d0_lo, 15); + d1_hi = vextq_u8(d1_lo, d1_hi, 15); + d1_lo = vextq_u8(d1_lo, d1_lo, 15); + + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15)); + vst1q_u8(dst + 30 * stride + 0, d0_hi); + vst1q_u8(dst + 30 * stride + 16, a31); + vst1q_u8(dst + 31 * stride + 0, d1_hi); + vst1q_u8(dst + 31 * stride + 16, a31); +} + +// ----------------------------------------------------------------------------- + +void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1; + + az = load_unaligned_u8_4x1(above - 1); + a0 = load_unaligned_u8_4x1(above + 0); + // [ left[0], above[-1], above[0], above[1], x, x, x, x ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2); + col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2); + + d0 = vrhadd_u8(az, a0); + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + d2 = vext_u8(col0, d0, 7); + d3 = vext_u8(col1, d1, 7); + + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1(dst + 2 * stride, d2); + store_u8_4x1(dst + 3 * stride, d3); +} + +void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; + + az = vld1_u8(above - 1); + a0 = vld1_u8(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = vld1_u8(left + 0); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vext_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], above[0]) + // d0[1] = AVG2(above[0], above[1]) + // ... + // d0[7] = AVG2(above[6], above[7]) + d0 = vrhadd_u8(az, a0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vector to put the elements to be shifted in + // at the end. The lowest two lanes here are unused: + // col0[7] = AVG3(above[-1], left[0], left[1]) + // col0[6] = AVG3(left[0], left[1], left[2]) + // ... + // col0[2] = AVG3(left[4], left[5], left[6]) + // col0[1] = x (don't care) + // col0[0] = x (don't care) + col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0)); + + // We don't care about the first parameter to this uzp since we only ever use + // the high three elements, we just use col0 again since it is already + // available: + // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ] + // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ] + col0_even = vuzp_u8(col0, col0).val[1]; + col0_odd = vuzp_u8(col0, col0).val[0]; + + // Incrementally shift more elements from col0 into d0/1: + // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ] + // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ] + // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ] + // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ] + // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ] + // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ] + // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ] + vst1_u8(dst + 0 * stride, d0); + vst1_u8(dst + 1 * stride, d1); + vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7)); + vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7)); + vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6)); + vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6)); + vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5)); + vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5)); +} + +void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[15], x ] + l1 = vextq_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0 = vrhaddq_u8(az, a0); + d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + + col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + col0 = vrev64q_u8(vextq_u8(col0, col0, 8)); + + // The low nine lanes here are unused so the first input to the uzp is + // unused, so just use a duplicate of col0 since we have it already. This + // also means that the lowest lane of col0 here is unused. + col0_even = vuzpq_u8(col0, col0).val[1]; + col0_odd = vuzpq_u8(col0, col0).val[0]; + + vst1q_u8(dst + 0 * stride, d0); + vst1q_u8(dst + 1 * stride, d1); + vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15)); + vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15)); + vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14)); + vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14)); + vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13)); + vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13)); + vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12)); + vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12)); + vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11)); + vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10)); + vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10)); + vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9)); + vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9)); +} + +void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d117_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1, + l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + a14 = vld1q_u8(above + 14); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + l1 = vld1q_u8(left + 1); + l15 = vld1q_u8(left + 15); + l16 = vld1q_u8(left + 16); + // The last lane here is unused, reading left[32] would cause a buffer + // over-read (observed as an address-sanitizer failure), so just fill with a + // duplicate of left[16] to avoid needing to materialize a zero: + // [ left[17], ... , left[31], x ] + l17 = vextq_u8(l16, l16, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0_lo = vrhaddq_u8(az, a0); + d0_hi = vrhaddq_u8(a15, a16); + d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15); + + // The last lane of col0_hi is unused here. + col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16); + + col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8)); + col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8)); + + // The first lane of these are unused since they are only ever called as + // ext(col0, _, i) where i >= 1. + col0_even = vuzpq_u8(col0_hi, col0_lo).val[1]; + col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0]; + + vst1q_u8(dst + 0 * stride + 0, d0_lo); + vst1q_u8(dst + 0 * stride + 16, d0_hi); + vst1q_u8(dst + 1 * stride + 0, d1_lo); + vst1q_u8(dst + 1 * stride + 16, d1_hi); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1)); + vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1)); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1)); + vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1)); } // ----------------------------------------------------------------------------- @@ -390,22 +866,14 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8x8_t L3210 = vrev64_u8(L0123); const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4); const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5); - const uint8x8_t L10XA0123_ = - vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8)); + const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1); const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012); const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r3 = vreinterpret_u32_u8(avg2); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); - vst1_lane_u32((uint32_t *)dst, r0, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r1, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r2, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r3, 0); + + store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3)); + store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2)); + store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1)); + store_u8_4x1(dst + 3 * stride, avg2); } void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, @@ -422,31 +890,15 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_); const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_); const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567); - const uint8x8_t row_0 = vget_low_u8(row); - const uint8x8_t row_1 = vget_high_u8(row); - const uint8x8_t r0 = vext_u8(row_0, row_1, 7); - const uint8x8_t r1 = vext_u8(row_0, row_1, 6); - const uint8x8_t r2 = vext_u8(row_0, row_1, 5); - const uint8x8_t r3 = vext_u8(row_0, row_1, 4); - const uint8x8_t r4 = vext_u8(row_0, row_1, 3); - const uint8x8_t r5 = vext_u8(row_0, row_1, 2); - const uint8x8_t r6 = vext_u8(row_0, row_1, 1); - vst1_u8(dst, r0); - dst += stride; - vst1_u8(dst, r1); - dst += stride; - vst1_u8(dst, r2); - dst += stride; - vst1_u8(dst, r3); - dst += stride; - vst1_u8(dst, r4); - dst += stride; - vst1_u8(dst, r5); - dst += stride; - vst1_u8(dst, r6); - dst += stride; - vst1_u8(dst, row_0); + vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7))); + vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6))); + vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5))); + vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4))); + vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3))); + vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2))); + vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1))); + vst1_u8(dst + 7 * stride, vget_low_u8(row)); } static INLINE void d135_store_16x8( @@ -489,6 +941,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_); const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X); const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef); + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15); const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14); const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13); @@ -496,7 +949,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11); const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10); const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9); - const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1)); + const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8); const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7); const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6); const uint8x16_t r_a = vextq_u8(row_0, row_1, 5); @@ -669,6 +1122,452 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, // ----------------------------------------------------------------------------- +void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02; + + az = load_unaligned_u8_4x1(above - 1); + a0 = load_unaligned_u8_4x1(above + 0); + // [ left[0], above[-1], above[0], above[1], x, x, x, x ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = load_unaligned_u8_4x1(left + 0); + l1 = load_unaligned_u8_4x1(left + 1); + // [ above[-1], left[0], left[1], left[2], x, x, x, x ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + d0 = vrhadd_u8(azl0, l0); + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0); + + d02 = vrev64_u8(vzip_u8(d0, d2).val[0]); + + store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7)); + store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5)); + store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3)); + store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1)); +} + +void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi; + + az = vld1_u8(above - 1); + a0 = vld1_u8(above + 0); + // [ left[0], above[-1], ... , above[5] ] + l0az = vext_u8(vld1_dup_u8(left), az, 7); + + l0 = vld1_u8(left); + // The last lane here is unused, reading left[8] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[7], x ] + l1 = vext_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[6] ] + azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7); + + // d0[0] = AVG2(above[-1], left[0]) + // d0[1] = AVG2(left[0], left[1]) + // ... + // d0[7] = AVG2(left[6], left[7]) + d0 = vrhadd_u8(azl0, l0); + + // d1[0] = AVG3(left[0], above[-1], above[0]) + // d1[1] = AVG3(above[-1], above[0], above[1]) + // ... + // d1[7] = AVG3(above[5], above[6], above[7]) + d1 = vrhadd_u8(vhadd_u8(l0az, a0), az); + + // d2[0] = AVG3(above[-1], left[0], left[1]) + // d2[1] = AVG3(left[0], left[1], left[2]) + // ... + // d2[6] = AVG3(left[5], left[6], left[7]) + // d2[7] = x (don't care) + d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0); + + // The ext instruction shifts elements in from the end of the vector rather + // than the start, so reverse the vectors to put the elements to be shifted + // in at the end. The lowest lane of d02_lo is unused. + d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0]; + d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1]; + + // Incrementally shift more elements from d0/d2 reversed into d1: + // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ] + // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ] + // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ] + // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ] + // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ] + // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ] + // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ] + // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ] + vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7)); + vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5)); + vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3)); + vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1)); + vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7)); + vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5)); + vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3)); + vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1)); +} + +void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left + 0); + // The last lane here is unused, reading left[16] could cause a buffer + // over-read, so just fill with a duplicate of left[0] to avoid needing to + // materialize a zero: + // [ left[1], ... , left[15], x ] + l1 = vextq_u8(l0, l0, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0 = vrhaddq_u8(azl0, l0); + d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + + d0 = vrev64q_u8(vextq_u8(d0, d0, 8)); + d2 = vrev64q_u8(vextq_u8(d2, d2, 8)); + + // The lowest lane of d02_lo is unused. + d02_lo = vzipq_u8(d2, d0).val[0]; + d02_hi = vzipq_u8(d2, d0).val[1]; + + vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15)); + vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13)); + vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11)); + vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9)); + vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7)); + vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5)); + vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3)); + vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1)); + vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15)); + vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13)); + vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9)); + vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7)); + vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5)); + vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3)); + vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1)); +} + +void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + // See vpx_d153_predictor_8x8_neon for more details on the implementation. + uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo, + d0_hi, d1_lo, d1_hi, d2_lo, d2_hi; + uint8x16x2_t d02_hi, d02_lo; + + az = vld1q_u8(above - 1); + a0 = vld1q_u8(above + 0); + a14 = vld1q_u8(above + 14); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + // [ left[0], above[-1], ... , above[13] ] + l0az = vextq_u8(vld1q_dup_u8(left), az, 15); + + l0 = vld1q_u8(left); + l1 = vld1q_u8(left + 1); + l15 = vld1q_u8(left + 15); + l16 = vld1q_u8(left + 16); + // The last lane here is unused, reading left[32] would cause a buffer + // over-read (observed as an address-sanitizer failure), so just fill with a + // duplicate of left[16] to avoid needing to materialize a zero: + // [ left[17], ... , left[31], x ] + l17 = vextq_u8(l16, l16, 1); + // [ above[-1], left[0], ... , left[14] ] + azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15); + + d0_lo = vrhaddq_u8(azl0, l0); + d0_hi = vrhaddq_u8(l15, l16); + + d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az); + d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15); + + // The highest lane of d2_hi is unused. + d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0); + d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16); + + d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8)); + d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8)); + + d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8)); + d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8)); + + // d02_hi.val[0][0] is unused here. + d02_hi = vzipq_u8(d2_hi, d0_hi); + d02_lo = vzipq_u8(d2_lo, d0_lo); + + vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15)); + vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5)); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3)); + vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3)); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1)); + vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1)); +} + +// ----------------------------------------------------------------------------- + +void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1; + (void)above; + + // We need the low half lanes here for the c0/c1 arithmetic but the high half + // lanes for the ext: + // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ] + l0 = load_replicate_u8_4x1(left + 0); + l3 = vld1_dup_u8(left + 3); + + // [ left[1], left[2], left[3], left[3], x, x, x, x ] + l1 = vext_u8(l0, l3, 5); + // [ left[2], left[3], left[3], left[3], x, x, x, x ] + l2 = vext_u8(l0, l3, 6); + + c0 = vrhadd_u8(l0, l1); + c1 = vrhadd_u8(vhadd_u8(l0, l2), l1); + + // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ] + c01 = vzip_u8(c0, c1).val[0]; + + d0 = c01; + d1 = vext_u8(c01, l3, 2); + + // Store the high half of the vector for stride={2,3} to avoid needing + // additional ext instructions: + // stride=0 [ c0[0], c1[0], c0[1], c1[1] ] + // stride=1 [ c0[1], c1[1], c0[2], c1[2] ] + // stride=2 [ c0[2], c1[2], c0[3], c1[3] ] + // stride=3 [ c0[3], c1[3], left[3], left[3] ] + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, d1); + store_u8_4x1_high(dst + 2 * stride, d0); + store_u8_4x1_high(dst + 3 * stride, d1); +} + +void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi; + (void)above; + + l0 = vld1_u8(left + 0); + l7 = vld1_dup_u8(left + 7); + + // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ] + l1 = vext_u8(l0, l7, 1); + // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ] + l2 = vext_u8(l0, l7, 2); + + c0 = vrhadd_u8(l0, l1); + c1 = vrhadd_u8(vhadd_u8(l0, l2), l1); + + c01_lo = vzip_u8(c0, c1).val[0]; + c01_hi = vzip_u8(c0, c1).val[1]; + + vst1_u8(dst + 0 * stride, c01_lo); + vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2)); + vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4)); + vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6)); + vst1_u8(dst + 4 * stride, c01_hi); + vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2)); + vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4)); + vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6)); +} + +void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi; + (void)above; + + l0 = vld1q_u8(left + 0); + l15 = vld1q_dup_u8(left + 15); + + l1 = vextq_u8(l0, l15, 1); + l2 = vextq_u8(l0, l15, 2); + + c0 = vrhaddq_u8(l0, l1); + c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1); + + c01_lo = vzipq_u8(c0, c1).val[0]; + c01_hi = vzipq_u8(c0, c1).val[1]; + + vst1q_u8(dst + 0 * stride, c01_lo); + vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2)); + vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4)); + vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6)); + vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8)); + vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10)); + vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12)); + vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14)); + vst1q_u8(dst + 8 * stride, c01_hi); + vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2)); + vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4)); + vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6)); + vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8)); + vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10)); + vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12)); + vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14)); +} + +void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo, + c1_hi, c01[4]; + (void)above; + + l0_lo = vld1q_u8(left + 0); + l0_hi = vld1q_u8(left + 16); + l31 = vld1q_dup_u8(left + 31); + + l1_lo = vextq_u8(l0_lo, l0_hi, 1); + l1_hi = vextq_u8(l0_hi, l31, 1); + l2_lo = vextq_u8(l0_lo, l0_hi, 2); + l2_hi = vextq_u8(l0_hi, l31, 2); + + c0_lo = vrhaddq_u8(l0_lo, l1_lo); + c0_hi = vrhaddq_u8(l0_hi, l1_hi); + c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo); + c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi); + + c01[0] = vzipq_u8(c0_lo, c1_lo).val[0]; + c01[1] = vzipq_u8(c0_lo, c1_lo).val[1]; + c01[2] = vzipq_u8(c0_hi, c1_hi).val[0]; + c01[3] = vzipq_u8(c0_hi, c1_hi).val[1]; + + vst1q_u8(dst + 0 * stride + 0, c01[0]); + vst1q_u8(dst + 0 * stride + 16, c01[1]); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14)); + vst1q_u8(dst + 8 * stride + 0, c01[1]); + vst1q_u8(dst + 8 * stride + 16, c01[2]); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14)); + vst1q_u8(dst + 16 * stride + 0, c01[2]); + vst1q_u8(dst + 16 * stride + 16, c01[3]); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14)); + vst1q_u8(dst + 24 * stride + 0, c01[3]); + vst1q_u8(dst + 24 * stride + 16, l31); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2)); + vst1q_u8(dst + 25 * stride + 16, l31); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4)); + vst1q_u8(dst + 26 * stride + 16, l31); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6)); + vst1q_u8(dst + 27 * stride + 16, l31); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8)); + vst1q_u8(dst + 28 * stride + 16, l31); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10)); + vst1q_u8(dst + 29 * stride + 16, l31); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12)); + vst1q_u8(dst + 30 * stride + 16, l31); + vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14)); + vst1q_u8(dst + 31 * stride + 16, l31); +} + +// ----------------------------------------------------------------------------- + #if !HAVE_NEON_ASM void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm index a042d40acb..a81a9d1013 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm +++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm @@ -201,7 +201,7 @@ str lr, [sp, #16] ; thresh1 add sp, #4 pop {r0-r1, lr} - add r0, r1, lsl #3 ; s + 8 * pitch + add r0, r0, r1, lsl #3 ; s + 8 * pitch b vpx_lpf_vertical_8_neon ENDP ; |vpx_lpf_vertical_8_dual_neon| diff --git a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c index 7419cea022..579096d78a 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/loopfilter_neon.c @@ -162,7 +162,7 @@ FUN_FLIP_SIGN(16, q_) // flip_sign_16 #define FUN_FLIP_SIGN_BACK(w, r) \ static INLINE uint8x##w##_t flip_sign_back_##w(const int8x##w##_t v) { \ - const int8x##w##_t sign_bit = vdup##r##n_s8(0x80); \ + const int8x##w##_t sign_bit = vdup##r##n_s8((int8_t)0x80); \ return vreinterpret##r##u8_s8(veor##r##s8(v, sign_bit)); \ } @@ -975,6 +975,17 @@ FUN_LPF_16_KERNEL(_, 8) // lpf_16_kernel FUN_LPF_16_KERNEL(_dual_, 16) // lpf_16_dual_kernel #undef FUN_LPF_16_KERNEL +// Quiet warnings of the form: 'vpx_dsp/arm/loopfilter_neon.c|981 col 42| +// warning: 'oq1' may be used uninitialized in this function +// [-Wmaybe-uninitialized]', for oq1-op1. Without reworking the code or adding +// an additional branch this warning cannot be silenced otherwise. The +// loopfilter is only called when needed for a block so these output pixels +// will be set. +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + void vpx_lpf_horizontal_16_neon(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x8_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, op6, @@ -1090,3 +1101,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, vget_high_u8(oq0), vget_high_u8(oq1)); } } + +#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__) +#pragma GCC diagnostic pop +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h new file mode 100644 index 0000000000..7088eb5450 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_ +#define VPX_VPX_DSP_ARM_MEM_NEON_H_ + +#include +#include +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Support for these xN intrinsics is lacking in older versions of GCC. +#if defined(__GNUC__) && !defined(__clang__) +#if __GNUC__ < 8 || defined(__arm__) +static INLINE uint8x16x2_t vld1q_u8_x2(uint8_t const *ptr) { + uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } }; + return res; +} +#endif + +#if __GNUC__ < 9 || defined(__arm__) +static INLINE uint8x16x3_t vld1q_u8_x3(uint8_t const *ptr) { + uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16), + vld1q_u8(ptr + 2 * 16) } }; + return res; +} +#endif +#endif + +static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1, + const int16_t c2, const int16_t c3) { + return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) | + ((uint64_t)(uint16_t)c2 << 32) | ((uint64_t)c3 << 48)); +} + +static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) { + return vcreate_s32((uint32_t)c0 | ((uint64_t)(uint32_t)c1 << 32)); +} + +static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1, + const int32_t c2, const int32_t c3) { + return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3)); +} + +// Helper functions used to load tran_low_t into int16, narrowing if necessary. +static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4x2_t v0 = vld2q_s32(buf); + const int32x4x2_t v1 = vld2q_s32(buf + 8); + const int16x4_t s0 = vmovn_s32(v0.val[0]); + const int16x4_t s1 = vmovn_s32(v0.val[1]); + const int16x4_t s2 = vmovn_s32(v1.val[0]); + const int16x4_t s3 = vmovn_s32(v1.val[1]); + int16x8x2_t res; + res.val[0] = vcombine_s16(s0, s2); + res.val[1] = vcombine_s16(s1, s3); + return res; +#else + return vld2q_s16(buf); +#endif +} + +static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +#else + return vld1q_s16(buf); +#endif +} + +static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + return vmovn_s32(v0); +#else + return vld1_s16(buf); +#endif +} + +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +#else + vst1q_s16(buf, a); +#endif +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) { + vst1q_s32(buf, a); +} + +static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) { + return vld1q_s32(buf); +} +#endif + +// Propagate type information to the compiler. Without this the compiler may +// assume the required alignment of uint32_t (4 bytes) and add alignment hints +// to the memory access. +// +// This is used for functions operating on uint8_t which wish to load or store 4 +// values at a time but which may not be on 4 byte boundaries. +static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { + memcpy(buf, &a, 4); +} + +// Load 4 contiguous bytes when alignment is not guaranteed. +static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) { + uint32_t a; + uint32x2_t a_u32; + memcpy(&a, buf, 4); + a_u32 = vdup_n_u32(0); + a_u32 = vset_lane_u32(a, a_u32, 0); + return vreinterpret_u8_u32(a_u32); +} + +// Load 4 contiguous bytes and replicate across a vector when alignment is not +// guaranteed. +static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) { + uint32_t a; + memcpy(&a, buf, 4); + return vreinterpret_u8_u32(vdup_n_u32(a)); +} + +// Store 4 contiguous bytes from the low half of an 8x8 vector. +static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) { + vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0); +} + +// Store 4 contiguous bytes from the high half of an 8x8 vector. +static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) { + vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1); +} + +// Load 2 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, + ptrdiff_t stride) { + uint32_t a; + uint32x2_t a_u32 = vdup_n_u32(0); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vset_lane_u32(a, a_u32, 0); + memcpy(&a, buf, 4); + a_u32 = vset_lane_u32(a, a_u32, 1); + return vreinterpret_u8_u32(a_u32); +} + +// Load 8 bytes when alignment is not guaranteed. +static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) { + uint64_t a; + uint64x1_t a_u64 = vdup_n_u64(0); + memcpy(&a, buf, 8); + a_u64 = vset_lane_u64(a, a_u64, 0); + return vreinterpret_u16_u64(a_u64); +} + +// Load 2 sets of 8 bytes when alignment is not guaranteed. +static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf, + ptrdiff_t stride) { + uint64_t a; + uint64x2_t a_u64 = vdupq_n_u64(0); + memcpy(&a, buf, 8); + buf += stride; + a_u64 = vsetq_lane_u64(a, a_u64, 0); + memcpy(&a, buf, 8); + a_u64 = vsetq_lane_u64(a, a_u64, 1); + return vreinterpretq_u16_u64(a_u64); +} + +// Store 2 sets of 4 bytes when alignment is not guaranteed. +static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, + const uint8x8_t a) { + const uint32x2_t a_u32 = vreinterpret_u32_u8(a); + uint32_to_mem(buf, vget_lane_u32(a_u32, 0)); + buf += stride; + uint32_to_mem(buf, vget_lane_u32(a_u32, 1)); +} + +// Load 4 sets of 4 bytes when alignment is not guaranteed. +static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, + ptrdiff_t stride) { + uint32_t a; + uint32x4_t a_u32 = vdupq_n_u32(0); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 0); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 1); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 2); + memcpy(&a, buf, 4); + buf += stride; + a_u32 = vsetq_lane_u32(a, a_u32, 3); + return vreinterpretq_u8_u32(a_u32); +} + +// Store 4 sets of 4 bytes when alignment is not guaranteed. +static INLINE void store_unaligned_u8q(uint8_t *buf, ptrdiff_t stride, + const uint8x16_t a) { + const uint32x4_t a_u32 = vreinterpretq_u32_u8(a); + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0)); + buf += stride; + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1)); + buf += stride; + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2)); + buf += stride; + uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3)); +} + +// Load 2 sets of 4 bytes when alignment is guaranteed. +static INLINE uint8x8_t load_u8(const uint8_t *buf, ptrdiff_t stride) { + uint32x2_t a = vdup_n_u32(0); + + assert(!((intptr_t)buf % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + a = vld1_lane_u32((const uint32_t *)buf, a, 0); + buf += stride; + a = vld1_lane_u32((const uint32_t *)buf, a, 1); + return vreinterpret_u8_u32(a); +} + +// Store 2 sets of 4 bytes when alignment is guaranteed. +static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { + uint32x2_t a_u32 = vreinterpret_u32_u8(a); + + assert(!((intptr_t)buf % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + vst1_lane_u32((uint32_t *)buf, a_u32, 0); + buf += stride; + vst1_lane_u32((uint32_t *)buf, a_u32, 1); +} + +static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); +} + +static INLINE void load_u8_8x3(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); +} + +static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); +} + +static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); +} + +static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); +} + +static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); +} + +static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p, + const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, const uint8x16_t s3) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); +} + +static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); +} + +static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); +} + +static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p, + uint8x8_t *const s0, uint8x8_t *const s1, + uint8x8_t *const s2, uint8x8_t *const s3, + uint8x8_t *const s4, uint8x8_t *const s5, + uint8x8_t *const s6, uint8x8_t *const s7, + uint8x8_t *const s8, uint8x8_t *const s9, + uint8x8_t *const s10) { + *s0 = vld1_u8(s); + s += p; + *s1 = vld1_u8(s); + s += p; + *s2 = vld1_u8(s); + s += p; + *s3 = vld1_u8(s); + s += p; + *s4 = vld1_u8(s); + s += p; + *s5 = vld1_u8(s); + s += p; + *s6 = vld1_u8(s); + s += p; + *s7 = vld1_u8(s); + s += p; + *s8 = vld1_u8(s); + s += p; + *s9 = vld1_u8(s); + s += p; + *s10 = vld1_u8(s); +} + +static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p, + const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t s4, const uint8x8_t s5, + const uint8x8_t s6, const uint8x8_t s7) { + vst1_u8(s, s0); + s += p; + vst1_u8(s, s1); + s += p; + vst1_u8(s, s2); + s += p; + vst1_u8(s, s3); + s += p; + vst1_u8(s, s4); + s += p; + vst1_u8(s, s5); + s += p; + vst1_u8(s, s6); + s += p; + vst1_u8(s, s7); +} + +static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p, + uint8x16_t *const s0, uint8x16_t *const s1, + uint8x16_t *const s2, uint8x16_t *const s3, + uint8x16_t *const s4, uint8x16_t *const s5, + uint8x16_t *const s6, uint8x16_t *const s7) { + *s0 = vld1q_u8(s); + s += p; + *s1 = vld1q_u8(s); + s += p; + *s2 = vld1q_u8(s); + s += p; + *s3 = vld1q_u8(s); + s += p; + *s4 = vld1q_u8(s); + s += p; + *s5 = vld1q_u8(s); + s += p; + *s6 = vld1q_u8(s); + s += p; + *s7 = vld1q_u8(s); +} + +static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p, + const uint8x16_t s0, const uint8x16_t s1, + const uint8x16_t s2, const uint8x16_t s3, + const uint8x16_t s4, const uint8x16_t s5, + const uint8x16_t s6, const uint8x16_t s7) { + vst1q_u8(s, s0); + s += p; + vst1q_u8(s, s1); + s += p; + vst1q_u8(s, s2); + s += p; + vst1q_u8(s, s3); + s += p; + vst1q_u8(s, s4); + s += p; + vst1q_u8(s, s5); + s += p; + vst1q_u8(s, s6); + s += p; + vst1q_u8(s, s7); +} + +static INLINE void store_u16_4x3(uint16_t *s, const ptrdiff_t p, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2) { + vst1_u16(s, s0); + s += p; + vst1_u16(s, s1); + s += p; + vst1_u16(s, s2); +} + +static INLINE void load_s16_4x3(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); +} + +static INLINE void load_s16_4x4(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); +} + +static INLINE void load_s16_4x11(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3, int16x4_t *s4, int16x4_t *s5, + int16x4_t *s6, int16x4_t *s7, int16x4_t *s8, + int16x4_t *s9, int16x4_t *s10) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); + s += p; + *s7 = vld1_s16(s); + s += p; + *s8 = vld1_s16(s); + s += p; + *s9 = vld1_s16(s); + s += p; + *s10 = vld1_s16(s); +} + +static INLINE void store_u16_4x4(uint16_t *s, const ptrdiff_t p, + const uint16x4_t s0, const uint16x4_t s1, + const uint16x4_t s2, const uint16x4_t s3) { + vst1_u16(s, s0); + s += p; + vst1_u16(s, s1); + s += p; + vst1_u16(s, s2); + s += p; + vst1_u16(s, s3); +} + +static INLINE void load_s16_4x7(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3, int16x4_t *s4, int16x4_t *s5, + int16x4_t *s6) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); +} + +static INLINE void load_s16_8x3(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); +} + +static INLINE void load_s16_8x4(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); +} + +static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, + uint16x8_t *s3) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); +} + +static INLINE void store_u16_8x4(uint16_t *s, const ptrdiff_t p, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2, const uint16x8_t s3) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); +} + +static INLINE void store_u16_8x3(uint16_t *s, const ptrdiff_t p, + const uint16x8_t s0, const uint16x8_t s1, + const uint16x8_t s2) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); +} + +static INLINE void load_s16_8x7(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3, int16x8_t *s4, int16x8_t *s5, + int16x8_t *s6) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); +} + +static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p, + uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2, + uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5, + uint16x8_t *s6, uint16x8_t *s7) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); + s += p; + *s7 = vld1q_u16(s); +} + +static INLINE void load_s16_4x8(const int16_t *s, const ptrdiff_t p, + int16x4_t *s0, int16x4_t *s1, int16x4_t *s2, + int16x4_t *s3, int16x4_t *s4, int16x4_t *s5, + int16x4_t *s6, int16x4_t *s7) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); + s += p; + *s4 = vld1_s16(s); + s += p; + *s5 = vld1_s16(s); + s += p; + *s6 = vld1_s16(s); + s += p; + *s7 = vld1_s16(s); +} + +static INLINE void load_s16_8x11(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3, int16x8_t *s4, int16x8_t *s5, + int16x8_t *s6, int16x8_t *s7, int16x8_t *s8, + int16x8_t *s9, int16x8_t *s10) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); + s += p; + *s8 = vld1q_s16(s); + s += p; + *s9 = vld1q_s16(s); + s += p; + *s10 = vld1q_s16(s); +} + +static INLINE void load_s16_8x12(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3, int16x8_t *s4, int16x8_t *s5, + int16x8_t *s6, int16x8_t *s7, int16x8_t *s8, + int16x8_t *s9, int16x8_t *s10, + int16x8_t *s11) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); + s += p; + *s8 = vld1q_s16(s); + s += p; + *s9 = vld1q_s16(s); + s += p; + *s10 = vld1q_s16(s); + s += p; + *s11 = vld1q_s16(s); +} + +static INLINE void load_s16_8x8(const int16_t *s, const ptrdiff_t p, + int16x8_t *s0, int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3, int16x8_t *s4, int16x8_t *s5, + int16x8_t *s6, int16x8_t *s7) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); +} + +#endif // VPX_VPX_DSP_ARM_MEM_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c new file mode 100644 index 0000000000..e2351fa2cc --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/quantize_neon.c @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff, + const int16x8_t dequant, + tran_low_t *dqcoeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t dqcoeff_0 = + vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + const int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +#else + vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant)); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static INLINE int16x8_t +quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16 + qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + +void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int16_t const *iscan = scan_order->iscan; + + // Only the first element of each vector is DC. + int16x8_t zbin = vld1q_s16(mb_plane->zbin); + int16x8_t round = vld1q_s16(mb_plane->round); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); + int16x8_t dequant = vld1q_s16(dequant_ptr); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant, + quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + n_coeffs -= 8; + + { + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); + + do { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + n_coeffs -= 8; + } while (n_coeffs > 0); + } + +#if VPX_ARCH_AARCH64 + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // VPX_ARCH_AARCH64 +} + +static INLINE int32x4_t extract_sign_bit(int32x4_t a) { + return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31)); +} + +static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff, + const int16x8_t dequant, + tran_low_t *dqcoeff_ptr) { + int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant)); + int32x4_t dqcoeff_1 = + vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant)); + + // Add 1 if negative to round towards zero because the C uses division. + dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0)); + dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1)); + +#if CONFIG_VP9_HIGHBITDEPTH + dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1); + dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1); + vst1q_s32(dqcoeff_ptr, dqcoeff_0); + vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1); +#else + vst1q_s16(dqcoeff_ptr, + vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1))); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +static INLINE int16x8_t +quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16x8_t zbin, + const int16x8_t round, const int16x8_t quant, + const int16x8_t quant_shift, const int16x8_t dequant) { + // Load coeffs as 8 x 16-bit ints, take sign and abs values + const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); + const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); + const int16x8_t coeff_abs = vabsq_s16(coeff); + + // Calculate mask of elements outside the bin + const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin)); + + // Get the rounded values + const int16x8_t rounded = vqaddq_s16(coeff_abs, round); + + // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16 + int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1); + + qcoeff = vaddq_s16(qcoeff, rounded); + + // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15 + qcoeff = vqdmulhq_s16(qcoeff, quant_shift); + + // Restore the sign bit. + qcoeff = veorq_s16(qcoeff, coeff_sign); + qcoeff = vsubq_s16(qcoeff, coeff_sign); + + // Only keep the relevant coeffs + qcoeff = vandq_s16(qcoeff, zbin_mask); + store_s16q_to_tran_low(qcoeff_ptr, qcoeff); + + calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr); + + return qcoeff; +} + +// Main difference is that zbin values are halved before comparison and dqcoeff +// values are divided by 2. zbin is rounded but dqcoeff is not. +void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, + const struct macroblock_plane *mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int16x8_t neg_one = vdupq_n_s16(-1); + uint16x8_t eob_max; + int i; + const int16_t *iscan = scan_order->iscan; + + // Only the first element of each vector is DC. + int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1); + int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1); + int16x8_t quant = vld1q_s16(mb_plane->quant); + int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift); + int16x8_t dequant = vld1q_s16(dequant_ptr); + + // Process first 8 values which include a dc component. + { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + + { + zbin = vdupq_lane_s16(vget_low_s16(zbin), 1); + round = vdupq_lane_s16(vget_low_s16(round), 1); + quant = vdupq_lane_s16(vget_low_s16(quant), 1); + quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1); + dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); + + for (i = 1; i < 32 * 32 / 8; ++i) { + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); + + const int16x8_t qcoeff = + quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, + quant, quant_shift, dequant); + + // Set non-zero elements to -1 and use that to extract values for eob. + eob_max = + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); + + __builtin_prefetch(coeff_ptr + 64); + coeff_ptr += 8; + iscan += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + } + } + +#if VPX_ARCH_AARCH64 + *eob_ptr = vmaxvq_u16(eob_max); +#else + { + const uint16x4_t eob_max_0 = + vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); + const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0); + const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); + vst1_lane_u16(eob_ptr, eob_max_2, 0); + } +#endif // VPX_ARCH_AARCH64 +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c index dc20398000..713eec7a92 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c @@ -10,215 +10,219 @@ #include +#include #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = - vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = - vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint16x8_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vpadalq_u8(*sad_sum, abs_diff); } -// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, -// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo -// and vec_sum_ref_hi. -static void sad_neon_64(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, - const uint8x16_t vec_src_32, - const uint8x16_t vec_src_48, const uint8_t *ref, - uint16x8_t *vec_sum_ref_lo, - uint16x8_t *vec_sum_ref_hi) { - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); +static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); + int i = 0; + do { + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + i++; + } while (i < h); + + vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi)); } -// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, -// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. -static void sad_neon_32(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, const uint8_t *ref, - uint16x8_t *vec_sum_ref_lo, - uint16x8_t *vec_sum_ref_hi) { - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); +static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + i++; + } while (i < h); + + vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi)); } -void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; +static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, - &vec_sum_ref0_lo, &vec_sum_ref0_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, - &vec_sum_ref1_lo, &vec_sum_ref1_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, - &vec_sum_ref2_lo, &vec_sum_ref2_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, - &vec_sum_ref3_lo, &vec_sum_ref3_hi); + i++; + } while (i < h); - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); +} + +static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref, + uint16x8_t *const sad_sum) { + uint8x8_t abs_diff = vabd_u8(src, ref); + *sad_sum = vaddw_u8(*sad_sum, abs_diff); +} + +static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + const uint8x8_t s = vld1_u8(src + i * src_stride); + sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]); + sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]); + sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]); + sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]); + + i++; + } while (i < h); + + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); +} + +static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride); + uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride); + uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride); + uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride); + uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride); + + sad8_neon(s, r0, &sum[0]); + sad8_neon(s, r1, &sum[1]); + sad8_neon(s, r2, &sum[2]); + sad8_neon(s, r3, &sum[3]); + + i += 2; + } while (i < h); + + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); +} + +#define SAD_WXH_4D_NEON(w, h) \ + void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \ + (h)); \ } - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); -} +SAD_WXH_4D_NEON(4, 4) +SAD_WXH_4D_NEON(4, 8) -void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; +SAD_WXH_4D_NEON(8, 4) +SAD_WXH_4D_NEON(8, 8) +SAD_WXH_4D_NEON(8, 16) - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); +SAD_WXH_4D_NEON(16, 8) +SAD_WXH_4D_NEON(16, 16) +SAD_WXH_4D_NEON(16, 32) - sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo, - &vec_sum_ref0_hi); - sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo, - &vec_sum_ref1_hi); - sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo, - &vec_sum_ref2_hi); - sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo, - &vec_sum_ref3_hi); +SAD_WXH_4D_NEON(32, 16) +SAD_WXH_4D_NEON(32, 32) +SAD_WXH_4D_NEON(32, 64) - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; +SAD_WXH_4D_NEON(64, 32) +SAD_WXH_4D_NEON(64, 64) + +#undef SAD_WXH_4D_NEON + +#define SAD_SKIP_WXH_4D_NEON(w, h) \ + void vpx_sad_skip_##w##x##h##x4d_neon( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ } - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); -} +SAD_SKIP_WXH_4D_NEON(4, 4) +SAD_SKIP_WXH_4D_NEON(4, 8) -void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; +SAD_SKIP_WXH_4D_NEON(8, 4) +SAD_SKIP_WXH_4D_NEON(8, 8) +SAD_SKIP_WXH_4D_NEON(8, 16) - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref0 = vld1q_u8(ref0); - const uint8x16_t vec_ref1 = vld1q_u8(ref1); - const uint8x16_t vec_ref2 = vld1q_u8(ref2); - const uint8x16_t vec_ref3 = vld1q_u8(ref3); +SAD_SKIP_WXH_4D_NEON(16, 8) +SAD_SKIP_WXH_4D_NEON(16, 16) +SAD_SKIP_WXH_4D_NEON(16, 32) - vec_sum_ref0_lo = - vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0)); - vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref0)); - vec_sum_ref1_lo = - vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1)); - vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref1)); - vec_sum_ref2_lo = - vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2)); - vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref2)); - vec_sum_ref3_lo = - vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3)); - vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref3)); +SAD_SKIP_WXH_4D_NEON(32, 16) +SAD_SKIP_WXH_4D_NEON(32, 32) +SAD_SKIP_WXH_4D_NEON(32, 64) - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } +SAD_SKIP_WXH_4D_NEON(64, 32) +SAD_SKIP_WXH_4D_NEON(64, 64) - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); -} +#undef SAD_SKIP_WXH_4D_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c new file mode 100644 index 0000000000..933fc48b8c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint32x4_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1)); +} + +static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], + int ref_stride, uint32_t res[4], + int h) { + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +#define SAD_WXH_4D_NEON_DOTPROD(w, h) \ + void vpx_sad##w##x##h##x4d_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon_dotprod(src_ptr, src_stride, ref_array, ref_stride, \ + sad_array, (h)); \ + } + +SAD_WXH_4D_NEON_DOTPROD(16, 8) +SAD_WXH_4D_NEON_DOTPROD(16, 16) +SAD_WXH_4D_NEON_DOTPROD(16, 32) + +SAD_WXH_4D_NEON_DOTPROD(32, 16) +SAD_WXH_4D_NEON_DOTPROD(32, 32) +SAD_WXH_4D_NEON_DOTPROD(32, 64) + +SAD_WXH_4D_NEON_DOTPROD(64, 32) +SAD_WXH_4D_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_4D_NEON_DOTPROD + +#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h) \ + void vpx_sad_skip_##w##x##h##x4d_neon_dotprod( \ + const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], int ref_stride, \ + uint32_t sad_array[4]) { \ + sad##w##xhx4d_neon_dotprod(src_ptr, 2 * src_stride, ref_array, \ + 2 * ref_stride, sad_array, ((h) >> 1)); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16) +SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32) +SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64) + +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32) +SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64) + +#undef SAD_SKIP_WXH_4D_NEON_DOTPROD diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c index ff3228768c..4dd87ddc0f 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c @@ -11,213 +11,381 @@ #include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" -unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; +static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32; - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3; + uint8x16_t diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + diff2 = vabdq_u8(s2, r2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + diff3 = vabdq_u8(s3, r3); + sum[3] = vpadalq_u8(sum[3], diff3); - for (i = 0; i < 15; i++) { - d0 = vld1_u8(src_ptr); src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } + } while (--i != 0); - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); + sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); - return vget_lane_u32(d5, 0); + return horizontal_add_uint32x4(sum_u32); } -unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x8_t d0, d8; - uint16x8_t q12; - uint32x2_t d1; - uint64x1_t d3; - int i; +static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint32x4_t sum = vdupq_n_u32(0); - d0 = vld1_u8(src_ptr); - src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(d0, d8); + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t diff0 = vabdq_u8(s0, r0); + uint16x8_t sum0 = vpaddlq_u8(diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t diff1 = vabdq_u8(s1, r1); + uint16x8_t sum1 = vpaddlq_u8(diff1); + + sum = vpadalq_u16(sum, sum0); + sum = vpadalq_u16(sum, sum1); - for (i = 0; i < 3; i++) { - d0 = vld1_u8(src_ptr); src_ptr += src_stride; - d8 = vld1_u8(ref_ptr); ref_ptr += ref_stride; - q12 = vabal_u8(q12, d0, d8); - } + } while (--i != 0); - d1 = vpaddl_u16(vget_low_u16(q12)); - d3 = vpaddl_u32(d1); - - return vget_lane_u32(vreinterpret_u32_u64(d3), 0); + return horizontal_add_uint32x4(sum); } -unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride, - unsigned char *ref_ptr, int ref_stride) { - uint8x16_t q0, q4; - uint16x8_t q12, q13; - uint32x4_t q1; - uint64x2_t q3; - uint32x2_t d5; - int i; +static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); - q0 = vld1q_u8(src_ptr); - src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); - ref_ptr += ref_stride; - q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + + uint8x16_t diff = vabdq_u8(s, r); + sum = vpadalq_u8(sum, diff); - for (i = 0; i < 7; i++) { - q0 = vld1q_u8(src_ptr); src_ptr += src_stride; - q4 = vld1q_u8(ref_ptr); ref_ptr += ref_stride; - q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); - q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + + sum = vabal_u8(sum, s, r); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + + sum = vabal_u8(sum, s, r); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +#define SAD_WXH_NEON(w, h) \ + unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ } - q12 = vaddq_u16(q12, q13); - q1 = vpaddlq_u16(q12); - q3 = vpaddlq_u32(q1); - d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), - vreinterpret_u32_u64(vget_high_u64(q3))); +SAD_WXH_NEON(4, 4) +SAD_WXH_NEON(4, 8) - return vget_lane_u32(d5, 0); -} +SAD_WXH_NEON(8, 4) +SAD_WXH_NEON(8, 8) +SAD_WXH_NEON(8, 16) -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = - vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = - vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} -static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { - const uint32x4_t a = vpaddlq_u16(vec_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} +SAD_WXH_NEON(16, 8) +SAD_WXH_NEON(16, 16) +SAD_WXH_NEON(16, 32) -unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); +SAD_WXH_NEON(32, 16) +SAD_WXH_NEON(32, 32) +SAD_WXH_NEON(32, 64) + +SAD_WXH_NEON(64, 32) +SAD_WXH_NEON(64, 64) + +#undef SAD_WXH_NEON + +#define SAD_SKIP_WXH_NEON(w, h) \ + unsigned int vpx_sad_skip_##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \ } - return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); + +SAD_SKIP_WXH_NEON(4, 4) +SAD_SKIP_WXH_NEON(4, 8) + +SAD_SKIP_WXH_NEON(8, 4) +SAD_SKIP_WXH_NEON(8, 8) +SAD_SKIP_WXH_NEON(8, 16) + +SAD_SKIP_WXH_NEON(16, 8) +SAD_SKIP_WXH_NEON(16, 16) +SAD_SKIP_WXH_NEON(16, 32) + +SAD_SKIP_WXH_NEON(32, 16) +SAD_SKIP_WXH_NEON(32, 32) +SAD_SKIP_WXH_NEON(32, 64) + +SAD_SKIP_WXH_NEON(64, 32) +SAD_SKIP_WXH_NEON(64, 64) + +#undef SAD_SKIP_WXH_NEON + +static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32; + + int i = h; + do { + uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vpadalq_u8(sum[0], diff0); + + s1 = vld1q_u8(src_ptr + 16); + r1 = vld1q_u8(ref_ptr + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vpadalq_u8(sum[1], diff1); + + s2 = vld1q_u8(src_ptr + 32); + r2 = vld1q_u8(ref_ptr + 32); + p2 = vld1q_u8(second_pred + 32); + avg2 = vrhaddq_u8(r2, p2); + diff2 = vabdq_u8(s2, avg2); + sum[2] = vpadalq_u8(sum[2], diff2); + + s3 = vld1q_u8(src_ptr + 48); + r3 = vld1q_u8(ref_ptr + 48); + p3 = vld1q_u8(second_pred + 48); + avg3 = vrhaddq_u8(r3, p3); + diff3 = vabdq_u8(s3, avg3); + sum[3] = vpadalq_u8(sum[3], diff3); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 64; + } while (--i != 0); + + sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sum_u32 = vpadalq_u16(sum_u32, sum[2]); + sum_u32 = vpadalq_u16(sum_u32, sum[3]); + + return horizontal_add_uint32x4(sum_u32); } -unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); +static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint32x4_t sum = vdupq_n_u32(0); - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); + int i = h; + do { + uint8x16_t s0 = vld1q_u8(src_ptr); + uint8x16_t r0 = vld1q_u8(ref_ptr); + uint8x16_t p0 = vld1q_u8(second_pred); + uint8x16_t avg0 = vrhaddq_u8(r0, p0); + uint8x16_t diff0 = vabdq_u8(s0, avg0); + uint16x8_t sum0 = vpaddlq_u8(diff0); + + uint8x16_t s1 = vld1q_u8(src_ptr + 16); + uint8x16_t r1 = vld1q_u8(ref_ptr + 16); + uint8x16_t p1 = vld1q_u8(second_pred + 16); + uint8x16_t avg1 = vrhaddq_u8(r1, p1); + uint8x16_t diff1 = vabdq_u8(s1, avg1); + uint16x8_t sum1 = vpaddlq_u8(diff1); + + sum = vpadalq_u16(sum, sum0); + sum = vpadalq_u16(sum, sum1); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 32; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x16_t s = vld1q_u8(src_ptr); + uint8x16_t r = vld1q_u8(ref_ptr); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(r, p); + uint8x16_t diff = vabdq_u8(s, avg); + sum = vpadalq_u8(sum, diff); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h; + do { + uint8x8_t s = vld1_u8(src_ptr); + uint8x8_t r = vld1_u8(ref_ptr); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 8; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + uint16x8_t sum = vdupq_n_u16(0); + + int i = h / 2; + do { + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + uint8x8_t p = vld1_u8(second_pred); + + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + second_pred += 8; + } while (--i != 0); + + return horizontal_add_uint16x8(sum); +} + +#define SAD_WXH_AVG_NEON(w, h) \ + uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); -} -unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); +SAD_WXH_AVG_NEON(4, 4) +SAD_WXH_AVG_NEON(4, 8) - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref = vld1q_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum_lo = - vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref)); - vec_accum_hi = - vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); - } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); -} +SAD_WXH_AVG_NEON(8, 4) +SAD_WXH_AVG_NEON(8, 8) +SAD_WXH_AVG_NEON(8, 16) -unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum = vdupq_n_u16(0); +SAD_WXH_AVG_NEON(16, 8) +SAD_WXH_AVG_NEON(16, 16) +SAD_WXH_AVG_NEON(16, 32) - for (i = 0; i < 8; ++i) { - const uint8x8_t vec_src = vld1_u8(src); - const uint8x8_t vec_ref = vld1_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); - } - return horizontal_add_16x8(vec_accum); -} +SAD_WXH_AVG_NEON(32, 16) +SAD_WXH_AVG_NEON(32, 32) +SAD_WXH_AVG_NEON(32, 64) + +SAD_WXH_AVG_NEON(64, 32) +SAD_WXH_AVG_NEON(64, 64) + +#undef SAD_WXH_AVG_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c new file mode 100644 index 0000000000..fbc0b8d75f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon_dotprod.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); +} + +static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); +} + +static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + diff0 = vabdq_u8(s0, r0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + diff1 = vabdq_u8(s1, r1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +#define SAD_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_sad##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \ + } + +SAD_WXH_NEON_DOTPROD(16, 8) +SAD_WXH_NEON_DOTPROD(16, 16) +SAD_WXH_NEON_DOTPROD(16, 32) + +SAD_WXH_NEON_DOTPROD(32, 16) +SAD_WXH_NEON_DOTPROD(32, 32) +SAD_WXH_NEON_DOTPROD(32, 64) + +SAD_WXH_NEON_DOTPROD(64, 32) +SAD_WXH_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_NEON_DOTPROD + +#define SAD_SKIP_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_sad_skip_##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \ + 2 * ref_stride, (h) / 2); \ + } + +SAD_SKIP_WXH_NEON_DOTPROD(16, 8) +SAD_SKIP_WXH_NEON_DOTPROD(16, 16) +SAD_SKIP_WXH_NEON_DOTPROD(16, 32) + +SAD_SKIP_WXH_NEON_DOTPROD(32, 16) +SAD_SKIP_WXH_NEON_DOTPROD(32, 32) +SAD_SKIP_WXH_NEON_DOTPROD(32, 64) + +SAD_SKIP_WXH_NEON_DOTPROD(64, 32) +SAD_SKIP_WXH_NEON_DOTPROD(64, 64) + +#undef SAD_SKIP_WXH_NEON_DOTPROD + +static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + // Only two accumulators are required for optimal instruction throughput of + // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes. + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr + j); + r0 = vld1q_u8(ref_ptr + j); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + s1 = vld1q_u8(src_ptr + j + 16); + r1 = vld1q_u8(ref_ptr + j + 16); + p1 = vld1q_u8(second_pred + 16); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + j += 32; + second_pred += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +static INLINE unsigned int sad64xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, + h, second_pred); +} + +static INLINE unsigned int sad32xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, + h, second_pred); +} + +static INLINE unsigned int sad16xh_avg_neon_dotprod( + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, + int ref_stride, int h, const uint8_t *second_pred) { + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + r0 = vld1q_u8(ref_ptr); + p0 = vld1q_u8(second_pred); + avg0 = vrhaddq_u8(r0, p0); + diff0 = vabdq_u8(s0, avg0); + sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + + s1 = vld1q_u8(src_ptr); + r1 = vld1q_u8(ref_ptr); + p1 = vld1q_u8(second_pred); + avg1 = vrhaddq_u8(r1, p1); + diff1 = vabdq_u8(s1, avg1); + sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + second_pred += 16; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1])); +} + +#define SAD_WXH_AVG_NEON_DOTPROD(w, h) \ + uint32_t vpx_sad##w##x##h##_avg_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ + } + +SAD_WXH_AVG_NEON_DOTPROD(16, 8) +SAD_WXH_AVG_NEON_DOTPROD(16, 16) +SAD_WXH_AVG_NEON_DOTPROD(16, 32) + +SAD_WXH_AVG_NEON_DOTPROD(32, 16) +SAD_WXH_AVG_NEON_DOTPROD(32, 32) +SAD_WXH_AVG_NEON_DOTPROD(32, 64) + +SAD_WXH_AVG_NEON_DOTPROD(64, 32) +SAD_WXH_AVG_NEON_DOTPROD(64, 64) + +#undef SAD_WXH_AVG_NEON_DOTPROD diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c new file mode 100644 index 0000000000..2dd57e596c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + uint8x16_t abs_diff = vabdq_u8(s, r); + uint8x8_t abs_diff_lo = vget_low_u8(abs_diff); + uint8x8_t abs_diff_hi = vget_high_u8(abs_diff); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo)); + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi)); +} + +static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); +} + +static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32x4_t *sse) { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff)); +} + +static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int width, int height) { + uint32x4_t sse = vdupq_n_u32(0); + + if ((width & 0x07) && ((width & 0x07) < 5)) { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon(src + j, ref + j, &sse); + sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse); + j += 8; + } while (j + 4 < width); + + sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse); + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } else { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon(src + j, ref + j, &sse); + j += 8; + } while (j < width); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + } + return horizontal_add_uint32x4(sse); +} + +static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + sse_16x1_neon(src + 32, ref + 32, &sse[0]); + sse_16x1_neon(src + 48, ref + 48, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + sse_16x1_neon(src + 16, ref + 16, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_16x1_neon(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse = vdupq_n_u32(0); + + int i = height; + do { + sse_8x1_neon(src, ref, &sse); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(sse); +} + +static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse = vdupq_n_u32(0); + + int i = height; + do { + sse_4x2_neon(src, src_stride, ref, ref_stride, &sse); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(sse); +} + +int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int width, int height) { + switch (width) { + case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height); + case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height); + case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height); + case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height); + case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height); + default: + return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c new file mode 100644 index 0000000000..8777773918 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sse_neon_dotprod.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, + uint32x4_t *sse) { + uint8x16_t s = vld1q_u8(src); + uint8x16_t r = vld1q_u8(ref); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + *sse = vdotq_u32(*sse, abs_diff, abs_diff); +} + +static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref, + uint32x2_t *sse) { + uint8x8_t s = vld1_u8(src); + uint8x8_t r = vld1_u8(ref); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vdot_u32(*sse, abs_diff, abs_diff); +} + +static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + uint32x2_t *sse) { + uint8x8_t s = load_unaligned_u8(src, src_stride); + uint8x8_t r = load_unaligned_u8(ref, ref_stride); + + uint8x8_t abs_diff = vabd_u8(s, r); + + *sse = vdot_u32(*sse, abs_diff, abs_diff); +} + +static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int width, int height) { + uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + if ((width & 0x07) && ((width & 0x07) < 5)) { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); + sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, + &sse[1]); + j += 8; + } while (j + 4 < width); + + sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]); + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } else { + int i = height; + do { + int j = 0; + do { + sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]); + sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride, + &sse[1]); + j += 8; + } while (j < width); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + } + return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]); + sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = height; + do { + sse_16x1_neon_dotprod(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_16x1_neon_dotprod(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = height; + do { + sse_8x1_neon_dotprod(src, ref, &sse[0]); + src += src_stride; + ref += ref_stride; + sse_8x1_neon_dotprod(src, ref, &sse[1]); + src += src_stride; + ref += ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1])); +} + +static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int height) { + uint32x2_t sse = vdup_n_u32(0); + + int i = height; + do { + sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse); + + src += 2 * src_stride; + ref += 2 * ref_stride; + i -= 2; + } while (i != 0); + + return horizontal_add_uint32x2(sse); +} + +int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int width, + int height) { + switch (width) { + case 4: + return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 8: + return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 16: + return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 32: + return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + case 64: + return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height); + default: + return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width, + height); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c index f044e11a15..d92f1615d7 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/subpel_variance_neon.c @@ -12,122 +12,478 @@ #include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" -#include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/variance.h" +#include "vpx_dsp/arm/mem_neon.h" -static const uint8_t bilinear_filters[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; +// Process a block exactly 4 wide and a multiple of 2 high. +static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); -static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { - const uint8x8_t f0 = vmov_n_u8(filter[0]); - const uint8x8_t f1 = vmov_n_u8(filter[1]); - unsigned int i; - for (i = 0; i < output_height; ++i) { - const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); - const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(&output_ptr[0], out); - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + i -= 2; + } while (i != 0); +} + +// Process a block exactly 8 wide and any height. +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + vst1_u8(dst_ptr, blend_u8); + + src_ptr += src_stride; + dst_ptr += 8; + } while (--i != 0); +} + +// Process a block which is a mutiple of 16 wide and any height. +static void var_filter_block2d_bil_large(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, int filter_offset) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3); + uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3); + vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi)); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, + dst_height, filter_offset); +} +static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_height, int filter_offset) { + var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, + dst_height, filter_offset); +} + +static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + vst1q_u8(dst_ptr + j, avg); + + j += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ } -} -static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { - const uint8x8_t f0 = vmov_n_u8(filter[0]); - const uint8x8_t f1 = vmov_n_u8(filter[1]); - unsigned int i, j; - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 16) { - const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); - const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); - const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); - const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); - const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); - const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); - const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); - const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); - vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; +#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ + yoffset); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ + xoffset); \ + var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ } + +// 4x blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4. +static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); + + int i = dst_height; + do { + uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + second_pred += 2 * 4; + i -= 2; + } while (i != 0); } -unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, - int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); - DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8. +static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); - var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, - bilinear_filters[yoffset]); - return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); + int i = dst_height; + do { + uint8x8_t s0 = vld1_u8(src_ptr); + uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); + uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1); + uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); + + uint8x8_t p = vld1_u8(second_pred); + uint8x8_t avg = vrhadd_u8(blend_u8, p); + + vst1_u8(dst_ptr, avg); + + src_ptr += src_stride; + dst_ptr += 8; + second_pred += 8; + } while (--i > 0); } -unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); - DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); +// Combine bilinear filter with vpx_comp_avg_pred for large blocks. +static void avg_pred_var_filter_block2d_bil_large( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint8_t *second_pred) { + const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); + const uint8x8_t f1 = vdup_n_u8(filter_offset); - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, - bilinear_filters[yoffset]); - return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); + int i = dst_height; + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint16x8_t blend_l = + vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1); + uint16x8_t blend_h = + vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1); + uint8x16_t blend_u8 = + vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); + + uint8x16_t p = vld1q_u8(second_pred); + uint8x16_t avg = vrhaddq_u8(blend_u8, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); } -unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); - DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, - bilinear_filters[yoffset]); - return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16. +static void avg_pred_var_filter_block2d_bil_w16( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); } -unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src, - int src_stride, int xoffset, - int yoffset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); - DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); - - var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, - bilinear_filters[xoffset]); - var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, - bilinear_filters[yoffset]); - return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32. +static void avg_pred_var_filter_block2d_bil_w32( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); } + +// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64. +static void avg_pred_var_filter_block2d_bil_w64( + const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint8_t *second_pred) { + avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +// Combine averaging subpel filter with vpx_comp_avg_pred. +static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height, + const uint8_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s0 = vld1q_u8(src_ptr + j); + uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); + uint8x16_t avg = vrhaddq_u8(s0, s1); + + uint8x16_t p = vld1q_u8(second_pred); + avg = vrhaddq_u8(avg, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of vpx_comp_avg_pred for blocks having width >= 16. +static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, + int dst_width, int dst_height, + const uint8_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint8x16_t s = vld1q_u8(src_ptr + j); + uint8x16_t p = vld1q_u8(second_pred); + + uint8x16_t avg = vrhaddq_u8(s, p); + + vst1q_u8(dst_ptr + j, avg); + + j += 16; + second_pred += 16; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint8_t tmp0[w * (h + padding)]; \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \ + xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } + +#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + if (xoffset == 0) { \ + uint8_t tmp[w * h]; \ + if (yoffset == 0) { \ + avg_pred(src, tmp, source_stride, w, h, second_pred); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ + source_stride, w, h, second_pred); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } else { \ + avg_pred_var_filter_block2d_bil_w##w( \ + src, tmp, source_stride, source_stride, h, yoffset, second_pred); \ + return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \ + second_pred); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * (h + padding)]; \ + var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } else { \ + uint8_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \ + xoffset, second_pred); \ + return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } else { \ + uint8_t tmp1[w * h]; \ + var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ + second_pred); \ + return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 4x blocks are processed two rows at a time, so require an extra row of +// padding. +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) +SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) + +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) +SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) + +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) +SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c index ce81fb630f..2c008e48ab 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/subtract_neon.c @@ -9,71 +9,129 @@ */ #include +#include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { - int r, c; + int r = rows, c; if (cols > 16) { - for (r = 0; r < rows; ++r) { + do { for (c = 0; c < cols; c += 32) { - const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); - const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); - const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); - const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); - const uint16x8_t v_diff_lo_00 = - vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); - const uint16x8_t v_diff_hi_00 = - vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); - const uint16x8_t v_diff_lo_16 = - vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); - const uint16x8_t v_diff_hi_16 = - vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); - vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); - vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); - vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); - vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + const uint8x16_t s0 = vld1q_u8(&src[c + 0]); + const uint8x16_t s1 = vld1q_u8(&src[c + 16]); + const uint8x16_t p0 = vld1q_u8(&pred[c + 0]); + const uint8x16_t p1 = vld1q_u8(&pred[c + 16]); + const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0)); + const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0)); + const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1)); + const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3)); } diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else if (cols > 8) { - for (r = 0; r < rows; ++r) { - const uint8x16_t v_src = vld1q_u8(&src[0]); - const uint8x16_t v_pred = vld1q_u8(&pred[0]); - const uint16x8_t v_diff_lo = - vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); - const uint16x8_t v_diff_hi = - vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); - vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); - vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + do { + const uint8x16_t s = vld1q_u8(&src[0]); + const uint8x16_t p = vld1q_u8(&pred[0]); + const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p)); + const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1)); diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else if (cols > 4) { - for (r = 0; r < rows; ++r) { - const uint8x8_t v_src = vld1_u8(&src[0]); - const uint8x8_t v_pred = vld1_u8(&pred[0]); - const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + do { + const uint8x8_t s = vld1_u8(&src[0]); + const uint8x8_t p = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(s, p); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else { - for (r = 0; r < rows; ++r) { - for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c]; - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } + assert(cols == 4); + do { + const uint8x8_t s = load_unaligned_u8(src, (int)src_stride); + const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride); + const uint16x8_t d = vsubl_u8(s, p); + vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d))); + vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d))); + diff += 2 * diff_stride; + pred += 2 * pred_stride; + src += 2 * src_stride; + r -= 2; + } while (r); } } + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, + const uint8_t *src8_ptr, + ptrdiff_t src_stride, + const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + int r = rows, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + + if (cols >= 16) { + do { + for (c = 0; c < cols; c += 16) { + const uint16x8_t s0 = vld1q_u16(&src[c + 0]); + const uint16x8_t s1 = vld1q_u16(&src[c + 8]); + const uint16x8_t p0 = vld1q_u16(&pred[c + 0]); + const uint16x8_t p1 = vld1q_u16(&pred[c + 8]); + const uint16x8_t d0 = vsubq_u16(s0, p0); + const uint16x8_t d1 = vsubq_u16(s1, p1); + vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols >= 8) { + do { + for (c = 0; c < cols; c += 8) { + const uint16x8_t s = vld1q_u16(&src[c]); + const uint16x8_t p = vld1q_u16(&pred[c]); + const uint16x8_t d0 = vsubq_u16(s, p); + vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } else if (cols >= 4) { + do { + for (c = 0; c < cols; c += 4) { + const uint16x4_t s = vld1_u16(&src[c]); + const uint16x4_t p = vld1_u16(&pred[c]); + const uint16x4_t v_diff = vsub_u16(s, p); + vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff)); + } + diff_ptr += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h new file mode 100644 index 0000000000..11821dc10e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_SUM_NEON_H_ +#define VPX_VPX_DSP_ARM_SUM_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) { +#if VPX_ARCH_AARCH64 + return vaddlv_u8(a); +#else + const uint16x4_t b = vpaddl_u8(a); + const uint16x4_t c = vpadd_u16(b, b); + return vget_lane_u16(c, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) { +#if VPX_ARCH_AARCH64 + return vaddlv_u8(a); +#else + const uint16x4_t b = vpaddl_u8(a); + const uint16x4_t c = vpadd_u16(b, b); + const uint16x4_t d = vpadd_u16(c, c); + return vget_lane_u16(d, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u8(a); +#else + const uint16x8_t b = vpaddlq_u8(a); + const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b)); + const uint16x4_t d = vpadd_u16(c, c); + const uint16x4_t e = vpadd_u16(d, d); + return vget_lane_u16(e, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) { +#if VPX_ARCH_AARCH64 + return vaddv_u16(a); +#else + const uint16x4_t b = vpadd_u16(a, a); + const uint16x4_t c = vpadd_u16(b, b); + return vget_lane_u16(c, 0); +#endif +} + +static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { +#if VPX_ARCH_AARCH64 + return vaddlvq_s16(a); +#else + const int32x4_t b = vpaddlq_s16(a); + const int64x2_t c = vpaddlq_s32(b); + const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)), + vreinterpret_s32_s64(vget_high_s64(c))); + return vget_lane_s32(d, 0); +#endif +} + +static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u16(a); +#else + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); + return vget_lane_u32(d, 0); +#endif +} + +static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) { +#if VPX_ARCH_AARCH64 + const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); + const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); + const uint16x8_t b0 = vpaddq_u16(a0, a1); + return vpaddlq_u16(b0); +#else + const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint16x4_t b0 = vpadd_u16(a0, a1); + const uint16x4_t b1 = vpadd_u16(a2, a3); + return vpaddlq_u16(vcombine_u16(b0, b1)); +#endif +} + +static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi); +#else + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif +} + +static INLINE uint32x4_t horizontal_long_add_4d_uint16x8( + const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) { + const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]); + const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]); + const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]); + const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]); + const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]); + const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]); + const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]); + const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]); +#if VPX_ARCH_AARCH64 + const uint32x4_t c0 = vpaddq_u32(b0, b1); + const uint32x4_t c1 = vpaddq_u32(b2, b3); + return vpaddq_u32(c0, c1); +#else + const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); + const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); + const uint32x2_t d0 = vpadd_u32(c0, c1); + const uint32x2_t d1 = vpadd_u32(c2, c3); + return vcombine_u32(d0, d1); +#endif +} + +static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { +#if VPX_ARCH_AARCH64 + return vaddv_s32(a); +#else + return vget_lane_s32(a, 0) + vget_lane_s32(a, 1); +#endif +} + +static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) { +#if VPX_ARCH_AARCH64 + return vaddv_u32(a); +#else + const uint64x1_t b = vpaddl_u32(a); + return vget_lane_u32(vreinterpret_u32_u64(b), 0); +#endif +} + +static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) { +#if VPX_ARCH_AARCH64 + return vaddvq_s32(a); +#else + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +#endif +} + +static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { +#if VPX_ARCH_AARCH64 + return vaddvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif +} + +static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) { +#if VPX_ARCH_AARCH64 + uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); + uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); + return vpaddq_u32(res01, res23); +#else + uint32x4_t res = vdupq_n_u32(0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3); + return res; +#endif +} + +static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) { +#if VPX_ARCH_AARCH64 + return vaddlvq_u32(a); +#else + const uint64x2_t b = vpaddlq_u32(a); + return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1); +#endif +} + +static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) { +#if VPX_ARCH_AARCH64 + return vaddvq_s64(a); +#else + return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); +#endif +} + +static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) { +#if VPX_ARCH_AARCH64 + return vaddvq_u64(a); +#else + return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); +#endif +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) { + return horizontal_long_add_uint32x4(a[0]) + + horizontal_long_add_uint32x4(a[1]); +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) { + uint64x2_t sum = vpaddlq_u32(a[0]); + sum = vpadalq_u32(sum, a[1]); + sum = vpadalq_u32(sum, a[2]); + sum = vpadalq_u32(sum, a[3]); + + return horizontal_add_uint64x2(sum); +} + +static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) { + uint64x2_t sum[2]; + sum[0] = vpaddlq_u32(a[0]); + sum[1] = vpaddlq_u32(a[1]); + sum[0] = vpadalq_u32(sum[0], a[2]); + sum[1] = vpadalq_u32(sum[1], a[3]); + sum[0] = vpadalq_u32(sum[0], a[4]); + sum[1] = vpadalq_u32(sum[1], a[5]); + sum[0] = vpadalq_u32(sum[0], a[6]); + sum[1] = vpadalq_u32(sum[1], a[7]); + + return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1])); +} + +static INLINE uint64_t +horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) { + uint64x2_t sum[2]; + sum[0] = vpaddlq_u32(a[0]); + sum[1] = vpaddlq_u32(a[1]); + sum[0] = vpadalq_u32(sum[0], a[2]); + sum[1] = vpadalq_u32(sum[1], a[3]); + sum[0] = vpadalq_u32(sum[0], a[4]); + sum[1] = vpadalq_u32(sum[1], a[5]); + sum[0] = vpadalq_u32(sum[0], a[6]); + sum[1] = vpadalq_u32(sum[1], a[7]); + sum[0] = vpadalq_u32(sum[0], a[8]); + sum[1] = vpadalq_u32(sum[1], a[9]); + sum[0] = vpadalq_u32(sum[0], a[10]); + sum[1] = vpadalq_u32(sum[1], a[11]); + sum[0] = vpadalq_u32(sum[0], a[12]); + sum[1] = vpadalq_u32(sum[1], a[13]); + sum[0] = vpadalq_u32(sum[0], a[14]); + sum[1] = vpadalq_u32(sum[1], a[15]); + + return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1])); +} + +#endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c new file mode 100644 index 0000000000..074afe3258 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_neon.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/sum_neon.h" + +uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) { + if (size == 4) { + int16x4_t s[4]; + int32x4_t sum_s32; + + s[0] = vld1_s16(src + 0 * stride); + s[1] = vld1_s16(src + 1 * stride); + s[2] = vld1_s16(src + 2 * stride); + s[3] = vld1_s16(src + 3 * stride); + + sum_s32 = vmull_s16(s[0], s[0]); + sum_s32 = vmlal_s16(sum_s32, s[1], s[1]); + sum_s32 = vmlal_s16(sum_s32, s[2], s[2]); + sum_s32 = vmlal_s16(sum_s32, s[3], s[3]); + + return horizontal_long_add_uint32x4(vreinterpretq_u32_s32(sum_s32)); + } else { + uint64x2_t sum_u64 = vdupq_n_u64(0); + int rows = size; + + do { + const int16_t *src_ptr = src; + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int cols = size; + + do { + int16x8_t s[8]; + + s[0] = vld1q_s16(src_ptr + 0 * stride); + s[1] = vld1q_s16(src_ptr + 1 * stride); + s[2] = vld1q_s16(src_ptr + 2 * stride); + s[3] = vld1q_s16(src_ptr + 3 * stride); + s[4] = vld1q_s16(src_ptr + 4 * stride); + s[5] = vld1q_s16(src_ptr + 5 * stride); + s[6] = vld1q_s16(src_ptr + 6 * stride); + s[7] = vld1q_s16(src_ptr + 7 * stride); + + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[0]), vget_low_s16(s[0])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[1]), vget_low_s16(s[1])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[2]), vget_low_s16(s[2])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[3]), vget_low_s16(s[3])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[4]), vget_low_s16(s[4])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[5]), vget_low_s16(s[5])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[6]), vget_low_s16(s[6])); + sum_s32[0] = + vmlal_s16(sum_s32[0], vget_low_s16(s[7]), vget_low_s16(s[7])); + + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[0]), vget_high_s16(s[0])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[1]), vget_high_s16(s[1])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[2]), vget_high_s16(s[2])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[3]), vget_high_s16(s[3])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[4]), vget_high_s16(s[4])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[5]), vget_high_s16(s[5])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[6]), vget_high_s16(s[6])); + sum_s32[1] = + vmlal_s16(sum_s32[1], vget_high_s16(s[7]), vget_high_s16(s[7])); + + src_ptr += 8; + cols -= 8; + } while (cols); + + sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[0])); + sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[1])); + src += 8 * stride; + rows -= 8; + } while (rows); + + return horizontal_add_uint64x2(sum_u64); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c new file mode 100644 index 0000000000..a18cbbd736 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_squares_sve.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_dsp/arm/vpx_neon_sve_bridge.h" + +uint64_t vpx_sum_squares_2d_i16_sve(const int16_t *src, int stride, int size) { + if (size == 4) { + int16x4_t s[4]; + int64x2_t sum = vdupq_n_s64(0); + + s[0] = vld1_s16(src + 0 * stride); + s[1] = vld1_s16(src + 1 * stride); + s[2] = vld1_s16(src + 2 * stride); + s[3] = vld1_s16(src + 3 * stride); + + int16x8_t s01 = vcombine_s16(s[0], s[1]); + int16x8_t s23 = vcombine_s16(s[2], s[3]); + + sum = vpx_dotq_s16(sum, s01, s01); + sum = vpx_dotq_s16(sum, s23, s23); + + return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum)); + } else { + int rows = size; + int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + const int16_t *src_ptr = src; + int cols = size; + + do { + int16x8_t s[8]; + load_s16_8x8(src_ptr, stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], + &s[6], &s[7]); + + sum[0] = vpx_dotq_s16(sum[0], s[0], s[0]); + sum[1] = vpx_dotq_s16(sum[1], s[1], s[1]); + sum[2] = vpx_dotq_s16(sum[2], s[2], s[2]); + sum[3] = vpx_dotq_s16(sum[3], s[3], s[3]); + sum[0] = vpx_dotq_s16(sum[0], s[4], s[4]); + sum[1] = vpx_dotq_s16(sum[1], s[5], s[5]); + sum[2] = vpx_dotq_s16(sum[2], s[6], s[6]); + sum[3] = vpx_dotq_s16(sum[3], s[7], s[7]); + + src_ptr += 8; + cols -= 8; + } while (cols); + + src += 8 * stride; + rows -= 8; + } while (rows); + + sum[0] = vaddq_s64(sum[0], sum[1]); + sum[2] = vaddq_s64(sum[2], sum[3]); + sum[0] = vaddq_s64(sum[0], sum[2]); + + return horizontal_add_uint64x2(vreinterpretq_u64_s64(sum[0])); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h index 8366ce50b8..b6b46fc47e 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_ARM_TRANSPOSE_NEON_H_ -#define VPX_DSP_ARM_TRANSPOSE_NEON_H_ +#ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_ +#define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_ #include @@ -23,44 +23,77 @@ // b0.val[1]: 04 05 06 07 20 21 22 23 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_s16_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s16_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), vreinterpret_s16_s32(vget_high_s32(a1))); +#endif return b0; } static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { int32x4x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_s32_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s32_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); +#endif return b0; } static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { int64x2x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); + b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); +#else b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)), vreinterpret_s64_s32(vget_low_s32(a1))); b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)), vreinterpret_s64_s32(vget_high_s32(a1))); +#endif return b0; } static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { uint8x16x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_u8_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u8_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)), vreinterpret_u8_u32(vget_low_u32(a1))); b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)), vreinterpret_u8_u32(vget_high_u32(a1))); +#endif return b0; } static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { uint16x8x2_t b0; +#if VPX_ARCH_AARCH64 + b0.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1))); b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), vreinterpret_u16_u32(vget_high_u32(a1))); +#endif return b0; } @@ -138,20 +171,16 @@ static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) { vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1)); // Swap 64 bit elements resulting in: - // c0.val[0]: 00 01 20 21 02 03 22 23 - // c0.val[1]: 10 11 30 31 12 13 32 33 + // c0: 00 01 20 21 02 03 22 23 + // c1: 10 11 30 31 12 13 32 33 - const int32x4_t c0 = - vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1])); - const int32x4_t c1 = - vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1])); + const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]); // Swap 16 bit elements resulting in: // d0.val[0]: 00 10 20 30 02 12 22 32 // d0.val[1]: 01 11 21 31 03 13 23 33 - const int16x8x2_t d0 = - vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1)); + const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]); *a0 = d0.val[0]; *a1 = d0.val[1]; @@ -169,20 +198,16 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1)); // Swap 64 bit elements resulting in: - // c0.val[0]: 00 01 20 21 02 03 22 23 - // c0.val[1]: 10 11 30 31 12 13 32 33 + // c0: 00 01 20 21 02 03 22 23 + // c1: 10 11 30 31 12 13 32 33 - const uint32x4_t c0 = - vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1])); - const uint32x4_t c1 = - vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1])); + const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]); // Swap 16 bit elements resulting in: // d0.val[0]: 00 10 20 30 02 12 22 32 // d0.val[1]: 01 11 21 31 03 13 23 33 - const uint16x8x2_t d0 = - vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1)); + const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]); *a0 = d0.val[0]; *a1 = d0.val[1]; @@ -281,7 +306,7 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, const int16x4_t a6, const int16x4_t a7, int16x8_t *const o0, int16x8_t *const o1, int16x8_t *const o2, int16x8_t *const o3) { - // Swap 16 bit elements. Goes from: + // Combine rows. Goes from: // a0: 00 01 02 03 // a1: 10 11 12 13 // a2: 20 21 22 23 @@ -291,53 +316,40 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, // a6: 60 61 62 63 // a7: 70 71 72 73 // to: - // b0.val[0]: 00 10 02 12 - // b0.val[1]: 01 11 03 13 - // b1.val[0]: 20 30 22 32 - // b1.val[1]: 21 31 23 33 - // b2.val[0]: 40 50 42 52 - // b2.val[1]: 41 51 43 53 - // b3.val[0]: 60 70 62 72 - // b3.val[1]: 61 71 63 73 + // b0: 00 01 02 03 40 41 42 43 + // b1: 10 11 12 13 50 51 52 53 + // b2: 20 21 22 23 60 61 62 63 + // b3: 30 31 32 33 70 71 72 73 - const int16x4x2_t b0 = vtrn_s16(a0, a1); - const int16x4x2_t b1 = vtrn_s16(a2, a3); - const int16x4x2_t b2 = vtrn_s16(a4, a5); - const int16x4x2_t b3 = vtrn_s16(a6, a7); + const int16x8_t b0 = vcombine_s16(a0, a4); + const int16x8_t b1 = vcombine_s16(a1, a5); + const int16x8_t b2 = vcombine_s16(a2, a6); + const int16x8_t b3 = vcombine_s16(a3, a7); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 10 02 12 40 50 42 52 + // c0.val[1]: 01 11 03 13 41 51 43 53 + // c1.val[0]: 20 30 22 32 60 70 62 72 + // c1.val[1]: 21 31 23 33 61 71 63 73 + + const int16x8x2_t c0 = vtrnq_s16(b0, b1); + const int16x8x2_t c1 = vtrnq_s16(b2, b3); // Swap 32 bit elements resulting in: - // c0.val[0]: 00 10 20 30 - // c0.val[1]: 02 12 22 32 - // c1.val[0]: 01 11 21 31 - // c1.val[1]: 03 13 23 33 - // c2.val[0]: 40 50 60 70 - // c2.val[1]: 42 52 62 72 - // c3.val[0]: 41 51 61 71 - // c3.val[1]: 43 53 63 73 + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 02 12 22 32 42 52 62 72 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 03 13 23 33 43 53 63 73 - const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), - vreinterpret_s32_s16(b1.val[0])); - const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), - vreinterpret_s32_s16(b1.val[1])); - const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]), - vreinterpret_s32_s16(b3.val[0])); - const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]), - vreinterpret_s32_s16(b3.val[1])); + const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), + vreinterpretq_s32_s16(c1.val[0])); + const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), + vreinterpretq_s32_s16(c1.val[1])); - // Swap 64 bit elements resulting in: - // o0: 00 10 20 30 40 50 60 70 - // o1: 01 11 21 31 41 51 61 71 - // o2: 02 12 22 32 42 52 62 72 - // o3: 03 13 23 33 43 53 63 73 - - *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]), - vreinterpret_s16_s32(c2.val[0])); - *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]), - vreinterpret_s16_s32(c3.val[0])); - *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]), - vreinterpret_s16_s32(c2.val[1])); - *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]), - vreinterpret_s16_s32(c3.val[1])); + *o0 = vreinterpretq_s16_s32(d0.val[0]); + *o1 = vreinterpretq_s16_s32(d1.val[0]); + *o2 = vreinterpretq_s16_s32(d0.val[1]); + *o3 = vreinterpretq_s16_s32(d1.val[1]); } static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1, @@ -512,12 +524,20 @@ static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1, *a7 = vreinterpretq_s32_s64(c3.val[1]); } -// Note: Using 'd' registers or 'q' registers has almost identical speed. We use -// 'q' registers here to save some instructions. static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5, uint8x8_t *a6, uint8x8_t *a7) { - // Swap 8 bit elements. Goes from: + // Widen to 128-bit registers (usually a no-op once inlined.) + const uint8x16_t a0q = vcombine_u8(*a0, vdup_n_u8(0)); + const uint8x16_t a1q = vcombine_u8(*a1, vdup_n_u8(0)); + const uint8x16_t a2q = vcombine_u8(*a2, vdup_n_u8(0)); + const uint8x16_t a3q = vcombine_u8(*a3, vdup_n_u8(0)); + const uint8x16_t a4q = vcombine_u8(*a4, vdup_n_u8(0)); + const uint8x16_t a5q = vcombine_u8(*a5, vdup_n_u8(0)); + const uint8x16_t a6q = vcombine_u8(*a6, vdup_n_u8(0)); + const uint8x16_t a7q = vcombine_u8(*a7, vdup_n_u8(0)); + + // Zip 8 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 // a2: 20 21 22 23 24 25 26 27 @@ -527,47 +547,115 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, // a6: 60 61 62 63 64 65 66 67 // a7: 70 71 72 73 74 75 76 77 // to: - // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 - // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 - // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 - // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 + // b0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // b1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // b2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // b3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const uint8x16_t b0 = vzipq_u8(a0q, a1q).val[0]; + const uint8x16_t b1 = vzipq_u8(a2q, a3q).val[0]; + const uint8x16_t b2 = vzipq_u8(a4q, a5q).val[0]; + const uint8x16_t b3 = vzipq_u8(a6q, a7q).val[0]; - const uint8x16x2_t b0 = - vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5)); - const uint8x16x2_t b1 = - vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7)); + // Zip 16 bit elements resulting in: + // c0.val[0]: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // c0.val[1]: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // c1.val[0]: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // c1.val[1]: 44 54 64 74 45 55 65 75 46 66 56 76 47 67 57 77 + const uint16x8x2_t c0 = + vzipq_u16(vreinterpretq_u16_u8(b0), vreinterpretq_u16_u8(b1)); + const uint16x8x2_t c1 = + vzipq_u16(vreinterpretq_u16_u8(b2), vreinterpretq_u16_u8(b3)); - // Swap 16 bit elements resulting in: - // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 - // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 - // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 - // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 - - const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), - vreinterpretq_u16_u8(b1.val[0])); - const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), - vreinterpretq_u16_u8(b1.val[1])); - - // Unzip 32 bit elements resulting in: + // Zip 32 bit elements resulting in: // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d0.val[1]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // d1.val[0]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), + const uint32x4x2_t d0 = vzipq_u32(vreinterpretq_u32_u16(c0.val[0]), vreinterpretq_u32_u16(c1.val[0])); - const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), + const uint32x4x2_t d1 = vzipq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1])); *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); - *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); - *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); - *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); - *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *a2 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); + *a3 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *a4 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); + *a5 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); } +// Transpose 8x8 to a new location. +static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + out[0] = d0.val[0]; + out[1] = d1.val[0]; + out[2] = d2.val[0]; + out[3] = d3.val[0]; + out[4] = d0.val[1]; + out[5] = d1.val[1]; + out[6] = d2.val[1]; + out[7] = d3.val[1]; +} + static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, @@ -624,6 +712,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); @@ -695,6 +784,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 + const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); @@ -710,6 +800,190 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, *a7 = d3.val[1]; } +static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1, + int32x4x2_t *a2, int32x4x2_t *a3, + int32x4x2_t *a4, int32x4x2_t *a5, + int32x4x2_t *a6, int32x4x2_t *a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0: 00 10 02 12 01 11 03 13 + // b1: 20 30 22 32 21 31 23 33 + // b2: 40 50 42 52 41 51 43 53 + // b3: 60 70 62 72 61 71 63 73 + // b4: 04 14 06 16 05 15 07 17 + // b5: 24 34 26 36 25 35 27 37 + // b6: 44 54 46 56 45 55 47 57 + // b7: 64 74 66 76 65 75 67 77 + + const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]); + const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]); + const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]); + const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]); + const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]); + const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]); + const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]); + const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]); + + // Swap 64 bit elements resulting in: + // c0: 00 10 20 30 02 12 22 32 + // c1: 01 11 21 31 03 13 23 33 + // c2: 40 50 60 70 42 52 62 72 + // c3: 41 51 61 71 43 53 63 73 + // c4: 04 14 24 34 06 16 26 36 + // c5: 05 15 25 35 07 17 27 37 + // c6: 44 54 64 74 46 56 66 76 + // c7: 45 55 65 75 47 57 67 77 + const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]); + const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]); + const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]); + const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]); + const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]); + const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]); + + // Swap 128 bit elements resulting in: + // a0: 00 10 20 30 40 50 60 70 + // a1: 01 11 21 31 41 51 61 71 + // a2: 02 12 22 32 42 52 62 72 + // a3: 03 13 23 33 43 53 63 73 + // a4: 04 14 24 34 44 54 64 74 + // a5: 05 15 25 35 45 55 65 75 + // a6: 06 16 26 36 46 56 66 76 + // a7: 07 17 27 37 47 57 67 77 + a0->val[0] = c0.val[0]; + a0->val[1] = c2.val[0]; + a1->val[0] = c1.val[0]; + a1->val[1] = c3.val[0]; + a2->val[0] = c0.val[1]; + a2->val[1] = c2.val[1]; + a3->val[0] = c1.val[1]; + a3->val[1] = c3.val[1]; + a4->val[0] = c4.val[0]; + a4->val[1] = c6.val[0]; + a5->val[0] = c5.val[0]; + a5->val[1] = c7.val[0]; + a6->val[0] = c4.val[1]; + a6->val[1] = c6.val[1]; + a7->val[0] = c5.val[1]; + a7->val[1] = c7.val[1]; +} + +// Helper transpose function for highbd FDCT variants +static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/, + int32x4_t *right /*[8]*/, + int32x4_t *out_left /*[8]*/, + int32x4_t *out_right /*[8]*/) { + int32x4x2_t out[8]; + + out[0].val[0] = left[0]; + out[0].val[1] = right[0]; + out[1].val[0] = left[1]; + out[1].val[1] = right[1]; + out[2].val[0] = left[2]; + out[2].val[1] = right[2]; + out[3].val[0] = left[3]; + out[3].val[1] = right[3]; + out[4].val[0] = left[4]; + out[4].val[1] = right[4]; + out[5].val[0] = left[5]; + out[5].val[1] = right[5]; + out[6].val[0] = left[6]; + out[6].val[1] = right[6]; + out[7].val[0] = left[7]; + out[7].val[1] = right[7]; + + transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); + + out_left[0] = out[0].val[0]; + out_left[1] = out[1].val[0]; + out_left[2] = out[2].val[0]; + out_left[3] = out[3].val[0]; + out_left[4] = out[4].val[0]; + out_left[5] = out[5].val[0]; + out_left[6] = out[6].val[0]; + out_left[7] = out[7].val[0]; + out_right[0] = out[0].val[1]; + out_right[1] = out[1].val[1]; + out_right[2] = out[2].val[1]; + out_right[3] = out[3].val[1]; + out_right[4] = out[4].val[1]; + out_right[5] = out[5].val[1]; + out_right[6] = out[6].val[1]; + out_right[7] = out[7].val[1]; +} + +static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1, + int32x4_t *left2, int32x4_t *right2) { + int32x4_t tl[16], tr[16]; + + // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3. + tl[0] = left1[8]; + tl[1] = left1[9]; + tl[2] = left1[10]; + tl[3] = left1[11]; + tl[4] = left1[12]; + tl[5] = left1[13]; + tl[6] = left1[14]; + tl[7] = left1[15]; + tr[0] = right1[8]; + tr[1] = right1[9]; + tr[2] = right1[10]; + tr[3] = right1[11]; + tr[4] = right1[12]; + tr[5] = right1[13]; + tr[6] = right1[14]; + tr[7] = right1[15]; + + left1[8] = left2[0]; + left1[9] = left2[1]; + left1[10] = left2[2]; + left1[11] = left2[3]; + left1[12] = left2[4]; + left1[13] = left2[5]; + left1[14] = left2[6]; + left1[15] = left2[7]; + right1[8] = right2[0]; + right1[9] = right2[1]; + right1[10] = right2[2]; + right1[11] = right2[3]; + right1[12] = right2[4]; + right1[13] = right2[5]; + right1[14] = right2[6]; + right1[15] = right2[7]; + + left2[0] = tl[0]; + left2[1] = tl[1]; + left2[2] = tl[2]; + left2[3] = tl[3]; + left2[4] = tl[4]; + left2[5] = tl[5]; + left2[6] = tl[6]; + left2[7] = tl[7]; + right2[0] = tr[0]; + right2[1] = tr[1]; + right2[2] = tr[2]; + right2[3] = tr[3]; + right2[4] = tr[4]; + right2[5] = tr[5]; + right2[6] = tr[6]; + right2[7] = tr[7]; + + transpose_s32_8x8_2(left1, right1, left1, right1); + transpose_s32_8x8_2(left2, right2, left2, right2); + transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8); + transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8); +} + static INLINE void transpose_u8_16x8( const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, @@ -1107,6 +1381,45 @@ static INLINE void transpose_u8_16x16( *o15 = e7.val[1]; } +static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) { + int16x8_t t[8]; + + // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3. + t[0] = in0[8]; + t[1] = in0[9]; + t[2] = in0[10]; + t[3] = in0[11]; + t[4] = in0[12]; + t[5] = in0[13]; + t[6] = in0[14]; + t[7] = in0[15]; + in0[8] = in1[0]; + in0[9] = in1[1]; + in0[10] = in1[2]; + in0[11] = in1[3]; + in0[12] = in1[4]; + in0[13] = in1[5]; + in0[14] = in1[6]; + in0[15] = in1[7]; + in1[0] = t[0]; + in1[1] = t[1]; + in1[2] = t[2]; + in1[3] = t[3]; + in1[4] = t[4]; + in1[5] = t[5]; + in1[6] = t[6]; + in1[7] = t[7]; + + transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5], + &in0[6], &in0[7]); + transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13], + &in0[14], &in0[15]); + transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5], + &in1[6], &in1[7]); + transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13], + &in1[14], &in1[15]); +} + static INLINE void load_and_transpose_u8_4x8(const uint8_t *a, const int a_stride, uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, @@ -1204,4 +1517,190 @@ static INLINE void load_and_transpose_s16_8x8(const int16_t *a, transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7); } -#endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_ + +static INLINE void load_and_transpose_s32_8x8( + const int32_t *a, const int a_stride, int32x4x2_t *const a0, + int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3, + int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6, + int32x4x2_t *const a7) { + a0->val[0] = vld1q_s32(a); + a0->val[1] = vld1q_s32(a + 4); + a += a_stride; + a1->val[0] = vld1q_s32(a); + a1->val[1] = vld1q_s32(a + 4); + a += a_stride; + a2->val[0] = vld1q_s32(a); + a2->val[1] = vld1q_s32(a + 4); + a += a_stride; + a3->val[0] = vld1q_s32(a); + a3->val[1] = vld1q_s32(a + 4); + a += a_stride; + a4->val[0] = vld1q_s32(a); + a4->val[1] = vld1q_s32(a + 4); + a += a_stride; + a5->val[0] = vld1q_s32(a); + a5->val[1] = vld1q_s32(a + 4); + a += a_stride; + a6->val[0] = vld1q_s32(a); + a6->val[1] = vld1q_s32(a + 4); + a += a_stride; + a7->val[0] = vld1q_s32(a); + a7->val[1] = vld1q_s32(a + 4); + + transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void transpose_concat_s16_4x4(const int16x4_t a0, + const int16x4_t a1, + const int16x4_t a2, + const int16x4_t a3, int16x8_t *b0, + int16x8_t *b1) { + // Transpose 16-bit elements: + // a0: 00, 01, 02, 03 + // a1: 10, 11, 12, 13 + // a2: 20, 21, 22, 23 + // a3: 30, 31, 32, 33 + // + // b0: 00 10 20 30 01 11 21 31 + // b1: 02 12 22 32 03 13 23 33 + + int16x8_t a0q = vcombine_s16(a0, vdup_n_s16(0)); + int16x8_t a1q = vcombine_s16(a1, vdup_n_s16(0)); + int16x8_t a2q = vcombine_s16(a2, vdup_n_s16(0)); + int16x8_t a3q = vcombine_s16(a3, vdup_n_s16(0)); + + int16x8_t a02 = vzipq_s16(a0q, a2q).val[0]; + int16x8_t a13 = vzipq_s16(a1q, a3q).val[0]; + + int16x8x2_t a0123 = vzipq_s16(a02, a13); + + *b0 = a0123.val[0]; + *b1 = a0123.val[1]; +} + +static INLINE void transpose_concat_s16_8x4(const int16x8_t a0, + const int16x8_t a1, + const int16x8_t a2, + const int16x8_t a3, int16x8_t *b0, + int16x8_t *b1, int16x8_t *b2, + int16x8_t *b3) { + // Transpose 16-bit elements: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00 10 20 30 01 11 21 31 + // b1: 02 12 22 32 03 13 23 33 + // b2: 04 14 24 34 05 15 25 35 + // b3: 06 16 26 36 07 17 27 37 + + int16x8x2_t a02 = vzipq_s16(a0, a2); + int16x8x2_t a13 = vzipq_s16(a1, a3); + + int16x8x2_t a0123_lo = vzipq_s16(a02.val[0], a13.val[0]); + int16x8x2_t a0123_hi = vzipq_s16(a02.val[1], a13.val[1]); + + *b0 = a0123_lo.val[0]; + *b1 = a0123_lo.val[1]; + *b2 = a0123_hi.val[0]; + *b3 = a0123_hi.val[1]; +} + +static INLINE void transpose_concat_s8_8x4(int8x8_t a0, int8x8_t a1, + int8x8_t a2, int8x8_t a3, + int8x16_t *b0, int8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a02 = vzipq_s8(a0q, a2q).val[0]; + int8x16_t a13 = vzipq_s8(a1q, a3q).val[0]; + + int8x16x2_t a0123 = vzipq_s8(a02, a13); + + *b0 = a0123.val[0]; + *b1 = a0123.val[1]; +} + +static INLINE void transpose_concat_u8_8x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b0, uint8x16_t *b1) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, 04, 05, 06, 07 + // a1: 10, 11, 12, 13, 14, 15, 16, 17 + // a2: 20, 21, 22, 23, 24, 25, 26, 27 + // a3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0]; + uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0]; + + uint8x16x2_t a0123 = vzipq_u8(a02, a13); + + *b0 = a0123.val[0]; + *b1 = a0123.val[1]; +} + +static INLINE void transpose_concat_s8_4x4(int8x8_t a0, int8x8_t a1, + int8x8_t a2, int8x8_t a3, + int8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0)); + int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0)); + int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0)); + int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0)); + + int8x16_t a02 = vzipq_s8(a0q, a2q).val[0]; + int8x16_t a13 = vzipq_s8(a1q, a3q).val[0]; + + *b = vzipq_s8(a02, a13).val[0]; +} + +static INLINE void transpose_concat_u8_4x4(uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x16_t *b) { + // Transpose 8-bit elements and concatenate result rows as follows: + // a0: 00, 01, 02, 03, XX, XX, XX, XX + // a1: 10, 11, 12, 13, XX, XX, XX, XX + // a2: 20, 21, 22, 23, XX, XX, XX, XX + // a3: 30, 31, 32, 33, XX, XX, XX, XX + // + // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33 + + uint8x16_t a0q = vcombine_u8(a0, vdup_n_u8(0)); + uint8x16_t a1q = vcombine_u8(a1, vdup_n_u8(0)); + uint8x16_t a2q = vcombine_u8(a2, vdup_n_u8(0)); + uint8x16_t a3q = vcombine_u8(a3, vdup_n_u8(0)); + + uint8x16_t a02 = vzipq_u8(a0q, a2q).val[0]; + uint8x16_t a13 = vzipq_u8(a1q, a3q).val[0]; + + *b = vzipq_u8(a02, a13).val[0]; +} + +#endif // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c index b6d7f86a4b..efb2c1d8da 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon.c @@ -9,391 +9,324 @@ */ #include +#include #include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { - const int32x4_t a = vpaddlq_s16(v_16x8); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); +// Process a block of width 4 two rows at a time. +static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32 = vdupq_n_s32(0); + int i = h; + + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows. + assert(h <= 256); + + do { + const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); + + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff)); + sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = (uint32_t)horizontal_add_int32x4(sse_s32); } -static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { - const int64x2_t b = vpaddlq_s32(v_32x4); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); +// Process a block of width 8 one row at a time. +static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; + + // Number of rows we can process before 'sum_s16' overflows: + // 32767 / 255 ~= 128 + assert(h <= 128); + + do { + const uint8x8_t s = vld1_u8(src_ptr); + const uint8x8_t r = vld1_u8(ref_ptr); + const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r)); + + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int16x8(sum_s16); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); } -// w * h must be less than 2048 or local variable v_sum may overflow. -static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, - int *sum) { - int i, j; - int16x8_t v_sum = vdupq_n_s16(0); - int32x4_t v_sse_lo = vdupq_n_s32(0); - int32x4_t v_sse_hi = vdupq_n_s32(0); +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h, uint32_t *sse, int *sum) { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int i = h; - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const uint8x8_t v_a = vld1_u8(&a[j]); - const uint8x8_t v_b = vld1_u8(&b[j]); - const uint16x8_t v_diff = vsubl_u8(v_a, v_b); - const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); - v_sum = vaddq_s16(v_sum, sv_diff); - v_sse_lo = - vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); - v_sse_hi = - vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); - } - a += a_stride; - b += b_stride; + // Number of rows we can process before 'sum_s16' accumulators overflow: + // 32767 / 255 ~= 128, so 128 16-wide rows. + assert(h <= 128); + + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1])); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int w, int h, int h_limit, + unsigned int *sse, int *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit + // accumulator overflows. After hitting this limit we accumulate into 32-bit + // elements. + int h_tmp = h > h_limit ? h_limit : h; + + int i = 0; + do { + int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + do { + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const int16x8_t diff_l = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r))); + const int16x8_t diff_h = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r))); + + sum_s16[0] = vaddq_s16(sum_s16[0], diff_l); + sum_s16[1] = vaddq_s16(sum_s16[1], diff_h); + + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l)); + sse_s32[0] = + vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h)); + sse_s32[1] = + vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h)); + + j += 16; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + i++; + } while (i < h_tmp); + + sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]); + sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]); + + h_tmp += h_limit; + } while (i < h); + + *sum = horizontal_add_int32x4(sum_s32); + *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1])); +} + +static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum); +} + +static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum); +} + +void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); +} + +void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum); +} + +#define VARIANCE_WXH_NEON(w, h, shift) \ + unsigned int vpx_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } - *sum = horizontal_add_s16x8(v_sum); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +VARIANCE_WXH_NEON(4, 4, 4) +VARIANCE_WXH_NEON(4, 8, 5) + +VARIANCE_WXH_NEON(8, 4, 5) +VARIANCE_WXH_NEON(8, 8, 6) +VARIANCE_WXH_NEON(8, 16, 7) + +VARIANCE_WXH_NEON(16, 8, 7) +VARIANCE_WXH_NEON(16, 16, 8) +VARIANCE_WXH_NEON(16, 32, 9) + +VARIANCE_WXH_NEON(32, 16, 9) +VARIANCE_WXH_NEON(32, 32, 10) +VARIANCE_WXH_NEON(32, 64, 11) + +VARIANCE_WXH_NEON(64, 32, 11) +VARIANCE_WXH_NEON(64, 64, 12) + +#undef VARIANCE_WXH_NEON + +static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + uint16x8_t sse0, sse1; + + s0 = vld1_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); + + sse0 = vmull_u8(diff0, diff0); + sse_u32[0] = vpadalq_u16(sse_u32[0], sse0); + sse1 = vmull_u8(diff1, diff1); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); } -void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); +static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint8x16_t s, r, diff; + uint16x8_t sse0, sse1; + + s = vld1q_u8(src_ptr); + src_ptr += src_stride; + r = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + + diff = vabdq_u8(s, r); + + sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff)); + sse_u32[0] = vpadalq_u16(sse_u32[0], sse0); + sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff)); + sse_u32[1] = vpadalq_u16(sse_u32[1], sse1); + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); } -void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); -} - -unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); - return *sse - ((sum * sum) >> 6); -} - -unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); - return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); -} - -unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); -} - -unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride, - 32, 32, &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} - -unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, - 64, 16, &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); -} - -unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, - 64, 16, &sse2, &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), - b_stride, 64, 16, &sse2, &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), - b_stride, 64, 16, &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); -} - -unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr, - int source_stride, +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, - int recon_stride, unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; + int ref_stride) { + uint8x8_t s[2], r[2]; + uint16x8_t abs_diff[2]; + uint32x4_t sse; - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); + s[0] = load_u8(src_ptr, src_stride); + r[0] = load_u8(ref_ptr, ref_stride); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + s[1] = load_u8(src_ptr, src_stride); + r[1] = load_u8(ref_ptr, ref_stride); - for (i = 0; i < 4; i++) { - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); + abs_diff[0] = vabdl_u8(s[0], r[0]); + abs_diff[1] = vabdl_u8(s[1], r[1]); - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); + sse = vmull_u16(vget_low_u16(abs_diff[0]), vget_low_u16(abs_diff[0])); + sse = vmlal_u16(sse, vget_high_u16(abs_diff[0]), vget_high_u16(abs_diff[0])); + sse = vmlal_u16(sse, vget_low_u16(abs_diff[1]), vget_low_u16(abs_diff[1])); + sse = vmlal_u16(sse, vget_high_u16(abs_diff[1]), vget_high_u16(abs_diff[1])); - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + return horizontal_add_uint32x4(sse); +} - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); +#define VPX_MSE_WXH_NEON(w, h) \ + unsigned int vpx_mse##w##x##h##_neon( \ + const unsigned char *src_ptr, int src_stride, \ + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ + *sse = vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + return *sse; \ } - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); +VPX_MSE_WXH_NEON(8, 8) +VPX_MSE_WXH_NEON(8, 16) +VPX_MSE_WXH_NEON(16, 8) +VPX_MSE_WXH_NEON(16, 16) - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, unsigned int *sse) { - int i; - uint8x8_t d0u8, d2u8, d4u8, d6u8; - int16x4_t d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d2u8, d6u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - int64x1_t d0s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - q7s32 = vdupq_n_s32(0); - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // mse16x16_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q7s32 = vmlal_s16(q7s32, d22s16, d22s16); - q8s32 = vmlal_s16(q8s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q7s32 = vmlal_s16(q7s32, d26s16, d26s16); - q8s32 = vmlal_s16(q8s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q10s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q10s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} - -unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride) { - int16x4_t d22s16, d24s16, d26s16, d28s16; - int64x1_t d0s64; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); - d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); - d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); - d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); - - q7s32 = vmull_s16(d22s16, d22s16); - q8s32 = vmull_s16(d24s16, d24s16); - q9s32 = vmull_s16(d26s16, d26s16); - q10s32 = vmull_s16(d28s16, d28s16); - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q9s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q9s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} +#undef VPX_MSE_WXH_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c new file mode 100644 index 0000000000..ab843e9fca --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/variance_neon_dotprod.c @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" +#include "vpx_ports/mem.h" + +// Process a block of width 4 four rows at a time. +static INLINE void variance_4xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; + i -= 4; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 8 two rows at a time. +static INLINE void variance_8xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = + vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride)); + const uint8x16_t r = + vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride)); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + i -= 2; + } while (i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of width 16 one row at a time. +static INLINE void variance_16xh_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + const uint8x16_t s = vld1q_u8(src_ptr); + const uint8x16_t r = vld1q_u8(ref_ptr); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +// Process a block of any size where the width is divisible by 16. +static INLINE void variance_large_neon_dotprod(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + uint32_t *sse, int *sum) { + uint32x4_t src_sum = vdupq_n_u32(0); + uint32x4_t ref_sum = vdupq_n_u32(0); + uint32x4_t sse_u32 = vdupq_n_u32(0); + + int i = h; + do { + int j = 0; + do { + const uint8x16_t s = vld1q_u8(src_ptr + j); + const uint8x16_t r = vld1q_u8(ref_ptr + j); + + const uint8x16_t abs_diff = vabdq_u8(s, r); + sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff); + + src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1)); + ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1)); + + j += 16; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--i != 0); + + *sum = horizontal_add_int32x4( + vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum))); + *sse = horizontal_add_uint32x4(sse_u32); +} + +static INLINE void variance_32xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse, + sum); +} + +static INLINE void variance_64xh_neon_dotprod(const uint8_t *src, + int src_stride, + const uint8_t *ref, + int ref_stride, int h, + uint32_t *sse, int *sum) { + variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse, + sum); +} + +void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_8xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, + sum); +} + +void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_16xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, + sum); +} + +#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift) \ + unsigned int vpx_variance##w##x##h##_neon_dotprod( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } + +VARIANCE_WXH_NEON_DOTPROD(4, 4, 4) +VARIANCE_WXH_NEON_DOTPROD(4, 8, 5) + +VARIANCE_WXH_NEON_DOTPROD(8, 4, 5) +VARIANCE_WXH_NEON_DOTPROD(8, 8, 6) +VARIANCE_WXH_NEON_DOTPROD(8, 16, 7) + +VARIANCE_WXH_NEON_DOTPROD(16, 8, 7) +VARIANCE_WXH_NEON_DOTPROD(16, 16, 8) +VARIANCE_WXH_NEON_DOTPROD(16, 32, 9) + +VARIANCE_WXH_NEON_DOTPROD(32, 16, 9) +VARIANCE_WXH_NEON_DOTPROD(32, 32, 10) +VARIANCE_WXH_NEON_DOTPROD(32, 64, 11) + +VARIANCE_WXH_NEON_DOTPROD(64, 32, 11) +VARIANCE_WXH_NEON_DOTPROD(64, 64, 12) + +#undef VARIANCE_WXH_NEON_DOTPROD + +static INLINE unsigned int vpx_mse8xh_neon_dotprod(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) }; + + int i = h / 2; + do { + uint8x8_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabd_u8(s0, r0); + diff1 = vabd_u8(s1, r1); + + sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); + + return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1])); +} + +static INLINE unsigned int vpx_mse16xh_neon_dotprod( + const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, + int ref_stride, int h) { + uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h / 2; + do { + uint8x16_t s0, s1, r0, r1, diff0, diff1; + + s0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + s1 = vld1q_u8(src_ptr); + src_ptr += src_stride; + r0 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + r1 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + + diff0 = vabdq_u8(s0, r0); + diff1 = vabdq_u8(s1, r1); + + sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0); + sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1); + } while (--i != 0); + + return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1])); +} + +unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride); + uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride); + + uint8x16_t abs_diff = vabdq_u8(s, r); + + uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff); + + return horizontal_add_uint32x4(sse); +} + +#define VPX_MSE_WXH_NEON_DOTPROD(w, h) \ + unsigned int vpx_mse##w##x##h##_neon_dotprod( \ + const unsigned char *src_ptr, int src_stride, \ + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \ + *sse = vpx_mse##w##xh_neon_dotprod(src_ptr, src_stride, ref_ptr, \ + ref_stride, h); \ + return *sse; \ + } + +VPX_MSE_WXH_NEON_DOTPROD(8, 8) +VPX_MSE_WXH_NEON_DOTPROD(8, 16) +VPX_MSE_WXH_NEON_DOTPROD(16, 8) +VPX_MSE_WXH_NEON_DOTPROD(16, 16) + +#undef VPX_MSE_WXH_NEON_DOTPROD diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm new file mode 100644 index 0000000000..d8e4bcc3a7 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm @@ -0,0 +1,438 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers***************************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_avg_horiz_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_horiz_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u8 {d6}, [r1] + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u8 {d7}, [r6] + vrhadd.u8 d20, d20, d6 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vrhadd.u8 d8, d8, d7 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlsl.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlal.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlal.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + add r7, r1, #8 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vld1.u8 {d0}, [r1] + vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u8 {d2}, [r7] + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vrhadd.u8 d8, d8, d0 + vrhadd.u8 d9, d9, d2 + vmlsl.u8 q11, d1, d24 + vmlsl.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlal.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + vmlal.u8 q11, d13, d28 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + subeq r14, r14, #2 + vhadd.s16 q5, q5, q10 + vmlal.u8 q11, d15, d29 + addeq r1, r1, r8 + vmlsl.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vdup.16 q10, r7 + vld1.u32 {q3}, [r12], r11 + add r7, r6, #8 + moveq r5, r10 + vld1.u8 {d0}, [r6] + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u8 {d2}, [r7] + vqrshrun.s16 d11, q11, #6 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q6}, [r12], r11 + vrhadd.u8 d10, d10, d0 + vld1.u32 {q7}, [r12], r11 + vrhadd.u8 d11, d11, d2 + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + mov r7, #0xc000 + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + add r7, r6, #8 + vld1.u8 {d20}, [r6] + vld1.u8 {d21}, [r7] + vrhadd.u8 d10, d10, d20 + vrhadd.u8 d11, d11, d21 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlal.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlal.u8 q4, d5, d29 + vmlsl.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vld1.u32 {d10[0]}, [r1] + vld1.u32 {d10[1]}, [r6] + vrhadd.u8 d8, d8, d10 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm new file mode 100644 index 0000000000..7a77747fec --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm @@ -0,0 +1,439 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_avg_horiz_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_horiz_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u8 {d6}, [r1] + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u8 {d7}, [r6] + vrhadd.u8 d20, d20, d6 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vrhadd.u8 d8, d8, d7 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlal.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlsl.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlsl.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlal.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + add r7, r1, #8 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vld1.u8 {d0}, [r1] + vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u8 {d2}, [r7] + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vrhadd.u8 d8, d8, d0 + vrhadd.u8 d9, d9, d2 + vmlsl.u8 q11, d1, d24 + vmlal.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlsl.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + vmlal.u8 q11, d13, d28 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + subeq r14, r14, #2 + vhadd.s16 q5, q5, q10 + vmlsl.u8 q11, d15, d29 + addeq r1, r1, r8 + vmlal.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vdup.16 q10, r7 + vld1.u32 {q3}, [r12], r11 + add r7, r6, #8 + moveq r5, r10 + vld1.u8 {d0}, [r6] + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u8 {d2}, [r7] + vqrshrun.s16 d11, q11, #6 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q6}, [r12], r11 + vrhadd.u8 d10, d10, d0 + vld1.u32 {q7}, [r12], r11 + vrhadd.u8 d11, d11, d2 + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + mov r7, #0xc000 + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + add r7, r6, #8 + vld1.u8 {d20}, [r6] + vld1.u8 {d21}, [r7] + vrhadd.u8 d10, d10, d20 + vrhadd.u8 d11, d11, d21 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlal.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlsl.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlsl.u8 q4, d5, d29 + vmlal.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vld1.u32 {d10[0]}, [r1] + vld1.u32 {d10[1]}, [r6] + vrhadd.u8 d8, d8, d10 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm deleted file mode 100644 index e279d570fc..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm +++ /dev/null @@ -1,292 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ; These functions are only valid when: - ; x_step_q4 == 16 - ; w%4 == 0 - ; h%4 == 0 - ; taps == 8 - ; VP9_FILTER_WEIGHT == 128 - ; VP9_FILTER_SHIFT == 7 - - EXPORT |vpx_convolve8_avg_horiz_neon| - EXPORT |vpx_convolve8_avg_vert_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Multiply and accumulate by q0 - MACRO - MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 - vmull.s16 $dst, $src0, d0[0] - vmlal.s16 $dst, $src1, d0[1] - vmlal.s16 $dst, $src2, d0[2] - vmlal.s16 $dst, $src3, d0[3] - vmlal.s16 $dst, $src4, d1[0] - vmlal.s16 $dst, $src5, d1[1] - vmlal.s16 $dst, $src6, d1[2] - vmlal.s16 $dst, $src7, d1[3] - MEND - -; r0 const uint8_t *src -; r1 int src_stride -; r2 uint8_t *dst -; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused -; sp[]int w -; sp[]int h - -|vpx_convolve8_avg_horiz_neon| PROC - push {r4-r10, lr} - - sub r0, r0, #3 ; adjust for taps - - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h - - vld1.s16 {q0}, [r5] ; filter_x - - sub r8, r1, r1, lsl #2 ; -src_stride * 3 - add r8, r8, #4 ; -src_stride * 3 + 4 - - sub r4, r3, r3, lsl #2 ; -dst_stride * 3 - add r4, r4, #4 ; -dst_stride * 3 + 4 - - rsb r9, r6, r1, lsl #2 ; reset src for outer loop - sub r9, r9, #7 - rsb r12, r6, r3, lsl #2 ; reset dst for outer loop - - mov r10, r6 ; w loop counter - -vpx_convolve8_avg_loop_horiz_v - vld1.8 {d24}, [r0], r1 - vld1.8 {d25}, [r0], r1 - vld1.8 {d26}, [r0], r1 - vld1.8 {d27}, [r0], r8 - - vtrn.16 q12, q13 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - - pld [r0, r1, lsl #2] - - vmovl.u8 q8, d24 - vmovl.u8 q9, d25 - vmovl.u8 q10, d26 - vmovl.u8 q11, d27 - - ; save a few instructions in the inner loop - vswp d17, d18 - vmov d23, d21 - - add r0, r0, #3 - -vpx_convolve8_avg_loop_horiz - add r5, r0, #64 - - vld1.32 {d28[]}, [r0], r1 - vld1.32 {d29[]}, [r0], r1 - vld1.32 {d31[]}, [r0], r1 - vld1.32 {d30[]}, [r0], r8 - - pld [r5] - - vtrn.16 d28, d31 - vtrn.16 d29, d30 - vtrn.8 d28, d29 - vtrn.8 d31, d30 - - pld [r5, r1] - - ; extract to s16 - vtrn.32 q14, q15 - vmovl.u8 q12, d28 - vmovl.u8 q13, d29 - - pld [r5, r1, lsl #1] - - ; slightly out of order load to match the existing data - vld1.u32 {d6[0]}, [r2], r3 - vld1.u32 {d7[0]}, [r2], r3 - vld1.u32 {d6[1]}, [r2], r3 - vld1.u32 {d7[1]}, [r2], r3 - - sub r2, r2, r3, lsl #2 ; reset for store - - ; src[] * filter_x - MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 - MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 - MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 - MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 - - pld [r5, -r8] - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; transpose - vtrn.16 d2, d3 - vtrn.32 d2, d3 - vtrn.8 d2, d3 - - ; average the new value and the dst value - vrhadd.u8 q1, q1, q3 - - vst1.u32 {d2[0]}, [r2@32], r3 - vst1.u32 {d3[0]}, [r2@32], r3 - vst1.u32 {d2[1]}, [r2@32], r3 - vst1.u32 {d3[1]}, [r2@32], r4 - - vmov q8, q9 - vmov d20, d23 - vmov q11, q12 - vmov q9, q13 - - subs r6, r6, #4 ; w -= 4 - bgt vpx_convolve8_avg_loop_horiz - - ; outer loop - mov r6, r10 ; restore w counter - add r0, r0, r9 ; src += src_stride * 4 - w - add r2, r2, r12 ; dst += dst_stride * 4 - w - subs r7, r7, #4 ; h -= 4 - bgt vpx_convolve8_avg_loop_horiz_v - - pop {r4-r10, pc} - - ENDP - -|vpx_convolve8_avg_vert_neon| PROC - push {r4-r8, lr} - - ; adjust for taps - sub r0, r0, r1 - sub r0, r0, r1, lsl #1 - - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h - - vld1.s16 {q0}, [r4] ; filter_y - - lsl r1, r1, #1 - lsl r3, r3, #1 - -vpx_convolve8_avg_loop_vert_h - mov r4, r0 - add r7, r0, r1, asr #1 - mov r5, r2 - add r8, r2, r3, asr #1 - mov r12, lr ; h loop counter - - vld1.u32 {d16[0]}, [r4], r1 - vld1.u32 {d16[1]}, [r7], r1 - vld1.u32 {d18[0]}, [r4], r1 - vld1.u32 {d18[1]}, [r7], r1 - vld1.u32 {d20[0]}, [r4], r1 - vld1.u32 {d20[1]}, [r7], r1 - vld1.u32 {d22[0]}, [r4], r1 - - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 - -vpx_convolve8_avg_loop_vert - ; always process a 4x4 block at a time - vld1.u32 {d24[0]}, [r7], r1 - vld1.u32 {d26[0]}, [r4], r1 - vld1.u32 {d26[1]}, [r7], r1 - vld1.u32 {d24[1]}, [r4], r1 - - ; extract to s16 - vmovl.u8 q12, d24 - vmovl.u8 q13, d26 - - vld1.u32 {d6[0]}, [r5@32], r3 - vld1.u32 {d6[1]}, [r8@32], r3 - vld1.u32 {d7[0]}, [r5@32], r3 - vld1.u32 {d7[1]}, [r8@32], r3 - - pld [r7] - pld [r4] - - ; src[] * filter_y - MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 - - pld [r7, r1] - pld [r4, r1] - - MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 - - pld [r5] - pld [r8] - - MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 - - pld [r5, r3] - pld [r8, r3] - - MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; average the new value and the dst value - vrhadd.u8 q1, q1, q3 - - sub r5, r5, r3, lsl #1 ; reset for store - sub r8, r8, r3, lsl #1 - - vst1.u32 {d2[0]}, [r5@32], r3 - vst1.u32 {d2[1]}, [r8@32], r3 - vst1.u32 {d3[0]}, [r5@32], r3 - vst1.u32 {d3[1]}, [r8@32], r3 - - vmov q8, q10 - vmov d18, d22 - vmov d19, d24 - vmov q10, q13 - vmov d22, d25 - - subs r12, r12, #4 ; h -= 4 - bgt vpx_convolve8_avg_loop_vert - - ; outer loop - add r0, r0, #4 - add r2, r2, #4 - subs r6, r6, #4 ; w -= 4 - bgt vpx_convolve8_avg_loop_vert_h - - pop {r4-r8, pc} - - ENDP - END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm new file mode 100644 index 0000000000..d310a83dad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm @@ -0,0 +1,486 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_avg_vert_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_vert_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + pld [r3, r2, lsl #1] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r3, r3, r2 + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + add r14, r1, r6 + vmlal.u8 q6, d7, d27 + vmlsl.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlsl.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlal.u8 q7, d5, d24 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d6, d25 + vrhadd.u8 d10, d10, d20 + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d20}, [r14] + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + addle r0, r0, r8 + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vrhadd.u8 d12, d12, d20 + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + addle r1, r1, r9 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlsl.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + add r10, r10, r2 ; 11*strd + vmlal.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlal.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlsl.u8 q6, d16, d28 + add r10, r10, r2 ;12*strd + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + subs r7, r7, #4 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vld1.u8 {d20}, [r14] + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vrhadd.u8 d12, d12, d20 + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vst1.8 {d12}, [r14], r6 + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vst1.8 {d14}, [r14], r6 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vmlal.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlal.u8 q6, d7, d27 + add r14, r1, r6 + vmlsl.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlal.u8 q7, d16, d27 + vmlsl.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vld1.u8 {d20}, [r14] + vrhadd.u8 d12, d12, d20 + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d20}, [r14] + vrhadd.u8 d14, d14, d20 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlsl.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlsl.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlal.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlal.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlsl.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vld1.u32 {d20[0]}, [r1] + vld1.u32 {d20[1]}, [r3] + vrhadd.u8 d0, d0, d20 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + mov r4, r3 + vld1.u32 {d20[0]}, [r4], r6 + vld1.u32 {d20[1]}, [r4] + vrhadd.u8 d8, d8, d20 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm new file mode 100644 index 0000000000..c5695fbda8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm @@ -0,0 +1,487 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_avg_vert_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_avg_vert_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + pld [r3, r2, lsl #1] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r3, r3, r2 + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + pld [r3, r2, lsl #1] + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + add r14, r1, r6 + vmlsl.u8 q6, d7, d27 + vmlal.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlal.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlsl.u8 q7, d5, d24 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d6, d25 + vrhadd.u8 d10, d10, d20 + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d20}, [r14] + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + addle r0, r0, r8 + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vrhadd.u8 d12, d12, d20 + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + addle r1, r1, r9 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlal.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + add r10, r10, r2 ; 11*strd + vmlsl.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlsl.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlal.u8 q6, d16, d28 + add r10, r10, r2 ;12*strd + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + subs r7, r7, #4 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vld1.u8 {d20}, [r14] + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vrhadd.u8 d12, d12, d20 + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vst1.8 {d12}, [r14], r6 + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vld1.u8 {d20}, [r14] + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vrhadd.u8 d14, d14, d20 + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vst1.8 {d14}, [r14], r6 + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vld1.u8 {d20}, [r1] + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vrhadd.u8 d8, d8, d20 + vmlsl.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlsl.u8 q6, d7, d27 + add r14, r1, r6 + vmlal.u8 q6, d16, d28 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vmlsl.u8 q6, d17, d29 + vld1.u8 {d20}, [r14] + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vrhadd.u8 d10, d10, d20 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlsl.u8 q7, d16, d27 + vmlal.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vld1.u8 {d20}, [r14] + vrhadd.u8 d12, d12, d20 + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d20}, [r14] + vrhadd.u8 d14, d14, d20 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp + + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlal.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlal.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlsl.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlsl.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlal.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vld1.u32 {d20[0]}, [r1] + vld1.u32 {d20[1]}, [r3] + vrhadd.u8 d0, d0, d20 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + mov r4, r3 + vld1.u32 {d20[0]}, [r4], r6 + vld1.u32 {d20[1]}, [r4] + vrhadd.u8 d8, d8, d20 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm new file mode 100644 index 0000000000..fa1b732466 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm @@ -0,0 +1,415 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_horiz_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_horiz_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlsl.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlal.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlal.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vmlsl.u8 q11, d1, d24 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + vmlsl.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlal.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + addeq r1, r1, r8 + subeq r14, r14, #2 + vmlal.u8 q11, d13, d28 + vhadd.s16 q5, q5, q10 + vmlal.u8 q11, d15, d29 + vmlsl.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vld1.u32 {q7}, [r12], r11 + vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q9}, [r12], r11 + vqrshrun.s16 d11, q11, #6 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + moveq r5, r10 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vdup.16 q10, r7 + vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlal.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlal.u8 q4, d5, d29 + vmlsl.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm new file mode 100644 index 0000000000..90b2c8fef7 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm @@ -0,0 +1,415 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r3 => dst_stride +; r4 => filter_x0 +; r8 => ht +; r10 => wd + + EXPORT |vpx_convolve8_horiz_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_horiz_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + +start_loop_count + ldr r4, [sp, #104] ;loads pi1_coeff + ldr r8, [sp, #108] ;loads x0_q4 + add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4] + ldr r8, [sp, #128] ;loads ht + ldr r10, [sp, #124] ;loads wd + vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff) + mov r11, #1 + subs r14, r8, #0 ;checks for ht == 0 + vabs.s8 d2, d0 ;vabs_s8(coeff) + vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0) + sub r12, r0, #3 ;pu1_src - 3 + vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1) + add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd + vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2) + rsb r9, r10, r2, lsl #1 ;2*src_strd - wd + vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3) + rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd + vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4) + vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5) + vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6) + vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7) + mov r7, r1 + cmp r10, #4 + ble outer_loop_4 + + cmp r10, #24 + moveq r10, #16 + addeq r8, #8 + addeq r9, #8 + cmp r10, #16 + bge outer_loop_16 + + cmp r10, #12 + addeq r8, #4 + addeq r9, #4 + b outer_loop_8 + +outer_loop8_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + mov r14, #32 + add r1, #16 + add r12, #16 + mov r10, #8 + add r8, #8 + add r9, #8 + +outer_loop_8 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_8 + +inner_loop_8 + mov r7, #0xc000 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {d1}, [r12], r11 + vdup.16 q5, r7 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + mov r7, #0x4000 + vld1.u32 {d4}, [r12], r11 + vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {d5}, [r12], r11 + vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d6}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {d7}, [r12], r11 + vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d13}, [r4], r11 + vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vld1.u32 {d14}, [r4], r11 + vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vld1.u32 {d15}, [r4], r11 + vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd + vdup.16 q11, r7 + vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {d17}, [r4], r11 + vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vhadd.s16 q4, q4, q11 + vld1.u32 {d18}, [r4], r11 + vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd + vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vst1.8 {d20}, [r1]! ;store the result pu1_dst + vhadd.s16 q5, q5, q11 + subs r5, r5, #8 ;decrement the wd loop + vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow + ; result 2 + vst1.8 {d8}, [r6]! ;store the result pu1_dst + cmp r5, #4 + bgt inner_loop_8 + +end_inner_loop_8 + subs r14, r14, #2 ;decrement the ht loop + add r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + add r1, r1, r8 ;increment the dst pointer by + ; 2*dst_strd-wd + bgt outer_loop_8 + + ldr r10, [sp, #120] ;loads wd + cmp r10, #12 + beq outer_loop4_residual + +end_loops + b end_func + +outer_loop_16 + str r0, [sp, #-4]! + str r7, [sp, #-4]! + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + and r0, r12, #31 + mov r7, #0xc000 + sub r5, r10, #0 ;checks wd + pld [r4, r2, lsl #1] + pld [r12, r2, lsl #1] + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + vdup.16 q4, r7 + vld1.u32 {q1}, [r12], r11 + vld1.u32 {q2}, [r12], r11 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q7}, [r12], r11 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q8}, [r12], r11 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + vld1.u32 {q9}, [r12], r11 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vdup.16 q10, r7 + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + +inner_loop_16 + vmlsl.u8 q10, d1, d24 + vdup.16 q5, r7 + vmlal.u8 q10, d3, d25 + mov r7, #0x4000 + vdup.16 q11, r7 + vmlsl.u8 q10, d5, d26 + vld1.u32 {q0}, [r4], r11 ;vector load pu1_src + vhadd.s16 q4, q4, q11 + vld1.u32 {q1}, [r4], r11 + vmlal.u8 q10, d7, d27 + add r12, #8 + subs r5, r5, #16 + vmlal.u8 q10, d13, d28 + vld1.u32 {q2}, [r4], r11 + vmlsl.u8 q10, d15, d29 + vld1.u32 {q3}, [r4], r11 + vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow + ; result 1 + vmlal.u8 q10, d17, d30 + vld1.u32 {q6}, [r4], r11 + vmlsl.u8 q10, d19, d31 + vld1.u32 {q7}, [r4], r11 + vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r4], r11 + vhadd.s16 q10, q10, q11 + vld1.u32 {q9}, [r4], r11 + vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + add r4, #8 + mov r7, #0xc000 + vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vqrshrun.s16 d9, q10, #6 + vdup.16 q11, r7 + vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + mov r7, #0x4000 + vmlsl.u8 q11, d1, d24 + vst1.8 {q4}, [r1]! ;store the result pu1_dst + vmlal.u8 q11, d3, d25 + vdup.16 q10, r7 + vmlsl.u8 q11, d5, d26 + pld [r12, r2, lsl #2] + pld [r4, r2, lsl #2] + addeq r12, r12, r9 ;increment the src pointer by + ; 2*src_strd-wd + addeq r4, r12, r2 ;pu1_src + src_strd + vmlal.u8 q11, d7, d27 + addeq r1, r1, r8 + subeq r14, r14, #2 + vmlal.u8 q11, d13, d28 + vhadd.s16 q5, q5, q10 + vmlsl.u8 q11, d15, d29 + vmlal.u8 q11, d17, d30 + cmp r14, #0 + vmlsl.u8 q11, d19, d31 + vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow + ; result 2 + beq epilog_16 + + vld1.u32 {q0}, [r12], r11 ;vector load pu1_src + mov r7, #0xc000 + cmp r5, #0 + vld1.u32 {q1}, [r12], r11 + vhadd.s16 q11, q11, q10 + vld1.u32 {q2}, [r12], r11 + vdup.16 q4, r7 + vld1.u32 {q3}, [r12], r11 + vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0], + ; coeffabs_0); + vld1.u32 {q6}, [r12], r11 + vld1.u32 {q7}, [r12], r11 + vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1], + ; coeffabs_1); + vld1.u32 {q8}, [r12], r11 + vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2], + ; coeffabs_2); + vld1.u32 {q9}, [r12], r11 + vqrshrun.s16 d11, q11, #6 + vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3], + ; coeffabs_3); + moveq r5, r10 + vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4], + ; coeffabs_4); + vdup.16 q10, r7 + vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5], + ; coeffabs_5); + vst1.8 {q5}, [r6]! ;store the result pu1_dst + vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6], + ; coeffabs_6); + vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7], + ; coeffabs_7); + addeq r6, r1, r3 ;pu1_dst + dst_strd + b inner_loop_16 + +epilog_16 + mov r7, #0x4000 + ldr r0, [sp], #4 + ldr r10, [sp, #120] + vdup.16 q10, r7 + vhadd.s16 q11, q11, q10 + vqrshrun.s16 d11, q11, #6 + vst1.8 {q5}, [r6]! ;store the result pu1_dst + ldr r7, [sp], #4 + cmp r10, #24 + beq outer_loop8_residual + +end_loops1 + b end_func + +outer_loop4_residual + sub r12, r0, #3 ;pu1_src - 3 + mov r1, r7 + add r1, #8 + mov r10, #4 + add r12, #8 + mov r14, #16 + add r8, #4 + add r9, #4 + +outer_loop_4 + add r6, r1, r3 ;pu1_dst + dst_strd + add r4, r12, r2 ;pu1_src + src_strd + subs r5, r10, #0 ;checks wd + ble end_inner_loop_4 + +inner_loop_4 + vld1.u32 {d0}, [r12], r11 ;vector load pu1_src + vld1.u32 {d1}, [r12], r11 + vld1.u32 {d2}, [r12], r11 + vld1.u32 {d3}, [r12], r11 + vld1.u32 {d4}, [r12], r11 + vld1.u32 {d5}, [r12], r11 + vld1.u32 {d6}, [r12], r11 + vld1.u32 {d7}, [r12], r11 + sub r12, r12, #4 + vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd + vld1.u32 {d13}, [r4], r11 + vzip.32 d0, d12 ;vector zip the i iteration and ii + ; interation in single register + vld1.u32 {d14}, [r4], r11 + vzip.32 d1, d13 + vld1.u32 {d15}, [r4], r11 + vzip.32 d2, d14 + vld1.u32 {d16}, [r4], r11 + vzip.32 d3, d15 + vld1.u32 {d17}, [r4], r11 + vzip.32 d4, d16 + vld1.u32 {d18}, [r4], r11 + vzip.32 d5, d17 + vld1.u32 {d19}, [r4], r11 + mov r7, #0xc000 + vdup.16 q4, r7 + sub r4, r4, #4 + vzip.32 d6, d18 + vzip.32 d7, d19 + vmlal.u8 q4, d1, d25 ;arithmetic operations for ii + ; iteration in the same time + vmlsl.u8 q4, d0, d24 + vmlsl.u8 q4, d2, d26 + vmlal.u8 q4, d3, d27 + vmlal.u8 q4, d4, d28 + vmlsl.u8 q4, d5, d29 + vmlal.u8 q4, d6, d30 + vmlsl.u8 q4, d7, d31 + mov r7, #0x4000 + vdup.16 q10, r7 + vhadd.s16 q4, q4, q10 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r1]! ;store the i iteration result which + ; is in upper part of the register + vst1.32 {d8[1]},[r6]! ;store the ii iteration result which + ; is in lower part of the register + subs r5, r5, #4 ;decrement the wd by 4 + bgt inner_loop_4 + +end_inner_loop_4 + subs r14, r14, #2 ;decrement the ht by 4 + add r12, r12, r9 ;increment the input pointer + ; 2*src_strd-wd + add r1, r1, r8 ;increment the output pointer + ; 2*dst_strd-wd + bgt outer_loop_4 + +end_func + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c index 1386838eea..037ea1142d 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c @@ -14,198 +14,122 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" -// Note: -// 1. src is not always 32-bit aligned, so don't call vld1_lane_u32(src). -// 2. After refactoring the shared code in kernel loops with inline functions, -// the decoder speed dropped a lot when using gcc compiler. Therefore there is -// no refactoring for those parts by now. -// 3. For horizontal convolve, there is an alternative optimization that -// convolves a single row in each loop. For each row, 8 sample banks with 4 or 8 -// samples in each are read from memory: src, (src+1), (src+2), (src+3), -// (src+4), (src+5), (src+6), (src+7), or prepared by vector extract -// instructions. This optimization is much faster in speed unit test, but slowed -// down the whole decoder by 5%. +static INLINE void convolve_4tap_horiz_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const uint8x8_t x_filter = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1); -static INLINE void load_8x4(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0, - uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3) { - *s0 = vld1_u8(s); - s += p; - *s1 = vld1_u8(s); - s += p; - *s2 = vld1_u8(s); - s += p; - *s3 = vld1_u8(s); + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t filter_taps[4] = { vdup_lane_u8(x_filter, 2), + vdup_lane_u8(x_filter, 3), + vdup_lane_u8(x_filter, 4), + vdup_lane_u8(x_filter, 5) }; + + if (w == 4) { + do { + uint8x8_t s01[4]; + + s01[0] = load_unaligned_u8(src + 0, src_stride); + s01[1] = load_unaligned_u8(src + 1, src_stride); + s01[2] = load_unaligned_u8(src + 2, src_stride); + s01[3] = load_unaligned_u8(src + 3, src_stride); + + uint8x8_t d01 = convolve4_8(s01[0], s01[1], s01[2], s01[3], filter_taps); + + store_unaligned_u8(dst, dst_stride, d01); + + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x8_t s0[4], s1[4]; + + s0[0] = vld1_u8(s + 0); + s0[1] = vld1_u8(s + 1); + s0[2] = vld1_u8(s + 2); + s0[3] = vld1_u8(s + 3); + + s1[0] = vld1_u8(s + src_stride + 0); + s1[1] = vld1_u8(s + src_stride + 1); + s1[2] = vld1_u8(s + src_stride + 2); + s1[3] = vld1_u8(s + src_stride + 3); + + uint8x8_t d0 = convolve4_8(s0[0], s0[1], s0[2], s0[3], filter_taps); + uint8x8_t d1 = convolve4_8(s1[0], s1[1], s1[2], s1[3], filter_taps); + + vst1_u8(d, d0); + vst1_u8(d + dst_stride, d1); + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 2 * src_stride; + dst += 2 * dst_stride; + h -= 2; + } while (h > 0); + } } -static INLINE void load_8x8(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0, - uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3, - uint8x8_t *s4, uint8x8_t *s5, uint8x8_t *s6, - uint8x8_t *s7) { - *s0 = vld1_u8(s); - s += p; - *s1 = vld1_u8(s); - s += p; - *s2 = vld1_u8(s); - s += p; - *s3 = vld1_u8(s); - s += p; - *s4 = vld1_u8(s); - s += p; - *s5 = vld1_u8(s); - s += p; - *s6 = vld1_u8(s); - s += p; - *s7 = vld1_u8(s); -} - -static INLINE void store_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, - const uint8x8_t s1, const uint8x8_t s2, - const uint8x8_t s3, const uint8x8_t s4, - const uint8x8_t s5, const uint8x8_t s6, - const uint8x8_t s7) { - vst1_u8(s, s0); - s += p; - vst1_u8(s, s1); - s += p; - vst1_u8(s, s2); - s += p; - vst1_u8(s, s3); - s += p; - vst1_u8(s, s4); - s += p; - vst1_u8(s, s5); - s += p; - vst1_u8(s, s6); - s += p; - vst1_u8(s, s7); -} - -static INLINE int16x4_t convolve8_4(int16x4_t s0, int16x4_t s1, int16x4_t s2, - int16x4_t s3, int16x4_t s4, int16x4_t s5, - int16x4_t s6, int16x4_t s7, - int16x8_t filters, int16x4_t filter3, - int16x4_t filter4) { - const int16x4_t filters_lo = vget_low_s16(filters); - const int16x4_t filters_hi = vget_high_s16(filters); - int16x4_t sum = vdup_n_s16(0); - - sum = vmla_lane_s16(sum, s0, filters_lo, 0); - sum = vmla_lane_s16(sum, s1, filters_lo, 1); - sum = vmla_lane_s16(sum, s2, filters_lo, 2); - sum = vmla_lane_s16(sum, s5, filters_hi, 1); - sum = vmla_lane_s16(sum, s6, filters_hi, 2); - sum = vmla_lane_s16(sum, s7, filters_hi, 3); - sum = vqadd_s16(sum, vmul_s16(s3, filter3)); - sum = vqadd_s16(sum, vmul_s16(s4, filter4)); - return sum; -} - -static INLINE int16x8_t convolve8_8(int16x8_t s0, int16x8_t s1, int16x8_t s2, - int16x8_t s3, int16x8_t s4, int16x8_t s5, - int16x8_t s6, int16x8_t s7, - int16x8_t filters, int16x8_t filter3, - int16x8_t filter4) { - const int16x4_t filters_lo = vget_low_s16(filters); - const int16x4_t filters_hi = vget_high_s16(filters); - int16x8_t sum = vdupq_n_s16(0); - - sum = vmlaq_lane_s16(sum, s0, filters_lo, 0); - sum = vmlaq_lane_s16(sum, s1, filters_lo, 1); - sum = vmlaq_lane_s16(sum, s2, filters_lo, 2); - sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); - sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); - sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); - sum = vqaddq_s16(sum, vmulq_s16(s3, filter3)); - sum = vqaddq_s16(sum, vmulq_s16(s4, filter4)); - return sum; -} - -void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h) { - const int16x8_t filters = vld1q_s16(filter_x); - uint8x8_t t0, t1, t2, t3; - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y_step_q4; - (void)filter_y; - - src -= 3; - +static INLINE void convolve_8tap_horiz_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { if (h == 4) { - uint8x8_t d01, d23; - int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, - d1, d2, d3; - int16x8_t tt0, tt1, tt2, tt3; + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - filter4 = vdup_lane_s16(vget_high_s16(filters), 0); - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s0 = vget_low_s16(tt0); - s1 = vget_low_s16(tt1); - s2 = vget_low_s16(tt2); - s3 = vget_low_s16(tt3); - s4 = vget_high_s16(tt0); - s5 = vget_high_s16(tt1); - s6 = vget_high_s16(tt2); - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + src += 7; do { - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s7 = vget_low_s16(tt0); - s8 = vget_low_s16(tt1); - s9 = vget_low_s16(tt2); - s10 = vget_low_s16(tt3); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + transpose_u8_8x4(&t7, &t8, &t9, &t10); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); transpose_u8_4x4(&d01, &d23); - vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), - vreinterpret_u32_u8(d01), 0); - vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), - vreinterpret_u32_u8(d23), 0); - vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), - vreinterpret_u32_u8(d01), 1); - vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), - vreinterpret_u32_u8(d23), 1); + store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23); s0 = s4; s1 = s5; @@ -217,157 +141,94 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4; dst += 4; w -= 4; - } while (w > 0); + } while (w != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); - int width; - const uint8_t *s; - uint8x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - if (w == 4) { do { - load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + + transpose_u8_8x4(&d04, &d15, &d26, &d37); + + store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04); + store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15); + store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); + store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); - load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - - t0 = vqrshrun_n_s16(d0, 7); - t1 = vqrshrun_n_s16(d1, 7); - t2 = vqrshrun_n_s16(d2, 7); - t3 = vqrshrun_n_s16(d3, 7); - transpose_u8_8x4(&t0, &t1, &t2, &t3); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1); - dst += dst_stride; + dst += 8 * dst_stride; h -= 8; } while (h > 0); } else { - uint8_t *d; - int16x8_t s11, s12, s13, s14, d4, d5, d6, d7; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7; + uint8_t *d = dst; + int width = w; do { - load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); - s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; + load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, + &t15); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, - filter4); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, - filter4); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, - filter4); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, - filter3, filter4); + transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15)); - t0 = vqrshrun_n_s16(d0, 7); - t1 = vqrshrun_n_s16(d1, 7); - t2 = vqrshrun_n_s16(d2, 7); - t3 = vqrshrun_n_s16(d3, 7); - t4 = vqrshrun_n_s16(d4, 7); - t5 = vqrshrun_n_s16(d5, 7); - t6 = vqrshrun_n_s16(d6, 7); - t7 = vqrshrun_n_s16(d7, 7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - store_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7); + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter); + uint8x8_t d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter); + uint8x8_t d6 = + convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter); + uint8x8_t d7 = + convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter); + + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; @@ -379,7 +240,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s += 8; d += 8; width -= 8; - } while (width > 0); + } while (width != 0); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; @@ -388,93 +249,89 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, } } +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + const int16x8_t x_filter = vld1q_s16(filter[x0_q4]); + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + convolve_4tap_horiz_neon(src - 1, src_stride, dst, dst_stride, w, h, + x_filter); + } else { + convolve_8tap_horiz_neon(src - 3, src_stride, dst, dst_stride, w, h, + x_filter); + } +} + void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int16x8_t filters = vld1q_s16(filter_x); - uint8x8_t t0, t1, t2, t3; + const int16x8_t filters = vld1q_s16(filter[x0_q4]); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(x_step_q4 == 16); (void)x_step_q4; + (void)y0_q4; (void)y_step_q4; - (void)filter_y; src -= 3; if (h == 4) { - uint8x8_t d01, d23; - int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, - d1, d2, d3; - int16x8_t tt0, tt1, tt2, tt3; - uint32x4_t d0123 = vdupq_n_u32(0); + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - filter4 = vdup_lane_s16(vget_high_s16(filters), 0); - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s0 = vget_low_s16(tt0); - s1 = vget_low_s16(tt1); - s2 = vget_low_s16(tt2); - s3 = vget_low_s16(tt3); - s4 = vget_high_s16(tt0); - s5 = vget_high_s16(tt1); - s6 = vget_high_s16(tt2); - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + src += 7; do { - load_8x4(src, src_stride, &t0, &t1, &t2, &t3); - transpose_u8_8x4(&t0, &t1, &t2, &t3); - tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - tt1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - tt2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - tt3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s7 = vget_low_s16(tt0); - s8 = vget_low_s16(tt1); - s9 = vget_low_s16(tt2); - s10 = vget_low_s16(tt3); + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + transpose_u8_8x4(&t7, &t8, &t9, &t10); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + int16x4_t s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t8))); + int16x4_t s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t9))); + int16x4_t s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); transpose_u8_4x4(&d01, &d23); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); - d0123 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride); + uint8x8_t dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride); - vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); - vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2); - vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1); - vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23); s0 = s4; s1 = s5; @@ -486,190 +343,114 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, src += 4; dst += 4; w -= 4; - } while (w > 0); + } while (w != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); - int width; - const uint8_t *s; - uint8x8_t t4, t5, t6, t7; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - if (w == 4) { - uint32x4_t d0415 = vdupq_n_u32(0); - uint32x4_t d2637 = vdupq_n_u32(0); do { - load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + + transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); + + uint8x8_t d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + uint8x8_t d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + uint8x8_t d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + uint8x8_t d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + transpose_u8_8x4(&d04, &d15, &d26, &d37); + + uint8x8_t dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride); + uint8x8_t dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride); + uint8x8_t dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride); + uint8x8_t dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride); + + d04 = vrhadd_u8(d04, dd04); + d15 = vrhadd_u8(d15, dd15); + d26 = vrhadd_u8(d26, dd26); + d37 = vrhadd_u8(d37, dd37); + + store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04); + store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15); + store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26); + store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37); - load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); src += 8 * src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - - t0 = vqrshrun_n_s16(d0, 7); - t1 = vqrshrun_n_s16(d1, 7); - t2 = vqrshrun_n_s16(d2, 7); - t3 = vqrshrun_n_s16(d3, 7); - transpose_u8_8x4(&t0, &t1, &t2, &t3); - - d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0); - d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2); - d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1); - d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1); - d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3); - d0415 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1))); - d2637 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3))); - - vst1q_lane_u32((uint32_t *)dst, d0415, 0); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0415, 2); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 0); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 2); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0415, 1); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0415, 3); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 1); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d2637, 3); - dst += dst_stride; + dst += 8 * dst_stride; h -= 8; - } while (h > 0); + } while (h != 0); } else { - uint8_t *d; - int16x8_t s11, s12, s13, s14, d4, d5, d6, d7; - uint8x16_t d01, d23, d45, d67; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - __builtin_prefetch(src + 7 * src_stride); - load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - width = w; - s = src + 7; - d = dst; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(dst + 4 * dst_stride); - __builtin_prefetch(dst + 5 * dst_stride); - __builtin_prefetch(dst + 6 * dst_stride); - __builtin_prefetch(dst + 7 * dst_stride); + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7; + uint8_t *d = dst; + int width = w; do { - load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); - s7 = vreinterpretq_s16_u16(vmovl_u8(t0)); - s8 = vreinterpretq_s16_u16(vmovl_u8(t1)); - s9 = vreinterpretq_s16_u16(vmovl_u8(t2)); - s10 = vreinterpretq_s16_u16(vmovl_u8(t3)); - s11 = vreinterpretq_s16_u16(vmovl_u8(t4)); - s12 = vreinterpretq_s16_u16(vmovl_u8(t5)); - s13 = vreinterpretq_s16_u16(vmovl_u8(t6)); - s14 = vreinterpretq_s16_u16(vmovl_u8(t7)); + uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15; + load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14, + &t15); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3, - filter4); - d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3, - filter4); - d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3, - filter4); - d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, - filter3, filter4); + transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11)); + int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12)); + int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13)); + int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14)); + int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15)); - t0 = vqrshrun_n_s16(d0, 7); - t1 = vqrshrun_n_s16(d1, 7); - t2 = vqrshrun_n_s16(d2, 7); - t3 = vqrshrun_n_s16(d3, 7); - t4 = vqrshrun_n_s16(d4, 7); - t5 = vqrshrun_n_s16(d5, 7); - t6 = vqrshrun_n_s16(d6, 7); - t7 = vqrshrun_n_s16(d7, 7); - transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters); + uint8x8_t d5 = + convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters); + uint8x8_t d6 = + convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters); + uint8x8_t d7 = + convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters); - d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), - vld1_u8(d + 1 * dst_stride)); - d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), - vld1_u8(d + 3 * dst_stride)); - d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride), - vld1_u8(d + 5 * dst_stride)); - d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride), - vld1_u8(d + 7 * dst_stride)); - d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1)); - d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3)); - d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5)); - d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7)); + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); - store_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01), - vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45), - vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67)); + d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride)); + d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride)); + d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride)); + d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride)); + d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride)); + d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride)); + d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride)); + d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride)); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); s0 = s8; s1 = s9; @@ -681,272 +462,187 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, s += 8; d += 8; width -= 8; - } while (width > 0); + } while (width != 0); src += 8 * src_stride; dst += 8 * dst_stride; h -= 8; - } while (h > 0); + } while (h != 0); } } } +static INLINE void convolve_8tap_vert_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { + if (w == 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); + + src += 7 * src_stride; + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10))); + + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7 * src_stride; + uint8_t *d = dst; + int height = h; + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int16x8_t filters = vld1q_s16(filter_y); - - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); + (void)x0_q4; (void)x_step_q4; (void)y_step_q4; - (void)filter_x; - src -= 3 * src_stride; + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); - if (w == 4) { - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); - uint8x8_t d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - - do { - s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1); - dst += dst_stride; - - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - h -= 4; - } while (h > 0); + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h, + y_filter); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); - int height; - const uint8_t *s; - uint8_t *d; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - d = dst; - height = h; - - do { - s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); - - vst1_u8(d, vqrshrun_n_s16(d0, 7)); - d += dst_stride; - vst1_u8(d, vqrshrun_n_s16(d1, 7)); - d += dst_stride; - vst1_u8(d, vqrshrun_n_s16(d2, 7)); - d += dst_stride; - vst1_u8(d, vqrshrun_n_s16(d3, 7)); - d += dst_stride; - - s0 = s4; - s1 = s5; - s2 = s6; - s3 = s7; - s4 = s8; - s5 = s9; - s6 = s10; - height -= 4; - } while (height > 0); - src += 8; - dst += 8; - w -= 8; - } while (w > 0); + convolve_8tap_vert_neon(src - 3 * src_stride, src_stride, dst, dst_stride, + w, h, y_filter); } } void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int16x8_t filters = vld1q_s16(filter_y); + const int16x8_t filters = vld1q_s16(filter[y0_q4]); - assert(!((intptr_t)dst & 3)); - assert(!(dst_stride & 3)); + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); assert(y_step_q4 == 16); + (void)x0_q4; (void)x_step_q4; (void)y_step_q4; - (void)filter_x; src -= 3 * src_stride; if (w == 4) { - const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); - const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); - uint8x8_t d01, d23; - int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - uint32x4_t d0123 = vdupq_n_u32(0); + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + int16x4_t s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0))); + int16x4_t s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1))); + int16x4_t s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2))); + int16x4_t s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3))); + int16x4_t s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4))); + int16x4_t s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5))); + int16x4_t s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6))); - s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; + src += 7 * src_stride; do { - s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; - s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src)))); - src += src_stride; + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + int16x4_t s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t7))); + int16x4_t s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t8))); + int16x4_t s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t9))); + int16x4_t s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t10))); - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + int16x4_t d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + int16x4_t d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + int16x4_t d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + int16x4_t d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7); + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2); - d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3); - d0123 = vreinterpretq_u32_u8( - vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23))); + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); - vst1q_lane_u32((uint32_t *)dst, d0123, 0); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0123, 1); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0123, 2); - dst += dst_stride; - vst1q_lane_u32((uint32_t *)dst, d0123, 3); - dst += dst_stride; + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -955,87 +651,45 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s4 = s8; s5 = s9; s6 = s10; + src += 4 * src_stride; + dst += 4 * dst_stride; h -= 4; - } while (h > 0); + } while (h != 0); } else { - const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3); - const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0); - int height; - const uint8_t *s; - uint8_t *d; - uint8x16_t d01, d23, dd01, dd23; - int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3; - do { - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); - __builtin_prefetch(src + 4 * src_stride); - __builtin_prefetch(src + 5 * src_stride); - __builtin_prefetch(src + 6 * src_stride); - s = src; - s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - d = dst; - height = h; + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + + const uint8_t *s = src + 7 * src_stride; + uint8_t *d = dst; + int height = h; do { - s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; - s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); - s += src_stride; + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8)); + int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9)); + int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10)); - __builtin_prefetch(d + 0 * dst_stride); - __builtin_prefetch(d + 1 * dst_stride); - __builtin_prefetch(d + 2 * dst_stride); - __builtin_prefetch(d + 3 * dst_stride); - __builtin_prefetch(s + 0 * src_stride); - __builtin_prefetch(s + 1 * src_stride); - __builtin_prefetch(s + 2 * src_stride); - __builtin_prefetch(s + 3 * src_stride); - d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, - filter4); - d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, - filter4); - d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3, - filter4); - d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3, - filter4); + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters); + uint8x8_t d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters); + uint8x8_t d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters); + uint8x8_t d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters); - d01 = vcombine_u8(vqrshrun_n_s16(d0, 7), vqrshrun_n_s16(d1, 7)); - d23 = vcombine_u8(vqrshrun_n_s16(d2, 7), vqrshrun_n_s16(d3, 7)); - dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride), - vld1_u8(d + 1 * dst_stride)); - dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride), - vld1_u8(d + 3 * dst_stride)); - dd01 = vrhaddq_u8(dd01, d01); - dd23 = vrhaddq_u8(dd23, d23); + d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride)); + d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride)); + d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride)); + d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride)); - vst1_u8(d, vget_low_u8(dd01)); - d += dst_stride; - vst1_u8(d, vget_high_u8(dd01)); - d += dst_stride; - vst1_u8(d, vget_low_u8(dd23)); - d += dst_stride; - vst1_u8(d, vget_high_u8(dd23)); - d += dst_stride; + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); s0 = s4; s1 = s5; @@ -1045,10 +699,12 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s5 = s9; s6 = s10; height -= 4; - } while (height > 0); + s += 4 * src_stride; + d += 4 * dst_stride; + } while (height != 0); src += 8; dst += 8; w -= 8; - } while (w > 0); + } while (w != 0); } } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h new file mode 100644 index 0000000000..10cc761ccd --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ +#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_filter.h" + +static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filters) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int16x4_t sum; + + sum = vmul_lane_s16(s0, filters_lo, 0); + sum = vmla_lane_s16(sum, s1, filters_lo, 1); + sum = vmla_lane_s16(sum, s2, filters_lo, 2); + sum = vmla_lane_s16(sum, s5, filters_hi, 1); + sum = vmla_lane_s16(sum, s6, filters_hi, 2); + sum = vmla_lane_s16(sum, s7, filters_hi, 3); + sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3)); + sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0)); + return sum; +} + +static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filters) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int16x8_t sum; + + sum = vmulq_lane_s16(s0, filters_lo, 0); + sum = vmlaq_lane_s16(sum, s1, filters_lo, 1); + sum = vmlaq_lane_s16(sum, s2, filters_lo, 2); + sum = vmlaq_lane_s16(sum, s5, filters_hi, 1); + sum = vmlaq_lane_s16(sum, s6, filters_hi, 2); + sum = vmlaq_lane_s16(sum, s7, filters_hi, 3); + sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3)); + sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0)); + return vqrshrun_n_s16(sum, FILTER_BITS); +} + +static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s, + const int16x8_t filters) { + int16x8_t ss[8]; + + ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); + ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); + ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); + ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); + ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4])); + ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5])); + ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6])); + ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7])); + + return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7], + filters); +} + +// 2-tap (bilinear) filter values are always positive, but 4-tap filter values +// are negative on the outer edges (taps 0 and 3), with taps 1 and 2 having much +// greater positive values to compensate. To use instructions that operate on +// 8-bit types we also need the types to be unsigned. Subtracting the products +// of taps 0 and 3 from the products of taps 1 and 2 always works given that +// 2-tap filters are 0-padded. +static INLINE uint8x8_t convolve4_8(const uint8x8_t s0, const uint8x8_t s1, + const uint8x8_t s2, const uint8x8_t s3, + const uint8x8_t filter_taps[4]) { + uint16x8_t sum = vmull_u8(s1, filter_taps[1]); + sum = vmlal_u8(sum, s2, filter_taps[2]); + sum = vmlsl_u8(sum, s0, filter_taps[0]); + sum = vmlsl_u8(sum, s3, filter_taps[3]); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_BITS - 1); +} + +static INLINE void convolve_4tap_vert_neon(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, + const int16x8_t filter) { + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const uint8x8_t y_filter = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(filter)), 1); + + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t filter_taps[4] = { vdup_lane_u8(y_filter, 2), + vdup_lane_u8(y_filter, 3), + vdup_lane_u8(y_filter, 4), + vdup_lane_u8(y_filter, 5) }; + + if (w == 4) { + uint8x8_t s01 = load_unaligned_u8(src + 0 * src_stride, src_stride); + uint8x8_t s12 = load_unaligned_u8(src + 1 * src_stride, src_stride); + + src += 2 * src_stride; + + do { + uint8x8_t s23 = load_unaligned_u8(src + 0 * src_stride, src_stride); + uint8x8_t s34 = load_unaligned_u8(src + 1 * src_stride, src_stride); + uint8x8_t s45 = load_unaligned_u8(src + 2 * src_stride, src_stride); + uint8x8_t s56 = load_unaligned_u8(src + 3 * src_stride, src_stride); + + uint8x8_t d01 = convolve4_8(s01, s12, s23, s34, filter_taps); + uint8x8_t d23 = convolve4_8(s23, s34, s45, s56, filter_taps); + + store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01); + store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23); + + s01 = s45; + s12 = s56; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x8_t s0, s1, s2; + load_u8_8x3(s, src_stride, &s0, &s1, &s2); + + s += 3 * src_stride; + + do { + uint8x8_t s3, s4, s5, s6; + load_u8_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + uint8x8_t d0 = convolve4_8(s0, s1, s2, s3, filter_taps); + uint8x8_t d1 = convolve4_8(s1, s2, s3, s4, filter_taps); + uint8x8_t d2 = convolve4_8(s2, s3, s4, s5, filter_taps); + uint8x8_t d3 = convolve4_8(s3, s4, s5, s6, filter_taps); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s0 = s4; + s1 = s5; + s2 = s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm deleted file mode 100644 index 2d0f2ae065..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm +++ /dev/null @@ -1,270 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ; These functions are only valid when: - ; x_step_q4 == 16 - ; w%4 == 0 - ; h%4 == 0 - ; taps == 8 - ; VP9_FILTER_WEIGHT == 128 - ; VP9_FILTER_SHIFT == 7 - - EXPORT |vpx_convolve8_horiz_neon| - EXPORT |vpx_convolve8_vert_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Multiply and accumulate by q0 - MACRO - MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 - vmull.s16 $dst, $src0, d0[0] - vmlal.s16 $dst, $src1, d0[1] - vmlal.s16 $dst, $src2, d0[2] - vmlal.s16 $dst, $src3, d0[3] - vmlal.s16 $dst, $src4, d1[0] - vmlal.s16 $dst, $src5, d1[1] - vmlal.s16 $dst, $src6, d1[2] - vmlal.s16 $dst, $src7, d1[3] - MEND - -; r0 const uint8_t *src -; r1 int src_stride -; r2 uint8_t *dst -; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused -; sp[]int w -; sp[]int h - -|vpx_convolve8_horiz_neon| PROC - push {r4-r10, lr} - - sub r0, r0, #3 ; adjust for taps - - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h - - vld1.s16 {q0}, [r5] ; filter_x - - sub r8, r1, r1, lsl #2 ; -src_stride * 3 - add r8, r8, #4 ; -src_stride * 3 + 4 - - sub r4, r3, r3, lsl #2 ; -dst_stride * 3 - add r4, r4, #4 ; -dst_stride * 3 + 4 - - rsb r9, r6, r1, lsl #2 ; reset src for outer loop - sub r9, r9, #7 - rsb r12, r6, r3, lsl #2 ; reset dst for outer loop - - mov r10, r6 ; w loop counter - -vpx_convolve8_loop_horiz_v - vld1.8 {d24}, [r0], r1 - vld1.8 {d25}, [r0], r1 - vld1.8 {d26}, [r0], r1 - vld1.8 {d27}, [r0], r8 - - vtrn.16 q12, q13 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - - pld [r0, r1, lsl #2] - - vmovl.u8 q8, d24 - vmovl.u8 q9, d25 - vmovl.u8 q10, d26 - vmovl.u8 q11, d27 - - ; save a few instructions in the inner loop - vswp d17, d18 - vmov d23, d21 - - add r0, r0, #3 - -vpx_convolve8_loop_horiz - add r5, r0, #64 - - vld1.32 {d28[]}, [r0], r1 - vld1.32 {d29[]}, [r0], r1 - vld1.32 {d31[]}, [r0], r1 - vld1.32 {d30[]}, [r0], r8 - - pld [r5] - - vtrn.16 d28, d31 - vtrn.16 d29, d30 - vtrn.8 d28, d29 - vtrn.8 d31, d30 - - pld [r5, r1] - - ; extract to s16 - vtrn.32 q14, q15 - vmovl.u8 q12, d28 - vmovl.u8 q13, d29 - - pld [r5, r1, lsl #1] - - ; src[] * filter_x - MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 - MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 - MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 - MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 - - pld [r5, -r8] - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; transpose - vtrn.16 d2, d3 - vtrn.32 d2, d3 - vtrn.8 d2, d3 - - vst1.u32 {d2[0]}, [r2@32], r3 - vst1.u32 {d3[0]}, [r2@32], r3 - vst1.u32 {d2[1]}, [r2@32], r3 - vst1.u32 {d3[1]}, [r2@32], r4 - - vmov q8, q9 - vmov d20, d23 - vmov q11, q12 - vmov q9, q13 - - subs r6, r6, #4 ; w -= 4 - bgt vpx_convolve8_loop_horiz - - ; outer loop - mov r6, r10 ; restore w counter - add r0, r0, r9 ; src += src_stride * 4 - w - add r2, r2, r12 ; dst += dst_stride * 4 - w - subs r7, r7, #4 ; h -= 4 - bgt vpx_convolve8_loop_horiz_v - - pop {r4-r10, pc} - - ENDP - -|vpx_convolve8_vert_neon| PROC - push {r4-r8, lr} - - ; adjust for taps - sub r0, r0, r1 - sub r0, r0, r1, lsl #1 - - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h - - vld1.s16 {q0}, [r4] ; filter_y - - lsl r1, r1, #1 - lsl r3, r3, #1 - -vpx_convolve8_loop_vert_h - mov r4, r0 - add r7, r0, r1, asr #1 - mov r5, r2 - add r8, r2, r3, asr #1 - mov r12, lr ; h loop counter - - vld1.u32 {d16[0]}, [r4], r1 - vld1.u32 {d16[1]}, [r7], r1 - vld1.u32 {d18[0]}, [r4], r1 - vld1.u32 {d18[1]}, [r7], r1 - vld1.u32 {d20[0]}, [r4], r1 - vld1.u32 {d20[1]}, [r7], r1 - vld1.u32 {d22[0]}, [r4], r1 - - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 - -vpx_convolve8_loop_vert - ; always process a 4x4 block at a time - vld1.u32 {d24[0]}, [r7], r1 - vld1.u32 {d26[0]}, [r4], r1 - vld1.u32 {d26[1]}, [r7], r1 - vld1.u32 {d24[1]}, [r4], r1 - - ; extract to s16 - vmovl.u8 q12, d24 - vmovl.u8 q13, d26 - - pld [r5] - pld [r8] - - ; src[] * filter_y - MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 - - pld [r5, r3] - pld [r8, r3] - - MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 - - pld [r7] - pld [r4] - - MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 - - pld [r7, r1] - pld [r4, r1] - - MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - vst1.u32 {d2[0]}, [r5@32], r3 - vst1.u32 {d2[1]}, [r8@32], r3 - vst1.u32 {d3[0]}, [r5@32], r3 - vst1.u32 {d3[1]}, [r8@32], r3 - - vmov q8, q10 - vmov d18, d22 - vmov d19, d24 - vmov q10, q13 - vmov d22, d25 - - subs r12, r12, #4 ; h -= 4 - bgt vpx_convolve8_loop_vert - - ; outer loop - add r0, r0, #4 - add r2, r2, #4 - subs r6, r6, #4 ; w -= 4 - bgt vpx_convolve8_loop_vert_h - - pop {r4-r8, pc} - - ENDP - END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c new file mode 100644 index 0000000000..c4177c5385 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_filter.h" +#include "vpx_dsp/arm/vpx_convolve8_neon_asm.h" + +/* Type1 and Type2 functions are called depending on the position of the + * negative and positive coefficients in the filter. In type1, the filter kernel + * used is sub_pel_filters_8lp, in which only the first two and the last two + * coefficients are negative. In type2, the negative coefficients are 0, 2, 5 & + * 7. + */ + +#define DEFINE_FILTER(dir) \ + void vpx_convolve8_##dir##_neon( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + if (filter == vp9_filter_kernels[1]) { \ + vpx_convolve8_##dir##_filter_type1_neon( \ + src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h); \ + } else { \ + vpx_convolve8_##dir##_filter_type2_neon( \ + src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h); \ + } \ + } + +DEFINE_FILTER(horiz) +DEFINE_FILTER(avg_horiz) +DEFINE_FILTER(vert) +DEFINE_FILTER(avg_vert) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h new file mode 100644 index 0000000000..f1c7d62ed0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ +#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ + +#define DECLARE_FILTER(dir, type) \ + void vpx_convolve8_##dir##_filter_##type##_neon( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h); + +DECLARE_FILTER(horiz, type1) +DECLARE_FILTER(avg_horiz, type1) +DECLARE_FILTER(horiz, type2) +DECLARE_FILTER(avg_horiz, type2) +DECLARE_FILTER(vert, type1) +DECLARE_FILTER(avg_vert, type1) +DECLARE_FILTER(vert, type2) +DECLARE_FILTER(avg_vert, type2) + +#endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c new file mode 100644 index 0000000000..b919843de6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c @@ -0,0 +1,1024 @@ +/* + * Copyright (c) 2021 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +// Filter values always sum to 128. +#define FILTER_SUM 128 + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl); + + // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide + // by 2 since we halved the filter values.) + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0); + + // Further narrowing and packing is performed by the caller. + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Accumulate into 128 * FILTER_SUM to account for range transform. (Divide + // by 2 since we halved the filter values.) + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM / 2); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]) }; + + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + int32x4_t sum = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum = vdotq_lane_s32(sum, perm_samples[1], filters, 1); + + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); +} + +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x16_t samples_128 = + vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128))); + + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]), + vqtbl1q_s8(samples_128, permute_tbl.val[1]), + vqtbl1q_s8(samples_128, permute_tbl.val[2]) }; + + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, perm_samples[0], filters, 0); + sum0 = vdotq_lane_s32(sum0, perm_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, perm_samples[1], filters, 0); + sum1 = vdotq_lane_s32(sum1, perm_samples[2], filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_4tap_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void convolve_8tap_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + // Load 4-tap filter into first 4 elements of the vector. + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int8x8_t x_filter_4tap = + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + convolve_4tap_horiz_neon_dotprod(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap); + + } else { + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_horiz_neon_dotprod(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap); + } +} + +void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl); + + uint8x8_t dd0, dd1, dd2, dd3; + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE int16x4_t convolve8_4_v(const int8x16_t samples_lo, + const int8x16_t samples_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. + + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + int32x4_t sum = vdotq_lane_s32(acc, samples_lo, filters, 0); + sum = vdotq_lane_s32(sum, samples_hi, filters, 1); + + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); +} + +static INLINE uint8x8_t convolve8_8_v(const int8x16_t samples0_lo, + const int8x16_t samples0_hi, + const int8x16_t samples1_lo, + const int8x16_t samples1_hi, + const int8x8_t filters) { + // The sample range transform and permutation are performed by the caller. + + // Accumulate into 128 * FILTER_SUM to account for range transform. + int32x4_t acc = vdupq_n_s32(128 * FILTER_SUM); + // First 4 output values. + int32x4_t sum0 = vdotq_lane_s32(acc, samples0_lo, filters, 0); + sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vdotq_lane_s32(acc, samples1_lo, filters, 0); + sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_8tap_vert_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + + if (w == 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123, s1234, s2345, s3456; + transpose_concat_s8_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_s8_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_s8_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_s8_4x4(s3, s4, s5, s6, &s3456); + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); + + int8x16_t s78910; + transpose_concat_s8_4x4(s7, s8, s9, s10, &s78910); + + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456, s78910 } }; + int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); + + int8x16_t s78910_lo, s78910_hi; + transpose_concat_s8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); + + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h, + y_filter); + } else { + const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_vert_neon_dotprod(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter); + } +} + +void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + src += 7 * src_stride; + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123, s1234, s2345, s3456; + transpose_concat_s8_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_s8_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_s8_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_s8_4x4(s3, s4, s5, s6, &s3456); + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); + + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); + + int8x16_t s78910; + transpose_concat_s8_4x4(s7, s8, s9, s10, &s78910); + + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456, s78910 } }; + int8x16_t s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + int16x4_t d0 = convolve8_4_v(s0123, s4567, filters); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filters); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filters); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x8_t t0, t1, t2, t3, t4, t5, t6; + load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); + s += 7 * src_stride; + + // Transform sample range to [-128, 127] for 8-bit signed dot product. + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128))); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128))); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128))); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128))); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128))); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128))); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128))); + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_s8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_s8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_s8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_s8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + + do { + uint8x8_t t7, t8, t9, t10; + load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); + + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128))); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128))); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128))); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128))); + + int8x16_t s78910_lo, s78910_hi; + transpose_concat_s8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); + + // Merge new data into block from previous iteration. + int8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + int8x16_t s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + int8x16_t s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]); + int8x16_t s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); + int8x16_t s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters); + + uint8x8_t dd0, dd1, dd2, dd3; + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void convolve_4tap_2d_neon_dotprod(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t x_filter, + const uint8x8_t y_filter) { + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2), + vdup_lane_u8(y_filter, 3), + vdup_lane_u8(y_filter, 4), + vdup_lane_u8(y_filter, 5) }; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); + + int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1); + + src += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl); + int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl); + int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl); + int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1); + uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1); + uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4); + uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4); + + uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps); + uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps); + + store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01); + store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23); + + v_s01 = v_s45; + v_s12 = v_s56; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2); + + uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl); + uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl); + uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl); + + s += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl); + uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl); + uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl); + uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl); + + uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps); + uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps); + uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps); + uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void convolve_8tap_2d_horiz_neon_dotprod( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + uint8x16_t s0, s1, s2; + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = + vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2; + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + + const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8; + const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1; + const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride; + + if (x_filter_taps == 4 && y_filter_taps == 4) { + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t x_filter_4tap = + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + const uint8x8_t y_filter_4tap = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1); + + convolve_4tap_2d_neon_dotprod(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter_4tap, + y_filter_4tap); + return; + } + + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7). + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + const int im_height = h + SUBPEL_TAPS - 1; + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_2d_horiz_neon_dotprod(src - horiz_offset - vert_offset, + src_stride, im_block, im_stride, w, + im_height, x_filter_8tap); + + convolve_8tap_vert_neon_dotprod(im_block, im_stride, dst, dst_stride, w, h, + y_filter_8tap); +} + +void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + + // Averaging convolution always uses an 8-tap filter. + // Account for the vertical phase needing 3 lines prior and 4 lines post. + const int im_height = h + SUBPEL_TAPS - 1; + const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_2d_horiz_neon_dotprod(src - offset - offset * src_stride, + src_stride, im_block, im_stride, w, + im_height, x_filter_8tap); + + vpx_convolve8_avg_vert_neon_dotprod(im_block + offset * im_stride, im_stride, + dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c new file mode 100644 index 0000000000..b9d88bcfb1 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c @@ -0,0 +1,943 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 +}; + +static INLINE int16x4_t convolve4_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t permuted_samples = vqtbl1q_u8(samples, permute_tbl); + + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples, filters, 0); + + // Further narrowing and packing is performed by the caller. + return vmovn_s32(sum); +} + +static INLINE uint8x8_t convolve4_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; + + // First 4 output values. + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + // Second 4 output values. + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vmovn_s32(sum0), vmovn_s32(sum1)); + // We halved the filter values so -1 from right shift. + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE int16x4_t convolve8_4_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x2_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t permuted_samples[2] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]) }; + + int32x4_t sum = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1); + + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); +} + +static INLINE uint8x8_t convolve8_8_h(const uint8x16_t samples, + const int8x8_t filters, + const uint8x16x3_t permute_tbl) { + // Permute samples ready for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t permuted_samples[3] = { vqtbl1q_u8(samples, permute_tbl.val[0]), + vqtbl1q_u8(samples, permute_tbl.val[1]), + vqtbl1q_u8(samples, permute_tbl.val[2]) }; + + // First 4 output values. + int32x4_t sum0 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0); + sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1); + // Second 4 output values. + int32x4_t sum1 = + vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0); + sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_4tap_horiz_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t filter) { + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t t0 = convolve4_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve4_4_h(s3, filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve4_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve4_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve4_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve4_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE void convolve_8tap_horiz_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t filter) { + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t t0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[x0_q4]) <= 4) { + // Load 4-tap filter into first 4 elements of the vector. + // All 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int8x8_t x_filter_4tap = + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + + convolve_4tap_horiz_neon_i8mm(src - 1, src_stride, dst, dst_stride, w, h, + x_filter_4tap); + + } else { + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_horiz_neon_i8mm(src - 3, src_stride, dst, dst_stride, w, h, + x_filter_8tap); + } +} + +void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4])); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + src -= 3; + + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t t0 = convolve8_4_h(s0, filters, permute_tbl); + int16x4_t t1 = convolve8_4_h(s1, filters, permute_tbl); + int16x4_t t2 = convolve8_4_h(s2, filters, permute_tbl); + int16x4_t t3 = convolve8_4_h(s3, filters, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1); + + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_h(s0, filters, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filters, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filters, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filters, permute_tbl); + + uint8x8_t dd0, dd1, dd2, dd3; + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } +} + +static INLINE int16x4_t convolve8_4_v(const uint8x16_t samples_lo, + const uint8x16_t samples_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + int32x4_t sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0); + sum = vusdotq_lane_s32(sum, samples_hi, filters, 1); + + // Further narrowing and packing is performed by the caller. + return vshrn_n_s32(sum, 1); +} + +static INLINE uint8x8_t convolve8_8_v(const uint8x16_t samples0_lo, + const uint8x16_t samples0_hi, + const uint8x16_t samples1_lo, + const uint8x16_t samples1_hi, + const int8x8_t filters) { + // Sample permutation is performed by the caller. + + // First 4 output values. + int32x4_t sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0); + sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1); + // Second 4 output values. + int32x4_t sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0); + sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1); + + // Narrow and re-pack. + int16x8_t sum = vcombine_s16(vshrn_n_s32(sum0, 1), vshrn_n_s32(sum1, 1)); + return vqrshrun_n_s16(sum, FILTER_BITS - 1); +} + +static INLINE void convolve_8tap_vert_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t filter) { + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + if (w == 4) { + uint8x8_t s0, s1, s2, s3, s4, s5, s6; + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456; + transpose_concat_u8_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_u8_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_u8_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_u8_4x4(s3, s4, s5, s6, &s3456); + + do { + uint8x8_t s7, s8, s9, s10; + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + uint8x16_t s78910; + transpose_concat_u8_4x4(s7, s8, s9, s10, &s78910); + + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456, s78910 } }; + uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + int16x4_t d0 = convolve8_4_v(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x8_t s0, s1, s2, s3, s4, s5, s6; + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + + do { + uint8x8_t s7, s8, s9, s10; + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint8x16_t s78910_lo, s78910_hi; + transpose_concat_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); + + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filter); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filter); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filter); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filter); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + if (vpx_get_filter_taps(filter[y0_q4]) <= 4) { + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + convolve_4tap_vert_neon(src - src_stride, src_stride, dst, dst_stride, w, h, + y_filter); + } else { + const int8x8_t y_filter = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_vert_neon_i8mm(src - 3 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter); + } +} + +void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4])); + const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); + + assert((intptr_t)dst % 4 == 0); + assert(dst_stride % 4 == 0); + assert(y_step_q4 == 16); + + (void)x0_q4; + (void)x_step_q4; + (void)y_step_q4; + + src -= 3 * src_stride; + + if (w == 4) { + uint8x8_t s0, s1, s2, s3, s4, s5, s6; + load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + src += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123, s1234, s2345, s3456; + transpose_concat_u8_4x4(s0, s1, s2, s3, &s0123); + transpose_concat_u8_4x4(s1, s2, s3, s4, &s1234); + transpose_concat_u8_4x4(s2, s3, s4, s5, &s2345); + transpose_concat_u8_4x4(s3, s4, s5, s6, &s3456); + + do { + uint8x8_t s7, s8, s9, s10; + load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + + uint8x16_t s78910; + transpose_concat_u8_4x4(s7, s8, s9, s10, &s78910); + + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456, s78910 } }; + uint8x16_t s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + int16x4_t d0 = convolve8_4_v(s0123, s4567, filters); + int16x4_t d1 = convolve8_4_v(s1234, s5678, filters); + int16x4_t d2 = convolve8_4_v(s2345, s6789, filters); + int16x4_t d3 = convolve8_4_v(s3456, s78910, filters); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + uint8x8_t dd01 = load_u8(dst + 0 * dst_stride, dst_stride); + uint8x8_t dd23 = load_u8(dst + 2 * dst_stride, dst_stride); + + d01 = vrhadd_u8(d01, dd01); + d23 = vrhadd_u8(d23, dd23); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123 = s4567; + s1234 = s5678; + s2345 = s6789; + s3456 = s78910; + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x8_t s0, s1, s2, s3, s4, s5, s6; + load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // (see horizontal case) required before computing the dot product. + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; + transpose_concat_u8_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi); + transpose_concat_u8_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi); + transpose_concat_u8_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi); + transpose_concat_u8_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi); + + do { + uint8x8_t s7, s8, s9, s10; + load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + uint8x16_t s78910_lo, s78910_hi; + transpose_concat_u8_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi); + + // Merge new data into block from previous iteration. + uint8x16x2_t samples_LUT = { { s3456_lo, s78910_lo } }; + uint8x16_t s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + samples_LUT.val[0] = s3456_hi; + samples_LUT.val[1] = s78910_hi; + uint8x16_t s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]); + uint8x16_t s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); + uint8x16_t s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); + + uint8x8_t d0 = + convolve8_8_v(s0123_lo, s4567_lo, s0123_hi, s4567_hi, filters); + uint8x8_t d1 = + convolve8_8_v(s1234_lo, s5678_lo, s1234_hi, s5678_hi, filters); + uint8x8_t d2 = + convolve8_8_v(s2345_lo, s6789_lo, s2345_hi, s6789_hi, filters); + uint8x8_t d3 = + convolve8_8_v(s3456_lo, s78910_lo, s3456_hi, s78910_hi, filters); + + uint8x8_t dd0, dd1, dd2, dd3; + load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3); + + d0 = vrhadd_u8(d0, dd0); + d1 = vrhadd_u8(d1, dd1); + d2 = vrhadd_u8(d2, dd2); + d3 = vrhadd_u8(d3, dd3); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + /* Prepare block for next iteration - re-using as much as possible. */ + /* Shuffle everything up four rows. */ + s0123_lo = s4567_lo; + s0123_hi = s4567_hi; + s1234_lo = s5678_lo; + s1234_hi = s5678_hi; + s2345_lo = s6789_lo; + s2345_hi = s6789_hi; + s3456_lo = s78910_lo; + s3456_hi = s78910_hi; + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void convolve_4tap_2d_neon_i8mm(const uint8_t *src, + ptrdiff_t src_stride, + uint8_t *dst, + ptrdiff_t dst_stride, int w, + int h, const int8x8_t x_filter, + const uint8x8_t y_filter) { + // Neon does not have lane-referencing multiply or multiply-accumulate + // instructions that operate on vectors of 8-bit elements. This means we have + // to duplicate filter taps into a whole vector and use standard multiply / + // multiply-accumulate instructions. + const uint8x8_t y_filter_taps[4] = { vdup_lane_u8(y_filter, 2), + vdup_lane_u8(y_filter, 3), + vdup_lane_u8(y_filter, 4), + vdup_lane_u8(y_filter, 5) }; + + if (w == 4) { + const uint8x16_t permute_tbl = vld1q_u8(dot_prod_permute_tbl); + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2); + + int16x4_t t0 = convolve4_4_h(h_s0, x_filter, permute_tbl); + int16x4_t t1 = convolve4_4_h(h_s1, x_filter, permute_tbl); + int16x4_t t2 = convolve4_4_h(h_s2, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1); + uint8x8_t v_s12 = vqrshrun_n_s16(vcombine_s16(t1, t2), FILTER_BITS - 1); + + src += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + int16x4_t t3 = convolve4_4_h(h_s3, x_filter, permute_tbl); + int16x4_t t4 = convolve4_4_h(h_s4, x_filter, permute_tbl); + int16x4_t t5 = convolve4_4_h(h_s5, x_filter, permute_tbl); + int16x4_t t6 = convolve4_4_h(h_s6, x_filter, permute_tbl); + // We halved the filter values so -1 from right shift. + uint8x8_t v_s34 = vqrshrun_n_s16(vcombine_s16(t3, t4), FILTER_BITS - 1); + uint8x8_t v_s56 = vqrshrun_n_s16(vcombine_s16(t5, t6), FILTER_BITS - 1); + uint8x8_t v_s23 = vext_u8(v_s12, v_s34, 4); + uint8x8_t v_s45 = vext_u8(v_s34, v_s56, 4); + + uint8x8_t d01 = convolve4_8(v_s01, v_s12, v_s23, v_s34, y_filter_taps); + uint8x8_t d23 = convolve4_8(v_s23, v_s34, v_s45, v_s56, y_filter_taps); + + store_unaligned_u8(dst + 0 * dst_stride, dst_stride, d01); + store_unaligned_u8(dst + 2 * dst_stride, dst_stride, d23); + + v_s01 = v_s45; + v_s12 = v_s56; + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h != 0); + } else { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int height = h; + + uint8x16_t h_s0, h_s1, h_s2; + load_u8_16x3(s, src_stride, &h_s0, &h_s1, &h_s2); + + uint8x8_t v_s0 = convolve4_8_h(h_s0, x_filter, permute_tbl); + uint8x8_t v_s1 = convolve4_8_h(h_s1, x_filter, permute_tbl); + uint8x8_t v_s2 = convolve4_8_h(h_s2, x_filter, permute_tbl); + + s += 3 * src_stride; + + do { + uint8x16_t h_s3, h_s4, h_s5, h_s6; + load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6); + + uint8x8_t v_s3 = convolve4_8_h(h_s3, x_filter, permute_tbl); + uint8x8_t v_s4 = convolve4_8_h(h_s4, x_filter, permute_tbl); + uint8x8_t v_s5 = convolve4_8_h(h_s5, x_filter, permute_tbl); + uint8x8_t v_s6 = convolve4_8_h(h_s6, x_filter, permute_tbl); + + uint8x8_t d0 = convolve4_8(v_s0, v_s1, v_s2, v_s3, y_filter_taps); + uint8x8_t d1 = convolve4_8(v_s1, v_s2, v_s3, v_s4, y_filter_taps); + uint8x8_t d2 = convolve4_8(v_s2, v_s3, v_s4, v_s5, y_filter_taps); + uint8x8_t d3 = convolve4_8(v_s3, v_s4, v_s5, v_s6, y_filter_taps); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + v_s0 = v_s4; + v_s1 = v_s5; + v_s2 = v_s6; + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height != 0); + src += 8; + dst += 8; + w -= 8; + } while (w != 0); + } +} + +static INLINE void convolve_8tap_2d_horiz_neon_i8mm( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, int w, int h, const int8x8_t filter) { + if (w == 4) { + const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl); + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + int16x4_t d3 = convolve8_4_h(s3, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8(dst + 2 * dst_stride, dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + uint8x16_t s0, s1, s2; + load_u8_16x3(src, src_stride, &s0, &s1, &s2); + + int16x4_t d0 = convolve8_4_h(s0, filter, permute_tbl); + int16x4_t d1 = convolve8_4_h(s1, filter, permute_tbl); + int16x4_t d2 = convolve8_4_h(s2, filter, permute_tbl); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); + uint8x8_t d23 = + vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS - 1); + + store_u8(dst + 0 * dst_stride, dst_stride, d01); + store_u8_4x1(dst + 2 * dst_stride, d23); + } else { + const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl); + + do { + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2, s3; + load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + uint8x8_t d3 = convolve8_8_h(s3, filter, permute_tbl); + + store_u8_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 3); + + // Process final three rows (h % 4 == 3). See vpx_convolve_neon_i8mm() + // below for further details on possible values of block height. + const uint8_t *s = src; + uint8_t *d = dst; + int width = w; + + do { + uint8x16_t s0, s1, s2; + load_u8_16x3(s, src_stride, &s0, &s1, &s2); + + uint8x8_t d0 = convolve8_8_h(s0, filter, permute_tbl); + uint8x8_t d1 = convolve8_8_h(s1, filter, permute_tbl); + uint8x8_t d2 = convolve8_8_h(s2, filter, permute_tbl); + + store_u8_8x3(d, dst_stride, d0, d1, d2); + + s += 8; + d += 8; + width -= 8; + } while (width > 0); + } +} + +void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + + const int x_filter_taps = vpx_get_filter_taps(filter[x0_q4]) <= 4 ? 4 : 8; + const int y_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + // Account for needing filter_taps / 2 - 1 lines prior and filter_taps / 2 + // lines post both horizontally and vertically. + const ptrdiff_t horiz_offset = x_filter_taps / 2 - 1; + const ptrdiff_t vert_offset = (y_filter_taps / 2 - 1) * src_stride; + + if (x_filter_taps == 4 && y_filter_taps == 4) { + const int16x4_t x_filter = vld1_s16(filter[x0_q4] + 2); + const int16x8_t y_filter = vld1q_s16(filter[y0_q4]); + + // 4-tap and bilinear filter values are even, so halve them to reduce + // intermediate precision requirements. + const int8x8_t x_filter_4tap = + vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1); + const uint8x8_t y_filter_4tap = + vshrn_n_u16(vreinterpretq_u16_s16(vabsq_s16(y_filter)), 1); + + convolve_4tap_2d_neon_i8mm(src - horiz_offset - vert_offset, src_stride, + dst, dst_stride, w, h, x_filter_4tap, + y_filter_4tap); + return; + } + + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7). + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + const int im_height = h + SUBPEL_TAPS - 1; + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + const int8x8_t y_filter_8tap = vmovn_s16(vld1q_s16(filter[y0_q4])); + + convolve_8tap_2d_horiz_neon_i8mm(src - horiz_offset - vert_offset, src_stride, + im_block, im_stride, w, im_height, + x_filter_8tap); + + convolve_8tap_vert_neon_i8mm(im_block, im_stride, dst, dst_stride, w, h, + y_filter_8tap); +} + +void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 71]); + const int im_stride = 64; + + // Averaging convolution always uses an 8-tap filter. + // Account for the vertical phase needing 3 lines prior and 4 lines post. + const int im_height = h + SUBPEL_TAPS - 1; + const ptrdiff_t offset = SUBPEL_TAPS / 2 - 1; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + const int8x8_t x_filter_8tap = vmovn_s16(vld1q_s16(filter[x0_q4])); + + convolve_8tap_2d_horiz_neon_i8mm(src - offset - offset * src_stride, + src_stride, im_block, im_stride, w, + im_height, x_filter_8tap); + + vpx_convolve8_avg_vert_neon_i8mm(im_block + offset * im_stride, im_stride, + dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm new file mode 100644 index 0000000000..2666d4253e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm @@ -0,0 +1,457 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_vert_filter_type1_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_vert_filter_type1_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + pld [r3, r2, lsl #1] + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r3, r3, r2 + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + vmlal.u8 q6, d7, d27 + vmlsl.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlsl.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + addle r1, r1, r9 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlsl.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + add r10, r10, r2 ; 11*strd + vmlal.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlal.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + vmlsl.u8 q6, d16, d28 + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlsl.u8 q6, d17, d29 + add r10, r10, r2 ;12*strd + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + subs r7, r7, #4 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vst1.8 {d14}, [r14], r6 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vmlal.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlal.u8 q6, d7, d27 + vmlsl.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vmlal.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlal.u8 q7, d16, d27 + vmlsl.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from + ; sp + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlsl.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlsl.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlal.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlal.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlsl.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm new file mode 100644 index 0000000000..cb5d6d3fe5 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm @@ -0,0 +1,455 @@ +; +; Copyright (c) 2018 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +;**************Variables Vs Registers*********************************** +; r0 => src +; r1 => dst +; r2 => src_stride +; r6 => dst_stride +; r12 => filter_y0 +; r5 => ht +; r3 => wd + + EXPORT |vpx_convolve8_vert_filter_type2_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vpx_convolve8_vert_filter_type2_neon| PROC + + stmfd sp!, {r4 - r12, r14} ;stack stores the values of + ; the arguments + vpush {d8 - d15} ; stack offset by 64 + mov r4, r1 + mov r1, r2 + mov r2, r4 + vmov.i16 q15, #0x4000 + mov r11, #0xc000 + ldr r12, [sp, #104] ;load filter + ldr r6, [sp, #116] ;load y0_q4 + add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4] + mov r6, r3 + ldr r5, [sp, #124] ;load wd + vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff) + sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff + vabs.s8 d0, d0 ;vabs_s8(coeff) + add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff + ldr r3, [sp, #128] ;load ht + subs r7, r3, #0 ;r3->ht + vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs, + ; 0); + cmp r5, #8 + vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs, + ; 1); + vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs, + ; 2); + vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs, + ; 3); + vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs, + ; 4); + vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs, + ; 5); + vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs, + ; 6); + vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs, + ; 7); + blt core_loop_wd_4 ;core loop wd 4 jump + + str r0, [sp, #-4]! + str r1, [sp, #-4]! + bic r4, r5, #7 ;r5 ->wd + rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r4, r2, lsl #2 ;r2->src_strd + mov r3, r5, lsr #3 ;divide by 8 + mul r7, r3 ;multiply height by width + sub r7, #4 ;subtract by one for epilog + +prolog + and r10, r0, #31 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vdup.16 q4, r11 + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + subs r4, r4, #8 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vdup.16 q5, r11 + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + pld [r3] + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + pld [r3, r2] + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r3, r3, r2 + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + pld [r3, r2, lsl #1] + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + + vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d2, d22 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q6, d4, d24 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d5, d25 + vmlal.u8 q6, d6, d26 + vmlsl.u8 q6, d7, d27 + vmlal.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + addle r1, r1, r9 + vmlal.u8 q7, d4, d23 + subs r7, r7, #4 + vmlsl.u8 q7, d3, d22 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + blt epilog_end ;jumps to epilog_end + + beq epilog ;jumps to epilog + +main_loop_8 + subs r4, r4, #8 + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + ; coeffabs_1); + addle r0, r0, r8 + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + bicle r4, r5, #7 ;r5 ->wd + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + add r3, r0, r2 ;pu1_src_tmp += src_strd; + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vst1.8 {d14}, [r14], r6 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + add r14, r1, #0 + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + add r1, r1, #8 + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + addle r1, r1, r9 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vmlal.u8 q6, d3, d23 + add r10, r3, r2, lsl #3 ; 10*strd - 8+2 + vmlsl.u8 q6, d2, d22 + add r10, r10, r2 ; 11*strd + vmlsl.u8 q6, d4, d24 + vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res); + pld [r10] ;11+ 0 + vmlsl.u8 q6, d7, d27 + pld [r10, r2] ;11+ 1*strd + vmlal.u8 q6, d16, d28 + pld [r10, r2, lsl #1] ;11+ 2*strd + vmlsl.u8 q6, d17, d29 + add r10, r10, r2 ;12*strd + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + pld [r10, r2, lsl #1] ;11+ 3*strd + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + subs r7, r7, #4 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vhadd.s16 q6, q6, q15 + vdup.16 q4, r11 + vmlal.u8 q7, d7, d26 + vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d16, d27 + vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d17, d28 + vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlsl.u8 q7, d18, d29 + vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp); + vqrshrun.s16 d12, q6, #6 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + bgt main_loop_8 ;jumps to main_loop_8 + +epilog + vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2, + vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp1, coeffabs_0); + vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp3, coeffabs_2); + vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp4, coeffabs_3); + vhadd.s16 q7, q7, q15 + vdup.16 q5, r11 + vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp1, coeffabs_4); + vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp2, coeffabs_5); + vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; src_tmp3, coeffabs_6); + vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1, + ; src_tmp4, coeffabs_7); + vst1.8 {d12}, [r14], r6 + vqrshrun.s16 d14, q7, #6 + vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp); + vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3, + ; coeffabs_1); + vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp2, coeffabs_0); + vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp4, coeffabs_2); + vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp1, coeffabs_3); + vhadd.s16 q4, q4, q15 + vdup.16 q6, r11 + vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp2, coeffabs_4); + vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp3, coeffabs_5); + vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2, + ; src_tmp4, coeffabs_6); + vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; src_tmp1, coeffabs_7); + vst1.8 {d14}, [r14], r6 + vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp); + vmlal.u8 q6, d3, d23 + vmlsl.u8 q6, d2, d22 + vmlsl.u8 q6, d4, d24 + vmlal.u8 q6, d5, d25 + vhadd.s16 q5, q5, q15 + vdup.16 q7, r11 + vmlal.u8 q6, d6, d26 + vmlsl.u8 q6, d7, d27 + vmlal.u8 q6, d16, d28 + vmlsl.u8 q6, d17, d29 + add r14, r1, r6 + vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res); + vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp); + vmlal.u8 q7, d4, d23 + vmlsl.u8 q7, d3, d22 + vmlsl.u8 q7, d5, d24 + vmlal.u8 q7, d6, d25 + vhadd.s16 q6, q6, q15 + vmlal.u8 q7, d7, d26 + vmlsl.u8 q7, d16, d27 + vmlal.u8 q7, d17, d28 + vmlsl.u8 q7, d18, d29 + vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res); + vqrshrun.s16 d12, q6, #6 + +epilog_end + vst1.8 {d12}, [r14], r6 + vhadd.s16 q7, q7, q15 + vqrshrun.s16 d14, q7, #6 + vst1.8 {d14}, [r14], r6 + +end_loops + tst r5, #7 + ldr r1, [sp], #4 + ldr r0, [sp], #4 + vpopeq {d8 - d15} + ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp + mov r5, #4 + add r0, r0, #8 + add r1, r1, #8 + mov r7, #16 + +core_loop_wd_4 + rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd + rsb r8, r5, r2, lsl #2 ;r2->src_strd + vmov.i8 d4, #0 + +outer_loop_wd_4 + subs r12, r5, #0 + ble end_inner_loop_wd_4 ;outer loop jump + +inner_loop_wd_4 + add r3, r0, r2 + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + subs r12, r12, #4 + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 0); + vdup.16 q0, r11 + vmlal.u8 q0, d5, d23 ;mul_res1 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + add r0, r0, #4 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_0); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_2); + vdup.16 q4, r11 + vmlal.u8 q4, d7, d23 + vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4, + ; 1); + vmull.u8 q1, d7, d25 ;mul_res2 = + ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3); + vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp1, 1); + vmlsl.u8 q4, d6, d22 + vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp1), coeffabs_4); + vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1, + ; 1); + vmlsl.u8 q4, d4, d24 + vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp2, 1); + vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp2), coeffabs_5); + vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2, + ; 1); + vmlal.u8 q4, d5, d25 + vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp3, 1); + vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1, + ; vreinterpret_u8_u32(src_tmp3), coeffabs_6); + vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3, + ; 1); + vmlal.u8 q4, d6, d26 + vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t + ; *)pu1_src_tmp, src_tmp4, 1); + vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2, + ; vreinterpret_u8_u32(src_tmp4), coeffabs_7); + vdup.u32 d4, d7[1] + vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1, + ; mul_res2); + vmlsl.u8 q4, d7, d27 + vld1.u32 {d4[1]},[r3], r2 + vmlal.u8 q4, d4, d28 + vdup.u32 d5, d4[1] + vhadd.s16 q0, q0, q15 + vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp); + vld1.u32 {d5[1]},[r3] + add r3, r1, r6 + vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst, + ; vreinterpret_u32_u8(sto_res), 0); + vmlsl.u8 q4, d5, d29 + vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t + ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1); + vhadd.s16 q4, q4, q15 + vqrshrun.s16 d8, q4, #6 + vst1.32 {d8[0]},[r3], r6 + add r1, r1, #4 + vst1.32 {d8[1]},[r3] + bgt inner_loop_wd_4 + +end_inner_loop_wd_4 + subs r7, r7, #4 + add r1, r1, r9 + add r0, r0, r8 + bgt outer_loop_wd_4 + + vpop {d8 - d15} + ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp + + ENDP + + END diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c index 04cb835fa3..8e3ee599f4 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c @@ -15,13 +15,13 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, - int h) { - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; if (w < 8) { // avg4 uint8x8_t s0, s1; @@ -43,7 +43,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dd0), 1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 8) { // avg8 uint8x8_t s0, s1, d0, d1; uint8x16_t s01, d01; @@ -64,7 +64,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, vst1_u8(dst, vget_high_u8(d01)); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w < 32) { // avg16 uint8x16_t s0, s1, d0, d1; do { @@ -83,7 +83,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, vst1q_u8(dst, d1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 32) { // avg32 uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3; do { @@ -110,7 +110,7 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, vst1q_u8(dst + 16, d3); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else { // avg64 uint8x16_t s0, s1, s2, s3, d0, d1, d2, d3; do { diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm index 97e6189fda..efd6574f1f 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm @@ -17,7 +17,7 @@ |vpx_convolve_avg_neon| PROC push {r4-r6, lr} - ldrd r4, r5, [sp, #32] + ldrd r4, r5, [sp, #36] mov r6, r2 cmp r4, #32 diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c index a8f690acd4..bea7c98437 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -9,30 +9,32 @@ */ #include +#include #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; if (w < 8) { // copy4 do { - *(uint32_t *)dst = *(const uint32_t *)src; + memcpy(dst, src, 4); src += src_stride; dst += dst_stride; - *(uint32_t *)dst = *(const uint32_t *)src; + memcpy(dst, src, 4); src += src_stride; dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 8) { // copy8 uint8x8_t s0, s1; do { @@ -46,7 +48,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, vst1_u8(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w < 32) { // copy16 uint8x16_t s0, s1; do { @@ -60,7 +62,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, vst1q_u8(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 32) { // copy32 uint8x16_t s0, s1, s2, s3; do { @@ -78,7 +80,7 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, vst1q_u8(dst + 16, s3); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else { // copy64 uint8x16_t s0, s1, s2, s3; do { diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm index 89164ad48b..7a66e3ce2f 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm @@ -17,7 +17,7 @@ |vpx_convolve_copy_neon| PROC push {r4-r5, lr} - ldrd r4, r5, [sp, #28] + ldrd r4, r5, [sp, #32] cmp r4, #32 bgt copy64 diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c index 6ca0e501b3..de5fa29471 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_convolve_neon.c @@ -12,53 +12,61 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the - * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). - */ - DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); + // Given our constraints: w <= 64, h <= 64, taps <= 8 we can reduce the + // maximum buffer size to 64 * (64 + 7) (+1 row to make it divisible by 4). + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]); + const int im_stride = 64; - // Account for the vertical phase needing 3 lines prior and 4 lines post - const int intermediate_height = h + 7; + const int vert_filter_taps = vpx_get_filter_taps(filter[y0_q4]) <= 4 ? 4 : 8; + // Account for the vertical phase needing vert_filter_taps / 2 - 1 lines prior + // and vert_filter_taps / 2 lines post. (+1 to make total divisible by 4.) + const int im_height = h + vert_filter_taps; + const ptrdiff_t border_offset = vert_filter_taps / 2 - 1; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* Filter starting 3 lines back. The neon implementation will ignore the given - * height and filter a multiple of 4 lines. Since this goes in to the temp - * buffer which has lots of extra room and is subsequently discarded this is - * safe if somewhat less than ideal. */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, - x_step_q4, filter_y, y_step_q4, w, - intermediate_height); + // Filter starting border_offset rows back. The Neon implementation will + // ignore the given height and filter a multiple of 4 lines. Since this goes + // into the temporary buffer which has lots of extra room and is subsequently + // discarded this is safe if somewhat less than ideal. + vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, + im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, im_height); - /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + // Step into the temporary buffer border_offset rows to get actual frame data. + vpx_convolve8_vert_neon(im_block + im_stride * border_offset, im_stride, dst, + dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); } void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); - const int intermediate_height = h + 7; + DECLARE_ALIGNED(32, uint8_t, im_block[64 * 72]); + const int im_stride = 64; + const int im_height = h + SUBPEL_TAPS; + const ptrdiff_t border_offset = SUBPEL_TAPS / 2 - 1; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* This implementation has the same issues as above. In addition, we only want - * to average the values after both passes. - */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, - x_step_q4, filter_y, y_step_q4, w, - intermediate_height); - vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + // This implementation has the same issues as above. In addition, we only want + // to average the values after both passes. + vpx_convolve8_horiz_neon(src - src_stride * border_offset, src_stride, + im_block, im_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, im_height); + + vpx_convolve8_avg_vert_neon(im_block + im_stride * border_offset, im_stride, + dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h new file mode 100644 index 0000000000..f30143e3e3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve2_bridge.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_ +#define VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_ + +#include +#include +#include + +// Some very useful instructions are exclusive to the SVE2 instruction set. +// However, we can access these instructions from a predominantly Neon context +// by making use of the Neon-SVE bridge intrinsics to reinterpret Neon vectors +// as SVE vectors - with the high part of the SVE vector (if it's longer than +// 128 bits) being "don't care". + +static INLINE int16x8_t vpx_tbl2_s16(int16x8_t s0, int16x8_t s1, + uint16x8_t tbl) { + svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0), + svset_neonq_s16(svundef_s16(), s1)); + return svget_neonq_s16( + svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl))); +} + +static INLINE void vpx_tbl2x4_s16(int16x8_t s0[4], int16x8_t s1[4], + int16x8_t res[4], uint16x8_t idx) { + res[0] = vpx_tbl2_s16(s0[0], s1[0], idx); + res[1] = vpx_tbl2_s16(s0[1], s1[1], idx); + res[2] = vpx_tbl2_s16(s0[2], s1[2], idx); + res[3] = vpx_tbl2_s16(s0[3], s1[3], idx); +} + +static INLINE void vpx_tbl2x2_s16(int16x8_t s0[2], int16x8_t s1[2], + int16x8_t res[2], uint16x8_t idx) { + res[0] = vpx_tbl2_s16(s0[0], s1[0], idx); + res[1] = vpx_tbl2_s16(s0[1], s1[1], idx); +} + +#endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE2_BRIDGE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h new file mode 100644 index 0000000000..2a2d89f42a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_neon_sve_bridge.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2024 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_ +#define VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_ + +#include +#include +#include + +// Dot product instructions operating on 16-bit input elements are exclusive to +// the SVE instruction set. However, we can access these instructions from a +// predominantly Neon context by making use of the Neon-SVE bridge intrinsics +// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE +// vector (if it's longer than 128 bits) being "don't care". + +// While sub-optimal on machines that have SVE vector length > 128-bit - as the +// remainder of the vector is unused - this approach is still beneficial when +// compared to a Neon-only solution. + +static INLINE uint64x2_t vpx_dotq_u16(uint64x2_t acc, uint16x8_t x, + uint16x8_t y) { + return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc), + svset_neonq_u16(svundef_u16(), x), + svset_neonq_u16(svundef_u16(), y))); +} + +static INLINE int64x2_t vpx_dotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) { + return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc), + svset_neonq_s16(svundef_s16(), x), + svset_neonq_s16(svundef_s16(), y))); +} + +#define vpx_dotq_lane_s16(acc, x, y, lane) \ + svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), acc), \ + svset_neonq_s16(svundef_s16(), x), \ + svset_neonq_s16(svundef_s16(), y), lane)) + +static INLINE uint16x8_t vpx_tbl_u16(uint16x8_t data, uint16x8_t indices) { + return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), data), + svset_neonq_u16(svundef_u16(), indices))); +} + +static INLINE int16x8_t vpx_tbl_s16(int16x8_t data, uint16x8_t indices) { + return svget_neonq_s16(svtbl_s16(svset_neonq_s16(svundef_s16(), data), + svset_neonq_u16(svundef_u16(), indices))); +} + +#endif // VPX_VPX_DSP_ARM_VPX_NEON_SVE_BRIDGE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c new file mode 100644 index 0000000000..f40b6a907f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/arm/vpx_convolve8_neon.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +static INLINE void scaledconvolve_horiz_neon( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const x_filter, + const int x0_q4, const int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + + src -= SUBPEL_TAPS / 2 - 1; + + if (w == 4) { + do { + int x_q4 = x0_q4; + + // Process a 4x4 tile. + for (int r = 0; r < 4; ++r) { + const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; + + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3; + load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); + transpose_u8_8x4(&t0, &t1, &t2, &t3); + + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + + int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d0 = + vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS); + + store_u8_4x1(&temp[4 * r], d0); + } else { + // Memcpy for non-subpel locations. + s += SUBPEL_TAPS / 2 - 1; + + for (int c = 0; c < 4; ++c) { + temp[r * 4 + c] = s[c * src_stride]; + } + } + x_q4 += x_step_q4; + } + + // Transpose the 4x4 result tile and store. + uint8x8_t d01 = vld1_u8(temp + 0); + uint8x8_t d23 = vld1_u8(temp + 8); + + transpose_u8_4x4(&d01, &d23); + + store_u8_4x1(dst + 0 * dst_stride, d01); + store_u8_4x1(dst + 1 * dst_stride, d23); + store_u8_4x1_high(dst + 2 * dst_stride, d01); + store_u8_4x1_high(dst + 3 * dst_stride, d23); + + src += 4 * src_stride; + dst += 4 * dst_stride; + h -= 4; + } while (h > 0); + return; + } + + do { + int x_q4 = x0_q4; + uint8_t *d = dst; + int width = w; + + do { + // Process an 8x8 tile. + for (int r = 0; r < 8; ++r) { + const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; + + if (x_q4 & SUBPEL_MASK) { + const int16x8_t filter = vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + + vst1_u8(&temp[r * 8], d0); + } else { + // Memcpy for non-subpel locations. + s += SUBPEL_TAPS / 2 - 1; + + for (int c = 0; c < 8; ++c) { + temp[r * 8 + c] = s[c * src_stride]; + } + } + x_q4 += x_step_q4; + } + + // Transpose the 8x8 result tile and store. + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; + load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + d += 8; + width -= 8; + } while (width != 0); + + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); +} + +static INLINE void scaledconvolve_vert_neon( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filter, + const int y0_q4, const int y_step_q4, int w, int h) { + int y_q4 = y0_q4; + + if (w == 4) { + do { + const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); + int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); + int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); + int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); + int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); + int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); + int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); + int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); + + int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); + uint8x8_t d0 = + vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS); + + store_u8_4x1(dst, d0); + } else { + // Memcpy for non-subpel locations. + memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4); + } + + y_q4 += y_step_q4; + dst += dst_stride; + } while (--h != 0); + return; + } + + if (w == 8) { + do { + const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + + if (y_q4 & SUBPEL_MASK) { + const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]); + + uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); + int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); + int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); + int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); + int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); + int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); + int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); + int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); + + uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); + + vst1_u8(dst, d0); + } else { + // Memcpy for non-subpel locations. + memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8); + } + + y_q4 += y_step_q4; + dst += dst_stride; + } while (--h != 0); + return; + } + + do { + const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + uint8_t *d = dst; + int width = w; + + if (y_q4 & SUBPEL_MASK) { + do { + const int16x8_t filter = vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]); + + uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; + load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); + + int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); + s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); + s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); + s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); + s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4))); + s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5))); + s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6))); + s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7))); + + s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); + s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); + s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); + s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); + s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4))); + s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5))); + s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6))); + s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7))); + + uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0], + s6[0], s7[0], filter); + uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1], + s6[1], s7[1], filter); + + vst1q_u8(d, vcombine_u8(d0, d1)); + + s += 16; + d += 16; + width -= 16; + } while (width != 0); + } else { + // Memcpy for non-subpel locations. + s += (SUBPEL_TAPS / 2 - 1) * src_stride; + + do { + uint8x16_t s0 = vld1q_u8(s); + vst1q_u8(d, s0); + s += 16; + d += 16; + width -= 16; + } while (width != 0); + } + + y_q4 += y_step_q4; + dst += dst_stride; + } while (--h != 0); +} + +void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Fixed size intermediate buffer, im_block, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the im_block buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]); + const int im_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + const ptrdiff_t im_stride = 64; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + scaledconvolve_horiz_neon(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, im_block, im_stride, filter, x0_q4, + x_step_q4, w, im_height); + + scaledconvolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/avg.c b/media/libvpx/libvpx/vpx_dsp/avg.c index 4d9abb8de3..a8dcab7dae 100644 --- a/media/libvpx/libvpx/vpx_dsp/avg.c +++ b/media/libvpx/libvpx/vpx_dsp/avg.c @@ -7,6 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + +#include #include #include "./vpx_dsp_rtcd.h" @@ -32,9 +34,169 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) { return (sum + 8) >> 4; } +#if CONFIG_VP9_HIGHBITDEPTH +// src_diff: 13 bit, dynamic range [-4095, 4095] +// coeff: 16 bit +static void hadamard_highbd_col8_first_pass(const int16_t *src_diff, + ptrdiff_t src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// src_diff: 16 bit, dynamic range [-32760, 32760] +// coeff: 19 bit +static void hadamard_highbd_col8_second_pass(const int16_t *src_diff, + ptrdiff_t src_stride, + int32_t *coeff) { + int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int32_t c0 = b0 + b2; + int32_t c1 = b1 + b3; + int32_t c2 = b0 - b2; + int32_t c3 = b1 - b3; + int32_t c4 = b4 + b6; + int32_t c5 = b5 + b7; + int32_t c6 = b4 - b6; + int32_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// The order of the output coeff of the hadamard is not important. For +// optimization purposes the final transpose may be skipped. +void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + int16_t buffer[64]; + int32_t buffer2[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + // src_diff: 13 bit + // buffer: 16 bit, dynamic range [-32760, 32760] + hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf); + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + // buffer: 16 bit + // buffer2: 19 bit, dynamic range [-262080, 262080] + hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx); + ++tmp_buf; + } + + for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; +} + +// In place 16x16 2D Hadamard transform +void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 13 bit, dynamic range [-4095, 4095] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + vpx_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + // coeff: 19 bit, dynamic range [-262080, 262080] + for (idx = 0; idx < 64; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; + + tran_low_t b0 = (a0 + a1) >> 1; + tran_low_t b1 = (a0 - a1) >> 1; + tran_low_t b2 = (a2 + a3) >> 1; + tran_low_t b3 = (a2 - a3) >> 1; + + // new coeff dynamic range: 20 bit + coeff[0] = b0 + b2; + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 13 bit, dynamic range [-4095, 4095] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 20 bit + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; + tran_low_t b1 = (a0 - a1) >> 2; + tran_low_t b2 = (a2 + a3) >> 2; + tran_low_t b3 = (a2 - a3) >> 2; + + // new coeff dynamic range: 20 bit + coeff[0] = b0 + b2; + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + // src_diff: first pass, 9 bit, dynamic range [-255, 255] // second pass, 12 bit, dynamic range [-2040, 2040] -static void hadamard_col8(const int16_t *src_diff, int src_stride, +static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff) { int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; @@ -66,10 +228,11 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride, // The order of the output coeff of the hadamard is not important. For // optimization purposes the final transpose may be skipped. -void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { int idx; int16_t buffer[64]; + int16_t buffer2[64]; int16_t *tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit @@ -80,17 +243,19 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { - hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit - // dynamic range [-2040, 2040] - coeff += 8; // coeff: 15 bit - // dynamic range [-16320, 16320] + hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + // buffer2: 15 bit + // dynamic range [-16320, 16320] ++tmp_buf; } + + for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; } // In place 16x16 2D Hadamard transform -void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] @@ -101,15 +266,15 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, // coeff: 15 bit, dynamic range [-16320, 16320] for (idx = 0; idx < 64; ++idx) { - int16_t a0 = coeff[0]; - int16_t a1 = coeff[64]; - int16_t a2 = coeff[128]; - int16_t a3 = coeff[192]; + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; - int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] - int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range - int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] - int16_t b3 = (a2 - a3) >> 1; + tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + tran_low_t b3 = (a2 - a3) >> 1; coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] coeff[64] = b1 + b3; @@ -120,9 +285,53 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, } } +void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256); + } + + // coeff: 16 bit, dynamic range [-32768, 32767] + for (idx = 0; idx < 256; ++idx) { + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[256]; + tran_low_t a2 = coeff[512]; + tran_low_t a3 = coeff[768]; + + tran_low_t b0 = (a0 + a1) >> 2; // (a0 + a1): 17 bit, [-65536, 65535] + tran_low_t b1 = (a0 - a1) >> 2; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 2; // [-16384, 16383] + tran_low_t b3 = (a2 - a3) >> 2; + + coeff[0] = b0 + b2; // 16 bit, [-32768, 32767] + coeff[256] = b1 + b3; + coeff[512] = b0 - b2; + coeff[768] = b1 - b3; + + ++coeff; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +// coeff: dynamic range 20 bit. +// length: value range {16, 64, 256, 1024}. +int vpx_highbd_satd_c(const tran_low_t *coeff, int length) { + int i; + int satd = 0; + for (i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 30 bits + return satd; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. -int vpx_satd_c(const int16_t *coeff, int length) { +int vpx_satd_c(const tran_low_t *coeff, int length) { int i; int satd = 0; for (i = 0; i < length; ++i) satd += abs(coeff[i]); @@ -137,6 +346,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height) { int idx; const int norm_factor = height >> 1; + assert(height >= 2); for (idx = 0; idx < 16; ++idx) { int i; hbuf[idx] = 0; @@ -218,7 +428,7 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int i, j; const uint16_t *s = CONVERT_TO_SHORTPTR(s8); const uint16_t *d = CONVERT_TO_SHORTPTR(d8); - *min = 255; + *min = 65535; *max = 0; for (i = 0; i < 8; ++i, s += p, d += dp) { for (j = 0; j < 8; ++j) { diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader.h b/media/libvpx/libvpx/vpx_dsp/bitreader.h index 6ee2a58632..a5927ea2ad 100644 --- a/media/libvpx/libvpx/vpx_dsp/bitreader.h +++ b/media/libvpx/libvpx/vpx_dsp/bitreader.h @@ -8,10 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_BITREADER_H_ -#define VPX_DSP_BITREADER_H_ +#ifndef VPX_VPX_DSP_BITREADER_H_ +#define VPX_VPX_DSP_BITREADER_H_ #include +#include #include #include "./vpx_config.h" @@ -19,6 +20,9 @@ #include "vpx/vp8dx.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/prob.h" +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG #ifdef __cplusplus extern "C" { @@ -94,7 +98,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) { } { - register int shift = vpx_norm[range]; + const unsigned char shift = vpx_norm[(unsigned char)range]; range <<= shift; value <<= shift; count -= shift; @@ -103,6 +107,31 @@ static INLINE int vpx_read(vpx_reader *r, int prob) { r->count = count; r->range = range; +#if CONFIG_BITSTREAM_DEBUG + { + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + int ref_result, ref_prob; + bitstream_queue_pop(&ref_result, &ref_prob); + if ((int)bit != ref_result) { + fprintf(stderr, + "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d " + "queue_r %d\n", + frame_idx, bit, ref_result, queue_r); + + assert(0); + } + if (prob != ref_prob) { + fprintf(stderr, + "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d " + "queue_r %d\n", + frame_idx, prob, ref_prob, queue_r); + + assert(0); + } + } +#endif + return bit; } @@ -131,4 +160,4 @@ static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree, } // extern "C" #endif -#endif // VPX_DSP_BITREADER_H_ +#endif // VPX_VPX_DSP_BITREADER_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c index e99fffb605..f59f1f7cb9 100644 --- a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c +++ b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.c @@ -23,7 +23,7 @@ int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) { rb->bit_offset = off + 1; return bit; } else { - rb->error_handler(rb->error_handler_data); + if (rb->error_handler != NULL) rb->error_handler(rb->error_handler_data); return 0; } } @@ -40,11 +40,5 @@ int vpx_rb_read_signed_literal(struct vpx_read_bit_buffer *rb, int bits) { } int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits) { -#if CONFIG_MISC_FIXES - const int nbits = sizeof(unsigned) * 8 - bits - 1; - const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits; - return ((int)value) >> nbits; -#else return vpx_rb_read_signed_literal(rb, bits); -#endif } diff --git a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h index 8a48a95ed1..b27703a4db 100644 --- a/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h +++ b/media/libvpx/libvpx/vpx_dsp/bitreader_buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_BITREADER_BUFFER_H_ -#define VPX_DSP_BITREADER_BUFFER_H_ +#ifndef VPX_VPX_DSP_BITREADER_BUFFER_H_ +#define VPX_VPX_DSP_BITREADER_BUFFER_H_ #include @@ -44,4 +44,4 @@ int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits); } // extern "C" #endif -#endif // VPX_DSP_BITREADER_BUFFER_H_ +#endif // VPX_VPX_DSP_BITREADER_BUFFER_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter.c b/media/libvpx/libvpx/vpx_dsp/bitwriter.c index 81e28b309f..d3ef9bd89a 100644 --- a/media/libvpx/libvpx/vpx_dsp/bitwriter.c +++ b/media/libvpx/libvpx/vpx_dsp/bitwriter.c @@ -9,23 +9,47 @@ */ #include +#include #include "./bitwriter.h" -void vpx_start_encode(vpx_writer *br, uint8_t *source) { +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif + +void vpx_start_encode(vpx_writer *br, uint8_t *source, size_t size) { br->lowvalue = 0; br->range = 255; br->count = -24; - br->buffer = source; + br->error = 0; br->pos = 0; + // Make sure it is safe to cast br->pos to int in vpx_write(). + if (size > INT_MAX) size = INT_MAX; + br->size = (unsigned int)size; + br->buffer = source; vpx_write_bit(br, 0); } -void vpx_stop_encode(vpx_writer *br) { +int vpx_stop_encode(vpx_writer *br) { int i; +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_set_skip_write(1); +#endif for (i = 0; i < 32; i++) vpx_write_bit(br, 0); // Ensure there's no ambigous collision with any index marker bytes - if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0; + if (!br->error && (br->buffer[br->pos - 1] & 0xe0) == 0xc0) { + if (br->pos < br->size) { + br->buffer[br->pos++] = 0; + } else { + br->error = 1; + } + } + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_set_skip_write(0); +#endif + + return br->error ? -1 : 0; } diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter.h b/media/libvpx/libvpx/vpx_dsp/bitwriter.h index 41040cf935..daff331daf 100644 --- a/media/libvpx/libvpx/vpx_dsp/bitwriter.h +++ b/media/libvpx/libvpx/vpx_dsp/bitwriter.h @@ -8,12 +8,18 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_BITWRITER_H_ -#define VPX_DSP_BITWRITER_H_ +#ifndef VPX_VPX_DSP_BITWRITER_H_ +#define VPX_VPX_DSP_BITWRITER_H_ +#include + +#include "vpx_ports/compiler_attributes.h" #include "vpx_ports/mem.h" #include "vpx_dsp/prob.h" +#if CONFIG_BITSTREAM_DEBUG +#include "vpx_util/vpx_debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG #ifdef __cplusplus extern "C" { @@ -23,19 +29,43 @@ typedef struct vpx_writer { unsigned int lowvalue; unsigned int range; int count; + // Whether there has been an error. + int error; + // We maintain the invariant that pos <= size, i.e., we never write beyond + // the end of the buffer. If pos would be incremented to be greater than + // size, leave pos unchanged and set error to 1. unsigned int pos; + unsigned int size; uint8_t *buffer; } vpx_writer; -void vpx_start_encode(vpx_writer *bc, uint8_t *buffer); -void vpx_stop_encode(vpx_writer *bc); +void vpx_start_encode(vpx_writer *br, uint8_t *source, size_t size); +// Returns 0 on success and returns -1 in case of error. +int vpx_stop_encode(vpx_writer *br); -static INLINE void vpx_write(vpx_writer *br, int bit, int probability) { +static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br, + int bit, + int probability) { unsigned int split; int count = br->count; unsigned int range = br->range; unsigned int lowvalue = br->lowvalue; - register int shift; + int shift; + +#if CONFIG_BITSTREAM_DEBUG + /* + int queue_r = 0; + int frame_idx_r = 0; + int queue_w = bitstream_queue_get_write(); + int frame_idx_w = bitstream_queue_get_frame_write(); + if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + frame_idx_w, queue_w); + assert(0); + } + */ + bitstream_queue_push(bit, probability); +#endif split = 1 + (((range - 1) * probability) >> 8); @@ -54,18 +84,25 @@ static INLINE void vpx_write(vpx_writer *br, int bit, int probability) { if (count >= 0) { int offset = shift - count; - if ((lowvalue << (offset - 1)) & 0x80000000) { - int x = br->pos - 1; + if (!br->error) { + if ((lowvalue << (offset - 1)) & 0x80000000) { + int x = (int)br->pos - 1; - while (x >= 0 && br->buffer[x] == 0xff) { - br->buffer[x] = 0; - x--; + while (x >= 0 && br->buffer[x] == 0xff) { + br->buffer[x] = 0; + x--; + } + + // TODO(wtc): How to prove x >= 0? + br->buffer[x] += 1; } - br->buffer[x] += 1; + if (br->pos < br->size) { + br->buffer[br->pos++] = (lowvalue >> (24 - offset)) & 0xff; + } else { + br->error = 1; + } } - - br->buffer[br->pos++] = (lowvalue >> (24 - offset)); lowvalue <<= offset; shift = count; lowvalue &= 0xffffff; @@ -94,4 +131,4 @@ static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) { } // extern "C" #endif -#endif // VPX_DSP_BITWRITER_H_ +#endif // VPX_VPX_DSP_BITWRITER_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c index 1043cdc784..b3a2490f2c 100644 --- a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c +++ b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.c @@ -8,24 +8,43 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include "./vpx_config.h" #include "./bitwriter_buffer.h" +void vpx_wb_init(struct vpx_write_bit_buffer *wb, uint8_t *bit_buffer, + size_t size) { + wb->error = 0; + wb->bit_offset = 0; + wb->size = size; + wb->bit_buffer = bit_buffer; +} + +int vpx_wb_has_error(const struct vpx_write_bit_buffer *wb) { + return wb->error; +} + size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb) { + assert(!wb->error); return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); } void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit) { + if (wb->error) return; const int off = (int)wb->bit_offset; const int p = off / CHAR_BIT; const int q = CHAR_BIT - 1 - off % CHAR_BIT; + if ((size_t)p >= wb->size) { + wb->error = 1; + return; + } if (q == CHAR_BIT - 1) { wb->bit_buffer[p] = bit << q; } else { - wb->bit_buffer[p] &= ~(1 << q); + assert((wb->bit_buffer[p] & (1 << q)) == 0); wb->bit_buffer[p] |= bit << q; } wb->bit_offset = off + 1; @@ -38,10 +57,6 @@ void vpx_wb_write_literal(struct vpx_write_bit_buffer *wb, int data, int bits) { void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data, int bits) { -#if CONFIG_MISC_FIXES - vpx_wb_write_literal(wb, data, bits + 1); -#else vpx_wb_write_literal(wb, abs(data), bits); vpx_wb_write_bit(wb, data < 0); -#endif } diff --git a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h index a123a2fe8c..3ee0e9658b 100644 --- a/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h +++ b/media/libvpx/libvpx/vpx_dsp/bitwriter_buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_BITWRITER_BUFFER_H_ -#define VPX_DSP_BITWRITER_BUFFER_H_ +#ifndef VPX_VPX_DSP_BITWRITER_BUFFER_H_ +#define VPX_VPX_DSP_BITWRITER_BUFFER_H_ #include "vpx/vpx_integer.h" @@ -18,10 +18,24 @@ extern "C" { #endif struct vpx_write_bit_buffer { - uint8_t *bit_buffer; + // Whether there has been an error. + int error; + // We maintain the invariant that bit_offset <= size * CHAR_BIT, i.e., we + // never write beyond the end of bit_buffer. If bit_offset would be + // incremented to be greater than size * CHAR_BIT, leave bit_offset unchanged + // and set error to 1. size_t bit_offset; + // Size of bit_buffer in bytes. + size_t size; + uint8_t *bit_buffer; }; +void vpx_wb_init(struct vpx_write_bit_buffer *wb, uint8_t *bit_buffer, + size_t size); + +int vpx_wb_has_error(const struct vpx_write_bit_buffer *wb); + +// Must not be called if vpx_wb_has_error(wb) returns true. size_t vpx_wb_bytes_written(const struct vpx_write_bit_buffer *wb); void vpx_wb_write_bit(struct vpx_write_bit_buffer *wb, int bit); @@ -35,4 +49,4 @@ void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data, } // extern "C" #endif -#endif // VPX_DSP_BITWRITER_BUFFER_H_ +#endif // VPX_VPX_DSP_BITWRITER_BUFFER_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/deblock.c b/media/libvpx/libvpx/vpx_dsp/deblock.c index 6c27484979..455b73bbce 100644 --- a/media/libvpx/libvpx/vpx_dsp/deblock.c +++ b/media/libvpx/libvpx/vpx_dsp/deblock.c @@ -7,7 +7,9 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include #include +#include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" const int16_t vpx_rv[] = { @@ -37,32 +39,36 @@ const int16_t vpx_rv[] = { 9, 10, 13, }; -void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, int cols, - unsigned char *f, int size) { +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, + unsigned char *dst, int src_pitch, + int dst_pitch, int cols, + unsigned char *flimits, int size) { unsigned char *p_src, *p_dst; int row; int col; unsigned char v; unsigned char d[4]; + assert(size >= 8); + assert(cols >= 8); + for (row = 0; row < size; row++) { /* post_proc_down for one row */ - p_src = src_ptr; - p_dst = dst_ptr; + p_src = src; + p_dst = dst; for (col = 0; col < cols; col++) { - unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line]; - unsigned char p_above1 = p_src[col - src_pixels_per_line]; - unsigned char p_below1 = p_src[col + src_pixels_per_line]; - unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line]; + unsigned char p_above2 = p_src[col - 2 * src_pitch]; + unsigned char p_above1 = p_src[col - src_pitch]; + unsigned char p_below1 = p_src[col + src_pitch]; + unsigned char p_below2 = p_src[col + 2 * src_pitch]; v = p_src[col]; - if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) && - (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) { + if ((abs(v - p_above2) < flimits[col]) && + (abs(v - p_above1) < flimits[col]) && + (abs(v - p_below1) < flimits[col]) && + (abs(v - p_below2) < flimits[col])) { unsigned char k1, k2, k3; k1 = (p_above2 + p_above1 + 1) >> 1; k2 = (p_below2 + p_below1 + 1) >> 1; @@ -74,8 +80,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, } /* now post_proc_across */ - p_src = dst_ptr; - p_dst = dst_ptr; + p_src = dst; + p_dst = dst; p_src[-2] = p_src[-1] = p_src[0]; p_src[cols] = p_src[cols + 1] = p_src[cols - 1]; @@ -83,10 +89,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, for (col = 0; col < cols; col++) { v = p_src[col]; - if ((abs(v - p_src[col - 2]) < f[col]) && - (abs(v - p_src[col - 1]) < f[col]) && - (abs(v - p_src[col + 1]) < f[col]) && - (abs(v - p_src[col + 2]) < f[col])) { + if ((abs(v - p_src[col - 2]) < flimits[col]) && + (abs(v - p_src[col - 1]) < flimits[col]) && + (abs(v - p_src[col + 1]) < flimits[col]) && + (abs(v - p_src[col + 2]) < flimits[col])) { unsigned char k1, k2, k3; k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1; k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1; @@ -104,8 +110,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, p_dst[col - 1] = d[(col - 1) & 3]; /* next row */ - src_ptr += src_pixels_per_line; - dst_ptr += dst_pixels_per_line; + src += src_pitch; + dst += dst_pitch; } } diff --git a/media/libvpx/libvpx/vpx_dsp/fastssim.c b/media/libvpx/libvpx/vpx_dsp/fastssim.c index 0469071a17..4d32a02a55 100644 --- a/media/libvpx/libvpx/vpx_dsp/fastssim.c +++ b/media/libvpx/libvpx/vpx_dsp/fastssim.c @@ -47,7 +47,7 @@ struct fs_ctx { unsigned *col_buf; }; -static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { +static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { unsigned char *data; size_t data_size; int lw; @@ -71,6 +71,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { lh = (lh + 1) >> 1; } data = (unsigned char *)malloc(data_size); + if (!data) return -1; _ctx->level = (fs_level *)data; _ctx->nlevels = _nlevels; data += _nlevels * sizeof(*_ctx->level); @@ -95,6 +96,7 @@ static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { lh = (lh + 1) >> 1; } _ctx->col_buf = (unsigned *)data; + return 0; } static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); } @@ -128,10 +130,12 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) { int i1; i0 = 2 * i; i1 = FS_MINI(i0 + 1, w2); - dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] + - src1[j1offs + i0] + src1[j1offs + i1]; - dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] + - src2[j1offs + i0] + src2[j1offs + i1]; + dst1[j * w + i] = + (uint32_t)((int64_t)src1[j0offs + i0] + src1[j0offs + i1] + + src1[j1offs + i0] + src1[j1offs + i1]); + dst2[j * w + i] = + (uint32_t)((int64_t)src2[j0offs + i0] + src2[j0offs + i1] + + src2[j1offs + i0] + src2[j1offs + i1]); } } } @@ -220,12 +224,12 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { ssim = _ctx->level[_l].ssim; c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l)); for (j = 0; j < h; j++) { - unsigned mux; - unsigned muy; + int64_t mux; + int64_t muy; int i0; int i1; - mux = 5 * col_sums_x[0]; - muy = 5 * col_sums_y[0]; + mux = (int64_t)5 * col_sums_x[0]; + muy = (int64_t)5 * col_sums_y[0]; for (i = 1; i < 4; i++) { i1 = FS_MINI(i, w - 1); mux += col_sums_x[i1]; @@ -237,8 +241,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { if (i + 1 < w) { i0 = FS_MAXI(0, i - 4); i1 = FS_MINI(i + 4, w - 1); - mux += col_sums_x[i1] - col_sums_x[i0]; - muy += col_sums_x[i1] - col_sums_x[i0]; + mux += (int)col_sums_x[i1] - (int)col_sums_x[i0]; + muy += (int)col_sums_x[i1] - (int)col_sums_x[i0]; } } if (j + 1 < h) { @@ -246,8 +250,10 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i]; for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i]; j1offs = FS_MINI(j + 4, h - 1) * w; - for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; - for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + for (i = 0; i < w; i++) + col_sums_x[i] = (uint32_t)((int64_t)col_sums_x[i] + im1[j1offs + i]); + for (i = 0; i < w; i++) + col_sums_y[i] = (uint32_t)((int64_t)col_sums_y[i] + im2[j1offs + i]); } } } @@ -343,18 +349,18 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { for (j = 0; j < h + 4; j++) { if (j < h - 1) { for (i = 0; i < w - 1; i++) { - unsigned g1; - unsigned g2; - unsigned gx; - unsigned gy; - g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]); - g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]); + int64_t g1; + int64_t g2; + int64_t gx; + int64_t gy; + g1 = labs((int64_t)im1[(j + 1) * w + i + 1] - (int64_t)im1[j * w + i]); + g2 = labs((int64_t)im1[(j + 1) * w + i] - (int64_t)im1[j * w + i + 1]); gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); - g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]); - g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]); - gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); - gx_buf[(j & 7) * stride + i + 4] = gx; - gy_buf[(j & 7) * stride + i + 4] = gy; + g1 = labs((int64_t)im2[(j + 1) * w + i + 1] - (int64_t)im2[j * w + i]); + g2 = labs((int64_t)im2[(j + 1) * w + i] - (int64_t)im2[j * w + i + 1]); + gy = ((int64_t)4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2)); + gx_buf[(j & 7) * stride + i + 4] = (uint32_t)gx; + gy_buf[(j & 7) * stride + i + 4] = (uint32_t)gy; } } else { memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf)); @@ -452,7 +458,7 @@ static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, double ret; int l; ret = 1; - fs_ctx_init(&ctx, _w, _h, FS_NLEVELS); + if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0; fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd, _shift); for (l = 0; l < FS_NLEVELS - 1; l++) { diff --git a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c index aa59601094..ef66de0247 100644 --- a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c +++ b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.c @@ -84,14 +84,14 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { for (r = 0; r < 4; ++r) for (c = 0; c < 4; ++c) sum += input[r * stride + c]; - output[0] = sum << 1; + output[0] = sum * 2; } -void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) { int i, j; tran_low_t intermediate[64]; int pass; - tran_low_t *output = intermediate; + tran_low_t *out = intermediate; const tran_low_t *in = NULL; // Transform columns @@ -133,10 +133,10 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { t1 = (x0 - x1) * cospi_16_64; t2 = x2 * cospi_24_64 + x3 * cospi_8_64; t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; - output[0] = (tran_low_t)fdct_round_shift(t0); - output[2] = (tran_low_t)fdct_round_shift(t2); - output[4] = (tran_low_t)fdct_round_shift(t1); - output[6] = (tran_low_t)fdct_round_shift(t3); + out[0] = (tran_low_t)fdct_round_shift(t0); + out[2] = (tran_low_t)fdct_round_shift(t2); + out[4] = (tran_low_t)fdct_round_shift(t1); + out[6] = (tran_low_t)fdct_round_shift(t3); // Stage 2 t0 = (s6 - s5) * cospi_16_64; @@ -155,19 +155,19 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { t1 = x1 * cospi_12_64 + x2 * cospi_20_64; t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - output[1] = (tran_low_t)fdct_round_shift(t0); - output[3] = (tran_low_t)fdct_round_shift(t2); - output[5] = (tran_low_t)fdct_round_shift(t1); - output[7] = (tran_low_t)fdct_round_shift(t3); - output += 8; + out[1] = (tran_low_t)fdct_round_shift(t0); + out[3] = (tran_low_t)fdct_round_shift(t2); + out[5] = (tran_low_t)fdct_round_shift(t1); + out[7] = (tran_low_t)fdct_round_shift(t3); + out += 8; } in = intermediate; - output = final_output; + out = output; } // Rows for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; + for (j = 0; j < 8; ++j) output[j + i * 8] /= 2; } } @@ -705,9 +705,9 @@ void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } -void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) { int i, j; - tran_high_t output[32 * 32]; + tran_high_t out[32 * 32]; // Columns for (i = 0; i < 32; ++i) { @@ -715,16 +715,16 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; vpx_fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } // Rows for (i = 0; i < 32; ++i) { tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; vpx_fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) - out[j + i * 32] = + output[j + i * 32] = (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); } } @@ -732,9 +732,9 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { // Note that although we use dct_32_round in dct32 computation flow, // this 2d fdct32x32 for rate-distortion optimization loop is operating // within 16 bits precision. -void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) { int i, j; - tran_high_t output[32 * 32]; + tran_high_t out[32 * 32]; // Columns for (i = 0; i < 32; ++i) { @@ -745,15 +745,15 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { // TODO(cd): see quality impact of only doing // output[j * 32 + i] = (temp_out[j] + 1) >> 2; // PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } // Rows for (i = 0; i < 32; ++i) { tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; vpx_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; + for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j]; } } @@ -772,14 +772,14 @@ void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, vpx_fdct4x4_c(input, output, stride); } -void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, +void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct8x8_c(input, final_output, stride); + vpx_fdct8x8_c(input, output, stride); } -void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, +void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct8x8_1_c(input, final_output, stride); + vpx_fdct8x8_1_c(input, output, stride); } void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, @@ -792,17 +792,18 @@ void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, vpx_fdct16x16_1_c(input, output, stride); } -void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { - vpx_fdct32x32_c(input, out, stride); +void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, + int stride) { + vpx_fdct32x32_c(input, output, stride); } -void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, +void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct32x32_rd_c(input, out, stride); + vpx_fdct32x32_rd_c(input, output, stride); } -void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, +void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { - vpx_fdct32x32_1_c(input, out, stride); + vpx_fdct32x32_1_c(input, output, stride); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h index 29e139c73b..a43c8ea7f7 100644 --- a/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h +++ b/media/libvpx/libvpx/vpx_dsp/fwd_txfm.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_FWD_TXFM_H_ -#define VPX_DSP_FWD_TXFM_H_ +#ifndef VPX_VPX_DSP_FWD_TXFM_H_ +#define VPX_VPX_DSP_FWD_TXFM_H_ #include "vpx_dsp/txfm_common.h" @@ -22,4 +22,4 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) { } void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round); -#endif // VPX_DSP_FWD_TXFM_H_ +#endif // VPX_VPX_DSP_FWD_TXFM_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/intrapred.c b/media/libvpx/libvpx/vpx_dsp/intrapred.c index eca17a983e..4b37deef0e 100644 --- a/media/libvpx/libvpx/vpx_dsp/intrapred.c +++ b/media/libvpx/libvpx/vpx_dsp/intrapred.c @@ -14,7 +14,7 @@ #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" -#define DST(x, y) dst[(x) + (y)*stride] +#define DST(x, y) dst[(x) + (y) * stride] #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) #define AVG2(a, b) (((a) + (b) + 1) >> 1) @@ -42,23 +42,6 @@ static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; } -#if CONFIG_MISC_FIXES -static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - (void)above; - - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], - left[(c >> 1) + r + 2]) - : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); - } - dst += stride; - } -} -#endif // CONFIG_MISC_FIXES - static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; @@ -76,22 +59,6 @@ static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } } -#if CONFIG_MISC_FIXES -static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - (void)left; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], - above[(r >> 1) + c + 2]) - : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); - } - dst += stride; - } -} -#endif // CONFIG_MISC_FIXES - static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { const uint8_t above_right = above[bs - 1]; @@ -111,21 +78,6 @@ static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } } -#if CONFIG_MISC_FIXES -static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - (void)left; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = AVG3(above[r + c], above[r + c + 1], - above[r + c + 1 + (r + c + 2 < bs * 2)]); - } - dst += stride; - } -} -#endif // CONFIG_MISC_FIXES - static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; @@ -367,7 +319,7 @@ void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, DST(3, 3) = AVG3(E, F, G); // differs from vp8 } -void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const int A = above[0]; const int B = above[1]; @@ -533,75 +485,46 @@ static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride, } } -#if CONFIG_MISC_FIXES -static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void)above; - (void)bd; - - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], - left[(c >> 1) + r + 2]) - : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); - } - dst += stride; - } -} -#endif // CONFIG_MISC_FIXES - static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { int r, c; + int size; (void)left; (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], - above[(r >> 1) + c + 2]) - : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); - } - dst += stride; + for (c = 0; c < bs; ++c) { + dst[c] = AVG2(above[c], above[c + 1]); + dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]); + } + for (r = 2, size = bs - 2; r < bs; r += 2, --size) { + memcpy(dst + (r + 0) * stride, dst + (r >> 1), size * sizeof(*dst)); + vpx_memset16(dst + (r + 0) * stride + size, above[bs - 1], bs - size); + memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), + size * sizeof(*dst)); + vpx_memset16(dst + (r + 1) * stride + size, above[bs - 1], bs - size); } } -#define highbd_d63e_predictor highbd_d63_predictor - static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { - int r, c; + const uint16_t above_right = above[bs - 1]; + const uint16_t *const dst_row0 = dst; + int x, size; (void)left; (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = r + c + 2 < bs * 2 - ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2]) - : above[bs * 2 - 1]; - } - dst += stride; - } -} -#if CONFIG_MISC_FIXES -static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void)left; - (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = AVG3(above[r + c], above[r + c + 1], - above[r + c + 1 + (r + c + 2 < bs * 2)]); - } + for (x = 0; x < bs - 1; ++x) { + dst[x] = AVG3(above[x], above[x + 1], above[x + 2]); + } + dst[bs - 1] = above_right; + dst += stride; + for (x = 1, size = bs - 2; x < bs; ++x, --size) { + memcpy(dst, dst_row0 + x, size * sizeof(*dst)); + vpx_memset16(dst + size, above_right, x + 1); dst += stride; } } -#endif // CONFIG_MISC_FIXES static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, @@ -633,19 +556,30 @@ static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd) { - int r, c; + int i; +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 + // silence a spurious -Warray-bounds warning, possibly related to: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 + uint16_t border[69]; +#else + uint16_t border[32 + 32 - 1]; // outer border from bottom-left to top-right +#endif (void)bd; - dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); - dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bs; ++r) - dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + // dst(bs, bs - 2)[0], i.e., border starting at bottom-left + for (i = 0; i < bs - 2; ++i) { + border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); + } + border[bs - 2] = AVG3(above[-1], left[0], left[1]); + border[bs - 1] = AVG3(left[0], above[-1], above[0]); + border[bs - 0] = AVG3(above[-1], above[0], above[1]); + // dst[0][2, size), i.e., remaining top border ascending + for (i = 0; i < bs - 2; ++i) { + border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]); + } - dst += stride; - for (r = 1; r < bs; ++r) { - for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1]; - dst += stride; + for (i = 0; i < bs; ++i) { + memcpy(dst + i * stride, border + bs - 1 - i, bs * sizeof(dst[0])); } } @@ -776,6 +710,144 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, dst += stride; } } + +void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + (void)above; + (void)bd; + DST(0, 0) = AVG2(I, J); + DST(2, 0) = DST(0, 1) = AVG2(J, K); + DST(2, 1) = DST(0, 2) = AVG2(K, L); + DST(1, 0) = AVG3(I, J, K); + DST(3, 0) = DST(1, 1) = AVG3(J, K, L); + DST(3, 1) = DST(1, 2) = AVG3(K, L, L); + DST(3, 2) = DST(2, 2) = DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; +} + +void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + (void)left; + (void)bd; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = DST(0, 2) = AVG2(B, C); + DST(2, 0) = DST(1, 2) = AVG2(C, D); + DST(3, 0) = DST(2, 2) = AVG2(D, E); + DST(3, 2) = AVG2(E, F); // differs from vp8 + + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = DST(0, 3) = AVG3(B, C, D); + DST(2, 1) = DST(1, 3) = AVG3(C, D, E); + DST(3, 1) = DST(2, 3) = AVG3(D, E, F); + DST(3, 3) = AVG3(E, F, G); // differs from vp8 +} + +void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)left; + (void)bd; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = H; // differs from vp8 +} + +void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)bd; + DST(0, 0) = DST(1, 2) = AVG2(X, A); + DST(1, 0) = DST(2, 2) = AVG2(A, B); + DST(2, 0) = DST(3, 2) = AVG2(B, C); + DST(3, 0) = AVG2(C, D); + + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); + DST(0, 1) = DST(1, 3) = AVG3(I, X, A); + DST(1, 1) = DST(2, 3) = AVG3(X, A, B); + DST(2, 1) = DST(3, 3) = AVG3(A, B, C); + DST(3, 1) = AVG3(B, C, D); +} + +void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)bd; + DST(0, 3) = AVG3(J, K, L); + DST(1, 3) = DST(0, 2) = AVG3(I, J, K); + DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); + DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); + DST(3, 1) = DST(2, 0) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); +} + +void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + (void)bd; + + DST(0, 0) = DST(2, 1) = AVG2(I, X); + DST(0, 1) = DST(2, 2) = AVG2(J, I); + DST(0, 2) = DST(2, 3) = AVG2(K, J); + DST(0, 3) = AVG2(L, K); + + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); + DST(1, 0) = DST(3, 1) = AVG3(I, X, A); + DST(1, 1) = DST(3, 2) = AVG3(J, I, X); + DST(1, 2) = DST(3, 3) = AVG3(K, J, I); + DST(1, 3) = AVG3(L, K, J); +} #endif // CONFIG_VP9_HIGHBITDEPTH // This serves as a wrapper function, so that all the prediction functions @@ -811,7 +883,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, intra_pred_sized(type, 8) \ intra_pred_sized(type, 16) \ intra_pred_sized(type, 32) \ - intra_pred_highbd_sized(type, 4) \ intra_pred_highbd_sized(type, 8) \ intra_pred_highbd_sized(type, 16) \ intra_pred_highbd_sized(type, 32) @@ -832,11 +903,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, intra_pred_no_4x4(d207) intra_pred_no_4x4(d63) intra_pred_no_4x4(d45) -#if CONFIG_MISC_FIXES -intra_pred_allsizes(d207e) -intra_pred_allsizes(d63e) -intra_pred_no_4x4(d45e) -#endif intra_pred_no_4x4(d117) intra_pred_no_4x4(d135) intra_pred_no_4x4(d153) diff --git a/media/libvpx/libvpx/vpx_dsp/inv_txfm.c b/media/libvpx/libvpx/vpx_dsp/inv_txfm.c index 0f9aff1892..97655b3a9e 100644 --- a/media/libvpx/libvpx/vpx_dsp/inv_txfm.c +++ b/media/libvpx/libvpx/vpx_dsp/inv_txfm.c @@ -67,11 +67,11 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } -void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) { +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; - const tran_low_t *ip = in; + const tran_low_t *ip = input; tran_low_t *op = tmp; a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -93,17 +93,54 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) { } } +void iadst4_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + memset(output, 0, 4 * sizeof(*output)); + return; + } + + // 32-bit result is enough for the following multiplications. + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = WRAPLOW(x0 - x2 + x3); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); + output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); + output[2] = WRAPLOW(dct_const_round_shift(s2)); + output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); +} + void idct4_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step[4]; + int16_t step[4]; tran_high_t temp1, temp2; // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; + temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64; + temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64; step[0] = WRAPLOW(dct_const_round_shift(temp1)); step[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64; + temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64; step[2] = WRAPLOW(dct_const_round_shift(temp1)); step[3] = WRAPLOW(dct_const_round_shift(temp2)); @@ -141,7 +178,8 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 4); @@ -155,134 +193,6 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } -void idct8_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[8], step2[8]; - tran_high_t temp1, temp2; - - // stage 1 - step1[0] = input[0]; - step1[2] = input[4]; - step1[1] = input[2]; - step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 2 - temp1 = (step1[0] + step1[2]) * cospi_16_64; - temp2 = (step1[0] - step1[2]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); - - // stage 3 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - step1[7] = step2[7]; - - // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7]); - output[1] = WRAPLOW(step1[1] + step1[6]); - output[2] = WRAPLOW(step1[2] + step1[5]); - output[3] = WRAPLOW(step1[3] + step1[4]); - output[4] = WRAPLOW(step1[3] - step1[4]); - output[5] = WRAPLOW(step1[2] - step1[5]); - output[6] = WRAPLOW(step1[1] - step1[6]); - output[7] = WRAPLOW(step1[0] - step1[7]); -} - -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - tran_low_t temp_in[8], temp_out[8]; - - // First transform rows - for (i = 0; i < 8; ++i) { - idct8_c(input, outptr); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - idct8_c(temp_in, temp_out); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5)); - } - } -} - -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 5); - for (j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } -} - -void iadst4_c(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - tran_low_t x0 = input[0]; - tran_low_t x1 = input[1]; - tran_low_t x2 = input[2]; - tran_low_t x3 = input[3]; - - if (!(x0 | x1 | x2 | x3)) { - memset(output, 0, 4 * sizeof(*output)); - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = WRAPLOW(x0 - x2 + x3); - - s0 = s0 + s3 + s5; - s1 = s1 - s4 - s6; - s3 = s2; - s2 = sinpi_3_9 * s7; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); - output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); - output[2] = WRAPLOW(dct_const_round_shift(s2)); - output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); -} - void iadst8_c(const tran_low_t *input, tran_low_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; tran_high_t x0 = input[7]; @@ -358,6 +268,85 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { output[7] = WRAPLOW(-x1); } +void idct8_c(const tran_low_t *input, tran_low_t *output) { + int16_t step1[8], step2[8]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = (int16_t)input[0]; + step1[2] = (int16_t)input[4]; + step1[1] = (int16_t)input[2]; + step1[3] = (int16_t)input[6]; + temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64; + temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64; + temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + temp1 = (step1[0] + step1[2]) * cospi_16_64; + temp2 = (step1[0] - step1[2]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + // stage 3 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + // stage 4 + output[0] = WRAPLOW(step1[0] + step1[7]); + output[1] = WRAPLOW(step1[1] + step1[6]); + output[2] = WRAPLOW(step1[2] + step1[5]); + output[3] = WRAPLOW(step1[3] + step1[4]); + output[4] = WRAPLOW(step1[3] - step1[4]); + output[5] = WRAPLOW(step1[2] - step1[5]); + output[6] = WRAPLOW(step1[1] - step1[6]); + output[7] = WRAPLOW(step1[0] - step1[7]); +} + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + for (i = 0; i < 8; ++i) { + idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_low_t out[8 * 8] = { 0 }; @@ -383,193 +372,17 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } -void idct16_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[16], step2[16]; - tran_high_t temp1, temp2; - - // stage 1 - step1[0] = input[0 / 2]; - step1[1] = input[16 / 2]; - step1[2] = input[8 / 2]; - step1[3] = input[24 / 2]; - step1[4] = input[4 / 2]; - step1[5] = input[20 / 2]; - step1[6] = input[12 / 2]; - step1[7] = input[28 / 2]; - step1[8] = input[2 / 2]; - step1[9] = input[18 / 2]; - step1[10] = input[10 / 2]; - step1[11] = input[26 / 2]; - step1[12] = input[6 / 2]; - step1[13] = input[22 / 2]; - step1[14] = input[14 / 2]; - step1[15] = input[30 / 2]; - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1)); - step2[15] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - - step1[8] = WRAPLOW(step2[8] + step2[9]); - step1[9] = WRAPLOW(step2[8] - step2[9]); - step1[10] = WRAPLOW(-step2[10] + step2[11]); - step1[11] = WRAPLOW(step2[10] + step2[11]); - step1[12] = WRAPLOW(step2[12] + step2[13]); - step1[13] = WRAPLOW(step2[12] - step2[13]); - step1[14] = WRAPLOW(-step2[14] + step2[15]); - step1[15] = WRAPLOW(step2[14] + step2[15]); - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - step2[11] = step1[11]; - step2[12] = step1[12]; - - // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - step1[7] = step2[7]; - - step1[8] = WRAPLOW(step2[8] + step2[11]); - step1[9] = WRAPLOW(step2[9] + step2[10]); - step1[10] = WRAPLOW(step2[9] - step2[10]); - step1[11] = WRAPLOW(step2[8] - step2[11]); - step1[12] = WRAPLOW(-step2[12] + step2[15]); - step1[13] = WRAPLOW(-step2[13] + step2[14]); - step1[14] = WRAPLOW(step2[13] + step2[14]); - step1[15] = WRAPLOW(step2[12] + step2[15]); - - // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7]); - step2[1] = WRAPLOW(step1[1] + step1[6]); - step2[2] = WRAPLOW(step1[2] + step1[5]); - step2[3] = WRAPLOW(step1[3] + step1[4]); - step2[4] = WRAPLOW(step1[3] - step1[4]); - step2[5] = WRAPLOW(step1[2] - step1[5]); - step2[6] = WRAPLOW(step1[1] - step1[6]); - step2[7] = WRAPLOW(step1[0] - step1[7]); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - step2[14] = step1[14]; - step2[15] = step1[15]; - - // stage 7 - output[0] = WRAPLOW(step2[0] + step2[15]); - output[1] = WRAPLOW(step2[1] + step2[14]); - output[2] = WRAPLOW(step2[2] + step2[13]); - output[3] = WRAPLOW(step2[3] + step2[12]); - output[4] = WRAPLOW(step2[4] + step2[11]); - output[5] = WRAPLOW(step2[5] + step2[10]); - output[6] = WRAPLOW(step2[6] + step2[9]); - output[7] = WRAPLOW(step2[7] + step2[8]); - output[8] = WRAPLOW(step2[7] - step2[8]); - output[9] = WRAPLOW(step2[6] - step2[9]); - output[10] = WRAPLOW(step2[5] - step2[10]); - output[11] = WRAPLOW(step2[4] - step2[11]); - output[12] = WRAPLOW(step2[3] - step2[12]); - output[13] = WRAPLOW(step2[2] - step2[13]); - output[14] = WRAPLOW(step2[1] - step2[14]); - output[15] = WRAPLOW(step2[0] - step2[15]); -} - -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - tran_low_t temp_in[16], temp_out[16]; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); - // First transform rows - for (i = 0; i < 16; ++i) { - idct16_c(input, outptr); - input += 16; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - idct16_c(temp_in, temp_out); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; } } @@ -741,6 +554,222 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { output[15] = WRAPLOW(-x1); } +void idct16_c(const tran_low_t *input, tran_low_t *output) { + int16_t step1[16], step2[16]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = (int16_t)input[0 / 2]; + step1[1] = (int16_t)input[16 / 2]; + step1[2] = (int16_t)input[8 / 2]; + step1[3] = (int16_t)input[24 / 2]; + step1[4] = (int16_t)input[4 / 2]; + step1[5] = (int16_t)input[20 / 2]; + step1[6] = (int16_t)input[12 / 2]; + step1[7] = (int16_t)input[28 / 2]; + step1[8] = (int16_t)input[2 / 2]; + step1[9] = (int16_t)input[18 / 2]; + step1[10] = (int16_t)input[10 / 2]; + step1[11] = (int16_t)input[26 / 2]; + step1[12] = (int16_t)input[6 / 2]; + step1[13] = (int16_t)input[22 / 2]; + step1[14] = (int16_t)input[14 / 2]; + step1[15] = (int16_t)input[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = (tran_low_t)WRAPLOW(step2[0] + step2[15]); + output[1] = (tran_low_t)WRAPLOW(step2[1] + step2[14]); + output[2] = (tran_low_t)WRAPLOW(step2[2] + step2[13]); + output[3] = (tran_low_t)WRAPLOW(step2[3] + step2[12]); + output[4] = (tran_low_t)WRAPLOW(step2[4] + step2[11]); + output[5] = (tran_low_t)WRAPLOW(step2[5] + step2[10]); + output[6] = (tran_low_t)WRAPLOW(step2[6] + step2[9]); + output[7] = (tran_low_t)WRAPLOW(step2[7] + step2[8]); + output[8] = (tran_low_t)WRAPLOW(step2[7] - step2[8]); + output[9] = (tran_low_t)WRAPLOW(step2[6] - step2[9]); + output[10] = (tran_low_t)WRAPLOW(step2[5] - step2[10]); + output[11] = (tran_low_t)WRAPLOW(step2[4] - step2[11]); + output[12] = (tran_low_t)WRAPLOW(step2[3] - step2[12]); + output[13] = (tran_low_t)WRAPLOW(step2[2] - step2[13]); + output[14] = (tran_low_t)WRAPLOW(step2[1] - step2[14]); + output[15] = (tran_low_t)WRAPLOW(step2[0] - step2[15]); +} + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows + for (i = 0; i < 16; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; @@ -770,7 +799,8 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); @@ -781,64 +811,64 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } void idct32_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[32], step2[32]; + int16_t step1[32], step2[32]; tran_high_t temp1, temp2; // stage 1 - step1[0] = input[0]; - step1[1] = input[16]; - step1[2] = input[8]; - step1[3] = input[24]; - step1[4] = input[4]; - step1[5] = input[20]; - step1[6] = input[12]; - step1[7] = input[28]; - step1[8] = input[2]; - step1[9] = input[18]; - step1[10] = input[10]; - step1[11] = input[26]; - step1[12] = input[6]; - step1[13] = input[22]; - step1[14] = input[14]; - step1[15] = input[30]; + step1[0] = (int16_t)input[0]; + step1[1] = (int16_t)input[16]; + step1[2] = (int16_t)input[8]; + step1[3] = (int16_t)input[24]; + step1[4] = (int16_t)input[4]; + step1[5] = (int16_t)input[20]; + step1[6] = (int16_t)input[12]; + step1[7] = (int16_t)input[28]; + step1[8] = (int16_t)input[2]; + step1[9] = (int16_t)input[18]; + step1[10] = (int16_t)input[10]; + step1[11] = (int16_t)input[26]; + step1[12] = (int16_t)input[6]; + step1[13] = (int16_t)input[22]; + step1[14] = (int16_t)input[14]; + step1[15] = (int16_t)input[30]; - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64; + temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64; step1[16] = WRAPLOW(dct_const_round_shift(temp1)); step1[31] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64; + temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64; step1[17] = WRAPLOW(dct_const_round_shift(temp1)); step1[30] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64; + temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64; step1[18] = WRAPLOW(dct_const_round_shift(temp1)); step1[29] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64; + temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64; step1[19] = WRAPLOW(dct_const_round_shift(temp1)); step1[28] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64; + temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64; step1[20] = WRAPLOW(dct_const_round_shift(temp1)); step1[27] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64; + temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64; step1[21] = WRAPLOW(dct_const_round_shift(temp1)); step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64; + temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64; step1[22] = WRAPLOW(dct_const_round_shift(temp1)); step1[25] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64; + temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64; step1[23] = WRAPLOW(dct_const_round_shift(temp1)); step1[24] = WRAPLOW(dct_const_round_shift(temp2)); @@ -1156,16 +1186,10 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, // Rows for (i = 0; i < 32; ++i) { - int16_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + int16_t zero_coeff = 0; + for (j = 0; j < 32; ++j) zero_coeff |= input[j]; - if (zero_coeff[0] | zero_coeff[1]) + if (zero_coeff) idct32_c(input, outptr); else memset(outptr, 0, sizeof(tran_low_t) * 32); @@ -1239,7 +1263,8 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); @@ -1264,7 +1289,7 @@ static INLINE int detect_invalid_highbd_input(const tran_low_t *input, return 0; } -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ @@ -1273,7 +1298,6 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, tran_high_t a1, b1, c1, d1, e1; const tran_low_t *ip = input; tran_low_t *op = output; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); for (i = 0; i < 4; i++) { a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1322,14 +1346,13 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; - const tran_low_t *ip = in; + const tran_low_t *ip = input; tran_low_t *op = tmp; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); (void)bd; a1 = ip[0] >> UNIT_QUANT_SHIFT; @@ -1351,178 +1374,6 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, } } -void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step[4]; - tran_high_t temp1, temp2; - (void)bd; - - if (detect_invalid_highbd_input(input, 4)) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - assert(0 && "invalid highbd txfm input"); -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - memset(output, 0, sizeof(*output) * 4); - return; - } - - // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - // stage 2 - output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); - output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); - output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); - output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); -} - -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i, j; - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - tran_low_t temp_in[4], temp_out[4]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // Rows - for (i = 0; i < 4; ++i) { - vpx_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - vpx_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } -} - -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i; - tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); - a1 = ROUND_POWER_OF_TWO(out, 4); - - for (i = 0; i < 4; i++) { - dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); - dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); - dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); - dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); - dest += stride; - } -} - -void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step1[8], step2[8]; - tran_high_t temp1, temp2; - - if (detect_invalid_highbd_input(input, 8)) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - assert(0 && "invalid highbd txfm input"); -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - memset(output, 0, sizeof(*output) * 8); - return; - } - - // stage 1 - step1[0] = input[0]; - step1[2] = input[4]; - step1[1] = input[2]; - step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - // stage 2 & stage 3 - even half - vpx_highbd_idct4_c(step1, step1, bd); - - // stage 2 - odd half - step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); - step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); - - // stage 3 - odd half - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step1[7] = step2[7]; - - // stage 4 - output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); - output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); - output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); - output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); - output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); - output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); - output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); - output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); -} - -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i, j; - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows - for (i = 0; i < 8; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } -} - -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i, j; - tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); - a1 = ROUND_POWER_OF_TWO(out, 5); - for (j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); - dest += stride; - } -} - void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; tran_low_t x0 = input[0]; @@ -1544,13 +1395,13 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { return; } - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; + s0 = (tran_high_t)sinpi_1_9 * x0; + s1 = (tran_high_t)sinpi_2_9 * x0; + s2 = (tran_high_t)sinpi_3_9 * x1; + s3 = (tran_high_t)sinpi_4_9 * x2; + s4 = (tran_high_t)sinpi_1_9 * x2; + s5 = (tran_high_t)sinpi_2_9 * x3; + s6 = (tran_high_t)sinpi_4_9 * x3; s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); s0 = s0 + s3 + s5; @@ -1568,6 +1419,83 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); } +void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + (void)bd; + + if (detect_invalid_highbd_input(input, 4)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 4); + return; + } + + // stage 1 + temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64; + temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64; + step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64; + temp2 = + input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64; + step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 2 + output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); + output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); + output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); + output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); +} + +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + tran_low_t temp_in[4], temp_out[4]; + + // Rows + for (i = 0; i < 4; ++i) { + vpx_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + vpx_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } +} + +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + tran_high_t a1; + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); + dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); + dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); + dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); + dest += stride; + } +} + void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; tran_low_t x0 = input[7]; @@ -1594,14 +1522,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { } // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1; + s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1; + s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3; + s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3; + s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5; + s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5; + s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7; + s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7; x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); @@ -1617,10 +1545,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s1 = x1; s2 = x2; s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5; + s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5; + s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7; + s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7; x0 = HIGHBD_WRAPLOW(s0 + s2, bd); x1 = HIGHBD_WRAPLOW(s1 + s3, bd); @@ -1632,10 +1560,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); // stage 3 - s2 = cospi_16_64 * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); + s2 = (tran_high_t)cospi_16_64 * (x2 + x3); + s3 = (tran_high_t)cospi_16_64 * (x2 - x3); + s6 = (tran_high_t)cospi_16_64 * (x6 + x7); + s7 = (tran_high_t)cospi_16_64 * (x6 - x7); x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); @@ -1652,13 +1580,95 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { output[7] = HIGHBD_WRAPLOW(-x1, bd); } -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; + + if (detect_invalid_highbd_input(input, 8)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 8); + return; + } + + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = + input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64; + temp2 = + input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = + input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64; + temp2 = + input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 2 & stage 3 - even half + vpx_highbd_idct4_c(step1, step1, bd); + + // stage 2 - odd half + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); + + // stage 3 - odd half + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[7] = step2[7]; + + // stage 4 + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); +} + +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + for (i = 0; i < 8; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows // Only first 4 row has non-zero coefs @@ -1679,6 +1689,199 @@ void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, } } +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_high_t a1; + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); + dest += stride; + } +} + +void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + tran_low_t x0 = input[15]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[13]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[11]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[9]; + tran_low_t x7 = input[6]; + tran_low_t x8 = input[7]; + tran_low_t x9 = input[8]; + tran_low_t x10 = input[5]; + tran_low_t x11 = input[10]; + tran_low_t x12 = input[3]; + tran_low_t x13 = input[12]; + tran_low_t x14 = input[1]; + tran_low_t x15 = input[14]; + (void)bd; + + if (detect_invalid_highbd_input(input, 16)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 16); + return; + } + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + memset(output, 0, 16 * sizeof(*output)); + return; + } + + // stage 1 + s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64; + s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64; + s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64; + s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64; + s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64; + s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64; + s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64; + s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64; + s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64; + s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64; + s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64; + s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64; + s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64; + s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64; + s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64; + s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64; + + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64; + s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64; + s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64; + s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64; + s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64; + s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64; + s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64; + s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64; + + x0 = HIGHBD_WRAPLOW(s0 + s4, bd); + x1 = HIGHBD_WRAPLOW(s1 + s5, bd); + x2 = HIGHBD_WRAPLOW(s2 + s6, bd); + x3 = HIGHBD_WRAPLOW(s3 + s7, bd); + x4 = HIGHBD_WRAPLOW(s0 - s4, bd); + x5 = HIGHBD_WRAPLOW(s1 - s5, bd); + x6 = HIGHBD_WRAPLOW(s2 - s6, bd); + x7 = HIGHBD_WRAPLOW(s3 - s7, bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64; + s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64; + s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64; + s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64; + s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64; + s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64; + s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64; + + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); + x8 = HIGHBD_WRAPLOW(s8 + s10, bd); + x9 = HIGHBD_WRAPLOW(s9 + s11, bd); + x10 = HIGHBD_WRAPLOW(s8 - s10, bd); + x11 = HIGHBD_WRAPLOW(s9 - s11, bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); + + // stage 4 + s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3); + s3 = (tran_high_t)cospi_16_64 * (x2 - x3); + s6 = (tran_high_t)cospi_16_64 * (x6 + x7); + s7 = (tran_high_t)cospi_16_64 * (-x6 + x7); + s10 = (tran_high_t)cospi_16_64 * (x10 + x11); + s11 = (tran_high_t)cospi_16_64 * (-x10 + x11); + s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15); + s15 = (tran_high_t)cospi_16_64 * (x14 - x15); + + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x8, bd); + output[2] = HIGHBD_WRAPLOW(x12, bd); + output[3] = HIGHBD_WRAPLOW(-x4, bd); + output[4] = HIGHBD_WRAPLOW(x6, bd); + output[5] = HIGHBD_WRAPLOW(x14, bd); + output[6] = HIGHBD_WRAPLOW(x10, bd); + output[7] = HIGHBD_WRAPLOW(x2, bd); + output[8] = HIGHBD_WRAPLOW(x3, bd); + output[9] = HIGHBD_WRAPLOW(x11, bd); + output[10] = HIGHBD_WRAPLOW(x15, bd); + output[11] = HIGHBD_WRAPLOW(x7, bd); + output[12] = HIGHBD_WRAPLOW(x5, bd); + output[13] = HIGHBD_WRAPLOW(-x13, bd); + output[14] = HIGHBD_WRAPLOW(x9, bd); + output[15] = HIGHBD_WRAPLOW(-x1, bd); +} + void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step1[16], step2[16]; tran_high_t temp1, temp2; @@ -1720,23 +1923,31 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[6] = step1[6]; step2[7] = step1[7]; - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + temp1 = + step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64; + temp2 = + step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64; step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + temp1 = step1[9] * (tran_high_t)cospi_14_64 - + step1[14] * (tran_high_t)cospi_18_64; + temp2 = step1[9] * (tran_high_t)cospi_18_64 + + step1[14] * (tran_high_t)cospi_14_64; step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + temp1 = step1[10] * (tran_high_t)cospi_22_64 - + step1[13] * (tran_high_t)cospi_10_64; + temp2 = step1[10] * (tran_high_t)cospi_10_64 + + step1[13] * (tran_high_t)cospi_22_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + temp1 = step1[11] * (tran_high_t)cospi_6_64 - + step1[12] * (tran_high_t)cospi_26_64; + temp2 = step1[11] * (tran_high_t)cospi_26_64 + + step1[12] * (tran_high_t)cospi_6_64; step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -1746,12 +1957,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[2] = step2[2]; step1[3] = step2[3]; - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + temp1 = + step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64; + temp2 = + step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64; step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + temp1 = + step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64; + temp2 = + step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64; step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -1765,12 +1980,14 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; + temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64; + temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64; step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + temp1 = + step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64; + temp2 = + step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64; step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); @@ -1780,12 +1997,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[8] = step1[8]; step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + temp1 = -step1[9] * (tran_high_t)cospi_8_64 + + step1[14] * (tran_high_t)cospi_24_64; + temp2 = + step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64; step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + temp1 = -step1[10] * (tran_high_t)cospi_24_64 - + step1[13] * (tran_high_t)cospi_8_64; + temp2 = -step1[10] * (tran_high_t)cospi_8_64 + + step1[13] * (tran_high_t)cospi_24_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[11] = step1[11]; @@ -1797,8 +2018,8 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; @@ -1823,12 +2044,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; + temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; + temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64; step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[14] = step1[14]; @@ -1853,13 +2074,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); } -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows for (i = 0; i < 16; ++i) { @@ -1879,190 +2099,40 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - tran_low_t x0 = input[15]; - tran_low_t x1 = input[0]; - tran_low_t x2 = input[13]; - tran_low_t x3 = input[2]; - tran_low_t x4 = input[11]; - tran_low_t x5 = input[4]; - tran_low_t x6 = input[9]; - tran_low_t x7 = input[6]; - tran_low_t x8 = input[7]; - tran_low_t x9 = input[8]; - tran_low_t x10 = input[5]; - tran_low_t x11 = input[10]; - tran_low_t x12 = input[3]; - tran_low_t x13 = input[12]; - tran_low_t x14 = input[1]; - tran_low_t x15 = input[14]; - (void)bd; - - if (detect_invalid_highbd_input(input, 16)) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - assert(0 && "invalid highbd txfm input"); -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - memset(output, 0, sizeof(*output) * 16); - return; - } - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - memset(output, 0, 16 * sizeof(*output)); - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); - x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); - x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); - x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); - x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); - x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); - x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); - x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); - x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); - x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); - x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); - x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = HIGHBD_WRAPLOW(s0 + s4, bd); - x1 = HIGHBD_WRAPLOW(s1 + s5, bd); - x2 = HIGHBD_WRAPLOW(s2 + s6, bd); - x3 = HIGHBD_WRAPLOW(s3 + s7, bd); - x4 = HIGHBD_WRAPLOW(s0 - s4, bd); - x5 = HIGHBD_WRAPLOW(s1 - s5, bd); - x6 = HIGHBD_WRAPLOW(s2 - s6, bd); - x7 = HIGHBD_WRAPLOW(s3 - s7, bd); - x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); - x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); - x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); - x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); - x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); - x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = HIGHBD_WRAPLOW(s0 + s2, bd); - x1 = HIGHBD_WRAPLOW(s1 + s3, bd); - x2 = HIGHBD_WRAPLOW(s0 - s2, bd); - x3 = HIGHBD_WRAPLOW(s1 - s3, bd); - x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); - x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); - x8 = HIGHBD_WRAPLOW(s8 + s10, bd); - x9 = HIGHBD_WRAPLOW(s9 + s11, bd); - x10 = HIGHBD_WRAPLOW(s8 - s10, bd); - x11 = HIGHBD_WRAPLOW(s9 - s11, bd); - x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); - x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); - x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); - x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); - x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); - - output[0] = HIGHBD_WRAPLOW(x0, bd); - output[1] = HIGHBD_WRAPLOW(-x8, bd); - output[2] = HIGHBD_WRAPLOW(x12, bd); - output[3] = HIGHBD_WRAPLOW(-x4, bd); - output[4] = HIGHBD_WRAPLOW(x6, bd); - output[5] = HIGHBD_WRAPLOW(x14, bd); - output[6] = HIGHBD_WRAPLOW(x10, bd); - output[7] = HIGHBD_WRAPLOW(x2, bd); - output[8] = HIGHBD_WRAPLOW(x3, bd); - output[9] = HIGHBD_WRAPLOW(x11, bd); - output[10] = HIGHBD_WRAPLOW(x15, bd); - output[11] = HIGHBD_WRAPLOW(x7, bd); - output[12] = HIGHBD_WRAPLOW(x5, bd); - output[13] = HIGHBD_WRAPLOW(-x13, bd); - output[14] = HIGHBD_WRAPLOW(x9, bd); - output[15] = HIGHBD_WRAPLOW(-x1, bd); -} - -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + uint16_t *destT = dest; + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + destT[i] = highbd_clip_pixel_add(destT[i], + ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + destT += stride; + } + } +} + +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. @@ -2083,15 +2153,15 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); @@ -2131,43 +2201,59 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[14] = input[14]; step1[15] = input[30]; - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + temp1 = + input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64; + temp2 = + input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64; step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + temp1 = input[17] * (tran_high_t)cospi_15_64 - + input[15] * (tran_high_t)cospi_17_64; + temp2 = input[17] * (tran_high_t)cospi_17_64 + + input[15] * (tran_high_t)cospi_15_64; step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + temp1 = + input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64; + temp2 = + input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64; step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + temp1 = + input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64; + temp2 = + input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64; step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + temp1 = + input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64; + temp2 = + input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64; step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + temp1 = input[21] * (tran_high_t)cospi_11_64 - + input[11] * (tran_high_t)cospi_21_64; + temp2 = input[21] * (tran_high_t)cospi_21_64 + + input[11] * (tran_high_t)cospi_11_64; step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + temp1 = input[13] * (tran_high_t)cospi_19_64 - + input[19] * (tran_high_t)cospi_13_64; + temp2 = input[13] * (tran_high_t)cospi_13_64 + + input[19] * (tran_high_t)cospi_19_64; step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + temp1 = + input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64; + temp2 = + input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64; step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -2181,23 +2267,31 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[6] = step1[6]; step2[7] = step1[7]; - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + temp1 = + step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64; + temp2 = + step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64; step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + temp1 = step1[9] * (tran_high_t)cospi_14_64 - + step1[14] * (tran_high_t)cospi_18_64; + temp2 = step1[9] * (tran_high_t)cospi_18_64 + + step1[14] * (tran_high_t)cospi_14_64; step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + temp1 = step1[10] * (tran_high_t)cospi_22_64 - + step1[13] * (tran_high_t)cospi_10_64; + temp2 = step1[10] * (tran_high_t)cospi_10_64 + + step1[13] * (tran_high_t)cospi_22_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + temp1 = step1[11] * (tran_high_t)cospi_6_64 - + step1[12] * (tran_high_t)cospi_26_64; + temp2 = step1[11] * (tran_high_t)cospi_26_64 + + step1[12] * (tran_high_t)cospi_6_64; step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -2224,12 +2318,16 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[2] = step2[2]; step1[3] = step2[3]; - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + temp1 = + step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64; + temp2 = + step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64; step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + temp1 = + step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64; + temp2 = + step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64; step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); @@ -2244,22 +2342,30 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[16] = step2[16]; step1[31] = step2[31]; - temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; - temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + temp1 = -step2[17] * (tran_high_t)cospi_4_64 + + step2[30] * (tran_high_t)cospi_28_64; + temp2 = step2[17] * (tran_high_t)cospi_28_64 + + step2[30] * (tran_high_t)cospi_4_64; step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; - temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + temp1 = -step2[18] * (tran_high_t)cospi_28_64 - + step2[29] * (tran_high_t)cospi_4_64; + temp2 = -step2[18] * (tran_high_t)cospi_4_64 + + step2[29] * (tran_high_t)cospi_28_64; step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[19] = step2[19]; step1[20] = step2[20]; - temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; - temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + temp1 = -step2[21] * (tran_high_t)cospi_20_64 + + step2[26] * (tran_high_t)cospi_12_64; + temp2 = step2[21] * (tran_high_t)cospi_12_64 + + step2[26] * (tran_high_t)cospi_20_64; step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; - temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + temp1 = -step2[22] * (tran_high_t)cospi_12_64 - + step2[25] * (tran_high_t)cospi_20_64; + temp2 = -step2[22] * (tran_high_t)cospi_20_64 + + step2[25] * (tran_high_t)cospi_12_64; step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[23] = step2[23]; @@ -2268,12 +2374,14 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[28] = step2[28]; // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; + temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64; + temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64; step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + temp1 = + step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64; + temp2 = + step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64; step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); @@ -2283,12 +2391,16 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[8] = step1[8]; step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + temp1 = -step1[9] * (tran_high_t)cospi_8_64 + + step1[14] * (tran_high_t)cospi_24_64; + temp2 = + step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64; step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + temp1 = -step1[10] * (tran_high_t)cospi_24_64 - + step1[13] * (tran_high_t)cospi_8_64; + temp2 = -step1[10] * (tran_high_t)cospi_8_64 + + step1[13] * (tran_high_t)cospi_24_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[11] = step1[11]; @@ -2318,8 +2430,8 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; + temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; + temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; @@ -2335,20 +2447,28 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[16] = step2[16]; step1[17] = step2[17]; - temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; - temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + temp1 = -step2[18] * (tran_high_t)cospi_8_64 + + step2[29] * (tran_high_t)cospi_24_64; + temp2 = step2[18] * (tran_high_t)cospi_24_64 + + step2[29] * (tran_high_t)cospi_8_64; step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; - temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + temp1 = -step2[19] * (tran_high_t)cospi_8_64 + + step2[28] * (tran_high_t)cospi_24_64; + temp2 = step2[19] * (tran_high_t)cospi_24_64 + + step2[28] * (tran_high_t)cospi_8_64; step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; - temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + temp1 = -step2[20] * (tran_high_t)cospi_24_64 - + step2[27] * (tran_high_t)cospi_8_64; + temp2 = -step2[20] * (tran_high_t)cospi_8_64 + + step2[27] * (tran_high_t)cospi_24_64; step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; - temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + temp1 = -step2[21] * (tran_high_t)cospi_24_64 - + step2[26] * (tran_high_t)cospi_8_64; + temp2 = -step2[21] * (tran_high_t)cospi_8_64 + + step2[26] * (tran_high_t)cospi_24_64; step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[22] = step2[22]; @@ -2369,12 +2489,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; + temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64; + temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64; step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; + temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64; + temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64; step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[14] = step1[14]; @@ -2420,20 +2540,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[17] = step2[17]; step1[18] = step2[18]; step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * cospi_16_64; - temp2 = (step2[20] + step2[27]) * cospi_16_64; + temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64; + temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64; step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step2[21] + step2[26]) * cospi_16_64; - temp2 = (step2[21] + step2[26]) * cospi_16_64; + temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64; + temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64; step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step2[22] + step2[25]) * cospi_16_64; - temp2 = (step2[22] + step2[25]) * cospi_16_64; + temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64; + temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64; step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step2[23] + step2[24]) * cospi_16_64; - temp2 = (step2[23] + step2[24]) * cospi_16_64; + temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64; + temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64; step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[28] = step2[28]; @@ -2476,26 +2596,19 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); } -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32]; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 32; ++i) { - tran_low_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + tran_low_t zero_coeff = 0; + for (j = 0; j < 32; ++j) zero_coeff |= input[j]; - if (zero_coeff[0] | zero_coeff[1]) + if (zero_coeff) highbd_idct32_c(input, outptr, bd); else memset(outptr, 0, sizeof(tran_low_t) * 32); @@ -2514,13 +2627,40 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // Only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) { + highbd_idct32_c(input, outptr, bd); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + uint16_t *destT = dest; + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + highbd_idct32_c(temp_in, temp_out, bd); + for (j = 0; j < 32; ++j) { + destT[i] = highbd_clip_pixel_add(destT[i], + ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + destT += stride; + } + } +} + +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; tran_low_t temp_in[32], temp_out[32]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows // Only upper-left 8x8 has non-zero coeff @@ -2541,15 +2681,15 @@ void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd) { int i, j; int a1; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { diff --git a/media/libvpx/libvpx/vpx_dsp/inv_txfm.h b/media/libvpx/libvpx/vpx_dsp/inv_txfm.h index 13137659fa..6eedbeac35 100644 --- a/media/libvpx/libvpx/vpx_dsp/inv_txfm.h +++ b/media/libvpx/libvpx/vpx_dsp/inv_txfm.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_INV_TXFM_H_ -#define VPX_DSP_INV_TXFM_H_ +#ifndef VPX_VPX_DSP_INV_TXFM_H_ +#define VPX_VPX_DSP_INV_TXFM_H_ #include @@ -76,7 +76,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) { // bd of 10 uses trans_low with 18bits, need to remove 14bits // bd of 12 uses trans_low with 20bits, need to remove 12bits // bd of x uses trans_low with 8+x bits, need to remove 24-x bits - #define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16) #if CONFIG_VP9_HIGHBITDEPTH #define HIGHBD_WRAPLOW(x, bd) \ @@ -123,4 +122,4 @@ static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { } // extern "C" #endif -#endif // VPX_DSP_INV_TXFM_H_ +#endif // VPX_VPX_DSP_INV_TXFM_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c new file mode 100644 index 0000000000..750c9de29f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_lsx.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h" + +void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride, + tran_low_t *dst) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + ptrdiff_t src_stride2 = src_stride << 1; + ptrdiff_t src_stride3 = src_stride2 + src_stride; + ptrdiff_t src_stride4 = src_stride2 << 1; + ptrdiff_t src_stride6 = src_stride3 << 1; + + int16_t *src_tmp = (int16_t *)src; + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2); + src3 = __lsx_vldx(src_tmp, src_stride6); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6); + src7 = __lsx_vldx(src_tmp, src_stride6); + + LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, + tmp4, tmp6, tmp7, tmp5, tmp3, tmp1); + LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, + src4, src5, src7, src6, src3, src2); + LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, + tmp3, tmp4, tmp5, tmp1, tmp6, tmp2); + LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, + tmp4, tmp6, tmp7, tmp5, tmp3, tmp1); + LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, + src4, src5, src7, src6, src3, src2); + LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, + tmp3, tmp4, tmp5, tmp1, tmp6, tmp2); + store_tran_low(tmp0, dst, 0); + store_tran_low(tmp1, dst, 8); + store_tran_low(tmp2, dst, 16); + store_tran_low(tmp3, dst, 24); + store_tran_low(tmp4, dst, 32); + store_tran_low(tmp5, dst, 40); + store_tran_low(tmp6, dst, 48); + store_tran_low(tmp7, dst, 56); +} + +void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride, + tran_low_t *dst) { + int i; + __m128i a0, a1, a2, a3, b0, b1, b2, b3; + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0); + /* Top right. */ + vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64); + /* Bottom left. */ + vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128); + /* Bottom right. */ + vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192); + + for (i = 0; i < 64; i += 8) { + a0 = load_tran_low(dst); + a1 = load_tran_low(dst + 64); + a2 = load_tran_low(dst + 128); + a3 = load_tran_low(dst + 192); + + LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1); + DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3); + LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2); + + store_tran_low(a0, dst, 0); + store_tran_low(a1, dst, 64); + store_tran_low(a2, dst, 128); + store_tran_low(a3, dst, 192); + + dst += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c new file mode 100644 index 0000000000..482626080a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_util/loongson_intrinsics.h" + +void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + // width > 8 || width == 8 || width == 4 + if (width > 8) { + int i, j; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + __m128i p, r, avg; + + p = __lsx_vld(pred + j, 0); + r = __lsx_vld(ref + j, 0); + avg = __lsx_vavgr_bu(p, r); + __lsx_vst(avg, comp_pred + j, 0); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + int i = height * width; + do { + __m128i p, r, r_0, r_1; + + p = __lsx_vld(pred, 0); + r_0 = __lsx_vld(ref, 0); + ref += ref_stride; + r_1 = __lsx_vld(ref, 0); + ref += ref_stride; + r = __lsx_vilvl_d(r_1, r_0); + r = __lsx_vavgr_bu(p, r); + + __lsx_vst(r, comp_pred, 0); + + pred += 16; + comp_pred += 16; + i -= 16; + } while (i); + } else { // width = 4 + int i = height * width; + assert(width == 4); + do { + __m128i p, r, r_0, r_1, r_2, r_3; + p = __lsx_vld(pred, 0); + + if (width == ref_stride) { + r = __lsx_vld(ref, 0); + ref += 16; + } else { + r_0 = __lsx_vld(ref, 0); + ref += ref_stride; + r_1 = __lsx_vld(ref, 0); + ref += ref_stride; + r_2 = __lsx_vld(ref, 0); + ref += ref_stride; + r_3 = __lsx_vld(ref, 0); + ref += ref_stride; + DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2); + r = __lsx_vilvl_d(r_2, r_0); + } + r = __lsx_vavgr_bu(p, r); + + __lsx_vst(r, comp_pred, 0); + comp_pred += 16; + pred += 16; + i -= 16; + } while (i); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h new file mode 100644 index 0000000000..b0db1e99c5 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i load_tran_low(const tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + __m128i v0_m = __lsx_vld(s, 0); + __m128i v1_m = __lsx_vld(s + 4, 0); + return __lsx_vsrlni_h_w(v0_m, v1_m, 0); +#else + return __lsx_vld(s, 0); +#endif +} + +static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) { +#if CONFIG_VP9_HIGHBITDEPTH + __m128i v0_m, v1_m; + v1_m = __lsx_vexth_w_h(v); + v0_m = __lsx_vsllwil_w_h(v, 0); + __lsx_vst(v0_m, s + c, 0); + __lsx_vst(v1_m, s + c + 4, 0); +#else + __lsx_vst(v, s + c, 0); +#endif +} + +#endif // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c new file mode 100644 index 0000000000..9bb3877212 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c @@ -0,0 +1,1176 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" +#include "vpx_dsp/fwd_txfm.h" + +#define UNPCK_SH_SW(in, out0, out1) \ + do { \ + out0 = __lsx_vsllwil_w_h(in, 0); \ + out1 = __lsx_vexth_w_h(in); \ + } while (0) + +static void fdct8x32_1d_column_load_butterfly(const int16_t *input, + int32_t src_stride, + int16_t *temp_buff) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i step0, step1, step2, step3; + __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + __m128i step0_1, step1_1, step2_1, step3_1; + + int32_t stride = src_stride << 1; + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + const int16_t *input_tmp = (int16_t *)input; + + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2); + in3 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in0_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1); + in3_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp = input + (src_stride * 24); + in4_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1); + in7_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6); + in7 = __lsx_vldx(input_tmp, stride3); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1, + in2_1, in3_1); + DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1, + in6_1, in7_1); + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, + in7_1); + + __lsx_vst(step0, temp_buff, 0); + __lsx_vst(step1, temp_buff, 16); + __lsx_vst(step2, temp_buff, 32); + __lsx_vst(step3, temp_buff, 48); + + __lsx_vst(in4, temp_buff, 448); + __lsx_vst(in5, temp_buff, 464); + __lsx_vst(in6, temp_buff, 480); + __lsx_vst(in7, temp_buff, 496); + + __lsx_vst(step0_1, temp_buff, 64); + __lsx_vst(step1_1, temp_buff, 80); + __lsx_vst(step2_1, temp_buff, 96); + __lsx_vst(step3_1, temp_buff, 112); + + __lsx_vst(in4_1, temp_buff, 384); + __lsx_vst(in5_1, temp_buff, 400); + __lsx_vst(in6_1, temp_buff, 416); + __lsx_vst(in7_1, temp_buff, 432); + + /* 3rd and 4th set */ + input_tmp = input + (src_stride * 8); + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2); + in3 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in0_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1); + in3_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4_1 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1); + in7_1 = __lsx_vldx(input_tmp, stride3); + + input_tmp += stride2; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6); + in7 = __lsx_vldx(input_tmp, stride3); + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1, + in2_1, in3_1); + DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1, + in6_1, in7_1); + + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, + in7_1); + + __lsx_vst(step0, temp_buff, 128); + __lsx_vst(step1, temp_buff, 144); + __lsx_vst(step2, temp_buff, 160); + __lsx_vst(step3, temp_buff, 176); + + __lsx_vst(in4, temp_buff, 320); + __lsx_vst(in5, temp_buff, 336); + __lsx_vst(in6, temp_buff, 352); + __lsx_vst(in7, temp_buff, 368); + + __lsx_vst(step0_1, temp_buff, 192); + __lsx_vst(step1_1, temp_buff, 208); + __lsx_vst(step2_1, temp_buff, 224); + __lsx_vst(step3_1, temp_buff, 240); + + __lsx_vst(in4_1, temp_buff, 256); + __lsx_vst(in5_1, temp_buff, 272); + __lsx_vst(in6_1, temp_buff, 288); + __lsx_vst(in7_1, temp_buff, 304); +} + +static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i temp0, temp1; + + /* fdct even */ + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12, + in13, in14, in15); + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, + vec2, vec3, in12, in13, in14, in15); + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9, + in10, in11); + LSX_BUTTERFLY_8_H(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, + vec7, in8, in9, in10, in11); + + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 0); + __lsx_vst(temp1, temp, 1024); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 512); + __lsx_vst(temp1, temp, 1536); + + DUP4_ARG2(__lsx_vsub_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, + vec6, vec5, vec4); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 256); + __lsx_vst(temp1, temp, 1792); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 1280); + __lsx_vst(temp1, temp, 768); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 128); + __lsx_vst(temp1, temp, 1920); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 1152); + __lsx_vst(temp1, temp, 896); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 640); + __lsx_vst(temp1, temp, 1408); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + __lsx_vst(temp0, temp, 384); + __lsx_vst(temp1, temp, 1664); +} + +static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + __m128i tmp0, tmp1; + + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 160, input, 176, in20, in21, + in26, in27); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 192, input, 208, in18, in19, + in28, in29); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, input, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, input, 80); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, input, 160); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, input, 176); + + in21 = __lsx_vadd_h(in18, in21); + in20 = __lsx_vadd_h(in19, in20); + in27 = __lsx_vadd_h(in28, in27); + in26 = __lsx_vadd_h(in29, in26); + + DUP4_ARG2(__lsx_vld, input, 96, input, 112, input, 128, input, 144, in22, + in23, in24, in25); + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 224, input, 240, in16, in17, + in30, in31); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, input, 32); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, input, 48); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, input, 192); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, input, 208); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 0); + __lsx_vst(vec4, temp_ptr, 1920); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 896); + __lsx_vst(vec4, temp_ptr, 1024); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec4, temp_ptr, 1408); + __lsx_vst(vec5, temp_ptr, 512); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec4, temp_ptr, 384); + __lsx_vst(vec5, temp_ptr, 1536); + + DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 64, input, 80, in22, in23, + in20, in21); + DUP4_ARG2(__lsx_vld, input, 160, input, 176, input, 192, input, 208, in26, + in27, in24, in25); + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 1664); + __lsx_vst(vec4, temp_ptr, 256); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 640); + __lsx_vst(vec4, temp_ptr, 1280); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 1152); + __lsx_vst(vec4, temp_ptr, 768); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + __lsx_vst(vec5, temp_ptr, 128); + __lsx_vst(vec4, temp_ptr, 1792); +} + +static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, + int16_t *tmp_buf, int16_t *tmp_buf_big) { + fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); + fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); + fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); +} + +static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, + int16_t *output) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i step0, step1, step2, step3, step4, step5, step6, step7; + + DUP4_ARG2(__lsx_vld, temp_buff, 0, temp_buff, 64, temp_buff, 128, temp_buff, + 192, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, temp_buff, 256, temp_buff, 320, temp_buff, 384, + temp_buff, 448, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vld, temp_buff, 48, temp_buff, 112, temp_buff, 176, temp_buff, + 240, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, temp_buff, 304, temp_buff, 368, temp_buff, 432, + temp_buff, 496, in12, in13, in14, in15); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, step0, step1, step2, step3, + step4, step5, step6, step7, in8, in9, in10, in11, in12, + in13, in14, in15); + + __lsx_vst(step0, output, 0); + __lsx_vst(step1, output, 16); + __lsx_vst(step2, output, 32); + __lsx_vst(step3, output, 48); + __lsx_vst(step4, output, 64); + __lsx_vst(step5, output, 80); + __lsx_vst(step6, output, 96); + __lsx_vst(step7, output, 112); + + __lsx_vst(in8, output, 384); + __lsx_vst(in9, output, 400); + __lsx_vst(in10, output, 416); + __lsx_vst(in11, output, 432); + __lsx_vst(in12, output, 448); + __lsx_vst(in13, output, 464); + __lsx_vst(in14, output, 480); + __lsx_vst(in15, output, 496); + + /* 2nd set */ + DUP4_ARG2(__lsx_vld, temp_buff, 16, temp_buff, 80, temp_buff, 144, temp_buff, + 208, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vld, temp_buff, 272, temp_buff, 336, temp_buff, 400, + temp_buff, 464, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vld, temp_buff, 32, temp_buff, 96, temp_buff, 160, temp_buff, + 224, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, temp_buff, 288, temp_buff, 352, temp_buff, 416, + temp_buff, 480, in12, in13, in14, in15); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, step0, step1, step2, step3, + step4, step5, step6, step7, in8, in9, in10, in11, in12, + in13, in14, in15); + + __lsx_vst(step0, output, 128); + __lsx_vst(step1, output, 144); + __lsx_vst(step2, output, 160); + __lsx_vst(step3, output, 176); + __lsx_vst(step4, output, 192); + __lsx_vst(step5, output, 208); + __lsx_vst(step6, output, 224); + __lsx_vst(step7, output, 240); + + __lsx_vst(in8, output, 256); + __lsx_vst(in9, output, 272); + __lsx_vst(in10, output, 288); + __lsx_vst(in11, output, 304); + __lsx_vst(in12, output, 320); + __lsx_vst(in13, output, 336); + __lsx_vst(in14, output, 352); + __lsx_vst(in15, output, 368); +} + +static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, + int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; + __m128i vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; + __m128i tmp0_w, tmp1_w, tmp2_w, tmp3_w; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12, + in13, in14, in15); + + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + + __lsx_vst(vec0, interm_ptr, 0); + __lsx_vst(vec1, interm_ptr, 16); + __lsx_vst(vec2, interm_ptr, 32); + __lsx_vst(vec3, interm_ptr, 48); + __lsx_vst(vec4, interm_ptr, 64); + __lsx_vst(vec5, interm_ptr, 80); + __lsx_vst(vec6, interm_ptr, 96); + __lsx_vst(vec7, interm_ptr, 112); + + __lsx_vst(in8, interm_ptr, 128); + __lsx_vst(in9, interm_ptr, 144); + __lsx_vst(in10, interm_ptr, 160); + __lsx_vst(in11, interm_ptr, 176); + __lsx_vst(in12, interm_ptr, 192); + __lsx_vst(in13, interm_ptr, 208); + __lsx_vst(in14, interm_ptr, 224); + __lsx_vst(in15, interm_ptr, 240); + + /* Stage 3 */ + UNPCK_SH_SW(vec0, vec0_l, vec0_r); + UNPCK_SH_SW(vec1, vec1_l, vec1_r); + UNPCK_SH_SW(vec2, vec2_l, vec2_r); + UNPCK_SH_SW(vec3, vec3_l, vec3_r); + UNPCK_SH_SW(vec4, vec4_l, vec4_r); + UNPCK_SH_SW(vec5, vec5_l, vec5_r); + UNPCK_SH_SW(vec6, vec6_l, vec6_r); + UNPCK_SH_SW(vec7, vec7_l, vec7_r); + DUP4_ARG2(__lsx_vadd_w, vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, + vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); + LSX_BUTTERFLY_4_W(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, + vec5_r); + DUP4_ARG2(__lsx_vadd_w, vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, + vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); + + tmp3_w = __lsx_vadd_w(vec0_r, vec3_r); + vec0_r = __lsx_vsub_w(vec0_r, vec3_r); + vec3_r = __lsx_vadd_w(vec1_r, vec2_r); + vec1_r = __lsx_vsub_w(vec1_r, vec2_r); + + DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 16); + + DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + __lsx_vst(vec5, out, 32); + __lsx_vst(vec4, out, 48); + + DUP4_ARG2(__lsx_vld, interm_ptr, 0, interm_ptr, 16, interm_ptr, 32, + interm_ptr, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, interm_ptr, 64, interm_ptr, 80, interm_ptr, 96, + interm_ptr, 112, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 64); + __lsx_vst(in5, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 80); + __lsx_vst(in5, out, 96); + + DUP4_ARG2(__lsx_vld, interm_ptr, 128, interm_ptr, 144, interm_ptr, 160, + interm_ptr, 176, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, interm_ptr, 192, interm_ptr, 208, interm_ptr, 224, + interm_ptr, 240, in12, in13, in14, in15); + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 128); + __lsx_vst(in5, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 144); + __lsx_vst(in5, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + tmp0_w = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(tmp0_w, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 160); + __lsx_vst(in5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + __lsx_vst(in4, out, 192); + __lsx_vst(in5, out, 176); +} + +static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6, + in7); + DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13, + in14, in15); + + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 0); + __lsx_vst(temp1, out, 16); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 32); + __lsx_vst(temp1, out, 48); + + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 64); + __lsx_vst(temp1, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 80); + __lsx_vst(temp1, out, 96); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 128); + __lsx_vst(temp1, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 144); + __lsx_vst(temp1, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5) + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 160); + __lsx_vst(temp1, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + __lsx_vst(temp0, out, 192); + __lsx_vst(temp1, out, 176); +} + +static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + __m128i tmp0, tmp1; + + in20 = __lsx_vld(temp, 64); + in21 = __lsx_vld(temp, 80); + in26 = __lsx_vld(temp, 160); + in27 = __lsx_vld(temp, 176); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = __lsx_vld(temp, 32); + in19 = __lsx_vld(temp, 48); + in28 = __lsx_vld(temp, 192); + in29 = __lsx_vld(temp, 208); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, interm_ptr, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, interm_ptr, 176); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, interm_ptr, 112); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, interm_ptr, 128); + + DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21, + in20, in27, in26); + + in22 = __lsx_vld(temp, 96); + in23 = __lsx_vld(temp, 112); + in24 = __lsx_vld(temp, 128); + in25 = __lsx_vld(temp, 144); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = __lsx_vld(temp, 0); + in17 = __lsx_vld(temp, 16); + in30 = __lsx_vld(temp, 224); + in31 = __lsx_vld(temp, 240); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, interm_ptr, 80); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, interm_ptr, 96); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, interm_ptr, 144); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, interm_ptr, 160); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 224); + __lsx_vst(vec4, out, 16); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 32); + __lsx_vst(vec5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 48); + __lsx_vst(vec5, out, 192); + + in20 = __lsx_vld(interm_ptr, 64); + in21 = __lsx_vld(interm_ptr, 176); + in27 = __lsx_vld(interm_ptr, 112); + in26 = __lsx_vld(interm_ptr, 128); + + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = __lsx_vld(interm_ptr, 80); + in25 = __lsx_vld(interm_ptr, 96); + in24 = __lsx_vld(interm_ptr, 144); + in23 = __lsx_vld(interm_ptr, 160); + + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 64); + __lsx_vst(vec4, out, 176); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 80); + __lsx_vst(vec4, out, 160); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec5, out, 144); + __lsx_vst(vec4, out, 96); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + __lsx_vst(vec4, out, 112); + __lsx_vst(vec5, out, 128); +} + +static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + + /* 1st set */ + in0 = __lsx_vld(temp, 0); + in4 = __lsx_vld(temp, 64); + in2 = __lsx_vld(temp, 128); + in6 = __lsx_vld(temp, 192); + in1 = __lsx_vld(temp, 256); + in7 = __lsx_vld(temp, 304); + in3 = __lsx_vld(temp, 384); + in5 = __lsx_vld(temp, 432); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* 2nd set */ + in0_1 = __lsx_vld(temp, 32); + in1_1 = __lsx_vld(temp, 464); + in2_1 = __lsx_vld(temp, 160); + in3_1 = __lsx_vld(temp, 336); + in4_1 = __lsx_vld(temp, 96); + in5_1 = __lsx_vld(temp, 352); + in6_1 = __lsx_vld(temp, 224); + in7_1 = __lsx_vld(temp, 480); + + __lsx_vst(in0, output, 0); + __lsx_vst(in1, output, 64); + __lsx_vst(in2, output, 128); + __lsx_vst(in3, output, 192); + __lsx_vst(in4, output, 256); + __lsx_vst(in5, output, 320); + __lsx_vst(in6, output, 384); + __lsx_vst(in7, output, 448); + + LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + /* 3rd set */ + in0 = __lsx_vld(temp, 16); + in1 = __lsx_vld(temp, 272); + in2 = __lsx_vld(temp, 144); + in3 = __lsx_vld(temp, 400); + in4 = __lsx_vld(temp, 80); + in5 = __lsx_vld(temp, 416); + in6 = __lsx_vld(temp, 208); + in7 = __lsx_vld(temp, 288); + + __lsx_vst(in0_1, output, 16); + __lsx_vst(in1_1, output, 80); + __lsx_vst(in2_1, output, 144); + __lsx_vst(in3_1, output, 208); + __lsx_vst(in4_1, output, 272); + __lsx_vst(in5_1, output, 336); + __lsx_vst(in6_1, output, 400); + __lsx_vst(in7_1, output, 464); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + __lsx_vst(in0, output, 32); + __lsx_vst(in1, output, 96); + __lsx_vst(in2, output, 160); + __lsx_vst(in3, output, 224); + __lsx_vst(in4, output, 288); + __lsx_vst(in5, output, 352); + __lsx_vst(in6, output, 416); + __lsx_vst(in7, output, 480); + + /* 4th set */ + in0_1 = __lsx_vld(temp, 48); + in1_1 = __lsx_vld(temp, 448); + in2_1 = __lsx_vld(temp, 176); + in3_1 = __lsx_vld(temp, 320); + in4_1 = __lsx_vld(temp, 112); + in5_1 = __lsx_vld(temp, 368); + in6_1 = __lsx_vld(temp, 240); + in7_1 = __lsx_vld(temp, 496); + + LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + __lsx_vst(in0_1, output, 48); + __lsx_vst(in1_1, output, 112); + __lsx_vst(in2_1, output, 176); + __lsx_vst(in3_1, output, 240); + __lsx_vst(in4_1, output, 304); + __lsx_vst(in5_1, output, 368); + __lsx_vst(in6_1, output, 432); + __lsx_vst(in7_1, output, 496); +} + +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { + fdct8x32_1d_row_load_butterfly(temp, temp_buf); + fdct8x32_1d_row_even(temp_buf, temp_buf); + fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); + fdct8x32_1d_row_transpose_store(temp_buf, output); +} + +static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); + fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vpx_fdct32x32_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + int i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, + tmp_buf_big + (8 * i)); + } + + /* row transform */ + fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); + + /* row transform */ + for (i = 1; i < 4; ++i) { + fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); + } +} + +static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6, + in7); + DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13, + in14, in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, + vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, + in15); + + FDCT_POSTPROC_2V_NEG_H(vec0, vec1); + FDCT_POSTPROC_2V_NEG_H(vec2, vec3); + FDCT_POSTPROC_2V_NEG_H(vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec6, vec7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + + /* Stage 3 */ + DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, + in1, in2, in3); + + temp0 = __lsx_vadd_h(in0, in3); + in0 = __lsx_vsub_h(in0, in3); + in3 = __lsx_vadd_h(in1, in2); + in1 = __lsx_vsub_h(in1, in2); + + DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); + __lsx_vst(temp0, out, 0); + __lsx_vst(temp1, out, 16); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + __lsx_vst(temp0, out, 32); + __lsx_vst(temp1, out, 48); + + DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, + vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + __lsx_vst(temp0, out, 64); + __lsx_vst(temp1, out, 112); + + DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + __lsx_vst(temp0, out, 80); + __lsx_vst(temp1, out, 96); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, + vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + __lsx_vst(temp0, out, 128); + __lsx_vst(temp1, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + __lsx_vst(temp0, out, 144); + __lsx_vst(temp1, out, 224); + + DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5); + temp0 = __lsx_vneg_h(vec2); + DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1); + DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, + vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + __lsx_vst(temp0, out, 160); + __lsx_vst(temp1, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + __lsx_vst(temp0, out, 192); + __lsx_vst(temp1, out, 176); +} + +static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + __m128i in16, in17, in18, in19, in20, in21, in22, in23; + __m128i in24, in25, in26, in27, in28, in29, in30, in31; + __m128i vec4, vec5, tmp0, tmp1; + + in20 = __lsx_vld(temp, 64); + in21 = __lsx_vld(temp, 80); + in26 = __lsx_vld(temp, 160); + in27 = __lsx_vld(temp, 176); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + FDCT_POSTPROC_2V_NEG_H(in20, in21); + FDCT_POSTPROC_2V_NEG_H(in26, in27); + + in18 = __lsx_vld(temp, 32); + in19 = __lsx_vld(temp, 48); + in28 = __lsx_vld(temp, 192); + in29 = __lsx_vld(temp, 208); + + FDCT_POSTPROC_2V_NEG_H(in18, in19); + FDCT_POSTPROC_2V_NEG_H(in28, in29); + + vec4 = __lsx_vsub_h(in19, in20); + __lsx_vst(vec4, interm_ptr, 64); + vec4 = __lsx_vsub_h(in18, in21); + __lsx_vst(vec4, interm_ptr, 176); + vec4 = __lsx_vsub_h(in29, in26); + __lsx_vst(vec4, interm_ptr, 128); + vec4 = __lsx_vsub_h(in28, in27); + __lsx_vst(vec4, interm_ptr, 112); + + DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21, + in20, in27, in26); + + in22 = __lsx_vld(temp, 96); + in23 = __lsx_vld(temp, 112); + in24 = __lsx_vld(temp, 128); + in25 = __lsx_vld(temp, 144); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + FDCT_POSTPROC_2V_NEG_H(in22, in23); + FDCT_POSTPROC_2V_NEG_H(in24, in25); + + in16 = __lsx_vld(temp, 0); + in17 = __lsx_vld(temp, 16); + in30 = __lsx_vld(temp, 224); + in31 = __lsx_vld(temp, 240); + + FDCT_POSTPROC_2V_NEG_H(in16, in17); + FDCT_POSTPROC_2V_NEG_H(in30, in31); + + vec4 = __lsx_vsub_h(in17, in22); + __lsx_vst(vec4, interm_ptr, 80); + vec4 = __lsx_vsub_h(in30, in25); + __lsx_vst(vec4, interm_ptr, 96); + vec4 = __lsx_vsub_h(in31, in24); + __lsx_vst(vec4, interm_ptr, 144); + vec4 = __lsx_vsub_h(in16, in23); + __lsx_vst(vec4, interm_ptr, 160); + + DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16, + in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27, + in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + __lsx_vst(vec5, out, 0); + __lsx_vst(vec4, out, 240); + + DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + __lsx_vst(vec5, out, 224); + __lsx_vst(vec4, out, 16); + + DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23, + in26, in24, in20); + tmp0 = __lsx_vneg_h(in23); + DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25); + DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + __lsx_vst(vec4, out, 32); + __lsx_vst(vec5, out, 208); + + DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + __lsx_vst(vec4, out, 48); + __lsx_vst(vec5, out, 192); + + in20 = __lsx_vld(interm_ptr, 64); + in21 = __lsx_vld(interm_ptr, 176); + in27 = __lsx_vld(interm_ptr, 112); + in26 = __lsx_vld(interm_ptr, 128); + + in16 = in20; + in17 = in21; + DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1); + DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = __lsx_vld(interm_ptr, 80); + in25 = __lsx_vld(interm_ptr, 96); + in24 = __lsx_vld(interm_ptr, 144); + in23 = __lsx_vld(interm_ptr, 160); + + DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28, + in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + in16 = __lsx_vadd_h(in28, in29); + in19 = __lsx_vadd_h(in31, in30); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + __lsx_vst(vec5, out, 64); + __lsx_vst(vec4, out, 176); + + DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + __lsx_vst(vec5, out, 80); + __lsx_vst(vec4, out, 160); + + DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16, + in29, in30, in19); + tmp0 = __lsx_vneg_h(in16); + DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31); + DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + __lsx_vst(vec5, out, 144); + __lsx_vst(vec4, out, 96); + + DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + __lsx_vst(vec4, out, 112); + __lsx_vst(vec5, out, 128); +} + +static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); + fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void vpx_fdct32x32_rd_lsx(const int16_t *input, int16_t *out, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], + &tmp_buf_big[0] + (8 * i)); + } + /* row transform */ + for (i = 0; i < 4; ++i) { + fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], + out + (8 * i * 32)); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c new file mode 100644 index 0000000000..508532b9d8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" + +#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + do { \ + __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \ + \ + DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \ + DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _t2 = __lsx_vilvl_h(_s3, _s2); \ + _t3 = __lsx_vilvh_h(_s3, _s2); \ + DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \ + DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \ + } while (0) + +#if !CONFIG_VP9_HIGHBITDEPTH +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride) { + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30; + __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; + __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; + __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; + __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + int32_t src_stride8 = src_stride4 << 1; + int16_t *input_tmp = (int16_t *)input; + in0 = __lsx_vld(input_tmp, 0); + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4); + input_tmp += src_stride4; + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8); + input_tmp += src_stride4; + DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, + input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11, + in12); + input_tmp += src_stride4; + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13, + in14); + input_tmp += src_stride2; + in15 = __lsx_vldx(input_tmp, src_stride2); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14, + in15); + DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0, + tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, + tmp6, tmp7); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + __lsx_vst(tmp0, tmp_ptr, 0); + __lsx_vst(tmp1, tmp_ptr, 64); + __lsx_vst(tmp2, tmp_ptr, 128); + __lsx_vst(tmp3, tmp_ptr, 192); + __lsx_vst(tmp4, tmp_ptr, 256); + __lsx_vst(tmp5, tmp_ptr, 320); + __lsx_vst(tmp6, tmp_ptr, 384); + __lsx_vst(tmp7, tmp_ptr, 448); + DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15, + in14, in13, in12); + DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, + in9, in8); + + tmp_ptr += 16; + + /* stp 1 */ + DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4); + DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5); + + cnst4 = __lsx_vreplvei_h(coeff, 0); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25); + + cnst5 = __lsx_vreplvei_h(coeff, 1); + cnst5 = __lsx_vpackev_h(cnst5, cnst4); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23); + + /* stp2 */ + LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); + LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); + DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4); + DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5); + DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26); + + cnst0 = __lsx_vreplvei_h(coeff, 4); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21); + + LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9); + vec1 = __lsx_vilvl_h(in15, in8); + vec0 = __lsx_vilvh_h(in15, in8); + + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 0); + + cnst0 = __lsx_vreplvei_h(coeff2, 0); + cnst0 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 448); + + vec1 = __lsx_vilvl_h(in14, in9); + vec0 = __lsx_vilvh_h(in14, in9); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8); + __lsx_vst(in8, tmp_ptr, 256); + + cnst1 = __lsx_vreplvei_h(coeff2, 2); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 192); + + DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25); + + cnst1 = __lsx_vreplvei_h(coeff, 3); + cnst1 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22); + + /* stp4 */ + DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10); + + vec1 = __lsx_vilvl_h(in13, in10); + vec0 = __lsx_vilvh_h(in13, in10); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 128); + + cnst0 = __lsx_vreplvei_h(coeff2, 1); + cnst0 = __lsx_vpackev_h(cnst1, cnst0); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 320); + + DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11); + vec1 = __lsx_vilvl_h(in12, in11); + vec0 = __lsx_vilvh_h(in12, in11); + DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1); + cnst1 = __lsx_vpackev_h(cnst1, cnst0); + + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8); + __lsx_vst(in8, tmp_ptr, 384); + + cnst1 = __lsx_vreplvei_h(coeff2, 3); + cnst0 = __lsx_vpackev_h(cnst0, cnst1); + DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8); + __lsx_vst(in8, tmp_ptr, 64); +} + +void fdct16x8_1d_row(int16_t *input, int16_t *output) { + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i in8, in9, in10, in11, in12, in13, in14, in15; + int16_t *input_tmp = input; + + DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2, + in3); + DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5, + in6, in7); + DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp, + 112, in8, in9, in10, in11); + DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208, + input_tmp, 240, in12, in13, in14, in15); + + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, + in14, in15); + + DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10, + in11); + DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14, + in15); + LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, + tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, + in15); + __lsx_vst(in8, input, 0); + __lsx_vst(in9, input, 32); + __lsx_vst(in10, input, 64); + __lsx_vst(in11, input, 96); + __lsx_vst(in12, input, 128); + __lsx_vst(in13, input, 160); + __lsx_vst(in14, input, 192); + __lsx_vst(in15, input, 224); + + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9, + in10, in11); + DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12, + in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + __lsx_vst(tmp0, output, 0); + __lsx_vst(in0, output, 32); + __lsx_vst(tmp1, output, 64); + __lsx_vst(in1, output, 96); + __lsx_vst(tmp2, output, 128); + __lsx_vst(in2, output, 160); + __lsx_vst(tmp3, output, 192); + __lsx_vst(in3, output, 224); + + LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + __lsx_vst(tmp4, output, 16); + __lsx_vst(in4, output, 48); + __lsx_vst(tmp5, output, 80); + __lsx_vst(in5, output, 112); + __lsx_vst(tmp6, output, 144); + __lsx_vst(in6, output, 176); + __lsx_vst(tmp7, output, 208); + __lsx_vst(in7, output, 240); +} + +void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + __m128i in0, in1, in2, in3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + + in0 = __lsx_vld(input, 0); + DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2); + in3 = __lsx_vldx(input, src_stride6); + + /* fdct4 pre-process */ + { + __m128i vec, mask; + __m128i zero = __lsx_vldi(0); + + mask = __lsx_vinsgr2vr_b(zero, 1, 0); + DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2, + in3); + vec = __lsx_vseqi_h(in0, 0); + vec = __lsx_vxori_b(vec, 255); + vec = __lsx_vand_v(mask, vec); + in0 = __lsx_vadd_h(in0, vec); + } + + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2); + __lsx_vst(in0, output, 0); + __lsx_vst(in2, output, 16); +} + +void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t src_stride6 = src_stride4 + src_stride2; + int16_t *input_tmp = (int16_t *)input; + + in0 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1, + in2); + in3 = __lsx_vldx(input_tmp, src_stride6); + input_tmp += src_stride4; + in4 = __lsx_vld(input_tmp, 0); + DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5, + in6); + in7 = __lsx_vldx(input_tmp, src_stride6); + + DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3); + DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7); + + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + + __lsx_vst(in0, output, 0); + __lsx_vst(in1, output, 16); + __lsx_vst(in2, output, 32); + __lsx_vst(in3, output, 48); + __lsx_vst(in4, output, 64); + __lsx_vst(in5, output, 80); + __lsx_vst(in6, output, 96); + __lsx_vst(in7, output, 112); +} + +void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); + + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); + } +} +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h new file mode 100644 index 0000000000..4a9fce9a3d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ + +#include "vpx_dsp/loongarch/txfm_macros_lsx.h" +#include "vpx_dsp/txfm_common.h" + +#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ + do { \ + __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m; \ + __m128i vec4_m, vec5_m, vec6_m, vec7_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \ + \ + LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \ + cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \ + vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ + \ + vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \ + cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \ + cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \ + vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \ + \ + DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \ + vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \ + vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \ + } while (0) + +#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ + \ + /* FDCT stage1 */ \ + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ + s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ + x2_m = __lsx_vneg_h(x2_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ + x2_m = __lsx_vreplvei_h(coeff_m, 2); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ + \ + /* stage2 */ \ + s1_m = __lsx_vilvl_h(s5_m, s6_m); \ + s0_m = __lsx_vilvh_h(s5_m, s6_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ + \ + /* stage3 */ \ + LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x0_m, x1_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ + \ + x1_m = __lsx_vreplvei_h(coeff_m, 5); \ + x0_m = __lsx_vneg_h(x0_m); \ + x0_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ + x2_m = __lsx_vreplvei_h(coeff_m, 6); \ + x3_m = __lsx_vneg_h(x3_m); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ + } while (0) + +#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ + do { \ + __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \ + vec1_m, vec2_m, vec3_m); \ + DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \ + vec5_m, vec6_m, vec7_m); \ + DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \ + in3, in0, in1, in2, in3); \ + DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \ + in7, in4, in5, in6, in7); \ + } while (0) + +#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ + do { \ + __m128i tp0_m, tp1_m; \ + __m128i one = __lsx_vreplgr2vr_h(1); \ + \ + tp0_m = __lsx_vslei_h(vec0, 0); \ + tp1_m = __lsx_vslei_h(vec1, 0); \ + tp0_m = __lsx_vxori_b(tp0_m, 255); \ + tp1_m = __lsx_vxori_b(tp1_m, 255); \ + vec0 = __lsx_vadd_h(vec0, one); \ + vec1 = __lsx_vadd_h(vec1, one); \ + tp0_m = __lsx_vand_v(one, tp0_m); \ + tp1_m = __lsx_vand_v(one, tp1_m); \ + vec0 = __lsx_vadd_h(vec0, tp0_m); \ + vec1 = __lsx_vadd_h(vec1, tp1_m); \ + vec0 = __lsx_vsrai_h(vec0, 2); \ + vec1 = __lsx_vsrai_h(vec1, 2); \ + } while (0) + +#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ + do { \ + __m128i tp0_m, tp1_m; \ + __m128i one_m = __lsx_vldi(0x401); \ + \ + tp0_m = __lsx_vslti_h(vec0, 0); \ + tp1_m = __lsx_vslti_h(vec1, 0); \ + vec0 = __lsx_vadd_h(vec0, one_m); \ + vec1 = __lsx_vadd_h(vec1, one_m); \ + tp0_m = __lsx_vand_v(one_m, tp0_m); \ + tp1_m = __lsx_vand_v(one_m, tp1_m); \ + vec0 = __lsx_vadd_h(vec0, tp0_m); \ + vec1 = __lsx_vadd_h(vec1, tp1_m); \ + vec0 = __lsx_vsrai_h(vec0, 2); \ + vec1 = __lsx_vsrai_h(vec1, 2); \ + } while (0) + +#define FDCT32_POSTPROC_NEG_W(vec) \ + do { \ + __m128i temp_m; \ + __m128i one_m = __lsx_vreplgr2vr_w(1); \ + \ + temp_m = __lsx_vslti_w(vec, 0); \ + vec = __lsx_vadd_w(vec, one_m); \ + temp_m = __lsx_vand_v(one_m, temp_m); \ + vec = __lsx_vadd_w(vec, temp_m); \ + vec = __lsx_vsrai_w(vec, 2); \ + } while (0) + +#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ + const0, const1, out0, out1, out2, out3) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1; \ + __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0); \ + \ + s0_m = __lsx_vreplgr2vr_w((int32_t)const1); \ + k0_m = __lsx_vpackev_w(s0_m, k0_m); \ + \ + DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1); \ + s1_m = __lsx_vilvl_w(_tmp0, reg0_left); \ + s0_m = __lsx_vilvh_w(_tmp0, reg0_left); \ + s3_m = __lsx_vilvl_w(reg0_left, reg1_left); \ + s2_m = __lsx_vilvh_w(reg0_left, reg1_left); \ + s5_m = __lsx_vilvl_w(_tmp1, reg0_right); \ + s4_m = __lsx_vilvh_w(_tmp1, reg0_right); \ + s7_m = __lsx_vilvl_w(reg0_right, reg1_right); \ + s6_m = __lsx_vilvh_w(reg0_right, reg1_right); \ + DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m); \ + DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m); \ + DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ + DCT_CONST_BITS, out0, out1); \ + DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m); \ + DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m); \ + DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \ + DCT_CONST_BITS, out2, out3); \ + } while (0) + +#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2, \ + in3) \ + do { \ + __m128i dst0_m, dst1_m, dst2_m, dst3_m; \ + __m128i tmp0_m, tmp1_m; \ + __m128i res0_m, res1_m, res2_m, res3_m; \ + \ + dst0_m = __lsx_vld(dst, 0); \ + DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m); \ + dst3_m = __lsx_vldx(dst, _stride3); \ + DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \ + res0_m, res1_m, res2_m, res3_m); \ + DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m, \ + in3, res0_m, res1_m, res2_m, res3_m); \ + DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0, \ + tmp0_m, tmp1_m); \ + __lsx_vstelm_d(tmp0_m, dst, 0, 0); \ + __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1); \ + __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0); \ + __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1); \ + } while (0) + +#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + __m128i x0_m, x1_m, x2_m, x3_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \ + \ + /* FDCT stage1 */ \ + LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \ + s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \ + LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \ + x2_m = __lsx_vneg_h(x2_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \ + x2_m = __lsx_vreplvei_h(coeff_m, 2); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \ + \ + /* stage2 */ \ + s1_m = __lsx_vilvl_h(s5_m, s6_m); \ + s0_m = __lsx_vilvh_h(s5_m, s6_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \ + DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \ + \ + /* stage3 */ \ + LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \ + x1_m = __lsx_vpackev_h(x0_m, x1_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \ + x2_m = __lsx_vpackev_h(x3_m, x2_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \ + \ + x1_m = __lsx_vreplvei_h(coeff_m, 5); \ + x0_m = __lsx_vneg_h(x0_m); \ + x0_m = __lsx_vpackev_h(x1_m, x0_m); \ + DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \ + \ + x2_m = __lsx_vreplvei_h(coeff_m, 6); \ + x3_m = __lsx_vneg_h(x3_m); \ + x2_m = __lsx_vpackev_h(x2_m, x3_m); \ + DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \ + } while (0) + +#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ + input7, out1, out3, out5, out7, out9, out11, out13, \ + out15) \ + do { \ + __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ + __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ + __m128i stp36_m, stp37_m, vec0_m, vec1_m; \ + __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ + __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ + __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df }; \ + __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 }; \ + __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 }; \ + \ + /* stp 1 */ \ + DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \ + DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \ + \ + cnst4_m = __lsx_vreplvei_h(coeff_m, 0); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m); \ + \ + cnst5_m = __lsx_vreplvei_h(coeff_m, 1); \ + cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m); \ + \ + /* stp2 */ \ + LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, \ + stp32_m, stp33_m); \ + LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, \ + stp35_m, stp34_m); \ + \ + DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, \ + vec4_m); \ + DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, \ + vec5_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff_m, 4); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m); \ + \ + DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff_m, 3); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m); \ + \ + /* stp4 */ \ + LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, \ + vec4_m, vec5_m); \ + LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, \ + stp24_m, stp31_m); \ + \ + vec1_m = __lsx_vilvl_h(vec2_m, vec6_m); \ + vec0_m = __lsx_vilvh_h(vec2_m, vec6_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff2_m, 0); \ + cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15); \ + \ + vec1_m = __lsx_vilvl_h(vec4_m, vec5_m); \ + vec0_m = __lsx_vilvh_h(vec4_m, vec5_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9); \ + \ + cnst1_m = __lsx_vreplvei_h(coeff2_m, 2); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7); \ + \ + vec1_m = __lsx_vilvl_h(stp23_m, stp21_m); \ + vec0_m = __lsx_vilvh_h(stp23_m, stp21_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5); \ + \ + cnst0_m = __lsx_vreplvei_h(coeff2_m, 1); \ + cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11); \ + \ + vec1_m = __lsx_vilvl_h(stp24_m, stp31_m); \ + vec0_m = __lsx_vilvh_h(stp24_m, stp31_m); \ + DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m); \ + cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \ + \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13); \ + \ + cnst1_m = __lsx_vreplvei_h(coeff2_m, 3); \ + cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m); \ + DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3); \ + } while (0) + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride); +void fdct16x8_1d_row(int16_t *input, int16_t *output); +#endif // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c new file mode 100644 index 0000000000..ec07f57d90 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c @@ -0,0 +1,834 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/fwd_txfm_lsx.h" + +#define UNPCK_UB_SH(_in, _out0, _out1) \ + do { \ + _out0 = __lsx_vsllwil_hu_bu(_in, 0); \ + _out1 = __lsx_vexth_hu_bu(_in); \ + } while (0) + +static void idct32x8_row_transpose_store(const int16_t *input, + int16_t *tmp_buf) { + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + + /* 1st & 2nd 8x8 */ + DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1, + n1); + DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2, + m3, n3); + DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5, + n5); + DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6, + m7, n7); + + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + + __lsx_vst(m0, tmp_buf, 0); + __lsx_vst(n0, tmp_buf, 16); + __lsx_vst(m1, tmp_buf, 32); + __lsx_vst(n1, tmp_buf, 48); + __lsx_vst(m2, tmp_buf, 64); + __lsx_vst(n2, tmp_buf, 80); + __lsx_vst(m3, tmp_buf, 96); + __lsx_vst(n3, tmp_buf, 112); + __lsx_vst(m4, tmp_buf, 128); + __lsx_vst(n4, tmp_buf, 144); + __lsx_vst(m5, tmp_buf, 160); + __lsx_vst(n5, tmp_buf, 176); + __lsx_vst(m6, tmp_buf, 192); + __lsx_vst(n6, tmp_buf, 208); + __lsx_vst(m7, tmp_buf, 224); + __lsx_vst(n7, tmp_buf, 240); + + /* 3rd & 4th 8x8 */ + DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1, + n1); + DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2, + m3, n3); + DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4, + m5, n5); + DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6, + m7, n7); + + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + + __lsx_vst(m0, tmp_buf, 256); + __lsx_vst(n0, tmp_buf, 272); + __lsx_vst(m1, tmp_buf, 288); + __lsx_vst(n1, tmp_buf, 304); + __lsx_vst(m2, tmp_buf, 320); + __lsx_vst(n2, tmp_buf, 336); + __lsx_vst(m3, tmp_buf, 352); + __lsx_vst(n3, tmp_buf, 368); + __lsx_vst(m4, tmp_buf, 384); + __lsx_vst(n4, tmp_buf, 400); + __lsx_vst(m5, tmp_buf, 416); + __lsx_vst(n5, tmp_buf, 432); + __lsx_vst(m6, tmp_buf, 448); + __lsx_vst(n6, tmp_buf, 464); + __lsx_vst(m7, tmp_buf, 480); + __lsx_vst(n7, tmp_buf, 496); +} + +static void idct32x8_row_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + __m128i tmp0; + + /* Even stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480, + reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = __lsx_vadd_h(reg0, reg4); + reg0 = __lsx_vsub_h(reg0, reg4); + reg4 = __lsx_vadd_h(reg6, reg2); + reg6 = __lsx_vsub_h(reg6, reg2); + reg2 = __lsx_vadd_h(reg1, reg5); + reg1 = __lsx_vsub_h(reg1, reg5); + reg5 = __lsx_vadd_h(reg7, reg3); + reg7 = __lsx_vsub_h(reg7, reg3); + reg3 = vec0; + + vec1 = reg2; + reg2 = __lsx_vadd_h(reg3, reg4); + reg3 = __lsx_vsub_h(reg3, reg4); + reg4 = __lsx_vsub_h(reg5, vec1); + reg5 = __lsx_vadd_h(reg5, vec1); + + tmp0 = __lsx_vneg_h(reg6); + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = __lsx_vsub_h(reg0, reg6); + reg0 = __lsx_vadd_h(reg0, reg6); + vec1 = __lsx_vsub_h(reg7, reg1); + reg7 = __lsx_vadd_h(reg7, reg1); + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 240); + __lsx_vst(loc1, tmp_eve_buf, 0); + __lsx_vst(loc2, tmp_eve_buf, 224); + __lsx_vst(loc3, tmp_eve_buf, 16); + + LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 208); + __lsx_vst(loc1, tmp_eve_buf, 32); + __lsx_vst(loc2, tmp_eve_buf, 192); + __lsx_vst(loc3, tmp_eve_buf, 48); + + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 176); + __lsx_vst(loc1, tmp_eve_buf, 64); + __lsx_vst(loc2, tmp_eve_buf, 160); + __lsx_vst(loc3, tmp_eve_buf, 80); + + LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + __lsx_vst(loc0, tmp_eve_buf, 144); + __lsx_vst(loc1, tmp_eve_buf, 96); + __lsx_vst(loc2, tmp_eve_buf, 128); + __lsx_vst(loc3, tmp_eve_buf, 112); +} + +static void idct32x8_row_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = __lsx_vadd_h(reg0, reg3); + reg0 = __lsx_vsub_h(reg0, reg3); + reg3 = __lsx_vadd_h(reg7, reg4); + reg7 = __lsx_vsub_h(reg7, reg4); + reg4 = __lsx_vadd_h(reg1, reg2); + reg1 = __lsx_vsub_h(reg1, reg2); + reg2 = __lsx_vadd_h(reg6, reg5); + reg6 = __lsx_vsub_h(reg6, reg5); + reg5 = vec0; + + /* 4 Stores */ + DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 64); + __lsx_vst(vec1, tmp_odd_buf, 80); + + DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 0); + __lsx_vst(vec1, tmp_odd_buf, 16); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 96); + __lsx_vst(vec1, tmp_odd_buf, 112); + + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + __lsx_vst(vec2, tmp_odd_buf, 32); + __lsx_vst(vec3, tmp_odd_buf, 48); + + /* Odd stage 2 */ + /* 8 loads */ + DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464, + reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, + vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + + LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 192); + __lsx_vst(vec1, tmp_odd_buf, 240); + + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 160); + __lsx_vst(vec1, tmp_odd_buf, 176); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, + vec2, vec0, vec3); + LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + __lsx_vst(reg0, tmp_odd_buf, 208); + __lsx_vst(reg1, tmp_odd_buf, 224); + + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + __lsx_vst(reg0, tmp_odd_buf, 128); + __lsx_vst(reg1, tmp_odd_buf, 144); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32, + tmp_odd_buf, 48, reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160, + tmp_odd_buf, 176, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 0); + __lsx_vst(loc1, tmp_odd_buf, 16); + __lsx_vst(loc2, tmp_odd_buf, 32); + __lsx_vst(loc3, tmp_odd_buf, 48); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 128); + __lsx_vst(loc1, tmp_odd_buf, 144); + __lsx_vst(loc2, tmp_odd_buf, 160); + __lsx_vst(loc3, tmp_odd_buf, 176); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96, + tmp_odd_buf, 112, reg1, reg2, reg0, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224, + tmp_odd_buf, 240, reg4, reg5, reg6, reg7); + + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 64); + __lsx_vst(loc1, tmp_odd_buf, 80); + __lsx_vst(loc2, tmp_odd_buf, 96); + __lsx_vst(loc3, tmp_odd_buf, 112); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 192); + __lsx_vst(loc1, tmp_odd_buf, 208); + __lsx_vst(loc2, tmp_odd_buf, 224); + __lsx_vst(loc3, tmp_odd_buf, 240); +} + +static void idct_butterfly_transpose_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, int16_t *dst) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + __m128i reg0, reg1, reg2, reg3; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224, + tmp_odd_buf, 96, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64, + tmp_eve_buf, 192, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, + m4, m2, m6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 496); + __lsx_vst(reg1, tmp_buf, 368); + __lsx_vst(reg2, tmp_buf, 432); + __lsx_vst(reg3, tmp_buf, 304); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160, + tmp_odd_buf, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96, + tmp_eve_buf, 224, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, + m5, m3, m7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 464); + __lsx_vst(reg1, tmp_buf, 336); + __lsx_vst(reg2, tmp_buf, 400); + __lsx_vst(reg3, tmp_buf, 272); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192, + tmp_odd_buf, 112, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80, + tmp_eve_buf, 208, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, + n4, n2, n6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 480); + __lsx_vst(reg1, tmp_buf, 352); + __lsx_vst(reg2, tmp_buf, 416); + __lsx_vst(reg3, tmp_buf, 288); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128, + tmp_odd_buf, 16, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112, + tmp_eve_buf, 240, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, + n5, n3, n7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0, + reg1, reg2, reg3); + __lsx_vst(reg0, tmp_buf, 448); + __lsx_vst(reg1, tmp_buf, 320); + __lsx_vst(reg2, tmp_buf, 384); + __lsx_vst(reg3, tmp_buf, 256); + + /* Transpose : 16 vectors */ + /* 1st & 2nd 8x8 */ + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + __lsx_vst(m0, dst, 0); + __lsx_vst(n0, dst, 64); + __lsx_vst(m1, dst, 128); + __lsx_vst(n1, dst, 192); + __lsx_vst(m2, dst, 256); + __lsx_vst(n2, dst, 320); + __lsx_vst(m3, dst, 384); + __lsx_vst(n3, dst, 448); + + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + __lsx_vst(m4, dst, 16); + __lsx_vst(n4, dst, 80); + __lsx_vst(m5, dst, 144); + __lsx_vst(n5, dst, 208); + __lsx_vst(m6, dst, 272); + __lsx_vst(n6, dst, 336); + __lsx_vst(m7, dst, 400); + __lsx_vst(n7, dst, 464); + + /* 3rd & 4th 8x8 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304, + m0, n0, m1, n1); + DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368, + m2, n2, m3, n3); + DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432, + m4, n4, m5, n5); + DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496, + m6, n6, m7, n7); + LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + __lsx_vst(m0, dst, 32); + __lsx_vst(n0, dst, 96); + __lsx_vst(m1, dst, 160); + __lsx_vst(n1, dst, 224); + __lsx_vst(m2, dst, 288); + __lsx_vst(n2, dst, 352); + __lsx_vst(m3, dst, 416); + __lsx_vst(n3, dst, 480); + __lsx_vst(m4, dst, 48); + __lsx_vst(n4, dst, 112); + __lsx_vst(m5, dst, 176); + __lsx_vst(n5, dst, 240); + __lsx_vst(m6, dst, 304); + __lsx_vst(n6, dst, 368); + __lsx_vst(m7, dst, 432); + __lsx_vst(n7, dst, 496); +} + +static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) { + DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct32x8_row_transpose_store(input, &tmp_buf[0]); + idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); + idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); + idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], + output); +} + +static void idct8x32_column_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + __m128i tmp0; + + /* Even stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf, + 1792, reg4, reg5, reg6, reg7); + tmp_buf += 64; + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + /* Load 8 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf, + 1792, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = __lsx_vadd_h(reg0, reg4); + reg0 = __lsx_vsub_h(reg0, reg4); + reg4 = __lsx_vadd_h(reg6, reg2); + reg6 = __lsx_vsub_h(reg6, reg2); + reg2 = __lsx_vadd_h(reg1, reg5); + reg1 = __lsx_vsub_h(reg1, reg5); + reg5 = __lsx_vadd_h(reg7, reg3); + reg7 = __lsx_vsub_h(reg7, reg3); + reg3 = vec0; + + vec1 = reg2; + reg2 = __lsx_vadd_h(reg3, reg4); + reg3 = __lsx_vsub_h(reg3, reg4); + reg4 = __lsx_vsub_h(reg5, vec1); + reg5 = __lsx_vadd_h(reg5, vec1); + + tmp0 = __lsx_vneg_h(reg6); + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = __lsx_vsub_h(reg0, reg6); + reg0 = __lsx_vadd_h(reg0, reg6); + vec1 = __lsx_vsub_h(reg7, reg1); + reg7 = __lsx_vadd_h(reg7, reg1); + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 0); + __lsx_vst(loc3, tmp_eve_buf, 16); + __lsx_vst(loc2, tmp_eve_buf, 224); + __lsx_vst(loc0, tmp_eve_buf, 240); + + LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 32); + __lsx_vst(loc3, tmp_eve_buf, 48); + __lsx_vst(loc2, tmp_eve_buf, 192); + __lsx_vst(loc0, tmp_eve_buf, 208); + + /* Store 8 */ + LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 64); + __lsx_vst(loc3, tmp_eve_buf, 80); + __lsx_vst(loc2, tmp_eve_buf, 160); + __lsx_vst(loc0, tmp_eve_buf, 176); + + LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + __lsx_vst(loc1, tmp_eve_buf, 96); + __lsx_vst(loc3, tmp_eve_buf, 112); + __lsx_vst(loc2, tmp_eve_buf, 128); + __lsx_vst(loc0, tmp_eve_buf, 144); +} + +static void idct8x32_column_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf, + 1984, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = __lsx_vadd_h(reg0, reg3); + reg0 = __lsx_vsub_h(reg0, reg3); + reg3 = __lsx_vadd_h(reg7, reg4); + reg7 = __lsx_vsub_h(reg7, reg4); + reg4 = __lsx_vadd_h(reg1, reg2); + reg1 = __lsx_vsub_h(reg1, reg2); + reg2 = __lsx_vadd_h(reg6, reg5); + reg6 = __lsx_vsub_h(reg6, reg5); + reg5 = vec0; + + /* 4 Stores */ + DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 64); + __lsx_vst(vec1, tmp_odd_buf, 80); + DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 0); + __lsx_vst(vec1, tmp_odd_buf, 16); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + __lsx_vst(vec0, tmp_odd_buf, 96); + __lsx_vst(vec1, tmp_odd_buf, 112); + __lsx_vst(vec2, tmp_odd_buf, 32); + __lsx_vst(vec3, tmp_odd_buf, 48); + + /* Odd stage 2 */ + /* 8 loads */ + DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf, + 1856, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, + vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); + __lsx_vst(vec0, tmp_odd_buf, 192); + __lsx_vst(vec1, tmp_odd_buf, 240); + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + __lsx_vst(vec0, tmp_odd_buf, 160); + __lsx_vst(vec1, tmp_odd_buf, 176); + + /* 4 Stores */ + DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, + vec1, vec2, vec3); + LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + __lsx_vst(reg0, tmp_odd_buf, 208); + __lsx_vst(reg1, tmp_odd_buf, 224); + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + __lsx_vst(reg0, tmp_odd_buf, 128); + __lsx_vst(reg1, tmp_odd_buf, 144); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32, + tmp_odd_buf, 48, reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160, + tmp_odd_buf, 176, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 0); + __lsx_vst(loc1, tmp_odd_buf, 16); + __lsx_vst(loc2, tmp_odd_buf, 32); + __lsx_vst(loc3, tmp_odd_buf, 48); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 128); + __lsx_vst(loc1, tmp_odd_buf, 144); + __lsx_vst(loc2, tmp_odd_buf, 160); + __lsx_vst(loc3, tmp_odd_buf, 176); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96, + tmp_odd_buf, 112, reg1, reg2, reg0, reg3); + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224, + tmp_odd_buf, 240, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, + loc1, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 64); + __lsx_vst(loc1, tmp_odd_buf, 80); + __lsx_vst(loc2, tmp_odd_buf, 96); + __lsx_vst(loc3, tmp_odd_buf, 112); + + DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + __lsx_vst(loc0, tmp_odd_buf, 192); + __lsx_vst(loc1, tmp_odd_buf, 208); + __lsx_vst(loc2, tmp_odd_buf, 224); + __lsx_vst(loc3, tmp_odd_buf, 240); +} + +static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, uint8_t *dst, + int32_t dst_stride) { + __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i n0, n1, n2, n3, n4, n5, n6, n7; + int32_t stride = dst_stride << 2; + int32_t stride2 = stride << 1; + int32_t stride3 = stride + stride2; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224, + tmp_odd_buf, 96, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64, + tmp_eve_buf, 192, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, + m4, m2, m6); + DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6); + VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, + m2, m4, m0); + DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6); + VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2, + m4, m6); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160, + tmp_odd_buf, 48, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96, + tmp_eve_buf, 224, loc0, loc1, loc2, loc3); + + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, + m5, m3, m7); + DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7); + VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3, + m5, m7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, + m3, m5, m1); + DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7); + VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3, + m5, m7); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192, + tmp_odd_buf, 112, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80, + tmp_eve_buf, 208, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, + n4, n2, n6); + DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6); + VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4, + n6); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, + n2, n4, n0); + DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6); + VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2, + n4, n6); + + /* Load 8 & Store 8 */ + DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128, + tmp_odd_buf, 16, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112, + tmp_eve_buf, 240, loc0, loc1, loc2, loc3); + DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, + n5, n3, n7); + DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7); + VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3, + n5, n7); + DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, + n3, n5, n1); + DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7); + VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3, + n5, n7); +} + +static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); + idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); + idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, + dst_stride); +} + +void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + /* transform rows */ + for (i = 0; i < 4; ++i) { + /* process 32 * 8 block */ + idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8))); + } + + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + __m128i zero = __lsx_vldi(0); + + for (i = 32; i--;) { + __lsx_vst(zero, out_ptr, 0); + __lsx_vst(zero, out_ptr, 16); + __lsx_vst(zero, out_ptr, 32); + __lsx_vst(zero, out_ptr, 48); + out_ptr += 32; + } + + out_ptr = out_arr; + + /* rows: only upper-left 8x8 has non-zero coeff */ + idct32x8_1d_rows_lsx(input, out_ptr); + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + int16_t out; + __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __lsx_vreplgr2vr_h(out); + + for (i = 16; i--;) { + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + dst2 = __lsx_vldx(dst, dst_stride); + dst3 = __lsx_vldx(dst + 16, dst_stride); + + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + + DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0, + res1, res2, res3); + DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4, + res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0, + res7, res3, 0, tmp0, tmp1, tmp2, tmp3); + __lsx_vst(tmp0, dst, 0); + __lsx_vst(tmp1, dst, 16); + dst += dst_stride; + __lsx_vst(tmp2, dst, 0); + __lsx_vst(tmp3, dst, 16); + dst += dst_stride; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c new file mode 100644 index 0000000000..f990211791 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/intrapred_lsx.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static inline void intra_predict_dc_8x8_lsx(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint64_t val0, val1; + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i store, sum_h, sum_w, sum_d; + __m128i src = { 0 }; + + val0 = *(const uint64_t *)src_top; + val1 = *(const uint64_t *)src_left; + DUP2_ARG3(__lsx_vinsgr2vr_d, src, val0, 0, src, val1, 1, src, src); + sum_h = __lsx_vhaddw_hu_bu(src, src); + sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vpickev_w(sum_d, sum_d); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vsrari_w(sum_d, 4); + store = __lsx_vreplvei_b(sum_w, 0); + + __lsx_vstelm_d(store, dst, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0); + dst += dst_stride_x4; + __lsx_vstelm_d(store, dst, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0); + __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0); +} + +static inline void intra_predict_dc_16x16_lsx(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, + int32_t dst_stride) { + int32_t dst_stride_x2 = dst_stride << 1; + int32_t dst_stride_x3 = dst_stride_x2 + dst_stride; + int32_t dst_stride_x4 = dst_stride << 2; + __m128i top, left, out; + __m128i sum_h, sum_top, sum_left; + __m128i sum_w; + __m128i sum_d; + + DUP2_ARG2(__lsx_vld, src_top, 0, src_left, 0, top, left); + DUP2_ARG2(__lsx_vhaddw_hu_bu, top, top, left, left, sum_top, sum_left); + sum_h = __lsx_vadd_h(sum_top, sum_left); + sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vpickev_w(sum_d, sum_d); + sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w); + sum_w = __lsx_vsrari_w(sum_d, 5); + out = __lsx_vreplvei_b(sum_w, 0); + + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); + dst += dst_stride_x4; + __lsx_vstx(out, dst, 0); + __lsx_vstx(out, dst, dst_stride); + __lsx_vstx(out, dst, dst_stride_x2); + __lsx_vstx(out, dst, dst_stride_x3); +} + +void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_lsx(above, left, dst, y_stride); +} + +void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_lsx(above, left, dst, y_stride); +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c new file mode 100644 index 0000000000..0503df9966 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c @@ -0,0 +1,1320 @@ +/* + * Copyright (c) 2022 Loongson Technology Corporation Limited + * Contributed by Hecai Yuan + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" +#include "vpx_ports/mem.h" + +#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \ + _in2, _in3, _in4, _in5, _in6, _in7) \ + do { \ + _in0 = __lsx_vld(_src, 0); \ + _in1 = __lsx_vldx(_src, _stride); \ + _in2 = __lsx_vldx(_src, _stride2); \ + _in3 = __lsx_vldx(_src, _stride3); \ + _src += _stride4; \ + _in4 = __lsx_vld(_src, 0); \ + _in5 = __lsx_vldx(_src, _stride); \ + _in6 = __lsx_vldx(_src, _stride2); \ + _in7 = __lsx_vldx(_src, _stride3); \ + } while (0) + +#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \ + _stride, _stride2, _stride3, _stride4) \ + do { \ + __lsx_vst(_dst0, _dst, 0); \ + __lsx_vstx(_dst1, _dst, _stride); \ + __lsx_vstx(_dst2, _dst, _stride2); \ + __lsx_vstx(_dst3, _dst, _stride3); \ + _dst += _stride4; \ + __lsx_vst(_dst4, _dst, 0); \ + __lsx_vstx(_dst5, _dst, _stride); \ + __lsx_vstx(_dst6, _dst, _stride2); \ + __lsx_vstx(_dst7, _dst, _stride3); \ + } while (0) + +static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride, + uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__lsx_bz_v(flat)) { + __lsx_vstx(p1_out, dst, -stride2); + __lsx_vstx(p0_out, dst, -stride); + __lsx_vst(q0_out, dst, 0); + __lsx_vstx(q1_out, dst, stride); + + return 1; + } + + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filt8_l, flat, p1_out, p1_filt8_l, flat, + p0_out, p0_filt8_l, flat, q0_out, q0_filt8_l, flat, p2_out, p1_out, + p0_out, q0_out); + DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filt8_l, flat, q2, q2_filt8_l, flat, + q1_out, q2_out); + + __lsx_vst(p2_out, filter48, 0); + __lsx_vst(p1_out, filter48, 16); + __lsx_vst(p0_out, filter48, 32); + __lsx_vst(q0_out, filter48, 48); + __lsx_vst(q1_out, filter48, 64); + __lsx_vst(q2_out, filter48, 80); + __lsx_vst(flat, filter48, 96); + + return 0; +} + +static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) { + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + uint8_t *dst_tmp0 = dst - stride4; + uint8_t *dst_tmp1 = dst + stride4; + + __m128i flat, flat2, filter8; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + __m128i out_h, out_l; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; + v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; + v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in; + v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in; + v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in; + v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in; + v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h; + + flat = __lsx_vld(filter48, 96); + + DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0, + -stride2, dst_tmp0, -stride, p7, p6, p5, p4); + + p3 = __lsx_vld(dst_tmp0, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp0, stride, dst_tmp0, stride2, p2, p1); + p0 = __lsx_vldx(dst_tmp0, stride3); + + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + q4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6); + q7 = __lsx_vldx(dst_tmp1, stride3); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__lsx_bz_v(flat2)) { + DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48, + p2, p1, p0, q0); + DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); + __lsx_vstx(p2, dst, -stride3); + __lsx_vstx(p1, dst, -stride2); + __lsx_vstx(p0, dst, -stride); + __lsx_vst(q0, dst, 0); + __lsx_vstx(q1, dst, stride); + __lsx_vstx(q2, dst, stride2); + } else { + dst = dst_tmp0 - stride3; + + p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0); + p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0); + p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0); + p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0); + p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0); + p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0); + p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0); + p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0); + q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7); + p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6); + p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5); + p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4); + p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3); + p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2); + p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1); + p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0); + q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0); + + tmp0_h = p7_h_in << 3; + tmp0_h -= p7_h_in; + tmp0_h += p6_h_in; + tmp0_h += q0_h_in; + tmp1_h = p6_h_in + p5_h_in; + tmp1_h += p4_h_in; + tmp1_h += p3_h_in; + tmp1_h += p2_h_in; + tmp1_h += p1_h_in; + tmp1_h += p0_h_in; + tmp1_h += tmp0_h; + + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p6 = __lsx_vbitsel_v(p6, out_l, flat2); + __lsx_vst(p6, dst, 0); + dst += stride; + + /* p5 */ + q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1); + tmp0_h = p5_h_in - p6_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p5 = __lsx_vbitsel_v(p5, out_l, flat2); + __lsx_vst(p5, dst, 0); + dst += stride; + + /* p4 */ + q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2); + tmp0_h = p4_h_in - p5_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p4 = __lsx_vbitsel_v(p4, out_l, flat2); + __lsx_vst(p4, dst, 0); + dst += stride; + + /* p3 */ + q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3); + tmp0_h = p3_h_in - p4_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p3 = __lsx_vbitsel_v(p3, out_l, flat2); + __lsx_vst(p3, dst, 0); + dst += stride; + + /* p2 */ + q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0); + filter8 = __lsx_vld(filter48, 0); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4); + tmp0_h = p2_h_in - p3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* p1 */ + q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0); + filter8 = __lsx_vld(filter48, 16); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5); + tmp0_h = p1_h_in - p2_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* p0 */ + q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0); + filter8 = __lsx_vld(filter48, 32); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6); + tmp0_h = p0_h_in - p1_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q0 */ + q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0); + filter8 = __lsx_vld(filter48, 48); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7); + tmp0_h = q7_h_in - p0_h_in; + tmp0_h += q0_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q1 */ + filter8 = __lsx_vld(filter48, 64); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q0_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p6_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q2 */ + filter8 = __lsx_vld(filter48, 80); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q1_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p5_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 0); + dst += stride; + + /* q3 */ + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q2_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p4_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q3 = __lsx_vbitsel_v(q3, out_l, flat2); + __lsx_vst(q3, dst, 0); + dst += stride; + + /* q4 */ + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p3_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q4 = __lsx_vbitsel_v(q4, out_l, flat2); + __lsx_vst(q4, dst, 0); + dst += stride; + + /* q5 */ + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q4_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p2_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q5 = __lsx_vbitsel_v(q5, out_l, flat2); + __lsx_vst(q5, dst, 0); + dst += stride; + + /* q6 */ + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + + tmp0_h = q7_h_in - q5_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p1_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + q6 = __lsx_vbitsel_v(q6, out_l, flat2); + __lsx_vst(q6, dst, 0); + } +} + +static void mb_lpf_horizontal_edge_dual(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + DECLARE_ALIGNED(16, uint8_t, filter48[16 * 8]); + uint8_t early_exit = 0; + + early_exit = hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); + + if (early_exit == 0) { + hz_lpf_t16_16w(dst, stride, filter48); + } +} + +static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + if (count == 1) { + __m128i flat2, mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i p0_filter16, p1_filter16; + __m128i p2_filter8, p1_filter8, p0_filter8; + __m128i q0_filter8, q1_filter8, q2_filter8; + __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l; + __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; + __m128i zero = __lsx_vldi(0); + __m128i tmp0, tmp1, tmp2; + + int32_t stride2 = stride << 1; + int32_t stride3 = 2 + stride; + int32_t stride4 = stride << 2; + uint8_t *dst_tmp0 = dst - stride4; + uint8_t *dst_tmp1 = dst + stride4; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + /* filter_mask* */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + flat = __lsx_vilvl_d(zero, flat); + if (__lsx_bz_v(flat)) { + __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); + __lsx_vstelm_d(p0_out, dst - stride, 0, 0); + __lsx_vstelm_d(q0_out, dst, 0, 0); + __lsx_vstelm_d(q1_out, dst + stride, 0, 0); + } else { + /* convert 8 bit input data into 16 bit */ + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, + p2_l, p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, + q1_l, q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero, + p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8, + p0_filter8, q0_filter8); + DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8, + q2_filter8); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat); + + /* load 16 vector elements */ + DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0, + -stride2, dst_tmp0, -stride, p7, p6, p5, p4); + q4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6); + q7 = __lsx_vldx(dst_tmp1, stride3); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__lsx_bz_v(flat2)) { + dst -= stride3; + __lsx_vstelm_d(p2_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q1_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(q2_out, dst, 0, 0); + } else { + /* LSB(right) 8 pixel operation */ + DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, p7_l, + p6_l, p5_l, p4_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, q4_l, + q5_l, q6_l, q7_l); + + tmp0 = __lsx_vslli_h(p7_l, 3); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp0 = __lsx_vadd_h(tmp0, p6_l); + tmp0 = __lsx_vadd_h(tmp0, q0_l); + + dst = dst_tmp0 - stride3; + + /* calculation of p6 and p5 */ + tmp1 = __lsx_vadd_h(p6_l, p5_l); + tmp1 = __lsx_vadd_h(tmp1, p4_l); + tmp1 = __lsx_vadd_h(tmp1, p3_l); + tmp1 = __lsx_vadd_h(tmp1, p2_l); + tmp1 = __lsx_vadd_h(tmp1, p1_l); + tmp1 = __lsx_vadd_h(tmp1, p0_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp0 = __lsx_vsub_h(p5_l, p6_l); + tmp0 = __lsx_vadd_h(tmp0, q1_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p6, p0_filter16, flat2, p5, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p4 and p3 */ + tmp0 = __lsx_vsub_h(p4_l, p5_l); + tmp0 = __lsx_vadd_h(tmp0, q2_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(p3_l, p4_l); + tmp2 = __lsx_vadd_h(tmp2, q3_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p4, p0_filter16, flat2, p3, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p2 and p1 */ + tmp0 = __lsx_vsub_h(p2_l, p3_l); + tmp0 = __lsx_vadd_h(tmp0, q4_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(p1_l, p2_l); + tmp2 = __lsx_vadd_h(tmp2, q5_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p2_out, p0_filter16, flat2, p1_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of p0 and q0 */ + tmp0 = __lsx_vsub_h(p0_l, p1_l); + tmp0 = __lsx_vadd_h(tmp0, q6_l); + tmp0 = __lsx_vsub_h(tmp0, p7_l); + tmp2 = __lsx_vsub_h(q7_l, p0_l); + tmp2 = __lsx_vadd_h(tmp2, q0_l); + tmp2 = __lsx_vsub_h(tmp2, p7_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, p0_out, p0_filter16, flat2, q0_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q1 and q2 */ + tmp0 = __lsx_vsub_h(q7_l, q0_l); + tmp0 = __lsx_vadd_h(tmp0, q1_l); + tmp0 = __lsx_vsub_h(tmp0, p6_l); + tmp2 = __lsx_vsub_h(q7_l, q1_l); + tmp2 = __lsx_vadd_h(tmp2, q2_l); + tmp2 = __lsx_vsub_h(tmp2, p5_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q1_out, p0_filter16, flat2, q2_out, + p1_filter16, flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q3 and q4 */ + tmp0 = __lsx_vsub_h(q7_l, q2_l); + tmp0 = __lsx_vadd_h(tmp0, q3_l); + tmp0 = __lsx_vsub_h(tmp0, p4_l); + tmp2 = __lsx_vsub_h(q7_l, q3_l); + tmp2 = __lsx_vadd_h(tmp2, q4_l); + tmp2 = __lsx_vsub_h(tmp2, p3_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q3, p0_filter16, flat2, q4, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + dst += stride; + + /* calculation of q5 and q6 */ + tmp0 = __lsx_vsub_h(q7_l, q4_l); + tmp0 = __lsx_vadd_h(tmp0, q5_l); + tmp0 = __lsx_vsub_h(tmp0, p2_l); + tmp2 = __lsx_vsub_h(q7_l, q5_l); + tmp2 = __lsx_vadd_h(tmp2, q6_l); + tmp2 = __lsx_vsub_h(tmp2, p1_l); + tmp1 = __lsx_vadd_h(tmp1, tmp0); + p0_filter16 = __lsx_vsrari_h(tmp1, 4); + tmp1 = __lsx_vadd_h(tmp1, tmp2); + p1_filter16 = __lsx_vsrari_h(tmp1, 4); + DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16, + p0_filter16, p1_filter16); + DUP2_ARG3(__lsx_vbitsel_v, q5, p0_filter16, flat2, q6, p1_filter16, + flat2, p0_filter16, p1_filter16); + __lsx_vstelm_d(p0_filter16, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p1_filter16, dst, 0, 0); + } + } + } else { + mb_lpf_horizontal_edge_dual(dst, stride, b_limit_ptr, limit_ptr, + thresh_ptr); + } +} + +void vpx_lpf_horizontal_16_dual_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(dst, stride, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + +static void transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output, + int32_t out_stride) { + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + __m128i tmp2, tmp3; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + int32_t in_stride2 = in_stride << 1; + int32_t in_stride3 = in_stride2 + in_stride; + int32_t in_stride4 = in_stride2 << 1; + int32_t out_stride2 = out_stride << 1; + int32_t out_stride3 = out_stride2 + out_stride; + int32_t out_stride4 = out_stride2 << 1; + + LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row0, row1, + row2, row3, row4, row5, row6, row7); + input += in_stride4; + LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row8, row9, + row10, row11, row12, row13, row14, row15); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p7, p6, + p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = __lsx_vpackod_d(row8, row0); + q6 = __lsx_vpackod_d(row9, row1); + q5 = __lsx_vpackod_d(row10, row2); + q4 = __lsx_vpackod_d(row11, row3); + q3 = __lsx_vpackod_d(row12, row4); + q2 = __lsx_vpackod_d(row13, row5); + q1 = __lsx_vpackod_d(row14, row6); + q0 = __lsx_vpackod_d(row15, row7); + + DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1); + DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5); + + DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7); + DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7); + + DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3); + q0 = __lsx_vpackev_w(tmp3, tmp2); + q4 = __lsx_vpackod_w(tmp3, tmp2); + + tmp2 = __lsx_vpackod_h(tmp1, tmp0); + tmp3 = __lsx_vpackod_h(q7, q5); + q2 = __lsx_vpackev_w(tmp3, tmp2); + q6 = __lsx_vpackod_w(tmp3, tmp2); + + DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3); + q1 = __lsx_vpackev_w(tmp3, tmp2); + q5 = __lsx_vpackod_w(tmp3, tmp2); + + tmp2 = __lsx_vpackod_h(tmp5, tmp4); + tmp3 = __lsx_vpackod_h(tmp7, tmp6); + q3 = __lsx_vpackev_w(tmp3, tmp2); + q7 = __lsx_vpackod_w(tmp3, tmp2); + + LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, out_stride2, + out_stride3, out_stride4); + output += out_stride4; + LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, out_stride2, + out_stride3, out_stride4); +} + +static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, + uint8_t *dst_org, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + + /* load vector elements */ + DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1); + vec4 = __lsx_vilvl_h(vec1, vec0); + vec5 = __lsx_vilvh_h(vec1, vec0); + + dst_org -= 2; + __lsx_vstelm_w(vec2, dst_org, 0, 0); + __lsx_vstelm_w(vec2, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec3, dst_org, 0, 0); + __lsx_vstelm_w(vec3, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec4, dst_org, 0, 0); + __lsx_vstelm_w(vec4, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3); + dst_org += stride4; + __lsx_vstelm_w(vec5, dst_org, 0, 0); + __lsx_vstelm_w(vec5, dst_org + stride, 0, 1); + __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2); + __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3); + + return 1; + } + + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + __lsx_vst(p2_out, filter48, 0); + __lsx_vst(p1_out, filter48, 16); + __lsx_vst(p0_out, filter48, 32); + __lsx_vst(q0_out, filter48, 48); + __lsx_vst(q1_out, filter48, 64); + __lsx_vst(q2_out, filter48, 80); + __lsx_vst(flat, filter48, 96); + + return 0; +} + +static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride, + uint8_t *filter48) { + __m128i flat, flat2, filter8; + __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + __m128i out_l, out_h; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in; + v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in; + v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in; + v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in; + v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in; + v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in; + v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h; + uint8_t *dst_tmp = dst - 128; + + flat = __lsx_vld(filter48, 96); + + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, p7, + p6, p5, p4); + DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, dst_tmp, 112, p3, + p2, p1, p0); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3); + DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7); + + VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + /* if flat2 is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat2)) { + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48, + p2, p1, p0, q0); + DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1); + vec3 = __lsx_vilvl_h(vec1, vec0); + vec4 = __lsx_vilvh_h(vec1, vec0); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1); + vec6 = __lsx_vilvl_h(vec1, vec0); + vec7 = __lsx_vilvh_h(vec1, vec0); + vec2 = __lsx_vilvl_b(q2, q1); + vec5 = __lsx_vilvh_b(q2, q1); + + dst_org -= 3; + __lsx_vstelm_w(vec3, dst_org, 0, 0); + __lsx_vstelm_h(vec2, dst_org, 4, 0); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 1); + __lsx_vstelm_h(vec2, dst_org, 4, 1); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 2); + __lsx_vstelm_h(vec2, dst_org, 4, 2); + dst_org += stride; + __lsx_vstelm_w(vec3, dst_org, 0, 3); + __lsx_vstelm_h(vec2, dst_org, 4, 3); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 0); + __lsx_vstelm_h(vec2, dst_org, 4, 4); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 1); + __lsx_vstelm_h(vec2, dst_org, 4, 5); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 2); + __lsx_vstelm_h(vec2, dst_org, 4, 6); + dst_org += stride; + __lsx_vstelm_w(vec4, dst_org, 0, 3); + __lsx_vstelm_h(vec2, dst_org, 4, 7); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 0); + __lsx_vstelm_h(vec5, dst_org, 4, 0); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 1); + __lsx_vstelm_h(vec5, dst_org, 4, 1); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 2); + __lsx_vstelm_h(vec5, dst_org, 4, 2); + dst_org += stride; + __lsx_vstelm_w(vec6, dst_org, 0, 3); + __lsx_vstelm_h(vec5, dst_org, 4, 3); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 0); + __lsx_vstelm_h(vec5, dst_org, 4, 4); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 1); + __lsx_vstelm_h(vec5, dst_org, 4, 5); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 2); + __lsx_vstelm_h(vec5, dst_org, 4, 6); + dst_org += stride; + __lsx_vstelm_w(vec7, dst_org, 0, 3); + __lsx_vstelm_h(vec5, dst_org, 4, 7); + + return 1; + } + + dst -= 7 * 16; + + p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0); + p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0); + p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0); + p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0); + p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0); + p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0); + p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0); + p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0); + q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7); + p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6); + p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5); + p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4); + p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3); + p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2); + p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1); + p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0); + q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0); + + tmp0_h = p7_h_in << 3; + tmp0_h -= p7_h_in; + tmp0_h += p6_h_in; + tmp0_h += q0_h_in; + tmp1_h = p6_h_in + p5_h_in; + tmp1_h += p4_h_in; + tmp1_h += p3_h_in; + tmp1_h += p2_h_in; + tmp1_h += p1_h_in; + tmp1_h += p0_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + + out_l = __lsx_vpickev_b(out_h, out_l); + p6 = __lsx_vbitsel_v(p6, out_l, flat2); + __lsx_vst(p6, dst, 0); + + /* p5 */ + q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1); + tmp0_h = p5_h_in - p6_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p5 = __lsx_vbitsel_v(p5, out_l, flat2); + __lsx_vst(p5, dst, 16); + + /* p4 */ + q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2); + tmp0_h = p4_h_in - p5_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p4 = __lsx_vbitsel_v(p4, out_l, flat2); + __lsx_vst(p4, dst, 16 * 2); + + /* p3 */ + q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3); + tmp0_h = p3_h_in - p4_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + p3 = __lsx_vbitsel_v(p3, out_l, flat2); + __lsx_vst(p3, dst, 16 * 3); + + /* p2 */ + q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0); + filter8 = __lsx_vld(filter48, 0); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4); + tmp0_h = p2_h_in - p3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 4); + + /* p1 */ + q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0); + filter8 = __lsx_vld(filter48, 16); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5); + tmp0_h = p1_h_in - p2_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 5); + + /* p0 */ + q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0); + filter8 = __lsx_vld(filter48, 32); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6); + tmp0_h = p0_h_in - p1_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 6); + + /* q0 */ + q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0); + filter8 = __lsx_vld(filter48, 48); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7); + tmp0_h = q7_h_in - p0_h_in; + tmp0_h += q0_h_in; + tmp0_h -= p7_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 7); + + /* q1 */ + filter8 = __lsx_vld(filter48, 64); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q0_h_in; + tmp0_h += q1_h_in; + tmp0_h -= p6_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 8); + + /* q2 */ + filter8 = __lsx_vld(filter48, 80); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q1_h_in; + tmp0_h += q2_h_in; + tmp0_h -= p5_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + filter8 = __lsx_vbitsel_v(filter8, out_l, flat2); + __lsx_vst(filter8, dst, 16 * 9); + + /* q3 */ + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q2_h_in; + tmp0_h += q3_h_in; + tmp0_h -= p4_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q3 = __lsx_vbitsel_v(q3, out_l, flat2); + __lsx_vst(q3, dst, 16 * 10); + + /* q4 */ + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q3_h_in; + tmp0_h += q4_h_in; + tmp0_h -= p3_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q4 = __lsx_vbitsel_v(q4, out_l, flat2); + __lsx_vst(q4, dst, 16 * 11); + + /* q5 */ + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q4_h_in; + tmp0_h += q5_h_in; + tmp0_h -= p2_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q5 = __lsx_vbitsel_v(q5, out_l, flat2); + __lsx_vst(q5, dst, 16 * 12); + + /* q6 */ + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4); + tmp0_h = q7_h_in - q5_h_in; + tmp0_h += q6_h_in; + tmp0_h -= p1_h_in; + tmp1_h += tmp0_h; + out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4); + out_l = __lsx_vpickev_b(out_h, out_l); + q6 = __lsx_vbitsel_v(q6, out_l, flat2); + __lsx_vst(q6, dst, 16 * 13); + + return 0; +} + +void vpx_lpf_vertical_16_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(16, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = + vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (early_exit == 0) { + early_exit = + vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]); + + if (early_exit == 0) { + transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c new file mode 100644 index 0000000000..9300b5c5ae --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" + +void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + + DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch, + p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2); + q3 = __lsx_vldx(src, pitch3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + __lsx_vstelm_d(p1_out, src - pitch2, 0, 0); + __lsx_vstelm_d(p0_out, src - pitch, 0, 0); + __lsx_vstelm_d(q0_out, src, 0, 0); + __lsx_vstelm_d(q1_out, src + pitch, 0, 0); +} + +void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + __m128i mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + + DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch, + p3, p2, p1, p0); + q0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2); + q3 = __lsx_vldx(src, pitch3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + __lsx_vstx(p1, src, -pitch2); + __lsx_vstx(p0, src, -pitch); + __lsx_vst(q0, src, 0); + __lsx_vstx(q1, src, pitch); +} + +void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, limit, thresh, b_limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i vec0, vec1, vec2, vec3; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + uint8_t *src_tmp = src - 4; + + p3 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, p2, p1); + p0 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + q0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2); + q3 = __lsx_vldx(src_tmp, pitch3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1); + vec2 = __lsx_vilvl_h(vec1, vec0); + vec3 = __lsx_vilvh_h(vec1, vec0); + + src -= 2; + __lsx_vstelm_w(vec2, src, 0, 0); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 1); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 2); + src += pitch; + __lsx_vstelm_w(vec2, src, 0, 3); + src += pitch; + + __lsx_vstelm_w(vec3, src, 0, 0); + __lsx_vstelm_w(vec3, src + pitch, 0, 1); + __lsx_vstelm_w(vec3, src + pitch2, 0, 2); + __lsx_vstelm_w(vec3, src + pitch3, 0, 3); +} + +void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + __m128i mask, hev, flat; + __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i row8, row9, row10, row11, row12, row13, row14, row15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + int32_t pitch2 = pitch << 1; + int32_t pitch3 = pitch2 + pitch; + int32_t pitch4 = pitch2 << 1; + uint8_t *src_tmp = src - 4; + + row0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row1, row2); + row3 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row5, row6); + row7 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row8 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row9, row10); + row11 = __lsx_vldx(src_tmp, pitch3); + src_tmp += pitch4; + row12 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row13, row14); + row15 = __lsx_vldx(src_tmp, pitch3); + + LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0); + thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0); + thresh0 = __lsx_vilvl_d(thresh1, thresh0); + + b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0); + b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0); + b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0); + + limit0 = __lsx_vldrepl_b(limit0_ptr, 0); + limit1 = __lsx_vldrepl_b(limit1_ptr, 0); + limit0 = __lsx_vilvl_d(limit1, limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1); + tmp2 = __lsx_vilvl_h(tmp1, tmp0); + tmp3 = __lsx_vilvh_h(tmp1, tmp0); + DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1); + tmp4 = __lsx_vilvl_h(tmp1, tmp0); + tmp5 = __lsx_vilvh_h(tmp1, tmp0); + + src -= 2; + __lsx_vstelm_w(tmp2, src, 0, 0); + __lsx_vstelm_w(tmp2, src + pitch, 0, 1); + __lsx_vstelm_w(tmp2, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp2, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp3, src, 0, 0); + __lsx_vstelm_w(tmp3, src + pitch, 0, 1); + __lsx_vstelm_w(tmp3, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp3, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp4, src, 0, 0); + __lsx_vstelm_w(tmp4, src + pitch, 0, 1); + __lsx_vstelm_w(tmp4, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp4, src + pitch3, 0, 3); + src += pitch4; + __lsx_vstelm_w(tmp5, src, 0, 0); + __lsx_vstelm_w(tmp5, src + pitch, 0, 1); + __lsx_vstelm_w(tmp5, src + pitch2, 0, 2); + __lsx_vstelm_w(tmp5, src + pitch3, 0, 3); +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c new file mode 100644 index 0000000000..00219ba71d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/loopfilter_lsx.h" + +void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i mask, hev, flat, thresh, b_limit, limit; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out; + __m128i p2_filter8, p1_filter8, p0_filter8; + __m128i q0_filter8, q1_filter8, q2_filter8; + __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l; + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + /* load vector elements */ + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = __lsx_vilvl_d(flat, flat); + + if (__lsx_bz_v(flat)) { + __lsx_vstelm_d(p1_out, dst - stride2, 0, 0); + __lsx_vstelm_d(p0_out, dst - stride, 0, 0); + __lsx_vstelm_d(q0_out, dst, 0, 0); + __lsx_vstelm_d(q1_out, dst + stride, 0, 0); + } else { + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8, + p1_filter8, q0_filter8); + q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8); + + p2 = __lsx_vilvl_d(p1_out, p2); + p0_out = __lsx_vilvl_d(q0_out, p0_out); + q1_out = __lsx_vilvl_d(q2, q1_out); + + DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat, + p2_out, p1_out); + p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat); + dst -= stride3; + + __lsx_vstelm_d(p2_out, dst, 0, 0); + __lsx_vstelm_d(p2_out, dst + stride, 0, 1); + __lsx_vstelm_d(p1_out, dst + stride2, 0, 0); + __lsx_vstelm_d(p1_out, dst + stride3, 0, 1); + + dst += stride4; + __lsx_vstelm_d(p0_out, dst, 0, 0); + dst += stride; + __lsx_vstelm_d(p0_out, dst, 0, 1); + } +} + +void vpx_lpf_horizontal_8_dual_lsx( + uint8_t *dst, int32_t stride, const uint8_t *b_limit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1, + const uint8_t *limit1, const uint8_t *thresh1) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst, + -stride, p3, p2, p1, p0); + q0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2); + q3 = __lsx_vldx(dst, stride3); + + thresh = __lsx_vldrepl_b(thresh0, 0); + p2_out = __lsx_vldrepl_b(thresh1, 0); + thresh = __lsx_vilvl_d(p2_out, thresh); + + b_limit = __lsx_vldrepl_b(b_limit0, 0); + p2_out = __lsx_vldrepl_b(b_limit1, 0); + b_limit = __lsx_vilvl_d(p2_out, b_limit); + + limit = __lsx_vldrepl_b(limit0, 0); + p2_out = __lsx_vldrepl_b(limit1, 0); + limit = __lsx_vilvl_d(p2_out, limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__lsx_bz_v(flat)) { + __lsx_vst(p1_out, dst - stride2, 0); + __lsx_vst(p0_out, dst - stride, 0); + __lsx_vst(q0_out, dst, 0); + __lsx_vst(q1_out, dst + stride, 0); + } else { + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + __lsx_vst(p2_out, dst - stride3, 0); + __lsx_vst(p1_out, dst - stride2, 0); + __lsx_vst(p0_out, dst - stride, 0); + __lsx_vst(q0_out, dst, 0); + __lsx_vst(q1_out, dst + stride, 0); + __lsx_vst(q2_out, dst + stride2, 0); + } +} + +void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p1_out, p0_out, q0_out, q1_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i zero = __lsx_vldi(0); + + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + uint8_t *dst_tmp = dst - 4; + + /* load vector elements */ + p3 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1); + p0 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + q0 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2); + q3 = __lsx_vldx(dst_tmp, stride3); + + LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = __lsx_vldrepl_b(thresh_ptr, 0); + b_limit = __lsx_vldrepl_b(b_limit_ptr, 0); + limit = __lsx_vldrepl_b(limit_ptr, 0); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = __lsx_vilvl_d(zero, flat); + + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + p2 = __lsx_vilvl_h(p1, p0); + p3 = __lsx_vilvh_h(p1, p0); + + dst -= 2; + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_w(p2, dst + stride, 0, 1); + __lsx_vstelm_w(p2, dst + stride2, 0, 2); + __lsx_vstelm_w(p2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(p3, dst, 0, 0); + __lsx_vstelm_w(p3, dst + stride, 0, 1); + __lsx_vstelm_w(p3, dst + stride2, 0, 2); + __lsx_vstelm_w(p3, dst + stride3, 0, 3); + } else { + DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, + p1_l, p0_l); + DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, + q2_l, q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l, + p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + /* store pixel values */ + p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + /* Store 6 pixels p2-_q2 */ + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3); + p1 = __lsx_vilvl_h(q3, p3); + p2 = __lsx_vilvh_h(q3, p3); + p3 = __lsx_vilvl_b(q2, q1); + dst -= 3; + __lsx_vstelm_w(p1, dst, 0, 0); + __lsx_vstelm_h(p3, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(p1, dst, 0, 1); + __lsx_vstelm_h(p3, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(p1, dst, 0, 2); + __lsx_vstelm_h(p3, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(p1, dst, 0, 3); + __lsx_vstelm_h(p3, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_h(p3, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 1); + __lsx_vstelm_h(p3, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 2); + __lsx_vstelm_h(p3, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(p2, dst, 0, 3); + __lsx_vstelm_h(p3, dst, 4, 7); + } +} + +void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride, + const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8_t *dst_tmp = dst - 4; + __m128i p3, p2, p1, p0, q3, q2, q1, q0; + __m128i p1_out, p0_out, q0_out, q1_out; + __m128i flat, mask, hev, thresh, b_limit, limit; + __m128i row4, row5, row6, row7, row12, row13, row14, row15; + __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h; + __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l; + __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l; + __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h; + __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h; + int32_t stride2 = stride << 1; + int32_t stride3 = stride2 + stride; + int32_t stride4 = stride2 << 1; + + p0 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2); + p3 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + row4 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6); + row7 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + + q3 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1); + q0 = __lsx_vldx(dst_tmp, stride3); + dst_tmp += stride4; + row12 = __lsx_vld(dst_tmp, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14); + row15 = __lsx_vldx(dst_tmp, stride3); + + /* transpose 16x8 matrix into 8x16 */ + LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, + row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = __lsx_vldrepl_b(thresh0, 0); + p1_out = __lsx_vldrepl_b(thresh1, 0); + thresh = __lsx_vilvl_d(p1_out, thresh); + + b_limit = __lsx_vldrepl_b(b_limit0, 0); + p1_out = __lsx_vldrepl_b(b_limit1, 0); + b_limit = __lsx_vilvl_d(p1_out, b_limit); + + limit = __lsx_vldrepl_b(limit0, 0); + p1_out = __lsx_vldrepl_b(limit1, 0); + limit = __lsx_vilvl_d(p1_out, limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + /* if flat is zero for all pixels, then no need to calculate other filter */ + if (__lsx_bz_v(flat)) { + DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + p2 = __lsx_vilvl_h(p1, p0); + p3 = __lsx_vilvh_h(p1, p0); + DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1); + q2 = __lsx_vilvl_h(p1, p0); + q3 = __lsx_vilvh_h(p1, p0); + dst -= 2; + __lsx_vstelm_w(p2, dst, 0, 0); + __lsx_vstelm_w(p2, dst + stride, 0, 1); + __lsx_vstelm_w(p2, dst + stride2, 0, 2); + __lsx_vstelm_w(p2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(p3, dst, 0, 0); + __lsx_vstelm_w(p3, dst + stride, 0, 1); + __lsx_vstelm_w(p3, dst + stride2, 0, 2); + __lsx_vstelm_w(p3, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(q2, dst, 0, 0); + __lsx_vstelm_w(q2, dst + stride, 0, 1); + __lsx_vstelm_w(q2, dst + stride2, 0, 2); + __lsx_vstelm_w(q2, dst + stride3, 0, 3); + dst += stride4; + __lsx_vstelm_w(q3, dst, 0, 0); + __lsx_vstelm_w(q3, dst + stride, 0, 1); + __lsx_vstelm_w(q3, dst + stride2, 0, 2); + __lsx_vstelm_w(q3, dst + stride3, 0, 3); + } else { + DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l, + p0_l); + DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, + q3_l); + VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h); + DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); + + /* filter8 */ + VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h, + p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h); + + /* convert 16 bit output data into 8 bit */ + DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l, + p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l); + DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l, + q1_filt8_l, q2_filt8_l); + + /* store pixel values */ + p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat); + p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat); + p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat); + q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat); + q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat); + q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat); + + DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3); + p2_filt8_l = __lsx_vilvl_h(q3, p3); + p2_filt8_h = __lsx_vilvh_h(q3, p3); + DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3); + p0_filt8_l = __lsx_vilvl_h(q3, p3); + p0_filt8_h = __lsx_vilvh_h(q3, p3); + q1_filt8_l = __lsx_vilvl_b(q2, q1); + q1_filt8_h = __lsx_vilvh_b(q2, q1); + + dst -= 3; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(p2_filt8_l, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(p2_filt8_h, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_l, dst, 4, 7); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 0); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 1); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 2); + dst += stride; + __lsx_vstelm_w(p0_filt8_l, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 3); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 0); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 4); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 1); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 5); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 2); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 6); + dst += stride; + __lsx_vstelm_w(p0_filt8_h, dst, 0, 3); + __lsx_vstelm_h(q1_filt8_h, dst, 4, 7); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h new file mode 100644 index 0000000000..1c43836503 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + do { \ + __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in); \ + p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in); \ + p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in); \ + q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in); \ + q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in); \ + q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in); \ + p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in); \ + p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = __lsx_vslt_bu(thresh_in, flat_out); \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1); \ + p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m); \ + mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m); \ + mask_out = __lsx_vmax_bu(flat_out, mask_out); \ + p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out); \ + \ + mask_out = __lsx_vslt_bu(limit_in, mask_out); \ + mask_out = __lsx_vxori_b(mask_out, 0xff); \ + } while (0) + +#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + do { \ + __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0; \ + __m128i flat4_tmp = __lsx_vldi(1); \ + \ + DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \ + q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0); \ + p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0); \ + flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out); \ + p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0); \ + flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out); \ + \ + flat_out = __lsx_vslt_bu(flat4_tmp, flat_out); \ + flat_out = __lsx_vxori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ + } while (0) + +#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ + q6_in, q7_in, flat_in, flat2_out) \ + do { \ + __m128i flat5_tmp = __lsx_vldi(1); \ + __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0; \ + __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0; \ + DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \ + q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0); \ + DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \ + q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0); \ + \ + DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0, \ + p4_asub_p0, flat2_out); \ + flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out); \ + p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0); \ + flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out); \ + p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0); \ + flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out); \ + flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out); \ + flat2_out = __lsx_vxori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ + } while (0) + +#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \ + p0_out, q0_out, q1_out) \ + do { \ + __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const __m128i cnst4b = __lsx_vldi(4); \ + const __m128i cnst3b = __lsx_vldi(3); \ + DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \ + 0x80, p1_m, p0_m, q0_m, q1_m); \ + filt = __lsx_vssub_b(p1_m, q1_m); \ + filt &= hev; \ + \ + q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt = __lsx_vsadd_b(filt, q0_sub_p0); \ + filt &= mask; \ + DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2); \ + DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2); \ + \ + q0_m = __lsx_vssub_b(q0_m, t1); \ + p0_m = __lsx_vsadd_b(p0_m, t2); \ + DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out); \ + \ + filt = __lsx_vsrari_b(t1, 1); \ + hev = __lsx_vxori_b(hev, 0xff); \ + filt &= hev; \ + q1_m = __lsx_vssub_b(q1_m, filt); \ + p1_m = __lsx_vsadd_b(p1_m, filt); \ + DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out); \ + } while (0) + +#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ + q1_filt8_out, q2_filt8_out) \ + do { \ + __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ + \ + tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in); \ + tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in); \ + tmp_filt8_0 = __lsx_vslli_h(p3_in, 1); \ + \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in); \ + p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in); \ + p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in); \ + tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in); \ + p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3); \ + \ + tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in); \ + tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0); \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \ + tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0); \ + q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in); \ + q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in); \ + tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in); \ + tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1); \ + q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_lsx.c new file mode 100644 index 0000000000..1299e75e9a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/quantize_lsx.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs, + __m128i round, __m128i quant, + __m128i shift, __m128i cmp_mask) { + __m128i rounded, qcoeff; + + rounded = __lsx_vsadd_h(coeff_abs, round); + qcoeff = __lsx_vmuh_h(rounded, quant); + qcoeff = __lsx_vadd_h(rounded, qcoeff); + qcoeff = __lsx_vmuh_h(qcoeff, shift); + qcoeff = __lsx_vsigncov_h(coeff, qcoeff); + qcoeff = __lsx_vand_v(qcoeff, cmp_mask); + + return qcoeff; +} + +static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, + int16_t *dqcoeff) { + __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant); + __lsx_vst(dqcoeff16, dqcoeff, 0); +} + +static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff, + __m128i dequant, + int16_t *dqcoeff) { + // Un-sign to bias rounding like C. + __m128i low, high, dqcoeff32_0, dqcoeff32_1, res; + __m128i zero = __lsx_vldi(0); + __m128i coeff = __lsx_vabsd_h(qcoeff, zero); + + const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero); + const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero); + + low = __lsx_vmul_h(coeff, dequant); + high = __lsx_vmuh_h(coeff, dequant); + dqcoeff32_0 = __lsx_vilvl_h(high, low); + dqcoeff32_1 = __lsx_vilvh_h(high, low); + + // "Divide" by 2. + dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1); + dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1); + dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0); + dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1); + res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0); + __lsx_vst(res, dqcoeff, 0); +} + +static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, + const int16_t *scan, int index, + __m128i zero) { + const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); + const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero); + __m128i scan0 = __lsx_vld(scan + index, 0); + __m128i scan1 = __lsx_vld(scan + index + 8, 0); + __m128i eob0, eob1; + + eob0 = __lsx_vandn_v(zero_coeff0, scan0); + eob1 = __lsx_vandn_v(zero_coeff1, scan1); + return __lsx_vmax_h(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + int16_t res_m; + + eob_shuffled = __lsx_vshuf4i_w(eob, 0xe); + eob = __lsx_vmax_h(eob, eob_shuffled); + eob_shuffled = __lsx_vshuf4i_h(eob, 0xe); + eob = __lsx_vmax_h(eob, eob_shuffled); + eob_shuffled = __lsx_vshuf4i_h(eob, 0x1); + eob = __lsx_vmax_h(eob, eob_shuffled); + res_m = __lsx_vpickve2gr_h(eob, 1); + + return res_m; +} + +#if !CONFIG_VP9_HIGHBITDEPTH + +void vpx_quantize_b_lsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m128i zero = __lsx_vldi(0); + int index = 16; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, quant_shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + zbin = __lsx_vld(mb_plane->zbin, 0); + round = __lsx_vld(mb_plane->round, 0); + quant = __lsx_vld(mb_plane->quant, 0); + dequant = __lsx_vld(dequant_ptr, 0); + quant_shift = __lsx_vld(mb_plane->quant_shift, 0); + // Handle one DC and first 15 AC. + DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + zbin = __lsx_vilvh_d(zbin, zbin); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + round = __lsx_vilvh_d(round, round); + quant = __lsx_vilvh_d(quant, quant); + quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + + __lsx_vst(qcoeff0, qcoeff_ptr, 0); + __lsx_vst(qcoeff1, qcoeff_ptr, 16); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); + dequant = __lsx_vilvh_d(dequant, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); + // AC only loop. + while (index < n_coeffs) { + coeff0 = __lsx_vld(coeff_ptr + index, 0); + coeff1 = __lsx_vld(coeff_ptr + index + 8, 0); + + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + + __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); + __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); + eob = __lsx_vmax_h(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m128i zero = __lsx_vldi(0); + int index; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, quant_shift; + __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1; + __m128i eob = zero, eob0; + + zbin = __lsx_vld(mb_plane->zbin, 0); + zbin = __lsx_vsrari_h(zbin, 1); + round = __lsx_vld(mb_plane->round, 0); + round = __lsx_vsrari_h(round, 1); + + quant = __lsx_vld(mb_plane->quant, 0); + dequant = __lsx_vld(dequant_ptr, 0); + quant_shift = __lsx_vld(mb_plane->quant_shift, 0); + quant_shift = __lsx_vslli_h(quant_shift, 1); + // Handle one DC and first 15 AC. + DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1); + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + // remove DC from zbin + zbin = __lsx_vilvh_d(zbin, zbin); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + // remove DC in quant_shift, quant, quant_shift + round = __lsx_vilvh_d(round, round); + quant = __lsx_vilvh_d(quant, quant); + quant_shift = __lsx_vilvh_d(quant_shift, quant_shift); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + __lsx_vst(qcoeff0, qcoeff_ptr, 0); + __lsx_vst(qcoeff1, qcoeff_ptr, 16); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr); + dequant = __lsx_vilvh_d(dequant, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = __lsx_vld(coeff_ptr + index, 0); + coeff1 = __lsx_vld(coeff_ptr + index + 8, 0); + + qcoeff0 = __lsx_vabsd_h(coeff0, zero); + qcoeff1 = __lsx_vabsd_h(coeff1, zero); + + cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0); + cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1); + + qcoeff0 = + calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0); + qcoeff1 = + calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1); + __lsx_vst(qcoeff0, qcoeff_ptr + index, 0); + __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, + dqcoeff_ptr + 8 + index); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); + eob = __lsx_vmax_h(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c new file mode 100644 index 0000000000..b6fbedb0d0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sad_lsx.c @@ -0,0 +1,717 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0, + __m128i ref1) { + __m128i diff0_m, diff1_m, sad_m0; + __m128i sad_m = __lsx_vldi(0); + + diff0_m = __lsx_vabsd_bu(in0, ref0); + diff1_m = __lsx_vabsd_bu(in1, ref1); + + sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m); + sad_m = __lsx_vadd_h(sad_m, sad_m0); + sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m); + sad_m = __lsx_vadd_h(sad_m, sad_m0); + + return sad_m; +} + +static INLINE uint32_t hadd_uw_u32(__m128i in) { + __m128i res0_m; + uint32_t sum_m; + + res0_m = __lsx_vhaddw_du_wu(in, in); + res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m); + sum_m = __lsx_vpickve2gr_w(res0_m, 0); + + return sum_m; +} + +static INLINE uint32_t hadd_uh_u32(__m128i in) { + __m128i res_m; + uint32_t sum_m; + + res_m = __lsx_vhaddw_wu_hu(in, in); + sum_m = hadd_uw_u32(res_m); + + return sum_m; +} + +static INLINE int32_t hadd_sw_s32(__m128i in) { + __m128i res0_m; + int32_t sum_m; + + res0_m = __lsx_vhaddw_d_w(in, in); + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); + sum_m = __lsx_vpickve2gr_w(res0_m, 0); + + return sum_m; +} + +static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t res; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp; + __m128i sad = __lsx_vldi(0); + + for (ht_cnt = (height >> 2); ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2); + src += src_stride; + ref += ref_stride; + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3); + src += src_stride; + ref += ref_stride; + DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 2); + uint32_t res; + __m128i src0, src1, ref0, ref1, sad_tmp; + __m128i sad = __lsx_vldi(0); + int32_t src_stride2 = src_stride << 1; + int32_t ref_stride2 = ref_stride << 1; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); + src += src_stride2; + ref += ref_stride2; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0); + DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1); + src += src_stride2; + ref += ref_stride2; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 2); + uint32_t res; + __m128i src0, src1, ref0, ref1; + __m128i sad_tmp; + __m128i sad = __lsx_vldi(0); + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt = (height >> 1); + uint32_t sad = 0; + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + } + + sad = hadd_uh_u32(sad0); + sad += hadd_uh_u32(sad1); + + return sad; +} + +static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt = (height >> 2); + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + __m128i src0, src1, src2, src3, sad_tmp; + __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + src0 = __lsx_vld(src_ptr, 0); + DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_ptr, src_stride3); + src_ptr += src_stride4; + ref0 = __lsx_vld(ref0_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1, + ref2); + ref3 = __lsx_vldx(ref0_ptr, ref_stride3); + ref0_ptr += ref_stride4; + ref4 = __lsx_vld(ref1_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5, + ref6); + ref7 = __lsx_vldx(ref1_ptr, ref_stride3); + ref1_ptr += ref_stride4; + ref8 = __lsx_vld(ref2_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9, + ref10); + ref11 = __lsx_vldx(ref2_ptr, ref_stride3); + ref2_ptr += ref_stride4; + ref12 = __lsx_vld(ref3_ptr, 0); + DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13, + ref14); + ref15 = __lsx_vldx(ref3_ptr, ref_stride3); + ref3_ptr += ref_stride4; + + DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1); + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + + DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1); + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); +} + +static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt = (height >> 1); + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref0 = __lsx_vld(ref0_ptr, 0); + ref0_ptr += ref_stride; + ref1 = __lsx_vld(ref1_ptr, 0); + ref1_ptr += ref_stride; + ref2 = __lsx_vld(ref2_ptr, 0); + ref2_ptr += ref_stride; + ref3 = __lsx_vld(ref3_ptr, 0); + ref3_ptr += ref_stride; + + diff = __lsx_vabsd_bu(src, ref0); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + diff = __lsx_vabsd_bu(src, ref1); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + diff = __lsx_vabsd_bu(src, ref2); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + diff = __lsx_vabsd_bu(src, ref3); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref0 = __lsx_vld(ref0_ptr, 0); + ref0_ptr += ref_stride; + ref1 = __lsx_vld(ref1_ptr, 0); + ref1_ptr += ref_stride; + ref2 = __lsx_vld(ref2_ptr, 0); + ref2_ptr += ref_stride; + ref3 = __lsx_vld(ref3_ptr, 0); + ref3_ptr += ref_stride; + + diff = __lsx_vabsd_bu(src, ref0); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + diff = __lsx_vabsd_bu(src, ref1); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + diff = __lsx_vabsd_bu(src, ref2); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + diff = __lsx_vabsd_bu(src, ref3); + sad_tmp = __lsx_vhaddw_hu_bu(diff, diff); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); +} + +static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt = height; + __m128i src0, src1, ref0, ref1, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + __m128i sad2 = sad0; + __m128i sad3 = sad0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1); + src += src_stride; + + DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1); + ref0_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1); + ref1_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1); + ref2_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad2 = __lsx_vadd_h(sad2, sad_tmp); + + DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1); + ref3_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad3 = __lsx_vadd_h(sad3, sad_tmp); + } + sad_array[0] = hadd_uh_u32(sad0); + sad_array[1] = hadd_uh_u32(sad1); + sad_array[2] = hadd_uh_u32(sad2); + sad_array[3] = hadd_uh_u32(sad3); +} + +static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt = height; + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i sad, sad_tmp; + + __m128i sad0_0 = __lsx_vldi(0); + __m128i sad0_1 = sad0_0; + __m128i sad1_0 = sad0_0; + __m128i sad1_1 = sad0_0; + __m128i sad2_0 = sad0_0; + __m128i sad2_1 = sad0_0; + __m128i sad3_0 = sad0_0; + __m128i sad3_1 = sad0_0; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + + DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48, + ref0, ref1, ref2, ref3); + ref0_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48, + ref0, ref1, ref2, ref3); + ref1_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48, + ref0, ref1, ref2, ref3); + ref2_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp); + + DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48, + ref0, ref1, ref2, ref3); + ref3_ptr += ref_stride; + sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1); + sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3); + sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp); + } + sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[0] = hadd_uw_u32(sad); + + sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[1] = hadd_uw_u32(sad); + + sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[2] = hadd_uw_u32(sad); + + sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0); + sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1); + sad = __lsx_vadd_w(sad, sad_tmp); + sad_array[3] = hadd_uw_u32(sad); +} + +static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t res, ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i comp0, comp1, sad_tmp; + __m128i sad = __lsx_vldi(0); + uint8_t *src_tmp, *ref_tmp; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + for (; ht_cnt--;) { + src_tmp = (uint8_t *)src + 16; + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src1 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + ref_tmp = (uint8_t *)ref + 16; + ref0 = __lsx_vld(ref, 0); + DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4); + ref6 = __lsx_vldx(ref, ref_stride3); + ref1 = __lsx_vld(ref_tmp, 0); + DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3, + ref5); + ref7 = __lsx_vldx(ref_tmp, ref_stride3); + ref += ref_stride4; + + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96, + pred0, pred2, pred4, pred6); + DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred, + 112, pred1, pred3, pred5, pred7); + sec_pred += 128; + + DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1); + sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1); + sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1); + sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1); + sad = __lsx_vadd_h(sad, sad_tmp); + } + res = hadd_uh_u32(sad); + return res; +} + +static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t res, ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3; + __m128i sad, sad_tmp; + __m128i sad0 = __lsx_vldi(0); + __m128i sad1 = sad0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2, + ref3); + ref += ref_stride; + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3, + ref3, comp0, comp1, comp2, comp3); + sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1); + sad0 = __lsx_vadd_h(sad0, sad_tmp); + sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3); + sad1 = __lsx_vadd_h(sad1, sad_tmp); + } + sad = __lsx_vhaddw_wu_hu(sad0, sad0); + sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1); + sad = __lsx_vadd_w(sad, sad_tmp); + + res = hadd_sw_s32(sad); + return res; +} + +#define VPX_SAD_8xHT_LSX(height) \ + uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_16xHT_LSX(height) \ + uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_16width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_32xHT_LSX(height) \ + uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_32width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_64xHT_LSX(height) \ + uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_64width_lsx(src, src_stride, ref, ref_stride, height); \ + } + +#define VPX_SAD_8xHTx4D_LSX(height) \ + void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ + sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_16xHTx4D_LSX(height) \ + void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_32xHTx4D_LSX(height) \ + void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_SAD_64xHTx4D_LSX(height) \ + void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define VPX_AVGSAD_32xHT_LSX(height) \ + uint32_t vpx_sad32x##height##_avg_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define VPX_AVGSAD_64xHT_LSX(height) \ + uint32_t vpx_sad64x##height##_avg_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define SAD64 \ + VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \ + VPX_AVGSAD_64xHT_LSX(64) + +SAD64 + +#define SAD32 \ + VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \ + VPX_AVGSAD_32xHT_LSX(32) + +SAD32 + +#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16) + +SAD16 + +#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8) + +SAD8 + +#undef SAD64 +#undef SAD32 +#undef SAD16 +#undef SAD8 diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c new file mode 100644 index 0000000000..700793531c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c @@ -0,0 +1,874 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_dsp/loongarch/variance_lsx.h" +#include "vpx_dsp/variance.h" + +static const uint8_t bilinear_filters_lsx[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) + +static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t res, ht_cnt = 32; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i pred0, pred1, pred2, pred3, vec, vec_tmp; + __m128i avg0, avg1, avg2, avg3; + __m128i var = __lsx_vldi(0); + + avg0 = var; + avg1 = var; + avg2 = var; + avg3 = var; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3, + pred3, src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + + DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48, + pred0, pred1, pred2, pred3); + sec_pred += 64; + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3, + pred3, src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + vec = __lsx_vhaddw_w_h(avg0, avg0); + vec_tmp = __lsx_vhaddw_w_h(avg1, avg1); + vec = __lsx_vadd_w(vec, vec_tmp); + vec_tmp = __lsx_vhaddw_w_h(avg2, avg2); + vec = __lsx_vadd_w(vec, vec_tmp); + vec_tmp = __lsx_vhaddw_w_h(avg3, avg3); + vec = __lsx_vadd_w(vec, vec_tmp); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t sub_pixel_sse_diff_8width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3; + __m128i vec0, vec1, vec2, vec3, filt0, out, vec; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1, + FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS, + src0, src1, src2, src3); + out = __lsx_vpackev_d(src1, src0); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = __lsx_vpackev_d(src3, src2); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_16width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, filt0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i vec, var = __lsx_vldi(0); + __m128i avg = var; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + src0, src1, src2, src3); + CALC_MSE_AVG_B(src0, dst0, var, avg); + CALC_MSE_AVG_B(src1, dst1, var, avg); + CALC_MSE_AVG_B(src2, dst2, var, avg); + CALC_MSE_AVG_B(src3, dst3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_8width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4; + __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + src0 = src4; + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_16width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4; + __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i var = __lsx_vldi(0); + __m128i avg = var; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + src0 = src4; + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride, + filter, height, &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_8width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2); + src += src_stride; + dst += dst_stride; + DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3); + src += src_stride; + dst += dst_stride; + + DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_sse_diff_16width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec; + __m128i var = __lsx_vldi(0); + __m128i avg = var; + __m128i mask = { 0x0403030202010100, 0x0807070606050504 }; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + ref0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2); + ref3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + CALC_MSE_AVG_B(src2, ref2, var, avg); + CALC_MSE_AVG_B(src3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t sub_pixel_sse_diff_32width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t sse = 0; + int32_t diff0[2]; + + sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[0]); + src += 16; + dst += 16; + + sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[1]); + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t subpel_avg_ssediff_16w_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + __m128i pred0, pred1, pred2, pred3, filt0, vec; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i mask = { 0x403030202010100, 0x807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + dst0 = __lsx_vld(dst, 0); + dst += dst_stride; + dst1 = __lsx_vld(dst, 0); + dst += dst_stride; + dst2 = __lsx_vld(dst, 0); + dst += dst_stride; + dst3 = __lsx_vld(dst, 0); + dst += dst_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, + pred3, tmp0, tmp1, tmp2, tmp3); + + CALC_MSE_AVG_B(tmp0, dst0, var, avg); + CALC_MSE_AVG_B(tmp1, dst1, var, avg); + CALC_MSE_AVG_B(tmp2, dst2, var, avg); + CALC_MSE_AVG_B(tmp3, dst3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + + return res; +} + +static uint32_t subpel_avg_ssediff_16w_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3; + __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1, vec, filt0; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + src += src_stride; + src2 = __lsx_vld(src, 0); + src += src_stride; + src3 = __lsx_vld(src, 0); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + src0 = src4; + ref0 = __lsx_vld(dst, 0); + dst += dst_stride; + ref1 = __lsx_vld(dst, 0); + dst += dst_stride; + ref2 = __lsx_vld(dst, 0); + dst += dst_stride; + ref3 = __lsx_vld(dst, 0); + dst += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3, + pred3, out0, out1, out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t subpel_avg_ssediff_16w_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { + uint32_t loop_cnt = (height >> 2); + int32_t res; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1; + __m128i mask = { 0x403030202010100, 0x807070606050504 }; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7); + src += src_stride; + + pred0 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred1 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred2 = __lsx_vld(sec_pred, 0); + sec_pred += width; + pred3 = __lsx_vld(sec_pred, 0); + sec_pred += width; + + HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1); + HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0); + HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + ref0 = __lsx_vld(dst, 0); + dst += dst_stride; + ref1 = __lsx_vld(dst, 0); + dst += dst_stride; + ref2 = __lsx_vld(dst, 0); + dst += dst_stride; + ref3 = __lsx_vld(dst, 0); + dst += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3, + pred3, out0, out1, out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6) +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8) +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12) + +#define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht) \ + uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx( \ + const uint8_t *src, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sse) { \ + int32_t diff; \ + uint32_t var; \ + const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_sse_diff_##wd##width_hv_lsx( \ + src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_sse_diff_##wd##width_v_lsx( \ + src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ + } \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_sse_diff_##wd##width_h_lsx( \ + src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \ + sse); \ + } \ + } \ + \ + return var; \ + } + +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8) +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16) +VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32) + +#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht) \ + uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_lsx[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_lsx[y_offset]; \ + \ + if (y_offset) { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_64width_hv_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_64width_v_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (x_offset) { \ + *sse = sub_pixel_avg_sse_diff_64width_h_lsx( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, &diff); \ + } \ + } \ + \ + return VARIANCE_64Wx##ht##H(*sse, diff); \ + } + +VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64) diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c new file mode 100644 index 0000000000..943a5c5a9b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/subtract_lsx.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + __m128i src0, src1, src2, src3; + __m128i pred0, pred1, pred2, pred3; + __m128i diff0, diff1; + __m128i reg0, reg1; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t diff_stride2 = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t diff_stride3 = diff_stride2 + diff_stride; + + DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0, + pred1, pred2, pred3); + DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2, + src0, src2, pred0, pred2); + DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0); + reg0 = __lsx_vilvl_b(src0, pred0); + reg1 = __lsx_vilvh_b(src0, pred0); + DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1); + __lsx_vstelm_d(diff0, diff_ptr, 0, 0); + __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1); + __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0); + __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1); +} + +static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t dst_stride = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t dst_stride2 = dst_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0, + pred1, pred2, pred3); + src_ptr += src_stride4; + pred_ptr += pred_stride4; + + DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5, + src6, src7); + DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0, + pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4, + pred5, pred6, pred7); + + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + __lsx_vst(src0, diff_ptr, 0); + __lsx_vstx(src1, diff_ptr, dst_stride); + __lsx_vstx(src2, diff_ptr, dst_stride2); + __lsx_vstx(src3, diff_ptr, dst_stride3); + diff_ptr += dst_stride2; + __lsx_vst(src4, diff_ptr, 0); + __lsx_vstx(src5, diff_ptr, dst_stride); + __lsx_vstx(src6, diff_ptr, dst_stride2); + __lsx_vstx(src7, diff_ptr, dst_stride3); +} + +static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t dst_stride = diff_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t dst_stride2 = dst_stride << 1; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + int16_t *diff_tmp = diff + 8; + + DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred, + pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + pred, pred_stride, src5, src6, src7, pred5); + DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, + pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7, + pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vstx(src2, diff, dst_stride); + __lsx_vstx(src4, diff, dst_stride2); + __lsx_vstx(src6, diff, dst_stride3); + __lsx_vst(src1, diff_tmp, 0); + __lsx_vstx(src3, diff_tmp, dst_stride); + __lsx_vstx(src5, diff_tmp, dst_stride2); + __lsx_vstx(src7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + __lsx_vst(pred0, diff, 0); + __lsx_vstx(pred2, diff, dst_stride); + __lsx_vstx(pred4, diff, dst_stride2); + __lsx_vstx(pred6, diff, dst_stride3); + __lsx_vst(pred1, diff_tmp, 0); + __lsx_vstx(pred3, diff_tmp, dst_stride); + __lsx_vstx(pred5, diff_tmp, dst_stride2); + __lsx_vstx(pred7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred, + pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4); + src += src_stride4; + pred += pred_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + pred, pred_stride, src5, src6, src7, pred5); + DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7); + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, + pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7, + pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vstx(src2, diff, dst_stride); + __lsx_vstx(src4, diff, dst_stride2); + __lsx_vstx(src6, diff, dst_stride3); + __lsx_vst(src1, diff_tmp, 0); + __lsx_vstx(src3, diff_tmp, dst_stride); + __lsx_vstx(src5, diff_tmp, dst_stride2); + __lsx_vstx(src7, diff_tmp, dst_stride3); + diff += dst_stride2; + diff_tmp += dst_stride2; + __lsx_vst(pred0, diff, 0); + __lsx_vstx(pred2, diff, dst_stride); + __lsx_vstx(pred4, diff, dst_stride2); + __lsx_vstx(pred6, diff, dst_stride3); + __lsx_vst(pred1, diff_tmp, 0); + __lsx_vstx(pred3, diff_tmp, dst_stride); + __lsx_vstx(pred5, diff_tmp, dst_stride2); + __lsx_vstx(pred7, diff_tmp, dst_stride3); +} + +static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + uint32_t loop_cnt; + int32_t src_stride2 = src_stride << 1; + int32_t pred_stride2 = pred_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t pred_stride3 = pred_stride2 + pred_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t pred_stride4 = pred_stride2 << 1; + + for (loop_cnt = 8; loop_cnt--;) { + const uint8_t *src_tmp = src + 16; + const uint8_t *pred_tmp = pred + 16; + DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1, + pred0, pred1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src3, src4, src5); + DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred, + pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3); + DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred, + pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7); + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, + reg7, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, + tmp7, pred4, pred5, pred6, pred7); + src += src_stride4; + pred += pred_stride4; + __lsx_vst(src0, diff, 0); + __lsx_vst(src1, diff, 16); + __lsx_vst(src2, diff, 32); + __lsx_vst(src3, diff, 48); + diff += diff_stride; + __lsx_vst(src4, diff, 0); + __lsx_vst(src5, diff, 16); + __lsx_vst(src6, diff, 32); + __lsx_vst(src7, diff, 48); + diff += diff_stride; + __lsx_vst(pred0, diff, 0); + __lsx_vst(pred1, diff, 16); + __lsx_vst(pred2, diff, 32); + __lsx_vst(pred3, diff, 48); + diff += diff_stride; + __lsx_vst(pred4, diff, 0); + __lsx_vst(pred5, diff, 16); + __lsx_vst(pred6, diff, 32); + __lsx_vst(pred7, diff, 48); + diff += diff_stride; + } +} + +static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + uint32_t loop_cnt; + + for (loop_cnt = 32; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1, + pred2, pred3); + src += src_stride; + pred += pred_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6, + src7); + DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5, + pred6, pred7); + src += src_stride; + pred += pred_stride; + + DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg0, reg2, reg4, reg6); + DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3, + reg1, reg3, reg5, reg7); + DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, + reg3, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, + reg7, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, pred0, pred1, pred2, pred3); + DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, + tmp7, pred4, pred5, pred6, pred7); + __lsx_vst(src0, diff, 0); + __lsx_vst(src1, diff, 16); + __lsx_vst(src2, diff, 32); + __lsx_vst(src3, diff, 48); + __lsx_vst(src4, diff, 64); + __lsx_vst(src5, diff, 80); + __lsx_vst(src6, diff, 96); + __lsx_vst(src7, diff, 112); + diff += diff_stride; + __lsx_vst(pred0, diff, 0); + __lsx_vst(pred1, diff, 16); + __lsx_vst(pred2, diff, 32); + __lsx_vst(pred3, diff, 48); + __lsx_vst(pred4, diff, 64); + __lsx_vst(pred5, diff, 80); + __lsx_vst(pred6, diff, 96); + __lsx_vst(pred7, diff, 112); + diff += diff_stride; + } +} + +void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 8: + sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 16: + sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 32: + sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 64: + sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + default: + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h new file mode 100644 index 0000000000..bd514831bf --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + do { \ + __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ + __m128i k0_m, k1_m, k2_m, k3_m; \ + \ + k0_m = __lsx_vreplgr2vr_h(cnst0); \ + k1_m = __lsx_vreplgr2vr_h(cnst1); \ + k2_m = __lsx_vpackev_h(k1_m, k0_m); \ + \ + DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m); \ + DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m); \ + \ + DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \ + k3_m = __lsx_vmulwod_w_h(s5_m, k1_m); \ + s1_m = __lsx_vsub_w(s1_m, k3_m); \ + k3_m = __lsx_vmulwod_w_h(s4_m, k1_m); \ + s0_m = __lsx_vsub_w(s0_m, k3_m); \ + \ + out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + \ + DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m); \ + out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS); \ + } while (0) + +#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3) \ + do { \ + __m128i tp0_m, tp1_m; \ + \ + DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \ + in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c new file mode 100644 index 0000000000..8fad342c71 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/variance_lsx.h" + +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) + +static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t res, ht_cnt = (height >> 2); + __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t ref_stride2 = ref_stride << 1; + int32_t ref_stride3 = ref_stride2 + ref_stride; + int32_t ref_stride4 = ref_stride2 << 1; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr + src_stride, 0, + src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1, + src2, src3); + src_ptr += src_stride4; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr + ref_stride, 0, + ref_ptr + ref_stride2, 0, ref_ptr + ref_stride3, 0, ref0, ref1, + ref2, ref3); + ref_ptr += ref_stride4; + + DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t res, ht_cnt = (height >> 2); + __m128i src, ref, vec; + __m128i avg = __lsx_vldi(0); + __m128i var = avg; + + for (; ht_cnt--;) { + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = __lsx_vld(src_ptr, 0); + src_ptr += src_stride; + ref = __lsx_vld(ref_ptr, 0); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + } + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t res, ht_cnt = (height >> 2); + __m128i avg = __lsx_vldi(0); + __m128i src0, src1, ref0, ref1; + __m128i vec; + __m128i var = avg; + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + src_ptr += src_stride; + DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __lsx_vhaddw_w_h(avg, avg); + HADD_SW_S32(vec, *diff); + HADD_SW_S32(var, res); + return res; +} + +static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t res, ht_cnt = 32; + __m128i avg0 = __lsx_vldi(0); + __m128i src0, src1, src2, src3; + __m128i ref0, ref1, ref2, ref3; + __m128i vec0, vec1; + __m128i avg1 = avg0; + __m128i avg2 = avg0; + __m128i avg3 = avg0; + __m128i var = avg0; + + for (; ht_cnt--;) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + src_ptr += src_stride; + DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48, + ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + vec0 = __lsx_vhaddw_w_h(avg0, avg0); + vec1 = __lsx_vhaddw_w_h(avg1, avg1); + vec0 = __lsx_vadd_w(vec0, vec1); + vec1 = __lsx_vhaddw_w_h(avg2, avg2); + vec0 = __lsx_vadd_w(vec0, vec1); + vec1 = __lsx_vhaddw_w_h(avg3, avg3); + vec0 = __lsx_vadd_w(vec0, vec1); + HADD_SW_S32(vec0, *diff); + HADD_SW_S32(var, res); + return res; +} + +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6) +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8) + +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10) +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12) + +#define VPX_VARIANCE_WDXHT_LSX(wd, ht) \ + uint32_t vpx_variance##wd##x##ht##_lsx( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, uint32_t *sse) { \ + int32_t diff; \ + \ + *sse = \ + sse_diff_##wd##width_lsx(src, src_stride, ref, ref_stride, ht, &diff); \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +static uint32_t sse_16width_lsx(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t res, ht_cnt = (height >> 2); + __m128i src, ref; + __m128i var = __lsx_vldi(0); + + for (; ht_cnt--;) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref); + src_ptr += src_stride; + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + } + HADD_SW_S32(var, res); + return res; +} + +VPX_VARIANCE_WDXHT_LSX(8, 8) +VPX_VARIANCE_WDXHT_LSX(16, 16) +VPX_VARIANCE_WDXHT_LSX(32, 32) + +uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x64_lsx(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx64H(*sse, diff); +} + +uint32_t vpx_mse16x16_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_lsx(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum); +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h new file mode 100644 index 0000000000..cf9e9890ff --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/variance_lsx.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ + +#include "vpx_util/loongson_intrinsics.h" + +#define HADD_SW_S32(in0, in1) \ + do { \ + __m128i res0_m; \ + \ + res0_m = __lsx_vhaddw_d_w(in0, in0); \ + res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \ + in1 = __lsx_vpickve2gr_w(res0_m, 0); \ + } while (0) + +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \ + do { \ + __m128i tmp0_m, tmp1_m; \ + \ + tmp0_m = __lsx_vshuf_b(in1, in0, mask); \ + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); \ + in2 = __lsx_vsrari_h(tmp1_m, shift); \ + } while (0) + +#define CALC_MSE_B(src, ref, var) \ + do { \ + __m128i src_l0_m, src_l1_m; \ + __m128i res_l0_m, res_l1_m; \ + \ + src_l0_m = __lsx_vilvl_b(src, ref); \ + src_l1_m = __lsx_vilvh_b(src, ref); \ + DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ + res_l0_m, res_l1_m); \ + var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ + var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ + } while (0) + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + do { \ + __m128i src_l0_m, src_l1_m; \ + __m128i res_l0_m, res_l1_m; \ + \ + src_l0_m = __lsx_vilvl_b(src, ref); \ + src_l1_m = __lsx_vilvh_b(src, ref); \ + DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \ + res_l0_m, res_l1_m); \ + var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m); \ + var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m); \ + sub = __lsx_vadd_h(sub, res_l0_m); \ + sub = __lsx_vadd_h(sub, res_l1_m); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c new file mode 100644 index 0000000000..1c59228813 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c @@ -0,0 +1,972 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1; + __m128i dst0, dst1, dst2, dst3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp0, tmp1); + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7); + tmp0 = __lsx_vxori_b(tmp0, 128); + dst0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 3); +} + +static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + tmp0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp0 = __lsx_vilvl_w(tmp1, tmp0); + tmp1 = __lsx_vilvl_w(tmp3, tmp2); + dst0 = __lsx_vilvl_d(tmp1, tmp0); + + tmp0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + tmp3 = __lsx_vldrepl_w(dst_tmp, 0); + tmp0 = __lsx_vilvl_w(tmp1, tmp0); + tmp1 = __lsx_vilvl_w(tmp3, tmp2); + dst1 = __lsx_vilvl_d(tmp1, tmp0); + + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp0, tmp1); + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, tmp2, tmp3); + DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7, + tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1); + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst0, dst, 0, 3); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 3); +} + +static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + int32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, tmp0, + tmp1, tmp2, tmp3); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1); + __lsx_vstelm_d(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt = height >> 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3); + src += src_stride; + dst0 = __lsx_vld(dst_tmp, 0); + dst1 = __lsx_vldx(dst_tmp, dst_stride); + dst_tmp += dst_stride2; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, + mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, + mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, + mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2, + mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15); + DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3, + filter0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, + tmp11, filter2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2, + tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, + tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, + tmp7); + DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3); + DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3); + DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst1, dst, dst_stride); + dst += dst_stride2; + } +} + +static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3, dst0, dst1; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, + mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, + mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, + mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2, + mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15); + DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3, + filter0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, + tmp11, filter2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2, + tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, + tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, + tmp7); + DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt = height; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3, dst0, dst1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2); + src3 = __lsx_vld(src, 56); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1); + __lsx_vst(out0, dst, 32); + __lsx_vst(out1, dst, 48); + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i dst0, dst1, dst2, dst3, vec0, vec1, filt0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec0, vec1); + vec0 = __lsx_vssrarni_bu_h(vec1, vec0, FILTER_BITS); + vec0 = __lsx_vavgr_bu(vec0, dst0); + __lsx_vstelm_w(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(vec0, dst, 0, 3); +} + +static void common_hz_2t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i vec4, vec5, vec6, vec7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp1 = (uint8_t *)src + src_stride4; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src4 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp1, src_stride3); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, dst2, dst1, dst4, dst3, dst1, dst2); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask, src7, src6, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec4, vec5, vec6, vec7); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0, + res1, res2, res3); + DUP2_ARG2(__lsx_vilvl_d, res1, res0, res3, res2, res0, res2); + DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res2, dst1, res0, res2); + + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(res2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res2, dst, 0, 3); + dst += dst_stride; +} + +static void common_hz_2t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_and_aver_dst_8x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec1); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec1, dst1, vec0, vec1); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec1, dst, 0, 1); +} + +static void common_hz_2t_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + uint8_t *dst_tmp = dst; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + if (height == 16) { + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec2); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2); + __lsx_vstelm_d(vec0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(vec2, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_hz_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_hz_2t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2) - 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp1 = (uint8_t *)src + 8; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src_tmp1 += src_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, + res4, res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, res0, + res2, res4, res6); + dst0 = __lsx_vld(dst, 0); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res2 = __lsx_vavgr_bu(res2, dst0); + __lsx_vst(res2, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res6 = __lsx_vavgr_bu(res6, dst0); + __lsx_vst(res6, dst, 0); + dst += dst_stride; + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src_tmp1 += src_stride4; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, res4, res5, res6, res7); + + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, + res0, res2, res4, res6); + dst0 = __lsx_vld(dst, 0); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res2 = __lsx_vavgr_bu(res2, dst0); + __lsx_vst(res2, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + dst += dst_stride; + + dst0 = __lsx_vld(dst, 0); + res6 = __lsx_vavgr_bu(res6, dst0); + __lsx_vst(res6, dst, 0); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0, dst1; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 16, src, 24, src2, src3); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 16, src, 24, src6, src7); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, res0, res1, res2, res3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, res4, res5, res6, res7); + DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2, + FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, + res0, res2, res4, res6); + + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + res0 = __lsx_vavgr_bu(res0, dst0); + __lsx_vst(res0, dst, 0); + res2 = __lsx_vavgr_bu(res2, dst1); + __lsx_vst(res2, dst, 16); + dst += dst_stride; + + DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1); + res4 = __lsx_vavgr_bu(res4, dst0); + __lsx_vst(res4, dst, 0); + res6 = __lsx_vavgr_bu(res6, dst1); + __lsx_vst(res6, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, dst0, dst1, dst2, dst3; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4, + src6); + src7 = __lsx_vld(src, 56); + DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1); + DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3); + DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5); + DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out2, out4, out6); + + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst1, dst2, + dst3); + out0 = __lsx_vavgr_bu(out0, dst0); + __lsx_vst(out0, dst, 0); + out2 = __lsx_vavgr_bu(out2, dst1); + __lsx_vst(out2, dst, 16); + out4 = __lsx_vavgr_bu(out4, dst2); + __lsx_vst(out4, dst, 32); + out6 = __lsx_vavgr_bu(out6, dst3); + __lsx_vst(out6, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2) { + switch (w) { + case 4: + common_hz_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 8: + common_hz_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 16: + common_hz_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + + case 32: + common_hz_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 64: + common_hz_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 8: + common_hz_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 16: + common_hz_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 32: + common_hz_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 64: + common_hz_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + default: + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c new file mode 100644 index 0000000000..d1abf622ad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_and_aver_dst_4w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i out0, out1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3 - src_stride3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + tmp2 = __lsx_vpackev_b(tmp5, tmp4); + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src4 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src5 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3); + src2 = __lsx_vilvl_d(src3, src2); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); + tmp4 = __lsx_vpackev_b(tmp3, tmp4); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vshuf_b(src1, tmp3, shuff); + src0 = __lsx_vpackev_b(src1, src0); + out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS); + out0 = __lsx_vxori_b(out0, 128); + out0 = __lsx_vavgr_bu(out0, src2); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + tmp5 = src1; + tmp0 = tmp2; + tmp1 = tmp4; + tmp2 = src0; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_8w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + __m128i out0, out1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *_src = (uint8_t *)src - 3 - src_stride3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1, + tmp0, tmp1, tmp2, tmp4); + DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6); + + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp3 = __lsx_vpackev_b(src7, src6); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vpackev_b(src8, src7); + out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = __lsx_vpackev_b(src9, src8); + src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = __lsx_vpackev_b(src10, src9); + src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3, + FILTER_BITS, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + src5 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src7 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src8 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src9 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7); + DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src6 = src10; + tmp0 = tmp2; + tmp1 = tmp3; + tmp2 = src1; + tmp4 = tmp6; + tmp5 = src0; + tmp6 = src2; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_16w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hv_8ht_8vt_and_aver_dst_32w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_64w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1; + __m128i dst0, dst1, dst2, dst3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + + dst0 = __lsx_vldrepl_w(dst, 0); + dst1 = __lsx_vldrepl_w(dst + dst_stride, 0); + dst2 = __lsx_vldrepl_w(dst + dst_stride2, 0); + dst3 = __lsx_vldrepl_w(dst + dst_stride3, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); +} + +static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += src_stride4; + + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz); + hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz); + hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz); + DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, + hz_out1, hz_out3); + hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); + hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst1 = __lsx_vilvl_w(dst2, dst1); + dst2 = __lsx_vilvl_w(dst4, dst3); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5, + hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, + filt_vt, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, res0, res1); + DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1); + + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(res1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 3); +} + +static void common_hv_2ht_2vt_and_aver_dst_4w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (height == 8) { + common_hv_2ht_2vt_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *dst_tmp = dst; + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec1 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec2 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec3 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *dst_tmp = dst; + + /* rearranging filter */ + mask = __lsx_vld(mc_filt_mask_arr, 0); + + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx( + src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_16w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint8_t *src_tmp1; + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride << 2; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + + for (; loop_cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src_tmp1 = (uint8_t *)(src + 8); + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst0); + __lsx_vst(tmp3, dst, 0); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst1); + __lsx_vstx(tmp3, dst, dst_stride); + + hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst2); + __lsx_vstx(tmp3, dst, dst_stride2); + + hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp3 = __lsx_vavgr_bu(tmp3, dst3); + __lsx_vstx(tmp3, dst, dst_stride3); + dst += dst_stride4; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_32w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); +} + +static void common_hv_2ht_2vt_and_aver_dst_64w_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_hv_2ht_2vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 8: + common_hv_2ht_2vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 16: + common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 32: + common_hv_2ht_2vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 64: + common_hv_2ht_2vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + default: + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 8: + common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 16: + common_hv_8ht_8vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 32: + common_hv_8ht_8vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 64: + common_hv_8ht_8vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + default: + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c new file mode 100644 index 0000000000..5c6413df44 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c @@ -0,0 +1,918 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i reg0, reg1, reg2, reg3, reg4; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + src0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5, + src6); + src_tmp0 += src_stride3; + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0, + tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5); + DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1); + reg2 = __lsx_vilvl_d(tmp5, tmp2); + DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1); + reg2 = __lsx_vxori_b(reg2, 128); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1); + src0 = __lsx_vilvl_d(src1, src0); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); + DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1, + filter2, filter3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + out0 = __lsx_vavgr_bu(out0, src0); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + reg0 = reg2; + reg1 = reg3; + reg2 = reg4; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_tmp = dst; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1, out2, out3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src4 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5, + src6); + src_tmp0 += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp0, 0); + DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp0, src_stride3); + src_tmp0 += src_stride4; + src0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + src3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1); + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1, + filter2, filter3); + out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1, + filter2, filter3); + out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + reg0 = reg2; + reg1 = tmp0; + reg2 = tmp2; + reg3 = reg5; + reg4 = tmp1; + reg5 = tmp3; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_16w_mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height, int32_t width) { + uint8_t *src_tmp; + uint32_t cnt = width >> 4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + uint8_t *src_tmp0 = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + for (; cnt--;) { + uint32_t loop_cnt = height >> 2; + uint8_t *dst_reg = dst; + + src_tmp = src_tmp0; + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride3; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg6, reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, + src7, src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + tmp2 = __lsx_vld(dst_reg, 0); + tmp3 = __lsx_vldx(dst_reg, dst_stride); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); + __lsx_vst(tmp0, dst_reg, 0); + __lsx_vstx(tmp1, dst_reg, dst_stride); + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + tmp2 = __lsx_vldx(dst_reg, dst_stride2); + tmp3 = __lsx_vldx(dst_reg, dst_stride3); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1); + __lsx_vstx(tmp0, dst_reg, dst_stride2); + __lsx_vstx(tmp1, dst_reg, dst_stride3); + dst_reg += dst_stride4; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } + src_tmp0 += 16; + dst += 16; + } +} + +static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int8_t *filter, + int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void common_vt_2t_and_aver_dst_4x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + __m128i src10_r, src32_r, src21_r, src43_r; + __m128i tmp0, tmp1; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + src += src_stride; + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110, + src4332); + DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + out = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vstelm_w(out, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 3); + dst += dst_stride; +} + +static void common_vt_2t_and_aver_dst_4x8_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i dst0, dst1, dst2, dst3, dst4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; + __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + __m128i src2110, src4332, src6554, src8776, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src7 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src8 = __lsx_vld(src, 0); + + dst0 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst0 = __lsx_vilvl_w(dst1, dst0); + dst1 = __lsx_vilvl_w(dst3, dst2); + dst0 = __lsx_vilvl_d(dst1, dst0); + + dst1 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_w(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_w(dst_tmp, 0); + dst1 = __lsx_vilvl_w(dst2, dst1); + dst2 = __lsx_vilvl_w(dst4, dst3); + dst1 = __lsx_vilvl_d(dst2, dst1); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r); + DUP4_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, + src87_r, src76_r, src2110, src4332, src6554, src8776); + DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0, + src8776, filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_w(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp0, dst, 0, 3); + dst += dst_stride; + + __lsx_vstelm_w(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(tmp2, dst, 0, 3); +} + +static void common_vt_2t_and_aver_dst_4w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_vt_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_vt_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_and_aver_dst_8x4_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec1); + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); +} + +static void common_vt_2t_and_aver_dst_8x8mult_lsx( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i dst0, dst1, dst2, dst3, dst4, dst5; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp0, tmp1, tmp2, tmp3; + uint8_t *dst_tmp = dst; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7); + src8 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst1 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1); + + dst2 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst3 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst4 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + dst5 = __lsx_vldrepl_d(dst_tmp, 0); + dst_tmp += dst_stride; + DUP2_ARG2(__lsx_vilvl_d, dst3, dst2, dst5, dst4, dst2, dst3); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp2); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst2, tmp2, dst3, tmp0, tmp2); + __lsx_vstelm_d(tmp0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + src0 = src8; + } +} + +static void common_vt_2t_and_aver_dst_8w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (height == 4) { + common_vt_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_vt_2t_and_aver_dst_16w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_and_aver_dst_32w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + uint8_t *src_tmp1; + uint8_t *dst_tmp1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp0, tmp1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + + src_tmp1 = src + 16; + src6 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src7, + src8); + src9 = __lsx_vldx(src_tmp1, src_stride3); + + dst_tmp1 = dst + 16; + dst4 = __lsx_vld(dst_tmp1, 0); + DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, dst5, + dst6); + dst7 = __lsx_vldx(dst_tmp1, dst_stride3); + src += src_stride4; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vstx(tmp0, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vstx(tmp0, dst, dst_stride2); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vstx(tmp0, dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst4); + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst5); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst6); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst7); + dst += dst_stride; + __lsx_vst(tmp0, dst, 16); + dst += dst_stride; + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_and_aver_dst_64w_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + int32_t src_stride2 = src_stride << 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *src_tmp1; + uint8_t *dst_tmp1; + __m128i src0, src1, src2, src3, src4, src5; + __m128i src6, src7, src8, src9, src10, src11, filt0; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i tmp0, tmp1; + + filt0 = __lsx_vldrepl_h(filter, 0); + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6, + src9); + src += src_stride; + + for (; loop_cnt--;) { + src2 = __lsx_vldx(src, src_stride); + dst1 = __lsx_vldx(dst, dst_stride); + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7, + src10); + DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst2, dst4, + dst6); + src_tmp1 = (uint8_t *)src + 16; + src5 = __lsx_vldx(src_tmp1, src_stride); + src_tmp1 = src_tmp1 + 16; + src8 = __lsx_vldx(src_tmp1, src_stride); + src_tmp1 = src_tmp1 + 16; + src11 = __lsx_vldx(src_tmp1, src_stride); + + dst_tmp1 = dst + 16; + dst3 = __lsx_vldx(dst_tmp1, dst_stride); + dst_tmp1 = dst + 32; + dst5 = __lsx_vldx(dst_tmp1, dst_stride); + dst_tmp1 = dst + 48; + dst7 = __lsx_vldx(dst_tmp1, dst_stride); + src += src_stride2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst0); + __lsx_vst(tmp0, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst1); + __lsx_vstx(tmp0, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst2); + __lsx_vst(tmp0, dst, 16); + + dst_tmp1 = dst + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst3); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst4); + __lsx_vst(tmp0, dst, 32); + + dst_tmp1 = dst_tmp1 + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst5); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst6); + __lsx_vst(tmp0, dst, 48); + + dst_tmp1 = dst_tmp1 + 16; + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + tmp0 = __lsx_vavgr_bu(tmp0, dst7); + __lsx_vstx(tmp0, dst_tmp1, dst_stride); + dst += dst_stride2; + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_vt_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 8: + common_vt_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 16: + common_vt_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 32: + common_vt_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 64: + common_vt_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 8: + common_vt_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 16: + common_vt_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + + break; + case 32: + common_vt_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 64: + common_vt_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + default: + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c new file mode 100644 index 0000000000..2c6459a978 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c @@ -0,0 +1,814 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out, out0, out1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= 3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1); + out = __lsx_vssrarni_b_h(out1, out0, 7); + out = __lsx_vxori_b(out, 128); + __lsx_vstelm_w(out, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out, dst, 0, 3); +} + +static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1); + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out1, dst, 0, 3); +} + +static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter) { + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filter0, filter1, filter2, filter3, out0, out1, + out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); +} + +static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + uint8_t *_src = (uint8_t *)src - 3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 1; + int32_t stride = src_stride << 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + const uint8_t *_src = src + src_stride; + DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2); + DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + dst += dst_stride; + __lsx_vst(out1, dst, 0); + dst += dst_stride; + src += stride; + } +} + +static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 1; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + + dst += dst_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int8_t *filter, int32_t height) { + int32_t loop_cnt = height; + __m128i src0, src1, src2, src3; + __m128i filter0, filter1, filter2, filter3; + __m128i mask0, mask1, mask2, mask3; + __m128i out0, out1, out2, out3; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= 3; + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + + DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2); + src3 = __lsx_vld(src, 56); + src1 = __lsx_vshuf_b(src2, src0, shuff); + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filter0, filter1, filter2, filter3, out0, + out1, out2, out3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vst(out0, dst, 32); + __lsx_vst(out1, dst, 48); + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, mask; + __m128i filt0, vec0, vec1, vec2, vec3, res0, res1; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3, + FILTER_BITS, res0, res1); + + __lsx_vstelm_w(res0, dst, 0, 0); + __lsx_vstelm_w(res0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i res0, res1, res2, res3, filt0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride + dst_stride2; + + uint8_t *src_tmp1 = src + src_stride4; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp1, src_stride3); + + DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask, + src7, src6, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec4, vec5, vec6, vec7); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0, + res1, res2, res3); + + __lsx_vstelm_w(res0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(res1, dst, 0, 1); + dst += dst_stride; + + __lsx_vstelm_w(res2, dst, 0, 0); + __lsx_vstelm_w(res2, dst + dst_stride, 0, 1); + __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i filt0, mask; + __m128i src0, src1, src2, src3; + __m128i vec0, vec1, vec2, vec3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, vec0, vec1); + + __lsx_vstelm_d(vec0, dst, 0, 0); + __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1); +} + +static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + __m128i filt0, mask; + __m128i src0, src1, src2, src3, out0, out1; + __m128i vec0, vec1, vec2, vec3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + if (height == 16) { + uint8_t *dst_tmp1 = dst + dst_stride4; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, vec0, vec1, vec2, vec3); + DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst_tmp1, 0, 0); + __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1); + } +} + +static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2) - 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + uint8_t *src_tmp1 = src + 8; + mask = __lsx_vld(mc_filt_mask_arr, 0); + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask, + src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask, + src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0, + out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0, + out1, out2, out3); + + __lsx_vst(out0, dst, 0); + dst += dst_stride; + __lsx_vst(out1, dst, 0); + dst += dst_stride; + __lsx_vst(out2, dst, 0); + dst += dst_stride; + __lsx_vst(out3, dst, 0); + dst += dst_stride; + + for (; loop_cnt--;) { + src_tmp1 += src_stride4; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4); + src6 = __lsx_vldx(src, src_stride3); + + src1 = __lsx_vld(src_tmp1, 0); + DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3, + src5); + src7 = __lsx_vldx(src_tmp1, src_stride3); + src += src_stride4; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + dst += dst_stride; + __lsx_vst(out1, dst, 0); + dst += dst_stride; + __lsx_vst(out2, dst, 0); + dst += dst_stride; + __lsx_vst(out3, dst, 0); + dst += dst_stride; + } +} + +static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2); + src3 = __lsx_vld(src, 24); + src1 = __lsx_vshuf_b(src2, src0, shuff); + src += src_stride; + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6); + src7 = __lsx_vld(src, 24); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + dst += dst_stride; + + __lsx_vst(out2, dst, 0); + __lsx_vst(out3, dst, 16); + dst += dst_stride; + } +} + +static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + __m128i out0, out1, out2, out3, out4, out5, out6, out7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + filt0 = __lsx_vldrepl_h(filter, 0); + + for (; loop_cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4, + src6); + src7 = __lsx_vld(src, 56); + DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3); + src5 = __lsx_vshuf_b(src6, src4, shuff); + src += src_stride; + + DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, + mask, src3, src3, mask, vec0, vec1, vec2, vec3); + DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, + mask, src7, src7, mask, vec4, vec5, vec6, vec7); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, out0, out1, out2, out3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, out4, out5, out6, out7); + DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2, + FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, + out0, out1, out2, out3); + + __lsx_vst(out0, dst, 0); + __lsx_vst(out1, dst, 16); + __lsx_vst(out2, dst, 32); + __lsx_vst(out3, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_x = filter[x0_q4]; + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + if (vpx_get_filter_taps(filter_x) == 2) { + switch (w) { + case 4: + common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 8: + common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 16: + common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 32: + common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 64: + common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 8: + common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 16: + common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 32: + common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + + case 64: + common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + default: + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c new file mode 100644 index 0000000000..9f5cd6cfe9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c @@ -0,0 +1,697 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i out0, out1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + mask0 = __lsx_vld(mc_filt_mask_arr, 16); + src -= (3 + 3 * src_stride); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + src5 = __lsx_vld(src, 0); + src += src_stride; + src6 = __lsx_vld(src, 0); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3); + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1); + tmp2 = __lsx_vpackev_b(tmp5, tmp4); + + for (; loop_cnt--;) { + LSX_LD_4(src, src_stride, src7, src8, src9, src10); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff); + tmp4 = __lsx_vpackev_b(tmp3, tmp4); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vshuf_b(src1, tmp3, shuff); + src0 = __lsx_vpackev_b(src1, src0); + out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + tmp5 = src1; + tmp0 = tmp2; + tmp1 = tmp4; + tmp2 = src0; + } +} + +static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3; + __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3; + __m128i mask0, mask1, mask2, mask3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; + __m128i out0, out1; + + mask0 = __lsx_vld(mc_filt_mask_arr, 0); + src -= (3 + 3 * src_stride); + DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4, + filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2); + mask3 = __lsx_vaddi_bu(mask0, 6); + + LSX_LD_4(src, src_stride, src0, src1, src2, src3); + src += src_stride; + src4 = __lsx_vld(src, 0); + src += src_stride; + src5 = __lsx_vld(src, 0); + src += src_stride; + src6 = __lsx_vld(src, 0); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + + src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4, + filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1, + tmp0, tmp1, tmp2, tmp4); + DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6); + + for (; loop_cnt--;) { + LSX_LD_4(src, src_stride, src7, src8, src9, src10); + src += src_stride; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + tmp3 = __lsx_vpackev_b(src7, src6); + out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src0 = __lsx_vpackev_b(src8, src7); + out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src1 = __lsx_vpackev_b(src9, src8); + src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + src2 = __lsx_vpackev_b(src10, src9); + src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + src6 = src10; + tmp0 = tmp2; + tmp1 = tmp3; + tmp2 = src1; + tmp4 = tmp6; + tmp5 = src0; + tmp6 = src2; + } +} + +static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; +} + +static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_vt, filt_hz, vec0, vec1; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + filt_hz = __lsx_vldrepl_h(filter_horiz, 0); + filt_vt = __lsx_vldrepl_h(filter_vert, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + + hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff); + hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2); + + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7; + __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 16); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += src_stride4; + + hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz); + hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz); + hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz); + hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz); + + DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff, + hz_out1, hz_out3); + hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff); + hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6); + DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5, + hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3, + filt_vt, vec4, vec5, vec6, vec7); + DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5, + FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4, + vec5, vec6, vec7); + + __lsx_vstelm_w(vec4, dst, 0, 0); + __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1); + __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1); + dst += dst_stride4; + __lsx_vstelm_w(vec6, dst, 0, 0); + __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1); + __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0); + __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else if (height == 8) { + common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3; + __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec1 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec2 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec3 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1); +} + +static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, mask; + __m128i filt_hz, filt_vt, vec0; + __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3, + FILTER_BITS, tmp1, tmp2); + + __lsx_vstelm_d(tmp1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + + hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out1, hz_out0); + tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt); + + hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + vec0 = __lsx_vpackev_b(hz_out0, hz_out1); + tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt); + + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3, + FILTER_BITS, tmp1, tmp2); + + __lsx_vstelm_d(tmp1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp1, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(tmp2, dst, 0, 1); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (height == 4) { + common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask; + __m128i filt_hz, filt_vt, vec0, vec1; + __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + mask = __lsx_vld(mc_filt_mask_arr, 0); + + /* rearranging filter */ + DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt); + + DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1); + src += src_stride; + + hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + + for (; loop_cnt--;) { + uint8_t *src_tmp0 = src + 8; + + DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src, + src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7); + src += src_stride4; + + hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz); + hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz); + hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz); + DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2); + tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); +} + +static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, + int32_t y_step_q4, int32_t w, int32_t h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 8: + common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 16: + common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 32: + common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 64: + common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + default: + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 8: + common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 16: + common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 32: + common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 64: + common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + default: + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c new file mode 100644 index 0000000000..6022e43c83 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c @@ -0,0 +1,825 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/loongarch/vpx_convolve_lsx.h" + +static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + __m128i reg0, reg1, reg2, reg3, reg4; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1; + uint8_t *_src = (uint8_t *)src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + src0 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2); + src3 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + src4 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6); + _src += src_stride3; + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0, + tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5); + DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1); + reg2 = __lsx_vilvl_d(tmp5, tmp2); + DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1); + reg2 = __lsx_vxori_b(reg2, 128); + + for (; loop_cnt--;) { + src7 = __lsx_vld(_src, 0); + DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9); + src10 = __lsx_vldx(_src, src_stride3); + _src += src_stride4; + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4); + DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1, + filter2, filter3); + out0 = __lsx_vssrarni_b_h(out1, out0, 7); + out0 = __lsx_vxori_b(out0, 128); + __lsx_vstelm_w(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 2); + dst += dst_stride; + __lsx_vstelm_w(out0, dst, 0, 3); + dst += dst_stride; + + reg0 = reg2; + reg1 = reg3; + reg2 = reg4; + src6 = src10; + } +} + +static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i filter0, filter1, filter2, filter3; + __m128i out0, out1, out2, out3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + src = src - src_stride3; + + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9); + src10 = __lsx_vldx(src, src_stride3); + src += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + tmp0, tmp1, tmp2, tmp3); + out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1, + filter2, filter3); + out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1, + filter2, filter3); + out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1, + filter2, filter3); + out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1); + DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1); + __lsx_vstelm_d(out0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out0, dst, 0, 1); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(out1, dst, 0, 1); + dst += dst_stride; + + reg0 = reg2; + reg1 = tmp0; + reg2 = tmp2; + reg3 = reg5; + reg4 = tmp1; + reg5 = tmp3; + src6 = src10; + } +} + +static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = height >> 2; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + // uint8_t *_src = (uint8_t *)src - src_stride3; + src -= src_stride3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0, + reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, reg6, + reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9); + src10 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7, + src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + __lsx_vst(tmp1, dst, 0); + dst += dst_stride; + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst, 0); + dst += dst_stride; + __lsx_vst(tmp1, dst, 0); + dst += dst_stride; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, + int32_t width) { + uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t cnt = width >> 4; + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i filter0, filter1, filter2, filter3; + __m128i reg0, reg1, reg2, reg3, reg4, reg5; + __m128i reg6, reg7, reg8, reg9, reg10, reg11; + __m128i tmp0, tmp1, tmp2, tmp3; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride + src_stride2; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + src -= src_stride3; + DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, + filter0, filter1, filter2, filter3); + + for (; cnt--;) { + uint32_t loop_cnt = height >> 2; + + src_tmp = src; + dst_tmp = dst; + + src0 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1, + src2); + src3 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride3; + + DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0, + src1, src2, src3); + DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5); + src6 = __lsx_vxori_b(src6, 128); + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5); + DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, + reg6, reg7, reg8, reg9); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11); + + for (; loop_cnt--;) { + src7 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8, + src9); + src10 = __lsx_vldx(src_tmp, src_stride3); + src_tmp += src_stride4; + DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, + src7, src8, src9, src10); + DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9, + src4, src5, src7, src8); + tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vst(tmp0, dst_tmp, 0); + __lsx_vstx(tmp1, dst_tmp, dst_stride); + tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1, + filter2, filter3); + tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1, + filter2, filter3); + tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1, + filter2, filter3); + tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1, + filter2, filter3); + DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1); + DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1); + __lsx_vstx(tmp0, dst_tmp, dst_stride2); + __lsx_vstx(tmp1, dst_tmp, dst_stride3); + dst_tmp += dst_stride4; + + reg0 = reg2; + reg1 = src0; + reg2 = src2; + reg3 = reg5; + reg4 = src1; + reg5 = src3; + reg6 = reg8; + reg7 = src4; + reg8 = src7; + reg9 = reg11; + reg10 = src5; + reg11 = src8; + src6 = src10; + } + src += 16; + dst += 16; + } +} + +static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, + 32); +} + +static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height, + 64); +} + +static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + __m128i filt0, tmp0, tmp1; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += (src_stride4 + src_stride); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); +} + +static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i vec0, vec1, vec2, vec3, vec4, vec5; + __m128i vec6, vec7, vec8, vec9, vec10, vec11; + __m128i tmp0, tmp1, tmp2, tmp3; + __m128i filt0; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + uint8_t *dst_tmp1 = dst + dst_stride4; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src5, src6, src7, src8); + src += (src_stride4 + src_stride); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4, + vec5, vec6, vec7); + DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8, + vec9, vec10, vec11); + + DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, tmp0, tmp1); + + __lsx_vstelm_w(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1); + __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3); + + __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2); + __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3); +} + +static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_vt_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter); + } else if (height == 8) { + common_vt_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + __m128i out0, out1, tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0, + vec1, vec2, vec3); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); +} + +static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 3); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i out0, out1, tmp0, tmp1, tmp2, tmp3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src5 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7) + src8 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, + vec0, vec1, vec2, vec3); + DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, + vec4, vec5, vec6, vec7); + DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + dst += dst_stride4; + + DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, + filt0, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2, + FILTER_BITS, out0, out1); + + __lsx_vstelm_d(out0, dst, 0, 0); + __lsx_vstelm_d(out0, dst + dst_stride, 0, 1); + __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0); + __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1); + dst += dst_stride4; + + src0 = src8; + } +} + +static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (height == 4) { + common_vt_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + filt0 = __lsx_vldrepl_h(filter, 0); + + src0 = __lsx_vld(src, 0); + src += src_stride; + + for (; loop_cnt--;) { + src1 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3); + src4 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 2); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp, tmp0, tmp1; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + uint8_t *src_tmp; + + filt0 = __lsx_vldrepl_h(filter, 0); + + DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5); + src += src_stride; + src_tmp = src + 16; + + for (; loop_cnt--;) { + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src1, src6); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src7, src3, src8); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src4, src9); + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + src += src_stride4; + src_tmp += src_stride4; + + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride); + + DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride2); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vstx(tmp, dst, dst_stride3); + + DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + dst += dst_stride; + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + dst += dst_stride; + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + dst += dst_stride; + __lsx_vst(tmp, dst, 16); + + dst += dst_stride; + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt = (height >> 1); + __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + __m128i tmp, tmp0, tmp1; + + int32_t src_stride2 = src_stride << 1; + int32_t dst_stride2 = dst_stride << 1; + uint8_t *dst_tmp1 = dst + dst_stride; + + filt0 = __lsx_vldrepl_h(filter, 0); + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6, + src9); + src += src_stride; + + for (; loop_cnt--;) { + uint8_t *src_tmp0 = src + src_stride; + + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7, + src10); + DUP4_ARG2(__lsx_vld, src_tmp0, 0, src_tmp0, 16, src_tmp0, 32, src_tmp0, 48, + src2, src5, src8, src11); + src += src_stride2; + + DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 0); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 0); + + DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 16); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 16); + + DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2); + DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3); + DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 32); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 32); + + DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6); + DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7); + DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst, 48); + + DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1); + tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS); + __lsx_vst(tmp, dst_tmp1, 48); + dst += dst_stride2; + dst_tmp1 += dst_stride2; + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16_t *const filter_y = filter[y0_q4]; + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 8; cnt--;) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (vpx_get_filter_taps(filter_y) == 2) { + switch (w) { + case 4: + common_vt_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 8: + common_vt_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 16: + common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 32: + common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 64: + common_vt_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 8: + common_vt_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 16: + common_vt_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 32: + common_vt_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 64: + common_vt_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + default: + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c new file mode 100644 index 0000000000..1dad29eeed --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void avg_width4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1; + __m128i dst0, dst1; + + int32_t src_stride2 = src_stride << 1; + + if ((height % 2) == 0) { + for (cnt = (height / 2); cnt--;) { + src0 = __lsx_vld(src, 0); + src1 = __lsx_vldx(src, src_stride); + src += src_stride2; + + dst0 = __lsx_vld(dst, 0); + dst1 = __lsx_vldx(dst, dst_stride); + DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1); + + __lsx_vstelm_w(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_w(dst1, dst, 0, 0); + dst += dst_stride; + } + } +} + +static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 4); + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2, dst3; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + + for (; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + + __lsx_vstelm_d(dst0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(dst3, dst, 0, 0); + dst += dst_stride; + } +} + +static void avg_width16_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 8); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + for (; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + src4 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src7 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + dst0 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2); + dst3 = __lsx_vldx(dst, dst_stride3); + dst += dst_stride4; + dst4 = __lsx_vld(dst, 0); + DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst5, dst6); + dst7 = __lsx_vldx(dst, dst_stride3); + dst -= dst_stride4; + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst1, dst, dst_stride); + __lsx_vstx(dst2, dst, dst_stride2); + __lsx_vstx(dst3, dst, dst_stride3); + dst += dst_stride4; + __lsx_vst(dst4, dst, 0); + __lsx_vstx(dst5, dst, dst_stride); + __lsx_vstx(dst6, dst, dst_stride2); + __lsx_vstx(dst7, dst, dst_stride3); + dst += dst_stride4; + } +} + +static void avg_width32_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 8); + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i src8, src9, src10, src11, src12, src13, src14, src15; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + int32_t dst_stride2 = dst_stride << 1; + int32_t dst_stride3 = dst_stride2 + dst_stride; + int32_t dst_stride4 = dst_stride2 << 1; + + for (; cnt--;) { + uint8_t *dst_tmp = dst; + uint8_t *dst_tmp1 = dst_tmp + 16; + uint8_t *src_tmp = src + 16; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src0, src1); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src2, src3, src4, src5); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src6, src7); + src += src_stride4; + + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst0, dst1); + DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp, + dst_stride2, dst_tmp1, dst_stride2, dst2, dst3, dst4, dst5); + DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst6, + dst7); + dst_tmp += dst_stride4; + dst_tmp1 += dst_stride4; + + src_tmp = src + 16; + DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src8, src9); + DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src, + src_stride2, src_tmp, src_stride2, src10, src11, src12, src13); + DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src14, src15); + src += src_stride4; + + DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst8, dst9); + DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp, + dst_stride2, dst_tmp1, dst_stride2, dst10, dst11, dst12, dst13); + DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst14, + dst15); + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11, + dst11, dst8, dst9, dst10, dst11); + DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15, + dst15, dst12, dst13, dst14, dst15); + + dst_tmp = dst + 16; + __lsx_vst(dst0, dst, 0); + __lsx_vstx(dst2, dst, dst_stride); + __lsx_vstx(dst4, dst, dst_stride2); + __lsx_vstx(dst6, dst, dst_stride3); + __lsx_vst(dst1, dst_tmp, 0); + __lsx_vstx(dst3, dst_tmp, dst_stride); + __lsx_vstx(dst5, dst_tmp, dst_stride2); + __lsx_vstx(dst7, dst_tmp, dst_stride3); + dst += dst_stride4; + + __lsx_vst(dst8, dst, 0); + __lsx_vstx(dst10, dst, dst_stride); + __lsx_vstx(dst12, dst, dst_stride2); + __lsx_vstx(dst14, dst, dst_stride3); + __lsx_vst(dst9, dst_tmp1, 0); + __lsx_vstx(dst11, dst_tmp1, dst_stride); + __lsx_vstx(dst13, dst_tmp1, dst_stride2); + __lsx_vstx(dst15, dst_tmp1, dst_stride3); + dst += dst_stride4; + } +} + +static void avg_width64_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt = (height / 4); + uint8_t *dst_tmp = dst; + + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i src8, src9, src10, src11, src12, src13, src14, src15; + __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (; cnt--;) { + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2, + src3); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6, + src7); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src8, src9, src10, + src11); + src += src_stride; + DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src12, src13, src14, + src15); + src += src_stride; + + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst0, dst1, dst2, dst3); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst4, dst5, dst6, dst7); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst8, dst9, dst10, dst11); + dst_tmp += dst_stride; + DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, + dst12, dst13, dst14, dst15); + dst_tmp += dst_stride; + + DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3); + DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7); + DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11, + dst11, dst8, dst9, dst10, dst11); + DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15, + dst15, dst12, dst13, dst14, dst15); + + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + __lsx_vst(dst2, dst, 32); + __lsx_vst(dst3, dst, 48); + dst += dst_stride; + __lsx_vst(dst4, dst, 0); + __lsx_vst(dst5, dst, 16); + __lsx_vst(dst6, dst, 32); + __lsx_vst(dst7, dst, 48); + dst += dst_stride; + __lsx_vst(dst8, dst, 0); + __lsx_vst(dst9, dst, 16); + __lsx_vst(dst10, dst, 32); + __lsx_vst(dst11, dst, 48); + dst += dst_stride; + __lsx_vst(dst12, dst, 0); + __lsx_vst(dst13, dst, 16); + __lsx_vst(dst14, dst, 32); + __lsx_vst(dst15, dst, 48); + dst += dst_stride; + } +} + +void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + switch (w) { + case 4: { + avg_width4_lsx(src, src_stride, dst, dst_stride, h); + break; + } + + case 8: { + avg_width8_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + avg_width16_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_width32_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_width64_lsx(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int32_t lp, cnt; + for (cnt = h; cnt--;) { + for (lp = 0; lp < w; ++lp) { + dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1); + } + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c new file mode 100644 index 0000000000..53dc7097ed --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_util/loongson_intrinsics.h" + +static void copy_width8_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + + __lsx_vstelm_d(src4, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src5, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src6, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src7, dst, 0, 0); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 8) == 0) { + for (cnt = height >> 3; cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + + __lsx_vstelm_d(src4, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src5, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src6, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src7, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 4) == 0) { + for (cnt = (height / 4); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src2, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src3, dst, 0, 0); + dst += dst_stride; + } + } else if ((height % 2) == 0) { + for (cnt = (height / 2); cnt--;) { + src0 = __lsx_vld(src, 0); + src1 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vstelm_d(src0, dst, 0, 0); + dst += dst_stride; + __lsx_vstelm_d(src1, dst, 0, 0); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) { + int32_t cnt, loop_cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = (uint8_t *)src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + src0 = __lsx_vld(src_tmp, 0); + DUP4_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src_tmp, + src_stride3, src_tmp, src_stride4, src1, src2, src3, src4); + src_tmp += src_stride4; + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src_tmp += src_stride2; + src7 = __lsx_vldx(src_tmp, src_stride); + src_tmp += src_stride2; + + __lsx_vst(src0, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src1, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src2, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src3, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + src += 16; + dst += 16; + } +} + +static void copy_width16_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3, + src, src_stride4, src1, src2, src3, src4); + src += src_stride4; + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6); + src += src_stride2; + src7 = __lsx_vldx(src, src_stride); + src += src_stride2; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + __lsx_vst(src4, dst, 0); + dst += dst_stride; + __lsx_vst(src5, dst, 0); + dst += dst_stride; + __lsx_vst(src6, dst, 0); + dst += dst_stride; + __lsx_vst(src7, dst, 0); + dst += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + } + } else if ((height % 8) == 0) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 16); + } else if ((height % 4) == 0) { + for (cnt = (height >> 2); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + } + } +} + +static void copy_width32_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *src_tmp; + uint8_t *dst_tmp; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + int32_t src_stride2 = src_stride << 1; + int32_t src_stride3 = src_stride2 + src_stride; + int32_t src_stride4 = src_stride2 << 1; + + if ((height % 12) == 0) { + for (cnt = (height / 12); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + } else if ((height % 8) == 0) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 32); + } else if ((height % 4) == 0) { + for (cnt = (height >> 2); cnt--;) { + src0 = __lsx_vld(src, 0); + DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2); + src3 = __lsx_vldx(src, src_stride3); + + src_tmp = (uint8_t *)src + 16; + src4 = __lsx_vld(src_tmp, 0); + DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5, + src6); + src7 = __lsx_vldx(src_tmp, src_stride3); + src += src_stride4; + + __lsx_vst(src0, dst, 0); + dst += dst_stride; + __lsx_vst(src1, dst, 0); + dst += dst_stride; + __lsx_vst(src2, dst, 0); + dst += dst_stride; + __lsx_vst(src3, dst, 0); + dst += dst_stride; + + dst_tmp = dst + 16; + __lsx_vst(src4, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src5, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src6, dst_tmp, 0); + dst_tmp += dst_stride; + __lsx_vst(src7, dst_tmp, 0); + dst_tmp += dst_stride; + } + } +} + +static void copy_width64_lsx(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 64); +} + +void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + switch (w) { + case 4: { + uint32_t cnt; + __m128i tmp; + for (cnt = h; cnt--;) { + tmp = __lsx_vldrepl_w(src, 0); + __lsx_vstelm_w(tmp, dst, 0, 0); + src += src_stride; + dst += dst_stride; + } + break; + } + case 8: { + copy_width8_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + copy_width16_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_width32_lsx(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_width64_lsx(src, src_stride, dst, dst_stride, h); + break; + } + default: { + uint32_t cnt; + for (cnt = h; cnt--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h new file mode 100644 index 0000000000..d886b00198 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ +#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ + +#include "./vpx_config.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_util/loongson_intrinsics.h" + +static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1, + __m128i _reg2, __m128i _reg3, + __m128i _filter0, __m128i _filter1, + __m128i _filter2, __m128i _filter3) { + __m128i _vec0, _vec1; + + _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); + _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); + _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); + _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); + return __lsx_vsadd_h(_vec0, _vec1); +} + +static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1, + __m128i _mask0, __m128i _mask1, + __m128i _mask2, __m128i _mask3, + __m128i _filt_h0, __m128i _filt_h1, + __m128i _filt_h2, __m128i _filt_h3) { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + __m128i _out; + + DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1, + _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3); + _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, + _filt_h2, _filt_h3); + _out = __lsx_vsrari_h(_out, FILTER_BITS); + return __lsx_vsat_h(_out, 7); +} + +static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask, + __m128i coeff) { + __m128i tmp0_m, tmp1_m; + + tmp0_m = __lsx_vshuf_b(in1, in0, mask); + tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff); + return __lsx_vsrari_h(tmp1_m, FILTER_BITS); +} + +#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \ + do { \ + _src0 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src1 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src2 = __lsx_vld(_src, 0); \ + _src += _stride; \ + _src3 = __lsx_vld(_src, 0); \ + } while (0) + +#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \ + _mask2, _mask3, _filter0, _filter1, \ + _filter2, _filter3, _out0, _out1) \ + do { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _reg0, _reg1, _reg2, _reg3; \ + \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \ + _tmp0, _tmp1); \ + DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \ + _tmp2, _tmp3); \ + DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \ + _filter1, _reg0, _reg1); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \ + _tmp4, _tmp5); \ + DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \ + DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \ + _tmp6, _tmp7); \ + DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \ + _filter3, _reg2, _reg3); \ + DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \ + } while (0) + +#define HORIZ_8TAP_8WID_4VECS_FILT( \ + _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, \ + _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3) \ + do { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \ + \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, \ + _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \ + _tmp3); \ + DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \ + _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, \ + _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \ + _tmp3); \ + DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \ + _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, \ + _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \ + _tmp7); \ + DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \ + _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \ + _reg1, _reg2, _reg3); \ + DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, \ + _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \ + _tmp7); \ + DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \ + _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \ + _reg5, _reg6, _reg7); \ + DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \ + _reg7, _out0, _out1, _out2, _out3); \ + } while (0) + +#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \ + do { \ + __m128i tmp0_m, tmp1_m; \ + \ + DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \ + pdst += stride; \ + __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \ + } while (0) + +#endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/loopfilter.c b/media/libvpx/libvpx/vpx_dsp/loopfilter.c index 9866ea37d6..d6504aab1f 100644 --- a/media/libvpx/libvpx/vpx_dsp/loopfilter.c +++ b/media/libvpx/libvpx/vpx_dsp/loopfilter.c @@ -81,11 +81,11 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { int8_t filter1, filter2; - const int8_t ps1 = (int8_t)*op1 ^ 0x80; - const int8_t ps0 = (int8_t)*op0 ^ 0x80; - const int8_t qs0 = (int8_t)*oq0 ^ 0x80; - const int8_t qs1 = (int8_t)*oq1 ^ 0x80; - const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); + const int8_t ps1 = (int8_t)(*op1 ^ 0x80); + const int8_t ps0 = (int8_t)(*op0 ^ 0x80); + const int8_t qs0 = (int8_t)(*oq0 ^ 0x80); + const int8_t qs1 = (int8_t)(*oq1 ^ 0x80); + const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); // add outer taps if we have high edge variance int8_t filter = signed_char_clamp(ps1 - qs1) & hev; @@ -99,39 +99,40 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, filter1 = signed_char_clamp(filter + 4) >> 3; filter2 = signed_char_clamp(filter + 3) >> 3; - *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80; - *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80; + *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80); + *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80); // outer tap adjustments filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; - *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80; - *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; + *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80); + *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80); } -void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); + filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch); ++s; } } -void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); - vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1); + vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1); } void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, @@ -158,7 +159,7 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } -static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, +static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, uint8_t *oq3) { @@ -178,31 +179,33 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, } } -void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p); + filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch, + s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch); ++s; } } -void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { - vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); - vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); + vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0); + vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1); } void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, @@ -229,8 +232,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } -static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint8_t *op7, uint8_t *op6, +static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op7, uint8_t *op6, uint8_t *op5, uint8_t *op4, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, @@ -283,7 +286,8 @@ static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, } } -static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, +static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; @@ -291,34 +295,37 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8 * count; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat2 = - flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, - s[4 * p], s[5 * p], s[6 * p], s[7 * p]); + const int8_t flat2 = flat_mask5( + 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0, + s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]); - filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, - s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, - s + 7 * p); + filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch, + s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, + s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch, + s + 7 * pitch); ++s; } } -void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); + mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1); } -void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); + mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2); } -static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, +static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; @@ -335,18 +342,18 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); - s += p; + s += pitch; } } -void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); + mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8); } -void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16); + mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16); } #if CONFIG_VP9_HIGHBITDEPTH @@ -416,7 +423,7 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, const int16_t ps0 = (int16_t)*op0 - (0x80 << shift); const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift); const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift); - const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); + const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); // Add outer taps if we have high edge variance. int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev; @@ -440,7 +447,7 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); } -void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, +void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; @@ -448,27 +455,28 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint16_t p3 = s[-4 * p]; - const uint16_t p2 = s[-3 * p]; - const uint16_t p1 = s[-2 * p]; - const uint16_t p0 = s[-p]; - const uint16_t q0 = s[0 * p]; - const uint16_t q1 = s[1 * p]; - const uint16_t q2 = s[2 * p]; - const uint16_t q3 = s[3 * p]; + const uint16_t p3 = s[-4 * pitch]; + const uint16_t p2 = s[-3 * pitch]; + const uint16_t p1 = s[-2 * pitch]; + const uint16_t p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch]; + const uint16_t q1 = s[1 * pitch]; + const uint16_t q2 = s[2 * pitch]; + const uint16_t q3 = s[3 * pitch]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); - highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); + highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, + s + 1 * pitch, bd); ++s; } } void vpx_highbd_lpf_horizontal_4_dual_c( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); - vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); + vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd); } void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, @@ -497,7 +505,7 @@ void vpx_highbd_lpf_vertical_4_dual_c( bd); } -static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, int bd) { @@ -517,33 +525,36 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, } } -void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { +void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8; ++i) { - const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], + p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], + q3 = s[3 * pitch]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); - highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, - s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); + highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, + s + 2 * pitch, s + 3 * pitch, bd); ++s; } } void vpx_highbd_lpf_horizontal_8_dual_c( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); - vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); + vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd); } void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, @@ -573,8 +584,8 @@ void vpx_highbd_lpf_vertical_8_dual_c( bd); } -static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint16_t *op7, uint16_t *op6, +static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint16_t *op7, uint16_t *op6, uint16_t *op5, uint16_t *op4, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, @@ -639,7 +650,7 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, } } -static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, @@ -649,44 +660,45 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < 8 * count; ++i) { - const uint16_t p3 = s[-4 * p]; - const uint16_t p2 = s[-3 * p]; - const uint16_t p1 = s[-2 * p]; - const uint16_t p0 = s[-p]; - const uint16_t q0 = s[0 * p]; - const uint16_t q1 = s[1 * p]; - const uint16_t q2 = s[2 * p]; - const uint16_t q3 = s[3 * p]; + const uint16_t p3 = s[-4 * pitch]; + const uint16_t p2 = s[-3 * pitch]; + const uint16_t p1 = s[-2 * pitch]; + const uint16_t p0 = s[-pitch]; + const uint16_t q0 = s[0 * pitch]; + const uint16_t q1 = s[1 * pitch]; + const uint16_t q2 = s[2 * pitch]; + const uint16_t q3 = s[3 * pitch]; const int8_t mask = highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); - const int8_t flat2 = - highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, - s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); + const int8_t flat2 = highbd_flat_mask5( + 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0, + s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd); - highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, - s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, - s + 6 * p, s + 7 * p, bd); + highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch, + s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch, + s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, + s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, + s + 6 * pitch, s + 7 * pitch, bd); ++s; } } -void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd); } -void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p, +void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); + highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd); } -static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, +static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count, @@ -712,20 +724,20 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7, bd); - s += p; + s += pitch; } } -void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, +void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd); } -void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, +void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c index 43d2c1146e..97541411e4 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/add_noise_msa.c @@ -9,7 +9,9 @@ */ #include -#include "./macros_msa.h" + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise, int blackclamp, int whiteclamp, int width, diff --git a/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c index 52a24ed379..3fd18dec56 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/avg_msa.c @@ -7,7 +7,9 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" @@ -54,3 +56,676 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { return sum_out; } + +#if !CONFIG_VP9_HIGHBITDEPTH +void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride, + int16_t *dst) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8); +} + +void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride, + int16_t *dst) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SH2(src, 8, src0, src8); + src += src_stride; + LD_SH2(src, 8, src1, src9); + src += src_stride; + LD_SH2(src, 8, src2, src10); + src += src_stride; + LD_SH2(src, 8, src3, src11); + src += src_stride; + LD_SH2(src, 8, src4, src12); + src += src_stride; + LD_SH2(src, 8, src5, src13); + src += src_stride; + LD_SH2(src, 8, src6, src14); + src += src_stride; + LD_SH2(src, 8, src7, src15); + src += src_stride; + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src11, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8); + + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, + src9, src10, src11, src12, src13, src14, src15); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, + res1, res2, res3, res4, res5, res6, res7); + + LD_SH2(src, 8, src0, src8); + src += src_stride; + LD_SH2(src, 8, src1, src9); + src += src_stride; + LD_SH2(src, 8, src2, src10); + src += src_stride; + LD_SH2(src, 8, src3, src11); + src += src_stride; + + ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8); + + LD_SH2(src, 8, src4, src12); + src += src_stride; + LD_SH2(src, 8, src5, src13); + src += src_stride; + LD_SH2(src, 8, src6, src14); + src += src_stride; + LD_SH2(src, 8, src7, src15); + src += src_stride; + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8); + + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, + src9, src10, src11, src12, src13, src14, src15); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, + res1, res2, res3, res4, res5, res6, res7); + ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8); + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); +} + +int vpx_satd_msa(const int16_t *data, int length) { + int i, satd; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 src8, src9, src10, src11, src12, src13, src14, src15; + v8i16 zero = { 0 }; + v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h; + v4u32 tmp0_w = { 0 }; + + if (16 == length) { + LD_SH2(data, 8, src0, src1); + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + satd = HADD_UW_U32(tmp0_w); + } else if (64 == length) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + satd = HADD_UW_U32(tmp0_w); + } else if (256 == length) { + for (i = 0; i < 2; ++i) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + data += 8 * 8; + LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15); + data += 8 * 8; + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + tmp0_h = (v8u16)__msa_asub_s_h(src8, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src9, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src10, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src11, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src12, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src13, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src14, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src15, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + } + + satd = HADD_UW_U32(tmp0_w); + } else if (1024 == length) { + for (i = 0; i < 8; ++i) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + data += 8 * 8; + LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15); + data += 8 * 8; + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + tmp0_h = (v8u16)__msa_asub_s_h(src8, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src9, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src10, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src11, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src12, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src13, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src14, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src15, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + } + + satd = HADD_UW_U32(tmp0_w); + } else { + satd = 0; + + for (i = 0; i < length; ++i) { + satd += abs(data[i]); + } + } + + return satd; +} +#endif // !CONFIG_VP9_HIGHBITDEPTH + +void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref, + const int ref_stride, const int height) { + int i; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v8i16 hbuf_r = { 0 }; + v8i16 hbuf_l = { 0 }; + v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l; + v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l; + + if (16 == height) { + for (i = 2; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 3); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else if (32 == height) { + for (i = 2; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 4); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else if (64 == height) { + for (i = 4; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 5); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else { + const int norm_factor = height >> 1; + int cnt; + + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] = 0; + } + + for (i = 0; i < height; ++i) { + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] += ref[cnt]; + } + + ref += ref_stride; + } + + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] /= norm_factor; + } + } +} + +int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) { + int16_t sum; + v16u8 ref0, ref1, ref2, ref3; + v8u16 ref0_h; + + if (16 == width) { + ref0 = LD_UB(ref); + ref0_h = __msa_hadd_u_h(ref0, ref0); + sum = HADD_UH_U32(ref0_h); + } else if (32 == width) { + LD_UB2(ref, 16, ref0, ref1); + ref0_h = __msa_hadd_u_h(ref0, ref0); + ref0_h += __msa_hadd_u_h(ref1, ref1); + sum = HADD_UH_U32(ref0_h); + } else if (64 == width) { + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref0_h = __msa_hadd_u_h(ref0, ref0); + ref0_h += __msa_hadd_u_h(ref1, ref1); + ref0_h += __msa_hadd_u_h(ref2, ref2); + ref0_h += __msa_hadd_u_h(ref3, ref3); + sum = HADD_UH_U32(ref0_h); + } else { + int idx; + + sum = 0; + for (idx = 0; idx < width; ++idx) { + sum += ref[idx]; + } + } + + return sum; +} + +int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) { + int sse, mean, var; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2; + v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m; + v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m; + v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m; + v4i32 res_l7_m, mean_v; + v2i64 sse_v; + + if (2 == bwl) { + LD_SH2(src, 8, src0, src1); + LD_SH2(ref, 8, ref0, ref1); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else if (3 == bwl) { + LD_SH4(src, 8, src0, src1, src2, src3); + LD_SH4(ref, 8, ref0, ref1, ref2, ref3); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m); + ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else if (4 == bwl) { + LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7); + LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m); + ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m); + ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m); + ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m); + ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v += res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else { + int i; + const int width = 4 << bwl; + + sse = 0; + mean = 0; + + for (i = 0; i < width; ++i) { + const int diff = ref[i] - src[i]; + + mean += diff; + sse += diff * diff; + } + } + + var = sse - ((mean * mean) >> (bwl + 2)); + + return var; +} + +void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7; + v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1; + + LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); + LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7); + PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3); + PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3); + + diff0 = __msa_asub_u_b(s0, d0); + diff1 = __msa_asub_u_b(s1, d1); + diff2 = __msa_asub_u_b(s2, d2); + diff3 = __msa_asub_u_b(s3, d3); + + min0 = __msa_min_u_b(diff0, diff1); + min1 = __msa_min_u_b(diff2, diff3); + min0 = __msa_min_u_b(min0, min1); + + max0 = __msa_max_u_b(diff0, diff1); + max1 = __msa_max_u_b(diff2, diff3); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1); + max0 = __msa_max_u_b(max0, max1); + + *min = min0[0]; + *max = max0[0]; +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h index 0a42f5cec2..87a5bbab56 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/common_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_COMMON_MIPS_DSPR2_H_ -#define VPX_COMMON_MIPS_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_ #include #include "./vpx_config.h" @@ -45,4 +45,4 @@ static INLINE void prefetch_store_streamed(unsigned char *dst) { } // extern "C" #endif -#endif // VPX_COMMON_MIPS_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c index ae88eddfd6..18e7d5375d 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c @@ -219,9 +219,10 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; uint32_t pos = 38; assert(y_step_q4 == 16); @@ -247,8 +248,8 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c index e944207b6e..7dcb662d7f 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c @@ -751,9 +751,10 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; uint32_t pos = 38; assert(x_step_q4 == 16); @@ -793,8 +794,8 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c index 5cc06b5f26..9e65a8f50f 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c @@ -628,9 +628,10 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; uint32_t pos = 38; assert(x_step_q4 == 16); @@ -672,8 +673,8 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filter_x, (int32_t)h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c index eb1975e447..a3e967b405 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c @@ -201,9 +201,10 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; uint32_t pos = 38; assert(y_step_q4 == 16); @@ -228,8 +229,8 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c index b4ed6ee850..cc458c8618 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/mips/convolve_common_dspr2.h" #include "vpx_dsp/vpx_convolve.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" #if HAVE_DSPR2 @@ -334,15 +335,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); - if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + if (vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -367,8 +369,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } @@ -376,8 +378,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { /* Fixed size intermediate buffer places limits on parameters. */ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); @@ -390,24 +392,26 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, if (intermediate_height < h) intermediate_height = h; - vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, intermediate_height); + vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); - vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { int x, y; uint32_t tp1, tp2, tn1, tp3, tp4, tn2; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; /* prefetch data to cache memory */ prefetch_load(src); diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c index 9a9bab25a5..7a9aa49d8a 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c @@ -15,6 +15,7 @@ #include "vpx_dsp/mips/convolve_common_dspr2.h" #include "vpx_dsp/vpx_convolve.h" #include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" #if HAVE_DSPR2 @@ -938,15 +939,16 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); - if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + if (vpx_get_filter_taps(filter_x) == 2) { + vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -987,9 +989,8 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, - h); + vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c index 8d35b6394e..1e7052f6c5 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_dspr2.c @@ -1296,9 +1296,11 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, } void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; uint32_t pos = 38; @@ -1320,7 +1322,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, if (filter_x[3] == 0x80) { copy_horiz_transposed(src - src_stride * 3, src_stride, temp, intermediate_height, w, intermediate_height); - } else if (((const int32_t *)filter_x)[0] == 0) { + } else if (vpx_get_filter_taps(filter_x) == 2) { vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp, intermediate_height, filter_x, w, intermediate_height); } else { @@ -1363,7 +1365,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, /* copy the src to dst */ if (filter_y[3] == 0x80) { copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w); - } else if (((const int32_t *)filter_y)[0] == 0) { + } else if (vpx_get_filter_taps(filter_y) == 2) { vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride, filter_y, h, w); } else { @@ -1395,14 +1397,15 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { int x, y; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; /* prefetch data to cache memory */ prefetch_load(src); diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c index 196a0a2f0b..09d6f36e56 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c @@ -818,15 +818,16 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); - if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + if (vpx_get_filter_taps(filter_x) == 2) { + vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -868,8 +869,8 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filter_x, (int32_t)h); break; default: - vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c index ad107d5c47..fd977b5336 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c @@ -318,15 +318,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); - if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + if (vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -349,8 +350,8 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h index 4eee3bd5e1..14b65bc650 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/convolve_common_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_ -#define VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ #include @@ -24,21 +24,21 @@ extern "C" { #if HAVE_DSPR2 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h); void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter, int w, @@ -46,13 +46,13 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); #endif // #if HAVE_DSPR2 #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c index e33ea740a9..4e93ff594d 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/deblock_msa.c @@ -9,42 +9,43 @@ */ #include -#include "./macros_msa.h" + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/mips/macros_msa.h" extern const int16_t vpx_rv[]; -#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0, \ - out1, out2, out3, out4, out5, out6, out7, \ - out8, out9, out10, out11, out12, out13, out14, \ - out15) \ - { \ - v8i16 temp0, temp1, temp2, temp3, temp4; \ - v8i16 temp5, temp6, temp7, temp8, temp9; \ - \ - ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ - temp3); \ - ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ - ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ - ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ - temp3); \ - ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_UB(temp5, temp4, out8, out10); \ - ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_UB(temp5, temp4, out12, out14); \ - out0 = (v16u8)temp6; \ - out2 = (v16u8)temp7; \ - out4 = (v16u8)temp8; \ - out6 = (v16u8)temp9; \ - out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ - out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ - out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ - out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ - out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ - out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ - out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ +#define VPX_TRANSPOSE8x16_UB_UB( \ + in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, \ + out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15) \ + { \ + v8i16 temp0, temp1, temp2, temp3, temp4; \ + v8i16 temp5, temp6, temp7, temp8, temp9; \ + \ + ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ + temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ + ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ + temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out8, out10); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out12, out14); \ + out0 = (v16u8)temp6; \ + out2 = (v16u8)temp7; \ + out4 = (v16u8)temp8; \ + out6 = (v16u8)temp9; \ + out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ + out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ + out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ + out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ + out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ } #define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \ @@ -295,6 +296,7 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, uint8_t *p_dst_st = dst_ptr; uint8_t *f_orig = f; uint16_t col; + uint64_t out0, out1, out2, out3; v16u8 above2, above1, below2, below1; v16u8 src, ref, ref_temp; v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6; @@ -346,6 +348,67 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, f += 16; } + if (0 != (cols / 16)) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + src = LD_UB(p_src + 10 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8); + below1 = LD_UB(p_src + 11 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9); + below2 = LD_UB(p_src + 12 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10); + above2 = LD_UB(p_src + 13 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11); + above1 = LD_UB(p_src + 14 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12); + src = LD_UB(p_src + 15 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13); + below1 = LD_UB(p_src + 16 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14); + below2 = LD_UB(p_src + 17 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15); + out0 = __msa_copy_u_d((v2i64)inter0, 0); + out1 = __msa_copy_u_d((v2i64)inter1, 0); + out2 = __msa_copy_u_d((v2i64)inter2, 0); + out3 = __msa_copy_u_d((v2i64)inter3, 0); + SD4(out0, out1, out2, out3, p_dst, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter4, 0); + out1 = __msa_copy_u_d((v2i64)inter5, 0); + out2 = __msa_copy_u_d((v2i64)inter6, 0); + out3 = __msa_copy_u_d((v2i64)inter7, 0); + SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter8, 0); + out1 = __msa_copy_u_d((v2i64)inter9, 0); + out2 = __msa_copy_u_d((v2i64)inter10, 0); + out3 = __msa_copy_u_d((v2i64)inter11, 0); + SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter12, 0); + out1 = __msa_copy_u_d((v2i64)inter13, 0); + out2 = __msa_copy_u_d((v2i64)inter14, 0); + out3 = __msa_copy_u_d((v2i64)inter15, 0); + SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride); + } + f = f_orig; p_dst = dst_ptr - 2; LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5, @@ -446,11 +509,11 @@ void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst, } } -void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, - int32_t rows, int32_t cols, int32_t flimit) { +void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows, + int32_t cols, int32_t flimit) { int32_t row, col, cnt; - uint8_t *src_dup = src_ptr; - v16u8 src0, src, tmp_orig; + uint8_t *src_dup = src; + v16u8 src0, src1, tmp_orig; v16u8 tmp = { 0 }; v16i8 zero = { 0 }; v8u16 sum_h, src_r_h, src_l_h; @@ -469,13 +532,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, src_dup[cols + 16] = src_dup[cols - 1]; tmp_orig = (v16u8)__msa_ldi_b(0); tmp_orig[15] = tmp[15]; - src = LD_UB(src_dup - 8); - src[15] = 0; - ILVRL_B2_UH(zero, src, src_r_h, src_l_h); + src1 = LD_UB(src_dup - 8); + src1[15] = 0; + ILVRL_B2_UH(zero, src1, src_r_h, src_l_h); src_r_w = __msa_dotp_u_w(src_r_h, src_r_h); src_r_w += __msa_dotp_u_w(src_l_h, src_l_h); sum_sq = HADD_SW_S32(src_r_w) + 16; - sum_h = __msa_hadd_u_h(src, src); + sum_h = __msa_hadd_u_h(src1, src1); sum = HADD_UH_U32(sum_h); { v16u8 src7, src8, src_r, src_l; @@ -504,8 +567,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1]; } sum = sum_l[7]; - src = LD_UB(src_dup + 16 * col); - ILVRL_B2_UH(zero, src, src_r_h, src_l_h); + src1 = LD_UB(src_dup + 16 * col); + ILVRL_B2_UH(zero, src1, src_r_h, src_l_h); src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4); src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4); tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7); @@ -551,7 +614,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, total3 = (total3 < flimit_vec); PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1); mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0); - tmp = __msa_bmz_v(tmp, src, (v16u8)mask); + tmp = __msa_bmz_v(tmp, src1, (v16u8)mask); if (col == 0) { uint64_t src_d; diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c index e41a904808..36583e2d24 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/fwd_txfm_msa.h" static void fdct8x32_1d_column_load_butterfly(const int16_t *input, @@ -927,21 +928,21 @@ void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out, } void vpx_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - int sum = LD_HADD(input, stride); - sum += LD_HADD(input + 8, stride); - sum += LD_HADD(input + 16, stride); - sum += LD_HADD(input + 24, stride); - sum += LD_HADD(input + 32 * 8, stride); - sum += LD_HADD(input + 32 * 8 + 8, stride); - sum += LD_HADD(input + 32 * 8 + 16, stride); - sum += LD_HADD(input + 32 * 8 + 24, stride); - sum += LD_HADD(input + 32 * 16, stride); - sum += LD_HADD(input + 32 * 16 + 8, stride); - sum += LD_HADD(input + 32 * 16 + 16, stride); - sum += LD_HADD(input + 32 * 16 + 24, stride); - sum += LD_HADD(input + 32 * 24, stride); - sum += LD_HADD(input + 32 * 24 + 8, stride); - sum += LD_HADD(input + 32 * 24 + 16, stride); - sum += LD_HADD(input + 32 * 24 + 24, stride); + int sum, i; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w = { 0 }; + + for (i = 0; i < 16; ++i) { + LD_SH4(input, 8, in0, in1, in2, in3); + input += stride; + LD_SH4(input, 8, in4, in5, in6, in7); + input += stride; + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w += __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + } + + sum = HADD_SW_S32(vec_w); out[0] = (int16_t)(sum >> 3); } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c index fdead50503..5a6dfcef2f 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.c @@ -8,8 +8,23 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/fwd_txfm_msa.h" +void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *out, int32_t stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w; + + LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w = __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + out[0] = HADD_SW_S32(vec_w); + out[1] = 0; +} + +#if !CONFIG_VP9_HIGHBITDEPTH void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; @@ -215,11 +230,6 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output, ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); } -void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - out[0] = LD_HADD(input, stride); - out[1] = 0; -} - void vpx_fdct16x16_msa(const int16_t *input, int16_t *output, int32_t src_stride) { int32_t i; @@ -237,9 +247,26 @@ void vpx_fdct16x16_msa(const int16_t *input, int16_t *output, } void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - int sum = LD_HADD(input, stride); - sum += LD_HADD(input + 8, stride); - sum += LD_HADD(input + 16 * 8, stride); - sum += LD_HADD(input + 16 * 8 + 8, stride); + int sum, i; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v4i32 vec_w = { 0 }; + + for (i = 0; i < 4; ++i) { + LD_SH2(input, 8, in0, in1); + input += stride; + LD_SH2(input, 8, in2, in3); + input += stride; + LD_SH2(input, 8, in4, in5); + input += stride; + LD_SH2(input, 8, in6, in7); + input += stride; + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); + ADD2(in0, in2, in4, in6, in0, in4); + vec_w += __msa_hadd_s_w(in0, in0); + vec_w += __msa_hadd_s_w(in4, in4); + } + + sum = HADD_SW_S32(vec_w); out[0] = (int16_t)(sum >> 1); } +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h index db5e90e7b9..c0be56b819 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/fwd_txfm_msa.h @@ -8,28 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_FWD_TXFM_MSA_H_ -#define VPX_DSP_MIPS_FWD_TXFM_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_ +#define VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_ #include "vpx_dsp/mips/txfm_macros_msa.h" #include "vpx_dsp/txfm_common.h" -#define LD_HADD(psrc, stride) \ - ({ \ - v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ - v4i32 vec_w_m; \ - \ - LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ - ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ - LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ - ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \ - in0_m, in4_m); \ - in0_m += in4_m; \ - \ - vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ - HADD_SW_S32(vec_w_m); \ - }) - #define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ { \ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ @@ -377,4 +361,4 @@ void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride); void fdct16x8_1d_row(int16_t *input, int16_t *output); -#endif // VPX_DSP_MIPS_FWD_TXFM_MSA_H_ +#endif // VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c index 2a211c5677..7ca61a28ec 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/idct16x16_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/inv_txfm_msa.h" void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c index 2ea6136f9b..053948183a 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/idct32x32_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/inv_txfm_msa.h" static void idct32x8_row_transpose_store(const int16_t *input, diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c index 0a85742f10..56ffec3cba 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/idct4x4_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/inv_txfm_msa.h" void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, diff --git a/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c index 7f77d20191..a383ff2066 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/idct8x8_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/inv_txfm_msa.h" void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst, diff --git a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h index 27881f0db6..cbea22f20f 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ -#define VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ #include @@ -25,7 +25,6 @@ extern "C" { #if HAVE_DSPR2 #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ ({ \ - \ int32_t tmp, out; \ int dct_cost_rounding = DCT_CONST_ROUNDING; \ int in = input; \ @@ -73,4 +72,4 @@ void iadst16_dspr2(const int16_t *input, int16_t *output); } // extern "C" #endif -#endif // VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h index 1fe9b28e8a..3b66249ef2 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/inv_txfm_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_INV_TXFM_MSA_H_ -#define VPX_DSP_MIPS_INV_TXFM_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_ +#define VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_ #include "vpx_dsp/mips/macros_msa.h" #include "vpx_dsp/mips/txfm_macros_msa.h" @@ -408,4 +408,4 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output); void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, int32_t dst_stride); void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output); -#endif // VPX_DSP_MIPS_INV_TXFM_MSA_H_ +#endif // VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c index 3f985b847b..e214b538d4 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/itrans4_dspr2.c @@ -343,6 +343,7 @@ void iadst4_dspr2(const int16_t *input, int16_t *output) { return; } + // 32-bit result is enough for the following multiplications. s0 = sinpi_1_9 * x0; s1 = sinpi_2_9 * x0; s2 = sinpi_3_9 * x1; diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c index b73d56bd55..b1731f2345 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_16_msa.c @@ -8,13 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_ports/mem.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/loopfilter_msa.h" +#include "vpx_ports/mem.h" -int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, + uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -77,7 +79,7 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, } } -void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { +static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { v16u8 flat, flat2, filter8; v16i8 zero = { 0 }; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; @@ -413,11 +415,11 @@ static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, (void)count; - early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, - limit_ptr, thresh_ptr); + early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); if (0 == early_exit) { - vpx_hz_lpf_t16_16w(src, pitch, filter48); + hz_lpf_t16_16w(src, pitch, filter48); } } @@ -753,11 +755,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); } -int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, - uint8_t *src_org, int32_t pitch_org, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch_org, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -820,8 +822,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, } } -int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, - uint8_t *filter48) { +static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { v16i8 zero = { 0 }; v16u8 filter8, flat, flat2; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; @@ -1051,12 +1053,12 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); early_exit = - vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, - pitch, b_limit_ptr, limit_ptr, thresh_ptr); + vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch, + b_limit_ptr, limit_ptr, thresh_ptr); if (0 == early_exit) { - early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, - &filter48[0]); + early_exit = + vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]); if (0 == early_exit) { transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); @@ -1064,11 +1066,11 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, } } -int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, - uint8_t *src_org, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { v16u8 p3, p2, p1, p0, q3, q2, q1, q0; v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; v16u8 flat, mask, hev, thresh, b_limit, limit; @@ -1141,8 +1143,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, } } -int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, - uint8_t *filter48) { +static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { v16u8 flat, flat2, filter8; v16i8 zero = { 0 }; v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; @@ -1473,12 +1475,12 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, transpose_16x16((src - 8), pitch, &transposed_input[0], 16); early_exit = - vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, - pitch, b_limit_ptr, limit_ptr, thresh_ptr); + vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); if (0 == early_exit) { - early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, - &filter48[0]); + early_exit = + vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]); if (0 == early_exit) { transpose_16x16(transposed_input, 16, (src - 8), pitch); diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c index 9500cd2fd8..0eff2b6ca9 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_4_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/loopfilter_msa.h" void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c index a22c62bb3a..703fcce8a7 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_8_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/loopfilter_msa.h" void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h index 5b0c73345b..ec339be868 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_ -#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ #include @@ -731,4 +731,4 @@ static INLINE void wide_mbfilter_dspr2( } // extern "C" #endif -#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h index 38ed0b2a63..9af0b42360 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_ -#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ #include @@ -432,4 +432,4 @@ extern "C" { } // extern "C" #endif -#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h index ee11142266..24c492bea0 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_ -#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_ +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ #include @@ -352,4 +352,4 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, } // extern "C" #endif -#endif // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_ +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h index 49fd74c25a..1ea05e0b0b 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/loopfilter_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_LOOPFILTER_MSA_H_ -#define VPX_DSP_LOOPFILTER_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_ +#define VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_ #include "vpx_dsp/mips/macros_msa.h" @@ -174,4 +174,4 @@ mask_out = limit_in < (v16u8)mask_out; \ mask_out = __msa_xori_b(mask_out, 0xff); \ } -#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */ +#endif // VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h index 002e574aa8..da79b40a7a 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/macros_msa.h @@ -8,215 +8,159 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_MACROS_MSA_H_ -#define VPX_DSP_MIPS_MACROS_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_MACROS_MSA_H_ +#define VPX_VPX_DSP_MIPS_MACROS_MSA_H_ #include #include "./vpx_config.h" #include "vpx/vpx_integer.h" -#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) -#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) +#define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UB(...) LD_V(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_V(v16i8, __VA_ARGS__) +#define LD_UH(...) LD_V(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_V(v8i16, __VA_ARGS__) +#define LD_SW(...) LD_V(v4i32, __VA_ARGS__) -#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) -#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) - -#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) - -#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) -#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) - -#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) - -#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) +#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_V(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_V(v16i8, __VA_ARGS__) +#define ST_SH(...) ST_V(v8i16, __VA_ARGS__) +#define ST_SW(...) ST_V(v4i32, __VA_ARGS__) #if (__mips_isa_rev >= 6) -#define LH(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LH(psrc) \ + ({ \ + uint16_t val_lh_m = *(const uint16_t *)(psrc); \ + val_lh_m; \ }) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + uint32_t val_lw_m = *(const uint32_t *)(psrc); \ + val_lw_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + uint64_t val_ld_m = *(const uint64_t *)(psrc); \ + val_ld_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m); \ - val1_m = LW(psrc_m + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint32_t val0_ld_m, val1_ld_m; \ + uint64_t val_ld_m = 0; \ + \ + val0_ld_m = LW(psrc_ld_m); \ + val1_ld_m = LW(psrc_ld_m + 4); \ + \ + val_ld_m = (uint64_t)(val1_ld_m); \ + val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ + val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ + \ + val_ld_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } - -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ - } +#define SH(val, pdst) *(uint16_t *)(pdst) = (val); +#define SW(val, pdst) *(uint32_t *)(pdst) = (val); +#define SD(val, pdst) *(uint64_t *)(pdst) = (val); #else // !(__mips_isa_rev >= 6) -#define LH(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_lh_m = (const uint8_t *)(psrc); \ + uint16_t val_lh_m; \ + \ + __asm__ __volatile__("ulh %[val_lh_m], %[psrc_lh_m] \n\t" \ + \ + : [val_lh_m] "=r"(val_lh_m) \ + : [psrc_lh_m] "m"(*psrc_lh_m)); \ + \ + val_lh_m; \ }) -#define LW(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \ + uint32_t val_lw_m; \ + \ + __asm__ __volatile__( \ + "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \ + "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ + : [val_lw_m] "=&r"(val_lw_m) \ + : [psrc_lw_m] "r"(psrc_lw_m)); \ + \ + val_lw_m; \ }) #if (__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r"(val_m) \ - : [psrc_m] "m"(*psrc_m)); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint64_t val_ld_m = 0; \ + \ + __asm__ __volatile__( \ + "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \ + "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ + : [val_ld_m] "=&r"(val_ld_m) \ + : [psrc_ld_m] "r"(psrc_ld_m)); \ + \ + val_ld_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m_combined = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m_combined = (uint64_t)(val1_m); \ - val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ - val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ - \ - val_m_combined; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \ + uint32_t val0_ld_m, val1_ld_m; \ + uint64_t val_ld_m = 0; \ + \ + val0_ld_m = LW(psrc_ld_m); \ + val1_ld_m = LW(psrc_ld_m + 4); \ + \ + val_ld_m = (uint64_t)(val1_ld_m); \ + val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \ + val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m); \ + \ + val_ld_m; \ }) #endif // (__mips == 64) -#define SH(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SH(val, pdst) \ + { \ + uint8_t *pdst_sh_m = (uint8_t *)(pdst); \ + const uint16_t val_sh_m = (val); \ + \ + __asm__ __volatile__("ush %[val_sh_m], %[pdst_sh_m] \n\t" \ + \ + : [pdst_sh_m] "=m"(*pdst_sh_m) \ + : [val_sh_m] "r"(val_sh_m)); \ } -#define SW(val, pdst) \ - { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m"(*pdst_m) \ - : [val_m] "r"(val_m)); \ +#define SW(val, pdst) \ + { \ + uint8_t *pdst_sw_m = (uint8_t *)(pdst); \ + const uint32_t val_sw_m = (val); \ + \ + __asm__ __volatile__("usw %[val_sw_m], %[pdst_sw_m] \n\t" \ + \ + : [pdst_sw_m] "=m"(*pdst_sw_m) \ + : [val_sw_m] "r"(val_sw_m)); \ } -#define SD(val, pdst) \ - { \ - uint8_t *pdst_m1 = (uint8_t *)(pdst); \ - uint32_t val0_m, val1_m; \ - \ - val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - \ - SW(val0_m, pdst_m1); \ - SW(val1_m, pdst_m1 + 4); \ +#define SD(val, pdst) \ + { \ + uint8_t *pdst_sd_m = (uint8_t *)(pdst); \ + uint32_t val0_sd_m, val1_sd_m; \ + \ + val0_sd_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ + val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_sd_m, pdst_sd_m); \ + SW(val1_sd_m, pdst_sd_m + 4); \ } #endif // (__mips_isa_rev >= 6) @@ -283,97 +227,73 @@ SD(in3, (pdst) + 3 * stride); \ } -/* Description : Load vectors with 16 byte elements with stride +/* Description : Load vector elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 Return Type - as per RTYPE Details : Load 16 byte elements in 'out0' from (psrc) Load 16 byte elements in 'out1' from (psrc + stride) */ -#define LD_B2(RTYPE, psrc, stride, out0, out1) \ +#define LD_V2(RTYPE, psrc, stride, out0, out1) \ { \ - out0 = LD_B(RTYPE, (psrc)); \ - out1 = LD_B(RTYPE, (psrc) + stride); \ + out0 = LD_V(RTYPE, (psrc)); \ + out1 = LD_V(RTYPE, (psrc) + stride); \ } -#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) -#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) +#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__) +#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__) +#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__) -#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ +#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \ { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ + LD_V2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_V(RTYPE, (psrc) + 2 * stride); \ } -#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) +#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__) -#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ +#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + LD_V2(RTYPE, (psrc), stride, out0, out1); \ + LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ } -#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) -#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) +#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__) +#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__) -#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ +#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ + LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_V(RTYPE, (psrc) + 4 * stride); \ } -#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) -#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) +#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__) -#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ +#define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ { \ - LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ - LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ + LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ } -#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) +#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__) -#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ +#define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ out7) \ { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ } -#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) -#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) +#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__) +#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__) -/* Description : Load vectors with 8 halfword elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Details : Load 8 halfword elements in 'out0' from (psrc) - Load 8 halfword elements in 'out1' from (psrc + stride) -*/ -#define LD_H2(RTYPE, psrc, stride, out0, out1) \ - { \ - out0 = LD_H(RTYPE, (psrc)); \ - out1 = LD_H(RTYPE, (psrc) + (stride)); \ - } -#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) - -#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ - { \ - LD_H2(RTYPE, (psrc), stride, out0, out1); \ - LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ - } -#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) - -#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ - out7) \ - { \ - LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ - } -#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) - -#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ +#define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ out7, out8, out9, out10, out11, out12, out13, out14, out15) \ { \ - LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ + LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ out7); \ - LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ + LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ out13, out14, out15); \ } -#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) +#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__) /* Description : Load 4x4 block of signed halfword elements from 1D source data into 4 vectors (Each vector with 4 signed halfwords) @@ -388,79 +308,35 @@ out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ } -/* Description : Load 2 vectors of signed word elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - signed word -*/ -#define LD_SW2(psrc, stride, out0, out1) \ - { \ - out0 = LD_SW((psrc)); \ - out1 = LD_SW((psrc) + stride); \ - } - -/* Description : Store vectors of 16 byte elements with stride +/* Description : Store vectors with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 16 byte elements from 'in0' to (pdst) Store 16 byte elements from 'in1' to (pdst + stride) */ -#define ST_B2(RTYPE, in0, in1, pdst, stride) \ +#define ST_V2(RTYPE, in0, in1, pdst, stride) \ { \ - ST_B(RTYPE, in0, (pdst)); \ - ST_B(RTYPE, in1, (pdst) + stride); \ + ST_V(RTYPE, in0, (pdst)); \ + ST_V(RTYPE, in1, (pdst) + stride); \ } -#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) +#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__) +#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__) +#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__) -#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ +#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ { \ - ST_B2(RTYPE, in0, in1, (pdst), stride); \ - ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + ST_V2(RTYPE, in0, in1, (pdst), stride); \ + ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ } -#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) +#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) +#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__) -#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ { \ - ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ - ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ - } -#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) - -/* Description : Store vectors of 8 halfword elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 8 halfword elements from 'in0' to (pdst) - Store 8 halfword elements from 'in1' to (pdst + stride) -*/ -#define ST_H2(RTYPE, in0, in1, pdst, stride) \ - { \ - ST_H(RTYPE, in0, (pdst)); \ - ST_H(RTYPE, in1, (pdst) + stride); \ - } -#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) - -#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ - { \ - ST_H2(RTYPE, in0, in1, (pdst), stride); \ - ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ - } -#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) - -#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ - { \ - ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ - ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ - } -#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) - -/* Description : Store vectors of word elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 4 word elements from 'in0' to (pdst) - Store 4 word elements from 'in1' to (pdst + stride) -*/ -#define ST_SW2(in0, in1, pdst, stride) \ - { \ - ST_SW(in0, (pdst)); \ - ST_SW(in1, (pdst) + stride); \ + ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ } +#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) +#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__) /* Description : Store 2x4 byte block to destination memory from input vector Arguments : Inputs - in, stidx, pdst, stride @@ -605,17 +481,17 @@ with rounding is calculated and written to 'out0' */ #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ - { \ + do { \ out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ - } + } while (0) #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3) \ { \ - AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ - AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ + AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1); \ + AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3); \ } #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) @@ -681,6 +557,7 @@ #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) +#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__) #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ out3) \ @@ -897,16 +774,16 @@ Details : 4 signed word elements of 'in' vector are added together and the resulting integer sum is returned */ -#define HADD_SW_S32(in) \ - ({ \ - v2i64 res0_m, res1_m; \ - int32_t sum_m; \ - \ - res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ - res1_m = __msa_splati_d(res0_m, 1); \ - res0_m = res0_m + res1_m; \ - sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ - sum_m; \ +#define HADD_SW_S32(in) \ + ({ \ + v2i64 hadd_sw_s32_res0_m, hadd_sw_s32_res1_m; \ + int32_t hadd_sw_s32_sum_m; \ + \ + hadd_sw_s32_res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + hadd_sw_s32_res1_m = __msa_splati_d(hadd_sw_s32_res0_m, 1); \ + hadd_sw_s32_res0_m = hadd_sw_s32_res0_m + hadd_sw_s32_res1_m; \ + hadd_sw_s32_sum_m = __msa_copy_s_w((v4i32)hadd_sw_s32_res0_m, 0); \ + hadd_sw_s32_sum_m; \ }) /* Description : Horizontal addition of 4 unsigned word elements @@ -916,16 +793,16 @@ Details : 4 unsigned word elements of 'in' vector are added together and the resulting integer sum is returned */ -#define HADD_UW_U32(in) \ - ({ \ - v2u64 res0_m, res1_m; \ - uint32_t sum_m; \ - \ - res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ - res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ - res0_m += res1_m; \ - sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ - sum_m; \ +#define HADD_UW_U32(in) \ + ({ \ + v2u64 hadd_uw_u32_res0_m, hadd_uw_u32_res1_m; \ + uint32_t hadd_uw_u32_sum_m; \ + \ + hadd_uw_u32_res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ + hadd_uw_u32_res1_m = (v2u64)__msa_splati_d((v2i64)hadd_uw_u32_res0_m, 1); \ + hadd_uw_u32_res0_m += hadd_uw_u32_res1_m; \ + hadd_uw_u32_sum_m = __msa_copy_u_w((v4i32)hadd_uw_u32_res0_m, 0); \ + hadd_uw_u32_sum_m; \ }) /* Description : Horizontal addition of 8 unsigned halfword elements @@ -935,14 +812,14 @@ Details : 8 unsigned halfword elements of 'in' vector are added together and the resulting integer sum is returned */ -#define HADD_UH_U32(in) \ - ({ \ - v4u32 res_m; \ - uint32_t sum_m; \ - \ - res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ - sum_m = HADD_UW_U32(res_m); \ - sum_m; \ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 hadd_uh_u32_res_m; \ + uint32_t hadd_uh_u32_sum_m; \ + \ + hadd_uh_u32_res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + hadd_uh_u32_sum_m = HADD_UW_U32(hadd_uh_u32_res_m); \ + hadd_uh_u32_sum_m; \ }) /* Description : Horizontal addition of unsigned byte vector elements @@ -1049,6 +926,7 @@ } #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) +#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__) /* Description : Interleave even byte elements from vectors Arguments : Inputs - in0, in1, in2, in3 @@ -1307,6 +1185,7 @@ out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ } #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__) #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) @@ -1559,6 +1438,12 @@ Details : Each element of vector 'in0' is right shifted by 'shift' and the result is written in-place. 'shift' is a GP variable. */ +#define SRA_2V(in0, in1, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + } + #define SRA_4V(in0, in1, in2, in3, shift) \ { \ in0 = in0 >> shift; \ @@ -1578,15 +1463,15 @@ 'shift' is a vector. */ #define SRAR_W2(RTYPE, in0, in1, shift) \ - { \ + do { \ in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ - } + } while (0) #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ { \ - SRAR_W2(RTYPE, in0, in1, shift) \ - SRAR_W2(RTYPE, in2, in3, shift) \ + SRAR_W2(RTYPE, in0, in1, shift); \ + SRAR_W2(RTYPE, in2, in3, shift); \ } #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) @@ -1714,6 +1599,25 @@ out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ } +/* Description : Sign extend byte elements from input vector and return + halfword results in pair of vectors + Arguments : Input - in (byte vector) + Outputs - out0, out1 (sign extended halfword vectors) + Return Type - signed halfword + Details : Sign bit of byte elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 8 signed halfword elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 8 signed halfword elements in 'out1' +*/ +#define UNPCK_SB_SH(in, out0, out1) \ + { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_clti_s_b((v16i8)in, 0); \ + ILVRL_B2_SH(tmp_m, in, out0, out1); \ + } + /* Description : Zero extend unsigned byte elements to halfword elements Arguments : Input - in (unsigned byte vector) Outputs - out0, out1 (unsigned halfword vectors) @@ -1872,8 +1776,6 @@ out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ \ tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ @@ -2027,19 +1929,17 @@ /* Description : Converts inputs to unsigned bytes, interleave, average & store as 8x4 unsigned byte block - Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, - pdst, stride + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride */ -#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ - pdst, stride) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - tmp0_m = PCKEV_XORI128_UB(in0, in1); \ - tmp1_m = PCKEV_XORI128_UB(in2, in3); \ - ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ } /* Description : Pack even byte elements and store byte vector in destination @@ -2068,4 +1968,4 @@ \ tmp1_m; \ }) -#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */ +#endif // VPX_VPX_DSP_MIPS_MACROS_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c new file mode 100644 index 0000000000..7f5882bca3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/sad_mmi.c @@ -0,0 +1,807 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#define SAD_SRC_REF_ABS_SUB_64 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_32 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_16 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_REF_ABS_SUB_8 \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + +#if _MIPS_SIM == _ABIO32 +#define SAD_SRC_REF_ABS_SUB_4 \ + "ulw %[tmp0], 0x00(%[src]) \n\t" \ + "mtc1 %[tmp0], %[ftmp1] \n\t" \ + "ulw %[tmp0], 0x00(%[ref]) \n\t" \ + "mtc1 %[tmp0], %[ftmp2] \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */ +#define SAD_SRC_REF_ABS_SUB_4 \ + "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" \ + "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \ + "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#endif /* _MIPS_SIM == _ABIO32 */ + +#define SAD_SRC_AVGREF_ABS_SUB_64 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x27(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x20(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x2f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x28(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x27(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x20(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x2f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x28(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x37(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x30(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x3f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x38(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x37(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x30(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x3f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x38(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_32 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x17(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x10(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x1f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x18(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x17(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x10(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x1f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x18(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_16 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp3], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp3], 0x00(%[ref]) \n\t" \ + "gsldlc1 %[ftmp4], 0x0f(%[ref]) \n\t" \ + "gsldrc1 %[ftmp4], 0x08(%[ref]) \n\t" \ + "pavgb %[ftmp3], %[ftmp1], %[ftmp3] \n\t" \ + "pavgb %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[src]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \ + "pasubub %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "biadd %[ftmp2], %[ftmp2] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "paddw %[ftmp5], %[ftmp5], %[ftmp2] \n\t" + +#define SAD_SRC_AVGREF_ABS_SUB_8 \ + "gsldlc1 %[ftmp1], 0x07(%[second_pred]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" + +#if _MIPS_SIM == _ABIO32 +#define SAD_SRC_AVGREF_ABS_SUB_4 \ + "ulw %[tmp0], 0x00(%[second_pred]) \n\t" \ + "mtc1 %[tmp0], %[ftmp1] \n\t" \ + "ulw %[tmp0], 0x00(%[ref]) \n\t" \ + "mtc1 %[tmp0], %[ftmp2] \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */ +#define SAD_SRC_AVGREF_ABS_SUB_4 \ + "gslwlc1 %[ftmp1], 0x03(%[second_pred]) \n\t" \ + "gslwrc1 %[ftmp1], 0x00(%[second_pred]) \n\t" \ + "gslwlc1 %[ftmp2], 0x03(%[ref]) \n\t" \ + "gslwrc1 %[ftmp2], 0x00(%[ref]) \n\t" \ + "pavgb %[ftmp2], %[ftmp1], %[ftmp2] \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" \ + "pasubub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \ + "mthc1 $0, %[ftmp1] \n\t" \ + "biadd %[ftmp1], %[ftmp1] \n\t" \ + "paddw %[ftmp3], %[ftmp3], %[ftmp1] \n\t" +#endif /* _MIPS_SIM == _ABIO32 */ + +#define sadMxNx4D_mmi(m, n) \ + void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \ + } + +static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_64 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_64 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad64xN(H) \ + unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad64x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad64xN(64); +vpx_sad64xN(32); +sadMxNx4D_mmi(64, 64); +sadMxNx4D_mmi(64, 32); + +static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_64 + MMI_ADDIU(%[second_pred], %[second_pred], 0x40) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_64 + MMI_ADDIU(%[second_pred], %[second_pred], 0x40) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"(l_second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad_avg64xN(H) \ + unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg64xN(64); +vpx_sad_avg64xN(32); + +static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_32 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_32 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad32xN(H) \ + unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad32x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad32xN(64); +vpx_sad32xN(32); +vpx_sad32xN(16); +sadMxNx4D_mmi(32, 64); +sadMxNx4D_mmi(32, 32); +sadMxNx4D_mmi(32, 16); + +static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_32 + MMI_ADDIU(%[second_pred], %[second_pred], 0x20) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_32 + MMI_ADDIU(%[second_pred], %[second_pred], 0x20) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"(l_second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad_avg32xN(H) \ + unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg32xN(64); +vpx_sad_avg32xN(32); +vpx_sad_avg32xN(16); + +static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_16 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_16 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad16xN(H) \ + unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad16x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad16xN(32); +vpx_sad16xN(16); +vpx_sad16xN(8); +sadMxNx4D_mmi(16, 32); +sadMxNx4D_mmi(16, 16); +sadMxNx4D_mmi(16, 8); + +static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5; + mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_16 + MMI_ADDIU(%[second_pred], %[second_pred], 0x10) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_16 + MMI_ADDIU(%[second_pred], %[second_pred], 0x10) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp5] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter), + [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"(l_second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad_avg16xN(H) \ + unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg16xN(32); +vpx_sad_avg16xN(16); +vpx_sad_avg16xN(8); + +static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_8 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_8 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad8xN(H) \ + unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad8x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad8xN(16); +vpx_sad8xN(8); +vpx_sad8xN(4); +sadMxNx4D_mmi(8, 16); +sadMxNx4D_mmi(8, 8); +sadMxNx4D_mmi(8, 4); + +static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_8 + MMI_ADDIU(%[second_pred], %[second_pred], 0x08) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_8 + MMI_ADDIU(%[second_pred], %[second_pred], 0x08) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"(l_second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad_avg8xN(H) \ + unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg8xN(16); +vpx_sad_avg8xN(8); +vpx_sad_avg8xN(4); + +static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_REF_ABS_SUB_4 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_REF_ABS_SUB_4 + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad4xN(H) \ + unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return vpx_sad4x(src, src_stride, ref, ref_stride, H); \ + } + +vpx_sad4xN(8); +vpx_sad4xN(4); +sadMxNx4D_mmi(4, 8); +sadMxNx4D_mmi(4, 4); + +static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, + int counter) { + unsigned int sad; + double ftmp1, ftmp2, ftmp3; + mips_reg l_counter = counter; + mips_reg l_second_pred = (mips_reg)second_pred; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + // Include two loop body, to reduce loop time. + SAD_SRC_AVGREF_ABS_SUB_4 + MMI_ADDIU(%[second_pred], %[second_pred], 0x04) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + SAD_SRC_AVGREF_ABS_SUB_4 + MMI_ADDIU(%[second_pred], %[second_pred], 0x04) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[ref], %[ref], %[ref_stride]) + MMI_ADDIU(%[counter], %[counter], -0x02) + "bnez %[counter], 1b \n\t" + "mfc1 %[sad], %[ftmp3] \n\t" + : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3), + [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref), + [second_pred]"+&r"(l_second_pred), + [sad]"=&r"(sad) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride) + ); + /* clang-format on */ + + return sad; +} + +#define vpx_sad_avg4xN(H) \ + unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \ + } + +vpx_sad_avg4xN(8); +vpx_sad_avg4xN(4); diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c index e295123acf..b0f8ff1fd9 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/sad_msa.c @@ -159,640 +159,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, return sad; } -static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v16u8 ref0, ref1, ref2, ref3, diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - INSERT_W4_UB(src0, src1, src2, src3, src); - - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad0 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref00, ref11, ref22, ref33; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); - ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, - ref0, ref1); - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src, ref, ref0, ref1, diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = height >> 1; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v4u32 sad; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3); - ref0_4 = LD_UB(ref + 64); - ref += ref_stride; - - sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[0] = HADD_SW_S32((v4i32)sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[1] = HADD_SW_S32((v4i32)sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[2] = HADD_SW_S32((v4i32)sad); -} - -static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3, diff; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - INSERT_W4_UB(src0, src1, src2, src3, src); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad0 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref00, ref11, ref22, ref33; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); - ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, - ref0, ref1); - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src, ref0, ref1, ref; - v16u8 diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1; - v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); - sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); - sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); - sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); - sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - const uint8_t *src_dup, *ref_dup; - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4; - v16u8 ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v8u16 sad3_0 = { 0 }; - v8u16 sad3_1 = { 0 }; - v4u32 sad; - - src_dup = src; - ref_dup = ref; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); - ref += ref_stride; - - sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3); - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[0] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[1] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[2] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad3_0, sad3_0); - sad += __msa_hadd_u_w(sad3_1, sad3_1); - sad_array[3] = HADD_SW_S32(sad); - - sad0_0 = (v8u16)__msa_ldi_h(0); - sad0_1 = (v8u16)__msa_ldi_h(0); - sad1_0 = (v8u16)__msa_ldi_h(0); - sad1_1 = (v8u16)__msa_ldi_h(0); - sad2_0 = (v8u16)__msa_ldi_h(0); - sad2_1 = (v8u16)__msa_ldi_h(0); - sad3_0 = (v8u16)__msa_ldi_h(0); - sad3_1 = (v8u16)__msa_ldi_h(0); - - for (ht_cnt = 64; ht_cnt--;) { - LD_UB4(src_dup, 16, src0, src1, src2, src3); - src_dup += src_stride; - LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); - ref_dup += ref_stride; - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4); - sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7); - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[4] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[5] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[6] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad3_0, sad3_0); - sad += __msa_hadd_u_w(sad3_1, sad3_1); - sad_array[7] = HADD_SW_S32(sad); -} - static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *const aref_ptr[], int32_t ref_stride, int32_t height, @@ -1297,108 +663,38 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ } -#define VPX_SAD_4xHEIGHTx3_MSA(height) \ - void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_8xHEIGHTx3_MSA(height) \ - void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_16xHEIGHTx3_MSA(height) \ - void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_32xHEIGHTx3_MSA(height) \ - void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_64xHEIGHTx3_MSA(height) \ - void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_4xHEIGHTx8_MSA(height) \ - void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_8xHEIGHTx8_MSA(height) \ - void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_16xHEIGHTx8_MSA(height) \ - void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_32xHEIGHTx8_MSA(height) \ - void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define VPX_SAD_64xHEIGHTx8_MSA(height) \ - void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - #define VPX_SAD_4xHEIGHTx4D_MSA(height) \ void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_8xHEIGHTx4D_MSA(height) \ void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_16xHEIGHTx4D_MSA(height) \ void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_32xHEIGHTx4D_MSA(height) \ void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_64xHEIGHTx4D_MSA(height) \ void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } @@ -1444,91 +740,65 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, // 64x64 VPX_SAD_64xHEIGHT_MSA(64); -VPX_SAD_64xHEIGHTx3_MSA(64); -VPX_SAD_64xHEIGHTx8_MSA(64); VPX_SAD_64xHEIGHTx4D_MSA(64); VPX_AVGSAD_64xHEIGHT_MSA(64); // 64x32 VPX_SAD_64xHEIGHT_MSA(32); -VPX_SAD_64xHEIGHTx3_MSA(32); -VPX_SAD_64xHEIGHTx8_MSA(32); VPX_SAD_64xHEIGHTx4D_MSA(32); VPX_AVGSAD_64xHEIGHT_MSA(32); // 32x64 VPX_SAD_32xHEIGHT_MSA(64); -VPX_SAD_32xHEIGHTx3_MSA(64); -VPX_SAD_32xHEIGHTx8_MSA(64); VPX_SAD_32xHEIGHTx4D_MSA(64); VPX_AVGSAD_32xHEIGHT_MSA(64); // 32x32 VPX_SAD_32xHEIGHT_MSA(32); -VPX_SAD_32xHEIGHTx3_MSA(32); -VPX_SAD_32xHEIGHTx8_MSA(32); VPX_SAD_32xHEIGHTx4D_MSA(32); VPX_AVGSAD_32xHEIGHT_MSA(32); // 32x16 VPX_SAD_32xHEIGHT_MSA(16); -VPX_SAD_32xHEIGHTx3_MSA(16); -VPX_SAD_32xHEIGHTx8_MSA(16); VPX_SAD_32xHEIGHTx4D_MSA(16); VPX_AVGSAD_32xHEIGHT_MSA(16); // 16x32 VPX_SAD_16xHEIGHT_MSA(32); -VPX_SAD_16xHEIGHTx3_MSA(32); -VPX_SAD_16xHEIGHTx8_MSA(32); VPX_SAD_16xHEIGHTx4D_MSA(32); VPX_AVGSAD_16xHEIGHT_MSA(32); // 16x16 VPX_SAD_16xHEIGHT_MSA(16); -VPX_SAD_16xHEIGHTx3_MSA(16); -VPX_SAD_16xHEIGHTx8_MSA(16); VPX_SAD_16xHEIGHTx4D_MSA(16); VPX_AVGSAD_16xHEIGHT_MSA(16); // 16x8 VPX_SAD_16xHEIGHT_MSA(8); -VPX_SAD_16xHEIGHTx3_MSA(8); -VPX_SAD_16xHEIGHTx8_MSA(8); VPX_SAD_16xHEIGHTx4D_MSA(8); VPX_AVGSAD_16xHEIGHT_MSA(8); // 8x16 VPX_SAD_8xHEIGHT_MSA(16); -VPX_SAD_8xHEIGHTx3_MSA(16); -VPX_SAD_8xHEIGHTx8_MSA(16); VPX_SAD_8xHEIGHTx4D_MSA(16); VPX_AVGSAD_8xHEIGHT_MSA(16); // 8x8 VPX_SAD_8xHEIGHT_MSA(8); -VPX_SAD_8xHEIGHTx3_MSA(8); -VPX_SAD_8xHEIGHTx8_MSA(8); VPX_SAD_8xHEIGHTx4D_MSA(8); VPX_AVGSAD_8xHEIGHT_MSA(8); // 8x4 VPX_SAD_8xHEIGHT_MSA(4); -VPX_SAD_8xHEIGHTx3_MSA(4); -VPX_SAD_8xHEIGHTx8_MSA(4); VPX_SAD_8xHEIGHTx4D_MSA(4); VPX_AVGSAD_8xHEIGHT_MSA(4); // 4x8 VPX_SAD_4xHEIGHT_MSA(8); -VPX_SAD_4xHEIGHTx3_MSA(8); -VPX_SAD_4xHEIGHTx8_MSA(8); VPX_SAD_4xHEIGHTx4D_MSA(8); VPX_AVGSAD_4xHEIGHT_MSA(8); // 4x4 VPX_SAD_4xHEIGHT_MSA(4); -VPX_SAD_4xHEIGHTx3_MSA(4); -VPX_SAD_4xHEIGHTx8_MSA(4); VPX_SAD_4xHEIGHTx4D_MSA(4); VPX_AVGSAD_4xHEIGHT_MSA(4); diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c index 313e06f92d..572fcabfc0 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c @@ -27,13 +27,14 @@ static const uint8_t bilinear_filters_msa[8][2] = { HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ \ - sub += res_l0_m + res_l1_m; \ + (sub) += res_l0_m + res_l1_m; \ } -#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) #define VARIANCE_LARGE_WxH(sse, diff, shift) \ - sse - (((int64_t)diff * diff) >> shift) + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, @@ -1619,16 +1620,16 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \ - const uint8_t *src, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *src, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \ uint32_t *sse) { \ int32_t diff; \ uint32_t var; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ \ - if (yoffset) { \ - if (xoffset) { \ + if (y_offset) { \ + if (x_offset) { \ *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ } else { \ @@ -1638,7 +1639,7 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( \ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ } else { \ - if (xoffset) { \ + if (x_offset) { \ *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ \ @@ -1672,15 +1673,15 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ uint32_t *sse, const uint8_t *sec_pred) { \ int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ \ - if (yoffset) { \ - if (xoffset) { \ + if (y_offset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ v_filter, ht, &diff); \ @@ -1690,7 +1691,7 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); &diff); \ } \ } else { \ - if (xoffset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ &diff); \ @@ -1719,16 +1720,16 @@ VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32); uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, int32_t src_stride, - int32_t xoffset, int32_t yoffset, + int32_t x_offset, int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, uint32_t *sse, const uint8_t *sec_pred) { int32_t diff; - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; - if (yoffset) { - if (xoffset) { + if (y_offset) { + if (x_offset) { *sse = sub_pixel_avg_sse_diff_32width_hv_msa( src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, v_filter, 64, &diff); @@ -1738,7 +1739,7 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, v_filter, 64, &diff); } } else { - if (xoffset) { + if (x_offset) { *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, 64, &diff); @@ -1753,15 +1754,15 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ uint32_t *sse, const uint8_t *sec_pred) { \ int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ \ - if (yoffset) { \ - if (xoffset) { \ + if (y_offset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ v_filter, ht, &diff); \ @@ -1771,7 +1772,7 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, &diff); \ } \ } else { \ - if (xoffset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ &diff); \ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c new file mode 100644 index 0000000000..8bd7e6977c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/subtract_mmi.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + double ftmp[13]; + uint32_t tmp[1]; + + if (rows == cols) { + switch (rows) { + case 4: + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp1] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp2] \n\t" +#else + "gslwlc1 %[ftmp1], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp2], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp2], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp3] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp4] \n\t" +#else + "gslwlc1 %[ftmp3], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp3], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp4], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp4], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp5] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp6] \n\t" +#else + "gslwlc1 %[ftmp5], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp6], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp6], 0x00(%[pred]) \n\t" +#endif + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + +#if _MIPS_SIM == _ABIO32 + "ulw %[tmp0], 0x00(%[src]) \n\t" + "mtc1 %[tmp0], %[ftmp7] \n\t" + "ulw %[tmp0], 0x00(%[pred]) \n\t" + "mtc1 %[tmp0], %[ftmp8] \n\t" +#else + "gslwlc1 %[ftmp7], 0x03(%[src]) \n\t" + "gslwrc1 %[ftmp7], 0x00(%[src]) \n\t" + "gslwlc1 %[ftmp8], 0x03(%[pred]) \n\t" + "gslwrc1 %[ftmp8], 0x00(%[pred]) \n\t" +#endif + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp2], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp4], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp6], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp8], %[ftmp0] \n\t" + "psubh %[ftmp11], %[ftmp9], %[ftmp10] \n\t" + "gssdlc1 %[ftmp11], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp11], 0x00(%[diff]) \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), +#if _MIPS_SIM == _ABIO32 + [tmp0] "=&r"(tmp[0]), +#endif + [src] "+&r"(src), [pred] "+&r"(pred), [diff] "+&r"(diff) + : [src_stride] "r"((mips_reg)src_stride), + [pred_stride] "r"((mips_reg)pred_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 8: + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x02 \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp4], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp7], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp8], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src), + [pred] "+&r"(pred), [diff] "+&r"(diff) + : [pred_stride] "r"((mips_reg)pred_stride), + [src_stride] "r"((mips_reg)src_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 16: + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x08 \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[pred]) \n\t" + "gsldlc1 %[ftmp3], 0x0f(%[src]) \n\t" + "gsldrc1 %[ftmp3], 0x08(%[src]) \n\t" + "gsldlc1 %[ftmp4], 0x0f(%[pred]) \n\t" + "gsldrc1 %[ftmp4], 0x08(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp6], 0x07(%[pred]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[pred]) \n\t" + "gsldlc1 %[ftmp7], 0x0f(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldlc1 %[ftmp8], 0x0f(%[pred]) \n\t" + "gsldrc1 %[ftmp8], 0x08(%[pred]) \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[pred], %[pred], %[pred_stride]) + "punpcklbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp2], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp2], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + "punpcklbh %[ftmp9], %[ftmp3], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp3], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp4], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp4], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "punpcklbh %[ftmp9], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp6], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp6], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x07(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x00(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x0f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x08(%[diff]) \n\t" + "punpcklbh %[ftmp9], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp10], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp8], %[ftmp0] \n\t" + "punpckhbh %[ftmp12], %[ftmp8], %[ftmp0] \n\t" + "psubsh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" + "psubsh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "gssdlc1 %[ftmp9], 0x17(%[diff]) \n\t" + "gssdrc1 %[ftmp9], 0x10(%[diff]) \n\t" + "gssdlc1 %[ftmp10], 0x1f(%[diff]) \n\t" + "gssdrc1 %[ftmp10], 0x18(%[diff]) \n\t" + MMI_ADDU(%[diff], %[diff], %[diff_stride]) + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + "bnez %[tmp0], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), + [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), + [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), + [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]), + [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src), + [pred] "+&r"(pred), [diff] "+&r"(diff) + : [pred_stride] "r"((mips_reg)pred_stride), + [src_stride] "r"((mips_reg)src_stride), + [diff_stride] "r"((mips_reg)(diff_stride * 2)) + : "memory"); + break; + case 32: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + case 64: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + default: + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, + pred, pred_stride); + break; + } + } else { + vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, pred, + pred_stride); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c new file mode 100644 index 0000000000..d4563dc410 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/sum_squares_msa.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "./macros_msa.h" + +uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride, + int size) { + int row, col; + uint64_t ss_res = 0; + v4i32 mul0, mul1; + v2i64 res0 = { 0 }; + + if (4 == size) { + uint64_t src0, src1, src2, src3; + v8i16 diff0 = { 0 }; + v8i16 diff1 = { 0 }; + + LD4(src, src_stride, src0, src1, src2, src3); + INSERT_D2_SH(src0, src1, diff0); + INSERT_D2_SH(src2, src3, diff1); + DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1); + mul0 += mul1; + res0 = __msa_hadd_s_d(mul0, mul0); + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (8 == size) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 = __msa_hadd_s_d(mul0, mul0); + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (16 == size) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += 8 * src_stride; + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 += __msa_hadd_s_d(mul0, mul0); + + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (0 == (size % 16)) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + for (row = 0; row < (size >> 4); row++) { + for (col = 0; col < size; col += 16) { + const int16_t *src_ptr = src + col; + LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5, + src6, src7); + src_ptr += 8 * src_stride; + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5, + src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 += __msa_hadd_s_d(mul0, mul0); + } + + src += 16 * src_stride; + } + + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else { + int16_t val; + + for (row = 0; row < size; row++) { + for (col = 0; col < size; col++) { + val = src[col]; + ss_res += val * val; + } + + src += src_stride; + } + } + + return ss_res; +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h index f077fa4814..f27504a207 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/txfm_macros_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ -#define VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_ +#define VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_ #include "vpx_dsp/mips/macros_msa.h" @@ -98,4 +98,4 @@ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ } -#endif // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ +#endif // VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c new file mode 100644 index 0000000000..6428bc7006 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/variance_mmi.c @@ -0,0 +1,1357 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/variance.h" +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/asmdefs_mmi.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32, + vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */ +#define VARIANCE_SSE_SUM_8_FOR_W64 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp6] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp7] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp2], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \ + "psubw %[ftmp3], %[ftmp1], %[ftmp7] \n\t" \ + "psubw %[ftmp5], %[ftmp2], %[ftmp8] \n\t" \ + "punpcklhw %[ftmp1], %[ftmp4], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp2], %[ftmp4], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \ + "psubw %[ftmp4], %[ftmp1], %[ftmp7] \n\t" \ + "psubw %[ftmp6], %[ftmp2], %[ftmp8] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp3] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp4] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp5] \n\t" \ + "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" + +#define VARIANCE_SSE_SUM_4 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp5], %[ftmp4], %[ftmp4] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp5] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \ + "paddh %[ftmp7], %[ftmp7], %[ftmp3] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp4] \n\t" + +#define VARIANCE_SSE_SUM_8 \ + /* sse */ \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" \ + \ + /* sum */ \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklbh %[ftmp5], %[ftmp2], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp3] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t" \ + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" \ + "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t" + +#define VARIANCE_SSE_8 \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + +#define VARIANCE_SSE_16 \ + VARIANCE_SSE_8 \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" \ + "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ + "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp4], %[ftmp4] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp5], %[ftmp5] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp6] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp7] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \ + /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp3], %[ftmp3], %[filter_x1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \ + /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \ + \ + /* store: temp2[0] ~ temp2[3] */ \ + "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \ + "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \ + "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ + \ + /* store: temp2[0] ~ temp2[3] */ \ + "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \ + "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \ + "gssdrc1 %[ftmp4], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ + /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ + "pmullh %[ftmp3], %[ftmp3], %[filter_x0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x1] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ + "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ + /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \ + "pmullh %[ftmp9], %[ftmp9], %[filter_x0] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp10], %[ftmp10], %[filter_x1] \n\t" \ + "pmullh %[ftmp11], %[ftmp11], %[filter_x1] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ftmp11] \n\t" \ + "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ + "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp2], %[ftmp2], %[filter_y0] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp8], %[filter_y1] \n\t" \ + "paddh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \ + "psrlh %[ftmp2], %[ftmp2], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[4] ~ temp2[7] */ \ + "pmullh %[ftmp3], %[ftmp3], %[filter_y0] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp9], %[filter_y1] \n\t" \ + "paddh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \ + "psrlh %[ftmp3], %[ftmp3], %[ftmp14] \n\t" \ + \ + /* store: temp2[0] ~ temp2[7] */ \ + "pand %[ftmp2], %[ftmp2], %[mask] \n\t" \ + "pand %[ftmp3], %[ftmp3], %[mask] \n\t" \ + "packushb %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \ + "gssdlc1 %[ftmp2], 0x07(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp2], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ + /* calculate: temp2[0] ~ temp2[3] */ \ + "pmullh %[ftmp8], %[ftmp8], %[filter_y0] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp2], %[filter_y1] \n\t" \ + "paddh %[ftmp8], %[ftmp8], %[ftmp1] \n\t" \ + "psrlh %[ftmp8], %[ftmp8], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[4] ~ temp2[7] */ \ + "pmullh %[ftmp9], %[ftmp9], %[filter_y0] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp3], %[filter_y1] \n\t" \ + "paddh %[ftmp9], %[ftmp9], %[ftmp1] \n\t" \ + "psrlh %[ftmp9], %[ftmp9], %[ftmp14] \n\t" \ + \ + /* store: temp2[0] ~ temp2[7] */ \ + "pand %[ftmp8], %[ftmp8], %[mask] \n\t" \ + "pand %[ftmp9], %[ftmp9], %[mask] \n\t" \ + "packushb %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "gssdlc1 %[ftmp8], 0x07(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp8], 0x00(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A \ + /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ + \ + /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ + "pmullh %[ftmp5], %[ftmp5], %[filter_x0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp6], %[ftmp6], %[filter_x1] \n\t" \ + "pmullh %[ftmp7], %[ftmp7], %[filter_x1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp6] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ + "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B \ + /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ + \ + /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \ + "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \ + "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \ + "pmullh %[ftmp11], %[ftmp11], %[filter_x0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp12], %[ftmp12], %[filter_x1] \n\t" \ + "pmullh %[ftmp13], %[ftmp13], %[filter_x1] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp12] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ftmp13] \n\t" \ + "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ + "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A \ + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A \ + \ + /* calculate: temp2[8] ~ temp2[11] */ \ + "pmullh %[ftmp4], %[ftmp4], %[filter_y0] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp10], %[filter_y1] \n\t" \ + "paddh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \ + "psrlh %[ftmp4], %[ftmp4], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[12] ~ temp2[15] */ \ + "pmullh %[ftmp5], %[ftmp5], %[filter_y0] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp11], %[filter_y1] \n\t" \ + "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \ + "psrlh %[ftmp5], %[ftmp5], %[ftmp14] \n\t" \ + \ + /* store: temp2[8] ~ temp2[15] */ \ + "pand %[ftmp4], %[ftmp4], %[mask] \n\t" \ + "pand %[ftmp5], %[ftmp5], %[mask] \n\t" \ + "packushb %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "gssdlc1 %[ftmp4], 0x0f(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp4], 0x08(%[temp2_ptr]) \n\t" + +#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B \ + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B \ + \ + /* calculate: temp2[8] ~ temp2[11] */ \ + "pmullh %[ftmp10], %[ftmp10], %[filter_y0] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp4], %[filter_y1] \n\t" \ + "paddh %[ftmp10], %[ftmp10], %[ftmp1] \n\t" \ + "psrlh %[ftmp10], %[ftmp10], %[ftmp14] \n\t" \ + \ + /* calculate: temp2[12] ~ temp2[15] */ \ + "pmullh %[ftmp11], %[ftmp11], %[filter_y0] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ff_ph_40] \n\t" \ + "pmullh %[ftmp1], %[ftmp5], %[filter_y1] \n\t" \ + "paddh %[ftmp11], %[ftmp11], %[ftmp1] \n\t" \ + "psrlh %[ftmp11], %[ftmp11], %[ftmp14] \n\t" \ + \ + /* store: temp2[8] ~ temp2[15] */ \ + "pand %[ftmp10], %[ftmp10], %[mask] \n\t" \ + "pand %[ftmp11], %[ftmp11], %[mask] \n\t" \ + "packushb %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "gssdlc1 %[ftmp10], 0x0f(%[temp2_ptr]) \n\t" \ + "gssdrc1 %[ftmp10], 0x08(%[temp2_ptr]) \n\t" + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the first-pass of 2-D separable filter. +// +// Produces int16_t output to retain precision for the next pass. Two filter +// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). +// It defines the offset required to move from one input to the next. +static void var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line, + int pixel_step, unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the second-pass of 2-D separable filter. +// +// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step = 1) or vertically +// (pixel_step = stride). It defines the offset required to move from one input +// to the next. Output is 8-bit. +static void var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; + } +} + +static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x27(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x20(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x27(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x20(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x2f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x28(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x2f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x28(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x37(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x30(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x37(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x30(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x3f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x38(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x3f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x38(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "mfc1 %[tmp1], %[ftmp9] \n\t" + "mfhc1 %[tmp2], %[ftmp9] \n\t" + "addu %[sum], %[tmp1], %[tmp2] \n\t" + "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "swc1 %[ftmp1], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr), + [sum]"=&r"(sum) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / (64 * high)); +} + +#define VPX_VARIANCE64XN(n) \ + uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +VPX_VARIANCE64XN(64) +VPX_VARIANCE64XN(32) + +uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + "li %[tmp0], 0x40 \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8_FOR_W64 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "mfc1 %[tmp1], %[ftmp9] \n\t" + "mfhc1 %[tmp2], %[ftmp9] \n\t" + "addu %[sum], %[tmp1], %[tmp2] \n\t" + "ssrld %[ftmp1], %[ftmp10], %[ftmp11] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp10] \n\t" + "swc1 %[ftmp1], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [tmp2]"=&r"(tmp[2]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr), + [sum]"=&r"(sum) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [sse]"r"(sse) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / 2048); +} + +static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / (32 * high)); +} + +#define VPX_VARIANCE32XN(n) \ + uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +VPX_VARIANCE32XN(32) +VPX_VARIANCE32XN(16) + +static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / (16 * high)); +} + +#define VPX_VARIANCE16XN(n) \ + uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +VPX_VARIANCE16XN(32) +VPX_VARIANCE16XN(16) +VPX_VARIANCE16XN(8) + +static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[13]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "pxor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" + "pxor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp10], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp10], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp12], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp12], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp11] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / (8 * high)); +} + +#define VPX_VARIANCE8XN(n) \ + uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +VPX_VARIANCE8XN(16) +VPX_VARIANCE8XN(8) +VPX_VARIANCE8XN(4) + +static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, int high) { + int sum; + double ftmp[12]; + uint32_t tmp[3]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp10] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp6], %[ftmp6], %[ftmp6] \n\t" + "pxor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" + VARIANCE_SSE_SUM_4 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp6], %[ftmp10] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp6] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + + "punpcklhw %[ftmp3], %[ftmp7], %[ftmp0] \n\t" + "punpckhhw %[ftmp4], %[ftmp7], %[ftmp0] \n\t" + "punpcklhw %[ftmp5], %[ftmp8], %[ftmp0] \n\t" + "punpckhhw %[ftmp6], %[ftmp8], %[ftmp0] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp5] \n\t" + "psubw %[ftmp3], %[ftmp3], %[ftmp6] \n\t" + "ssrld %[ftmp0], %[ftmp3], %[ftmp10] \n\t" + "paddw %[ftmp0], %[ftmp0], %[ftmp3] \n\t" + "swc1 %[ftmp0], 0x00(%[sum]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), + [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) + : "memory" + ); + /* clang-format on */ + + return *sse - (((int64_t)sum * sum) / (4 * high)); +} + +#define VPX_VARIANCE4XN(n) \ + uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +VPX_VARIANCE4XN(8) +VPX_VARIANCE4XN(4) + +static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, uint64_t high) { + double ftmp[12]; + uint32_t tmp[1]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + + "1: \n\t" + VARIANCE_SSE_16 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + /* clang-format on */ + + return *sse; +} + +#define vpx_mse16xN(n) \ + uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +vpx_mse16xN(16); +vpx_mse16xN(8); + +static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, uint64_t high) { + double ftmp[12]; + uint32_t tmp[1]; + + *sse = 0; + + /* clang-format off */ + __asm__ volatile ( + "li %[tmp0], 0x20 \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + MMI_L(%[tmp0], %[high], 0x00) + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "pxor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" + + "1: \n\t" + VARIANCE_SSE_8 + + "addiu %[tmp0], %[tmp0], -0x01 \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) + "bnez %[tmp0], 1b \n\t" + + "ssrld %[ftmp9], %[ftmp8], %[ftmp11] \n\t" + "paddw %[ftmp9], %[ftmp9], %[ftmp8] \n\t" + "swc1 %[ftmp9], 0x00(%[sse]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [tmp0]"=&r"(tmp[0]), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), + [high]"r"(&high), [sse]"r"(sse) + : "memory" + ); + /* clang-format on */ + + return *sse; +} + +#define vpx_mse8xN(n) \ + uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ + } + +vpx_mse8xN(16); +vpx_mse8xN(8); + +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[((H) + 1) * (W)]; \ + uint8_t temp2[(H) * (W)]; \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse); \ + } + +SUBPIX_VAR(64, 64) +SUBPIX_VAR(64, 32) +SUBPIX_VAR(32, 64) +SUBPIX_VAR(32, 32) +SUBPIX_VAR(32, 16) +SUBPIX_VAR(16, 32) + +static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[15]; + double ff_ph_40, mask; + double filter_x0, filter_x1, filter_y0, filter_y1; + mips_reg tmp[2]; + uint64_t x0, x1, y0, y1, all; + + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp14]) + "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp14]) + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) + // fdata3: fdata3[0] ~ fdata3[15] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B + // temp2: temp2[0] ~ temp2[15] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A + + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + // temp2+16*1: temp2[0] ~ temp2[15] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B + + "1: \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), + [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), + [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) + : "memory" + ); + /* clang-format on */ +} + +#define SUBPIX_VAR16XN(H) \ + uint32_t vpx_sub_pixel_variance16x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[16 * (H)]; \ + var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + ((H) - 2) / 2); \ + \ + return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse); \ + } + +SUBPIX_VAR16XN(16) +SUBPIX_VAR16XN(8) + +static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[15]; + mips_reg tmp[2]; + double ff_ph_40, mask; + uint64_t x0, x1, y0, y1, all; + double filter_x0, filter_x1, filter_y0, filter_y1; + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp14]) + "punpcklbh %[ftmp14], %[ftmp14], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp14], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp14], %[ftmp0] \n\t" + "ssrld %[ftmp14], %[ftmp14], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp14], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp14]) + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) + + // fdata3: fdata3[0] ~ fdata3[7] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B + // temp2: temp2[0] ~ temp2[7] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A + + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + // temp2+8*1: temp2[0] ~ temp2[7] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B + + "1: \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]), + [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), + [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), + [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), + [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), + [counter]"+&r"(l_counter), [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) + : "memory" + ); + /* clang-format on */ +} + +#define SUBPIX_VAR8XN(H) \ + uint32_t vpx_sub_pixel_variance8x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[8 * (H)]; \ + var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + ((H) - 2) / 2); \ + \ + return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse); \ + } + +SUBPIX_VAR8XN(16) +SUBPIX_VAR8XN(8) +SUBPIX_VAR8XN(4) + +static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { + uint8_t *temp2_ptr = temp2; + mips_reg l_counter = counter; + double ftmp[7]; + mips_reg tmp[2]; + double ff_ph_40, mask; + uint64_t x0, x1, y0, y1, all; + double filter_x0, filter_x1, filter_y0, filter_y1; + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; + x0 = (uint64_t)filter_x[0]; + x1 = (uint64_t)filter_x[1]; + y0 = (uint64_t)filter_y[0]; + y1 = (uint64_t)filter_y[1]; + all = x0 | x1 << 8 | y0 << 16 | y1 << 24; + + /* clang-format off */ + __asm__ volatile ( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_MTC1(%[all], %[ftmp6]) + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "pshufh %[filter_x0], %[ftmp6], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x10) + MMI_MTC1(%[tmp0], %[mask]) + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_x1], %[ftmp6], %[ftmp0] \n\t" + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_y0], %[ftmp6], %[ftmp0] \n\t" + "ssrld %[ftmp6], %[ftmp6], %[mask] \n\t" + "pshufh %[filter_y1], %[ftmp6], %[ftmp0] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + MMI_LI(%[tmp0], 0x07) + MMI_MTC1(%[tmp0], %[ftmp6]) + MMI_LI(%[tmp0], 0x0040004000400040) + MMI_MTC1(%[tmp0], %[ff_ph_40]) + MMI_LI(%[tmp0], 0x00ff00ff00ff00ff) + MMI_MTC1(%[tmp0], %[mask]) + // fdata3: fdata3[0] ~ fdata3[3] + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B + // temp2: temp2[0] ~ temp2[7] + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A + + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + // temp2+4*1: temp2[0] ~ temp2[7] + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B + + "1: \n\t" + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A + + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A + MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) + VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B + "addiu %[counter], %[counter], -0x01 \n\t" + "bnez %[counter], 1b \n\t" + : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), + [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), + [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), + [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter), + [ff_ph_40] "=&f"(ff_ph_40), [mask] "=&f"(mask), + [filter_x0] "=&f"(filter_x0), [filter_x1] "=&f"(filter_x1), + [filter_y0] "=&f"(filter_y0), [filter_y1] "=&f"(filter_y1) + : [src_stride] "r"((mips_reg)src_stride), [all] "r"(all) + : "memory" + ); + /* clang-format on */ +} + +#define SUBPIX_VAR4XN(H) \ + uint32_t vpx_sub_pixel_variance4x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[4 * (H)]; \ + var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + ((H) - 2) / 2); \ + \ + return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse); \ + } + +SUBPIX_VAR4XN(8) +SUBPIX_VAR4XN(4) + +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[((H) + 1) * (W)]; \ + uint8_t temp2[(H) * (W)]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]); \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse); \ + } + +SUBPIX_AVG_VAR(64, 64) +SUBPIX_AVG_VAR(64, 32) +SUBPIX_AVG_VAR(32, 64) +SUBPIX_AVG_VAR(32, 32) +SUBPIX_AVG_VAR(32, 16) +SUBPIX_AVG_VAR(16, 32) +SUBPIX_AVG_VAR(16, 16) +SUBPIX_AVG_VAR(16, 8) +SUBPIX_AVG_VAR(8, 16) +SUBPIX_AVG_VAR(8, 8) +SUBPIX_AVG_VAR(8, 4) +SUBPIX_AVG_VAR(4, 8) +SUBPIX_AVG_VAR(4, 4) diff --git a/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c index 49b2f99230..444b086a6e 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/variance_msa.c @@ -33,10 +33,11 @@ sub += res_l0_m + res_l1_m; \ } -#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) +#define VARIANCE_WxH(sse, diff, shift) \ + (sse) - (((uint32_t)(diff) * (diff)) >> (shift)) #define VARIANCE_LARGE_WxH(sse, diff, shift) \ - sse - (((int64_t)diff * diff) >> shift) + (sse) - (((int64_t)(diff) * (diff)) >> (shift)) static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride, diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c index ad2af28669..5b5a1cbc3a 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c @@ -16,8 +16,9 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 dst0, dst1, dst2, dst3, res2, res3; + v16u8 dst0 = { 0 }, res; v16u8 mask0, mask1, mask2, mask3; v8i16 filt, res0, res1; @@ -36,23 +37,23 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, XORI_B4_128_SB(src0, src1, src2, src3); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, res0, res1); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); SRARI_H2_SH(res0, res1, FILTER_BITS); SAT_SH2_SH(res0, res1, 7); - PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - XORI_B2_128_UB(res2, res3); - AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + res = PCKEV_XORI128_UB(res0, res1); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); } static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v8i16 filt, vec0, vec1, vec2, vec3; mask0 = LD_UB(&mc_filt_mask_arr[16]); @@ -69,7 +70,10 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, LD_SB4(src, src_stride, src0, src1, src2, src3); XORI_B4_128_SB(src0, src1, src2, src3); src += (4 * src_stride); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, vec0, vec1); LD_SB4(src, src_stride, src0, src1, src2, src3); @@ -82,10 +86,7 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, res3); ILVR_D2_UB(res1, res0, res3, res2, res0, res2); XORI_B2_128_UB(res0, res2); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); - AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); + AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); ST4x8_UB(res0, res2, dst, dst_stride); } @@ -105,8 +106,9 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t dst_stride, int8_t *filter, int32_t height) { int32_t loop_cnt; + int64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 }; v8i16 filt, out0, out1, out2, out3; mask0 = LD_UB(&mc_filt_mask_arr[0]); @@ -127,10 +129,12 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); } @@ -309,8 +313,9 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; + v16u8 filt0, dst0 = { 0 }, vec0, vec1, res; v8u16 vec2, vec3, filt; mask = LD_SB(&mc_filt_mask_arr[16]); @@ -320,23 +325,24 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); SRARI_H2_UH(vec2, vec3, FILTER_BITS); - PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); } static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v8u16 vec4, vec5, vec6, vec7, filt; mask = LD_SB(&mc_filt_mask_arr[16]); @@ -346,7 +352,10 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, @@ -354,13 +363,9 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, res3); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + ILVR_D2_UB(res1, res0, res3, res2, res0, res2); + AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2); + ST4x8_UB(res0, res2, dst, dst_stride); } static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, @@ -378,8 +383,9 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + int64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 filt0, dst0 = { 0 }, dst1 = { 0 }; v8u16 vec0, vec1, vec2, vec3, filt; mask = LD_SB(&mc_filt_mask_arr[0]); @@ -394,16 +400,18 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); } static void common_hz_2t_and_aver_dst_8x8mult_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { + int64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 filt0, dst0 = { 0 }, dst1 = { 0 }; v8u16 vec0, vec1, vec2, vec3, filt; mask = LD_SB(&mc_filt_mask_arr[0]); @@ -419,11 +427,12 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); LD_SB4(src, src_stride, src0, src1, src2, src3); src += (4 * src_stride); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); @@ -431,9 +440,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); if (16 == height) { @@ -445,10 +455,11 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); LD_SB4(src, src_stride, src0, src1, src2, src3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); @@ -456,9 +467,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride); } } @@ -633,9 +645,10 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; int8_t cnt, filt_hor[8]; assert(x_step_q4 == 16); @@ -645,7 +658,7 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor[cnt] = filter_x[cnt]; } - if (((const int32_t *)filter_x)[0] == 0) { + if (vpx_get_filter_taps(filter_x) == 2) { switch (w) { case 4: common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, @@ -668,8 +681,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, &filt_hor[3], h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -695,8 +708,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filt_hor, h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c index 1cfa63201c..ba816192a1 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c @@ -16,8 +16,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; + v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; @@ -59,7 +60,8 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa( XORI_B4_128_SB(src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); @@ -73,14 +75,12 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa( vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); SRARI_H2_SH(res0, res1, FILTER_BITS); SAT_SH2_SH(res0, res1, 7); - PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); - XORI_B2_128_UB(tmp0, tmp1); - AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); - ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); + res = PCKEV_XORI128_UB(res0, res1); + res = (v16u8)__msa_aver_u_b(res, dst0); + ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); hz_out5 = hz_out9; @@ -94,10 +94,11 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; + v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3; v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; @@ -144,7 +145,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa( XORI_B4_128_SB(src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); @@ -172,7 +175,7 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa( SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); - CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst, + CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); @@ -225,9 +228,10 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa( static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, mask; v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 dst0, dst1, dst2, dst3, res0, res1; + v16u8 dst0 = { 0 }, out; v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; mask = LD_SB(&mc_filt_mask_arr[16]); @@ -248,21 +252,22 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); - AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, dst0); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; - v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -289,21 +294,18 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( hz_out3, hz_out5, 8); hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2, - res3); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1); + ST4x8_UB(res0, res1, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_4w_msa( @@ -321,8 +323,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa( static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert) { + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3; v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -338,7 +341,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( LD_SB5(src, src_stride, src0, src1, src2, src3, src4); src += (5 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); @@ -357,16 +362,16 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( tmp3 = __msa_dotp_u_h(vec3, filt_vt); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); } static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; + v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 }; v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -407,9 +412,10 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( tmp3 = __msa_dotp_u_h(vec0, filt_vt); SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -516,9 +522,10 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa( void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_hor[8], filt_ver[8]; assert(x_step_q4 == 16); @@ -531,8 +538,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver[cnt] = filter_y[cnt]; } - if (((const int32_t *)filter_x)[0] == 0 && - ((const int32_t *)filter_y)[0] == 0) { + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { switch (w) { case 4: common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, @@ -560,14 +567,14 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], &filt_ver[3], h); break; default: - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } - } else if (((const int32_t *)filter_x)[0] == 0 || - ((const int32_t *)filter_y)[0] == 0) { - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { switch (w) { case 4: @@ -596,8 +603,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver, h); break; default: - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c index 146ce3b2f5..e6a790dfc6 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c @@ -17,8 +17,9 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3, out; + v16u8 dst0 = { 0 }, out; v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; v16i8 src10998, filt0, filt1, filt2, filt3; @@ -43,7 +44,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, LD_SB4(src, src_stride, src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); @@ -55,9 +57,6 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, SRARI_H2_SH(out10, out32, FILTER_BITS); SAT_SH2_SH(out10, out32, 7); out = PCKEV_XORI128_UB(out10, out32); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - - dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0); out = __msa_aver_u_b(out, dst0); ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); @@ -75,8 +74,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; + uint64_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; v8i16 filt, out0, out1, out2, out3; @@ -98,7 +98,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, LD_SB4(src, src_stride, src7, src8, src9, src10); src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); XORI_B4_128_SB(src7, src8, src9, src10); ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, src87_r, src98_r, src109_r); @@ -112,7 +114,7 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, filt1, filt2, filt3); SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); @@ -246,8 +248,9 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + uint32_t tp0, tp1, tp2, tp3; v16i8 src0, src1, src2, src3, src4; - v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + v16u8 dst0 = { 0 }, out, filt0, src2110, src4332; v16i8 src10_r, src32_r, src21_r, src43_r; v8i16 filt; v8u16 tmp0, tmp1; @@ -261,9 +264,8 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, src4 = LD_SB(src); src += src_stride; - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); - dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, src32_r, src43_r); ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); @@ -280,7 +282,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + uint32_t tp0, tp1, tp2, tp3; + v16u8 dst0 = { 0 }, dst1 = { 0 }; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; v16u8 src2110, src4332, src6554, src8776, filt0; @@ -294,10 +297,10 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, src += (8 * src_stride); src8 = LD_SB(src); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2, - dst3); - ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + LW4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0); + LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1); ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, src32_r, src43_r); ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, @@ -309,9 +312,7 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride); + ST4x8_UB(src2110, src4332, dst, dst_stride); } static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, @@ -329,8 +330,9 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter) { + int64_t tp0, tp1, tp2, tp3; v16u8 src0, src1, src2, src3, src4; - v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + v16u8 dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3, filt0; v8u16 tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -339,22 +341,24 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, filt0 = (v16u8)__msa_splati_h(filt, 0); LD_UB5(src, src_stride, src0, src1, src2, src3, src4); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); } static void common_vt_2t_and_aver_dst_8x8mult_msa( const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int8_t *filter, int32_t height) { uint32_t loop_cnt; + int64_t tp0, tp1, tp2, tp3; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; - v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 }; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; v8u16 tmp0, tmp1, tmp2, tmp3; v8i16 filt; @@ -369,7 +373,12 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa( for (loop_cnt = (height >> 3); loop_cnt--;) { LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); src += (8 * src_stride); - LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); + LD4(dst, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst0); + INSERT_D2_UB(tp2, tp3, dst1); + LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3); + INSERT_D2_UB(tp0, tp1, dst2); + INSERT_D2_UB(tp2, tp3, dst3); ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, vec3); @@ -378,15 +387,13 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa( DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride); dst += (4 * dst_stride); DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst, - dst_stride); + PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride); dst += (4 * dst_stride); src0 = src8; @@ -605,9 +612,10 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_ver[8]; assert(y_step_q4 == 16); @@ -617,7 +625,7 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver[cnt] = filter_y[cnt]; } - if (((const int32_t *)filter_y)[0] == 0) { + if (vpx_get_filter_taps(filter_y) == 2) { switch (w) { case 4: common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, @@ -640,8 +648,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, &filt_ver[3], h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -668,8 +676,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filt_ver, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c index 9e8bf7b519..792c0f709c 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c @@ -621,9 +621,10 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; int8_t cnt, filt_hor[8]; assert(x_step_q4 == 16); @@ -633,7 +634,7 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor[cnt] = filter_x[cnt]; } - if (((const int32_t *)filter_x)[0] == 0) { + if (vpx_get_filter_taps(filter_x) == 2) { switch (w) { case 4: common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, @@ -656,8 +657,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -683,8 +684,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor, h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c new file mode 100644 index 0000000000..cb7bca5589 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c @@ -0,0 +1,716 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/asmdefs_mmi.h" +#include "vpx_ports/mem.h" + +#define GET_DATA_H_MMI \ + "pmaddhw %[ftmp4], %[ftmp4], %[filter1] \n\t" \ + "pmaddhw %[ftmp5], %[ftmp5], %[filter2] \n\t" \ + "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "punpckhwd %[ftmp5], %[ftmp4], %[ftmp0] \n\t" \ + "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[ftmp6], %[ftmp6], %[filter1] \n\t" \ + "pmaddhw %[ftmp7], %[ftmp7], %[filter2] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \ + "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \ + "punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \ + "pmaddhw %[ftmp8], %[ftmp8], %[filter1] \n\t" \ + "pmaddhw %[ftmp9], %[ftmp9], %[filter2] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpckhwd %[ftmp9], %[ftmp8], %[ftmp0] \n\t" \ + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp10], %[ftmp10], %[filter1] \n\t" \ + "pmaddhw %[ftmp11], %[ftmp11], %[filter2] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \ + "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \ + "punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t" + +#define GET_DATA_V_MMI \ + "punpcklhw %[srcl], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[srcl], %[srcl], %[filter10] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpcklhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \ + "punpckhhw %[srch], %[ftmp4], %[ftmp5] \n\t" \ + "pmaddhw %[srch], %[srch], %[filter10] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" \ + "punpckhhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \ + "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \ + "paddw %[srch], %[srch], %[ftmp12] \n\t" + +/* clang-format off */ +#define ROUND_POWER_OF_TWO_MMI \ + /* Add para[0] */ \ + "lw %[tmp0], 0x00(%[para]) \n\t" \ + MMI_MTC1(%[tmp0], %[ftmp6]) \ + "punpcklwd %[ftmp6], %[ftmp6], %[ftmp6] \n\t" \ + "paddw %[srcl], %[srcl], %[ftmp6] \n\t" \ + "paddw %[srch], %[srch], %[ftmp6] \n\t" \ + /* Arithmetic right shift para[1] bits */ \ + "lw %[tmp0], 0x04(%[para]) \n\t" \ + MMI_MTC1(%[tmp0], %[ftmp5]) \ + "psraw %[srcl], %[srcl], %[ftmp5] \n\t" \ + "psraw %[srch], %[srch], %[ftmp5] \n\t" +/* clang-format on */ + +#define CLIP_PIXEL_MMI \ + /* Staturated operation */ \ + "packsswh %[srcl], %[srcl], %[srch] \n\t" \ + "packushb %[ftmp12], %[srcl], %[ftmp0] \n\t" + +static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int32_t w, int32_t h) { + const int16_t *filter_x = filter[x0_q4]; + double ftmp[14]; + uint32_t tmp[2]; + uint32_t para[5]; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= SUBPEL_TAPS / 2 - 1; + src_stride -= w; + dst_stride -= w; + (void)x_step_q4; + + /* clang-format off */ + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" + "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" + "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" + "gsldrc1 %[filter2], 0x08(%[filter]) \n\t" + "1: \n\t" + /* Get 8 data per row */ + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t" + "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_H_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]), + [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]), + [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]), + [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]), + [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]), + [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [width]"+&r"(w), + [dst]"+&r"(dst), [height]"+&r"(h) + : [filter]"r"(filter_x), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + /* clang-format on */ +} + +static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int y0_q4, + int y_step_q4, int32_t w, int32_t h) { + const int16_t *filter_y = filter[y0_q4]; + double ftmp[16]; + uint32_t tmp[1]; + uint32_t para[2]; + ptrdiff_t addr = src_stride; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + src_stride -= w; + dst_stride -= w; + (void)y_step_q4; + + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t" + "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t" + "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t" + "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t" + "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t" + "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + /* Get 8 data per column */ + "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t" + MMI_ADDU(%[tmp0], %[src], %[addr]) + "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_V_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + MMI_SUBU(%[width], %[addr], %[src_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]), + [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]), + [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]), + [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]), + [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]), + [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]), + [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h), + [tmp0]"=&r"(tmp[0]) + : [filter]"r"(filter_y), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride), + [addr]"r"((mips_reg)addr) + : "memory" + ); +} + +static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int32_t w, int32_t h) { + const int16_t *filter_x = filter[x0_q4]; + double ftmp[14]; + uint32_t tmp[2]; + uint32_t para[2]; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= SUBPEL_TAPS / 2 - 1; + src_stride -= w; + dst_stride -= w; + (void)x_step_q4; + + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" + "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" + "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" + "gsldrc1 %[filter2], 0x08(%[filter]) \n\t" + "1: \n\t" + /* Get 8 data per row */ + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t" + "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_H_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp5]) + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]), + [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]), + [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]), + [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]), + [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]), + [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [width]"+&r"(w), + [dst]"+&r"(dst), [height]"+&r"(h) + : [filter]"r"(filter_x), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); +} + +static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int y0_q4, + int y_step_q4, int32_t w, int32_t h) { + const int16_t *filter_y = filter[y0_q4]; + double ftmp[16]; + uint32_t tmp[1]; + uint32_t para[2]; + ptrdiff_t addr = src_stride; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + src_stride -= w; + dst_stride -= w; + (void)y_step_q4; + + __asm__ volatile( + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t" + "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t" + "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t" + "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t" + "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t" + "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t" + "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t" + "1: \n\t" + /* Get 8 data per column */ + "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t" + MMI_ADDU(%[tmp0], %[src], %[addr]) + "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t" + MMI_ADDU(%[tmp0], %[tmp0], %[addr]) + "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t" + "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t" + "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t" + "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t" + "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_V_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp5]) + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + MMI_SUBU(%[width], %[addr], %[src_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]), + [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]), + [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]), + [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]), + [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]), + [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]), + [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h), + [tmp0]"=&r"(tmp[0]) + : [filter]"r"(filter_y), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride), + [addr]"r"((mips_reg)addr) + : "memory" + ); +} + +void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + int x, y; + + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + if (w & 0x03) { + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + src += src_stride; + dst += dst_stride; + } + } else { + double ftmp[4]; + uint32_t tmp[2]; + src_stride -= w; + dst_stride -= w; + + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp3]) + "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + } +} + +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), + 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, + int32_t y_step_q4, int32_t w, int32_t h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. + uint8_t temp[64 * 135]; + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if (w & 0x03) { + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, + 64, filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h); + } else { + convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); + convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h); + } +} + +void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + (void)y0_q4; + (void)y_step_q4; + if (w & 0x03) + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); + else + convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); +} + +void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + if (w & 0x03) + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); + else + convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + (void)y0_q4; + (void)y_step_q4; + if (w & 0x03) + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); + else + convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); +} + +void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + (void)x0_q4; + (void)x_step_q4; + if (w & 0x03) + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); + else + convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, + int32_t w, int32_t h) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); + vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c index b16ec57886..c942167587 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c @@ -541,9 +541,11 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, } void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int32_t x_step_q4, const int16_t *filter_y, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_hor[8], filt_ver[8]; assert(x_step_q4 == 16); @@ -556,8 +558,8 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, filt_ver[cnt] = filter_y[cnt]; } - if (((const int32_t *)filter_x)[0] == 0 && - ((const int32_t *)filter_y)[0] == 0) { + if (vpx_get_filter_taps(filter_x) == 2 && + vpx_get_filter_taps(filter_y) == 2) { switch (w) { case 4: common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, @@ -585,14 +587,14 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, &filt_ver[3], (int32_t)h); break; default: - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } - } else if (((const int32_t *)filter_x)[0] == 0 || - ((const int32_t *)filter_y)[0] == 0) { - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + } else if (vpx_get_filter_taps(filter_x) == 2 || + vpx_get_filter_taps(filter_y) == 2) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); } else { switch (w) { case 4: @@ -621,9 +623,605 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, (int32_t)h); break; default: - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } } + +static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + uint32_t res; + v16u8 src0 = { 0 }, src1 = { 0 }, dst0; + v16i8 out0, out1; + v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 }; + v16i8 shf2 = shf1 + 2; + v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 }; + v16i8 filt_shf1 = filt_shf0 + 2; + v16i8 filt_shf2 = filt_shf0 + 4; + v16i8 filt_shf3 = filt_shf0 + 6; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1); + XORI_B2_128_SB(out0, out1); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + + filt = LD_SH(x_filter); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3); + + src0_h *= filt0; + src0_h += src1_h * filt1; + src0_h += src2_h * filt2; + src0_h += src3_h * filt3; + + src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8); + + src0_h = __msa_adds_s_h(src0_h, src1_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + res = __msa_copy_u_w((v4i32)dst0, 0); + SW(res, dst); +} + +static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v16u8 tmp0, tmp1, tmp2, tmp3, dst0; + v16i8 out0, out1, out2, out3; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src2); + INSERT_D2_UB(srcd2, srcd3, src3); + + filt = LD_SH(x_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + // transpose + VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out0, out1); + ILVRL_W2_SB(tmp3, tmp1, out2, out3); + + XORI_B4_128_SB(out0, out1, out2, out3); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + UNPCK_SB_SH(out2, src4_h, src5_h); + UNPCK_SB_SH(out3, src6_h, src7_h); + + src0_h *= filt0; + src4_h *= filt4; + src0_h += src1_h * filt1; + src4_h += src5_h * filt5; + src0_h += src2_h * filt2; + src4_h += src6_h * filt6; + src0_h += src3_h * filt3; + src4_h += src7_h * filt7; + + src0_h = __msa_adds_s_h(src0_h, src4_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + ST8x1_UB(dst0, dst); +} + +static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 }; + v16u8 tmp0, tmp1, tmp2, tmp3, dst0; + v16i8 out0, out1, out2, out3, out4, out5, out6, out7; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + v8i16 dst0_h, dst1_h, dst2_h, dst3_h; + + LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src0); + INSERT_D2_UB(srcd2, srcd3, src1); + LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src2); + INSERT_D2_UB(srcd2, srcd3, src3); + LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src4); + INSERT_D2_UB(srcd2, srcd3, src5); + LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_UB(srcd0, srcd1, src6); + INSERT_D2_UB(srcd2, srcd3, src7); + + filt = LD_SH(x_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + // transpose + VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out0, out1); + ILVRL_W2_SB(tmp3, tmp1, out2, out3); + XORI_B4_128_SB(out0, out1, out2, out3); + + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + UNPCK_SB_SH(out2, src4_h, src5_h); + UNPCK_SB_SH(out3, src6_h, src7_h); + + VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3); + ILVRL_W2_SB(tmp2, tmp0, out4, out5); + ILVRL_W2_SB(tmp3, tmp1, out6, out7); + XORI_B4_128_SB(out4, out5, out6, out7); + + dst0_h = src0_h * filt0; + dst1_h = src4_h * filt4; + dst0_h += src1_h * filt1; + dst1_h += src5_h * filt5; + dst0_h += src2_h * filt2; + dst1_h += src6_h * filt6; + dst0_h += src3_h * filt3; + dst1_h += src7_h * filt7; + + UNPCK_SB_SH(out4, src0_h, src1_h); + UNPCK_SB_SH(out5, src2_h, src3_h); + UNPCK_SB_SH(out6, src4_h, src5_h); + UNPCK_SB_SH(out7, src6_h, src7_h); + + dst2_h = src0_h * filt0; + dst3_h = src4_h * filt4; + dst2_h += src1_h * filt1; + dst3_h += src5_h * filt5; + dst2_h += src2_h * filt2; + dst3_h += src6_h * filt6; + dst2_h += src3_h * filt3; + dst3_h += src7_h * filt7; + + ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h); + SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS); + SAT_SH2_SH(dst0_h, dst2_h, 7); + dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h); + ST_UB(dst0, dst); +} + +static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0; + v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + + in0 = LD_UB(src); + out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0); + ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride); +} + +static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3; + v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 }; + v16i8 shf2 = shf1 + 4; + + LD_UB4(src, 16, in0, in1, in2, in3); + VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1); + VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3); + ILVRL_W2_UB(tmp2, tmp0, out0, out1); + ILVRL_W2_UB(tmp3, tmp1, out2, out3); + ST8x4_UB(out0, out1, dst, dst_stride); + ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride); +} + +static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst, + ptrdiff_t dst_stride) { + v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12; + v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8; + v16u8 out9, out10, out11, out12, out13, out14, out15; + + LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7); + LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15); + + TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, out0, out1, out2, out3, + out4, out5, out6, out7); + ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride); + dst += 8 * dst_stride; + + SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8); + SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8); + SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8); + SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8); + + TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, + in11, in12, in13, in14, in15, out8, out9, out10, out11, + out12, out13, out14, out15); + ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride); +} + +static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int y, z, i; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; y += 4) { + int x_q4 = x0_q4; + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter); + } else { + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + + x_q4 += x_step_q4; + } + + transpose4x4_to_dst(temp, dst, dst_stride); + + src += src_stride * 4; + dst += dst_stride * 4; + } +} + +static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int y, z, i; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = h + (8 - (h & 0x7)); + + do { + int x_q4 = x0_q4; + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter); + } else { + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[3 + i * src_stride]; + } + } + + x_q4 += x_step_q4; + } + + transpose8x8_to_dst(temp, dst, dst_stride); + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]); + int x, y, z, i; + + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 16x16 areas. The intermediate height is not always + // a multiple of 16, so force it to be a multiple of 8 here. + y = h + (16 - (h & 0xF)); + + do { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 16) { + for (z = 0; z < 16; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter); + } else { + for (i = 0; i < 16; ++i) { + temp[z * 16 + i] = src_x[3 + i * src_stride]; + } + } + + x_q4 += x_step_q4; + } + + transpose16x16_to_dst(temp, dst + x, dst_stride); + } + + src += src_stride * 16; + dst += dst_stride * 16; + } while (y -= 16); +} + +static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter) { + uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7; + uint32_t res; + v16u8 src0 = { 0 }, src1 = { 0 }, dst0; + v16i8 out0, out1; + v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }; + v16i8 shf2 = shf1 + 8; + v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 }; + v16i8 filt_shf1 = filt_shf0 + 2; + v16i8 filt_shf2 = filt_shf0 + 4; + v16i8 filt_shf3 = filt_shf0 + 6; + v8i16 filt, src0_h, src1_h, src2_h, src3_h; + v8i16 filt0, filt1, filt2, filt3; + + LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3); + LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7); + INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0); + INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1); + VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1); + XORI_B2_128_SB(out0, out1); + UNPCK_SB_SH(out0, src0_h, src1_h); + UNPCK_SB_SH(out1, src2_h, src3_h); + + filt = LD_SH(y_filter); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1); + VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3); + + src0_h *= filt0; + src0_h += src1_h * filt1; + src0_h += src2_h * filt2; + src0_h += src3_h * filt3; + + src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8); + + src0_h = __msa_adds_s_h(src0_h, src1_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + res = __msa_copy_u_w((v4i32)dst0, 0); + SW(res, dst); +} + +static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter) { + uint64_t srcd0, srcd1, srcd2, srcd3; + v16u8 dst0; + v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 }; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_SB(srcd0, srcd1, src0); + INSERT_D2_SB(srcd2, srcd3, src1); + LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3); + INSERT_D2_SB(srcd0, srcd1, src2); + INSERT_D2_SB(srcd2, srcd3, src3); + + filt = LD_SH(y_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + XORI_B4_128_SB(src0, src1, src2, src3); + UNPCK_SB_SH(src0, src0_h, src1_h); + UNPCK_SB_SH(src1, src2_h, src3_h); + UNPCK_SB_SH(src2, src4_h, src5_h); + UNPCK_SB_SH(src3, src6_h, src7_h); + + src0_h *= filt0; + src4_h *= filt4; + src0_h += src1_h * filt1; + src4_h += src5_h * filt5; + src0_h += src2_h * filt2; + src4_h += src6_h * filt6; + src0_h += src3_h * filt3; + src4_h += src7_h * filt7; + + src0_h = __msa_adds_s_h(src0_h, src4_h); + src0_h = __msa_srari_h(src0_h, FILTER_BITS); + src0_h = __msa_sat_s_h(src0_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src0_h); + ST8x1_UB(dst0, dst); +} + +static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *y_filter, + int w) { + int x; + v16u8 dst0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h; + v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h; + v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7; + + filt = LD_SH(y_filter); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7); + + for (x = 0; x < w; x += 16) { + LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7); + src_y += 16; + + XORI_B4_128_SB(src0, src1, src2, src3); + XORI_B4_128_SB(src4, src5, src6, src7); + UNPCK_SB_SH(src0, src0_h, src1_h); + UNPCK_SB_SH(src1, src2_h, src3_h); + UNPCK_SB_SH(src2, src4_h, src5_h); + UNPCK_SB_SH(src3, src6_h, src7_h); + UNPCK_SB_SH(src4, src8_h, src9_h); + UNPCK_SB_SH(src5, src10_h, src11_h); + UNPCK_SB_SH(src6, src12_h, src13_h); + UNPCK_SB_SH(src7, src14_h, src15_h); + + src0_h *= filt0; + src1_h *= filt0; + src8_h *= filt4; + src9_h *= filt4; + src0_h += src2_h * filt1; + src1_h += src3_h * filt1; + src8_h += src10_h * filt5; + src9_h += src11_h * filt5; + src0_h += src4_h * filt2; + src1_h += src5_h * filt2; + src8_h += src12_h * filt6; + src9_h += src13_h * filt6; + src0_h += src6_h * filt3; + src1_h += src7_h * filt3; + src8_h += src14_h * filt7; + src9_h += src15_h * filt7; + + ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h); + SRARI_H2_SH(src0_h, src1_h, FILTER_BITS); + SAT_SH2_SH(src0_h, src1_h, 7); + dst0 = PCKEV_XORI128_UB(src0_h, src1_h); + ST_UB(dst0, dst); + dst += 16; + } +} + +static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + uint32_t srcd = LW(src_y + 3 * src_stride); + SW(srcd, dst + y * dst_stride); + } + + y_q4 += y_step_q4; + } +} + +static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + uint64_t srcd = LD(src_y + 3 * src_stride); + SD(srcd, dst + y * dst_stride); + } + + y_q4 += y_step_q4; + } +} + +static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + int y_q4 = y0_q4; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (y = 0; y < h; ++y) { + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter, + w); + } else { + for (x = 0; x < w; ++x) { + dst[x + y * dst_stride] = src_y[x + 3 * src_stride]; + } + } + + y_q4 += y_step_q4; + } +} + +void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); + + if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) { + vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + } else { + if (w >= 16) { + scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + w, intermediate_height); + } else if (w == 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, 64, filter, x0_q4, x_step_q4, + intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, h); + } else { + scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, + dst_stride, filter, y0_q4, y_step_q4, h); + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c index 410682271f..195228689e 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c @@ -628,9 +628,10 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_ver[8]; assert(y_step_q4 == 16); @@ -640,7 +641,7 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver[cnt] = filter_y[cnt]; } - if (((const int32_t *)filter_y)[0] == 0) { + if (vpx_get_filter_taps(filter_y) == 2) { switch (w) { case 4: common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, @@ -663,8 +664,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_ver[3], h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -690,8 +691,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c index 45399bad85..ce649935da 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, @@ -188,13 +189,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; switch (w) { case 4: { diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c index c3d87a4ab8..c2ab33a2f4 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c @@ -9,6 +9,7 @@ */ #include +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" static void copy_width8_msa(const uint8_t *src, int32_t src_stride, @@ -198,13 +199,14 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; switch (w) { case 4: { diff --git a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h index f75679521a..a0280c5434 100644 --- a/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h +++ b/media/libvpx/libvpx/vpx_dsp/mips/vpx_convolve_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ -#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ +#ifndef VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ +#define VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ #include "vpx_dsp/mips/macros_msa.h" #include "vpx_dsp/vpx_filter.h" @@ -110,14 +110,13 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; ST_UB(tmp_m, (pdst)); \ } -#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \ - stride) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ - PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ +#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ } -#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */ +#endif // VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/postproc.h b/media/libvpx/libvpx/vpx_dsp/postproc.h index 43cb5c8e8d..37f993f814 100644 --- a/media/libvpx/libvpx/vpx_dsp/postproc.h +++ b/media/libvpx/libvpx/vpx_dsp/postproc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_POSTPROC_H_ -#define VPX_DSP_POSTPROC_H_ +#ifndef VPX_VPX_DSP_POSTPROC_H_ +#define VPX_VPX_DSP_POSTPROC_H_ #ifdef __cplusplus extern "C" { @@ -22,4 +22,4 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size); } #endif -#endif // VPX_DSP_POSTPROC_H_ +#endif // VPX_VPX_DSP_POSTPROC_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h new file mode 100644 index 0000000000..7ac873f9fc --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ +#define VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE int16x8_t load_tran_low(int32_t c, const tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + int32x4_t u = vec_vsx_ld(c, s); + int32x4_t v = vec_vsx_ld(c, s + 4); + return vec_packs(u, v); +#else + return vec_vsx_ld(c, s); +#endif +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + const int16x8_t one = vec_splat_s16(1); + const int32x4_t even = vec_mule(v, one); + const int32x4_t odd = vec_mulo(v, one); + const int32x4_t high = vec_mergeh(even, odd); + const int32x4_t low = vec_mergel(even, odd); + vec_vsx_st(high, c, s); + vec_vsx_st(low, c, s + 4); +#else + vec_vsx_st(v, c, s); +#endif +} + +#endif // VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c new file mode 100644 index 0000000000..2129911696 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/deblock_vsx.c @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +extern const int16_t vpx_rv[]; + +static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, + 0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; + +static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; + +static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v, + uint8x16_t filter) { + const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]); + const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]); + const uint8x16_t k3 = vec_avg(k1, k2); + const uint8x16_t f_a = vec_max(vec_absd(v, ctx[0]), vec_absd(v, ctx[1])); + const uint8x16_t f_b = vec_max(vec_absd(v, ctx[2]), vec_absd(v, ctx[3])); + const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter); + return vec_sel(v, vec_avg(k3, v), mask); +} + +static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src, + int stride) { + ctx[0] = vec_vsx_ld(col - 2 * stride, src); + ctx[1] = vec_vsx_ld(col - stride, src); + ctx[2] = vec_vsx_ld(col + stride, src); + ctx[3] = vec_vsx_ld(col + 2 * stride, src); +} + +static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx, + uint8x16_t v, uint8x16_t right_ctx) { + static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x1A, 0x1B, 0x1C, 0x1D }; + + static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, + 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, + 0x1B, 0x1C, 0x1D, 0x1E }; + + static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, + 0x0D, 0x0E, 0x0F, 0x10 }; + + static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11 }; + ctx[0] = vec_perm(left_ctx, v, l2_perm); + ctx[1] = vec_perm(left_ctx, v, l1_perm); + ctx[2] = vec_perm(v, right_ctx, r1_perm); + ctx[3] = vec_perm(v, right_ctx, r2_perm); +} +void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, int cols, + unsigned char *f, int size) { + int row, col; + uint8x16_t ctx[4], out, v, left_ctx; + + for (row = 0; row < size; row++) { + for (col = 0; col < cols - 8; col += 16) { + const uint8x16_t filter = vec_vsx_ld(col, f); + v = vec_vsx_ld(col, src_ptr); + vert_ctx(ctx, col, src_ptr, src_pixels_per_line); + vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr); + } + + if (col != cols) { + const uint8x16_t filter = vec_vsx_ld(col, f); + v = vec_vsx_ld(col, src_ptr); + vert_ctx(ctx, col, src_ptr, src_pixels_per_line); + out = apply_filter(ctx, v, filter); + vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr); + } + + /* now post_proc_across */ + left_ctx = vec_splats(dst_ptr[0]); + v = vec_vsx_ld(0, dst_ptr); + for (col = 0; col < cols - 8; col += 16) { + const uint8x16_t filter = vec_vsx_ld(col, f); + const uint8x16_t right_ctx = (col + 16 == cols) + ? vec_splats(dst_ptr[cols - 1]) + : vec_vsx_ld(col, dst_ptr + 16); + horz_ctx(ctx, left_ctx, v, right_ctx); + vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr); + left_ctx = v; + v = right_ctx; + } + + if (col != cols) { + const uint8x16_t filter = vec_vsx_ld(col, f); + const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]); + horz_ctx(ctx, left_ctx, v, right_ctx); + out = apply_filter(ctx, v, filter); + vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr); + } + + src_ptr += src_pixels_per_line; + dst_ptr += dst_pixels_per_line; + } +} + +// C: s[c + 7] +static INLINE int16x8_t next7l_s16(uint8x16_t c) { + static const uint8x16_t next7_perm = { + 0x07, 0x10, 0x08, 0x11, 0x09, 0x12, 0x0A, 0x13, + 0x0B, 0x14, 0x0C, 0x15, 0x0D, 0x16, 0x0E, 0x17, + }; + return (int16x8_t)vec_perm(c, vec_zeros_u8, next7_perm); +} + +// Slide across window and add. +static INLINE int16x8_t slide_sum_s16(int16x8_t x) { + // x = A B C D E F G H + // + // 0 A B C D E F G + const int16x8_t sum1 = vec_add(x, vec_slo(x, vec_splats((int8_t)(2 << 3)))); + // 0 0 A B C D E F + const int16x8_t sum2 = vec_add(vec_slo(x, vec_splats((int8_t)(4 << 3))), + // 0 0 0 A B C D E + vec_slo(x, vec_splats((int8_t)(6 << 3)))); + // 0 0 0 0 A B C D + const int16x8_t sum3 = vec_add(vec_slo(x, vec_splats((int8_t)(8 << 3))), + // 0 0 0 0 0 A B C + vec_slo(x, vec_splats((int8_t)(10 << 3)))); + // 0 0 0 0 0 0 A B + const int16x8_t sum4 = vec_add(vec_slo(x, vec_splats((int8_t)(12 << 3))), + // 0 0 0 0 0 0 0 A + vec_slo(x, vec_splats((int8_t)(14 << 3)))); + return vec_add(vec_add(sum1, sum2), vec_add(sum3, sum4)); +} + +// Slide across window and add. +static INLINE int32x4_t slide_sumsq_s32(int32x4_t xsq_even, int32x4_t xsq_odd) { + // 0 A C E + // + 0 B D F + int32x4_t sumsq_1 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(4 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(4 << 3)))); + // 0 0 A C + // + 0 0 B D + int32x4_t sumsq_2 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(8 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(8 << 3)))); + // 0 0 0 A + // + 0 0 0 B + int32x4_t sumsq_3 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(12 << 3))), + vec_slo(xsq_odd, vec_splats((int8_t)(12 << 3)))); + sumsq_1 = vec_add(sumsq_1, xsq_even); + sumsq_2 = vec_add(sumsq_2, sumsq_3); + return vec_add(sumsq_1, sumsq_2); +} + +// C: (b + sum + val) >> 4 +static INLINE int16x8_t filter_s16(int16x8_t b, int16x8_t sum, int16x8_t val) { + return vec_sra(vec_add(vec_add(b, sum), val), vec_splats((uint16_t)4)); +} + +// C: sumsq * 15 - sum * sum +static INLINE bool16x8_t mask_s16(int32x4_t sumsq_even, int32x4_t sumsq_odd, + int16x8_t sum, int32x4_t lim) { + static const uint8x16_t mask_merge = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, + 0x14, 0x15, 0x08, 0x09, 0x18, 0x19, + 0x0C, 0x0D, 0x1C, 0x1D }; + const int32x4_t sumsq_odd_scaled = + vec_mul(sumsq_odd, vec_splats((int32_t)15)); + const int32x4_t sumsq_even_scaled = + vec_mul(sumsq_even, vec_splats((int32_t)15)); + const int32x4_t thres_odd = vec_sub(sumsq_odd_scaled, vec_mulo(sum, sum)); + const int32x4_t thres_even = vec_sub(sumsq_even_scaled, vec_mule(sum, sum)); + + const bool32x4_t mask_odd = vec_cmplt(thres_odd, lim); + const bool32x4_t mask_even = vec_cmplt(thres_even, lim); + return vec_perm((bool16x8_t)mask_even, (bool16x8_t)mask_odd, mask_merge); +} + +void vpx_mbpost_proc_across_ip_vsx(unsigned char *src, int pitch, int rows, + int cols, int flimit) { + int row, col; + const int32x4_t lim = vec_splats(flimit); + + // 8 columns are processed at a time. + assert(cols % 8 == 0); + + for (row = 0; row < rows; row++) { + // The sum is signed and requires at most 13 bits. + // (8 bits + sign) * 15 (4 bits) + int16x8_t sum; + // The sum of squares requires at most 20 bits. + // (16 bits + sign) * 15 (4 bits) + int32x4_t sumsq_even, sumsq_odd; + + // Fill left context with first col. + int16x8_t left_ctx = vec_splats((int16_t)src[0]); + int16_t s = src[0] * 9; + int32_t ssq = src[0] * src[0] * 9 + 16; + + // Fill the next 6 columns of the sliding window with cols 2 to 7. + for (col = 1; col <= 6; ++col) { + s += src[col]; + ssq += src[col] * src[col]; + } + // Set this sum to every element in the window. + sum = vec_splats(s); + sumsq_even = vec_splats(ssq); + sumsq_odd = vec_splats(ssq); + + for (col = 0; col < cols; col += 8) { + bool16x8_t mask; + int16x8_t filtered, masked; + uint8x16_t out; + + const uint8x16_t val = vec_vsx_ld(0, src + col); + const int16x8_t val_high = unpack_to_s16_h(val); + + // C: s[c + 7] + const int16x8_t right_ctx = (col + 8 == cols) + ? vec_splats((int16_t)src[col + 7]) + : next7l_s16(val); + + // C: x = s[c + 7] - s[c - 8]; + const int16x8_t x = vec_sub(right_ctx, left_ctx); + const int32x4_t xsq_even = + vec_sub(vec_mule(right_ctx, right_ctx), vec_mule(left_ctx, left_ctx)); + const int32x4_t xsq_odd = + vec_sub(vec_mulo(right_ctx, right_ctx), vec_mulo(left_ctx, left_ctx)); + + const int32x4_t sumsq_tmp = slide_sumsq_s32(xsq_even, xsq_odd); + // A C E G + // 0 B D F + // 0 A C E + // 0 0 B D + // 0 0 A C + // 0 0 0 B + // 0 0 0 A + sumsq_even = vec_add(sumsq_even, sumsq_tmp); + // B D F G + // A C E G + // 0 B D F + // 0 A C E + // 0 0 B D + // 0 0 A C + // 0 0 0 B + // 0 0 0 A + sumsq_odd = vec_add(sumsq_odd, vec_add(sumsq_tmp, xsq_odd)); + + sum = vec_add(sum, slide_sum_s16(x)); + + // C: (8 + sum + s[c]) >> 4 + filtered = filter_s16(vec_splats((int16_t)8), sum, val_high); + // C: sumsq * 15 - sum * sum + mask = mask_s16(sumsq_even, sumsq_odd, sum, lim); + masked = vec_sel(val_high, filtered, mask); + + out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, src + col), load_merge); + vec_vsx_st(out, 0, src + col); + + // Update window sum and square sum + sum = vec_splat(sum, 7); + sumsq_even = vec_splat(sumsq_odd, 3); + sumsq_odd = vec_splat(sumsq_odd, 3); + + // C: s[c - 8] (for next iteration) + left_ctx = val_high; + } + src += pitch; + } +} + +void vpx_mbpost_proc_down_vsx(uint8_t *dst, int pitch, int rows, int cols, + int flimit) { + int col, row, i; + int16x8_t window[16]; + const int32x4_t lim = vec_splats(flimit); + + // 8 columns are processed at a time. + assert(cols % 8 == 0); + // If rows is less than 8 the bottom border extension fails. + assert(rows >= 8); + + for (col = 0; col < cols; col += 8) { + // The sum is signed and requires at most 13 bits. + // (8 bits + sign) * 15 (4 bits) + int16x8_t r1, sum; + // The sum of squares requires at most 20 bits. + // (16 bits + sign) * 15 (4 bits) + int32x4_t sumsq_even, sumsq_odd; + + r1 = unpack_to_s16_h(vec_vsx_ld(0, dst)); + // Fill sliding window with first row. + for (i = 0; i <= 8; i++) { + window[i] = r1; + } + // First 9 rows of the sliding window are the same. + // sum = r1 * 9 + sum = vec_mladd(r1, vec_splats((int16_t)9), vec_zeros_s16); + + // sumsq = r1 * r1 * 9 + sumsq_even = vec_mule(sum, r1); + sumsq_odd = vec_mulo(sum, r1); + + // Fill the next 6 rows of the sliding window with rows 2 to 7. + for (i = 1; i <= 6; ++i) { + const int16x8_t next_row = unpack_to_s16_h(vec_vsx_ld(i * pitch, dst)); + window[i + 8] = next_row; + sum = vec_add(sum, next_row); + sumsq_odd = vec_add(sumsq_odd, vec_mulo(next_row, next_row)); + sumsq_even = vec_add(sumsq_even, vec_mule(next_row, next_row)); + } + + for (row = 0; row < rows; row++) { + int32x4_t d15_even, d15_odd, d0_even, d0_odd; + bool16x8_t mask; + int16x8_t filtered, masked; + uint8x16_t out; + + const int16x8_t rv = vec_vsx_ld(0, vpx_rv + (row & 127)); + + // Move the sliding window + if (row + 7 < rows) { + window[15] = unpack_to_s16_h(vec_vsx_ld((row + 7) * pitch, dst)); + } else { + window[15] = window[14]; + } + + // C: sum += s[7 * pitch] - s[-8 * pitch]; + sum = vec_add(sum, vec_sub(window[15], window[0])); + + // C: sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * + // pitch]; + // Optimization Note: Caching a squared-window for odd and even is + // slower than just repeating the multiplies. + d15_odd = vec_mulo(window[15], window[15]); + d15_even = vec_mule(window[15], window[15]); + d0_odd = vec_mulo(window[0], window[0]); + d0_even = vec_mule(window[0], window[0]); + sumsq_odd = vec_add(sumsq_odd, vec_sub(d15_odd, d0_odd)); + sumsq_even = vec_add(sumsq_even, vec_sub(d15_even, d0_even)); + + // C: (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4 + filtered = filter_s16(rv, sum, window[8]); + + // C: sumsq * 15 - sum * sum + mask = mask_s16(sumsq_even, sumsq_odd, sum, lim); + masked = vec_sel(window[8], filtered, mask); + + // TODO(ltrudeau) If cols % 16 == 0, we could just process 16 per + // iteration + out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, dst + row * pitch), + load_merge); + vec_vsx_st(out, 0, dst + row * pitch); + + // Optimization Note: Turns out that the following loop is faster than + // using pointers to manage the sliding window. + for (i = 1; i < 16; i++) { + window[i - 1] = window[i]; + } + } + dst += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c new file mode 100644 index 0000000000..328b0e3130 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c @@ -0,0 +1,553 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/ppc/transpose_vsx.h" +#include "vpx_dsp/ppc/txfm_common_vsx.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14. +static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add, + int16x8_t *sub) { + // Since a + b can overflow 16 bits, the multiplication is distributed + // (a * c +/- b * c). + const int32x4_t ac_e = vec_mule(a, cospi16_v); + const int32x4_t ac_o = vec_mulo(a, cospi16_v); + const int32x4_t bc_e = vec_mule(b, cospi16_v); + const int32x4_t bc_o = vec_mulo(b, cospi16_v); + + // Reuse the same multiplies for sum and difference. + const int32x4_t sum_e = vec_add(ac_e, bc_e); + const int32x4_t sum_o = vec_add(ac_o, bc_o); + const int32x4_t diff_e = vec_sub(ac_e, bc_e); + const int32x4_t diff_o = vec_sub(ac_o, bc_o); + + // Add rounding offset + const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding); + const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding); + const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding); + const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding); + + const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits); + const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits); + const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits); + const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits); + + // There's no pack operation for even and odd, so we need to permute. + *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack); + *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack); +} + +// Returns (a * c1 +/- b * c2 + (2 << 13)) >> 14 +static INLINE void double_butterfly(int16x8_t a, int16x8_t c1, int16x8_t b, + int16x8_t c2, int16x8_t *add, + int16x8_t *sub) { + const int32x4_t ac1_o = vec_mulo(a, c1); + const int32x4_t ac1_e = vec_mule(a, c1); + const int32x4_t ac2_o = vec_mulo(a, c2); + const int32x4_t ac2_e = vec_mule(a, c2); + + const int32x4_t bc1_o = vec_mulo(b, c1); + const int32x4_t bc1_e = vec_mule(b, c1); + const int32x4_t bc2_o = vec_mulo(b, c2); + const int32x4_t bc2_e = vec_mule(b, c2); + + const int32x4_t sum_o = vec_add(ac1_o, bc2_o); + const int32x4_t sum_e = vec_add(ac1_e, bc2_e); + const int32x4_t diff_o = vec_sub(ac2_o, bc1_o); + const int32x4_t diff_e = vec_sub(ac2_e, bc1_e); + + // Add rounding offset + const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding); + const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding); + const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding); + const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding); + + const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits); + const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits); + const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits); + const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits); + + // There's no pack operation for even and odd, so we need to permute. + *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack); + *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack); +} + +// While other architecture combine the load and the stage 1 operations, Power9 +// benchmarking show no benefit in such an approach. +static INLINE void load(const int16_t *a, int stride, int16x8_t *b) { + // Tried out different combinations of load and shift instructions, this is + // the fastest one. + { + const int16x8_t l0 = vec_vsx_ld(0, a); + const int16x8_t l1 = vec_vsx_ld(0, a + stride); + const int16x8_t l2 = vec_vsx_ld(0, a + 2 * stride); + const int16x8_t l3 = vec_vsx_ld(0, a + 3 * stride); + const int16x8_t l4 = vec_vsx_ld(0, a + 4 * stride); + const int16x8_t l5 = vec_vsx_ld(0, a + 5 * stride); + const int16x8_t l6 = vec_vsx_ld(0, a + 6 * stride); + const int16x8_t l7 = vec_vsx_ld(0, a + 7 * stride); + + const int16x8_t l8 = vec_vsx_ld(0, a + 8 * stride); + const int16x8_t l9 = vec_vsx_ld(0, a + 9 * stride); + const int16x8_t l10 = vec_vsx_ld(0, a + 10 * stride); + const int16x8_t l11 = vec_vsx_ld(0, a + 11 * stride); + const int16x8_t l12 = vec_vsx_ld(0, a + 12 * stride); + const int16x8_t l13 = vec_vsx_ld(0, a + 13 * stride); + const int16x8_t l14 = vec_vsx_ld(0, a + 14 * stride); + const int16x8_t l15 = vec_vsx_ld(0, a + 15 * stride); + + b[0] = vec_sl(l0, vec_dct_scale_log2); + b[1] = vec_sl(l1, vec_dct_scale_log2); + b[2] = vec_sl(l2, vec_dct_scale_log2); + b[3] = vec_sl(l3, vec_dct_scale_log2); + b[4] = vec_sl(l4, vec_dct_scale_log2); + b[5] = vec_sl(l5, vec_dct_scale_log2); + b[6] = vec_sl(l6, vec_dct_scale_log2); + b[7] = vec_sl(l7, vec_dct_scale_log2); + + b[8] = vec_sl(l8, vec_dct_scale_log2); + b[9] = vec_sl(l9, vec_dct_scale_log2); + b[10] = vec_sl(l10, vec_dct_scale_log2); + b[11] = vec_sl(l11, vec_dct_scale_log2); + b[12] = vec_sl(l12, vec_dct_scale_log2); + b[13] = vec_sl(l13, vec_dct_scale_log2); + b[14] = vec_sl(l14, vec_dct_scale_log2); + b[15] = vec_sl(l15, vec_dct_scale_log2); + } + { + const int16x8_t l16 = vec_vsx_ld(0, a + 16 * stride); + const int16x8_t l17 = vec_vsx_ld(0, a + 17 * stride); + const int16x8_t l18 = vec_vsx_ld(0, a + 18 * stride); + const int16x8_t l19 = vec_vsx_ld(0, a + 19 * stride); + const int16x8_t l20 = vec_vsx_ld(0, a + 20 * stride); + const int16x8_t l21 = vec_vsx_ld(0, a + 21 * stride); + const int16x8_t l22 = vec_vsx_ld(0, a + 22 * stride); + const int16x8_t l23 = vec_vsx_ld(0, a + 23 * stride); + + const int16x8_t l24 = vec_vsx_ld(0, a + 24 * stride); + const int16x8_t l25 = vec_vsx_ld(0, a + 25 * stride); + const int16x8_t l26 = vec_vsx_ld(0, a + 26 * stride); + const int16x8_t l27 = vec_vsx_ld(0, a + 27 * stride); + const int16x8_t l28 = vec_vsx_ld(0, a + 28 * stride); + const int16x8_t l29 = vec_vsx_ld(0, a + 29 * stride); + const int16x8_t l30 = vec_vsx_ld(0, a + 30 * stride); + const int16x8_t l31 = vec_vsx_ld(0, a + 31 * stride); + + b[16] = vec_sl(l16, vec_dct_scale_log2); + b[17] = vec_sl(l17, vec_dct_scale_log2); + b[18] = vec_sl(l18, vec_dct_scale_log2); + b[19] = vec_sl(l19, vec_dct_scale_log2); + b[20] = vec_sl(l20, vec_dct_scale_log2); + b[21] = vec_sl(l21, vec_dct_scale_log2); + b[22] = vec_sl(l22, vec_dct_scale_log2); + b[23] = vec_sl(l23, vec_dct_scale_log2); + + b[24] = vec_sl(l24, vec_dct_scale_log2); + b[25] = vec_sl(l25, vec_dct_scale_log2); + b[26] = vec_sl(l26, vec_dct_scale_log2); + b[27] = vec_sl(l27, vec_dct_scale_log2); + b[28] = vec_sl(l28, vec_dct_scale_log2); + b[29] = vec_sl(l29, vec_dct_scale_log2); + b[30] = vec_sl(l30, vec_dct_scale_log2); + b[31] = vec_sl(l31, vec_dct_scale_log2); + } +} + +static INLINE void store(tran_low_t *a, const int16x8_t *b) { + vec_vsx_st(b[0], 0, a); + vec_vsx_st(b[8], 0, a + 8); + vec_vsx_st(b[16], 0, a + 16); + vec_vsx_st(b[24], 0, a + 24); + + vec_vsx_st(b[1], 0, a + 32); + vec_vsx_st(b[9], 0, a + 40); + vec_vsx_st(b[17], 0, a + 48); + vec_vsx_st(b[25], 0, a + 56); + + vec_vsx_st(b[2], 0, a + 64); + vec_vsx_st(b[10], 0, a + 72); + vec_vsx_st(b[18], 0, a + 80); + vec_vsx_st(b[26], 0, a + 88); + + vec_vsx_st(b[3], 0, a + 96); + vec_vsx_st(b[11], 0, a + 104); + vec_vsx_st(b[19], 0, a + 112); + vec_vsx_st(b[27], 0, a + 120); + + vec_vsx_st(b[4], 0, a + 128); + vec_vsx_st(b[12], 0, a + 136); + vec_vsx_st(b[20], 0, a + 144); + vec_vsx_st(b[28], 0, a + 152); + + vec_vsx_st(b[5], 0, a + 160); + vec_vsx_st(b[13], 0, a + 168); + vec_vsx_st(b[21], 0, a + 176); + vec_vsx_st(b[29], 0, a + 184); + + vec_vsx_st(b[6], 0, a + 192); + vec_vsx_st(b[14], 0, a + 200); + vec_vsx_st(b[22], 0, a + 208); + vec_vsx_st(b[30], 0, a + 216); + + vec_vsx_st(b[7], 0, a + 224); + vec_vsx_st(b[15], 0, a + 232); + vec_vsx_st(b[23], 0, a + 240); + vec_vsx_st(b[31], 0, a + 248); +} + +// Returns 1 if negative 0 if positive +static INLINE int16x8_t vec_sign_s16(int16x8_t a) { + return vec_sr(a, vec_shift_sign_s16); +} + +// Add 2 if positive, 1 if negative, and shift by 2. +static INLINE int16x8_t sub_round_shift(const int16x8_t a) { + const int16x8_t sign = vec_sign_s16(a); + return vec_sra(vec_sub(vec_add(a, vec_twos_s16), sign), vec_dct_scale_log2); +} + +// Add 1 if positive, 2 if negative, and shift by 2. +// In practice, add 1, then add the sign bit, then shift without rounding. +static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) { + const int16x8_t sign = vec_sign_s16(a); + return vec_sra(vec_add(vec_add(a, vec_ones_s16), sign), vec_dct_scale_log2); +} + +static void fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) { + int16x8_t temp0[32]; // Hold stages: 1, 4, 7 + int16x8_t temp1[32]; // Hold stages: 2, 5 + int16x8_t temp2[32]; // Hold stages: 3, 6 + int i; + + // Stage 1 + // Unrolling this loops actually slows down Power9 benchmarks + for (i = 0; i < 16; i++) { + temp0[i] = vec_add(in[i], in[31 - i]); + // pass through to stage 3. + temp1[i + 16] = vec_sub(in[15 - i], in[i + 16]); + } + + // Stage 2 + // Unrolling this loops actually slows down Power9 benchmarks + for (i = 0; i < 8; i++) { + temp1[i] = vec_add(temp0[i], temp0[15 - i]); + temp1[i + 8] = vec_sub(temp0[7 - i], temp0[i + 8]); + } + + // Apply butterflies (in place) on pass through to stage 3. + single_butterfly(temp1[27], temp1[20], &temp1[27], &temp1[20]); + single_butterfly(temp1[26], temp1[21], &temp1[26], &temp1[21]); + single_butterfly(temp1[25], temp1[22], &temp1[25], &temp1[22]); + single_butterfly(temp1[24], temp1[23], &temp1[24], &temp1[23]); + + // dump the magnitude by 4, hence the intermediate values are within + // the range of 16 bits. + if (pass) { + temp1[0] = add_round_shift_s16(temp1[0]); + temp1[1] = add_round_shift_s16(temp1[1]); + temp1[2] = add_round_shift_s16(temp1[2]); + temp1[3] = add_round_shift_s16(temp1[3]); + temp1[4] = add_round_shift_s16(temp1[4]); + temp1[5] = add_round_shift_s16(temp1[5]); + temp1[6] = add_round_shift_s16(temp1[6]); + temp1[7] = add_round_shift_s16(temp1[7]); + temp1[8] = add_round_shift_s16(temp1[8]); + temp1[9] = add_round_shift_s16(temp1[9]); + temp1[10] = add_round_shift_s16(temp1[10]); + temp1[11] = add_round_shift_s16(temp1[11]); + temp1[12] = add_round_shift_s16(temp1[12]); + temp1[13] = add_round_shift_s16(temp1[13]); + temp1[14] = add_round_shift_s16(temp1[14]); + temp1[15] = add_round_shift_s16(temp1[15]); + + temp1[16] = add_round_shift_s16(temp1[16]); + temp1[17] = add_round_shift_s16(temp1[17]); + temp1[18] = add_round_shift_s16(temp1[18]); + temp1[19] = add_round_shift_s16(temp1[19]); + temp1[20] = add_round_shift_s16(temp1[20]); + temp1[21] = add_round_shift_s16(temp1[21]); + temp1[22] = add_round_shift_s16(temp1[22]); + temp1[23] = add_round_shift_s16(temp1[23]); + temp1[24] = add_round_shift_s16(temp1[24]); + temp1[25] = add_round_shift_s16(temp1[25]); + temp1[26] = add_round_shift_s16(temp1[26]); + temp1[27] = add_round_shift_s16(temp1[27]); + temp1[28] = add_round_shift_s16(temp1[28]); + temp1[29] = add_round_shift_s16(temp1[29]); + temp1[30] = add_round_shift_s16(temp1[30]); + temp1[31] = add_round_shift_s16(temp1[31]); + } + + // Stage 3 + temp2[0] = vec_add(temp1[0], temp1[7]); + temp2[1] = vec_add(temp1[1], temp1[6]); + temp2[2] = vec_add(temp1[2], temp1[5]); + temp2[3] = vec_add(temp1[3], temp1[4]); + temp2[5] = vec_sub(temp1[2], temp1[5]); + temp2[6] = vec_sub(temp1[1], temp1[6]); + temp2[8] = temp1[8]; + temp2[9] = temp1[9]; + + single_butterfly(temp1[13], temp1[10], &temp2[13], &temp2[10]); + single_butterfly(temp1[12], temp1[11], &temp2[12], &temp2[11]); + temp2[14] = temp1[14]; + temp2[15] = temp1[15]; + + temp2[18] = vec_add(temp1[18], temp1[21]); + temp2[19] = vec_add(temp1[19], temp1[20]); + + temp2[20] = vec_sub(temp1[19], temp1[20]); + temp2[21] = vec_sub(temp1[18], temp1[21]); + + temp2[26] = vec_sub(temp1[29], temp1[26]); + temp2[27] = vec_sub(temp1[28], temp1[27]); + + temp2[28] = vec_add(temp1[28], temp1[27]); + temp2[29] = vec_add(temp1[29], temp1[26]); + + // Pass through Stage 4 + temp0[7] = vec_sub(temp1[0], temp1[7]); + temp0[4] = vec_sub(temp1[3], temp1[4]); + temp0[16] = vec_add(temp1[16], temp1[23]); + temp0[17] = vec_add(temp1[17], temp1[22]); + temp0[22] = vec_sub(temp1[17], temp1[22]); + temp0[23] = vec_sub(temp1[16], temp1[23]); + temp0[24] = vec_sub(temp1[31], temp1[24]); + temp0[25] = vec_sub(temp1[30], temp1[25]); + temp0[30] = vec_add(temp1[30], temp1[25]); + temp0[31] = vec_add(temp1[31], temp1[24]); + + // Stage 4 + temp0[0] = vec_add(temp2[0], temp2[3]); + temp0[1] = vec_add(temp2[1], temp2[2]); + temp0[2] = vec_sub(temp2[1], temp2[2]); + temp0[3] = vec_sub(temp2[0], temp2[3]); + single_butterfly(temp2[6], temp2[5], &temp0[6], &temp0[5]); + + temp0[9] = vec_add(temp2[9], temp2[10]); + temp0[10] = vec_sub(temp2[9], temp2[10]); + temp0[13] = vec_sub(temp2[14], temp2[13]); + temp0[14] = vec_add(temp2[14], temp2[13]); + + double_butterfly(temp2[29], cospi8_v, temp2[18], cospi24_v, &temp0[29], + &temp0[18]); + double_butterfly(temp2[28], cospi8_v, temp2[19], cospi24_v, &temp0[28], + &temp0[19]); + double_butterfly(temp2[27], cospi24_v, temp2[20], cospi8m_v, &temp0[27], + &temp0[20]); + double_butterfly(temp2[26], cospi24_v, temp2[21], cospi8m_v, &temp0[26], + &temp0[21]); + + // Pass through Stage 5 + temp1[8] = vec_add(temp2[8], temp2[11]); + temp1[11] = vec_sub(temp2[8], temp2[11]); + temp1[12] = vec_sub(temp2[15], temp2[12]); + temp1[15] = vec_add(temp2[15], temp2[12]); + + // Stage 5 + // 0 and 1 pass through to 0 and 16 at the end + single_butterfly(temp0[0], temp0[1], &out[0], &out[16]); + + // 2 and 3 pass through to 8 and 24 at the end + double_butterfly(temp0[3], cospi8_v, temp0[2], cospi24_v, &out[8], &out[24]); + + temp1[4] = vec_add(temp0[4], temp0[5]); + temp1[5] = vec_sub(temp0[4], temp0[5]); + temp1[6] = vec_sub(temp0[7], temp0[6]); + temp1[7] = vec_add(temp0[7], temp0[6]); + + double_butterfly(temp0[14], cospi8_v, temp0[9], cospi24_v, &temp1[14], + &temp1[9]); + double_butterfly(temp0[13], cospi24_v, temp0[10], cospi8m_v, &temp1[13], + &temp1[10]); + + temp1[17] = vec_add(temp0[17], temp0[18]); + temp1[18] = vec_sub(temp0[17], temp0[18]); + + temp1[21] = vec_sub(temp0[22], temp0[21]); + temp1[22] = vec_add(temp0[22], temp0[21]); + + temp1[25] = vec_add(temp0[25], temp0[26]); + temp1[26] = vec_sub(temp0[25], temp0[26]); + + temp1[29] = vec_sub(temp0[30], temp0[29]); + temp1[30] = vec_add(temp0[30], temp0[29]); + + // Pass through Stage 6 + temp2[16] = vec_add(temp0[16], temp0[19]); + temp2[19] = vec_sub(temp0[16], temp0[19]); + temp2[20] = vec_sub(temp0[23], temp0[20]); + temp2[23] = vec_add(temp0[23], temp0[20]); + temp2[24] = vec_add(temp0[24], temp0[27]); + temp2[27] = vec_sub(temp0[24], temp0[27]); + temp2[28] = vec_sub(temp0[31], temp0[28]); + temp2[31] = vec_add(temp0[31], temp0[28]); + + // Stage 6 + // 4 and 7 pass through to 4 and 28 at the end + double_butterfly(temp1[7], cospi4_v, temp1[4], cospi28_v, &out[4], &out[28]); + // 5 and 6 pass through to 20 and 12 at the end + double_butterfly(temp1[6], cospi20_v, temp1[5], cospi12_v, &out[20], + &out[12]); + temp2[8] = vec_add(temp1[8], temp1[9]); + temp2[9] = vec_sub(temp1[8], temp1[9]); + temp2[10] = vec_sub(temp1[11], temp1[10]); + temp2[11] = vec_add(temp1[11], temp1[10]); + temp2[12] = vec_add(temp1[12], temp1[13]); + temp2[13] = vec_sub(temp1[12], temp1[13]); + temp2[14] = vec_sub(temp1[15], temp1[14]); + temp2[15] = vec_add(temp1[15], temp1[14]); + + double_butterfly(temp1[30], cospi4_v, temp1[17], cospi28_v, &temp2[30], + &temp2[17]); + double_butterfly(temp1[29], cospi28_v, temp1[18], cospi4m_v, &temp2[29], + &temp2[18]); + double_butterfly(temp1[26], cospi20_v, temp1[21], cospi12_v, &temp2[26], + &temp2[21]); + double_butterfly(temp1[25], cospi12_v, temp1[22], cospi20m_v, &temp2[25], + &temp2[22]); + + // Stage 7 + double_butterfly(temp2[15], cospi2_v, temp2[8], cospi30_v, &out[2], &out[30]); + double_butterfly(temp2[14], cospi18_v, temp2[9], cospi14_v, &out[18], + &out[14]); + double_butterfly(temp2[13], cospi10_v, temp2[10], cospi22_v, &out[10], + &out[22]); + double_butterfly(temp2[12], cospi26_v, temp2[11], cospi6_v, &out[26], + &out[6]); + + temp0[16] = vec_add(temp2[16], temp2[17]); + temp0[17] = vec_sub(temp2[16], temp2[17]); + temp0[18] = vec_sub(temp2[19], temp2[18]); + temp0[19] = vec_add(temp2[19], temp2[18]); + temp0[20] = vec_add(temp2[20], temp2[21]); + temp0[21] = vec_sub(temp2[20], temp2[21]); + temp0[22] = vec_sub(temp2[23], temp2[22]); + temp0[23] = vec_add(temp2[23], temp2[22]); + temp0[24] = vec_add(temp2[24], temp2[25]); + temp0[25] = vec_sub(temp2[24], temp2[25]); + temp0[26] = vec_sub(temp2[27], temp2[26]); + temp0[27] = vec_add(temp2[27], temp2[26]); + temp0[28] = vec_add(temp2[28], temp2[29]); + temp0[29] = vec_sub(temp2[28], temp2[29]); + temp0[30] = vec_sub(temp2[31], temp2[30]); + temp0[31] = vec_add(temp2[31], temp2[30]); + + // Final stage --- outputs indices are bit-reversed. + double_butterfly(temp0[31], cospi1_v, temp0[16], cospi31_v, &out[1], + &out[31]); + double_butterfly(temp0[30], cospi17_v, temp0[17], cospi15_v, &out[17], + &out[15]); + double_butterfly(temp0[29], cospi9_v, temp0[18], cospi23_v, &out[9], + &out[23]); + double_butterfly(temp0[28], cospi25_v, temp0[19], cospi7_v, &out[25], + &out[7]); + double_butterfly(temp0[27], cospi5_v, temp0[20], cospi27_v, &out[5], + &out[27]); + double_butterfly(temp0[26], cospi21_v, temp0[21], cospi11_v, &out[21], + &out[11]); + double_butterfly(temp0[25], cospi13_v, temp0[22], cospi19_v, &out[13], + &out[19]); + double_butterfly(temp0[24], cospi29_v, temp0[23], cospi3_v, &out[29], + &out[3]); + + if (pass == 0) { + for (i = 0; i < 32; i++) { + out[i] = sub_round_shift(out[i]); + } + } +} + +void vpx_fdct32x32_rd_vsx(const int16_t *input, tran_low_t *out, int stride) { + int16x8_t temp0[32]; + int16x8_t temp1[32]; + int16x8_t temp2[32]; + int16x8_t temp3[32]; + int16x8_t temp4[32]; + int16x8_t temp5[32]; + int16x8_t temp6[32]; + + // Process in 8x32 columns. + load(input, stride, temp0); + fdct32_vsx(temp0, temp1, 0); + + load(input + 8, stride, temp0); + fdct32_vsx(temp0, temp2, 0); + + load(input + 16, stride, temp0); + fdct32_vsx(temp0, temp3, 0); + + load(input + 24, stride, temp0); + fdct32_vsx(temp0, temp4, 0); + + // Generate the top row by munging the first set of 8 from each one + // together. + transpose_8x8(&temp1[0], &temp0[0]); + transpose_8x8(&temp2[0], &temp0[8]); + transpose_8x8(&temp3[0], &temp0[16]); + transpose_8x8(&temp4[0], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out, temp6); + + // Second row of 8x32. + transpose_8x8(&temp1[8], &temp0[0]); + transpose_8x8(&temp2[8], &temp0[8]); + transpose_8x8(&temp3[8], &temp0[16]); + transpose_8x8(&temp4[8], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out + 8 * 32, temp6); + + // Third row of 8x32 + transpose_8x8(&temp1[16], &temp0[0]); + transpose_8x8(&temp2[16], &temp0[8]); + transpose_8x8(&temp3[16], &temp0[16]); + transpose_8x8(&temp4[16], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out + 16 * 32, temp6); + + // Final row of 8x32. + transpose_8x8(&temp1[24], &temp0[0]); + transpose_8x8(&temp2[24], &temp0[8]); + transpose_8x8(&temp3[24], &temp0[16]); + transpose_8x8(&temp4[24], &temp0[24]); + + fdct32_vsx(temp0, temp5, 1); + + transpose_8x8(&temp5[0], &temp6[0]); + transpose_8x8(&temp5[8], &temp6[8]); + transpose_8x8(&temp5[16], &temp6[16]); + transpose_8x8(&temp5[24], &temp6[24]); + + store(out + 24 * 32, temp6); +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c new file mode 100644 index 0000000000..e279b30478 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/hadamard_vsx.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/ppc/transpose_vsx.h" +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" + +static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) { + const int16x8_t b0 = vec_add(v[0], v[1]); + const int16x8_t b1 = vec_sub(v[0], v[1]); + const int16x8_t b2 = vec_add(v[2], v[3]); + const int16x8_t b3 = vec_sub(v[2], v[3]); + const int16x8_t b4 = vec_add(v[4], v[5]); + const int16x8_t b5 = vec_sub(v[4], v[5]); + const int16x8_t b6 = vec_add(v[6], v[7]); + const int16x8_t b7 = vec_sub(v[6], v[7]); + + const int16x8_t c0 = vec_add(b0, b2); + const int16x8_t c1 = vec_add(b1, b3); + const int16x8_t c2 = vec_sub(b0, b2); + const int16x8_t c3 = vec_sub(b1, b3); + const int16x8_t c4 = vec_add(b4, b6); + const int16x8_t c5 = vec_add(b5, b7); + const int16x8_t c6 = vec_sub(b4, b6); + const int16x8_t c7 = vec_sub(b5, b7); + + v[0] = vec_add(c0, c4); + v[1] = vec_sub(c2, c6); + v[2] = vec_sub(c0, c4); + v[3] = vec_add(c2, c6); + v[4] = vec_add(c3, c7); + v[5] = vec_sub(c3, c7); + v[6] = vec_sub(c1, c5); + v[7] = vec_add(c1, c5); +} + +void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int16x8_t v[8]; + + v[0] = vec_vsx_ld(0, src_diff); + v[1] = vec_vsx_ld(0, src_diff + src_stride); + v[2] = vec_vsx_ld(0, src_diff + (2 * src_stride)); + v[3] = vec_vsx_ld(0, src_diff + (3 * src_stride)); + v[4] = vec_vsx_ld(0, src_diff + (4 * src_stride)); + v[5] = vec_vsx_ld(0, src_diff + (5 * src_stride)); + v[6] = vec_vsx_ld(0, src_diff + (6 * src_stride)); + v[7] = vec_vsx_ld(0, src_diff + (7 * src_stride)); + + vpx_hadamard_s16_8x8_one_pass(v); + + vpx_transpose_s16_8x8(v); + + vpx_hadamard_s16_8x8_one_pass(v); + + store_tran_low(v[0], 0, coeff); + store_tran_low(v[1], 0, coeff + 8); + store_tran_low(v[2], 0, coeff + 16); + store_tran_low(v[3], 0, coeff + 24); + store_tran_low(v[4], 0, coeff + 32); + store_tran_low(v[5], 0, coeff + 40); + store_tran_low(v[6], 0, coeff + 48); + store_tran_low(v[7], 0, coeff + 56); +} + +void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + int i; + const uint16x8_t ones = vec_splat_u16(1); + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + vpx_hadamard_8x8_vsx(src_diff, src_stride, coeff); + /* Top right. */ + vpx_hadamard_8x8_vsx(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); + /* Bottom left. */ + vpx_hadamard_8x8_vsx(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); + /* Bottom right. */ + vpx_hadamard_8x8_vsx(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); + + /* Overlay the 8x8 blocks and combine. */ + for (i = 0; i < 64; i += 8) { + const int16x8_t a0 = load_tran_low(0, coeff); + const int16x8_t a1 = load_tran_low(0, coeff + 64); + const int16x8_t a2 = load_tran_low(0, coeff + 128); + const int16x8_t a3 = load_tran_low(0, coeff + 192); + + /* Prevent the result from escaping int16_t. */ + const int16x8_t b0 = vec_sra(a0, ones); + const int16x8_t b1 = vec_sra(a1, ones); + const int16x8_t b2 = vec_sra(a2, ones); + const int16x8_t b3 = vec_sra(a3, ones); + + const int16x8_t c0 = vec_add(b0, b1); + const int16x8_t c2 = vec_add(b2, b3); + const int16x8_t c1 = vec_sub(b0, b1); + const int16x8_t c3 = vec_sub(b2, b3); + + const int16x8_t d0 = vec_add(c0, c2); + const int16x8_t d1 = vec_add(c1, c3); + const int16x8_t d2 = vec_sub(c0, c2); + const int16x8_t d3 = vec_sub(c1, c3); + + store_tran_low(d0, 0, coeff); + store_tran_low(d1, 0, coeff + 64); + store_tran_low(d2, 0, coeff + 128); + store_tran_low(d3, 0, coeff + 192); + + coeff += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c new file mode 100644 index 0000000000..a4c8322ff2 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/intrapred_vsx.c @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, above); + int i; + (void)left; + + for (i = 0; i < 16; i++, dst += stride) { + vec_vsx_st(d, 0, dst); + } +} + +void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vec_vsx_ld(0, above); + const uint8x16_t d1 = vec_vsx_ld(16, above); + int i; + (void)left; + + for (i = 0; i < 32; i++, dst += stride) { + vec_vsx_st(d0, 0, dst); + vec_vsx_st(d1, 16, dst); + } +} + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + +void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + (void)above; + + vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); + dst += stride; + vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst); +} + +void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + const uint8x16_t v4 = vec_splat(d, 4); + const uint8x16_t v5 = vec_splat(d, 5); + const uint8x16_t v6 = vec_splat(d, 6); + const uint8x16_t v7 = vec_splat(d, 7); + + (void)above; + + vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst); + dst += stride; + vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst); +} +#endif + +void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vec_vsx_ld(0, left); + const uint8x16_t v0 = vec_splat(d, 0); + const uint8x16_t v1 = vec_splat(d, 1); + const uint8x16_t v2 = vec_splat(d, 2); + const uint8x16_t v3 = vec_splat(d, 3); + + const uint8x16_t v4 = vec_splat(d, 4); + const uint8x16_t v5 = vec_splat(d, 5); + const uint8x16_t v6 = vec_splat(d, 6); + const uint8x16_t v7 = vec_splat(d, 7); + + const uint8x16_t v8 = vec_splat(d, 8); + const uint8x16_t v9 = vec_splat(d, 9); + const uint8x16_t v10 = vec_splat(d, 10); + const uint8x16_t v11 = vec_splat(d, 11); + + const uint8x16_t v12 = vec_splat(d, 12); + const uint8x16_t v13 = vec_splat(d, 13); + const uint8x16_t v14 = vec_splat(d, 14); + const uint8x16_t v15 = vec_splat(d, 15); + + (void)above; + + vec_vsx_st(v0, 0, dst); + dst += stride; + vec_vsx_st(v1, 0, dst); + dst += stride; + vec_vsx_st(v2, 0, dst); + dst += stride; + vec_vsx_st(v3, 0, dst); + dst += stride; + vec_vsx_st(v4, 0, dst); + dst += stride; + vec_vsx_st(v5, 0, dst); + dst += stride; + vec_vsx_st(v6, 0, dst); + dst += stride; + vec_vsx_st(v7, 0, dst); + dst += stride; + vec_vsx_st(v8, 0, dst); + dst += stride; + vec_vsx_st(v9, 0, dst); + dst += stride; + vec_vsx_st(v10, 0, dst); + dst += stride; + vec_vsx_st(v11, 0, dst); + dst += stride; + vec_vsx_st(v12, 0, dst); + dst += stride; + vec_vsx_st(v13, 0, dst); + dst += stride; + vec_vsx_st(v14, 0, dst); + dst += stride; + vec_vsx_st(v15, 0, dst); +} + +#define H_PREDICTOR_32(v) \ + vec_vsx_st(v, 0, dst); \ + vec_vsx_st(v, 16, dst); \ + dst += stride + +void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vec_vsx_ld(0, left); + const uint8x16_t d1 = vec_vsx_ld(16, left); + + const uint8x16_t v0_0 = vec_splat(d0, 0); + const uint8x16_t v1_0 = vec_splat(d0, 1); + const uint8x16_t v2_0 = vec_splat(d0, 2); + const uint8x16_t v3_0 = vec_splat(d0, 3); + const uint8x16_t v4_0 = vec_splat(d0, 4); + const uint8x16_t v5_0 = vec_splat(d0, 5); + const uint8x16_t v6_0 = vec_splat(d0, 6); + const uint8x16_t v7_0 = vec_splat(d0, 7); + const uint8x16_t v8_0 = vec_splat(d0, 8); + const uint8x16_t v9_0 = vec_splat(d0, 9); + const uint8x16_t v10_0 = vec_splat(d0, 10); + const uint8x16_t v11_0 = vec_splat(d0, 11); + const uint8x16_t v12_0 = vec_splat(d0, 12); + const uint8x16_t v13_0 = vec_splat(d0, 13); + const uint8x16_t v14_0 = vec_splat(d0, 14); + const uint8x16_t v15_0 = vec_splat(d0, 15); + + const uint8x16_t v0_1 = vec_splat(d1, 0); + const uint8x16_t v1_1 = vec_splat(d1, 1); + const uint8x16_t v2_1 = vec_splat(d1, 2); + const uint8x16_t v3_1 = vec_splat(d1, 3); + const uint8x16_t v4_1 = vec_splat(d1, 4); + const uint8x16_t v5_1 = vec_splat(d1, 5); + const uint8x16_t v6_1 = vec_splat(d1, 6); + const uint8x16_t v7_1 = vec_splat(d1, 7); + const uint8x16_t v8_1 = vec_splat(d1, 8); + const uint8x16_t v9_1 = vec_splat(d1, 9); + const uint8x16_t v10_1 = vec_splat(d1, 10); + const uint8x16_t v11_1 = vec_splat(d1, 11); + const uint8x16_t v12_1 = vec_splat(d1, 12); + const uint8x16_t v13_1 = vec_splat(d1, 13); + const uint8x16_t v14_1 = vec_splat(d1, 14); + const uint8x16_t v15_1 = vec_splat(d1, 15); + + (void)above; + + H_PREDICTOR_32(v0_0); + H_PREDICTOR_32(v1_0); + H_PREDICTOR_32(v2_0); + H_PREDICTOR_32(v3_0); + + H_PREDICTOR_32(v4_0); + H_PREDICTOR_32(v5_0); + H_PREDICTOR_32(v6_0); + H_PREDICTOR_32(v7_0); + + H_PREDICTOR_32(v8_0); + H_PREDICTOR_32(v9_0); + H_PREDICTOR_32(v10_0); + H_PREDICTOR_32(v11_0); + + H_PREDICTOR_32(v12_0); + H_PREDICTOR_32(v13_0); + H_PREDICTOR_32(v14_0); + H_PREDICTOR_32(v15_0); + + H_PREDICTOR_32(v0_1); + H_PREDICTOR_32(v1_1); + H_PREDICTOR_32(v2_1); + H_PREDICTOR_32(v3_1); + + H_PREDICTOR_32(v4_1); + H_PREDICTOR_32(v5_1); + H_PREDICTOR_32(v6_1); + H_PREDICTOR_32(v7_1); + + H_PREDICTOR_32(v8_1); + H_PREDICTOR_32(v9_1); + H_PREDICTOR_32(v10_1); + H_PREDICTOR_32(v11_1); + + H_PREDICTOR_32(v12_1); + H_PREDICTOR_32(v13_1); + H_PREDICTOR_32(v14_1); + H_PREDICTOR_32(v15_1); +} + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); + const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); + int16x8_t tmp, val; + uint8x16_t d; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 0), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 1), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 2), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); + dst += stride; + + d = vec_vsx_ld(0, dst); + tmp = unpack_to_s16_l(d); + val = vec_sub(vec_add(vec_splat(l, 3), a), tl); + vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst); +} + +void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left)); + const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above)); + int16x8_t tmp, val; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 0), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 1), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 2), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 3), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 4), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 5), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 6), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); + dst += stride; + + tmp = unpack_to_s16_l(vec_vsx_ld(0, dst)); + val = vec_sub(vec_add(vec_splat(l, 7), a), tl); + vec_vsx_st(vec_packsu(val, tmp), 0, dst); +} +#endif + +static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l, + int16x8_t ah, int16x8_t al, int16x8_t tl) { + int16x8_t vh, vl, ls; + + ls = vec_splat(l, 0); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 1); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 2); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 3); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 4); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 5); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 6); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + dst += stride; + + ls = vec_splat(l, 7); + vh = vec_sub(vec_add(ls, ah), tl); + vl = vec_sub(vec_add(ls, al), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); +} + +void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const uint8x16_t l = vec_vsx_ld(0, left); + const int16x8_t lh = unpack_to_s16_h(l); + const int16x8_t ll = unpack_to_s16_l(l); + const uint8x16_t a = vec_vsx_ld(0, above); + const int16x8_t ah = unpack_to_s16_h(a); + const int16x8_t al = unpack_to_s16_l(a); + + tm_predictor_16x8(dst, stride, lh, ah, al, tl); + + dst += stride * 8; + + tm_predictor_16x8(dst, stride, ll, ah, al, tl); +} + +static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls, + const int16x8_t a0h, const int16x8_t a0l, + const int16x8_t a1h, const int16x8_t a1l, + const int16x8_t tl) { + int16x8_t vh, vl; + + vh = vec_sub(vec_add(ls, a0h), tl); + vl = vec_sub(vec_add(ls, a0l), tl); + vec_vsx_st(vec_packsu(vh, vl), 0, dst); + vh = vec_sub(vec_add(ls, a1h), tl); + vl = vec_sub(vec_add(ls, a1l), tl); + vec_vsx_st(vec_packsu(vh, vl), 16, dst); +} + +static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride, + const int16x8_t l, const uint8x16_t a0, + const uint8x16_t a1, const int16x8_t tl) { + const int16x8_t a0h = unpack_to_s16_h(a0); + const int16x8_t a0l = unpack_to_s16_l(a0); + const int16x8_t a1h = unpack_to_s16_h(a1); + const int16x8_t a1l = unpack_to_s16_l(a1); + + tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl); + dst += stride; + + tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl); +} + +void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0)); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const uint8x16_t l1 = vec_vsx_ld(16, left); + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + + tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl); + dst += stride * 8; + + tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl); +} + +static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 8; i++, dst += stride) { + const uint8x16_t d = vec_vsx_ld(0, dst); + vec_vsx_st(xxpermdi(val, d, 1), 0, dst); + } +} + +static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 16; i++, dst += stride) { + vec_vsx_st(val, 0, dst); + } +} + +void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7)); + (void)above; + (void)left; + + dc_fill_predictor_16x16(dst, stride, v128); +} + +static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride, + const uint8x16_t val) { + int i; + + for (i = 0; i < 32; i++, dst += stride) { + vec_vsx_st(val, 0, dst); + vec_vsx_st(val, 16, dst); + } +} + +void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7)); + (void)above; + (void)left; + + dc_fill_predictor_32x32(dst, stride, v128); +} + +static uint8x16_t avg16(const uint8_t *values) { + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0)); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8)); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + dc_fill_predictor_16x16(dst, stride, avg16(left)); +} + +void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + dc_fill_predictor_16x16(dst, stride, avg16(above)); +} + +static uint8x16_t avg32(const uint8_t *values) { + const uint8x16_t v0 = vec_vsx_ld(0, values); + const uint8x16_t v1 = vec_vsx_ld(16, values); + const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4)); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0))); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + dc_fill_predictor_32x32(dst, stride, avg32(left)); +} + +void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + dc_fill_predictor_32x32(dst, stride, avg32(above)); +} + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0))); + const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8)); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} +#endif + +static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4)); + const int32x4_t sum4s = + (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0))); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left)); +} +#endif + +void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left)); +} + +static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t l0 = vec_vsx_ld(0, left); + const uint8x16_t l1 = vec_vsx_ld(16, left); + const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5)); + const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0))); + const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum)); + const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32); + const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6)); + + return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)), + 3); +} + +void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left)); +} + +static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b, + const uint8x16_t c) { + const uint8x16_t ac = + vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1))); + + return vec_avg(ac, b); +} + +// Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken. +static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, + 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 }; + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t af = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(af, 7); + const uint8x16_t a = xxpermdi(af, above_right, 1); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 8; i++) { + const uint8x16_t d = vec_vsx_ld(0, dst); + vec_vsx_st(xxpermdi(row, d, 1), 0, dst); + dst += stride; + row = vec_perm(row, above_right, sl1); + } +} +#endif + +void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(a, 15); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 16; i++) { + vec_vsx_st(row, 0, dst); + dst += stride; + row = vec_perm(row, above_right, sl1); + } +} + +void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t above_right = vec_splat(a1, 15); + const uint8x16_t b0 = vec_perm(a0, a1, sl1); + const uint8x16_t b1 = vec_perm(a1, above_right, sl1); + const uint8x16_t c0 = vec_perm(b0, b1, sl1); + const uint8x16_t c1 = vec_perm(b1, above_right, sl1); + uint8x16_t row0 = avg3(a0, b0, c0); + uint8x16_t row1 = avg3(a1, b1, c1); + int i; + (void)left; + + for (i = 0; i < 32; i++) { + vec_vsx_st(row0, 0, dst); + vec_vsx_st(row1, 16, dst); + dst += stride; + row0 = vec_perm(row0, row1, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} + +// TODO(crbug.com/webm/1522): Fix test failures. +#if 0 +void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t af = vec_vsx_ld(0, above); + const uint8x16_t above_right = vec_splat(af, 9); + const uint8x16_t a = xxpermdi(af, above_right, 1); + const uint8x16_t b = vec_perm(a, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row0 = vec_avg(a, b); + uint8x16_t row1 = avg3(a, b, c); + int i; + (void)left; + + for (i = 0; i < 4; i++) { + const uint8x16_t d0 = vec_vsx_ld(0, dst); + const uint8x16_t d1 = vec_vsx_ld(0, dst + stride); + vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst); + vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride); + dst += stride * 2; + row0 = vec_perm(row0, above_right, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} +#endif + +void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t above_right = vec_splat(a1, 0); + const uint8x16_t b = vec_perm(a0, above_right, sl1); + const uint8x16_t c = vec_perm(b, above_right, sl1); + uint8x16_t row0 = vec_avg(a0, b); + uint8x16_t row1 = avg3(a0, b, c); + int i; + (void)left; + + for (i = 0; i < 8; i++) { + vec_vsx_st(row0, 0, dst); + vec_vsx_st(row1, 0, dst + stride); + dst += stride * 2; + row0 = vec_perm(row0, above_right, sl1); + row1 = vec_perm(row1, above_right, sl1); + } +} + +void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t a0 = vec_vsx_ld(0, above); + const uint8x16_t a1 = vec_vsx_ld(16, above); + const uint8x16_t a2 = vec_vsx_ld(32, above); + const uint8x16_t above_right = vec_splat(a2, 0); + const uint8x16_t b0 = vec_perm(a0, a1, sl1); + const uint8x16_t b1 = vec_perm(a1, above_right, sl1); + const uint8x16_t c0 = vec_perm(b0, b1, sl1); + const uint8x16_t c1 = vec_perm(b1, above_right, sl1); + uint8x16_t row0_0 = vec_avg(a0, b0); + uint8x16_t row0_1 = vec_avg(a1, b1); + uint8x16_t row1_0 = avg3(a0, b0, c0); + uint8x16_t row1_1 = avg3(a1, b1, c1); + int i; + (void)left; + + for (i = 0; i < 16; i++) { + vec_vsx_st(row0_0, 0, dst); + vec_vsx_st(row0_1, 16, dst); + vec_vsx_st(row1_0, 0, dst + stride); + vec_vsx_st(row1_1, 16, dst + stride); + dst += stride * 2; + row0_0 = vec_perm(row0_0, row0_1, sl1); + row0_1 = vec_perm(row0_1, above_right, sl1); + row1_0 = vec_perm(row1_0, row1_1, sl1); + row1_1 = vec_perm(row1_1, above_right, sl1); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c new file mode 100644 index 0000000000..8c4e603dda --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c @@ -0,0 +1,1828 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" +#include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/ppc/inv_txfm_vsx.h" + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/inv_txfm.h" + +static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, + 16364, 16364, 16364, 16364 }; +static const int16x8_t cospi1m_v = { -16364, -16364, -16364, -16364, + -16364, -16364, -16364, -16364 }; +static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, + 16305, 16305, 16305, 16305 }; +static const int16x8_t cospi2m_v = { -16305, -16305, -16305, -16305, + -16305, -16305, -16305, -16305 }; +static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, + 16207, 16207, 16207, 16207 }; +static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, + 16069, 16069, 16069, 16069 }; +static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, + -16069, -16069, -16069, -16069 }; +static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, + 15893, 15893, 15893, 15893 }; +static const int16x8_t cospi5m_v = { -15893, -15893, -15893, -15893, + -15893, -15893, -15893, -15893 }; +static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, + 15679, 15679, 15679, 15679 }; +static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, + 15426, 15426, 15426, 15426 }; +static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, + 15137, 15137, 15137, 15137 }; +static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, + -15137, -15137, -15137, -15137 }; +static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, + 14811, 14811, 14811, 14811 }; +static const int16x8_t cospi9m_v = { -14811, -14811, -14811, -14811, + -14811, -14811, -14811, -14811 }; +static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, + 14449, 14449, 14449, 14449 }; +static const int16x8_t cospi10m_v = { -14449, -14449, -14449, -14449, + -14449, -14449, -14449, -14449 }; +static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, + 14053, 14053, 14053, 14053 }; +static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, + 13623, 13623, 13623, 13623 }; +static const int16x8_t cospi12m_v = { -13623, -13623, -13623, -13623, + -13623, -13623, -13623, -13623 }; +static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, + 13160, 13160, 13160, 13160 }; +static const int16x8_t cospi13m_v = { -13160, -13160, -13160, -13160, + -13160, -13160, -13160, -13160 }; +static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, + 12665, 12665, 12665, 12665 }; +static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, + 12140, 12140, 12140, 12140 }; +static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, + 11585, 11585, 11585, 11585 }; +static const int16x8_t cospi16m_v = { -11585, -11585, -11585, -11585, + -11585, -11585, -11585, -11585 }; +static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, + 11003, 11003, 11003, 11003 }; +static const int16x8_t cospi17m_v = { -11003, -11003, -11003, -11003, + -11003, -11003, -11003, -11003 }; +static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, + 10394, 10394, 10394, 10394 }; +static const int16x8_t cospi18m_v = { -10394, -10394, -10394, -10394, + -10394, -10394, -10394, -10394 }; +static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, + 9760, 9760, 9760, 9760 }; +static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, + 9102, 9102, 9102, 9102 }; +static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, + -9102, -9102, -9102, -9102 }; +static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, + 8423, 8423, 8423, 8423 }; +static const int16x8_t cospi21m_v = { -8423, -8423, -8423, -8423, + -8423, -8423, -8423, -8423 }; +static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, + 7723, 7723, 7723, 7723 }; +static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, + 7005, 7005, 7005, 7005 }; +static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, + 6270, 6270, 6270, 6270 }; +static const int16x8_t cospi24m_v = { -6270, -6270, -6270, -6270, + -6270, -6270, -6270, -6270 }; +static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, + 5520, 5520, 5520, 5520 }; +static const int16x8_t cospi25m_v = { -5520, -5520, -5520, -5520, + -5520, -5520, -5520, -5520 }; +static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, + 4756, 4756, 4756, 4756 }; +static const int16x8_t cospi26m_v = { -4756, -4756, -4756, -4756, + -4756, -4756, -4756, -4756 }; +static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, + 3981, 3981, 3981, 3981 }; +static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, + 3196, 3196, 3196, 3196 }; +static const int16x8_t cospi28m_v = { -3196, -3196, -3196, -3196, + -3196, -3196, -3196, -3196 }; +static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, + 2404, 2404, 2404, 2404 }; +static const int16x8_t cospi29m_v = { -2404, -2404, -2404, -2404, + -2404, -2404, -2404, -2404 }; +static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, + 1606, 1606, 1606, 1606 }; +static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; + +static const int16x8_t sinpi_1_9_v = { 5283, 5283, 5283, 5283, + 5283, 5283, 5283, 5283 }; +static const int16x8_t sinpi_2_9_v = { 9929, 9929, 9929, 9929, + 9929, 9929, 9929, 9929 }; +static const int16x8_t sinpi_3_9_v = { 13377, 13377, 13377, 13377, + 13377, 13377, 13377, 13377 }; +static const int16x8_t sinpi_4_9_v = { 15212, 15212, 15212, 15212, + 15212, 15212, 15212, 15212 }; + +static uint8x16_t tr8_mask0 = { + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 +}; + +static uint8x16_t tr8_mask1 = { + 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F +}; + +#define ROUND_SHIFT_INIT \ + const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \ + const uint32x4_t shift14 = vec_splat_u32(14); + +#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14); + +#define PIXEL_ADD_INIT \ + int16x8_t add8 = vec_splat_s16(8); \ + uint16x8_t shift4 = vec_splat_u16(4); + +#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4); + +#define IDCT4(in0, in1, out0, out1) \ + t0 = vec_add(in0, in1); \ + t1 = vec_sub(in0, in1); \ + tmp16_0 = vec_mergeh(t0, t1); \ + temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14); \ + temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14); \ + \ + tmp16_0 = vec_mergel(in0, in1); \ + temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp3); \ + temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \ + DCT_CONST_ROUND_SHIFT(temp4); \ + \ + step0 = vec_packs(temp1, temp2); \ + step1 = vec_packs(temp4, temp3); \ + out0 = vec_add(step0, step1); \ + out1 = vec_sub(step0, step1); \ + out1 = vec_perm(out1, out1, mask0); + +#define PACK_STORE(v0, v1) \ + tmp16_0 = vec_add(vec_perm(d_u0, d_u1, tr8_mask0), v0); \ + tmp16_1 = vec_add(vec_perm(d_u2, d_u3, tr8_mask0), v1); \ + output_v = vec_packsu(tmp16_0, tmp16_1); \ + \ + vec_vsx_st(output_v, 0, tmp_dest); \ + for (i = 0; i < 4; i++) \ + for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; + +void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest, + int stride) { + int i, j; + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + uint8x16_t zerov = vec_splat_u8(0); + int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); + int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); + int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); + int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + int16x8_t tmp16_0, tmp16_1; + uint8x16_t output_v; + uint8_t tmp_dest[16]; + PIXEL_ADD_INIT; + + PIXEL_ADD4(out[0], in[0]); + PIXEL_ADD4(out[1], in[1]); + + PACK_STORE(out[0], out[1]); +} + +void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t temp1, temp2, temp3, temp4; + int16x8_t step0, step1, tmp16_0; + uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; + int16x8_t t0 = vec_mergeh(in[0], in[1]); + int16x8_t t1 = vec_mergel(in[0], in[1]); + ROUND_SHIFT_INIT + + in[0] = vec_mergeh(t0, t1); + in[1] = vec_mergel(t0, t1); + + IDCT4(in[0], in[1], out[0], out[1]); +} + +void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t in[2], out[2]; + + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + // Rows + vpx_idct4_vsx(in, out); + + // Columns + vpx_idct4_vsx(out, in); + + vpx_round_store4x4_vsx(in, out, dest, stride); +} + +#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + out0 = vec_mergeh(in0, in1); \ + out1 = vec_mergel(in0, in1); \ + out2 = vec_mergeh(in2, in3); \ + out3 = vec_mergel(in2, in3); \ + out4 = vec_mergeh(in4, in5); \ + out5 = vec_mergel(in4, in5); \ + out6 = vec_mergeh(in6, in7); \ + out7 = vec_mergel(in6, in7); \ + in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2); \ + in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2); \ + in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3); \ + in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3); \ + in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6); \ + in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6); \ + in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7); \ + in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7); \ + out0 = vec_perm(in0, in4, tr8_mask0); \ + out1 = vec_perm(in0, in4, tr8_mask1); \ + out2 = vec_perm(in1, in5, tr8_mask0); \ + out3 = vec_perm(in1, in5, tr8_mask1); \ + out4 = vec_perm(in2, in6, tr8_mask0); \ + out5 = vec_perm(in2, in6, tr8_mask1); \ + out6 = vec_perm(in3, in7, tr8_mask0); \ + out7 = vec_perm(in3, in7, tr8_mask1); + +/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z + * temp2 = step[x] * cospi_z + step[y] * cospi_q */ +#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \ + temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \ + tmp16_2 = vec_sub(inpt0, inpt1); \ + tmp16_3 = vec_add(inpt0, inpt1); \ + tmp16_0 = vec_mergeh(tmp16_2, tmp16_3); \ + tmp16_1 = vec_mergel(tmp16_2, tmp16_3); \ + temp10 = vec_mule(tmp16_0, cospi); \ + temp11 = vec_mule(tmp16_1, cospi); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_mulo(tmp16_0, cospi); \ + temp11 = vec_mulo(tmp16_1, cospi); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7) \ + /* stage 1 */ \ + step0 = in0; \ + step2 = in4; \ + step1 = in2; \ + step3 = in6; \ + \ + STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v); \ + STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \ + \ + /* stage 2 */ \ + STEP8_1(step0, step2, in1, in0, cospi16_v); \ + STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v); \ + in4 = vec_add(step4, step5); \ + in5 = vec_sub(step4, step5); \ + in6 = vec_sub(step7, step6); \ + in7 = vec_add(step6, step7); \ + \ + /* stage 3 */ \ + step0 = vec_add(in0, in3); \ + step1 = vec_add(in1, in2); \ + step2 = vec_sub(in1, in2); \ + step3 = vec_sub(in0, in3); \ + step4 = in4; \ + STEP8_1(in6, in5, step5, step6, cospi16_v); \ + step7 = in7; \ + \ + /* stage 4 */ \ + in0 = vec_add(step0, step7); \ + in1 = vec_add(step1, step6); \ + in2 = vec_add(step2, step5); \ + in3 = vec_add(step3, step4); \ + in4 = vec_sub(step3, step4); \ + in5 = vec_sub(step2, step5); \ + in6 = vec_sub(step1, step6); \ + in7 = vec_sub(step0, step7); + +#define PIXEL_ADD(in, out, add, shiftx) \ + out = vec_add(vec_sra(vec_add(in, add), shiftx), out); + +void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out) { + int16x8_t step0, step1, step2, step3, step4, step5, step6, step7; + int16x8_t tmp16_0, tmp16_1, tmp16_2, tmp16_3; + int32x4_t temp10, temp11; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0], + out[1], out[2], out[3], out[4], out[5], out[6], out[7]); + + IDCT8(out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]); +} + +void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride) { + uint8x16_t zerov = vec_splat_u8(0); + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest); + uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest); + uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest); + uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest); + int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); + int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); + int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); + int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov); + int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov); + int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov); + int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov); + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1)); + uint16x8_t shift5 = vec_splat_u16(5); + uint8x16_t output0, output1, output2, output3; + + PIXEL_ADD(in[0], d_u0, add, shift5); + PIXEL_ADD(in[1], d_u1, add, shift5); + PIXEL_ADD(in[2], d_u2, add, shift5); + PIXEL_ADD(in[3], d_u3, add, shift5); + PIXEL_ADD(in[4], d_u4, add, shift5); + PIXEL_ADD(in[5], d_u5, add, shift5); + PIXEL_ADD(in[6], d_u6, add, shift5); + PIXEL_ADD(in[7], d_u7, add, shift5); + output0 = vec_packsu(d_u0, d_u1); + output1 = vec_packsu(d_u2, d_u3); + output2 = vec_packsu(d_u4, d_u5); + output3 = vec_packsu(d_u6, d_u7); + + vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest); + vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest); + vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest); + vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest); + vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest); + vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest); + vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest); + vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest); +} + +void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src[8], tmp[8]; + + src[0] = load_tran_low(0, input); + src[1] = load_tran_low(8 * sizeof(*input), input); + src[2] = load_tran_low(16 * sizeof(*input), input); + src[3] = load_tran_low(24 * sizeof(*input), input); + src[4] = load_tran_low(32 * sizeof(*input), input); + src[5] = load_tran_low(40 * sizeof(*input), input); + src[6] = load_tran_low(48 * sizeof(*input), input); + src[7] = load_tran_low(56 * sizeof(*input), input); + + vpx_idct8_vsx(src, tmp); + vpx_idct8_vsx(tmp, src); + + vpx_round_store8x8_vsx(src, dest, stride); +} + +#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_mule(tmp16_0, cospi); \ + temp11 = vec_mule(tmp16_1, cospi); \ + temp20 = vec_mulo(tmp16_0, cospi); \ + temp21 = vec_mulo(tmp16_1, cospi); \ + temp30 = vec_sub(temp10, temp20); \ + temp10 = vec_add(temp10, temp20); \ + temp20 = vec_sub(temp11, temp21); \ + temp21 = vec_add(temp11, temp21); \ + DCT_CONST_ROUND_SHIFT(temp30); \ + DCT_CONST_ROUND_SHIFT(temp20); \ + outpt0 = vec_packs(temp30, temp20); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp21); \ + outpt1 = vec_packs(temp10, temp21); + +#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB, \ + inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6, \ + out7, out8, out9, outA, outB, outC, outD, outE, outF) \ + /* stage 1 */ \ + /* out0 = in0; */ \ + out1 = in8; \ + out2 = in4; \ + out3 = inC; \ + out4 = in2; \ + out5 = inA; \ + out6 = in6; \ + out7 = inE; \ + out8 = in1; \ + out9 = in9; \ + outA = in5; \ + outB = inD; \ + outC = in3; \ + outD = inB; \ + outE = in7; \ + outF = inF; \ + \ + /* stage 2 */ \ + /* in0 = out0; */ \ + in1 = out1; \ + in2 = out2; \ + in3 = out3; \ + in4 = out4; \ + in5 = out5; \ + in6 = out6; \ + in7 = out7; \ + \ + STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v); \ + STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v); \ + STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v); \ + STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v); \ + \ + /* stage 3 */ \ + out0 = in0; \ + out1 = in1; \ + out2 = in2; \ + out3 = in3; \ + \ + STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v); \ + STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v); \ + \ + out8 = vec_add(in8, in9); \ + out9 = vec_sub(in8, in9); \ + outA = vec_sub(inB, inA); \ + outB = vec_add(inA, inB); \ + outC = vec_add(inC, inD); \ + outD = vec_sub(inC, inD); \ + outE = vec_sub(inF, inE); \ + outF = vec_add(inE, inF); \ + \ + /* stage 4 */ \ + STEP16_1(out0, out1, in1, in0, cospi16_v); \ + STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v); \ + in4 = vec_add(out4, out5); \ + in5 = vec_sub(out4, out5); \ + in6 = vec_sub(out7, out6); \ + in7 = vec_add(out6, out7); \ + \ + in8 = out8; \ + inF = outF; \ + tmp16_0 = vec_mergeh(out9, outE); \ + tmp16_1 = vec_mergel(out9, outE); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + in9 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inE = vec_packs(temp10, temp11); \ + \ + tmp16_0 = vec_mergeh(outA, outD); \ + tmp16_1 = vec_mergel(outA, outD); \ + temp10 = \ + vec_sub(vec_mule(tmp16_0, cospi24m_v), vec_mulo(tmp16_0, cospi8_v)); \ + temp11 = \ + vec_sub(vec_mule(tmp16_1, cospi24m_v), vec_mulo(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inA = vec_packs(temp10, temp11); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + inD = vec_packs(temp10, temp11); \ + \ + inB = outB; \ + inC = outC; \ + \ + /* stage 5 */ \ + out0 = vec_add(in0, in3); \ + out1 = vec_add(in1, in2); \ + out2 = vec_sub(in1, in2); \ + out3 = vec_sub(in0, in3); \ + out4 = in4; \ + STEP16_1(in6, in5, out5, out6, cospi16_v); \ + out7 = in7; \ + \ + out8 = vec_add(in8, inB); \ + out9 = vec_add(in9, inA); \ + outA = vec_sub(in9, inA); \ + outB = vec_sub(in8, inB); \ + outC = vec_sub(inF, inC); \ + outD = vec_sub(inE, inD); \ + outE = vec_add(inD, inE); \ + outF = vec_add(inC, inF); \ + \ + /* stage 6 */ \ + in0 = vec_add(out0, out7); \ + in1 = vec_add(out1, out6); \ + in2 = vec_add(out2, out5); \ + in3 = vec_add(out3, out4); \ + in4 = vec_sub(out3, out4); \ + in5 = vec_sub(out2, out5); \ + in6 = vec_sub(out1, out6); \ + in7 = vec_sub(out0, out7); \ + in8 = out8; \ + in9 = out9; \ + STEP16_1(outD, outA, inA, inD, cospi16_v); \ + STEP16_1(outC, outB, inB, inC, cospi16_v); \ + inE = outE; \ + inF = outF; \ + \ + /* stage 7 */ \ + out0 = vec_add(in0, inF); \ + out1 = vec_add(in1, inE); \ + out2 = vec_add(in2, inD); \ + out3 = vec_add(in3, inC); \ + out4 = vec_add(in4, inB); \ + out5 = vec_add(in5, inA); \ + out6 = vec_add(in6, in9); \ + out7 = vec_add(in7, in8); \ + out8 = vec_sub(in7, in8); \ + out9 = vec_sub(in6, in9); \ + outA = vec_sub(in5, inA); \ + outB = vec_sub(in4, inB); \ + outC = vec_sub(in3, inC); \ + outD = vec_sub(in2, inD); \ + outE = vec_sub(in1, inE); \ + outF = vec_sub(in0, inF); + +#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in0, d_uh, add, shift6); \ + PIXEL_ADD(in1, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest); + +static void half_idct16x8_vsx(int16x8_t *src) { + int16x8_t tmp0[8], tmp1[8]; + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t tmp16_0, tmp16_1; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src[0], src[2], src[4], src[6], src[8], src[10], src[12], + src[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src[1], src[3], src[5], src[7], src[9], src[11], src[13], + src[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + src[0], src[2], src[4], src[6], src[8], src[10], src[12], src[14], + src[1], src[3], src[5], src[7], src[9], src[11], src[13], src[15]); +} + +void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1) { + int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8]; + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t tmp16_0, tmp16_1; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], + tmp2[6], tmp2[7]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], + tmp3[6], tmp3[7]); + + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], + src1[12], src1[14]); + + IDCT16(tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7], + tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7], + src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], + src1[13], src1[15]); +} + +void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest, + int stride) { + uint8x16_t destv[16]; + int16x8_t d_uh, d_ul; + uint8x16_t zerov = vec_splat_u8(0); + uint16x8_t shift6 = vec_splat_u16(6); + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); + + // load dest + LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, destv); + + PIXEL_ADD_STORE16(src0[0], src0[1], destv[0], 0); + PIXEL_ADD_STORE16(src0[2], src0[3], destv[1], stride); + PIXEL_ADD_STORE16(src0[4], src0[5], destv[2], 2 * stride); + PIXEL_ADD_STORE16(src0[6], src0[7], destv[3], 3 * stride); + PIXEL_ADD_STORE16(src0[8], src0[9], destv[4], 4 * stride); + PIXEL_ADD_STORE16(src0[10], src0[11], destv[5], 5 * stride); + PIXEL_ADD_STORE16(src0[12], src0[13], destv[6], 6 * stride); + PIXEL_ADD_STORE16(src0[14], src0[15], destv[7], 7 * stride); + + PIXEL_ADD_STORE16(src1[0], src1[1], destv[8], 8 * stride); + PIXEL_ADD_STORE16(src1[2], src1[3], destv[9], 9 * stride); + PIXEL_ADD_STORE16(src1[4], src1[5], destv[10], 10 * stride); + PIXEL_ADD_STORE16(src1[6], src1[7], destv[11], 11 * stride); + PIXEL_ADD_STORE16(src1[8], src1[9], destv[12], 12 * stride); + PIXEL_ADD_STORE16(src1[10], src1[11], destv[13], 13 * stride); + PIXEL_ADD_STORE16(src1[12], src1[13], destv[14], 14 * stride); + PIXEL_ADD_STORE16(src1[14], src1[15], destv[15], 15 * stride); +} +void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src0[16], src1[16]; + int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8]; + int32x4_t temp10, temp11, temp20, temp21, temp30; + int16x8_t tmp16_0, tmp16_1; + ROUND_SHIFT_INIT; + + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src0); + LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), + 8 * sizeof(*input), src1); + + // transform rows + // transform the upper half of 16x16 matrix + half_idct16x8_vsx(src0); + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + + // transform the lower half of 16x16 matrix + half_idct16x8_vsx(src1); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], + tmp2[6], tmp2[7]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], + tmp3[6], tmp3[7]); + + // transform columns + // left half first + IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7], + tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7], + src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], + src1[12], src1[14]); + // right half + IDCT16(tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7], + tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7], + src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], + src1[13], src1[15]); + + vpx_round_store16x16_vsx(src0, src1, dest, stride); +} + +#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \ + in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \ + in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \ + in71, in72, in73, offset) \ + /* load the first row from the 8x32 block*/ \ + in00 = load(offset, input); \ + in01 = load(offset + 16, input); \ + in02 = load(offset + 2 * 16, input); \ + in03 = load(offset + 3 * 16, input); \ + \ + in10 = load(offset + 4 * 16, input); \ + in11 = load(offset + 5 * 16, input); \ + in12 = load(offset + 6 * 16, input); \ + in13 = load(offset + 7 * 16, input); \ + \ + in20 = load(offset + 8 * 16, input); \ + in21 = load(offset + 9 * 16, input); \ + in22 = load(offset + 10 * 16, input); \ + in23 = load(offset + 11 * 16, input); \ + \ + in30 = load(offset + 12 * 16, input); \ + in31 = load(offset + 13 * 16, input); \ + in32 = load(offset + 14 * 16, input); \ + in33 = load(offset + 15 * 16, input); \ + \ + in40 = load(offset + 16 * 16, input); \ + in41 = load(offset + 17 * 16, input); \ + in42 = load(offset + 18 * 16, input); \ + in43 = load(offset + 19 * 16, input); \ + \ + in50 = load(offset + 20 * 16, input); \ + in51 = load(offset + 21 * 16, input); \ + in52 = load(offset + 22 * 16, input); \ + in53 = load(offset + 23 * 16, input); \ + \ + in60 = load(offset + 24 * 16, input); \ + in61 = load(offset + 25 * 16, input); \ + in62 = load(offset + 26 * 16, input); \ + in63 = load(offset + 27 * 16, input); \ + \ + /* load the last row from the 8x32 block*/ \ + in70 = load(offset + 28 * 16, input); \ + in71 = load(offset + 29 * 16, input); \ + in72 = load(offset + 30 * 16, input); \ + in73 = load(offset + 31 * 16, input); + +/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z + * temp2 = step[x] * cospi_z + step[y] * cospi_q */ +#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \ + temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z + * temp2 = -step[x] * cospi_z + step[y] * cospi_q */ +#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m) \ + tmp16_0 = vec_mergeh(inpt0, inpt1); \ + tmp16_1 = vec_mergel(inpt0, inpt1); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt0 = vec_packs(temp10, temp11); \ + temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1)); \ + temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1)); \ + DCT_CONST_ROUND_SHIFT(temp10); \ + DCT_CONST_ROUND_SHIFT(temp11); \ + outpt1 = vec_packs(temp10, temp11); + +#define IDCT32(in0, in1, in2, in3, out) \ + \ + /* stage 1 */ \ + /* out[0][0] = in[0][0]; */ \ + out[0][1] = in2[0]; \ + out[0][2] = in1[0]; \ + out[0][3] = in3[0]; \ + out[0][4] = in0[4]; \ + out[0][5] = in2[4]; \ + out[0][6] = in1[4]; \ + out[0][7] = in3[4]; \ + out[1][0] = in0[2]; \ + out[1][1] = in2[2]; \ + out[1][2] = in1[2]; \ + out[1][3] = in3[2]; \ + out[1][4] = in0[6]; \ + out[1][5] = in2[6]; \ + out[1][6] = in1[6]; \ + out[1][7] = in3[6]; \ + \ + STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v); \ + STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \ + STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v); \ + STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v); \ + STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v); \ + STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \ + STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \ + STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v); \ + \ + /* stage 2 */ \ + /* in0[0] = out[0][0]; */ \ + in0[1] = out[0][1]; \ + in0[2] = out[0][2]; \ + in0[3] = out[0][3]; \ + in0[4] = out[0][4]; \ + in0[5] = out[0][5]; \ + in0[6] = out[0][6]; \ + in0[7] = out[0][7]; \ + \ + STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v); \ + STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \ + STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \ + STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v); \ + \ + in2[0] = vec_add(out[2][0], out[2][1]); \ + in2[1] = vec_sub(out[2][0], out[2][1]); \ + in2[2] = vec_sub(out[2][3], out[2][2]); \ + in2[3] = vec_add(out[2][3], out[2][2]); \ + in2[4] = vec_add(out[2][4], out[2][5]); \ + in2[5] = vec_sub(out[2][4], out[2][5]); \ + in2[6] = vec_sub(out[2][7], out[2][6]); \ + in2[7] = vec_add(out[2][7], out[2][6]); \ + in3[0] = vec_add(out[3][0], out[3][1]); \ + in3[1] = vec_sub(out[3][0], out[3][1]); \ + in3[2] = vec_sub(out[3][3], out[3][2]); \ + in3[3] = vec_add(out[3][3], out[3][2]); \ + in3[4] = vec_add(out[3][4], out[3][5]); \ + in3[5] = vec_sub(out[3][4], out[3][5]); \ + in3[6] = vec_sub(out[3][7], out[3][6]); \ + in3[7] = vec_add(out[3][6], out[3][7]); \ + \ + /* stage 3 */ \ + out[0][0] = in0[0]; \ + out[0][1] = in0[1]; \ + out[0][2] = in0[2]; \ + out[0][3] = in0[3]; \ + \ + STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v); \ + STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \ + \ + out[1][0] = vec_add(in1[0], in1[1]); \ + out[1][1] = vec_sub(in1[0], in1[1]); \ + out[1][2] = vec_sub(in1[3], in1[2]); \ + out[1][3] = vec_add(in1[2], in1[3]); \ + out[1][4] = vec_add(in1[4], in1[5]); \ + out[1][5] = vec_sub(in1[4], in1[5]); \ + out[1][6] = vec_sub(in1[7], in1[6]); \ + out[1][7] = vec_add(in1[6], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[3][7] = in3[7]; \ + STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v); \ + STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v, \ + cospi4m_v); \ + out[2][3] = in2[3]; \ + out[2][4] = in2[4]; \ + STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v); \ + STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \ + cospi20m_v); \ + out[2][7] = in2[7]; \ + out[3][0] = in3[0]; \ + out[3][3] = in3[3]; \ + out[3][4] = in3[4]; \ + \ + /* stage 4 */ \ + STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v); \ + STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v); \ + in0[4] = vec_add(out[0][4], out[0][5]); \ + in0[5] = vec_sub(out[0][4], out[0][5]); \ + in0[6] = vec_sub(out[0][7], out[0][6]); \ + in0[7] = vec_add(out[0][7], out[0][6]); \ + \ + in1[0] = out[1][0]; \ + in1[7] = out[1][7]; \ + STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v); \ + STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v, \ + cospi8m_v); \ + in1[3] = out[1][3]; \ + in1[4] = out[1][4]; \ + \ + in2[0] = vec_add(out[2][0], out[2][3]); \ + in2[1] = vec_add(out[2][1], out[2][2]); \ + in2[2] = vec_sub(out[2][1], out[2][2]); \ + in2[3] = vec_sub(out[2][0], out[2][3]); \ + in2[4] = vec_sub(out[2][7], out[2][4]); \ + in2[5] = vec_sub(out[2][6], out[2][5]); \ + in2[6] = vec_add(out[2][5], out[2][6]); \ + in2[7] = vec_add(out[2][4], out[2][7]); \ + \ + in3[0] = vec_add(out[3][0], out[3][3]); \ + in3[1] = vec_add(out[3][1], out[3][2]); \ + in3[2] = vec_sub(out[3][1], out[3][2]); \ + in3[3] = vec_sub(out[3][0], out[3][3]); \ + in3[4] = vec_sub(out[3][7], out[3][4]); \ + in3[5] = vec_sub(out[3][6], out[3][5]); \ + in3[6] = vec_add(out[3][5], out[3][6]); \ + in3[7] = vec_add(out[3][4], out[3][7]); \ + \ + /* stage 5 */ \ + out[0][0] = vec_add(in0[0], in0[3]); \ + out[0][1] = vec_add(in0[1], in0[2]); \ + out[0][2] = vec_sub(in0[1], in0[2]); \ + out[0][3] = vec_sub(in0[0], in0[3]); \ + out[0][4] = in0[4]; \ + STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v); \ + out[0][7] = in0[7]; \ + \ + out[1][0] = vec_add(in1[0], in1[3]); \ + out[1][1] = vec_add(in1[1], in1[2]); \ + out[1][2] = vec_sub(in1[1], in1[2]); \ + out[1][3] = vec_sub(in1[0], in1[3]); \ + out[1][4] = vec_sub(in1[7], in1[4]); \ + out[1][5] = vec_sub(in1[6], in1[5]); \ + out[1][6] = vec_add(in1[5], in1[6]); \ + out[1][7] = vec_add(in1[4], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[2][1] = in2[1]; \ + STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v); \ + STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v); \ + STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v, \ + cospi8m_v); \ + STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v, \ + cospi8m_v); \ + out[2][6] = in2[6]; \ + out[2][7] = in2[7]; \ + out[3][0] = in3[0]; \ + out[3][1] = in3[1]; \ + out[3][6] = in3[6]; \ + out[3][7] = in3[7]; \ + \ + /* stage 6 */ \ + in0[0] = vec_add(out[0][0], out[0][7]); \ + in0[1] = vec_add(out[0][1], out[0][6]); \ + in0[2] = vec_add(out[0][2], out[0][5]); \ + in0[3] = vec_add(out[0][3], out[0][4]); \ + in0[4] = vec_sub(out[0][3], out[0][4]); \ + in0[5] = vec_sub(out[0][2], out[0][5]); \ + in0[6] = vec_sub(out[0][1], out[0][6]); \ + in0[7] = vec_sub(out[0][0], out[0][7]); \ + in1[0] = out[1][0]; \ + in1[1] = out[1][1]; \ + STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v); \ + STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v); \ + in1[6] = out[1][6]; \ + in1[7] = out[1][7]; \ + \ + in2[0] = vec_add(out[2][0], out[2][7]); \ + in2[1] = vec_add(out[2][1], out[2][6]); \ + in2[2] = vec_add(out[2][2], out[2][5]); \ + in2[3] = vec_add(out[2][3], out[2][4]); \ + in2[4] = vec_sub(out[2][3], out[2][4]); \ + in2[5] = vec_sub(out[2][2], out[2][5]); \ + in2[6] = vec_sub(out[2][1], out[2][6]); \ + in2[7] = vec_sub(out[2][0], out[2][7]); \ + \ + in3[0] = vec_sub(out[3][7], out[3][0]); \ + in3[1] = vec_sub(out[3][6], out[3][1]); \ + in3[2] = vec_sub(out[3][5], out[3][2]); \ + in3[3] = vec_sub(out[3][4], out[3][3]); \ + in3[4] = vec_add(out[3][4], out[3][3]); \ + in3[5] = vec_add(out[3][5], out[3][2]); \ + in3[6] = vec_add(out[3][6], out[3][1]); \ + in3[7] = vec_add(out[3][7], out[3][0]); \ + \ + /* stage 7 */ \ + out[0][0] = vec_add(in0[0], in1[7]); \ + out[0][1] = vec_add(in0[1], in1[6]); \ + out[0][2] = vec_add(in0[2], in1[5]); \ + out[0][3] = vec_add(in0[3], in1[4]); \ + out[0][4] = vec_add(in0[4], in1[3]); \ + out[0][5] = vec_add(in0[5], in1[2]); \ + out[0][6] = vec_add(in0[6], in1[1]); \ + out[0][7] = vec_add(in0[7], in1[0]); \ + out[1][0] = vec_sub(in0[7], in1[0]); \ + out[1][1] = vec_sub(in0[6], in1[1]); \ + out[1][2] = vec_sub(in0[5], in1[2]); \ + out[1][3] = vec_sub(in0[4], in1[3]); \ + out[1][4] = vec_sub(in0[3], in1[4]); \ + out[1][5] = vec_sub(in0[2], in1[5]); \ + out[1][6] = vec_sub(in0[1], in1[6]); \ + out[1][7] = vec_sub(in0[0], in1[7]); \ + \ + out[2][0] = in2[0]; \ + out[2][1] = in2[1]; \ + out[2][2] = in2[2]; \ + out[2][3] = in2[3]; \ + STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v); \ + STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v); \ + STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v); \ + STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v); \ + out[3][4] = in3[4]; \ + out[3][5] = in3[5]; \ + out[3][6] = in3[6]; \ + out[3][7] = in3[7]; \ + \ + /* final */ \ + in0[0] = vec_add(out[0][0], out[3][7]); \ + in0[1] = vec_add(out[0][1], out[3][6]); \ + in0[2] = vec_add(out[0][2], out[3][5]); \ + in0[3] = vec_add(out[0][3], out[3][4]); \ + in0[4] = vec_add(out[0][4], out[3][3]); \ + in0[5] = vec_add(out[0][5], out[3][2]); \ + in0[6] = vec_add(out[0][6], out[3][1]); \ + in0[7] = vec_add(out[0][7], out[3][0]); \ + in1[0] = vec_add(out[1][0], out[2][7]); \ + in1[1] = vec_add(out[1][1], out[2][6]); \ + in1[2] = vec_add(out[1][2], out[2][5]); \ + in1[3] = vec_add(out[1][3], out[2][4]); \ + in1[4] = vec_add(out[1][4], out[2][3]); \ + in1[5] = vec_add(out[1][5], out[2][2]); \ + in1[6] = vec_add(out[1][6], out[2][1]); \ + in1[7] = vec_add(out[1][7], out[2][0]); \ + in2[0] = vec_sub(out[1][7], out[2][0]); \ + in2[1] = vec_sub(out[1][6], out[2][1]); \ + in2[2] = vec_sub(out[1][5], out[2][2]); \ + in2[3] = vec_sub(out[1][4], out[2][3]); \ + in2[4] = vec_sub(out[1][3], out[2][4]); \ + in2[5] = vec_sub(out[1][2], out[2][5]); \ + in2[6] = vec_sub(out[1][1], out[2][6]); \ + in2[7] = vec_sub(out[1][0], out[2][7]); \ + in3[0] = vec_sub(out[0][7], out[3][0]); \ + in3[1] = vec_sub(out[0][6], out[3][1]); \ + in3[2] = vec_sub(out[0][5], out[3][2]); \ + in3[3] = vec_sub(out[0][4], out[3][3]); \ + in3[4] = vec_sub(out[0][3], out[3][4]); \ + in3[5] = vec_sub(out[0][2], out[3][5]); \ + in3[6] = vec_sub(out[0][1], out[3][6]); \ + in3[7] = vec_sub(out[0][0], out[3][7]); + +// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row, +// does not transpose rows +#define TRANSPOSE_8x32(in, out) \ + /* transpose 4 of 8x8 blocks */ \ + TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5], \ + in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \ + out[0][4], out[0][5], out[0][6], out[0][7]); \ + TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5], \ + in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \ + out[1][4], out[1][5], out[1][6], out[1][7]); \ + TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5], \ + in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \ + out[2][4], out[2][5], out[2][6], out[2][7]); \ + TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5], \ + in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \ + out[3][4], out[3][5], out[3][6], out[3][7]); + +#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step) \ + dst = vec_vsx_ld((step) * stride, dest); \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in0, d_uh, add, shift6); \ + PIXEL_ADD(in1, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), (step) * stride, dest); \ + dst = vec_vsx_ld((step) * stride + 16, dest); \ + d_uh = (int16x8_t)vec_mergeh(dst, zerov); \ + d_ul = (int16x8_t)vec_mergel(dst, zerov); \ + PIXEL_ADD(in2, d_uh, add, shift6); \ + PIXEL_ADD(in3, d_ul, add, shift6); \ + vec_vsx_st(vec_packsu(d_uh, d_ul), (step) * stride + 16, dest); + +#define ADD_STORE_BLOCK(in, offset) \ + PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], (offset) + 0); \ + PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], (offset) + 1); \ + PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], (offset) + 2); \ + PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], (offset) + 3); \ + PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], (offset) + 4); \ + PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], (offset) + 5); \ + PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], (offset) + 6); \ + PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], (offset) + 7); + +void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8]; + int16x8_t tmp16_0, tmp16_1; + int32x4_t temp10, temp11, temp20, temp21, temp30; + uint8x16_t dst; + int16x8_t d_uh, d_ul; + int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2)); + uint16x8_t shift6 = vec_splat_u16(6); + uint8x16_t zerov = vec_splat_u8(0); + + ROUND_SHIFT_INIT; + + LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0], + src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2], + src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3], + src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4], + src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5], + src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7], + src0[1][7], src0[2][7], src0[3][7], 0); + // Rows + // transpose the first row of 8x8 blocks + TRANSPOSE_8x32(src0, tmp); + // transform the 32x8 column + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0); + TRANSPOSE_8x32(tmp, src0); + + LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0], + src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2], + src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3], + src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4], + src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5], + src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7], + src1[1][7], src1[2][7], src1[3][7], 512); + TRANSPOSE_8x32(src1, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1); + TRANSPOSE_8x32(tmp, src1); + + LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0], + src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2], + src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3], + src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4], + src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5], + src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7], + src2[1][7], src2[2][7], src2[3][7], 1024); + TRANSPOSE_8x32(src2, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2); + TRANSPOSE_8x32(tmp, src2); + + LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0], + src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2], + src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3], + src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4], + src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5], + src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7], + src3[1][7], src3[2][7], src3[3][7], 1536); + TRANSPOSE_8x32(src3, tmp); + IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3); + TRANSPOSE_8x32(tmp, src3); + + // Columns + IDCT32(src0[0], src1[0], src2[0], src3[0], tmp); + IDCT32(src0[1], src1[1], src2[1], src3[1], tmp); + IDCT32(src0[2], src1[2], src2[2], src3[2], tmp); + IDCT32(src0[3], src1[3], src2[3], src3[3], tmp); + + ADD_STORE_BLOCK(src0, 0); + ADD_STORE_BLOCK(src1, 8); + ADD_STORE_BLOCK(src2, 16); + ADD_STORE_BLOCK(src3, 24); +} + +#define TRANSFORM_COLS \ + v32_a = vec_add(v32_a, v32_c); \ + v32_d = vec_sub(v32_d, v32_b); \ + v32_e = vec_sub(v32_a, v32_d); \ + v32_e = vec_sra(v32_e, one); \ + v32_b = vec_sub(v32_e, v32_b); \ + v32_c = vec_sub(v32_e, v32_c); \ + v32_a = vec_sub(v32_a, v32_b); \ + v32_d = vec_add(v32_d, v32_c); \ + v_a = vec_packs(v32_a, v32_b); \ + v_c = vec_packs(v32_c, v32_d); + +#define TRANSPOSE_WHT \ + tmp_a = vec_mergeh(v_a, v_c); \ + tmp_c = vec_mergel(v_a, v_c); \ + v_a = vec_mergeh(tmp_a, tmp_c); \ + v_c = vec_mergel(tmp_a, tmp_c); + +void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t v_a = load_tran_low(0, input); + int16x8_t v_c = load_tran_low(8 * sizeof(*input), input); + int16x8_t tmp_a, tmp_c; + uint16x8_t two = vec_splat_u16(2); + uint32x4_t one = vec_splat_u32(1); + int16x8_t tmp16_0, tmp16_1; + int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e; + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0); + int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1); + int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2); + int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3); + uint8x16_t output_v; + uint8_t tmp_dest[16]; + int i, j; + + v_a = vec_sra(v_a, two); + v_c = vec_sra(v_c, two); + + TRANSPOSE_WHT; + + v32_a = vec_unpackh(v_a); + v32_c = vec_unpackl(v_a); + + v32_d = vec_unpackh(v_c); + v32_b = vec_unpackl(v_c); + + TRANSFORM_COLS; + + TRANSPOSE_WHT; + + v32_a = vec_unpackh(v_a); + v32_c = vec_unpackl(v_a); + v32_d = vec_unpackh(v_c); + v32_b = vec_unpackl(v_c); + + TRANSFORM_COLS; + + PACK_STORE(v_a, v_c); +} + +void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out) { + int16x8_t sinpi_1_3_v, sinpi_4_2_v, sinpi_2_3_v, sinpi_1_4_v, sinpi_12_n3_v; + int32x4_t v_v[5], u_v[4]; + int32x4_t zerov = vec_splat_s32(0); + int16x8_t tmp0, tmp1; + int16x8_t zero16v = vec_splat_s16(0); + uint32x4_t shift16 = vec_sl(vec_splat_u32(8), vec_splat_u32(1)); + ROUND_SHIFT_INIT; + + sinpi_1_3_v = vec_mergel(sinpi_1_9_v, sinpi_3_9_v); + sinpi_4_2_v = vec_mergel(sinpi_4_9_v, sinpi_2_9_v); + sinpi_2_3_v = vec_mergel(sinpi_2_9_v, sinpi_3_9_v); + sinpi_1_4_v = vec_mergel(sinpi_1_9_v, sinpi_4_9_v); + sinpi_12_n3_v = vec_mergel(vec_add(sinpi_1_9_v, sinpi_2_9_v), + vec_sub(zero16v, sinpi_3_9_v)); + + tmp0 = (int16x8_t)vec_mergeh((int32x4_t)in[0], (int32x4_t)in[1]); + tmp1 = (int16x8_t)vec_mergel((int32x4_t)in[0], (int32x4_t)in[1]); + in[0] = (int16x8_t)vec_mergeh((int32x4_t)tmp0, (int32x4_t)tmp1); + in[1] = (int16x8_t)vec_mergel((int32x4_t)tmp0, (int32x4_t)tmp1); + + v_v[0] = vec_msum(in[0], sinpi_1_3_v, zerov); + v_v[1] = vec_msum(in[1], sinpi_4_2_v, zerov); + v_v[2] = vec_msum(in[0], sinpi_2_3_v, zerov); + v_v[3] = vec_msum(in[1], sinpi_1_4_v, zerov); + v_v[4] = vec_msum(in[0], sinpi_12_n3_v, zerov); + + in[0] = vec_sub(in[0], in[1]); + in[1] = (int16x8_t)vec_sra((int32x4_t)in[1], shift16); + in[0] = vec_add(in[0], in[1]); + in[0] = (int16x8_t)vec_sl((int32x4_t)in[0], shift16); + + u_v[0] = vec_add(v_v[0], v_v[1]); + u_v[1] = vec_sub(v_v[2], v_v[3]); + u_v[2] = vec_msum(in[0], sinpi_1_3_v, zerov); + u_v[3] = vec_sub(v_v[1], v_v[3]); + u_v[3] = vec_add(u_v[3], v_v[4]); + + DCT_CONST_ROUND_SHIFT(u_v[0]); + DCT_CONST_ROUND_SHIFT(u_v[1]); + DCT_CONST_ROUND_SHIFT(u_v[2]); + DCT_CONST_ROUND_SHIFT(u_v[3]); + + out[0] = vec_packs(u_v[0], u_v[1]); + out[1] = vec_packs(u_v[2], u_v[3]); +} + +#define MSUM_ROUND_SHIFT(a, b, cospi) \ + b = vec_msums(a, cospi, zerov); \ + DCT_CONST_ROUND_SHIFT(b); + +#define IADST_WRAPLOW(in0, in1, tmp0, tmp1, out, cospi) \ + MSUM_ROUND_SHIFT(in0, tmp0, cospi); \ + MSUM_ROUND_SHIFT(in1, tmp1, cospi); \ + out = vec_packs(tmp0, tmp1); + +void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t tmp0[16], tmp1[16]; + + int32x4_t zerov = vec_splat_s32(0); + int16x8_t zero16v = vec_splat_s16(0); + int16x8_t cospi_p02_p30_v = vec_mergel(cospi2_v, cospi30_v); + int16x8_t cospi_p30_m02_v = vec_mergel(cospi30_v, cospi2m_v); + int16x8_t cospi_p10_p22_v = vec_mergel(cospi10_v, cospi22_v); + int16x8_t cospi_p22_m10_v = vec_mergel(cospi22_v, cospi10m_v); + int16x8_t cospi_p18_p14_v = vec_mergel(cospi18_v, cospi14_v); + int16x8_t cospi_p14_m18_v = vec_mergel(cospi14_v, cospi18m_v); + int16x8_t cospi_p26_p06_v = vec_mergel(cospi26_v, cospi6_v); + int16x8_t cospi_p06_m26_v = vec_mergel(cospi6_v, cospi26m_v); + int16x8_t cospi_p08_p24_v = vec_mergel(cospi8_v, cospi24_v); + int16x8_t cospi_p24_m08_v = vec_mergel(cospi24_v, cospi8m_v); + int16x8_t cospi_m24_p08_v = vec_mergel(cospi24m_v, cospi8_v); + int16x8_t cospi_p16_m16_v = vec_mergel(cospi16_v, cospi16m_v); + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0], + out[1], out[2], out[3], out[4], out[5], out[6], out[7]); + + // stage 1 + // interleave and multiply/add into 32-bit integer + in[0] = vec_mergeh(out[7], out[0]); + in[1] = vec_mergel(out[7], out[0]); + in[2] = vec_mergeh(out[5], out[2]); + in[3] = vec_mergel(out[5], out[2]); + in[4] = vec_mergeh(out[3], out[4]); + in[5] = vec_mergel(out[3], out[4]); + in[6] = vec_mergeh(out[1], out[6]); + in[7] = vec_mergel(out[1], out[6]); + + tmp1[0] = vec_msum(in[0], cospi_p02_p30_v, zerov); + tmp1[1] = vec_msum(in[1], cospi_p02_p30_v, zerov); + tmp1[2] = vec_msum(in[0], cospi_p30_m02_v, zerov); + tmp1[3] = vec_msum(in[1], cospi_p30_m02_v, zerov); + tmp1[4] = vec_msum(in[2], cospi_p10_p22_v, zerov); + tmp1[5] = vec_msum(in[3], cospi_p10_p22_v, zerov); + tmp1[6] = vec_msum(in[2], cospi_p22_m10_v, zerov); + tmp1[7] = vec_msum(in[3], cospi_p22_m10_v, zerov); + tmp1[8] = vec_msum(in[4], cospi_p18_p14_v, zerov); + tmp1[9] = vec_msum(in[5], cospi_p18_p14_v, zerov); + tmp1[10] = vec_msum(in[4], cospi_p14_m18_v, zerov); + tmp1[11] = vec_msum(in[5], cospi_p14_m18_v, zerov); + tmp1[12] = vec_msum(in[6], cospi_p26_p06_v, zerov); + tmp1[13] = vec_msum(in[7], cospi_p26_p06_v, zerov); + tmp1[14] = vec_msum(in[6], cospi_p06_m26_v, zerov); + tmp1[15] = vec_msum(in[7], cospi_p06_m26_v, zerov); + + tmp0[0] = vec_add(tmp1[0], tmp1[8]); + tmp0[1] = vec_add(tmp1[1], tmp1[9]); + tmp0[2] = vec_add(tmp1[2], tmp1[10]); + tmp0[3] = vec_add(tmp1[3], tmp1[11]); + tmp0[4] = vec_add(tmp1[4], tmp1[12]); + tmp0[5] = vec_add(tmp1[5], tmp1[13]); + tmp0[6] = vec_add(tmp1[6], tmp1[14]); + tmp0[7] = vec_add(tmp1[7], tmp1[15]); + tmp0[8] = vec_sub(tmp1[0], tmp1[8]); + tmp0[9] = vec_sub(tmp1[1], tmp1[9]); + tmp0[10] = vec_sub(tmp1[2], tmp1[10]); + tmp0[11] = vec_sub(tmp1[3], tmp1[11]); + tmp0[12] = vec_sub(tmp1[4], tmp1[12]); + tmp0[13] = vec_sub(tmp1[5], tmp1[13]); + tmp0[14] = vec_sub(tmp1[6], tmp1[14]); + tmp0[15] = vec_sub(tmp1[7], tmp1[15]); + + // shift and rounding + DCT_CONST_ROUND_SHIFT(tmp0[0]); + DCT_CONST_ROUND_SHIFT(tmp0[1]); + DCT_CONST_ROUND_SHIFT(tmp0[2]); + DCT_CONST_ROUND_SHIFT(tmp0[3]); + DCT_CONST_ROUND_SHIFT(tmp0[4]); + DCT_CONST_ROUND_SHIFT(tmp0[5]); + DCT_CONST_ROUND_SHIFT(tmp0[6]); + DCT_CONST_ROUND_SHIFT(tmp0[7]); + DCT_CONST_ROUND_SHIFT(tmp0[8]); + DCT_CONST_ROUND_SHIFT(tmp0[9]); + DCT_CONST_ROUND_SHIFT(tmp0[10]); + DCT_CONST_ROUND_SHIFT(tmp0[11]); + DCT_CONST_ROUND_SHIFT(tmp0[12]); + DCT_CONST_ROUND_SHIFT(tmp0[13]); + DCT_CONST_ROUND_SHIFT(tmp0[14]); + DCT_CONST_ROUND_SHIFT(tmp0[15]); + + // back to 16-bit + out[0] = vec_packs(tmp0[0], tmp0[1]); + out[1] = vec_packs(tmp0[2], tmp0[3]); + out[2] = vec_packs(tmp0[4], tmp0[5]); + out[3] = vec_packs(tmp0[6], tmp0[7]); + out[4] = vec_packs(tmp0[8], tmp0[9]); + out[5] = vec_packs(tmp0[10], tmp0[11]); + out[6] = vec_packs(tmp0[12], tmp0[13]); + out[7] = vec_packs(tmp0[14], tmp0[15]); + + // stage 2 + in[0] = vec_add(out[0], out[2]); + in[1] = vec_add(out[1], out[3]); + in[2] = vec_sub(out[0], out[2]); + in[3] = vec_sub(out[1], out[3]); + in[4] = vec_mergeh(out[4], out[5]); + in[5] = vec_mergel(out[4], out[5]); + in[6] = vec_mergeh(out[6], out[7]); + in[7] = vec_mergel(out[6], out[7]); + + tmp1[0] = vec_msum(in[4], cospi_p08_p24_v, zerov); + tmp1[1] = vec_msum(in[5], cospi_p08_p24_v, zerov); + tmp1[2] = vec_msum(in[4], cospi_p24_m08_v, zerov); + tmp1[3] = vec_msum(in[5], cospi_p24_m08_v, zerov); + tmp1[4] = vec_msum(in[6], cospi_m24_p08_v, zerov); + tmp1[5] = vec_msum(in[7], cospi_m24_p08_v, zerov); + tmp1[6] = vec_msum(in[6], cospi_p08_p24_v, zerov); + tmp1[7] = vec_msum(in[7], cospi_p08_p24_v, zerov); + + tmp0[0] = vec_add(tmp1[0], tmp1[4]); + tmp0[1] = vec_add(tmp1[1], tmp1[5]); + tmp0[2] = vec_add(tmp1[2], tmp1[6]); + tmp0[3] = vec_add(tmp1[3], tmp1[7]); + tmp0[4] = vec_sub(tmp1[0], tmp1[4]); + tmp0[5] = vec_sub(tmp1[1], tmp1[5]); + tmp0[6] = vec_sub(tmp1[2], tmp1[6]); + tmp0[7] = vec_sub(tmp1[3], tmp1[7]); + + DCT_CONST_ROUND_SHIFT(tmp0[0]); + DCT_CONST_ROUND_SHIFT(tmp0[1]); + DCT_CONST_ROUND_SHIFT(tmp0[2]); + DCT_CONST_ROUND_SHIFT(tmp0[3]); + DCT_CONST_ROUND_SHIFT(tmp0[4]); + DCT_CONST_ROUND_SHIFT(tmp0[5]); + DCT_CONST_ROUND_SHIFT(tmp0[6]); + DCT_CONST_ROUND_SHIFT(tmp0[7]); + + in[4] = vec_packs(tmp0[0], tmp0[1]); + in[5] = vec_packs(tmp0[2], tmp0[3]); + in[6] = vec_packs(tmp0[4], tmp0[5]); + in[7] = vec_packs(tmp0[6], tmp0[7]); + + // stage 3 + out[0] = vec_mergeh(in[2], in[3]); + out[1] = vec_mergel(in[2], in[3]); + out[2] = vec_mergeh(in[6], in[7]); + out[3] = vec_mergel(in[6], in[7]); + + IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[2], cospi16_v); + IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[3], cospi_p16_m16_v); + IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[6], cospi16_v); + IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[7], cospi_p16_m16_v); + + out[0] = in[0]; + out[2] = in[6]; + out[4] = in[3]; + out[6] = in[5]; + + out[1] = vec_sub(zero16v, in[4]); + out[3] = vec_sub(zero16v, in[2]); + out[5] = vec_sub(zero16v, in[7]); + out[7] = vec_sub(zero16v, in[1]); +} + +static void iadst16x8_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t tmp0[32], tmp1[32]; + int16x8_t tmp16_0[8]; + int16x8_t cospi_p01_p31 = vec_mergel(cospi1_v, cospi31_v); + int16x8_t cospi_p31_m01 = vec_mergel(cospi31_v, cospi1m_v); + int16x8_t cospi_p05_p27 = vec_mergel(cospi5_v, cospi27_v); + int16x8_t cospi_p27_m05 = vec_mergel(cospi27_v, cospi5m_v); + int16x8_t cospi_p09_p23 = vec_mergel(cospi9_v, cospi23_v); + int16x8_t cospi_p23_m09 = vec_mergel(cospi23_v, cospi9m_v); + int16x8_t cospi_p13_p19 = vec_mergel(cospi13_v, cospi19_v); + int16x8_t cospi_p19_m13 = vec_mergel(cospi19_v, cospi13m_v); + int16x8_t cospi_p17_p15 = vec_mergel(cospi17_v, cospi15_v); + int16x8_t cospi_p15_m17 = vec_mergel(cospi15_v, cospi17m_v); + int16x8_t cospi_p21_p11 = vec_mergel(cospi21_v, cospi11_v); + int16x8_t cospi_p11_m21 = vec_mergel(cospi11_v, cospi21m_v); + int16x8_t cospi_p25_p07 = vec_mergel(cospi25_v, cospi7_v); + int16x8_t cospi_p07_m25 = vec_mergel(cospi7_v, cospi25m_v); + int16x8_t cospi_p29_p03 = vec_mergel(cospi29_v, cospi3_v); + int16x8_t cospi_p03_m29 = vec_mergel(cospi3_v, cospi29m_v); + int16x8_t cospi_p04_p28 = vec_mergel(cospi4_v, cospi28_v); + int16x8_t cospi_p28_m04 = vec_mergel(cospi28_v, cospi4m_v); + int16x8_t cospi_p20_p12 = vec_mergel(cospi20_v, cospi12_v); + int16x8_t cospi_p12_m20 = vec_mergel(cospi12_v, cospi20m_v); + int16x8_t cospi_m28_p04 = vec_mergel(cospi28m_v, cospi4_v); + int16x8_t cospi_m12_p20 = vec_mergel(cospi12m_v, cospi20_v); + int16x8_t cospi_p08_p24 = vec_mergel(cospi8_v, cospi24_v); + int16x8_t cospi_p24_m08 = vec_mergel(cospi24_v, cospi8m_v); + int16x8_t cospi_m24_p08 = vec_mergel(cospi24m_v, cospi8_v); + int32x4_t zerov = vec_splat_s32(0); + ROUND_SHIFT_INIT; + + tmp16_0[0] = vec_mergeh(in[15], in[0]); + tmp16_0[1] = vec_mergel(in[15], in[0]); + tmp16_0[2] = vec_mergeh(in[13], in[2]); + tmp16_0[3] = vec_mergel(in[13], in[2]); + tmp16_0[4] = vec_mergeh(in[11], in[4]); + tmp16_0[5] = vec_mergel(in[11], in[4]); + tmp16_0[6] = vec_mergeh(in[9], in[6]); + tmp16_0[7] = vec_mergel(in[9], in[6]); + tmp16_0[8] = vec_mergeh(in[7], in[8]); + tmp16_0[9] = vec_mergel(in[7], in[8]); + tmp16_0[10] = vec_mergeh(in[5], in[10]); + tmp16_0[11] = vec_mergel(in[5], in[10]); + tmp16_0[12] = vec_mergeh(in[3], in[12]); + tmp16_0[13] = vec_mergel(in[3], in[12]); + tmp16_0[14] = vec_mergeh(in[1], in[14]); + tmp16_0[15] = vec_mergel(in[1], in[14]); + + tmp0[0] = vec_msum(tmp16_0[0], cospi_p01_p31, zerov); + tmp0[1] = vec_msum(tmp16_0[1], cospi_p01_p31, zerov); + tmp0[2] = vec_msum(tmp16_0[0], cospi_p31_m01, zerov); + tmp0[3] = vec_msum(tmp16_0[1], cospi_p31_m01, zerov); + tmp0[4] = vec_msum(tmp16_0[2], cospi_p05_p27, zerov); + tmp0[5] = vec_msum(tmp16_0[3], cospi_p05_p27, zerov); + tmp0[6] = vec_msum(tmp16_0[2], cospi_p27_m05, zerov); + tmp0[7] = vec_msum(tmp16_0[3], cospi_p27_m05, zerov); + tmp0[8] = vec_msum(tmp16_0[4], cospi_p09_p23, zerov); + tmp0[9] = vec_msum(tmp16_0[5], cospi_p09_p23, zerov); + tmp0[10] = vec_msum(tmp16_0[4], cospi_p23_m09, zerov); + tmp0[11] = vec_msum(tmp16_0[5], cospi_p23_m09, zerov); + tmp0[12] = vec_msum(tmp16_0[6], cospi_p13_p19, zerov); + tmp0[13] = vec_msum(tmp16_0[7], cospi_p13_p19, zerov); + tmp0[14] = vec_msum(tmp16_0[6], cospi_p19_m13, zerov); + tmp0[15] = vec_msum(tmp16_0[7], cospi_p19_m13, zerov); + tmp0[16] = vec_msum(tmp16_0[8], cospi_p17_p15, zerov); + tmp0[17] = vec_msum(tmp16_0[9], cospi_p17_p15, zerov); + tmp0[18] = vec_msum(tmp16_0[8], cospi_p15_m17, zerov); + tmp0[19] = vec_msum(tmp16_0[9], cospi_p15_m17, zerov); + tmp0[20] = vec_msum(tmp16_0[10], cospi_p21_p11, zerov); + tmp0[21] = vec_msum(tmp16_0[11], cospi_p21_p11, zerov); + tmp0[22] = vec_msum(tmp16_0[10], cospi_p11_m21, zerov); + tmp0[23] = vec_msum(tmp16_0[11], cospi_p11_m21, zerov); + tmp0[24] = vec_msum(tmp16_0[12], cospi_p25_p07, zerov); + tmp0[25] = vec_msum(tmp16_0[13], cospi_p25_p07, zerov); + tmp0[26] = vec_msum(tmp16_0[12], cospi_p07_m25, zerov); + tmp0[27] = vec_msum(tmp16_0[13], cospi_p07_m25, zerov); + tmp0[28] = vec_msum(tmp16_0[14], cospi_p29_p03, zerov); + tmp0[29] = vec_msum(tmp16_0[15], cospi_p29_p03, zerov); + tmp0[30] = vec_msum(tmp16_0[14], cospi_p03_m29, zerov); + tmp0[31] = vec_msum(tmp16_0[15], cospi_p03_m29, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[16]); + tmp1[1] = vec_add(tmp0[1], tmp0[17]); + tmp1[2] = vec_add(tmp0[2], tmp0[18]); + tmp1[3] = vec_add(tmp0[3], tmp0[19]); + tmp1[4] = vec_add(tmp0[4], tmp0[20]); + tmp1[5] = vec_add(tmp0[5], tmp0[21]); + tmp1[6] = vec_add(tmp0[6], tmp0[22]); + tmp1[7] = vec_add(tmp0[7], tmp0[23]); + tmp1[8] = vec_add(tmp0[8], tmp0[24]); + tmp1[9] = vec_add(tmp0[9], tmp0[25]); + tmp1[10] = vec_add(tmp0[10], tmp0[26]); + tmp1[11] = vec_add(tmp0[11], tmp0[27]); + tmp1[12] = vec_add(tmp0[12], tmp0[28]); + tmp1[13] = vec_add(tmp0[13], tmp0[29]); + tmp1[14] = vec_add(tmp0[14], tmp0[30]); + tmp1[15] = vec_add(tmp0[15], tmp0[31]); + tmp1[16] = vec_sub(tmp0[0], tmp0[16]); + tmp1[17] = vec_sub(tmp0[1], tmp0[17]); + tmp1[18] = vec_sub(tmp0[2], tmp0[18]); + tmp1[19] = vec_sub(tmp0[3], tmp0[19]); + tmp1[20] = vec_sub(tmp0[4], tmp0[20]); + tmp1[21] = vec_sub(tmp0[5], tmp0[21]); + tmp1[22] = vec_sub(tmp0[6], tmp0[22]); + tmp1[23] = vec_sub(tmp0[7], tmp0[23]); + tmp1[24] = vec_sub(tmp0[8], tmp0[24]); + tmp1[25] = vec_sub(tmp0[9], tmp0[25]); + tmp1[26] = vec_sub(tmp0[10], tmp0[26]); + tmp1[27] = vec_sub(tmp0[11], tmp0[27]); + tmp1[28] = vec_sub(tmp0[12], tmp0[28]); + tmp1[29] = vec_sub(tmp0[13], tmp0[29]); + tmp1[30] = vec_sub(tmp0[14], tmp0[30]); + tmp1[31] = vec_sub(tmp0[15], tmp0[31]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + DCT_CONST_ROUND_SHIFT(tmp1[16]); + DCT_CONST_ROUND_SHIFT(tmp1[17]); + DCT_CONST_ROUND_SHIFT(tmp1[18]); + DCT_CONST_ROUND_SHIFT(tmp1[19]); + DCT_CONST_ROUND_SHIFT(tmp1[20]); + DCT_CONST_ROUND_SHIFT(tmp1[21]); + DCT_CONST_ROUND_SHIFT(tmp1[22]); + DCT_CONST_ROUND_SHIFT(tmp1[23]); + DCT_CONST_ROUND_SHIFT(tmp1[24]); + DCT_CONST_ROUND_SHIFT(tmp1[25]); + DCT_CONST_ROUND_SHIFT(tmp1[26]); + DCT_CONST_ROUND_SHIFT(tmp1[27]); + DCT_CONST_ROUND_SHIFT(tmp1[28]); + DCT_CONST_ROUND_SHIFT(tmp1[29]); + DCT_CONST_ROUND_SHIFT(tmp1[30]); + DCT_CONST_ROUND_SHIFT(tmp1[31]); + + in[0] = vec_packs(tmp1[0], tmp1[1]); + in[1] = vec_packs(tmp1[2], tmp1[3]); + in[2] = vec_packs(tmp1[4], tmp1[5]); + in[3] = vec_packs(tmp1[6], tmp1[7]); + in[4] = vec_packs(tmp1[8], tmp1[9]); + in[5] = vec_packs(tmp1[10], tmp1[11]); + in[6] = vec_packs(tmp1[12], tmp1[13]); + in[7] = vec_packs(tmp1[14], tmp1[15]); + in[8] = vec_packs(tmp1[16], tmp1[17]); + in[9] = vec_packs(tmp1[18], tmp1[19]); + in[10] = vec_packs(tmp1[20], tmp1[21]); + in[11] = vec_packs(tmp1[22], tmp1[23]); + in[12] = vec_packs(tmp1[24], tmp1[25]); + in[13] = vec_packs(tmp1[26], tmp1[27]); + in[14] = vec_packs(tmp1[28], tmp1[29]); + in[15] = vec_packs(tmp1[30], tmp1[31]); + + // stage 2 + tmp16_0[0] = vec_mergeh(in[8], in[9]); + tmp16_0[1] = vec_mergel(in[8], in[9]); + tmp16_0[2] = vec_mergeh(in[10], in[11]); + tmp16_0[3] = vec_mergel(in[10], in[11]); + tmp16_0[4] = vec_mergeh(in[12], in[13]); + tmp16_0[5] = vec_mergel(in[12], in[13]); + tmp16_0[6] = vec_mergeh(in[14], in[15]); + tmp16_0[7] = vec_mergel(in[14], in[15]); + + tmp0[0] = vec_msum(tmp16_0[0], cospi_p04_p28, zerov); + tmp0[1] = vec_msum(tmp16_0[1], cospi_p04_p28, zerov); + tmp0[2] = vec_msum(tmp16_0[0], cospi_p28_m04, zerov); + tmp0[3] = vec_msum(tmp16_0[1], cospi_p28_m04, zerov); + tmp0[4] = vec_msum(tmp16_0[2], cospi_p20_p12, zerov); + tmp0[5] = vec_msum(tmp16_0[3], cospi_p20_p12, zerov); + tmp0[6] = vec_msum(tmp16_0[2], cospi_p12_m20, zerov); + tmp0[7] = vec_msum(tmp16_0[3], cospi_p12_m20, zerov); + tmp0[8] = vec_msum(tmp16_0[4], cospi_m28_p04, zerov); + tmp0[9] = vec_msum(tmp16_0[5], cospi_m28_p04, zerov); + tmp0[10] = vec_msum(tmp16_0[4], cospi_p04_p28, zerov); + tmp0[11] = vec_msum(tmp16_0[5], cospi_p04_p28, zerov); + tmp0[12] = vec_msum(tmp16_0[6], cospi_m12_p20, zerov); + tmp0[13] = vec_msum(tmp16_0[7], cospi_m12_p20, zerov); + tmp0[14] = vec_msum(tmp16_0[6], cospi_p20_p12, zerov); + tmp0[15] = vec_msum(tmp16_0[7], cospi_p20_p12, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[8]); + tmp1[1] = vec_add(tmp0[1], tmp0[9]); + tmp1[2] = vec_add(tmp0[2], tmp0[10]); + tmp1[3] = vec_add(tmp0[3], tmp0[11]); + tmp1[4] = vec_add(tmp0[4], tmp0[12]); + tmp1[5] = vec_add(tmp0[5], tmp0[13]); + tmp1[6] = vec_add(tmp0[6], tmp0[14]); + tmp1[7] = vec_add(tmp0[7], tmp0[15]); + tmp1[8] = vec_sub(tmp0[0], tmp0[8]); + tmp1[9] = vec_sub(tmp0[1], tmp0[9]); + tmp1[10] = vec_sub(tmp0[2], tmp0[10]); + tmp1[11] = vec_sub(tmp0[3], tmp0[11]); + tmp1[12] = vec_sub(tmp0[4], tmp0[12]); + tmp1[13] = vec_sub(tmp0[5], tmp0[13]); + tmp1[14] = vec_sub(tmp0[6], tmp0[14]); + tmp1[15] = vec_sub(tmp0[7], tmp0[15]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + + tmp16_0[0] = vec_add(in[0], in[4]); + tmp16_0[1] = vec_add(in[1], in[5]); + tmp16_0[2] = vec_add(in[2], in[6]); + tmp16_0[3] = vec_add(in[3], in[7]); + tmp16_0[4] = vec_sub(in[0], in[4]); + tmp16_0[5] = vec_sub(in[1], in[5]); + tmp16_0[6] = vec_sub(in[2], in[6]); + tmp16_0[7] = vec_sub(in[3], in[7]); + tmp16_0[8] = vec_packs(tmp1[0], tmp1[1]); + tmp16_0[9] = vec_packs(tmp1[2], tmp1[3]); + tmp16_0[10] = vec_packs(tmp1[4], tmp1[5]); + tmp16_0[11] = vec_packs(tmp1[6], tmp1[7]); + tmp16_0[12] = vec_packs(tmp1[8], tmp1[9]); + tmp16_0[13] = vec_packs(tmp1[10], tmp1[11]); + tmp16_0[14] = vec_packs(tmp1[12], tmp1[13]); + tmp16_0[15] = vec_packs(tmp1[14], tmp1[15]); + + // stage 3 + in[0] = vec_mergeh(tmp16_0[4], tmp16_0[5]); + in[1] = vec_mergel(tmp16_0[4], tmp16_0[5]); + in[2] = vec_mergeh(tmp16_0[6], tmp16_0[7]); + in[3] = vec_mergel(tmp16_0[6], tmp16_0[7]); + in[4] = vec_mergeh(tmp16_0[12], tmp16_0[13]); + in[5] = vec_mergel(tmp16_0[12], tmp16_0[13]); + in[6] = vec_mergeh(tmp16_0[14], tmp16_0[15]); + in[7] = vec_mergel(tmp16_0[14], tmp16_0[15]); + + tmp0[0] = vec_msum(in[0], cospi_p08_p24, zerov); + tmp0[1] = vec_msum(in[1], cospi_p08_p24, zerov); + tmp0[2] = vec_msum(in[0], cospi_p24_m08, zerov); + tmp0[3] = vec_msum(in[1], cospi_p24_m08, zerov); + tmp0[4] = vec_msum(in[2], cospi_m24_p08, zerov); + tmp0[5] = vec_msum(in[3], cospi_m24_p08, zerov); + tmp0[6] = vec_msum(in[2], cospi_p08_p24, zerov); + tmp0[7] = vec_msum(in[3], cospi_p08_p24, zerov); + tmp0[8] = vec_msum(in[4], cospi_p08_p24, zerov); + tmp0[9] = vec_msum(in[5], cospi_p08_p24, zerov); + tmp0[10] = vec_msum(in[4], cospi_p24_m08, zerov); + tmp0[11] = vec_msum(in[5], cospi_p24_m08, zerov); + tmp0[12] = vec_msum(in[6], cospi_m24_p08, zerov); + tmp0[13] = vec_msum(in[7], cospi_m24_p08, zerov); + tmp0[14] = vec_msum(in[6], cospi_p08_p24, zerov); + tmp0[15] = vec_msum(in[7], cospi_p08_p24, zerov); + + tmp1[0] = vec_add(tmp0[0], tmp0[4]); + tmp1[1] = vec_add(tmp0[1], tmp0[5]); + tmp1[2] = vec_add(tmp0[2], tmp0[6]); + tmp1[3] = vec_add(tmp0[3], tmp0[7]); + tmp1[4] = vec_sub(tmp0[0], tmp0[4]); + tmp1[5] = vec_sub(tmp0[1], tmp0[5]); + tmp1[6] = vec_sub(tmp0[2], tmp0[6]); + tmp1[7] = vec_sub(tmp0[3], tmp0[7]); + tmp1[8] = vec_add(tmp0[8], tmp0[12]); + tmp1[9] = vec_add(tmp0[9], tmp0[13]); + tmp1[10] = vec_add(tmp0[10], tmp0[14]); + tmp1[11] = vec_add(tmp0[11], tmp0[15]); + tmp1[12] = vec_sub(tmp0[8], tmp0[12]); + tmp1[13] = vec_sub(tmp0[9], tmp0[13]); + tmp1[14] = vec_sub(tmp0[10], tmp0[14]); + tmp1[15] = vec_sub(tmp0[11], tmp0[15]); + + DCT_CONST_ROUND_SHIFT(tmp1[0]); + DCT_CONST_ROUND_SHIFT(tmp1[1]); + DCT_CONST_ROUND_SHIFT(tmp1[2]); + DCT_CONST_ROUND_SHIFT(tmp1[3]); + DCT_CONST_ROUND_SHIFT(tmp1[4]); + DCT_CONST_ROUND_SHIFT(tmp1[5]); + DCT_CONST_ROUND_SHIFT(tmp1[6]); + DCT_CONST_ROUND_SHIFT(tmp1[7]); + DCT_CONST_ROUND_SHIFT(tmp1[8]); + DCT_CONST_ROUND_SHIFT(tmp1[9]); + DCT_CONST_ROUND_SHIFT(tmp1[10]); + DCT_CONST_ROUND_SHIFT(tmp1[11]); + DCT_CONST_ROUND_SHIFT(tmp1[12]); + DCT_CONST_ROUND_SHIFT(tmp1[13]); + DCT_CONST_ROUND_SHIFT(tmp1[14]); + DCT_CONST_ROUND_SHIFT(tmp1[15]); + + in[0] = vec_add(tmp16_0[0], tmp16_0[2]); + in[1] = vec_add(tmp16_0[1], tmp16_0[3]); + in[2] = vec_sub(tmp16_0[0], tmp16_0[2]); + in[3] = vec_sub(tmp16_0[1], tmp16_0[3]); + in[4] = vec_packs(tmp1[0], tmp1[1]); + in[5] = vec_packs(tmp1[2], tmp1[3]); + in[6] = vec_packs(tmp1[4], tmp1[5]); + in[7] = vec_packs(tmp1[6], tmp1[7]); + in[8] = vec_add(tmp16_0[8], tmp16_0[10]); + in[9] = vec_add(tmp16_0[9], tmp16_0[11]); + in[10] = vec_sub(tmp16_0[8], tmp16_0[10]); + in[11] = vec_sub(tmp16_0[9], tmp16_0[11]); + in[12] = vec_packs(tmp1[8], tmp1[9]); + in[13] = vec_packs(tmp1[10], tmp1[11]); + in[14] = vec_packs(tmp1[12], tmp1[13]); + in[15] = vec_packs(tmp1[14], tmp1[15]); + + // stage 4 + out[0] = vec_mergeh(in[2], in[3]); + out[1] = vec_mergel(in[2], in[3]); + out[2] = vec_mergeh(in[6], in[7]); + out[3] = vec_mergel(in[6], in[7]); + out[4] = vec_mergeh(in[10], in[11]); + out[5] = vec_mergel(in[10], in[11]); + out[6] = vec_mergeh(in[14], in[15]); + out[7] = vec_mergel(in[14], in[15]); +} + +void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1) { + int16x8_t tmp0[16], tmp1[16], tmp2[8]; + int32x4_t tmp3, tmp4; + int16x8_t zero16v = vec_splat_s16(0); + int32x4_t zerov = vec_splat_s32(0); + int16x8_t cospi_p16_m16 = vec_mergel(cospi16_v, cospi16m_v); + int16x8_t cospi_m16_p16 = vec_mergel(cospi16m_v, cospi16_v); + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12], + src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], + tmp0[6], tmp0[7]); + TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12], + src1[14], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], + tmp1[6], tmp1[7]); + TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13], + src0[15], tmp0[8], tmp0[9], tmp0[10], tmp0[11], tmp0[12], + tmp0[13], tmp0[14], tmp0[15]); + TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13], + src1[15], tmp1[8], tmp1[9], tmp1[10], tmp1[11], tmp1[12], + tmp1[13], tmp1[14], tmp1[15]); + + iadst16x8_vsx(tmp0, tmp2); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[14], cospi16m_v); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[0], cospi_p16_m16); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[8], cospi16_v); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[6], cospi_m16_p16); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[12], cospi16_v); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[2], cospi_m16_p16); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[10], cospi16m_v); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[4], cospi_p16_m16); + + src0[0] = tmp0[0]; + src0[2] = vec_sub(zero16v, tmp0[8]); + src0[4] = tmp0[12]; + src0[6] = vec_sub(zero16v, tmp0[4]); + src1[8] = tmp0[5]; + src1[10] = vec_sub(zero16v, tmp0[13]); + src1[12] = tmp0[9]; + src1[14] = vec_sub(zero16v, tmp0[1]); + + iadst16x8_vsx(tmp1, tmp2); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[15], cospi16m_v); + IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[1], cospi_p16_m16); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[9], cospi16_v); + IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[7], cospi_m16_p16); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[13], cospi16_v); + IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[3], cospi_m16_p16); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[11], cospi16m_v); + IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[5], cospi_p16_m16); + + src0[1] = tmp1[0]; + src0[3] = vec_sub(zero16v, tmp1[8]); + src0[5] = tmp1[12]; + src0[7] = vec_sub(zero16v, tmp1[4]); + src1[9] = tmp1[5]; + src1[11] = vec_sub(zero16v, tmp1[13]); + src1[13] = tmp1[9]; + src1[15] = vec_sub(zero16v, tmp1[1]); +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h new file mode 100644 index 0000000000..7031742c1c --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_ +#define VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_ + +#include "vpx_dsp/ppc/types_vsx.h" + +void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest, + int stride); +void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out); +void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out); + +void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride); +void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out); +void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out); + +#define LOAD_INPUT16(load, source, offset, step, in) \ + in[0] = load(offset, source); \ + in[1] = load((step) + (offset), source); \ + in[2] = load(2 * (step) + (offset), source); \ + in[3] = load(3 * (step) + (offset), source); \ + in[4] = load(4 * (step) + (offset), source); \ + in[5] = load(5 * (step) + (offset), source); \ + in[6] = load(6 * (step) + (offset), source); \ + in[7] = load(7 * (step) + (offset), source); \ + in[8] = load(8 * (step) + (offset), source); \ + in[9] = load(9 * (step) + (offset), source); \ + in[10] = load(10 * (step) + (offset), source); \ + in[11] = load(11 * (step) + (offset), source); \ + in[12] = load(12 * (step) + (offset), source); \ + in[13] = load(13 * (step) + (offset), source); \ + in[14] = load(14 * (step) + (offset), source); \ + in[15] = load(15 * (step) + (offset), source); + +void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest, + int stride); +void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1); +void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1); + +#endif // VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c new file mode 100644 index 0000000000..ab71f6e235 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/quantize_vsx.c @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vec_sra(b, vec_shift_sign_s16); + return vec_xor(vec_add(a, mask), mask); +} + +// Sets the value of a 32-bit integers to 1 when the corresponding value in a is +// negative. +static INLINE int32x4_t vec_is_neg(int32x4_t a) { + return vec_sr(a, vec_shift_sign_s32); +} + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and return the high 16 bits of the intermediate integers. +// (a * b) >> 16 +static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) { + // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right + // shift. + return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16); +} + +// Quantization function used for 4x4, 8x8 and 16x16 blocks. +static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs, + int16x8_t round, int16x8_t quant, + int16x8_t quant_shift, bool16x8_t mask) { + const int16x8_t rounded = vec_vaddshs(coeff_abs, round); + int16x8_t qcoeff = vec_mulhi(rounded, quant); + qcoeff = vec_add(qcoeff, rounded); + qcoeff = vec_mulhi(qcoeff, quant_shift); + qcoeff = vec_sign(qcoeff, coeff); + return vec_and(qcoeff, mask); +} + +// Quantization function used for 32x32 blocks. +static INLINE int16x8_t quantize_coeff_32(int16x8_t coeff, int16x8_t coeff_abs, + int16x8_t round, int16x8_t quant, + int16x8_t quant_shift, + bool16x8_t mask) { + const int16x8_t rounded = vec_vaddshs(coeff_abs, round); + int16x8_t qcoeff = vec_mulhi(rounded, quant); + qcoeff = vec_add(qcoeff, rounded); + // 32x32 blocks require an extra multiplication by 2, this compensates for the + // extra right shift added in vec_mulhi, as such vec_madds can be used + // directly instead of vec_mulhi (((a * b) >> 15) >> 1) << 1 == (a * b >> 15) + qcoeff = vec_madds(qcoeff, quant_shift, vec_zeros_s16); + qcoeff = vec_sign(qcoeff, coeff); + return vec_and(qcoeff, mask); +} + +// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32 +// blocks are twice as big as for other block sizes. As such, using +// vec_mladd results in overflow. +static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, + int16x8_t dequant) { + int32x4_t dqcoeffe = vec_mule(qcoeff, dequant); + int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe)); + dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo)); + dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32); + dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32); + return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); +} + +static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, + const int16_t *iscan_ptr, int index) { + int16x8_t scan = vec_vsx_ld(index, iscan_ptr); + bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); + return vec_andc(scan, zero_coeff); +} + +// Compare packed 16-bit integers across a, and return the maximum value in +// every element. Returns a vector containing the biggest value across vector a. +static INLINE int16x8_t vec_max_across(int16x8_t a) { + a = vec_max(a, vec_perm(a, a, vec_perm64)); + a = vec_max(a, vec_perm(a, a, vec_perm32)); + return vec_max(a, vec_perm(a, a, vec_perm16)); +} + +void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t zero_mask0, zero_mask1; + + // First set of 8 coeff starts with DC + 7 AC + int16x8_t zbin = vec_vsx_ld(0, zbin_ptr); + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr); + + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + + int16x8_t coeff0_abs = vec_abs(coeff0); + int16x8_t coeff1_abs = vec_abs(coeff1); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zbin = vec_splat(zbin, 1); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + + (void)scan_ptr; + + qcoeff0 = + quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + quant_shift = vec_splat(quant_shift, 1); + qcoeff1 = + quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + dequant = vec_splat(dequant, 1); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); + + if (n_coeffs > 16) { + int index = 16; + int off0 = 32; + int off1 = 48; + int off2 = 64; + do { + int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2; + bool16x8_t zero_mask2; + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + coeff0_abs = vec_abs(coeff0); + coeff1_abs = vec_abs(coeff1); + coeff2_abs = vec_abs(coeff2); + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + zero_mask2 = vec_cmpge(coeff2_abs, zbin); + qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift, + zero_mask2); + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16); + + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); + eob = vec_max(eob, eob2); + + index += 24; + off0 += 48; + off1 += 48; + off2 += 48; + } while (index < n_coeffs); + } + + eob = vec_max_across(eob); + *eob_ptr = eob[0]; +} + +void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + // In stage 1, we quantize 16 coeffs (DC + 15 AC) + // In stage 2, we loop 42 times and quantize 24 coeffs per iteration + // (32 * 32 - 16) / 24 = 42 + int num_itr = 42; + // Offsets are in bytes, 16 coeffs = 32 bytes + int off0 = 32; + int off1 = 48; + int off2 = 64; + + int16x8_t qcoeff0, qcoeff1, eob; + bool16x8_t zero_mask0, zero_mask1; + + int16x8_t zbin = vec_vsx_ld(0, zbin_ptr); + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr); + + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + + int16x8_t coeff0_abs = vec_abs(coeff0); + int16x8_t coeff1_abs = vec_abs(coeff1); + + (void)scan_ptr; + (void)n_coeffs; + + // 32x32 quantization requires that zbin and round be divided by 2 + zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16); + round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zbin = vec_splat(zbin, 1); // remove DC from zbin + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + + qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + round = vec_splat(round, 1); // remove DC from round + quant = vec_splat(quant, 1); // remove DC from quant + quant_shift = vec_splat(quant_shift, 1); // remove DC from quant_shift + qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), 0, dqcoeff_ptr); + dequant = vec_splat(dequant, 1); // remove DC from dequant + vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr); + + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); + + do { + int16x8_t coeff2, coeff2_abs, qcoeff2, eob2; + bool16x8_t zero_mask2; + + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + + coeff0_abs = vec_abs(coeff0); + coeff1_abs = vec_abs(coeff1); + coeff2_abs = vec_abs(coeff2); + + zero_mask0 = vec_cmpge(coeff0_abs, zbin); + zero_mask1 = vec_cmpge(coeff1_abs, zbin); + zero_mask2 = vec_cmpge(coeff2_abs, zbin); + + qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift, + zero_mask0); + qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift, + zero_mask1); + qcoeff2 = quantize_coeff_32(coeff2, coeff2_abs, round, quant, quant_shift, + zero_mask2); + + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), off0, dqcoeff_ptr); + vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr); + vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr); + + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); + eob = vec_max(eob, eob2); + + // 24 int16_t is 48 bytes + off0 += 48; + off1 += 48; + off2 += 48; + num_itr--; + } while (num_itr != 0); + + eob = vec_max_across(eob); + *eob_ptr = eob[0]; +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c new file mode 100644 index 0000000000..a08ae12413 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/sad_vsx.c @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/ppc/types_vsx.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#define PROCESS16(offset) \ + v_a = vec_vsx_ld(offset, a); \ + v_b = vec_vsx_ld(offset, b); \ + v_abs = vec_absd(v_a, v_b); \ + v_sad = vec_sum4s(v_abs, v_sad); + +#define SAD8(height) \ + unsigned int vpx_sad8x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ + \ + do { \ + PROCESS16(0) \ + \ + a += a_stride; \ + b += b_stride; \ + y++; \ + } while (y < height); \ + \ + return v_sad[1] + v_sad[0]; \ + } + +#define SAD16(height) \ + unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ + \ + do { \ + PROCESS16(0); \ + \ + a += a_stride; \ + b += b_stride; \ + y++; \ + } while (y < height); \ + \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ + } + +#define SAD32(height) \ + unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ + \ + do { \ + PROCESS16(0); \ + PROCESS16(16); \ + \ + a += a_stride; \ + b += b_stride; \ + y++; \ + } while (y < height); \ + \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ + } + +#define SAD64(height) \ + unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ + \ + do { \ + PROCESS16(0); \ + PROCESS16(16); \ + PROCESS16(32); \ + PROCESS16(48); \ + \ + a += a_stride; \ + b += b_stride; \ + y++; \ + } while (y < height); \ + \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ + } + +SAD8(4); +SAD8(8); +SAD8(16); +SAD16(8); +SAD16(16); +SAD16(32); +SAD32(16); +SAD32(32); +SAD32(64); +SAD64(32); +SAD64(64); + +#define SAD16AVG(height) \ + unsigned int vpx_sad16x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * (height)]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref, \ + ref_stride); \ + \ + return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16); \ + } + +#define SAD32AVG(height) \ + unsigned int vpx_sad32x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * (height)]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref, \ + ref_stride); \ + \ + return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32); \ + } + +#define SAD64AVG(height) \ + unsigned int vpx_sad64x##height##_avg_vsx( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * (height)]); \ + vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref, \ + ref_stride); \ + return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64); \ + } + +SAD16AVG(8); +SAD16AVG(16); +SAD16AVG(32); +SAD32AVG(16); +SAD32AVG(32); +SAD32AVG(64); +SAD64AVG(32); +SAD64AVG(64); + +#define PROCESS16_4D(offset, ref, v_h, v_l) \ + v_b = vec_vsx_ld(offset, ref); \ + v_bh = unpack_to_s16_h(v_b); \ + v_bl = unpack_to_s16_l(v_b); \ + v_subh = vec_sub(v_h, v_bh); \ + v_subl = vec_sub(v_l, v_bl); \ + v_absh = vec_abs(v_subh); \ + v_absl = vec_abs(v_subl); \ + v_sad = vec_sum4s(v_absh, v_sad); \ + v_sad = vec_sum4s(v_absl, v_sad); + +#define UNPACK_SRC(offset, srcv_h, srcv_l) \ + v_a = vec_vsx_ld(offset, src); \ + srcv_h = unpack_to_s16_h(v_a); \ + srcv_l = unpack_to_s16_l(v_a); + +#define SAD16_4D(height) \ + void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah, v_al); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +#define SAD32_4D(height) \ + void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \ + int16x8_t v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah1, v_al1); \ + UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \ + PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +#define SAD64_4D(height) \ + void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + int y; \ + unsigned int sad[4]; \ + uint8x16_t v_a, v_b; \ + int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl; \ + int16x8_t v_ah3, v_al3, v_ah4, v_al4; \ + int16x8_t v_absh, v_absl, v_subh, v_subl; \ + \ + for (i = 0; i < 4; i++) sad_array[i] = 0; \ + \ + for (y = 0; y < height; y++) { \ + UNPACK_SRC(y *src_stride, v_ah1, v_al1); \ + UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2); \ + UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3); \ + UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4); \ + for (i = 0; i < 4; i++) { \ + int32x4_t v_sad = vec_splat_s32(0); \ + PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1); \ + PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2); \ + PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3); \ + PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4); \ + \ + vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]); \ + } \ + } \ + } + +SAD16_4D(8); +SAD16_4D(16); +SAD16_4D(32); +SAD32_4D(16); +SAD32_4D(32); +SAD32_4D(64); +SAD64_4D(32); +SAD64_4D(64); diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c new file mode 100644 index 0000000000..76ad302da6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/subtract_vsx.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static VPX_FORCE_INLINE void subtract_block4x4( + int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { + int16_t *diff1 = diff + 2 * diff_stride; + const uint8_t *src1 = src + 2 * src_stride; + const uint8_t *pred1 = pred + 2 * pred_stride; + + const int16x8_t d0 = vec_vsx_ld(0, diff); + const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride); + const int16x8_t d2 = vec_vsx_ld(0, diff1); + const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride); + + const uint8x16_t s0 = read4x2(src, (int)src_stride); + const uint8x16_t p0 = read4x2(pred, (int)pred_stride); + const uint8x16_t s1 = read4x2(src1, (int)src_stride); + const uint8x16_t p1 = read4x2(pred1, (int)pred_stride); + + const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1)); + + vec_vsx_st(xxpermdi(da, d0, 1), 0, diff); + vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride); + vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1); + vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride); +} + +void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r = rows, c; + + switch (cols) { + case 64: + case 32: + do { + for (c = 0; c < cols; c += 32) { + const uint8x16_t s0 = vec_vsx_ld(0, src + c); + const uint8x16_t s1 = vec_vsx_ld(16, src + c); + const uint8x16_t p0 = vec_vsx_ld(0, pred + c); + const uint8x16_t p1 = vec_vsx_ld(16, pred + c); + const int16x8_t d0l = + vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0)); + const int16x8_t d0h = + vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + const int16x8_t d1l = + vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1)); + const int16x8_t d1h = + vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1)); + vec_vsx_st(d0h, 0, diff + c); + vec_vsx_st(d0l, 16, diff + c); + vec_vsx_st(d1h, 0, diff + c + 16); + vec_vsx_st(d1l, 16, diff + c + 16); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 16: + do { + const uint8x16_t s0 = vec_vsx_ld(0, src); + const uint8x16_t p0 = vec_vsx_ld(0, pred); + const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0)); + const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + vec_vsx_st(d0h, 0, diff); + vec_vsx_st(d0l, 16, diff); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 8: + do { + const uint8x16_t s0 = vec_vsx_ld(0, src); + const uint8x16_t p0 = vec_vsx_ld(0, pred); + const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + vec_vsx_st(d0h, 0, diff); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 4: + subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride); + if (r > 4) { + diff += 4 * diff_stride; + pred += 4 * pred_stride; + src += 4 * src_stride; + + subtract_block4x4(diff, diff_stride, + + src, src_stride, + + pred, pred_stride); + } + break; + default: assert(0); // unreachable + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h new file mode 100644 index 0000000000..4883b734ad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/transpose_vsx.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_ +#define VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_ + +#include "./vpx_config.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) { + // d = vec_mergeh(a,b): + // The even elements of the result are obtained left-to-right, + // from the high elements of a. + // The odd elements of the result are obtained left-to-right, + // from the high elements of b. + // + // d = vec_mergel(a,b): + // The even elements of the result are obtained left-to-right, + // from the low elements of a. + // The odd elements of the result are obtained left-to-right, + // from the low elements of b. + + // Example, starting with: + // v[0]: 00 01 02 03 04 05 06 07 + // v[1]: 10 11 12 13 14 15 16 17 + // v[2]: 20 21 22 23 24 25 26 27 + // v[3]: 30 31 32 33 34 35 36 37 + // v[4]: 40 41 42 43 44 45 46 47 + // v[5]: 50 51 52 53 54 55 56 57 + // v[6]: 60 61 62 63 64 65 66 67 + // v[7]: 70 71 72 73 74 75 76 77 + + int16x8_t b0, b1, b2, b3, b4, b5, b6, b7; + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + + b0 = vec_mergeh(v[0], v[4]); + b1 = vec_mergel(v[0], v[4]); + b2 = vec_mergeh(v[1], v[5]); + b3 = vec_mergel(v[1], v[5]); + b4 = vec_mergeh(v[2], v[6]); + b5 = vec_mergel(v[2], v[6]); + b6 = vec_mergeh(v[3], v[7]); + b7 = vec_mergel(v[3], v[7]); + + // After first merge operation + // b0: 00 40 01 41 02 42 03 43 + // b1: 04 44 05 45 06 46 07 47 + // b2: 10 50 11 51 12 52 13 53 + // b3: 14 54 15 55 16 56 17 57 + // b4: 20 60 21 61 22 62 23 63 + // b5: 24 64 25 65 26 66 27 67 + // b6: 30 70 31 71 32 62 33 73 + // b7: 34 74 35 75 36 76 37 77 + + c0 = vec_mergeh(b0, b4); + c1 = vec_mergel(b0, b4); + c2 = vec_mergeh(b1, b5); + c3 = vec_mergel(b1, b5); + c4 = vec_mergeh(b2, b6); + c5 = vec_mergel(b2, b6); + c6 = vec_mergeh(b3, b7); + c7 = vec_mergel(b3, b7); + + // After second merge operation + // c0: 00 20 40 60 01 21 41 61 + // c1: 02 22 42 62 03 23 43 63 + // c2: 04 24 44 64 05 25 45 65 + // c3: 06 26 46 66 07 27 47 67 + // c4: 10 30 50 70 11 31 51 71 + // c5: 12 32 52 72 13 33 53 73 + // c6: 14 34 54 74 15 35 55 75 + // c7: 16 36 56 76 17 37 57 77 + + v[0] = vec_mergeh(c0, c4); + v[1] = vec_mergel(c0, c4); + v[2] = vec_mergeh(c1, c5); + v[3] = vec_mergel(c1, c5); + v[4] = vec_mergeh(c2, c6); + v[5] = vec_mergel(c2, c6); + v[6] = vec_mergeh(c3, c7); + v[7] = vec_mergel(c3, c7); + + // After last merge operation + // v[0]: 00 10 20 30 40 50 60 70 + // v[1]: 01 11 21 31 41 51 61 71 + // v[2]: 02 12 22 32 42 52 62 72 + // v[3]: 03 13 23 33 43 53 63 73 + // v[4]: 04 14 24 34 44 54 64 74 + // v[5]: 05 15 25 35 45 55 65 75 + // v[6]: 06 16 26 36 46 56 66 76 + // v[7]: 07 17 27 37 47 57 67 77 +} + +static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) { + // Stage 1 + const int16x8_t s1_0 = vec_mergeh(a[0], a[4]); + const int16x8_t s1_1 = vec_mergel(a[0], a[4]); + const int16x8_t s1_2 = vec_mergeh(a[1], a[5]); + const int16x8_t s1_3 = vec_mergel(a[1], a[5]); + const int16x8_t s1_4 = vec_mergeh(a[2], a[6]); + const int16x8_t s1_5 = vec_mergel(a[2], a[6]); + const int16x8_t s1_6 = vec_mergeh(a[3], a[7]); + const int16x8_t s1_7 = vec_mergel(a[3], a[7]); + + // Stage 2 + const int16x8_t s2_0 = vec_mergeh(s1_0, s1_4); + const int16x8_t s2_1 = vec_mergel(s1_0, s1_4); + const int16x8_t s2_2 = vec_mergeh(s1_1, s1_5); + const int16x8_t s2_3 = vec_mergel(s1_1, s1_5); + const int16x8_t s2_4 = vec_mergeh(s1_2, s1_6); + const int16x8_t s2_5 = vec_mergel(s1_2, s1_6); + const int16x8_t s2_6 = vec_mergeh(s1_3, s1_7); + const int16x8_t s2_7 = vec_mergel(s1_3, s1_7); + + // Stage 2 + b[0] = vec_mergeh(s2_0, s2_4); + b[1] = vec_mergel(s2_0, s2_4); + b[2] = vec_mergeh(s2_1, s2_5); + b[3] = vec_mergel(s2_1, s2_5); + b[4] = vec_mergeh(s2_2, s2_6); + b[5] = vec_mergel(s2_2, s2_6); + b[6] = vec_mergeh(s2_3, s2_7); + b[7] = vec_mergel(s2_3, s2_7); +} + +#endif // VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h new file mode 100644 index 0000000000..2907a1fe40 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/txfm_common_vsx.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_ +#define VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_ + +#include "vpx_dsp/ppc/types_vsx.h" + +static const int32x4_t vec_dct_const_rounding = { 8192, 8192, 8192, 8192 }; + +static const uint32x4_t vec_dct_const_bits = { 14, 14, 14, 14 }; + +static const uint16x8_t vec_dct_scale_log2 = { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, + 16364, 16364, 16364, 16364 }; +static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, + 16305, 16305, 16305, 16305 }; +static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, + 16207, 16207, 16207, 16207 }; +static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, + 16069, 16069, 16069, 16069 }; +static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, + -16069, -16069, -16069, -16069 }; +static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, + 15893, 15893, 15893, 15893 }; +static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, + 15679, 15679, 15679, 15679 }; +static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, + 15426, 15426, 15426, 15426 }; +static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, + 15137, 15137, 15137, 15137 }; +static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, + -15137, -15137, -15137, -15137 }; +static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, + 14811, 14811, 14811, 14811 }; +static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, + 14449, 14449, 14449, 14449 }; +static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, + 14053, 14053, 14053, 14053 }; +static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, + 13623, 13623, 13623, 13623 }; +static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, + 13160, 13160, 13160, 13160 }; +static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, + 12665, 12665, 12665, 12665 }; +static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, + 12140, 12140, 12140, 12140 }; +static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, + 11585, 11585, 11585, 11585 }; +static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, + 11003, 11003, 11003, 11003 }; +static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, + 10394, 10394, 10394, 10394 }; +static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, + 9760, 9760, 9760, 9760 }; +static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, + 9102, 9102, 9102, 9102 }; +static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, + -9102, -9102, -9102, -9102 }; +static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, + 8423, 8423, 8423, 8423 }; +static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, + 7723, 7723, 7723, 7723 }; +static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, + 7005, 7005, 7005, 7005 }; +static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, + 6270, 6270, 6270, 6270 }; +static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, + 5520, 5520, 5520, 5520 }; +static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, + 4756, 4756, 4756, 4756 }; +static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, + 3981, 3981, 3981, 3981 }; +static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, + 3196, 3196, 3196, 3196 }; +static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, + 2404, 2404, 2404, 2404 }; +static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, + 1606, 1606, 1606, 1606 }; +static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; + +#endif // VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h b/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h new file mode 100644 index 0000000000..9806b81cd0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/types_vsx.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_PPC_TYPES_VSX_H_ +#define VPX_VPX_DSP_PPC_TYPES_VSX_H_ + +#include + +typedef vector signed char int8x16_t; +typedef vector unsigned char uint8x16_t; +typedef vector signed short int16x8_t; +typedef vector unsigned short uint16x8_t; +typedef vector signed int int32x4_t; +typedef vector unsigned int uint32x4_t; +typedef vector bool char bool8x16_t; +typedef vector bool short bool16x8_t; +typedef vector bool int bool32x4_t; + +#if defined(__clang__) && __clang_major__ < 6 +static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17 }; +static const uint8x16_t xxpermdi1_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; +static const uint8x16_t xxpermdi2_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17 }; +static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; +#define xxpermdi(a, b, c) vec_perm(a, b, xxpermdi##c##_perm) +#elif defined(__GNUC__) && \ + (__GNUC__ > 6 || (__GNUC__ == 6 && __GNUC_MINOR__ >= 3)) +#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c) +#endif + +#ifdef WORDS_BIGENDIAN +#define unpack_to_u16_h(v) \ + (uint16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_u16_l(v) \ + (uint16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_s16_h(v) \ + (int16x8_t) vec_mergeh(vec_splat_u8(0), (uint8x16_t)v) +#define unpack_to_s16_l(v) \ + (int16x8_t) vec_mergel(vec_splat_u8(0), (uint8x16_t)v) +#ifndef xxpermdi +#define xxpermdi(a, b, c) vec_xxpermdi(a, b, c) +#endif +#else +#define unpack_to_u16_h(v) \ + (uint16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_u16_l(v) \ + (uint16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_s16_h(v) \ + (int16x8_t) vec_mergeh((uint8x16_t)v, vec_splat_u8(0)) +#define unpack_to_s16_l(v) \ + (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0)) +#ifndef xxpermdi +#define xxpermdi(a, b, c) vec_xxpermdi(b, a, (((c) >> 1) | ((c) & 1) << 1) ^ 3) +#endif +#endif + +static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) { + const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a); + const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride); + + return (uint8x16_t)vec_mergeh(a0, a1); +} + +#ifndef __POWER9_VECTOR__ +#define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b)) +#endif + +static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 }; +static const int16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static const int16x8_t vec_twos_s16 = { 2, 2, 2, 2, 2, 2, 2, 2 }; +static const uint16x8_t vec_ones_u16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static const uint32x4_t vec_ones_u32 = { 1, 1, 1, 1 }; +static const int32x4_t vec_zeros_s32 = { 0, 0, 0, 0 }; +static const uint32x4_t vec_zeros_u32 = { 0, 0, 0, 0 }; +static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 }; +static const uint32x4_t vec_shift_sign_s32 = { 31, 31, 31, 31 }; +static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07 }; +static const uint8x16_t vec_perm32 = { 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03 }; +static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D, + 0x0E, 0x0F, 0x00, 0x01 }; + +static const uint8x16_t vec_perm_odd_even_pack = { 0x00, 0x01, 0x10, 0x11, + 0x04, 0x05, 0x14, 0x15, + 0x08, 0x09, 0x18, 0x19, + 0x0C, 0x0D, 0x1C, 0x1D }; + +#endif // VPX_VPX_DSP_PPC_TYPES_VSX_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c new file mode 100644 index 0000000000..6c6bc9a301 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/variance_vsx.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + int distortion; + + const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride)); + const int16x8_t a1 = + unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride)); + const int16x8_t b1 = + unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride)); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0))); + const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3); + + vec_ste(d, 0, &distortion); + + return distortion; +} + +// TODO(lu_zero): Unroll +uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) { + unsigned int i, sum = 0; + int32x4_t s = vec_splat_s32(0); + + for (i = 0; i < 256; i += 8) { + const int16x8_t v = vec_vsx_ld(0, src_ptr + i); + s = vec_msum(v, v, s); + } + + s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3); + + vec_ste((uint32x4_t)s, 0, &sum); + + return sum; +} + +void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + /* comp_pred and pred must be 16 byte aligned. */ + assert(((intptr_t)comp_pred & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); + if (width >= 16) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref)); + vec_vsx_st(v, j, comp_pred); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + // Process 2 lines at time + for (i = 0; i < height / 2; ++i) { + const uint8x16_t r0 = vec_vsx_ld(0, ref); + const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride); + const uint8x16_t r = xxpermdi(r0, r1, 0); + const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r); + vec_vsx_st(v, 0, comp_pred); + comp_pred += 16; // width * 2; + pred += 16; // width * 2; + ref += ref_stride * 2; + } + } else { + assert(width == 4); + // process 4 lines at time + for (i = 0; i < height / 4; ++i) { + const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref); + const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride); + const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2); + const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3); + const uint8x16_t r = + (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0); + const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r); + vec_vsx_st(v, 0, comp_pred); + comp_pred += 16; // width * 4; + pred += 16; // width * 4; + ref += ref_stride * 4; + } + } +} + +static INLINE void variance_inner_32(const uint8_t *src_ptr, + const uint8_t *ref_ptr, + int32x4_t *sum_squared, int32x4_t *sum) { + int32x4_t s = *sum; + int32x4_t ss = *sum_squared; + + const uint8x16_t va0 = vec_vsx_ld(0, src_ptr); + const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr); + const uint8x16_t va1 = vec_vsx_ld(16, src_ptr); + const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr); + + const int16x8_t a0 = unpack_to_s16_h(va0); + const int16x8_t b0 = unpack_to_s16_h(vb0); + const int16x8_t a1 = unpack_to_s16_l(va0); + const int16x8_t b1 = unpack_to_s16_l(vb0); + const int16x8_t a2 = unpack_to_s16_h(va1); + const int16x8_t b2 = unpack_to_s16_h(vb1); + const int16x8_t a3 = unpack_to_s16_l(va1); + const int16x8_t b3 = unpack_to_s16_l(vb1); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + const int16x8_t d2 = vec_sub(a2, b2); + const int16x8_t d3 = vec_sub(a3, b3); + + s = vec_sum4s(d0, s); + ss = vec_msum(d0, d0, ss); + s = vec_sum4s(d1, s); + ss = vec_msum(d1, d1, ss); + s = vec_sum4s(d2, s); + ss = vec_msum(d2, d2, ss); + s = vec_sum4s(d3, s); + ss = vec_msum(d3, d3, ss); + *sum = s; + *sum_squared = ss; +} + +static INLINE void variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { + int i; + + int32x4_t s = vec_splat_s32(0); + int32x4_t ss = vec_splat_s32(0); + + switch (w) { + case 4: + for (i = 0; i < h / 2; ++i) { + const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride)); + const int16x8_t d = vec_sub(a0, b0); + s = vec_sum4s(d, s); + ss = vec_msum(d, d, ss); + src_ptr += src_stride * 2; + ref_ptr += ref_stride * 2; + } + break; + case 8: + for (i = 0; i < h; ++i) { + const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr)); + const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr)); + const int16x8_t d = vec_sub(a0, b0); + + s = vec_sum4s(d, s); + ss = vec_msum(d, d, ss); + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + case 16: + for (i = 0; i < h; ++i) { + const uint8x16_t va = vec_vsx_ld(0, src_ptr); + const uint8x16_t vb = vec_vsx_ld(0, ref_ptr); + const int16x8_t a0 = unpack_to_s16_h(va); + const int16x8_t b0 = unpack_to_s16_h(vb); + const int16x8_t a1 = unpack_to_s16_l(va); + const int16x8_t b1 = unpack_to_s16_l(vb); + const int16x8_t d0 = vec_sub(a0, b0); + const int16x8_t d1 = vec_sub(a1, b1); + + s = vec_sum4s(d0, s); + ss = vec_msum(d0, d0, ss); + s = vec_sum4s(d1, s); + ss = vec_msum(d1, d1, ss); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + case 32: + for (i = 0; i < h; ++i) { + variance_inner_32(src_ptr, ref_ptr, &ss, &s); + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + case 64: + for (i = 0; i < h; ++i) { + variance_inner_32(src_ptr, ref_ptr, &ss, &s); + variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + break; + } + + s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3); + + vec_ste(s, 0, sum); + + ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3); + + vec_ste((uint32x4_t)ss, 0, sse); +} + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ + void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse, int *sum) { \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \ + } + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addition to modifying the passed in + * variable. + */ +#define MSE(W, H) \ + uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } + +#define VAR(W, H) \ + uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / ((W) * (H))); \ + } + +#define VARIANCES(W, H) VAR(W, H) + +VARIANCES(64, 64) +VARIANCES(64, 32) +VARIANCES(32, 64) +VARIANCES(32, 32) +VARIANCES(32, 16) +VARIANCES(16, 32) +VARIANCES(16, 16) +VARIANCES(16, 8) +VARIANCES(8, 16) +VARIANCES(8, 8) +VARIANCES(8, 4) +VARIANCES(4, 8) +VARIANCES(4, 4) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) diff --git a/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c new file mode 100644 index 0000000000..2dc66055cc --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/vpx_filter.h" + +// TODO(lu_zero): unroll +static VPX_FORCE_INLINE void copy_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + src += src_stride; + dst += dst_stride; + } +} + +static VPX_FORCE_INLINE void copy_w32(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + src += src_stride; + dst += dst_stride; + } +} + +static VPX_FORCE_INLINE void copy_w64(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + vec_vsx_st(vec_vsx_ld(0, src), 0, dst); + vec_vsx_st(vec_vsx_ld(16, src), 16, dst); + vec_vsx_st(vec_vsx_ld(32, src), 32, dst); + vec_vsx_st(vec_vsx_ld(48, src), 48, dst); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + + switch (w) { + case 16: { + copy_w16(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_w32(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_w64(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int i; + for (i = h; i--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} + +static VPX_FORCE_INLINE void avg_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + vec_vsx_st(v, 0, dst); + src += src_stride; + dst += dst_stride; + } +} + +static VPX_FORCE_INLINE void avg_w32(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); + vec_vsx_st(v0, 0, dst); + vec_vsx_st(v1, 16, dst); + src += src_stride; + dst += dst_stride; + } +} + +static VPX_FORCE_INLINE void avg_w64(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + int32_t h) { + int i; + + for (i = h; i--;) { + const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst)); + const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst)); + const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst)); + const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst)); + vec_vsx_st(v0, 0, dst); + vec_vsx_st(v1, 16, dst); + vec_vsx_st(v2, 32, dst); + vec_vsx_st(v3, 48, dst); + src += src_stride; + dst += dst_stride; + } +} + +void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + switch (w) { + case 16: { + avg_w16(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_w32(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_w64(src, src_stride, dst, dst_stride, h); + break; + } + default: { + vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); + break; + } + } +} + +static VPX_FORCE_INLINE void convolve_line(uint8_t *dst, const int16x8_t s, + const int16x8_t f) { + const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0)); + const int32x4_t bias = + vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1)); + const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS)); + const uint8x16_t v = vec_splat( + vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3); + vec_ste(v, 0, dst); +} + +static VPX_FORCE_INLINE void convolve_line_h(uint8_t *dst, + const uint8_t *const src_x, + const int16_t *const x_filter) { + const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x)); + const int16x8_t f = vec_vsx_ld(0, x_filter); + + convolve_line(dst, s, f); +} + +// TODO(lu_zero): Implement 8x8 and bigger block special cases +static VPX_FORCE_INLINE void convolve_horiz(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, + int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS], + x_filters[x_q4 & SUBPEL_MASK]); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static VPX_FORCE_INLINE void convolve_avg_horiz( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + uint8_t v; + convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS], + x_filters[x_q4 & SUBPEL_MASK]); + dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b, + uint8x16_t c, uint8x16_t d, + uint8x16_t e, uint8x16_t f, + uint8x16_t g, uint8x16_t h) { + uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b); + uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d); + uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f); + uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h); + + uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd); + uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh); + + return (uint8x16_t)vec_mergeh(abcd, efgh); +} + +static VPX_FORCE_INLINE void convolve_line_v(uint8_t *dst, + const uint8_t *const src_y, + ptrdiff_t src_stride, + const int16_t *const y_filter) { + uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride); + uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride); + uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride); + uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride); + uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride); + uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride); + uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride); + uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride); + const int16x8_t f = vec_vsx_ld(0, y_filter); + uint8_t buf[16]; + const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7); + + vec_vsx_st(s, 0, buf); + + convolve_line(dst, unpack_to_s16_h(s), f); +} + +static VPX_FORCE_INLINE void convolve_vert(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, + int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + convolve_line_v(dst + y * dst_stride, + &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, + y_filters[y_q4 & SUBPEL_MASK]); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static VPX_FORCE_INLINE void convolve_avg_vert( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + uint8_t v; + convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride, + y_filters[y_q4 & SUBPEL_MASK]); + dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static VPX_FORCE_INLINE void convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const filter, + int x0_q4, int x_step_q4, int y0_q4, + int y_step_q4, int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= 64); + assert(h <= 64); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); +} + +void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)y0_q4; + (void)y_step_q4; + + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, + h); +} + +void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)y0_q4; + (void)y_step_q4; + + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); +} + +void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); +} + +void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, + w, h); +} + +void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); +} + +void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); + assert(w <= 64); + assert(h <= 64); + + vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); + vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); +} diff --git a/media/libvpx/libvpx/vpx_dsp/prob.h b/media/libvpx/libvpx/vpx_dsp/prob.h index 5656ddbab4..a1c7de1674 100644 --- a/media/libvpx/libvpx/vpx_dsp/prob.h +++ b/media/libvpx/libvpx/vpx_dsp/prob.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_PROB_H_ -#define VPX_DSP_PROB_H_ +#ifndef VPX_VPX_DSP_PROB_H_ +#define VPX_VPX_DSP_PROB_H_ #include @@ -30,9 +30,9 @@ typedef uint8_t vpx_prob; typedef int8_t vpx_tree_index; -#define TREE_SIZE(leaf_count) (2 * (leaf_count)-2) +#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2) -#define vpx_complement(x) (255 - x) +#define vpx_complement(x) (255 - (x)) #define MODE_MV_COUNT_SAT 20 @@ -48,7 +48,7 @@ typedef const vpx_tree_index vpx_tree[]; static INLINE vpx_prob get_prob(unsigned int num, unsigned int den) { assert(den != 0); { - const int p = (int)(((int64_t)num * 256 + (den >> 1)) / den); + const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); // (p > 255) ? 255 : (p < 1) ? 1 : p; const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); return (vpx_prob)clipped_prob; @@ -103,4 +103,4 @@ DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]); } // extern "C" #endif -#endif // VPX_DSP_PROB_H_ +#endif // VPX_VPX_DSP_PROB_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/psnr.c b/media/libvpx/libvpx/vpx_dsp/psnr.c index 47afd4388a..af1e638604 100644 --- a/media/libvpx/libvpx/vpx_dsp/psnr.c +++ b/media/libvpx/libvpx/vpx_dsp/psnr.c @@ -1,12 +1,12 @@ /* -* Copyright (c) 2016 The WebM project authors. All Rights Reserved. -* -* Use of this source code is governed by a BSD-style license -* that can be found in the LICENSE file in the root of the source -* tree. An additional intellectual property rights grant can be found -* in the file PATENTS. All contributing project authors may -* be found in the AUTHORS file in the root of the source tree. -*/ + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ #include #include @@ -24,59 +24,46 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) { } /* TODO(yaowu): The block_variance calls the unoptimized versions of variance() -* and highbd_8_variance(). It should not. -*/ -static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, unsigned int *sse, - int *sum) { + * and highbd_8_variance(). It should not. + */ +static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { int i, j; - - *sum = 0; - *sse = 0; + int64_t sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } + + return sse; } #if CONFIG_VP9_HIGHBITDEPTH -static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, uint64_t *sse, int64_t *sum) { +static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h) { int i, j; + int64_t sse = 0; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } -} -static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, - &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; + return sse; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -85,29 +72,24 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, const int dw = width % 16; const int dh = height % 16; int64_t total_sse = 0; - unsigned int sse = 0; - int sum = 0; int x, y; if (dw > 0) { - encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, - height, &sse, &sum); - total_sse += sse; + total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height); } if (dh > 0) { - encoder_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, width - dw, dh, - &sse, &sum); - total_sse += sse; + total_sse += + encoder_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; for (x = 0; x < width / 16; ++x) { - vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); - total_sse += sse; + total_sse += vpx_sse(pa, a_stride, pb, b_stride, 16, 16); pa += 16; pb += 16; @@ -146,25 +128,20 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, int x, y; const int dw = width % 16; const int dh = height % 16; - unsigned int sse = 0; - int sum = 0; if (dw > 0) { - encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], - b_stride, dw, height, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height); } if (dh > 0) { - encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; for (x = 0; x < width / 16; ++x) { - vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); - total_sse += sse; + total_sse += vpx_highbd_sse(pa, a_stride, pb, b_stride, 16, 16); pa += 16; pb += 16; } @@ -200,7 +177,8 @@ int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, #if CONFIG_VP9_HIGHBITDEPTH void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, - uint32_t bit_depth, uint32_t in_bit_depth) { + uint32_t bit_depth, uint32_t in_bit_depth, + int spatial_layer_id) { const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; const int heights[3] = { a->y_crop_height, a->uv_crop_height, a->uv_crop_height }; @@ -242,12 +220,13 @@ void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, psnr->samples[0] = total_samples; psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse); + psnr->spatial_layer_id = spatial_layer_id; } #endif // !CONFIG_VP9_HIGHBITDEPTH void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, - PSNR_STATS *psnr) { + PSNR_STATS *psnr, int spatial_layer_id) { static const double peak = 255.0; const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; const int heights[3] = { a->y_crop_height, a->uv_crop_height, @@ -278,4 +257,5 @@ void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, psnr->samples[0] = total_samples; psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak, (double)total_sse); + psnr->spatial_layer_id = spatial_layer_id; } diff --git a/media/libvpx/libvpx/vpx_dsp/psnr.h b/media/libvpx/libvpx/vpx_dsp/psnr.h index f321131d0b..88fde5bb7e 100644 --- a/media/libvpx/libvpx/vpx_dsp/psnr.h +++ b/media/libvpx/libvpx/vpx_dsp/psnr.h @@ -1,17 +1,18 @@ /* -* Copyright (c) 2016 The WebM project authors. All Rights Reserved. -* -* Use of this source code is governed by a BSD-style license -* that can be found in the LICENSE file in the root of the source -* tree. An additional intellectual property rights grant can be found -* in the file PATENTS. All contributing project authors may -* be found in the AUTHORS file in the root of the source tree. -*/ + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ -#ifndef VPX_DSP_PSNR_H_ -#define VPX_DSP_PSNR_H_ +#ifndef VPX_VPX_DSP_PSNR_H_ +#define VPX_VPX_DSP_PSNR_H_ #include "vpx_scale/yv12config.h" +#include "vpx/vpx_encoder.h" #define MAX_PSNR 100.0 @@ -19,22 +20,18 @@ extern "C" { #endif -typedef struct { - double psnr[4]; // total/y/u/v - uint64_t sse[4]; // total/y/u/v - uint32_t samples[4]; // total/y/u/v -} PSNR_STATS; +typedef struct vpx_psnr_pkt PSNR_STATS; // TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t /*!\brief Converts SSE to PSNR -* -* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). -* -* \param[in] samples Number of samples -* \param[in] peak Max sample value -* \param[in] sse Sum of squared errors -*/ + * + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR). + * + * \param[in] samples Number of samples + * \param[in] peak Max sample value + * \param[in] sse Sum of squared errors + */ double vpx_sse_to_psnr(double samples, double peak, double sse); int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); #if CONFIG_VP9_HIGHBITDEPTH @@ -42,10 +39,11 @@ int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, - unsigned int bit_depth, unsigned int in_bit_depth); + unsigned int bit_depth, unsigned int in_bit_depth, + int spatial_layer_id); #endif void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, - PSNR_STATS *psnr); + PSNR_STATS *psnr, int spatial_layer_id); double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *phvs_y, @@ -54,4 +52,4 @@ double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source, #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_DSP_PSNR_H_ +#endif // VPX_VPX_DSP_PSNR_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/psnrhvs.c b/media/libvpx/libvpx/vpx_dsp/psnrhvs.c index b3910152c4..d7ec1a429a 100644 --- a/media/libvpx/libvpx/vpx_dsp/psnrhvs.c +++ b/media/libvpx/libvpx/vpx_dsp/psnrhvs.c @@ -126,8 +126,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, const uint8_t *_dst8 = dst; const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src); const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst); - int16_t dct_s[8 * 8], dct_d[8 * 8]; - tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8]; + DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]); + DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]); double mask[8][8]; int pixels; int x; @@ -142,7 +144,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, been normalized and then squared." Their CSF matrix (from PSNR-HVS) was also constructed from the JPEG matrices. I can not find any obvious scheme of normalizing to produce their table, but if I multiply their - CSF by 0.38857 and square the result I get their masking table. + CSF by 0.3885746225901003 and square the result I get their masking table. I have no idea where this constant comes from, but deviating from it too greatly hurts MOS agreement. @@ -150,11 +152,15 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking of DCT basis functions", CD-ROM Proceedings of the Third International Workshop on Video Processing and Quality Metrics for Consumer - Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/ + Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p. + + Suggested in aomedia issue #2363: + 0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509) + of the old JPEG based matrix from the paper. Since you are not using that, + divide by actual maximum coefficient. */ for (x = 0; x < 8; x++) for (y = 0; y < 8; y++) - mask[x][y] = - (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003); + mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]); for (y = 0; y < _h - 7; y += _step) { for (x = 0; x < _w - 7; x += _step) { int i; diff --git a/media/libvpx/libvpx/vpx_dsp/quantize.c b/media/libvpx/libvpx/vpx_dsp/quantize.c index 3c7f9832f7..fac9136f8c 100644 --- a/media/libvpx/libvpx/vpx_dsp/quantize.c +++ b/media/libvpx/libvpx/vpx_dsp/quantize.c @@ -8,14 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/quantize.h" +#include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" -void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, +void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int16_t dequant, uint16_t *eob_ptr) { const int rc = 0; const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); @@ -25,45 +30,44 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 16; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; - if (tmp) eob = 0; - } + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; + if (tmp) eob = 0; + *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, - uint16_t *eob_ptr) { + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr) { int eob = -1; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { + { const int coeff = coeff_ptr[0]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp = abs_coeff + round_ptr[0]; const int abs_qcoeff = (int)((tmp * quant) >> 16); qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr; + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant; if (abs_qcoeff) eob = 0; } + *eob_ptr = eob + 1; } #endif -void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, +void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int16_t dequant, uint16_t *eob_ptr) { const int n_coeffs = 1024; const int rc = 0; const int coeff = coeff_ptr[rc]; @@ -74,89 +78,87 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), - INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; - if (tmp) eob = 0; - } + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), INT16_MIN, + INT16_MAX); + tmp = (tmp * quant) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2; + if (tmp) eob = 0; + *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, +void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, - uint16_t *eob_ptr) { + const int16_t dequant, uint16_t *eob_ptr) { const int n_coeffs = 1024; int eob = -1; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { + { const int coeff = coeff_ptr[0]; const int coeff_sign = (coeff >> 31); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); const int abs_qcoeff = (int)((tmp * quant) >> 15); qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2; + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2; if (abs_qcoeff) eob = 0; } + *eob_ptr = eob + 1; } #endif void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - (void)iscan; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - if (abs_coeff >= zbins[rc != 0]) { - int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> - 16; // quantization - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_coeff >= zbins[rc != 0]) { + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + 16; // quantization + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = (tran_low_t)(qcoeff_ptr[rc] * dequant_ptr[rc != 0]); - if (tmp) eob = i; - } + if (tmp) eob = i; } } *eob_ptr = eob + 1; @@ -164,156 +166,158 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const struct ScanOrder *const scan_order) { int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - (void)iscan; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - if (abs_coeff >= zbins[rc != 0]) { - const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (abs_qcoeff) eob = i; - } + if (abs_coeff >= zbins[rc != 0]) { + const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_qcoeff) eob = i; } } *eob_ptr = eob + 1; } #endif -void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const struct ScanOrder *const scan_order) { + const int n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; int idx = 0; - int idx_arr[1024]; + int idx_arr[32 * 32 /* n_coeffs */]; int i, eob = -1; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - int tmp; - int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); - tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * - quant_shift_ptr[rc != 0]) >> - 15; + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * + quant_shift_ptr[rc != 0]) >> + 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; +#if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH + // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than + // truncating with a cast, saturate the value. This is easier to implement + // on x86 and preserves the sign of the value. + dqcoeff_ptr[rc] = + clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX); +#else + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; +#endif // VPX_ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH - if (tmp) eob = idx_arr[i]; - } + if (tmp) eob = idx_arr[i]; } *eob_ptr = eob + 1; } #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { + const intptr_t n_coeffs = 32 * 32; + const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1), + ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; + const int16_t *scan = scan_order->scan; int idx = 0; int idx_arr[1024]; int i, eob = -1; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = idx_arr[i]; - } + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = idx_arr[i]; } *eob_ptr = eob + 1; } diff --git a/media/libvpx/libvpx/vpx_dsp/quantize.h b/media/libvpx/libvpx/vpx_dsp/quantize.h index e132845463..8e138445e2 100644 --- a/media/libvpx/libvpx/vpx_dsp/quantize.h +++ b/media/libvpx/libvpx/vpx_dsp/quantize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_QUANTIZE_H_ -#define VPX_DSP_QUANTIZE_H_ +#ifndef VPX_VPX_DSP_QUANTIZE_H_ +#define VPX_VPX_DSP_QUANTIZE_H_ #include "./vpx_config.h" #include "vpx_dsp/vpx_dsp_common.h" @@ -18,31 +18,29 @@ extern "C" { #endif -void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, +void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, + const int16_t dequant, uint16_t *eob_ptr); +void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); + const int16_t dequant, uint16_t *eob_ptr); #if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, - uint16_t *eob_ptr); -void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, - const int16_t quant_ptr, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant, uint16_t *eob_ptr); +void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, + const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); + const int16_t dequant, uint16_t *eob_ptr); #endif #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_DSP_QUANTIZE_H_ +#endif // VPX_VPX_DSP_QUANTIZE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/sad.c b/media/libvpx/libvpx/vpx_dsp/sad.c index c80ef729bf..2a4c81d588 100644 --- a/media/libvpx/libvpx/vpx_dsp/sad.c +++ b/media/libvpx/libvpx/vpx_dsp/sad.c @@ -17,61 +17,63 @@ #include "vpx_ports/mem.h" /* Sum the difference between every corresponding element of the buffers. */ -static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int width, int height) { +static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int width, int height) { int y, x; unsigned int sad = 0; for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]); - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } return sad; } -#define sadMxN(m, n) \ - unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - return sad(src, src_stride, ref, ref_stride, m, n); \ - } \ - unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - uint8_t comp_pred[m * n]; \ - vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ - return sad(src, src_stride, comp_pred, m, m, n); \ +#define sadMxN(m, n) \ + unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + } \ + unsigned int vpx_sad##m##x##n##_avg_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]); \ + vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ + return sad(src_ptr, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int vpx_sad_skip_##m##x##n##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m), \ + (n / 2)); \ } -// depending on call sites, pass **ref_array to avoid & in subsequent call and -// de-dup with 4D below. -#define sadMxNxK(m, n, k) \ - void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref_array, int ref_stride, \ - uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < k; ++i) \ - sad_array[i] = \ - vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \ - } - -// This appears to be equivalent to the above when k == 4 and refs is const -#define sadMxNx4D(m, n) \ - void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) \ - sad_array[i] = \ - vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ +// Compare |src_ptr| to 4 distinct references in |ref_array[4]| +#define sadMxNx4D(m, n) \ + void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \ + } \ + void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i], \ + 2 * ref_stride, (m), (n / 2)); \ + } \ } /* clang-format off */ // 64x64 sadMxN(64, 64) -sadMxNxK(64, 64, 3) -sadMxNxK(64, 64, 8) sadMxNx4D(64, 64) // 64x32 @@ -84,8 +86,6 @@ sadMxNx4D(32, 64) // 32x32 sadMxN(32, 32) -sadMxNxK(32, 32, 3) -sadMxNxK(32, 32, 8) sadMxNx4D(32, 32) // 32x16 @@ -98,118 +98,110 @@ sadMxNx4D(16, 32) // 16x16 sadMxN(16, 16) -sadMxNxK(16, 16, 3) -sadMxNxK(16, 16, 8) sadMxNx4D(16, 16) // 16x8 sadMxN(16, 8) -sadMxNxK(16, 8, 3) -sadMxNxK(16, 8, 8) sadMxNx4D(16, 8) // 8x16 sadMxN(8, 16) -sadMxNxK(8, 16, 3) -sadMxNxK(8, 16, 8) sadMxNx4D(8, 16) // 8x8 sadMxN(8, 8) -sadMxNxK(8, 8, 3) -sadMxNxK(8, 8, 8) sadMxNx4D(8, 8) // 8x4 sadMxN(8, 4) -sadMxNxK(8, 4, 8) sadMxNx4D(8, 4) // 4x8 sadMxN(4, 8) -sadMxNxK(4, 8, 8) sadMxNx4D(4, 8) // 4x4 sadMxN(4, 4) -sadMxNxK(4, 4, 3) -sadMxNxK(4, 4, 8) sadMxNx4D(4, 4) /* clang-format on */ #if CONFIG_VP9_HIGHBITDEPTH static INLINE - unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, - int b_stride, int width, int height) { + unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int width, + int height) { int y, x; unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]); - a += a_stride; - b += b_stride; + src += src_stride; + ref_ptr += ref_stride; } return sad; } -static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, - const uint16_t *b, int b_stride, +static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, int width, int height) { int y, x; unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]); - a += a_stride; - b += b_stride; + src += src_stride; + ref_ptr += ref_stride; } return sad; } #define highbd_sadMxN(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, \ - int ref_stride) { \ - return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ + unsigned int vpx_highbd_sad##m##x##n##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ } \ unsigned int vpx_highbd_sad##m##x##n##_avg_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - uint16_t comp_pred[m * n]; \ - vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ - return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]); \ + vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \ + n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride); \ + return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int vpx_highbd_sad_skip_##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * \ + highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \ } -#define highbd_sadMxNxK(m, n, k) \ - void vpx_highbd_sad##m##x##n##x##k##_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref_array, \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < k; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \ - &ref_array[i], ref_stride); \ - } \ - } - -#define highbd_sadMxNx4D(m, n) \ - void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \ - ref_array[i], ref_stride); \ - } \ +#define highbd_sadMxNx4D(m, n) \ + void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ + ref_array[i], ref_stride); \ + } \ + } \ + void vpx_highbd_sad_skip_##m##x##n##x4d_c( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c( \ + src, src_stride, ref_array[i], ref_stride); \ + } \ } /* clang-format off */ // 64x64 highbd_sadMxN(64, 64) -highbd_sadMxNxK(64, 64, 3) -highbd_sadMxNxK(64, 64, 8) highbd_sadMxNx4D(64, 64) // 64x32 @@ -222,8 +214,6 @@ highbd_sadMxNx4D(32, 64) // 32x32 highbd_sadMxN(32, 32) -highbd_sadMxNxK(32, 32, 3) -highbd_sadMxNxK(32, 32, 8) highbd_sadMxNx4D(32, 32) // 32x16 @@ -236,42 +226,30 @@ highbd_sadMxNx4D(16, 32) // 16x16 highbd_sadMxN(16, 16) -highbd_sadMxNxK(16, 16, 3) -highbd_sadMxNxK(16, 16, 8) highbd_sadMxNx4D(16, 16) // 16x8 highbd_sadMxN(16, 8) -highbd_sadMxNxK(16, 8, 3) -highbd_sadMxNxK(16, 8, 8) highbd_sadMxNx4D(16, 8) // 8x16 highbd_sadMxN(8, 16) -highbd_sadMxNxK(8, 16, 3) -highbd_sadMxNxK(8, 16, 8) highbd_sadMxNx4D(8, 16) // 8x8 highbd_sadMxN(8, 8) -highbd_sadMxNxK(8, 8, 3) -highbd_sadMxNxK(8, 8, 8) highbd_sadMxNx4D(8, 8) // 8x4 highbd_sadMxN(8, 4) -highbd_sadMxNxK(8, 4, 8) highbd_sadMxNx4D(8, 4) // 4x8 highbd_sadMxN(4, 8) -highbd_sadMxNxK(4, 8, 8) highbd_sadMxNx4D(4, 8) // 4x4 highbd_sadMxN(4, 4) -highbd_sadMxNxK(4, 4, 3) -highbd_sadMxNxK(4, 4, 8) highbd_sadMxNx4D(4, 4) /* clang-format on */ diff --git a/media/libvpx/libvpx/vpx_dsp/skin_detection.c b/media/libvpx/libvpx/vpx_dsp/skin_detection.c new file mode 100644 index 0000000000..bbbb6c3a17 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/skin_detection.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/skin_detection.h" + +#define MODEL_MODE 1 + +// Fixed-point skin color model parameters. +static const int skin_mean[5][2] = { { 7463, 9614 }, + { 6400, 10240 }, + { 7040, 10240 }, + { 8320, 9280 }, + { 6800, 9614 } }; +static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16 +static const int skin_threshold[6] = { 1570636, 1400000, 800000, + 800000, 800000, 800000 }; // q18 +// Thresholds on luminance. +static const int y_low = 40; +static const int y_high = 220; + +// Evaluates the Mahalanobis distance measure for the input CbCr values. +static int vpx_evaluate_skin_color_difference(const int cb, const int cr, + const int idx) { + const int cb_q6 = cb << 6; + const int cr_q6 = cr << 6; + const int cb_diff_q12 = + (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]); + const int cbcr_diff_q12 = + (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]); + const int cr_diff_q12 = + (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]); + const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10; + const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10; + const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10; + const int skin_diff = + skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 + + skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2; + return skin_diff; +} + +// Checks if the input yCbCr values corresponds to skin color. +int vpx_skin_pixel(const int y, const int cb, const int cr, int motion) { + if (y < y_low || y > y_high) { + return 0; + } else if (MODEL_MODE == 0) { + return (vpx_evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]); + } else { + int i = 0; + // Exit on grey. + if (cb == 128 && cr == 128) return 0; + // Exit on very strong cb. + if (cb > 150 && cr < 110) return 0; + for (; i < 5; ++i) { + int skin_color_diff = vpx_evaluate_skin_color_difference(cb, cr, i); + if (skin_color_diff < skin_threshold[i + 1]) { + if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) { + return 0; + } else if (motion == 0 && + skin_color_diff > (skin_threshold[i + 1] >> 1)) { + return 0; + } else { + return 1; + } + } + // Exit if difference is much large than the threshold. + if (skin_color_diff > (skin_threshold[i + 1] << 3)) { + return 0; + } + } + return 0; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/skin_detection.h b/media/libvpx/libvpx/vpx_dsp/skin_detection.h new file mode 100644 index 0000000000..91640c33d5 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/skin_detection.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_SKIN_DETECTION_H_ +#define VPX_VPX_DSP_SKIN_DETECTION_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +int vpx_skin_pixel(const int y, const int cb, const int cr, int motion); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_DSP_SKIN_DETECTION_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/sse.c b/media/libvpx/libvpx/vpx_dsp/sse.c new file mode 100644 index 0000000000..c9d751859d --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/sse.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * Sum the square of the difference between every corresponding element of the + * buffers. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = abs(a[x] - b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} +#endif diff --git a/media/libvpx/libvpx/vpx_dsp/ssim.c b/media/libvpx/libvpx/vpx_dsp/ssim.c index 7a29bd29f9..4a31f3d223 100644 --- a/media/libvpx/libvpx/vpx_dsp/ssim.c +++ b/media/libvpx/libvpx/vpx_dsp/ssim.c @@ -15,21 +15,6 @@ #include "vpx_ports/mem.h" #include "vpx_ports/system_state.h" -void vpx_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, - uint32_t *sum_s, uint32_t *sum_r, - uint32_t *sum_sq_s, uint32_t *sum_sq_r, - uint32_t *sum_sxr) { - int i, j; - for (i = 0; i < 16; i++, s += sp, r += rp) { - for (j = 0; j < 16; j++) { - *sum_s += s[j]; - *sum_r += r[j]; - *sum_sq_s += s[j] * s[j]; - *sum_sq_r += r[j] * r[j]; - *sum_sxr += s[j] * r[j]; - } - } -} void vpx_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr) { @@ -73,7 +58,7 @@ static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, uint32_t sum_sq_r, uint32_t sum_sxr, int count, uint32_t bd) { - int64_t ssim_n, ssim_d; + double ssim_n, ssim_d; int64_t c1, c2; if (bd == 8) { // scale the constants by number of pixels @@ -90,14 +75,14 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, assert(0); } - ssim_n = (2 * sum_s * sum_r + c1) * - ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + ssim_n = (2.0 * sum_s * sum_r + c1) * + (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2); - ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * - ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + - (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) * + ((double)count * sum_sq_s - (double)sum_s * sum_s + + (double)count * sum_sq_r - (double)sum_r * sum_r + c2); - return ssim_n * 1.0 / ssim_d; + return ssim_n / ssim_d; } static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { @@ -284,7 +269,7 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, for (i = 0; i < height; i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { for (j = 0; j < width; j += 4, ++c) { - Ssimv sv = { 0 }; + Ssimv sv = { 0, 0, 0, 0, 0, 0 }; double ssim; double ssim2; double dssim; diff --git a/media/libvpx/libvpx/vpx_dsp/ssim.h b/media/libvpx/libvpx/vpx_dsp/ssim.h index 4f2bb1d556..c382237fc6 100644 --- a/media/libvpx/libvpx/vpx_dsp/ssim.h +++ b/media/libvpx/libvpx/vpx_dsp/ssim.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_SSIM_H_ -#define VPX_DSP_SSIM_H_ +#ifndef VPX_VPX_DSP_SSIM_H_ +#define VPX_VPX_DSP_SSIM_H_ #define MAX_SSIM_DB 100.0; @@ -84,4 +84,4 @@ double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, } // extern "C" #endif -#endif // VPX_DSP_SSIM_H_ +#endif // VPX_VPX_DSP_SSIM_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/subtract.c b/media/libvpx/libvpx/vpx_dsp/subtract.c index 95e7071b27..45c819e67a 100644 --- a/media/libvpx/libvpx/vpx_dsp/subtract.c +++ b/media/libvpx/libvpx/vpx_dsp/subtract.c @@ -16,37 +16,37 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -void vpx_subtract_block_c(int rows, int cols, int16_t *diff, - ptrdiff_t diff_stride, const uint8_t *src, - ptrdiff_t src_stride, const uint8_t *pred, +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { int r, c; for (r = 0; r < rows; r++) { - for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c]; + for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c]; - diff += diff_stride; - pred += pred_stride; - src += src_stride; + diff_ptr += diff_stride; + pred_ptr += pred_stride; + src_ptr += src_stride; } } #if CONFIG_VP9_HIGHBITDEPTH -void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff, - ptrdiff_t diff_stride, const uint8_t *src8, - ptrdiff_t src_stride, const uint8_t *pred8, +void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src8_ptr, + ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd) { int r, c; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr); (void)bd; for (r = 0; r < rows; r++) { for (c = 0; c < cols; c++) { - diff[c] = src[c] - pred[c]; + diff_ptr[c] = src[c] - pred[c]; } - diff += diff_stride; + diff_ptr += diff_stride; pred += pred_stride; src += src_stride; } diff --git a/media/libvpx/libvpx/vpx_dsp/sum_squares.c b/media/libvpx/libvpx/vpx_dsp/sum_squares.c index 7c535ac2db..b80cd588e4 100644 --- a/media/libvpx/libvpx/vpx_dsp/sum_squares.c +++ b/media/libvpx/libvpx/vpx_dsp/sum_squares.c @@ -10,8 +10,7 @@ #include "./vpx_dsp_rtcd.h" -uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride, - int size) { +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size) { int r, c; uint64_t ss = 0; @@ -20,7 +19,7 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride, const int16_t v = src[c]; ss += v * v; } - src += src_stride; + src += stride; } return ss; diff --git a/media/libvpx/libvpx/vpx_dsp/txfm_common.h b/media/libvpx/libvpx/vpx_dsp/txfm_common.h index fd27f928ed..25f4fdb327 100644 --- a/media/libvpx/libvpx/vpx_dsp/txfm_common.h +++ b/media/libvpx/libvpx/vpx_dsp/txfm_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_TXFM_COMMON_H_ -#define VPX_DSP_TXFM_COMMON_H_ +#ifndef VPX_VPX_DSP_TXFM_COMMON_H_ +#define VPX_VPX_DSP_TXFM_COMMON_H_ #include "vpx_dsp/vpx_dsp_common.h" @@ -25,42 +25,42 @@ // printf("static const int cospi_%d_64 = %.0f;\n", i, // round(16384 * cos(i*M_PI/64))); // Note: sin(k*Pi/64) = cos((32-k)*Pi/64) -static const tran_high_t cospi_1_64 = 16364; -static const tran_high_t cospi_2_64 = 16305; -static const tran_high_t cospi_3_64 = 16207; -static const tran_high_t cospi_4_64 = 16069; -static const tran_high_t cospi_5_64 = 15893; -static const tran_high_t cospi_6_64 = 15679; -static const tran_high_t cospi_7_64 = 15426; -static const tran_high_t cospi_8_64 = 15137; -static const tran_high_t cospi_9_64 = 14811; -static const tran_high_t cospi_10_64 = 14449; -static const tran_high_t cospi_11_64 = 14053; -static const tran_high_t cospi_12_64 = 13623; -static const tran_high_t cospi_13_64 = 13160; -static const tran_high_t cospi_14_64 = 12665; -static const tran_high_t cospi_15_64 = 12140; -static const tran_high_t cospi_16_64 = 11585; -static const tran_high_t cospi_17_64 = 11003; -static const tran_high_t cospi_18_64 = 10394; -static const tran_high_t cospi_19_64 = 9760; -static const tran_high_t cospi_20_64 = 9102; -static const tran_high_t cospi_21_64 = 8423; -static const tran_high_t cospi_22_64 = 7723; -static const tran_high_t cospi_23_64 = 7005; -static const tran_high_t cospi_24_64 = 6270; -static const tran_high_t cospi_25_64 = 5520; -static const tran_high_t cospi_26_64 = 4756; -static const tran_high_t cospi_27_64 = 3981; -static const tran_high_t cospi_28_64 = 3196; -static const tran_high_t cospi_29_64 = 2404; -static const tran_high_t cospi_30_64 = 1606; -static const tran_high_t cospi_31_64 = 804; +static const tran_coef_t cospi_1_64 = 16364; +static const tran_coef_t cospi_2_64 = 16305; +static const tran_coef_t cospi_3_64 = 16207; +static const tran_coef_t cospi_4_64 = 16069; +static const tran_coef_t cospi_5_64 = 15893; +static const tran_coef_t cospi_6_64 = 15679; +static const tran_coef_t cospi_7_64 = 15426; +static const tran_coef_t cospi_8_64 = 15137; +static const tran_coef_t cospi_9_64 = 14811; +static const tran_coef_t cospi_10_64 = 14449; +static const tran_coef_t cospi_11_64 = 14053; +static const tran_coef_t cospi_12_64 = 13623; +static const tran_coef_t cospi_13_64 = 13160; +static const tran_coef_t cospi_14_64 = 12665; +static const tran_coef_t cospi_15_64 = 12140; +static const tran_coef_t cospi_16_64 = 11585; +static const tran_coef_t cospi_17_64 = 11003; +static const tran_coef_t cospi_18_64 = 10394; +static const tran_coef_t cospi_19_64 = 9760; +static const tran_coef_t cospi_20_64 = 9102; +static const tran_coef_t cospi_21_64 = 8423; +static const tran_coef_t cospi_22_64 = 7723; +static const tran_coef_t cospi_23_64 = 7005; +static const tran_coef_t cospi_24_64 = 6270; +static const tran_coef_t cospi_25_64 = 5520; +static const tran_coef_t cospi_26_64 = 4756; +static const tran_coef_t cospi_27_64 = 3981; +static const tran_coef_t cospi_28_64 = 3196; +static const tran_coef_t cospi_29_64 = 2404; +static const tran_coef_t cospi_30_64 = 1606; +static const tran_coef_t cospi_31_64 = 804; // 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 -static const tran_high_t sinpi_1_9 = 5283; -static const tran_high_t sinpi_2_9 = 9929; -static const tran_high_t sinpi_3_9 = 13377; -static const tran_high_t sinpi_4_9 = 15212; +static const tran_coef_t sinpi_1_9 = 5283; +static const tran_coef_t sinpi_2_9 = 9929; +static const tran_coef_t sinpi_3_9 = 13377; +static const tran_coef_t sinpi_4_9 = 15212; -#endif // VPX_DSP_TXFM_COMMON_H_ +#endif // VPX_VPX_DSP_TXFM_COMMON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/variance.c b/media/libvpx/libvpx/vpx_dsp/variance.c index d1fa0560d5..1c476542fa 100644 --- a/media/libvpx/libvpx/vpx_dsp/variance.c +++ b/media/libvpx/libvpx/vpx_dsp/variance.c @@ -21,36 +21,37 @@ static const uint8_t bilinear_filters[8][2] = { { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; -uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride) { +uint32_t vpx_get4x4sse_cs_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { int distortion = 0; int r, c; for (r = 0; r < 4; ++r) { for (c = 0; c < 4; ++c) { - int diff = a[c] - b[c]; + int diff = src_ptr[c] - ref_ptr[c]; distortion += diff * diff; } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } return distortion; } -uint32_t vpx_get_mb_ss_c(const int16_t *a) { +uint32_t vpx_get_mb_ss_c(const int16_t *src_ptr) { unsigned int i, sum = 0; for (i = 0; i < 256; ++i) { - sum += a[i] * a[i]; + sum += src_ptr[i] * src_ptr[i]; } return sum; } -static void variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, int *sum) { +static void variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, int h, + uint32_t *sse, int *sum) { int i, j; *sum = 0; @@ -58,13 +59,13 @@ static void variance(const uint8_t *a, int a_stride, const uint8_t *b, for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { - const int diff = a[j] - b[j]; + const int diff = src_ptr[j] - ref_ptr[j]; *sum += diff; *sse += diff * diff; } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } } @@ -76,24 +77,23 @@ static void variance(const uint8_t *a, int a_stride, const uint8_t *b, // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). // It defines the offset required to move from one input to the next. -static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line, + int pixel_step, unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); - ++a; + ++src_ptr; } - a += src_pixels_per_line - output_width; - b += output_width; + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; } } @@ -106,91 +106,90 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, // filter is applied horizontally (pixel_step = 1) or vertically // (pixel_step = stride). It defines the offset required to move from one input // to the next. Output is 8-bit. -static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); - ++a; + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; } - a += src_pixels_per_line - output_width; - b += output_width; + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; } } -#define VAR(W, H) \ - uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ +#define VAR(W, H) \ + uint32_t vpx_variance##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } -#define SUBPIX_VAR(W, H) \ - uint32_t vpx_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_variance##W##x##H##_c(temp2, W, ref_ptr, ref_stride, sse); \ } -#define SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ - \ - return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(32, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_c(temp3, W, ref_ptr, ref_stride, sse); \ } /* Identical to the variance call except it takes an additional parameter, sum, * and returns that value using pass-by-reference instead of returning * sse - sum^2 / w*h */ -#define GET_VAR(W, H) \ - void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - int *sum) { \ - variance(a, a_stride, b, b_stride, W, H, sse, sum); \ +#define GET_VAR(W, H) \ + void vpx_get##W##x##H##var_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse, int *sum) { \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \ } /* Identical to the variance call except it does not calculate the - * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * sse - sum^2 / w*h and returns sse in addition to modifying the passed in * variable. */ -#define MSE(W, H) \ - uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse; \ +#define MSE(W, H) \ + uint32_t vpx_mse##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse; \ } /* All three forms of the variance are available in the same sizes. */ @@ -237,128 +236,140 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, } #if CONFIG_VP9_HIGHBITDEPTH -static void highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint64_t *sse, int64_t *sum) { +static void highbd_variance64(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint64_t *sse, int64_t *sum) { int i, j; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); *sum = 0; *sse = 0; for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { - const int diff = a[j] - b[j]; + const int diff = src_ptr[j] - ref_ptr[j]; *sum += diff; *sse += diff * diff; } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } } -static void highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { +static void highbd_8_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); *sse = (uint32_t)sse_long; *sum = (int)sum_long; } -static void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { +static void highbd_10_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); } -static void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { +static void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); } -#define HIGHBD_VAR(W, H) \ - uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ - } \ - \ - uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ +#define HIGHBD_VAR(W, H) \ + uint32_t vpx_highbd_8_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } -#define HIGHBD_GET_VAR(S) \ - void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ - } \ - \ - void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ - } \ - \ - void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ } -#define HIGHBD_MSE(W, H) \ - uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ +#define HIGHBD_MSE(W, H) \ + uint32_t vpx_highbd_8_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ } static void highbd_var_filter_block2d_bil_first_pass( @@ -403,111 +414,111 @@ static void highbd_var_filter_block2d_bil_second_pass( } } -#define HIGHBD_SUBPIX_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ } -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ } /* All three forms of the variance are available in the same sizes. */ @@ -538,12 +549,10 @@ HIGHBD_MSE(16, 8) HIGHBD_MSE(8, 16) HIGHBD_MSE(8, 8) -void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride) { +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { int i, j; - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int tmp = pred[j] + ref[j]; diff --git a/media/libvpx/libvpx/vpx_dsp/variance.h b/media/libvpx/libvpx/vpx_dsp/variance.h index 4c482551e0..ccdb2f90ba 100644 --- a/media/libvpx/libvpx/vpx_dsp/variance.h +++ b/media/libvpx/libvpx/vpx_dsp/variance.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_VARIANCE_H_ -#define VPX_DSP_VARIANCE_H_ +#ifndef VPX_VPX_DSP_VARIANCE_H_ +#define VPX_VPX_DSP_VARIANCE_H_ #include "./vpx_config.h" @@ -22,46 +22,45 @@ extern "C" { #define FILTER_BITS 7 #define FILTER_WEIGHT 128 -typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b_ptr, int b_stride); +typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); -typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, +typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, - int b_stride, int n); +typedef void (*vp8_copy32xn_fn_t)(const uint8_t *src_ptr, int src_stride, + uint8_t *ref_ptr, int ref_stride, int n); -typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +typedef void (*vpx_sad_multi_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, +typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *src_ptr, int src_stride, const uint8_t *const b_array[], - int b_stride, unsigned int *sad_array); + int ref_stride, unsigned int *sad_array); -typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse); +typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, unsigned int *sse); -typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - unsigned int *sse); +typedef unsigned int (*vpx_subpixvariance_fn_t)( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); typedef unsigned int (*vpx_subp_avg_variance_fn_t)( - const uint8_t *a_ptr, int a_stride, int xoffset, int yoffset, - const uint8_t *b_ptr, int b_stride, unsigned int *sse, + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + #if CONFIG_VP8 typedef struct variance_vtable { vpx_sad_fn_t sdf; vpx_variance_fn_t vf; vpx_subpixvariance_fn_t svf; - vpx_sad_multi_fn_t sdx3f; - vpx_sad_multi_fn_t sdx8f; vpx_sad_multi_d_fn_t sdx4df; -#if ARCH_X86 || ARCH_X86_64 +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 vp8_copy32xn_fn_t copymem; #endif } vp8_variance_fn_ptr_t; @@ -70,13 +69,15 @@ typedef struct variance_vtable { #if CONFIG_VP9 typedef struct vp9_variance_vtable { vpx_sad_fn_t sdf; + // Same as normal sad, but downsample the rows by a factor of 2. + vpx_sad_fn_t sdsf; vpx_sad_avg_fn_t sdaf; vpx_variance_fn_t vf; vpx_subpixvariance_fn_t svf; vpx_subp_avg_variance_fn_t svaf; - vpx_sad_multi_fn_t sdx3f; - vpx_sad_multi_fn_t sdx8f; vpx_sad_multi_d_fn_t sdx4df; + // Same as sadx4, but downsample the rows by a factor of 2. + vpx_sad_multi_d_fn_t sdsx4df; } vp9_variance_fn_ptr_t; #endif // CONFIG_VP9 @@ -84,4 +85,4 @@ typedef struct vp9_variance_vtable { } // extern "C" #endif -#endif // VPX_DSP_VARIANCE_H_ +#endif // VPX_VPX_DSP_VARIANCE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c index cab6368e60..e55a963f9d 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c +++ b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.c @@ -113,11 +113,52 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const InterpKernel *const x_filters, - int x0_q4, int x_step_q4, - const InterpKernel *const y_filters, int y0_q4, - int y_step_q4, int w, int h) { +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)y0_q4; + (void)y_step_q4; + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, + h); +} + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)y0_q4; + (void)y_step_q4; + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); +} + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)x0_q4; + (void)x_step_q4; + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); +} + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + (void)x0_q4; + (void)x_step_q4; + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, + w, h); +} + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -130,118 +171,49 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. uint8_t temp[64 * 135]; const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, - x_filters, x0_q4, x_step_q4, w, intermediate_height); - convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, - y_filters, y0_q4, y_step_q4, w, h); -} - -void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; - (void)y_step_q4; - - convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - w, h); -} - -void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; - (void)y_step_q4; - - convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h); -} - -void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; - (void)x_step_q4; - - convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, - w, h); -} - -void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; - (void)x_step_q4; - - convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h); -} - -void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h); + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); } void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); - vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y, + vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); - vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); + vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); } void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int filter_x_stride, const int16_t *filter_y, - int filter_y_stride, int w, int h) { + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { int r; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; for (r = h; r > 0; --r) { memcpy(dst, src, w); @@ -251,15 +223,16 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int filter_x_stride, const int16_t *filter_y, - int filter_y_stride, int w, int h) { + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { int x, y; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); @@ -269,63 +242,60 @@ void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } #if CONFIG_VP9_HIGHBITDEPTH -static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { @@ -343,13 +313,11 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { @@ -369,13 +337,11 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { @@ -395,13 +361,11 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { @@ -423,11 +387,11 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h, int bd) { +static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -450,116 +414,97 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 <= 32); highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4, - x_step_q4, w, intermediate_height, bd); - highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1), - 64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, - bd); + temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height, bd); + highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + filter, y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)y0_q4; (void)y_step_q4; - highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + (void)y0_q4; (void)y_step_q4; - highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, h, bd); } -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h, bd); + highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); - vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); - vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL, - 0, NULL, 0, w, h, bd); + vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); + vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h, + bd); } -void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h, int bd) { +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { int r; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; for (r = h; r > 0; --r) { @@ -569,19 +514,18 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, } } -void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h, int bd) { +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { int x, y; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; for (y = 0; y < h; ++y) { diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h index ee9744b3ae..d5793e17ad 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h +++ b/media/libvpx/libvpx/vpx_dsp/vpx_convolve.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_VPX_CONVOLVE_H_ -#define VPX_DSP_VPX_CONVOLVE_H_ +#ifndef VPX_VPX_DSP_VPX_CONVOLVE_H_ +#define VPX_VPX_DSP_VPX_CONVOLVE_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -19,15 +19,15 @@ extern "C" { typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #if CONFIG_VP9_HIGHBITDEPTH -typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, +typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); #endif @@ -35,4 +35,4 @@ typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, } // extern "C" #endif -#endif // VPX_DSP_VPX_CONVOLVE_H_ +#endif // VPX_VPX_DSP_VPX_CONVOLVE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk index 18c225658e..357ad08508 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk @@ -13,6 +13,13 @@ DSP_SRCS-yes += vpx_dsp_common.h DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h +DSP_SRCS-$(HAVE_AVX2) += x86/bitdepth_conversion_avx2.h +DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2.h +# This file is included in libs.mk. Including it here would cause it to be +# compiled into an object. Even as an empty file, this would create an +# executable section on the stack. +#DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2$(ASM) + # bit reader DSP_SRCS-yes += prob.h DSP_SRCS-yes += prob.c @@ -24,10 +31,15 @@ DSP_SRCS-yes += bitwriter_buffer.c DSP_SRCS-yes += bitwriter_buffer.h DSP_SRCS-yes += psnr.c DSP_SRCS-yes += psnr.h +DSP_SRCS-yes += sse.c DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c +DSP_SRCS-$(HAVE_NEON) += arm/sse_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sse_neon_dotprod.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/sse_sse4.c +DSP_SRCS-$(HAVE_AVX2) += x86/sse_avx2.c endif ifeq ($(CONFIG_DECODERS),yes) @@ -40,14 +52,14 @@ endif # intra predictions DSP_SRCS-yes += intrapred.c -DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm +DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c endif # CONFIG_VP9_HIGHBITDEPTH @@ -60,11 +72,14 @@ DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/post_proc_sse2.c +DSP_SRCS-$(HAVE_VSX) += ppc/deblock_vsx.c endif # CONFIG_POSTPROC DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/intrapred_lsx.c DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred4_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c @@ -72,14 +87,19 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c +DSP_SRCS-yes += vpx_filter.h +ifeq ($(CONFIG_VP9),yes) # interpolation filters DSP_SRCS-yes += vpx_convolve.c DSP_SRCS-yes += vpx_convolve.h -DSP_SRCS-yes += vpx_filter.h -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c +DSP_SRCS-$(VPX_ARCH_X86)$(VPX_ARCH_X86_64) += x86/convolve.h + +DSP_SRCS-$(HAVE_SSE2) += x86/convolve_sse2.h +DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h +DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_4t_intrin_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm @@ -88,19 +108,30 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c +DSP_SRCS-$(HAVE_SVE) += arm/highbd_vpx_convolve8_sve.c +DSP_SRCS-$(HAVE_SVE2) += arm/highbd_vpx_convolve8_sve2.c endif DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/vpx_scaled_convolve8_neon.c ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM) -DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM) -DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type1_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type1_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type2_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type1_neon$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type1_neon$(ASM) DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM) +DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.c +DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.h DSP_SRCS-yes += arm/vpx_convolve_neon.c else ifeq ($(HAVE_NEON),yes) @@ -108,6 +139,8 @@ DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c DSP_SRCS-yes += arm/vpx_convolve8_neon.c DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c DSP_SRCS-yes += arm/vpx_convolve_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c +DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c endif # HAVE_NEON endif # HAVE_NEON_ASM @@ -121,6 +154,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h +DSP_SRCS-$(HAVE_MMI) += mips/vpx_convolve8_mmi.c # common (dspr2) DSP_SRCS-$(HAVE_DSPR2) += mips/convolve_common_dspr2.h @@ -135,11 +169,24 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c +DSP_SRCS-$(HAVE_VSX) += ppc/vpx_convolve_vsx.c + +# common (lsx) +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_horiz_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_vert_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_avg_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_copy_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h + # loop filters DSP_SRCS-yes += loopfilter.c -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_intrin_sse2.c -DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c +DSP_SRCS-$(HAVE_SSE2) += x86/loopfilter_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM) @@ -161,14 +208,21 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_lsx.h +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_16_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_8_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/loopfilter_4_lsx.c + ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_NEON) += arm/highbd_loopfilter_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c endif # CONFIG_VP9_HIGHBITDEPTH +endif # CONFIG_VP9 DSP_SRCS-yes += txfm_common.h DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h +DSP_SRCS-$(HAVE_LSX) += loongarch/txfm_macros_lsx.h # forward transform ifeq ($(CONFIG_VP9_ENCODER),yes) DSP_SRCS-yes += fwd_txfm.c @@ -177,15 +231,27 @@ DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h -ifeq ($(ARCH_X86_64),yes) +ifeq ($(VPX_ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm endif -DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h -DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct4x4_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct8x8_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/fdct_partial_neon.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.h +DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_txfm_lsx.c + +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/fwd_dct32x32_lsx.c +endif # !CONFIG_VP9_HIGHBITDEPTH + +DSP_SRCS-$(HAVE_VSX) += ppc/fdct32x32_vsx.c endif # CONFIG_VP9_ENCODER # inverse transform @@ -194,13 +260,15 @@ DSP_SRCS-yes += inv_txfm.h DSP_SRCS-yes += inv_txfm.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/inv_txfm_avx2.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm -ifeq ($(ARCH_X86_64),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm -endif # ARCH_X86_64 +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.h +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) +DSP_SRCS-$(HAVE_VSX) += ppc/inv_txfm_vsx.c + ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c @@ -214,29 +282,42 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c + +DSP_SRCS-$(HAVE_LSX) += loongarch/idct32x32_lsx.c else # CONFIG_VP9_HIGHBITDEPTH DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct8x8_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct16x16_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct_neon.h +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct32x32_add_sse4.c endif # !CONFIG_VP9_HIGHBITDEPTH ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/idct_neon$(ASM) DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) else DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c +endif # HAVE_NEON_ASM +DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c -endif # HAVE_NEON_ASM -DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h -DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c @@ -249,56 +330,89 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) DSP_SRCS-yes += quantize.c DSP_SRCS-yes += quantize.h -DSP_SRCS-$(HAVE_SSE2) += x86/fdct.h DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.h +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.h +DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c +DSP_SRCS-$(HAVE_AVX2) += x86/quantize_avx2.c +DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c +DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/quantize_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c -endif -ifeq ($(ARCH_X86_64),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm -DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_quantize_neon.c endif # avg DSP_SRCS-yes += avg.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/avg_intrin_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c -DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c -ifeq ($(ARCH_X86_64),yes) +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_NEON) += arm/highbd_hadamard_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_neon.c +endif +DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c +DSP_SRCS-$(HAVE_LSX) += loongarch/avg_lsx.c +ifeq ($(VPX_ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm endif +DSP_SRCS-$(HAVE_VSX) += ppc/hadamard_vsx.c endif # CONFIG_VP9_ENCODER +# skin detection +DSP_SRCS-yes += skin_detection.h +DSP_SRCS-yes += skin_detection.c + ifeq ($(CONFIG_ENCODERS),yes) DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c DSP_SRCS-yes += sum_squares.c +DSP_SRCS-$(HAVE_NEON) += arm/sum_squares_neon.c +DSP_SRCS-$(HAVE_SVE) += arm/sum_squares_sve.c DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c +DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad4d_neon_dotprod.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad_neon_dotprod.c DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c -DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm -DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm +DSP_SRCS-$(HAVE_LSX) += loongarch/sad_lsx.c + +DSP_SRCS-$(HAVE_MMI) += mips/sad_mmi.c +DSP_SRCS-$(HAVE_MMI) += mips/subtract_mmi.c + DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/subtract_avx2.c +DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c +DSP_SRCS-$(HAVE_AVX512) += x86/sad_avx512.c -DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm -DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm +DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c +DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c + +DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c + ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS @@ -307,33 +421,64 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC) DSP_SRCS-yes += variance.c DSP_SRCS-yes += variance.h +DSP_SRCS-$(HAVE_NEON) += arm/avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/variance_neon_dotprod.c DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c -DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c +DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.h +DSP_SRCS-$(HAVE_LSX) += loongarch/variance_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/sub_pixel_variance_lsx.c +DSP_SRCS-$(HAVE_LSX) += loongarch/avg_pred_lsx.c + +DSP_SRCS-$(HAVE_MMI) += mips/variance_mmi.c + +DSP_SRCS-$(HAVE_SSE2) += x86/avg_pred_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/avg_pred_avx2.c DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c -DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c +DSP_SRCS-$(HAVE_VSX) += ppc/variance_vsx.c -ifeq ($(ARCH_X86_64),yes) +ifeq ($(VPX_ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm -endif # ARCH_X86_64 +endif # VPX_ARCH_X86_64 -DSP_SRCS-$(HAVE_SSE) += x86/subpel_variance_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sse_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c +DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/highbd_variance_neon_dotprod.c +DSP_SRCS-$(HAVE_SVE) += arm/highbd_variance_sve.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC # Neon utilities +DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/sum_neon.h DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h + +# PPC VSX utilities +DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/txfm_common_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h + +# X86 utilities +DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h + +# LSX utilities +DSP_SRCS-$(HAVE_LSX) += loongarch/bitdepth_conversion_lsx.h DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h index 49d36e5458..9d2668277a 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_common.h @@ -8,8 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_VPX_DSP_COMMON_H_ -#define VPX_DSP_VPX_DSP_COMMON_H_ +#ifndef VPX_VPX_DSP_VPX_DSP_COMMON_H_ +#define VPX_VPX_DSP_VPX_DSP_COMMON_H_ + +#include #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -25,8 +27,8 @@ extern "C" { #define VPX_SWAP(type, a, b) \ do { \ type c = (b); \ - b = a; \ - a = c; \ + (b) = a; \ + (a) = c; \ } while (0) #if CONFIG_VP9_HIGHBITDEPTH @@ -43,9 +45,22 @@ typedef int32_t tran_high_t; typedef int16_t tran_low_t; #endif // CONFIG_VP9_HIGHBITDEPTH +typedef int16_t tran_coef_t; + +// Visual Studio 2022 (cl.exe) < 17.7 targeting AArch64 with optimizations +// enabled produces invalid code for clip_pixel() when the return type is +// uint8_t. See: +// https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361 +#if defined(_MSC_VER) && _MSC_VER < 1937 && defined(_M_ARM64) && \ + !defined(__clang__) +static INLINE int clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} +#else static INLINE uint8_t clip_pixel(int val) { return (val > 255) ? 255 : (val < 0) ? 0 : val; } +#endif static INLINE int clamp(int value, int low, int high) { return value < low ? low : (value > high ? high : value); @@ -55,7 +70,10 @@ static INLINE double fclamp(double value, double low, double high) { return value < low ? low : (value > high ? high : value); } -#if CONFIG_VP9_HIGHBITDEPTH +static INLINE int64_t lclamp(int64_t value, int64_t low, int64_t high) { + return value < low ? low : (value > high ? high : value); +} + static INLINE uint16_t clip_pixel_highbd(int val, int bd) { switch (bd) { case 8: @@ -64,10 +82,15 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) { case 12: return (uint16_t)clamp(val, 0, 4095); } } -#endif // CONFIG_VP9_HIGHBITDEPTH + +// Returns the saturating cast of a double value to int. +static INLINE int saturate_cast_double_to_int(double d) { + if (d > INT_MAX) return INT_MAX; + return (int)d; +} #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_DSP_VPX_DSP_COMMON_H_ +#endif // VPX_VPX_DSP_VPX_DSP_COMMON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c index 030c456d39..2b8c656afb 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd.c @@ -12,4 +12,4 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/vpx_once.h" -void vpx_dsp_rtcd() { once(setup_rtcd_internal); } +void vpx_dsp_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index ed7dd4da55..db508e9a9d 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vpx_dsp_forward_decls() { print < #include "vpx/vpx_integer.h" #ifdef __cplusplus @@ -26,19 +27,21 @@ extern "C" { typedef int16_t InterpKernel[SUBPEL_TAPS]; -static INLINE const InterpKernel *get_filter_base(const int16_t *filter) { - // NOTE: This assumes that the filter table is 256-byte aligned. - // TODO(agrange) Modify to make independent of table alignment. - return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); -} - -static INLINE int get_filter_offset(const int16_t *f, - const InterpKernel *base) { - return (int)((const InterpKernel *)(intptr_t)f - base); +static INLINE int vpx_get_filter_taps(const int16_t *const filter) { + if (filter[0] | filter[7]) { + return 8; + } + if (filter[1] | filter[6]) { + return 6; + } + if (filter[2] | filter[5]) { + return 4; + } + return 2; } #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_DSP_VPX_FILTER_H_ +#endif // VPX_VPX_DSP_VPX_FILTER_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm index f758da22dc..f51718cf99 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/add_noise_sse2.asm @@ -11,10 +11,12 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, ; int blackclamp, int whiteclamp, ; int width, int height, int pitch) -global sym(vpx_plane_add_noise_sse2) PRIVATE +globalsym(vpx_plane_add_noise_sse2) sym(vpx_plane_add_noise_sse2): push rbp mov rbp, rsp @@ -26,13 +28,13 @@ sym(vpx_plane_add_noise_sse2): mov rdx, 0x01010101 mov rax, arg(2) mul rdx - movd xmm3, rax + movq xmm3, rax pshufd xmm3, xmm3, 0 ; xmm3 is 16 copies of char in blackclamp mov rdx, 0x01010101 mov rax, arg(3) mul rdx - movd xmm4, rax + movq xmm4, rax pshufd xmm4, xmm4, 0 ; xmm4 is 16 copies of char in whiteclamp movdqu xmm5, xmm3 ; both clamp = black clamp + white clamp diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c new file mode 100644 index 0000000000..61e4e73c5b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_avx2.c @@ -0,0 +1,519 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_ports/mem.h" + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi32(a0, a1); + __m256i b1 = _mm256_sub_epi32(a0, a1); + __m256i b2 = _mm256_add_epi32(a2, a3); + __m256i b3 = _mm256_sub_epi32(a2, a3); + __m256i b4 = _mm256_add_epi32(a4, a5); + __m256i b5 = _mm256_sub_epi32(a4, a5); + __m256i b6 = _mm256_add_epi32(a6, a7); + __m256i b7 = _mm256_sub_epi32(a6, a7); + + a0 = _mm256_add_epi32(b0, b2); + a1 = _mm256_add_epi32(b1, b3); + a2 = _mm256_sub_epi32(b0, b2); + a3 = _mm256_sub_epi32(b1, b3); + a4 = _mm256_add_epi32(b4, b6); + a5 = _mm256_add_epi32(b5, b7); + a6 = _mm256_sub_epi32(b4, b6); + a7 = _mm256_sub_epi32(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi32(a0, a4); + b7 = _mm256_add_epi32(a1, a5); + b3 = _mm256_add_epi32(a2, a6); + b4 = _mm256_add_epi32(a3, a7); + b2 = _mm256_sub_epi32(a0, a4); + b6 = _mm256_sub_epi32(a1, a5); + b1 = _mm256_sub_epi32(a2, a6); + b5 = _mm256_sub_epi32(a3, a7); + + a0 = _mm256_unpacklo_epi32(b0, b1); + a1 = _mm256_unpacklo_epi32(b2, b3); + a2 = _mm256_unpackhi_epi32(b0, b1); + a3 = _mm256_unpackhi_epi32(b2, b3); + a4 = _mm256_unpacklo_epi32(b4, b5); + a5 = _mm256_unpacklo_epi32(b6, b7); + a6 = _mm256_unpackhi_epi32(b4, b5); + a7 = _mm256_unpackhi_epi32(b6, b7); + + b0 = _mm256_unpacklo_epi64(a0, a1); + b1 = _mm256_unpacklo_epi64(a4, a5); + b2 = _mm256_unpackhi_epi64(a0, a1); + b3 = _mm256_unpackhi_epi64(a4, a5); + b4 = _mm256_unpacklo_epi64(a2, a3); + b5 = _mm256_unpacklo_epi64(a6, a7); + b6 = _mm256_unpackhi_epi64(a2, a3); + b7 = _mm256_unpackhi_epi64(a6, a7); + + in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); + in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); + in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); + in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); + in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); + in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); + in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); + in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); + } else { + in[0] = _mm256_add_epi32(a0, a4); + in[7] = _mm256_add_epi32(a1, a5); + in[3] = _mm256_add_epi32(a2, a6); + in[4] = _mm256_add_epi32(a3, a7); + in[2] = _mm256_sub_epi32(a0, a4); + in[6] = _mm256_sub_epi32(a1, a5); + in[1] = _mm256_sub_epi32(a2, a6); + in[5] = _mm256_sub_epi32(a3, a7); + } +} + +void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + __m128i src16[8]; + __m256i src32[8]; + + src16[0] = _mm_loadu_si128((const __m128i *)src_diff); + src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); + src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride)); + + src32[0] = _mm256_cvtepi16_epi32(src16[0]); + src32[1] = _mm256_cvtepi16_epi32(src16[1]); + src32[2] = _mm256_cvtepi16_epi32(src16[2]); + src32[3] = _mm256_cvtepi16_epi32(src16[3]); + src32[4] = _mm256_cvtepi16_epi32(src16[4]); + src32[5] = _mm256_cvtepi16_epi32(src16[5]); + src32[6] = _mm256_cvtepi16_epi32(src16[6]); + src32[7] = _mm256_cvtepi16_epi32(src16[7]); + + highbd_hadamard_col8_avx2(src32, 0); + highbd_hadamard_col8_avx2(src32, 1); + + _mm256_storeu_si256((__m256i *)coeff, src32[0]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[1]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[2]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[3]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[4]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[5]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[6]); + coeff += 8; + _mm256_storeu_si256((__m256i *)coeff, src32[7]); +} + +void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 1); + b1 = _mm256_srai_epi32(b1, 1); + b2 = _mm256_srai_epi32(b2, 1); + b3 = _mm256_srai_epi32(b3, 1); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3); + + coeff += 8; + t_coeff += 8; + } +} + +void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff) { + int idx; + tran_low_t *t_coeff = coeff; + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); + } + + for (idx = 0; idx < 256; idx += 8) { + __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + __m256i b0 = _mm256_add_epi32(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); + __m256i b2 = _mm256_add_epi32(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); + + b0 = _mm256_srai_epi32(b0, 2); + b1 = _mm256_srai_epi32(b1, 2); + b2 = _mm256_srai_epi32(b2, 2); + b3 = _mm256_srai_epi32(b3, 2); + + coeff0 = _mm256_add_epi32(b0, b2); + coeff1 = _mm256_add_epi32(b1, b3); + coeff2 = _mm256_sub_epi32(b0, b2); + coeff3 = _mm256_sub_epi32(b1, b3); + + _mm256_storeu_si256((__m256i *)coeff, coeff0); + _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); + _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); + _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); + + coeff += 8; + t_coeff += 8; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero, + __m256i *out_lo, + __m256i *out_hi) { + const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in); + *out_lo = _mm256_unpacklo_epi16(in, sign_bits); + *out_hi = _mm256_unpackhi_epi16(in, sign_bits); +} + +static void hadamard_col8x2_avx2(__m256i *in, int iter) { + __m256i a0 = in[0]; + __m256i a1 = in[1]; + __m256i a2 = in[2]; + __m256i a3 = in[3]; + __m256i a4 = in[4]; + __m256i a5 = in[5]; + __m256i a6 = in[6]; + __m256i a7 = in[7]; + + __m256i b0 = _mm256_add_epi16(a0, a1); + __m256i b1 = _mm256_sub_epi16(a0, a1); + __m256i b2 = _mm256_add_epi16(a2, a3); + __m256i b3 = _mm256_sub_epi16(a2, a3); + __m256i b4 = _mm256_add_epi16(a4, a5); + __m256i b5 = _mm256_sub_epi16(a4, a5); + __m256i b6 = _mm256_add_epi16(a6, a7); + __m256i b7 = _mm256_sub_epi16(a6, a7); + + a0 = _mm256_add_epi16(b0, b2); + a1 = _mm256_add_epi16(b1, b3); + a2 = _mm256_sub_epi16(b0, b2); + a3 = _mm256_sub_epi16(b1, b3); + a4 = _mm256_add_epi16(b4, b6); + a5 = _mm256_add_epi16(b5, b7); + a6 = _mm256_sub_epi16(b4, b6); + a7 = _mm256_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm256_add_epi16(a0, a4); + b7 = _mm256_add_epi16(a1, a5); + b3 = _mm256_add_epi16(a2, a6); + b4 = _mm256_add_epi16(a3, a7); + b2 = _mm256_sub_epi16(a0, a4); + b6 = _mm256_sub_epi16(a1, a5); + b1 = _mm256_sub_epi16(a2, a6); + b5 = _mm256_sub_epi16(a3, a7); + + a0 = _mm256_unpacklo_epi16(b0, b1); + a1 = _mm256_unpacklo_epi16(b2, b3); + a2 = _mm256_unpackhi_epi16(b0, b1); + a3 = _mm256_unpackhi_epi16(b2, b3); + a4 = _mm256_unpacklo_epi16(b4, b5); + a5 = _mm256_unpacklo_epi16(b6, b7); + a6 = _mm256_unpackhi_epi16(b4, b5); + a7 = _mm256_unpackhi_epi16(b6, b7); + + b0 = _mm256_unpacklo_epi32(a0, a1); + b1 = _mm256_unpacklo_epi32(a4, a5); + b2 = _mm256_unpackhi_epi32(a0, a1); + b3 = _mm256_unpackhi_epi32(a4, a5); + b4 = _mm256_unpacklo_epi32(a2, a3); + b5 = _mm256_unpacklo_epi32(a6, a7); + b6 = _mm256_unpackhi_epi32(a2, a3); + b7 = _mm256_unpackhi_epi32(a6, a7); + + in[0] = _mm256_unpacklo_epi64(b0, b1); + in[1] = _mm256_unpackhi_epi64(b0, b1); + in[2] = _mm256_unpacklo_epi64(b2, b3); + in[3] = _mm256_unpackhi_epi64(b2, b3); + in[4] = _mm256_unpacklo_epi64(b4, b5); + in[5] = _mm256_unpackhi_epi64(b4, b5); + in[6] = _mm256_unpacklo_epi64(b6, b7); + in[7] = _mm256_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm256_add_epi16(a0, a4); + in[7] = _mm256_add_epi16(a1, a5); + in[3] = _mm256_add_epi16(a2, a6); + in[4] = _mm256_add_epi16(a3, a7); + in[2] = _mm256_sub_epi16(a0, a4); + in[6] = _mm256_sub_epi16(a1, a5); + in[1] = _mm256_sub_epi16(a2, a6); + in[5] = _mm256_sub_epi16(a3, a7); + } +} + +static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + int16_t *coeff) { + __m256i src[8]; + src[0] = _mm256_loadu_si256((const __m256i *)src_diff); + src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride)); + src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride)); + + hadamard_col8x2_avx2(src, 0); + hadamard_col8x2_avx2(src, 1); + + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x20)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[0], src[1], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[2], src[3], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[4], src[5], 0x31)); + coeff += 16; + _mm256_storeu_si256((__m256i *)coeff, + _mm256_permute2x128_si256(src[6], src[7], 0x31)); +} + +static INLINE void hadamard_16x16_avx2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int16_t *coeff16 = (int16_t *)coeff; + int idx; + for (idx = 0; idx < 2; ++idx) { + const int16_t *src_ptr = src_diff + idx * 8 * src_stride; + hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); + } + + for (idx = 0; idx < 64; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); + + __m256i b0 = _mm256_add_epi16(coeff0, coeff1); + __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); + __m256i b2 = _mm256_add_epi16(coeff2, coeff3); + __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); + + b0 = _mm256_srai_epi16(b0, 1); + b1 = _mm256_srai_epi16(b1, 1); + b2 = _mm256_srai_epi16(b2, 1); + b3 = _mm256_srai_epi16(b3, 1); + if (is_final) { + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); + coeff += 16; + } else { + _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); + _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); + _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); + coeff16 += 16; + } + t_coeff += 16; + } +} + +void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_avx2(src_diff, src_stride, coeff, 1); +} + +void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { +#if CONFIG_VP9_HIGHBITDEPTH + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int idx; + __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m256i b0, b1, b2, b3; + const __m256i zero = _mm256_setzero_si256(); + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_avx2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 16) { + const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); + const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); + + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm256_srai_epi32(b0_lo, 2); + b1_lo = _mm256_srai_epi32(b1_lo, 2); + b2_lo = _mm256_srai_epi32(b2_lo, 2); + b3_lo = _mm256_srai_epi32(b3_lo, 2); + + b0_hi = _mm256_srai_epi32(b0_hi, 2); + b1_hi = _mm256_srai_epi32(b1_hi, 2); + b2_hi = _mm256_srai_epi32(b2_hi, 2); + b3_hi = _mm256_srai_epi32(b3_hi, 2); + + b0 = _mm256_packs_epi32(b0_lo, b0_hi); + b1 = _mm256_packs_epi32(b1_lo, b1_hi); + b2 = _mm256_packs_epi32(b2_lo, b2_hi); + b3 = _mm256_packs_epi32(b3_lo, b3_hi); + + store_tran_low(_mm256_add_epi16(b0, b2), coeff); + store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); + store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); + store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); + + coeff += 16; + t_coeff += 16; + } +} + +int vpx_satd_avx2(const tran_low_t *coeff, int length) { + const __m256i one = _mm256_set1_epi16(1); + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 16) { + const __m256i src_line = load_tran_low(coeff); + const __m256i abs = _mm256_abs_epi16(src_line); + const __m256i sum = _mm256_madd_epi16(abs, one); + accum = _mm256_add_epi32(accum, sum); + coeff += 16; + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) { + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 8, coeff += 8) { + const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i abs = _mm256_abs_epi32(src_line); + accum = _mm256_add_epi32(accum, abs); + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c index b0a104bad0..4447dfab7c 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_intrin_sse2.c @@ -11,8 +11,18 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_ports/mem.h" +static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero, + __m128i *out_lo, + __m128i *out_hi) { + const __m128i sign_bits = _mm_cmplt_epi16(in, zero); + *out_lo = _mm_unpacklo_epi16(in, sign_bits); + *out_hi = _mm_unpackhi_epi16(in, sign_bits); +} + void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max) { __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; @@ -136,6 +146,56 @@ unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) { return (avg + 8) >> 4; } +#if CONFIG_VP9_HIGHBITDEPTH +unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) { + __m128i s0, s1; + unsigned int avg; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + const __m128i zero = _mm_setzero_si128(); + s0 = _mm_loadu_si128((const __m128i *)(s)); + s1 = _mm_loadu_si128((const __m128i *)(s + p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpackhi_epi16(s0, zero); + s0 = _mm_unpacklo_epi16(s0, zero); + s0 = _mm_add_epi32(s0, s1); + s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4)); + avg = (unsigned int)_mm_cvtsi128_si32(s0); + + return (avg + 32) >> 6; +} + +unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) { + __m128i s0, s1; + unsigned int avg; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + s0 = _mm_loadl_epi64((const __m128i *)(s)); + s1 = _mm_loadl_epi64((const __m128i *)(s + p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p)); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p)); + s0 = _mm_adds_epu16(s0, s1); + s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2)); + avg = _mm_extract_epi16(s0, 0); + + return (avg + 8) >> 4; +} +#endif // CONFIG_VP9_HIGHBITDEPTH + static void hadamard_col8_sse2(__m128i *in, int iter) { __m128i a0 = in[0]; __m128i a1 = in[1]; @@ -212,8 +272,9 @@ static void hadamard_col8_sse2(__m128i *in, int iter) { } } -void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { +static INLINE void hadamard_8x8_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { __m128i src[8]; src[0] = _mm_load_si128((const __m128i *)src_diff); src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); @@ -222,42 +283,79 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride)); hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); - _mm_store_si128((__m128i *)coeff, src[0]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[1]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[2]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[3]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[4]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[5]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[6]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[7]); + if (is_final) { + store_tran_low(src[0], coeff); + coeff += 8; + store_tran_low(src[1], coeff); + coeff += 8; + store_tran_low(src[2], coeff); + coeff += 8; + store_tran_low(src[3], coeff); + coeff += 8; + store_tran_low(src[4], coeff); + coeff += 8; + store_tran_low(src[5], coeff); + coeff += 8; + store_tran_low(src[6], coeff); + coeff += 8; + store_tran_low(src[7], coeff); + } else { + int16_t *coeff16 = (int16_t *)coeff; + _mm_store_si128((__m128i *)coeff16, src[0]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[1]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[2]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[3]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[4]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[5]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[6]); + coeff16 += 8; + _mm_store_si128((__m128i *)coeff16, src[7]); + } } -void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { +void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_8x8_sse2(src_diff, src_stride, coeff, 1); +} + +static INLINE void hadamard_16x16_sse2(const int16_t *src_diff, + ptrdiff_t src_stride, tran_low_t *coeff, + int is_final) { +#if CONFIG_VP9_HIGHBITDEPTH + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int16_t *coeff16 = (int16_t *)coeff; int idx; for (idx = 0; idx < 4; ++idx) { - int16_t const *src_ptr = + const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; - vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); + hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), + 0); } for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); - __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); - __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); - __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); @@ -271,25 +369,119 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); - _mm_store_si128((__m128i *)coeff, coeff0); - _mm_store_si128((__m128i *)(coeff + 64), coeff1); - coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); - _mm_store_si128((__m128i *)(coeff + 128), coeff2); - _mm_store_si128((__m128i *)(coeff + 192), coeff3); - coeff += 8; + if (is_final) { + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 64); + store_tran_low(coeff2, coeff + 128); + store_tran_low(coeff3, coeff + 192); + coeff += 8; + } else { + _mm_store_si128((__m128i *)coeff16, coeff0); + _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); + _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); + _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); + coeff16 += 8; + } + + t_coeff += 8; } } -int vpx_satd_sse2(const int16_t *coeff, int length) { +void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_16x16_sse2(src_diff, src_stride, coeff, 1); +} + +void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { +#if CONFIG_VP9_HIGHBITDEPTH + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif + int idx; + __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo, + b3_lo; + __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi, + b3_hi; + __m128i b0, b1, b2, b3; + const __m128i zero = _mm_setzero_si128(); + for (idx = 0; idx < 4; ++idx) { + const int16_t *src_ptr = + src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; + hadamard_16x16_sse2(src_ptr, src_stride, + (tran_low_t *)(t_coeff + idx * 256), 0); + } + + for (idx = 0; idx < 256; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); + + // Sign extend 16 bit to 32 bit. + sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi); + sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi); + sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi); + sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi); + + b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo); + b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi); + + b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo); + b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi); + + b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo); + b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi); + + b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo); + b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi); + + b0_lo = _mm_srai_epi32(b0_lo, 2); + b1_lo = _mm_srai_epi32(b1_lo, 2); + b2_lo = _mm_srai_epi32(b2_lo, 2); + b3_lo = _mm_srai_epi32(b3_lo, 2); + + b0_hi = _mm_srai_epi32(b0_hi, 2); + b1_hi = _mm_srai_epi32(b1_hi, 2); + b2_hi = _mm_srai_epi32(b2_hi, 2); + b3_hi = _mm_srai_epi32(b3_hi, 2); + + b0 = _mm_packs_epi32(b0_lo, b0_hi); + b1 = _mm_packs_epi32(b1_lo, b1_hi); + b2 = _mm_packs_epi32(b2_lo, b2_hi); + b3 = _mm_packs_epi32(b3_lo, b3_hi); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 256); + + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + store_tran_low(coeff2, coeff + 512); + store_tran_low(coeff3, coeff + 768); + + coeff += 8; + t_coeff += 8; + } +} + +int vpx_satd_sse2(const tran_low_t *coeff, int length) { int i; const __m128i zero = _mm_setzero_si128(); __m128i accum = zero; for (i = 0; i < length; i += 8) { - const __m128i src_line = _mm_load_si128((const __m128i *)coeff); + const __m128i src_line = load_tran_low(coeff); const __m128i inv = _mm_sub_epi16(zero, src_line); const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); @@ -309,7 +501,7 @@ int vpx_satd_sse2(const int16_t *coeff, int length) { return _mm_cvtsi128_si32(accum); } -void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, +void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height) { int idx; __m128i zero = _mm_setzero_si128(); @@ -358,16 +550,16 @@ void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, _mm_storeu_si128((__m128i *)hbuf, s1); } -int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) { +int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width) { __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_load_si128((const __m128i *)ref); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); __m128i s0 = _mm_sad_epu8(src_line, zero); __m128i s1; int i; for (i = 16; i < width; i += 16) { ref += 16; - src_line = _mm_load_si128((const __m128i *)ref); + src_line = _mm_loadu_si128((const __m128i *)ref); s1 = _mm_sad_epu8(src_line, zero); s0 = _mm_adds_epu16(s0, s1); } @@ -378,7 +570,7 @@ int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) { return _mm_extract_epi16(s0, 0); } -int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) { +int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl) { int idx; int width = 4 << bwl; int16_t mean; @@ -416,7 +608,7 @@ int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) { v1 = _mm_srli_epi64(sse, 32); sse = _mm_add_epi32(sse, v1); - mean = _mm_extract_epi16(sum, 0); + mean = (int16_t)_mm_extract_epi16(sum, 0); return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); } diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c new file mode 100644 index 0000000000..f4357998c9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_avx2.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" + +void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int row = 0; + // comp_pred and pred must be 32 byte aligned. + assert(((intptr_t)comp_pred % 32) == 0); + assert(((intptr_t)pred % 32) == 0); + + if (width == 8) { + assert(height % 4 == 0); + do { + const __m256i p = _mm256_load_si256((const __m256i *)pred); + const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); + const __m128i r_1 = + _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride)); + + const __m128i r1 = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride))); + const __m128i r2 = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride))); + + const __m256i ref_0123 = + _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1); + const __m256i avg = _mm256_avg_epu8(p, ref_0123); + + _mm256_store_si256((__m256i *)comp_pred, avg); + + row += 4; + pred += 32; + comp_pred += 32; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 16) { + assert(height % 4 == 0); + do { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); + const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); + const __m256i tmp0 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref)); + const __m256i ref_0 = _mm256_inserti128_si256( + tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1); + const __m256i tmp1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride))); + const __m256i ref_1 = _mm256_inserti128_si256( + tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)comp_pred, average_0); + _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); + + row += 4; + pred += 64; + comp_pred += 64; + ref += 4 * ref_stride; + } while (row < height); + } else if (width == 32) { + assert(height % 2 == 0); + do { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred); + const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)comp_pred, average_0); + _mm256_store_si256((__m256i *)(comp_pred + 32), average_1); + + row += 2; + pred += 64; + comp_pred += 64; + ref += 2 * ref_stride; + } while (row < height); + } else if (width % 64 == 0) { + do { + int x; + for (x = 0; x < width; x += 64) { + const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x)); + const __m256i pred_1 = + _mm256_load_si256((const __m256i *)(pred + x + 32)); + const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x)); + const __m256i ref_1 = + _mm256_loadu_si256((const __m256i *)(ref + x + 32)); + const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0); + const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1); + _mm256_store_si256((__m256i *)(comp_pred + x), average_0); + _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1); + } + row++; + pred += width; + comp_pred += width; + ref += ref_stride; + } while (row < height); + } else { + vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c new file mode 100644 index 0000000000..c6e70f744e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_pred_sse2.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + /* comp_pred and pred must be 16 byte aligned. */ + assert(((intptr_t)comp_pred & 0xf) == 0); + assert(((intptr_t)pred & 0xf) == 0); + if (width > 8) { + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; x += 16) { + const __m128i p = _mm_load_si128((const __m128i *)(pred + x)); + const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x)); + const __m128i avg = _mm_avg_epu8(p, r); + _mm_store_si128((__m128i *)(comp_pred + x), avg); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else { // width must be 4 or 8. + int i; + // Process 16 elements at a time. comp_pred and pred have width == stride + // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are + // all divisible by 16 so just ref needs to be massaged when loading. + for (i = 0; i < width * height; i += 16) { + const __m128i p = _mm_load_si128((const __m128i *)pred); + __m128i r; + __m128i avg; + if (width == ref_stride) { + r = _mm_loadu_si128((const __m128i *)ref); + ref += 16; + } else if (width == 4) { + r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride), + loadu_int32(ref + 2 * ref_stride), + loadu_int32(ref + ref_stride), loadu_int32(ref)); + + ref += 4 * ref_stride; + } else { + const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); + assert(width == 8); + r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0), + (const __m64 *)(ref + ref_stride))); + + ref += 2 * ref_stride; + } + avg = _mm_avg_epu8(p, r); + _mm_store_si128((__m128i *)comp_pred, avg); + + pred += 16; + comp_pred += 16; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm index 26412e8e43..9122b5a401 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm @@ -8,42 +8,50 @@ ; be found in the AUTHORS file in the root of the source tree. ; -%define private_prefix vpx - %include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the hadamard transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 ; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro +%macro TRANSPOSE8X8 10 + ; stage 1 + punpcklwd m%9, m%1, m%2 + punpcklwd m%10, m%3, m%4 + punpckhwd m%1, m%2 + punpckhwd m%3, m%4 -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 + punpcklwd m%2, m%5, m%6 + punpcklwd m%4, m%7, m%8 + punpckhwd m%5, m%6 + punpckhwd m%7, m%8 - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 + ; stage 2 + punpckldq m%6, m%9, m%10 + punpckldq m%8, m%1, m%3 + punpckhdq m%9, m%10 + punpckhdq m%1, m%3 - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 + punpckldq m%10, m%2, m%4 + punpckldq m%3, m%5, m%7 + punpckhdq m%2, m%4 + punpckhdq m%5, m%7 - SWAP %2, %5 - SWAP %4, %7 + ; stage 3 + punpckhqdq m%4, m%9, m%2 ; out3 + punpcklqdq m%9, m%2 ; out2 + punpcklqdq m%7, m%1, m%5 ; out6 + punpckhqdq m%1, m%5 ; out7 + + punpckhqdq m%2, m%6, m%10 ; out1 + punpcklqdq m%6, m%10 ; out0 + punpcklqdq m%5, m%8, m%3 ; out4 + punpckhqdq m%8, m%3 ; out5 + + SWAP %6, %1 + SWAP %3, %9 + SWAP %8, %6 %endmacro %macro HMD8_1D 0 @@ -87,8 +95,9 @@ SECTION .text SWAP 7, 9 %endmacro + INIT_XMM ssse3 -cglobal hadamard_8x8, 3, 5, 10, input, stride, output +cglobal hadamard_8x8, 3, 5, 11, input, stride, output lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -105,17 +114,17 @@ cglobal hadamard_8x8, 3, 5, 10, input, stride, output mova m7, [inputq + r3] HMD8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 HMD8_1D - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 + STORE_TRAN_LOW 0, outputq, 0, 8, 9 + STORE_TRAN_LOW 1, outputq, 8, 8, 9 + STORE_TRAN_LOW 2, outputq, 16, 8, 9 + STORE_TRAN_LOW 3, outputq, 24, 8, 9 + STORE_TRAN_LOW 4, outputq, 32, 8, 9 + STORE_TRAN_LOW 5, outputq, 40, 8, 9 + STORE_TRAN_LOW 6, outputq, 48, 8, 9 + STORE_TRAN_LOW 7, outputq, 56, 8, 9 RET %endif diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h new file mode 100644 index 0000000000..c02b47a3eb --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ +#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Load 16 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE __m256i load_tran_low(const tran_low_t *a) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i a_low = _mm256_loadu_si256((const __m256i *)a); + const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8)); + return _mm256_packs_epi32(a_low, a_high); +#else + return _mm256_loadu_si256((const __m256i *)a); +#endif +} + +static INLINE void store_tran_low(__m256i a, tran_low_t *b) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_hi = _mm256_mulhi_epi16(a, one); + const __m256i a_lo = _mm256_mullo_epi16(a, one); + const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi); + const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi); + _mm256_storeu_si256((__m256i *)b, a_1); + _mm256_storeu_si256((__m256i *)(b + 8), a_2); +#else + _mm256_storeu_si256((__m256i *)b, a); +#endif +} +#endif // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm new file mode 100644 index 0000000000..aacf71f7ac --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm @@ -0,0 +1,90 @@ +; +; Copyright (c) 2017 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm. +; vpx_config.asm is not guarded so can not be included twice. Because this will +; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be +; included after those files. + +; Increment register by sizeof() tran_low_t * 8. +%macro INCREMENT_TRAN_LOW 1 +%if CONFIG_VP9_HIGHBITDEPTH + add %1, 32 +%else + add %1, 16 +%endif +%endmacro + +; Increment %1 by sizeof() tran_low_t * %2. +%macro INCREMENT_ELEMENTS_TRAN_LOW 2 +%if CONFIG_VP9_HIGHBITDEPTH + lea %1, [%1 + %2 * 4] +%else + lea %1, [%1 + %2 * 2] +%endif +%endmacro + +; Load %2 + %3 into m%1. +; %3 is the offset in elements, not bytes. +; If tran_low_t is 16 bits (low bit depth configuration) then load the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack +; the values down to 16 bits. +%macro LOAD_TRAN_LOW 3 +%if CONFIG_VP9_HIGHBITDEPTH + mova m%1, [%2 + (%3) * 4] + packssdw m%1, [%2 + (%3) * 4 + 16] +%else + mova m%1, [%2 + (%3) * 2] +%endif +%endmacro + +; Store m%1 to %2 + %3. +; %3 is the offset in elements, not bytes. +; If 5 arguments are provided then m%1 is corrupted. +; If 6 arguments are provided then m%1 is preserved. +; If tran_low_t is 16 bits (low bit depth configuration) then store the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign +; extend the values first. +; Uses m%4-m%6 as scratch registers for high bit depth. +%macro STORE_TRAN_LOW 5-6 +%if CONFIG_VP9_HIGHBITDEPTH + pxor m%4, m%4 + mova m%5, m%1 + %if %0 == 6 + mova m%6, m%1 + %endif + pcmpgtw m%4, m%1 + punpcklwd m%5, m%4 + %if %0 == 5 + punpckhwd m%1, m%4 + %else + punpckhwd m%6, m%4 + %endif + mova [%2 + (%3) * 4 + 0], m%5 + %if %0 == 5 + mova [%2 + (%3) * 4 + 16], m%1 + %else + mova [%2 + (%3) * 4 + 16], m%6 + %endif +%else + mova [%2 + (%3) * 2], m%1 +%endif +%endmacro + +; Store zeros (in m%1) to %2 + %3. +; %3 is the offset in elements, not bytes. +%macro STORE_ZERO_TRAN_LOW 3 +%if CONFIG_VP9_HIGHBITDEPTH + mova [%2 + (%3) * 4 + 0], m%1 + mova [%2 + (%3) * 4 + 16], m%1 +%else + mova [%2 + (%3) * 2], m%1 +%endif +%endmacro diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fdct.h b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h similarity index 76% rename from media/libvpx/libvpx/vpx_dsp/x86/fdct.h rename to media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h index 54a6d81fcb..74dde656b1 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/fdct.h +++ b/media/libvpx/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_FDCT_H_ -#define VPX_DSP_X86_FDCT_H_ +#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ +#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ #include @@ -16,13 +16,12 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" -// Load 8 16 bit values. If the source is 32 bits then cast down. -// This does not saturate values. It only truncates. +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. static INLINE __m128i load_tran_low(const tran_low_t *a) { #if CONFIG_VP9_HIGHBITDEPTH - return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2], - (int16_t)a[3], (int16_t)a[4], (int16_t)a[5], - (int16_t)a[6], (int16_t)a[7]); + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); #else return _mm_load_si128((const __m128i *)a); #endif @@ -54,4 +53,4 @@ static INLINE void store_zero_tran_low(tran_low_t *a) { _mm_store_si128((__m128i *)(a), zero); #endif } -#endif // VPX_DSP_X86_FDCT_H_ +#endif // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve.h index d7468ad7ca..c339600556 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/convolve.h +++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve.h @@ -7,89 +7,130 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_CONVOLVE_H_ -#define VPX_DSP_X86_CONVOLVE_H_ +#ifndef VPX_VPX_DSP_X86_CONVOLVE_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_H_ #include #include "./vpx_config.h" #include "vpx/vpx_integer.h" -#include "vpx_ports/mem.h" +#include "vpx_ports/compiler_attributes.h" +// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty +// hacky and awful to read. Note that there is a filter_x[3] == 128 check in +// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function +// assumes the filter is always 8 tap. typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter); -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ +// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we +// have 4-tap vert avg filter. +#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \ void vpx_convolve8_##name##_##opt( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - (void)filter_x; \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + const int16_t *filter_row = filter[offset]; \ + (void)x0_q4; \ (void)x_step_q4; \ - (void)filter_y; \ + (void)y0_q4; \ (void)y_step_q4; \ - assert(filter[3] != 128); \ + assert(filter_row[3] != 128); \ assert(step_q4 == 16); \ - if (filter[0] | filter[1] | filter[2]) { \ + if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ + const int num_taps = 8; \ while (w >= 16) { \ vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ src += 16; \ dst += 16; \ w -= 16; \ } \ if (w == 8) { \ vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ } else if (w == 4) { \ vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ + dst_stride, h, filter_row); \ } \ - } else { \ + (void)num_taps; \ + } else if (filter_row[2] | filter_row[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ while (w >= 16) { \ - vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ + vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ src += 16; \ dst += 16; \ w -= 16; \ } \ if (w == 8) { \ - vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ + vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ } else if (w == 4) { \ - vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ - dst_stride, h, filter); \ + vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ } \ + (void)num_taps; \ + } else { \ + const int num_taps = 2; \ + while (w >= 16) { \ + vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + if (w == 8) { \ + vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + } else if (w == 4) { \ + vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter_row); \ + } \ + (void)num_taps; \ } \ } -#define FUN_CONV_2D(avg, opt) \ - void vpx_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - assert(filter_x[3] != 128); \ - assert(filter_y[3] != 128); \ - assert(w <= 64); \ - assert(h <= 64); \ - assert(x_step_q4 == 16); \ - assert(y_step_q4 == 16); \ - if (filter_x[0] | filter_x[1] | filter_x[2]) { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ - vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, \ - h + 7); \ - vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } else { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ - vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h + 1); \ - vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h); \ - } \ +#define FUN_CONV_2D(avg, opt, is_avg) \ + void vpx_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + const int16_t *filter_x = filter[x0_q4]; \ + const int16_t *filter_y = filter[y0_q4]; \ + (void)filter_y; \ + assert(filter_x[3] != 128); \ + assert(filter_y[3] != 128); \ + assert(w <= 64); \ + assert(h <= 64); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ + vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ + h + 7); \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h); \ + } else if (filter_x[2] | filter_x[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ + vpx_convolve8_horiz_##opt( \ + src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \ + dst, dst_stride, filter, x0_q4, \ + x_step_q4, y0_q4, y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ + vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \ + x_step_q4, y0_q4, y_step_q4, w, h + 1); \ + vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ + h); \ + } \ } #if CONFIG_VP9_HIGHBITDEPTH @@ -101,98 +142,138 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, unsigned int output_height, const int16_t *filter, int bd); -#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ - void vpx_highbd_convolve8_##name##_##opt( \ - const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - if (step_q4 == 16 && filter[3] != 128) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - if (filter[0] | filter[1] | filter[2]) { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h, bd); \ - } \ - } - -#define HIGH_FUN_CONV_2D(avg, opt) \ - void vpx_highbd_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vpx_highbd_convolve8_horiz_##opt( \ - src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h, bd); \ +#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, \ + is_avg) \ + void vpx_highbd_convolve8_##name##_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter_row = filter_kernel[offset]; \ + if (step_q4 == 16 && filter_row[3] != 128) { \ + if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ + const int num_taps = 8; \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + (void)num_taps; \ + } else if (filter_row[2] | filter_row[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + (void)num_taps; \ } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vpx_highbd_convolve8_horiz_##opt( \ - src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h, bd); \ + const int num_taps = 2; \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + (void)num_taps; \ } \ - } else { \ - vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ + } \ + if (w) { \ + vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_kernel, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h, bd); \ } \ } -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // VPX_DSP_X86_CONVOLVE_H_ +#define HIGH_FUN_CONV_2D(avg, opt, is_avg) \ + void vpx_highbd_convolve8_##avg##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter_x = filter[x0_q4]; \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \ + filter_x[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + fdata2, 64, filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h + 7, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h, bd); \ + } else if (filter_x[2] | filter_x[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ + vpx_highbd_convolve8_horiz_##opt( \ + src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \ + bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ + vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, \ + w, h + 1, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h, bd); \ + } \ + } else { \ + vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \ + bd); \ + } \ + } + +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // VPX_VPX_DSP_X86_CONVOLVE_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h new file mode 100644 index 0000000000..ebee964b18 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_avx2.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ + +#include // AVX2 + +#include "./vpx_config.h" + +#if defined(__clang__) +#if (__clang_major__ > 0 && __clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 +#elif defined(__GNUC__) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static INLINE void shuffle_filter_avx2(const int16_t *const filter, + __m256i *const f) { + const __m256i f_values = + MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter)); + // pack and duplicate the filter values + f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u)); + f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u)); + f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u)); + f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu)); +} + +static INLINE __m256i convolve8_16_avx2(const __m256i *const s, + const __m256i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m256i k_64 = _mm256_set1_epi16(1 << 6); + const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]); + const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]); + const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]); + const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]); + __m256i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm256_add_epi16(x0, x2); + sum2 = _mm256_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm256_add_epi16(sum1, k_64); + sum1 = _mm256_adds_epi16(sum1, sum2); + // round and shift by 7 bit each 16 bit + sum1 = _mm256_srai_epi16(sum1, 7); + return sum1; +} + +static INLINE __m128i convolve8_8_avx2(const __m256i *const s, + const __m256i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]), + _mm256_castsi256_si128(f[0])); + const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]), + _mm256_castsi256_si128(f[1])); + const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]), + _mm256_castsi256_si128(f[2])); + const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]), + _mm256_castsi256_si128(f[3])); + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; +} + +static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) { + const __m256i tmp = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo)); + return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1); +} + +static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) { + const __m256i tmp = + _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo)); + return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1); +} + +static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src)); + _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); +} + +static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src)); + _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); +} + +static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); + *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); +} + +static INLINE __m256i mm256_round_epi32(const __m256i *const src, + const __m256i *const half_depth, + const int depth) { + const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth); + return _mm256_srai_epi32(nearest_src, depth); +} + +static INLINE __m256i mm256_round_epi16(const __m256i *const src, + const __m256i *const half_depth, + const int depth) { + const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth); + return _mm256_srai_epi16(nearest_src, depth); +} + +static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0, + const __m256i *const src_1, + const __m256i *const ker_0, + const __m256i *const ker_1) { + const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0); + const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1); + return _mm256_add_epi32(tmp_0, tmp_1); +} + +#undef MM256_BROADCASTSI128_SI256 + +#endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h new file mode 100644 index 0000000000..8443546394 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_sse2.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" + +// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns +// values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words +static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) { + __m128i tmp = _mm_unpacklo_epi32(*reg, *reg); + return _mm_unpackhi_epi64(tmp, tmp); +} + +// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns +// values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words. +static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) { + __m128i tmp = _mm_unpackhi_epi32(*reg, *reg); + return _mm_unpacklo_epi64(tmp, tmp); +} + +// Interprets src as 8-bit words, zero extends to form 16-bit words, then +// multiplies with ker and add the adjacent results to form 32-bit words. +// Finally adds the result from 1 and 2 together. +static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { + const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128()); + const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128()); + const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1); + const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2); + return _mm_add_epi32(madd_1, madd_2); +} + +// Interprets src as 16-bit words, then multiplies with ker and add the +// adjacent results to form 32-bit words. Finally adds the result from 1 and 2 +// together. +static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { + const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1); + const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2); + return _mm_add_epi32(madd_1, madd_2); +} + +static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0, + const __m128i *const src_1, + const __m128i *const ker) { + const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker); + const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker); + return _mm_packs_epi32(madd_1, madd_2); +} + +// Interleaves src_1 and src_2 +static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1, + const __m128i *const src_2) { + const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2); + const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2); + return _mm_packs_epi32(tmp_1, tmp_2); +} + +static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { + const __m128i nearest_src = _mm_add_epi32(*src, *half_depth); + return _mm_srai_epi32(nearest_src, depth); +} + +static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { + const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth); + return _mm_srai_epi16(nearest_src, depth); +} + +#endif // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h new file mode 100644 index 0000000000..8a4b165133 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/convolve_ssse3.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_ + +#include +#include // SSSE3 + +#include "./vpx_config.h" + +static INLINE void shuffle_filter_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); +} + +static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + // It utilizes the fact that the high byte of filter[3] is always 0 to clean + // half of f[0] and f[4]. + assert(filter[3] >= 0 && filter[3] < 256); + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au)); + f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu)); +} + +static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + __m128i sum1, sum2; + + // sum the results together, saturating only on the final step + // adding x0 with x2 and x1 with x3 is the only order that prevents + // outranges for all filters + sum1 = _mm_add_epi16(x0, x2); + sum2 = _mm_add_epi16(x1, x3); + // add the rounding offset early to avoid another saturated add + sum1 = _mm_add_epi16(sum1, k_64); + sum1 = _mm_adds_epi16(sum1, sum2); + // shift by 7 bit each 16 bit + sum1 = _mm_srai_epi16(sum1, 7); + return sum1; +} + +static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + // compensate the subtracted 64 in f[1]. x4 is always non negative. + const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64)); + // add and saturate the results together + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x4); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); + // compensate the subtracted 64 in f[2]. x5 is always non negative. + const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64)); + __m128i temp; + + // add and saturate the results together + temp = _mm_adds_epi16(x0, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x3); + temp = _mm_adds_epi16(temp, x4); + temp = _mm_adds_epi16(temp, x5); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +#endif // VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm index ebca50930a..b3af677d2e 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/deblock_sse2.asm @@ -78,11 +78,13 @@ %endmacro %macro UPDATE_FLIMIT 0 - movdqa xmm2, XMMWORD PTR [rbx] - movdqa [rsp], xmm2 + movdqu xmm2, XMMWORD PTR [rbx] + movdqu [rsp], xmm2 add rbx, 16 %endmacro +SECTION .text + ;void vpx_post_proc_down_and_across_mb_row_sse2 ;( ; unsigned char *src_ptr, @@ -93,7 +95,7 @@ ; int *flimits, ; int size ;) -global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE +globalsym(vpx_post_proc_down_and_across_mb_row_sse2) sym(vpx_post_proc_down_and_across_mb_row_sse2): push rbp mov rbp, rsp @@ -230,241 +232,10 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2): ret %undef flimit -;void vpx_mbpost_proc_down_sse2(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) -extern sym(vpx_rv) -global sym(vpx_mbpost_proc_down_sse2) PRIVATE -sym(vpx_mbpost_proc_down_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 128+16 - - ; unsigned char d[16][8] at [rsp] - ; create flimit2 at [rsp+128] - mov eax, dword ptr arg(4) ;flimit - mov [rsp+128], eax - mov [rsp+128+4], eax - mov [rsp+128+8], eax - mov [rsp+128+12], eax -%define flimit4 [rsp+128] - -%if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vpx_rv))] -%endif - - ;rows +=8; - add dword arg(2), 8 - - ;for(c=0; c // AVX2 #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" +#define ADD256_EPI16 _mm256_add_epi16 +#define SUB256_EPI16 _mm256_sub_epi16 + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size, int pass) { + int i; + const __m256i kOne = _mm256_set1_epi16(1); + if (pass == 0) { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride)); + // x = x << 2 + out[i] = _mm256_slli_epi16(out[i], 2); + } + } else { + for (i = 0; i < out_size; i++) { + out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16)); + // x = (x + 1) >> 2 + out[i] = _mm256_add_epi16(out[i], kOne); + out[i] = _mm256_srai_epi16(out[i], 2); + } + } +} + +static INLINE void transpose2_8x8_avx2(const __m256i *const in, + __m256i *const out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1); + +#define LOADR(idx) \ + t[8 + idx] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + idx] = _mm256_inserti128_si256( \ + t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +// Store 8 16-bit values. Sign extend the values. +static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in, + tran_low_t *out, + const int stride, + const int out_size) { + int i; + for (i = 0; i < out_size; ++i) { + _mm256_storeu_si256((__m256i *)(out), in[i]); + out += stride; + } +} + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE __m256i mult256_round_shift(const __m256i *pin0, + const __m256i *pin1, + const __m256i *pmultiplier, + const __m256i *prounding, + const int shift) { + const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier); + const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier); + const __m256i v0 = _mm256_add_epi32(u0, *prounding); + const __m256i v1 = _mm256_add_epi32(u1, *prounding); + const __m256i w0 = _mm256_srai_epi32(v0, shift); + const __m256i w1 = _mm256_srai_epi32(v1, shift); + return _mm256_packs_epi32(w0, w1); +} + +static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) { + int i; + __m256i step2[4]; + __m256i in[8]; + __m256i step1[8]; + __m256i step3[8]; + + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64); + const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64); + const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64); + const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64); + const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64); + const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64); + const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64); + const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64); + const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); + + // Calculate input for the first 8 results. + for (i = 0; i < 8; i++) { + in[i] = ADD256_EPI16(input[i], input[15 - i]); + } + + // Calculate input for the next 8 results. + for (i = 0; i < 8; i++) { + step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]); + } + + // Work on the first eight values; fdct8(input, even_results); + { + // Add/subtract + const __m256i q0 = ADD256_EPI16(in[0], in[7]); + const __m256i q1 = ADD256_EPI16(in[1], in[6]); + const __m256i q2 = ADD256_EPI16(in[2], in[5]); + const __m256i q3 = ADD256_EPI16(in[3], in[4]); + const __m256i q4 = SUB256_EPI16(in[3], in[4]); + const __m256i q5 = SUB256_EPI16(in[2], in[5]); + const __m256i q6 = SUB256_EPI16(in[1], in[6]); + const __m256i q7 = SUB256_EPI16(in[0], in[7]); + + // Work on first four results + { + // Add/subtract + const __m256i r0 = ADD256_EPI16(q0, q3); + const __m256i r1 = ADD256_EPI16(q1, q2); + const __m256i r2 = SUB256_EPI16(q1, q2); + const __m256i r3 = SUB256_EPI16(q0, q3); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(r0, r1); + const __m256i t1 = _mm256_unpackhi_epi16(r0, r1); + const __m256i t2 = _mm256_unpacklo_epi16(r2, r3); + const __m256i t3 = _mm256_unpackhi_epi16(r2, r3); + + output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[12] = + mult256_round_shift(&t2, &t3, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m256i d0 = _mm256_unpacklo_epi16(q6, q5); + const __m256i d1 = _mm256_unpackhi_epi16(q6, q5); + const __m256i r0 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m256i r1 = mult256_round_shift( + &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + + { + // Add/subtract + const __m256i x0 = ADD256_EPI16(q4, r0); + const __m256i x1 = SUB256_EPI16(q4, r0); + const __m256i x2 = SUB256_EPI16(q7, r1); + const __m256i x3 = ADD256_EPI16(q7, r1); + + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m256i t0 = _mm256_unpacklo_epi16(x0, x3); + const __m256i t1 = _mm256_unpackhi_epi16(x0, x3); + const __m256i t2 = _mm256_unpacklo_epi16(x1, x2); + const __m256i t3 = _mm256_unpackhi_epi16(x1, x2); + output[2] = + mult256_round_shift(&t0, &t1, &k__cospi_p28_p04, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[14] = + mult256_round_shift(&t0, &t1, &k__cospi_m04_p28, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[10] = + mult256_round_shift(&t2, &t3, &k__cospi_p12_p20, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[6] = + mult256_round_shift(&t2, &t3, &k__cospi_m20_p12, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } + } + } + // Work on the next eight values; step1 -> odd_results + { // step 2 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 3 + { + step3[0] = ADD256_EPI16(step1[0], step2[1]); + step3[1] = ADD256_EPI16(step1[1], step2[0]); + step3[2] = SUB256_EPI16(step1[1], step2[0]); + step3[3] = SUB256_EPI16(step1[0], step2[1]); + step3[4] = SUB256_EPI16(step1[7], step2[3]); + step3[5] = SUB256_EPI16(step1[6], step2[2]); + step3[6] = ADD256_EPI16(step1[6], step2[2]); + step3[7] = ADD256_EPI16(step1[7], step2[3]); + } + // step 4 + { + const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]); + const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]); + const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]); + const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]); + step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + // step 5 + { + step1[0] = ADD256_EPI16(step3[0], step2[0]); + step1[1] = SUB256_EPI16(step3[0], step2[0]); + step1[2] = ADD256_EPI16(step3[3], step2[1]); + step1[3] = SUB256_EPI16(step3[3], step2[1]); + step1[4] = SUB256_EPI16(step3[4], step2[3]); + step1[5] = ADD256_EPI16(step3[4], step2[3]); + step1[6] = SUB256_EPI16(step3[7], step2[2]); + step1[7] = ADD256_EPI16(step3[7], step2[2]); + } + // step 6 + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]); + output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + { + const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]); + const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]); + const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]); + const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]); + output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + } + } +} + +void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) { + int pass; + DECLARE_ALIGNED(32, int16_t, intermediate[256]); + int16_t *out0 = intermediate; + tran_low_t *out1 = output; + const int width = 16; + const int height = 16; + __m256i buf0[16], buf1[16]; + + // Two transform and transpose passes + // Process 16 columns (transposed rows in second pass) at a time. + for (pass = 0; pass < 2; ++pass) { + // Load and pre-condition input. + load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass); + + // Calculate dct for 16x16 values + fdct16x16_1D_avx2(buf1, buf0); + + // Transpose the results. + transpose_16bit_16x16_avx2(buf0, buf1); + + if (pass == 0) { + store_buffer_16bit_to_32bit_w16_avx2(buf1, (tran_low_t *)out0, width, + height); + } else { + store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height); + } + // Setup in/out for next pass. + input = intermediate; + } +} + +#if !CONFIG_VP9_HIGHBITDEPTH #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 #include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" @@ -21,3 +397,4 @@ #include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT #undef FDCT32x32_2D_AVX2 #undef FDCT32x32_HIGH_PRECISION +#endif // !CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h index 743e55e635..d546f02a14 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h +++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h @@ -93,9 +93,9 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { #if DCT_HIGH_BIT_DEPTH // Check inputs small enough to use optimised code cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)), - _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00))); + _mm_cmplt_epi16(in0, _mm_set1_epi16((int16_t)0xfc00))); cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)), - _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00))); + _mm_cmplt_epi16(in1, _mm_set1_epi16((int16_t)0xfc00))); test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1)); if (test) { vpx_highbd_fdct4x4_c(input, output, stride); @@ -261,7 +261,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); @@ -582,7 +582,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); @@ -778,6 +778,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { return; } #endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us // into 32 bits. { @@ -834,6 +835,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { return; } #endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us // into 32 bits. { diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h index 5201e764c8..5aa2779706 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h +++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_FWD_TXFM_SSE2_H_ -#define VPX_DSP_X86_FWD_TXFM_SSE2_H_ +#ifndef VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_ +#define VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_ #ifdef __cplusplus extern "C" { @@ -36,7 +36,7 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { static INLINE int check_epi16_overflow_x2(const __m128i *preg0, const __m128i *preg1) { const __m128i max_overflow = _mm_set1_epi16(0x7fff); - const __m128i min_overflow = _mm_set1_epi16(0x8000); + const __m128i min_overflow = _mm_set1_epi16((short)0x8000); __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), _mm_cmpeq_epi16(*preg0, min_overflow)); __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), @@ -50,7 +50,7 @@ static INLINE int check_epi16_overflow_x4(const __m128i *preg0, const __m128i *preg2, const __m128i *preg3) { const __m128i max_overflow = _mm_set1_epi16(0x7fff); - const __m128i min_overflow = _mm_set1_epi16(0x8000); + const __m128i min_overflow = _mm_set1_epi16((short)0x8000); __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), _mm_cmpeq_epi16(*preg0, min_overflow)); __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), @@ -368,4 +368,4 @@ static INLINE void transpose_and_output8x8( } // extern "C" #endif -#endif // VPX_DSP_X86_FWD_TXFM_SSE2_H_ +#endif // VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm index 78a1dbb24f..2c338fb5dd 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -10,10 +10,6 @@ %include "third_party/x86inc/x86inc.asm" -; This file provides SSSE3 version of the forward transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. - SECTION_RODATA pw_11585x2: times 8 dw 23170 @@ -31,108 +27,12 @@ TRANSFORM_COEFFS 9102, 13623 SECTION .text -%if ARCH_X86_64 -%macro SUM_SUB 3 - psubw m%3, m%1, m%2 - paddw m%1, m%2 - SWAP %2, %3 -%endmacro - -; butterfly operation -%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 - pmaddwd m%1, m%3, %5 - pmaddwd m%2, m%3, %6 - paddd m%1, %4 - paddd m%2, %4 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -; 1D forward 8x8 DCT transform -%macro FDCT8_1D 1 - SUM_SUB 0, 7, 9 - SUM_SUB 1, 6, 9 - SUM_SUB 2, 5, 9 - SUM_SUB 3, 4, 9 - - SUM_SUB 0, 3, 9 - SUM_SUB 1, 2, 9 - SUM_SUB 6, 5, 9 -%if %1 == 0 - SUM_SUB 0, 1, 9 -%endif - - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 - - pmulhrsw m6, m12 - pmulhrsw m5, m12 -%if %1 == 0 - pmulhrsw m0, m12 - pmulhrsw m1, m12 -%else - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 - SWAP 0, 1 -%endif - - SUM_SUB 4, 5, 9 - SUM_SUB 7, 6, 9 - BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10 - BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10 - SWAP 1, 4 - SWAP 3, 6 -%endmacro - -%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2 - psraw m%3, m%1, 15 - psraw m%4, m%2, 15 - psubw m%1, m%3 - psubw m%2, m%4 - psraw m%1, 1 - psraw m%2, 1 -%endmacro - +%if VPX_ARCH_X86_64 INIT_XMM ssse3 cglobal fdct8x8, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m12, [pw_11585x2] - pxor m11, m11 + mova m8, [GLOBAL(pd_8192)] + mova m12, [GLOBAL(pw_11585x2)] lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -159,25 +59,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride psllw m7, 2 ; column transform - FDCT8_1D 0 - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + ; stage 1 + paddw m10, m0, m7 + psubw m0, m7 - FDCT8_1D 1 - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + paddw m9, m1, m6 + psubw m1, m6 - DIVIDE_ROUND_2X 0, 1, 9, 10 - DIVIDE_ROUND_2X 2, 3, 9, 10 - DIVIDE_ROUND_2X 4, 5, 9, 10 - DIVIDE_ROUND_2X 6, 7, 9, 10 + paddw m7, m2, m5 + psubw m2, m5 - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 + paddw m6, m3, m4 + psubw m3, m4 + + ; stage 2 + paddw m5, m9, m7 + psubw m9, m7 + + paddw m4, m10, m6 + psubw m10, m6 + + paddw m7, m1, m2 + psubw m1, m2 + + ; stage 3 + paddw m6, m4, m5 + psubw m4, m5 + + pmulhrsw m1, m12 + pmulhrsw m7, m12 + + ; sin(pi / 8), cos(pi / 8) + punpcklwd m2, m10, m9 + punpckhwd m10, m9 + pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] + pmaddwd m2, [GLOBAL(pw_6270_m15137)] + pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] + pmaddwd m10, [GLOBAL(pw_6270_m15137)] + paddd m5, m8 + paddd m2, m8 + paddd m9, m8 + paddd m10, m8 + psrad m5, 14 + psrad m2, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m5, m9 + packssdw m2, m10 + + pmulhrsw m6, m12 + pmulhrsw m4, m12 + + paddw m9, m3, m1 + psubw m3, m1 + + paddw m10, m0, m7 + psubw m0, m7 + + ; stage 4 + ; sin(pi / 16), cos(pi / 16) + punpcklwd m1, m10, m9 + punpckhwd m10, m9 + pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] + pmaddwd m1, [GLOBAL(pw_3196_m16069)] + pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] + pmaddwd m10, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m1, m8 + paddd m9, m8 + paddd m10, m8 + psrad m7, 14 + psrad m1, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m7, m9 + packssdw m1, m10 + + ; sin(3 * pi / 16), cos(3 * pi / 16) + punpcklwd m11, m0, m3 + punpckhwd m0, m3 + pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] + pmaddwd m11, [GLOBAL(pw_13623_m9102)] + pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] + pmaddwd m0, [GLOBAL(pw_13623_m9102)] + paddd m9, m8 + paddd m11, m8 + paddd m3, m8 + paddd m0, m8 + psrad m9, 14 + psrad m11, 14 + psrad m3, 14 + psrad m0, 14 + packssdw m9, m3 + packssdw m11, m0 + + ; transpose + ; stage 1 + punpcklwd m0, m6, m7 + punpcklwd m3, m5, m11 + punpckhwd m6, m7 + punpckhwd m5, m11 + punpcklwd m7, m4, m9 + punpcklwd m10, m2, m1 + punpckhwd m4, m9 + punpckhwd m2, m1 + + ; stage 2 + punpckldq m9, m0, m3 + punpckldq m1, m6, m5 + punpckhdq m0, m3 + punpckhdq m6, m5 + punpckldq m3, m7, m10 + punpckldq m5, m4, m2 + punpckhdq m7, m10 + punpckhdq m4, m2 + + ; stage 3 + punpcklqdq m10, m9, m3 + punpckhqdq m9, m3 + punpcklqdq m2, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m7, m6, m4 + punpckhqdq m6, m4 + + ; row transform + ; stage 1 + paddw m5, m10, m6 + psubw m10, m6 + + paddw m4, m9, m7 + psubw m9, m7 + + paddw m6, m2, m1 + psubw m2, m1 + + paddw m7, m0, m3 + psubw m0, m3 + + ;stage 2 + paddw m1, m5, m7 + psubw m5, m7 + + paddw m3, m4, m6 + psubw m4, m6 + + paddw m7, m9, m2 + psubw m9, m2 + + ; stage 3 + punpcklwd m6, m1, m3 + punpckhwd m1, m3 + pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] + pmaddwd m6, [GLOBAL(pw_11585_m11585)] + pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] + pmaddwd m1, [GLOBAL(pw_11585_m11585)] + paddd m2, m8 + paddd m6, m8 + paddd m3, m8 + paddd m1, m8 + psrad m2, 14 + psrad m6, 14 + psrad m3, 14 + psrad m1, 14 + packssdw m2, m3 + packssdw m6, m1 + + pmulhrsw m7, m12 + pmulhrsw m9, m12 + + punpcklwd m3, m5, m4 + punpckhwd m5, m4 + pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] + pmaddwd m3, [GLOBAL(pw_6270_m15137)] + pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] + pmaddwd m5, [GLOBAL(pw_6270_m15137)] + paddd m1, m8 + paddd m3, m8 + paddd m4, m8 + paddd m5, m8 + psrad m1, 14 + psrad m3, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m1, m4 + packssdw m3, m5 + + paddw m4, m0, m9 + psubw m0, m9 + + paddw m5, m10, m7 + psubw m10, m7 + + ; stage 4 + punpcklwd m9, m5, m4 + punpckhwd m5, m4 + pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] + pmaddwd m9, [GLOBAL(pw_3196_m16069)] + pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] + pmaddwd m5, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m9, m8 + paddd m4, m8 + paddd m5, m8 + psrad m7, 14 + psrad m9, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m7, m4 + packssdw m9, m5 + + punpcklwd m4, m10, m0 + punpckhwd m10, m0 + pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] + pmaddwd m4, [GLOBAL(pw_13623_m9102)] + pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] + pmaddwd m10, [GLOBAL(pw_13623_m9102)] + paddd m5, m8 + paddd m4, m8 + paddd m0, m8 + paddd m10, m8 + psrad m5, 14 + psrad m4, 14 + psrad m0, 14 + psrad m10, 14 + packssdw m5, m0 + packssdw m4, m10 + + ; transpose + ; stage 1 + punpcklwd m0, m2, m7 + punpcklwd m10, m1, m4 + punpckhwd m2, m7 + punpckhwd m1, m4 + punpcklwd m7, m6, m5 + punpcklwd m4, m3, m9 + punpckhwd m6, m5 + punpckhwd m3, m9 + + ; stage 2 + punpckldq m5, m0, m10 + punpckldq m9, m2, m1 + punpckhdq m0, m10 + punpckhdq m2, m1 + punpckldq m10, m7, m4 + punpckldq m1, m6, m3 + punpckhdq m7, m4 + punpckhdq m6, m3 + + ; stage 3 + punpcklqdq m4, m5, m10 + punpckhqdq m5, m10 + punpcklqdq m3, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m10, m9, m1 + punpckhqdq m9, m1 + punpcklqdq m7, m2, m6 + punpckhqdq m2, m6 + + psraw m1, m4, 15 + psraw m6, m5, 15 + psraw m8, m3, 15 + psraw m11, m0, 15 + + psubw m4, m1 + psubw m5, m6 + psubw m3, m8 + psubw m0, m11 + + psraw m4, 1 + psraw m5, 1 + psraw m3, 1 + psraw m0, 1 + + psraw m1, m10, 15 + psraw m6, m9, 15 + psraw m8, m7, 15 + psraw m11, m2, 15 + + psubw m10, m1 + psubw m9, m6 + psubw m7, m8 + psubw m2, m11 + + psraw m10, 1 + psraw m9, 1 + psraw m7, 1 + psraw m2, 1 + + mova [outputq + 0], m4 + mova [outputq + 16], m5 + mova [outputq + 32], m3 + mova [outputq + 48], m0 + mova [outputq + 64], m10 + mova [outputq + 80], m9 + mova [outputq + 96], m7 + mova [outputq + 112], m2 RET %endif diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c new file mode 100644 index 0000000000..01a52ec8bf --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c @@ -0,0 +1,1495 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_avx2.h" + +// ----------------------------------------------------------------------------- +// Copy and average + +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + assert(w % 4 == 0); + if (w > 32) { // w = 64 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + _mm256_storeu_si256((__m256i *)(dst + 32), p2); + _mm256_storeu_si256((__m256i *)(dst + 48), p3); + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 16) { // w = 32 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 8) { // w = 16 + __m256i p0, p1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + p1 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + + _mm256_storeu_si256((__m256i *)dst, p0); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w > 4) { // w = 8 + __m128i p0, p1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + p1 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + + _mm_storeu_si128((__m128i *)dst, p0); + dst += dst_stride; + _mm_storeu_si128((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // w = 4 + __m128i p0, p1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + p1 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + + _mm_storel_epi64((__m128i *)dst, p0); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } +} + +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; + (void)bd; + + assert(w % 4 == 0); + if (w > 32) { // w = 64 + __m256i p0, p1, p2, p3, u0, u1, u2, u3; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); + u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); + _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 16) { // w = 32 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (w > 8) { // w = 16 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); + + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + dst_stride), + _mm256_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else if (w > 4) { // w = 8 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadu_si128((const __m128i *)dst); + u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); + + _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); + _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else { // w = 4 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadl_epi64((const __m128i *)dst); + u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); + + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); + _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } +} + +// ----------------------------------------------------------------------------- +// Horizontal and vertical filtering + +static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + +static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15, + 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15 }; + +static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; + +#define CONV8_ROUNDING_BITS (7) +#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) + +// ----------------------------------------------------------------------------- +// Horizontal Filtering + +static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); + const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); + const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); + + p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 + p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 + p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 + p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 +} + +// Note: +// Shared by 8x2 and 16x1 block +static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, + __m256i *x /*x[8]*/) { + __m256i pp[8]; + pack_pixels(s0, pp); + pack_pixels(s1, &pp[4]); + x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); + x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); + x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); + x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); + x[4] = x[2]; + x[5] = x[3]; + x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); + x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); +} + +static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { + __m256i pp[8]; + __m256i s0; + s0 = _mm256_loadu_si256((const __m256i *)src); + pack_pixels(&s0, pp); + x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); + x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); + x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); + x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); +} + +static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, + __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); + pack_16_pixels(&s0, &s1, x); +} + +static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_pixels(&s0, &s1, x); +} + +// Note: +// Shared by horizontal and vertical filtering +static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p0 = _mm256_set1_epi32(0x03020100); + const __m256i p1 = _mm256_set1_epi32(0x07060504); + const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); + const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); + f[0] = _mm256_shuffle_epi8(hh, p0); + f[1] = _mm256_shuffle_epi8(hh, p1); + f[2] = _mm256_shuffle_epi8(hh, p2); + f[3] = _mm256_shuffle_epi8(hh, p3); +} + +static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, + const __m256i *fil /*fil[4]*/, + __m256i *y) { + __m256i a, a0, a1; + + a0 = _mm256_madd_epi16(fil[0], sig[0]); + a1 = _mm256_madd_epi16(fil[3], sig[3]); + a = _mm256_add_epi32(a0, a1); + + a0 = _mm256_madd_epi16(fil[1], sig[1]); + a1 = _mm256_madd_epi16(fil[2], sig[2]); + + { + const __m256i min = _mm256_min_epi32(a0, a1); + a = _mm256_add_epi32(a, min); + } + { + const __m256i max = _mm256_max_epi32(a0, a1); + a = _mm256_add_epi32(a, max); + } + { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + a = _mm256_add_epi32(a, rounding); + *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); + } +} + +static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y); + const __m128i a1 = _mm256_extractf128_si256(*y, 1); + __m128i res = _mm_packus_epi32(a0, a1); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static void vpx_highbd_filter_block1d8_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// 2-tap horizontal filtering + +static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p = _mm256_set1_epi32(0x09080706); + f[0] = _mm256_shuffle_epi8(hh, p); +} + +// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() +// the difference is s0/s1 specifies first and second rows or, +// first 16 samples and 8-sample shifted 16 samples +static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, + __m256i *sig) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); + __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); + __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); + __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + r1 = _mm256_shuffle_epi8(r1, sf2); + sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); + sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); +} + +static INLINE void pack_8x2_2t_pixels(const uint16_t *src, + const ptrdiff_t pitch, __m256i *sig) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_16x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_8x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x0 = _mm256_shuffle_epi8(r0, sf2); + r0 = _mm256_permutevar8x32_epi32(r0, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); +} + +// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() +static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + __m256i x1 = _mm256_madd_epi16(sig[1], *f); + x0 = _mm256_add_epi32(x0, rounding); + x1 = _mm256_add_epi32(x1, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + x0 = _mm256_add_epi32(x0, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); +} + +static void vpx_highbd_filter_block1d8_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// Vertical Filtering + +static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); + __m256i s1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); + __m256i s2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); + __m256i s3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); + __m256i s4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); + __m256i s5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); + __m256i s6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); + + s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); + s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); + s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); + s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); + s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); + + sig[0] = _mm256_unpacklo_epi16(s0, s1); + sig[4] = _mm256_unpackhi_epi16(s0, s1); + sig[1] = _mm256_unpacklo_epi16(s2, s3); + sig[5] = _mm256_unpackhi_epi16(s2, s3); + sig[2] = _mm256_unpacklo_epi16(s4, s5); + sig[6] = _mm256_unpackhi_epi16(s4, s5); + sig[8] = s6; +} + +static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + __m256i s0 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); + // base + 8th row + __m256i s1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); + __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); + __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + sig[3] = _mm256_unpacklo_epi16(s2, s3); + sig[7] = _mm256_unpackhi_epi16(s2, s3); + sig[8] = s1; +} + +static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_8x1_pixels(sig, f, y0); + filter_8x1_pixels(&sig[4], f, y1); +} + +static INLINE void update_pixels(__m256i *sig) { + int i; + for (i = 0; i < 3; ++i) { + sig[i] = sig[i + 1]; + sig[i + 4] = sig[i + 5]; + } +} + +static void vpx_highbd_filter_block1d8_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i u0, u1, u2, u3; + // load 0-6 rows + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); + const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); + const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); + const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); + + u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low + u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high + + u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low + u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high + + sig[0] = _mm256_unpacklo_epi16(u0, u2); + sig[4] = _mm256_unpackhi_epi16(u0, u2); + + sig[8] = _mm256_unpacklo_epi16(u1, u3); + sig[12] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s2, s3, 0x20); + u1 = _mm256_permute2x128_si256(s2, s3, 0x31); + + u2 = _mm256_permute2x128_si256(s3, s4, 0x20); + u3 = _mm256_permute2x128_si256(s3, s4, 0x31); + + sig[1] = _mm256_unpacklo_epi16(u0, u2); + sig[5] = _mm256_unpackhi_epi16(u0, u2); + + sig[9] = _mm256_unpacklo_epi16(u1, u3); + sig[13] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s4, s5, 0x20); + u1 = _mm256_permute2x128_si256(s4, s5, 0x31); + + u2 = _mm256_permute2x128_si256(s5, s6, 0x20); + u3 = _mm256_permute2x128_si256(s5, s6, 0x31); + + sig[2] = _mm256_unpacklo_epi16(u0, u2); + sig[6] = _mm256_unpackhi_epi16(u0, u2); + + sig[10] = _mm256_unpacklo_epi16(u1, u3); + sig[14] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s6; +} + +static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); + // base + 8th row + const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); + + __m256i u0, u1, u2, u3; + u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); + u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); + + u2 = _mm256_permute2x128_si256(s7, s8, 0x20); + u3 = _mm256_permute2x128_si256(s7, s8, 0x31); + + sig[3] = _mm256_unpacklo_epi16(u0, u2); + sig[7] = _mm256_unpackhi_epi16(u0, u2); + + sig[11] = _mm256_unpacklo_epi16(u1, u3); + sig[15] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s8; +} + +static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + __m256i res[4]; + int i; + for (i = 0; i < 4; ++i) { + filter_8x1_pixels(&sig[i << 2], f, &res[i]); + } + + { + const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); + const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); + *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); + *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); + } +} + +static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); + p = _mm256_min_epi16(*y1, *mask); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static void update_16x9_pixels(__m256i *sig) { + update_pixels(&sig[0]); + update_pixels(&sig[8]); +} + +static void vpx_highbd_filter_block1d16_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +// ----------------------------------------------------------------------------- +// 2-tap vertical filtering + +static void pack_16x2_init(const uint16_t *src, __m256i *sig) { + sig[2] = _mm256_loadu_si256((const __m256i *)src); +} + +static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // load the next row + const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); + sig[0] = _mm256_unpacklo_epi16(sig[2], u); + sig[1] = _mm256_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static void vpx_highbd_filter_block1d16_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m128i p = _mm_set1_epi32(0x09080706); + f[0] = _mm_shuffle_epi8(h, p); +} + +static void pack_8x2_init(const uint16_t *src, __m128i *sig) { + sig[2] = _mm_loadu_si128((const __m128i *)src); +} + +static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, + __m128i *sig) { + // load the next row + const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); + sig[0] = _mm_unpacklo_epi16(sig[2], u); + sig[1] = _mm_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, + __m128i *y0, __m128i *y1) { + const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m128i x0 = _mm_madd_epi16(sig[0], *f); + __m128i x1 = _mm_madd_epi16(sig[1], *f); + x0 = _mm_add_epi32(x0, rounding); + x1 = _mm_add_epi32(x1, rounding); + *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + res = _mm_min_epi16(res, *mask); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void vpx_highbd_filter_block1d8_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +// Calculation with averaging the input pixels + +static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask, + uint16_t *dst) { + const __m128i a0 = _mm256_castsi256_si128(*y0); + const __m128i a1 = _mm256_extractf128_si256(*y0, 1); + __m128i res = _mm_packus_epi32(a0, a1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); + const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); + const __m256i pix = + _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); + const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); + __m256i p = _mm256_min_epi16(*y0, *mask); + p = _mm256_avg_epu16(p, pix0); + _mm256_storeu_si256((__m256i *)dst, p); + + p = _mm256_min_epi16(*y1, *mask); + p = _mm256_avg_epu16(p, pix1); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0, + const __m128i *y1, + const __m128i *mask, + uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, *mask); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void vpx_highbd_filter_block1d8_h8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_avg_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d4_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We extract the middle four elements of the kernel into two registers in + // the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add on the two + // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we + // can do this two rows at a time. + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i res_reg; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, + 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i idx_shift_2 = + _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, + 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round the result + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round the result + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg)); + } +} + +static void vpx_highbd_filter_block1d8_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will extract the middle four elements of the kernel into two registers + // in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum of the first half. + // Calling add gives us first half of the output. Repat again to get the whole + // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows + // at a time. + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i res_reg, res_first, res_last; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, + 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i idx_shift_2 = + _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, + 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Result for first half + res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Do again to get the second half of dst + // Load the source + src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Result for second half + res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round each result + res_first = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS); + res_last = mm256_round_epi32(&res_last, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_first, res_last); + res_reg = _mm256_min_epi16(res_reg, reg_max); + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg); + } +} + +static void vpx_highbd_filter_block1d16_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +static void vpx_highbd_filter_block1d8_v8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d16_v8_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d8_h2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_avg_pixels(&res0, &max, dst_ptr); + } +} + +static void vpx_highbd_filter_block1d16_h2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d16_v2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d8_v2_avg_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static void vpx_highbd_filter_block1d4_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels and rearrange them into the form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223; + + // Result after multiply and add + __m256i res_reg; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel used + + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); + + // Output + res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223, + &kernel_reg_23, &kernel_reg_45); + + // Round the words + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Combine to get the result + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + // Save the result + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d8_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels and rearrange them into the form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel + + // Result after multiply and add + __m256i res_reg, res_reg_lo, res_reg_hi; + + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); + src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); + src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23); + + // Output from first half + res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo, + &kernel_reg_23, &kernel_reg_45); + + // Output from second half + res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi, + &kernel_reg_23, &kernel_reg_45); + + // Round the words + res_reg_lo = + mm256_round_epi32(&res_reg_lo, ®_round, CONV8_ROUNDING_BITS); + res_reg_hi = + mm256_round_epi32(&res_reg_hi, ®_round, CONV8_ROUNDING_BITS); + + // Combine to get the result + res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + // Save the result + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001_lo = src_reg_1223_lo; + src_reg_m1001_hi = src_reg_1223_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d16_v4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; + +#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2 +#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2 +#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 +#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_highbd_filter_block1d16_v4_avg_avx2 \ + vpx_highbd_filter_block1d16_v8_avg_avx2 +#define vpx_highbd_filter_block1d16_h4_avg_avx2 \ + vpx_highbd_filter_block1d16_h8_avg_avx2 +#define vpx_highbd_filter_block1d8_v4_avg_avx2 \ + vpx_highbd_filter_block1d8_v8_avg_avx2 +#define vpx_highbd_filter_block1d8_h4_avg_avx2 \ + vpx_highbd_filter_block1d8_h8_avg_avx2 +#define vpx_highbd_filter_block1d4_v4_avg_avx2 \ + vpx_highbd_filter_block1d4_v8_avg_avx2 +#define vpx_highbd_filter_block1d4_h4_avg_avx2 \ + vpx_highbd_filter_block1d4_h8_avg_avx2 + +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0) +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), , avx2, 0) +HIGH_FUN_CONV_2D(, avx2, 0) + +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; + +#define vpx_highbd_filter_block1d4_h8_avg_avx2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_h2_avg_avx2 \ + vpx_highbd_filter_block1d4_h2_avg_sse2 +#define vpx_highbd_filter_block1d4_v8_avg_avx2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_v2_avg_avx2 \ + vpx_highbd_filter_block1d4_v2_avg_sse2 + +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1) +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1) +HIGH_FUN_CONV_2D(avg_, avx2, 1) + +#undef HIGHBD_FUNC diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c new file mode 100644 index 0000000000..f4f7235d13 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, + __m128i *const out) { + // stage 5 + out[0] = _mm_add_epi32(in[0], in[3]); + out[1] = _mm_add_epi32(in[1], in[2]); + out[2] = _mm_sub_epi32(in[1], in[2]); + out[3] = _mm_sub_epi32(in[0], in[3]); + highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]); + out[8] = _mm_add_epi32(in[8], in[11]); + out[9] = _mm_add_epi32(in[9], in[10]); + out[10] = _mm_sub_epi32(in[9], in[10]); + out[11] = _mm_sub_epi32(in[8], in[11]); + out[12] = _mm_sub_epi32(in[15], in[12]); + out[13] = _mm_sub_epi32(in[14], in[13]); + out[14] = _mm_add_epi32(in[14], in[13]); + out[15] = _mm_add_epi32(in[15], in[12]); +} + +static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); + out[8] = in[8]; + out[9] = in[9]; + highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]); + highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]); + out[14] = in[14]; + out[15] = in[15]; +} + +static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]); + highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp1[2], sign[2]; + + // stage 2 + highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + abs_extend_64bit_sse2(io[0], temp1, sign); + step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + step2[1] = step2[0]; + highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp[2], sign[2]; + + // stage 2 + highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = + _mm_sub_epi32(_mm_setzero_si128(), step2[11]); // step1[10] = -step1[10] + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = + _mm_sub_epi32(_mm_setzero_si128(), step2[12]); // step1[13] = -step1[13] + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + abs_extend_64bit_sse2(io[0], temp, sign); + step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64); + step2[1] = step2[0]; + step2[2] = _mm_setzero_si128(); + step2[3] = _mm_setzero_si128(); + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + idct16_8col(in, in); + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + highbd_idct16_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct16_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], temp[16]; + + highbd_load_pack_transpose_32bit_8x8(input, 16, in); + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); + } + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(input, 16, in); + highbd_idct16x16_38_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + highbd_idct16x16_38_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], l[16]; + + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, in[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_4x4(input, 16, in); + highbd_idct16x16_10_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(&all[0][i], out); + highbd_idct16x16_10_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 16); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c new file mode 100644 index 0000000000..7898ee12c8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct16_4col_stage5(const __m128i *const in, + __m128i *const out) { + // stage 5 + out[0] = _mm_add_epi32(in[0], in[3]); + out[1] = _mm_add_epi32(in[1], in[2]); + out[2] = _mm_sub_epi32(in[1], in[2]); + out[3] = _mm_sub_epi32(in[0], in[3]); + highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]); + out[8] = _mm_add_epi32(in[8], in[11]); + out[9] = _mm_add_epi32(in[9], in[10]); + out[10] = _mm_sub_epi32(in[9], in[10]); + out[11] = _mm_sub_epi32(in[8], in[11]); + out[12] = _mm_sub_epi32(in[15], in[12]); + out[13] = _mm_sub_epi32(in[14], in[13]); + out[14] = _mm_add_epi32(in[14], in[13]); + out[15] = _mm_add_epi32(in[15], in[12]); +} + +static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); + out[8] = in[8]; + out[9] = in[9]; + highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]); + highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]); + out[14] = in[14]; + out[15] = in[15]; +} + +void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]); + highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp1[2]; + + // stage 2 + highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + + // stage 4 + extend_64bit(io[0], temp1); + step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + step2[1] = step2[0]; + highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step1[4] = _mm_add_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step1[7] = _mm_add_epi32(step1[7], step1[6]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) { + __m128i step1[16], step2[16]; + __m128i temp[2]; + + // stage 2 + highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + extend_64bit(io[0], temp); + step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + step2[1] = step2[0]; + step2[2] = _mm_setzero_si128(); + step2[3] = _mm_setzero_si128(); + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64, + &step2[13], &step2[10]); + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + highbd_idct16_4col_stage5(step2, step1); + highbd_idct16_4col_stage6(step1, step2); + highbd_idct16_4col_stage7(step2, io); +} + +void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + idct16_8col(in, in); + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + vpx_highbd_idct16_4col_sse4_1(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + vpx_highbd_idct16_4col_sse4_1(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], temp[16]; + + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); + } + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(input, 16, in); + highbd_idct16x16_38_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + highbd_idct16x16_38_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i; + __m128i out[16]; + + if (bd == 8) { + __m128i in[16], l[16]; + + in[0] = load_pack_8_32bit(input + 0 * 16); + in[1] = load_pack_8_32bit(input + 1 * 16); + in[2] = load_pack_8_32bit(input + 2 * 16); + in[3] = load_pack_8_32bit(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, in[j], bd); + } + dest += 8; + } + } else { + __m128i all[2][16], *in; + + for (i = 0; i < 2; i++) { + in = all[i]; + highbd_load_transpose_32bit_4x4(input, 16, in); + highbd_idct16x16_10_4col(in); + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(&all[0][i], out); + highbd_idct16x16_10_4col(out); + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c new file mode 100644 index 0000000000..c710e89954 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c @@ -0,0 +1,782 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi32(step2[8], step2[11]); + step1[9] = _mm_add_epi32(step2[9], step2[10]); + step1[10] = _mm_sub_epi32(step2[9], step2[10]); + step1[11] = _mm_sub_epi32(step2[8], step2[11]); + step1[12] = _mm_sub_epi32(step2[15], step2[12]); + step1[13] = _mm_sub_epi32(step2[14], step2[13]); + step1[14] = _mm_add_epi32(step2[14], step2[13]); + step1[15] = _mm_add_epi32(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64, + &out[10], &out[13]); + highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64, + &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi32(step1[16], step1[19]); + step2[17] = _mm_add_epi32(step1[17], step1[18]); + step2[18] = _mm_sub_epi32(step1[17], step1[18]); + step2[19] = _mm_sub_epi32(step1[16], step1[19]); + step2[20] = _mm_sub_epi32(step1[20], step1[23]); // step2[20] = -step2[20] + step2[21] = _mm_sub_epi32(step1[21], step1[22]); // step2[21] = -step2[21] + step2[22] = _mm_add_epi32(step1[21], step1[22]); + step2[23] = _mm_add_epi32(step1[20], step1[23]); + + step2[24] = _mm_add_epi32(step1[27], step1[24]); + step2[25] = _mm_add_epi32(step1[26], step1[25]); + step2[26] = _mm_sub_epi32(step1[26], step1[25]); // step2[26] = -step2[26] + step2[27] = _mm_sub_epi32(step1[27], step1[24]); // step2[27] = -step2[27] + step2[28] = _mm_sub_epi32(step1[31], step1[28]); + step2[29] = _mm_sub_epi32(step1[30], step1[29]); + step2[30] = _mm_add_epi32(step1[29], step1[30]); + step2[31] = _mm_add_epi32(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + highbd_butterfly_sse2(step2[29], step2[18], cospi_24_64, cospi_8_64, + &step1[18], &step1[29]); + highbd_butterfly_sse2(step2[28], step2[19], cospi_24_64, cospi_8_64, + &step1[19], &step1[28]); + highbd_butterfly_sse2(step2[20], step2[27], cospi_8_64, cospi_24_64, + &step1[27], &step1[20]); + highbd_butterfly_sse2(step2[21], step2[26], cospi_8_64, cospi_24_64, + &step1[26], &step1[21]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[16] = _mm_add_epi32(step1[16], step1[23]); + step2[17] = _mm_add_epi32(step1[17], step1[22]); + step2[18] = _mm_add_epi32(step1[18], step1[21]); + step2[19] = _mm_add_epi32(step1[19], step1[20]); + step2[20] = _mm_sub_epi32(step1[19], step1[20]); + step2[21] = _mm_sub_epi32(step1[18], step1[21]); + step2[22] = _mm_sub_epi32(step1[17], step1[22]); + step2[23] = _mm_sub_epi32(step1[16], step1[23]); + + step2[24] = _mm_sub_epi32(step1[31], step1[24]); + step2[25] = _mm_sub_epi32(step1[30], step1[25]); + step2[26] = _mm_sub_epi32(step1[29], step1[26]); + step2[27] = _mm_sub_epi32(step1[28], step1[27]); + step2[28] = _mm_add_epi32(step1[27], step1[28]); + step2[29] = _mm_add_epi32(step1[26], step1[29]); + step2[30] = _mm_add_epi32(step1[25], step1[30]); + step2[31] = _mm_add_epi32(step1[24], step1[31]); + + // stage 7 + out[16] = step2[16]; + out[17] = step2[17]; + out[18] = step2[18]; + out[19] = step2[19]; + highbd_butterfly_sse2(step2[27], step2[20], cospi_16_64, cospi_16_64, + &out[20], &out[27]); + highbd_butterfly_sse2(step2[26], step2[21], cospi_16_64, cospi_16_64, + &out[21], &out[26]); + highbd_butterfly_sse2(step2[25], step2[22], cospi_16_64, cospi_16_64, + &out[22], &out[25]); + highbd_butterfly_sse2(step2[24], step2[23], cospi_16_64, cospi_16_64, + &out[23], &out[24]); + out[28] = step2[28]; + out[29] = step2[29]; + out[30] = step2[30]; + out[31] = step2[31]; +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_butterfly_sse2(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_butterfly_sse2(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_butterfly_sse2(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_1024_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_1024_4x32_quarter_1(in, temp); + highbd_idct32_1024_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_butterfly_sse2(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_butterfly_sse2(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], + &step1[30]); + highbd_butterfly_sse2(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_butterfly_sse2(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_butterfly_sse2(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_butterfly_sse2(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], + &step1[26]); + + highbd_butterfly_sse2(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_butterfly_sse2(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18] + step2[19] = _mm_add_epi32(step1[18], step1[19]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22] + step2[23] = _mm_add_epi32(step1[22], step1[23]); + + step2[24] = _mm_add_epi32(step1[25], step1[24]); + step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25] + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[29], step1[28]); + step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29] + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_1024_4x32_quarter_1_2(io, temp); + highbd_idct32_1024_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[4][32], io[32]; + + // rows + for (i = 0; i < 4; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]); + highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]); + highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + // Transpose 32x8 block to 8x32 block + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); + idct32_1024_8x32(io, io); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, io[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 8; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]); + highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]); + highbd_idct32_1024_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + transpose_32bit_4x4(all[4] + i, out + 16); + transpose_32bit_4x4(all[5] + i, out + 20); + transpose_32bit_4x4(all[6] + i, out + 24); + transpose_32bit_4x4(all[7] + i, out + 28); + highbd_idct32_1024_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_neg_sse2(in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_partial_butterfly_sse2(in[8], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse2(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] + step1[11] = _mm_add_epi32(step2[10], step2[11]); + step1[12] = _mm_add_epi32(step2[13], step2[12]); + step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_135_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_135_4x32_quarter_1(in, temp); + highbd_idct32_135_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_neg_sse2(in[15], cospi_15_64, cospi_17_64, + &step1[17], &step1[30]); + highbd_partial_butterfly_sse2(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_neg_sse2(in[11], cospi_11_64, cospi_21_64, + &step1[21], &step1[26]); + + highbd_partial_butterfly_sse2(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[18], step1[19]); // step2[18] = -step2[18] + step2[19] = _mm_add_epi32(step1[18], step1[19]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[22], step1[23]); // step2[22] = -step2[22] + step2[23] = _mm_add_epi32(step1[22], step1[23]); + + step2[24] = _mm_add_epi32(step1[25], step1[24]); + step2[25] = _mm_sub_epi32(step1[25], step1[24]); // step2[25] = -step2[25] + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[29], step1[28]); + step2[29] = _mm_sub_epi32(step1[29], step1[28]); // step2[29] = -step2[29] + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_135_4x32_quarter_1_2(io, temp); + highbd_idct32_135_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[2][32], in[32], out[32]; + + for (i = 16; i < 32; i++) { + in[i] = _mm_setzero_si128(); + } + + // rows + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]); + idct32_1024_8x32(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_1024_8x32(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_135_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_135_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + + // stage 4 + highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + step1[10] = + _mm_sub_epi32(_mm_setzero_si128(), step1[10]); // step1[10] = -step1[10] + step1[13] = + _mm_sub_epi32(_mm_setzero_si128(), step1[13]); // step1[13] = -step1[13] + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_34_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_34_4x32_quarter_1(in, temp); + highbd_idct32_34_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + + // stage 3 + step2[18] = + _mm_sub_epi32(_mm_setzero_si128(), step2[18]); // step2[18] = -step2[18] + step2[22] = + _mm_sub_epi32(_mm_setzero_si128(), step2[22]); // step2[22] = -step2[22] + step2[25] = + _mm_sub_epi32(_mm_setzero_si128(), step2[25]); // step2[25] = -step2[25] + step2[29] = + _mm_sub_epi32(_mm_setzero_si128(), step2[29]); // step2[29] = -step2[29] + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64, + &step1[29], &step1[18]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64, + &step1[25], &step1[22]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_34_4x32_quarter_1_2(io, temp); + highbd_idct32_34_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[32], in[32], out[32]; + + // rows + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + idct32_34_8x32_sse2(in, col); + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col + i, in); + idct32_34_8x32_sse2(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_34_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_34_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 32); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c new file mode 100644 index 0000000000..2d0a53ac0a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c @@ -0,0 +1,765 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64, + &step2[9], &step2[14]); + highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64, + &step2[10], &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi32(step2[8], step2[11]); + step1[9] = _mm_add_epi32(step2[9], step2[10]); + step1[10] = _mm_sub_epi32(step2[9], step2[10]); + step1[11] = _mm_sub_epi32(step2[8], step2[11]); + step1[12] = _mm_sub_epi32(step2[15], step2[12]); + step1[13] = _mm_sub_epi32(step2[14], step2[13]); + step1[14] = _mm_add_epi32(step2[14], step2[13]); + step1[15] = _mm_add_epi32(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64, + &out[10], &out[13]); + highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64, + &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi32(step1[16], step1[19]); + step2[17] = _mm_add_epi32(step1[17], step1[18]); + step2[18] = _mm_sub_epi32(step1[17], step1[18]); + step2[19] = _mm_sub_epi32(step1[16], step1[19]); + step2[20] = _mm_sub_epi32(step1[23], step1[20]); + step2[21] = _mm_sub_epi32(step1[22], step1[21]); + step2[22] = _mm_add_epi32(step1[22], step1[21]); + step2[23] = _mm_add_epi32(step1[23], step1[20]); + + step2[24] = _mm_add_epi32(step1[24], step1[27]); + step2[25] = _mm_add_epi32(step1[25], step1[26]); + step2[26] = _mm_sub_epi32(step1[25], step1[26]); + step2[27] = _mm_sub_epi32(step1[24], step1[27]); + step2[28] = _mm_sub_epi32(step1[31], step1[28]); + step2[29] = _mm_sub_epi32(step1[30], step1[29]); + step2[30] = _mm_add_epi32(step1[29], step1[30]); + step2[31] = _mm_add_epi32(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64, + &step1[18], &step1[29]); + highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64, + &step1[19], &step1[28]); + highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64, + &step1[20], &step1[27]); + highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64, + &step1[21], &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[16] = _mm_add_epi32(step1[16], step1[23]); + step2[17] = _mm_add_epi32(step1[17], step1[22]); + step2[18] = _mm_add_epi32(step1[18], step1[21]); + step2[19] = _mm_add_epi32(step1[19], step1[20]); + step2[20] = _mm_sub_epi32(step1[19], step1[20]); + step2[21] = _mm_sub_epi32(step1[18], step1[21]); + step2[22] = _mm_sub_epi32(step1[17], step1[22]); + step2[23] = _mm_sub_epi32(step1[16], step1[23]); + + step2[24] = _mm_sub_epi32(step1[31], step1[24]); + step2[25] = _mm_sub_epi32(step1[30], step1[25]); + step2[26] = _mm_sub_epi32(step1[29], step1[26]); + step2[27] = _mm_sub_epi32(step1[28], step1[27]); + step2[28] = _mm_add_epi32(step1[27], step1[28]); + step2[29] = _mm_add_epi32(step1[26], step1[29]); + step2[30] = _mm_add_epi32(step1[25], step1[30]); + step2[31] = _mm_add_epi32(step1[24], step1[31]); + + // stage 7 + out[16] = step2[16]; + out[17] = step2[17]; + out[18] = step2[18]; + out[19] = step2[19]; + highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64, + &out[20], &out[27]); + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64, + &out[21], &out[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64, + &out[22], &out[25]); + highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64, + &out[23], &out[24]); + out[28] = step2[28]; + out[29] = step2[29]; + out[30] = step2[30]; + out[31] = step2[31]; +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], + &step2[14]); + highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_1024_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_1024_4x32_quarter_1(in, temp); + highbd_idct32_1024_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_1024_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], + &step1[30]); + highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], + &step1[28]); + + highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], + &step1[26]); + + highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[19], step1[18]); + step2[19] = _mm_add_epi32(step1[19], step1[18]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[23], step1[22]); + step2[23] = _mm_add_epi32(step1[23], step1[22]); + + step2[24] = _mm_add_epi32(step1[24], step1[25]); + step2[25] = _mm_sub_epi32(step1[24], step1[25]); + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[28], step1[29]); + step2[29] = _mm_sub_epi32(step1[28], step1[29]); + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_1024_4x32_quarter_1_2(io, temp); + highbd_idct32_1024_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[4][32], io[32]; + + // rows + for (i = 0; i < 4; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]); + highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]); + highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + // Transpose 32x8 block to 8x32 block + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); + idct32_1024_8x32(io, io); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, io[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 8; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]); + highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]); + highbd_idct32_1024_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + transpose_32bit_4x4(all[4] + i, out + 16); + transpose_32bit_4x4(all[5] + i, out + 20); + transpose_32bit_4x4(all[6] + i, out + 24); + transpose_32bit_4x4(all[7] + i, out + 28); + highbd_idct32_1024_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + + // stage 4 + highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_2( + const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi32(step2[8], step2[9]); + step1[9] = _mm_sub_epi32(step2[8], step2[9]); + step1[14] = _mm_sub_epi32(step2[15], step2[14]); + step1[15] = _mm_add_epi32(step2[15], step2[14]); + step1[10] = _mm_sub_epi32(step2[11], step2[10]); + step1[11] = _mm_add_epi32(step2[11], step2[10]); + step1[12] = _mm_add_epi32(step2[12], step2[13]); + step1[13] = _mm_sub_epi32(step2[12], step2[13]); + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_135_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_135_4x32_quarter_1(in, temp); + highbd_idct32_135_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_135_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17], + &step1[30]); + highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21], + &step1[26]); + + highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi32(step1[16], step1[17]); + step2[17] = _mm_sub_epi32(step1[16], step1[17]); + step2[18] = _mm_sub_epi32(step1[19], step1[18]); + step2[19] = _mm_add_epi32(step1[19], step1[18]); + step2[20] = _mm_add_epi32(step1[20], step1[21]); + step2[21] = _mm_sub_epi32(step1[20], step1[21]); + step2[22] = _mm_sub_epi32(step1[23], step1[22]); + step2[23] = _mm_add_epi32(step1[23], step1[22]); + + step2[24] = _mm_add_epi32(step1[24], step1[25]); + step2[25] = _mm_sub_epi32(step1[24], step1[25]); + step2[26] = _mm_sub_epi32(step1[27], step1[26]); + step2[27] = _mm_add_epi32(step1[27], step1[26]); + step2[28] = _mm_add_epi32(step1[28], step1[29]); + step2[29] = _mm_sub_epi32(step1[28], step1[29]); + step2[30] = _mm_sub_epi32(step1[31], step1[30]); + step2[31] = _mm_add_epi32(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_135_4x32_quarter_1_2(io, temp); + highbd_idct32_135_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input, + uint16_t *dest, int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[2][32], in[32], out[32]; + + // rows + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]); + idct32_135_8x32_ssse3(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_135_8x32_ssse3(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_135_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_135_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} + +// ----------------------------------------------------------------------------- + +// For each 4x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + + // stage 4 + highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1], + &step2[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64, + &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi32(step1[0], step1[7]); + out[1] = _mm_add_epi32(step1[1], step1[6]); + out[2] = _mm_add_epi32(step1[2], step1[5]); + out[3] = _mm_add_epi32(step1[3], step1[4]); + out[4] = _mm_sub_epi32(step1[3], step1[4]); + out[5] = _mm_sub_epi32(step1[2], step1[5]); + out[6] = _mm_sub_epi32(step1[1], step1[6]); + out[7] = _mm_sub_epi32(step1[0], step1[7]); +} + +// For each 4x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i step1[32], step2[32]; + + // stage 2 + highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void highbd_idct32_34_4x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + highbd_idct32_34_4x32_quarter_1(in, temp); + highbd_idct32_34_4x32_quarter_2(in, temp); + // stage 7 + highbd_add_sub_butterfly(temp, out, 16); +} + +// For each 4x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void highbd_idct32_34_4x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = step1[16]; + step2[17] = step1[16]; + step2[18] = step1[19]; + step2[19] = step1[19]; + step2[20] = step1[20]; + step2[21] = step1[20]; + step2[22] = step1[23]; + step2[23] = step1[23]; + + step2[24] = step1[24]; + step2[25] = step1[24]; + step2[26] = step1[27]; + step2[27] = step1[27]; + step2[28] = step1[28]; + step2[29] = step1[28]; + step2[30] = step1[31]; + step2[31] = step1[31]; + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64, + &step1[17], &step1[30]); + highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64, + &step1[18], &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64, + &step1[21], &step1[26]); + highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64, + &step1[22], &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) { + __m128i temp[32]; + + highbd_idct32_34_4x32_quarter_1_2(io, temp); + highbd_idct32_34_4x32_quarter_3_4(io, temp); + // final stage + highbd_add_sub_butterfly(temp, io, 32); +} + +void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int i, j; + + if (bd == 8) { + __m128i col[32], in[32], out[32]; + + // rows + highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]); + idct32_34_8x32_ssse3(in, col); + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col + i, in); + idct32_34_8x32_ssse3(in, out); + for (j = 0; j < 32; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[8][32], out[32], *in; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]); + highbd_idct32_34_4x32(in); + input += 4 * 32; + } + + for (i = 0; i < 32; i += 4) { + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + highbd_idct32_34_4x32(out); + + for (j = 0; j < 32; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c new file mode 100644 index 0000000000..b9c8884f99 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0, + const __m128i in1) { + const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 1 + const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 2, 3 + const __m128i t2 = _mm_unpacklo_epi64(t0, t1); // 0, 1, 2, 3 + return dct_const_round_shift_sse2(t2); +} + +static INLINE void highbd_idct4_small_sse2(__m128i *const io) { + const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0); + const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0); + const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0); + __m128i temp1[4], temp2[4], step[4]; + + transpose_32bit_4x4(io, io); + + // Note: There is no 32-bit signed multiply SIMD instruction in SSE2. + // _mm_mul_epu32() is used which can only guarantee the lower 32-bit + // (signed) result is meaningful, which is enough in this function. + + // stage 1 + temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + temp1[1] = _mm_srli_si128(temp1[0], 4); // 1, 3 + temp2[1] = _mm_srli_si128(temp2[0], 4); // 1, 3 + temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16); // ([0] + [2])*cospi_16_64 + temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16); // ([0] + [2])*cospi_16_64 + temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16); // ([0] - [2])*cospi_16_64 + temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16); // ([0] - [2])*cospi_16_64 + step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]); + step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]); + + temp1[3] = _mm_srli_si128(io[1], 4); + temp2[3] = _mm_srli_si128(io[3], 4); + temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24); // input[1] * cospi_24_64 + temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24); // input[1] * cospi_24_64 + temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08); // input[1] * cospi_8_64 + temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08); // input[1] * cospi_8_64 + temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08); // input[3] * cospi_8_64 + temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08); // input[3] * cospi_8_64 + temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24); // input[3] * cospi_24_64 + temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24); // input[3] * cospi_24_64 + temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]); // [1]*cospi_24 - [3]*cospi_8 + temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]); // [1]*cospi_24 - [3]*cospi_8 + temp2[0] = _mm_add_epi64(temp2[0], temp2[2]); // [1]*cospi_8 + [3]*cospi_24 + temp2[1] = _mm_add_epi64(temp2[1], temp2[3]); // [1]*cospi_8 + [3]*cospi_24 + step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]); + step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +static INLINE void highbd_idct4_large_sse2(__m128i *const io) { + __m128i step[4]; + + transpose_32bit_4x4(io, io); + + // stage 1 + highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]); + highbd_butterfly_sse2(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], + &step[3]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int16_t max = 0, min = 0; + __m128i io[4], io_short[2]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + + if (bd != 8) { + __m128i max_input, min_input; + + max_input = _mm_max_epi16(io_short[0], io_short[1]); + min_input = _mm_min_epi16(io_short[0], io_short[1]); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8)); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4)); + max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2)); + min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2)); + max = (int16_t)_mm_extract_epi16(max_input, 0); + min = (int16_t)_mm_extract_epi16(min_input, 0); + } + + if (bd == 8 || (max < 4096 && min >= -4096)) { + idct4_sse2(io_short); + idct4_sse2(io_short); + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + if (max < 32767 && min > -32768) { + highbd_idct4_small_sse2(io); + highbd_idct4_small_sse2(io); + } else { + highbd_idct4_large_sse2(io); + highbd_idct4_large_sse2(io); + } + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} + +void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + int a1, i; + tran_low_t out; + __m128i dc, d; + + out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 4); + dc = _mm_set1_epi16(a1); + + for (i = 0; i < 4; ++i) { + d = _mm_loadl_epi64((const __m128i *)dest); + d = add_clamp(d, dc, bd); + _mm_storel_epi64((__m128i *)dest, d); + dest += stride; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c new file mode 100644 index 0000000000..fe74d272ad --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[4]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + if (bd == 8) { + __m128i io_short[2]; + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + idct4_sse2(io_short); + idct4_sse2(io_short); + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + highbd_idct4_sse4_1(io); + highbd_idct4_sse4_1(io); + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c new file mode 100644 index 0000000000..bb7a510e15 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +static void highbd_idct8x8_half1d(__m128i *const io) { + __m128i step1[8], step2[8]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + step1[0] = io[0]; + step1[2] = io[4]; + step1[1] = io[2]; + step1[3] = io[6]; + highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 2 + highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]); + highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2], + &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +static void highbd_idct8x8_12_half1d(__m128i *const io) { + __m128i temp1[4], sign[2], step1[8], step2[8]; + + transpose_32bit_4x4(io, io); + + // stage 1 + step1[0] = io[0]; + step1[1] = io[2]; + abs_extend_64bit_sse2(io[1], temp1, sign); + step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64); + step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64); + abs_extend_64bit_sse2(io[3], temp1, sign); + step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64); + step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64); + + // stage 2 + abs_extend_64bit_sse2(step1[0], temp1, sign); + step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + abs_extend_64bit_sse2(step1[1], temp1, sign); + step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64); + step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[0], step2[2]); + step1[2] = _mm_sub_epi32(step2[0], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + vpx_idct8_sse2(io_short); + vpx_idct8_sse2(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_half1d(io); + + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + highbd_idct8x8_half1d(&io[8]); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + highbd_idct8x8_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} + +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], zero); + io_short[1] = _mm_packs_epi32(io[1], zero); + io_short[2] = _mm_packs_epi32(io[2], zero); + io_short[3] = _mm_packs_epi32(io[3], zero); + + idct8x8_12_add_kernel_sse2(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_12_half1d(io); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + highbd_idct8x8_12_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_12_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} + +void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 8); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c new file mode 100644 index 0000000000..8b2e3d2415 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE4.1 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" + +void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) { + __m128i step1[8], step2[8]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + step1[0] = io[0]; + step1[2] = io[4]; + step1[1] = io[2]; + step1[3] = io[6]; + highbd_butterfly_sse4_1(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4], + &step1[7]); + highbd_butterfly_sse4_1(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5], + &step1[6]); + + // stage 2 + highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]); + highbd_butterfly_sse4_1(step1[1], step1[3], cospi_24_64, cospi_8_64, + &step2[2], &step2[3]); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[1], step2[2]); + step1[2] = _mm_sub_epi32(step2[1], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +static void highbd_idct8x8_12_half1d(__m128i *const io) { + __m128i temp1[2], step1[8], step2[8]; + + transpose_32bit_4x4(io, io); + + // stage 1 + step1[0] = io[0]; + step1[1] = io[2]; + extend_64bit(io[1], temp1); + step1[4] = multiplication_round_shift_sse4_1(temp1, cospi_28_64); + step1[7] = multiplication_round_shift_sse4_1(temp1, cospi_4_64); + extend_64bit(io[3], temp1); + step1[5] = multiplication_round_shift_sse4_1(temp1, -cospi_20_64); + step1[6] = multiplication_round_shift_sse4_1(temp1, cospi_12_64); + + // stage 2 + extend_64bit(step1[0], temp1); + step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + extend_64bit(step1[1], temp1); + step2[2] = multiplication_round_shift_sse4_1(temp1, cospi_24_64); + step2[3] = multiplication_round_shift_sse4_1(temp1, cospi_8_64); + step2[4] = _mm_add_epi32(step1[4], step1[5]); + step2[5] = _mm_sub_epi32(step1[4], step1[5]); + step2[6] = _mm_sub_epi32(step1[7], step1[6]); + step2[7] = _mm_add_epi32(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi32(step2[0], step2[3]); + step1[1] = _mm_add_epi32(step2[0], step2[2]); + step1[2] = _mm_sub_epi32(step2[0], step2[2]); + step1[3] = _mm_sub_epi32(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]); + step1[7] = step2[7]; + + // stage 4 + highbd_idct8_stage4(step1, io); +} + +void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + vpx_idct8_sse2(io_short); + vpx_idct8_sse2(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + vpx_highbd_idct8x8_half1d_sse4_1(io); + + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + vpx_highbd_idct8x8_half1d_sse4_1(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} + +void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], zero); + io_short[1] = _mm_packs_epi32(io[1], zero); + io_short[2] = _mm_packs_epi32(io[2], zero); + io_short[3] = _mm_packs_epi32(io[3], zero); + + idct8x8_12_add_kernel_ssse3(io_short); + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + highbd_idct8x8_12_half1d(io); + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + highbd_idct8x8_12_half1d(io); + + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_idct8x8_12_half1d(&io[8]); + + highbd_idct8x8_final_round(io); + } + + recon_and_store_8x8(io, dest, stride, bd); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c new file mode 100644 index 0000000000..43634aea3a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// ----------------------------------------------------------------------------- + +void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); + } +} + +//------------------------------------------------------------------------------ +// DC 4x4 + +static INLINE __m128i dc_sum_4(const uint16_t *ref) { + const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +static INLINE __m128i dc_sum_8(const uint16_t *ref) { + const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); + const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_8x8(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)left; + (void)bd; + dc_store_8x8(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_8x8(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +static INLINE __m128i dc_sum_16(const uint16_t *ref) { + const __m128i sum_lo = dc_sum_8(ref); + const __m128i sum_hi = dc_sum_8(ref + 8); + return _mm_add_epi16(sum_lo, sum_hi); +} + +static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 16; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_16x16(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16x16(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16x16(dst, stride, &dc_dup); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +static INLINE __m128i dc_sum_32(const uint16_t *ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sum_a = dc_sum_16(ref); + const __m128i sum_b = dc_sum_16(ref + 16); + // 12 bit bd will outrange, so expand to 32 bit before adding final total + return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), + _mm_unpacklo_epi16(sum_b, zero)); +} + +static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < 32; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + _mm_store_si128((__m128i *)(dst + 16), dc_dup); + _mm_store_si128((__m128i *)(dst + 24), dc_dup); + } +} + +void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_32x32(dst, stride, &dc); +} + +void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32x32(dst, stride, &dc); +} + +void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32x32(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4); + const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0); + const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00); + const __m128i row0 = _mm_srli_si128(avg2, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg2, 4); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + + dst -= stride; + dst[0] = _mm_extract_epi16(avg3, 1); + dst[stride] = _mm_extract_epi16(avg3, 0); +} + +void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0); + const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC); + const __m128i row0 = _mm_srli_si128(avg3, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg3, 2); + const __m128i row3 = avg3; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXXABC = _mm_castps_si128( + _mm_loadh_pi(_mm_setzero_ps(), (const __m64 *)(above - 1))); + const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); + const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); + const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); + const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3); + const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2); + const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4); + const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00); + const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0); + const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row2 = _mm_srli_si128(row3, 4); + const __m128i row1 = _mm_srli_si128(row3, 8); + const __m128i row0 = _mm_srli_si128(avg3, 4); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst[0] = _mm_extract_epi16(avg2, 3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left); + const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff); + const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000); + const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2); + const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4); + const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00); + const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0); + const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row1 = _mm_srli_si128(row0, 4); + const __m128i row2 = _mm_srli_si128(row0, 8); + const __m128i row3 = LLLL0000; + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0); + const __m128i row0 = avg2; + const __m128i row1 = avg3; + const __m128i row2 = _mm_srli_si128(avg2, 2); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c new file mode 100644 index 0000000000..d673fac493 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c @@ -0,0 +1,930 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); + dst[3] = above[7]; // aka H +} + +static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, + __m128i *row, const __m128i *ar) { + *row = _mm_alignr_epi8(*ar, *row, 2); + _mm_store_si128((__m128i *)*dst, *row); + *dst += stride; +} + +void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3); + dst += stride; + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); + d45_store_8(&dst, stride, &avg3, &HHHHHHHH); +} + +static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, + __m128i *row_0, __m128i *row_1, + const __m128i *ar) { + *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2); + *row_1 = _mm_alignr_epi8(*ar, *row_1, 2); + _mm_store_si128((__m128i *)*dst, *row_0); + _mm_store_si128((__m128i *)(*dst + 8), *row_1); + *dst += stride; +} + +void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + dst += stride; + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); + d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); +} + +void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + int i; + (void)left; + (void)bd; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + for (i = 1; i < 32; ++i) { + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); + avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); + avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + } +} + +DECLARE_ALIGNED(16, static const uint8_t, + rotate_right_epu16[16]) = { 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 0, 1 }; + +static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { + *a = _mm_shuffle_epi8(*a, *rotrw); + return *a; +} + +void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i IXABCDEF = + _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); + __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); + __m128i rowa = avg2; + __m128i rowb = avg3; + int i; + (void)bd; + for (i = 0; i < 8; i += 2) { + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb); + dst += stride; + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); + } +} + +void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_srli_si128(L1, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + dst += stride; + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i C2 = _mm_alignr_epi8(B2, B1, 14); + const __m128i C3 = _mm_alignr_epi8(B3, B2, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2); + const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2); + const __m128i L3_ = _mm_srli_si128(L3, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowa_2 = avg2_2; + __m128i rowa_3 = avg2_3; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i rowb_2 = avg3_2; + __m128i rowb_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_); + avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + _mm_store_si128((__m128i *)(dst + 16), rowb_2); + _mm_store_si128((__m128i *)(dst + 24), rowb_3); + dst += stride; + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14); + rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); + const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); + __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + __m128i rowa = avg3; + int i; + (void)bd; + for (i = 0; i < 8; ++i) { + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + } +} + +void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_srli_si128(B1, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + } + } +} + +void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); + const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); + const __m128i C3 = _mm_srli_si128(B3, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); + const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i rowa_2 = avg3_2; + __m128i rowa_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); + avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + } + } +} + +void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2); + const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4); + const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14); + const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO); + const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left); + const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left); + const __m128i row0 = + _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12); + const __m128i row1 = + _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12); + const __m128i row2 = + _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12); + const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12); + const __m128i row4 = + _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12); + const __m128i row5 = + _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12); + const __m128i row6 = + _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12); + const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12); + (void)bd; + _mm_store_si128((__m128i *)dst, row0); + dst += stride; + _mm_store_si128((__m128i *)dst, row1); + dst += stride; + _mm_store_si128((__m128i *)dst, row2); + dst += stride; + _mm_store_si128((__m128i *)dst, row3); + dst += stride; + _mm_store_si128((__m128i *)dst, row4); + dst += stride; + _mm_store_si128((__m128i *)dst, row5); + dst += stride; + _mm_store_si128((__m128i *)dst, row6); + dst += stride; + _mm_store_si128((__m128i *)dst, row7); +} + +void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_srli_si128(A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_srli_si128(A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i avg2_avg3_left[2][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + } + } +} + +void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_srli_si128(A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_srli_si128(A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12); + const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2); + const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2); + const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i row_2 = avg3_2; + __m128i row_3 = avg3_3; + __m128i avg2_avg3_left[4][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3); + avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3); + + for (j = 0; j < 4; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + } + } +} + +static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b) { + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); + const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3); + const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3); + (void)above; + (void)bd; + d207_store_4x8(&dst, stride, &out_a, &out_b); + d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH); +} + +static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b, + const __m128i *c) { + _mm_store_si128((__m128i *)*dst, *a); + _mm_store_si128((__m128i *)(*dst + 8), *b); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)left); + const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(LR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(LR, A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); + const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); + const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); + const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); + (void)above; + (void)bd; + d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c); + d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d); + d207_store_4x16(&dst, stride, &out_c, &out_d, &LR); + d207_store_4x16(&dst, stride, &out_d, &LR, &LR); +} + +static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b, + const __m128i *c, const __m128i *d, + const __m128i *e) { + _mm_store_si128((__m128i *)*dst, *a); + _mm_store_si128((__m128i *)(*dst + 8), *b); + _mm_store_si128((__m128i *)(*dst + 16), *c); + _mm_store_si128((__m128i *)(*dst + 24), *d); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)left); + const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(LR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(LR, A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); + const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); + const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); + const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); + const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2); + const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2); + const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3); + const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3); + (void)above; + (void)bd; + d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e); + d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f); + d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g); + d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h); + d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR); + d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR); + d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR); + d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR); +} + +static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride, + __m128i *a, __m128i *b, const __m128i *ar) { + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, *b); + *dst += stride; + *a = _mm_alignr_epi8(*ar, *a, 2); + *b = _mm_alignr_epi8(*ar, *b, 2); + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, *b); + *dst += stride; + *a = _mm_alignr_epi8(*ar, *a, 2); + *b = _mm_alignr_epi8(*ar, *b, 2); +} + +void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); + (void)left; + (void)bd; + d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); + d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); +} + +void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg2_0 = _mm_avg_epu16(A0, B0); + __m128i avg2_1 = _mm_avg_epu16(A1, B1); + int i; + (void)left; + (void)bd; + for (i = 0; i < 14; i += 2) { + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + dst += stride; + avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); + avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2); + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2); + } + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); +} + +void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); + __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + __m128i avg2_0 = _mm_avg_epu16(A0, B0); + __m128i avg2_1 = _mm_avg_epu16(A1, B1); + __m128i avg2_2 = _mm_avg_epu16(A2, B2); + __m128i avg2_3 = _mm_avg_epu16(A3, B3); + int i; + (void)left; + (void)bd; + for (i = 0; i < 30; i += 2) { + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + _mm_store_si128((__m128i *)(dst + 16), avg2_2); + _mm_store_si128((__m128i *)(dst + 24), avg2_3); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); + dst += stride; + avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); + avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2); + avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2); + avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2); + avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); + avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); + avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); + avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); + } + _mm_store_si128((__m128i *)dst, avg2_0); + _mm_store_si128((__m128i *)(dst + 8), avg2_1); + _mm_store_si128((__m128i *)(dst + 16), avg2_2); + _mm_store_si128((__m128i *)(dst + 24), avg2_3); + dst += stride; + _mm_store_si128((__m128i *)dst, avg3_0); + _mm_store_si128((__m128i *)(dst + 8), avg3_1); + _mm_store_si128((__m128i *)(dst + 16), avg3_2); + _mm_store_si128((__m128i *)(dst + 24), avg3_3); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm index c61b62104f..caf506ac07 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm @@ -256,7 +256,7 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above REP_RET INIT_XMM sse2 -cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps +cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd movd m1, [aboveq-2] movq m0, [aboveq] pshuflw m1, m1, 0x0 @@ -264,7 +264,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps movlhps m1, m1 ; tl tl tl tl tl tl tl tl ; Get the values to compute the maximum value at this bit depth pcmpeqw m3, m3 - movd m4, bpsd + movd m4, bdd psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl psllw m3, m4 pcmpeqw m2, m2 @@ -295,7 +295,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps RET INIT_XMM sse2 -cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one +cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one movd m1, [aboveq-2] mova m0, [aboveq] pshuflw m1, m1, 0x0 @@ -304,7 +304,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one pxor m3, m3 pxor m4, m4 pinsrw m3, oned, 0 - pinsrw m4, bpsd, 0 + pinsrw m4, bdd, 0 pshuflw m3, m3, 0x0 DEFINE_ARGS dst, stride, line, left punpcklqdq m3, m3 @@ -339,14 +339,14 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one REP_RET INIT_XMM sse2 -cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps +cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd movd m2, [aboveq-2] mova m0, [aboveq] mova m1, [aboveq+16] pshuflw m2, m2, 0x0 ; Get the values to compute the maximum value at this bit depth pcmpeqw m3, m3 - movd m4, bpsd + movd m4, bdd punpcklqdq m2, m2 psllw m3, m4 pcmpeqw m5, m5 @@ -386,7 +386,7 @@ cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps REP_RET INIT_XMM sse2 -cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps +cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd movd m0, [aboveq-2] mova m1, [aboveq] mova m2, [aboveq+16] @@ -395,7 +395,7 @@ cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps pshuflw m0, m0, 0x0 ; Get the values to compute the maximum value at this bit depth pcmpeqw m5, m5 - movd m6, bpsd + movd m6, bdd psllw m5, m6 pcmpeqw m7, m7 pxor m6, m6 ; min possible value diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h new file mode 100644 index 0000000000..1d07391b02 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ +#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +// Note: There is no 64-bit bit-level shifting SIMD instruction. All +// coefficients are left shifted by 2, so that dct_const_round_shift() can be +// done by right shifting 2 bytes. + +static INLINE void extend_64bit(const __m128i in, + __m128i *const out /*out[2]*/) { + out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1 + out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3 +} + +static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1, + const __m128i rounding) { + __m128i temp[2]; + temp[0] = _mm_add_epi32(in0, rounding); + temp[1] = _mm_add_epi32(in1, rounding); + temp[0] = _mm_srai_epi32(temp[0], 4); + temp[1] = _mm_srai_epi32(temp[1], 4); + return _mm_packs_epi32(temp[0], temp[1]); +} + +static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1, + const __m128i rounding) { + __m128i temp[2]; + temp[0] = _mm_add_epi32(in0, rounding); + temp[1] = _mm_add_epi32(in1, rounding); + temp[0] = _mm_srai_epi32(temp[0], 5); + temp[1] = _mm_srai_epi32(temp[1], 5); + return _mm_packs_epi32(temp[0], temp[1]); +} + +static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) { + const __m128i t = + _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0)); + return _mm_srli_si128(t, 2); +} + +static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) { + const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2 + const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3 + return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3 +} + +static INLINE void abs_extend_64bit_sse2(const __m128i in, + __m128i *const out /*out[2]*/, + __m128i *const sign /*sign[2]*/) { + sign[0] = _mm_srai_epi32(in, 31); + out[0] = _mm_xor_si128(in, sign[0]); + out[0] = _mm_sub_epi32(out[0], sign[0]); + sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3 + sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1 + out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3 + out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1 +} + +// Note: cospi must be non negative. +static INLINE __m128i multiply_apply_sign_sse2(const __m128i in, + const __m128i sign, + const __m128i cospi) { + __m128i out = _mm_mul_epu32(in, cospi); + out = _mm_xor_si128(out, sign); + return _mm_sub_epi64(out, sign); +} + +// Note: c must be non negative. +static INLINE __m128i multiplication_round_shift_sse2( + const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/, + const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); + __m128i t0, t1; + + assert(c >= 0); + t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); + t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +// Note: c must be non negative. +static INLINE __m128i multiplication_neg_round_shift_sse2( + const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/, + const int c) { + const __m128i pair_c = pair_set_epi32(c << 2, 0); + __m128i t0, t1; + + assert(c >= 0); + t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c); + t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c); + t0 = _mm_sub_epi64(_mm_setzero_si128(), t0); + t1 = _mm_sub_epi64(_mm_setzero_si128(), t1); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0); + const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0); + __m128i temp1[4], temp2[4], sign1[2], sign2[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in0, temp1, sign1); + abs_extend_64bit_sse2(in1, temp2, sign2); + temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1); + temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1); + temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0); + temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0); + temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0); + temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0); + temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1); + temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0, + const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2], sign[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in, temp, sign); + *out0 = multiplication_round_shift_sse2(temp, sign, c0); + *out1 = multiplication_round_shift_sse2(temp, sign, c1); +} + +// Note: c0 and c1 must be non negative. +static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2], sign[2]; + + assert(c0 >= 0); + assert(c1 >= 0); + abs_extend_64bit_sse2(in, temp, sign); + *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1); + *out1 = multiplication_round_shift_sse2(temp, sign, c0); +} + +static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0, + const __m128i in1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[2], temp2, sign[2]; + + temp2 = _mm_add_epi32(in0, in1); + abs_extend_64bit_sse2(temp2, temp1, sign); + *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); + temp2 = _mm_sub_epi32(in0, in1); + abs_extend_64bit_sse2(temp2, temp1, sign); + *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64); +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi32(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]); + i++; + } +} + +static INLINE void highbd_idct8_stage4(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[7]); + out[1] = _mm_add_epi32(in[1], in[6]); + out[2] = _mm_add_epi32(in[2], in[5]); + out[3] = _mm_add_epi32(in[3], in[4]); + out[4] = _mm_sub_epi32(in[3], in[4]); + out[5] = _mm_sub_epi32(in[2], in[5]); + out[6] = _mm_sub_epi32(in[1], in[6]); + out[7] = _mm_sub_epi32(in[0], in[7]); +} + +static INLINE void highbd_idct8x8_final_round(__m128i *const io) { + io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16)); + io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16)); + io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16)); + io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16)); + io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16)); + io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16)); + io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16)); + io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16)); +} + +static INLINE void highbd_idct16_4col_stage7(const __m128i *const in, + __m128i *const out) { + out[0] = _mm_add_epi32(in[0], in[15]); + out[1] = _mm_add_epi32(in[1], in[14]); + out[2] = _mm_add_epi32(in[2], in[13]); + out[3] = _mm_add_epi32(in[3], in[12]); + out[4] = _mm_add_epi32(in[4], in[11]); + out[5] = _mm_add_epi32(in[5], in[10]); + out[6] = _mm_add_epi32(in[6], in[9]); + out[7] = _mm_add_epi32(in[7], in[8]); + out[8] = _mm_sub_epi32(in[7], in[8]); + out[9] = _mm_sub_epi32(in[6], in[9]); + out[10] = _mm_sub_epi32(in[5], in[10]); + out[11] = _mm_sub_epi32(in[4], in[11]); + out[12] = _mm_sub_epi32(in[3], in[12]); + out[13] = _mm_sub_epi32(in[2], in[13]); + out[14] = _mm_sub_epi32(in[1], in[14]); + out[15] = _mm_sub_epi32(in[0], in[15]); +} + +static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1, + const int bd) { + const __m128i zero = _mm_setzero_si128(); + // Faster than _mm_set1_epi16((1 << bd) - 1). + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + __m128i d; + + d = _mm_adds_epi16(in0, in1); + d = _mm_max_epi16(d, zero); + d = _mm_min_epi16(d, max); + + return d; +} + +static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input, + uint16_t *dest, int stride, int bd, + const int size) { + int a1, i, j; + tran_low_t out; + __m128i dc, d; + + out = HIGHBD_WRAPLOW( + dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); + out = + HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6); + dc = _mm_set1_epi16(a1); + + for (i = 0; i < size; ++i) { + for (j = 0; j < size; j += 8) { + d = _mm_load_si128((const __m128i *)(&dest[j])); + d = add_clamp(d, dc, bd); + _mm_store_si128((__m128i *)(&dest[j]), d); + } + dest += stride; + } +} + +static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest, + const int bd) { + __m128i d; + + d = _mm_loadl_epi64((const __m128i *)dest); + d = add_clamp(d, in, bd); + _mm_storel_epi64((__m128i *)dest, d); +} + +static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest, + const int stride, const int bd) { + __m128i d; + + d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); + d = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride))); + d = add_clamp(d, in, bd); + _mm_storel_epi64((__m128i *)(dest + 0 * stride), d); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d)); +} + +static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest, + const int stride, const int bd) { + recon_and_store_4x2(in[0], dest, stride, bd); + dest += 2 * stride; + recon_and_store_4x2(in[1], dest, stride, bd); +} + +static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest, + const int stride, const int bd) { + __m128i d; + + d = _mm_load_si128((const __m128i *)(*dest)); + d = add_clamp(d, in, bd); + _mm_store_si128((__m128i *)(*dest), d); + *dest += stride; +} + +static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest, + const int stride, const int bd) { + recon_and_store_8(in[0], &dest, stride, bd); + recon_and_store_8(in[1], &dest, stride, bd); + recon_and_store_8(in[2], &dest, stride, bd); + recon_and_store_8(in[3], &dest, stride, bd); + recon_and_store_8(in[4], &dest, stride, bd); + recon_and_store_8(in[5], &dest, stride, bd); + recon_and_store_8(in[6], &dest, stride, bd); + recon_and_store_8(in[7], &dest, stride, bd); +} + +static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) { + const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0)); + const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4)); + return _mm_packs_epi32(t0, t1); +} + +static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input, + const int stride, + __m128i *const in) { + in[0] = load_pack_8_32bit(input + 0 * stride); + in[1] = load_pack_8_32bit(input + 1 * stride); + in[2] = load_pack_8_32bit(input + 2 * stride); + in[3] = load_pack_8_32bit(input + 3 * stride); + in[4] = load_pack_8_32bit(input + 4 * stride); + in[5] = load_pack_8_32bit(input + 5 * stride); + in[6] = load_pack_8_32bit(input + 6 * stride); + in[7] = load_pack_8_32bit(input + 7 * stride); + transpose_16bit_8x8(in, in); +} + +static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input, + const int stride, + __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); + in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4)); + in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); + in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4)); + in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); + in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4)); + in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); + in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4)); + transpose_32bit_8x4(in, in); +} + +static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input, + const int stride, + __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + transpose_32bit_4x4(in, in); +} + +static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in, + const int bd) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i out; + + out = _mm_adds_epi16(in, final_rounding); + out = _mm_srai_epi16(out, 6); + recon_and_store_8(out, &dest, 0, bd); +} + +static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in, + const int bd) { + const __m128i final_rounding = _mm_set1_epi32(1 << 5); + __m128i out; + + out = _mm_add_epi32(in, final_rounding); + out = _mm_srai_epi32(out, 6); + out = _mm_packs_epi32(out, out); + recon_and_store_4(out, dest, bd); +} + +#endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h new file mode 100644 index 0000000000..f446bb13f3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ +#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ + +#include // SSE4.1 + +#include "./vpx_config.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" + +static INLINE __m128i multiplication_round_shift_sse4_1( + const __m128i *const in /*in[2]*/, const int c) { + const __m128i pair_c = pair_set_epi32(c * 4, 0); + __m128i t0, t1; + + t0 = _mm_mul_epi32(in[0], pair_c); + t1 = _mm_mul_epi32(in[1], pair_c); + t0 = dct_const_round_shift_64bit(t0); + t1 = dct_const_round_shift_64bit(t1); + + return pack_4(t0, t1); +} + +static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i temp1[4], temp2[4]; + + extend_64bit(in0, temp1); + extend_64bit(in1, temp2); + temp1[2] = _mm_mul_epi32(temp1[0], pair_c1); + temp1[3] = _mm_mul_epi32(temp1[1], pair_c1); + temp1[0] = _mm_mul_epi32(temp1[0], pair_c0); + temp1[1] = _mm_mul_epi32(temp1[1], pair_c0); + temp2[2] = _mm_mul_epi32(temp2[0], pair_c0); + temp2[3] = _mm_mul_epi32(temp2[1], pair_c0); + temp2[0] = _mm_mul_epi32(temp2[0], pair_c1); + temp2[1] = _mm_mul_epi32(temp2[1], pair_c1); + temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); + temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); + temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); + temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); + temp1[0] = dct_const_round_shift_64bit(temp1[0]); + temp1[1] = dct_const_round_shift_64bit(temp1[1]); + temp2[0] = dct_const_round_shift_64bit(temp2[0]); + temp2[1] = dct_const_round_shift_64bit(temp2[1]); + *out0 = pack_4(temp1[0], temp1[1]); + *out1 = pack_4(temp2[0], temp2[1]); +} + +static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0, + const __m128i in1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp1[2], temp2; + + temp2 = _mm_add_epi32(in0, in1); + extend_64bit(temp2, temp1); + *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64); + temp2 = _mm_sub_epi32(in0, in1); + extend_64bit(temp2, temp1); + *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64); +} + +static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in, + const int c0, const int c1, + __m128i *const out0, + __m128i *const out1) { + __m128i temp[2]; + + extend_64bit(in, temp); + *out0 = multiplication_round_shift_sse4_1(temp, c0); + *out1 = multiplication_round_shift_sse4_1(temp, c1); +} + +static INLINE void highbd_idct4_sse4_1(__m128i *const io) { + __m128i temp[2], step[4]; + + transpose_32bit_4x4(io, io); + + // stage 1 + temp[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2] + extend_64bit(temp[0], temp); + step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + temp[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2] + extend_64bit(temp[0], temp); + step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64); + highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2], + &step[3]); + + // stage 2 + io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3] + io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2] + io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2] + io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] +} + +void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io); +void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/); + +#endif // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c index 8670b28958..9f45623dee 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c @@ -12,14 +12,13 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -#include "vpx_ports/emmintrin_compat.h" static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { __m128i ubounded; __m128i lbounded; __m128i retval; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); __m128i t80, max, min; @@ -48,13 +47,13 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { // TODO(debargha, peter): Break up large functions into smaller ones // in this file. -void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); +void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); - __m128i blimit, limit, thresh; + __m128i blimit_v, limit_v, thresh_v; __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; __m128i ps1, qs1, ps0, qs0; @@ -71,35 +70,35 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, __m128i eight, four; if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); } - q4 = _mm_load_si128((__m128i *)(s + 4 * p)); - p4 = _mm_load_si128((__m128i *)(s - 5 * p)); - q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - q0 = _mm_load_si128((__m128i *)(s + 0 * p)); - p0 = _mm_load_si128((__m128i *)(s - 1 * p)); + q4 = _mm_load_si128((__m128i *)(s + 4 * pitch)); + p4 = _mm_load_si128((__m128i *)(s - 5 * pitch)); + q3 = _mm_load_si128((__m128i *)(s + 3 * pitch)); + p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); + q2 = _mm_load_si128((__m128i *)(s + 2 * pitch)); + p2 = _mm_load_si128((__m128i *)(s - 3 * pitch)); + q1 = _mm_load_si128((__m128i *)(s + 1 * pitch)); + p1 = _mm_load_si128((__m128i *)(s - 2 * pitch)); + q0 = _mm_load_si128((__m128i *)(s + 0 * pitch)); + p0 = _mm_load_si128((__m128i *)(s - 1 * pitch)); // highbd_filter_mask abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); @@ -112,14 +111,14 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, // highbd_hev_mask (in C code this is actually called from highbd_filter4) flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); + hev = _mm_subs_epu16(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); work = _mm_max_epi16( _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); @@ -133,7 +132,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit_v); mask = _mm_cmpeq_epi16(mask, zero); // return ~mask // lp filter @@ -208,12 +207,12 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, // (because, in both vars, each block of 16 either all 1s or all 0s) flat = _mm_and_si128(flat, mask); - p5 = _mm_load_si128((__m128i *)(s - 6 * p)); - q5 = _mm_load_si128((__m128i *)(s + 5 * p)); - p6 = _mm_load_si128((__m128i *)(s - 7 * p)); - q6 = _mm_load_si128((__m128i *)(s + 6 * p)); - p7 = _mm_load_si128((__m128i *)(s - 8 * p)); - q7 = _mm_load_si128((__m128i *)(s + 7 * p)); + p5 = _mm_load_si128((__m128i *)(s - 6 * pitch)); + q5 = _mm_load_si128((__m128i *)(s + 5 * pitch)); + p6 = _mm_load_si128((__m128i *)(s - 7 * pitch)); + q6 = _mm_load_si128((__m128i *)(s + 6 * pitch)); + p7 = _mm_load_si128((__m128i *)(s - 8 * pitch)); + q7 = _mm_load_si128((__m128i *)(s + 7 * pitch)); // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 // but referred to as p0-p4 & q0-q4 in fn) @@ -390,8 +389,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q6 = _mm_and_si128(flat2, flat2_q6); // get values for when (flat2 && flat && mask) q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values - _mm_store_si128((__m128i *)(s - 7 * p), p6); - _mm_store_si128((__m128i *)(s + 6 * p), q6); + _mm_store_si128((__m128i *)(s - 7 * pitch), p6); + _mm_store_si128((__m128i *)(s + 6 * pitch), q6); p5 = _mm_andnot_si128(flat2, p5); // p5 remains unchanged if !(flat2 && flat && mask) @@ -405,8 +404,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, // get values for when (flat2 && flat && mask) q5 = _mm_or_si128(q5, flat2_q5); // full list of q5 values - _mm_store_si128((__m128i *)(s - 6 * p), p5); - _mm_store_si128((__m128i *)(s + 5 * p), q5); + _mm_store_si128((__m128i *)(s - 6 * pitch), p5); + _mm_store_si128((__m128i *)(s + 5 * pitch), q5); p4 = _mm_andnot_si128(flat2, p4); // p4 remains unchanged if !(flat2 && flat && mask) @@ -418,8 +417,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q4 = _mm_and_si128(flat2, flat2_q4); // get values for when (flat2 && flat && mask) q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values - _mm_store_si128((__m128i *)(s - 5 * p), p4); - _mm_store_si128((__m128i *)(s + 4 * p), q4); + _mm_store_si128((__m128i *)(s - 5 * pitch), p4); + _mm_store_si128((__m128i *)(s + 4 * pitch), q4); p3 = _mm_andnot_si128(flat2, p3); // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -431,8 +430,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q3 = _mm_and_si128(flat2, flat2_q3); // get values for when (flat2 && flat && mask) q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values - _mm_store_si128((__m128i *)(s - 4 * p), p3); - _mm_store_si128((__m128i *)(s + 3 * p), q3); + _mm_store_si128((__m128i *)(s - 4 * pitch), p3); + _mm_store_si128((__m128i *)(s + 3 * pitch), q3); p2 = _mm_andnot_si128(flat2, p2); // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -445,8 +444,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q2 = _mm_and_si128(flat2, flat2_q2); // get values for when (flat2 && flat && mask) q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s + 2 * p), q2); + _mm_store_si128((__m128i *)(s - 3 * pitch), p2); + _mm_store_si128((__m128i *)(s + 2 * pitch), q2); p1 = _mm_andnot_si128(flat2, p1); // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -458,8 +457,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q1 = _mm_and_si128(flat2, flat2_q1); // get values for when (flat2 && flat && mask) q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s + 1 * p), q1); + _mm_store_si128((__m128i *)(s - 2 * pitch), p1); + _mm_store_si128((__m128i *)(s + 1 * pitch), q1); p0 = _mm_andnot_si128(flat2, p0); // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) @@ -471,39 +470,39 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, flat2_q0 = _mm_and_si128(flat2, flat2_q0); // get values for when (flat2 && flat && mask) q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s - 0 * p), q0); + _mm_store_si128((__m128i *)(s - 1 * pitch), p0); + _mm_store_si128((__m128i *)(s - 0 * pitch), q0); } -void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd); - vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd); +void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd); + vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd); } -void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { +void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); - __m128i blimit, limit, thresh; + const __m128i zero = _mm_setzero_si128(); + __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; - __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch)); + __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch)); + __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch)); + __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch)); + __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch)); + __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch)); + __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch)); + __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch)); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_cmpeq_epi16(one, one); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -520,25 +519,25 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, __m128i filter1, filter2; if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); t80 = _mm_set1_epi16(0x80); } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); t80 = _mm_set1_epi16(0x200); } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); t80 = _mm_set1_epi16(0x800); } @@ -554,16 +553,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); + hev = _mm_subs_epu16(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); mask = _mm_max_epi16(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; mask = _mm_max_epi16(abs_q1q0, mask); @@ -577,7 +576,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit_v); mask = _mm_cmpeq_epi16(mask, zero); // flat_mask4 @@ -675,7 +674,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); q2 = _mm_load_si128((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); @@ -695,43 +694,43 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); p2 = _mm_load_si128((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s + 0 * p), q0); - _mm_store_si128((__m128i *)(s + 1 * p), q1); - _mm_store_si128((__m128i *)(s + 2 * p), q2); + _mm_store_si128((__m128i *)(s - 3 * pitch), p2); + _mm_store_si128((__m128i *)(s - 2 * pitch), p1); + _mm_store_si128((__m128i *)(s - 1 * pitch), p0); + _mm_store_si128((__m128i *)(s + 0 * pitch), q0); + _mm_store_si128((__m128i *)(s + 1 * pitch), q1); + _mm_store_si128((__m128i *)(s + 2 * pitch), q2); } void vpx_highbd_lpf_horizontal_8_dual_sse2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); - vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd); } -void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); - __m128i blimit, limit, thresh; +void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit_v, limit_v, thresh_v; __m128i mask, hev, flat; - __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); const __m128i abs_q1q0 = @@ -761,57 +760,57 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, __m128i filter1, filter2; if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero); + limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero); + thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero); t80 = _mm_set1_epi16(0x80); - tff80 = _mm_set1_epi16(0xff80); - tffe0 = _mm_set1_epi16(0xffe0); + tff80 = _mm_set1_epi16((int16_t)0xff80); + tffe0 = _mm_set1_epi16((int16_t)0xffe0); t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2); t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); - tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2); - tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2); + tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 2); + tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 2); t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + blimit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4); + limit_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4); + thresh_v = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4); t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); - tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4); - tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4); + tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 4); + tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 4); t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4); t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); } - ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); + ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); + qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); + qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); // filter_mask and hev_mask flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); + hev = _mm_subs_epu16(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one)); mask = _mm_max_epi16(flat, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; @@ -823,7 +822,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit_v); mask = _mm_cmpeq_epi16(mask, zero); // filter4 @@ -873,18 +872,18 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), t80); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); } void vpx_highbd_lpf_horizontal_4_dual_sse2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); - vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd); } static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], @@ -999,9 +998,9 @@ static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, highbd_transpose(src1, in_p, dest1, out_p, 1); } -void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { +void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; uint16_t *dst[1]; @@ -1010,7 +1009,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, src[0] = s - 4; dst[0] = t_dst; - highbd_transpose(src, p, dst, 8, 1); + highbd_transpose(src, pitch, dst, 8, 1); // Loop filtering vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); @@ -1019,11 +1018,11 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[0] = s - 4; // Transpose back - highbd_transpose(src, 8, dst, p, 1); + highbd_transpose(src, 8, dst, pitch, 1); } void vpx_highbd_lpf_vertical_4_dual_sse2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); @@ -1031,7 +1030,7 @@ void vpx_highbd_lpf_vertical_4_dual_sse2( uint16_t *dst[2]; // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, @@ -1039,15 +1038,15 @@ void vpx_highbd_lpf_vertical_4_dual_sse2( src[0] = t_dst; src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - highbd_transpose(src, 16, dst, p, 2); + highbd_transpose(src, 16, dst, pitch, 2); } -void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh, - int bd) { +void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); uint16_t *src[1]; uint16_t *dst[1]; @@ -1056,7 +1055,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, src[0] = s - 4; dst[0] = t_dst; - highbd_transpose(src, p, dst, 8, 1); + highbd_transpose(src, pitch, dst, 8, 1); // Loop filtering vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); @@ -1065,11 +1064,11 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[0] = s - 4; // Transpose back - highbd_transpose(src, 8, dst, p, 1); + highbd_transpose(src, 8, dst, pitch, 1); } void vpx_highbd_lpf_vertical_8_dual_sse2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); @@ -1077,7 +1076,7 @@ void vpx_highbd_lpf_vertical_8_dual_sse2( uint16_t *dst[2]; // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, @@ -1086,13 +1085,14 @@ void vpx_highbd_lpf_vertical_8_dual_sse2( src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - highbd_transpose(src, 16, dst, p, 2); + highbd_transpose(src, 16, dst, pitch, 2); } -void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, +void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); @@ -1105,7 +1105,7 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[1] = t_dst + 8 * 8; // Transpose 16x8 - highbd_transpose(src, p, dst, 8, 2); + highbd_transpose(src, pitch, dst, 8, 2); // Loop filtering vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, @@ -1116,24 +1116,25 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, dst[1] = s; // Transpose back - highbd_transpose(src, 8, dst, p, 2); + highbd_transpose(src, 8, dst, pitch, 2); } -void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, +void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { DECLARE_ALIGNED(16, uint16_t, t_dst[256]); // Transpose 16x16 - highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16); + highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16); // Loop filtering vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); // Transpose back - highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); + highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch); + highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, + pitch); } diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c new file mode 100644 index 0000000000..35ca554049 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i sign = _mm_srai_epi16(*p, 15); + const __m128i dc = _mm_unpacklo_epi16(*p, sign); + const __m128i ac = _mm_unpackhi_epi16(*p, sign); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static VPX_FORCE_INLINE void update_qp(__m256i *qp) { + int i; + for (i = 0; i < 5; ++i) { + qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); + } +} + +static VPX_FORCE_INLINE void init_qp( + const struct macroblock_plane *const mb_plane, const int16_t *dequant_ptr, + __m256i *qp, int log_scale) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)mb_plane->zbin); + const __m128i round = _mm_loadu_si128((const __m128i *)mb_plane->round); + const __m128i quant = _mm_loadu_si128((const __m128i *)mb_plane->quant); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + const __m128i quant_shift = + _mm_loadu_si128((const __m128i *)mb_plane->quant_shift); + init_one_qp(&zbin, &qp[0]); + init_one_qp(&round, &qp[1]); + init_one_qp(&quant, &qp[2]); + init_one_qp(&dequant, &qp[3]); + init_one_qp(&quant_shift, &qp[4]); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1))); + qp[0] = _mm256_add_epi32(qp[0], rnd); + qp[0] = _mm256_srai_epi32(qp[0], log_scale); + + qp[1] = _mm256_add_epi32(qp[1], rnd); + qp[1] = _mm256_srai_epi32(qp[1], log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when + // calculating the zbin mask. + qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1)); +} + +// Note: +// *x is vector multiplied by *y which is 16 int32_t parallel multiplication +// and right shift 16. The output, 16 int32_t is save in *p. +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x, + const __m256i *y) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr, + __m256i eobmax, + __m256i nz_mask) { + const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask); + const __m256i packed_nz_mask_perm = + _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); + const __m256i iscan = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); + return _mm256_max_epi16(eobmax, nz_iscan); +} + +// Get the max eob from the lower 128 bits. +static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); +#if defined(_MSC_VER) && (_MSC_VER < 1910) + return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff; +#else + return (uint16_t)_mm256_extract_epi16(eob, 0); +#endif +} + +static VPX_FORCE_INLINE void quantize(const __m256i *qp, + const tran_low_t *coeff_ptr, + const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (_mm256_movemask_epi8(zbin_mask) == 0) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]); + const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } +} + +void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const int step = 8; + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + const int16_t *iscan = scan_order->iscan; + + init_qp(mb_plane, dequant_ptr, qp, 0); + + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} + +static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x, + const __m256i *y, + int log_scale) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale); + prod_hi = _mm256_slli_epi64(prod_hi, 32); + return _mm256_or_si256(prod_lo, prod_hi); +} + +static VPX_FORCE_INLINE void quantize_b_32x32( + const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr, + tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { + const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i abs_coeff = _mm256_abs_epi32(coeff); + const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]); + + if (_mm256_movemask_epi8(zbin_mask) == 0) { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + return; + } + + { + const __m256i tmp_rnd = + _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask); + // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0); + const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd); + // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1); + const __m256i abs_dq = + _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1); + const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256()); + const __m256i q = _mm256_sign_epi32(abs_q, coeff); + const __m256i dq = _mm256_sign_epi32(abs_dq, coeff); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask); + } +} + +void vpx_highbd_quantize_b_32x32_avx2( + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { + const unsigned int step = 8; + intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; + __m256i eob = _mm256_setzero_si256(); + __m256i qp[5]; + + init_qp(mb_plane, dequant_ptr, qp, 1); + + quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + + *eob_ptr = get_max_eob(eob); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 2362476c1f..adae60756d 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -8,24 +8,29 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include +#include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" -#if CONFIG_VP9_HIGHBITDEPTH void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, + const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { - int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + const struct ScanOrder *const scan_order) { + int i, j, non_zero_regs = (int)count / 4, eob_i = 0; __m128i zbins[2]; __m128i nzbins[2]; + const int16_t *iscan = scan_order->iscan; + const int16_t *zbin_ptr = mb_plane->zbin; + const int16_t *round_ptr = mb_plane->round; + const int16_t *quant_ptr = mb_plane->quant; + const int16_t *quant_shift_ptr = mb_plane->quant_shift; zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[0]); @@ -36,75 +41,72 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); - (void)scan; - memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = ((int)count / 4) - 1; i >= 0; i--) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (test == 0xffff) - non_zero_regs--; - else - break; - } + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } - // Quantization pass: - for (i = 0; i < non_zero_regs; i++) { - __m128i coeffs, coeffs_sign, tmp1, tmp2; - int test; - int abs_coeff[4]; - int coeff_sign[4]; + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); - tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); - tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); - tmp1 = _mm_or_si128(tmp1, tmp2); - test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i *)abs_coeff, coeffs); - _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); - for (j = 0; j < 4; j++) { - if (test & (1 << (4 * j))) { - int k = 4 * i + j; - const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; - const uint32_t abs_qcoeff = - (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; - dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; - } + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = + (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } - *eob_ptr = eob_i + 1; + *eob_ptr = eob_i; } void vpx_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan) { + const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const struct ScanOrder *const scan_order) { __m128i zbins[2]; __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; - int i, eob = -1; - const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); - const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); - (void)scan; + int i, eob = 0; + const intptr_t n_coeffs = 32 * 32; + const int16_t *iscan = scan_order->iscan; + const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1); + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); zbins[1] = _mm_set1_epi32(zbin1_tmp); @@ -116,39 +118,36 @@ void vpx_highbd_quantize_b_32x32_sse2( memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs / 4; i++) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = idx_arr[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; - } + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; } - *eob_ptr = eob + 1; + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + *eob_ptr = eob; } -#endif diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c new file mode 100644 index 0000000000..e483fdce73 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // AVX2 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t sad_array[4]) { + const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); + const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); + const __m256i t2 = _mm256_hadd_epi32(t0, t1); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), + _mm256_extractf128_si256(t2, 1)); + _mm_storeu_si128((__m128i *)sad_array, sum); +} + +static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + int x; + + for (x = 0; x < 4; ++x) { + __m256i r[4]; + r[0] = _mm256_loadu_si256((const __m256i *)refs[x]); + r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48)); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3)); + + // sum every abs diff + sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1])); + sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3])); + } + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 2); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 1; + } + calc_final_4(sums_32, sad_array); +} + +#define HIGHBD_SAD64XNX4D(n) \ + void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ + } + +#define HIGHBD_SADSKIP64XNx4D(n) \ + void vpx_highbd_sad_skip_64x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[8]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16)); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16)); + r[4] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16)); + r[6] = _mm256_loadu_si256((const __m256i *)refs[3]); + r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16)); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2)); + r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s)); + r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2)); + r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s)); + r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1])); + sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3])); + sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5])); + sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7])); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < (n / 8); ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 3; + } + calc_final_4(sums_32, sad_array); +} + +#define HIGHBD_SAD32XNX4D(n) \ + void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ + } + +#define HIGHBD_SADSKIP32XNx4D(n) \ + void vpx_highbd_sad_skip_32x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/, + const uint16_t *src, + int src_stride, + uint16_t *refs[4], + int ref_stride, int height) { + int i; + for (i = 0; i < height; i++) { + __m256i r[4]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + + // absolute differences between every ref[] to src + r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s)); + r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s)); + r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s)); + r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s)); + + // sum every abs diff + sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]); + sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]); + sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]); + sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]); + + src += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } +} + +static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4], int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + __m256i sums_32[4]; + const int height = VPXMIN(16, n); + const int num_iters = n / height; + int i; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_32[0] = _mm256_setzero_si256(); + sums_32[1] = _mm256_setzero_si256(); + sums_32[2] = _mm256_setzero_si256(); + sums_32[3] = _mm256_setzero_si256(); + + for (i = 0; i < num_iters; ++i) { + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32[0] = _mm256_add_epi32( + sums_32[0], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)))); + sums_32[1] = _mm256_add_epi32( + sums_32[1], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)))); + sums_32[2] = _mm256_add_epi32( + sums_32[2], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)))); + sums_32[3] = _mm256_add_epi32( + sums_32[3], + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)))); + + src += src_stride << 4; + } + calc_final_4(sums_32, sad_array); +} + +#define HIGHBD_SAD16XNX4D(n) \ + void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array, \ + n); \ + } + +#define HIGHBD_SADSKIP16XNx4D(n) \ + void vpx_highbd_sad_skip_16x##n##x4d_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + sad_array, n / 2); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} + +void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, uint32_t sad_array[4]) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *refs[4]; + __m256i sums_16[4]; + + refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); + refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); + refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); + refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); + sums_16[0] = _mm256_setzero_si256(); + sums_16[1] = _mm256_setzero_si256(); + sums_16[2] = _mm256_setzero_si256(); + sums_16[3] = _mm256_setzero_si256(); + + highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8); + + { + __m256i sums_32[4]; + sums_32[0] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))); + sums_32[1] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))); + sums_32[2] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))); + sums_32[3] = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))); + calc_final_4(sums_32, sad_array); + } +} + +// clang-format off +HIGHBD_SAD64XNX4D(64) +HIGHBD_SADSKIP64XNx4D(64) + +HIGHBD_SAD64XNX4D(32) +HIGHBD_SADSKIP64XNx4D(32) + +HIGHBD_SAD32XNX4D(64) +HIGHBD_SADSKIP32XNx4D(64) + +HIGHBD_SAD32XNX4D(32) +HIGHBD_SADSKIP32XNx4D(32) + +HIGHBD_SAD32XNX4D(16) +HIGHBD_SADSKIP32XNx4D(16) + +HIGHBD_SAD16XNX4D(32) +HIGHBD_SADSKIP16XNx4D(32) + +HIGHBD_SADSKIP16XNx4D(16) + +HIGHBD_SADSKIP16XNx4D(8) + // clang-format on diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm index 6c2a61e019..a07892d811 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm @@ -213,7 +213,12 @@ SECTION .text ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 -%macro HIGH_SADNXN4D 2 +; Macro Arguments: +; 1: Width +; 2: Height +; 3: If 0, then normal sad, if 2, then skip every other row +%macro HIGH_SADNXN4D 2-3 0 +%if %3 == 0 ; normal sad %if UNIX64 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 @@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 %endif +%else ; %3 == 2, downsample +%if UNIX64 +cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif ; +%endif ; sad/avg/skip ; set m1 push srcq @@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ pshufd m1, m1, 0x0 pop srcq +%if %3 == 2 ; skip rows + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] +%endif ; skip rows movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided mov ref2q, [ref1q+gprsize*1] @@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ shl ref1q, 1 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 +%if %3 == 2 ; Downsampling by two +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep +%undef rep HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 ; N.B. HIGH_PROCESS outputs dwords (32 bits) ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM @@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ paddd m4, m0 paddd m6, m1 punpcklqdq m4, m6 +%if %3 == 2 ; skip rows + pslld m4, 1 +%endif movifnidn r4, r4mp movu [r4], m4 RET @@ -285,3 +312,15 @@ HIGH_SADNXN4D 8, 8 HIGH_SADNXN4D 8, 4 HIGH_SADNXN4D 4, 8 HIGH_SADNXN4D 4, 4 + +HIGH_SADNXN4D 64, 64, 2 +HIGH_SADNXN4D 64, 32, 2 +HIGH_SADNXN4D 32, 64, 2 +HIGH_SADNXN4D 32, 32, 2 +HIGH_SADNXN4D 32, 16, 2 +HIGH_SADNXN4D 16, 32, 2 +HIGH_SADNXN4D 16, 16, 2 +HIGH_SADNXN4D 16, 8, 2 +HIGH_SADNXN4D 8, 16, 2 +HIGH_SADNXN4D 8, 8, 2 +HIGH_SADNXN4D 4, 8, 2 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c new file mode 100644 index 0000000000..78f8eb8bfa --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_avx2.c @@ -0,0 +1,522 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) { + const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8)); + const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4)); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1), + _mm256_extractf128_si256(t1, 1)); + return (unsigned int)_mm_cvtsi128_si32(sum); +} + +static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2)); + const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3)); + // sum every abs diff + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); + + src += src_stride; + ref += ref_stride; + } +} + +static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < (n / 2); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2); + + /* sums_16 will outrange after 2 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 1; + ref += ref_stride << 1; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD64XN(n) \ + unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP64xN(n) \ + unsigned int vpx_highbd_sad_skip_64x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + +static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + } +} + +static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < (n / 8); ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8); + + /* sums_16 will outrange after 8 rows, so add current sums_16 to + * sums_32*/ + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 3; + ref += ref_stride << 3; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD32XN(n) \ + unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP32xN(n) \ + unsigned int vpx_highbd_sad_skip_32x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + +static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16, + const uint16_t *src, int src_stride, + uint16_t *ref, int ref_stride, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + } +} + +static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int n) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_32 = _mm256_setzero_si256(); + const int height = VPXMIN(16, n); + const int num_iters = n / height; + int i; + + for (i = 0; i < num_iters; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + } + return calc_final(sums_32); +} + +#define HIGHBD_SAD16XN(n) \ + unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n); \ + } + +#define HIGHBD_SADSKIP16xN(n) \ + unsigned int vpx_highbd_sad_skip_16x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \ + n / 2); \ + } + +unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +// clang-format off +HIGHBD_SAD64XN(64) +HIGHBD_SADSKIP64xN(64) +HIGHBD_SAD64XN(32) +HIGHBD_SADSKIP64xN(32) +HIGHBD_SAD32XN(64) +HIGHBD_SADSKIP32xN(64) +HIGHBD_SAD32XN(32) +HIGHBD_SADSKIP32xN(32) +HIGHBD_SAD32XN(16) +HIGHBD_SADSKIP32xN(16) +HIGHBD_SAD16XN(32) +HIGHBD_SADSKIP16xN(32) +HIGHBD_SADSKIP16xN(16) +HIGHBD_SADSKIP16xN(8) +//clang-format on + +// AVG ------------------------------------------------------------------------- +static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32)); + const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32)); + const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + const __m256i avg2 = _mm256_avg_epu16(r2, x2); + const __m256i avg3 = _mm256_avg_epu16(r3, x3); + // absolute differences between every ref/pred avg to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2)); + const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3)); + // sum every abs diff + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1)); + *sums_16 = + _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3)); + + src += src_stride; + ref += ref_stride; + sec += 64; + } +} + +#define HIGHBD_SAD64XN_AVG(n) \ + unsigned int vpx_highbd_sad64x##n##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 2); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \ + \ + /* sums_16 will outrange after 2 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 1; \ + ref += ref_stride << 1; \ + sec += 64 << 1; \ + } \ + return calc_final(sums_32); \ + } + +// 64x64 +HIGHBD_SAD64XN_AVG(64) + +// 64x32 +HIGHBD_SAD64XN_AVG(32) + +static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; ++i) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + // absolute differences between every ref/pred avg to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride; + ref += ref_stride; + sec += 32; + } +} + +#define HIGHBD_SAD32XN_AVG(n) \ + unsigned int vpx_highbd_sad32x##n##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); \ + __m256i sums_32 = _mm256_setzero_si256(); \ + int i; \ + \ + for (i = 0; i < (n / 8); ++i) { \ + __m256i sums_16 = _mm256_setzero_si256(); \ + \ + highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \ + \ + /* sums_16 will outrange after 8 rows, so add current sums_16 to \ + * sums_32*/ \ + sums_32 = _mm256_add_epi32( \ + sums_32, \ + _mm256_add_epi32( \ + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), \ + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \ + \ + src += src_stride << 3; \ + ref += ref_stride << 3; \ + sec += 32 << 3; \ + } \ + return calc_final(sums_32); \ + } + +// 32x64 +HIGHBD_SAD32XN_AVG(64) + +// 32x32 +HIGHBD_SAD32XN_AVG(32) + +// 32x16 +HIGHBD_SAD32XN_AVG(16) + +static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16, + const uint16_t *src, + int src_stride, uint16_t *ref, + int ref_stride, uint16_t *sec, + int height) { + int i; + for (i = 0; i < height; i += 2) { + // load src and all ref[] + const __m256i s0 = _mm256_load_si256((const __m256i *)src); + const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride)); + const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride)); + const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16)); + const __m256i avg0 = _mm256_avg_epu16(r0, x0); + const __m256i avg1 = _mm256_avg_epu16(r1, x1); + // absolute differences between every ref[] to src + const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0)); + const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1)); + // sum every abs diff + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0); + *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1); + + src += src_stride << 1; + ref += ref_stride << 1; + sec += 32; + } +} + +unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_32 = _mm256_setzero_si256(); + int i; + + for (i = 0; i < 2; ++i) { + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); + + // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32 + sums_32 = _mm256_add_epi32( + sums_32, + _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); + + src += src_stride << 4; + ref += ref_stride << 4; + sec += 16 << 4; + } + return calc_final(sums_32); +} + +unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} + +unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); + uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred); + __m256i sums_16 = _mm256_setzero_si256(); + + highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); + + { + const __m256i sums_32 = _mm256_add_epi32( + _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)), + _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))); + return calc_final(sums_32); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm index bc4b28db24..62ad2237ff 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm @@ -12,6 +12,11 @@ SECTION .text +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro HIGH_SAD_FN 4 %if %4 == 0 %if %3 == 5 @@ -20,22 +25,33 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 -%else ; avg +%elif %4 == 1 ; avg %if %3 == 5 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 -cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ +cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 %define n_rowsd r7d %else ; x86-32 %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 -%endif ; avg/sad +%else ; %4 == 2, skip rows +%if %3 == 5 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%endif ; sad/avg/skip +%if %4 == 2 ; double the stride if we are skipping rows + lea src_strided, [src_strided*2] + lea ref_strided, [ref_strided*2] +%endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 @@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ ; uint8_t *ref, int ref_stride); %macro HIGH_SAD64XN 1-2 0 HIGH_SAD_FN 64, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 pxor m6, m6 @@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 +HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 +HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD32XN 1-2 0 HIGH_SAD_FN 32, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 pxor m6, m6 @@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 +HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 +HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 +HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD16XN 1-2 0 HIGH_SAD_FN 16, %1, 5, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/4 +%else mov n_rowsd, %1/2 +%endif pxor m0, m0 pxor m6, m6 @@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -292,13 +334,19 @@ HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 - +HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 +HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 +HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro HIGH_SAD8XN 1-2 0 HIGH_SAD_FN 8, %1, 7, %2 +%if %2 == 2 ; skip rows, so divide number of rows by 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 pxor m6, m6 @@ -350,6 +398,9 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 punpckldq m0, m6 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -361,3 +412,5 @@ HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 +HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 +HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index 30ee81b688..5a3a2818de 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -32,12 +32,12 @@ SECTION .text ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, ; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, +; const uint8_t *ref, ptrdiff_t ref_stride, ; int height, unsigned int *sse); ; ; This function returns the SE and stores SSE in the given pointer. -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse +%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse psubw %3, %4 psubw %1, %2 mova %4, %3 ; make copies to manipulate to calc sum @@ -72,13 +72,13 @@ SECTION .text paddd m6, m4 mov r1, ssem ; r1 = unsigned int *sse movd [r1], m7 ; store sse - movd rax, m6 ; store sum as return value + movd eax, m6 ; store sum as return value %endif RET %endmacro %macro INC_SRC_BY_SRC_STRIDE 0 -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 add srcq, src_stridemp add srcq, src_stridemp %else @@ -91,81 +91,65 @@ SECTION .text %define filter_idx_shift 5 -%ifdef PIC ; 64bit PIC +%if VPX_ARCH_X86_64 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq + ref, ref_stride, \ + second_pred, second_stride, height, sse + %define second_str second_strideq %else - cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, height, sse %endif %define block_height heightd %define bilin_filter sseq %else - %if ARCH_X86=1 && CONFIG_PIC=1 + %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, \ + ref, ref_stride, \ + second_pred, second_stride, height, sse %define block_height dword heightm - %define sec_str sec_stridemp - - ; Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back + %define second_str second_stridemp %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, height, \ - sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, \ + ref, ref_stride, height, sse %define block_height heightd - - ; Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back %else %if %2 == 1 ; avg - cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse - %if ARCH_X86_64 - %define block_height heightd - %define sec_str sec_strideq - %else + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, \ + second_pred, second_stride, height, sse %define block_height dword heightm - %define sec_str sec_stridemp - %endif + %define second_str second_stridemp %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, height, sse + x_offset, y_offset, \ + ref, ref_stride, height, sse %define block_height heightd %endif @@ -181,7 +165,7 @@ SECTION .text sar block_height, 1 %endif %if %2 == 1 ; avg - shl sec_str, 1 + shl second_str, 1 %endif ; FIXME(rbultje) replace by jumptable? @@ -196,35 +180,35 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m2, [srcq + 16] - mova m1, [dstq] - mova m3, [dstq + 16] + mova m1, [refq] + mova m3, [refq + 16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m2, [secq+16] + pavgw m0, [second_predq] + pavgw m2, [second_predq+16] %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m2, [srcq + src_strideq*2] - mova m1, [dstq] - mova m3, [dstq + dst_strideq*2] + mova m1, [refq] + mova m3, [refq + ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -242,40 +226,40 @@ SECTION .text movu m1, [srcq+16] movu m4, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*2+16] - mova m2, [dstq] - mova m3, [dstq+16] + mova m2, [refq] + mova m3, [refq+16] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*4] - mova m2, [dstq] - mova m3, [dstq+dst_strideq*2] + mova m2, [refq] + mova m3, [refq+ref_strideq*2] pavgw m0, m1 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -284,19 +268,19 @@ SECTION .text .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 +%if VPX_ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 %else ; x86-32 or mmx -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 ; x_offset == 0, reuse x_offset reg %define tempq x_offsetq add y_offsetq, g_bilin_filterm @@ -308,7 +292,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -318,8 +302,8 @@ SECTION .text movu m1, [srcq + 16] movu m4, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*2+16] - mova m2, [dstq] - mova m3, [dstq+16] + mova m2, [refq] + mova m3, [refq+16] ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of ; instructions is the same (5), but it is 1 mul instead of 2, so might be @@ -336,23 +320,23 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*4] mova m4, m1 - mova m2, [dstq] - mova m3, [dstq+dst_strideq*2] + mova m2, [refq] + mova m3, [refq+ref_strideq*2] pmullw m1, filter_y_a pmullw m5, filter_y_b paddw m1, filter_rnd @@ -364,16 +348,16 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -397,41 +381,41 @@ SECTION .text movu m1, [srcq + 16] movu m4, [srcq + 2] movu m5, [srcq + 18] - mova m2, [dstq] - mova m3, [dstq + 16] + mova m2, [refq] + mova m3, [refq + 16] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq + src_strideq*2] movu m4, [srcq + 2] movu m5, [srcq + src_strideq*2 + 2] - mova m2, [dstq] - mova m3, [dstq + dst_strideq*2] + mova m2, [refq] + mova m3, [refq + ref_strideq*2] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -460,20 +444,20 @@ SECTION .text pavgw m3, m5 pavgw m0, m2 pavgw m1, m3 - mova m4, [dstq] - mova m5, [dstq + 16] + mova m4, [refq] + mova m5, [refq + 16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 mova m0, m2 mova m1, m3 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -489,20 +473,20 @@ SECTION .text pavgw m3, m5 pavgw m0, m2 pavgw m2, m3 - mova m4, [dstq] - mova m5, [dstq + dst_strideq*2] + mova m4, [refq] + mova m5, [refq + ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] %endif SUM_SSE m0, m4, m2, m5, m6, m7 mova m0, m3 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -511,19 +495,19 @@ SECTION .text .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 +%if VPX_ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 %else ; x86_32 -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 ; x_offset == 0.5. We can reuse x_offset reg %define tempq x_offsetq add y_offsetq, g_bilin_filterm @@ -535,7 +519,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -565,21 +549,21 @@ SECTION .text paddw m0, filter_rnd psrlw m1, 4 paddw m0, m2 - mova m2, [dstq] + mova m2, [refq] psrlw m0, 4 - mova m3, [dstq+16] + mova m3, [refq+16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 mova m0, m4 mova m1, m5 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -604,21 +588,21 @@ SECTION .text paddw m0, filter_rnd psrlw m4, 4 paddw m0, m2 - mova m2, [dstq] + mova m2, [refq] psrlw m0, 4 - mova m3, [dstq+dst_strideq*2] + mova m3, [refq+ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m4, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m4, [second_predq] %endif SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -633,19 +617,19 @@ SECTION .text jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 +%if VPX_ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 %else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 ; y_offset == 0. We can reuse y_offset reg. %define tempq y_offsetq add x_offsetq, g_bilin_filterm @@ -657,7 +641,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -667,8 +651,8 @@ SECTION .text movu m1, [srcq+16] movu m2, [srcq+2] movu m3, [srcq+18] - mova m4, [dstq] - mova m5, [dstq+16] + mova m4, [refq] + mova m5, [refq+16] pmullw m1, filter_x_a pmullw m3, filter_x_b paddw m1, filter_rnd @@ -680,23 +664,23 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m2, [srcq+2] movu m3, [srcq+src_strideq*2+2] - mova m4, [dstq] - mova m5, [dstq+dst_strideq*2] + mova m4, [refq] + mova m5, [refq+ref_strideq*2] pmullw m1, filter_x_a pmullw m3, filter_x_b paddw m1, filter_rnd @@ -708,16 +692,16 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m4, m1, m5, m6, m7 lea srcq, [srcq+src_strideq*4] - lea dstq, [dstq+dst_strideq*4] + lea refq, [refq+ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -732,19 +716,19 @@ SECTION .text jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 +%if VPX_ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 %else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 ; y_offset == 0.5. We can reuse y_offset reg. %define tempq y_offsetq add x_offsetq, g_bilin_filterm @@ -756,7 +740,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -789,24 +773,24 @@ SECTION .text paddw m3, filter_rnd paddw m2, m4 paddw m3, m5 - mova m4, [dstq] - mova m5, [dstq+16] + mova m4, [refq] + mova m5, [refq+16] psrlw m2, 4 psrlw m3, 4 pavgw m0, m2 pavgw m1, m3 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 mova m0, m2 mova m1, m3 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -830,24 +814,24 @@ SECTION .text paddw m3, filter_rnd paddw m2, m4 paddw m3, m5 - mova m4, [dstq] - mova m5, [dstq+dst_strideq*2] + mova m4, [refq] + mova m5, [refq+ref_strideq*2] psrlw m2, 4 psrlw m3, 4 pavgw m0, m2 pavgw m2, m3 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] %endif SUM_SSE m0, m4, m2, m5, m6, m7 mova m0, m3 lea srcq, [srcq+src_strideq*4] - lea dstq, [dstq+dst_strideq*4] + lea refq, [refq+ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -859,24 +843,24 @@ SECTION .text .x_nonhalf_y_nonhalf: ; loading filter - this is same as in 8-bit depth -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && mmsize == 16 +%if VPX_ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] mova m10, [bilin_filter+y_offsetq] mova m11, [bilin_filter+y_offsetq+16] - mova m12, [pw_8] + mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 %define filter_y_b m11 %define filter_rnd m12 %else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 ; In this case, there is NO unused register. Used src_stride register. Later, ; src_stride has to be loaded from stack when it is needed. %define tempq src_strideq @@ -897,7 +881,7 @@ SECTION .text %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif ; end of load filter @@ -945,23 +929,23 @@ SECTION .text pmullw m3, filter_y_b paddw m0, m2 paddw m1, filter_rnd - mova m2, [dstq] + mova m2, [refq] paddw m1, m3 psrlw m0, 4 psrlw m1, 4 - mova m3, [dstq+16] + mova m3, [refq+16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 mova m0, m4 mova m1, m5 INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq + dst_strideq * 2] + lea refq, [refq + ref_strideq * 2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -999,23 +983,23 @@ SECTION .text pmullw m3, filter_y_b paddw m0, m2 paddw m4, filter_rnd - mova m2, [dstq] + mova m2, [refq] paddw m4, m3 psrlw m0, 4 psrlw m4, 4 - mova m3, [dstq+dst_strideq*2] + mova m3, [refq+ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m4, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m4, [second_predq] %endif SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq + dst_strideq * 4] + lea refq, [refq + ref_strideq * 4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm index 923418a992..5bee51fa0c 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -11,16 +11,18 @@ %include "vpx_ports/x86_abi_support.asm" +SECTION .text + ;unsigned int vpx_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, -; int source_stride, +; int src_stride, ; unsigned char * ref_ptr, -; int recon_stride, +; int ref_stride, ; unsigned int * SSE, ; int * Sum ;) -global sym(vpx_highbd_calc16x16var_sse2) PRIVATE +globalsym(vpx_highbd_calc16x16var_sse2) sym(vpx_highbd_calc16x16var_sse2): push rbp mov rbp, rsp @@ -34,8 +36,8 @@ sym(vpx_highbd_calc16x16var_sse2): mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] add rax, rax ; source stride in bytes add rdx, rdx ; recon stride in bytes @@ -167,13 +169,13 @@ sym(vpx_highbd_calc16x16var_sse2): ;unsigned int vpx_highbd_calc8x8var_sse2 ;( ; unsigned char * src_ptr, -; int source_stride, +; int src_stride, ; unsigned char * ref_ptr, -; int recon_stride, +; int ref_stride, ; unsigned int * SSE, ; int * Sum ;) -global sym(vpx_highbd_calc8x8var_sse2) PRIVATE +globalsym(vpx_highbd_calc8x8var_sse2) sym(vpx_highbd_calc8x8var_sse2): push rbp mov rbp, rsp @@ -187,8 +189,8 @@ sym(vpx_highbd_calc8x8var_sse2): mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] add rax, rax ; source stride in bytes add rdx, rdx ; recon stride in bytes diff --git a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c index 414ae5de1a..381e0ad193 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/highbd_variance_sse2.c @@ -7,8 +7,10 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "./vpx_config.h" +#include // SSE2 +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, @@ -89,9 +91,9 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, } #define HIGH_GET_VAR(S) \ - void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ + void vpx_highbd_8_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ @@ -120,8 +122,8 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, *sse = ROUND_POWER_OF_TWO(*sse, 8); \ } -HIGH_GET_VAR(16); -HIGH_GET_VAR(8); +HIGH_GET_VAR(16) +HIGH_GET_VAR(8) #undef HIGH_GET_VAR @@ -135,7 +137,7 @@ HIGH_GET_VAR(8); highbd_8_variance_sse2( \ src, src_stride, ref, ref_stride, w, h, sse, &sum, \ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> (shift)); \ } \ \ uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ @@ -148,7 +150,7 @@ HIGH_GET_VAR(8); highbd_10_variance_sse2( \ src, src_stride, ref, ref_stride, w, h, sse, &sum, \ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift)); \ return (var >= 0) ? (uint32_t)var : 0; \ } \ \ @@ -162,20 +164,20 @@ HIGH_GET_VAR(8); highbd_12_variance_sse2( \ src, src_stride, ref, ref_stride, w, h, sse, &sum, \ vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift)); \ return (var >= 0) ? (uint32_t)var : 0; \ } -VAR_FN(64, 64, 16, 12); -VAR_FN(64, 32, 16, 11); -VAR_FN(32, 64, 16, 11); -VAR_FN(32, 32, 16, 10); -VAR_FN(32, 16, 16, 9); -VAR_FN(16, 32, 16, 9); -VAR_FN(16, 16, 16, 8); -VAR_FN(16, 8, 8, 7); -VAR_FN(8, 16, 8, 7); -VAR_FN(8, 8, 8, 6); +VAR_FN(64, 64, 16, 12) +VAR_FN(64, 32, 16, 11) +VAR_FN(32, 64, 16, 11) +VAR_FN(32, 32, 16, 10) +VAR_FN(32, 16, 16, 9) +VAR_FN(16, 32, 16, 9) +VAR_FN(16, 16, 16, 8) +VAR_FN(16, 8, 8, 7) +VAR_FN(8, 16, 8, 7) +VAR_FN(8, 8, 8, 6) #undef VAR_FN @@ -251,13 +253,13 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, #define DECL(w, opt) \ int vpx_highbd_sub_pixel_variance##w##xh_##opt( \ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint16_t *dst, ptrdiff_t dst_stride, int height, \ + const uint16_t *ref, ptrdiff_t ref_stride, int height, \ unsigned int *sse, void *unused0, void *unused); #define DECLS(opt) \ - DECL(8, opt); \ + DECL(8, opt) \ DECL(16, opt) -DECLS(sse2); +DECLS(sse2) #undef DECLS #undef DECL @@ -265,61 +267,62 @@ DECLS(sse2); #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \ NULL); \ if (w > wf) { \ unsigned int sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ } \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ } \ \ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ + int64_t var; \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \ NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ @@ -328,44 +331,46 @@ DECLS(sse2); se = ROUND_POWER_OF_TWO(se, 2); \ sse = ROUND_POWER_OF_TWO(sse, 4); \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } \ \ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ int start_row; \ uint32_t sse; \ int se = 0; \ + int64_t var; \ uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ for (start_row = 0; start_row < h; start_row += 16) { \ uint32_t sse2; \ int height = h - start_row < 16 ? h - start_row : 16; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, x_offset, y_offset, \ - dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ + ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL, \ NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ + y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \ &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \ height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \ height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ @@ -375,23 +380,24 @@ DECLS(sse2); se = ROUND_POWER_OF_TWO(se, 4); \ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)); +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)) \ + FN(64, 32, 16, 6, 5, opt, (int64_t)) \ + FN(32, 64, 16, 5, 6, opt, (int64_t)) \ + FN(32, 32, 16, 5, 5, opt, (int64_t)) \ + FN(32, 16, 16, 5, 4, opt, (int64_t)) \ + FN(16, 32, 16, 4, 5, opt, (int64_t)) \ + FN(16, 16, 16, 4, 4, opt, (int64_t)) \ + FN(16, 8, 16, 4, 3, opt, (int64_t)) \ + FN(8, 16, 8, 3, 4, opt, (int64_t)) \ + FN(8, 8, 8, 3, 3, opt, (int64_t)) \ + FN(8, 4, 8, 3, 2, opt, (int64_t)) -FNS(sse2); +FNS(sse2) #undef FNS #undef FN @@ -400,79 +406,80 @@ FNS(sse2); #define DECL(w, opt) \ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt( \ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ - ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second, \ + ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \ void *unused); #define DECLS(opt1) \ DECL(16, opt1) \ DECL(8, opt1) -DECLS(sse2); +DECLS(sse2) #undef DECL #undef DECLS #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \ NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \ sec + 16, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \ sec + 32, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \ sec + 48, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ } \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ } \ \ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ + int64_t var; \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \ NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \ sec + 16, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \ sec + 32, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \ sec + 48, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ @@ -481,46 +488,48 @@ DECLS(sse2); se = ROUND_POWER_OF_TWO(se, 2); \ sse = ROUND_POWER_OF_TWO(sse, 4); \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } \ \ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ int start_row; \ + int64_t var; \ uint32_t sse; \ int se = 0; \ uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ for (start_row = 0; start_row < h; start_row += 16) { \ uint32_t sse2; \ int height = h - start_row < 16 ? h - start_row : 16; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, x_offset, y_offset, \ - dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ + ref + (start_row * ref_stride), ref_stride, sec + (start_row * w), \ w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 16 + (start_row * ref_stride), ref_stride, \ sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \ sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \ sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ @@ -530,23 +539,70 @@ DECLS(sse2); se = ROUND_POWER_OF_TWO(se, 4); \ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } -#define FNS(opt1) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ - FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt1, (int64_t)); +#define FNS(opt1) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (int64_t)) \ + FN(8, 16, 8, 4, 3, opt1, (int64_t)) \ + FN(8, 8, 8, 3, 3, opt1, (int64_t)) \ + FN(8, 4, 8, 3, 2, opt1, (int64_t)) -FNS(sse2); +FNS(sse2) #undef FNS #undef FN + +void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i, j; + if (width > 8) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; j += 16) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]); + _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1)); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } + } else if (width == 8) { + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]); + const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]); + _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1)); + comp_pred += 8 << 1; + pred += 8 << 1; + ref += ref_stride << 1; + } + } else { + assert(width == 4); + for (i = 0; i < height; i += 2) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]); + const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]); + const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]); + const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]); + _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0)); + _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1)); + comp_pred += 4 << 1; + pred += 4 << 1; + ref += ref_stride << 1; + } + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm index c18095c287..61af6236ed 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/intrapred_sse2.asm @@ -61,7 +61,7 @@ cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset psrlq m3, 8 movd [dstq+strideq ], m3 psrlq m0, 56 - movd tempq, m0 + movd tempd, m0 mov [dstq+strideq+3], tempb RESTORE_GOT diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c new file mode 100644 index 0000000000..752435d240 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_avx2.c @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // AVX2 + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/txfm_common.h" + +#define PAIR256_SET_EPI16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in, + int stride) { + int i; + // Load 16x16 values + for (i = 0; i < 16; i++) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride)); + const __m128i in1 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 4)); + const __m128i in2 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 8)); + const __m128i in3 = + _mm_loadu_si128((const __m128i *)((input + i * stride) + 12)); + const __m128i ls = _mm_packs_epi32(in0, in1); + const __m128i rs = _mm_packs_epi32(in2, in3); + in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1); +#else + in[i] = _mm256_load_si256((const __m256i *)(input + i * stride)); +#endif + } +} + +static INLINE __m256i dct_round_shift_avx2(__m256i in) { + const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING)); + return _mm256_srai_epi32(t, DCT_CONST_BITS); +} + +static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) { + const __m256i t = _mm256_madd_epi16(*in, *cospi); + return dct_round_shift_avx2(t); +} + +// Calculate the dot product between in0/1 and x and wrap to short. +static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1, + __m256i *x) { + const __m256i t0 = idct_madd_round_shift_avx2(in0, x); + const __m256i t1 = idct_madd_round_shift_avx2(in1, x); + return _mm256_packs_epi32(t0, t1); +} + +// Multiply elements by constants and add them together. +static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1, + __m256i *out0, __m256i *out1) { + __m256i cst0 = PAIR256_SET_EPI16(c0, -c1); + __m256i cst1 = PAIR256_SET_EPI16(c1, c0); + __m256i lo = _mm256_unpacklo_epi16(in0, in1); + __m256i hi = _mm256_unpackhi_epi16(in0, in1); + *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0); + *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1); +} + +static INLINE void idct16_16col(__m256i *in, __m256i *out) { + __m256i step1[16], step2[16]; + + // stage 2 + butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + step1[8] = _mm256_add_epi16(step2[8], step2[9]); + step1[9] = _mm256_sub_epi16(step2[8], step2[9]); + step1[10] = _mm256_sub_epi16(step2[11], step2[10]); + step1[11] = _mm256_add_epi16(step2[10], step2[11]); + step1[12] = _mm256_add_epi16(step2[12], step2[13]); + step1[13] = _mm256_sub_epi16(step2[12], step2[13]); + step1[14] = _mm256_sub_epi16(step2[15], step2[14]); + step1[15] = _mm256_add_epi16(step2[14], step2[15]); + + // stage 4 + butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm256_sub_epi16(step1[4], step1[5]); + step1[4] = _mm256_add_epi16(step1[4], step1[5]); + step2[6] = _mm256_sub_epi16(step1[7], step1[6]); + step1[7] = _mm256_add_epi16(step1[6], step1[7]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = _mm256_add_epi16(step2[0], step2[3]); + step1[1] = _mm256_add_epi16(step2[1], step2[2]); + step1[2] = _mm256_sub_epi16(step2[1], step2[2]); + step1[3] = _mm256_sub_epi16(step2[0], step2[3]); + butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[8] = _mm256_add_epi16(step2[8], step2[11]); + step1[9] = _mm256_add_epi16(step2[9], step2[10]); + step1[10] = _mm256_sub_epi16(step2[9], step2[10]); + step1[11] = _mm256_sub_epi16(step2[8], step2[11]); + step1[12] = _mm256_sub_epi16(step2[15], step2[12]); + step1[13] = _mm256_sub_epi16(step2[14], step2[13]); + step1[14] = _mm256_add_epi16(step2[14], step2[13]); + step1[15] = _mm256_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm256_add_epi16(step1[0], step1[7]); + step2[1] = _mm256_add_epi16(step1[1], step1[6]); + step2[2] = _mm256_add_epi16(step1[2], step1[5]); + step2[3] = _mm256_add_epi16(step1[3], step1[4]); + step2[4] = _mm256_sub_epi16(step1[3], step1[4]); + step2[5] = _mm256_sub_epi16(step1[2], step1[5]); + step2[6] = _mm256_sub_epi16(step1[1], step1[6]); + step2[7] = _mm256_sub_epi16(step1[0], step1[7]); + butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + out[0] = _mm256_add_epi16(step2[0], step1[15]); + out[1] = _mm256_add_epi16(step2[1], step1[14]); + out[2] = _mm256_add_epi16(step2[2], step2[13]); + out[3] = _mm256_add_epi16(step2[3], step2[12]); + out[4] = _mm256_add_epi16(step2[4], step2[11]); + out[5] = _mm256_add_epi16(step2[5], step2[10]); + out[6] = _mm256_add_epi16(step2[6], step1[9]); + out[7] = _mm256_add_epi16(step2[7], step1[8]); + out[8] = _mm256_sub_epi16(step2[7], step1[8]); + out[9] = _mm256_sub_epi16(step2[6], step1[9]); + out[10] = _mm256_sub_epi16(step2[5], step2[10]); + out[11] = _mm256_sub_epi16(step2[4], step2[11]); + out[12] = _mm256_sub_epi16(step2[3], step2[12]); + out[13] = _mm256_sub_epi16(step2[2], step2[13]); + out[14] = _mm256_sub_epi16(step2[1], step1[14]); + out[15] = _mm256_sub_epi16(step2[0], step1[15]); +} + +static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) { + const __m256i zero = _mm256_setzero_si256(); + __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest))); + d0 = _mm256_permute4x64_epi64(d0, 0xd8); + d0 = _mm256_unpacklo_epi8(d0, zero); + d0 = _mm256_add_epi16(in_x, d0); + d0 = _mm256_packus_epi16( + d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1))); + + _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0)); +} + +static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) { + const __m256i final_rounding = _mm256_set1_epi16(1 << 5); + __m256i out; + out = _mm256_adds_epi16(in, final_rounding); + out = _mm256_srai_epi16(out, 6); + recon_and_store16(dest, out); +} + +static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) { + const __m256i final_rounding = _mm256_set1_epi16(1 << 5); + int j = 0; + while (j < 32) { + in[j] = _mm256_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm256_srai_epi16(in[j], 6); + in[j + 1] = _mm256_srai_epi16(in[j + 1], 6); + + recon_and_store16(dst, in[j]); + dst += stride; + recon_and_store16(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) { + int i; + __m256i t[16], u[16]; + // (1st, 2nd) ==> (lo, hi) + // (0, 1) ==> (0, 1) + // (2, 3) ==> (2, 3) + // (4, 5) ==> (4, 5) + // (6, 7) ==> (6, 7) + for (i = 0; i < 4; i++) { + t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]); + t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 2) ==> (0, 2) + // (1, 3) ==> (1, 3) + // (4, 6) ==> (4, 6) + // (5, 7) ==> (5, 7) + for (i = 0; i < 2; i++) { + u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]); + u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]); + + u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]); + u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]); + } + + // (1st, 2nd) ==> (lo, hi) + // (0, 4) ==> (0, 1) + // (1, 5) ==> (4, 5) + // (2, 6) ==> (2, 3) + // (3, 7) ==> (6, 7) + for (i = 0; i < 2; i++) { + out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]); + out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]); + + out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]); + out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]); + } +} + +static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) { + __m256i t[16]; + +#define LOADL(idx) \ + t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \ + t[idx] = _mm256_inserti128_si256( \ + t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1); + +#define LOADR(idx) \ + t[8 + (idx)] = \ + _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \ + t[8 + (idx)] = _mm256_inserti128_si256( \ + t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1); + + // load left 8x16 + LOADL(0) + LOADL(1) + LOADL(2) + LOADL(3) + LOADL(4) + LOADL(5) + LOADL(6) + LOADL(7) + + // load right 8x16 + LOADR(0) + LOADR(1) + LOADR(2) + LOADR(3) + LOADR(4) + LOADR(5) + LOADR(6) + LOADR(7) + + // get the top 16x8 result + transpose2_8x8_avx2(t, out); + // get the bottom 16x8 result + transpose2_8x8_avx2(&t[8], &out[8]); +} + +void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + __m256i in[16]; + + // Load 16x16 values + idct_load16x16(input, in, 16); + + transpose_16bit_16x16_avx2(in, in); + idct16_16col(in, in); + + transpose_16bit_16x16_avx2(in, in); + idct16_16col(in, in); + + for (i = 0; i < 16; ++i) { + write_buffer_16x1(dest + i * stride, in[i]); + } +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm256_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +// For each 16x32 block __m256i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) { + __m256i step1[8], step2[8]; + + // stage 3 + butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 4 + butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm256_add_epi16(step1[4], step1[5]); + step2[5] = _mm256_sub_epi16(step1[4], step1[5]); + step2[6] = _mm256_sub_epi16(step1[7], step1[6]); + step2[7] = _mm256_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm256_add_epi16(step2[0], step2[3]); + step1[1] = _mm256_add_epi16(step2[1], step2[2]); + step1[2] = _mm256_sub_epi16(step2[1], step2[2]); + step1[3] = _mm256_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], + &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm256_add_epi16(step1[0], step1[7]); + out[1] = _mm256_add_epi16(step1[1], step1[6]); + out[2] = _mm256_add_epi16(step1[2], step1[5]); + out[3] = _mm256_add_epi16(step1[3], step1[4]); + out[4] = _mm256_sub_epi16(step1[3], step1[4]); + out[5] = _mm256_sub_epi16(step1[2], step1[5]); + out[6] = _mm256_sub_epi16(step1[1], step1[6]); + out[7] = _mm256_sub_epi16(step1[0], step1[7]); +} + +static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1, + __m256i *out) { + __m256i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], + &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm256_add_epi16(step2[8], step2[11]); + step1[9] = _mm256_add_epi16(step2[9], step2[10]); + step1[10] = _mm256_sub_epi16(step2[9], step2[10]); + step1[11] = _mm256_sub_epi16(step2[8], step2[11]); + step1[12] = _mm256_sub_epi16(step2[15], step2[12]); + step1[13] = _mm256_sub_epi16(step2[14], step2[13]); + step1[14] = _mm256_add_epi16(step2[14], step2[13]); + step1[15] = _mm256_add_epi16(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], + &out[13]); + butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], + &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +// For each 16x32 block __m256i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) { + __m256i step1[16], step2[16]; + + // stage 2 + butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = _mm256_add_epi16(step2[8], step2[9]); + step1[9] = _mm256_sub_epi16(step2[8], step2[9]); + step1[10] = _mm256_sub_epi16(step2[11], step2[10]); + step1[11] = _mm256_add_epi16(step2[11], step2[10]); + step1[12] = _mm256_add_epi16(step2[12], step2[13]); + step1[13] = _mm256_sub_epi16(step2[12], step2[13]); + step1[14] = _mm256_sub_epi16(step2[15], step2[14]); + step1[15] = _mm256_add_epi16(step2[15], step2[14]); + + idct32_16x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1, + __m256i *out) { + __m256i step2[32]; + + // stage 4 + step2[16] = _mm256_add_epi16(step1[16], step1[19]); + step2[17] = _mm256_add_epi16(step1[17], step1[18]); + step2[18] = _mm256_sub_epi16(step1[17], step1[18]); + step2[19] = _mm256_sub_epi16(step1[16], step1[19]); + step2[20] = _mm256_sub_epi16(step1[23], step1[20]); + step2[21] = _mm256_sub_epi16(step1[22], step1[21]); + step2[22] = _mm256_add_epi16(step1[22], step1[21]); + step2[23] = _mm256_add_epi16(step1[23], step1[20]); + + step2[24] = _mm256_add_epi16(step1[24], step1[27]); + step2[25] = _mm256_add_epi16(step1[25], step1[26]); + step2[26] = _mm256_sub_epi16(step1[25], step1[26]); + step2[27] = _mm256_sub_epi16(step1[24], step1[27]); + step2[28] = _mm256_sub_epi16(step1[31], step1[28]); + step2[29] = _mm256_sub_epi16(step1[30], step1[29]); + step2[30] = _mm256_add_epi16(step1[29], step1[30]); + step2[31] = _mm256_add_epi16(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], + &step1[29]); + butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], + &step1[28]); + butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], + &step1[27]); + butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], + &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + out[16] = _mm256_add_epi16(step1[16], step1[23]); + out[17] = _mm256_add_epi16(step1[17], step1[22]); + out[18] = _mm256_add_epi16(step1[18], step1[21]); + out[19] = _mm256_add_epi16(step1[19], step1[20]); + step2[20] = _mm256_sub_epi16(step1[19], step1[20]); + step2[21] = _mm256_sub_epi16(step1[18], step1[21]); + step2[22] = _mm256_sub_epi16(step1[17], step1[22]); + step2[23] = _mm256_sub_epi16(step1[16], step1[23]); + + step2[24] = _mm256_sub_epi16(step1[31], step1[24]); + step2[25] = _mm256_sub_epi16(step1[30], step1[25]); + step2[26] = _mm256_sub_epi16(step1[29], step1[26]); + step2[27] = _mm256_sub_epi16(step1[28], step1[27]); + out[28] = _mm256_add_epi16(step1[27], step1[28]); + out[29] = _mm256_add_epi16(step1[26], step1[29]); + out[30] = _mm256_add_epi16(step1[25], step1[30]); + out[31] = _mm256_add_epi16(step1[24], step1[31]); + + // stage 7 + butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], + &out[27]); + butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], + &out[26]); + butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], + &out[25]); + butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], + &out[24]); +} + +static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) { + __m256i temp[16]; + + // For each 16x32 block __m256i in[32], + // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 + // output pixels: 0-7 in __m256i out[32] + idct32_1024_16x32_quarter_1(in, temp); + + // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 + // output pixels: 8-15 in __m256i out[32] + idct32_1024_16x32_quarter_2(in, temp); + + // stage 7 + add_sub_butterfly_avx2(temp, out, 16); +} + +// For each 16x32 block __m256i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m256i out[32] +static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) { + __m256i step1[32], step2[32]; + + // stage 1 + butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); + butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); + butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + + butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); + + butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); + butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 2 + step2[16] = _mm256_add_epi16(step1[16], step1[17]); + step2[17] = _mm256_sub_epi16(step1[16], step1[17]); + step2[18] = _mm256_sub_epi16(step1[19], step1[18]); + step2[19] = _mm256_add_epi16(step1[19], step1[18]); + step2[20] = _mm256_add_epi16(step1[20], step1[21]); + step2[21] = _mm256_sub_epi16(step1[20], step1[21]); + step2[22] = _mm256_sub_epi16(step1[23], step1[22]); + step2[23] = _mm256_add_epi16(step1[23], step1[22]); + + step2[24] = _mm256_add_epi16(step1[24], step1[25]); + step2[25] = _mm256_sub_epi16(step1[24], step1[25]); + step2[26] = _mm256_sub_epi16(step1[27], step1[26]); + step2[27] = _mm256_add_epi16(step1[27], step1[26]); + step2[28] = _mm256_add_epi16(step1[28], step1[29]); + step2[29] = _mm256_sub_epi16(step1[28], step1[29]); + step2[30] = _mm256_sub_epi16(step1[31], step1[30]); + step2[31] = _mm256_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_16x32_quarter_3_4_stage_4_to_7(step1, out); +} + +static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) { + __m256i temp[32]; + + // For each 16x32 block __m256i in[32], + // Input with index, 0, 4, 8, 12, 16, 20, 24, 28 + // output pixels: 0-7 in __m256i out[32] + // AND + // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 + // output pixels: 8-15 in __m256i out[32] + idct32_1024_16x32_quarter_1_2(in, temp); + + // For each 16x32 block __m256i in[32], + // Input with odd index, + // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 + // output pixels: 16-23, 24-31 in __m256i out[32] + idct32_1024_16x32_quarter_3_4(in, temp); + + // final stage + add_sub_butterfly_avx2(temp, out, 32); +} + +void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i l[32], r[32], out[32], *in; + int i; + + in = l; + + for (i = 0; i < 2; i++) { + idct_load16x16(input, in, 32); + transpose_16bit_16x16_avx2(in, in); + + idct_load16x16(input + 16, in + 16, 32); + transpose_16bit_16x16_avx2(in + 16, in + 16); + idct32_1024_16x32(in, in); + + in = r; + input += 32 << 4; + } + + for (i = 0; i < 32; i += 16) { + transpose_16bit_16x16_avx2(l + i, out); + transpose_16bit_16x16_avx2(r + i, out + 16); + idct32_1024_16x32(out, out); + + store_buffer_16x32(out, dest, stride); + dest += 16; + } +} + +// Case when only upper-left 16x16 has non-zero coeff +void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i in[32], io[32], out[32]; + int i; + + for (i = 16; i < 32; i++) { + in[i] = _mm256_setzero_si256(); + } + + // rows + idct_load16x16(input, in, 32); + transpose_16bit_16x16_avx2(in, in); + idct32_1024_16x32(in, io); + + // columns + for (i = 0; i < 32; i += 16) { + transpose_16bit_16x16_avx2(io + i, in); + idct32_1024_16x32(in, out); + + store_buffer_16x32(out, dest, stride); + dest += 16; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c index 487a474a67..f42b3df849 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -8,169 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include // SSE2 + #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -#define RECON_AND_STORE4X4(dest, in_x) \ - { \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)(dest) = _mm_cvtsi128_si32(d0); \ - } - -void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i eight = _mm_set1_epi16(8); - const __m128i cst = _mm_setr_epi16( - (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, - (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, - (int16_t)cospi_8_64, (int16_t)cospi_24_64); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i input0, input1, input2, input3; - - // Rows - input0 = load_input_data(input); - input2 = load_input_data(input + 8); - - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input0, 0xd8); - input0 = _mm_shufflehi_epi16(input0, 0xd8); - input2 = _mm_shufflelo_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input2, 0xd8); - - input1 = _mm_unpackhi_epi32(input0, input0); - input0 = _mm_unpacklo_epi32(input0, input0); - input3 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpacklo_epi32(input2, input2); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input1); - input1 = _mm_packs_epi32(input2, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Columns - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_unpacklo_epi32(input2, input2); - input1 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpackhi_epi32(input3, input3); - input3 = _mm_unpacklo_epi32(input3, input3); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input2); - input1 = _mm_packs_epi32(input1, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Final round and shift - input2 = _mm_add_epi16(input2, eight); - input3 = _mm_add_epi16(input3, eight); - - input2 = _mm_srai_epi16(input2, 4); - input3 = _mm_srai_epi16(input3, 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, input2); - d2 = _mm_add_epi16(d2, input3); - d0 = _mm_packus_epi16(d0, d2); - // store input0 - *(int *)dest = _mm_cvtsi128_si32(d0); - // store input1 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store input2 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - // store input3 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - } -} - -void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 4); - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE4X4(dest + 0 * stride, dc_value); - RECON_AND_STORE4X4(dest + 1 * stride, dc_value); - RECON_AND_STORE4X4(dest + 2 * stride, dc_value); - RECON_AND_STORE4X4(dest + 3 * stride, dc_value); -} - -static INLINE void transpose_4x4(__m128i *res) { +static INLINE void transpose_16bit_4(__m128i *res) { const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); @@ -178,35 +23,75 @@ static INLINE void transpose_4x4(__m128i *res) { res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); } -void idct4_sse2(__m128i *in) { +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i eight = _mm_set1_epi16(8); + __m128i in[2]; + + // Rows + in[0] = load_input_data8(input); + in[1] = load_input_data8(input + 8); + idct4_sse2(in); + + // Columns + idct4_sse2(in); + + // Final round and shift + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); + + recon_and_store4x4_sse2(in, dest, stride); +} + +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + int a; + __m128i dc_value, d[2]; + + a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 4); + + dc_value = _mm_set1_epi16(a); + + // Reconstruction and Store + d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); + d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d[0] = _mm_unpacklo_epi32(d[0], + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d[1] = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); + d[0] = _mm_unpacklo_epi8(d[0], zero); + d[1] = _mm_unpacklo_epi8(d[1], zero); + d[0] = _mm_add_epi16(d[0], dc_value); + d[1] = _mm_add_epi16(d[1], dc_value); + d[0] = _mm_packus_epi16(d[0], d[1]); + + *(int *)dest = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); +} + +void idct4_sse2(__m128i *const in) { const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; + __m128i u[2]; - transpose_4x4(in); + transpose_16bit_4(in); // stage 1 u[0] = _mm_unpacklo_epi16(in[0], in[1]); u[1] = _mm_unpackhi_epi16(in[0], in[1]); - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[3], v[2]); + u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]); + u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]); // stage 2 in[0] = _mm_add_epi16(u[0], u[1]); @@ -214,358 +99,137 @@ void idct4_sse2(__m128i *in) { in[1] = _mm_shuffle_epi32(in[1], 0x4E); } -void iadst4_sse2(__m128i *in) { - const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); - const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); - const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8], in7; +void iadst4_sse2(__m128i *const in) { + const __m128i k__sinpi_1_3 = pair_set_epi16(sinpi_1_9, sinpi_3_9); + const __m128i k__sinpi_4_2 = pair_set_epi16(sinpi_4_9, sinpi_2_9); + const __m128i k__sinpi_2_3 = pair_set_epi16(sinpi_2_9, sinpi_3_9); + const __m128i k__sinpi_1_4 = pair_set_epi16(sinpi_1_9, sinpi_4_9); + const __m128i k__sinpi_12_n3 = + pair_set_epi16(sinpi_1_9 + sinpi_2_9, -sinpi_3_9); + __m128i u[4], v[5]; - transpose_4x4(in); - in7 = _mm_srli_si128(in[1], 8); - in7 = _mm_add_epi16(in7, in[0]); - in7 = _mm_sub_epi16(in7, in[1]); + // 00 01 20 21 02 03 22 23 + // 10 11 30 31 12 13 32 33 + const __m128i tr0_0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i tr0_1 = _mm_unpackhi_epi32(in[0], in[1]); - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpackhi_epi16(in[0], kZero); + // 00 01 10 11 20 21 30 31 + // 02 03 12 13 22 23 32 33 + in[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); + in[1] = _mm_unpackhi_epi32(tr0_0, tr0_1); - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 - v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 + v[0] = _mm_madd_epi16(in[0], k__sinpi_1_3); // s_1 * x0 + s_3 * x1 + v[1] = _mm_madd_epi16(in[1], k__sinpi_4_2); // s_4 * x2 + s_2 * x3 + v[2] = _mm_madd_epi16(in[0], k__sinpi_2_3); // s_2 * x0 + s_3 * x1 + v[3] = _mm_madd_epi16(in[1], k__sinpi_1_4); // s_1 * x2 + s_4 * x3 + v[4] = _mm_madd_epi16(in[0], k__sinpi_12_n3); // (s_1 + s_2) * x0 - s_3 * x1 + in[0] = _mm_sub_epi16(in[0], in[1]); // x0 - x2 + in[1] = _mm_srli_epi32(in[1], 16); + in[0] = _mm_add_epi16(in[0], in[1]); + in[0] = _mm_slli_epi32(in[0], 16); // x0 - x2 + x3 u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_add_epi32(v[3], v[4]); - u[2] = v[2]; - u[3] = _mm_add_epi32(u[0], u[1]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_add_epi32(u[3], v[5]); - u[6] = _mm_sub_epi32(u[5], u[4]); + u[1] = _mm_sub_epi32(v[2], v[3]); + u[2] = _mm_madd_epi16(in[0], k__sinpi_1_3); + u[3] = _mm_sub_epi32(v[1], v[3]); + u[3] = _mm_add_epi32(u[3], v[4]); - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[0] = dct_const_round_shift_sse2(u[0]); + u[1] = dct_const_round_shift_sse2(u[1]); + u[2] = dct_const_round_shift_sse2(u[2]); + u[3] = dct_const_round_shift_sse2(u[3]); in[0] = _mm_packs_epi32(u[0], u[1]); in[1] = _mm_packs_epi32(u[2], u[3]); } -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } - -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - } - -#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - } - -// Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ - res0, res1, res2, res3) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - tmp4 = _mm_madd_epi16(lo_1, cst2); \ - tmp5 = _mm_madd_epi16(hi_1, cst2); \ - tmp6 = _mm_madd_epi16(lo_1, cst3); \ - tmp7 = _mm_madd_epi16(hi_1, cst3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - res2 = _mm_packs_epi32(tmp4, tmp5); \ - res3 = _mm_packs_epi32(tmp6, tmp7); \ - } - -#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - } - -#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ - out4, out5, out6, out7) \ - { \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ - stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ - tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ - tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ - tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - } \ - \ - /* Stage4 */ \ - out0 = _mm_add_epi16(stp1_0, stp2_7); \ - out1 = _mm_add_epi16(stp1_1, stp1_6); \ - out2 = _mm_add_epi16(stp1_2, stp1_5); \ - out3 = _mm_add_epi16(stp1_3, stp2_4); \ - out4 = _mm_sub_epi16(stp1_3, stp2_4); \ - out5 = _mm_sub_epi16(stp1_2, stp1_5); \ - out6 = _mm_sub_epi16(stp1_1, stp1_6); \ - out7 = _mm_sub_epi16(stp1_0, stp2_7); \ - } +static INLINE void load_buffer_8x8(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 8); + in[1] = load_input_data8(input + 1 * 8); + in[2] = load_input_data8(input + 2 * 8); + in[3] = load_input_data8(input + 3 * 8); + in[4] = load_input_data8(input + 4 * 8); + in[5] = load_input_data8(input + 5 * 8); + in[6] = load_input_data8(input + 6 * 8); + in[7] = load_input_data8(input + 7 * 8); +} void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i in[8]; int i; // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); + load_buffer_8x8(input, in); // 2-D for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, - in6, in7); + vpx_idct8_sse2(in); } - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); + write_buffer_8x8(in, dest, stride); +} - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[8]; - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); + io[0] = load_input_data4(input + 0 * 8); + io[1] = load_input_data4(input + 1 * 8); + io[2] = load_input_data4(input + 2 * 8); + io[3] = load_input_data4(input + 3 * 8); + + idct8x8_12_add_kernel_sse2(io); + write_buffer_8x8(io, dest, stride); +} + +static INLINE void recon_and_store_8_dual(uint8_t *const dest, + const __m128i in_x, + const int stride) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0, d1; + + d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride)); + d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride)); + d0 = _mm_unpacklo_epi8(d0, zero); + d1 = _mm_unpacklo_epi8(d1, zero); + d0 = _mm_add_epi16(in_x, d0); + d1 = _mm_add_epi16(in_x, d1); + d0 = _mm_packus_epi16(d0, d1); + _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0)); } void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 5); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + dc_value = _mm_set1_epi16((int16_t)a1); - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); + dest += 2 * stride; + recon_and_store_8_dual(dest, dc_value, stride); } -void idct8_sse2(__m128i *in) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - +void vpx_idct8_sse2(__m128i *const in) { // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, - in1, in2, in3, in4, in5, in6, in7); + transpose_16bit_8x8(in, in); // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], - in[4], in[5], in[6], in[7]); + idct8(in, in); } -void iadst8_sse2(__m128i *in) { +void iadst8_sse2(__m128i *const in) { const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); @@ -578,736 +242,279 @@ void iadst8_sse2(__m128i *in) { const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i kZero = _mm_setzero_si128(); + __m128i s[8], u[16], v[8], w[16]; // transpose - array_transpose_8x8(in, in); - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; + transpose_16bit_8x8(in, in); // column transformation // stage 1 // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); + s[0] = _mm_unpacklo_epi16(in[7], in[0]); + s[1] = _mm_unpackhi_epi16(in[7], in[0]); + s[2] = _mm_unpacklo_epi16(in[5], in[2]); + s[3] = _mm_unpackhi_epi16(in[5], in[2]); + s[4] = _mm_unpacklo_epi16(in[3], in[4]); + s[5] = _mm_unpackhi_epi16(in[3], in[4]); + s[6] = _mm_unpacklo_epi16(in[1], in[6]); + s[7] = _mm_unpackhi_epi16(in[1], in[6]); - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + u[0] = _mm_madd_epi16(s[0], k__cospi_p02_p30); + u[1] = _mm_madd_epi16(s[1], k__cospi_p02_p30); + u[2] = _mm_madd_epi16(s[0], k__cospi_p30_m02); + u[3] = _mm_madd_epi16(s[1], k__cospi_p30_m02); + u[4] = _mm_madd_epi16(s[2], k__cospi_p10_p22); + u[5] = _mm_madd_epi16(s[3], k__cospi_p10_p22); + u[6] = _mm_madd_epi16(s[2], k__cospi_p22_m10); + u[7] = _mm_madd_epi16(s[3], k__cospi_p22_m10); + u[8] = _mm_madd_epi16(s[4], k__cospi_p18_p14); + u[9] = _mm_madd_epi16(s[5], k__cospi_p18_p14); + u[10] = _mm_madd_epi16(s[4], k__cospi_p14_m18); + u[11] = _mm_madd_epi16(s[5], k__cospi_p14_m18); + u[12] = _mm_madd_epi16(s[6], k__cospi_p26_p06); + u[13] = _mm_madd_epi16(s[7], k__cospi_p26_p06); + u[14] = _mm_madd_epi16(s[6], k__cospi_p06_m26); + u[15] = _mm_madd_epi16(s[7], k__cospi_p06_m26); // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); + w[0] = _mm_add_epi32(u[0], u[8]); + w[1] = _mm_add_epi32(u[1], u[9]); + w[2] = _mm_add_epi32(u[2], u[10]); + w[3] = _mm_add_epi32(u[3], u[11]); + w[4] = _mm_add_epi32(u[4], u[12]); + w[5] = _mm_add_epi32(u[5], u[13]); + w[6] = _mm_add_epi32(u[6], u[14]); + w[7] = _mm_add_epi32(u[7], u[15]); + w[8] = _mm_sub_epi32(u[0], u[8]); + w[9] = _mm_sub_epi32(u[1], u[9]); + w[10] = _mm_sub_epi32(u[2], u[10]); + w[11] = _mm_sub_epi32(u[3], u[11]); + w[12] = _mm_sub_epi32(u[4], u[12]); + w[13] = _mm_sub_epi32(u[5], u[13]); + w[14] = _mm_sub_epi32(u[6], u[14]); + w[15] = _mm_sub_epi32(u[7], u[15]); // shift and rounding - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + u[0] = dct_const_round_shift_sse2(w[0]); + u[1] = dct_const_round_shift_sse2(w[1]); + u[2] = dct_const_round_shift_sse2(w[2]); + u[3] = dct_const_round_shift_sse2(w[3]); + u[4] = dct_const_round_shift_sse2(w[4]); + u[5] = dct_const_round_shift_sse2(w[5]); + u[6] = dct_const_round_shift_sse2(w[6]); + u[7] = dct_const_round_shift_sse2(w[7]); + u[8] = dct_const_round_shift_sse2(w[8]); + u[9] = dct_const_round_shift_sse2(w[9]); + u[10] = dct_const_round_shift_sse2(w[10]); + u[11] = dct_const_round_shift_sse2(w[11]); + u[12] = dct_const_round_shift_sse2(w[12]); + u[13] = dct_const_round_shift_sse2(w[13]); + u[14] = dct_const_round_shift_sse2(w[14]); + u[15] = dct_const_round_shift_sse2(w[15]); // back to 16-bit and pack 8 integers into __m128i - in[0] = _mm_packs_epi32(u0, u1); - in[1] = _mm_packs_epi32(u2, u3); - in[2] = _mm_packs_epi32(u4, u5); - in[3] = _mm_packs_epi32(u6, u7); - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); + in[2] = _mm_packs_epi32(u[4], u[5]); + in[3] = _mm_packs_epi32(u[6], u[7]); + in[4] = _mm_packs_epi32(u[8], u[9]); + in[5] = _mm_packs_epi32(u[10], u[11]); + in[6] = _mm_packs_epi32(u[12], u[13]); + in[7] = _mm_packs_epi32(u[14], u[15]); // stage 2 - s0 = _mm_add_epi16(in[0], in[2]); - s1 = _mm_add_epi16(in[1], in[3]); - s2 = _mm_sub_epi16(in[0], in[2]); - s3 = _mm_sub_epi16(in[1], in[3]); - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); + s[0] = _mm_add_epi16(in[0], in[2]); + s[1] = _mm_add_epi16(in[1], in[3]); + s[2] = _mm_sub_epi16(in[0], in[2]); + s[3] = _mm_sub_epi16(in[1], in[3]); + u[0] = _mm_unpacklo_epi16(in[4], in[5]); + u[1] = _mm_unpackhi_epi16(in[4], in[5]); + u[2] = _mm_unpacklo_epi16(in[6], in[7]); + u[3] = _mm_unpackhi_epi16(in[6], in[7]); - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); + w[0] = _mm_add_epi32(v[0], v[4]); + w[1] = _mm_add_epi32(v[1], v[5]); + w[2] = _mm_add_epi32(v[2], v[6]); + w[3] = _mm_add_epi32(v[3], v[7]); + w[4] = _mm_sub_epi32(v[0], v[4]); + w[5] = _mm_sub_epi32(v[1], v[5]); + w[6] = _mm_sub_epi32(v[2], v[6]); + w[7] = _mm_sub_epi32(v[3], v[7]); - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u[0] = dct_const_round_shift_sse2(w[0]); + u[1] = dct_const_round_shift_sse2(w[1]); + u[2] = dct_const_round_shift_sse2(w[2]); + u[3] = dct_const_round_shift_sse2(w[3]); + u[4] = dct_const_round_shift_sse2(w[4]); + u[5] = dct_const_round_shift_sse2(w[5]); + u[6] = dct_const_round_shift_sse2(w[6]); + u[7] = dct_const_round_shift_sse2(w[7]); // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); + s[4] = _mm_packs_epi32(u[0], u[1]); + s[5] = _mm_packs_epi32(u[2], u[3]); + s[6] = _mm_packs_epi32(u[4], u[5]); + s[7] = _mm_packs_epi32(u[6], u[7]); // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + s[2] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16); + s[3] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16); + s[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16); + s[7] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_m16); - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[4]); + in[2] = s[6]; + in[3] = _mm_sub_epi16(kZero, s[2]); + in[4] = s[3]; + in[5] = _mm_sub_epi16(kZero, s[7]); + in[6] = s[5]; + in[7] = _mm_sub_epi16(kZero, s[1]); } -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - // Stage1 - { - const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - stp1_5 = _mm_packs_epi32(tmp4, tmp6); - } - - // Stage2 - { - const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_04, stg2_0); - tmp2 = _mm_madd_epi16(lo_04, stg2_1); - tmp4 = _mm_madd_epi16(lo_26, stg2_2); - tmp6 = _mm_madd_epi16(lo_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp0, tmp2); - stp2_2 = _mm_packs_epi32(tmp6, tmp4); - - tmp0 = _mm_add_epi16(stp1_4, stp1_5); - tmp1 = _mm_sub_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage3 - { - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - - tmp4 = _mm_add_epi16(stp2_0, stp2_2); - tmp6 = _mm_sub_epi16(stp2_0, stp2_2); - - stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); - stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); - - tmp0 = _mm_madd_epi16(lo_56, stg3_0); - tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - tmp0 = _mm_add_epi16(stp1_3, stp2_4); - tmp1 = _mm_add_epi16(stp1_2, stp1_5); - tmp2 = _mm_sub_epi16(stp1_3, stp2_4); - tmp3 = _mm_sub_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, - in5, in6, in7); - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); +static INLINE void idct16_load8x8(const tran_low_t *const input, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * 16); + in[1] = load_input_data8(input + 1 * 16); + in[2] = load_input_data8(input + 2 * 16); + in[3] = load_input_data8(input + 3 * 16); + in[4] = load_input_data8(input + 4 * 16); + in[5] = load_input_data8(input + 5 * 16); + in[6] = load_input_data8(input + 6 * 16); + in[7] = load_input_data8(input + 7 * 16); } -#define IDCT16 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ - \ - MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ - stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ - stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } - -#define IDCT16_10 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ - stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ - stp1_12_0) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } - void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[16], l[16], r[16], *curr1; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i l[16], r[16], out[16], *in; int i; - curr1 = l; + in = l; for (i = 0; i < 2; i++) { - // 1-D idct - - // Load input data. - in[0] = load_input_data(input); - in[8] = load_input_data(input + 8 * 1); - in[1] = load_input_data(input + 8 * 2); - in[9] = load_input_data(input + 8 * 3); - in[2] = load_input_data(input + 8 * 4); - in[10] = load_input_data(input + 8 * 5); - in[3] = load_input_data(input + 8 * 6); - in[11] = load_input_data(input + 8 * 7); - in[4] = load_input_data(input + 8 * 8); - in[12] = load_input_data(input + 8 * 9); - in[5] = load_input_data(input + 8 * 10); - in[13] = load_input_data(input + 8 * 11); - in[6] = load_input_data(input + 8 * 12); - in[14] = load_input_data(input + 8 * 13); - in[7] = load_input_data(input + 8 * 14); - in[15] = load_input_data(input + 8 * 15); - - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - - IDCT16 - - // Stage7 - curr1[0] = _mm_add_epi16(stp2_0, stp1_15); - curr1[1] = _mm_add_epi16(stp2_1, stp1_14); - curr1[2] = _mm_add_epi16(stp2_2, stp2_13); - curr1[3] = _mm_add_epi16(stp2_3, stp2_12); - curr1[4] = _mm_add_epi16(stp2_4, stp2_11); - curr1[5] = _mm_add_epi16(stp2_5, stp2_10); - curr1[6] = _mm_add_epi16(stp2_6, stp1_9); - curr1[7] = _mm_add_epi16(stp2_7, stp1_8); - curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); - curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); - curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); - curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); - curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); - curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); - curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); - curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); - - curr1 = r; + idct16_load8x8(input, in); + transpose_16bit_8x8(in, in); + idct16_load8x8(input + 8, in + 8); + transpose_16bit_8x8(in + 8, in + 8); + idct16_8col(in, in); + in = r; input += 128; } - for (i = 0; i < 2; i++) { + + for (i = 0; i < 16; i += 8) { int j; - // 1-D idct - array_transpose_8x8(l + i * 8, in); - array_transpose_8x8(r + i * 8, in + 8); - - IDCT16 - - // 2-D - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + idct16_8col(out, out); for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + write_buffer_8x1(dest + j * stride, out[j]); } dest += 8; } } +void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[16], temp[16], out[16]; + int i; + + idct16_load8x8(input, in); + transpose_16bit_8x8(in, in); + + for (i = 8; i < 16; i++) { + in[i] = _mm_setzero_si128(); + } + idct16_8col(in, temp); + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(temp + i, in); + idct16_8col(in, out); + + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, out[j]); + } + + dest += 8; + } +} + +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i in[16], l[16]; + int i; + + // First 1-D inverse DCT + // Load input data. + in[0] = load_input_data4(input + 0 * 16); + in[1] = load_input_data4(input + 1 * 16); + in[2] = load_input_data4(input + 2 * 16); + in[3] = load_input_data4(input + 3 * 16); + + idct16x16_10_pass1(in, l); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 16; i += 8) { + int j; + idct16x16_10_pass2(l + i, in); + + for (j = 0; j < 16; ++j) { + write_buffer_8x1(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0, d1; + + d0 = _mm_load_si128((__m128i *)(dest)); + d1 = _mm_unpackhi_epi8(d0, zero); + d0 = _mm_unpacklo_epi8(d0, zero); + d0 = _mm_add_epi16(in_x, d0); + d1 = _mm_add_epi16(in_x, d1); + d0 = _mm_packus_epi16(d0, d1); + _mm_store_si128((__m128i *)(dest), d0); +} + void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, i; + int i; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + dc_value = _mm_set1_epi16((int16_t)a1); for (i = 0; i < 16; ++i) { - RECON_AND_STORE(dest + 0, dc_value); - RECON_AND_STORE(dest + 8, dc_value); + recon_and_store_16(dest, dc_value); dest += stride; } } -static void iadst16_8col(__m128i *in) { +void vpx_iadst16_8col_sse2(__m128i *const in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -1335,12 +542,11 @@ static void iadst16_8col(__m128i *in) { const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); + const __m128i kZero = _mm_setzero_si128(); u[0] = _mm_unpacklo_epi16(in[15], in[0]); u[1] = _mm_unpackhi_epi16(in[15], in[0]); @@ -1425,71 +631,38 @@ static void iadst16_8col(__m128i *in) { u[30] = _mm_sub_epi32(v[14], v[30]); u[31] = _mm_sub_epi32(v[15], v[31]); - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + u[0] = dct_const_round_shift_sse2(u[0]); + u[1] = dct_const_round_shift_sse2(u[1]); + u[2] = dct_const_round_shift_sse2(u[2]); + u[3] = dct_const_round_shift_sse2(u[3]); + u[4] = dct_const_round_shift_sse2(u[4]); + u[5] = dct_const_round_shift_sse2(u[5]); + u[6] = dct_const_round_shift_sse2(u[6]); + u[7] = dct_const_round_shift_sse2(u[7]); + u[8] = dct_const_round_shift_sse2(u[8]); + u[9] = dct_const_round_shift_sse2(u[9]); + u[10] = dct_const_round_shift_sse2(u[10]); + u[11] = dct_const_round_shift_sse2(u[11]); + u[12] = dct_const_round_shift_sse2(u[12]); + u[13] = dct_const_round_shift_sse2(u[13]); + u[14] = dct_const_round_shift_sse2(u[14]); + u[15] = dct_const_round_shift_sse2(u[15]); + u[16] = dct_const_round_shift_sse2(u[16]); + u[17] = dct_const_round_shift_sse2(u[17]); + u[18] = dct_const_round_shift_sse2(u[18]); + u[19] = dct_const_round_shift_sse2(u[19]); + u[20] = dct_const_round_shift_sse2(u[20]); + u[21] = dct_const_round_shift_sse2(u[21]); + u[22] = dct_const_round_shift_sse2(u[22]); + u[23] = dct_const_round_shift_sse2(u[23]); + u[24] = dct_const_round_shift_sse2(u[24]); + u[25] = dct_const_round_shift_sse2(u[25]); + u[26] = dct_const_round_shift_sse2(u[26]); + u[27] = dct_const_round_shift_sse2(u[27]); + u[28] = dct_const_round_shift_sse2(u[28]); + u[29] = dct_const_round_shift_sse2(u[29]); + u[30] = dct_const_round_shift_sse2(u[30]); + u[31] = dct_const_round_shift_sse2(u[31]); s[0] = _mm_packs_epi32(u[0], u[1]); s[1] = _mm_packs_epi32(u[2], u[3]); @@ -1552,39 +725,22 @@ static void iadst16_8col(__m128i *in) { u[14] = _mm_sub_epi32(v[6], v[14]); u[15] = _mm_sub_epi32(v[7], v[15]); - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[0] = dct_const_round_shift_sse2(u[0]); + u[1] = dct_const_round_shift_sse2(u[1]); + u[2] = dct_const_round_shift_sse2(u[2]); + u[3] = dct_const_round_shift_sse2(u[3]); + u[4] = dct_const_round_shift_sse2(u[4]); + u[5] = dct_const_round_shift_sse2(u[5]); + u[6] = dct_const_round_shift_sse2(u[6]); + u[7] = dct_const_round_shift_sse2(u[7]); + u[8] = dct_const_round_shift_sse2(u[8]); + u[9] = dct_const_round_shift_sse2(u[9]); + u[10] = dct_const_round_shift_sse2(u[10]); + u[11] = dct_const_round_shift_sse2(u[11]); + u[12] = dct_const_round_shift_sse2(u[12]); + u[13] = dct_const_round_shift_sse2(u[13]); + u[14] = dct_const_round_shift_sse2(u[14]); + u[15] = dct_const_round_shift_sse2(u[15]); x[0] = _mm_add_epi16(s[0], s[4]); x[1] = _mm_add_epi16(s[1], s[5]); @@ -1647,39 +803,22 @@ static void iadst16_8col(__m128i *in) { u[14] = _mm_sub_epi32(v[10], v[14]); u[15] = _mm_sub_epi32(v[11], v[15]); - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + v[0] = dct_const_round_shift_sse2(u[0]); + v[1] = dct_const_round_shift_sse2(u[1]); + v[2] = dct_const_round_shift_sse2(u[2]); + v[3] = dct_const_round_shift_sse2(u[3]); + v[4] = dct_const_round_shift_sse2(u[4]); + v[5] = dct_const_round_shift_sse2(u[5]); + v[6] = dct_const_round_shift_sse2(u[6]); + v[7] = dct_const_round_shift_sse2(u[7]); + v[8] = dct_const_round_shift_sse2(u[8]); + v[9] = dct_const_round_shift_sse2(u[9]); + v[10] = dct_const_round_shift_sse2(u[10]); + v[11] = dct_const_round_shift_sse2(u[11]); + v[12] = dct_const_round_shift_sse2(u[12]); + v[13] = dct_const_round_shift_sse2(u[13]); + v[14] = dct_const_round_shift_sse2(u[14]); + v[15] = dct_const_round_shift_sse2(u[15]); s[0] = _mm_add_epi16(x[0], x[2]); s[1] = _mm_add_epi16(x[1], x[3]); @@ -1708,1718 +847,371 @@ static void iadst16_8col(__m128i *in) { u[6] = _mm_unpacklo_epi16(s[14], s[15]); u[7] = _mm_unpackhi_epi16(s[14], s[15]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16); + in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16); + in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16); + in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16); + in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16); + in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16); + in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16); + in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16); in[0] = s[0]; in[1] = _mm_sub_epi16(kZero, s[8]); in[2] = s[12]; in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); in[12] = s[5]; in[13] = _mm_sub_epi16(kZero, s[13]); in[14] = s[9]; in[15] = _mm_sub_epi16(kZero, s[1]); } -static void idct16_8col(__m128i *in) { - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i v[16], u[16], s[16], t[16]; +void idct16_sse2(__m128i *const in0, __m128i *const in1) { + transpose_16bit_16x16(in0, in1); + idct16_8col(in0, in0); + idct16_8col(in1, in1); +} - // stage 1 - s[0] = in[0]; - s[1] = in[8]; - s[2] = in[4]; - s[3] = in[12]; - s[4] = in[2]; - s[5] = in[10]; - s[6] = in[6]; - s[7] = in[14]; - s[8] = in[1]; - s[9] = in[9]; - s[10] = in[5]; - s[11] = in[13]; - s[12] = in[3]; - s[13] = in[11]; - s[14] = in[7]; - s[15] = in[15]; +void iadst16_sse2(__m128i *const in0, __m128i *const in1) { + transpose_16bit_16x16(in0, in1); + vpx_iadst16_8col_sse2(in0); + vpx_iadst16_8col_sse2(in1); +} - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[15]); - u[1] = _mm_unpackhi_epi16(s[8], s[15]); - u[2] = _mm_unpacklo_epi16(s[9], s[14]); - u[3] = _mm_unpackhi_epi16(s[9], s[14]); - u[4] = _mm_unpacklo_epi16(s[10], s[13]); - u[5] = _mm_unpackhi_epi16(s[10], s[13]); - u[6] = _mm_unpacklo_epi16(s[11], s[12]); - u[7] = _mm_unpackhi_epi16(s[11], s[12]); +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); - v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); - v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); - v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); - v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); - v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); - v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); - v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); - v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); - v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); - v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); - v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); - v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); - v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[8] = _mm_packs_epi32(u[0], u[1]); - s[15] = _mm_packs_epi32(u[2], u[3]); - s[9] = _mm_packs_epi32(u[4], u[5]); - s[14] = _mm_packs_epi32(u[6], u[7]); - s[10] = _mm_packs_epi32(u[8], u[9]); - s[13] = _mm_packs_epi32(u[10], u[11]); - s[11] = _mm_packs_epi32(u[12], u[13]); - s[12] = _mm_packs_epi32(u[14], u[15]); +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[8], step2[8]; // stage 3 - t[0] = s[0]; - t[1] = s[1]; - t[2] = s[2]; - t[3] = s[3]; - u[0] = _mm_unpacklo_epi16(s[4], s[7]); - u[1] = _mm_unpackhi_epi16(s[4], s[7]); - u[2] = _mm_unpacklo_epi16(s[5], s[6]); - u[3] = _mm_unpackhi_epi16(s[5], s[6]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[4] = _mm_packs_epi32(u[0], u[1]); - t[7] = _mm_packs_epi32(u[2], u[3]); - t[5] = _mm_packs_epi32(u[4], u[5]); - t[6] = _mm_packs_epi32(u[6], u[7]); - t[8] = _mm_add_epi16(s[8], s[9]); - t[9] = _mm_sub_epi16(s[8], s[9]); - t[10] = _mm_sub_epi16(s[11], s[10]); - t[11] = _mm_add_epi16(s[10], s[11]); - t[12] = _mm_add_epi16(s[12], s[13]); - t[13] = _mm_sub_epi16(s[12], s[13]); - t[14] = _mm_sub_epi16(s[15], s[14]); - t[15] = _mm_add_epi16(s[14], s[15]); + butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); // stage 4 - u[0] = _mm_unpacklo_epi16(t[0], t[1]); - u[1] = _mm_unpackhi_epi16(t[0], t[1]); - u[2] = _mm_unpacklo_epi16(t[2], t[3]); - u[3] = _mm_unpackhi_epi16(t[2], t[3]); - u[4] = _mm_unpacklo_epi16(t[9], t[14]); - u[5] = _mm_unpackhi_epi16(t[9], t[14]); - u[6] = _mm_unpacklo_epi16(t[10], t[13]); - u[7] = _mm_unpackhi_epi16(t[10], t[13]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); - v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); - v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_add_epi16(t[4], t[5]); - s[5] = _mm_sub_epi16(t[4], t[5]); - s[6] = _mm_sub_epi16(t[7], t[6]); - s[7] = _mm_add_epi16(t[6], t[7]); - s[8] = t[8]; - s[15] = t[15]; - s[9] = _mm_packs_epi32(u[8], u[9]); - s[14] = _mm_packs_epi32(u[10], u[11]); - s[10] = _mm_packs_epi32(u[12], u[13]); - s[13] = _mm_packs_epi32(u[14], u[15]); - s[11] = t[11]; - s[12] = t[12]; + step2[0] = butterfly_cospi16(in[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; // stage 5 - t[0] = _mm_add_epi16(s[0], s[3]); - t[1] = _mm_add_epi16(s[1], s[2]); - t[2] = _mm_sub_epi16(s[1], s[2]); - t[3] = _mm_sub_epi16(s[0], s[3]); - t[4] = s[4]; - t[7] = s[7]; - - u[0] = _mm_unpacklo_epi16(s[5], s[6]); - u[1] = _mm_unpackhi_epi16(s[5], s[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - t[5] = _mm_packs_epi32(u[0], u[1]); - t[6] = _mm_packs_epi32(u[2], u[3]); - - t[8] = _mm_add_epi16(s[8], s[11]); - t[9] = _mm_add_epi16(s[9], s[10]); - t[10] = _mm_sub_epi16(s[9], s[10]); - t[11] = _mm_sub_epi16(s[8], s[11]); - t[12] = _mm_sub_epi16(s[15], s[12]); - t[13] = _mm_sub_epi16(s[14], s[13]); - t[14] = _mm_add_epi16(s[13], s[14]); - t[15] = _mm_add_epi16(s[12], s[15]); + step1[0] = step2[0]; + step1[1] = step2[0]; + step1[2] = step2[0]; + step1[3] = step2[0]; + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - s[8] = t[8]; - s[9] = t[9]; - - u[0] = _mm_unpacklo_epi16(t[10], t[13]); - u[1] = _mm_unpackhi_epi16(t[10], t[13]); - u[2] = _mm_unpacklo_epi16(t[11], t[12]); - u[3] = _mm_unpackhi_epi16(t[11], t[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - s[10] = _mm_packs_epi32(u[0], u[1]); - s[13] = _mm_packs_epi32(u[2], u[3]); - s[11] = _mm_packs_epi32(u[4], u[5]); - s[12] = _mm_packs_epi32(u[6], u[7]); - s[14] = t[14]; - s[15] = t[15]; - - // stage 7 - in[0] = _mm_add_epi16(s[0], s[15]); - in[1] = _mm_add_epi16(s[1], s[14]); - in[2] = _mm_add_epi16(s[2], s[13]); - in[3] = _mm_add_epi16(s[3], s[12]); - in[4] = _mm_add_epi16(s[4], s[11]); - in[5] = _mm_add_epi16(s[5], s[10]); - in[6] = _mm_add_epi16(s[6], s[9]); - in[7] = _mm_add_epi16(s[7], s[8]); - in[8] = _mm_sub_epi16(s[7], s[8]); - in[9] = _mm_sub_epi16(s[6], s[9]); - in[10] = _mm_sub_epi16(s[5], s[10]); - in[11] = _mm_sub_epi16(s[4], s[11]); - in[12] = _mm_sub_epi16(s[3], s[12]); - in[13] = _mm_sub_epi16(s[2], s[13]); - in[14] = _mm_sub_epi16(s[1], s[14]); - in[15] = _mm_sub_epi16(s[0], s[15]); + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); } -void idct16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - idct16_8col(in0); - idct16_8col(in1); -} - -void iadst16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - iadst16_8col(in0); - iadst16_8col(in1); -} - -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { const __m128i zero = _mm_setzero_si128(); + __m128i step1[16], step2[16]; - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + // stage 2 + butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in[16], l[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, - stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, - stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - // First 1-D inverse DCT - // Load input data. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 2); - in[2] = load_input_data(input + 8 * 4); - in[3] = load_input_data(input + 8 * 6); - - TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); - - // Stage2 - { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); - - tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); - tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); - tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); - tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp2_8 = _mm_packs_epi32(tmp0, tmp2); - stp2_11 = _mm_packs_epi32(tmp5, tmp7); - } - - // Stage3 - { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); - - tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); - tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); - stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); - - tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); - tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); - tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); - tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); - tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); - tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp1_0 = _mm_packs_epi32(tmp0, tmp0); - stp1_1 = _mm_packs_epi32(tmp2, tmp2); - stp2_9 = _mm_packs_epi32(tmp1, tmp3); - stp2_10 = _mm_packs_epi32(tmp5, tmp7); - - stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); - } - - // Stage5 and Stage6 - { - tmp0 = _mm_add_epi16(stp2_8, stp2_11); - tmp1 = _mm_sub_epi16(stp2_8, stp2_11); - tmp2 = _mm_add_epi16(stp2_9, stp2_10); - tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - - stp1_9 = _mm_unpacklo_epi64(tmp2, zero); - stp1_10 = _mm_unpacklo_epi64(tmp3, zero); - stp1_8 = _mm_unpacklo_epi64(tmp0, zero); - stp1_11 = _mm_unpacklo_epi64(tmp1, zero); - - stp1_13 = _mm_unpackhi_epi64(tmp3, zero); - stp1_14 = _mm_unpackhi_epi64(tmp2, zero); - stp1_12 = _mm_unpackhi_epi64(tmp1, zero); - stp1_15 = _mm_unpackhi_epi64(tmp0, zero); - } - - // Stage6 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - - tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); - tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); - tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); - tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); - tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); - tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_6 = _mm_packs_epi32(tmp3, tmp1); - - stp2_10 = _mm_packs_epi32(tmp0, zero); - stp2_13 = _mm_packs_epi32(tmp2, zero); - stp2_11 = _mm_packs_epi32(tmp4, zero); - stp2_12 = _mm_packs_epi32(tmp6, zero); - - tmp0 = _mm_add_epi16(stp1_0, stp1_4); - tmp1 = _mm_sub_epi16(stp1_0, stp1_4); - tmp2 = _mm_add_epi16(stp1_1, stp1_6); - tmp3 = _mm_sub_epi16(stp1_1, stp1_6); - - stp2_0 = _mm_unpackhi_epi64(tmp0, zero); - stp2_1 = _mm_unpacklo_epi64(tmp2, zero); - stp2_2 = _mm_unpackhi_epi64(tmp2, zero); - stp2_3 = _mm_unpacklo_epi64(tmp0, zero); - stp2_4 = _mm_unpacklo_epi64(tmp1, zero); - stp2_5 = _mm_unpackhi_epi64(tmp3, zero); - stp2_6 = _mm_unpacklo_epi64(tmp3, zero); - stp2_7 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage7. Left 8x16 only. - l[0] = _mm_add_epi16(stp2_0, stp1_15); - l[1] = _mm_add_epi16(stp2_1, stp1_14); - l[2] = _mm_add_epi16(stp2_2, stp2_13); - l[3] = _mm_add_epi16(stp2_3, stp2_12); - l[4] = _mm_add_epi16(stp2_4, stp2_11); - l[5] = _mm_add_epi16(stp2_5, stp2_10); - l[6] = _mm_add_epi16(stp2_6, stp1_9); - l[7] = _mm_add_epi16(stp2_7, stp1_8); - l[8] = _mm_sub_epi16(stp2_7, stp1_8); - l[9] = _mm_sub_epi16(stp2_6, stp1_9); - l[10] = _mm_sub_epi16(stp2_5, stp2_10); - l[11] = _mm_sub_epi16(stp2_4, stp2_11); - l[12] = _mm_sub_epi16(stp2_3, stp2_12); - l[13] = _mm_sub_epi16(stp2_2, stp2_13); - l[14] = _mm_sub_epi16(stp2_1, stp1_14); - l[15] = _mm_sub_epi16(stp2_0, stp1_15); - - // Second 1-D inverse transform, performed per 8x16 block - for (i = 0; i < 2; i++) { - int j; - array_transpose_4X8(l + 8 * i, in); - - IDCT16_10 - - // Stage7 - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } + idct32_8x32_quarter_2_stage_4_to_6(step1, out); } -#define LOAD_DQCOEFF(reg, input) \ - { \ - reg = load_input_data(input); \ - input += 8; \ - } +static INLINE void idct32_34_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_34_8x32_quarter_1(in, temp); + idct32_34_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} -#define IDCT32_34 \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ - \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ - \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ - stp1_31); \ - MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ - stp1_28); \ - MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ - stp1_27); \ - MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ - stp1_24); \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ - \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ - stp2_15); \ - MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ - stp2_12); \ - \ - stp2_16 = stp1_16; \ - stp2_19 = stp1_19; \ - \ - stp2_20 = stp1_20; \ - stp2_23 = stp1_23; \ - \ - stp2_24 = stp1_24; \ - stp2_27 = stp1_27; \ - \ - stp2_28 = stp1_28; \ - stp2_31 = stp1_31; \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ - \ - MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ - stp1_7); \ - \ - stp1_8 = stp2_8; \ - stp1_11 = stp2_11; \ - stp1_12 = stp2_12; \ - stp1_15 = stp2_15; \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ - stp2_1); \ - \ - stp2_4 = stp1_4; \ - stp2_5 = stp1_4; \ - stp2_6 = stp1_7; \ - stp2_7 = stp1_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = stp2_0; \ - stp1_1 = stp2_1; \ - stp1_2 = stp2_1; \ - stp1_3 = stp2_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } +// For each 8x32 block __m128i in[32], +// Input with odd index, 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[32]; -#define IDCT32 \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ - stp1_30) \ - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ - stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ - \ - const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ - stp2_14) \ - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - \ - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ - stp1_6) \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } + // stage 1 + butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 3 + butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + + idct32_34_8x32_quarter_1_2(in, temp); + idct32_34_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} // Only upper-left 8x8 has non-zero coeff void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[32]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i io[32], col[32]; int i; // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); + load_transpose_16bit_8x8(input, 32, io); + idct32_34_8x32_sse2(io, col); - array_transpose_8x8(in, in); - IDCT32_34 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[0] = _mm_add_epi16(stp1_0, stp1_31); - col[1] = _mm_add_epi16(stp1_1, stp1_30); - col[2] = _mm_add_epi16(stp1_2, stp1_29); - col[3] = _mm_add_epi16(stp1_3, stp1_28); - col[4] = _mm_add_epi16(stp1_4, stp1_27); - col[5] = _mm_add_epi16(stp1_5, stp1_26); - col[6] = _mm_add_epi16(stp1_6, stp1_25); - col[7] = _mm_add_epi16(stp1_7, stp1_24); - col[8] = _mm_add_epi16(stp1_8, stp1_23); - col[9] = _mm_add_epi16(stp1_9, stp1_22); - col[10] = _mm_add_epi16(stp1_10, stp1_21); - col[11] = _mm_add_epi16(stp1_11, stp1_20); - col[12] = _mm_add_epi16(stp1_12, stp1_19); - col[13] = _mm_add_epi16(stp1_13, stp1_18); - col[14] = _mm_add_epi16(stp1_14, stp1_17); - col[15] = _mm_add_epi16(stp1_15, stp1_16); - col[16] = _mm_sub_epi16(stp1_15, stp1_16); - col[17] = _mm_sub_epi16(stp1_14, stp1_17); - col[18] = _mm_sub_epi16(stp1_13, stp1_18); - col[19] = _mm_sub_epi16(stp1_12, stp1_19); - col[20] = _mm_sub_epi16(stp1_11, stp1_20); - col[21] = _mm_sub_epi16(stp1_10, stp1_21); - col[22] = _mm_sub_epi16(stp1_9, stp1_22); - col[23] = _mm_sub_epi16(stp1_8, stp1_23); - col[24] = _mm_sub_epi16(stp1_7, stp1_24); - col[25] = _mm_sub_epi16(stp1_6, stp1_25); - col[26] = _mm_sub_epi16(stp1_5, stp1_26); - col[27] = _mm_sub_epi16(stp1_4, stp1_27); - col[28] = _mm_sub_epi16(stp1_3, stp1_28); - col[29] = _mm_sub_epi16(stp1_2, stp1_29); - col[30] = _mm_sub_epi16(stp1_1, stp1_30); - col[31] = _mm_sub_epi16(stp1_0, stp1_31); - for (i = 0; i < 4; i++) { + for (i = 0; i < 32; i += 8) { int j; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - IDCT32_34 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); + transpose_16bit_8x8(col + i, io); + idct32_34_8x32_sse2(io, io); for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); + write_buffer_8x1(dest + j * stride, io[j]); } dest += 8; } } +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_1( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 4 + butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[11], step2[10]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[15], step2[14]); + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_1024_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_1024_8x32_quarter_1(in, temp); + idct32_1024_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_1024_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]); + butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]); + butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]); + butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]); + + butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]); + butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]); + + butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]); + butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi16(step1[16], step1[17]); + step2[17] = _mm_sub_epi16(step1[16], step1[17]); + step2[18] = _mm_sub_epi16(step1[19], step1[18]); + step2[19] = _mm_add_epi16(step1[19], step1[18]); + step2[20] = _mm_add_epi16(step1[20], step1[21]); + step2[21] = _mm_sub_epi16(step1[20], step1[21]); + step2[22] = _mm_sub_epi16(step1[23], step1[22]); + step2[23] = _mm_add_epi16(step1[23], step1[22]); + + step2[24] = _mm_add_epi16(step1[24], step1[25]); + step2[25] = _mm_sub_epi16(step1[24], step1[25]); + step2[26] = _mm_sub_epi16(step1[27], step1[26]); + step2[27] = _mm_add_epi16(step1[27], step1[26]); + step2[28] = _mm_add_epi16(step1[28], step1[29]); + step2[29] = _mm_sub_epi16(step1[28], step1[29]); + step2[30] = _mm_sub_epi16(step1[31], step1[30]); + step2[31] = _mm_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_1024_8x32(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + + idct32_1024_8x32_quarter_1_2(in, temp); + idct32_1024_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} + void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[128], zero_idx[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j, i32; + __m128i col[4][32], io[32]; + int i; + // rows for (i = 0; i < 4; i++) { - i32 = (i << 5); - // First 1-D idct - // Load input data. - LOAD_DQCOEFF(in[0], input); - LOAD_DQCOEFF(in[8], input); - LOAD_DQCOEFF(in[16], input); - LOAD_DQCOEFF(in[24], input); - LOAD_DQCOEFF(in[1], input); - LOAD_DQCOEFF(in[9], input); - LOAD_DQCOEFF(in[17], input); - LOAD_DQCOEFF(in[25], input); - LOAD_DQCOEFF(in[2], input); - LOAD_DQCOEFF(in[10], input); - LOAD_DQCOEFF(in[18], input); - LOAD_DQCOEFF(in[26], input); - LOAD_DQCOEFF(in[3], input); - LOAD_DQCOEFF(in[11], input); - LOAD_DQCOEFF(in[19], input); - LOAD_DQCOEFF(in[27], input); - - LOAD_DQCOEFF(in[4], input); - LOAD_DQCOEFF(in[12], input); - LOAD_DQCOEFF(in[20], input); - LOAD_DQCOEFF(in[28], input); - LOAD_DQCOEFF(in[5], input); - LOAD_DQCOEFF(in[13], input); - LOAD_DQCOEFF(in[21], input); - LOAD_DQCOEFF(in[29], input); - LOAD_DQCOEFF(in[6], input); - LOAD_DQCOEFF(in[14], input); - LOAD_DQCOEFF(in[22], input); - LOAD_DQCOEFF(in[30], input); - LOAD_DQCOEFF(in[7], input); - LOAD_DQCOEFF(in[15], input); - LOAD_DQCOEFF(in[23], input); - LOAD_DQCOEFF(in[31], input); - - // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in[0], in[1]); - zero_idx[1] = _mm_or_si128(in[2], in[3]); - zero_idx[2] = _mm_or_si128(in[4], in[5]); - zero_idx[3] = _mm_or_si128(in[6], in[7]); - zero_idx[4] = _mm_or_si128(in[8], in[9]); - zero_idx[5] = _mm_or_si128(in[10], in[11]); - zero_idx[6] = _mm_or_si128(in[12], in[13]); - zero_idx[7] = _mm_or_si128(in[14], in[15]); - zero_idx[8] = _mm_or_si128(in[16], in[17]); - zero_idx[9] = _mm_or_si128(in[18], in[19]); - zero_idx[10] = _mm_or_si128(in[20], in[21]); - zero_idx[11] = _mm_or_si128(in[22], in[23]); - zero_idx[12] = _mm_or_si128(in[24], in[25]); - zero_idx[13] = _mm_or_si128(in[26], in[27]); - zero_idx[14] = _mm_or_si128(in[28], in[29]); - zero_idx[15] = _mm_or_si128(in[30], in[31]); - - zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); - zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); - - zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); - - if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + load_transpose_16bit_8x8(&input[0], 32, &io[0]); + load_transpose_16bit_8x8(&input[8], 32, &io[8]); + load_transpose_16bit_8x8(&input[16], 32, &io[16]); + load_transpose_16bit_8x8(&input[24], 32, &io[24]); + idct32_1024_8x32(io, col[i]); + input += 32 << 3; } - for (i = 0; i < 4; i++) { - // Second 1-D idct - j = i << 3; + // columns + for (i = 0; i < 32; i += 8) { // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + transpose_16bit_8x8(col[2] + i, io + 16); + transpose_16bit_8x8(col[3] + i, io + 24); - IDCT32 + idct32_1024_8x32(io, io); + store_buffer_8x32(io, dest, stride); + dest += 8; + } +} - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); +void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[2][32], in[32], out[32]; + int i; - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } + for (i = 16; i < 32; i++) { + in[i] = _mm_setzero_si128(); + } + // rows + for (i = 0; i < 2; i++) { + load_transpose_16bit_8x8(&input[0], 32, &in[0]); + load_transpose_16bit_8x8(&input[8], 32, &in[8]); + idct32_1024_8x32(in, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, in); + transpose_16bit_8x8(col[1] + i, in + 8); + idct32_1024_8x32(in, out); + store_buffer_8x32(out, dest, stride); dest += 8; } } @@ -3427,611 +1219,17 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, j; + int j; + tran_high_t a1; + tran_low_t out = + WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - dc_value = _mm_set1_epi16(a); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + dc_value = _mm_set1_epi16((int16_t)a1); for (j = 0; j < 32; ++j) { - RECON_AND_STORE(dest + 0 + j * stride, dc_value); - RECON_AND_STORE(dest + 8 + j * stride, dc_value); - RECON_AND_STORE(dest + 16 + j * stride, dc_value); - RECON_AND_STORE(dest + 24 + j * stride, dc_value); + recon_and_store_16(dest + j * stride + 0, dc_value); + recon_and_store_16(dest + j * stride + 16, dc_value); } } - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; -} - -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - __m128i inptr[4]; - __m128i sign_bits[2]; - __m128i temp_mm, min_input, max_input; - int test; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int optimised_cols = 0; - const __m128i zero = _mm_set1_epi16(0); - const __m128i eight = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(12043); - const __m128i min = _mm_set1_epi16(-12043); - // Load input into __m128i - inptr[0] = _mm_loadu_si128((const __m128i *)input); - inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); - inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); - inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); - - // Pack to 16 bits - inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); - inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); - - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (!test) { - // Do the row transform - idct4_sse2(inptr); - - // Check the min & max values - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (test) { - transpose_4x4(inptr); - sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); - sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); - inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); - inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); - inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); - inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); - _mm_storeu_si128((__m128i *)outptr, inptr[0]); - _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); - _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); - _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - } - - if (optimised_cols) { - idct4_sse2(inptr); - - // Final round and shift - inptr[0] = _mm_add_epi16(inptr[0], eight); - inptr[1] = _mm_add_epi16(inptr[1], eight); - - inptr[0] = _mm_srai_epi16(inptr[0], 4); - inptr[1] = _mm_srai_epi16(inptr[1], 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi64( - d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); - // store input0 - _mm_storel_epi64((__m128i *)dest, d0); - // store input1 - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - // store input2 - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - // store input3 - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[4], temp_out[4]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - vpx_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } - } -} - -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_8x8(inptr, inptr); - for (i = 0; i < 8; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 8; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // only first 4 row has non-zero coefs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct8_sse2(inptr); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 32; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - array_transpose_16x16(inptr, inptr + 16); - for (i = 0; i < 16; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 16; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[32]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_set1_epi16(32); - const __m128i max = _mm_set1_epi16(3155); - const __m128i min = _mm_set1_epi16(-3155); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 16; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); - inptr[i + 16] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // Since all non-zero dct coefficients are in upper-left 4x4 area, - // we only need to consider first 4 rows here. - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform (N.B. This transposes inptr) - idct16_sse2(inptr, inptr + 16); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 16; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_8x8(inptr, inptr); - array_transpose_8x8(inptr + 8, inptr + 16); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); - sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); - temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); - temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - vpx_highbd_idct16_c(input, outptr, bd); - input += 16; - outptr += 16; - } - } - - if (optimised_cols) { - idct16_sse2(inptr, inptr + 16); - - // Final round & shift and Reconstruction and Store - { - __m128i d[2]; - for (i = 0; i < 16; i++) { - inptr[i] = _mm_add_epi16(inptr[i], rounding); - inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); - d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); - inptr[i] = _mm_srai_epi16(inptr[i], 6); - inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); - d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); - d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); - _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[16], temp_out[16]; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - vpx_highbd_idct16_c(temp_in, temp_out, bd); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); - } - } - } -} - -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - __m128i dc_value, d; - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - int a, i, j; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out; - - out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a = ROUND_POWER_OF_TWO(out, 6); - - d = _mm_set1_epi32(a); - dc_value = _mm_packs_epi32(d, d); - for (i = 0; i < 32; ++i) { - for (j = 0; j < 4; ++j) { - d = _mm_loadu_si128((const __m128i *)(&dest[j * 8])); - d = _mm_adds_epi16(d, dc_value); - d = _mm_max_epi16(d, zero); - d = _mm_min_epi16(d, max); - _mm_storeu_si128((__m128i *)(&dest[j * 8]), d); - } - dest += stride; - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h index d762a04abc..b4bbd186d2 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_sse2.h @@ -8,189 +8,703 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_ -#define VPX_DSP_X86_INV_TXFM_SSE2_H_ +#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ +#define VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ #include // SSE2 + #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/inv_txfm.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -// perform 8x8 transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); +static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 30 31 32 33 00 01 02 03 + // in[1]: 20 21 22 23 10 11 12 13 + // in[2]: 40 41 42 43 70 71 72 73 + // in[3]: 50 51 52 53 60 61 62 63 + // to: + // tr0_0: 00 10 01 11 02 12 03 13 + // tr0_1: 20 30 21 31 22 32 23 33 + // tr0_2: 40 50 41 51 42 52 43 53 + // tr0_3: 60 70 61 71 62 72 63 73 + const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]); + const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]); + // Unpack 32 bit elements resulting in: + // tr1_0: 00 10 20 30 01 11 21 31 + // tr1_1: 02 12 22 32 03 13 23 33 + // tr1_2: 40 50 60 70 41 51 61 71 + // tr1_3: 42 52 62 72 43 53 63 73 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); } -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } - -static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) { + const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING)); + return _mm_srai_epi32(t, DCT_CONST_BITS); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; +static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in, + const __m128i cospi) { + const __m128i t = _mm_madd_epi16(in, cospi); + return dct_const_round_shift_sse2(t); } -// Function to allow 8 bit optimisations to be used when profile 0 is used with +// Calculate the dot product between in0/1 and x and wrap to short. +static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0, + const __m128i in1, + const __m128i x) { + const __m128i t0 = idct_madd_round_shift_sse2(in0, x); + const __m128i t1 = idct_madd_round_shift_sse2(in1, x); + return _mm_packs_epi32(t0, t1); +} + +// Multiply elements by constants and add them together. +static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0, + const int c1, __m128i *const out0, + __m128i *const out1) { + const __m128i cst0 = pair_set_epi16(c0, -c1); + const __m128i cst1 = pair_set_epi16(c1, c0); + const __m128i lo = _mm_unpacklo_epi16(in0, in1); + const __m128i hi = _mm_unpackhi_epi16(in0, in1); + *out0 = idct_calc_wraplow_sse2(lo, hi, cst0); + *out1 = idct_calc_wraplow_sse2(lo, hi, cst1); +} + +static INLINE __m128i butterfly_cospi16(const __m128i in) { + const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128()); + const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128()); + return idct_calc_wraplow_sse2(lo, hi, cst); +} + +// Functions to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled -static INLINE __m128i load_input_data(const tran_low_t *data) { +static INLINE __m128i load_input_data4(const tran_low_t *data) { #if CONFIG_VP9_HIGHBITDEPTH - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); + const __m128i zero = _mm_setzero_si128(); + const __m128i in = _mm_load_si128((const __m128i *)data); + return _mm_packs_epi32(in, zero); +#else + return _mm_loadl_epi64((const __m128i *)data); +#endif +} + +static INLINE __m128i load_input_data8(const tran_low_t *data) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i in0 = _mm_load_si128((const __m128i *)data); + const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4)); + return _mm_packs_epi32(in0, in1); #else return _mm_load_si128((const __m128i *)data); #endif } -static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { - in[0] = load_input_data(input + 0 * 16); - in[1] = load_input_data(input + 1 * 16); - in[2] = load_input_data(input + 2 * 16); - in[3] = load_input_data(input + 3 * 16); - in[4] = load_input_data(input + 4 * 16); - in[5] = load_input_data(input + 5 * 16); - in[6] = load_input_data(input + 6 * 16); - in[7] = load_input_data(input + 7 * 16); - - in[8] = load_input_data(input + 8 * 16); - in[9] = load_input_data(input + 9 * 16); - in[10] = load_input_data(input + 10 * 16); - in[11] = load_input_data(input + 11 * 16); - in[12] = load_input_data(input + 12 * 16); - in[13] = load_input_data(input + 13 * 16); - in[14] = load_input_data(input + 14 * 16); - in[15] = load_input_data(input + 15 * 16); +static INLINE void load_transpose_16bit_8x8(const tran_low_t *input, + const int stride, + __m128i *const in) { + in[0] = load_input_data8(input + 0 * stride); + in[1] = load_input_data8(input + 1 * stride); + in[2] = load_input_data8(input + 2 * stride); + in[3] = load_input_data8(input + 3 * stride); + in[4] = load_input_data8(input + 4 * stride); + in[5] = load_input_data8(input + 5 * stride); + in[6] = load_input_data8(input + 6 * stride); + in[7] = load_input_data8(input + 7 * stride); + transpose_16bit_8x8(in, in); } -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ +static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) { + const __m128i zero = _mm_setzero_si128(); + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); + d0 = _mm_unpacklo_epi8(d0, zero); + d0 = _mm_add_epi16(in_x, d0); + d0 = _mm_packus_epi16(d0, d0); + _mm_storel_epi64((__m128i *)(dest), d0); +} + +static INLINE void round_shift_8x8(const __m128i *const in, + __m128i *const out) { + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + + out[0] = _mm_add_epi16(in[0], final_rounding); + out[1] = _mm_add_epi16(in[1], final_rounding); + out[2] = _mm_add_epi16(in[2], final_rounding); + out[3] = _mm_add_epi16(in[3], final_rounding); + out[4] = _mm_add_epi16(in[4], final_rounding); + out[5] = _mm_add_epi16(in[5], final_rounding); + out[6] = _mm_add_epi16(in[6], final_rounding); + out[7] = _mm_add_epi16(in[7], final_rounding); + + out[0] = _mm_srai_epi16(out[0], 5); + out[1] = _mm_srai_epi16(out[1], 5); + out[2] = _mm_srai_epi16(out[2], 5); + out[3] = _mm_srai_epi16(out[3], 5); + out[4] = _mm_srai_epi16(out[4], 5); + out[5] = _mm_srai_epi16(out[5], 5); + out[6] = _mm_srai_epi16(out[6], 5); + out[7] = _mm_srai_epi16(out[7], 5); +} + +static INLINE void write_buffer_8x8(const __m128i *const in, + uint8_t *const dest, const int stride) { + __m128i t[8]; + + round_shift_8x8(in, t); + + recon_and_store(dest + 0 * stride, t[0]); + recon_and_store(dest + 1 * stride, t[1]); + recon_and_store(dest + 2 * stride, t[2]); + recon_and_store(dest + 3 * stride, t[3]); + recon_and_store(dest + 4 * stride, t[4]); + recon_and_store(dest + 5 * stride, t[5]); + recon_and_store(dest + 6 * stride, t[6]); + recon_and_store(dest + 7 * stride, t[7]); +} + +static INLINE void recon_and_store4x4_sse2(const __m128i *const in, + uint8_t *const dest, + const int stride) { + const __m128i zero = _mm_setzero_si128(); + __m128i d[2]; + + // Reconstruction and Store + d[0] = _mm_cvtsi32_si128(*(const int *)(dest)); + d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)); + d[0] = _mm_unpacklo_epi32(d[0], + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d[1] = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]); + d[0] = _mm_unpacklo_epi8(d[0], zero); + d[1] = _mm_unpacklo_epi8(d[1], zero); + d[0] = _mm_add_epi16(d[0], in[0]); + d[1] = _mm_add_epi16(d[1], in[1]); + d[0] = _mm_packus_epi16(d[0], d[1]); + + *(int *)dest = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]); + d[0] = _mm_srli_si128(d[0], 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]); +} + +static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + int j = 0; + while (j < 32) { + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm_srai_epi16(in[j], 6); + in[j + 1] = _mm_srai_epi16(in[j + 1], 6); + + recon_and_store(dst, in[j]); + dst += stride; + recon_and_store(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i out; + out = _mm_adds_epi16(in, final_rounding); + out = _mm_srai_epi16(out, 6); + recon_and_store(dest, out); +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +static INLINE void idct8(const __m128i *const in /*in[8]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 1 + butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + + // stage 2 + butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + + // stage 4 + out[0] = _mm_add_epi16(step1[0], step2[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step2[4]); + out[4] = _mm_sub_epi16(step1[3], step2[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + __m128i step1[8], step2[8], tmp[4]; + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + { + const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero); + const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero); + step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7 + step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6 } -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); + // stage 2 + { + const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero); + const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero); + const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0); + step2[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6 + } - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); + // stage 3 + { + const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]); + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6 + } - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); - RECON_AND_STORE(dest + 8 * stride, in[8]); - RECON_AND_STORE(dest + 9 * stride, in[9]); - RECON_AND_STORE(dest + 10 * stride, in[10]); - RECON_AND_STORE(dest + 11 * stride, in[11]); - RECON_AND_STORE(dest + 12 * stride, in[12]); - RECON_AND_STORE(dest + 13 * stride, in[13]); - RECON_AND_STORE(dest + 14 * stride, in[14]); - RECON_AND_STORE(dest + 15 * stride, in[15]); + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + io[4] = io[5] = io[6] = io[7] = zero; + + idct8(io, io); } -void idct4_sse2(__m128i *in); -void idct8_sse2(__m128i *in); -void idct16_sse2(__m128i *in0, __m128i *in1); -void iadst4_sse2(__m128i *in); -void iadst8_sse2(__m128i *in); -void iadst16_sse2(__m128i *in0, __m128i *in1); +static INLINE void idct16_8col(const __m128i *const in /*in[16]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; -#endif // VPX_DSP_X86_INV_TXFM_SSE2_H_ + // stage 2 + butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); + butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); + butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[10], step2[11]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[14], step2[15]); + + // stage 4 + butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]); + butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step1[4] = _mm_add_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step1[7] = _mm_add_epi16(step1[6], step1[7]); + step2[8] = step1[8]; + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[1], step2[2]); + step1[2] = _mm_sub_epi16(step2[1], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm_add_epi16(step1[0], step1[7]); + step2[1] = _mm_add_epi16(step1[1], step1[6]); + step2[2] = _mm_add_epi16(step1[2], step1[5]); + step2[3] = _mm_add_epi16(step1[3], step1[4]); + step2[4] = _mm_sub_epi16(step1[3], step1[4]); + step2[5] = _mm_sub_epi16(step1[2], step1[5]); + step2[6] = _mm_sub_epi16(step1[1], step1[6]); + step2[7] = _mm_sub_epi16(step1[0], step1[7]); + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + out[0] = _mm_add_epi16(step2[0], step1[15]); + out[1] = _mm_add_epi16(step2[1], step1[14]); + out[2] = _mm_add_epi16(step2[2], step2[13]); + out[3] = _mm_add_epi16(step2[3], step2[12]); + out[4] = _mm_add_epi16(step2[4], step2[11]); + out[5] = _mm_add_epi16(step2[5], step2[10]); + out[6] = _mm_add_epi16(step2[6], step1[9]); + out[7] = _mm_add_epi16(step2[7], step1[8]); + out[8] = _mm_sub_epi16(step2[7], step1[8]); + out[9] = _mm_sub_epi16(step2[6], step1[9]); + out[10] = _mm_sub_epi16(step2[5], step2[10]); + out[11] = _mm_sub_epi16(step2[4], step2[11]); + out[12] = _mm_sub_epi16(step2[3], step2[12]); + out[13] = _mm_sub_epi16(step2[2], step2[13]); + out[14] = _mm_sub_epi16(step2[1], step1[14]); + out[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/, + __m128i *const output /*output[16]*/) { + const __m128i zero = _mm_setzero_si128(); + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i step1[16], step2[16]; + + transpose_16bit_4x4(input, output); + + // stage 2 + { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]); + step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30, + lo_1_15); // step2 8&15 + step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06, + lo_13_3); // step2 11&12 + } + + // stage 3 + { + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero); + step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28, + lo_2_14); // step1 4&7 + step1[13] = _mm_unpackhi_epi64(step2[11], zero); + step1[14] = _mm_unpackhi_epi64(step2[8], zero); + } + + // stage 4 + { + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]); + const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]); + const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16); + step1[0] = _mm_packs_epi32(t, t); // step2 0&1 + step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08, + lo_9_14); // step2 9&14 + step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24, + lo_10_13); // step2 10&13 + step2[6] = _mm_unpackhi_epi64(step1[4], zero); + } + + // stage 5 + { + const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]); + step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16, + lo_5_6); // step1 6&5 + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_unpackhi_epi64(step1[11], zero); + step1[13] = _mm_unpackhi_epi64(step1[10], zero); + step1[14] = _mm_unpackhi_epi64(step1[9], zero); + step1[15] = _mm_unpackhi_epi64(step1[8], zero); + } + + // stage 6 + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]); + step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, + lo_10_13); // step2 10&13 + step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16, + lo_11_12); // step2 11&12 + step2[13] = _mm_unpackhi_epi64(step2[10], zero); + step2[12] = _mm_unpackhi_epi64(step2[11], zero); + step2[3] = _mm_add_epi16(step1[0], step1[4]); + step2[1] = _mm_add_epi16(step1[0], step1[6]); + step2[6] = _mm_sub_epi16(step1[0], step1[6]); + step2[4] = _mm_sub_epi16(step1[0], step1[4]); + step2[0] = _mm_unpackhi_epi64(step2[3], zero); + step2[2] = _mm_unpackhi_epi64(step2[1], zero); + step2[5] = _mm_unpackhi_epi64(step2[6], zero); + step2[7] = _mm_unpackhi_epi64(step2[4], zero); + } + + // stage 7. Left 8x16 only. + output[0] = _mm_add_epi16(step2[0], step1[15]); + output[1] = _mm_add_epi16(step2[1], step1[14]); + output[2] = _mm_add_epi16(step2[2], step2[13]); + output[3] = _mm_add_epi16(step2[3], step2[12]); + output[4] = _mm_add_epi16(step2[4], step2[11]); + output[5] = _mm_add_epi16(step2[5], step2[10]); + output[6] = _mm_add_epi16(step2[6], step1[9]); + output[7] = _mm_add_epi16(step2[7], step1[8]); + output[8] = _mm_sub_epi16(step2[7], step1[8]); + output[9] = _mm_sub_epi16(step2[6], step1[9]); + output[10] = _mm_sub_epi16(step2[5], step2[10]); + output[11] = _mm_sub_epi16(step2[4], step2[11]); + output[12] = _mm_sub_epi16(step2[3], step2[12]); + output[13] = _mm_sub_epi16(step2[2], step2[13]); + output[14] = _mm_sub_epi16(step2[1], step1[14]); + output[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/, + __m128i *const io /*io[16]*/) { + const __m128i zero = _mm_setzero_si128(); + __m128i step1[16], step2[16]; + + transpose_16bit_4x8(l, io); + + // stage 2 + butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]); + butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); + + // stage 3 + butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + + // stage 4 + step1[0] = butterfly_cospi16(io[0]); + butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13], + &step2[10]); + + // stage 5 + butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + step2[0] = _mm_add_epi16(step1[0], step1[7]); + step2[1] = _mm_add_epi16(step1[0], step1[6]); + step2[2] = _mm_add_epi16(step1[0], step1[5]); + step2[3] = _mm_add_epi16(step1[0], step1[4]); + step2[4] = _mm_sub_epi16(step1[0], step1[4]); + step2[5] = _mm_sub_epi16(step1[0], step1[5]); + step2[6] = _mm_sub_epi16(step1[0], step1[6]); + step2[7] = _mm_sub_epi16(step1[0], step1[7]); + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10], + &step2[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11], + &step2[12]); + + // stage 7 + io[0] = _mm_add_epi16(step2[0], step1[15]); + io[1] = _mm_add_epi16(step2[1], step1[14]); + io[2] = _mm_add_epi16(step2[2], step2[13]); + io[3] = _mm_add_epi16(step2[3], step2[12]); + io[4] = _mm_add_epi16(step2[4], step2[11]); + io[5] = _mm_add_epi16(step2[5], step2[10]); + io[6] = _mm_add_epi16(step2[6], step1[9]); + io[7] = _mm_add_epi16(step2[7], step1[8]); + io[8] = _mm_sub_epi16(step2[7], step1[8]); + io[9] = _mm_sub_epi16(step2[6], step1[9]); + io[10] = _mm_sub_epi16(step2[5], step2[10]); + io[11] = _mm_sub_epi16(step2[4], step2[11]); + io[12] = _mm_sub_epi16(step2[3], step2[12]); + io[13] = _mm_sub_epi16(step2[2], step2[13]); + io[14] = _mm_sub_epi16(step2[1], step1[14]); + io[15] = _mm_sub_epi16(step2[0], step1[15]); +} + +static INLINE void idct32_8x32_quarter_2_stage_4_to_6( + __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { + __m128i step2[32]; + + // stage 4 + step2[8] = step1[8]; + step2[15] = step1[15]; + butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], + &step2[14]); + butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10], + &step2[13]); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[8] = _mm_add_epi16(step2[8], step2[11]); + step1[9] = _mm_add_epi16(step2[9], step2[10]); + step1[10] = _mm_sub_epi16(step2[9], step2[10]); + step1[11] = _mm_sub_epi16(step2[8], step2[11]); + step1[12] = _mm_sub_epi16(step2[15], step2[12]); + step1[13] = _mm_sub_epi16(step2[14], step2[13]); + step1[14] = _mm_add_epi16(step2[14], step2[13]); + step1[15] = _mm_add_epi16(step2[15], step2[12]); + + // stage 6 + out[8] = step1[8]; + out[9] = step1[9]; + butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]); + butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]); + out[14] = step1[14]; + out[15] = step1[15]; +} + +static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7( + __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) { + __m128i step2[32]; + + // stage 4 + step2[16] = _mm_add_epi16(step1[16], step1[19]); + step2[17] = _mm_add_epi16(step1[17], step1[18]); + step2[18] = _mm_sub_epi16(step1[17], step1[18]); + step2[19] = _mm_sub_epi16(step1[16], step1[19]); + step2[20] = _mm_sub_epi16(step1[23], step1[20]); + step2[21] = _mm_sub_epi16(step1[22], step1[21]); + step2[22] = _mm_add_epi16(step1[22], step1[21]); + step2[23] = _mm_add_epi16(step1[23], step1[20]); + + step2[24] = _mm_add_epi16(step1[24], step1[27]); + step2[25] = _mm_add_epi16(step1[25], step1[26]); + step2[26] = _mm_sub_epi16(step1[25], step1[26]); + step2[27] = _mm_sub_epi16(step1[24], step1[27]); + step2[28] = _mm_sub_epi16(step1[31], step1[28]); + step2[29] = _mm_sub_epi16(step1[30], step1[29]); + step2[30] = _mm_add_epi16(step1[29], step1[30]); + step2[31] = _mm_add_epi16(step1[28], step1[31]); + + // stage 5 + step1[16] = step2[16]; + step1[17] = step2[17]; + butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18], + &step1[29]); + butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19], + &step1[28]); + butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20], + &step1[27]); + butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21], + &step1[26]); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + out[16] = _mm_add_epi16(step1[16], step1[23]); + out[17] = _mm_add_epi16(step1[17], step1[22]); + out[18] = _mm_add_epi16(step1[18], step1[21]); + out[19] = _mm_add_epi16(step1[19], step1[20]); + step2[20] = _mm_sub_epi16(step1[19], step1[20]); + step2[21] = _mm_sub_epi16(step1[18], step1[21]); + step2[22] = _mm_sub_epi16(step1[17], step1[22]); + step2[23] = _mm_sub_epi16(step1[16], step1[23]); + + step2[24] = _mm_sub_epi16(step1[31], step1[24]); + step2[25] = _mm_sub_epi16(step1[30], step1[25]); + step2[26] = _mm_sub_epi16(step1[29], step1[26]); + step2[27] = _mm_sub_epi16(step1[28], step1[27]); + out[28] = _mm_add_epi16(step1[27], step1[28]); + out[29] = _mm_add_epi16(step1[26], step1[29]); + out[30] = _mm_add_epi16(step1[25], step1[30]); + out[31] = _mm_add_epi16(step1[24], step1[31]); + + // stage 7 + butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]); + butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]); + butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]); + butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]); +} + +void idct4_sse2(__m128i *const in); +void vpx_idct8_sse2(__m128i *const in); +void idct16_sse2(__m128i *const in0, __m128i *const in1); +void iadst4_sse2(__m128i *const in); +void iadst8_sse2(__m128i *const in); +void vpx_iadst16_8col_sse2(__m128i *const in); +void iadst16_sse2(__m128i *const in0, __m128i *const in1); +void idct32_1024_8x32(const __m128i *const in, __m128i *const out); +void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out); +void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out); + +#endif // VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c new file mode 100644 index 0000000000..6e99469b63 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/inv_txfm_ssse3.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0, + const int c1, __m128i *const out0, + __m128i *const out1) { + const __m128i cst0 = _mm_set1_epi16(2 * c0); + const __m128i cst1 = _mm_set1_epi16(2 * c1); + *out0 = _mm_mulhrs_epi16(in, cst0); + *out1 = _mm_mulhrs_epi16(in, cst1); +} + +static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) { + const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64); + return _mm_mulhrs_epi16(in, coef_pair); +} + +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[8]; + + io[0] = load_input_data4(input + 0 * 8); + io[1] = load_input_data4(input + 1 * 8); + io[2] = load_input_data4(input + 2 * 8); + io[3] = load_input_data4(input + 3 * 8); + + idct8x8_12_add_kernel_ssse3(io); + write_buffer_8x8(io, dest, stride); +} + +// Group the coefficient calculation into smaller functions to prevent stack +// spillover in 32x32 idct optimizations: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + + // stage 4 + step2[0] = partial_butterfly_cospi16_ssse3(in[0]); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[0]; + step1[2] = step2[0]; + step1[3] = step2[0]; + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[14] = step2[15]; + step1[15] = step2[15]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_34_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_34_8x32_quarter_1(in, temp); + idct32_34_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, 1, 3, 5, 7 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_34_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32]; + + // stage 1 + partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 3 + butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + + idct32_34_8x32_quarter_1_2(in, temp); + idct32_34_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} + +// Only upper-left 8x8 has non-zero coeff +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i io[32], col[32]; + int i; + + // Load input data. Only need to load the top left 8x8 block. + load_transpose_16bit_8x8(input, 32, io); + idct32_34_8x32_ssse3(io, col); + + for (i = 0; i < 32; i += 8) { + int j; + transpose_16bit_8x8(col + i, io); + idct32_34_8x32_ssse3(io, io); + + for (j = 0; j < 32; ++j) { + write_buffer_8x1(dest + j * stride, io[j]); + } + + dest += 8; + } +} + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12 +// output pixels: 0-7 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[8]*/) { + __m128i step1[8], step2[8]; + + // stage 3 + partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]); + partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5], + &step1[6]); + + // stage 4 + step2[0] = partial_butterfly_cospi16_ssse3(in[0]); + partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 5 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[0], step2[2]); + step1[2] = _mm_sub_epi16(step2[0], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + step1[4] = step2[4]; + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + step1[7] = step2[7]; + + // stage 6 + out[0] = _mm_add_epi16(step1[0], step1[7]); + out[1] = _mm_add_epi16(step1[1], step1[6]); + out[2] = _mm_add_epi16(step1[2], step1[5]); + out[3] = _mm_add_epi16(step1[3], step1[4]); + out[4] = _mm_sub_epi16(step1[3], step1[4]); + out[5] = _mm_sub_epi16(step1[2], step1[5]); + out[6] = _mm_sub_epi16(step1[1], step1[6]); + out[7] = _mm_sub_epi16(step1[0], step1[7]); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14 +// output pixels: 8-15 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[16]*/) { + __m128i step1[16], step2[16]; + + // stage 2 + partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8], + &step2[15]); + partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9], + &step2[14]); + partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10], + &step2[13]); + partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11], + &step2[12]); + + // stage 3 + step1[8] = _mm_add_epi16(step2[8], step2[9]); + step1[9] = _mm_sub_epi16(step2[8], step2[9]); + step1[10] = _mm_sub_epi16(step2[11], step2[10]); + step1[11] = _mm_add_epi16(step2[11], step2[10]); + step1[12] = _mm_add_epi16(step2[12], step2[13]); + step1[13] = _mm_sub_epi16(step2[12], step2[13]); + step1[14] = _mm_sub_epi16(step2[15], step2[14]); + step1[15] = _mm_add_epi16(step2[15], step2[14]); + + idct32_8x32_quarter_2_stage_4_to_6(step1, out); +} + +static INLINE void idct32_135_8x32_quarter_1_2( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i temp[16]; + idct32_135_8x32_quarter_1(in, temp); + idct32_135_8x32_quarter_2(in, temp); + // stage 7 + add_sub_butterfly(temp, out, 16); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15 +// output pixels: 16-23, 24-31 in __m128i out[32] +static INLINE void idct32_135_8x32_quarter_3_4( + const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) { + __m128i step1[32], step2[32]; + + // stage 1 + partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16], + &step1[31]); + partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17], + &step1[30]); + partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18], + &step1[29]); + partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19], + &step1[28]); + + partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20], + &step1[27]); + partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21], + &step1[26]); + + partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22], + &step1[25]); + partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23], + &step1[24]); + + // stage 2 + step2[16] = _mm_add_epi16(step1[16], step1[17]); + step2[17] = _mm_sub_epi16(step1[16], step1[17]); + step2[18] = _mm_sub_epi16(step1[19], step1[18]); + step2[19] = _mm_add_epi16(step1[19], step1[18]); + step2[20] = _mm_add_epi16(step1[20], step1[21]); + step2[21] = _mm_sub_epi16(step1[20], step1[21]); + step2[22] = _mm_sub_epi16(step1[23], step1[22]); + step2[23] = _mm_add_epi16(step1[23], step1[22]); + + step2[24] = _mm_add_epi16(step1[24], step1[25]); + step2[25] = _mm_sub_epi16(step1[24], step1[25]); + step2[26] = _mm_sub_epi16(step1[27], step1[26]); + step2[27] = _mm_add_epi16(step1[27], step1[26]); + step2[28] = _mm_add_epi16(step1[28], step1[29]); + step2[29] = _mm_sub_epi16(step1[28], step1[29]); + step2[30] = _mm_sub_epi16(step1[31], step1[30]); + step2[31] = _mm_add_epi16(step1[31], step1[30]); + + // stage 3 + step1[16] = step2[16]; + step1[31] = step2[31]; + butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17], + &step1[30]); + butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18], + &step1[29]); + step1[19] = step2[19]; + step1[20] = step2[20]; + butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21], + &step1[26]); + butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22], + &step1[25]); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + idct32_8x32_quarter_3_4_stage_4_to_7(step1, out); +} + +void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/, + __m128i *const out /*out[32]*/) { + __m128i temp[32]; + idct32_135_8x32_quarter_1_2(in, temp); + idct32_135_8x32_quarter_3_4(in, temp); + // final stage + add_sub_butterfly(temp, out, 32); +} + +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[2][32], io[32]; + int i; + + // rows + for (i = 0; i < 2; i++) { + load_transpose_16bit_8x8(&input[0], 32, &io[0]); + load_transpose_16bit_8x8(&input[8], 32, &io[8]); + idct32_135_8x32_ssse3(io, col[i]); + input += 32 << 3; + } + + // columns + for (i = 0; i < 32; i += 8) { + transpose_16bit_8x8(col[0] + i, io); + transpose_16bit_8x8(col[1] + i, io + 8); + idct32_135_8x32_ssse3(io, io); + store_buffer_8x32(io, dest, stride); + dest += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h new file mode 100644 index 0000000000..e9f0f69033 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ +#define VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) { + const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64); + const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64); + const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64); + const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64); + const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64)); + const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64)); + const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64)); + const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64)); + const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64)); + const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64)); + const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64)); + __m128i step1[8], step2[8], tmp[4]; + + // pass 1 + + transpose_16bit_4x4(io, io); + // io[0]: 00 10 20 30 01 11 21 31 + // io[1]: 02 12 22 32 03 13 23 33 + + // stage 1 + tmp[0] = _mm_unpacklo_epi64(io[0], io[0]); + tmp[1] = _mm_unpackhi_epi64(io[0], io[0]); + tmp[2] = _mm_unpacklo_epi64(io[1], io[1]); + tmp[3] = _mm_unpackhi_epi64(io[1], io[1]); + step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7 + step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6 + + // stage 2 + step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1 + step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2 + step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7 + step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6 + step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6 + + // stage 3 + tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]); + step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6 + tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1 + tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2 + step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1 + step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0 + + // stage 4 + tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0 + tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1 + tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7 + tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6 + + // pass 2 + + idct8x8_12_transpose_16bit_4x8(tmp, io); + + // stage 1 + step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d); + step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d); + step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d); + step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d); + + // stage 2 + step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d); // step2[1] = step2[0] + step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d); + step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d); + step2[4] = _mm_add_epi16(step1[4], step1[5]); + step2[5] = _mm_sub_epi16(step1[4], step1[5]); + step2[6] = _mm_sub_epi16(step1[7], step1[6]); + step2[7] = _mm_add_epi16(step1[7], step1[6]); + + // stage 3 + step1[0] = _mm_add_epi16(step2[0], step2[3]); + step1[1] = _mm_add_epi16(step2[0], step2[2]); + step1[2] = _mm_sub_epi16(step2[0], step2[2]); + step1[3] = _mm_sub_epi16(step2[0], step2[3]); + butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]); + + // stage 4 + io[0] = _mm_add_epi16(step1[0], step2[7]); + io[1] = _mm_add_epi16(step1[1], step1[6]); + io[2] = _mm_add_epi16(step1[2], step1[5]); + io[3] = _mm_add_epi16(step1[3], step2[4]); + io[4] = _mm_sub_epi16(step1[3], step2[4]); + io[5] = _mm_sub_epi16(step1[2], step1[5]); + io[6] = _mm_sub_epi16(step1[1], step1[6]); + io[7] = _mm_sub_epi16(step1[0], step2[7]); +} + +void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out); + +#endif // VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm deleted file mode 100644 index dee64e3ad3..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ /dev/null @@ -1,1793 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the inverse transformation. Part -; of the functions are originally derived from the ffmpeg project. -; Note that the current version applies to x86 64-bit only. - -SECTION_RODATA - -pw_11585x2: times 8 dw 23170 - -pw_m2404x2: times 8 dw -2404*2 -pw_m4756x2: times 8 dw -4756*2 -pw_m5520x2: times 8 dw -5520*2 -pw_m8423x2: times 8 dw -8423*2 -pw_m9102x2: times 8 dw -9102*2 -pw_m10394x2: times 8 dw -10394*2 -pw_m11003x2: times 8 dw -11003*2 - -pw_16364x2: times 8 dw 16364*2 -pw_16305x2: times 8 dw 16305*2 -pw_16207x2: times 8 dw 16207*2 -pw_16069x2: times 8 dw 16069*2 -pw_15893x2: times 8 dw 15893*2 -pw_15679x2: times 8 dw 15679*2 -pw_15426x2: times 8 dw 15426*2 -pw_15137x2: times 8 dw 15137*2 -pw_14811x2: times 8 dw 14811*2 -pw_14449x2: times 8 dw 14449*2 -pw_14053x2: times 8 dw 14053*2 -pw_13623x2: times 8 dw 13623*2 -pw_13160x2: times 8 dw 13160*2 -pw_12665x2: times 8 dw 12665*2 -pw_12140x2: times 8 dw 12140*2 -pw__9760x2: times 8 dw 9760*2 -pw__7723x2: times 8 dw 7723*2 -pw__7005x2: times 8 dw 7005*2 -pw__6270x2: times 8 dw 6270*2 -pw__3981x2: times 8 dw 3981*2 -pw__3196x2: times 8 dw 3196*2 -pw__1606x2: times 8 dw 1606*2 -pw___804x2: times 8 dw 804*2 - -pd_8192: times 4 dd 8192 -pw_32: times 8 dw 32 -pw_16: times 8 dw 16 - -%macro TRANSFORM_COEFFS 2 -pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 -pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 -pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2 -%endmacro - -TRANSFORM_COEFFS 6270, 15137 -TRANSFORM_COEFFS 3196, 16069 -TRANSFORM_COEFFS 13623, 9102 - -; constants for 32x32_34 -TRANSFORM_COEFFS 804, 16364 -TRANSFORM_COEFFS 15426, 5520 -TRANSFORM_COEFFS 3981, 15893 -TRANSFORM_COEFFS 16207, 2404 -TRANSFORM_COEFFS 1606, 16305 -TRANSFORM_COEFFS 15679, 4756 -TRANSFORM_COEFFS 11585, 11585 - -; constants for 32x32_1024 -TRANSFORM_COEFFS 12140, 11003 -TRANSFORM_COEFFS 7005, 14811 -TRANSFORM_COEFFS 14053, 8423 -TRANSFORM_COEFFS 9760, 13160 -TRANSFORM_COEFFS 12665, 10394 -TRANSFORM_COEFFS 7723, 14449 - -%macro PAIR_PP_COEFFS 2 -dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 -%endmacro - -%macro PAIR_MP_COEFFS 2 -dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 -%endmacro - -%macro PAIR_MM_COEFFS 2 -dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 -%endmacro - -PAIR_PP_COEFFS 30274, 12540 -PAIR_PP_COEFFS 6392, 32138 -PAIR_MP_COEFFS 18204, 27246 - -PAIR_PP_COEFFS 12540, 12540 -PAIR_PP_COEFFS 30274, 30274 -PAIR_PP_COEFFS 6392, 6392 -PAIR_PP_COEFFS 32138, 32138 -PAIR_MM_COEFFS 18204, 18204 -PAIR_PP_COEFFS 27246, 27246 - -SECTION .text - -%if ARCH_X86_64 -%macro SUM_SUB 3 - psubw m%3, m%1, m%2 - paddw m%1, m%2 - SWAP %2, %3 -%endmacro - -; butterfly operation -%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 - pmaddwd m%1, m%3, %5 - pmaddwd m%2, m%3, %6 - paddd m%1, %4 - paddd m%2, %4 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -%macro IDCT8_1D 0 - SUM_SUB 0, 4, 9 - BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 - pmulhrsw m0, m12 - pmulhrsw m4, m12 - BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 - BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 - - SUM_SUB 1, 5, 9 - SUM_SUB 7, 3, 9 - SUM_SUB 0, 6, 9 - SUM_SUB 4, 2, 9 - SUM_SUB 3, 5, 9 - pmulhrsw m3, m12 - pmulhrsw m5, m12 - - SUM_SUB 0, 7, 9 - SUM_SUB 4, 3, 9 - SUM_SUB 2, 5, 9 - SUM_SUB 6, 1, 9 - - SWAP 3, 6 - SWAP 1, 4 -%endmacro - -; This macro handles 8 pixels per line -%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero - paddw m%1, m11 - paddw m%2, m11 - psraw m%1, 5 - psraw m%2, 5 - - movh m%3, [outputq] - movh m%4, [outputq + strideq] - punpcklbw m%3, m%5 - punpcklbw m%4, m%5 - paddw m%3, m%1 - paddw m%4, m%2 - packuswb m%3, m%5 - packuswb m%4, m%5 - movh [outputq], m%3 - movh [outputq + strideq], m%4 -%endmacro - -INIT_XMM ssse3 -; full inverse 8x8 2D-DCT transform -cglobal idct8x8_64_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] - mova m2, [inputq + 64] - packssdw m2, [inputq + 80] - mova m3, [inputq + 96] - packssdw m3, [inputq + 112] - mova m4, [inputq + 128] - packssdw m4, [inputq + 144] - mova m5, [inputq + 160] - packssdw m5, [inputq + 176] - mova m6, [inputq + 192] - packssdw m6, [inputq + 208] - mova m7, [inputq + 224] - packssdw m7, [inputq + 240] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] - mova m4, [inputq + 64] - mova m5, [inputq + 80] - mova m6, [inputq + 96] - mova m7, [inputq + 112] -%endif - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - IDCT8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - IDCT8_1D - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - -; inverse 8x8 2D-DCT transform with only first 12 coeffs non-zero -cglobal idct8x8_12_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] - -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] - mova m2, [inputq + 64] - packssdw m2, [inputq + 80] - mova m3, [inputq + 96] - packssdw m3, [inputq + 112] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] -%endif - - punpcklwd m0, m1 - punpcklwd m2, m3 - punpckhdq m9, m0, m2 - punpckldq m0, m2 - SWAP 2, 9 - - ; m0 -> [0], [0] - ; m1 -> [1], [1] - ; m2 -> [2], [2] - ; m3 -> [3], [3] - punpckhqdq m10, m0, m0 - punpcklqdq m0, m0 - punpckhqdq m9, m2, m2 - punpcklqdq m2, m2 - SWAP 1, 10 - SWAP 3, 9 - - pmulhrsw m0, m12 - pmulhrsw m2, [dpw_30274_12540] - pmulhrsw m1, [dpw_6392_32138] - pmulhrsw m3, [dpw_m18204_27246] - - SUM_SUB 0, 2, 9 - SUM_SUB 1, 3, 9 - - punpcklqdq m9, m3, m3 - punpckhqdq m5, m3, m9 - - SUM_SUB 3, 5, 9 - punpckhqdq m5, m3 - pmulhrsw m5, m12 - - punpckhqdq m9, m1, m5 - punpcklqdq m1, m5 - SWAP 5, 9 - - SUM_SUB 0, 5, 9 - SUM_SUB 2, 1, 9 - - punpckhqdq m3, m0, m0 - punpckhqdq m4, m1, m1 - punpckhqdq m6, m5, m5 - punpckhqdq m7, m2, m2 - - punpcklwd m0, m3 - punpcklwd m7, m2 - punpcklwd m1, m4 - punpcklwd m6, m5 - - punpckhdq m4, m0, m7 - punpckldq m0, m7 - punpckhdq m10, m1, m6 - punpckldq m5, m1, m6 - - punpckhqdq m1, m0, m5 - punpcklqdq m0, m5 - punpckhqdq m3, m4, m10 - punpcklqdq m2, m4, m10 - - - pmulhrsw m0, m12 - pmulhrsw m6, m2, [dpw_30274_30274] - pmulhrsw m4, m2, [dpw_12540_12540] - - pmulhrsw m7, m1, [dpw_32138_32138] - pmulhrsw m1, [dpw_6392_6392] - pmulhrsw m5, m3, [dpw_m18204_m18204] - pmulhrsw m3, [dpw_27246_27246] - - mova m2, m0 - SUM_SUB 0, 6, 9 - SUM_SUB 2, 4, 9 - SUM_SUB 1, 5, 9 - SUM_SUB 7, 3, 9 - - SUM_SUB 3, 5, 9 - pmulhrsw m3, m12 - pmulhrsw m5, m12 - - SUM_SUB 0, 7, 9 - SUM_SUB 2, 3, 9 - SUM_SUB 4, 5, 9 - SUM_SUB 6, 1, 9 - - SWAP 3, 6 - SWAP 1, 2 - SWAP 2, 4 - - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - -%define idx0 16 * 0 -%define idx1 16 * 1 -%define idx2 16 * 2 -%define idx3 16 * 3 -%define idx4 16 * 4 -%define idx5 16 * 5 -%define idx6 16 * 6 -%define idx7 16 * 7 -%define idx8 16 * 0 -%define idx9 16 * 1 -%define idx10 16 * 2 -%define idx11 16 * 3 -%define idx12 16 * 4 -%define idx13 16 * 5 -%define idx14 16 * 6 -%define idx15 16 * 7 -%define idx16 16 * 0 -%define idx17 16 * 1 -%define idx18 16 * 2 -%define idx19 16 * 3 -%define idx20 16 * 4 -%define idx21 16 * 5 -%define idx22 16 * 6 -%define idx23 16 * 7 -%define idx24 16 * 0 -%define idx25 16 * 1 -%define idx26 16 * 2 -%define idx27 16 * 3 -%define idx28 16 * 4 -%define idx29 16 * 5 -%define idx30 16 * 6 -%define idx31 16 * 7 - -; FROM idct32x32_add_neon.asm -; -; Instead of doing the transforms stage by stage, it is done by loading -; some input values and doing as many stages as possible to minimize the -; storing/loading of intermediate results. To fit within registers, the -; final coefficients are cut into four blocks: -; BLOCK A: 16-19,28-31 -; BLOCK B: 20-23,24-27 -; BLOCK C: 8-11,12-15 -; BLOCK D: 0-3,4-7 -; Blocks A and C are straight calculation through the various stages. In -; block B, further calculations are performed using the results from -; block A. In block D, further calculations are performed using the results -; from block C and then the final calculations are done using results from -; block A and B which have been combined at the end of block B. -; - -%macro IDCT32X32_34 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, m1 - pmulhrsw m1, [pw___804x2] ; stp1_16 - mova [r4 + 0], m0 - pmulhrsw m11, [pw_16364x2] ; stp2_31 - mova [r4 + 16 * 2], m2 - mova m12, m7 - pmulhrsw m7, [pw_15426x2] ; stp1_28 - mova [r4 + 16 * 4], m4 - pmulhrsw m12, [pw_m5520x2] ; stp2_19 - mova [r4 + 16 * 6], m6 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m2, m1 ; stp1_16 - mova m0, m11 ; stp1_31 - mova m4, m7 ; stp1_28 - mova m15, m12 ; stp1_19 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 15, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m6, m5 - pmulhrsw m5, [pw__3981x2] ; stp1_20 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m15 - pmulhrsw m6, [pw_15893x2] ; stp2_27 - mova [stp + %4 + idx30], m2 - mova m2, m3 - pmulhrsw m3, [pw_m2404x2] ; stp1_23 - mova [stp + %4 + idx31], m11 - pmulhrsw m2, [pw_16207x2] ; stp2_24 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m13, m5 ; stp1_20 - mova m14, m6 ; stp1_27 - mova m15, m3 ; stp1_23 - mova m11, m2 ; stp1_24 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 15, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 11, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 0, 15, 9 ; stp2_17, stp2_22 - SUM_SUB 4, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 7, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m10, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 10, 11, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m10 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 11, 15, 9 - pmulhrsw m11, m10 ; stp1_25 - pmulhrsw m15, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_24 - pmulhrsw m3, m10 ; stp1_23 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 11, 15 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m11 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m6, [rsp + transposed_in + 16 * 6] - - mova m1, m0 - pmulhrsw m0, [pw__1606x2] ; stp1_8 - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - pmulhrsw m1, [pw_16305x2] ; stp2_15 - mova [stp + %3 + idx22], m15 - mova m7, m6 - pmulhrsw m7, [pw_m4756x2] ; stp2_11 - mova [stp + %3 + idx23], m3 - pmulhrsw m6, [pw_15679x2] ; stp1_12 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m3, m0 ; stp1_8 - mova m2, m1 ; stp1_15 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - mova m4, m7 ; stp1_11 - mova m5, m6 ; stp1_12 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, m11 - pmulhrsw m11, [pw__3196x2] ; stp1_4 - pmulhrsw m12, [pw_16069x2] ; stp1_7 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m10, [pw_11585x2] - pmulhrsw m0, m10 ; stp1_1 - - mova m14, m11 ; stp1_4 - mova m13, m12 ; stp1_7 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - mova m7, m0 ; stp1_0 = stp1_1 - mova m4, m0 ; stp1_1 - mova m2, m7 ; stp1_0 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 7, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 4, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 1, 9 ; stp1_0, stp1_15 - SUM_SUB 7, 3, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 4, 6, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m15, [stp + %4 + idx30] - mova m10, [stp + %4 + idx31] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 7, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m7 - mova [stp + %4 + idx30], m15 - mova [stp + %4 + idx31], m10 - mova m7, [stp + %4 + idx28] - mova m0, [stp + %4 + idx29] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 4, 7, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m4 - mova [stp + %4 + idx28], m7 - mova [stp + %4 + idx29], m0 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m4, [stp + %3 + idx19] - SUM_SUB 1, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 3, 7, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 6, 4, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m4 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m0, [stp + %4 + idx27] - mova m1, [stp + %4 + idx26] - mova m2, [stp + %4 + idx25] - mova m3, [stp + %4 + idx24] - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - mova [stp + %4 + idx27], m0 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx24], m3 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -%macro RECON_AND_STORE 1 - mova m11, [pw_32] - lea stp, [rsp + %1] - mov r6, 32 - pxor m8, m8 -%%recon_and_store: - mova m0, [stp + 16 * 32 * 0] - mova m1, [stp + 16 * 32 * 1] - mova m2, [stp + 16 * 32 * 2] - mova m3, [stp + 16 * 32 * 3] - add stp, 16 - - paddw m0, m11 - paddw m1, m11 - paddw m2, m11 - paddw m3, m11 - psraw m0, 6 - psraw m1, 6 - psraw m2, 6 - psraw m3, 6 - movh m4, [outputq + 0] - movh m5, [outputq + 8] - movh m6, [outputq + 16] - movh m7, [outputq + 24] - punpcklbw m4, m8 - punpcklbw m5, m8 - punpcklbw m6, m8 - punpcklbw m7, m8 - paddw m0, m4 - paddw m1, m5 - paddw m2, m6 - paddw m3, m7 - packuswb m0, m1 - packuswb m2, m3 - mova [outputq + 0], m0 - mova [outputq + 16], m2 - lea outputq, [outputq + strideq] - dec r6 - jnz %%recon_and_store -%endmacro - -%define i32x32_size 16*32*5 -%define pass_two_start 16*32*0 -%define transposed_in 16*32*4 -%define pass_one_start 16*32*0 -%define stp r8 - -INIT_XMM ssse3 -cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - lea stp, [rsp + pass_one_start] - -idct32x32_34: - mov r3, inputq - lea r4, [rsp + transposed_in] - -idct32x32_34_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - IDCT32X32_34 16*0, 16*32, 16*64, 16*96 - lea stp, [stp + 16 * 8] - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_34_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - -idct32x32_34_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - IDCT32X32_34 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_34_2 - - RECON_AND_STORE pass_two_start - - RET - -%macro IDCT32X32_135 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m1, [rsp + transposed_in + 16 * 1] - mova m11, m1 - pmulhrsw m1, [pw___804x2] ; stp1_16 - pmulhrsw m11, [pw_16364x2] ; stp2_31 - - mova m7, [rsp + transposed_in + 16 * 7] - mova m12, m7 - pmulhrsw m7, [pw_15426x2] ; stp1_28 - pmulhrsw m12, [pw_m5520x2] ; stp2_19 - - mova m3, [rsp + transposed_in + 16 * 9] - mova m4, m3 - pmulhrsw m3, [pw__7005x2] ; stp1_18 - pmulhrsw m4, [pw_14811x2] ; stp2_29 - - mova m0, [rsp + transposed_in + 16 * 15] - mova m2, m0 - pmulhrsw m0, [pw_12140x2] ; stp1_30 - pmulhrsw m2, [pw_m11003x2] ; stp2_17 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 - SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 - SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 - SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m3 - mova [stp + %4 + idx30], m2 - mova [stp + %4 + idx31], m11 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m2, [rsp + transposed_in + 16 * 3] - mova m3, m2 - pmulhrsw m3, [pw_m2404x2] ; stp1_23 - pmulhrsw m2, [pw_16207x2] ; stp2_24 - - mova m5, [rsp + transposed_in + 16 * 5] - mova m6, m5 - pmulhrsw m5, [pw__3981x2] ; stp1_20 - pmulhrsw m6, [pw_15893x2] ; stp2_27 - - mova m14, [rsp + transposed_in + 16 * 11] - mova m13, m14 - pmulhrsw m13, [pw_m8423x2] ; stp1_21 - pmulhrsw m14, [pw_14053x2] ; stp2_26 - - mova m0, [rsp + transposed_in + 16 * 13] - mova m1, m0 - pmulhrsw m0, [pw__9760x2] ; stp1_22 - pmulhrsw m1, [pw_13160x2] ; stp2_25 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 - SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 - SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 - SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m11, [stp + %3 + idx18] - mova m12, [stp + %3 + idx19] - SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 - SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m4 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m11 - mova [stp + %3 + idx19], m12 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m11, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m11 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 1, 0, 9 - pmulhrsw m1, m10 ; stp1_25 - pmulhrsw m0, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_25 - pmulhrsw m3, m10 ; stp1_22 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 1, 0 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - mova [stp + %3 + idx22], m0 - mova [stp + %3 + idx23], m3 - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m1 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m1, m0 - pmulhrsw m0, [pw__1606x2] ; stp1_8 - pmulhrsw m1, [pw_16305x2] ; stp2_15 - - mova m6, [rsp + transposed_in + 16 * 6] - mova m7, m6 - pmulhrsw m7, [pw_m4756x2] ; stp2_11 - pmulhrsw m6, [pw_15679x2] ; stp1_12 - - mova m4, [rsp + transposed_in + 16 * 10] - mova m5, m4 - pmulhrsw m4, [pw__7723x2] ; stp1_10 - pmulhrsw m5, [pw_14449x2] ; stp2_13 - - mova m2, [rsp + transposed_in + 16 * 14] - mova m3, m2 - pmulhrsw m3, [pw_m10394x2] ; stp1_9 - pmulhrsw m2, [pw_12665x2] ; stp2_14 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 - SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 - SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 - SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, m11 - pmulhrsw m11, [pw__3196x2] ; stp1_4 - pmulhrsw m12, [pw_16069x2] ; stp1_7 - - mova m13, [rsp + transposed_in + 16 * 12] - mova m14, m13 - pmulhrsw m13, [pw_13623x2] ; stp1_6 - pmulhrsw m14, [pw_m9102x2] ; stp1_5 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m2, [rsp + transposed_in + 16 * 8] - pmulhrsw m0, [pw_11585x2] ; stp1_1 - mova m3, m2 - pmulhrsw m2, [pw__6270x2] ; stp1_2 - pmulhrsw m3, [pw_15137x2] ; stp1_3 - - SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 - SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - mova m1, m0 ; stp1_0 = stp1_1 - SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 - SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %2 + idx12] - mova m5, [stp + %2 + idx13] - mova m6, [stp + %2 + idx14] - mova m7, [stp + %2 + idx15] - SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 - SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m10, [stp + %4 + idx31] - mova m15, [stp + %4 + idx30] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m1 - mova [stp + %4 + idx31], m10 - mova [stp + %4 + idx30], m15 - mova m0, [stp + %4 + idx29] - mova m1, [stp + %4 + idx28] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m3 - mova [stp + %4 + idx29], m0 - mova [stp + %4 + idx28], m1 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m1, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m3, [stp + %3 + idx19] - SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m4 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m6 - mova [stp + %2 + idx15], m7 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m1 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m3 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m3, [stp + %4 + idx24] - mova m2, [stp + %4 + idx25] - mova m1, [stp + %4 + idx26] - mova m0, [stp + %4 + idx27] - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - mova [stp + %4 + idx24], m3 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx27], m0 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -INIT_XMM ssse3 -cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - mov r6, 2 - lea stp, [rsp + pass_one_start] - -idct32x32_135: - mov r3, inputq - lea r4, [rsp + transposed_in] - mov r7, 2 - -idct32x32_135_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - -%if CONFIG_VP9_HIGHBITDEPTH - add r3, 32 -%else - add r3, 16 -%endif - add r4, 16 * 8 - dec r7 - jne idct32x32_135_transpose - - IDCT32X32_135 16*0, 16*32, 16*64, 16*96 - lea stp, [stp + 16 * 8] -%if CONFIG_VP9_HIGHBITDEPTH - lea inputq, [inputq + 32 * 32] -%else - lea inputq, [inputq + 16 * 32] -%endif - dec r6 - jnz idct32x32_135 - - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_135_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - mov r7, 2 - -idct32x32_135_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - - add r3, 16 * 8 - add r4, 16 * 8 - dec r7 - jne idct32x32_135_transpose_2 - - IDCT32X32_135 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_135_2 - - RECON_AND_STORE pass_two_start - - RET - -%macro IDCT32X32_1024 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m1, [rsp + transposed_in + 16 * 1] - mova m11, [rsp + transposed_in + 16 * 31] - BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31 - - mova m0, [rsp + transposed_in + 16 * 15] - mova m2, [rsp + transposed_in + 16 * 17] - BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30 - - mova m7, [rsp + transposed_in + 16 * 7] - mova m12, [rsp + transposed_in + 16 * 25] - BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28 - - mova m3, [rsp + transposed_in + 16 * 9] - mova m4, [rsp + transposed_in + 16 * 23] - BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 - SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 - SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 - SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m3 - mova [stp + %4 + idx30], m2 - mova [stp + %4 + idx31], m11 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m5, [rsp + transposed_in + 16 * 5] - mova m6, [rsp + transposed_in + 16 * 27] - BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27 - - mova m13, [rsp + transposed_in + 16 * 21] - mova m14, [rsp + transposed_in + 16 * 11] - BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26 - - mova m0, [rsp + transposed_in + 16 * 13] - mova m1, [rsp + transposed_in + 16 * 19] - BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25 - - mova m2, [rsp + transposed_in + 16 * 3] - mova m3, [rsp + transposed_in + 16 * 29] - BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 - SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 - SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 - SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m11, [stp + %3 + idx18] - mova m12, [stp + %3 + idx19] - SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 - SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m4 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m11 - mova [stp + %3 + idx19], m12 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m11, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m11 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 1, 0, 9 - pmulhrsw m1, m10 ; stp1_25 - pmulhrsw m0, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_25 - pmulhrsw m3, m10 ; stp1_22 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 1, 0 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - mova [stp + %3 + idx22], m0 - mova [stp + %3 + idx23], m3 - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m1 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m1, [rsp + transposed_in + 16 * 30] - BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15 - - mova m2, [rsp + transposed_in + 16 * 14] - mova m3, [rsp + transposed_in + 16 * 18] - BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14 - - mova m4, [rsp + transposed_in + 16 * 10] - mova m5, [rsp + transposed_in + 16 * 22] - BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13 - - mova m6, [rsp + transposed_in + 16 * 6] - mova m7, [rsp + transposed_in + 16 * 26] - BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 - SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 - SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 - SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, [rsp + transposed_in + 16 * 28] - BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7 - - mova m13, [rsp + transposed_in + 16 * 12] - mova m14, [rsp + transposed_in + 16 * 20] - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m1, [rsp + transposed_in + 16 * 16] - -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 0, 1, 9 - pmulhrsw m0, m10 ; stp1_1 - pmulhrsw m1, m10 ; stp1_0 -%else - BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0 - SWAP 0, 1 -%endif - mova m2, [rsp + transposed_in + 16 * 8] - mova m3, [rsp + transposed_in + 16 * 24] - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3 - - mova m10, [pw_11585x2] - SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 - SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 - SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %2 + idx12] - mova m5, [stp + %2 + idx13] - mova m6, [stp + %2 + idx14] - mova m7, [stp + %2 + idx15] - SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 - SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m10, [stp + %4 + idx31] - mova m15, [stp + %4 + idx30] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m1 - mova [stp + %4 + idx31], m10 - mova [stp + %4 + idx30], m15 - mova m0, [stp + %4 + idx29] - mova m1, [stp + %4 + idx28] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m3 - mova [stp + %4 + idx29], m0 - mova [stp + %4 + idx28], m1 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m1, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m3, [stp + %3 + idx19] - SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m4 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m6 - mova [stp + %2 + idx15], m7 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m1 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m3 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m3, [stp + %4 + idx24] - mova m2, [stp + %4 + idx25] - mova m1, [stp + %4 + idx26] - mova m0, [stp + %4 + idx27] - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - mova [stp + %4 + idx24], m3 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx27], m0 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -INIT_XMM ssse3 -cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - mov r6, 4 - lea stp, [rsp + pass_one_start] - -idct32x32_1024: - mov r3, inputq - lea r4, [rsp + transposed_in] - mov r7, 4 - -idct32x32_1024_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 -%if CONFIG_VP9_HIGHBITDEPTH - add r3, 32 -%else - add r3, 16 -%endif - add r4, 16 * 8 - dec r7 - jne idct32x32_1024_transpose - - IDCT32X32_1024 16*0, 16*32, 16*64, 16*96 - - lea stp, [stp + 16 * 8] -%if CONFIG_VP9_HIGHBITDEPTH - lea inputq, [inputq + 32 * 32] -%else - lea inputq, [inputq + 16 * 32] -%endif - dec r6 - jnz idct32x32_1024 - - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_1024_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - mov r7, 4 - -idct32x32_1024_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - - add r3, 16 * 8 - add r4, 16 * 8 - dec r7 - jne idct32x32_1024_transpose_2 - - IDCT32X32_1024 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_1024_2 - - RECON_AND_STORE pass_two_start - - RET -%endif diff --git a/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm index fbbcd76bd7..bcf1a6ef98 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/inv_wht_sse2.asm @@ -9,6 +9,7 @@ ; %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text @@ -82,15 +83,8 @@ SECTION .text INIT_XMM sse2 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] -%endif + LOAD_TRAN_LOW 0, inputq, 0 + LOAD_TRAN_LOW 1, inputq, 8 psraw m0, 2 psraw m1, 2 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c index 6652a62dcf..a58fb65539 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_avx2.c @@ -13,38 +13,38 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0])); + const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0])); - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch)); q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch)); q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch)); q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch)); q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch))); p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch)); q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch))); p0q0 = _mm_shuffle_epi32(q0p0, 78); { @@ -52,19 +52,19 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); + fe = _mm_set1_epi8((int8_t)0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -76,7 +76,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -84,7 +84,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i t1 = _mm_set1_epi16(0x1); __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); @@ -136,21 +136,21 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch)); q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch))); - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch)); q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch))); flat2 = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch)); q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch))); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), @@ -321,44 +321,44 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6)); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5)); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4)); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3)); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2)); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1)); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0)); } } @@ -367,12 +367,12 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 }; -void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; @@ -380,32 +380,32 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + const __m128i thresh_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0])); + const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0])); + const __m128i blimit_v = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0])); - p256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); - p256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); - p256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); - p256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); - p256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); - q256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); - q256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); - q256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); - q256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); - q256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); + p256_4 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch))); + p256_3 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch))); + p256_2 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch))); + p256_1 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch))); + p256_0 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch))); + q256_0 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch))); + q256_1 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch))); + q256_2 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch))); + q256_3 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch))); + q256_4 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch))); p4 = _mm256_castsi256_si128(p256_4); p3 = _mm256_castsi256_si128(p256_3); @@ -423,7 +423,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); @@ -431,12 +431,12 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); __m128i work; flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(flat, mask); @@ -450,7 +450,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -458,8 +458,8 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); @@ -532,9 +532,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, flat = _mm_and_si128(flat, mask); p256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); + _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch))); q256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); + _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch))); p5 = _mm256_castsi256_si128(p256_5); q5 = _mm256_castsi256_si128(q256_5); flat2 = _mm_max_epu8( @@ -543,9 +543,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, flat2 = _mm_max_epu8(work, flat2); p256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); + _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch))); q256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); + _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch))); p6 = _mm256_castsi256_si128(p256_6); q6 = _mm256_castsi256_si128(q256_6); work = _mm_max_epu8( @@ -555,9 +555,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, flat2 = _mm_max_epu8(work, flat2); p256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); + _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch))); q256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); + _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch))); p7 = _mm256_castsi256_si128(p256_7); q7 = _mm256_castsi256_si128(q256_7); work = _mm_max_epu8( @@ -843,71 +843,71 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, p6 = _mm_andnot_si128(flat2, p6); flat2_p6 = _mm_and_si128(flat2, flat2_p6); p6 = _mm_or_si128(flat2_p6, p6); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6); p5 = _mm_andnot_si128(flat2, p5); flat2_p5 = _mm_and_si128(flat2, flat2_p5); p5 = _mm_or_si128(flat2_p5, p5); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5); p4 = _mm_andnot_si128(flat2, p4); flat2_p4 = _mm_and_si128(flat2, flat2_p4); p4 = _mm_or_si128(flat2_p4, p4); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4); p3 = _mm_andnot_si128(flat2, p3); flat2_p3 = _mm_and_si128(flat2, flat2_p3); p3 = _mm_or_si128(flat2_p3, p3); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3); p2 = _mm_andnot_si128(flat2, p2); flat2_p2 = _mm_and_si128(flat2, flat2_p2); p2 = _mm_or_si128(flat2_p2, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2); p1 = _mm_andnot_si128(flat2, p1); flat2_p1 = _mm_and_si128(flat2, flat2_p1); p1 = _mm_or_si128(flat2_p1, p1); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); p0 = _mm_andnot_si128(flat2, p0); flat2_p0 = _mm_and_si128(flat2, flat2_p0); p0 = _mm_or_si128(flat2_p0, p0); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); q0 = _mm_andnot_si128(flat2, q0); flat2_q0 = _mm_and_si128(flat2, flat2_q0); q0 = _mm_or_si128(flat2_q0, q0); - _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0); q1 = _mm_andnot_si128(flat2, q1); flat2_q1 = _mm_and_si128(flat2, flat2_q1); q1 = _mm_or_si128(flat2_q1, q1); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); q2 = _mm_andnot_si128(flat2, q2); flat2_q2 = _mm_and_si128(flat2, flat2_q2); q2 = _mm_or_si128(flat2_q2, q2); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2); q3 = _mm_andnot_si128(flat2, q3); flat2_q3 = _mm_and_si128(flat2, flat2_q3); q3 = _mm_or_si128(flat2_q3, q3); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3); q4 = _mm_andnot_si128(flat2, q4); flat2_q4 = _mm_and_si128(flat2, flat2_q4); q4 = _mm_or_si128(flat2_q4, q4); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4); q5 = _mm_andnot_si128(flat2, q5); flat2_q5 = _mm_and_si128(flat2, flat2_q5); q5 = _mm_or_si128(flat2_q5, q5); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5); q6 = _mm_andnot_si128(flat2, q6); flat2_q6 = _mm_and_si128(flat2, flat2_q6); q6 = _mm_or_si128(flat2_q6, q6); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6); } } diff --git a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_sse2.c similarity index 80% rename from media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c rename to media/libvpx/libvpx/vpx_dsp/x86/loopfilter_sse2.c index 28e6fd65f9..6ea34cdd16 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/loopfilter_sse2.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" #include "vpx_ports/emmintrin_compat.h" +#include "vpx_dsp/x86/mem_sse2.h" static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); @@ -30,7 +31,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ hev = \ _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ - hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_cmpgt_epi16(hev, thresh_v); \ hev = _mm_packs_epi16(hev, hev); \ \ /* const int8_t mask = filter_mask(*limit, *blimit, */ \ @@ -51,7 +52,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { flat = _mm_max_epu8(work, flat); \ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ mask = _mm_unpacklo_epi64(mask, flat); \ - mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_subs_epu8(mask, limit_v); \ mask = _mm_cmpeq_epi8(mask, zero); \ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ } while (0) @@ -60,7 +61,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { do { \ const __m128i t3t4 = \ _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \ - const __m128i t80 = _mm_set1_epi8(0x80); \ + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); \ __m128i filter, filter2filter1, work; \ \ ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ @@ -103,27 +104,26 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ } while (0) -void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, - const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i limit_v = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), + _mm_loadl_epi64((const __m128i *)limit)); + const __m128i thresh_v = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; __m128i mask, hev; - p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s - 4 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s + 0 * p))); - q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); + p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)), + _mm_loadl_epi64((__m128i *)(s - 4 * pitch))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 0 * pitch))); + q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch))); p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); @@ -132,41 +132,40 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, FILTER_HEV_MASK; FILTER4; - _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 - _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 - _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 + _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0)); // *op1 + _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0); // *op0 + _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0); // *oq0 + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0)); // *oq1 } -void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, - const uint8_t *_blimit, const uint8_t *_limit, - const uint8_t *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); +void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i limit_v = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit), + _mm_loadl_epi64((const __m128i *)limit)); + const __m128i thresh_v = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i x0, x1, x2, x3; __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; __m128i mask, hev; // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); + q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4))); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); + x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4))); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); + x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4))); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); + x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)), + _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4))); // Transpose 8x8 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 @@ -212,69 +211,69 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); - *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0)); - *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0)); } -void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - const __m128i zero = _mm_set1_epi16(0); +void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); __m128i mask, hev, flat, flat2; __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch)); q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch)); q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch)); q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch)); q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch))); p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch)); q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch))); p0q0 = _mm_shuffle_epi32(q0p0, 78); { __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; abs_p1p0 = abs_diff(q1p1, q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); + fe = _mm_set1_epi8((int8_t)0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); abs_p0q0 = abs_diff(q0p0, p0q0); abs_p1q1 = abs_diff(q1p1, p1q1); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -284,7 +283,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -292,7 +291,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i t1 = _mm_set1_epi16(0x1); __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); @@ -342,18 +341,18 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch)); q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch))); - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch)); q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch))); flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch)); q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch))); work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); @@ -520,44 +519,44 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6)); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5)); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4)); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3)); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2)); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1)); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0)); } } @@ -591,15 +590,15 @@ static INLINE __m128i filter16_mask(const __m128i *const flat, return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } -void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - const __m128i zero = _mm_set1_epi16(0); +void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); __m128i mask, hev, flat, flat2; __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; @@ -609,27 +608,27 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, __m128i max_abs_p1p0q1q0; - p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); - p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); - p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); - p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); - q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); - q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch)); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch)); + p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch)); + p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); { const __m128i abs_p1p0 = abs_diff(p1, p0); const __m128i abs_q1q0 = abs_diff(q1, q0); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(zero, zero); __m128i abs_p0q0 = abs_diff(p0, q0); __m128i abs_p1q1 = abs_diff(p1, q1); @@ -638,7 +637,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); @@ -648,7 +647,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, mask = _mm_max_epu8(work, mask); work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); } @@ -678,8 +677,8 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); @@ -694,7 +693,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, oq0 = _mm_xor_si128(q0, t80); oq1 = _mm_xor_si128(q1, t80); - hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); @@ -851,111 +850,111 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6); f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5); f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4); f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3); f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2); f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ } } -void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit); + const __m128i limit_v = _mm_load_si128((const __m128i *)limit); + const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh); __m128i mask, hev, flat; __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; - q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s - 0 * p))); + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 3 * pitch))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 2 * pitch))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), + _mm_loadl_epi64((__m128i *)(s + 1 * pitch))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), + _mm_loadl_epi64((__m128i *)(s - 0 * pitch))); p1q1 = _mm_shuffle_epi32(q1p1, 78); p0q0 = _mm_shuffle_epi32(q0p0, 78); { // filter_mask and hev_mask const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; abs_p1p0 = abs_diff(q1p1, q0p0); @@ -964,12 +963,12 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, abs_p0q0 = abs_diff(q0p0, p0q0); abs_p1q1 = abs_diff(q1p1, p1q1); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, thresh_v); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -979,7 +978,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, limit_v); mask = _mm_cmpeq_epi8(mask, zero); // flat_mask4 @@ -997,14 +996,22 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, unsigned char *src = s; { __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)), + zero); workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); @@ -1047,16 +1054,16 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i ps1 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80); const __m128i ps0 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80); const __m128i qs0 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80); const __m128i qs1 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80); + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1102,7 +1109,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); q2 = _mm_loadl_epi64((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); @@ -1120,62 +1127,60 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); p2 = _mm_loadl_epi64((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2); + _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1); + _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0); + _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0); + _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1); + _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2); } } -void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1) { +void vpx_lpf_horizontal_8_dual_sse2( + uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); + const __m128i zero = _mm_setzero_si128(); const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), + _mm_load_si128((const __m128i *)blimit1)); const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0), + _mm_load_si128((const __m128i *)limit1)); const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), + _mm_load_si128((const __m128i *)thresh1)); __m128i mask, hev, flat; __m128i p3, p2, p1, p0, q0, q1, q2, q3; - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); @@ -1227,14 +1232,22 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, do { __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)), + zero); workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); @@ -1279,20 +1292,20 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i ps1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); const __m128i ps0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); const __m128i qs0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); const __m128i qs1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1344,7 +1357,7 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); q2 = _mm_load_si128((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); @@ -1362,49 +1375,49 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); p2 = _mm_load_si128((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2); } } -void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, - const unsigned char *_blimit0, - const unsigned char *_limit0, - const unsigned char *_thresh0, - const unsigned char *_blimit1, - const unsigned char *_limit1, - const unsigned char *_thresh1) { +void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch, + const unsigned char *blimit0, + const unsigned char *limit0, + const unsigned char *thresh0, + const unsigned char *blimit1, + const unsigned char *limit1, + const unsigned char *thresh1) { const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), - _mm_load_si128((const __m128i *)_blimit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0), + _mm_load_si128((const __m128i *)blimit1)); const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0), + _mm_load_si128((const __m128i *)limit1)); const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); - const __m128i zero = _mm_set1_epi16(0); + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0), + _mm_load_si128((const __m128i *)thresh1)); + const __m128i zero = _mm_setzero_si128(); __m128i p3, p2, p1, p0, q0, q1, q2, q3; __m128i mask, hev, flat; - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); // filter_mask and hev_mask { @@ -1412,7 +1425,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i fe = _mm_set1_epi8((int8_t)0xfe); const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); @@ -1448,20 +1461,20 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t80 = _mm_set1_epi8((int8_t)0x80); + const __m128i te0 = _mm_set1_epi8((int8_t)0xe0); const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); const __m128i ps1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80); const __m128i ps0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80); const __m128i qs0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80); const __m128i qs1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; @@ -1506,10 +1519,10 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1); + _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1); } } @@ -1626,16 +1639,12 @@ static INLINE void transpose(unsigned char *src[], int in_p, x5 = _mm_unpacklo_epi16(x2, x3); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 0 * out_p), - _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 - _mm_storeh_pd((double *)(out + 1 * out_p), - _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 + mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70 + mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 2 * out_p), - _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 - _mm_storeh_pd((double *)(out + 3 * out_p), - _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 + mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72 + mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 x4 = _mm_unpackhi_epi16(x0, x1); @@ -1643,21 +1652,17 @@ static INLINE void transpose(unsigned char *src[], int in_p, x5 = _mm_unpackhi_epi16(x2, x3); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 4 * out_p), - _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 - _mm_storeh_pd((double *)(out + 5 * out_p), - _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 + mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74 + mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 6 * out_p), - _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 - _mm_storeh_pd((double *)(out + 7 * out_p), - _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 + mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76 + mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77 } while (++idx8x8 < num_8x8_to_transpose); } -void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { @@ -1666,21 +1671,21 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, unsigned char *dst[2]; // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering - vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, - blimit1, limit1, thresh1); + vpx_lpf_horizontal_4_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); src[0] = t_dst; src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - transpose(src, 16, dst, p, 2); + transpose(src, 16, dst, pitch, 2); } -void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, +void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { @@ -1692,19 +1697,19 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, src[0] = s - 4; dst[0] = t_dst; - transpose(src, p, dst, 8, 1); + transpose(src, pitch, dst, 8, 1); // Loop filtering - vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); + vpx_lpf_horizontal_8(t_dst + 4 * 8, 8, blimit, limit, thresh); src[0] = t_dst; dst[0] = s - 4; // Transpose back - transpose(src, 8, dst, p, 1); + transpose(src, 8, dst, pitch, 1); } -void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, +void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { @@ -1713,22 +1718,22 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, unsigned char *dst[2]; // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16); // Loop filtering - vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, - blimit1, limit1, thresh1); + vpx_lpf_horizontal_8_dual(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); src[0] = t_dst; src[1] = t_dst + 8; dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + dst[1] = s - 4 + pitch * 8; // Transpose back - transpose(src, 16, dst, p, 2); + transpose(src, 16, dst, pitch, 2); } -void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, +void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch, const unsigned char *blimit, const unsigned char *limit, const unsigned char *thresh) { @@ -1742,10 +1747,10 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, dst[1] = t_dst + 8 * 8; // Transpose 16x8 - transpose(src, p, dst, 8, 2); + transpose(src, pitch, dst, 8, 2); // Loop filtering - vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); + vpx_lpf_horizontal_16(t_dst + 8 * 8, 8, blimit, limit, thresh); src[0] = t_dst; src[1] = t_dst + 8 * 8; @@ -1753,22 +1758,22 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, dst[1] = s; // Transpose back - transpose(src, 8, dst, p, 2); + transpose(src, 8, dst, pitch, 2); } -void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p, +void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { DECLARE_ALIGNED(16, unsigned char, t_dst[256]); // Transpose 16x16 - transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16); + transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16); // Loop filtering - vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); + vpx_lpf_horizontal_16_dual(t_dst + 8 * 16, 16, blimit, limit, thresh); // Transpose back - transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch); } diff --git a/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h new file mode 100644 index 0000000000..031f361a41 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/mem_sse2.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_MEM_SSE2_H_ +#define VPX_VPX_DSP_X86_MEM_SSE2_H_ + +#include // SSE2 +#include + +#include "./vpx_config.h" + +static INLINE void storeu_int32(void *dst, int32_t v) { + memcpy(dst, &v, sizeof(v)); +} + +static INLINE int32_t loadu_int32(const void *src) { + int32_t v; + memcpy(&v, src, sizeof(v)); + return v; +} + +static INLINE __m128i load_unaligned_u32(const void *a) { + int val; + memcpy(&val, a, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +static INLINE void store_unaligned_u32(void *const a, const __m128i v) { + const int val = _mm_cvtsi128_si32(v); + memcpy(a, &val, sizeof(val)); +} + +#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8) +#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8) + +static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) { + return _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); +} + +static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); + d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride)); + d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride)); + d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride)); +} + +static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + load_8bit_4x4(s + 0 * stride, stride, &d[0]); + load_8bit_4x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride)); + d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride)); + d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride)); + d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride)); +} + +static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride, + __m128i *const d) { + load_8bit_8x4(s + 0 * stride, stride, &d[0]); + load_8bit_8x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void load_8bit_16x8(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride)); + d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride)); + d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride)); + d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride)); + d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride)); + d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride)); + d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride)); + d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride)); +} + +static INLINE void loadu_8bit_16x4(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride)); + d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride)); + d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride)); + d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride)); +} + +static INLINE void loadu_8bit_16x8(const uint8_t *const s, + const ptrdiff_t stride, __m128i *const d) { + loadu_8bit_16x4(s + 0 * stride, stride, &d[0]); + loadu_8bit_16x4(s + 4 * stride, stride, &d[4]); +} + +static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) { + _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); +} + +static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]); + *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]); + *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]); + *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]); +} + +static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d, + const ptrdiff_t stride) { + __m128i ss[4]; + + ss[0] = s; + ss[1] = _mm_srli_si128(s, 4); + ss[2] = _mm_srli_si128(s, 8); + ss[3] = _mm_srli_si128(s, 12); + store_8bit_4x4(ss, d, stride); +} + +static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s, + uint8_t *const d, + const ptrdiff_t stride) { + _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); + _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]); + _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]); + _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]); +} + +static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); + _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]); + _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]); + _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]); + _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]); + _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]); + _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]); + _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); +} + +static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); + _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); + _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); + _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); +} + +#endif // VPX_VPX_DSP_X86_MEM_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c new file mode 100644 index 0000000000..119fa7cd1a --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/post_proc_sse2.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +extern const int16_t vpx_rv[]; + +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, + int cols, int flimit) { + int col; + const __m128i zero = _mm_setzero_si128(); + const __m128i f = _mm_set1_epi32(flimit); + DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]); + + // 8 columns are processed at a time. + // If rows is less than 8 the bottom border extension fails. + assert(cols % 8 == 0); + assert(rows >= 8); + + for (col = 0; col < cols; col += 8) { + int row, i; + __m128i s = _mm_loadl_epi64((__m128i *)dst); + __m128i sum, sumsq_0, sumsq_1; + __m128i tmp_0, tmp_1; + __m128i below_context = _mm_setzero_si128(); + + s = _mm_unpacklo_epi8(s, zero); + + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)above_context + i, s); + } + + // sum *= 9 + sum = _mm_slli_epi16(s, 3); + sum = _mm_add_epi16(s, sum); + + // sum^2 * 9 == (sum * 9) * sum + tmp_0 = _mm_mullo_epi16(sum, s); + tmp_1 = _mm_mulhi_epi16(sum, s); + + sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1); + sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1); + + // Prime sum/sumsq + for (i = 1; i <= 6; ++i) { + __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch)); + a = _mm_unpacklo_epi8(a, zero); + sum = _mm_add_epi16(sum, a); + a = _mm_mullo_epi16(a, a); + sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero)); + sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero)); + } + + for (row = 0; row < rows + 8; row++) { + const __m128i above = + _mm_load_si128((__m128i *)above_context + (row & 7)); + __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch)); + __m128i above_sq, below_sq; + __m128i mask_0, mask_1; + __m128i multmp_0, multmp_1; + __m128i rv; + __m128i out; + + this_row = _mm_unpacklo_epi8(this_row, zero); + + if (row + 7 < rows) { + // Instead of copying the end context we just stop loading when we get + // to the last one. + below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch)); + below_context = _mm_unpacklo_epi8(below_context, zero); + } + + sum = _mm_sub_epi16(sum, above); + sum = _mm_add_epi16(sum, below_context); + + // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero + // extend. Unfortunately we can't do below_sq - above_sq in 16 bits + // because x86 does not have unpack with sign extension. + above_sq = _mm_mullo_epi16(above, above); + sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero)); + sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero)); + + below_sq = _mm_mullo_epi16(below_context, below_context); + sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero)); + sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero)); + + // sumsq * 16 - sumsq == sumsq * 15 + mask_0 = _mm_slli_epi32(sumsq_0, 4); + mask_0 = _mm_sub_epi32(mask_0, sumsq_0); + mask_1 = _mm_slli_epi32(sumsq_1, 4); + mask_1 = _mm_sub_epi32(mask_1, sumsq_1); + + multmp_0 = _mm_mullo_epi16(sum, sum); + multmp_1 = _mm_mulhi_epi16(sum, sum); + + mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1)); + mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1)); + + // mask - f gives a negative value when mask < f + mask_0 = _mm_sub_epi32(mask_0, f); + mask_1 = _mm_sub_epi32(mask_1, f); + + // Shift the sign bit down to create a mask + mask_0 = _mm_srai_epi32(mask_0, 31); + mask_1 = _mm_srai_epi32(mask_1, 31); + + mask_0 = _mm_packs_epi32(mask_0, mask_1); + + rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127))); + + mask_1 = _mm_add_epi16(rv, sum); + mask_1 = _mm_add_epi16(mask_1, this_row); + mask_1 = _mm_srai_epi16(mask_1, 4); + + mask_1 = _mm_and_si128(mask_0, mask_1); + mask_0 = _mm_andnot_si128(mask_0, this_row); + out = _mm_or_si128(mask_1, mask_0); + + _mm_storel_epi64((__m128i *)(dst + row * pitch), + _mm_packus_epi16(out, zero)); + + _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row); + } + + dst += 8; + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c new file mode 100644 index 0000000000..5ff5abc110 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#if defined(_MSC_VER) +#include +#endif +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + const __m256i big_zero = _mm256_setzero_si256(); + int index; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + *eob_ptr = 0; + + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (n_coeffs == 16) return; + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < n_coeffs; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + const __m256i big_zero = _mm256_setzero_si256(); + int index; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_test_all_zeros(all_zero, all_zero)) { + _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c new file mode 100644 index 0000000000..d4872f6bca --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx2.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +static VPX_FORCE_INLINE void load_b_values_avx2( + const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round, + __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant, + __m256i *shift, int log_scale) { + *zbin = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin)); + *zbin = _mm256_permute4x64_epi64(*zbin, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *zbin = _mm256_add_epi16(*zbin, rnd); + *zbin = _mm256_srai_epi16(*zbin, log_scale); + } + // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when + // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16) + *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1)); + + *round = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round)); + *round = _mm256_permute4x64_epi64(*round, 0x54); + if (log_scale > 0) { + const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1))); + *round = _mm256_add_epi16(*round, rnd); + *round = _mm256_srai_epi16(*round, log_scale); + } + + *quant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant)); + *quant = _mm256_permute4x64_epi64(*quant, 0x54); + *dequant = + _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr)); + *dequant = _mm256_permute4x64_epi64(*dequant, 0x54); + *shift = _mm256_castsi128_si256( + _mm_load_si128((const __m128i *)mb_plane->quant_shift)); + *shift = _mm256_permute4x64_epi64(*shift, 0x54); +} + +static VPX_FORCE_INLINE __m256i +load_coefficients_avx2(const tran_low_t *coeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr); + const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8)); + return _mm256_packs_epi32(coeff1, coeff2); +#else + // typedef int16_t tran_low_t; + return _mm256_loadu_si256((const __m256i *)coeff_ptr); +#endif +} + +static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals, + tran_low_t *coeff_ptr) { +#if CONFIG_VP9_HIGHBITDEPTH + // typedef int32_t tran_low_t; + __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15); + __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign); + __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign); + _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo); + _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi); +#else + // typedef int16_t tran_low_t; + _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals); +#endif +} + +static VPX_FORCE_INLINE __m256i +quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant, + __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); +#endif // CONFIG_VP9_HIGHBITDEPTH + return _mm256_setzero_si256(); + } + { + // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift); + const __m256i v_nz_mask = + _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant); + const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant); + + const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high); + const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high); +#else + const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant); +#endif + + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); +#else + store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr); +#endif + return v_nz_mask; + } +} + +static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, + __m256i v_eobmax, + __m256i v_mask) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); +#else + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); +#endif + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); + return _mm256_max_epi16(v_eobmax, v_nz_iscan); +} + +static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) { + const __m128i eob_lo = _mm256_castsi256_si128(eob256); + const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1); + __m128i eob = _mm_max_epi16(eob_lo, eob_hi); + __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask; + __m256i v_eobmax = _mm256_setzero_si256(); + intptr_t count; + const int16_t *iscan = scan_order->iscan; + + load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, + &v_dequant, &v_quant_shift, 0); + // Do DC and first 15 AC. + v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (count = n_coeffs - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, + &v_dequant, &v_round, &v_zbin, &v_quant_shift); + + v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} + +static VPX_FORCE_INLINE __m256i quantize_b_32x32_16( + const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant, + __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin, + __m256i *v_quant_shift, __m256i *v_eobmax) { + const __m256i v_coeff = load_coefficients_avx2(coeff_ptr); + const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff); + const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin); + + if (_mm256_movemask_epi8(v_zbin_mask) == 0) { + _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256()); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256()); + _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256()); +#endif + return *v_eobmax; + } + { + // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0 + const __m256i v_tmp_rnd = + _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask); + // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + // quant_shift_ptr[rc != 0]) >> 15); + const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant); + const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd); + const __m256i v_tmp32_hi = + _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1); + const __m256i v_tmp32_lo = + _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15); + const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo); + const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff); + const __m256i v_sign_lo = + _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff); + const __m256i v_sign_hi = + _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff); + const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant); + const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant); + const __m256i v_dqcoeff_lo = _mm256_sign_epi32( + _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo); + const __m256i v_dqcoeff_hi = _mm256_sign_epi32( + _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi); + const __m256i v_nz_mask = + _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256()); + + store_coefficients_avx2(v_qcoeff, qcoeff_ptr); + +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi); +#else + store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi), + dqcoeff_ptr); +#endif + + return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask); + } +} + +void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift; + __m256i v_eobmax = _mm256_setzero_si256(); + intptr_t count; + const int16_t *iscan = scan_order->iscan; + + load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr, + &v_dequant, &v_quant_shift, 1); + + // Do DC and first 15 AC. + v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, &v_eobmax); + + v_round = _mm256_unpackhi_epi64(v_round, v_round); + v_quant = _mm256_unpackhi_epi64(v_quant, v_quant); + v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant); + v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift); + v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin); + + for (count = (32 * 32) - 16; count > 0; count -= 16) { + coeff_ptr += 16; + qcoeff_ptr += 16; + dqcoeff_ptr += 16; + iscan += 16; + v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan, + &v_quant, &v_dequant, &v_round, &v_zbin, + &v_quant_shift, &v_eobmax); + } + + *eob_ptr = accumulate_eob256(v_eobmax); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm deleted file mode 100644 index 01c41291be..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm +++ /dev/null @@ -1,544 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - - vzeroupper - - ; If we can skip this block, then just zero the output - cmp skipmp, 0 - jne .blank - -%ifnidn %1, b_32x32 - - ; Special case for ncoeff == 16, as it is frequent and we can save on - ; not setting up a loop. - cmp ncoeffmp, 16 - jne .generic - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Special case of ncoeff == 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -.single: - - movifnidn coeffq, coeffmp - movifnidn zbinq, zbinmp - mova m0, [zbinq] ; m0 = zbin - - ; Get DC and first 15 AC coeffs - in this special case, that is all. -%if CONFIG_VP9_HIGHBITDEPTH - ; coeff stored as 32bit numbers but we process them as 16 bit numbers - mova m9, [coeffq] - packssdw m9, [coeffq+16] ; m9 = c[i] - mova m10, [coeffq+32] - packssdw m10, [coeffq+48] ; m10 = c[i] -%else - mova m9, [coeffq] ; m9 = c[i] - mova m10, [coeffq+16] ; m10 = c[i] -%endif - - mov r0, eobmp ; Output pointer - mov r1, qcoeffmp ; Output pointer - mov r2, dqcoeffmp ; Output pointer - - pxor m5, m5 ; m5 = dedicated zero - - pcmpeqw m4, m4 ; All word lanes -1 - paddw m0, m4 ; m0 = zbin - 1 - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, we just write zeros - ; to the outputs and we are done. - por m14, m7, m12 - ptest m14, m14 - jnz .single_nonzero - -%if CONFIG_VP9_HIGHBITDEPTH - mova [r1 ], ymm5 - mova [r1+32], ymm5 - mova [r2 ], ymm5 - mova [r2+32], ymm5 -%else - mova [r1], ymm5 - mova [r2], ymm5 -%endif - mov [r0], word 0 - - vzeroupper - RET - -.single_nonzero: - - ; Actual quantization of size 16 block - setup pointers, rounders, etc. - movifnidn r4, roundmp - movifnidn r5, quantmp - mov r3, dequantmp - mov r6, shiftmp - mova m1, [r4] ; m1 = round - mova m2, [r5] ; m2 = quant - mova m3, [r3] ; m3 = dequant - mova m4, [r6] ; m4 = shift - - mov r3, iscanmp - - DEFINE_ARGS eob, qcoeff, dqcoeff, iscan - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 - -%if CONFIG_VP9_HIGHBITDEPTH - ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [qcoeffq ], m11 - mova [qcoeffq+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+32], m11 - mova [qcoeffq+48], m6 -%else - mova [qcoeffq ], m8 - mova [qcoeffq+16], m13 -%endif - - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q - -%if CONFIG_VP9_HIGHBITDEPTH - ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [dqcoeffq ], m11 - mova [dqcoeffq+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+32], m11 - mova [dqcoeffq+48], m6 -%else - mova [dqcoeffq ], m8 - mova [dqcoeffq+16], m13 -%endif - - mova m6, [iscanq] ; m6 = scan[i] - mova m11, [iscanq+16] ; m11 = scan[i] - - pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 - psubw m6, m6, m7 ; m6 = scan[i] + 1 - psubw m11, m11, m12 ; m11 = scan[i] + 1 - pandn m8, m8, m6 ; m8 = max(eob) - pandn m13, m13, m11 ; m13 = max(eob) - pmaxsw m8, m8, m13 - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [eobq], ax - - vzeroupper - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of ncoeff != 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -.generic: - -%endif ; %ifnidn %1, b_32x32 - -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ - qcoeff, dqcoeff, dequant, eob, scan, iscan - - ; Actual quantization loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp - movifnidn zbinq, zbinmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - mova m0, [zbinq] ; m0 = zbin - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant - mova m3, [r2] ; m3 = dequant - pcmpeqw m4, m4 ; All lanes -1 -%ifidn %1, b_32x32 - psubw m0, m4 - psubw m1, m4 - psrlw m0, 1 ; m0 = (m0 + 1) / 2 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - paddw m0, m4 ; m0 = m0 + 1 - - mov r2, shiftmp - mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob - -%if CONFIG_VP9_HIGHBITDEPTH - lea coeffq, [ coeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - lea dqcoeffq, [dqcoeffq+ncoeffq*4] -%else - lea coeffq, [ coeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] -%endif - lea iscanq, [ iscanq+ncoeffq*2] - neg ncoeffq - - ; get DC and first 15 AC coeffs -%if CONFIG_VP9_HIGHBITDEPTH - ; coeff stored as 32bit numbers & require 16bit numbers - mova m9, [coeffq+ncoeffq*4+ 0] - packssdw m9, [coeffq+ncoeffq*4+16] - mova m10, [coeffq+ncoeffq*4+32] - packssdw m10, [coeffq+ncoeffq*4+48] -%else - mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, skip forward quickly. - por m14, m7, m12 - ptest m14, m14 - jnz .first_nonzero - -%if CONFIG_VP9_HIGHBITDEPTH - mova [qcoeffq+ncoeffq*4 ], ymm5 - mova [qcoeffq+ncoeffq*4+32], ymm5 - mova [dqcoeffq+ncoeffq*4 ], ymm5 - mova [dqcoeffq+ncoeffq*4+32], ymm5 -%else - mova [qcoeffq+ncoeffq*2], ymm5 - mova [dqcoeffq+ncoeffq*2], ymm5 -%endif - - add ncoeffq, mmsize - - punpckhqdq m1, m1 - punpckhqdq m2, m2 - punpckhqdq m3, m3 - punpckhqdq m4, m4 - pxor m8, m8 - - jmp .ac_only_loop - -.first_nonzero: - - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 -%else - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif - -%ifidn %1, b_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 -%endif - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m8 - punpckhwd m6, m8, m6 - pmovsxwd m11, m8 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] - mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - -.ac_only_loop: - -%if CONFIG_VP9_HIGHBITDEPTH - ; pack coeff from 32bit to 16bit array - mova m9, [coeffq+ncoeffq*4+ 0] - packssdw m9, [coeffq+ncoeffq*4+16] - mova m10, [coeffq+ncoeffq*4+32] - packssdw m10, [coeffq+ncoeffq*4+48] -%else - mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - - ; Check if all coeffs are less than zbin. If yes, skip this itertion. - ; And just write zeros as the result would be. - por m14, m7, m12 - ptest m14, m14 - jnz .rest_nonzero - -%if CONFIG_VP9_HIGHBITDEPTH - mova [qcoeffq+ncoeffq*4+ 0], ymm5 - mova [qcoeffq+ncoeffq*4+32], ymm5 - mova [dqcoeffq+ncoeffq*4+ 0], ymm5 - mova [dqcoeffq+ncoeffq*4+32], ymm5 -%else - mova [qcoeffq+ncoeffq*2+ 0], ymm5 - mova [dqcoeffq+ncoeffq*2+ 0], ymm5 -%endif - add ncoeffq, mmsize - jnz .ac_only_loop - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [r2], ax - vzeroupper - RET - -.rest_nonzero: - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m14, m6 ; m14 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m14, m4 ; m14 = m14*qsh>>16 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m14, m7 - pand m13, m12 - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m14 - punpckhwd m6, m14, m6 - pmovsxwd m11, m14 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 -%else - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif - -%ifidn %1, b_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; dqc[i] = qc[i] * q - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif - -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pcmpgtw m6, m5, m14 - punpckhwd m6, m14, m6 - pmovsxwd m11, m14 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pcmpgtw m6, m5, m13 - punpckhwd m6, m13, m6 - pmovsxwd m11, m13 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jnz .ac_only_loop - - ; Horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - movq rax, m8 - mov [r2], ax - vzeroupper - RET - - ; Skip-block, i.e. just write all zeroes -.blank: - -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ - qcoeff, dqcoeff, dequant, eob, scan, iscan - - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - -DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - -%if CONFIG_VP9_HIGHBITDEPTH - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] -%else - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] -%endif - - neg ncoeffq - pxor m7, m7 - -.blank_loop: -%if CONFIG_VP9_HIGHBITDEPTH - mova [dqcoeffq+ncoeffq*4+ 0], ymm7 - mova [dqcoeffq+ncoeffq*4+32], ymm7 - mova [qcoeffq+ncoeffq*4+ 0], ymm7 - mova [qcoeffq+ncoeffq*4+32], ymm7 -%else - mova [dqcoeffq+ncoeffq*2+ 0], ymm7 - mova [qcoeffq+ncoeffq*2+ 0], ymm7 -%endif - add ncoeffq, mmsize - jl .blank_loop - - mov [eobq], word 0 - - vzeroupper - RET -%endmacro - -INIT_XMM avx -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 - -END diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c index 0580a7bd7b..64838eaa7d 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.c @@ -8,216 +8,106 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include #include #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -#include "vpx_dsp/x86/fdct.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vp9/common/vp9_scan.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { - __m128i zero; - (void)scan_ptr; + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + int index = 16; + const int16_t *iscan = scan_order->iscan; - coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - if (!skip_block) { - __m128i eob; - __m128i zbin; - __m128i round, quant, dequant, shift; - { - __m128i coeff0, coeff1; + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; - // Setup global values - { - __m128i pw_1; - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - pw_1 = _mm_set1_epi16(1); - zbin = _mm_sub_epi16(zbin, pw_1); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - } + // Setup global values. + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - // Do DC and first 15 AC - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - shift = _mm_unpackhi_epi64(shift, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + calculate_qcoeff(&qcoeff0, round, quant, shift); - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + calculate_qcoeff(&qcoeff1, round, quant, shift); - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - coeff0 = load_tran_low(coeff_ptr + n_coeffs); - coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); - store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); - store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); - store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); - } + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - } + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - store_tran_low(zero, dqcoeff_ptr + n_coeffs); - store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8); - store_tran_low(zero, qcoeff_ptr + n_coeffs); - store_tran_low(zero, qcoeff_ptr + n_coeffs + 8); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; } + + *eob_ptr = accumulate_eob(eob); } diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h new file mode 100644 index 0000000000..82c755a0cf --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_sse2.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_ +#define VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_block.h" + +static INLINE void load_b_values(const struct macroblock_plane *const mb_plane, + __m128i *zbin, __m128i *round, __m128i *quant, + const int16_t *dequant_ptr, __m128i *dequant, + __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); + *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); +} + +static INLINE void load_b_values32x32( + const struct macroblock_plane *const mb_plane, __m128i *zbin, + __m128i *round, __m128i *quant, const int16_t *dequant_ptr, + __m128i *dequant, __m128i *shift) { + const __m128i one = _mm_set1_epi16(1); + // The 32x32 halves zbin and round. + *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin); + // Shift with rounding. + *zbin = _mm_add_epi16(*zbin, one); + *zbin = _mm_srli_epi16(*zbin, 1); + // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + *zbin = _mm_sub_epi16(*zbin, one); + + *round = _mm_load_si128((const __m128i *)mb_plane->round); + *round = _mm_add_epi16(*round, one); + *round = _mm_srli_epi16(*round, 1); + + *quant = _mm_load_si128((const __m128i *)mb_plane->quant); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift); + // I suspect this is not technically OK because quant_shift can be up + // to 1 << 16 and shifting up again will outrange that, but the test is not + // comprehensive enough to catch that and "it's been that way forever" + *shift = _mm_slli_epi16(*shift, 1); +} + +static INLINE void load_fp_values(const struct macroblock_plane *mb_plane, + __m128i *round, __m128i *quant, + const int16_t *dequant_ptr, + __m128i *dequant) { + *round = _mm_load_si128((const __m128i *)mb_plane->round_fp); + *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); +} + +// With ssse3 and later abs() and sign() are preferred. +static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi16(a, sign); +} + +static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, + const __m128i quant, const __m128i shift) { + __m128i tmp, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + *coeff = _mm_mulhi_epi16(qcoeff, shift); +} + +static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, + tran_low_t *dqcoeff) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i low = _mm_mullo_epi16(qcoeff, dequant); + const __m128i high = _mm_mulhi_epi16(qcoeff, dequant); + + const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +#else + const __m128i dqcoeff16 = _mm_mullo_epi16(qcoeff, dequant); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +// Scan 16 values for eob reference in scan. +static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, + const int16_t *scan, const int index, + const __m128i zero) { + const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8)); + __m128i eob0, eob1; + eob0 = _mm_andnot_si128(zero_coeff0, scan0); + eob1 = _mm_andnot_si128(zero_coeff1, scan1); + return _mm_max_epi16(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} + +#endif // VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c new file mode 100644 index 0000000000..2c6d851a16 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" +#include "vpx_dsp/x86/quantize_sse2.h" +#include "vpx_dsp/x86/quantize_ssse3.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_block.h" + +void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + int index = 16; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + + load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); + calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; + } + + *eob_ptr = accumulate_eob(eob); +} + +void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, + const struct macroblock_plane *const mb_plane, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const struct ScanOrder *const scan_order) { + const __m128i zero = _mm_setzero_si128(); + int index; + const int16_t *iscan = scan_order->iscan; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i all_zero; + __m128i eob = zero, eob0; + + load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, + &shift); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); +#endif // CONFIG_HIGHBITDEPTH + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); + + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < 32 * 32; index += 16) { + coeff0 = load_tran_low(coeff_ptr + index); + coeff1 = load_tran_low(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); +#endif // CONFIG_VP9_HIGHBITDEPTH + continue; + } + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr + index); + store_tran_low(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, + dqcoeff_ptr + 8 + index); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h new file mode 100644 index 0000000000..e8d2a05771 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ +#define VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/quantize_sse2.h" + +static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff, + const __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff) { + // Un-sign to bias rounding like C. + const __m128i coeff = _mm_abs_epi16(qcoeff); + + const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); + const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + // "Divide" by 2. + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 1); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 1); + + dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); + dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); + +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +#else + _mm_store_si128((__m128i *)(dqcoeff), + _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1)); +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +#endif // VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm deleted file mode 100644 index ca21539173..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm +++ /dev/null @@ -1,346 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_1: times 8 dw 1 - -SECTION .text - -; TODO(yunqingwang)fix quantize_b code for skip=1 case. -%macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, \ - eob, scan, iscan - cmp dword skipm, 0 - jne .blank - - ; actual quantize loop - setup pointers, rounders, etc. - movifnidn coeffq, coeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp - movifnidn zbinq, zbinmp - movifnidn roundq, roundmp - movifnidn quantq, quantmp - mova m0, [zbinq] ; m0 = zbin - mova m1, [roundq] ; m1 = round - mova m2, [quantq] ; m2 = quant -%ifidn %1, b_32x32 - pcmpeqw m5, m5 - psrlw m5, 15 - paddw m0, m5 - paddw m1, m5 - psrlw m0, 1 ; m0 = (m0 + 1) / 2 - psrlw m1, 1 ; m1 = (m1 + 1) / 2 -%endif - mova m3, [r2q] ; m3 = dequant - psubw m0, [pw_1] - mov r2, shiftmp - mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift - mov r4, dqcoeffmp - mov r5, iscanmp -%ifidn %1, b_32x32 - psllw m4, 1 -%endif - pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob -%if CONFIG_VP9_HIGHBITDEPTH - lea coeffq, [ coeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - lea dqcoeffq, [dqcoeffq+ncoeffq*4] -%else - lea coeffq, [ coeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] -%endif - lea iscanq, [ iscanq+ncoeffq*2] - neg ncoeffq - - ; get DC and first 15 AC coeffs -%if CONFIG_VP9_HIGHBITDEPTH - ; coeff stored as 32bit numbers & require 16bit numbers - mova m9, [ coeffq+ncoeffq*4+ 0] - packssdw m9, [ coeffq+ncoeffq*4+16] - mova m10, [ coeffq+ncoeffq*4+32] - packssdw m10, [ coeffq+ncoeffq*4+48] -%else - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - punpckhqdq m0, m0 - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - paddsw m6, m1 ; m6 += round - punpckhqdq m1, m1 - paddsw m11, m1 ; m11 += round - pmulhw m8, m6, m2 ; m8 = m6*q>>16 - punpckhqdq m2, m2 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m8, m6 ; m8 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m8, m4 ; m8 = m8*qsh>>16 - punpckhqdq m4, m4 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m8, m9 ; m8 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m8, m7 - pand m13, m12 -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m8 - mova m6, m8 - pcmpgtw m5, m8 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register -%else - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif -%ifidn %1, b_32x32 - pabsw m8, m8 - pabsw m13, m13 -%endif - pmullw m8, m3 ; dqc[i] = qc[i] * q - punpckhqdq m3, m3 - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m8, 1 - psrlw m13, 1 - psignw m8, m9 - psignw m13, m10 -%endif -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m8 - mova m6, m8 - pcmpgtw m5, m8 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register -%else - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - pcmpeqw m8, m5 ; m8 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m8, m6 ; m8 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m13 - add ncoeffq, mmsize - jz .accumulate_eob - -.ac_only_loop: -%if CONFIG_VP9_HIGHBITDEPTH - ; pack coeff from 32bit to 16bit array - mova m9, [ coeffq+ncoeffq*4+ 0] - packssdw m9, [ coeffq+ncoeffq*4+16] - mova m10, [ coeffq+ncoeffq*4+32] - packssdw m10, [ coeffq+ncoeffq*4+48] -%else - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif - pabsw m6, m9 ; m6 = abs(m9) - pabsw m11, m10 ; m11 = abs(m10) - pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin - pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin -%ifidn %1, b_32x32 - pmovmskb r6d, m7 - pmovmskb r2d, m12 - or r6, r2 - jz .skip_iter -%endif - paddsw m6, m1 ; m6 += round - paddsw m11, m1 ; m11 += round - pmulhw m14, m6, m2 ; m14 = m6*q>>16 - pmulhw m13, m11, m2 ; m13 = m11*q>>16 - paddw m14, m6 ; m14 += m6 - paddw m13, m11 ; m13 += m11 - pmulhw m14, m4 ; m14 = m14*qsh>>16 - pmulhw m13, m4 ; m13 = m13*qsh>>16 - psignw m14, m9 ; m14 = reinsert sign - psignw m13, m10 ; m13 = reinsert sign - pand m14, m7 - pand m13, m12 -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pxor m11, m11 - mova m11, m14 - mova m6, m14 - pcmpgtw m5, m14 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+ 0], m11 - mova [qcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [qcoeffq+ncoeffq*4+32], m11 - mova [qcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 ; reset m5 to zero register -%else - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif -%ifidn %1, b_32x32 - pabsw m14, m14 - pabsw m13, m13 -%endif - pmullw m14, m3 ; dqc[i] = qc[i] * q - pmullw m13, m3 ; dqc[i] = qc[i] * q -%ifidn %1, b_32x32 - psrlw m14, 1 - psrlw m13, 1 - psignw m14, m9 - psignw m13, m10 -%endif -%if CONFIG_VP9_HIGHBITDEPTH - ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - mova m11, m14 - mova m6, m14 - pcmpgtw m5, m14 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+ 0], m11 - mova [dqcoeffq+ncoeffq*4+16], m6 - pxor m5, m5 - mova m11, m13 - mova m6, m13 - pcmpgtw m5, m13 - punpcklwd m11, m5 - punpckhwd m6, m5 - mova [dqcoeffq+ncoeffq*4+32], m11 - mova [dqcoeffq+ncoeffq*4+48], m6 - pxor m5, m5 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif - pcmpeqw m14, m5 ; m14 = c[i] == 0 - pcmpeqw m13, m5 ; m13 = c[i] == 0 - mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] - mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m12 ; m11 = scan[i] + 1 - pandn m14, m6 ; m14 = max(eob) - pandn m13, m11 ; m13 = max(eob) - pmaxsw m8, m14 - pmaxsw m8, m13 - add ncoeffq, mmsize - jl .ac_only_loop - -%ifidn %1, b_32x32 - jmp .accumulate_eob -.skip_iter: -%if CONFIG_VP9_HIGHBITDEPTH - mova [qcoeffq+ncoeffq*4+ 0], m5 - mova [qcoeffq+ncoeffq*4+16], m5 - mova [qcoeffq+ncoeffq*4+32], m5 - mova [qcoeffq+ncoeffq*4+48], m5 - mova [dqcoeffq+ncoeffq*4+ 0], m5 - mova [dqcoeffq+ncoeffq*4+16], m5 - mova [dqcoeffq+ncoeffq*4+32], m5 - mova [dqcoeffq+ncoeffq*4+48], m5 -%else - mova [qcoeffq+ncoeffq*2+ 0], m5 - mova [qcoeffq+ncoeffq*2+16], m5 - mova [dqcoeffq+ncoeffq*2+ 0], m5 - mova [dqcoeffq+ncoeffq*2+16], m5 -%endif - add ncoeffq, mmsize - jl .ac_only_loop -%endif - -.accumulate_eob: - ; horizontally accumulate/max eobs and write into [eob] memory pointer - mov r2, eobmp - pshufd m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0xe - pmaxsw m8, m7 - pshuflw m7, m8, 0x1 - pmaxsw m8, m7 - pextrw r6, m8, 0 - mov [r2], r6 - RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob -%if CONFIG_VP9_HIGHBITDEPTH - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] -%else - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] -%endif - neg ncoeffq - pxor m7, m7 -.blank_loop: -%if CONFIG_VP9_HIGHBITDEPTH - mova [dqcoeffq+ncoeffq*4+ 0], m7 - mova [dqcoeffq+ncoeffq*4+16], m7 - mova [dqcoeffq+ncoeffq*4+32], m7 - mova [dqcoeffq+ncoeffq*4+48], m7 - mova [qcoeffq+ncoeffq*4+ 0], m7 - mova [qcoeffq+ncoeffq*4+16], m7 - mova [qcoeffq+ncoeffq*4+32], m7 - mova [qcoeffq+ncoeffq*4+48], m7 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m7 - mova [dqcoeffq+ncoeffq*2+16], m7 - mova [qcoeffq+ncoeffq*2+ 0], m7 - mova [qcoeffq+ncoeffq*2+16], m7 -%endif - add ncoeffq, mmsize - jl .blank_loop - mov word [eobq], 0 - RET -%endmacro - -INIT_XMM ssse3 -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c index 962b8fb11a..cf7111983b 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx2.c @@ -11,154 +11,174 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; - __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; - __m256i sum_mlow, sum_mhigh; - int i; - const uint8_t *ref0, *ref1, *ref2, *ref3; - - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - sum_ref0 = _mm256_set1_epi16(0); - sum_ref1 = _mm256_set1_epi16(0); - sum_ref2 = _mm256_set1_epi16(0); - sum_ref3 = _mm256_set1_epi16(0); - for (i = 0; i < 32; i++) { - // load src and all refs - src_reg = _mm256_loadu_si256((const __m256i *)src); - ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); - ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); - ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); - ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); - // sum of the absolute differences between every ref-i to src - ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); - ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); - ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); - ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); - // sum every ref-i - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - { - __m128i sum; - // in sum_ref-i the result is saved in the first 4 bytes - // the other 4 bytes are zeroed. - // sum_ref1 and sum_ref3 are shifted left by 4 bytes - sum_ref1 = _mm256_slli_si256(sum_ref1, 4); - sum_ref3 = _mm256_slli_si256(sum_ref3, 4); - - // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 - sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); - sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); - - // merge every 64 bit from each sum_ref-i - sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); - sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); - - // add the low 64 bit to the high 64 bit - sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); - - // add the low 128 bit to the high 128 bit - sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), - _mm256_extractf128_si256(sum_mlow, 1)); - - _mm_storeu_si128((__m128i *)(res), sum); - } +// Note with sums[4] some versions of Visual Studio may fail due to parameter +// alignment, though the functions should be equivalent: +// error C2719: 'sums': formal parameter with requested alignment of 32 won't be +// aligned +static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, + uint32_t sad_array[4]) { + const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); + const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); + const __m256i t2 = _mm256_hadd_epi32(t0, t1); + const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2), + _mm256_extractf128_si256(t2, 1)); + _mm_storeu_si128((__m128i *)sad_array, sum); } -void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t res[4]) { - __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; - __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; - __m256i ref3_reg, ref3next_reg; - __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; - __m256i sum_mlow, sum_mhigh; +static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { int i; - const uint8_t *ref0, *ref1, *ref2, *ref3; + const uint8_t *refs[4]; + __m256i sums[4]; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - sum_ref0 = _mm256_set1_epi16(0); - sum_ref1 = _mm256_set1_epi16(0); - sum_ref2 = _mm256_set1_epi16(0); - sum_ref3 = _mm256_set1_epi16(0); - for (i = 0; i < 64; i++) { - // load 64 bytes from src and all refs - src_reg = _mm256_loadu_si256((const __m256i *)src); - srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); - ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); - ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); - ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); - ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); - ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); - ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); - ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); - ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); - // sum of the absolute differences between every ref-i to src - ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); - ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); - ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); - ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); - ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); - ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); - ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); - ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); + refs[0] = ref_array[0]; + refs[1] = ref_array[1]; + refs[2] = ref_array[2]; + refs[3] = ref_array[3]; + sums[0] = _mm256_setzero_si256(); + sums[1] = _mm256_setzero_si256(); + sums[2] = _mm256_setzero_si256(); + sums[3] = _mm256_setzero_si256(); - // sum every ref-i - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; + for (i = 0; i < h; i++) { + __m256i r[4]; + + // load src and all ref[] + const __m256i s = _mm256_load_si256((const __m256i *)src_ptr); + r[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + + // sum of the absolute differences between every ref[] to src + r[0] = _mm256_sad_epu8(r[0], s); + r[1] = _mm256_sad_epu8(r[1], s); + r[2] = _mm256_sad_epu8(r[2], s); + r[3] = _mm256_sad_epu8(r[3], s); + + // sum every ref[] + sums[0] = _mm256_add_epi32(sums[0], r[0]); + sums[1] = _mm256_add_epi32(sums[1], r[1]); + sums[2] = _mm256_add_epi32(sums[2], r[2]); + sums[3] = _mm256_add_epi32(sums[3], r[3]); + + src_ptr += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; } - { - __m128i sum; - // in sum_ref-i the result is saved in the first 4 bytes - // the other 4 bytes are zeroed. - // sum_ref1 and sum_ref3 are shifted left by 4 bytes - sum_ref1 = _mm256_slli_si256(sum_ref1, 4); - sum_ref3 = _mm256_slli_si256(sum_ref3, 4); - - // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 - sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); - sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); - - // merge every 64 bit from each sum_ref-i - sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); - sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); - - // add the low 64 bit to the high 64 bit - sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); - - // add the low 128 bit to the high 128 bit - sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), - _mm256_extractf128_si256(sum_mlow, 1)); - - _mm_storeu_si128((__m128i *)(res), sum); - } + calc_final_4(sums, sad_array); } + +static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { + __m256i sums[4]; + int i; + const uint8_t *refs[4]; + + refs[0] = ref_array[0]; + refs[1] = ref_array[1]; + refs[2] = ref_array[2]; + refs[3] = ref_array[3]; + sums[0] = _mm256_setzero_si256(); + sums[1] = _mm256_setzero_si256(); + sums[2] = _mm256_setzero_si256(); + sums[3] = _mm256_setzero_si256(); + + for (i = 0; i < h; i++) { + __m256i r_lo[4], r_hi[4]; + // load 64 bytes from src and all ref[] + const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr); + const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32)); + r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]); + r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32)); + r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]); + r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32)); + r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]); + r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32)); + r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]); + r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32)); + + // sum of the absolute differences between every ref[] to src + r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo); + r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo); + r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo); + r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo); + r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi); + r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi); + r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi); + r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi); + + // sum every ref[] + sums[0] = _mm256_add_epi32(sums[0], r_lo[0]); + sums[1] = _mm256_add_epi32(sums[1], r_lo[1]); + sums[2] = _mm256_add_epi32(sums[2], r_lo[2]); + sums[3] = _mm256_add_epi32(sums[3], r_lo[3]); + sums[0] = _mm256_add_epi32(sums[0], r_hi[0]); + sums[1] = _mm256_add_epi32(sums[1], r_hi[1]); + sums[2] = _mm256_add_epi32(sums[2], r_hi[2]); + sums[3] = _mm256_add_epi32(sums[3], r_hi[3]); + + src_ptr += src_stride; + refs[0] += ref_stride; + refs[1] += ref_stride; + refs[2] += ref_stride; + refs[3] += ref_stride; + } + + calc_final_4(sums, sad_array); +} + +#define SAD64_H(h) \ + void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ + } + +#define SAD32_H(h) \ + void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \ + } + +SAD64_H(64) +SAD32_H(32) + +#define SADS64_H(h) \ + void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +#define SADS32_H(h) \ + void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +SADS64_H(64) +SADS64_H(32) + +SADS32_H(64) +SADS32_H(32) +SADS32_H(16) diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c new file mode 100644 index 0000000000..cc36cae611 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_avx512.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // AVX512 +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static INLINE void sad64xhx4d_avx512(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_array[4], + int ref_stride, int h, + uint32_t sad_array[4]) { + __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m512i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref_array[0]; + ref1 = ref_array[1]; + ref2 = ref_array[2]; + ref3 = ref_array[3]; + sum_ref0 = _mm512_set1_epi16(0); + sum_ref1 = _mm512_set1_epi16(0); + sum_ref2 = _mm512_set1_epi16(0); + sum_ref3 = _mm512_set1_epi16(0); + for (i = 0; i < h; i++) { + // load src and all ref[] + src_reg = _mm512_loadu_si512((const __m512i *)src_ptr); + ref0_reg = _mm512_loadu_si512((const __m512i *)ref0); + ref1_reg = _mm512_loadu_si512((const __m512i *)ref1); + ref2_reg = _mm512_loadu_si512((const __m512i *)ref2); + ref3_reg = _mm512_loadu_si512((const __m512i *)ref3); + // sum of the absolute differences between every ref[] to src + ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg); + // sum every ref[] + sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg); + + src_ptr += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m256i sum256; + __m128i sum128; + // in sum_ref[] the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4); + sum_ref3 = _mm512_bslli_epi128(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1); + sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref[] + sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm512_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum256 = _mm256_add_epi32(_mm512_castsi512_si256(sum_mlow), + _mm512_extracti32x8_epi32(sum_mlow, 1)); + sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256), + _mm256_extractf128_si256(sum256, 1)); + + _mm_storeu_si128((__m128i *)(sad_array), sum128); + } +} + +void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { + sad64xhx4d_avx512(src, src_stride, ref_array, ref_stride, 64, sad_array); +} + +#define SADS64_H(h) \ + void vpx_sad_skip_64x##h##x4d_avx512( \ + const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + sad64xhx4d_avx512(src, 2 * src_stride, ref_array, 2 * ref_stride, \ + ((h) >> 1), sad_array); \ + sad_array[0] <<= 1; \ + sad_array[1] <<= 1; \ + sad_array[2] <<= 1; \ + sad_array[3] <<= 1; \ + } + +SADS64_H(64) +SADS64_H(32) diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm index 3f6e55ce9a..ed4ea3ef9b 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad4d_sse2.asm @@ -179,13 +179,27 @@ SECTION .text ; uint8_t *ref[4], int ref_stride, ; uint32_t res[4]); ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 -%macro SADNXN4D 2 +%macro SADNXN4D 2-3 0 +%if %3 == 1 ; skip rows +%if UNIX64 +cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif +%else ; normal sad %if UNIX64 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ res, ref2, ref3, ref4 %else cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ ref2, ref3, ref4 +%endif +%endif +%if %3 == 1 + lea src_strided, [2*src_strided] + lea ref_strided, [2*ref_strided] %endif movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided @@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ mov ref1q, [ref1q+gprsize*0] PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 +%if %3 == 1 ; downsample number of rows by 2 +%define num_rep (%2-8)/4 +%else +%define num_rep (%2-4)/2 +%endif +%rep num_rep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 %endrep +%undef num_rep PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 %if %1 > 4 @@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ punpckhqdq m5, m7 movifnidn r4, r4mp paddd m4, m5 +%if %3 == 1 + pslld m4, 1 +%endif movu [r4], m4 RET %else movifnidn r4, r4mp pshufd m6, m6, 0x08 pshufd m7, m7, 0x08 +%if %3 == 1 + pslld m6, 1 + pslld m7, 1 +%endif movq [r4+0], m6 movq [r4+8], m7 RET @@ -237,3 +264,15 @@ SADNXN4D 8, 8 SADNXN4D 8, 4 SADNXN4D 4, 8 SADNXN4D 4, 4 + +SADNXN4D 64, 64, 1 +SADNXN4D 64, 32, 1 +SADNXN4D 32, 64, 1 +SADNXN4D 32, 32, 1 +SADNXN4D 32, 16, 1 +SADNXN4D 16, 32, 1 +SADNXN4D 16, 16, 1 +SADNXN4D 16, 8, 1 +SADNXN4D 8, 16, 1 +SADNXN4D 8, 8, 1 +SADNXN4D 4, 8, 1 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c index d944134305..e00494d766 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx2.c @@ -11,88 +11,120 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" +static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + +static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + int h) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + const int ref2_stride = ref_stride << 1; + const int src2_stride = src_stride << 1; + const int max = h >> 1; + for (i = 0; i < max; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref2_stride; + src_ptr += src2_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + return res; +} + #define FSAD64_H(h) \ unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0; i < h; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref_stride; \ - src_ptr += src_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS64_H(h) \ + unsigned int vpx_sad_skip_64x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ } #define FSAD32_H(h) \ unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0; i < max; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - sad1_reg = _mm256_sad_epu8( \ - ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8( \ - ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = \ - _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr += ref2_stride; \ - src_ptr += src2_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS32_H(h) \ + unsigned int vpx_sad_skip_32x##h##_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \ + h / 2); \ } #define FSAD64 \ - FSAD64_H(64); \ - FSAD64_H(32); + FSAD64_H(64) \ + FSAD64_H(32) \ + FSADS64_H(64) \ + FSADS64_H(32) #define FSAD32 \ - FSAD32_H(64); \ - FSAD32_H(32); \ - FSAD32_H(16); + FSAD32_H(64) \ + FSAD32_H(32) \ + FSAD32_H(16) \ + FSADS32_H(64) \ + FSADS32_H(32) \ + FSADS32_H(16) -FSAD64; -FSAD32; +FSAD64 +FSAD32 #undef FSAD64 #undef FSAD32 #undef FSAD64_H #undef FSAD32_H +#undef FSADS64_H +#undef FSADS32_H #define FSADAVG64_H(h) \ unsigned int vpx_sad64x##h##_avg_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -118,15 +150,14 @@ FSAD32; sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } #define FSADAVG32_H(h) \ unsigned int vpx_sad32x##h##_avg_avx2( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred) { \ - int i, res; \ + int i; \ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ __m256i sum_sad = _mm256_setzero_si256(); \ __m256i sum_sad_h; \ @@ -156,21 +187,20 @@ FSAD32; sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ + return (unsigned int)_mm_cvtsi128_si32(sum_sad128); \ } -#define FSADAVG64 \ - FSADAVG64_H(64); \ - FSADAVG64_H(32); +#define FSADAVG64 \ + FSADAVG64_H(64) \ + FSADAVG64_H(32) -#define FSADAVG32 \ - FSADAVG32_H(64); \ - FSADAVG32_H(32); \ - FSADAVG32_H(16); +#define FSADAVG32 \ + FSADAVG32_H(64) \ + FSADAVG32_H(32) \ + FSADAVG32_H(16) -FSADAVG64; -FSADAVG32; +FSADAVG64 +FSADAVG32 #undef FSADAVG64 #undef FSADAVG32 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_avx512.c b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx512.c new file mode 100644 index 0000000000..38bd3be52b --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_avx512.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2025 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +static INLINE unsigned int sad64xh_avx512(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + int i, res; + __m512i sad_reg, ref_reg; + __m512i sum_sad = _mm512_setzero_si512(); + for (i = 0; i < h; i++) { + ref_reg = _mm512_loadu_si512((const __m512i *)ref_ptr); + sad_reg = + _mm512_sad_epu8(ref_reg, _mm512_loadu_si512((__m512 const *)src_ptr)); + sum_sad = _mm512_add_epi32(sum_sad, sad_reg); + ref_ptr += ref_stride; + src_ptr += src_stride; + } + res = _mm512_reduce_add_epi32(sum_sad); + return res; +} + +#define FSAD64_H(h) \ + unsigned int vpx_sad64x##h##_avx512(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride) { \ + return sad64xh_avx512(src_ptr, src_stride, ref_ptr, ref_stride, h); \ + } + +#define FSADS64_H(h) \ + unsigned int vpx_sad_skip_64x##h##_avx512( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride) { \ + return 2 * sad64xh_avx512(src_ptr, src_stride * 2, ref_ptr, \ + ref_stride * 2, h / 2); \ + } + +#define FSAD64 \ + FSAD64_H(64) \ + FSAD64_H(32) \ + FSADS64_H(64) \ + FSADS64_H(32) + +FSAD64 + +#undef FSAD64 +#undef FSAD64_H +#undef FSADS64_H + +#define FSADAVG64_H(h) \ + unsigned int vpx_sad64x##h##_avg_avx512( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i; \ + __m512i sad_reg, ref_reg; \ + __m512i sum_sad = _mm512_setzero_si512(); \ + for (i = 0; i < h; i++) { \ + ref_reg = _mm512_loadu_si512((const __m512i *)ref_ptr); \ + ref_reg = _mm512_avg_epu8( \ + ref_reg, _mm512_loadu_si512((const __m512i *)second_pred)); \ + sad_reg = _mm512_sad_epu8(ref_reg, \ + _mm512_loadu_si512((const __m512i *)src_ptr)); \ + sum_sad = _mm512_add_epi32(sum_sad, sad_reg); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + second_pred += 64; \ + } \ + return (unsigned int)_mm512_reduce_add_epi32(sum_sad); \ + } + +#define FSADAVG64 \ + FSADAVG64_H(64) \ + FSADAVG64_H(32) + +FSADAVG64 + +#undef FSADAVG64 +#undef FSADAVG64_H diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm index 1ec906c236..627e463bf8 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse2.asm @@ -12,30 +12,48 @@ SECTION .text +; Macro Arguments +; Arg 1: Width +; Arg 2: Height +; Arg 3: Number of general purpose registers +; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows %macro SAD_FN 4 -%if %4 == 0 +%if %4 == 0 ; normal sad %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 -%else ; avg + +%elif %4 == 2 ; skip +%if %3 == 5 +cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 + +%else %if %3 == 5 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 -cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ +cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 %define n_rowsd r7d %else ; x86-32 %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 -%endif ; avg/sad +%endif ; sad/avg/skip +%if %4 == 2; skip rows so double the stride +lea src_strided, [src_strided*2] +lea ref_strided, [ref_strided*2] +%endif ; %4 skip movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 @@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/2 +%else mov n_rowsd, %1 +%endif pxor m0, m0 .loop: movu m1, [refq] @@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 +SAD64XN 64, 2 ; sad64x64_skip_sse2 +SAD64XN 32, 2 ; sad64x32_skip_sse2 ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 5, %2 +%if %2 == 2 + mov n_rowsd, %1/4 +%else mov n_rowsd, %1/2 +%endif pxor m0, m0 .loop: movu m1, [refq] @@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 +SAD32XN 64, 2 ; sad32x64_skip_sse2 +SAD32XN 32, 2 ; sad32x32_skip_sse2 +SAD32XN 16, 2 ; sad32x16_skip_sse2 ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD16XN 1-2 0 SAD_FN 16, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -177,12 +221,19 @@ SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 +SAD16XN 32, 2 ; sad16x32_skip_sse2 +SAD16XN 16, 2 ; sad16x16_skip_sse2 +SAD16XN 8, 2 ; sad16x8_skip_sse2 ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD8XN 1-2 0 SAD_FN 8, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -210,6 +261,9 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -221,12 +275,18 @@ SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 +SAD8XN 16, 2 ; sad8x16_skip_sse2 +SAD8XN 8, 2 ; sad8x8_skip_sse2 ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD4XN 1-2 0 SAD_FN 4, %1, 7, %2 +%if %2 == 2 + mov n_rowsd, %1/8 +%else mov n_rowsd, %1/4 +%endif pxor m0, m0 .loop: @@ -257,6 +317,9 @@ SAD8XN 4, 1 ; sad8x4_avg_sse2 movhlps m1, m0 paddd m0, m1 +%if %2 == 2 ; we skipped rows, so now we need to double the sad + pslld m0, 1 +%endif movd eax, m0 RET %endmacro @@ -266,3 +329,4 @@ SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse SAD4XN 4, 1 ; sad4x4_avg_sse +SAD4XN 8, 2 ; sad4x8_skip_sse diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse3.asm deleted file mode 100644 index 18279bdb9d..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse3.asm +++ /dev/null @@ -1,374 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -%macro STACK_FRAME_CREATE_X3 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define ref_ptr rdi - %define ref_stride rdx - %define end_ptr rcx - %define ret_var rbx - %define result_ptr arg(4) - %define height dword ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - mov rsi, arg(0) ; src_ptr - mov rdi, arg(2) ; ref_ptr - - movsxd rax, dword ptr arg(1) ; src_stride - movsxd rdx, dword ptr arg(3) ; ref_stride -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define ref_ptr r8 - %define ref_stride r9 - %define end_ptr r10 - %define ret_var r11 - %define result_ptr [rsp+xmm_stack_space+8+4*8] - %define height dword ptr [rsp+xmm_stack_space+8+4*8] - %else - %define src_ptr rdi - %define src_stride rsi - %define ref_ptr rdx - %define ref_stride rcx - %define end_ptr r9 - %define ret_var r10 - %define result_ptr r8 - %define height r8 - %endif -%endif - -%endmacro - -%macro STACK_FRAME_DESTROY_X3 0 - %define src_ptr - %define src_stride - %define ref_ptr - %define ref_stride - %define end_ptr - %define ret_var - %define result_ptr - %define height - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBVPX_YASM_WIN64 - RESTORE_XMM - %endif -%endif - ret -%endmacro - -%macro PROCESS_16X2X3 5 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm5, XMMWORD PTR [%3] - lddqu xmm6, XMMWORD PTR [%3+1] - lddqu xmm7, XMMWORD PTR [%3+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%3+1] - lddqu xmm3, XMMWORD PTR [%3+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [%2+%4] - lddqu xmm1, XMMWORD PTR [%3+%5] - lddqu xmm2, XMMWORD PTR [%3+%5+1] - lddqu xmm3, XMMWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_8X2X3 5 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm5, QWORD PTR [%3] - movq mm6, QWORD PTR [%3+1] - movq mm7, QWORD PTR [%3+2] - - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%3+1] - movq mm3, QWORD PTR [%3+2] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endif - movq mm0, QWORD PTR [%2+%4] - movq mm1, QWORD PTR [%3+%5] - movq mm2, QWORD PTR [%3+%5+1] - movq mm3, QWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endmacro - -;void int vpx_sad16x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vpx_sad16x16x3_sse3) PRIVATE -sym(vpx_sad16x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad16x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vpx_sad16x8x3_sse3) PRIVATE -sym(vpx_sad16x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad8x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vpx_sad8x16x3_sse3) PRIVATE -sym(vpx_sad8x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad8x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vpx_sad8x8x3_sse3) PRIVATE -sym(vpx_sad8x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int vpx_sad4x4x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vpx_sad4x4x3_sse3) PRIVATE -sym(vpx_sad4x4x3_sse3): - - STACK_FRAME_CREATE_X3 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [ref_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [ref_ptr+1] - movd mm5, DWORD PTR [ref_ptr+2] - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm3, DWORD PTR [ref_ptr+ref_stride+2] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - psadbw mm4, mm0 - psadbw mm5, mm0 - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [ref_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm6, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm6 - - movd mm3, DWORD PTR [ref_ptr+1] - movd mm7, DWORD PTR [ref_ptr+2] - - psadbw mm2, mm0 - - paddw mm1, mm2 - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm6, DWORD PTR [ref_ptr+ref_stride+2] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm6 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - paddw mm3, mm4 - paddw mm7, mm5 - - mov rcx, result_ptr - - punpckldq mm1, mm3 - - movq [rcx], mm1 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse4.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_sse4.asm deleted file mode 100644 index bc67447971..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/x86/sad_sse4.asm +++ /dev/null @@ -1,359 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X8 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm1, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm1, xmm2 - paddw xmm1, xmm3 - paddw xmm1, xmm4 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endif - movdqa xmm0, XMMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - movq xmm2, MMWORD PTR [rdi+ rdx+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_8X2X8 1 -%if %1 - movq xmm0, MMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm1, xmm2 -%else - movq xmm0, MMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endif - movq xmm0, MMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_4X2X8 1 -%if %1 - movd xmm0, [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - mpsadbw xmm1, xmm0, 0x0 -%else - movd xmm0, [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endif - movd xmm0, [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endmacro - -%macro WRITE_AS_INTS 0 - mov rdi, arg(4) ;Results - pxor xmm0, xmm0 - movdqa xmm2, xmm1 - punpcklwd xmm1, xmm0 - punpckhwd xmm2, xmm0 - - movdqa [rdi], xmm1 - movdqa [rdi + 16], xmm2 -%endmacro - -;void vpx_sad16x16x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array); -global sym(vpx_sad16x16x8_sse4_1) PRIVATE -sym(vpx_sad16x16x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad16x8x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vpx_sad16x8x8_sse4_1) PRIVATE -sym(vpx_sad16x8x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad8x8x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vpx_sad8x8x8_sse4_1) PRIVATE -sym(vpx_sad8x8x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad8x16x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vpx_sad8x16x8_sse4_1) PRIVATE -sym(vpx_sad8x16x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vpx_sad4x4x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vpx_sad4x4x8_sse4_1) PRIVATE -sym(vpx_sad4x4x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - - diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sad_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/sad_ssse3.asm deleted file mode 100644 index 49f204fa04..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/x86/sad_ssse3.asm +++ /dev/null @@ -1,370 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X3 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm5, XMMWORD PTR [rdi] - lddqu xmm6, XMMWORD PTR [rdi+1] - lddqu xmm7, XMMWORD PTR [rdi+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm1, XMMWORD PTR [rdi] - lddqu xmm2, XMMWORD PTR [rdi+1] - lddqu xmm3, XMMWORD PTR [rdi+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - lddqu xmm1, XMMWORD PTR [rdi+rdx] - lddqu xmm2, XMMWORD PTR [rdi+rdx+1] - lddqu xmm3, XMMWORD PTR [rdi+rdx+2] - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X2X3_OFFSET 2 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm7, XMMWORD PTR [rdi+16] - - movdqa xmm5, xmm7 - palignr xmm5, xmm4, %2 - - movdqa xmm6, xmm7 - palignr xmm6, xmm4, (%2+1) - - palignr xmm7, xmm4, (%2+2) - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm3, XMMWORD PTR [rdi+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - movdqa xmm4, XMMWORD PTR [rdi+rdx] - movdqa xmm3, XMMWORD PTR [rdi+rdx+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X16X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -%macro PROCESS_16X8X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -;void int vpx_sad16x16x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vpx_sad16x16x3_ssse3) PRIVATE -sym(vpx_sad16x16x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .vpx_sad16x16x3_ssse3_skiptable -.vpx_sad16x16x3_ssse3_jumptable: - dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump - dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump -.vpx_sad16x16x3_ssse3_skiptable: - - call .vpx_sad16x16x3_ssse3_do_jump -.vpx_sad16x16x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3 - -.vpx_sad16x16x3_ssse3_aligned_by_15: - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.vpx_sad16x16x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void int vpx_sad16x8x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vpx_sad16x8x3_ssse3) PRIVATE -sym(vpx_sad16x8x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .vpx_sad16x8x3_ssse3_skiptable -.vpx_sad16x8x3_ssse3_jumptable: - dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump - dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump -.vpx_sad16x8x3_ssse3_skiptable: - - call .vpx_sad16x8x3_ssse3_do_jump -.vpx_sad16x8x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3 - -.vpx_sad16x8x3_ssse3_aligned_by_15: - - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.vpx_sad16x8x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c new file mode 100644 index 0000000000..dfe45b6115 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sse_avx2.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a); + const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b); + const __m256i zero = _mm256_setzero_si256(); + const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); + const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); + const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); + const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + __m256i zero = _mm256_setzero_si256(); + const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); + const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) { + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + *sum = _mm256_add_epi64(*sum, sum_4x64); +} + +static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) { + int64_t sum; + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} +#endif + +static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = load_unaligned_u32(a); + const __m128i v_a1 = load_unaligned_u32(a + a_stride); + const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2); + const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3); + const __m128i v_b0 = load_unaligned_u32(b); + const __m128i v_b1 = load_unaligned_u32(b + b_stride); + const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2); + const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1), + _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1), + _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m256i *sum) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + __m256i zero = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + sse_w4x4_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + sse_w8x2_avx2(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); + const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); + const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride)); + const __m256i v_a = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); + const __m256i v_b = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); + const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); + const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); + const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); + const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); + const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); + const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); + const __m256i temp = + _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), + _mm256_madd_epi16(v_bsub, v_bsub)); + sum = _mm256_add_epi32(sum, temp); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: + if ((width & 0x07) == 0) { + do { + int i = 0; + do { + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + const uint8_t *a2; + const uint8_t *b2; + sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum); + a2 = a + i + (a_stride << 1); + b2 = b + i + (b_stride << 1); + sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } + sse = summary_all_avx2(&sum); + break; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a); + const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2)); + const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2)); + const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3)); + const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3); + const __m256i v_a_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); + const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3); + const __m256i v_b_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride)); + const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a); + const __m256i v_a_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1); + const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride)); + const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b); + const __m256i v_b_w = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16, b + 16); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 64; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + case 64: + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + highbd_sse_w16_avx2(&sum32, a, b); + highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 32; + } while (y < height); + sse = summary_4x64_avx2(sum); + break; + default: + if (width & 0x7) { + do { + int i = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + const uint16_t *a2; + const uint16_t *b2; + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + a2 = a + i + (a_stride << 1); + b2 = b + i + (b_stride << 1); + highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride); + summary_32_avx2(&sum32, &sum); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + } else { + do { + int l = 0; + __m256i sum32 = _mm256_setzero_si256(); + do { + int i = 0; + do { + highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride); + i += 8; + } while (i < width); + a += a_stride << 1; + b += b_stride << 1; + l += 2; + } while (l < 8 && l < (height - y)); + summary_32_avx2(&sum32, &sum); + y += 8; + } while (y < height); + } + sse = summary_4x64_avx2(sum); + break; + } + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c b/media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c new file mode 100644 index 0000000000..4a7585c57e --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/sse_sse4.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + _mm_storel_epi64((__m128i *)&sum, sum_1x64); + return sum; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { + const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); + *sum64 = _mm_add_epi64(sum0, *sum64); + *sum64 = _mm_add_epi64(sum1, *sum64); +} +#endif + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a); + const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, __m128i *sum) { + const __m128i v_a0 = load_unaligned_u32(a); + const __m128i v_a1 = load_unaligned_u32(a + a_stride); + const __m128i v_b0 = load_unaligned_u32(b); + const __m128i v_b1 = load_unaligned_u32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b, + __m128i *sum) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + sse8_sse4_1(a, b, &sum); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: + if (width & 0x07) { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); + i += 8; + } while (i + 4 < width); + sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + } while (y < height); + } else { + do { + int i = 0; + do { + sse8_sse4_1(a + i, b + i, &sum); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + } + sse = summary_all_sse4(&sum); + break; + } + + return sse; +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, + int a_stride, const uint16_t *b, + int b_stride) { + const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a); + const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 64 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 64; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 32: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 32 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 32; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + case 64: + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + highbd_sse_w8_sse4_1(&sum32, a, b); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 16 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 16; + } while (y < height); + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + default: + if (width & 0x7) { + do { + __m128i sum32 = _mm_setzero_si128(); + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); + i += 8; + } while (i + 4 < width); + highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); + a += (a_stride << 1); + b += (b_stride << 1); + y += 2; + summary_32_sse4(&sum32, &sum); + } while (y < height); + } else { + do { + int l = 0; + __m128i sum32 = _mm_setzero_si128(); + do { + int i = 0; + do { + highbd_sse_w8_sse4_1(&sum32, a + i, b + i); + i += 8; + } while (i < width); + a += a_stride; + b += b_stride; + l += 1; + } while (l < 8 && l < (height - y)); + summary_32_sse4(&sum32, &sum); + y += 8; + } while (y < height); + } + _mm_storel_epi64((__m128i *)&sse, + _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); + break; + } + return sse; +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm index 6d58321e03..1ad3b88c8d 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm @@ -44,7 +44,10 @@ paddd %1, xmm1 SUM_ACROSS_Q %1 %endmacro -;void ssim_parms_sse2( + +SECTION .text + +;void vpx_ssim_parms_8x8_sse2( ; unsigned char *s, ; int sp, ; unsigned char *r, @@ -61,97 +64,7 @@ ; or pavgb At this point this is just meant to be first pass for calculating ; all the parms needed for 16x16 ssim so we can play with dssim as distortion ; in mode selection code. -global sym(vpx_ssim_parms_16x16_sse2) PRIVATE -sym(vpx_ssim_parms_16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 15 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;s - mov rcx, arg(1) ;sp - mov rdi, arg(2) ;r - mov rax, arg(3) ;rp - - pxor xmm0, xmm0 - pxor xmm15,xmm15 ;sum_s - pxor xmm14,xmm14 ;sum_r - pxor xmm13,xmm13 ;sum_sq_s - pxor xmm12,xmm12 ;sum_sq_r - pxor xmm11,xmm11 ;sum_sxr - - mov rdx, 16 ;row counter -.NextRow: - - ;grab source and reference pixels - movdqu xmm5, [rsi] - movdqu xmm6, [rdi] - movdqa xmm3, xmm5 - movdqa xmm4, xmm6 - punpckhbw xmm3, xmm0 ; high_s - punpckhbw xmm4, xmm0 ; high_r - - TABULATE_SSIM - - movdqa xmm3, xmm5 - movdqa xmm4, xmm6 - punpcklbw xmm3, xmm0 ; low_s - punpcklbw xmm4, xmm0 ; low_r - - TABULATE_SSIM - - add rsi, rcx ; next s row - add rdi, rax ; next r row - - dec rdx ; counter - jnz .NextRow - - SUM_ACROSS_W xmm15 - SUM_ACROSS_W xmm14 - SUM_ACROSS_Q xmm13 - SUM_ACROSS_Q xmm12 - SUM_ACROSS_Q xmm11 - - mov rdi,arg(4) - movd [rdi], xmm15; - mov rdi,arg(5) - movd [rdi], xmm14; - mov rdi,arg(6) - movd [rdi], xmm13; - mov rdi,arg(7) - movd [rdi], xmm12; - mov rdi,arg(8) - movd [rdi], xmm11; - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void ssim_parms_sse2( -; unsigned char *s, -; int sp, -; unsigned char *r, -; int rp -; uint32_t *sum_s, -; uint32_t *sum_r, -; uint32_t *sum_sq_s, -; uint32_t *sum_sq_r, -; uint32_t *sum_sxr); -; -; TODO: Use parm passing through structure, probably don't need the pxors -; ( calling app will initialize to 0 ) could easily fit everything in sse2 -; without too much hastle, and can probably do better estimates with psadw -; or pavgb At this point this is just meant to be first pass for calculating -; all the parms needed for 16x16 ssim so we can play with dssim as distortion -; in mode selection code. -global sym(vpx_ssim_parms_8x8_sse2) PRIVATE +globalsym(vpx_ssim_parms_8x8_sse2) sym(vpx_ssim_parms_8x8_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm index cee4468c1f..d1d8d3460e 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm @@ -41,12 +41,12 @@ SECTION .text ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, ; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, +; const uint8_t *ref, ptrdiff_t ref_stride, ; int height, unsigned int *sse); ; ; This function returns the SE and stores SSE in the given pointer. -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse +%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse psubw %3, %4 psubw %1, %2 paddw %5, %3 @@ -95,7 +95,7 @@ SECTION .text %endmacro %macro INC_SRC_BY_SRC_STRIDE 0 -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 add srcq, src_stridemp %else add srcq, src_strideq @@ -114,84 +114,65 @@ SECTION .text ; 11, not 13, if the registers are ordered correctly. May make a minor speed ; difference on Win64 -%ifdef PIC ; 64bit PIC +%if VPX_ARCH_X86_64 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq + x_offset, y_offset, ref, ref_stride, \ + second_pred, second_stride, height, sse + %define second_str second_strideq %else - cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + height, sse %endif %define block_height heightd %define bilin_filter sseq %else - %if ARCH_X86=1 && CONFIG_PIC=1 + %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, ref, ref_stride, \ + second_pred, second_stride, height, sse %define block_height dword heightm - %define sec_str sec_stridemp - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back + %define second_str second_stridemp %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse, \ - g_bilin_filter, g_pw_8 + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + height, sse %define block_height heightd - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back %else %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse - %if ARCH_X86_64 - %define block_height heightd - %define sec_str sec_strideq - %else + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + ref, ref_stride, second_pred, second_stride, \ + height, sse %define block_height dword heightm - %define sec_str sec_stridemp - %endif + %define second_str second_stridemp %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, ref, ref_stride, \ + height, sse %define block_height heightd %endif - %define bilin_filter bilin_filter_m %endif %endif @@ -211,7 +192,7 @@ SECTION .text %if %1 < 16 sar block_height, 1 %if %2 == 1 ; avg - shl sec_str, 1 + shl second_str, 1 %endif %endif @@ -226,9 +207,9 @@ SECTION .text .x_zero_y_zero_loop: %if %1 == 16 movu m0, [srcq] - mova m1, [dstq] + mova m1, [refq] %if %2 == 1 ; avg - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m3, m1, m5 punpcklbw m1, m5 %endif @@ -242,7 +223,7 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] %if %2 == 1 ; avg @@ -256,14 +237,14 @@ SECTION .text movx m2, [srcq+src_strideq] %endif - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] %if %2 == 1 ; avg %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 %endif punpcklbw m3, m5 @@ -284,10 +265,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_zero_y_zero_loop @@ -302,11 +283,11 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+src_strideq] - mova m1, [dstq] + mova m1, [refq] pavgb m0, m4 punpckhbw m3, m1, m5 %if %2 == 1 ; avg - pavgb m0, [secq] + pavgb m0, [second_predq] %endif punpcklbw m1, m5 punpckhbw m2, m0, m5 @@ -314,7 +295,7 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m2, [srcq+src_strideq] @@ -325,22 +306,22 @@ SECTION .text movx m1, [srcq+src_strideq*2] punpckldq m2, m1 %endif - movx m1, [dstq] + movx m1, [refq] %if %1 > 4 movlhps m0, m2 %else ; 4xh punpckldq m0, m2 %endif - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] pavgb m0, m2 punpcklbw m1, m5 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpcklbw m3, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh - movh m4, [secq] + movh m4, [second_predq] pavgb m0, m4 punpcklbw m3, m5 punpcklbw m0, m5 @@ -348,9 +329,9 @@ SECTION .text %endif %else ; !avg movx m4, [srcq+src_strideq*2] - movx m1, [dstq] + movx m1, [refq] pavgb m0, m2 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] pavgb m2, m4 punpcklbw m0, m5 punpcklbw m2, m5 @@ -360,10 +341,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_zero_y_half_loop @@ -371,21 +352,21 @@ SECTION .text .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 +%if VPX_ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+y_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 %else ; x86-32 or mmx -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 ; x_offset == 0, reuse x_offset reg %define tempq x_offsetq add y_offsetq, g_bilin_filterm @@ -397,7 +378,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -405,7 +386,7 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+src_strideq] - mova m1, [dstq] + mova m1, [refq] %if cpuflag(ssse3) punpckhbw m2, m0, m4 punpcklbw m0, m4 @@ -437,7 +418,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -446,14 +427,14 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m2, [srcq+src_strideq] movx m4, [srcq+src_strideq*2] - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if cpuflag(ssse3) - movx m1, [dstq] + movx m1, [refq] punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a @@ -473,7 +454,7 @@ SECTION .text pmullw m4, filter_y_b paddw m0, m1 paddw m2, filter_rnd - movx m1, [dstq] + movx m1, [refq] paddw m2, m4 %endif psraw m0, 4 @@ -485,11 +466,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -499,10 +480,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_zero_y_other_loop @@ -523,11 +504,11 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+1] - mova m1, [dstq] + mova m1, [refq] pavgb m0, m4 punpckhbw m3, m1, m5 %if %2 == 1 ; avg - pavgb m0, [secq] + pavgb m0, [second_predq] %endif punpcklbw m1, m5 punpckhbw m2, m0, m5 @@ -535,7 +516,7 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m4, [srcq+1] @@ -549,17 +530,17 @@ SECTION .text movx m2, [srcq+src_strideq+1] punpckldq m4, m2 %endif - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] pavgb m0, m4 punpcklbw m3, m5 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpcklbw m1, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m1, m5 punpcklbw m0, m5 @@ -567,10 +548,10 @@ SECTION .text %endif %else ; !avg movx m2, [srcq+src_strideq] - movx m1, [dstq] + movx m1, [refq] pavgb m0, m4 movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] pavgb m2, m4 punpcklbw m0, m5 punpcklbw m2, m5 @@ -580,10 +561,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_half_y_zero_loop @@ -602,13 +583,13 @@ SECTION .text .x_half_y_half_loop: movu m4, [srcq] movu m3, [srcq+1] - mova m1, [dstq] + mova m1, [refq] pavgb m4, m3 punpckhbw m3, m1, m5 pavgb m0, m4 %if %2 == 1 ; avg punpcklbw m1, m5 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else @@ -620,7 +601,7 @@ SECTION .text mova m0, m4 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m3, [srcq+1] @@ -647,13 +628,13 @@ SECTION .text punpckldq m0, m2 pshuflw m4, m2, 0xe %endif - movx m1, [dstq] + movx m1, [refq] pavgb m0, m2 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 %endif punpcklbw m3, m5 @@ -672,8 +653,8 @@ SECTION .text pavgb m4, m1 pavgb m0, m2 pavgb m2, m4 - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] punpcklbw m0, m5 punpcklbw m2, m5 punpcklbw m3, m5 @@ -683,10 +664,10 @@ SECTION .text mova m0, m4 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_half_y_half_loop @@ -694,21 +675,21 @@ SECTION .text .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 +%if VPX_ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+y_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 %else ;x86_32 -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 ; x_offset == 0.5. We can reuse x_offset reg %define tempq x_offsetq add y_offsetq, g_bilin_filterm @@ -720,7 +701,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -732,7 +713,7 @@ SECTION .text .x_half_y_other_loop: movu m4, [srcq] movu m2, [srcq+1] - mova m1, [dstq] + mova m1, [refq] pavgb m4, m2 %if cpuflag(ssse3) punpckhbw m2, m0, m4 @@ -762,7 +743,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -771,7 +752,7 @@ SECTION .text mova m0, m4 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m3, [srcq+1] @@ -787,9 +768,9 @@ SECTION .text movx m3, [srcq+src_strideq+1] pavgb m2, m1 pavgb m4, m3 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if cpuflag(ssse3) - movx m1, [dstq] + movx m1, [refq] punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a @@ -809,7 +790,7 @@ SECTION .text pmullw m1, m4, filter_y_b paddw m2, filter_rnd paddw m2, m1 - movx m1, [dstq] + movx m1, [refq] %endif psraw m0, 4 psraw m2, 4 @@ -820,11 +801,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -835,10 +816,10 @@ SECTION .text mova m0, m4 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_half_y_other_loop @@ -852,21 +833,21 @@ SECTION .text jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 +%if VPX_ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+x_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 %else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 ;y_offset == 0. We can reuse y_offset reg. %define tempq y_offsetq add x_offsetq, g_bilin_filterm @@ -878,7 +859,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -886,7 +867,7 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+1] - mova m1, [dstq] + mova m1, [refq] %if cpuflag(ssse3) punpckhbw m2, m0, m4 punpcklbw m0, m4 @@ -913,7 +894,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -922,16 +903,16 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] movx m2, [srcq+src_strideq] movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if cpuflag(ssse3) punpcklbw m0, m1 - movx m1, [dstq] + movx m1, [refq] punpcklbw m2, m4 pmaddubsw m0, filter_x_a pmaddubsw m2, filter_x_a @@ -951,7 +932,7 @@ SECTION .text pmullw m4, filter_x_b paddw m0, m1 paddw m2, filter_rnd - movx m1, [dstq] + movx m1, [refq] paddw m2, m4 %endif psraw m0, 4 @@ -963,11 +944,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -977,10 +958,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_other_y_zero_loop @@ -994,21 +975,21 @@ SECTION .text jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 +%if VPX_ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+x_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 %else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 ; y_offset == 0.5. We can reuse y_offset reg. %define tempq y_offsetq add x_offsetq, g_bilin_filterm @@ -1020,7 +1001,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -1056,7 +1037,7 @@ SECTION .text movu m4, [srcq] movu m3, [srcq+1] %if cpuflag(ssse3) - mova m1, [dstq] + mova m1, [refq] punpckhbw m2, m4, m3 punpcklbw m4, m3 pmaddubsw m2, filter_x_a @@ -1082,7 +1063,7 @@ SECTION .text paddw m2, filter_rnd paddw m4, m3 paddw m2, m1 - mova m1, [dstq] + mova m1, [refq] psraw m4, 4 psraw m2, 4 punpckhbw m3, m1, m5 @@ -1096,7 +1077,7 @@ SECTION .text %endif %if %2 == 1 ; avg ; FIXME(rbultje) pipeline - pavgb m0, [secq] + pavgb m0, [second_predq] %endif punpckhbw m2, m0, m5 punpcklbw m0, m5 @@ -1104,7 +1085,7 @@ SECTION .text mova m0, m4 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] @@ -1132,8 +1113,8 @@ SECTION .text punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] paddw m2, filter_rnd paddw m4, filter_rnd %else @@ -1148,9 +1129,9 @@ SECTION .text pmullw m3, filter_x_b paddw m4, filter_rnd paddw m2, m1 - movx m1, [dstq] + movx m1, [refq] paddw m4, m3 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %endif psraw m2, 4 psraw m4, 4 @@ -1163,11 +1144,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -1179,10 +1160,10 @@ SECTION .text mova m0, m4 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_other_y_half_loop @@ -1192,12 +1173,12 @@ SECTION .text STORE_AND_RET %1 .x_nonhalf_y_nonhalf: -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if VPX_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift shl y_offsetd, filter_idx_shift -%if ARCH_X86_64 && %1 > 4 +%if VPX_ARCH_X86_64 && %1 > 4 mova m8, [bilin_filter+x_offsetq] %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] @@ -1206,14 +1187,14 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m11, [bilin_filter+y_offsetq+16] %endif - mova m12, [pw_8] + mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 %define filter_y_b m11 %define filter_rnd m12 %else ; x86-32 -%if ARCH_X86=1 && CONFIG_PIC=1 +%if VPX_ARCH_X86=1 && CONFIG_PIC=1 ; In this case, there is NO unused register. Used src_stride register. Later, ; src_stride has to be loaded from stack when it is needed. %define tempq src_strideq @@ -1234,7 +1215,7 @@ SECTION .text %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -1273,7 +1254,7 @@ SECTION .text %if cpuflag(ssse3) movu m4, [srcq] movu m3, [srcq+1] - mova m1, [dstq] + mova m1, [refq] punpckhbw m2, m4, m3 punpcklbw m4, m3 pmaddubsw m2, filter_x_a @@ -1319,7 +1300,7 @@ SECTION .text pmullw m0, filter_y_a pmullw m3, filter_y_b paddw m2, m1 - mova m1, [dstq] + mova m1, [refq] paddw m0, filter_rnd psraw m2, 4 paddw m0, m3 @@ -1330,7 +1311,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -1338,7 +1319,7 @@ SECTION .text mova m0, m4 INC_SRC_BY_SRC_STRIDE - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] @@ -1374,8 +1355,8 @@ SECTION .text punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a - movx m3, [dstq+dst_strideq] - movx m1, [dstq] + movx m3, [refq+ref_strideq] + movx m1, [refq] paddw m2, filter_rnd paddw m4, filter_rnd psraw m2, 4 @@ -1414,9 +1395,9 @@ SECTION .text pmullw m1, m4, filter_y_b paddw m2, filter_rnd paddw m0, m3 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] paddw m2, m1 - movx m1, [dstq] + movx m1, [refq] psraw m0, 4 psraw m2, 4 punpcklbw m3, m5 @@ -1429,11 +1410,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -1443,10 +1424,10 @@ SECTION .text mova m0, m4 INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_other_y_other_loop diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c new file mode 100644 index 0000000000..4849581ed4 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/subtract_avx2.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr, + const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); + const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); + const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); + const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); + _mm256_storeu_si256((__m256i *)diff_ptr, d_0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1); +} + +static VPX_FORCE_INLINE void subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr); + const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr); + const __m256i s_0 = _mm256_cvtepu8_epi16(s); + const __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_storeu_si256((__m256i *)diff_ptr, d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static VPX_FORCE_INLINE void subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static VPX_FORCE_INLINE void subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int j; + for (j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 32: + subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + case 64: + subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + break; + default: + vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, + const uint8_t *src8_ptr, + ptrdiff_t src_stride, + const uint8_t *pred8_ptr, + ptrdiff_t pred_stride, int bd) { + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr); + (void)bd; + if (cols == 64) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); + const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32)); + const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); + const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32)); + const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + const __m256i d2 = _mm256_sub_epi16(s2, p2); + const __m256i d3 = _mm256_sub_epi16(s3, p3); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); + _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2); + _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } while (--j != 0); + } else if (cols == 32) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } while (--j != 0); + } else if (cols == 16) { + int j = rows; + do { + const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr); + const __m256i s1 = + _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride)); + const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr); + const __m256i p1 = + _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride)); + const __m256i d0 = _mm256_sub_epi16(s0, p0); + const __m256i d1 = _mm256_sub_epi16(s1, p1); + _mm256_storeu_si256((__m256i *)diff_ptr, d0); + _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } else if (cols == 8) { + int j = rows; + do { + const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr); + const __m128i s1 = + _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride)); + const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr); + const __m128i p1 = + _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride)); + const __m128i d0 = _mm_sub_epi16(s0, p0); + const __m128i d1 = _mm_sub_epi16(s1, p1); + _mm_storeu_si128((__m128i *)diff_ptr, d0); + _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } else { + int j = rows; + assert(cols == 4); + do { + const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i s1 = + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr); + const __m128i p1 = + _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride)); + const __m128i d0 = _mm_sub_epi16(s0, p0); + const __m128i d1 = _mm_sub_epi16(s1, p1); + _mm_storel_epi64((__m128i *)diff_ptr, d0); + _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1); + src_ptr += src_stride << 1; + pred_ptr += pred_stride << 1; + diff_ptr += diff_stride << 1; + j -= 2; + } while (j != 0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm index 4273efb854..e3055ab292 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/subtract_sse2.asm @@ -124,4 +124,5 @@ INIT_MMX lea predq, [predq+pred_str*2] sub rowsd, 2 jg .loop_4 + emms RET diff --git a/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c index bc5362e10f..df6514b2c4 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/sum_squares_sse2.c @@ -10,119 +10,96 @@ #include #include -#include #include "./vpx_dsp_rtcd.h" - -static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src, - int stride) { - const __m128i v_val_0_w = - _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); - const __m128i v_val_1_w = - _mm_loadl_epi64((const __m128i *)(src + 1 * stride)); - const __m128i v_val_2_w = - _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); - const __m128i v_val_3_w = - _mm_loadl_epi64((const __m128i *)(src + 3 * stride)); - - const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); - const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); - const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); - const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - - const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); - const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - - const __m128i v_sum_d = - _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); - - return (uint64_t)_mm_cvtsi128_si32(v_sum_d); -} - -// TODO(jingning): Evaluate the performance impact here. -#ifdef __GNUC__ -// This prevents GCC/Clang from inlining this function into -// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack -// maintenance instructions in the common case of 4x4. -__attribute__((noinline)) -#endif -static uint64_t -vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) { - int r, c; - const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); - __m128i v_acc_q = _mm_setzero_si128(); - - for (r = 0; r < size; r += 8) { - __m128i v_acc_d = _mm_setzero_si128(); - - for (c = 0; c < size; c += 8) { - const int16_t *b = src + c; - const __m128i v_val_0_w = - _mm_load_si128((const __m128i *)(b + 0 * stride)); - const __m128i v_val_1_w = - _mm_load_si128((const __m128i *)(b + 1 * stride)); - const __m128i v_val_2_w = - _mm_load_si128((const __m128i *)(b + 2 * stride)); - const __m128i v_val_3_w = - _mm_load_si128((const __m128i *)(b + 3 * stride)); - const __m128i v_val_4_w = - _mm_load_si128((const __m128i *)(b + 4 * stride)); - const __m128i v_val_5_w = - _mm_load_si128((const __m128i *)(b + 5 * stride)); - const __m128i v_val_6_w = - _mm_load_si128((const __m128i *)(b + 6 * stride)); - const __m128i v_val_7_w = - _mm_load_si128((const __m128i *)(b + 7 * stride)); - - const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); - const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); - const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); - const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); - const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); - const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); - const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); - - const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); - const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); - const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); - - const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); - - v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); - v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); - } - - v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); - v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); - - src += 8 * stride; - } - - v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); - -#if ARCH_X86_64 - return (uint64_t)_mm_cvtsi128_si64(v_acc_q); -#else - { - uint64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, v_acc_q); - return tmp; - } -#endif -} +#include "vpx_dsp/x86/mem_sse2.h" uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { - // 4 elements per row only requires half an XMM register, so this - // must be a special case, but also note that over 75% of all calls - // are with size == 4, so it is also the common case. + // Over 75% of all calls are with size == 4. if (size == 4) { - return vpx_sum_squares_2d_i16_4x4_sse2(src, stride); + __m128i s[2], sq[2], ss; + + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); + s[0] = loadh_epi64(s[0], src + 1 * stride); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); + s[1] = loadh_epi64(s[1], src + 3 * stride); + sq[0] = _mm_madd_epi16(s[0], s[0]); + sq[1] = _mm_madd_epi16(s[1], s[1]); + sq[0] = _mm_add_epi32(sq[0], sq[1]); + ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8)); + ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32)); + + return (uint64_t)_mm_cvtsi128_si32(ss); } else { // Generic case - return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size); + int r = size; + const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1); + __m128i v_acc_q = _mm_setzero_si128(); + + assert(size % 8 == 0); + + do { + int c = 0; + __m128i v_acc_d = _mm_setzero_si128(); + + do { + const int16_t *const b = src + c; + const __m128i v_val_0_w = + _mm_load_si128((const __m128i *)(b + 0 * stride)); + const __m128i v_val_1_w = + _mm_load_si128((const __m128i *)(b + 1 * stride)); + const __m128i v_val_2_w = + _mm_load_si128((const __m128i *)(b + 2 * stride)); + const __m128i v_val_3_w = + _mm_load_si128((const __m128i *)(b + 3 * stride)); + const __m128i v_val_4_w = + _mm_load_si128((const __m128i *)(b + 4 * stride)); + const __m128i v_val_5_w = + _mm_load_si128((const __m128i *)(b + 5 * stride)); + const __m128i v_val_6_w = + _mm_load_si128((const __m128i *)(b + 6 * stride)); + const __m128i v_val_7_w = + _mm_load_si128((const __m128i *)(b + 7 * stride)); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); + c += 8; + } while (c < size); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); + + src += 8 * stride; + r -= 8; + } while (r); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + +#if VPX_ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(v_acc_q); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_acc_q); + return tmp; + } +#endif } } diff --git a/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h new file mode 100644 index 0000000000..b4f1190d74 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/transpose_sse2.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ +#define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ + +#include // SSE2 + +#include "./vpx_config.h" + +static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + + // Unpack 16 bit elements resulting in: + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + return _mm_unpacklo_epi16(a0, a1); +} + +static INLINE void transpose_8bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); + + // Unpack 16 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpackhi_epi16(a0, a1); + const __m128i b2 = _mm_unpacklo_epi16(a2, a3); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + + // Unpack 32 bit elements resulting in: + // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const __m128i c0 = _mm_unpacklo_epi32(b0, b2); + const __m128i c1 = _mm_unpackhi_epi32(b0, b2); + const __m128i c2 = _mm_unpacklo_epi32(b1, b3); + const __m128i c3 = _mm_unpackhi_epi32(b1, b3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(c0, c0); + out[1] = _mm_unpackhi_epi64(c0, c0); + out[2] = _mm_unpacklo_epi64(c1, c1); + out[3] = _mm_unpackhi_epi64(c1, c1); + out[4] = _mm_unpacklo_epi64(c2, c2); + out[5] = _mm_unpackhi_epi64(c2, c2); + out[6] = _mm_unpacklo_epi64(c3, c3); + out[7] = _mm_unpackhi_epi64(c3, c3); +} + +static INLINE void transpose_16bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // out[0]: 00 10 20 30 01 11 21 31 + // out[1]: 02 12 22 32 03 13 23 33 + out[0] = _mm_unpacklo_epi32(a0, a1); + out[1] = _mm_unpackhi_epi32(a0, a1); +} + +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 02 12 22 32 03 13 23 33 + // b3: 42 52 62 72 43 53 63 73 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpackhi_epi32(a0, a1); + const __m128i b3 = _mm_unpackhi_epi32(a2, a3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b2, b3); + out[3] = _mm_unpackhi_epi64(b2, b3); +} + +static INLINE void transpose_16bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + // a6: 44 54 45 55 46 56 47 57 + // a7: 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 04 14 24 34 05 15 25 35 + // b3: 44 54 64 74 45 55 65 75 + // b4: 02 12 22 32 03 13 23 33 + // b5: 42 52 62 72 43 53 63 73 + // b6: 06 16 26 36 07 17 27 37 + // b7: 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b3 = _mm_unpacklo_epi32(a6, a7); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b5 = _mm_unpackhi_epi32(a2, a3); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + const __m128i b7 = _mm_unpackhi_epi32(a6, a7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b4, b5); + out[3] = _mm_unpackhi_epi64(b4, b5); + out[4] = _mm_unpacklo_epi64(b2, b3); + out[5] = _mm_unpackhi_epi64(b2, b3); + out[6] = _mm_unpacklo_epi64(b6, b7); + out[7] = _mm_unpackhi_epi64(b6, b7); +} + +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + +static INLINE void transpose_32bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); +} + +static INLINE void transpose_32bit_4x4x2(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // in[4]: 04 05 06 07 + // in[5]: 14 15 16 17 + // in[6]: 24 25 26 27 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); + const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +static INLINE void transpose_32bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 04 05 06 07 + // in[2]: 10 11 12 13 + // in[3]: 14 15 16 17 + // in[4]: 20 21 22 23 + // in[5]: 24 25 26 27 + // in[6]: 30 31 32 33 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); + const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); + const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); + const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); + const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); + const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +#endif // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h index f8edb1b787..de5ce43b00 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h +++ b/media/libvpx/libvpx/vpx_dsp/x86/txfm_common_sse2.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_ -#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_ +#ifndef VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_ +#define VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_ #include #include "vpx/vpx_integer.h" @@ -18,6 +18,9 @@ _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) +#define pair_set_epi32(a, b) \ + _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) + #define dual_set_epi16(a, b) \ _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) @@ -26,4 +29,4 @@ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) -#endif // VPX_DSP_X86_TXFM_COMMON_SSE2_H_ +#endif // VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c index 8428e0520d..8305b9f20f 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c @@ -7,137 +7,852 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + +#include // AVX2 + #include "./vpx_dsp_rtcd.h" -typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); +/* clang-format off */ +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, +}; -void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum); +DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = { + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, + 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 +}; +/* clang-format on */ -static void variance_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, int w, int h, - unsigned int *sse, int *sum, get_var_avx2 var_fn, - int block_size) { - int i, j; +static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2); - *sse = 0; - *sum = 0; + // unpack into pairs of source and reference values + const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); + const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); - for (i = 0; i < h; i += 16) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j], - ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } + // subtract adjacent elements using src*1 + ref*-1 + const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); + const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); + + // add to the running totals + *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); + *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); +} + +static INLINE void variance_final_from_32bit_sum_avx2(__m256i vsse, + __m128i vsum, + unsigned int *const sse, + int *const sum) { + // extract the low lane and add it to the high lane + const __m128i sse_reg_128 = _mm_add_epi32(_mm256_castsi256_si128(vsse), + _mm256_extractf128_si256(vsse, 1)); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + *((int *)sum) = _mm_extract_epi32(res, 1); +} + +static INLINE void variance_final_from_16bit_sum_avx2(__m256i vsse, + __m256i vsum, + unsigned int *const sse, + int *const sum) { + // extract the low lane and add it to the high lane + const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + const __m128i sum_reg_64 = + _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64); + + variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse, sum); +} + +static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { + const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); + const __m256i sum_hi = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); + return _mm256_add_epi32(sum_lo, sum_hi); +} + +static INLINE void variance8_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + __m128i src0, src1, ref0, ref1; + __m256i ss, rr, diff; + + // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00 + src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride)); + + // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10 + src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride)); + + // s17 s16...s11 s10 s07 s06...s01 s00 (8bit) + src0 = _mm_unpacklo_epi64(src0, src1); + + // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit) + ss = _mm256_cvtepu8_epi16(src0); + + // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00 + ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride)); + + // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10 + ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride)); + + // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit) + ref0 = _mm_unpacklo_epi64(ref0, ref1); + + // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit) + rr = _mm256_cvtepu8_epi16(ref0); + + diff = _mm256_sub_epi16(ss, rr); + *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff)); + *sum = _mm256_add_epi16(*sum, diff); +} + +static INLINE void variance16_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); + const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); + const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance32_kernel_avx2(const uint8_t *const src, + const uint8_t *const ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance8_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i += 2) { + variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; } } -unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, - vpx_get16x16var_avx2, 16); - return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); +static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i += 2) { + variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } } -unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse; +static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i++) { + variance32_kernel_avx2(src, ref, vsse, vsum); + src += src_stride; + ref += ref_stride; + } } -unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + + for (i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + src += src_stride; + ref += ref_stride; + } +} + +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, sum); +} + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo); \ + *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + +static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src); + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + second_pred += second_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +// (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction. +static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, + int second_stride, int do_sec, int height, + __m256i *sum_reg, __m256i *sse_reg, + int sstep) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + second_pred += second_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, src_stride); +} + +static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, 1); +} + +static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg); + prev_src_avg = src_avg; + + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + second_pred += second_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg); + } + // save current source average + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +// (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction. +static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, + int second_stride, int do_sec, int height, + __m256i *sum_reg, __m256i *sse_reg, + int offset, int sstep) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (offset << 5))); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(filter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); + second_pred += second_stride; + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } +} + +static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int y_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, y_offset, src_stride); +} + +static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, x_offset, 1); +} + +static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int y_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + int i; + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); + exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg); + exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg); + prev_src_avg = src_avg; + + FILTER_SRC(filter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + second_pred += second_stride; + } + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i src_reg, src_pack; + int i; + exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b); + exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b); + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + + src += src_stride; + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(filter) + + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg); + second_pred += second_stride; + } else { + exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg); + } + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + dst += dst_stride; + src += src_stride; + } +} + +static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset, int y_offset) { + const __m256i zero_reg = _mm256_setzero_si256(); + const __m256i pw8 = _mm256_set1_epi16(8); + const __m256i xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5))); + const __m256i yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5))); + const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); + __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i prev_src_pack, src_pack; + int i; + exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b); + exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b); + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + src += src_stride; + + for (i = 0; i < height; i++) { + const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); + const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src); + const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1)); + exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1); + exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1); + + FILTER_SRC(xfilter) + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + + // merge previous pack to current pack source + exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack); + exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack); + + FILTER_SRC(yfilter) + if (do_sec) { + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); + const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); + exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); + exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); + second_pred += second_stride; + } + + prev_src_pack = src_pack; + + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + src += src_stride; + } +} + +static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, int second_stride, + int do_sec, int height, unsigned int *sse) { + const __m256i zero_reg = _mm256_setzero_si256(); + __m256i sum_reg = _mm256_setzero_si256(); + __m256i sse_reg = _mm256_setzero_si256(); + __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + int sum; + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + spv32_x0_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); + // x_offset = 0 and y_offset = 4 + } else if (y_offset == 4) { + spv32_x0_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); + // x_offset = 0 and y_offset = bilin interpolation + } else { + spv32_x0_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, y_offset); + } + // x_offset = 4 and y_offset = 0 + } else if (x_offset == 4) { + if (y_offset == 0) { + spv32_x4_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); + // x_offset = 4 and y_offset = 4 + } else if (y_offset == 4) { + spv32_x4_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); + // x_offset = 4 and y_offset = bilin interpolation + } else { + spv32_x4_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, y_offset); + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + spv32_xb_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset); + // x_offset = bilin interpolation and y_offset = 4 + } else if (y_offset == 4) { + spv32_xb_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset); + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + spv32_xb_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset, y_offset); + } + } + CALC_SUM_AND_SSE + return sum; +} + +static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { + return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, + NULL, 0, 0, height, sse); +} + +static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + const uint8_t *second_pred, + int second_stride, int height, + unsigned int *sse) { + return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, + second_pred, second_stride, 1, height, sse); +} + +typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 6); +} + +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 7); +} + +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 7); +} + +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, - vpx_get32x32var_avx2, 32); + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); +} + +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); } -unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, - vpx_get32x32var_avx2, 32); + __m256i vsse, vsum; + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + __m128i vsum_128; + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + vsum_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + vsum_128 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), + _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 10); } -unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, - vpx_get32x32var_avx2, 32); - return *sse - (uint32_t)(((int64_t)sum * sum) >> 12); -} - -unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, - vpx_get32x32var_avx2, 32); + __m256i vsse, vsum; + __m128i vsum_128; + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, &vsse, &vsum); + vsum = sum_to_32bit_avx2(vsum); + vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, - int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, - int height, unsigned int *sse); +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse = _mm256_setzero_si256(); + __m256i vsum = _mm256_setzero_si256(); + __m128i vsum_128; + int sum; + variance64_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + vsum = sum_to_32bit_avx2(vsum); + vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); +} -unsigned int vpx_sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, - int height, unsigned int *sseptr); +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse = _mm256_setzero_si256(); + __m256i vsum = _mm256_setzero_si256(); + __m128i vsum_128; + int sum; + int i = 0; -unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { + for (i = 0; i < 2; i++) { + __m256i vsum16; + variance64_avx2(src_ptr + 32 * i * src_stride, src_stride, + ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse, + &vsum16); + vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); + } + vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), + _mm256_extractf128_si256(vsum, 1)); + variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse; +} + +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + int sum; + __m256i vsse, vsum; + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse; +} + +unsigned int vpx_sub_pixel_variance64x64_avx2( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { unsigned int sse1; - const int se1 = vpx_sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); + const int se1 = sub_pixel_variance32xh_avx2( + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 64, &sse1); unsigned int sse2; const int se2 = - vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, - dst + 32, dst_stride, 64, &sse2); + sub_pixel_variance32xh_avx2(src_ptr + 32, src_stride, x_offset, y_offset, + ref_ptr + 32, ref_stride, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; return *sse - (uint32_t)(((int64_t)se * se) >> 12); } -unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - const int se = vpx_sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); +unsigned int vpx_sub_pixel_variance32x32_avx2( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + const int se = sub_pixel_variance32xh_avx2( + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 32, sse); return *sse - (uint32_t)(((int64_t)se * se) >> 10); } unsigned int vpx_sub_pixel_avg_variance64x64_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, + const uint8_t *second_pred) { unsigned int sse1; - const int se1 = vpx_sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); + const int se1 = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset, + y_offset, ref_ptr, ref_stride, + second_pred, 64, 64, &sse1); unsigned int sse2; - const int se2 = vpx_sub_pixel_avg_variance32xh_avx2( - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, - 64, 64, &sse2); + const int se2 = sub_pixel_avg_variance32xh_avx2( + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, ref_stride, + second_pred + 32, 64, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; @@ -146,10 +861,12 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2( } unsigned int vpx_sub_pixel_avg_variance32x32_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, + const uint8_t *second_pred) { // Process 32 elements in parallel. - const int se = vpx_sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); + const int se = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset, + y_offset, ref_ptr, ref_stride, + second_pred, 32, 32, sse); return *sse - (uint32_t)(((int64_t)se * se) >> 10); } diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_impl_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_impl_avx2.c deleted file mode 100644 index 51e6b19ad1..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/x86/variance_impl_avx2.c +++ /dev/null @@ -1,708 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // AVX2 - -#include "./vpx_dsp_rtcd.h" -#include "vpx_ports/mem.h" - -/* clang-format off */ -DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, - 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, -}; -/* clang-format on */ - -void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *SSE, int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i, src_2strides, ref_2strides; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing two strides in a 256 bit register reducing the number - // of loop stride by half (comparing to the sse2 code) - src_2strides = source_stride << 1; - ref_2strides = recon_stride << 1; - for (i = 0; i < 8; i++) { - src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr))); - src = _mm256_inserti128_si256( - src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1); - - ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr))); - ref = _mm256_inserti128_si256( - ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = - _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - - src_ptr += src_2strides; - ref_ptr += ref_2strides; - } - - { - __m128i sum_res, madd_res; - __m128i expand_sum_low, expand_sum_high, expand_sum; - __m128i expand_madd_low, expand_madd_high, expand_madd; - __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // extract the low lane and add it to the high lane - sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), - _mm256_extractf128_si256(sum_ref_src, 1)); - - madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), - _mm256_extractf128_si256(madd_ref_src, 1)); - - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = - _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - expand_sum_high = - _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - - // shifting the sign 16 bits right - expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); - - expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); - - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = - _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - expand_madd_high = - _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - - expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); - - ex_expand_sum_low = - _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - ex_expand_sum_high = - _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - - ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - - // shift 8 bytes eight - madd_res = _mm_srli_si128(expand_madd, 8); - sum_res = _mm_srli_si128(ex_expand_sum, 8); - - madd_res = _mm_add_epi32(madd_res, expand_madd); - sum_res = _mm_add_epi32(sum_res, ex_expand_sum); - - *((int *)SSE) = _mm_cvtsi128_si32(madd_res); - - *((int *)Sum) = _mm_cvtsi128_si32(sum_res); - } -} - -void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *SSE, int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing 32 elements in parallel - for (i = 0; i < 16; i++) { - src = _mm256_loadu_si256((__m256i const *)(src_ptr)); - - ref = _mm256_loadu_si256((__m256i const *)(ref_ptr)); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = - _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - - src_ptr += source_stride; - ref_ptr += recon_stride; - } - - { - __m256i expand_sum_low, expand_sum_high, expand_sum; - __m256i expand_madd_low, expand_madd_high, expand_madd; - __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); - expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); - - // shifting the sign 16 bits right - expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); - - expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); - - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); - expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); - - expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); - - ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); - ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); - - ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - - // shift 8 bytes eight - madd_ref_src = _mm256_srli_si256(expand_madd, 8); - sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); - - madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); - sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); - - // extract the low lane and the high lane and add the results - *((int *)SSE) = - _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); - - *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); - } -} - -#define FILTER_SRC(filter) \ - /* filter the source */ \ - exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ - exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ - \ - /* add 8 to source */ \ - exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ - exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ - \ - /* divide source by 16 */ \ - exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ - exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); - -#define MERGE_WITH_SRC(src_reg, reg) \ - exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ - exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); - -#define LOAD_SRC_DST \ - /* load source and destination */ \ - src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ - dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); - -#define AVG_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ - /* average between current and next stride source */ \ - src_reg = _mm256_avg_epu8(src_reg, src_next_reg); - -#define MERGE_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ - MERGE_WITH_SRC(src_reg, src_next_reg) - -#define CALC_SUM_SSE_INSIDE_LOOP \ - /* expand each byte to 2 bytes */ \ - exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ - exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ - /* source - dest */ \ - exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ - exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ - /* caculate sum */ \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ - exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ - exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ - /* calculate sse */ \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); - -// final calculation to sum and sse -#define CALC_SUM_AND_SSE \ - res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ - sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ - sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ - \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ - \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); - -unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, - int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, - int height, unsigned int *sse) { - __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; - __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; - __m256i zero_reg; - int i, sum; - sum_reg = _mm256_set1_epi16(0); - sse_reg = _mm256_set1_epi16(0); - zero_reg = _mm256_set1_epi16(0); - - // x_offset = 0 and y_offset = 0 - if (x_offset == 0) { - if (y_offset == 0) { - for (i = 0; i < height; i++) { - LOAD_SRC_DST - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, src_stride) - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg; - - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, src_stride) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { - if (y_offset == 0) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg, src_avg; - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // average between previous average to current average - src_avg = _mm256_avg_epu8(src_avg, src_reg); - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - // save current source average - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = 8 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg, src_avg; - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - MERGE_WITH_SRC(src_avg, src_reg) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - // x_offset = bilin interpolation and y_offset = 0 - } else { - if (y_offset == 0) { - __m256i filter, pw8, src_next_reg; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { - __m256i filter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // average between previous pack to the current - src_pack = _mm256_avg_epu8(src_pack, src_reg); - MERGE_WITH_SRC(src_pack, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src_pack = src_reg; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = bilin interpolation - } else { - __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - xfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - y_offset <<= 5; - yfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - - FILTER_SRC(xfilter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(xfilter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // merge previous pack to current pack source - MERGE_WITH_SRC(src_pack, src_reg) - // filter the source - FILTER_SRC(yfilter) - src_pack = src_reg; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - } - CALC_SUM_AND_SSE - return sum; -} - -unsigned int vpx_sub_pixel_avg_variance32xh_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, - int height, unsigned int *sse) { - __m256i sec_reg; - __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; - __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; - __m256i zero_reg; - int i, sum; - sum_reg = _mm256_set1_epi16(0); - sse_reg = _mm256_set1_epi16(0); - zero_reg = _mm256_set1_epi16(0); - - // x_offset = 0 and y_offset = 0 - if (x_offset == 0) { - if (y_offset == 0) { - for (i = 0; i < height; i++) { - LOAD_SRC_DST - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } else if (y_offset == 8) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, src_stride) - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 0 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg; - - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, src_stride) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { - if (y_offset == 0) { - __m256i src_next_reg; - for (i = 0; i < height; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec += sec_stride; - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg, src_avg; - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // average between previous average to current average - src_avg = _mm256_avg_epu8(src_avg, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_avg = _mm256_avg_epu8(src_avg, sec_reg); - sec += sec_stride; - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = 8 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg, src_avg; - y_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height; i++) { - // save current source average - src_avg = src_reg; - src += src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - MERGE_WITH_SRC(src_avg, src_reg) - FILTER_SRC(filter) - src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_avg = _mm256_avg_epu8(src_avg, sec_reg); - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - // x_offset = bilin interpolation and y_offset = 0 - } else { - if (y_offset == 0) { - __m256i filter, pw8, src_next_reg; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - MERGE_WITH_SRC(src_reg, zero_reg) - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - src += src_stride; - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { - __m256i filter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - filter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // average between previous pack to the current - src_pack = _mm256_avg_epu8(src_pack, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_pack = _mm256_avg_epu8(src_pack, sec_reg); - sec += sec_stride; - MERGE_WITH_SRC(src_pack, zero_reg) - src_pack = src_reg; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - // x_offset = bilin interpolation and y_offset = bilin interpolation - } else { - __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - xfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + x_offset)); - y_offset <<= 5; - yfilter = _mm256_load_si256( - (__m256i const *)(bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *)(src)); - MERGE_NEXT_SRC(src_reg, 1) - - FILTER_SRC(xfilter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height; i++) { - src += src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(xfilter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // merge previous pack to current pack source - MERGE_WITH_SRC(src_pack, src_reg) - // filter the source - FILTER_SRC(yfilter) - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); - src_pack = _mm256_avg_epu8(src_pack, sec_reg); - MERGE_WITH_SRC(src_pack, zero_reg) - src_pack = src_reg; - sec += sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - dst += dst_stride; - } - } - } - CALC_SUM_AND_SSE - return sum; -} diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c index 1161da4914..d6eb12da1a 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_sse2.c @@ -8,312 +8,426 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include #include // SSE2 #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" - #include "vpx_ports/mem.h" +#include "vpx_dsp/x86/mem_sse2.h" -typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); +static INLINE unsigned int add32x4_sse2(__m128i val) { + val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); + val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); + return (unsigned int)_mm_cvtsi128_si32(val); +} -unsigned int vpx_get_mb_ss_sse2(const int16_t *src) { +unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) { __m128i vsum = _mm_setzero_si128(); int i; for (i = 0; i < 32; ++i) { - const __m128i v = _mm_loadu_si128((const __m128i *)src); + const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr); vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); - src += 8; + src_ptr += 8; } - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - return _mm_cvtsi128_si32(vsum); + return add32x4_sse2(vsum); } -#define READ64(p, stride, i) \ - _mm_unpacklo_epi8( \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) +static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { + const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride)); + const __m128i p01 = _mm_unpacklo_epi32(p0, p1); + return _mm_unpacklo_epi8(p01, _mm_setzero_si128()); +} -static void get4x4var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); - const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); - const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); - const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); +static INLINE void variance_kernel_sse2(const __m128i src_ptr, + const __m128i ref_ptr, + __m128i *const sse, + __m128i *const sum) { + const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr); + *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); + *sum = _mm_add_epi16(*sum, diff); +} + +// Can handle 128 pixels' diff sum (such as 8x16 or 16x8) +// Slightly faster than variance_final_256_pel_sse2() +static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - // sum - __m128i vsum = _mm_add_epi16(diff0, diff1); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsum = - _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - *sse = _mm_cvtsi128_si32(vsum); } -void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, - int ref_stride, unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; +// Can handle 256 pixels' diff sum (such as 16x16) +static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - for (i = 0; i < 8; i += 2) { - const __m128i src0 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero); - const __m128i ref0 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - - const __m128i src1 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero); - const __m128i ref1 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - } - - // sum vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); + *sum += (int16_t)_mm_extract_epi16(vsum, 1); } -void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); +// Can handle 512 pixels' diff sum (such as 16x32 or 32x16) +static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); + + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_unpacklo_epi16(vsum, vsum); + vsum = _mm_srai_epi32(vsum, 16); + *sum = (int)add32x4_sse2(vsum); +} + +static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { + const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); + const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); + return _mm_add_epi32(sum_lo, sum_hi); +} + +// Can handle 1024 pixels' diff sum (such as 32x32) +static INLINE int sum_final_sse2(const __m128i sum) { + const __m128i t = sum_to_32bit_sse2(sum); + return (int)add32x4_sse2(t); +} + +static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { int i; - for (i = 0; i < 16; ++i) { - const __m128i s = _mm_loadu_si128((const __m128i *)src); - const __m128i r = _mm_loadu_si128((const __m128i *)ref); + assert(h <= 256); // May overflow for larger height. + *sse = _mm_setzero_si128(); + *sum = _mm_setzero_si128(); - const __m128i src0 = _mm_unpacklo_epi8(s, zero); - const __m128i ref0 = _mm_unpacklo_epi8(r, zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); + for (i = 0; i < h; i += 2) { + const __m128i s = load4x2_sse2(src_ptr, src_stride); + const __m128i r = load4x2_sse2(ref_ptr, ref_stride); - const __m128i src1 = _mm_unpackhi_epi8(s, zero); - const __m128i ref1 = _mm_unpackhi_epi8(r, zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - - src += src_stride; - ref += ref_stride; - } - - // sum - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - *sum = - (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); -} - -static void variance_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, int w, - int h, unsigned int *sse, int *sum, - getNxMvar_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, - ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } + variance_kernel_sse2(s, r, sse, sum); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; } } -unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, +static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + int i; + + assert(h <= 128); // May overflow for larger height. + *sse = _mm_setzero_si128(); + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; i++) { + const __m128i s = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero); + const __m128i r = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero); + + variance_kernel_sse2(s, r, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr, + const uint8_t *const ref_ptr, + __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr); + const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr); + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + + variance_kernel_sse2(src0, ref0, sse, sum); + variance_kernel_sse2(src1, ref1, sse, sum); +} + +static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + int i; + + assert(h <= 64); // May overflow for larger height. + *sse = _mm_setzero_si128(); + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; ++i) { + variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + int i; + + assert(h <= 32); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; ++i) { + variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum); + variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + int i; + + assert(h <= 16); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (i = 0; i < h; ++i) { + variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum); + variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum); + variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum); + variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; + } +} + +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m128i vsse, vsum; + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, sum); +} + +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + __m128i vsse, vsum; + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_256_pel_sse2(vsse, vsum, sse, sum); +} + +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 4); } -unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum, - get4x4var_sse2, 4); + variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 5); } -unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum, - get4x4var_sse2, 4); + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 5); } -unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 6); } -unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum, - vpx_get8x8var_sse2, 8); + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 7); } -unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum, - vpx_get8x8var_sse2, 8); + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 7); } -unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_256_pel_sse2(vsse, vsum, sse, &sum); + return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); } -unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse, vsum; int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, - vpx_get16x16var_sse2, 16); + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + variance_final_512_pel_sse2(vsse, vsum, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum; + int sum; + variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_512_pel_sse2(vsse, vsum, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum; + int sum; + variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); + *sse = add32x4_sse2(vsse); + sum = sum_final_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); } -unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, - vpx_get16x16var_sse2, 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); + int i = 0; + + for (i = 0; i < 2; i++) { + __m128i vsum16; + variance32_sse2(src_ptr + 32 * i * src_stride, src_stride, + ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse, + &vsum16); + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); + } + *sse = add32x4_sse2(vsse); + sum = (int)add32x4_sse2(vsum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum, - vpx_get16x16var_sse2, 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); + int i = 0; + + for (i = 0; i < 2; i++) { + __m128i vsum16; + variance64_sse2(src_ptr + 16 * i * src_stride, src_stride, + ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse, + &vsum16); + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); + } + *sse = add32x4_sse2(vsse); + sum = (int)add32x4_sse2(vsum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { + __m128i vsse = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, - vpx_get16x16var_sse2, 16); + int i = 0; + + for (i = 0; i < 4; i++) { + __m128i vsum16; + variance64_sse2(src_ptr + 16 * i * src_stride, src_stride, + ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse, + &vsum16); + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); + } + *sse = add32x4_sse2(vsse); + sum = (int)add32x4_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); } -unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, - vpx_get16x16var_sse2, 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); -} - -unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum, - vpx_get16x16var_sse2, 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); -} - -unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } -unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } -unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } -unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } // The 2 unused parameters are place holders for PIC enabled build. // These definitions are for functions defined in subpel_variance.asm -#define DECL(w, opt) \ - int vpx_sub_pixel_variance##w##xh_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ - void *unused0, void *unused) +#define DECL(w, opt) \ + int vpx_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \ + int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \ + unsigned int *sse, void *unused0, void *unused) #define DECLS(opt1, opt2) \ DECL(4, opt1); \ DECL(8, opt1); \ @@ -324,66 +438,67 @@ DECLS(ssse3, ssse3); #undef DECLS #undef DECL -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ - unsigned int sse; \ - int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - h, &sse, NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ + unsigned int sse_tmp; \ + int se = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ + &sse_tmp, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + } \ + } \ + *sse = sse_tmp; \ + return sse_tmp - \ + (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#define FNS(opt1, opt2) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \ +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)) \ FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) -FNS(sse2, sse2); -FNS(ssse3, ssse3); +FNS(sse2, sse2) +FNS(ssse3, ssse3) #undef FNS #undef FN // The 2 unused parameters are place holders for PIC enabled build. -#define DECL(w, opt) \ - int vpx_sub_pixel_avg_variance##w##xh_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ - ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ - void *unused) +#define DECL(w, opt) \ + int vpx_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \ + int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, \ + const uint8_t *second_pred, ptrdiff_t second_stride, int height, \ + unsigned int *sse, void *unused0, void *unused) #define DECLS(opt1, opt2) \ DECL(4, opt1); \ DECL(8, opt1); \ @@ -394,56 +509,57 @@ DECLS(ssse3, ssse3); #undef DECL #undef DECLS -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sseptr, \ - const uint8_t *sec) { \ - unsigned int sse; \ - int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ - NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sseptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + unsigned int sse_tmp; \ + int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, \ + second_pred, w, h, &sse_tmp, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ + ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ + ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ + ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + } \ + } \ + *sse = sse_tmp; \ + return sse_tmp - \ + (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#define FNS(opt1, opt2) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \ +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)) \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)) \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)) \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)) \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)) \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \ + FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)) \ + FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)) \ + FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)) \ + FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)) \ + FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)) \ FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) -FNS(sse2, sse); -FNS(ssse3, ssse3); +FNS(sse2, sse) +FNS(ssse3, ssse3) #undef FNS #undef FN diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_asm_stubs.c deleted file mode 100644 index 727d9d1156..0000000000 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_asm_stubs.c +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/x86/convolve.h" - -#if HAVE_SSE2 -filter8_1dfunction vpx_filter_block1d16_v8_sse2; -filter8_1dfunction vpx_filter_block1d16_h8_sse2; -filter8_1dfunction vpx_filter_block1d8_v8_sse2; -filter8_1dfunction vpx_filter_block1d8_h8_sse2; -filter8_1dfunction vpx_filter_block1d4_v8_sse2; -filter8_1dfunction vpx_filter_block1d4_h8_sse2; -filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; - -filter8_1dfunction vpx_filter_block1d16_v2_sse2; -filter8_1dfunction vpx_filter_block1d16_h2_sse2; -filter8_1dfunction vpx_filter_block1d8_v2_sse2; -filter8_1dfunction vpx_filter_block1d8_h2_sse2; -filter8_1dfunction vpx_filter_block1d4_v2_sse2; -filter8_1dfunction vpx_filter_block1d4_h2_sse2; -filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; - -// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); - -// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, sse2); -FUN_CONV_2D(avg_, sse2); - -#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; - -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; - -// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - sse2); - -// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_2D(, sse2); -HIGH_FUN_CONV_2D(avg_, sse2); -#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -#endif // HAVE_SSE2 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index e2311c1167..3f444e2e6a 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -20,21 +20,19 @@ SECTION .text %endif %ifidn %2, highbd %define pavg pavgw -cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ +cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ dst, dst_stride, \ - fx, fxs, fy, fys, w, h, bd + f, fxo, fxs, fyo, fys, w, h, bd %else %define pavg pavgb -cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ +cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ dst, dst_stride, \ - fx, fxs, fy, fys, w, h + f, fxo, fxs, fyo, fys, w, h %endif mov r4d, dword wm %ifidn %2, highbd shl r4d, 1 - shl srcq, 1 shl src_strideq, 1 - shl dstq, 1 shl dst_strideq, 1 %else cmp r4d, 4 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm index bfc816f235..fc301fb39e 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm @@ -45,7 +45,7 @@ ;Compute max and min values of a pixel mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm0, rdx movq xmm1, rcx pshufd xmm0, xmm0, 0b @@ -121,7 +121,7 @@ ;Compute max and min values of a pixel mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm0, rdx movq xmm1, rcx pshufd xmm0, xmm0, 0b @@ -197,7 +197,9 @@ movdqu [rdi + %2], xmm0 %endm -;void vpx_filter_block1d4_v8_sse2 +SECTION .text + +;void vpx_highbd_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -206,7 +208,7 @@ ; unsigned int output_height, ; short *filter ;) -global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d4_v8_sse2) sym(vpx_highbd_filter_block1d4_v8_sse2): push rbp mov rbp, rsp @@ -267,7 +269,7 @@ sym(vpx_highbd_filter_block1d4_v8_sse2): pop rbp ret -;void vpx_filter_block1d8_v8_sse2 +;void vpx_highbd_filter_block1d8_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -276,7 +278,7 @@ sym(vpx_highbd_filter_block1d4_v8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d8_v8_sse2) sym(vpx_highbd_filter_block1d8_v8_sse2): push rbp mov rbp, rsp @@ -326,7 +328,7 @@ sym(vpx_highbd_filter_block1d8_v8_sse2): pop rbp ret -;void vpx_filter_block1d16_v8_sse2 +;void vpx_highbd_filter_block1d16_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -335,7 +337,7 @@ sym(vpx_highbd_filter_block1d8_v8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d16_v8_sse2) sym(vpx_highbd_filter_block1d16_v8_sse2): push rbp mov rbp, rsp @@ -389,7 +391,7 @@ sym(vpx_highbd_filter_block1d16_v8_sse2): pop rbp ret -global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d4_v8_avg_sse2) sym(vpx_highbd_filter_block1d4_v8_avg_sse2): push rbp mov rbp, rsp @@ -450,7 +452,7 @@ sym(vpx_highbd_filter_block1d4_v8_avg_sse2): pop rbp ret -global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d8_v8_avg_sse2) sym(vpx_highbd_filter_block1d8_v8_avg_sse2): push rbp mov rbp, rsp @@ -499,7 +501,7 @@ sym(vpx_highbd_filter_block1d8_v8_avg_sse2): pop rbp ret -global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d16_v8_avg_sse2) sym(vpx_highbd_filter_block1d16_v8_avg_sse2): push rbp mov rbp, rsp @@ -552,7 +554,7 @@ sym(vpx_highbd_filter_block1d16_v8_avg_sse2): pop rbp ret -;void vpx_filter_block1d4_h8_sse2 +;void vpx_highbd_filter_block1d4_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -561,7 +563,7 @@ sym(vpx_highbd_filter_block1d16_v8_avg_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d4_h8_sse2) sym(vpx_highbd_filter_block1d4_h8_sse2): push rbp mov rbp, rsp @@ -627,7 +629,7 @@ sym(vpx_highbd_filter_block1d4_h8_sse2): pop rbp ret -;void vpx_filter_block1d8_h8_sse2 +;void vpx_highbd_filter_block1d8_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -636,7 +638,7 @@ sym(vpx_highbd_filter_block1d4_h8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d8_h8_sse2) sym(vpx_highbd_filter_block1d8_h8_sse2): push rbp mov rbp, rsp @@ -693,7 +695,7 @@ sym(vpx_highbd_filter_block1d8_h8_sse2): pop rbp ret -;void vpx_filter_block1d16_h8_sse2 +;void vpx_highbd_filter_block1d16_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -702,7 +704,7 @@ sym(vpx_highbd_filter_block1d8_h8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d16_h8_sse2) sym(vpx_highbd_filter_block1d16_h8_sse2): push rbp mov rbp, rsp @@ -770,7 +772,7 @@ sym(vpx_highbd_filter_block1d16_h8_sse2): pop rbp ret -global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d4_h8_avg_sse2) sym(vpx_highbd_filter_block1d4_h8_avg_sse2): push rbp mov rbp, rsp @@ -836,7 +838,7 @@ sym(vpx_highbd_filter_block1d4_h8_avg_sse2): pop rbp ret -global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d8_h8_avg_sse2) sym(vpx_highbd_filter_block1d8_h8_avg_sse2): push rbp mov rbp, rsp @@ -893,7 +895,7 @@ sym(vpx_highbd_filter_block1d8_h8_avg_sse2): pop rbp ret -global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d16_h8_avg_sse2) sym(vpx_highbd_filter_block1d16_h8_avg_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm index 72f2ff71da..bd51c75bcb 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm @@ -26,7 +26,7 @@ pshufd xmm3, xmm3, 0 mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm5, rdx movq xmm2, rcx pshufd xmm5, xmm5, 0b @@ -64,7 +64,7 @@ dec rcx %endm -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 %macro HIGH_GET_PARAM 0 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr @@ -82,7 +82,7 @@ pshufd xmm4, xmm4, 0 mov rdx, 0x00010001 - movsxd rcx, DWORD PTR arg(6) ;bps + movsxd rcx, DWORD PTR arg(6) ;bd movq xmm8, rdx movq xmm5, rcx pshufd xmm8, xmm8, 0b @@ -171,7 +171,9 @@ %endm %endif -global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE +SECTION .text + +globalsym(vpx_highbd_filter_block1d4_v2_sse2) sym(vpx_highbd_filter_block1d4_v2_sse2): push rbp mov rbp, rsp @@ -195,8 +197,8 @@ sym(vpx_highbd_filter_block1d4_v2_sse2): pop rbp ret -%if ARCH_X86_64 -global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE +%if VPX_ARCH_X86_64 +globalsym(vpx_highbd_filter_block1d8_v2_sse2) sym(vpx_highbd_filter_block1d8_v2_sse2): push rbp mov rbp, rsp @@ -222,7 +224,7 @@ sym(vpx_highbd_filter_block1d8_v2_sse2): pop rbp ret -global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d16_v2_sse2) sym(vpx_highbd_filter_block1d16_v2_sse2): push rbp mov rbp, rsp @@ -251,7 +253,7 @@ sym(vpx_highbd_filter_block1d16_v2_sse2): ret %endif -global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d4_v2_avg_sse2) sym(vpx_highbd_filter_block1d4_v2_avg_sse2): push rbp mov rbp, rsp @@ -275,8 +277,8 @@ sym(vpx_highbd_filter_block1d4_v2_avg_sse2): pop rbp ret -%if ARCH_X86_64 -global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE +%if VPX_ARCH_X86_64 +globalsym(vpx_highbd_filter_block1d8_v2_avg_sse2) sym(vpx_highbd_filter_block1d8_v2_avg_sse2): push rbp mov rbp, rsp @@ -302,7 +304,7 @@ sym(vpx_highbd_filter_block1d8_v2_avg_sse2): pop rbp ret -global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d16_v2_avg_sse2) sym(vpx_highbd_filter_block1d16_v2_avg_sse2): push rbp mov rbp, rsp @@ -331,7 +333,7 @@ sym(vpx_highbd_filter_block1d16_v2_avg_sse2): ret %endif -global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d4_h2_sse2) sym(vpx_highbd_filter_block1d4_h2_sse2): push rbp mov rbp, rsp @@ -356,8 +358,8 @@ sym(vpx_highbd_filter_block1d4_h2_sse2): pop rbp ret -%if ARCH_X86_64 -global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE +%if VPX_ARCH_X86_64 +globalsym(vpx_highbd_filter_block1d8_h2_sse2) sym(vpx_highbd_filter_block1d8_h2_sse2): push rbp mov rbp, rsp @@ -383,7 +385,7 @@ sym(vpx_highbd_filter_block1d8_h2_sse2): pop rbp ret -global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d16_h2_sse2) sym(vpx_highbd_filter_block1d16_h2_sse2): push rbp mov rbp, rsp @@ -412,7 +414,7 @@ sym(vpx_highbd_filter_block1d16_h2_sse2): ret %endif -global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d4_h2_avg_sse2) sym(vpx_highbd_filter_block1d4_h2_avg_sse2): push rbp mov rbp, rsp @@ -437,8 +439,8 @@ sym(vpx_highbd_filter_block1d4_h2_avg_sse2): pop rbp ret -%if ARCH_X86_64 -global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE +%if VPX_ARCH_X86_64 +globalsym(vpx_highbd_filter_block1d8_h2_avg_sse2) sym(vpx_highbd_filter_block1d8_h2_avg_sse2): push rbp mov rbp, rsp @@ -464,7 +466,7 @@ sym(vpx_highbd_filter_block1d8_h2_avg_sse2): pop rbp ret -global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE +globalsym(vpx_highbd_filter_block1d16_h2_avg_sse2) sym(vpx_highbd_filter_block1d16_h2_avg_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c new file mode 100644 index 0000000000..21a35ae3c3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -0,0 +1,1161 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_sse2.h" +#include "vpx_ports/mem.h" + +#define CONV8_ROUNDING_BITS (7) +#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) + +static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i dst_first, dst_second; + __m128i even, odd; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together for the first half of even + // output. + // Repeat multiple times to get the whole outoput + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Output 6 4 2 0 + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + dst_first = mm_zip_epi32_sse2(&even, &odd); + + // Do again to get the second half of dst + src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Output 14 12 10 8 + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 15 13 11 9 + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the second half of the dst + dst_second = mm_zip_epi32_sse2(&even, &odd); + + // Round each result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm_packus_epi16(dst_first, dst_second); + _mm_store_si128((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +/* The macro used to generate functions shifts the src_ptr up by 3 rows already + * */ + +static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi; + // Half of half of the interleaved rows + __m128i src_reg_m10_lo_1, src_reg_m10_lo_2, src_reg_m10_hi_1, + src_reg_m10_hi_2; + __m128i src_reg_01_lo_1, src_reg_01_lo_2, src_reg_01_hi_1, src_reg_01_hi_2; + __m128i src_reg_12_lo_1, src_reg_12_lo_2, src_reg_12_hi_1, src_reg_12_hi_2; + __m128i src_reg_23_lo_1, src_reg_23_lo_2, src_reg_23_hi_1, src_reg_23_hi_2; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit + // words, + // shuffle the data into the form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0); + src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128()); + src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128()); + src_reg_m10_hi_1 = _mm_unpacklo_epi8(src_reg_m10_hi, _mm_setzero_si128()); + src_reg_m10_hi_2 = _mm_unpackhi_epi8(src_reg_m10_hi, _mm_setzero_si128()); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); + src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128()); + src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128()); + src_reg_01_hi_1 = _mm_unpacklo_epi8(src_reg_01_hi, _mm_setzero_si128()); + src_reg_01_hi_2 = _mm_unpackhi_epi8(src_reg_01_hi, _mm_setzero_si128()); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3); + + // Partial output from first half + res_reg_m10_lo = mm_madd_packs_epi16_sse2( + &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); + + res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2, + &kernel_reg_23); + + src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); + src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); + res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2, + &kernel_reg_45); + + src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); + src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); + res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2, + &kernel_reg_45); + + // Add to get first half of the results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Now repeat everything again for the second half + // Partial output for second half + res_reg_m10_hi = mm_madd_packs_epi16_sse2( + &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23); + + res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2, + &kernel_reg_23); + + src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128()); + src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128()); + res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2, + &kernel_reg_45); + + src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128()); + src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128()); + res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2, + &kernel_reg_45); + + // Second half of the results + res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + + // Combine to get the result + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi); + + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo_1 = src_reg_12_lo_1; + src_reg_m10_lo_2 = src_reg_12_lo_2; + src_reg_m10_hi_1 = src_reg_12_hi_1; + src_reg_m10_hi_2 = src_reg_12_hi_2; + src_reg_01_lo_1 = src_reg_23_lo_1; + src_reg_01_lo_2 = src_reg_23_lo_2; + src_reg_01_hi_1 = src_reg_23_hi_1; + src_reg_01_hi_2 = src_reg_23_hi_2; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i dst_first; + __m128i even, odd; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the even output + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Output 6 4 2 0 + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + dst_first = mm_zip_epi32_sse2(&even, &odd); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Saturate and convert to 8-bit words + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo; + __m128i src_reg_12_lo, src_reg_23_lo; + // Half of half of the interleaved rows + __m128i src_reg_m10_lo_1, src_reg_m10_lo_2; + __m128i src_reg_01_lo_1, src_reg_01_lo_2; + __m128i src_reg_12_lo_1, src_reg_12_lo_2; + __m128i src_reg_23_lo_1, src_reg_23_lo_2; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit + // words, + // shuffle the data into the form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128()); + src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128()); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128()); + src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128()); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10_lo = mm_madd_packs_epi16_sse2( + &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); + + res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2, + &kernel_reg_23); + + src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); + src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); + res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2, + &kernel_reg_45); + + src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); + src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); + res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2, + &kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + + // Convert to 8-bit words + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128()); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, _mm_setzero_si128()); + + // Save only half of the register (8 words) + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo_1 = src_reg_12_lo_1; + src_reg_m10_lo_2 = src_reg_12_lo_2; + src_reg_01_lo_1 = src_reg_23_lo_1; + src_reg_01_lo_2 = src_reg_23_lo_2; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i dst_first; + __m128i tmp_0, tmp_1; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Then we call multiply and add to get partial results + // s[1]k[3]+s[0]k[2] s[0]k[3]s[-1]k[2] + // s[3]k[5]+s[2]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the output + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 1); + src_reg_shift_2 = _mm_srli_si128(src_reg, 2); + src_reg_shift_3 = _mm_srli_si128(src_reg, 3); + + // Convert to 16-bit words + src_reg = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128()); + src_reg_shift_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128()); + src_reg_shift_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128()); + src_reg_shift_3 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128()); + + // Shuffle into the right format + tmp_0 = _mm_unpacklo_epi32(src_reg, src_reg_shift_1); + tmp_1 = _mm_unpacklo_epi32(src_reg_shift_2, src_reg_shift_3); + + // Partial output + tmp_0 = _mm_madd_epi16(tmp_0, kernel_reg_23); + tmp_1 = _mm_madd_epi16(tmp_1, kernel_reg_45); + + // Output + dst_first = _mm_add_epi32(tmp_0, tmp_1); + dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128()); + + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Saturate and convert to 8-bit words + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo; + __m128i src_reg_12_lo, src_reg_23_lo; + // Half of half of the interleaved rows + __m128i src_reg_m10_lo_1; + __m128i src_reg_01_lo_1; + __m128i src_reg_12_lo_1; + __m128i src_reg_23_lo_1; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit + // words, + // shuffle the data into the form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128()); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128()); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10_lo = + mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, ®_zero, &kernel_reg_23); + + res_reg_01_lo = + mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, ®_zero, &kernel_reg_23); + + src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); + res_reg_12_lo = + mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, ®_zero, &kernel_reg_45); + + src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); + res_reg_23_lo = + mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, ®_zero, &kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + + // Convert to 8-bit words + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero); + + // Save only half of the register (8 words) + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012); + *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo_1 = src_reg_12_lo_1; + src_reg_01_lo_1 = src_reg_23_lo_1; + src_reg_1 = src_reg_3; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 +static void vpx_highbd_filter_block1d4_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the even output + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i res_reg; + __m128i even, odd; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 2); + src_reg_shift_2 = _mm_srli_si128(src_reg, 4); + src_reg_shift_3 = _mm_srli_si128(src_reg, 6); + + // Output 2 0 + even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 3 1 + odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + res_reg = _mm_unpacklo_epi32(even, odd); + res_reg = mm_round_epi32_sse2(&res_reg, ®_round, CONV8_ROUNDING_BITS); + res_reg = _mm_packs_epi32(res_reg, reg_zero); + + // Saturate the result and save + res_reg = _mm_min_epi16(res_reg, reg_max); + res_reg = _mm_max_epi16(res_reg, reg_zero); + _mm_storel_epi64((__m128i *)dst_ptr, res_reg); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_highbd_filter_block1d4_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels as 16-bit words, and shuffle them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23; + __m128i res_reg_m1012, res_reg_0123; + + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23); + res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23); + res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45); + res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45); + + // Add to get results + res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12); + res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23); + + // Round the words + res_reg_m1012 = + mm_round_epi32_sse2(&res_reg_m1012, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123 = + mm_round_epi32_sse2(&res_reg_0123, ®_round, CONV8_ROUNDING_BITS); + + res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero); + + // Saturate according to bit depth + res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max); + res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max); + res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero); + + // Save only half of the register (8 words) + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10 = src_reg_12; + src_reg_01 = src_reg_23; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d8_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together for the first half of even + // output. + // Repeat multiple times to get the whole outoput + + __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2, + src_reg_shift_3; + __m128i res_reg; + __m128i even, odd; + __m128i tmp_0, tmp_1; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will put first half in the first half of the reg, and second half in + // second half + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + + // Output 6 4 2 0 + tmp_0 = _mm_srli_si128(src_reg, 4); + tmp_1 = _mm_srli_si128(src_reg_next, 2); + src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1); + even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + tmp_0 = _mm_srli_si128(src_reg, 2); + tmp_1 = src_reg_next; + src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1); + + tmp_0 = _mm_srli_si128(src_reg, 6); + tmp_1 = _mm_srli_si128(src_reg_next, 4); + src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1); + + odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + even = mm_round_epi32_sse2(&even, ®_round, CONV8_ROUNDING_BITS); + odd = mm_round_epi32_sse2(&odd, ®_round, CONV8_ROUNDING_BITS); + res_reg = mm_zip_epi32_sse2(&even, &odd); + + // Saturate the result and save + res_reg = _mm_min_epi16(res_reg, reg_max); + res_reg = _mm_max_epi16(res_reg, reg_zero); + + _mm_store_si128((__m128i *)dst_ptr, res_reg); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_highbd_filter_block1d8_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We will load two rows of pixels as 16-bit words, and shuffle them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi; + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + __m128i res_reg_m1012_hi, res_reg_0123_hi; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3); + + // Partial output for first half + res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23); + res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23); + res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45); + res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = + mm_round_epi32_sse2(&res_reg_m1012_lo, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123_lo = + mm_round_epi32_sse2(&res_reg_0123_lo, ®_round, CONV8_ROUNDING_BITS); + + // Partial output for first half + res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23); + res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23); + res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45); + res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45); + + // Add to get results + res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_hi = + mm_round_epi32_sse2(&res_reg_m1012_hi, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123_hi = + mm_round_epi32_sse2(&res_reg_0123_hi, ®_round, CONV8_ROUNDING_BITS); + + // Combine the two halfs + res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi); + + // Saturate according to bit depth + res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max); + res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max); + res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero); + + // Save only half of the register (8 words) + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo = src_reg_12_lo; + src_reg_m10_hi = src_reg_12_hi; + src_reg_01_lo = src_reg_23_lo; + src_reg_01_hi = src_reg_23_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_highbd_filter_block1d16_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +static void vpx_highbd_filter_block1d16_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 + +// From vpx_subpixel_8t_sse2.asm. +filter8_1dfunction vpx_filter_block1d16_v8_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_sse2; +filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2 +#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2 +#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2 +#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2 +#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2 +#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2 + +// From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm. +filter8_1dfunction vpx_filter_block1d16_v2_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_sse2; +filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; + +// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0) +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, , + sse2, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1) +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1) + +// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2, 0) +FUN_CONV_2D(avg_, sse2, 1) + +#if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_highbd_filter_block1d16_v4_avg_sse2 \ + vpx_highbd_filter_block1d16_v8_avg_sse2 +#define vpx_highbd_filter_block1d16_h4_avg_sse2 \ + vpx_highbd_filter_block1d16_h8_avg_sse2 +#define vpx_highbd_filter_block1d8_v4_avg_sse2 \ + vpx_highbd_filter_block1d8_v8_avg_sse2 +#define vpx_highbd_filter_block1d8_h4_avg_sse2 \ + vpx_highbd_filter_block1d8_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_v4_avg_sse2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_h4_avg_sse2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; + +// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0) +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), , sse2, 0) +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1) +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1) + +// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h, int bd); +HIGH_FUN_CONV_2D(, sse2, 0) +HIGH_FUN_CONV_2D(avg_, sse2, 1) +#endif // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 7c1ecc0148..526c283823 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -9,21 +9,25 @@ */ #include +#include #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_avx2.h" +#include "vpx_dsp/x86/convolve_sse2.h" +#include "vpx_dsp/x86/convolve_ssse3.h" #include "vpx_ports/mem.h" -// filters for 16_h8 and 16_v8 -DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; +// filters for 16_h8 +DECLARE_ALIGNED(32, static const uint8_t, + filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, + 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3, + 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; -DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; +DECLARE_ALIGNED(32, static const uint8_t, + filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 }; DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, @@ -35,540 +39,1336 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -#if defined(__clang__) -#if (__clang_major__ > 0 && __clang_major__ < 3) || \ - (__clang_major__ == 3 && __clang_minor__ <= 3) || \ - (defined(__APPLE__) && defined(__apple_build_version__) && \ - ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ - (__clang_major__ == 5 && __clang_minor__ == 0))) -#define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -#else // clang > 3.3, and not 5.0 on macosx. -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // clang <= 3.3 -#elif defined(__GNUC__) -#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) -#define MM256_BROADCASTSI128_SI256(x) \ - _mm_broadcastsi128_si256((__m128i const *)&(x)) -#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 -#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) -#else // gcc > 4.7 -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // gcc <= 4.6 -#else // !(gcc || clang) -#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) -#endif // __clang__ +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, +}; -static void vpx_filter_block1d16_h8_avx2( +#define CALC_CONVOLVE8_HORZ_ROW \ + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); \ + s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \ + s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \ + s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \ + s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]); \ + s1[0] = convolve8_16_avx2(s1, f1); \ + s1[0] = _mm256_packus_epi16(s1[0], s1[0]); \ + src_ptr += src_stride; \ + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \ + output_ptr += output_pitch; \ + _mm_storel_epi64((__m128i *)&output_ptr[0], \ + _mm256_extractf128_si256(s1[0], 1)); \ + output_ptr += output_pitch; + +static INLINE void vpx_filter_block1d16_h8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, - ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; - __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; - __m256i srcReg32b1, srcReg32b2, filtersReg32; + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter, + const int avg) { + __m128i outReg1, outReg2; + __m256i outReg32b1, outReg32b2; unsigned int i; ptrdiff_t src_stride, dst_stride; + __m256i f[4], filt[4], s[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - - filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + shuffle_filter_avx2(filter, f); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { + __m256i srcReg; + // load the 2 strides of source - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg32b1 = _mm256_inserti128_si256( - srcReg32b1, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), - 1); + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3); // filter the source buffer - srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + outReg32b1 = convolve8_16_avx2(s, f); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) - srcReg32b2 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg32b2 = _mm256_inserti128_si256( - srcReg32b2, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), - 1); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5); // filter the source buffer - srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); + outReg32b2 = convolve8_16_avx2(s, f); - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); - - // filter the source buffer - srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - - // add and saturate the results together - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); - - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); - srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2); src_ptr += src_stride; - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, - _mm256_castsi256_si128(srcRegFilt32b1_1)); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + output_pitch), - _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + if (avg) { + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch)); + outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg); + } + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + output_pitch), &outReg32b1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { - __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; - __m128i srcRegFilt2, srcRegFilt3; - - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + const __m256i srcReg = + _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1); // filter the source buffer - srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = - _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + // The low and high 128-bits of each lane contain the first and second + // convolve result respectively + outReg32b1 = convolve8_16_avx2(s, f); + outReg1 = _mm256_castsi256_si128(outReg32b1); + outReg2 = _mm256_extractf128_si256(outReg32b1, 1); - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + // shrink to 8 bit each 16 bits + outReg1 = _mm_packus_epi16(outReg1, outReg2); - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - - // reading the next 16 bytes - // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - // filter the source buffer - srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2_1 = - _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); - srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = - _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); - srcRegFilt2 = - _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); - - // add and saturate the results together - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); - - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + // average if necessary + if (avg) { + outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); + } // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); + _mm_store_si128((__m128i *)output_ptr, outReg1); } } -static void vpx_filter_block1d16_v8_avx2( +static void vpx_filter_block1d16_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr, + ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) { + vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride, + output_height, filter, 0); +} + +static void vpx_filter_block1d16_h8_avg_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr, + ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) { + vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride, + output_height, filter, 1); +} + +static void vpx_filter_block1d8_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, - ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i filtersReg; - __m256i addFilterReg64; - __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; - __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; - __m256i srcReg32b11, srcReg32b12, filtersReg32; - __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m256i filt[4], f1[4], s1[4], srcReg; + __m128i f[4], s[4]; + int y = output_height; + + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + shuffle_filter_avx2(filter, f1); + filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // Process next 4 rows + while (y > 3) { + CALC_CONVOLVE8_HORZ_ROW + CALC_CONVOLVE8_HORZ_ROW + y -= 4; + } + + // If remaining, then process 2 rows at a time + while (y > 1) { + CALC_CONVOLVE8_HORZ_ROW + y -= 2; + } + + // For the remaining height. + if (y > 0) { + const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + f[0] = _mm256_castsi256_si128(f1[0]); + f[1] = _mm256_castsi256_si128(f1[1]); + f[2] = _mm256_castsi256_si128(f1[2]); + f[3] = _mm256_castsi256_si128(f1[3]); + + // filter the source buffer + s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0])); + s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1])); + s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2])); + s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3])); + s[0] = convolve8_8_ssse3(s, f); + + // Saturate 16bit value to 8bit. + s[0] = _mm_packus_epi16(s[0], s[0]); + + // Save only 8 bytes + _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]); + } +} + +static INLINE void vpx_filter_block1d16_v8_x_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter, + const int avg) { + __m256i srcRegHead1; unsigned int i; ptrdiff_t src_stride, dst_stride; + __m256i f[4], s1[4], s2[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the - // same data in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - // have the same data in both lanes of a 256 bit register - filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 256 bit register - firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 256 bit register - secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 256 bit register - thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 256 bit register - forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + shuffle_filter_avx2(filter, f); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; - // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr))); - srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); - srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); - srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); - srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); - srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); - srcReg32b7 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + { + __m128i s[6]; + __m256i s32b[6]; - // have each consecutive loads on the same 256 register - srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm256_castsi256_si128(srcReg32b2), 1); - srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm256_castsi256_si128(srcReg32b3), 1); - srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, - _mm256_castsi256_si128(srcReg32b4), 1); - srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, - _mm256_castsi256_si128(srcReg32b5), 1); - srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, - _mm256_castsi256_si128(srcReg32b6), 1); - srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, - _mm256_castsi256_si128(srcReg32b7), 1); + // load 16 bytes 7 times in stride of src_pitch + s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch)); + srcRegHead1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch))); - // merge every two consecutive registers except the last one - srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); - srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + // have each consecutive loads on the same 256 register + s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), + _mm256_castsi256_si128(srcRegHead1), 1); - // save - srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + // merge every two consecutive registers except the last one + // the first lanes contain values for filtering odd rows (1,3,5...) and + // the second lanes contain values for filtering even rows (2,4,6...) + s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]); + s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]); + s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]); + s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]); + s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]); + s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]); + } - // save - srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); - - // save - srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); - - // save - srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + // The output_height is always a multiple of two. + assert(!(output_height & 1)); for (i = output_height; i > 1; i -= 2) { - // load the last 2 loads of 16 bytes and have every two + __m256i srcRegHead2, srcRegHead3; + + // load the next 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register - srcReg32b8 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); - srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, - _mm256_castsi256_si128(srcReg32b8), 1); - srcReg32b9 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); - srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, - _mm256_castsi256_si128(srcReg32b9), 1); + srcRegHead2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch))); + srcRegHead1 = _mm256_inserti128_si256( + srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1); + srcRegHead3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch))); + srcRegHead2 = _mm256_inserti128_si256( + srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1); - // merge every two consecutive registers - // save - srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); - srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + // merge the two new consecutive registers + // the first lane contain values for filtering odd rows (1,3,5...) and + // the second lane contain values for filtering even rows (2,4,6...) + s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2); + s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2); - // multiply 2 adjacent elements with the filter and add the result - srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + s1[0] = convolve8_16_avx2(s1, f); + s2[0] = convolve8_16_avx2(s2, f); - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); - - // add and saturate the results together - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); - srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); - - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); - - // multiply 2 adjacent elements with the filter and add the result - srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); - srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); - - // add and saturate the results together - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); - srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane + // contain the first and second convolve result respectively + s1[0] = _mm256_packus_epi16(s1[0], s2[0]); src_ptr += src_stride; - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1)); + // average if necessary + if (avg) { + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch)); + s1[0] = _mm256_avg_epu8(s1[0], outReg); + } - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + out_pitch), - _mm256_extractf128_si256(srcReg32b1, 1)); + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + out_pitch), s1); output_ptr += dst_stride; - // save part of the registers for next strides - srcReg32b10 = srcReg32b11; - srcReg32b1 = srcReg32b3; - srcReg32b11 = srcReg32b2; - srcReg32b3 = srcReg32b5; - srcReg32b2 = srcReg32b4; - srcReg32b5 = srcReg32b7; - srcReg32b7 = srcReg32b9; + // shift down by two rows + s1[0] = s1[1]; + s2[0] = s2[1]; + s1[1] = s1[2]; + s2[1] = s2[2]; + s1[2] = s1[3]; + s2[2] = s2[3]; + srcRegHead1 = srcRegHead3; } - if (i > 0) { - __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; - __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; - // load the last 16 bytes - srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); +} - // merge the last 2 results together - srcRegFilt4 = - _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); - srcRegFilt7 = - _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); +static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *filter) { + vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, filter, 0); +} - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt4 = - _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); - srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), - _mm256_castsi256_si128(firstFilters)); - srcRegFilt7 = - _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); +static void vpx_filter_block1d16_v8_avg_avx2( + const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) { + vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, filter, 1); +} - // add and saturate the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); +static void vpx_filter_block1d16_h4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a + // time. - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), - _mm256_castsi256_si128(secondFilters)); - srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), - _mm256_castsi256_si128(secondFilters)); + __m128i kernel_reg; // Kernel + __m256i kernel_reg_256, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), - _mm256_castsi256_si128(thirdFilters)); - srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), - _mm256_castsi256_si128(thirdFilters)); + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i dst_first, dst_second; + __m256i tmp_0, tmp_1; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m256i idx_shift_2 = + _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); - // add and saturate the results together - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; - // add and saturate the results together - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg); + kernel_reg_23 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u)); - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + // Partial result for first half + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm256_adds_epi16(tmp_0, tmp_1); - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + // Do again to get the second half of dst + // Load the source + src_reg = mm256_loadu2_si128(src_ptr + 8, src_ptr + src_stride + 8); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); + // Partial result for second half + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_second = _mm256_adds_epi16(tmp_0, tmp_1); + + // Round each result + dst_first = mm256_round_epi16(&dst_first, ®_32, 6); + dst_second = mm256_round_epi16(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm256_packus_epi16(dst_first, dst_second); + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &dst_first); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + src_reg = _mm256_loadu_si256((const __m256i *)src_ptr); + // Reorder into 2 1 1 2 + src_reg = _mm256_permute4x64_epi64(src_reg, 0x94); + + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm256_adds_epi16(tmp_0, tmp_1); + + dst_first = mm256_round_epi16(&dst_first, ®_32, 6); + + dst_first = _mm256_packus_epi16(dst_first, dst_first); + dst_first = _mm256_permute4x64_epi64(dst_first, 0x8); + + _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(dst_first)); + } +} + +static void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi; + + __m128i kernel_reg; // Kernel + __m256i kernel_reg_256, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m256i res_reg_m1001_lo, res_reg_1223_lo, res_reg_m1001_hi, res_reg_1223_hi; + __m256i res_reg, res_reg_lo, res_reg_hi; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg); + kernel_reg_23 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001_lo = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + src_reg_m1001_hi = _mm256_unpackhi_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223_lo = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + src_reg_1223_hi = _mm256_unpackhi_epi8(src_reg_12, src_reg_23); + + // Output from first half + res_reg_m1001_lo = _mm256_maddubs_epi16(src_reg_m1001_lo, kernel_reg_23); + res_reg_1223_lo = _mm256_maddubs_epi16(src_reg_1223_lo, kernel_reg_45); + res_reg_lo = _mm256_adds_epi16(res_reg_m1001_lo, res_reg_1223_lo); + + // Output from second half + res_reg_m1001_hi = _mm256_maddubs_epi16(src_reg_m1001_hi, kernel_reg_23); + res_reg_1223_hi = _mm256_maddubs_epi16(src_reg_1223_hi, kernel_reg_45); + res_reg_hi = _mm256_adds_epi16(res_reg_m1001_hi, res_reg_1223_hi); + + // Round the words + res_reg_lo = mm256_round_epi16(&res_reg_lo, ®_32, 6); + res_reg_hi = mm256_round_epi16(&res_reg_hi, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi); + + // Save the result + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001_lo = src_reg_1223_lo; + src_reg_m1001_hi = src_reg_1223_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a + // time. + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m256i idx_shift_2 = + _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u)); + + for (h = height; h >= 2; h -= 2) { + // Load the source + const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + __m256i dst_reg; + __m256i tmp_0, tmp_1; + const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_reg = _mm256_adds_epi16(tmp_0, tmp_1); + + // Round the result + dst_reg = mm256_round_epi16(&dst_reg, ®_32, 6); + + // Finally combine to get the final dst + dst_reg = _mm256_packus_epi16(dst_reg, dst_reg); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &dst_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + __m128i dst_reg; + const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding + __m128i tmp_0, tmp_1; + + __m128i src_reg_shift_0 = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_0)); + __m128i src_reg_shift_2 = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_2)); + + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, + _mm256_castsi256_si128(kernel_reg_23)); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, + _mm256_castsi256_si128(kernel_reg_45)); + dst_reg = _mm_adds_epi16(tmp_0, tmp_1); + + dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32_128, 6); + + dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, dst_reg); + } +} + +static void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m256i res_reg_m1001, res_reg_1223; + __m256i res_reg; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + + // Output + res_reg_m1001 = _mm256_maddubs_epi16(src_reg_m1001, kernel_reg_23); + res_reg_1223 = _mm256_maddubs_epi16(src_reg_1223, kernel_reg_45); + res_reg = _mm256_adds_epi16(res_reg_m1001, res_reg_1223); + + // Round the words + res_reg = mm256_round_epi16(&res_reg, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg, res_reg); + + // Save the result + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into a single register in the form + // k[5:2] k[5:2] k[5:2] k[5:2] + // Then we shuffle the source into + // s[5:2] s[4:1] s[3:0] s[2:-1] + // Calling multiply and add gives us half of the sum next to each other. + // Calling horizontal add then gives us the output. + // Since avx2 has 256-bit register, we can do 2 rows at a time. + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg; + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + int h; + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + + __m256i shuf_idx = + _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, + 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u)); + + for (h = height; h > 1; h -= 2) { + // Load the source + const __m256i src_reg = mm256_loadu2_epi64( + (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride)); + const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx); + + // Get the result + __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg); + dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256()); + + // Round result + dst = mm256_round_epi16(&dst, ®_32, 6); + + // Pack to 8-bits + dst = _mm256_packus_epi16(dst, _mm256_setzero_si256()); + + // Save + mm256_storeu2_epi32((__m128i *const)dst_ptr, + (__m128i *const)(dst_ptr + dst_stride), &dst); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + if (h > 0) { + // Load the source + const __m128i reg_32_128 = _mm_set1_epi16(32); // Used for rounding + __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr); + __m128i src_reg_shuf = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx)); + + // Get the result + __m128i dst = + _mm_maddubs_epi16(src_reg_shuf, _mm256_castsi256_si128(kernel_reg)); + dst = _mm_hadds_epi16(dst, _mm_setzero_si128()); + + // Round result + dst = mm_round_epi16_sse2(&dst, ®_32_128, 6); + + // Pack to 8-bits + dst = _mm_packus_epi16(dst, _mm_setzero_si128()); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst); + } +} + +static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[3,0] s[2,0] s[1,0] s[0,0] s[2,0] s[1,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel to get partial output. + // Calling horizontal add then gives us the completely output + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223, src_reg_m1012_1023; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg; + + // Result after multiply and add + __m256i res_reg; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + + // Combine all the rows + src_reg_m1012_1023 = _mm256_unpacklo_epi16(src_reg_m1001, src_reg_1223); + + // Output + res_reg = _mm256_maddubs_epi16(src_reg_m1012_1023, kernel_reg); + res_reg = _mm256_hadds_epi16(res_reg, _mm256_setzero_si256()); + + // Round the words + res_reg = mm256_round_epi16(&res_reg, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg, res_reg); + + // Save the result + mm256_storeu2_epi32((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[8]; + __m128i s[9]; + + unsigned int y = output_height; + // Multiply the size of the source stride by two + const ptrdiff_t src_stride = src_pitch << 1; + + // The output_height is always a multiple of two. + assert(!(output_height & 1)); + + shuffle_filter_avx2(filter, f); + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + // merge the result together + // r[0]: 0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0 + // r07 r06 r05 r04 r03 r02 r01 r00 + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1); + + // r[1]: 0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0 + // r17 r16 r15 r14 r13 r12 r11 r10 + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1); + + // r[2]: 0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0 + // r27 r26 r25 r24 r23 r22 r21 r20 + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1); + + // r[3]: 0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0 + // r37 r36 r35 r34 r33 r32 r31 r30 + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1); + + // r[4]: 0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0 + // r47 r46 r45 r44 r43 r42 r41 r40 + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1); + + // r[5]: 0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0 + // r57 r56 r55 r54 r53 r52 r51 r50 + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1); + + // Merge together + // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11 + // r01|r10 r00| + ss[0] = _mm256_unpacklo_epi8(r[0], r[1]); + + // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31 + // r21|r30 r20| + ss[1] = _mm256_unpacklo_epi8(r[2], r[3]); + + // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51 + // r41|r50 r40| + ss[2] = _mm256_unpacklo_epi8(r[4], r[5]); + + // Process 2 rows at a time + do { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + // r[6]: 0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0 + // 0 r67 r66 r65 r64 r63 r62 r61 r60 + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1); + // r[7]: 0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0 + // 0 r77 r76 r75 r74 r73 r72 r71 r70 + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1); + + // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72 + // r62 | r71 r61|r70 r60| + ss[3] = _mm256_unpacklo_epi8(r[6], r[7]); + ss[0] = convolve8_16_avx2(ss, f); + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + /* shift down two rows */ + s[6] = s[8]; + _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0])); + output_ptr += out_pitch; + _mm_storel_epi64((__m128i *)&output_ptr[0], + _mm256_extractf128_si256(ss[0], 1)); + output_ptr += out_pitch; + ss[0] = ss[1]; + ss[1] = ss[2]; + ss[2] = ss[3]; + y -= 2; + } while (y > 1); +} + +static void vpx_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64_256bit; + unsigned int y = output_height; + + assert(output_height > 1); + + addFilterReg64_256bit = _mm256_set1_epi16(32); + + // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit) + filtersReg = _mm_loadu_si128((const __m128i *)filter); + + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each) + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + { + ptrdiff_t src_stride; + __m256i filt1Reg, filt2Reg, firstFilters, secondFilters; + // have the same data in both lanes of a 256 bit register + // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0 + // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each) + const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg); + + // duplicate only the first 32 bits + // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1 + // f0|f3 f2 f1 f0|f3 f2 f1 f0 + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5 + // f4|f7 f6 f5 f4|f7 f6 f5 f4 + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3 + // s2 s4 s3 s2 s1 s3 s2 s1 s0 + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + + // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7 + // s6 s8 s7 s6 s5 s7 s6 s5 s4 + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + + do { + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1; + // load the 2 strides of source + // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07 + // r06 r05 r04 r03 r02 r01 r00 + srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); + + // filter the source buffer + // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06 + // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00 + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||... + // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00 + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010 + // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04 + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010 + // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04 + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = + _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit) + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + + // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit) + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + // save first row 4 values + *((int *)&output_ptr[0]) = + _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1)); + output_ptr += output_pitch; + + // save second row 4 values + *((int *)&output_ptr[0]) = + _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr += output_pitch; + + y = y - 2; + } while (y > 1); + + // For remaining height + if (y > 0) { + __m128i srcReg1, srcRegFilt1_1, addFilterReg64; + __m128i srcRegFilt2; + + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1_1 = + _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } + } +} + +static void vpx_filter_block1d4_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[9], rr[2]; + __m128i s[11]; + + unsigned int y = output_height; + // Multiply the size of the source stride by four + const ptrdiff_t src_stride = src_pitch << 2; + const ptrdiff_t out_stride = out_pitch << 2; + + // The output_height is always a multiple of two. + assert(!(output_height & 0x01)); + + shuffle_filter_avx2(filter, f); + + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1); + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1); + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1); + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1); + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1); + + // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[0], r[1]); + + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[1], r[2]); + + // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10 + // r00| + ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[2], r[3]); + + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[3], r[4]); + + // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30 + // r20| + ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]); + // Process 4 rows at a time + while (y >= 4) { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch)); + s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch)); + + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1); + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1); + rr[0] = _mm256_unpacklo_epi32(r[4], r[5]); + rr[1] = _mm256_unpacklo_epi32(r[5], r[6]); + ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1); + r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1); + rr[0] = _mm256_unpacklo_epi32(r[6], r[7]); + rr[1] = _mm256_unpacklo_epi32(r[7], r[8]); + ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + ss[0] = convolve8_16_avx2(ss, f); + + // r3 r2 r3 r2 r1 r0 r1 r0 + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + mm256_storeu2_epi32((__m128i *const)output_ptr, + (__m128i *const)(output_ptr + (2 * out_pitch)), ss); + + ss[0] = _mm256_srli_si256(ss[0], 4); + + mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)), + (__m128i *const)(output_ptr + (3 * out_pitch)), ss); + + output_ptr += out_stride; + + ss[0] = ss[2]; + ss[1] = ss[3]; + + s[6] = s[10]; + s[5] = s[9]; + + r[4] = r[8]; + y -= 4; + } + + // Process 2 rows + if (y == 2) { + __m128i ss1[4], f1[4], r1[4]; + + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + f1[0] = _mm256_castsi256_si128(f[0]); + f1[1] = _mm256_castsi256_si128(f[1]); + f1[2] = _mm256_castsi256_si128(f[2]); + f1[3] = _mm256_castsi256_si128(f[3]); + + r1[0] = _mm_unpacklo_epi32(s[4], s[5]); + r1[1] = _mm_unpacklo_epi32(s[5], s[6]); + + // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 + r1[2] = _mm_unpacklo_epi32(s[6], s[7]); + + // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 + r1[3] = _mm_unpacklo_epi32(s[7], s[8]); + + // r23 r13....r20 r10|r13 r03....r10 r00 + ss1[0] = _mm256_castsi256_si128(ss[0]); + + // r43 r33....r40 r30|r33 r23....r30 r20 + ss1[1] = _mm256_castsi256_si128(ss[1]); + + // r63 r53....r60 r50|r53 r43....r50 r40 + ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]); + + // r83 r73....r80 r70|r73 r63....r70 r60 + ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]); + + ss1[0] = convolve8_8_ssse3(ss1, f1); + + // r1 r0 r1 r0 + ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]); + + // Save first row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + output_ptr += out_pitch; + + ss1[0] = _mm_srli_si128(ss1[0], 4); + // Save second row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); } } #if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction vpx_filter_block1d4_v8_ssse3; -#if ARCH_X86_64 +#if VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3 -#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3 -#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 +#else // VPX_ARCH_X86 filter8_1dfunction vpx_filter_block1d8_v8_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_ssse3; -#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3 -#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 -#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 -#endif // ARCH_X86_64 +#endif // VPX_ARCH_X86_64 +filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; +#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3 +#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3 +#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3 +#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3 filter8_1dfunction vpx_filter_block1d16_v2_ssse3; filter8_1dfunction vpx_filter_block1d16_h2_ssse3; filter8_1dfunction vpx_filter_block1d8_v2_ssse3; filter8_1dfunction vpx_filter_block1d8_h2_ssse3; filter8_1dfunction vpx_filter_block1d4_v2_ssse3; filter8_1dfunction vpx_filter_block1d4_h2_ssse3; -#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 +filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; +#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3 +#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3 +#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3 +#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3 +#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3 +#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3 + +#define vpx_filter_block1d16_v4_avg_avx2 vpx_filter_block1d16_v8_avg_avx2 +#define vpx_filter_block1d16_h4_avg_avx2 vpx_filter_block1d16_h8_avg_avx2 +#define vpx_filter_block1d8_v4_avg_avx2 vpx_filter_block1d8_v8_avg_avx2 +#define vpx_filter_block1d8_h4_avg_avx2 vpx_filter_block1d8_h8_avg_avx2 +#define vpx_filter_block1d4_v4_avg_avx2 vpx_filter_block1d4_v8_avg_avx2 +#define vpx_filter_block1d4_h4_avg_avx2 vpx_filter_block1d4_h8_avg_avx2 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0) +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , + avx2, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1) +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1) // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, avx2); +// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2, 0) +FUN_CONV_2D(avg_, avx2, 1) #endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 09c75d455c..4ea2752d38 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -8,52 +8,51 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include +#include // SSSE3 +#include + +#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_filter.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_sse2.h" +#include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_dsp/x86/mem_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" -#include "vpx_ports/emmintrin_compat.h" -// filters only for the 4_h8 convolution -DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 -}; +static INLINE __m128i shuffle_filter_convolve8_8_ssse3( + const __m128i *const s, const int16_t *const filter) { + __m128i f[4]; + shuffle_filter_ssse3(filter, f); + return convolve8_8_ssse3(s, f); +} -DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -// filters for 8_h8 and 16_h8 -DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 -}; - -// These are reused by the avx2 intrinsics. -filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; +// Used by the avx2 implementation. +#if VPX_ARCH_X86_64 +// Use the intrinsics below filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; +#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3 +#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3 +#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3 +#else // VPX_ARCH_X86 +// Use the assembly in vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm. +filter8_1dfunction vpx_filter_block1d4_h8_ssse3; +filter8_1dfunction vpx_filter_block1d8_h8_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_ssse3; +#endif +#if VPX_ARCH_X86_64 void vpx_filter_block1d4_h8_intrin_ssse3( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, srcReg, minReg; + __m128i srcRegFilt1, srcRegFilt2; + __m128i addFilterReg64, filtersReg, srcReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 @@ -75,8 +74,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3( secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); // loading the local filters - shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); - shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); + shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6); + shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); @@ -89,25 +88,23 @@ void vpx_filter_block1d4_h8_intrin_ssse3( srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - // extract the higher half of the lane - srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); - srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + // sum the results together, saturating only on the final step + // the specific order of the additions prevents outranges + srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2); - minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + // extract the higher half of the register + srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8); - // add and saturate all the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + // add the rounding offset early to avoid another saturated add + srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - src_ptr += src_pixels_per_line; + src_ptr += src_pitch; // save only 4 bytes *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); @@ -117,77 +114,35 @@ void vpx_filter_block1d4_h8_intrin_ssse3( } void vpx_filter_block1d8_h8_intrin_ssse3( - const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { - __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; - __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, minReg; unsigned int i; + __m128i f[4], filt[4], s[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 128 bit register - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 128 bit register - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 128 bit register - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 128 bit register - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - filt1Reg = _mm_load_si128((__m128i const *)filt1_global); - filt2Reg = _mm_load_si128((__m128i const *)filt2_global); - filt3Reg = _mm_load_si128((__m128i const *)filt3_global); - filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + shuffle_filter_ssse3(filter, f); + filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12); + filt[3] = + _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14); for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer - srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); - srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // filter the source buffer - srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); - srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); - - // add and saturate all the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + s[0] = _mm_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm_shuffle_epi8(srcReg, filt[3]); + s[0] = convolve8_8_ssse3(s, f); // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + s[0] = _mm_packus_epi16(s[0], s[0]); - src_ptr += src_pixels_per_line; + src_ptr += src_pitch; // save only 8 bytes - _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]); output_ptr += output_pitch; } @@ -196,94 +151,537 @@ void vpx_filter_block1d8_h8_intrin_ssse3( void vpx_filter_block1d8_v8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { - __m128i addFilterReg64, filtersReg, minReg; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; - __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; - __m128i srcReg8; unsigned int i; + __m128i f[4], s[8], ss[4]; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg = _mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits in the filter - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits in the filter - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits in the filter - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + shuffle_filter_ssse3(filter, f); // load the first 7 rows of 8 bytes - srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); - srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); for (i = 0; i < output_height; i++) { // load the last 8 bytes - srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); - srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); - srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); - - // add and saturate the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + ss[0] = convolve8_8_ssse3(ss, f); // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + ss[0] = _mm_packus_epi16(ss[0], ss[0]); src_ptr += src_pitch; // shift down a row - srcReg1 = srcReg2; - srcReg2 = srcReg3; - srcReg3 = srcReg4; - srcReg4 = srcReg5; - srcReg5 = srcReg6; - srcReg6 = srcReg7; - srcReg7 = srcReg8; + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + s[3] = s[4]; + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; // save only 8 bytes convolve result - _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]); output_ptr += out_pitch; } } +#endif // VPX_ARCH_X86_64 +static void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_0, src_reg_shift_2; + __m128i dst_first, dst_second; + __m128i tmp_0, tmp_1; + __m128i idx_shift_0 = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m128i idx_shift_2 = + _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm_adds_epi16(tmp_0, tmp_1); + + // Do again to get the second half of dst + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_second = _mm_adds_epi16(tmp_0, tmp_1); + + // Round each result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm_packus_epi16(dst_first, dst_second); + _mm_store_si128((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, + uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // so that we can call multiply and add with the kernel to get 16-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3); + + // Partial output from first half + res_reg_m10_lo = _mm_maddubs_epi16(src_reg_m10_lo, kernel_reg_23); + res_reg_01_lo = _mm_maddubs_epi16(src_reg_01_lo, kernel_reg_23); + + res_reg_12_lo = _mm_maddubs_epi16(src_reg_12_lo, kernel_reg_45); + res_reg_23_lo = _mm_maddubs_epi16(src_reg_23_lo, kernel_reg_45); + + // Add to get first half of the results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Partial output for second half + res_reg_m10_hi = _mm_maddubs_epi16(src_reg_m10_hi, kernel_reg_23); + res_reg_01_hi = _mm_maddubs_epi16(src_reg_01_hi, kernel_reg_23); + + res_reg_12_hi = _mm_maddubs_epi16(src_reg_12_hi, kernel_reg_45); + res_reg_23_hi = _mm_maddubs_epi16(src_reg_23_hi, kernel_reg_45); + + // Second half of the results + res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + + // Combine to get the result + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi); + + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo = src_reg_12_lo; + src_reg_m10_hi = src_reg_12_hi; + src_reg_01_lo = src_reg_23_lo; + src_reg_01_hi = src_reg_23_hi; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_0, src_reg_shift_2; + __m128i dst_first; + __m128i tmp_0, tmp_1; + __m128i idx_shift_0 = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m128i idx_shift_2 = + _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Get the result + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm_adds_epi16(tmp_0, tmp_1); + + // Round round result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Pack to 8-bits + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel to get 16-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23; + __m128i res_reg_m1012, res_reg_0123; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12 = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23 = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10 = _mm_maddubs_epi16(src_reg_m10, kernel_reg_23); + res_reg_01 = _mm_maddubs_epi16(src_reg_01, kernel_reg_23); + + res_reg_12 = _mm_maddubs_epi16(src_reg_12, kernel_reg_45); + res_reg_23 = _mm_maddubs_epi16(src_reg_23, kernel_reg_45); + + // Add to get entire output + res_reg_m1012 = _mm_adds_epi16(res_reg_m10, res_reg_12); + res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23); + + // Round the words + res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, ®_32, 6); + res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, ®_32, 6); + + // Pack from 16-bit to 8-bit + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128()); + res_reg_0123 = _mm_packus_epi16(res_reg_0123, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10 = src_reg_12; + src_reg_01 = src_reg_23; + src_reg_1 = src_reg_3; + } +} + +static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into a single register in the form + // k[5:2] k[5:2] k[5:2] k[5:2] + // Then we shuffle the source into + // s[5:2] s[4:1] s[3:0] s[2:-1] + // Calling multiply and add gives us half of the sum next to each other. + // Calling horizontal add then gives us the output. + + __m128i kernel_reg; // Kernel + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shuf; + __m128i dst_first; + __m128i shuf_idx = + _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shuf = _mm_shuffle_epi8(src_reg, shuf_idx); + + // Get the result + dst_first = _mm_maddubs_epi16(src_reg_shuf, kernel_reg); + dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128()); + + // Round result + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + + // Pack to 8-bits + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[2,0] s[1,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call horizontal add to get the output. + // Finally, we can add multiple rows together to get the desired output. + // This is done two rows at a time + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + __m128i src_reg_m1001, src_reg_1223; + __m128i src_reg_m1012_1023_lo, src_reg_m1012_1023_hi; + + __m128i kernel_reg; // Kernel + + // Result after multiply and add + __m128i reg_0, reg_1; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u)); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi32(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi32(src_reg_0, src_reg_1); + + // Put three rows next to each other + src_reg_m1001 = _mm_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + src_reg_12 = _mm_unpacklo_epi32(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + src_reg_23 = _mm_unpacklo_epi32(src_reg_2, src_reg_3); + + // Put three rows next to each other + src_reg_1223 = _mm_unpacklo_epi8(src_reg_12, src_reg_23); + + // Put all four rows next to each other + src_reg_m1012_1023_lo = _mm_unpacklo_epi16(src_reg_m1001, src_reg_1223); + src_reg_m1012_1023_hi = _mm_unpackhi_epi16(src_reg_m1001, src_reg_1223); + + // Get the results + reg_0 = _mm_maddubs_epi16(src_reg_m1012_1023_lo, kernel_reg); + reg_1 = _mm_maddubs_epi16(src_reg_m1012_1023_hi, kernel_reg); + reg_0 = _mm_hadds_epi16(reg_0, _mm_setzero_si128()); + reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128()); + + // Round the words + reg_0 = mm_round_epi16_sse2(®_0, ®_32, 6); + reg_1 = mm_round_epi16_sse2(®_1, ®_32, 6); + + // Pack from 16-bit to 8-bit and put them in the right order + reg_0 = _mm_packus_epi16(reg_0, reg_0); + reg_1 = _mm_packus_epi16(reg_1, reg_1); + + // Save the result + *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0); + *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +// From vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm filter8_1dfunction vpx_filter_block1d16_v8_ssse3; filter8_1dfunction vpx_filter_block1d16_h8_ssse3; -filter8_1dfunction vpx_filter_block1d8_v8_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_ssse3; filter8_1dfunction vpx_filter_block1d4_v8_ssse3; -filter8_1dfunction vpx_filter_block1d4_h8_ssse3; filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; @@ -291,6 +689,15 @@ filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_filter_block1d16_v4_avg_ssse3 vpx_filter_block1d16_v8_avg_ssse3 +#define vpx_filter_block1d16_h4_avg_ssse3 vpx_filter_block1d16_h8_avg_ssse3 +#define vpx_filter_block1d8_v4_avg_ssse3 vpx_filter_block1d8_v8_avg_ssse3 +#define vpx_filter_block1d8_h4_avg_ssse3 vpx_filter_block1d8_h8_avg_ssse3 +#define vpx_filter_block1d4_v4_avg_ssse3 vpx_filter_block1d4_v8_avg_ssse3 +#define vpx_filter_block1d4_h4_avg_ssse3 vpx_filter_block1d4_h8_avg_ssse3 + +// From vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm filter8_1dfunction vpx_filter_block1d16_v2_ssse3; filter8_1dfunction vpx_filter_block1d16_h2_ssse3; filter8_1dfunction vpx_filter_block1d8_v2_ssse3; @@ -306,149 +713,71 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0) +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , + ssse3, 0) +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1) +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1) -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ - const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ - const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ - const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ - \ - const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ - const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ - const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ - const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ - \ - out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ - out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ - out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ - out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ - out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ - out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ - out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ - out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ - } +static void filter_horiz_w8_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const x_filter) { + __m128i s[8], ss[4], temp; -static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *x_filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 - const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); - // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 - const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 - const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); - // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 - const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); - const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); - const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); - const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, ss); + temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); } -static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A, B, C, D, E, F, G, H; +static void transpose8x8_to_dst(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride) { + __m128i s[8]; - A = _mm_loadl_epi64((const __m128i *)src); - B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); - C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); - E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); - F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); - G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); - H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); - - TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H); - - _mm_storel_epi64((__m128i *)dst, A); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H); + load_8bit_8x8(src, src_stride, s); + transpose_8bit_8x8(s, s); + store_8bit_8x8(s, dst, dst_stride); } -static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { +static void scaledconvolve_horiz_w8(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, + const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; - // This function processes 8x8 areas. The intermediate height is not always + // This function processes 8x8 areas. The intermediate height is not always // a multiple of 8, so force it to be a multiple of 8 here. y = h + (8 - (h & 0x7)); @@ -479,93 +808,50 @@ static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, } while (y -= 8); } -static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - // TRANSPOSE... - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // - // TO - // - // 00 10 20 30 - // 01 11 21 31 - // 02 12 22 32 - // 03 13 23 33 - // 04 14 24 34 - // 05 15 25 35 - // 06 16 26 36 - // 07 17 27 37 - // - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); +static void filter_horiz_w4_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const filter) { + __m128i s[4], ss[2]; + __m128i temp; + + load_8bit_8x4(src, src_stride, s); + transpose_16bit_4x4(s, ss); + // 00 01 10 11 20 21 30 31 + s[0] = ss[0]; // 02 03 12 13 22 23 32 33 - const __m128i s3s2 = _mm_srli_si128(s1s0, 8); + s[1] = _mm_srli_si128(ss[0], 8); + // 04 05 14 15 24 25 34 35 + s[2] = ss[1]; // 06 07 16 17 26 27 36 37 - const __m128i s7s6 = _mm_srli_si128(s5s4, 8); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); + s[3] = _mm_srli_si128(ss[1], 8); + + temp = shuffle_filter_convolve8_8_ssse3(s, filter); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); } -static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A = _mm_cvtsi32_si128(*(const int *)src); - __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); - __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); - __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); - // 00 10 01 11 02 12 03 13 - const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); - // 20 30 21 31 22 32 23 33 - const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - A = _mm_unpacklo_epi16(tr0_0, tr0_1); - B = _mm_srli_si128(A, 4); - C = _mm_srli_si128(A, 8); - D = _mm_srli_si128(A, 12); +static void transpose4x4_to_dst(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride) { + __m128i s[4]; - *(int *)(dst) = _mm_cvtsi128_si32(A); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); - *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); - *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); + load_8bit_4x4(src, src_stride, s); + s[0] = transpose_8bit_4x4(s); + s[1] = _mm_srli_si128(s[0], 4); + s[2] = _mm_srli_si128(s[0], 8); + s[3] = _mm_srli_si128(s[0], 12); + store_8bit_4x4(s, dst, dst_stride); } -static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { +static void scaledconvolve_horiz_w4(const uint8_t *src, + const ptrdiff_t src_stride, uint8_t *dst, + const ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + const int x0_q4, const int x_step_q4, + const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; @@ -597,50 +883,41 @@ static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, } } -static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); - const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); - const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); +static __m128i filter_vert_kernel(const __m128i *const s, + const int16_t *const filter) { + __m128i ss[4]; + __m128i temp; + + // 00 10 01 11 02 12 03 13 + ss[0] = _mm_unpacklo_epi8(s[0], s[1]); + // 20 30 21 31 22 32 23 33 + ss[1] = _mm_unpacklo_epi8(s[2], s[3]); + // 40 50 41 51 42 52 43 53 + ss[2] = _mm_unpacklo_epi8(s[4], s[5]); + // 60 70 61 71 62 72 63 73 + ss[3] = _mm_unpacklo_epi8(s[6], s[7]); + + temp = shuffle_filter_convolve8_8_ssse3(ss, filter); // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); + return _mm_packus_epi16(temp, temp); +} + +static void filter_vert_w4_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const int16_t *const filter) { + __m128i s[8]; + __m128i temp; + + load_8bit_4x8(src, src_stride, s); + temp = filter_vert_kernel(s, filter); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); } -static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static void scaledconvolve_vert_w4( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; @@ -659,50 +936,21 @@ static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, } } -static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); +static void filter_vert_w8_ssse3(const uint8_t *const src, + const ptrdiff_t src_stride, uint8_t *const dst, + const int16_t *const filter) { + __m128i s[8], temp; + + load_8bit_8x8(src, src_stride, s); + temp = filter_vert_kernel(s, filter); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); } -static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static void scaledconvolve_vert_w8( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; @@ -719,81 +967,44 @@ static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, } } -static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter, int w) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); +static void filter_vert_w16_ssse3(const uint8_t *src, + const ptrdiff_t src_stride, + uint8_t *const dst, + const int16_t *const filter, const int w) { int i; + __m128i f[4]; + shuffle_filter_ssse3(filter, f); for (i = 0; i < w; i += 16) { - const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); - const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - // merge the result together - const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); - const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); - const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); - const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); - const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); - const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); - const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); - // add and saturate the results together - const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); - const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); - // merge the result together - const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); - const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); - const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); - // merge the result together - const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); - const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); - const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); - // add and saturate the results together - __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); - __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); + __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi; - // add and saturate the results together - temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); - temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); - // round and shift by 7 bit each 16 bit - temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); - temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result + loadu_8bit_16x8(src, src_stride, s); + + // merge the result together + s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]); + s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]); + s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]); + s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]); + s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]); + s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]); + s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]); + s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]); + temp_lo = convolve8_8_ssse3(s_lo, f); + temp_hi = convolve8_8_ssse3(s_hi, f); + + // shrink to 8 bit each 16 bits, the first lane contain the first convolve + // result and the second lane contain the second convolve result temp_hi = _mm_packus_epi16(temp_lo, temp_hi); - src_ptr += 16; + src += 16; // save 16 bytes convolve result _mm_store_si128((__m128i *)&dst[i], temp_hi); } } -static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { +static void scaledconvolve_vert_w16( + const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst, + const ptrdiff_t dst_stride, const InterpKernel *const y_filters, + const int y0_q4, const int y_step_q4, const int w, const int h) { int y; int y_q4 = y0_q4; @@ -811,11 +1022,10 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, } } -static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h) { +void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -829,60 +1039,49 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. // --Require an additional 8 rows for the horiz_w8 transpose tail. + // When calling in frame scaling function, the smallest scaling factor is x1/4 + // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still + // big enough. DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); + assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); + assert(x_step_q4 <= 64); if (w >= 8) { scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, - w, intermediate_height); + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); } else { scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, - w, intermediate_height); + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); } if (w >= 16) { scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } else if (w == 8) { scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } else { scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } } -void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, filters_y, y0_q4, y_step_q4, w, h); -} - -// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, ssse3); -FUN_CONV_2D(avg_, ssse3); +FUN_CONV_2D(, ssse3, 0) +FUN_CONV_2D(avg_, ssse3, 1) diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm index 08f3d6a6cf..c8455e13a2 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm @@ -176,6 +176,8 @@ movq [rdi + %2], xmm0 %endm +SECTION .text + ;void vpx_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, @@ -185,7 +187,7 @@ ; unsigned int output_height, ; short *filter ;) -global sym(vpx_filter_block1d4_v8_sse2) PRIVATE +globalsym(vpx_filter_block1d4_v8_sse2) sym(vpx_filter_block1d4_v8_sse2): push rbp mov rbp, rsp @@ -252,7 +254,7 @@ sym(vpx_filter_block1d4_v8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vpx_filter_block1d8_v8_sse2) PRIVATE +globalsym(vpx_filter_block1d8_v8_sse2) sym(vpx_filter_block1d8_v8_sse2): push rbp mov rbp, rsp @@ -311,7 +313,7 @@ sym(vpx_filter_block1d8_v8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vpx_filter_block1d16_v8_sse2) PRIVATE +globalsym(vpx_filter_block1d16_v8_sse2) sym(vpx_filter_block1d16_v8_sse2): push rbp mov rbp, rsp @@ -365,7 +367,7 @@ sym(vpx_filter_block1d16_v8_sse2): pop rbp ret -global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d4_v8_avg_sse2) sym(vpx_filter_block1d4_v8_avg_sse2): push rbp mov rbp, rsp @@ -423,7 +425,7 @@ sym(vpx_filter_block1d4_v8_avg_sse2): pop rbp ret -global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d8_v8_avg_sse2) sym(vpx_filter_block1d8_v8_avg_sse2): push rbp mov rbp, rsp @@ -472,7 +474,7 @@ sym(vpx_filter_block1d8_v8_avg_sse2): pop rbp ret -global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d16_v8_avg_sse2) sym(vpx_filter_block1d16_v8_avg_sse2): push rbp mov rbp, rsp @@ -534,7 +536,7 @@ sym(vpx_filter_block1d16_v8_avg_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vpx_filter_block1d4_h8_sse2) PRIVATE +globalsym(vpx_filter_block1d4_h8_sse2) sym(vpx_filter_block1d4_h8_sse2): push rbp mov rbp, rsp @@ -608,7 +610,7 @@ sym(vpx_filter_block1d4_h8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vpx_filter_block1d8_h8_sse2) PRIVATE +globalsym(vpx_filter_block1d8_h8_sse2) sym(vpx_filter_block1d8_h8_sse2): push rbp mov rbp, rsp @@ -683,7 +685,7 @@ sym(vpx_filter_block1d8_h8_sse2): ; unsigned int output_height, ; short *filter ;) -global sym(vpx_filter_block1d16_h8_sse2) PRIVATE +globalsym(vpx_filter_block1d16_h8_sse2) sym(vpx_filter_block1d16_h8_sse2): push rbp mov rbp, rsp @@ -769,7 +771,7 @@ sym(vpx_filter_block1d16_h8_sse2): pop rbp ret -global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d4_h8_avg_sse2) sym(vpx_filter_block1d4_h8_avg_sse2): push rbp mov rbp, rsp @@ -834,7 +836,7 @@ sym(vpx_filter_block1d4_h8_avg_sse2): pop rbp ret -global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d8_h8_avg_sse2) sym(vpx_filter_block1d8_h8_avg_sse2): push rbp mov rbp, rsp @@ -900,7 +902,7 @@ sym(vpx_filter_block1d8_h8_avg_sse2): pop rbp ret -global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d16_h8_avg_sse2) sym(vpx_filter_block1d16_h8_avg_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm index c1a6f23abe..fe617f1207 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm @@ -26,7 +26,7 @@ SECTION .text %define LOCAL_VARS_SIZE 16*6 %macro SETUP_LOCAL_VARS 0 - ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + + ; TODO(slavarnway): using xmm registers for these on VPX_ARCH_X86_64 + ; pmaddubsw has a higher latency on some platforms, this might be eased by ; interleaving the instructions. %define k0k1 [rsp + 16*0] @@ -48,7 +48,7 @@ SECTION .text mova k2k3, m1 mova k4k5, m2 mova k6k7, m3 -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 %define krd m12 %define tmp0 [rsp + 16*4] %define tmp1 [rsp + 16*5] @@ -68,7 +68,7 @@ SECTION .text %endm ;------------------------------------------------------------------------------- -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 %define LOCAL_VARS_SIZE_H4 0 %else %define LOCAL_VARS_SIZE_H4 16*4 @@ -79,7 +79,7 @@ cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ src, sstride, dst, dstride, height, filter mova m4, [filterq] packsswb m4, m4 -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 %define k0k1k4k5 m8 %define k2k3k6k7 m9 %define krd m10 @@ -327,19 +327,19 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ %endm INIT_XMM ssse3 -SUBPIX_HFILTER16 h8 -SUBPIX_HFILTER16 h8_avg -SUBPIX_HFILTER8 h8 -SUBPIX_HFILTER8 h8_avg -SUBPIX_HFILTER4 h8 -SUBPIX_HFILTER4 h8_avg +SUBPIX_HFILTER16 h8 ; vpx_filter_block1d16_h8_ssse3 +SUBPIX_HFILTER16 h8_avg ; vpx_filter_block1d16_h8_avg_ssse3 +SUBPIX_HFILTER8 h8 ; vpx_filter_block1d8_h8_ssse3 +SUBPIX_HFILTER8 h8_avg ; vpx_filter_block1d8_h8_avg_ssse3 +SUBPIX_HFILTER4 h8 ; vpx_filter_block1d4_h8_ssse3 +SUBPIX_HFILTER4 h8_avg ; vpx_filter_block1d4_h8_avg_ssse3 ;------------------------------------------------------------------------------- ; TODO(Linfeng): Detect cpu type and choose the code with better performance. %define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 -%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON +%if VPX_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON %define NUM_GENERAL_REG_USED 9 %else %define NUM_GENERAL_REG_USED 6 @@ -359,9 +359,9 @@ cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ dec heightd -%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON +%if VPX_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 %define src1q r7 %define sstride6q r8 %define dst_stride dstrideq @@ -467,7 +467,7 @@ cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ movx [dstq], m0 %else - ; ARCH_X86_64 + ; VPX_ARCH_X86_64 movx m0, [srcq ] ;A movx m1, [srcq + sstrideq ] ;B @@ -567,7 +567,7 @@ cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ %endif movx [dstq], m0 -%endif ; ARCH_X86_64 +%endif ; VPX_ARCH_X86_64 .done: REP_RET @@ -581,9 +581,9 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ mova m4, [filterq] SETUP_LOCAL_VARS -%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON +%if VPX_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON -%if ARCH_X86_64 +%if VPX_ARCH_X86_64 %define src1q r7 %define sstride6q r8 %define dst_stride dstrideq @@ -654,7 +654,7 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ REP_RET %else - ; ARCH_X86_64 + ; VPX_ARCH_X86_64 dec heightd movu m1, [srcq ] ;A @@ -790,14 +790,14 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ .done: REP_RET -%endif ; ARCH_X86_64 +%endif ; VPX_ARCH_X86_64 %endm INIT_XMM ssse3 -SUBPIX_VFILTER16 v8 -SUBPIX_VFILTER16 v8_avg -SUBPIX_VFILTER v8, 8 -SUBPIX_VFILTER v8_avg, 8 -SUBPIX_VFILTER v8, 4 -SUBPIX_VFILTER v8_avg, 4 +SUBPIX_VFILTER16 v8 ; vpx_filter_block1d16_v8_ssse3 +SUBPIX_VFILTER16 v8_avg ; vpx_filter_block1d16_v8_avg_ssse3 +SUBPIX_VFILTER v8, 8 ; vpx_filter_block1d8_v8_ssse3 +SUBPIX_VFILTER v8_avg, 8 ; vpx_filter_block1d8_v8_avg_ssse3 +SUBPIX_VFILTER v8, 4 ; vpx_filter_block1d4_v8_ssse3 +SUBPIX_VFILTER v8_avg, 4 ; vpx_filter_block1d4_v8_avg_ssse3 diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm index a378dd0402..65790b1c21 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm @@ -131,7 +131,9 @@ dec rcx %endm -global sym(vpx_filter_block1d4_v2_sse2) PRIVATE +SECTION .text + +globalsym(vpx_filter_block1d4_v2_sse2) sym(vpx_filter_block1d4_v2_sse2): push rbp mov rbp, rsp @@ -155,7 +157,7 @@ sym(vpx_filter_block1d4_v2_sse2): pop rbp ret -global sym(vpx_filter_block1d8_v2_sse2) PRIVATE +globalsym(vpx_filter_block1d8_v2_sse2) sym(vpx_filter_block1d8_v2_sse2): push rbp mov rbp, rsp @@ -181,7 +183,7 @@ sym(vpx_filter_block1d8_v2_sse2): pop rbp ret -global sym(vpx_filter_block1d16_v2_sse2) PRIVATE +globalsym(vpx_filter_block1d16_v2_sse2) sym(vpx_filter_block1d16_v2_sse2): push rbp mov rbp, rsp @@ -209,7 +211,7 @@ sym(vpx_filter_block1d16_v2_sse2): pop rbp ret -global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d4_v2_avg_sse2) sym(vpx_filter_block1d4_v2_avg_sse2): push rbp mov rbp, rsp @@ -233,7 +235,7 @@ sym(vpx_filter_block1d4_v2_avg_sse2): pop rbp ret -global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d8_v2_avg_sse2) sym(vpx_filter_block1d8_v2_avg_sse2): push rbp mov rbp, rsp @@ -259,7 +261,7 @@ sym(vpx_filter_block1d8_v2_avg_sse2): pop rbp ret -global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d16_v2_avg_sse2) sym(vpx_filter_block1d16_v2_avg_sse2): push rbp mov rbp, rsp @@ -287,7 +289,7 @@ sym(vpx_filter_block1d16_v2_avg_sse2): pop rbp ret -global sym(vpx_filter_block1d4_h2_sse2) PRIVATE +globalsym(vpx_filter_block1d4_h2_sse2) sym(vpx_filter_block1d4_h2_sse2): push rbp mov rbp, rsp @@ -312,7 +314,7 @@ sym(vpx_filter_block1d4_h2_sse2): pop rbp ret -global sym(vpx_filter_block1d8_h2_sse2) PRIVATE +globalsym(vpx_filter_block1d8_h2_sse2) sym(vpx_filter_block1d8_h2_sse2): push rbp mov rbp, rsp @@ -339,7 +341,7 @@ sym(vpx_filter_block1d8_h2_sse2): pop rbp ret -global sym(vpx_filter_block1d16_h2_sse2) PRIVATE +globalsym(vpx_filter_block1d16_h2_sse2) sym(vpx_filter_block1d16_h2_sse2): push rbp mov rbp, rsp @@ -367,7 +369,7 @@ sym(vpx_filter_block1d16_h2_sse2): pop rbp ret -global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d4_h2_avg_sse2) sym(vpx_filter_block1d4_h2_avg_sse2): push rbp mov rbp, rsp @@ -392,7 +394,7 @@ sym(vpx_filter_block1d4_h2_avg_sse2): pop rbp ret -global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d8_h2_avg_sse2) sym(vpx_filter_block1d8_h2_avg_sse2): push rbp mov rbp, rsp @@ -419,7 +421,7 @@ sym(vpx_filter_block1d8_h2_avg_sse2): pop rbp ret -global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE +globalsym(vpx_filter_block1d16_h2_avg_sse2) sym(vpx_filter_block1d16_h2_avg_sse2): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm index 538b2129db..32e3cd3d9f 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm @@ -105,7 +105,9 @@ dec rcx %endm -global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE +SECTION .text + +globalsym(vpx_filter_block1d4_v2_ssse3) sym(vpx_filter_block1d4_v2_ssse3): push rbp mov rbp, rsp @@ -129,7 +131,7 @@ sym(vpx_filter_block1d4_v2_ssse3): pop rbp ret -global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE +globalsym(vpx_filter_block1d8_v2_ssse3) sym(vpx_filter_block1d8_v2_ssse3): push rbp mov rbp, rsp @@ -155,7 +157,7 @@ sym(vpx_filter_block1d8_v2_ssse3): pop rbp ret -global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE +globalsym(vpx_filter_block1d16_v2_ssse3) sym(vpx_filter_block1d16_v2_ssse3): push rbp mov rbp, rsp @@ -182,7 +184,7 @@ sym(vpx_filter_block1d16_v2_ssse3): pop rbp ret -global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE +globalsym(vpx_filter_block1d4_v2_avg_ssse3) sym(vpx_filter_block1d4_v2_avg_ssse3): push rbp mov rbp, rsp @@ -206,7 +208,7 @@ sym(vpx_filter_block1d4_v2_avg_ssse3): pop rbp ret -global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE +globalsym(vpx_filter_block1d8_v2_avg_ssse3) sym(vpx_filter_block1d8_v2_avg_ssse3): push rbp mov rbp, rsp @@ -232,7 +234,7 @@ sym(vpx_filter_block1d8_v2_avg_ssse3): pop rbp ret -global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE +globalsym(vpx_filter_block1d16_v2_avg_ssse3) sym(vpx_filter_block1d16_v2_avg_ssse3): push rbp mov rbp, rsp @@ -259,7 +261,7 @@ sym(vpx_filter_block1d16_v2_avg_ssse3): pop rbp ret -global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE +globalsym(vpx_filter_block1d4_h2_ssse3) sym(vpx_filter_block1d4_h2_ssse3): push rbp mov rbp, rsp @@ -284,7 +286,7 @@ sym(vpx_filter_block1d4_h2_ssse3): pop rbp ret -global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE +globalsym(vpx_filter_block1d8_h2_ssse3) sym(vpx_filter_block1d8_h2_ssse3): push rbp mov rbp, rsp @@ -311,7 +313,7 @@ sym(vpx_filter_block1d8_h2_ssse3): pop rbp ret -global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE +globalsym(vpx_filter_block1d16_h2_ssse3) sym(vpx_filter_block1d16_h2_ssse3): push rbp mov rbp, rsp @@ -338,7 +340,7 @@ sym(vpx_filter_block1d16_h2_ssse3): pop rbp ret -global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE +globalsym(vpx_filter_block1d4_h2_avg_ssse3) sym(vpx_filter_block1d4_h2_avg_ssse3): push rbp mov rbp, rsp @@ -363,7 +365,7 @@ sym(vpx_filter_block1d4_h2_avg_ssse3): pop rbp ret -global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE +globalsym(vpx_filter_block1d8_h2_avg_ssse3) sym(vpx_filter_block1d8_h2_avg_ssse3): push rbp mov rbp, rsp @@ -390,7 +392,7 @@ sym(vpx_filter_block1d8_h2_avg_ssse3): pop rbp ret -global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE +globalsym(vpx_filter_block1d16_h2_avg_ssse3) sym(vpx_filter_block1d16_h2_avg_ssse3): push rbp mov rbp, rsp diff --git a/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h index 2c259d322e..fb75d0c808 100644 --- a/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h +++ b/media/libvpx/libvpx/vpx_mem/include/vpx_mem_intrnl.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ -#define VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#ifndef VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#define VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ #include "./vpx_config.h" #define ADDRESS_STORAGE_SIZE sizeof(size_t) @@ -26,6 +26,6 @@ /*returns an addr aligned to the byte boundary specified by align*/ #define align_addr(addr, align) \ - (void *)(((size_t)(addr) + ((align)-1)) & ~(size_t)((align)-1)) + (void *)(((size_t)(addr) + ((align) - 1)) & ~(size_t)((align) - 1)) -#endif // VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#endif // VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ diff --git a/media/libvpx/libvpx/vpx_mem/vpx_mem.c b/media/libvpx/libvpx/vpx_mem/vpx_mem.c index a9be086806..18abf1158b 100644 --- a/media/libvpx/libvpx/vpx_mem/vpx_mem.c +++ b/media/libvpx/libvpx/vpx_mem/vpx_mem.c @@ -16,12 +16,14 @@ #include "include/vpx_mem_intrnl.h" #include "vpx/vpx_integer.h" +#if !defined(VPX_MAX_ALLOCABLE_MEMORY) #if SIZE_MAX > (1ULL << 40) #define VPX_MAX_ALLOCABLE_MEMORY (1ULL << 40) #else // For 32-bit targets keep this below INT_MAX to avoid valgrind warnings. #define VPX_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16)) #endif +#endif // Returns 0 in case of overflow of nmemb * size. static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) { @@ -82,12 +84,3 @@ void vpx_free(void *memblk) { free(addr); } } - -#if CONFIG_VP9_HIGHBITDEPTH -void *vpx_memset16(void *dest, int val, size_t length) { - size_t i; - uint16_t *dest16 = (uint16_t *)dest; - for (i = 0; i < length; i++) *dest16++ = val; - return dest; -} -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/vpx_mem/vpx_mem.h b/media/libvpx/libvpx/vpx_mem/vpx_mem.h index 733aff4885..7689a05e6e 100644 --- a/media/libvpx/libvpx/vpx_mem/vpx_mem.h +++ b/media/libvpx/libvpx/vpx_mem/vpx_mem.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_MEM_VPX_MEM_H_ -#define VPX_MEM_VPX_MEM_H_ +#ifndef VPX_VPX_MEM_VPX_MEM_H_ +#define VPX_VPX_MEM_VPX_MEM_H_ #include "vpx_config.h" #if defined(__uClinux__) @@ -19,6 +19,8 @@ #include #include +#include "vpx/vpx_integer.h" + #if defined(__cplusplus) extern "C" { #endif @@ -29,7 +31,12 @@ void *vpx_calloc(size_t num, size_t size); void vpx_free(void *memblk); #if CONFIG_VP9_HIGHBITDEPTH -void *vpx_memset16(void *dest, int val, size_t length); +static INLINE void *vpx_memset16(void *dest, int val, size_t length) { + size_t i; + uint16_t *dest16 = (uint16_t *)dest; + for (i = 0; i < length; i++) *dest16++ = val; + return dest; +} #endif #include @@ -42,4 +49,4 @@ void *vpx_memset16(void *dest, int val, size_t length); } #endif -#endif // VPX_MEM_VPX_MEM_H_ +#endif // VPX_VPX_MEM_VPX_MEM_H_ diff --git a/media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c new file mode 100644 index 0000000000..20f688e179 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/aarch32_cpudetect.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +// Feature detection code for Armv7-A / AArch32. + +#include "./vpx_config.h" +#include "arm_cpudetect.h" + +#if !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(_MSC_VER) // end !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON || HAVE_NEON_ASM + // MSVC has no inline __asm support for Arm, but it does let you __emit + // instructions via their assembled hex code. + // All of these instructions should be essentially nops. + __try { + // VORR q0,q0,q0 + __emit(0xF2200150); + flags |= HAS_NEON; + } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { + // Ignore exception. + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} + +#elif defined(VPX_USE_ANDROID_CPU_FEATURES) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON || HAVE_NEON_ASM + uint64_t features = android_getCpuFeatures(); + if (features & ANDROID_CPU_ARM_FEATURE_NEON) { + flags |= HAS_NEON; + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} + +#elif defined(__linux__) // end defined(VPX_USE_ANDROID_CPU_FEATURES) + +#include + +// Define hwcap values ourselves: building with an old auxv header where these +// hwcap values are not defined should not prevent features from being enabled. +#define VPX_AARCH32_HWCAP_NEON (1 << 12) + +static int arm_get_cpu_caps(void) { + int flags = 0; + unsigned long hwcap = getauxval(AT_HWCAP); +#if HAVE_NEON || HAVE_NEON_ASM + if (hwcap & VPX_AARCH32_HWCAP_NEON) { + flags |= HAS_NEON; + } +#endif // HAVE_NEON || HAVE_NEON_ASM + return flags; +} +#else // end __linux__ +#error \ + "Runtime CPU detection selected, but no CPU detection method available" \ +"for your platform. Rerun configure with --disable-runtime-cpu-detect." +#endif + +int arm_cpu_caps(void) { + int flags = 0; + if (arm_cpu_env_flags(&flags)) { + return flags; + } + return arm_get_cpu_caps() & arm_cpu_env_mask(); +} diff --git a/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c new file mode 100644 index 0000000000..df8e1e244d --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/aarch64_cpudetect.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vpx_ports/arm.h" +#include "vpx_ports/arm_cpudetect.h" + +#if defined(__APPLE__) +#include +#endif + +#if !CONFIG_RUNTIME_CPU_DETECT + +static int arm_get_cpu_caps(void) { + // This function should actually be a no-op. There is no way to adjust any of + // these because the RTCD tables do not exist: the functions are called + // statically. + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON + return flags; +} + +#elif defined(__APPLE__) // end !CONFIG_RUNTIME_CPU_DETECT + +// sysctlbyname() parameter documentation for instruction set characteristics: +// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics +static INLINE int64_t have_feature(const char *feature) { + int64_t feature_present = 0; + size_t size = sizeof(feature_present); + if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) { + return 0; + } + return feature_present; +} + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD + if (have_feature("hw.optional.arm.FEAT_DotProd")) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (have_feature("hw.optional.arm.FEAT_I8MM")) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM + return flags; +} + +#elif defined(_WIN32) // end __APPLE__ + +static int arm_get_cpu_caps(void) { + int flags = 0; +// IsProcessorFeaturePresent() parameter documentation: +// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD +// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK +// 20348, supported by Windows 11 and Windows Server 2022. +#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) + if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) { + flags |= HAS_NEON_DOTPROD; + } +#endif // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM +// Support for PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE was added in Windows SDK +// 26100. +#if defined(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE) + // There's no PF_* flag that indicates whether plain I8MM is available + // or not. But if SVE_I8MM is available, that also implies that + // regular I8MM is available. + if (IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE)) { + flags |= HAS_NEON_I8MM; + } +#endif // defined(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE) +#endif // HAVE_NEON_I8MM +#if HAVE_SVE +// Support for PF_ARM_SVE_INSTRUCTIONS_AVAILABLE was added in Windows SDK 26100. +#if defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) + if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) { + flags |= HAS_SVE; + } +#endif // defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) +#endif // HAVE_SVE +#if HAVE_SVE2 +// Support for PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE was added in Windows SDK +// 26100. +#if defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) + if (IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE)) { + flags |= HAS_SVE2; + } +#endif // defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) +#endif // HAVE_SVE2 + return flags; +} + +#elif defined(VPX_USE_ANDROID_CPU_FEATURES) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON + return flags; +} + +#elif defined(__linux__) // end defined(VPX_USE_ANDROID_CPU_FEATURES) + +#include + +// Define hwcap values ourselves: building with an old auxv header where these +// hwcap values are not defined should not prevent features from being enabled. +#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20) +#define VPX_AARCH64_HWCAP_SVE (1 << 22) +#define VPX_AARCH64_HWCAP2_SVE2 (1 << 1) +#define VPX_AARCH64_HWCAP2_I8MM (1 << 13) + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON_DOTPROD || HAVE_SVE + unsigned long hwcap = getauxval(AT_HWCAP); +#endif // HAVE_NEON_DOTPROD || HAVE_SVE +#if HAVE_NEON_I8MM || HAVE_SVE2 + unsigned long hwcap2 = getauxval(AT_HWCAP2); +#endif // HAVE_NEON_I8MM || HAVE_SVE2 +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON +#if HAVE_NEON_DOTPROD + if (hwcap & VPX_AARCH64_HWCAP_ASIMDDP) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (hwcap2 & VPX_AARCH64_HWCAP2_I8MM) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM +#if HAVE_SVE + if (hwcap & VPX_AARCH64_HWCAP_SVE) { + flags |= HAS_SVE; + } +#endif // HAVE_SVE +#if HAVE_SVE2 + if (hwcap2 & VPX_AARCH64_HWCAP2_SVE2) { + flags |= HAS_SVE2; + } +#endif // HAVE_SVE2 + return flags; +} + +#elif defined(__Fuchsia__) // end __linux__ + +#include +#include + +// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282. +#ifndef ZX_ARM64_FEATURE_ISA_I8MM +#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19)) +#endif +// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083. +#ifndef ZX_ARM64_FEATURE_ISA_SVE +#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20)) +#endif + +static int arm_get_cpu_caps(void) { + int flags = 0; +#if HAVE_NEON + flags |= HAS_NEON; // Neon is mandatory in Armv8.0-A. +#endif // HAVE_NEON + uint32_t features; + zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features); + if (status != ZX_OK) { + return flags; + } +#if HAVE_NEON_DOTPROD + if (features & ZX_ARM64_FEATURE_ISA_DP) { + flags |= HAS_NEON_DOTPROD; + } +#endif // HAVE_NEON_DOTPROD +#if HAVE_NEON_I8MM + if (features & ZX_ARM64_FEATURE_ISA_I8MM) { + flags |= HAS_NEON_I8MM; + } +#endif // HAVE_NEON_I8MM +#if HAVE_SVE + if (features & ZX_ARM64_FEATURE_ISA_SVE) { + flags |= HAS_SVE; + } +#endif // HAVE_SVE + return flags; +} + +#else // end __Fuchsia__ +#error \ + "Runtime CPU detection selected, but no CPU detection method available" \ +"for your platform. Rerun configure with --disable-runtime-cpu-detect." +#endif + +int arm_cpu_caps(void) { + int flags = 0; + if (!arm_cpu_env_flags(&flags)) { + flags = arm_get_cpu_caps() & arm_cpu_env_mask(); + } + + // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available. + if (!(flags & HAS_NEON_DOTPROD)) { + flags &= ~HAS_NEON_I8MM; + } + + // Restrict flags: FEAT_SVE assumes that FEAT_{DotProd,I8MM} are available. + if (!(flags & HAS_NEON_DOTPROD)) { + flags &= ~HAS_SVE; + } + if (!(flags & HAS_NEON_I8MM)) { + flags &= ~HAS_SVE; + } + + // Restrict flags: FEAT_SVE2 assumes that FEAT_SVE is available. + if (!(flags & HAS_SVE)) { + flags &= ~HAS_SVE2; + } + + return flags; +} diff --git a/media/libvpx/libvpx/vpx_ports/arm.h b/media/libvpx/libvpx/vpx_ports/arm.h index 7be6104a4f..814c3cc408 100644 --- a/media/libvpx/libvpx/vpx_ports/arm.h +++ b/media/libvpx/libvpx/vpx_ports/arm.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_ARM_H_ -#define VPX_PORTS_ARM_H_ +#ifndef VPX_VPX_PORTS_ARM_H_ +#define VPX_VPX_PORTS_ARM_H_ #include #include "vpx_config.h" @@ -17,12 +17,16 @@ extern "C" { #endif -/*ARMv5TE "Enhanced DSP" instructions.*/ -#define HAS_EDSP 0x01 -/*ARMv6 "Parallel" or "Media" instructions.*/ -#define HAS_MEDIA 0x02 -/*ARMv7 optional NEON instructions.*/ -#define HAS_NEON 0x04 +// Armv7-A optional Neon instructions, mandatory from Armv8.0-A. +#define HAS_NEON (1 << 0) +// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A. +#define HAS_NEON_DOTPROD (1 << 1) +// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A. +#define HAS_NEON_I8MM (1 << 2) +// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A. +#define HAS_SVE (1 << 3) +// Armv9.0-A SVE2 instructions. +#define HAS_SVE2 (1 << 4) int arm_cpu_caps(void); @@ -36,4 +40,4 @@ int arm_cpu_caps(void); } // extern "C" #endif -#endif // VPX_PORTS_ARM_H_ +#endif // VPX_VPX_PORTS_ARM_H_ diff --git a/media/libvpx/libvpx/vpx_ports/arm_cpudetect.c b/media/libvpx/libvpx/vpx_ports/arm_cpudetect.c deleted file mode 100644 index 4f9d480ade..0000000000 --- a/media/libvpx/libvpx/vpx_ports/arm_cpudetect.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include - -#include "./vpx_config.h" -#include "vpx_ports/arm.h" - -#ifdef WINAPI_FAMILY -#include -#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -#define getenv(x) NULL -#endif -#endif - -static int arm_cpu_env_flags(int *flags) { - char *env; - env = getenv("VPX_SIMD_CAPS"); - if (env && *env) { - *flags = (int)strtol(env, NULL, 0); - return 0; - } - *flags = 0; - return -1; -} - -static int arm_cpu_env_mask(void) { - char *env; - env = getenv("VPX_SIMD_CAPS_MASK"); - return env && *env ? (int)strtol(env, NULL, 0) : ~0; -} - -#if !CONFIG_RUNTIME_CPU_DETECT - -int arm_cpu_caps(void) { - /* This function should actually be a no-op. There is no way to adjust any of - * these because the RTCD tables do not exist: the functions are called - * statically */ - int flags; - int mask; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); -#if HAVE_NEON || HAVE_NEON_ASM - flags |= HAS_NEON; -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - return flags & mask; -} - -#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */ -/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#ifndef WIN32_EXTRA_LEAN -#define WIN32_EXTRA_LEAN -#endif -#include - -int arm_cpu_caps(void) { - int flags; - int mask; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); -/* MSVC has no inline __asm support for ARM, but it does let you __emit - * instructions via their assembled hex code. - * All of these instructions should be essentially nops. - */ -#if HAVE_NEON || HAVE_NEON_ASM - if (mask & HAS_NEON) { - __try { - /*VORR q0,q0,q0*/ - __emit(0xF2200150); - flags |= HAS_NEON; - } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { - /*Ignore exception.*/ - } - } -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - return flags & mask; -} - -#elif defined(__ANDROID__) /* end _MSC_VER */ -#include - -int arm_cpu_caps(void) { - int flags; - int mask; - uint64_t features; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); - features = android_getCpuFeatures(); - -#if HAVE_NEON || HAVE_NEON_ASM - if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON; -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - return flags & mask; -} - -#elif defined(__linux__) /* end __ANDROID__ */ - -#include - -int arm_cpu_caps(void) { - FILE *fin; - int flags; - int mask; - if (!arm_cpu_env_flags(&flags)) { - return flags; - } - mask = arm_cpu_env_mask(); - /* Reading /proc/self/auxv would be easier, but that doesn't work reliably - * on Android. - * This also means that detection will fail in Scratchbox. - */ - fin = fopen("/proc/cpuinfo", "r"); - if (fin != NULL) { - /* 512 should be enough for anybody (it's even enough for all the flags - * that x86 has accumulated... so far). - */ - char buf[512]; - while (fgets(buf, 511, fin) != NULL) { -#if HAVE_NEON || HAVE_NEON_ASM - if (memcmp(buf, "Features", 8) == 0) { - char *p; - p = strstr(buf, " neon"); - if (p != NULL && (p[5] == ' ' || p[5] == '\n')) { - flags |= HAS_NEON; - } - } -#endif /* HAVE_NEON || HAVE_NEON_ASM */ - } - fclose(fin); - } - return flags & mask; -} -#else /* end __linux__ */ -#error \ - "--enable-runtime-cpu-detect selected, but no CPU detection method " \ -"available for your platform. Reconfigure with --disable-runtime-cpu-detect." -#endif diff --git a/media/libvpx/libvpx/vpx_ports/arm_cpudetect.h b/media/libvpx/libvpx/vpx_ports/arm_cpudetect.h new file mode 100644 index 0000000000..9b64a1fa2d --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/arm_cpudetect.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vpx_config.h" +#include "vpx_ports/arm.h" + +#if defined(_WIN32) +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#undef WIN32_EXTRA_LEAN +#define WIN32_EXTRA_LEAN +#include +#endif + +#ifdef WINAPI_FAMILY +#include +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define getenv(x) NULL +#endif +#endif + +#if defined(__ANDROID__) && (__ANDROID_API__ < 18) +#define VPX_USE_ANDROID_CPU_FEATURES 1 +// Use getauxval() when targeting (64-bit) Android with API level >= 18. +// getauxval() is supported since Android API level 18 (Android 4.3.) +// First Android version with 64-bit support was Android 5.x (API level 21). +#include +#endif + +static INLINE int arm_cpu_env_flags(int *flags) { + const char *env = getenv("VPX_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 1; + } + return 0; +} + +static INLINE int arm_cpu_env_mask(void) { + const char *env = getenv("VPX_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} diff --git a/media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h b/media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h new file mode 100644 index 0000000000..400a51cc32 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/asmdefs_mmi.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_ASMDEFS_MMI_H_ +#define VPX_VPX_PORTS_ASMDEFS_MMI_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +#if HAVE_MMI + +#if HAVE_MIPS64 +#define mips_reg int64_t +#define MMI_ADDU(reg1, reg2, reg3) \ + "daddu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_ADDIU(reg1, reg2, immediate) \ + "daddiu " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_ADDI(reg1, reg2, immediate) \ + "daddi " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_SUBU(reg1, reg2, reg3) \ + "dsubu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_L(reg, addr, bias) \ + "ld " #reg ", " #bias "(" #addr ") \n\t" + +#define MMI_SRL(reg1, reg2, shift) \ + "ssrld " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_SLL(reg1, reg2, shift) \ + "dsll " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_MTC1(reg, fp) \ + "dmtc1 " #reg ", " #fp " \n\t" + +#define MMI_LI(reg, immediate) \ + "dli " #reg ", " #immediate " \n\t" + +#else +#define mips_reg int32_t +#define MMI_ADDU(reg1, reg2, reg3) \ + "addu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_ADDIU(reg1, reg2, immediate) \ + "addiu " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_ADDI(reg1, reg2, immediate) \ + "addi " #reg1 ", " #reg2 ", " #immediate " \n\t" + +#define MMI_SUBU(reg1, reg2, reg3) \ + "subu " #reg1 ", " #reg2 ", " #reg3 " \n\t" + +#define MMI_L(reg, addr, bias) \ + "lw " #reg ", " #bias "(" #addr ") \n\t" + +#define MMI_SRL(reg1, reg2, shift) \ + "ssrlw " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_SLL(reg1, reg2, shift) \ + "sll " #reg1 ", " #reg2 ", " #shift " \n\t" + +#define MMI_MTC1(reg, fp) \ + "mtc1 " #reg ", " #fp " \n\t" + +#define MMI_LI(reg, immediate) \ + "li " #reg ", " #immediate " \n\t" + +#endif /* HAVE_MIPS64 */ + +#endif /* HAVE_MMI */ + +#endif // VPX_VPX_PORTS_ASMDEFS_MMI_H_ diff --git a/media/libvpx/libvpx/vpx_ports/bitops.h b/media/libvpx/libvpx/vpx_ports/bitops.h index 0ed7189ff6..e92c972f50 100644 --- a/media/libvpx/libvpx/vpx_ports/bitops.h +++ b/media/libvpx/libvpx/vpx_ports/bitops.h @@ -8,13 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_BITOPS_H_ -#define VPX_PORTS_BITOPS_H_ +#ifndef VPX_VPX_PORTS_BITOPS_H_ +#define VPX_VPX_PORTS_BITOPS_H_ #include -#include "vpx_ports/msvc.h" - #ifdef _MSC_VER #if defined(_M_X64) || defined(_M_IX86) #include @@ -26,20 +24,32 @@ extern "C" { #endif -// These versions of get_msb() are only valid when n != 0 because all -// of the optimized versions are undefined when n == 0: +// These versions of get_lsb() and get_msb() are only valid when n != 0 +// because all of the optimized versions are undefined when n == 0: // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html // use GNU builtins where available. #if defined(__GNUC__) && \ ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int get_lsb(unsigned int n) { + assert(n != 0); + return __builtin_ctz(n); +} + static INLINE int get_msb(unsigned int n) { assert(n != 0); return 31 ^ __builtin_clz(n); } #elif defined(USE_MSC_INTRINSICS) +#pragma intrinsic(_BitScanForward) #pragma intrinsic(_BitScanReverse) +static INLINE int get_lsb(unsigned int n) { + unsigned long first_set_bit; // NOLINT(runtime/int) + _BitScanForward(&first_set_bit, n); + return first_set_bit; +} + static INLINE int get_msb(unsigned int n) { unsigned long first_set_bit; assert(n != 0); @@ -48,6 +58,13 @@ static INLINE int get_msb(unsigned int n) { } #undef USE_MSC_INTRINSICS #else +static INLINE int get_lsb(unsigned int n) { + int i; + assert(n != 0); + for (i = 0; i < 32 && !(n & 1); ++i) n >>= 1; + return i; +} + // Returns (int)floor(log2(n)). n must be > 0. static INLINE int get_msb(unsigned int n) { int log = 0; @@ -72,4 +89,4 @@ static INLINE int get_msb(unsigned int n) { } // extern "C" #endif -#endif // VPX_PORTS_BITOPS_H_ +#endif // VPX_VPX_PORTS_BITOPS_H_ diff --git a/media/libvpx/libvpx/vpx_ports/compiler_attributes.h b/media/libvpx/libvpx/vpx_ports/compiler_attributes.h new file mode 100644 index 0000000000..4b468749b8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/compiler_attributes.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_COMPILER_ATTRIBUTES_H_ +#define VPX_VPX_PORTS_COMPILER_ATTRIBUTES_H_ + +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif // !defined(__has_feature) + +#if !defined(__has_attribute) +#define __has_attribute(x) 0 +#endif // !defined(__has_attribute) + +//------------------------------------------------------------------------------ +// Sanitizer attributes. + +#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) +#define VPX_WITH_ASAN 1 +#else +#define VPX_WITH_ASAN 0 +#endif // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) + +#if defined(__clang__) && __has_attribute(no_sanitize) +// Both of these have defined behavior and are used in certain operations or +// optimizations thereof. There are cases where an overflow may be unintended, +// however, so use of these attributes should be done with care. +#define VPX_NO_UNSIGNED_OVERFLOW_CHECK \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +#if __clang_major__ >= 12 +#define VPX_NO_UNSIGNED_SHIFT_CHECK \ + __attribute__((no_sanitize("unsigned-shift-base"))) +#endif // __clang__ >= 12 +#endif // __clang__ + +#ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK +#define VPX_NO_UNSIGNED_OVERFLOW_CHECK +#endif +#ifndef VPX_NO_UNSIGNED_SHIFT_CHECK +#define VPX_NO_UNSIGNED_SHIFT_CHECK +#endif + +//------------------------------------------------------------------------------ +// Variable attributes. + +#if __has_attribute(uninitialized) +// Attribute "uninitialized" disables -ftrivial-auto-var-init=pattern for +// the specified variable. +// +// -ftrivial-auto-var-init is security risk mitigation feature, so attribute +// should not be used "just in case", but only to fix real performance +// bottlenecks when other approaches do not work. In general the compiler is +// quite effective at eliminating unneeded initializations introduced by the +// flag, e.g. when they are followed by actual initialization by a program. +// However if compiler optimization fails and code refactoring is hard, the +// attribute can be used as a workaround. +#define VPX_UNINITIALIZED __attribute__((uninitialized)) +#else +#define VPX_UNINITIALIZED +#endif // __has_attribute(uninitialized) + +#endif // VPX_VPX_PORTS_COMPILER_ATTRIBUTES_H_ diff --git a/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h b/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h index 903534e0c0..d6cc68ee4d 100644 --- a/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h +++ b/media/libvpx/libvpx/vpx_ports/emmintrin_compat.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_EMMINTRIN_COMPAT_H_ -#define VPX_PORTS_EMMINTRIN_COMPAT_H_ +#ifndef VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_ +#define VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_ #if defined(__GNUC__) && __GNUC__ < 4 /* From emmintrin.h (gcc 4.5.3) */ @@ -52,4 +52,4 @@ extern __inline __m128d } #endif -#endif // VPX_PORTS_EMMINTRIN_COMPAT_H_ +#endif // VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_ diff --git a/media/libvpx/libvpx/vpx_ports/emms_mmx.asm b/media/libvpx/libvpx/vpx_ports/emms_mmx.asm new file mode 100644 index 0000000000..b31b25ebde --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.asm @@ -0,0 +1,18 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +section .text +globalsym(vpx_clear_system_state) +sym(vpx_clear_system_state): + emms + ret diff --git a/media/libvpx/libvpx/vpx_ports/config.h b/media/libvpx/libvpx/vpx_ports/emms_mmx.c similarity index 66% rename from media/libvpx/libvpx/vpx_ports/config.h rename to media/libvpx/libvpx/vpx_ports/emms_mmx.c index 3c1ab99f4a..79b98a75f1 100644 --- a/media/libvpx/libvpx/vpx_ports/config.h +++ b/media/libvpx/libvpx/vpx_ports/emms_mmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,9 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_CONFIG_H_ -#define VPX_PORTS_CONFIG_H_ +#include -#include "vpx_config.h" +#include "vpx_ports/system_state.h" -#endif // VPX_PORTS_CONFIG_H_ +void vpx_clear_system_state(void) { _mm_empty(); } diff --git a/media/libvpx/libvpx/vpx_ports/emms.asm b/media/libvpx/libvpx/vpx_ports/float_control_word.asm similarity index 81% rename from media/libvpx/libvpx/vpx_ports/emms.asm rename to media/libvpx/libvpx/vpx_ports/float_control_word.asm index db8da28737..bb75b7a31f 100644 --- a/media/libvpx/libvpx/vpx_ports/emms.asm +++ b/media/libvpx/libvpx/vpx_ports/float_control_word.asm @@ -12,14 +12,9 @@ %include "vpx_ports/x86_abi_support.asm" section .text -global sym(vpx_reset_mmx_state) PRIVATE -sym(vpx_reset_mmx_state): - emms - ret - %if LIBVPX_YASM_WIN64 -global sym(vpx_winx64_fldcw) PRIVATE +globalsym(vpx_winx64_fldcw) sym(vpx_winx64_fldcw): sub rsp, 8 mov [rsp], rcx ; win x64 specific @@ -28,7 +23,7 @@ sym(vpx_winx64_fldcw): ret -global sym(vpx_winx64_fstcw) PRIVATE +globalsym(vpx_winx64_fstcw) sym(vpx_winx64_fstcw): sub rsp, 8 fstcw [rsp] diff --git a/media/libvpx/libvpx/vpx_ports/loongarch.h b/media/libvpx/libvpx/vpx_ports/loongarch.h new file mode 100644 index 0000000000..d93ff9f5f0 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/loongarch.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Jin Bo + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_LOONGARCH_H_ +#define VPX_VPX_PORTS_LOONGARCH_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_LSX 0x01 +#define HAS_LASX 0x02 + +int loongarch_cpu_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_LOONGARCH_H_ diff --git a/media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c b/media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c new file mode 100644 index 0000000000..7b4322d35e --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/loongarch_cpudetect.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * Contributed by Jin Bo + * Contributed by Lu Wang + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vpx_ports/loongarch.h" + +#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_CFG2_LSX (1 << 6) +#define LOONGARCH_CFG2_LASX (1 << 7) + +#if CONFIG_RUNTIME_CPU_DETECT +#if defined(__loongarch__) && defined(__linux__) +int loongarch_cpu_caps(void) { + int reg = 0; + int flag = 0; + + __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(reg) : "r"(LOONGARCH_CFG2)); + if (reg & LOONGARCH_CFG2_LSX) flag |= HAS_LSX; + + if (reg & LOONGARCH_CFG2_LASX) flag |= HAS_LASX; + + return flag; +} +#else /* end __loongarch__ && __linux__ */ +#error \ + "--enable-runtime-cpu-detect selected, but no CPU detection method " \ +"available for your platform. Reconfigure with --disable-runtime-cpu-detect." +#endif +#else /* end CONFIG_RUNTIME_CPU_DETECT */ +int loongarch_cpu_caps(void) { return 0; } +#endif diff --git a/media/libvpx/libvpx/vpx_ports/mem.h b/media/libvpx/libvpx/vpx_ports/mem.h index 2d49b7a06d..5165b676ac 100644 --- a/media/libvpx/libvpx/vpx_ports/mem.h +++ b/media/libvpx/libvpx/vpx_ports/mem.h @@ -8,13 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_MEM_H_ -#define VPX_PORTS_MEM_H_ +#ifndef VPX_VPX_PORTS_MEM_H_ +#define VPX_VPX_PORTS_MEM_H_ #include "vpx_config.h" #include "vpx/vpx_integer.h" -#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C) +#if defined(__GNUC__) || defined(__SUNPRO_C) #define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n))) #elif defined(_MSC_VER) #define DECLARE_ALIGNED(n, typ, val) __declspec(align(n)) typ val @@ -23,40 +23,28 @@ #define DECLARE_ALIGNED(n, typ, val) typ val #endif -/* Indicates that the usage of the specified variable has been audited to assure - * that it's safe to use uninitialized. Silences 'may be used uninitialized' - * warnings on gcc. - */ -#if defined(__GNUC__) && __GNUC__ -#define UNINITIALIZED_IS_SAFE(x) x = x +#if defined(__has_builtin) +#define VPX_HAS_BUILTIN(x) __has_builtin(x) #else -#define UNINITIALIZED_IS_SAFE(x) x +#define VPX_HAS_BUILTIN(x) 0 #endif -#if HAVE_NEON && defined(_MSC_VER) +#if !VPX_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__) #define __builtin_prefetch(x) #endif /* Shift down with rounding */ -#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n)-1))) >> (n)) -#define ROUND64_POWER_OF_TWO(value, n) (((value) + (1ULL << ((n)-1))) >> (n)) +#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n)) +#define ROUND64_POWER_OF_TWO(value, n) (((value) + (1ULL << ((n) - 1))) >> (n)) #define ALIGN_POWER_OF_TWO(value, n) \ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1)) +#define CAST_TO_SHORTPTR(x) ((uint16_t *)((uintptr_t)(x))) #if CONFIG_VP9_HIGHBITDEPTH #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1)) +#define CAST_TO_BYTEPTR(x) ((uint8_t *)((uintptr_t)(x))) #endif // CONFIG_VP9_HIGHBITDEPTH -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif // !defined(__has_feature) - -#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) -#define VPX_WITH_ASAN 1 -#else -#define VPX_WITH_ASAN 0 -#endif // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) - -#endif // VPX_PORTS_MEM_H_ +#endif // VPX_VPX_PORTS_MEM_H_ diff --git a/media/libvpx/libvpx/vpx_ports/mem_ops.h b/media/libvpx/libvpx/vpx_ports/mem_ops.h index 343f27577c..b17015e7ec 100644 --- a/media/libvpx/libvpx/vpx_ports/mem_ops.h +++ b/media/libvpx/libvpx/vpx_ports/mem_ops.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_MEM_OPS_H_ -#define VPX_PORTS_MEM_OPS_H_ +#ifndef VPX_VPX_PORTS_MEM_OPS_H_ +#define VPX_VPX_PORTS_MEM_OPS_H_ /* \file * \brief Provides portable memory access primitives @@ -224,5 +224,4 @@ static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) { mem[3] = (MAU_T)((val >> 24) & 0xff); } /* clang-format on */ - -#endif // VPX_PORTS_MEM_OPS_H_ +#endif // VPX_VPX_PORTS_MEM_OPS_H_ diff --git a/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h b/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h index ccac391ba0..8649b87623 100644 --- a/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h +++ b/media/libvpx/libvpx/vpx_ports/mem_ops_aligned.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_MEM_OPS_ALIGNED_H_ -#define VPX_PORTS_MEM_OPS_ALIGNED_H_ +#ifndef VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_ +#define VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_ #include "vpx/vpx_integer.h" @@ -168,4 +168,4 @@ mem_put_le_aligned_generic(32) #undef swap_endian_32_se /* clang-format on */ -#endif // VPX_PORTS_MEM_OPS_ALIGNED_H_ +#endif // VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_ diff --git a/media/libvpx/libvpx/vpx_ports/mips.h b/media/libvpx/libvpx/vpx_ports/mips.h new file mode 100644 index 0000000000..439de754fd --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/mips.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_MIPS_H_ +#define VPX_VPX_PORTS_MIPS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_MMI 0x01 +#define HAS_MSA 0x02 + +int mips_cpu_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_MIPS_H_ diff --git a/media/libvpx/libvpx/vpx_ports/mips_cpudetect.c b/media/libvpx/libvpx/vpx_ports/mips_cpudetect.c new file mode 100644 index 0000000000..e0eca2d48d --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/mips_cpudetect.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include +#include +#include "./vpx_config.h" +#include "vpx_ports/mips.h" + +#if CONFIG_RUNTIME_CPU_DETECT +#if defined(__mips__) && defined(__linux__) +int mips_cpu_caps(void) { + char cpuinfo_line[512]; + int flag = 0x0; + FILE *f = fopen("/proc/cpuinfo", "r"); + if (!f) { + // Assume nothing if /proc/cpuinfo is unavailable. + // This will occur for Chrome sandbox for Pepper or Render process. + return 0; + } + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { + // Workaround early kernel without mmi in ASEs line. + if (strstr(cpuinfo_line, "Loongson-3")) { + flag |= HAS_MMI; + } else if (strstr(cpuinfo_line, "Loongson-2K")) { + flag |= HAS_MMI | HAS_MSA; + } + } + if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { + if (strstr(cpuinfo_line, "loongson-mmi") && + strstr(cpuinfo_line, "loongson-ext")) { + flag |= HAS_MMI; + } + if (strstr(cpuinfo_line, "msa")) { + flag |= HAS_MSA; + } + // ASEs is the last line, so we can break here. + break; + } + } + fclose(f); + return flag; +} +#else /* end __mips__ && __linux__ */ +#error \ + "--enable-runtime-cpu-detect selected, but no CPU detection method " \ +"available for your platform. Reconfigure with --disable-runtime-cpu-detect." +#endif +#else /* end CONFIG_RUNTIME_CPU_DETECT */ +int mips_cpu_caps(void) { return 0; } +#endif diff --git a/media/libvpx/libvpx/vpx_ports/msvc.h b/media/libvpx/libvpx/vpx_ports/msvc.h deleted file mode 100644 index 3ff71474b3..0000000000 --- a/media/libvpx/libvpx/vpx_ports/msvc.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VPX_PORTS_MSVC_H_ -#define VPX_PORTS_MSVC_H_ -#ifdef _MSC_VER - -#include "./vpx_config.h" - -#if _MSC_VER < 1900 // VS2015 provides snprintf -#define snprintf _snprintf -#endif // _MSC_VER < 1900 - -#if _MSC_VER < 1800 // VS2013 provides round -#include -static INLINE double round(double x) { - if (x < 0) - return ceil(x - 0.5); - else - return floor(x + 0.5); -} -#endif // _MSC_VER < 1800 - -#endif // _MSC_VER -#endif // VPX_PORTS_MSVC_H_ diff --git a/media/libvpx/libvpx/vpx_ports/ppc.h b/media/libvpx/libvpx/vpx_ports/ppc.h new file mode 100644 index 0000000000..a11f4e8732 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/ppc.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_PPC_H_ +#define VPX_VPX_PORTS_PPC_H_ +#include + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define HAS_VSX 0x01 + +int ppc_simd_caps(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_PPC_H_ diff --git a/media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c b/media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c new file mode 100644 index 0000000000..374a0271c9 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/ppc_cpudetect.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include + +#include "./vpx_config.h" +#include "vpx_ports/ppc.h" + +#if CONFIG_RUNTIME_CPU_DETECT +static int cpu_env_flags(int *flags) { + char *env; + env = getenv("VPX_SIMD_CAPS"); + if (env && *env) { + *flags = (int)strtol(env, NULL, 0); + return 0; + } + *flags = 0; + return -1; +} + +static int cpu_env_mask(void) { + char *env; + env = getenv("VPX_SIMD_CAPS_MASK"); + return env && *env ? (int)strtol(env, NULL, 0) : ~0; +} + +int ppc_simd_caps(void) { + int flags; + int mask; + int fd; + ssize_t count; + unsigned int i; + uint64_t buf[64]; + + // If VPX_SIMD_CAPS is set then allow only those capabilities. + if (!cpu_env_flags(&flags)) { + return flags; + } + + mask = cpu_env_mask(); + + fd = open("/proc/self/auxv", O_RDONLY); + if (fd < 0) { + return 0; + } + + while ((count = read(fd, buf, sizeof(buf))) > 0) { + for (i = 0; i < (count / sizeof(*buf)); i += 2) { + if (buf[i] == AT_HWCAP) { +#if HAVE_VSX + if (buf[i + 1] & PPC_FEATURE_HAS_VSX) { + flags |= HAS_VSX; + } +#endif // HAVE_VSX + goto out_close; + } else if (buf[i] == AT_NULL) { + goto out_close; + } + } + } +out_close: + close(fd); + return flags & mask; +} +#else +// If there is no RTCD the function pointers are not used and can not be +// changed. +int ppc_simd_caps(void) { return 0; } +#endif // CONFIG_RUNTIME_CPU_DETECT diff --git a/media/libvpx/libvpx/vpx_ports/static_assert.h b/media/libvpx/libvpx/vpx_ports/static_assert.h new file mode 100644 index 0000000000..f632d9f1e8 --- /dev/null +++ b/media/libvpx/libvpx/vpx_ports/static_assert.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2020 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_PORTS_STATIC_ASSERT_H_ +#define VPX_VPX_PORTS_STATIC_ASSERT_H_ + +#if defined(_MSC_VER) +#define VPX_STATIC_ASSERT(boolexp) \ + do { \ + char vpx_static_assert[(boolexp) ? 1 : -1]; \ + (void)vpx_static_assert; \ + } while (0) +#else // !_MSC_VER +#define VPX_STATIC_ASSERT(boolexp) \ + do { \ + struct { \ + unsigned int vpx_static_assert : (boolexp) ? 1 : -1; \ + } vpx_static_assert; \ + (void)vpx_static_assert; \ + } while (0) +#endif // _MSC_VER + +#endif // VPX_VPX_PORTS_STATIC_ASSERT_H_ diff --git a/media/libvpx/libvpx/vpx_ports/system_state.h b/media/libvpx/libvpx/vpx_ports/system_state.h index 086c64681f..32ebd0ed8c 100644 --- a/media/libvpx/libvpx/vpx_ports/system_state.h +++ b/media/libvpx/libvpx/vpx_ports/system_state.h @@ -8,15 +8,23 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_SYSTEM_STATE_H_ -#define VPX_PORTS_SYSTEM_STATE_H_ +#ifndef VPX_VPX_PORTS_SYSTEM_STATE_H_ +#define VPX_VPX_PORTS_SYSTEM_STATE_H_ #include "./vpx_config.h" -#if ARCH_X86 || ARCH_X86_64 -void vpx_reset_mmx_state(void); -#define vpx_clear_system_state() vpx_reset_mmx_state() +#ifdef __cplusplus +extern "C" { +#endif + +#if (VPX_ARCH_X86 || VPX_ARCH_X86_64) && HAVE_MMX +extern void vpx_clear_system_state(void); #else #define vpx_clear_system_state() -#endif // ARCH_X86 || ARCH_X86_64 -#endif // VPX_PORTS_SYSTEM_STATE_H_ +#endif // (VPX_ARCH_X86 || VPX_ARCH_X86_64) && HAVE_MMX + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_PORTS_SYSTEM_STATE_H_ diff --git a/media/libvpx/libvpx/vpx_ports/vpx_once.h b/media/libvpx/libvpx/vpx_ports/vpx_once.h index 7d9fc3b406..d33eff4397 100644 --- a/media/libvpx/libvpx/vpx_ports/vpx_once.h +++ b/media/libvpx/libvpx/vpx_ports/vpx_once.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_VPX_ONCE_H_ -#define VPX_PORTS_VPX_ONCE_H_ +#ifndef VPX_VPX_PORTS_VPX_ONCE_H_ +#define VPX_VPX_PORTS_VPX_ONCE_H_ #include "vpx_config.h" @@ -91,29 +91,6 @@ static void once(void (*func)(void)) { return; } -#elif CONFIG_MULTITHREAD && defined(__OS2__) -#define INCL_DOS -#include -static void once(void (*func)(void)) { - static int done; - - /* If the initialization is complete, return early. */ - if (done) return; - - /* Causes all other threads in the process to block themselves - * and give up their time slice. - */ - DosEnterCritSec(); - - if (!done) { - func(); - done = 1; - } - - /* Restores normal thread dispatching for the current process. */ - DosExitCritSec(); -} - #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H #include static void once(void (*func)(void)) { @@ -128,7 +105,7 @@ static void once(void (*func)(void)) { */ static void once(void (*func)(void)) { - static int done; + static volatile int done; if (!done) { func(); @@ -137,4 +114,4 @@ static void once(void (*func)(void)) { } #endif -#endif // VPX_PORTS_VPX_ONCE_H_ +#endif // VPX_VPX_PORTS_VPX_ONCE_H_ diff --git a/media/libvpx/libvpx/vpx_ports/vpx_ports.mk b/media/libvpx/libvpx/vpx_ports/vpx_ports.mk index 36b14936df..b0ad210f60 100644 --- a/media/libvpx/libvpx/vpx_ports/vpx_ports.mk +++ b/media/libvpx/libvpx/vpx_ports/vpx_ports.mk @@ -12,16 +12,46 @@ PORTS_SRCS-yes += vpx_ports.mk PORTS_SRCS-yes += bitops.h +PORTS_SRCS-yes += compiler_attributes.h PORTS_SRCS-yes += mem.h -PORTS_SRCS-yes += msvc.h +PORTS_SRCS-yes += static_assert.h PORTS_SRCS-yes += system_state.h PORTS_SRCS-yes += vpx_timer.h -ifeq ($(ARCH_X86)$(ARCH_X86_64),yes) -PORTS_SRCS-yes += emms.asm +ifeq ($(VPX_ARCH_X86),yes) +PORTS_SRCS-$(HAVE_MMX) += emms_mmx.c +endif +ifeq ($(VPX_ARCH_X86_64),yes) +# Visual Studio x64 does not support the _mm_empty() intrinsic. +PORTS_SRCS-$(HAVE_MMX) += emms_mmx.asm +endif + +ifeq ($(VPX_ARCH_X86_64),yes) +PORTS_SRCS-$(CONFIG_MSVS) += float_control_word.asm +endif + +ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes) PORTS_SRCS-yes += x86.h PORTS_SRCS-yes += x86_abi_support.asm endif -PORTS_SRCS-$(ARCH_ARM) += arm_cpudetect.c -PORTS_SRCS-$(ARCH_ARM) += arm.h +ifeq ($(VPX_ARCH_AARCH64),yes) +PORTS_SRCS-yes += aarch64_cpudetect.c +else +PORTS_SRCS-$(VPX_ARCH_ARM) += aarch32_cpudetect.c +endif +PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.h +PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h + +PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c +PORTS_SRCS-$(VPX_ARCH_PPC) += ppc.h + +PORTS_SRCS-$(VPX_ARCH_MIPS) += mips_cpudetect.c +PORTS_SRCS-$(VPX_ARCH_MIPS) += mips.h + +PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch_cpudetect.c +PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch.h + +ifeq ($(VPX_ARCH_MIPS), yes) +PORTS_SRCS-yes += asmdefs_mmi.h +endif diff --git a/media/libvpx/libvpx/vpx_ports/vpx_timer.h b/media/libvpx/libvpx/vpx_ports/vpx_timer.h index c1f1b60275..55e7891e19 100644 --- a/media/libvpx/libvpx/vpx_ports/vpx_timer.h +++ b/media/libvpx/libvpx/vpx_ports/vpx_timer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_VPX_TIMER_H_ -#define VPX_PORTS_VPX_TIMER_H_ +#ifndef VPX_VPX_PORTS_VPX_TIMER_H_ +#define VPX_VPX_PORTS_VPX_TIMER_H_ #include "./vpx_config.h" @@ -31,17 +31,17 @@ /* * POSIX specific includes */ -#include +#include /* timersub is not provided by msys at this time. */ -#ifndef timersub -#define timersub(a, b, result) \ +#ifndef timersub_ns +#define timersub_ns(a, b, result) \ do { \ (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ - (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ - if ((result)->tv_usec < 0) { \ + (result)->tv_nsec = (a)->tv_nsec - (b)->tv_nsec; \ + if ((result)->tv_nsec < 0) { \ --(result)->tv_sec; \ - (result)->tv_usec += 1000000; \ + (result)->tv_nsec += 1000000000; \ } \ } while (0) #endif @@ -51,23 +51,27 @@ struct vpx_usec_timer { #if defined(_WIN32) LARGE_INTEGER begin, end; #else - struct timeval begin, end; + struct timespec begin, end; #endif }; static INLINE void vpx_usec_timer_start(struct vpx_usec_timer *t) { #if defined(_WIN32) QueryPerformanceCounter(&t->begin); +#elif defined(CLOCK_MONOTONIC_RAW) + clock_gettime(CLOCK_MONOTONIC_RAW, &t->begin); #else - gettimeofday(&t->begin, NULL); + clock_gettime(CLOCK_MONOTONIC, &t->begin); #endif } static INLINE void vpx_usec_timer_mark(struct vpx_usec_timer *t) { #if defined(_WIN32) QueryPerformanceCounter(&t->end); +#elif defined(CLOCK_MONOTONIC_RAW) + clock_gettime(CLOCK_MONOTONIC_RAW, &t->end); #else - gettimeofday(&t->end, NULL); + clock_gettime(CLOCK_MONOTONIC, &t->end); #endif } @@ -80,18 +84,18 @@ static INLINE int64_t vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { QueryPerformanceFrequency(&freq); return diff.QuadPart * 1000000 / freq.QuadPart; #else - struct timeval diff; + struct timespec diff; - timersub(&t->end, &t->begin, &diff); - return diff.tv_sec * 1000000 + diff.tv_usec; + timersub_ns(&t->end, &t->begin, &diff); + return (int64_t)diff.tv_sec * 1000000 + diff.tv_nsec / 1000; #endif } #else /* CONFIG_OS_SUPPORT = 0*/ /* Empty timer functions if CONFIG_OS_SUPPORT = 0 */ -#ifndef timersub -#define timersub(a, b, result) +#ifndef timersub_ns +#define timersub_ns(a, b, result) #endif struct vpx_usec_timer { @@ -106,4 +110,4 @@ static INLINE int vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { return 0; } #endif /* CONFIG_OS_SUPPORT */ -#endif // VPX_PORTS_VPX_TIMER_H_ +#endif // VPX_VPX_PORTS_VPX_TIMER_H_ diff --git a/media/libvpx/libvpx/vpx_ports/x86.h b/media/libvpx/libvpx/vpx_ports/x86.h index 5aabb9e3af..4e450f8b5f 100644 --- a/media/libvpx/libvpx/vpx_ports/x86.h +++ b/media/libvpx/libvpx/vpx_ports/x86.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_X86_H_ -#define VPX_PORTS_X86_H_ +#ifndef VPX_VPX_PORTS_X86_H_ +#define VPX_VPX_PORTS_X86_H_ #include #if defined(_MSC_VER) @@ -42,12 +42,12 @@ typedef enum { VPX_CPU_LAST } vpx_cpu_t; -#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__) -#if ARCH_X86_64 +#if defined(__GNUC__) || defined(__ANDROID__) +#if VPX_ARCH_X86_64 #define cpuid(func, func2, ax, bx, cx, dx) \ __asm__ __volatile__("cpuid \n\t" \ : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \ - : "a"(func), "c"(func2)); + : "a"(func), "c"(func2)) #else #define cpuid(func, func2, ax, bx, cx, dx) \ __asm__ __volatile__( \ @@ -55,11 +55,11 @@ typedef enum { "cpuid \n\t" \ "xchg %%edi, %%ebx \n\t" \ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ - : "a"(func), "c"(func2)); + : "a"(func), "c"(func2)) #endif #elif defined(__SUNPRO_C) || \ defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/ -#if ARCH_X86_64 +#if VPX_ARCH_X86_64 #define cpuid(func, func2, ax, bx, cx, dx) \ asm volatile( \ "xchg %rsi, %rbx \n\t" \ @@ -67,7 +67,7 @@ typedef enum { "movl %ebx, %edi \n\t" \ "xchg %rsi, %rbx \n\t" \ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ - : "a"(func), "c"(func2)); + : "a"(func), "c"(func2)) #else #define cpuid(func, func2, ax, bx, cx, dx) \ asm volatile( \ @@ -76,10 +76,10 @@ typedef enum { "movl %ebx, %edi \n\t" \ "popl %ebx \n\t" \ : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \ - : "a"(func), "c"(func2)); + : "a"(func), "c"(func2)) #endif #else /* end __SUNPRO__ */ -#if ARCH_X86_64 +#if VPX_ARCH_X86_64 #if defined(_MSC_VER) && _MSC_VER > 1500 #define cpuid(func, func2, a, b, c, d) \ do { \ @@ -151,78 +151,104 @@ static INLINE uint64_t xgetbv(void) { #endif #endif -#define HAS_MMX 0x01 -#define HAS_SSE 0x02 -#define HAS_SSE2 0x04 -#define HAS_SSE3 0x08 -#define HAS_SSSE3 0x10 -#define HAS_SSE4_1 0x20 -#define HAS_AVX 0x40 -#define HAS_AVX2 0x80 +#define HAS_MMX 0x001 +#define HAS_SSE 0x002 +#define HAS_SSE2 0x004 +#define HAS_SSE3 0x008 +#define HAS_SSSE3 0x010 +#define HAS_SSE4_1 0x020 +#define HAS_AVX 0x040 +#define HAS_AVX2 0x080 +#define HAS_AVX512 0x100 #ifndef BIT -#define BIT(n) (1 << n) +#define BIT(n) (1u << (n)) #endif +#define MMX_BITS BIT(23) +#define SSE_BITS BIT(25) +#define SSE2_BITS BIT(26) +#define SSE3_BITS BIT(0) +#define SSSE3_BITS BIT(9) +#define SSE4_1_BITS BIT(19) +// Bits 27 (OSXSAVE) & 28 (256-bit AVX) +#define AVX_BITS (BIT(27) | BIT(28)) +#define AVX2_BITS BIT(5) +// Bits 16 (AVX-512F) & 17 (AVX-512DQ) & 28 (AVX-512CD) & 30 (AVX-512BW) +// & 31 (AVX-512VL) +#define AVX512_BITS (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31)) + +#define FEATURE_SET(reg, feature) \ + (((reg) & (feature##_BITS)) == (feature##_BITS)) + static INLINE int x86_simd_caps(void) { unsigned int flags = 0; - unsigned int mask = ~0; + unsigned int mask = ~0u; unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx; char *env; (void)reg_ebx; /* See if the CPU capabilities are being overridden by the environment */ env = getenv("VPX_SIMD_CAPS"); - if (env && *env) return (int)strtol(env, NULL, 0); env = getenv("VPX_SIMD_CAPS_MASK"); - if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0); /* Ensure that the CPUID instruction supports extended features */ cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx); - if (max_cpuid_val < 1) return 0; /* Get the standard feature flags */ cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); - if (reg_edx & BIT(23)) flags |= HAS_MMX; + flags |= FEATURE_SET(reg_edx, MMX) ? HAS_MMX : 0; + flags |= FEATURE_SET(reg_edx, SSE) ? HAS_SSE : 0; + flags |= FEATURE_SET(reg_edx, SSE2) ? HAS_SSE2 : 0; + flags |= FEATURE_SET(reg_ecx, SSE3) ? HAS_SSE3 : 0; + flags |= FEATURE_SET(reg_ecx, SSSE3) ? HAS_SSSE3 : 0; + flags |= FEATURE_SET(reg_ecx, SSE4_1) ? HAS_SSE4_1 : 0; - if (reg_edx & BIT(25)) flags |= HAS_SSE; /* aka xmm */ - - if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */ - - if (reg_ecx & BIT(0)) flags |= HAS_SSE3; - - if (reg_ecx & BIT(9)) flags |= HAS_SSSE3; - - if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1; - - // bits 27 (OSXSAVE) & 28 (256-bit AVX) - if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) { + if (FEATURE_SET(reg_ecx, AVX)) { + // Check for OS-support of YMM state. Necessary for AVX and AVX2. if ((xgetbv() & 0x6) == 0x6) { flags |= HAS_AVX; - if (max_cpuid_val >= 7) { /* Get the leaf 7 feature flags. Needed to check for AVX2 support */ cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); - - if (reg_ebx & BIT(5)) flags |= HAS_AVX2; + flags |= FEATURE_SET(reg_ebx, AVX2) ? HAS_AVX2 : 0; + if (FEATURE_SET(reg_ebx, AVX512)) { + // Check for OS-support of ZMM and YMM state. Necessary for AVX-512. + if ((xgetbv() & 0xe6) == 0xe6) flags |= HAS_AVX512; + } } } } - + (void)reg_eax; // Avoid compiler warning on unused-but-set variable. return flags & mask; } -// Note: -// 32-bit CPU cycle counter is light-weighted for most function performance -// measurement. For large function (CPU time > a couple of seconds), 64-bit -// counter should be used. -// 32-bit CPU cycle counter +// Fine-Grain Measurement Functions +// +// If you are timing a small region of code, access the timestamp counter +// (TSC) via: +// +// unsigned int start = x86_tsc_start(); +// ... +// unsigned int end = x86_tsc_end(); +// unsigned int diff = end - start; +// +// The start/end functions introduce a few more instructions than using +// x86_readtsc directly, but prevent the CPU's out-of-order execution from +// affecting the measurement (by having earlier/later instructions be evaluated +// in the time interval). See the white paper, "How to Benchmark Code +// Execution Times on Intel(R) IA-32 and IA-64 Instruction Set Architectures" by +// Gabriele Paoloni for more information. +// +// If you are timing a large function (CPU time > a couple of seconds), use +// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The +// out-of-order leakage that can occur is minimal compared to total runtime. static INLINE unsigned int x86_readtsc(void) { -#if defined(__GNUC__) && __GNUC__ +#if defined(__GNUC__) unsigned int tsc; __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :); return tsc; @@ -231,7 +257,7 @@ static INLINE unsigned int x86_readtsc(void) { asm volatile("rdtsc\n\t" : "=a"(tsc) :); return tsc; #else -#if ARCH_X86_64 +#if VPX_ARCH_X86_64 return (unsigned int)__rdtsc(); #else __asm rdtsc; @@ -240,7 +266,7 @@ static INLINE unsigned int x86_readtsc(void) { } // 64-bit CPU cycle counter static INLINE uint64_t x86_readtsc64(void) { -#if defined(__GNUC__) && __GNUC__ +#if defined(__GNUC__) uint32_t hi, lo; __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); return ((uint64_t)hi << 32) | lo; @@ -249,7 +275,7 @@ static INLINE uint64_t x86_readtsc64(void) { asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi)); return ((uint64_t)hi << 32) | lo; #else -#if ARCH_X86_64 +#if VPX_ARCH_X86_64 return (uint64_t)__rdtsc(); #else __asm rdtsc; @@ -257,19 +283,66 @@ static INLINE uint64_t x86_readtsc64(void) { #endif } -#if defined(__GNUC__) && __GNUC__ +// 32-bit CPU cycle counter with a partial fence against out-of-order execution. +static INLINE unsigned int x86_readtscp(void) { +#if defined(__GNUC__) + unsigned int tscp; + __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tscp; + asm volatile("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(_MSC_VER) + unsigned int ui; + return (unsigned int)__rdtscp(&ui); +#else +#if VPX_ARCH_X86_64 + return (unsigned int)__rdtscp(); +#else + __asm rdtscp; +#endif +#endif +} + +static INLINE unsigned int x86_tsc_start(void) { + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + // This call should not be removed. See function notes above. + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + // Avoid compiler warnings on unused-but-set variables. + (void)reg_eax; + (void)reg_ebx; + (void)reg_ecx; + (void)reg_edx; + return x86_readtsc(); +} + +static INLINE unsigned int x86_tsc_end(void) { + uint32_t v = x86_readtscp(); + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + // This call should not be removed. See function notes above. + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + // Avoid compiler warnings on unused-but-set variables. + (void)reg_eax; + (void)reg_ebx; + (void)reg_ecx; + (void)reg_edx; + return v; +} + +#if defined(__GNUC__) #define x86_pause_hint() __asm__ __volatile__("pause \n\t") #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) #define x86_pause_hint() asm volatile("pause \n\t") #else -#if ARCH_X86_64 +#if VPX_ARCH_X86_64 #define x86_pause_hint() _mm_pause(); #else #define x86_pause_hint() __asm pause #endif #endif -#if defined(__GNUC__) && __GNUC__ +#if defined(__GNUC__) static void x87_set_control_word(unsigned short mode) { __asm__ __volatile__("fldcw %0" : : "m"(*&mode)); } @@ -287,7 +360,7 @@ static unsigned short x87_get_control_word(void) { asm volatile("fstcw %0\n\t" : "=m"(*&mode) :); return mode; } -#elif ARCH_X86_64 +#elif VPX_ARCH_X86_64 /* No fldcw intrinsics on Windows x64, punt to external asm */ extern void vpx_winx64_fldcw(unsigned short mode); extern unsigned short vpx_winx64_fstcw(void); @@ -306,14 +379,23 @@ static unsigned short x87_get_control_word(void) { static INLINE unsigned int x87_set_double_precision(void) { unsigned int mode = x87_get_control_word(); - x87_set_control_word((mode & ~0x300) | 0x200); + // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1 + // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf + // 8.1.5.2 Precision Control Field + // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control") + // determine the number of bits used in floating point calculations. To match + // later SSE instructions restrict x87 operations to Double Precision (0x200). + // Precision PC Field + // Single Precision (24-Bits) 00B + // Reserved 01B + // Double Precision (53-Bits) 10B + // Extended Precision (64-Bits) 11B + x87_set_control_word((mode & ~0x300u) | 0x200u); return mode; } -extern void vpx_reset_mmx_state(void); - #ifdef __cplusplus } // extern "C" #endif -#endif // VPX_PORTS_X86_H_ +#endif // VPX_VPX_PORTS_X86_H_ diff --git a/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm b/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm index 708fa101c5..6b2d6b9684 100644 --- a/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm +++ b/media/libvpx/libvpx/vpx_ports/x86_abi_support.asm @@ -89,49 +89,70 @@ %define LIBVPX_YASM_WIN64 0 %endif +; Declare groups of platforms +%ifidn __OUTPUT_FORMAT__,elf32 + %define LIBVPX_ELF 1 +%elifidn __OUTPUT_FORMAT__,elfx32 + %define LIBVPX_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define LIBVPX_ELF 1 +%else + %define LIBVPX_ELF 0 +%endif + +%ifidn __OUTPUT_FORMAT__,macho32 + %define LIBVPX_MACHO 1 +%elifidn __OUTPUT_FORMAT__,macho64 + %define LIBVPX_MACHO 1 +%else + %define LIBVPX_MACHO 0 +%endif + ; sym() ; Return the proper symbol name for the target ABI. ; ; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols ; with C linkage be prefixed with an underscore. ; -%ifidn __OUTPUT_FORMAT__,elf32 -%define sym(x) x -%elifidn __OUTPUT_FORMAT__,elf64 -%define sym(x) x -%elifidn __OUTPUT_FORMAT__,elfx32 -%define sym(x) x -%elif LIBVPX_YASM_WIN64 -%define sym(x) x +%if LIBVPX_ELF || LIBVPX_YASM_WIN64 + %define sym(x) x %else -%define sym(x) _ %+ x + ; Mach-O / COFF + %define sym(x) _ %+ x %endif -; PRIVATE -; Macro for the attribute to hide a global symbol for the target ABI. -; This is only active if CHROMIUM is defined. +; globalsym() +; Return a global declaration with the proper decoration for the target ABI. ; -; Chromium doesn't like exported global symbols due to symbol clashing with -; plugins among other things. +; When CHROMIUM is defined, include attributes to hide the symbol from the +; global namespace. ; -; Requires Chromium's patched copy of yasm: -; http://src.chromium.org/viewvc/chrome?view=rev&revision=73761 -; http://www.tortall.net/projects/yasm/ticket/236 +; Chromium doesn't like exported global symbols due to symbol clashing with +; plugins among other things. +; +; Requires Chromium's patched copy of yasm: +; http://src.chromium.org/viewvc/chrome?view=rev&revision=73761 +; http://www.tortall.net/projects/yasm/ticket/236 +; or nasm > 2.14. ; %ifdef CHROMIUM - %ifidn __OUTPUT_FORMAT__,elf32 - %define PRIVATE :hidden - %elifidn __OUTPUT_FORMAT__,elf64 - %define PRIVATE :hidden - %elifidn __OUTPUT_FORMAT__,elfx32 - %define PRIVATE :hidden - %elif LIBVPX_YASM_WIN64 - %define PRIVATE + %ifdef __NASM_VER__ + %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 + ; nasm < 2.14 does not support :private_extern directive + %fatal Must use nasm 2.14 or newer + %endif + %endif + + %if LIBVPX_ELF + %define globalsym(x) global sym(x) %+ :function hidden + %elif LIBVPX_MACHO + %define globalsym(x) global sym(x) %+ :private_extern %else - %define PRIVATE :private_extern + ; COFF / PE32+ + %define globalsym(x) global sym(x) %endif %else - %define PRIVATE + %define globalsym(x) global sym(x) %endif ; arg() diff --git a/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c b/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c index b554a56e83..d8db4b3547 100644 --- a/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c +++ b/media/libvpx/libvpx/vpx_scale/generic/gen_scalers.c @@ -12,8 +12,8 @@ #include "vpx_scale/vpx_scale.h" #include "vpx_mem/vpx_mem.h" /**************************************************************************** -* Imports -****************************************************************************/ + * Imports + ****************************************************************************/ /**************************************************************************** * diff --git a/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c b/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c index 20e1ff90fd..36d6d3628a 100644 --- a/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c +++ b/media/libvpx/libvpx/vpx_scale/generic/vpx_scale.c @@ -17,8 +17,10 @@ ***************************************************************************/ /**************************************************************************** -* Header Files -****************************************************************************/ + * Header Files + ****************************************************************************/ +#include + #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/vpx_scale.h" @@ -172,6 +174,7 @@ static void scale1d_c(const unsigned char *source, int source_step, /*assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale * );*/ + assert(dest_scale != 0); for (i = 0; i < dest_length * dest_step; i += dest_step) { dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / diff --git a/media/libvpx/libvpx/vpx_scale/generic/yv12config.c b/media/libvpx/libvpx/vpx_scale/generic/yv12config.c index a674eac84b..07deeb2016 100644 --- a/media/libvpx/libvpx/vpx_scale/generic/yv12config.c +++ b/media/libvpx/libvpx/vpx_scale/generic/yv12config.c @@ -9,20 +9,25 @@ */ #include +#include +#include #include "vpx_scale/yv12config.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" +#if defined(VPX_MAX_ALLOCABLE_MEMORY) +#include "vp9/common/vp9_onyxc_int.h" +#endif // VPX_MAX_ALLOCABLE_MEMORY /**************************************************************************** -* Exports -****************************************************************************/ + * Exports + ****************************************************************************/ /**************************************************************************** * ****************************************************************************/ #define yv12_align_addr(addr, align) \ - (void *)(((size_t)(addr) + ((align)-1)) & (size_t) - (align)) + (void *)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align)) int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) { if (ybf) { @@ -53,17 +58,29 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int uv_width = aligned_width >> 1; int uv_height = aligned_height >> 1; /** There is currently a bunch of code which assumes - * uv_stride == y_stride/2, so enforce this here. */ + * uv_stride == y_stride/2, so enforce this here. */ int uv_stride = y_stride >> 1; int uvplane_size = (uv_height + border) * uv_stride; - const int frame_size = yplane_size + 2 * uvplane_size; + const size_t frame_size = yplane_size + 2 * uvplane_size; if (!ybf->buffer_alloc) { ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size); + if (!ybf->buffer_alloc) { + ybf->buffer_alloc_sz = 0; + return -1; + } +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) + // This memset is needed for fixing the issue of using uninitialized + // value in msan test. It will cause a perf loss, so only do this for + // msan test. + memset(ybf->buffer_alloc, 0, frame_size); +#endif +#endif ybf->buffer_alloc_sz = frame_size; } - if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) return -1; + if (ybf->buffer_alloc_sz < frame_size) return -1; /* Only support allocating buffers that have a border that's a multiple * of 32. The border restriction is required to get 16-byte alignment of @@ -141,6 +158,17 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border, int byte_alignment, vpx_codec_frame_buffer_t *fb, vpx_get_frame_buffer_cb_fn_t cb, void *cb_priv) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1; +#endif + + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without introducing an arbitrary gap + * between planes, which would break the semantics of things like + * vpx_img_set_rect(). */ + if (border & 0x1f) return -3; + if (ybf) { const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; const int aligned_width = (width + 7) & ~7; @@ -165,6 +193,21 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, uint8_t *buf = NULL; +#if defined(VPX_MAX_ALLOCABLE_MEMORY) + // The decoder may allocate REF_FRAMES frame buffers in the frame buffer + // pool. Bound the total amount of allocated memory as if these REF_FRAMES + // frame buffers were allocated in a single allocation. + if (frame_size > VPX_MAX_ALLOCABLE_MEMORY / REF_FRAMES) return -1; +#endif // VPX_MAX_ALLOCABLE_MEMORY + +#if UINT64_MAX > SIZE_MAX + // frame_size is stored in buffer_alloc_sz, which is a size_t. If it won't + // fit, fail early. + if (frame_size > SIZE_MAX) { + return -1; + } +#endif + if (cb != NULL) { const int align_addr_extra_size = 31; const uint64_t external_frame_size = frame_size + align_addr_extra_size; @@ -185,20 +228,19 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, // This memset is needed for fixing the issue of using uninitialized // value in msan test. It will cause a perf loss, so only do this for // msan test. - memset(ybf->buffer_alloc, 0, (int)frame_size); + memset(ybf->buffer_alloc, 0, (size_t)frame_size); #endif #endif - } else if (frame_size > (size_t)ybf->buffer_alloc_sz) { + } else if (frame_size > ybf->buffer_alloc_sz) { // Allocation to hold larger frame, or first allocation. vpx_free(ybf->buffer_alloc); ybf->buffer_alloc = NULL; - - if (frame_size != (size_t)frame_size) return -1; + ybf->buffer_alloc_sz = 0; ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size); if (!ybf->buffer_alloc) return -1; - ybf->buffer_alloc_sz = (int)frame_size; + ybf->buffer_alloc_sz = (size_t)frame_size; // This memset is needed for fixing valgrind error from C loop filter // due to access uninitialized memory in frame border. It could be @@ -206,13 +248,6 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz); } - /* Only support allocating buffers that have a border that's a multiple - * of 32. The border restriction is required to get 16-byte alignment of - * the start of the chroma rows without introducing an arbitrary gap - * between planes, which would break the semantics of things like - * vpx_img_set_rect(). */ - if (border & 0x1f) return -3; - ybf->y_crop_width = width; ybf->y_crop_height = height; ybf->y_width = aligned_width; @@ -226,7 +261,7 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, ybf->uv_stride = uv_stride; ybf->border = border; - ybf->frame_size = (int)frame_size; + ybf->frame_size = (size_t)frame_size; ybf->subsampling_x = ss_x; ybf->subsampling_y = ss_y; diff --git a/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c b/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c index a6aaff95a0..e231806505 100644 --- a/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c +++ b/media/libvpx/libvpx/vpx_scale/generic/yv12extend.c @@ -111,25 +111,6 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { assert(ybf->y_height - ybf->y_crop_height >= 0); assert(ybf->y_width - ybf->y_crop_width >= 0); -#if CONFIG_VP9_HIGHBITDEPTH - if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { - extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, - ybf->y_crop_height, ybf->border, ybf->border, - ybf->border + ybf->y_height - ybf->y_crop_height, - ybf->border + ybf->y_width - ybf->y_crop_width); - - extend_plane_high(ybf->u_buffer, ybf->uv_stride, ybf->uv_crop_width, - ybf->uv_crop_height, uv_border, uv_border, - uv_border + ybf->uv_height - ybf->uv_crop_height, - uv_border + ybf->uv_width - ybf->uv_crop_width); - - extend_plane_high(ybf->v_buffer, ybf->uv_stride, ybf->uv_crop_width, - ybf->uv_crop_height, uv_border, uv_border, - uv_border + ybf->uv_height - ybf->uv_crop_height, - uv_border + ybf->uv_width - ybf->uv_crop_width); - return; - } -#endif extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height, ybf->border, ybf->border, ybf->border + ybf->y_height - ybf->y_crop_height, @@ -208,12 +189,55 @@ static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { // Copies the source image into the destination image and updates the // destination's UMV borders. // Note: The frames are assumed to be identical in size. + void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) { int row; const uint8_t *src = src_ybc->y_buffer; uint8_t *dst = dst_ybc->y_buffer; +#if 0 + /* These assertions are valid in the codec, but the libvpx-tester uses + * this code slightly differently. + */ + assert(src_ybc->y_width == dst_ybc->y_width); + assert(src_ybc->y_height == dst_ybc->y_height); +#endif + + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } + + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders_c(dst_ybc); +} + +#if CONFIG_VP9 +void vpx_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { + int row; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; + #if 0 /* These assertions are valid in the codec, but the libvpx-tester uses * this code slightly differently. @@ -249,7 +273,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, dst += dst_ybc->uv_stride; } - vp8_yv12_extend_frame_borders_c(dst_ybc); + vpx_extend_frame_borders_c(dst_ybc); return; } else { assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH)); @@ -280,8 +304,9 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, dst += dst_ybc->uv_stride; } - vp8_yv12_extend_frame_borders_c(dst_ybc); + vpx_extend_frame_borders_c(dst_ybc); } +#endif // CONFIG_VP9 void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) { diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale.h b/media/libvpx/libvpx/vpx_scale/vpx_scale.h index 478a483461..fd5ba7ccdc 100644 --- a/media/libvpx/libvpx/vpx_scale/vpx_scale.h +++ b/media/libvpx/libvpx/vpx_scale/vpx_scale.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_SCALE_VPX_SCALE_H_ -#define VPX_SCALE_VPX_SCALE_H_ +#ifndef VPX_VPX_SCALE_VPX_SCALE_H_ +#define VPX_VPX_SCALE_VPX_SCALE_H_ #include "vpx_scale/yv12config.h" @@ -19,4 +19,4 @@ extern void vpx_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, unsigned int vscale, unsigned int vratio, unsigned int interlaced); -#endif // VPX_SCALE_VPX_SCALE_H_ +#endif // VPX_VPX_SCALE_VPX_SCALE_H_ diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c index dc4d9593a8..706b0770c8 100644 --- a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c +++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.c @@ -12,4 +12,4 @@ #include "./vpx_scale_rtcd.h" #include "vpx_ports/vpx_once.h" -void vpx_scale_rtcd() { once(setup_rtcd_internal); } +void vpx_scale_rtcd(void) { once(setup_rtcd_internal); } diff --git a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl index 44b115c7eb..1281071a7d 100644 --- a/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl +++ b/media/libvpx/libvpx/vpx_scale/vpx_scale_rtcd.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + sub vpx_scale_forward_decls() { print < #include "./vpx_config.h" @@ -115,4 +115,4 @@ static INLINE uint64_t BSwap64(uint64_t x) { #endif // HAVE_BUILTIN_BSWAP64 } -#endif // VPX_UTIL_ENDIAN_INL_H_ +#endif // VPX_VPX_UTIL_ENDIAN_INL_H_ diff --git a/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h b/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h new file mode 100644 index 0000000000..b8b9e6db02 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/loongson_intrinsics.h @@ -0,0 +1,2090 @@ +/* + * Copyright (c) 2022 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + */ + +#ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ +#define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ + +/* + * Copyright (c) 2021 Loongson Technology Corporation Limited + * All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + * + * Contributed by Shiyou Yin + * Xiwei Gu + * Lu Wang + * + * This file is a header file for loongarch builtin extension. + * + */ + +#ifndef LOONGSON_INTRINSICS_H +#define LOONGSON_INTRINSICS_H + +/** + * MAJOR version: Macro usage changes. + * MINOR version: Add new functions, or bug fixes. + * MICRO version: Comment changes or implementation changes. + */ +#define LSOM_VERSION_MAJOR 1 +#define LSOM_VERSION_MINOR 2 +#define LSOM_VERSION_MICRO 1 + +#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \ + { \ + _OUT0 = _INS(_IN0); \ + _OUT1 = _INS(_IN1); \ + } + +#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \ + { \ + _OUT0 = _INS(_IN0, _IN1); \ + _OUT1 = _INS(_IN2, _IN3); \ + } + +#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \ + { \ + _OUT0 = _INS(_IN0, _IN1, _IN2); \ + _OUT1 = _INS(_IN3, _IN4, _IN5); \ + } + +#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \ + { \ + DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \ + DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \ + } + +#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \ + _OUT1, _OUT2, _OUT3) \ + { \ + DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \ + DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \ + } + +#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \ + _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \ + { \ + DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \ + DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \ + } + +#ifdef __loongarch_sx +#include +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. Then + * the results are added to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) + * in_c : 1,2,3,4, 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 23,40,41,26, 23,40,41,26 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_h_b(in_c, in_h, in_l); + out = __lsx_vmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * unsigned byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * The results are added to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l) + * in_c : 1,2,3,4, 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 23,40,41,26, 23,40,41,26 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l); + out = __lsx_vmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * The results are added to signed half-word elements from in_c. + * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l) + * in_c : 1,1,1,1, 1,1,1,1 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8 + * out : -4,-24,-60,-112, 6,26,62,114 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l); + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of half-word vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Outputs - out + * Return Type - __m128i + * Details : Signed half-word elements from in_h are multiplied by + * signed half-word elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Then the results are added to signed word elements from in_c. + * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l) + * in_c : 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1 + * out : 23,40,41,26 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, + __m128i in_l) { + __m128i out; + + out = __lsx_vmaddwev_w_h(in_c, in_h, in_l); + out = __lsx_vmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_h_b(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22, 22,38,38,22 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_h_b(in_h, in_l); + out = __lsx_vmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * unsigned byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_h_bu(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22, 22,38,38,22 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_h_bu(in_h, in_l); + out = __lsx_vmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1 + * out : 22,38,38,22, 22,38,38,6 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_h_bu_b(in_h, in_l); + out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_w_h(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_w_h(in_h, in_l); + out = __lsx_vmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Outputs - out + * Return Type - double + * Details : Signed byte elements from in_h are multiplied by + * signed byte elements from in_l, and then added adjacent to + * each other to get a result twice the size of input. + * Example : out = __lsx_vdp2_d_w(in_h, in_l) + * in_h : 1,2,3,4 + * in_l : 8,7,6,5 + * out : 22,38 + * ============================================================================= + */ +static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) { + __m128i out; + + out = __lsx_vmulwev_d_w(in_h, in_l); + out = __lsx_vmaddwod_d_w(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Clip all halfword elements of input vector between min & max + * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : + * (_in)) + * Arguments : Inputs - _in (input vector) + * - min (min threshold) + * - max (max threshold) + * Outputs - out (output vector with clipped elements) + * Return Type - signed halfword + * Example : out = __lsx_vclip_h(_in) + * _in : -8,2,280,249, -8,255,280,249 + * min : 1,1,1,1, 1,1,1,1 + * max : 9,9,9,9, 9,9,9,9 + * out : 1,2,9,9, 1,9,9,9 + * ============================================================================= + */ +static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) { + __m128i out; + + out = __lsx_vmax_h(min, _in); + out = __lsx_vmin_h(max, out); + return out; +} + +/* + * ============================================================================= + * Description : Set each element of vector between 0 and 255 + * Arguments : Inputs - _in + * Outputs - out + * Return Type - halfword + * Details : Signed byte elements from _in are clamped between 0 and 255. + * Example : out = __lsx_vclip255_h(_in) + * _in : -8,255,280,249, -8,255,280,249 + * out : 0,255,255,249, 0,255,255,249 + * ============================================================================= + */ +static inline __m128i __lsx_vclip255_h(__m128i _in) { + __m128i out; + + out = __lsx_vmaxi_h(_in, 0); + out = __lsx_vsat_hu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Set each element of vector between 0 and 255 + * Arguments : Inputs - _in + * Outputs - out + * Return Type - word + * Details : Signed byte elements from _in are clamped between 0 and 255. + * Example : out = __lsx_vclip255_w(_in) + * _in : -8,255,280,249 + * out : 0,255,255,249 + * ============================================================================= + */ +static inline __m128i __lsx_vclip255_w(__m128i _in) { + __m128i out; + + out = __lsx_vmaxi_w(_in, 0); + out = __lsx_vsat_wu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Swap two variables + * Arguments : Inputs - _in0, _in1 + * Outputs - _in0, _in1 (in-place) + * Details : Swapping of two input variables using xor + * Example : LSX_SWAP(_in0, _in1) + * _in0 : 1,2,3,4 + * _in1 : 5,6,7,8 + * _in0(out) : 5,6,7,8 + * _in1(out) : 1,2,3,4 + * ============================================================================= + */ +#define LSX_SWAP(_in0, _in1) \ + { \ + _in0 = __lsx_vxor_v(_in0, _in1); \ + _in1 = __lsx_vxor_v(_in0, _in1); \ + _in0 = __lsx_vxor_v(_in0, _in1); \ + } + +/* + * ============================================================================= + * Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : + * Example : + * 1, 2, 3, 4 1, 5, 9,13 + * 5, 6, 7, 8 to 2, 6,10,14 + * 9,10,11,12 =====> 3, 7,11,15 + * 13,14,15,16 4, 8,12,16 + * ============================================================================= + */ +#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + __m128i _t0, _t1, _t2, _t3; \ + \ + _t0 = __lsx_vilvl_w(_in1, _in0); \ + _t1 = __lsx_vilvh_w(_in1, _in0); \ + _t2 = __lsx_vilvl_w(_in3, _in2); \ + _t3 = __lsx_vilvh_w(_in3, _in2); \ + _out0 = __lsx_vilvl_d(_t2, _t0); \ + _out1 = __lsx_vilvh_d(_t2, _t0); \ + _out2 = __lsx_vilvl_d(_t3, _t1); \ + _out3 = __lsx_vilvh_d(_t3, _t1); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with byte elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 + * Details : The rows of the matrix become columns, and the columns + * become rows. + * Example : LSX_TRANSPOSE8x8_B + * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00 + * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00 + * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00 + * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00 + * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00 + * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00 + * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00 + * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00 + * + * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 + * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 + * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 + * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 + * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00 + * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00 + * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00 + * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00 + * ============================================================================= + */ +#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m128i zero = { 0 }; \ + __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \ + __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + \ + _t0 = __lsx_vilvl_b(_in2, _in0); \ + _t1 = __lsx_vilvl_b(_in3, _in1); \ + _t2 = __lsx_vilvl_b(_in6, _in4); \ + _t3 = __lsx_vilvl_b(_in7, _in5); \ + _t4 = __lsx_vilvl_b(_t1, _t0); \ + _t5 = __lsx_vilvh_b(_t1, _t0); \ + _t6 = __lsx_vilvl_b(_t3, _t2); \ + _t7 = __lsx_vilvh_b(_t3, _t2); \ + _out0 = __lsx_vilvl_w(_t6, _t4); \ + _out2 = __lsx_vilvh_w(_t6, _t4); \ + _out4 = __lsx_vilvl_w(_t7, _t5); \ + _out6 = __lsx_vilvh_w(_t7, _t5); \ + _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \ + _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \ + _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \ + _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + * Details : + * Example : + * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70 + * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71 + * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72 + * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73 + * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74 + * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75 + * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76 + * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77 + * ============================================================================= + */ +#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + \ + _s0 = __lsx_vilvl_h(_in6, _in4); \ + _s1 = __lsx_vilvl_h(_in7, _in5); \ + _t0 = __lsx_vilvl_h(_s1, _s0); \ + _t1 = __lsx_vilvh_h(_s1, _s0); \ + _s0 = __lsx_vilvh_h(_in6, _in4); \ + _s1 = __lsx_vilvh_h(_in7, _in5); \ + _t2 = __lsx_vilvl_h(_s1, _s0); \ + _t3 = __lsx_vilvh_h(_s1, _s0); \ + _s0 = __lsx_vilvl_h(_in2, _in0); \ + _s1 = __lsx_vilvl_h(_in3, _in1); \ + _t4 = __lsx_vilvl_h(_s1, _s0); \ + _t5 = __lsx_vilvh_h(_s1, _s0); \ + _s0 = __lsx_vilvh_h(_in2, _in0); \ + _s1 = __lsx_vilvh_h(_in3, _in1); \ + _t6 = __lsx_vilvl_h(_s1, _s0); \ + _t7 = __lsx_vilvh_h(_s1, _s0); \ + \ + _out0 = __lsx_vpickev_d(_t0, _t4); \ + _out2 = __lsx_vpickev_d(_t1, _t5); \ + _out4 = __lsx_vpickev_d(_t2, _t6); \ + _out6 = __lsx_vpickev_d(_t3, _t7); \ + _out1 = __lsx_vpickod_d(_t0, _t4); \ + _out3 = __lsx_vpickod_d(_t1, _t5); \ + _out5 = __lsx_vpickod_d(_t2, _t6); \ + _out7 = __lsx_vpickod_d(_t3, _t7); \ + } + +/* + * ============================================================================= + * Description : Transpose input 8x4 byte block into 4x8 + * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block) + * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block) + * Return Type - as per RTYPE + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : LSX_TRANSPOSE8x4_B + * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00 + * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00 + * + * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00 + * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00 + * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00 + * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00 + * ============================================================================= + */ +#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3) \ + { \ + __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + \ + _tmp0_m = __lsx_vpackev_w(_in4, _in0); \ + _tmp1_m = __lsx_vpackev_w(_in5, _in1); \ + _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ + _tmp0_m = __lsx_vpackev_w(_in6, _in2); \ + _tmp1_m = __lsx_vpackev_w(_in7, _in3); \ + \ + _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \ + _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \ + _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \ + \ + _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \ + _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \ + _out1 = __lsx_vilvh_d(_out2, _out0); \ + _out3 = __lsx_vilvh_d(_out0, _out2); \ + } + +/* + * ============================================================================= + * Description : Transpose 16x8 block with byte elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8 + * in9, in10, in11, in12, in13, in14, in15 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + * Details : + * Example : + * 000,001,002,003,004,005,006,007 + * 008,009,010,011,012,013,014,015 + * 016,017,018,019,020,021,022,023 + * 024,025,026,027,028,029,030,031 + * 032,033,034,035,036,037,038,039 + * 040,041,042,043,044,045,046,047 000,008,...,112,120 + * 048,049,050,051,052,053,054,055 001,009,...,113,121 + * 056,057,058,059,060,061,062,063 to 002,010,...,114,122 + * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123 + * 072,073,074,075,076,077,078,079 004,012,...,116,124 + * 080,081,082,083,084,085,086,087 005,013,...,117,125 + * 088,089,090,091,092,093,094,095 006,014,...,118,126 + * 096,097,098,099,100,101,102,103 007,015,...,119,127 + * 104,105,106,107,108,109,110,111 + * 112,113,114,115,116,117,118,119 + * 120,121,122,123,124,125,126,127 + * ============================================================================= + */ +#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7) \ + { \ + __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \ + __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \ + _tmp0, _tmp1, _tmp2, _tmp3); \ + DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \ + _in13, _tmp4, _tmp5, _tmp6, _tmp7); \ + DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \ + DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \ + DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \ + DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \ + DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \ + DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \ + DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \ + DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \ + DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \ + DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \ + DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \ + DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 4 input vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : Butterfly operation + * Example : + * out0 = in0 + in3; + * out1 = in1 + in2; + * out2 = in1 - in2; + * out3 = in0 - in3; + * ============================================================================= + */ +#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_b(_in0, _in3); \ + _out1 = __lsx_vadd_b(_in1, _in2); \ + _out2 = __lsx_vsub_b(_in1, _in2); \ + _out3 = __lsx_vsub_b(_in0, _in3); \ + } +#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_h(_in0, _in3); \ + _out1 = __lsx_vadd_h(_in1, _in2); \ + _out2 = __lsx_vsub_h(_in1, _in2); \ + _out3 = __lsx_vsub_h(_in0, _in3); \ + } +#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_w(_in0, _in3); \ + _out1 = __lsx_vadd_w(_in1, _in2); \ + _out2 = __lsx_vsub_w(_in1, _in2); \ + _out3 = __lsx_vsub_w(_in0, _in3); \ + } +#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lsx_vadd_d(_in0, _in3); \ + _out1 = __lsx_vadd_d(_in1, _in2); \ + _out2 = __lsx_vsub_d(_in1, _in2); \ + _out3 = __lsx_vsub_d(_in0, _in3); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 8 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ + * Outputs - _out0, _out1, _out2, _out3, ~ + * Details : Butterfly operation + * Example : + * _out0 = _in0 + _in7; + * _out1 = _in1 + _in6; + * _out2 = _in2 + _in5; + * _out3 = _in3 + _in4; + * _out4 = _in3 - _in4; + * _out5 = _in2 - _in5; + * _out6 = _in1 - _in6; + * _out7 = _in0 - _in7; + * ============================================================================= + */ +#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_b(_in0, _in7); \ + _out1 = __lsx_vadd_b(_in1, _in6); \ + _out2 = __lsx_vadd_b(_in2, _in5); \ + _out3 = __lsx_vadd_b(_in3, _in4); \ + _out4 = __lsx_vsub_b(_in3, _in4); \ + _out5 = __lsx_vsub_b(_in2, _in5); \ + _out6 = __lsx_vsub_b(_in1, _in6); \ + _out7 = __lsx_vsub_b(_in0, _in7); \ + } + +#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_h(_in0, _in7); \ + _out1 = __lsx_vadd_h(_in1, _in6); \ + _out2 = __lsx_vadd_h(_in2, _in5); \ + _out3 = __lsx_vadd_h(_in3, _in4); \ + _out4 = __lsx_vsub_h(_in3, _in4); \ + _out5 = __lsx_vsub_h(_in2, _in5); \ + _out6 = __lsx_vsub_h(_in1, _in6); \ + _out7 = __lsx_vsub_h(_in0, _in7); \ + } + +#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_w(_in0, _in7); \ + _out1 = __lsx_vadd_w(_in1, _in6); \ + _out2 = __lsx_vadd_w(_in2, _in5); \ + _out3 = __lsx_vadd_w(_in3, _in4); \ + _out4 = __lsx_vsub_w(_in3, _in4); \ + _out5 = __lsx_vsub_w(_in2, _in5); \ + _out6 = __lsx_vsub_w(_in1, _in6); \ + _out7 = __lsx_vsub_w(_in0, _in7); \ + } + +#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lsx_vadd_d(_in0, _in7); \ + _out1 = __lsx_vadd_d(_in1, _in6); \ + _out2 = __lsx_vadd_d(_in2, _in5); \ + _out3 = __lsx_vadd_d(_in3, _in4); \ + _out4 = __lsx_vsub_d(_in3, _in4); \ + _out5 = __lsx_vsub_d(_in2, _in5); \ + _out6 = __lsx_vsub_d(_in1, _in6); \ + _out7 = __lsx_vsub_d(_in0, _in7); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 16 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ + * Outputs - _out0, _out1, _out2, _out3, ~ + * Details : Butterfly operation + * Example : + * _out0 = _in0 + _in15; + * _out1 = _in1 + _in14; + * _out2 = _in2 + _in13; + * _out3 = _in3 + _in12; + * _out4 = _in4 + _in11; + * _out5 = _in5 + _in10; + * _out6 = _in6 + _in9; + * _out7 = _in7 + _in8; + * _out8 = _in7 - _in8; + * _out9 = _in6 - _in9; + * _out10 = _in5 - _in10; + * _out11 = _in4 - _in11; + * _out12 = _in3 - _in12; + * _out13 = _in2 - _in13; + * _out14 = _in1 - _in14; + * _out15 = _in0 - _in15; + * ============================================================================= + */ + +#define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_b(_in0, _in15); \ + _out1 = __lsx_vadd_b(_in1, _in14); \ + _out2 = __lsx_vadd_b(_in2, _in13); \ + _out3 = __lsx_vadd_b(_in3, _in12); \ + _out4 = __lsx_vadd_b(_in4, _in11); \ + _out5 = __lsx_vadd_b(_in5, _in10); \ + _out6 = __lsx_vadd_b(_in6, _in9); \ + _out7 = __lsx_vadd_b(_in7, _in8); \ + \ + _out8 = __lsx_vsub_b(_in7, _in8); \ + _out9 = __lsx_vsub_b(_in6, _in9); \ + _out10 = __lsx_vsub_b(_in5, _in10); \ + _out11 = __lsx_vsub_b(_in4, _in11); \ + _out12 = __lsx_vsub_b(_in3, _in12); \ + _out13 = __lsx_vsub_b(_in2, _in13); \ + _out14 = __lsx_vsub_b(_in1, _in14); \ + _out15 = __lsx_vsub_b(_in0, _in15); \ + } + +#define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_h(_in0, _in15); \ + _out1 = __lsx_vadd_h(_in1, _in14); \ + _out2 = __lsx_vadd_h(_in2, _in13); \ + _out3 = __lsx_vadd_h(_in3, _in12); \ + _out4 = __lsx_vadd_h(_in4, _in11); \ + _out5 = __lsx_vadd_h(_in5, _in10); \ + _out6 = __lsx_vadd_h(_in6, _in9); \ + _out7 = __lsx_vadd_h(_in7, _in8); \ + \ + _out8 = __lsx_vsub_h(_in7, _in8); \ + _out9 = __lsx_vsub_h(_in6, _in9); \ + _out10 = __lsx_vsub_h(_in5, _in10); \ + _out11 = __lsx_vsub_h(_in4, _in11); \ + _out12 = __lsx_vsub_h(_in3, _in12); \ + _out13 = __lsx_vsub_h(_in2, _in13); \ + _out14 = __lsx_vsub_h(_in1, _in14); \ + _out15 = __lsx_vsub_h(_in0, _in15); \ + } + +#define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_w(_in0, _in15); \ + _out1 = __lsx_vadd_w(_in1, _in14); \ + _out2 = __lsx_vadd_w(_in2, _in13); \ + _out3 = __lsx_vadd_w(_in3, _in12); \ + _out4 = __lsx_vadd_w(_in4, _in11); \ + _out5 = __lsx_vadd_w(_in5, _in10); \ + _out6 = __lsx_vadd_w(_in6, _in9); \ + _out7 = __lsx_vadd_w(_in7, _in8); \ + \ + _out8 = __lsx_vsub_w(_in7, _in8); \ + _out9 = __lsx_vsub_w(_in6, _in9); \ + _out10 = __lsx_vsub_w(_in5, _in10); \ + _out11 = __lsx_vsub_w(_in4, _in11); \ + _out12 = __lsx_vsub_w(_in3, _in12); \ + _out13 = __lsx_vsub_w(_in2, _in13); \ + _out14 = __lsx_vsub_w(_in1, _in14); \ + _out15 = __lsx_vsub_w(_in0, _in15); \ + } + +#define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7, _out8, _out9, _out10, _out11, _out12, \ + _out13, _out14, _out15) \ + { \ + _out0 = __lsx_vadd_d(_in0, _in15); \ + _out1 = __lsx_vadd_d(_in1, _in14); \ + _out2 = __lsx_vadd_d(_in2, _in13); \ + _out3 = __lsx_vadd_d(_in3, _in12); \ + _out4 = __lsx_vadd_d(_in4, _in11); \ + _out5 = __lsx_vadd_d(_in5, _in10); \ + _out6 = __lsx_vadd_d(_in6, _in9); \ + _out7 = __lsx_vadd_d(_in7, _in8); \ + \ + _out8 = __lsx_vsub_d(_in7, _in8); \ + _out9 = __lsx_vsub_d(_in6, _in9); \ + _out10 = __lsx_vsub_d(_in5, _in10); \ + _out11 = __lsx_vsub_d(_in4, _in11); \ + _out12 = __lsx_vsub_d(_in3, _in12); \ + _out13 = __lsx_vsub_d(_in2, _in13); \ + _out14 = __lsx_vsub_d(_in1, _in14); \ + _out15 = __lsx_vsub_d(_in0, _in15); \ + } + +#endif // LSX + +#ifdef __loongarch_asx +#include +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed halfword + * Details : Unsigned byte elements from in_h are multiplied with + * unsigned byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplied results of adjacent odd-even elements + * are added to the out vector + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_h_bu(in_h, in_l); + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed halfword + * Details : Signed byte elements from in_h are multiplied with + * signed byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplication results of adjacent odd-even elements + * are added to the out vector + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_h_b(in_h, in_l); + out = __lasx_xvmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed word + * Details : Signed halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Then these multiplied results of adjacent odd-even elements + * are added to the out vector. + * Example : out = __lasx_xvdp2_w_h(in_h, in_l) + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1 + * out : 22,38,38,22, 22,38,38,22 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_h(in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of word vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed double + * Details : Signed word elements from in_h are multiplied with + * signed word elements from in_l producing a result + * twice the size of input i.e. signed double-word. + * Then these multiplied results of adjacent odd-even elements + * are added to the out vector. + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_d_w(in_h, in_l); + out = __lasx_xvmaddwod_d_w(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed word + * Details : Unsigned halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. unsigned word. + * Multiplication result of adjacent odd-even elements + * are added to the out vector + * Example : See out = __lasx_xvdp2_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_hu_h(in_h, in_l); + out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Signed byte elements from in_h are multiplied with + * signed byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied with + * unsigned byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product & addition of byte vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - halfword + * Details : Unsigned byte elements from in_h are multiplied with + * signed byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Then these multiplied results of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l); + out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - per RTYPE + * Details : Signed halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added to the in_c vector. + * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * in_c : 1,2,3,4, 1,2,3,4 + * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8, + * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1, + * out : 23,40,41,26, 23,40,41,26 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed word + * Details : Unsigned halfword elements from in_h are multiplied with + * unsigned halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added to the in_c vector. + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l); + out = __lasx_xvmaddwod_w_hu(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed word + * Details : Unsigned halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added to the in_c vector + * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l); + out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l); + return out; +} + +/* + * ============================================================================= + * Description : Vector Unsigned Dot Product and Subtract + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed halfword + * Details : Unsigned byte elements from in_h are multiplied with + * unsigned byte elements from in_l producing a result + * twice the size of input i.e. signed halfword. + * Multiplication result of adjacent odd-even elements + * are added together and subtracted from double width elements + * in_c vector. + * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_h_bu(in_h, in_l); + out = __lasx_xvmaddwod_h_bu(out, in_h, in_l); + out = __lasx_xvsub_h(in_c, out); + return out; +} + +/* + * ============================================================================= + * Description : Vector Signed Dot Product and Subtract + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Return Type - signed word + * Details : Signed halfword elements from in_h are multiplied with + * Signed halfword elements from in_l producing a result + * twice the size of input i.e. signed word. + * Multiplication result of adjacent odd-even elements + * are added together and subtracted from double width elements + * in_c vector. + * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l) + * in_c : 0,0,0,0, 0,0,0,0 + * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1 + * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1 + * out : -7,-3,0,0, 0,-1,0,-1 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_h(in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + out = __lasx_xvsub_w(in_c, out); + return out; +} + +/* + * ============================================================================= + * Description : Dot product of halfword vector elements + * Arguments : Inputs - in_h, in_l + * Output - out + * Return Type - signed word + * Details : Signed halfword elements from in_h are multiplied with + * signed halfword elements from in_l producing a result + * four times the size of input i.e. signed doubleword. + * Then these multiplication results of four adjacent elements + * are added together and stored to the out vector. + * Example : out = __lasx_xvdp4_d_h(in_h, in_l) + * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1 + * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1 + * out : -2,0,1,1 + * ============================================================================= + */ +static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvmulwev_w_h(in_h, in_l); + out = __lasx_xvmaddwod_w_h(out, in_h, in_l); + out = __lasx_xvhaddw_d_w(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The high half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * higher half of the two-fold sign extension (signed byte + * to signed halfword) and stored to the out vector. + * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvh_b(in_h, in_l); + out = __lasx_xvhaddw_h_b(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The high half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * higher half of the two-fold sign extension (signed halfword + * to signed word) and stored to the out vector. + * Example : out = __lasx_xvaddwh_w_h(in_h, in_l) + * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 + * out : 1,0,0,-1, 1,0,0, 2 + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvh_h(in_h, in_l); + out = __lasx_xvhaddw_w_h(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * lower half of the two-fold sign extension (signed byte + * to signed halfword) and stored to the out vector. + * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvl_b(in_h, in_l); + out = __lasx_xvhaddw_h_b(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are added after the + * lower half of the two-fold sign extension (signed halfword + * to signed word) and stored to the out vector. + * Example : out = __lasx_xvaddwl_w_h(in_h, in_l) + * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1 + * out : 5,-1,4,2, 1,0,2,-1 + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvl_h(in_h, in_l); + out = __lasx_xvhaddw_w_h(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The out vector and the out vector are added after the + * lower half of the two-fold zero extension (unsigned byte + * to unsigned halfword) and stored to the out vector. + * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvilvl_b(in_h, in_l); + out = __lasx_xvhaddw_hu_bu(out, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_l vector after double zero extension (unsigned byte to + * signed halfword),added to the in_h vector. + * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvsllwil_hu_bu(in_l, 0); + out = __lasx_xvadd_h(in_h, out); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are expanded and + * added after being doubled. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_l vector after double sign extension (signed halfword to + * signed word), added to the in_h vector. + * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l) + * in_h : 0, 1,0,0, -1,0,0,1, + * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1, + * out : 2, 0,1,2, -1,0,1,1, + * ============================================================================= + */ +static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) { + __m256i out; + + out = __lasx_xvsllwil_w_h(in_l, 0); + out = __lasx_xvadd_w(in_h, out); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication and addition calculation after expansion + * of the lower half of the vector. + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the lower half of the two-fold sign extension (signed halfword + * to signed word), and the result is added to the vector in_c, + * then stored to the out vector. + * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) + * in_c : 1,2,3,4, 5,6,7,8 + * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8 + * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000, + * -200,-300,-400,-500, -2000,-3000,-4000,-5000 + * out : 201, 602,1203,2004, -995, -1794,-2793,-3992 + * ============================================================================= + */ +static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvsllwil_w_h(in_h, 0); + tmp1 = __lasx_xvsllwil_w_h(in_l, 0); + tmp0 = __lasx_xvmul_w(tmp0, tmp1); + out = __lasx_xvadd_w(tmp0, in_c); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication and addition calculation after expansion + * of the higher half of the vector. + * Arguments : Inputs - in_c, in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the higher half of the two-fold sign extension (signed + * halfword to signed word), and the result is added to + * the vector in_c, then stored to the out vector. + * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l) + * ============================================================================= + */ +static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, + __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvilvh_h(in_h, in_h); + tmp1 = __lasx_xvilvh_h(in_l, in_l); + tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1); + out = __lasx_xvadd_w(tmp0, in_c); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication calculation after expansion of the lower + * half of the vector. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the lower half of the two-fold sign extension (signed + * halfword to signed word), then stored to the out vector. + * Example : out = __lasx_xvmulwl_w_h(in_h, in_l) + * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 + * out : 6,1,3,0, 0,0,1,0 + * ============================================================================= + */ +static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvsllwil_w_h(in_h, 0); + tmp1 = __lasx_xvsllwil_w_h(in_l, 0); + out = __lasx_xvmul_w(tmp0, tmp1); + return out; +} + +/* + * ============================================================================= + * Description : Multiplication calculation after expansion of the lower + * half of the vector. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector and the in_l vector are multiplied after + * the lower half of the two-fold sign extension (signed + * halfword to signed word), then stored to the out vector. + * Example : out = __lasx_xvmulwh_w_h(in_h, in_l) + * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1 + * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1 + * out : 0,0,0,0, 0,0,0,1 + * ============================================================================= + */ +static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) { + __m256i tmp0, tmp1, out; + + tmp0 = __lasx_xvilvh_h(in_h, in_h); + tmp1 = __lasx_xvilvh_h(in_l, in_l); + out = __lasx_xvmulwev_w_h(tmp0, tmp1); + return out; +} + +/* + * ============================================================================= + * Description : The low half of the vector elements are added to the high half + * after being doubled, then saturated. + * Arguments : Inputs - in_h, in_l + * Output - out + * Details : The in_h vector adds the in_l vector after the lower half of + * the two-fold zero extension (unsigned byte to unsigned + * halfword) and then saturated. The results are stored to the out + * vector. + * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l) + * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1 + * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, + * 0,0,0,1 + * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2, + * ============================================================================= + */ +static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) { + __m256i tmp1, out; + __m256i zero = { 0 }; + + tmp1 = __lasx_xvilvl_b(zero, in_l); + out = __lasx_xvsadd_hu(in_h, tmp1); + return out; +} + +/* + * ============================================================================= + * Description : Clip all halfword elements of input vector between min & max + * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in)) + * Arguments : Inputs - in (input vector) + * - min (min threshold) + * - max (max threshold) + * Outputs - in (output vector with clipped elements) + * Return Type - signed halfword + * Example : out = __lasx_xvclip_h(in, min, max) + * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5 + * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1 + * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9 + * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5 + * ============================================================================= + */ +static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) { + __m256i out; + + out = __lasx_xvmax_h(min, in); + out = __lasx_xvmin_h(max, out); + return out; +} + +/* + * ============================================================================= + * Description : Clip all signed halfword elements of input vector + * between 0 & 255 + * Arguments : Inputs - in (input vector) + * Outputs - out (output vector with clipped elements) + * Return Type - signed halfword + * Example : See out = __lasx_xvclip255_w(in) + * ============================================================================= + */ +static inline __m256i __lasx_xvclip255_h(__m256i in) { + __m256i out; + + out = __lasx_xvmaxi_h(in, 0); + out = __lasx_xvsat_hu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Clip all signed word elements of input vector + * between 0 & 255 + * Arguments : Inputs - in (input vector) + * Output - out (output vector with clipped elements) + * Return Type - signed word + * Example : out = __lasx_xvclip255_w(in) + * in : -8,255,280,249, -8,255,280,249 + * out : 0,255,255,249, 0,255,255,249 + * ============================================================================= + */ +static inline __m256i __lasx_xvclip255_w(__m256i in) { + __m256i out; + + out = __lasx_xvmaxi_w(in, 0); + out = __lasx_xvsat_wu(out, 7); + return out; +} + +/* + * ============================================================================= + * Description : Indexed halfword element values are replicated to all + * elements in output vector. If 'idx < 8' use xvsplati_l_*, + * if 'idx >= 8' use xvsplati_h_*. + * Arguments : Inputs - in, idx + * Output - out + * Details : Idx element value from in vector is replicated to all + * elements in out vector. + * Valid index range for halfword operation is 0-7 + * Example : out = __lasx_xvsplati_l_h(in, idx) + * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0 + * idx : 0x02 + * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11 + * ============================================================================= + */ +static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) { + __m256i out; + + out = __lasx_xvpermi_q(in, in, 0x02); + out = __lasx_xvreplve_h(out, idx); + return out; +} + +/* + * ============================================================================= + * Description : Indexed halfword element values are replicated to all + * elements in output vector. If 'idx < 8' use xvsplati_l_*, + * if 'idx >= 8' use xvsplati_h_*. + * Arguments : Inputs - in, idx + * Output - out + * Details : Idx element value from in vector is replicated to all + * elements in out vector. + * Valid index range for halfword operation is 0-7 + * Example : out = __lasx_xvsplati_h_h(in, idx) + * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0 + * idx : 0x09 + * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 + * ============================================================================= + */ +static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) { + __m256i out; + + out = __lasx_xvpermi_q(in, in, 0x13); + out = __lasx_xvreplve_h(out, idx); + return out; +} + +/* + * ============================================================================= + * Description : Transpose 4x4 block with double-word elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3 + * Outputs - _out0, _out1, _out2, _out3 + * Example : LASX_TRANSPOSE4x4_D + * _in0 : 1,2,3,4 + * _in1 : 1,2,3,4 + * _in2 : 1,2,3,4 + * _in3 : 1,2,3,4 + * + * _out0 : 1,1,1,1 + * _out1 : 2,2,2,2 + * _out2 : 3,3,3,3 + * _out3 : 4,4,4,4 + * ============================================================================= + */ +#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ + _out3) \ + { \ + __m256i _tmp0, _tmp1, _tmp2, _tmp3; \ + _tmp0 = __lasx_xvilvl_d(_in1, _in0); \ + _tmp1 = __lasx_xvilvh_d(_in1, _in0); \ + _tmp2 = __lasx_xvilvl_d(_in3, _in2); \ + _tmp3 = __lasx_xvilvh_d(_in3, _in2); \ + _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \ + _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \ + _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \ + _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with word elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 + * Example : LASX_TRANSPOSE8x8_W + * _in0 : 1,2,3,4,5,6,7,8 + * _in1 : 2,2,3,4,5,6,7,8 + * _in2 : 3,2,3,4,5,6,7,8 + * _in3 : 4,2,3,4,5,6,7,8 + * _in4 : 5,2,3,4,5,6,7,8 + * _in5 : 6,2,3,4,5,6,7,8 + * _in6 : 7,2,3,4,5,6,7,8 + * _in7 : 8,2,3,4,5,6,7,8 + * + * _out0 : 1,2,3,4,5,6,7,8 + * _out1 : 2,2,2,2,2,2,2,2 + * _out2 : 3,3,3,3,3,3,3,3 + * _out3 : 4,4,4,4,4,4,4,4 + * _out4 : 5,5,5,5,5,5,5,5 + * _out5 : 6,6,6,6,6,6,6,6 + * _out6 : 7,7,7,7,7,7,7,7 + * _out7 : 8,8,8,8,8,8,8,8 + * ============================================================================= + */ +#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m256i _s0_m, _s1_m; \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + \ + _s0_m = __lasx_xvilvl_w(_in2, _in0); \ + _s1_m = __lasx_xvilvl_w(_in3, _in1); \ + _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_w(_in2, _in0); \ + _s1_m = __lasx_xvilvh_w(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvl_w(_in6, _in4); \ + _s1_m = __lasx_xvilvl_w(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_w(_in6, _in4); \ + _s1_m = __lasx_xvilvh_w(_in7, _in5); \ + _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \ + _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \ + _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \ + _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \ + _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \ + _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \ + _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \ + _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \ + } + +/* + * ============================================================================= + * Description : Transpose input 16x8 byte block + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, + * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 + * (input 16x8 byte block) + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 (output 8x16 byte block) + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : See LASX_TRANSPOSE16x8_H + * ============================================================================= + */ +#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7) \ + { \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + \ + _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ + _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ + _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \ + _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \ + _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \ + _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \ + _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ + _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ + _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ + _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ + _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \ + _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \ + _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \ + _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \ + _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \ + _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \ + _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \ + _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \ + _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \ + _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \ + _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \ + _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \ + _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \ + _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \ + _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \ + _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \ + _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \ + _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \ + _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \ + _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \ + } + +/* + * ============================================================================= + * Description : Transpose input 16x8 byte block + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, + * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15 + * (input 16x8 byte block) + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 (output 8x16 byte block) + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : LASX_TRANSPOSE16x8_H + * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0 + * + * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6 + * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2 + * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3 + * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4 + * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 + * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 + * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 + * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 + * ============================================================================= + */ +#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _in8, _in9, _in10, _in11, _in12, _in13, _in14, \ + _in15, _out0, _out1, _out2, _out3, _out4, _out5, \ + _out6, _out7) \ + { \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \ + \ + _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \ + _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \ + _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \ + _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \ + _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \ + _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \ + _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ + _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ + _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ + _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ + _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ + _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ + _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ + _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ + _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ + _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ + _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ + _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ + _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ + _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ + _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ + _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ + _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ + _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ + _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ + _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ + \ + _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \ + _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \ + _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \ + _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \ + _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \ + _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \ + _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \ + _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \ + _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \ + _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \ + _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \ + _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \ + _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \ + _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \ + _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \ + _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \ + _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \ + _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \ + _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \ + _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \ + _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \ + _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \ + _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \ + _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \ + _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \ + _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \ + _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \ + _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \ + } + +/* + * ============================================================================= + * Description : Transpose 4x4 block with halfword elements in vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3 + * Outputs - _out0, _out1, _out2, _out3 + * Return Type - signed halfword + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : See LASX_TRANSPOSE8x8_H + * ============================================================================= + */ +#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \ + _out3) \ + { \ + __m256i _s0_m, _s1_m; \ + \ + _s0_m = __lasx_xvilvl_h(_in1, _in0); \ + _s1_m = __lasx_xvilvl_h(_in3, _in2); \ + _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \ + _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \ + _out1 = __lasx_xvilvh_d(_out0, _out0); \ + _out3 = __lasx_xvilvh_d(_out2, _out2); \ + } + +/* + * ============================================================================= + * Description : Transpose input 8x8 byte block + * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7 + * (input 8x8 byte block) + * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, + * _out7 (output 8x8 byte block) + * Example : See LASX_TRANSPOSE8x8_H + * ============================================================================= + */ +#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \ + _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \ + _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \ + _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \ + _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \ + _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \ + _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \ + _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \ + _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \ + _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \ + _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \ + _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \ + _out1 = __lasx_xvbsrl_v(_out0, 8); \ + _out3 = __lasx_xvbsrl_v(_out2, 8); \ + _out5 = __lasx_xvbsrl_v(_out4, 8); \ + _out7 = __lasx_xvbsrl_v(_out6, 8); \ + } + +/* + * ============================================================================= + * Description : Transpose 8x8 block with halfword elements in vectors. + * Arguments : Inputs - _in0, _in1, ~ + * Outputs - _out0, _out1, ~ + * Details : The rows of the matrix become columns, and the columns become + * rows. + * Example : LASX_TRANSPOSE8x8_H + * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 + * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8 + * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 + * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8 + * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8 + * + * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9 + * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2 + * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3 + * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4 + * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5 + * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6 + * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7 + * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8 + * ============================================================================= + */ +#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + __m256i _s0_m, _s1_m; \ + __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \ + __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \ + \ + _s0_m = __lasx_xvilvl_h(_in6, _in4); \ + _s1_m = __lasx_xvilvl_h(_in7, _in5); \ + _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_h(_in6, _in4); \ + _s1_m = __lasx_xvilvh_h(_in7, _in5); \ + _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + \ + _s0_m = __lasx_xvilvl_h(_in2, _in0); \ + _s1_m = __lasx_xvilvl_h(_in3, _in1); \ + _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + _s0_m = __lasx_xvilvh_h(_in2, _in0); \ + _s1_m = __lasx_xvilvh_h(_in3, _in1); \ + _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \ + _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \ + \ + _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \ + _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \ + _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \ + _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \ + _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \ + _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \ + _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \ + _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 4 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3 + * Outputs - _out0, _out1, _out2, _out3 + * Details : Butterfly operation + * Example : LASX_BUTTERFLY_4 + * _out0 = _in0 + _in3; + * _out1 = _in1 + _in2; + * _out2 = _in1 - _in2; + * _out3 = _in0 - _in3; + * ============================================================================= + */ +#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_b(_in0, _in3); \ + _out1 = __lasx_xvadd_b(_in1, _in2); \ + _out2 = __lasx_xvsub_b(_in1, _in2); \ + _out3 = __lasx_xvsub_b(_in0, _in3); \ + } +#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_h(_in0, _in3); \ + _out1 = __lasx_xvadd_h(_in1, _in2); \ + _out2 = __lasx_xvsub_h(_in1, _in2); \ + _out3 = __lasx_xvsub_h(_in0, _in3); \ + } +#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_w(_in0, _in3); \ + _out1 = __lasx_xvadd_w(_in1, _in2); \ + _out2 = __lasx_xvsub_w(_in1, _in2); \ + _out3 = __lasx_xvsub_w(_in0, _in3); \ + } +#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \ + { \ + _out0 = __lasx_xvadd_d(_in0, _in3); \ + _out1 = __lasx_xvadd_d(_in1, _in2); \ + _out2 = __lasx_xvsub_d(_in1, _in2); \ + _out3 = __lasx_xvsub_d(_in0, _in3); \ + } + +/* + * ============================================================================= + * Description : Butterfly of 8 input vectors + * Arguments : Inputs - _in0, _in1, _in2, _in3, ~ + * Outputs - _out0, _out1, _out2, _out3, ~ + * Details : Butterfly operation + * Example : LASX_BUTTERFLY_8 + * _out0 = _in0 + _in7; + * _out1 = _in1 + _in6; + * _out2 = _in2 + _in5; + * _out3 = _in3 + _in4; + * _out4 = _in3 - _in4; + * _out5 = _in2 - _in5; + * _out6 = _in1 - _in6; + * _out7 = _in0 - _in7; + * ============================================================================= + */ +#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_b(_in0, _in7); \ + _out1 = __lasx_xvadd_b(_in1, _in6); \ + _out2 = __lasx_xvadd_b(_in2, _in5); \ + _out3 = __lasx_xvadd_b(_in3, _in4); \ + _out4 = __lasx_xvsub_b(_in3, _in4); \ + _out5 = __lasx_xvsub_b(_in2, _in5); \ + _out6 = __lasx_xvsub_b(_in1, _in6); \ + _out7 = __lasx_xvsub_b(_in0, _in7); \ + } + +#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_h(_in0, _in7); \ + _out1 = __lasx_xvadd_h(_in1, _in6); \ + _out2 = __lasx_xvadd_h(_in2, _in5); \ + _out3 = __lasx_xvadd_h(_in3, _in4); \ + _out4 = __lasx_xvsub_h(_in3, _in4); \ + _out5 = __lasx_xvsub_h(_in2, _in5); \ + _out6 = __lasx_xvsub_h(_in1, _in6); \ + _out7 = __lasx_xvsub_h(_in0, _in7); \ + } + +#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_w(_in0, _in7); \ + _out1 = __lasx_xvadd_w(_in1, _in6); \ + _out2 = __lasx_xvadd_w(_in2, _in5); \ + _out3 = __lasx_xvadd_w(_in3, _in4); \ + _out4 = __lasx_xvsub_w(_in3, _in4); \ + _out5 = __lasx_xvsub_w(_in2, _in5); \ + _out6 = __lasx_xvsub_w(_in1, _in6); \ + _out7 = __lasx_xvsub_w(_in0, _in7); \ + } + +#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \ + _out0, _out1, _out2, _out3, _out4, _out5, _out6, \ + _out7) \ + { \ + _out0 = __lasx_xvadd_d(_in0, _in7); \ + _out1 = __lasx_xvadd_d(_in1, _in6); \ + _out2 = __lasx_xvadd_d(_in2, _in5); \ + _out3 = __lasx_xvadd_d(_in3, _in4); \ + _out4 = __lasx_xvsub_d(_in3, _in4); \ + _out5 = __lasx_xvsub_d(_in2, _in5); \ + _out6 = __lasx_xvsub_d(_in1, _in6); \ + _out7 = __lasx_xvsub_d(_in0, _in7); \ + } + +#endif // LASX + +/* + * ============================================================================= + * Description : Print out elements in vector. + * Arguments : Inputs - RTYPE, _element_num, _in0, _enter + * Outputs - + * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if + * '_enter' is TRUE, prefix "\nVP:" will be added first. + * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4 + * VP:1,2,3,4, + * ============================================================================= + */ +#define VECT_PRINT(RTYPE, element_num, in0, enter) \ + { \ + RTYPE _tmp0 = (RTYPE)in0; \ + int _i = 0; \ + if (enter) printf("\nVP:"); \ + for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \ + } + +#endif /* LOONGSON_INTRINSICS_H */ +#endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */ diff --git a/media/libvpx/libvpx/vpx_util/vpx_atomics.h b/media/libvpx/libvpx/vpx_util/vpx_atomics.h new file mode 100644 index 0000000000..13c1fc11f6 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_atomics.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_UTIL_VPX_ATOMICS_H_ +#define VPX_VPX_UTIL_VPX_ATOMICS_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD + +// Look for built-in atomic support. We cannot use or +// since neither is guaranteed to exist on both C and C++ platforms, and we need +// to back the atomic type with the same type (g++ needs to be able to use +// gcc-built code). g++ 6 doesn't support _Atomic as a keyword and can't use the +// stdatomic.h header. Even if both and existed it's not +// guaranteed that atomic_int is the same type as std::atomic_int. +// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60932#c13. +#if !defined(__has_builtin) +#define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#endif // !defined(__has_builtin) + +#if (__has_builtin(__atomic_load_n)) || \ + (defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))) +// For GCC >= 4.7 and Clang versions that support __atomic builtins, use those. +#define VPX_USE_ATOMIC_BUILTINS +#else +// Use platform-specific asm barriers. +#if defined(_MSC_VER) +// TODO(pbos): This assumes that newer versions of MSVC are building with the +// default /volatile:ms (or older, where this is always true. Consider adding +// support for using instead of stdatomic.h when building C++11 under +// MSVC. It's unclear what to do for plain C under /volatile:iso (inline asm?), +// there're no explicit Interlocked* functions for only storing or loading +// (presumably because volatile has historically implied that on MSVC). +// +// For earlier versions of MSVC or the default /volatile:ms volatile int are +// acquire/release and require no barrier. +#define vpx_atomic_memory_barrier() \ + do { \ + } while (0) +#else +#if VPX_ARCH_X86 || VPX_ARCH_X86_64 +// Use a compiler barrier on x86, no runtime penalty. +#define vpx_atomic_memory_barrier() __asm__ __volatile__("" ::: "memory") +#elif VPX_ARCH_ARM +#define vpx_atomic_memory_barrier() __asm__ __volatile__("dmb ish" ::: "memory") +#elif VPX_ARCH_MIPS +#define vpx_atomic_memory_barrier() __asm__ __volatile__("sync" ::: "memory") +#else +#error Unsupported architecture! +#endif // VPX_ARCH_X86 || VPX_ARCH_X86_64 +#endif // defined(_MSC_VER) +#endif // atomic builtin availability check + +// These are wrapped in a struct so that they are not easily accessed directly +// on any platform (to discourage programmer errors by setting values directly). +// This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT +// (NOT memset) and accessed through vpx_atomic_ functions. +typedef struct vpx_atomic_int { + volatile int value; +} vpx_atomic_int; + +#define VPX_ATOMIC_INIT(num) { num } + +// Initialization of an atomic int, not thread safe. +static INLINE void vpx_atomic_init(vpx_atomic_int *atomic, int value) { + atomic->value = value; +} + +static INLINE void vpx_atomic_store_release(vpx_atomic_int *atomic, int value) { +#if defined(VPX_USE_ATOMIC_BUILTINS) + __atomic_store_n(&atomic->value, value, __ATOMIC_RELEASE); +#else + vpx_atomic_memory_barrier(); + atomic->value = value; +#endif // defined(VPX_USE_ATOMIC_BUILTINS) +} + +static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) { +#if defined(VPX_USE_ATOMIC_BUILTINS) + return __atomic_load_n(&atomic->value, __ATOMIC_ACQUIRE); +#else + int v = atomic->value; + vpx_atomic_memory_barrier(); + return v; +#endif // defined(VPX_USE_ATOMIC_BUILTINS) +} + +#undef VPX_USE_ATOMIC_BUILTINS +#undef vpx_atomic_memory_barrier + +#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // VPX_VPX_UTIL_VPX_ATOMICS_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_debug_util.c b/media/libvpx/libvpx/vpx_util/vpx_debug_util.c new file mode 100644 index 0000000000..3ce4065ba5 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_debug_util.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include "vpx_util/vpx_debug_util.h" + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +static int frame_idx_w = 0; +static int frame_idx_r = 0; + +void bitstream_queue_set_frame_write(int frame_idx) { frame_idx_w = frame_idx; } + +int bitstream_queue_get_frame_write(void) { return frame_idx_w; } + +void bitstream_queue_set_frame_read(int frame_idx) { frame_idx_r = frame_idx; } + +int bitstream_queue_get_frame_read(void) { return frame_idx_r; } +#endif + +#if CONFIG_BITSTREAM_DEBUG +#define QUEUE_MAX_SIZE 2000000 +static int result_queue[QUEUE_MAX_SIZE]; +static int prob_queue[QUEUE_MAX_SIZE]; + +static int queue_r = 0; +static int queue_w = 0; +static int queue_prev_w = -1; +static int skip_r = 0; +static int skip_w = 0; +void bitstream_queue_set_skip_write(int skip) { skip_w = skip; } + +void bitstream_queue_set_skip_read(int skip) { skip_r = skip; } + +void bitstream_queue_record_write(void) { queue_prev_w = queue_w; } + +void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; } + +int bitstream_queue_get_write(void) { return queue_w; } + +int bitstream_queue_get_read(void) { return queue_r; } + +void bitstream_queue_pop(int *result, int *prob) { + if (!skip_r) { + if (queue_w == queue_r) { + printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r); + assert(0); + } + *result = result_queue[queue_r]; + *prob = prob_queue[queue_r]; + queue_r = (queue_r + 1) % QUEUE_MAX_SIZE; + } +} + +void bitstream_queue_push(int result, const int prob) { + if (!skip_w) { + result_queue[queue_w] = result; + prob_queue[queue_w] = prob; + queue_w = (queue_w + 1) % QUEUE_MAX_SIZE; + if (queue_w == queue_r) { + printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r); + assert(0); + } + } +} +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_MISMATCH_DEBUG +static int frame_buf_idx_r = 0; +static int frame_buf_idx_w = 0; +#define MAX_FRAME_BUF_NUM 20 +#define MAX_FRAME_STRIDE 1920 +#define MAX_FRAME_HEIGHT 1080 +static uint16_t + frame_pre[MAX_FRAME_BUF_NUM][3] + [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction only +static uint16_t + frame_tx[MAX_FRAME_BUF_NUM][3] + [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT]; // prediction + txfm +static int frame_stride = MAX_FRAME_STRIDE; +static int frame_height = MAX_FRAME_HEIGHT; +static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT; +void mismatch_move_frame_idx_w(void) { + frame_buf_idx_w = (frame_buf_idx_w + 1) % MAX_FRAME_BUF_NUM; + if (frame_buf_idx_w == frame_buf_idx_r) { + printf("frame_buf overflow\n"); + assert(0); + } +} + +void mismatch_reset_frame(int num_planes) { + int plane; + for (plane = 0; plane < num_planes; ++plane) { + memset(frame_pre[frame_buf_idx_w][plane], 0, + sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size); + memset(frame_tx[frame_buf_idx_w][plane], 0, + sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size); + } +} + +void mismatch_move_frame_idx_r(void) { + if (frame_buf_idx_w == frame_buf_idx_r) { + printf("frame_buf underflow\n"); + assert(0); + } + frame_buf_idx_r = (frame_buf_idx_r + 1) % MAX_FRAME_BUF_NUM; +} + +void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int r, c; + + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + frame_pre[frame_buf_idx_w][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] = + src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; + } + } +#if 0 + { + int ref_frame_idx = 3; + int ref_plane = 1; + int ref_pixel_c = 162; + int ref_pixel_r = 16; + if (frame_idx_w == ref_frame_idx && plane == ref_plane && + ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w && + ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) { + printf( + "\nrecord_block_pre frame_idx %d plane %d pixel_c %d pixel_r %d blk_w" + " %d blk_h %d\n", + frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h); + } + } +#endif +} +void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int r, c; + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + frame_tx[frame_buf_idx_w][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] = + src16 ? src16[r * src_stride + c] : src[r * src_stride + c]; + } + } +#if 0 + { + int ref_frame_idx = 3; + int ref_plane = 1; + int ref_pixel_c = 162; + int ref_pixel_r = 16; + if (frame_idx_w == ref_frame_idx && plane == ref_plane && + ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w && + ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) { + printf( + "\nrecord_block_tx frame_idx %d plane %d pixel_c %d pixel_r %d blk_w " + "%d blk_h %d\n", + frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h); + } + } +#endif +} +void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int mismatch = 0; + int r, c; + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + if (frame_pre[frame_buf_idx_r][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] != + (uint16_t)(src16 ? src16[r * src_stride + c] + : src[r * src_stride + c])) { + mismatch = 1; + } + } + } + if (mismatch) { + int rr, cc; + printf( + "\ncheck_block_pre failed frame_idx %d plane %d " + "pixel_c %d pixel_r " + "%d blk_w %d blk_h %d\n", + frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h); + printf("enc\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", frame_pre[frame_buf_idx_r][plane] + [(rr + pixel_r) * frame_stride + cc + pixel_c]); + } + printf("\n"); + } + + printf("dec\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", + src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); + } + printf("\n"); + } + assert(0); + } +} +void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd) { + const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL; + int mismatch = 0; + int r, c; + if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) { + printf("frame_buf undersized\n"); + assert(0); + } + + for (r = 0; r < blk_h; ++r) { + for (c = 0; c < blk_w; ++c) { + if (frame_tx[frame_buf_idx_r][plane] + [(r + pixel_r) * frame_stride + c + pixel_c] != + (uint16_t)(src16 ? src16[r * src_stride + c] + : src[r * src_stride + c])) { + mismatch = 1; + } + } + } + if (mismatch) { + int rr, cc; + printf( + "\ncheck_block_tx failed frame_idx %d plane %d pixel_c " + "%d pixel_r " + "%d blk_w %d blk_h %d\n", + frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h); + printf("enc\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", frame_tx[frame_buf_idx_r][plane] + [(rr + pixel_r) * frame_stride + cc + pixel_c]); + } + printf("\n"); + } + + printf("dec\n"); + for (rr = 0; rr < blk_h; ++rr) { + for (cc = 0; cc < blk_w; ++cc) { + printf("%d ", + src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]); + } + printf("\n"); + } + assert(0); + } +} +#endif // CONFIG_MISMATCH_DEBUG diff --git a/media/libvpx/libvpx/vpx_util/vpx_debug_util.h b/media/libvpx/libvpx/vpx_util/vpx_debug_util.h new file mode 100644 index 0000000000..df1a1aab2c --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_debug_util.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_ +#define VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_ + +#include "./vpx_config.h" + +#include "vpx_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +void bitstream_queue_set_frame_write(int frame_idx); +int bitstream_queue_get_frame_write(void); +void bitstream_queue_set_frame_read(int frame_idx); +int bitstream_queue_get_frame_read(void); +#endif + +#if CONFIG_BITSTREAM_DEBUG +/* This is a debug tool used to detect bitstream error. On encoder side, it + * pushes each bit and probability into a queue before the bit is written into + * the Arithmetic coder. On decoder side, whenever a bit is read out from the + * Arithmetic coder, it pops out the reference bit and probability from the + * queue as well. If the two results do not match, this debug tool will report + * an error. This tool can be used to pin down the bitstream error precisely. + * By combining gdb's backtrace method, we can detect which module causes the + * bitstream error. */ +int bitstream_queue_get_write(void); +int bitstream_queue_get_read(void); +void bitstream_queue_record_write(void); +void bitstream_queue_reset_write(void); +void bitstream_queue_pop(int *result, int *prob); +void bitstream_queue_push(int result, const int prob); +void bitstream_queue_set_skip_write(int skip); +void bitstream_queue_set_skip_read(int skip); +#endif // CONFIG_BITSTREAM_DEBUG + +#if CONFIG_MISMATCH_DEBUG +void mismatch_move_frame_idx_w(void); +void mismatch_move_frame_idx_r(void); +void mismatch_reset_frame(int num_planes); +void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane, + int pixel_c, int pixel_r, int blk_w, int blk_h, + int highbd); +#endif // CONFIG_MISMATCH_DEBUG + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_pthread.h b/media/libvpx/libvpx/vpx_util/vpx_pthread.h new file mode 100644 index 0000000000..4ed32d4632 --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_pthread.h @@ -0,0 +1,150 @@ +// Copyright 2024 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// pthread.h wrapper + +#ifndef VPX_VPX_UTIL_VPX_PTHREAD_H_ +#define VPX_VPX_UTIL_VPX_PTHREAD_H_ + +#include "./vpx_config.h" + +#if CONFIG_MULTITHREAD + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) && !HAVE_PTHREAD_H +// Prevent leaking max/min macros. +#undef NOMINMAX +#define NOMINMAX +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#include // NOLINT +#include // NOLINT +#include // NOLINT +typedef HANDLE pthread_t; +typedef SRWLOCK pthread_mutex_t; + +#if _WIN32_WINNT < 0x0600 +#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. +#endif +typedef CONDITION_VARIABLE pthread_cond_t; + +#ifndef WINAPI_FAMILY_PARTITION +#define WINAPI_PARTITION_DESKTOP 1 +#define WINAPI_FAMILY_PARTITION(x) x +#endif + +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define USE_CREATE_THREAD +#endif + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#if defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall +#else +#define THREADFN unsigned int __stdcall +#endif +#define THREAD_EXIT_SUCCESS 0 + +static INLINE int pthread_create(pthread_t *const thread, const void *attr, + unsigned int(__stdcall *start)(void *), + void *arg) { + (void)attr; +#ifdef USE_CREATE_THREAD + *thread = CreateThread(NULL, /* lpThreadAttributes */ + 0, /* dwStackSize */ + start, arg, 0, /* dwStackSize */ + NULL); /* lpThreadId */ +#else + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, arg, 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ +#endif + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void **value_ptr) { + (void)value_ptr; + return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void *mutexattr) { + (void)mutexattr; + InitializeSRWLock(mutex); + return 0; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + AcquireSRWLockExclusive(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + ReleaseSRWLockExclusive(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + (void)mutex; + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + (void)condition; + return 0; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void *cond_attr) { + (void)cond_attr; + InitializeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + WakeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + WakeAllConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + const int ok = SleepConditionVariableSRW(condition, mutex, INFINITE, 0); + return !ok; +} +#else // _WIN32 +#include // NOLINT +#define THREADFN void * +#define THREAD_EXIT_SUCCESS NULL +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // CONFIG_MULTITHREAD + +#endif // VPX_VPX_UTIL_VPX_PTHREAD_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.c b/media/libvpx/libvpx/vpx_util/vpx_thread.c index 04c5fb6f26..0d0e2f5766 100644 --- a/media/libvpx/libvpx/vpx_util/vpx_thread.c +++ b/media/libvpx/libvpx/vpx_util/vpx_thread.c @@ -12,10 +12,18 @@ // Original source: // https://chromium.googlesource.com/webm/libwebp +// Enable GNU extensions in glibc so that we can call pthread_setname_np(). +// This must be before any #include statements. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include #include // for memset() +#include "./vpx_config.h" #include "./vpx_thread.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_util/vpx_pthread.h" #if CONFIG_MULTITHREAD @@ -31,23 +39,54 @@ static void execute(VPxWorker *const worker); // Forward declaration. static THREADFN thread_loop(void *ptr) { VPxWorker *const worker = (VPxWorker *)ptr; - int done = 0; - while (!done) { - pthread_mutex_lock(&worker->impl_->mutex_); - while (worker->status_ == OK) { // wait in idling mode +#ifdef __APPLE__ + if (worker->thread_name != NULL) { + // Apple's version of pthread_setname_np takes one argument and operates on + // the current thread only. The maximum size of the thread_name buffer was + // noted in the Chromium source code and was confirmed by experiments. If + // thread_name is too long, pthread_setname_np returns -1 with errno + // ENAMETOOLONG (63). + char thread_name[64]; + strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); + thread_name[sizeof(thread_name) - 1] = '\0'; + pthread_setname_np(thread_name); + } +#elif (defined(__GLIBC__) && !defined(__GNU__)) || defined(__BIONIC__) + if (worker->thread_name != NULL) { + // Linux and Android require names (with nul) fit in 16 chars, otherwise + // pthread_setname_np() returns ERANGE (34). + char thread_name[16]; + strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1); + thread_name[sizeof(thread_name) - 1] = '\0'; + pthread_setname_np(pthread_self(), thread_name); + } +#endif + pthread_mutex_lock(&worker->impl_->mutex_); + for (;;) { + while (worker->status_ == VPX_WORKER_STATUS_OK) { // wait in idling mode pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } - if (worker->status_ == WORK) { + if (worker->status_ == VPX_WORKER_STATUS_WORKING) { + // When worker->status_ is VPX_WORKER_STATUS_WORKING, the main thread + // doesn't change worker->status_ and will wait until the worker changes + // worker->status_ to VPX_WORKER_STATUS_OK. See change_state(). So the + // worker can safely call execute() without holding worker->impl_->mutex_. + // When the worker reacquires worker->impl_->mutex_, worker->status_ must + // still be VPX_WORKER_STATUS_WORKING. + pthread_mutex_unlock(&worker->impl_->mutex_); execute(worker); - worker->status_ = OK; - } else if (worker->status_ == NOT_OK) { // finish the worker - done = 1; + pthread_mutex_lock(&worker->impl_->mutex_); + assert(worker->status_ == VPX_WORKER_STATUS_WORKING); + worker->status_ = VPX_WORKER_STATUS_OK; + // signal to the main thread that we're done (for sync()) + pthread_cond_signal(&worker->impl_->condition_); + } else { + assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK); // finish the worker + break; } - // signal to the main thread that we're done (for sync()) - pthread_cond_signal(&worker->impl_->condition_); - pthread_mutex_unlock(&worker->impl_->mutex_); } - return THREAD_RETURN(NULL); // Thread is finished + pthread_mutex_unlock(&worker->impl_->mutex_); + return THREAD_EXIT_SUCCESS; // Thread is finished } // main thread state control @@ -58,13 +97,13 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) { if (worker->impl_ == NULL) return; pthread_mutex_lock(&worker->impl_->mutex_); - if (worker->status_ >= OK) { + if (worker->status_ >= VPX_WORKER_STATUS_OK) { // wait for the worker to finish - while (worker->status_ != OK) { + while (worker->status_ != VPX_WORKER_STATUS_OK) { pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } // assign new status and release the working thread if needed - if (new_status != OK) { + if (new_status != VPX_WORKER_STATUS_OK) { worker->status_ = new_status; pthread_cond_signal(&worker->impl_->condition_); } @@ -78,21 +117,21 @@ static void change_state(VPxWorker *const worker, VPxWorkerStatus new_status) { static void init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); - worker->status_ = NOT_OK; + worker->status_ = VPX_WORKER_STATUS_NOT_OK; } static int sync(VPxWorker *const worker) { #if CONFIG_MULTITHREAD - change_state(worker, OK); + change_state(worker, VPX_WORKER_STATUS_OK); #endif - assert(worker->status_ <= OK); + assert(worker->status_ <= VPX_WORKER_STATUS_OK); return !worker->had_error; } static int reset(VPxWorker *const worker) { int ok = 1; worker->had_error = 0; - if (worker->status_ < OK) { + if (worker->status_ < VPX_WORKER_STATUS_OK) { #if CONFIG_MULTITHREAD worker->impl_ = (VPxWorkerImpl *)vpx_calloc(1, sizeof(*worker->impl_)); if (worker->impl_ == NULL) { @@ -107,7 +146,7 @@ static int reset(VPxWorker *const worker) { } pthread_mutex_lock(&worker->impl_->mutex_); ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker); - if (ok) worker->status_ = OK; + if (ok) worker->status_ = VPX_WORKER_STATUS_OK; pthread_mutex_unlock(&worker->impl_->mutex_); if (!ok) { pthread_mutex_destroy(&worker->impl_->mutex_); @@ -118,12 +157,12 @@ static int reset(VPxWorker *const worker) { return 0; } #else - worker->status_ = OK; + worker->status_ = VPX_WORKER_STATUS_OK; #endif - } else if (worker->status_ > OK) { + } else if (worker->status_ > VPX_WORKER_STATUS_OK) { ok = sync(worker); } - assert(!ok || (worker->status_ == OK)); + assert(!ok || (worker->status_ == VPX_WORKER_STATUS_OK)); return ok; } @@ -135,7 +174,7 @@ static void execute(VPxWorker *const worker) { static void launch(VPxWorker *const worker) { #if CONFIG_MULTITHREAD - change_state(worker, WORK); + change_state(worker, VPX_WORKER_STATUS_WORKING); #else execute(worker); #endif @@ -144,7 +183,7 @@ static void launch(VPxWorker *const worker) { static void end(VPxWorker *const worker) { #if CONFIG_MULTITHREAD if (worker->impl_ != NULL) { - change_state(worker, NOT_OK); + change_state(worker, VPX_WORKER_STATUS_NOT_OK); pthread_join(worker->impl_->thread_, NULL); pthread_mutex_destroy(&worker->impl_->mutex_); pthread_cond_destroy(&worker->impl_->condition_); @@ -152,10 +191,10 @@ static void end(VPxWorker *const worker) { worker->impl_ = NULL; } #else - worker->status_ = NOT_OK; + worker->status_ = VPX_WORKER_STATUS_NOT_OK; assert(worker->impl_ == NULL); #endif - assert(worker->status_ == NOT_OK); + assert(worker->status_ == VPX_WORKER_STATUS_NOT_OK); } //------------------------------------------------------------------------------ diff --git a/media/libvpx/libvpx/vpx_util/vpx_thread.h b/media/libvpx/libvpx/vpx_util/vpx_thread.h index 53a5f4966a..22051eae41 100644 --- a/media/libvpx/libvpx/vpx_util/vpx_thread.h +++ b/media/libvpx/libvpx/vpx_util/vpx_thread.h @@ -12,350 +12,23 @@ // Original source: // https://chromium.googlesource.com/webm/libwebp -#ifndef VPX_THREAD_H_ -#define VPX_THREAD_H_ - -#include "./vpx_config.h" +#ifndef VPX_VPX_UTIL_VPX_THREAD_H_ +#define VPX_VPX_UTIL_VPX_THREAD_H_ #ifdef __cplusplus extern "C" { #endif -// Set maximum decode threads to be 8 due to the limit of frame buffers -// and not enough semaphores in the emulation layer on windows. -#define MAX_DECODE_THREADS 8 - -#if CONFIG_MULTITHREAD - -#if defined(_WIN32) && !HAVE_PTHREAD_H -#include // NOLINT -#include // NOLINT -#include // NOLINT -typedef HANDLE pthread_t; -typedef CRITICAL_SECTION pthread_mutex_t; - -#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater -#define USE_WINDOWS_CONDITION_VARIABLE -typedef CONDITION_VARIABLE pthread_cond_t; -#else -typedef struct { - HANDLE waiting_sem_; - HANDLE received_sem_; - HANDLE signal_event_; -} pthread_cond_t; -#endif // _WIN32_WINNT >= 0x600 - -#ifndef WINAPI_FAMILY_PARTITION -#define WINAPI_PARTITION_DESKTOP 1 -#define WINAPI_FAMILY_PARTITION(x) x -#endif - -#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -#define USE_CREATE_THREAD -#endif - -//------------------------------------------------------------------------------ -// simplistic pthread emulation layer - -// _beginthreadex requires __stdcall -#if defined(__GNUC__) && \ - (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) -#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall -#else -#define THREADFN unsigned int __stdcall -#endif -#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) - -#if _WIN32_WINNT >= 0x0501 // Windows XP or greater -#define WaitForSingleObject(obj, timeout) \ - WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/) -#endif - -static INLINE int pthread_create(pthread_t *const thread, const void *attr, - unsigned int(__stdcall *start)(void *), - void *arg) { - (void)attr; -#ifdef USE_CREATE_THREAD - *thread = CreateThread(NULL, /* lpThreadAttributes */ - 0, /* dwStackSize */ - start, arg, 0, /* dwStackSize */ - NULL); /* lpThreadId */ -#else - *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ - 0, /* unsigned stack_size */ - start, arg, 0, /* unsigned initflag */ - NULL); /* unsigned *thrdaddr */ -#endif - if (*thread == NULL) return 1; - SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); - return 0; -} - -static INLINE int pthread_join(pthread_t thread, void **value_ptr) { - (void)value_ptr; - return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 || - CloseHandle(thread) == 0); -} - -// Mutex -static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, - void *mutexattr) { - (void)mutexattr; -#if _WIN32_WINNT >= 0x0600 // Windows Vista / Server 2008 or greater - InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); -#else - InitializeCriticalSection(mutex); -#endif - return 0; -} - -static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { - return TryEnterCriticalSection(mutex) ? 0 : EBUSY; -} - -static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { - EnterCriticalSection(mutex); - return 0; -} - -static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { - LeaveCriticalSection(mutex); - return 0; -} - -static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { - DeleteCriticalSection(mutex); - return 0; -} - -// Condition -static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { - int ok = 1; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - (void)condition; -#else - ok &= (CloseHandle(condition->waiting_sem_) != 0); - ok &= (CloseHandle(condition->received_sem_) != 0); - ok &= (CloseHandle(condition->signal_event_) != 0); -#endif - return !ok; -} - -static INLINE int pthread_cond_init(pthread_cond_t *const condition, - void *cond_attr) { - (void)cond_attr; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - InitializeConditionVariable(condition); -#else - condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL); - condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL); - condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); - if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL || - condition->signal_event_ == NULL) { - pthread_cond_destroy(condition); - return 1; - } -#endif - return 0; -} - -static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { - int ok = 1; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - WakeConditionVariable(condition); -#else - if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { - // a thread is waiting in pthread_cond_wait: allow it to be notified - ok = SetEvent(condition->signal_event_); - // wait until the event is consumed so the signaler cannot consume - // the event via its own pthread_cond_wait. - ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != - WAIT_OBJECT_0); - } -#endif - return !ok; -} - -static INLINE int pthread_cond_wait(pthread_cond_t *const condition, - pthread_mutex_t *const mutex) { - int ok; -#ifdef USE_WINDOWS_CONDITION_VARIABLE - ok = SleepConditionVariableCS(condition, mutex, INFINITE); -#else - // note that there is a consumer available so the signal isn't dropped in - // pthread_cond_signal - if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1; - // now unlock the mutex so pthread_cond_signal may be issued - pthread_mutex_unlock(mutex); - ok = (WaitForSingleObject(condition->signal_event_, INFINITE) == - WAIT_OBJECT_0); - ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL); - pthread_mutex_lock(mutex); -#endif - return !ok; -} -#elif defined(__OS2__) -#define INCL_DOS -#include // NOLINT - -#include // NOLINT -#include // NOLINT -#include // NOLINT - -#define pthread_t TID -#define pthread_mutex_t HMTX - -typedef struct { - HEV event_sem_; - HEV ack_sem_; - volatile unsigned wait_count_; -} pthread_cond_t; - -//------------------------------------------------------------------------------ -// simplistic pthread emulation layer - -#define THREADFN void * -#define THREAD_RETURN(val) (val) - -typedef struct { - void *(*start_)(void *); - void *arg_; -} thread_arg; - -static void thread_start(void *arg) { - thread_arg targ = *(thread_arg *)arg; - free(arg); - - targ.start_(targ.arg_); -} - -static INLINE int pthread_create(pthread_t *const thread, const void *attr, - void *(*start)(void *), void *arg) { - int tid; - thread_arg *targ = (thread_arg *)malloc(sizeof(*targ)); - if (targ == NULL) return 1; - - (void)attr; - - targ->start_ = start; - targ->arg_ = arg; - tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ); - if (tid == -1) { - free(targ); - return 1; - } - - *thread = tid; - return 0; -} - -static INLINE int pthread_join(pthread_t thread, void **value_ptr) { - (void)value_ptr; - return DosWaitThread(&thread, DCWW_WAIT) != 0; -} - -// Mutex -static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, - void *mutexattr) { - (void)mutexattr; - return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0; -} - -static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { - return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY; -} - -static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { - return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0; -} - -static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { - return DosReleaseMutexSem(*mutex) != 0; -} - -static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { - return DosCloseMutexSem(*mutex) != 0; -} - -// Condition -static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { - int ok = 1; - ok &= DosCloseEventSem(condition->event_sem_) == 0; - ok &= DosCloseEventSem(condition->ack_sem_) == 0; - return !ok; -} - -static INLINE int pthread_cond_init(pthread_cond_t *const condition, - void *cond_attr) { - int ok = 1; - (void)cond_attr; - - ok &= - DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0; - ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0; - if (!ok) { - pthread_cond_destroy(condition); - return 1; - } - condition->wait_count_ = 0; - return 0; -} - -static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { - int ok = 1; - - if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) { - ok &= DosPostEventSem(condition->event_sem_) == 0; - ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0; - } - - return !ok; -} - -static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { - int ok = 1; - - while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) - ok &= pthread_cond_signal(condition) == 0; - - return !ok; -} - -static INLINE int pthread_cond_wait(pthread_cond_t *const condition, - pthread_mutex_t *const mutex) { - int ok = 1; - - __atomic_increment(&condition->wait_count_); - - ok &= pthread_mutex_unlock(mutex) == 0; - - ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0; - - __atomic_decrement(&condition->wait_count_); - - ok &= DosPostEventSem(condition->ack_sem_) == 0; - - pthread_mutex_lock(mutex); - - return !ok; -} -#else // _WIN32 -#include // NOLINT -#define THREADFN void * -#define THREAD_RETURN(val) val -#endif - -#endif // CONFIG_MULTITHREAD - // State of the worker thread object typedef enum { - NOT_OK = 0, // object is unusable - OK, // ready to work - WORK // busy finishing the current task + VPX_WORKER_STATUS_NOT_OK = 0, // object is unusable + VPX_WORKER_STATUS_OK, // ready to work + VPX_WORKER_STATUS_WORKING // busy finishing the current task } VPxWorkerStatus; // Function to be called by the worker thread. Takes two opaque pointers as -// arguments (data1 and data2), and should return false in case of error. +// arguments (data1 and data2). Should return true on success and return false +// in case of error. typedef int (*VPxWorkerHook)(void *, void *); // Platform-dependent implementation details for the worker. @@ -365,10 +38,14 @@ typedef struct VPxWorkerImpl VPxWorkerImpl; typedef struct { VPxWorkerImpl *impl_; VPxWorkerStatus status_; + // Thread name for the debugger. If not NULL, must point to a string that + // outlives the worker thread. For portability, use a name <= 15 characters + // long (not including the terminating NUL character). + const char *thread_name; VPxWorkerHook hook; // hook to call void *data1; // first argument passed to 'hook' void *data2; // second argument passed to 'hook' - int had_error; // return value of the last call to 'hook' + int had_error; // true if a call to 'hook' returned false } VPxWorker; // The interface for all thread-worker related functions. All these functions @@ -412,4 +89,4 @@ const VPxWorkerInterface *vpx_get_worker_interface(void); } // extern "C" #endif -#endif // VPX_THREAD_H_ +#endif // VPX_VPX_UTIL_VPX_THREAD_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_timestamp.h b/media/libvpx/libvpx/vpx_util/vpx_timestamp.h new file mode 100644 index 0000000000..5296458fad --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_timestamp.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_UTIL_VPX_TIMESTAMP_H_ +#define VPX_VPX_UTIL_VPX_TIMESTAMP_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Rational Number with an int64 numerator +typedef struct vpx_rational64 { + int64_t num; // fraction numerator + int den; // fraction denominator +} vpx_rational64_t; // alias for struct vpx_rational64_t + +static INLINE int gcd(int64_t a, int b) { + int r; // remainder + assert(a >= 0); + assert(b > 0); + while (b != 0) { + r = (int)(a % b); + a = b; + b = r; + } + + return (int)a; +} + +static INLINE void reduce_ratio(vpx_rational64_t *ratio) { + const int denom = gcd(ratio->num, ratio->den); + ratio->num /= denom; + ratio->den /= denom; +} + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // VPX_VPX_UTIL_VPX_TIMESTAMP_H_ diff --git a/media/libvpx/libvpx/vpx_util/vpx_util.mk b/media/libvpx/libvpx/vpx_util/vpx_util.mk index c0ef8d3362..948e6d6f89 100644 --- a/media/libvpx/libvpx/vpx_util/vpx_util.mk +++ b/media/libvpx/libvpx/vpx_util/vpx_util.mk @@ -8,7 +8,14 @@ ## be found in the AUTHORS file in the root of the source tree. ## +UTIL_SRCS-yes += vpx_atomics.h UTIL_SRCS-yes += vpx_util.mk +UTIL_SRCS-yes += vpx_pthread.h UTIL_SRCS-yes += vpx_thread.c UTIL_SRCS-yes += vpx_thread.h UTIL_SRCS-yes += endian_inl.h +UTIL_SRCS-yes += vpx_write_yuv_frame.h +UTIL_SRCS-yes += vpx_write_yuv_frame.c +UTIL_SRCS-yes += vpx_timestamp.h +UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.h +UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.c diff --git a/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c new file mode 100644 index 0000000000..4ef57a2fee --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_dsp/skin_detection.h" +#include "vpx_util/vpx_write_yuv_frame.h" + +void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { +#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \ + defined(OUTPUT_YUV_SKINMAP) || defined(OUTPUT_YUV_SVC_SRC) + + unsigned char *src = s->y_buffer; + int h = s->y_crop_height; + + do { + fwrite(src, s->y_width, 1, yuv_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_crop_height; + + do { + fwrite(src, s->uv_width, 1, yuv_file); + src += s->uv_stride; + } while (--h); + +#else + (void)yuv_file; + (void)s; +#endif +} diff --git a/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h new file mode 100644 index 0000000000..ce1102458e --- /dev/null +++ b/media/libvpx/libvpx/vpx_util/vpx_write_yuv_frame.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ +#define VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ + +#include +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_ diff --git a/media/libvpx/libvpx/vpx_version.h b/media/libvpx/libvpx/vpx_version.h new file mode 100644 index 0000000000..ba9b63a4a3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_version.h @@ -0,0 +1,11 @@ +// This file is generated. Do not edit. +#ifndef VPX_VERSION_H_ +#define VPX_VERSION_H_ +#define VERSION_MAJOR 1 +#define VERSION_MINOR 16 +#define VERSION_PATCH 0 +#define VERSION_EXTRA "" +#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) +#define VERSION_STRING_NOSP "v1.16.0" +#define VERSION_STRING " v1.16.0" +#endif // VPX_VERSION_H_ diff --git a/media/libvpx/libvpx/vpxdec.c b/media/libvpx/libvpx/vpxdec.c index 2cdb69d5a3..bfe6c1d6ba 100644 --- a/media/libvpx/libvpx/vpxdec.c +++ b/media/libvpx/libvpx/vpxdec.c @@ -47,6 +47,8 @@ struct VpxDecInputContext { struct WebmInputContext *webm_ctx; }; +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); static const arg_def_t looparg = ARG_DEF(NULL, "loops", 1, "Number of times to decode the file"); static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"); @@ -75,7 +77,7 @@ static const arg_def_t outputfile = static const arg_def_t threadsarg = ARG_DEF("t", "threads", 1, "Max threads to use"); static const arg_def_t frameparallelarg = - ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode"); + ARG_DEF(NULL, "frame-parallel", 0, "Frame parallel decode (ignored)"); static const arg_def_t verbosearg = ARG_DEF("v", "verbose", 0, "Show version string"); static const arg_def_t error_concealment = @@ -94,17 +96,43 @@ static const arg_def_t outbitdeptharg = #endif static const arg_def_t svcdecodingarg = ARG_DEF( NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer"); +static const arg_def_t framestatsarg = + ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)"); +static const arg_def_t rowmtarg = + ARG_DEF(NULL, "row-mt", 1, "Enable multi-threading to run row-wise in VP9"); +static const arg_def_t lpfoptarg = + ARG_DEF(NULL, "lpf-opt", 1, + "Do loopfilter without waiting for all threads to sync."); -static const arg_def_t *all_args[] = { - &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, - &noblitarg, &progressarg, &limitarg, &skiparg, &postprocarg, - &summaryarg, &outputfile, &threadsarg, &frameparallelarg, &verbosearg, - &scalearg, &fb_arg, &md5arg, &error_concealment, &continuearg, +static const arg_def_t *all_args[] = { &help, + &codecarg, + &use_yv12, + &use_i420, + &flipuvarg, + &rawvideo, + &noblitarg, + &progressarg, + &limitarg, + &skiparg, + &postprocarg, + &summaryarg, + &outputfile, + &threadsarg, + &frameparallelarg, + &verbosearg, + &scalearg, + &fb_arg, + &md5arg, + &error_concealment, + &continuearg, #if CONFIG_VP9_HIGHBITDEPTH - &outbitdeptharg, + &outbitdeptharg, #endif - &svcdecodingarg, NULL -}; + &svcdecodingarg, + &framestatsarg, + &rowmtarg, + &lpfoptarg, + NULL }; #if CONFIG_VP8_DECODER static const arg_def_t addnoise_level = @@ -147,41 +175,47 @@ static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst, dst->d_h, mode); } #endif - -void usage_exit(void) { +static void show_help(FILE *fout, int shorthelp) { int i; - fprintf(stderr, - "Usage: %s filename\n\n" - "Options:\n", - exec_name); - arg_show_usage(stderr, all_args); + fprintf(fout, "Usage: %s filename\n\n", exec_name); + + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "Options:\n"); + arg_show_usage(fout, all_args); #if CONFIG_VP8_DECODER - fprintf(stderr, "\nVP8 Postprocessing Options:\n"); - arg_show_usage(stderr, vp8_pp_args); + fprintf(fout, "\nVP8 Postprocessing Options:\n"); + arg_show_usage(fout, vp8_pp_args); #endif - fprintf(stderr, + fprintf(fout, "\nOutput File Patterns:\n\n" " The -o argument specifies the name of the file(s) to " "write to. If the\n argument does not include any escape " "characters, the output will be\n written to a single file. " "Otherwise, the filename will be calculated by\n expanding " "the following escape characters:\n"); - fprintf(stderr, + fprintf(fout, "\n\t%%w - Frame width" "\n\t%%h - Frame height" "\n\t%% - Frame number, zero padded to places (1..9)" "\n\n Pattern arguments are only supported in conjunction " "with the --yv12 and\n --i420 options. If the -o option is " "not specified, the output will be\n directed to stdout.\n"); - fprintf(stderr, "\nIncluded decoders:\n\n"); + fprintf(fout, "\nIncluded decoders:\n\n"); for (i = 0; i < get_vpx_decoder_count(); ++i) { const VpxInterface *const decoder = get_vpx_decoder_by_index(i); - fprintf(stderr, " %-6s - %s\n", decoder->name, + fprintf(fout, " %-6s - %s\n", decoder->name, vpx_codec_iface_name(decoder->codec_interface())); } +} +void usage_exit(void) { + show_help(stderr, 1); exit(EXIT_FAILURE); } @@ -225,13 +259,14 @@ static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, return 1; } *bytes_read = frame_size; + return 0; } - return 0; + return 1; } -static int read_frame(struct VpxDecInputContext *input, uint8_t **buf, - size_t *bytes_in_buffer, size_t *buffer_size) { +static int dec_read_frame(struct VpxDecInputContext *input, uint8_t **buf, + size_t *bytes_in_buffer, size_t *buffer_size) { switch (input->vpx_input_ctx->file_type) { #if CONFIG_WEBM_IO case FILE_TYPE_WEBM: @@ -411,7 +446,7 @@ static void generate_filename(const char *pattern, char *out, size_t q_len, case '7': snprintf(q, q_len - 1, "%07d", frame_in); break; case '8': snprintf(q, q_len - 1, "%08d", frame_in); break; case '9': snprintf(q, q_len - 1, "%09d", frame_in); break; - default: die("Unrecognized pattern %%%c\n", p[1]); break; + default: die("Unrecognized pattern %%%c\n", p[1]); } pat_len = strlen(q); @@ -488,11 +523,13 @@ static int main_loop(int argc, const char **argv_) { size_t bytes_in_buffer = 0, buffer_size = 0; FILE *infile; int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0; - int do_md5 = 0, progress = 0, frame_parallel = 0; + int do_md5 = 0, progress = 0; int stop_after = 0, postproc = 0, summary = 0, quiet = 1; int arg_skip = 0; int ec_enabled = 0; int keep_going = 0; + int enable_row_mt = 0; + int enable_lpf_opt = 0; const VpxInterface *interface = NULL; const VpxInterface *fourcc_interface = NULL; uint64_t dx_time = 0; @@ -527,6 +564,8 @@ static int main_loop(int argc, const char **argv_) { char outfile_name[PATH_MAX] = { 0 }; FILE *outfile = NULL; + FILE *framestats_file = NULL; + MD5Context md5_ctx; unsigned char md5_digest[16]; @@ -542,12 +581,18 @@ static int main_loop(int argc, const char **argv_) { /* Parse command line */ exec_name = argv_[0]; argv = argv_dup(argc - 1, argv_ + 1); - + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { memset(&arg, 0, sizeof(arg)); arg.argv_step = 1; - if (arg_match(&arg, &codecarg, argi)) { + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { interface = get_vpx_decoder_by_name(arg.val); if (!interface) die("Error: Unrecognized argument (%s) to --codec\n", arg.val); @@ -584,8 +629,9 @@ static int main_loop(int argc, const char **argv_) { else if (arg_match(&arg, &threadsarg, argi)) cfg.threads = arg_parse_uint(&arg); #if CONFIG_VP9_DECODER - else if (arg_match(&arg, &frameparallelarg, argi)) - frame_parallel = 1; + else if (arg_match(&arg, &frameparallelarg, argi)) { + /* ignored for compatibility */ + } #endif else if (arg_match(&arg, &verbosearg, argi)) quiet = 0; @@ -603,6 +649,16 @@ static int main_loop(int argc, const char **argv_) { else if (arg_match(&arg, &svcdecodingarg, argi)) { svc_decoding = 1; svc_spatial_layer = arg_parse_uint(&arg); + } else if (arg_match(&arg, &framestatsarg, argi)) { + framestats_file = fopen(arg.val, "w"); + if (!framestats_file) { + die("Error: Could not open --framestats file (%s) for writing.\n", + arg.val); + } + } else if (arg_match(&arg, &rowmtarg, argi)) { + enable_row_mt = arg_parse_uint(&arg); + } else if (arg_match(&arg, &lpfoptarg, argi)) { + enable_lpf_opt = arg_parse_uint(&arg); } #if CONFIG_VP8_DECODER else if (arg_match(&arg, &addnoise_level, argi)) { @@ -637,6 +693,7 @@ static int main_loop(int argc, const char **argv_) { if (!fn) { free(argv); + fprintf(stderr, "No input file specified!\n"); usage_exit(); } /* Open file */ @@ -668,6 +725,7 @@ static int main_loop(int argc, const char **argv_) { #if !CONFIG_WEBM_IO fprintf(stderr, "vpxdec was built without WebM container support.\n"); #endif + free(argv); return EXIT_FAILURE; } @@ -712,8 +770,7 @@ static int main_loop(int argc, const char **argv_) { if (!interface) interface = get_vpx_decoder_by_index(0); dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) | - (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) | - (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0); + (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0); if (vpx_codec_dec_init(&decoder, interface->codec_interface(), &cfg, dec_flags)) { fprintf(stderr, "Failed to initialize decoder: %s\n", @@ -728,6 +785,18 @@ static int main_loop(int argc, const char **argv_) { goto fail; } } + if (interface->fourcc == VP9_FOURCC && + vpx_codec_control(&decoder, VP9D_SET_ROW_MT, enable_row_mt)) { + fprintf(stderr, "Failed to set decoder in row multi-thread mode: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } + if (interface->fourcc == VP9_FOURCC && + vpx_codec_control(&decoder, VP9D_SET_LOOP_FILTER_OPT, enable_lpf_opt)) { + fprintf(stderr, "Failed to set decoder in optimized loopfilter mode: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } if (!quiet) fprintf(stderr, "%s\n", decoder.name); #if CONFIG_VP8_DECODER @@ -741,7 +810,7 @@ static int main_loop(int argc, const char **argv_) { if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip); while (arg_skip) { - if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break; + if (dec_read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break; arg_skip--; } @@ -749,6 +818,10 @@ static int main_loop(int argc, const char **argv_) { ext_fb_list.num_external_frame_buffers = num_external_frame_buffers; ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc( num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb)); + if (!ext_fb_list.ext_fb) { + fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n"); + goto fail; + } if (vpx_codec_set_frame_buffer_functions(&decoder, get_vp9_frame_buffer, release_vp9_frame_buffer, &ext_fb_list)) { @@ -761,6 +834,8 @@ static int main_loop(int argc, const char **argv_) { frame_avail = 1; got_data = 0; + if (framestats_file) fprintf(framestats_file, "bytes,qp\n"); + /* Decode file */ while (frame_avail || got_data) { vpx_codec_iter_t iter = NULL; @@ -770,7 +845,7 @@ static int main_loop(int argc, const char **argv_) { frame_avail = 0; if (!stop_after || frame_in < stop_after) { - if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) { + if (!dec_read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) { frame_avail = 1; frame_in++; @@ -786,6 +861,16 @@ static int main_loop(int argc, const char **argv_) { if (!keep_going) goto fail; } + if (framestats_file) { + int qp; + if (vpx_codec_control(&decoder, VPXD_GET_LAST_QUANTIZER, &qp)) { + warn("Failed VPXD_GET_LAST_QUANTIZER: %s", + vpx_codec_error(&decoder)); + if (!keep_going) goto fail; + } + fprintf(framestats_file, "%d,%d\n", (int)bytes_in_buffer, qp); + } + vpx_usec_timer_mark(&timer); dx_time += vpx_usec_timer_elapsed(&timer); } else { @@ -815,7 +900,7 @@ static int main_loop(int argc, const char **argv_) { vpx_usec_timer_mark(&timer); dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer); - if (!frame_parallel && !corrupted && + if (!corrupted && vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) { warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder)); if (!keep_going) goto fail; @@ -852,6 +937,11 @@ static int main_loop(int argc, const char **argv_) { } scaled_img = vpx_img_alloc(NULL, img->fmt, render_width, render_height, 16); + if (!scaled_img) { + fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n", + render_width, render_height); + goto fail; + } scaled_img->bit_depth = img->bit_depth; } @@ -888,6 +978,10 @@ static int main_loop(int argc, const char **argv_) { if (!img_shifted) { img_shifted = vpx_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16); + if (!img_shifted) { + fprintf(stderr, "Failed to allocate image\n"); + goto fail; + } img_shifted->bit_depth = output_bit_depth; } if (output_bit_depth > img->bit_depth) { @@ -902,7 +996,7 @@ static int main_loop(int argc, const char **argv_) { if (single_file) { if (use_y4m) { - char buf[Y4M_BUFFER_SIZE] = { 0 }; + char y4m_buf[Y4M_BUFFER_SIZE] = { 0 }; size_t len = 0; if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) { fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n"); @@ -911,21 +1005,22 @@ static int main_loop(int argc, const char **argv_) { if (frame_out == 1) { // Y4M file header len = y4m_write_file_header( - buf, sizeof(buf), vpx_input_ctx.width, vpx_input_ctx.height, - &vpx_input_ctx.framerate, img->fmt, img->bit_depth); + y4m_buf, sizeof(y4m_buf), vpx_input_ctx.width, + vpx_input_ctx.height, &vpx_input_ctx.framerate, img->fmt, + img->bit_depth); if (do_md5) { - MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len); + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); } else { - fputs(buf, outfile); + fputs(y4m_buf, outfile); } } // Y4M frame header - len = y4m_write_frame_header(buf, sizeof(buf)); + len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf)); if (do_md5) { - MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len); + MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len); } else { - fputs(buf, outfile); + fputs(y4m_buf, outfile); } } else { if (frame_out == 1) { @@ -952,7 +1047,7 @@ static int main_loop(int argc, const char **argv_) { if (do_md5) { update_image_md5(img, planes, &md5_ctx); } else { - write_image_file(img, planes, outfile); + if (!corrupted) write_image_file(img, planes, outfile); } } else { generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w, @@ -1018,6 +1113,8 @@ fail2: free(ext_fb_list.ext_fb); fclose(infile); + if (framestats_file) fclose(framestats_file); + free(argv); return ret; @@ -1030,6 +1127,10 @@ int main(int argc, const char **argv_) { int error = 0; argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { memset(&arg, 0, sizeof(arg)); arg.argv_step = 1; diff --git a/media/libvpx/libvpx/vpxenc.c b/media/libvpx/libvpx/vpxenc.c index a0f760574c..bf5f1ad790 100644 --- a/media/libvpx/libvpx/vpxenc.c +++ b/media/libvpx/libvpx/vpxenc.c @@ -50,12 +50,6 @@ #endif #include "./y4minput.h" -/* Swallow warnings about unused results of fread/fwrite */ -static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { - return fread(ptr, size, nmemb, stream); -} -#define fread wrap_fread - static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { return fwrite(ptr, size, nmemb, stream); @@ -64,8 +58,8 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb, static const char *exec_name; -static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal, - const char *s, va_list ap) { +static VPX_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv( + vpx_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) { if (ctx->err) { const char *detail = vpx_codec_error_detail(ctx); @@ -78,7 +72,9 @@ static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal, } } -static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) { +static VPX_TOOLS_FORMAT_PRINTF(2, + 3) void ctx_exit_on_error(vpx_codec_ctx_t *ctx, + const char *s, ...) { va_list ap; va_start(ap, s); @@ -86,8 +82,8 @@ static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) { va_end(ap); } -static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal, - const char *s, ...) { +static VPX_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error( + vpx_codec_ctx_t *ctx, int fatal, const char *s, ...) { va_list ap; va_start(ap, s); @@ -95,38 +91,14 @@ static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal, va_end(ap); } -static int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) { - FILE *f = input_ctx->file; - y4m_input *y4m = &input_ctx->y4m; - int shortread = 0; - - if (input_ctx->file_type == FILE_TYPE_Y4M) { - if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0; - } else { - shortread = read_yuv_frame(input_ctx, img); - } - - return !shortread; -} - -static int file_is_y4m(const char detect[4]) { - if (memcmp(detect, "YUV4", 4) == 0) { - return 1; - } - return 0; -} - -static int fourcc_is_ivf(const char detect[4]) { - if (memcmp(detect, "DKIF", 4) == 0) { - return 1; - } - return 0; -} - +static const arg_def_t help = + ARG_DEF(NULL, "help", 0, "Show usage options and exit"); static const arg_def_t debugmode = ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)"); static const arg_def_t outputfile = ARG_DEF("o", "output", 1, "Output filename"); +static const arg_def_t use_nv12 = + ARG_DEF(NULL, "nv12", 0, "Input file is NV12 "); static const arg_def_t use_yv12 = ARG_DEF(NULL, "yv12", 0, "Input file is YV12 "); static const arg_def_t use_i420 = @@ -144,10 +116,6 @@ static const arg_def_t pass_arg = ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)"); static const arg_def_t fpf_name = ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"); -#if CONFIG_FP_MB_STATS -static const arg_def_t fpmbf_name = - ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name"); -#endif static const arg_def_t limit = ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"); static const arg_def_t skip = @@ -199,7 +167,8 @@ static const arg_def_t test16bitinternalarg = ARG_DEF( NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer"); #endif -static const arg_def_t *main_args[] = { &debugmode, +static const arg_def_t *main_args[] = { &help, + &debugmode, &outputfile, &codecarg, &passes, @@ -251,7 +220,8 @@ static const arg_def_t error_resilient = static const arg_def_t lag_in_frames = ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag"); -static const arg_def_t *global_args[] = { &use_yv12, +static const arg_def_t *global_args[] = { &use_nv12, + &use_yv12, &use_i420, &use_i422, &use_i444, @@ -315,14 +285,75 @@ static const arg_def_t *rc_args[] = { &buf_sz, &buf_initial_sz, &buf_optimal_sz, NULL }; +#if CONFIG_VP9_ENCODER +static const arg_def_t use_vizier_rc_params = + ARG_DEF(NULL, "use-vizier-rc-params", 1, "Use vizier rc params"); +static const arg_def_t active_wq_factor = + ARG_DEF(NULL, "active-wq-factor", 1, "Active worst quality factor"); +static const arg_def_t err_per_mb_factor = + ARG_DEF(NULL, "err-per-mb-factor", 1, "Error per macroblock factor"); +static const arg_def_t sr_default_decay_limit = ARG_DEF( + NULL, "sr-default-decay-limit", 1, "Second reference default decay limit"); +static const arg_def_t sr_diff_factor = + ARG_DEF(NULL, "sr-diff-factor", 1, "Second reference diff factor"); +static const arg_def_t kf_err_per_mb_factor = ARG_DEF( + NULL, "kf-err-per-mb-factor", 1, "Keyframe error per macroblock factor"); +static const arg_def_t kf_frame_min_boost_factor = + ARG_DEF(NULL, "kf-frame-min-boost-factor", 1, "Keyframe min boost"); +static const arg_def_t kf_frame_max_boost_first_factor = + ARG_DEF(NULL, "kf-frame-max-boost-first-factor", 1, + "Max keyframe boost adjustment factor for first frame"); +static const arg_def_t kf_frame_max_boost_subs_factor = + ARG_DEF(NULL, "kf-frame-max-boost-subs-factor", 1, + "Max boost adjustment factor for subsequent KFs"); +static const arg_def_t kf_max_total_boost_factor = ARG_DEF( + NULL, "kf-max-total-boost-factor", 1, "Keyframe max total boost factor"); +static const arg_def_t gf_max_total_boost_factor = + ARG_DEF(NULL, "gf-max-total-boost-factor", 1, + "Golden frame max total boost factor"); +static const arg_def_t gf_frame_max_boost_factor = + ARG_DEF(NULL, "gf-frame-max-boost-factor", 1, + "Golden frame max per frame boost factor"); +static const arg_def_t zm_factor = + ARG_DEF(NULL, "zm-factor", 1, "Zero motion power factor"); +static const arg_def_t rd_mult_inter_qp_fac = + ARG_DEF(NULL, "rd-mult-inter-qp-fac", 1, + "RD multiplier adjustment for inter frames"); +static const arg_def_t rd_mult_arf_qp_fac = + ARG_DEF(NULL, "rd-mult-arf-qp-fac", 1, + "RD multiplier adjustment for alt-ref frames"); +static const arg_def_t rd_mult_key_qp_fac = ARG_DEF( + NULL, "rd-mult-key-qp-fac", 1, "RD multiplier adjustment for key frames"); +static const arg_def_t *vizier_rc_args[] = { &use_vizier_rc_params, + &active_wq_factor, + &err_per_mb_factor, + &sr_default_decay_limit, + &sr_diff_factor, + &kf_err_per_mb_factor, + &kf_frame_min_boost_factor, + &kf_frame_max_boost_first_factor, + &kf_frame_max_boost_subs_factor, + &kf_max_total_boost_factor, + &gf_max_total_boost_factor, + &gf_frame_max_boost_factor, + &zm_factor, + &rd_mult_inter_qp_fac, + &rd_mult_arf_qp_fac, + &rd_mult_key_qp_fac, + NULL }; +#endif + static const arg_def_t bias_pct = ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)"); static const arg_def_t minsection_pct = ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)"); static const arg_def_t maxsection_pct = ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)"); +static const arg_def_t corpus_complexity = + ARG_DEF(NULL, "corpus-complexity", 1, "corpus vbr complexity midpoint"); static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct, - &maxsection_pct, NULL }; + &maxsection_pct, + &corpus_complexity, NULL }; static const arg_def_t kf_min_dist = ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)"); @@ -336,19 +367,19 @@ static const arg_def_t *kf_args[] = { &kf_min_dist, &kf_max_dist, &kf_disabled, static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)"); static const arg_def_t sharpness = - ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)"); + ARG_DEF(NULL, "sharpness", 1, + "Increase sharpness at the expense of lower PSNR. (0..7)"); static const arg_def_t static_thresh = ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"); -static const arg_def_t auto_altref = - ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames"); static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)"); static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)"); -static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1, "AltRef type"); -static const struct arg_enum_list tuning_enum[] = { - { "psnr", VP8_TUNE_PSNR }, { "ssim", VP8_TUNE_SSIM }, { NULL, 0 } -}; +static const arg_def_t arnr_type = + ARG_DEF(NULL, "arnr-type", 1, "AltRef filter type (1..3)"); +static const struct arg_enum_list tuning_enum[] = { { "psnr", VP8_TUNE_PSNR }, + { "ssim", VP8_TUNE_SSIM }, + { NULL, 0 } }; static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1, "Material to favor", tuning_enum); static const arg_def_t cq_level = @@ -361,12 +392,14 @@ static const arg_def_t gf_cbr_boost_pct = ARG_DEF( #if CONFIG_VP8_ENCODER static const arg_def_t cpu_used_vp8 = ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-16..16)"); +static const arg_def_t auto_altref_vp8 = ARG_DEF( + NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames. (0..1)"); static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2"); static const arg_def_t screen_content_mode = ARG_DEF(NULL, "screen-content-mode", 1, "Screen content mode"); static const arg_def_t *vp8_args[] = { &cpu_used_vp8, - &auto_altref, + &auto_altref_vp8, &noise_sens, &sharpness, &static_thresh, @@ -399,12 +432,22 @@ static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED, #if CONFIG_VP9_ENCODER static const arg_def_t cpu_used_vp9 = - ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-8..8)"); + ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-9..9)"); +static const arg_def_t auto_altref_vp9 = ARG_DEF( + NULL, "auto-alt-ref", 1, + "Enable automatic alt reference frames, 2+ enables multi-layer. (0..6)"); static const arg_def_t tile_cols = ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"); static const arg_def_t tile_rows = ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2 (set to 0 while threads > 1)"); + +static const arg_def_t enable_tpl_model = + ARG_DEF(NULL, "enable-tpl", 1, "Enable temporal dependency model"); +static const arg_def_t enable_keyframe_filtering = + ARG_DEF(NULL, "enable-keyframe-filtering", 1, + "Enable key frame temporal filtering (0: off (default), 1: on)"); + static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)"); static const arg_def_t frame_parallel_decoding = ARG_DEF( @@ -441,8 +484,8 @@ static const struct arg_enum_list color_space_enum[] = { }; static const arg_def_t input_color_space = - ARG_DEF_ENUM(NULL, "color-space", 1, "The color space of input content:", - color_space_enum); + ARG_DEF_ENUM(NULL, "color-space", 1, + "The color space of input content:", color_space_enum); #if CONFIG_VP9_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { @@ -460,6 +503,7 @@ static const arg_def_t inbitdeptharg = static const struct arg_enum_list tune_content_enum[] = { { "default", VP9E_CONTENT_DEFAULT }, { "screen", VP9E_CONTENT_SCREEN }, + { "film", VP9E_CONTENT_FILM }, { NULL, 0 } }; @@ -468,17 +512,39 @@ static const arg_def_t tune_content = ARG_DEF_ENUM( static const arg_def_t target_level = ARG_DEF( NULL, "target-level", 1, - "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;" - " 11: level 1.1; ... 62: level 6.2)"); + "Target level\n" + " 255: off (default)\n" + " 0: only keep level stats\n" + " 1: adaptively set alt-ref " + "distance and column tile limit based on picture size, and keep" + " level stats\n" + " 10: level 1.0 11: level 1.1 " + "... 62: level 6.2"); + +static const arg_def_t row_mt = + ARG_DEF(NULL, "row-mt", 1, + "Enable row based non-deterministic multi-threading in VP9"); + +static const arg_def_t disable_loopfilter = + ARG_DEF(NULL, "disable-loopfilter", 1, + "Control Loopfilter in VP9:\n" + " " + "0: Loopfilter on for all frames (default)\n" + " " + "1: Loopfilter off for non reference frames\n" + " " + "2: Loopfilter off for all frames"); #endif #if CONFIG_VP9_ENCODER static const arg_def_t *vp9_args[] = { &cpu_used_vp9, - &auto_altref, + &auto_altref_vp9, &sharpness, &static_thresh, &tile_cols, &tile_rows, + &enable_tpl_model, + &enable_keyframe_filtering, &arnr_maxframes, &arnr_strength, &arnr_type, @@ -498,6 +564,11 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9, &min_gf_interval, &max_gf_interval, &target_level, + &row_mt, + &disable_loopfilter, +// NOTE: The entries above have a corresponding entry in vp9_arg_ctrl_map. The +// entries below do not have a corresponding entry in vp9_arg_ctrl_map. They +// must be listed at the end of vp9_args. #if CONFIG_VP9_HIGHBITDEPTH &bitdeptharg, &inbitdeptharg, @@ -509,6 +580,8 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP8E_SET_STATIC_THRESHOLD, VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS, + VP9E_SET_TPL, + VP9E_SET_KEY_FRAME_FILTERING, VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE, @@ -528,278 +601,66 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL, VP9E_SET_TARGET_LEVEL, + VP9E_SET_ROW_MT, + VP9E_SET_DISABLE_LOOPFILTER, 0 }; #endif static const arg_def_t *no_args[] = { NULL }; -void usage_exit(void) { +static void show_help(FILE *fout, int shorthelp) { int i; const int num_encoder = get_vpx_encoder_count(); - fprintf(stderr, "Usage: %s -o dst_filename src_filename \n", + fprintf(fout, "Usage: %s -o dst_filename src_filename \n", exec_name); - fprintf(stderr, "\nOptions:\n"); - arg_show_usage(stderr, main_args); - fprintf(stderr, "\nEncoder Global Options:\n"); - arg_show_usage(stderr, global_args); - fprintf(stderr, "\nRate Control Options:\n"); - arg_show_usage(stderr, rc_args); - fprintf(stderr, "\nTwopass Rate Control Options:\n"); - arg_show_usage(stderr, rc_twopass_args); - fprintf(stderr, "\nKeyframe Placement Options:\n"); - arg_show_usage(stderr, kf_args); + if (shorthelp) { + fprintf(fout, "Use --help to see the full list of options.\n"); + return; + } + + fprintf(fout, "\nOptions:\n"); + arg_show_usage(fout, main_args); + fprintf(fout, "\nEncoder Global Options:\n"); + arg_show_usage(fout, global_args); + fprintf(fout, "\nRate Control Options:\n"); + arg_show_usage(fout, rc_args); + fprintf(fout, "\nTwopass Rate Control Options:\n"); + arg_show_usage(fout, rc_twopass_args); + fprintf(fout, "\nKeyframe Placement Options:\n"); + arg_show_usage(fout, kf_args); #if CONFIG_VP8_ENCODER - fprintf(stderr, "\nVP8 Specific Options:\n"); - arg_show_usage(stderr, vp8_args); + fprintf(fout, "\nVP8 Specific Options:\n"); + arg_show_usage(fout, vp8_args); #endif #if CONFIG_VP9_ENCODER - fprintf(stderr, "\nVP9 Specific Options:\n"); - arg_show_usage(stderr, vp9_args); + fprintf(fout, "\nVP9 Specific Options:\n"); + arg_show_usage(fout, vp9_args); + fprintf(fout, "\nVizier Rate Control Options:\n"); + arg_show_usage(fout, vizier_rc_args); #endif - fprintf(stderr, + fprintf(fout, "\nStream timebase (--timebase):\n" " The desired precision of timestamps in the output, expressed\n" " in fractional seconds. Default is 1/1000.\n"); - fprintf(stderr, "\nIncluded encoders:\n\n"); + fprintf(fout, "\nIncluded encoders:\n\n"); for (i = 0; i < num_encoder; ++i) { const VpxInterface *const encoder = get_vpx_encoder_by_index(i); const char *defstr = (i == (num_encoder - 1)) ? "(default)" : ""; - fprintf(stderr, " %-6s - %s %s\n", encoder->name, + fprintf(fout, " %-6s - %s %s\n", encoder->name, vpx_codec_iface_name(encoder->codec_interface()), defstr); } - fprintf(stderr, "\n "); - fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n"); + fprintf(fout, "\n "); + fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n"); +} +void usage_exit(void) { + show_help(stderr, 1); exit(EXIT_FAILURE); } -#define mmin(a, b) ((a) < (b) ? (a) : (b)) - -#if CONFIG_VP9_HIGHBITDEPTH -static void find_mismatch_high(const vpx_image_t *const img1, - const vpx_image_t *const img2, int yloc[4], - int uloc[4], int vloc[4]) { - uint16_t *plane1, *plane2; - uint32_t stride1, stride2; - const uint32_t bsize = 64; - const uint32_t bsizey = bsize >> img1->y_chroma_shift; - const uint32_t bsizex = bsize >> img1->x_chroma_shift; - const uint32_t c_w = - (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; - const uint32_t c_h = - (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; - int match = 1; - uint32_t i, j; - yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; - plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y]; - plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y]; - stride1 = img1->stride[VPX_PLANE_Y] / 2; - stride2 = img2->stride[VPX_PLANE_Y] / 2; - for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { - for (j = 0; match && j < img1->d_w; j += bsize) { - int k, l; - const int si = mmin(i + bsize, img1->d_h) - i; - const int sj = mmin(j + bsize, img1->d_w) - j; - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(plane1 + (i + k) * stride1 + j + l) != - *(plane2 + (i + k) * stride2 + j + l)) { - yloc[0] = i + k; - yloc[1] = j + l; - yloc[2] = *(plane1 + (i + k) * stride1 + j + l); - yloc[3] = *(plane2 + (i + k) * stride2 + j + l); - match = 0; - break; - } - } - } - } - } - - uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; - plane1 = (uint16_t *)img1->planes[VPX_PLANE_U]; - plane2 = (uint16_t *)img2->planes[VPX_PLANE_U]; - stride1 = img1->stride[VPX_PLANE_U] / 2; - stride2 = img2->stride[VPX_PLANE_U] / 2; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(plane1 + (i + k) * stride1 + j + l) != - *(plane2 + (i + k) * stride2 + j + l)) { - uloc[0] = i + k; - uloc[1] = j + l; - uloc[2] = *(plane1 + (i + k) * stride1 + j + l); - uloc[3] = *(plane2 + (i + k) * stride2 + j + l); - match = 0; - break; - } - } - } - } - } - - vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; - plane1 = (uint16_t *)img1->planes[VPX_PLANE_V]; - plane2 = (uint16_t *)img2->planes[VPX_PLANE_V]; - stride1 = img1->stride[VPX_PLANE_V] / 2; - stride2 = img2->stride[VPX_PLANE_V] / 2; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(plane1 + (i + k) * stride1 + j + l) != - *(plane2 + (i + k) * stride2 + j + l)) { - vloc[0] = i + k; - vloc[1] = j + l; - vloc[2] = *(plane1 + (i + k) * stride1 + j + l); - vloc[3] = *(plane2 + (i + k) * stride2 + j + l); - match = 0; - break; - } - } - } - } - } -} -#endif - -static void find_mismatch(const vpx_image_t *const img1, - const vpx_image_t *const img2, int yloc[4], - int uloc[4], int vloc[4]) { - const uint32_t bsize = 64; - const uint32_t bsizey = bsize >> img1->y_chroma_shift; - const uint32_t bsizex = bsize >> img1->x_chroma_shift; - const uint32_t c_w = - (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; - const uint32_t c_h = - (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; - int match = 1; - uint32_t i, j; - yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1; - for (i = 0, match = 1; match && i < img1->d_h; i += bsize) { - for (j = 0; match && j < img1->d_w; j += bsize) { - int k, l; - const int si = mmin(i + bsize, img1->d_h) - i; - const int sj = mmin(j + bsize, img1->d_w) - j; - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_Y] + - (i + k) * img1->stride[VPX_PLANE_Y] + j + l) != - *(img2->planes[VPX_PLANE_Y] + - (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) { - yloc[0] = i + k; - yloc[1] = j + l; - yloc[2] = *(img1->planes[VPX_PLANE_Y] + - (i + k) * img1->stride[VPX_PLANE_Y] + j + l); - yloc[3] = *(img2->planes[VPX_PLANE_Y] + - (i + k) * img2->stride[VPX_PLANE_Y] + j + l); - match = 0; - break; - } - } - } - } - } - - uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_U] + - (i + k) * img1->stride[VPX_PLANE_U] + j + l) != - *(img2->planes[VPX_PLANE_U] + - (i + k) * img2->stride[VPX_PLANE_U] + j + l)) { - uloc[0] = i + k; - uloc[1] = j + l; - uloc[2] = *(img1->planes[VPX_PLANE_U] + - (i + k) * img1->stride[VPX_PLANE_U] + j + l); - uloc[3] = *(img2->planes[VPX_PLANE_U] + - (i + k) * img2->stride[VPX_PLANE_U] + j + l); - match = 0; - break; - } - } - } - } - } - vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1; - for (i = 0, match = 1; match && i < c_h; i += bsizey) { - for (j = 0; match && j < c_w; j += bsizex) { - int k, l; - const int si = mmin(i + bsizey, c_h - i); - const int sj = mmin(j + bsizex, c_w - j); - for (k = 0; match && k < si; ++k) { - for (l = 0; match && l < sj; ++l) { - if (*(img1->planes[VPX_PLANE_V] + - (i + k) * img1->stride[VPX_PLANE_V] + j + l) != - *(img2->planes[VPX_PLANE_V] + - (i + k) * img2->stride[VPX_PLANE_V] + j + l)) { - vloc[0] = i + k; - vloc[1] = j + l; - vloc[2] = *(img1->planes[VPX_PLANE_V] + - (i + k) * img1->stride[VPX_PLANE_V] + j + l); - vloc[3] = *(img2->planes[VPX_PLANE_V] + - (i + k) * img2->stride[VPX_PLANE_V] + j + l); - match = 0; - break; - } - } - } - } - } -} - -static int compare_img(const vpx_image_t *const img1, - const vpx_image_t *const img2) { - uint32_t l_w = img1->d_w; - uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift; - const uint32_t c_h = - (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift; - uint32_t i; - int match = 1; - - match &= (img1->fmt == img2->fmt); - match &= (img1->d_w == img2->d_w); - match &= (img1->d_h == img2->d_h); -#if CONFIG_VP9_HIGHBITDEPTH - if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) { - l_w *= 2; - c_w *= 2; - } -#endif - - for (i = 0; i < img1->d_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], - img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], - l_w) == 0); - - for (i = 0; i < c_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], - img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], - c_w) == 0); - - for (i = 0; i < c_h; ++i) - match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], - img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], - c_w) == 0); - - return match; -} - #define NELEMENTS(x) (sizeof(x) / sizeof(x[0])) #if CONFIG_VP9_ENCODER #define ARG_CTRL_CNT_MAX NELEMENTS(vp9_arg_ctrl_map) @@ -819,9 +680,6 @@ struct stream_config { struct vpx_codec_enc_cfg cfg; const char *out_fn; const char *stats_fn; -#if CONFIG_FP_MB_STATS - const char *fpmb_stats_fn; -#endif stereo_format_t stereo_fmt; int arg_ctrls[ARG_CTRL_CNT_MAX][2]; int arg_ctrl_cnt; @@ -849,9 +707,6 @@ struct stream_state { uint64_t cx_time; size_t nbytes; stats_io_t stats; -#if CONFIG_FP_MB_STATS - stats_io_t fpmb_stats; -#endif struct vpx_image *img; vpx_codec_ctx_t decoder; int mismatch_seen; @@ -864,7 +719,7 @@ static void validate_positive_rational(const char *msg, rat->den *= -1; } - if (rat->num < 0) die("Error: %s must be positive\n", msg); + if (rat->num <= 0) die("Error: %s must be positive\n", msg); if (!rat->den) die("Error: %s has zero denominator\n", msg); } @@ -887,7 +742,10 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; - if (arg_match(&arg, &codecarg, argi)) { + if (arg_match(&arg, &help, argi)) { + show_help(stdout, 0); + exit(EXIT_SUCCESS); + } else if (arg_match(&arg, &codecarg, argi)) { global->codec = get_vpx_encoder_by_name(arg.val); if (!global->codec) die("Error: Unrecognized argument (%s) to --codec\n", arg.val); @@ -913,6 +771,8 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { global->deadline = VPX_DL_REALTIME; else if (arg_match(&arg, &use_yv12, argi)) global->color_type = YV12; + else if (arg_match(&arg, &use_nv12, argi)) + global->color_type = NV12; else if (arg_match(&arg, &use_i420, argi)) global->color_type = I420; else if (arg_match(&arg, &use_i422, argi)) @@ -982,57 +842,6 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) { } } -static void open_input_file(struct VpxInputContext *input) { - /* Parse certain options from the input file, if possible */ - input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb") - : set_binary_mode(stdin); - - if (!input->file) fatal("Failed to open input file"); - - if (!fseeko(input->file, 0, SEEK_END)) { - /* Input file is seekable. Figure out how long it is, so we can get - * progress info. - */ - input->length = ftello(input->file); - rewind(input->file); - } - - /* Default to 1:1 pixel aspect ratio. */ - input->pixel_aspect_ratio.numerator = 1; - input->pixel_aspect_ratio.denominator = 1; - - /* For RAW input sources, these bytes will applied on the first frame - * in read_frame(). - */ - input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file); - input->detect.position = 0; - - if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) { - if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, - input->only_i420) >= 0) { - input->file_type = FILE_TYPE_Y4M; - input->width = input->y4m.pic_w; - input->height = input->y4m.pic_h; - input->pixel_aspect_ratio.numerator = input->y4m.par_n; - input->pixel_aspect_ratio.denominator = input->y4m.par_d; - input->framerate.numerator = input->y4m.fps_n; - input->framerate.denominator = input->y4m.fps_d; - input->fmt = input->y4m.vpx_fmt; - input->bit_depth = input->y4m.bit_depth; - } else - fatal("Unsupported Y4M stream."); - } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) { - fatal("IVF is not supported as input."); - } else { - input->file_type = FILE_TYPE_RAW; - } -} - -static void close_input_file(struct VpxInputContext *input) { - fclose(input->file); - if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m); -} - static struct stream_state *new_stream(struct VpxEncoderConfig *global, struct stream_state *prev) { struct stream_state *stream; @@ -1043,7 +852,7 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global, } if (prev) { - memcpy(stream, prev, sizeof(*stream)); + *stream = *prev; stream->index++; prev->next = stream; } else { @@ -1079,7 +888,7 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global, /* Default lag_in_frames is 0 in realtime mode CBR mode*/ if (global->deadline == VPX_DL_REALTIME && - stream->config.cfg.rc_end_usage == 1) + stream->config.cfg.rc_end_usage == VPX_CBR) stream->config.cfg.g_lag_in_frames = 0; } @@ -1094,8 +903,8 @@ static int parse_stream_params(struct VpxEncoderConfig *global, struct stream_state *stream, char **argv) { char **argi, **argj; struct arg arg; - static const arg_def_t **ctrl_args = no_args; - static const int *ctrl_args_map = NULL; + const arg_def_t **ctrl_args = no_args; + const int *ctrl_args_map = NULL; struct stream_config *config = &stream->config; int eos_mark_found = 0; #if CONFIG_VP9_HIGHBITDEPTH @@ -1134,10 +943,6 @@ static int parse_stream_params(struct VpxEncoderConfig *global, config->out_fn = arg.val; } else if (arg_match(&arg, &fpf_name, argi)) { config->stats_fn = arg.val; -#if CONFIG_FP_MB_STATS - } else if (arg_match(&arg, &fpmbf_name, argi)) { - config->fpmb_stats_fn = arg.val; -#endif } else if (arg_match(&arg, &use_webm, argi)) { #if CONFIG_WEBM_IO config->write_webm = 1; @@ -1221,6 +1026,11 @@ static int parse_stream_params(struct VpxEncoderConfig *global, } else if (arg_match(&arg, &maxsection_pct, argi)) { config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg); + if (global->passes < 2) + warn("option %s ignored in one-pass mode.\n", arg.name); + } else if (arg_match(&arg, &corpus_complexity, argi)) { + config->cfg.rc_2pass_vbr_corpus_complexity = arg_parse_uint(&arg); + if (global->passes < 2) warn("option %s ignored in one-pass mode.\n", arg.name); } else if (arg_match(&arg, &kf_min_dist, argi)) { @@ -1229,6 +1039,40 @@ static int parse_stream_params(struct VpxEncoderConfig *global, config->cfg.kf_max_dist = arg_parse_uint(&arg); } else if (arg_match(&arg, &kf_disabled, argi)) { config->cfg.kf_mode = VPX_KF_DISABLED; +#if CONFIG_VP9_ENCODER + } else if (arg_match(&arg, &use_vizier_rc_params, argi)) { + config->cfg.use_vizier_rc_params = arg_parse_int(&arg); + } else if (arg_match(&arg, &active_wq_factor, argi)) { + config->cfg.active_wq_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &err_per_mb_factor, argi)) { + config->cfg.err_per_mb_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &sr_default_decay_limit, argi)) { + config->cfg.sr_default_decay_limit = arg_parse_rational(&arg); + } else if (arg_match(&arg, &sr_diff_factor, argi)) { + config->cfg.sr_diff_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_err_per_mb_factor, argi)) { + config->cfg.kf_err_per_mb_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_min_boost_factor, argi)) { + config->cfg.kf_frame_min_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_max_boost_first_factor, argi)) { + config->cfg.kf_frame_max_boost_first_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_frame_max_boost_subs_factor, argi)) { + config->cfg.kf_frame_max_boost_subs_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &kf_max_total_boost_factor, argi)) { + config->cfg.kf_max_total_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &gf_max_total_boost_factor, argi)) { + config->cfg.gf_max_total_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &gf_frame_max_boost_factor, argi)) { + config->cfg.gf_frame_max_boost_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &zm_factor, argi)) { + config->cfg.zm_factor = arg_parse_rational(&arg); + } else if (arg_match(&arg, &rd_mult_inter_qp_fac, argi)) { + config->cfg.rd_mult_inter_qp_fac = arg_parse_rational(&arg); + } else if (arg_match(&arg, &rd_mult_arf_qp_fac, argi)) { + config->cfg.rd_mult_arf_qp_fac = arg_parse_rational(&arg); + } else if (arg_match(&arg, &rd_mult_key_qp_fac, argi)) { + config->cfg.rd_mult_key_qp_fac = arg_parse_rational(&arg); +#endif #if CONFIG_VP9_HIGHBITDEPTH } else if (arg_match(&arg, &test16bitinternalarg, argi)) { if (strcmp(global->codec->name, "vp9") == 0) { @@ -1243,8 +1087,8 @@ static int parse_stream_params(struct VpxEncoderConfig *global, match = 1; /* Point either to the next free element or the first - * instance of this control. - */ + * instance of this control. + */ for (j = 0; j < config->arg_ctrl_cnt; j++) if (ctrl_args_map != NULL && config->arg_ctrls[j][0] == ctrl_args_map[i]) @@ -1321,17 +1165,6 @@ static void validate_stream_config(const struct stream_state *stream, fatal("Stream %d: duplicate stats file (from stream %d)", streami->index, stream->index); } - -#if CONFIG_FP_MB_STATS - /* Check for two streams sharing a mb stats file. */ - if (streami != stream) { - const char *a = stream->config.fpmb_stats_fn; - const char *b = streami->config.fpmb_stats_fn; - if (a && b && !strcmp(a, b)) - fatal("Stream %d: duplicate mb stats file (from stream %d)", - streami->index, stream->index); - } -#endif } } @@ -1419,9 +1252,14 @@ static void show_stream_config(struct stream_state *stream, SHOW(rc_2pass_vbr_bias_pct); SHOW(rc_2pass_vbr_minsection_pct); SHOW(rc_2pass_vbr_maxsection_pct); + SHOW(rc_2pass_vbr_corpus_complexity); SHOW(kf_mode); SHOW(kf_min_dist); SHOW(kf_max_dist); + // Temporary use for debug + SHOW(use_vizier_rc_params); + SHOW(active_wq_factor.num); + SHOW(active_wq_factor.den); } static void open_output_file(struct stream_state *stream, @@ -1485,26 +1323,11 @@ static void setup_pass(struct stream_state *stream, fatal("Failed to open statistics store"); } -#if CONFIG_FP_MB_STATS - if (stream->config.fpmb_stats_fn) { - if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn, - pass)) - fatal("Failed to open mb statistics store"); - } else { - if (!stats_open_mem(&stream->fpmb_stats, pass)) - fatal("Failed to open mb statistics store"); - } -#endif - stream->config.cfg.g_pass = global->passes == 2 ? pass ? VPX_RC_LAST_PASS : VPX_RC_FIRST_PASS : VPX_RC_ONE_PASS; if (pass) { stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats); -#if CONFIG_FP_MB_STATS - stream->config.cfg.rc_firstpass_mb_stats_in = - stats_get(&stream->fpmb_stats); -#endif } stream->cx_time = 0; @@ -1578,14 +1401,14 @@ static void encode_frame(struct stream_state *stream, vpx_img_alloc(NULL, VPX_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16); } I420Scale_16( - (uint16 *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2, - (uint16 *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2, - (uint16 *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2, - img->d_w, img->d_h, (uint16 *)stream->img->planes[VPX_PLANE_Y], + (uint16_t *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2, + (uint16_t *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2, + (uint16_t *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2, + img->d_w, img->d_h, (uint16_t *)stream->img->planes[VPX_PLANE_Y], stream->img->stride[VPX_PLANE_Y] / 2, - (uint16 *)stream->img->planes[VPX_PLANE_U], + (uint16_t *)stream->img->planes[VPX_PLANE_U], stream->img->stride[VPX_PLANE_U] / 2, - (uint16 *)stream->img->planes[VPX_PLANE_V], + (uint16_t *)stream->img->planes[VPX_PLANE_V], stream->img->stride[VPX_PLANE_V] / 2, stream->img->d_w, stream->img->d_h, kFilterBox); img = stream->img; @@ -1657,7 +1480,7 @@ static void get_cx_data(struct stream_state *stream, *got_data = 0; while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter))) { static size_t fsize = 0; - static int64_t ivf_header_pos = 0; + static FileOffset ivf_header_pos = 0; switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: @@ -1683,7 +1506,7 @@ static void get_cx_data(struct stream_state *stream, fsize += pkt->data.frame.sz; if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) { - const int64_t currpos = ftello(stream->file); + const FileOffset currpos = ftello(stream->file); fseeko(stream->file, ivf_header_pos, SEEK_SET); ivf_write_frame_size(stream->file, fsize); fseeko(stream->file, currpos, SEEK_SET); @@ -1716,13 +1539,6 @@ static void get_cx_data(struct stream_state *stream, pkt->data.twopass_stats.sz); stream->nbytes += pkt->data.raw.sz; break; -#if CONFIG_FP_MB_STATS - case VPX_CODEC_FPMB_STATS_PKT: - stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf, - pkt->data.firstpass_mb_stats.sz); - stream->nbytes += pkt->data.raw.sz; - break; -#endif case VPX_CODEC_PSNR_PKT: if (global->show_psnr) { @@ -1775,13 +1591,14 @@ static void test_decode(struct stream_state *stream, /* Get the internal reference frame */ if (strcmp(codec->name, "vp8") == 0) { struct vpx_ref_frame ref_enc, ref_dec; - int width, height; + unsigned int aligned_width = (stream->config.cfg.g_w + 15u) & ~15u; + unsigned int aligned_height = (stream->config.cfg.g_h + 15u) & ~15u; - width = (stream->config.cfg.g_w + 15) & ~15; - height = (stream->config.cfg.g_h + 15) & ~15; - vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1); + vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, aligned_width, aligned_height, + 1); enc_img = ref_enc.img; - vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1); + vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, aligned_width, aligned_height, + 1); dec_img = ref_dec.img; ref_enc.frame_type = VP8_LAST_FRAME; @@ -1881,10 +1698,9 @@ int main(int argc, const char **argv_) { int res = 0; memset(&input, 0, sizeof(input)); + memset(&raw, 0, sizeof(raw)); exec_name = argv_[0]; - if (argc < 3) usage_exit(); - /* Setup default input stream settings */ input.framerate.numerator = 30; input.framerate.denominator = 1; @@ -1896,14 +1712,21 @@ int main(int argc, const char **argv_) { * codec. */ argv = argv_dup(argc - 1, argv_ + 1); + if (!argv) { + fprintf(stderr, "Error allocating argument list\n"); + return EXIT_FAILURE; + } parse_global_config(&global, argv); + if (argc < 3) usage_exit(); + switch (global.color_type) { case I420: input.fmt = VPX_IMG_FMT_I420; break; case I422: input.fmt = VPX_IMG_FMT_I422; break; case I444: input.fmt = VPX_IMG_FMT_I444; break; case I440: input.fmt = VPX_IMG_FMT_I440; break; case YV12: input.fmt = VPX_IMG_FMT_YV12; break; + case NV12: input.fmt = VPX_IMG_FMT_NV12; break; } { @@ -1931,7 +1754,10 @@ int main(int argc, const char **argv_) { /* Handle non-option arguments */ input.filename = argv[0]; - if (!input.filename) usage_exit(); + if (!input.filename) { + fprintf(stderr, "No input file specified!\n"); + usage_exit(); + } /* Decide if other chroma subsamplings than 4:2:0 are supported */ if (global.codec->fourcc == VP9_FOURCC) input.only_i420 = 0; @@ -2022,14 +1848,10 @@ int main(int argc, const char **argv_) { FOREACH_STREAM(show_stream_config(stream, &global, &input)); if (pass == (global.pass ? global.pass - 1 : 0)) { - if (input.file_type == FILE_TYPE_Y4M) - /*The Y4M reader does its own allocation. - Just initialize this here to avoid problems if we never read any - frames.*/ - memset(&raw, 0, sizeof(raw)); - else + // The Y4M reader does its own allocation. + if (input.file_type != FILE_TYPE_Y4M) { vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32); - + } FOREACH_STREAM(stream->rate_hist = init_rate_histogram( &stream->config.cfg, &global.framerate)); } @@ -2153,10 +1975,9 @@ int main(int argc, const char **argv_) { } else { const int64_t input_pos = ftello(input.file); const int64_t input_pos_lagged = input_pos - lagged_count; - const int64_t limit = input.length; rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0; - remaining = limit - input_pos + lagged_count; + remaining = input.length - input_pos + lagged_count; } average_rate = @@ -2176,9 +1997,9 @@ int main(int argc, const char **argv_) { if (!global.quiet) { FOREACH_STREAM(fprintf( - stderr, "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 - "b/f %7" PRId64 "b/s" - " %7" PRId64 " %s (%.2f fps)\033[K\n", + stderr, + "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 "b/f %7" PRId64 + "b/s %7" PRId64 " %s (%.2f fps)\033[K\n", pass + 1, global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes, seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0, @@ -2215,10 +2036,6 @@ int main(int argc, const char **argv_) { FOREACH_STREAM(stats_close(&stream->stats, global.passes - 1)); -#if CONFIG_FP_MB_STATS - FOREACH_STREAM(stats_close(&stream->fpmb_stats, global.passes - 1)); -#endif - if (global.pass) break; } diff --git a/media/libvpx/libvpx/vpxenc.h b/media/libvpx/libvpx/vpxenc.h index d867e9d954..be54840f7d 100644 --- a/media/libvpx/libvpx/vpxenc.h +++ b/media/libvpx/libvpx/vpxenc.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPXENC_H_ -#define VPXENC_H_ +#ifndef VPX_VPXENC_H_ +#define VPX_VPXENC_H_ #include "vpx/vpx_encoder.h" @@ -28,6 +28,7 @@ typedef enum { I444, // 4:4:4 8+ bit-depth I440, // 4:4:0 8+ bit-depth YV12, // 4:2:0 with uv flipped, only 8-bit depth + NV12, // 4:2:0 with uv interleaved } ColorInputType; struct VpxInterface; @@ -61,4 +62,4 @@ struct VpxEncoderConfig { } // extern "C" #endif -#endif // VPXENC_H_ +#endif // VPX_VPXENC_H_ diff --git a/media/libvpx/libvpx/vpxstats.c b/media/libvpx/libvpx/vpxstats.c index 142e367bb4..c0dd14e450 100644 --- a/media/libvpx/libvpx/vpxstats.c +++ b/media/libvpx/libvpx/vpxstats.c @@ -41,7 +41,7 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) { stats->buf.buf = malloc(stats->buf_alloc_sz); if (!stats->buf.buf) - fatal("Failed to allocate first-pass stats buffer (%lu bytes)", + fatal("Failed to allocate first-pass stats buffer (%u bytes)", (unsigned int)stats->buf_alloc_sz); nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file); diff --git a/media/libvpx/libvpx/vpxstats.h b/media/libvpx/libvpx/vpxstats.h index 5c9ea34f71..3625ee3291 100644 --- a/media/libvpx/libvpx/vpxstats.h +++ b/media/libvpx/libvpx/vpxstats.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPXSTATS_H_ -#define VPXSTATS_H_ +#ifndef VPX_VPXSTATS_H_ +#define VPX_VPXSTATS_H_ #include @@ -40,4 +40,4 @@ vpx_fixed_buf_t stats_get(stats_io_t *stats); } // extern "C" #endif -#endif // VPXSTATS_H_ +#endif // VPX_VPXSTATS_H_ diff --git a/media/libvpx/libvpx/warnings.c b/media/libvpx/libvpx/warnings.c index a80da527f7..3e6e702536 100644 --- a/media/libvpx/libvpx/warnings.c +++ b/media/libvpx/libvpx/warnings.c @@ -98,7 +98,7 @@ void check_encoder_config(int disable_prompt, /* Count and print warnings. */ for (warning = warning_list.warning_node; warning != NULL; warning = warning->next_warning, ++num_warnings) { - warn(warning->warning_string); + warn("%s", warning->warning_string); } free_warning_list(&warning_list); diff --git a/media/libvpx/libvpx/warnings.h b/media/libvpx/libvpx/warnings.h index 6b8ae6796f..15558c6437 100644 --- a/media/libvpx/libvpx/warnings.h +++ b/media/libvpx/libvpx/warnings.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WARNINGS_H_ -#define WARNINGS_H_ +#ifndef VPX_WARNINGS_H_ +#define VPX_WARNINGS_H_ #ifdef __cplusplus extern "C" { @@ -30,4 +30,4 @@ void check_encoder_config(int disable_prompt, } // extern "C" #endif -#endif // WARNINGS_H_ +#endif // VPX_WARNINGS_H_ diff --git a/media/libvpx/libvpx/webmdec.cc b/media/libvpx/libvpx/webmdec.cc index ed4bd700dd..0576bb4978 100644 --- a/media/libvpx/libvpx/webmdec.cc +++ b/media/libvpx/libvpx/webmdec.cc @@ -19,25 +19,25 @@ namespace { void reset(struct WebmInputContext *const webm_ctx) { - if (webm_ctx->reader != NULL) { + if (webm_ctx->reader != nullptr) { mkvparser::MkvReader *const reader = reinterpret_cast(webm_ctx->reader); delete reader; } - if (webm_ctx->segment != NULL) { + if (webm_ctx->segment != nullptr) { mkvparser::Segment *const segment = reinterpret_cast(webm_ctx->segment); delete segment; } - if (webm_ctx->buffer != NULL) { + if (webm_ctx->buffer != nullptr) { delete[] webm_ctx->buffer; } - webm_ctx->reader = NULL; - webm_ctx->segment = NULL; - webm_ctx->buffer = NULL; - webm_ctx->cluster = NULL; - webm_ctx->block_entry = NULL; - webm_ctx->block = NULL; + webm_ctx->reader = nullptr; + webm_ctx->segment = nullptr; + webm_ctx->buffer = nullptr; + webm_ctx->cluster = nullptr; + webm_ctx->block_entry = nullptr; + webm_ctx->block = nullptr; webm_ctx->block_frame_index = 0; webm_ctx->video_track_index = 0; webm_ctx->timestamp_ns = 0; @@ -84,7 +84,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx, } const mkvparser::Tracks *const tracks = segment->GetTracks(); - const mkvparser::VideoTrack *video_track = NULL; + const mkvparser::VideoTrack *video_track = nullptr; for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) { const mkvparser::Track *const track = tracks->GetTrackByIndex(i); if (track->GetType() == mkvparser::Track::kVideo) { @@ -94,7 +94,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx, } } - if (video_track == NULL || video_track->GetCodecId() == NULL) { + if (video_track == nullptr || video_track->GetCodecId() == nullptr) { rewind_and_reset(webm_ctx, vpx_ctx); return 0; } @@ -137,12 +137,12 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, do { long status = 0; bool get_new_block = false; - if (block_entry == NULL && !block_entry_eos) { + if (block_entry == nullptr && !block_entry_eos) { status = cluster->GetFirst(block_entry); get_new_block = true; } else if (block_entry_eos || block_entry->EOS()) { cluster = segment->GetNext(cluster); - if (cluster == NULL || cluster->EOS()) { + if (cluster == nullptr || cluster->EOS()) { *buffer_size = 0; webm_ctx->reached_eos = 1; return 1; @@ -150,25 +150,26 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, status = cluster->GetFirst(block_entry); block_entry_eos = false; get_new_block = true; - } else if (block == NULL || + } else if (block == nullptr || webm_ctx->block_frame_index == block->GetFrameCount() || block->GetTrackNumber() != webm_ctx->video_track_index) { status = cluster->GetNext(block_entry, block_entry); - if (block_entry == NULL || block_entry->EOS()) { + if (block_entry == nullptr || block_entry->EOS()) { block_entry_eos = true; continue; } get_new_block = true; } - if (status || block_entry == NULL) { + if (status || block_entry == nullptr) { return -1; } if (get_new_block) { block = block_entry->GetBlock(); + if (block == nullptr) return -1; webm_ctx->block_frame_index = 0; } - } while (block->GetTrackNumber() != webm_ctx->video_track_index || - block_entry_eos); + } while (block_entry_eos || + block->GetTrackNumber() != webm_ctx->video_track_index); webm_ctx->cluster = cluster; webm_ctx->block_entry = block_entry; @@ -178,9 +179,9 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, block->GetFrame(webm_ctx->block_frame_index); ++webm_ctx->block_frame_index; if (frame.len > static_cast(*buffer_size)) { - delete[] * buffer; + delete[] *buffer; *buffer = new uint8_t[frame.len]; - if (*buffer == NULL) { + if (*buffer == nullptr) { return -1; } webm_ctx->buffer = *buffer; @@ -197,7 +198,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer, int webm_guess_framerate(struct WebmInputContext *webm_ctx, struct VpxInputContext *vpx_ctx) { uint32_t i = 0; - uint8_t *buffer = NULL; + uint8_t *buffer = nullptr; size_t buffer_size = 0; while (webm_ctx->timestamp_ns < 1000000000 && i < 50) { if (webm_read_frame(webm_ctx, &buffer, &buffer_size)) { @@ -209,10 +210,12 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx, vpx_ctx->framerate.denominator = static_cast(webm_ctx->timestamp_ns / 1000); delete[] buffer; + // webm_ctx->buffer is assigned to the buffer pointer in webm_read_frame(). + webm_ctx->buffer = nullptr; get_first_cluster(webm_ctx); - webm_ctx->block = NULL; - webm_ctx->block_entry = NULL; + webm_ctx->block = nullptr; + webm_ctx->block_entry = nullptr; webm_ctx->block_frame_index = 0; webm_ctx->timestamp_ns = 0; webm_ctx->reached_eos = 0; diff --git a/media/libvpx/libvpx/webmdec.h b/media/libvpx/libvpx/webmdec.h index 7dcb170caf..6ae7ee16d0 100644 --- a/media/libvpx/libvpx/webmdec.h +++ b/media/libvpx/libvpx/webmdec.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBMDEC_H_ -#define WEBMDEC_H_ +#ifndef VPX_WEBMDEC_H_ +#define VPX_WEBMDEC_H_ #include "./tools_common.h" @@ -27,7 +27,7 @@ struct WebmInputContext { const void *block; int block_frame_index; int video_track_index; - uint64_t timestamp_ns; + int64_t timestamp_ns; int is_key_frame; int reached_eos; }; @@ -66,4 +66,4 @@ void webm_free(struct WebmInputContext *webm_ctx); } // extern "C" #endif -#endif // WEBMDEC_H_ +#endif // VPX_WEBMDEC_H_ diff --git a/media/libvpx/libvpx/webmenc.cc b/media/libvpx/libvpx/webmenc.cc index 66606674b0..c718ab5a9f 100644 --- a/media/libvpx/libvpx/webmenc.cc +++ b/media/libvpx/libvpx/webmenc.cc @@ -90,6 +90,6 @@ void write_webm_file_footer(struct WebmOutputContext *webm_ctx) { segment->Finalize(); delete segment; delete writer; - webm_ctx->writer = NULL; - webm_ctx->segment = NULL; + webm_ctx->writer = nullptr; + webm_ctx->segment = nullptr; } diff --git a/media/libvpx/libvpx/webmenc.h b/media/libvpx/libvpx/webmenc.h index b4a9e357bb..4176e82081 100644 --- a/media/libvpx/libvpx/webmenc.h +++ b/media/libvpx/libvpx/webmenc.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBMENC_H_ -#define WEBMENC_H_ +#ifndef VPX_WEBMENC_H_ +#define VPX_WEBMENC_H_ #include #include @@ -52,4 +52,4 @@ void write_webm_file_footer(struct WebmOutputContext *webm_ctx); } // extern "C" #endif -#endif // WEBMENC_H_ +#endif // VPX_WEBMENC_H_ diff --git a/media/libvpx/libvpx/y4menc.c b/media/libvpx/libvpx/y4menc.c index e26fcaf6ea..3940249cb9 100644 --- a/media/libvpx/libvpx/y4menc.c +++ b/media/libvpx/libvpx/y4menc.c @@ -9,6 +9,7 @@ */ #include +#include #include "./y4menc.h" int y4m_write_file_header(char *buf, size_t len, int width, int height, @@ -17,41 +18,34 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height, const char *color; switch (bit_depth) { case 8: - color = fmt == VPX_IMG_FMT_444A - ? "C444alpha\n" - : fmt == VPX_IMG_FMT_I444 ? "C444\n" : fmt == VPX_IMG_FMT_I422 - ? "C422\n" - : "C420jpeg\n"; + color = fmt == VPX_IMG_FMT_I444 ? "C444\n" + : fmt == VPX_IMG_FMT_I422 ? "C422\n" + : "C420jpeg\n"; break; case 9: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p9 XYSCSS=444P9\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" - : "C420p9 XYSCSS=420P9\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p9 XYSCSS=444P9\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n" + : "C420p9 XYSCSS=420P9\n"; break; case 10: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p10 XYSCSS=444P10\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" - : "C420p10 XYSCSS=420P10\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p10 XYSCSS=444P10\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n" + : "C420p10 XYSCSS=420P10\n"; break; case 12: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p12 XYSCSS=444P12\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" - : "C420p12 XYSCSS=420P12\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p12 XYSCSS=444P12\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n" + : "C420p12 XYSCSS=420P12\n"; break; case 14: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p14 XYSCSS=444P14\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" - : "C420p14 XYSCSS=420P14\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p14 XYSCSS=444P14\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n" + : "C420p14 XYSCSS=420P14\n"; break; case 16: - color = fmt == VPX_IMG_FMT_I44416 - ? "C444p16 XYSCSS=444P16\n" - : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" - : "C420p16 XYSCSS=420P16\n"; + color = fmt == VPX_IMG_FMT_I44416 ? "C444p16 XYSCSS=444P16\n" + : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n" + : "C420p16 XYSCSS=420P16\n"; break; default: color = NULL; assert(0); } diff --git a/media/libvpx/libvpx/y4menc.h b/media/libvpx/libvpx/y4menc.h index 69d590413e..9a367e34c6 100644 --- a/media/libvpx/libvpx/y4menc.h +++ b/media/libvpx/libvpx/y4menc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef Y4MENC_H_ -#define Y4MENC_H_ +#ifndef VPX_Y4MENC_H_ +#define VPX_Y4MENC_H_ #include "./tools_common.h" @@ -30,4 +30,4 @@ int y4m_write_frame_header(char *buf, size_t len); } // extern "C" #endif -#endif // Y4MENC_H_ +#endif // VPX_Y4MENC_H_ diff --git a/media/libvpx/libvpx/y4minput.c b/media/libvpx/libvpx/y4minput.c index acf7d69fe9..40f152d057 100644 --- a/media/libvpx/libvpx/y4minput.c +++ b/media/libvpx/libvpx/y4minput.c @@ -10,7 +10,9 @@ * Based on code from the OggTheora software codec source code, * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. */ +#include #include +#include #include #include @@ -20,12 +22,13 @@ // Reads 'size' bytes from 'file' into 'buf' with some fault tolerance. // Returns true on success. static int file_read(void *buf, size_t size, FILE *file) { - const int kMaxRetries = 5; - int retry_count = 0; - int file_error; + const int kMaxTries = 5; + int try_count = 0; + int file_error = 0; size_t len = 0; - do { + while (!feof(file) && len < size && try_count < kMaxTries) { const size_t n = fread((uint8_t *)buf + len, 1, size - len, file); + ++try_count; len += n; file_error = ferror(file); if (file_error) { @@ -38,28 +41,21 @@ static int file_read(void *buf, size_t size, FILE *file) { return 0; } } - } while (!feof(file) && len < size && ++retry_count < kMaxRetries); + } if (!feof(file) && len != size) { fprintf(stderr, "Error reading file: %u of %u bytes read," - " error: %d, retries: %d, %d: %s\n", - (uint32_t)len, (uint32_t)size, file_error, retry_count, errno, + " error: %d, tries: %d, %d: %s\n", + (uint32_t)len, (uint32_t)size, file_error, try_count, errno, strerror(errno)); } return len == size; } static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { - int got_w; - int got_h; - int got_fps; - int got_interlace; - int got_par; - int got_chroma; char *p; char *q; - got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0; for (p = _tags;; p = q) { /*Skip any leading spaces.*/ while (*p == ' ') p++; @@ -72,52 +68,119 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { switch (p[0]) { case 'W': { if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1; - got_w = 1; break; } case 'H': { if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1; - got_h = 1; break; } case 'F': { if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) { return -1; } - got_fps = 1; break; } case 'I': { _y4m->interlace = p[1]; - got_interlace = 1; break; } case 'A': { if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) { return -1; } - got_par = 1; break; } case 'C': { if (q - p > 16) return -1; memcpy(_y4m->chroma_type, p + 1, q - p - 1); _y4m->chroma_type[q - p - 1] = '\0'; - got_chroma = 1; break; } /*Ignore unknown tags.*/ } } - if (!got_w || !got_h || !got_fps) return -1; - if (!got_interlace) _y4m->interlace = '?'; - if (!got_par) _y4m->par_n = _y4m->par_d = 0; - /*Chroma-type is not specified in older files, e.g., those generated by - mplayer.*/ - if (!got_chroma) strcpy(_y4m->chroma_type, "420"); return 0; } +// Copy a single tag into the buffer, along with a null character. +// Returns 0 if any file IO errors occur. +static int copy_tag(char *buf, size_t buf_len, char *end_tag, FILE *file) { + size_t i; + assert(buf_len >= 1); + // Skip leading space characters. + do { + if (!file_read(buf, 1, file)) { + return 0; + } + } while (buf[0] == ' '); + + // If we hit the newline, treat this as the "empty" tag. + if (buf[0] == '\n') { + buf[0] = '\0'; + *end_tag = '\n'; + return 1; + } + + // Copy over characters until a space is hit, or the buffer is exhausted. + for (i = 1; i < buf_len; ++i) { + if (!file_read(buf + i, 1, file)) { + return 0; + } + if (buf[i] == ' ' || buf[i] == '\n') { + break; + } + } + if (i == buf_len) { + fprintf(stderr, "Error: Y4M header tags must be less than %lu characters\n", + (unsigned long)i); + return 0; + } + *end_tag = buf[i]; + buf[i] = '\0'; + return 1; +} + +/* Returns 1 if tags were parsed successfully, 0 otherwise. */ +static int parse_tags(y4m_input *y4m_ctx, FILE *file) { + char tag[256]; + char end; /* Character denoting the end of the tag, ' ' or '\n'. */ + /* Set Y4M tags to defaults, updating them as processing occurs. Mandatory + fields are marked with -1 and will be checked after the tags are parsed. */ + y4m_ctx->pic_w = -1; + y4m_ctx->pic_h = -1; + y4m_ctx->fps_n = -1; /* Also serves as marker for fps_d */ + y4m_ctx->par_n = 0; + y4m_ctx->par_d = 0; + y4m_ctx->interlace = '?'; + snprintf(y4m_ctx->chroma_type, sizeof(y4m_ctx->chroma_type), "420"); + + /* Find one tag at a time. */ + do { + if (!copy_tag(tag, sizeof(tag), &end, file)) { + return 0; + } + /* y4m_parse_tags returns 0 on success. */ + if (y4m_parse_tags(y4m_ctx, tag)) { + return 0; + } + } while (end != '\n'); + + /* Check the mandatory fields. */ + if (y4m_ctx->pic_w == -1) { + fprintf(stderr, "Width field missing\n"); + return 0; + } + if (y4m_ctx->pic_h == -1) { + fprintf(stderr, "Height field missing\n"); + return 0; + } + if (y4m_ctx->fps_n == -1) { + fprintf(stderr, "FPS field missing\n"); + return 0; + } + return 1; +} + /*All anti-aliasing filters in the following conversion functions are based on one of two window functions: The 6-tap Lanczos window (for down-sampling and shifts): @@ -130,8 +193,8 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { The number of taps is intentionally kept small to reduce computational overhead and limit ringing. - The taps from these filters are scaled so that their sum is 1, and the result - is scaled by 128 and rounded to integers to create a filter whose + The taps from these filters are scaled so that their sum is 1, and the + result is scaled by 128 and rounded to integers to create a filter whose intermediate values fit inside 16 bits. Coefficients are rounded in such a way as to ensure their sum is still 128, which is usually equivalent to normal rounding. @@ -139,7 +202,6 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { Conversions which require both horizontal and vertical filtering could have these steps pipelined, for less memory consumption and better cache performance, but we do them separately for simplicity.*/ - #define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a)) #define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a)) #define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c))) @@ -195,26 +257,29 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, window.*/ for (x = 0; x < OC_MINI(_c_w, 2); x++) { _dst[x] = (unsigned char)OC_CLAMPI( - 0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + - 35 * _src[OC_MINI(x + 1, _c_w - 1)] - - 9 * _src[OC_MINI(x + 2, _c_w - 1)] + - _src[OC_MINI(x + 3, _c_w - 1)] + 64) >> - 7, + 0, + (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] + + 64) >> + 7, 255); } for (; x < _c_w - 3; x++) { _dst[x] = (unsigned char)OC_CLAMPI( - 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + - 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> - 7, + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >> + 7, 255); } for (; x < _c_w; x++) { _dst[x] = (unsigned char)OC_CLAMPI( - 0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + - 35 * _src[OC_MINI(x + 1, _c_w - 1)] - - 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> - 7, + 0, + (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] + + 35 * _src[OC_MINI(x + 1, _c_w - 1)] - + 9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >> + 7, 255); } _dst += _c_w; @@ -222,26 +287,6 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst, } } -/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/ -static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst, - unsigned char *_aux) { - int c_w; - int c_h; - int c_sz; - int pli; - /*Skip past the luma data.*/ - _dst += _y4m->pic_w * _y4m->pic_h; - /*Compute the size of each chroma plane.*/ - c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h; - c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v; - c_sz = c_w * c_h; - for (pli = 1; pli < 3; pli++) { - y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h); - _dst += c_sz; - _aux += c_sz; - } -} - /*This format is only used for interlaced content, but is included for completeness. @@ -314,28 +359,31 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, for (x = 0; x < c_w; x++) { for (y = 0; y < OC_MINI(c_h, 3); y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + - 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + - 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> - 7, + 0, + (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] + + 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >> + 7, 255); } for (; y < c_h - 2; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + - 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> - 7, + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >> + 7, 255); } for (; y < c_h; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + - 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - - 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + - 4 * tmp[(c_h - 1) * c_w] + 64) >> - 7, + 0, + (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] + + 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] - + 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] + + 4 * tmp[(c_h - 1) * c_w] + 64) >> + 7, 255); } _dst++; @@ -361,10 +409,11 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst, } for (; y < c_h - 3; y++) { _dst[y * c_w] = (unsigned char)OC_CLAMPI( - 0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + - 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - - 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> - 7, + 0, + (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] + + 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] - + 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >> + 7, 255); } for (; y < c_h; y++) { @@ -404,18 +453,20 @@ static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst, for (x = 0; x < _c_w; x++) { for (y = 0; y < OC_MINI(_c_h, 2); y += 2) { _dst[(y >> 1) * _c_w] = - OC_CLAMPI(0, (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - - 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + - 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> - 7, + OC_CLAMPI(0, + (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] - + 17 * _src[OC_MINI(2, _c_h - 1) * _c_w] + + 3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >> + 7, 255); } for (; y < _c_h - 3; y += 2) { _dst[(y >> 1) * _c_w] = - OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - - 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + - 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> - 7, + OC_CLAMPI(0, + (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) - + 17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) + + 78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >> + 7, 255); } for (; y < _c_h; y += 2) { @@ -642,33 +693,38 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst, 4-tap Mitchell window.*/ for (x = 0; x < OC_MINI(c_w, 1); x++) { tmp[x << 1] = (unsigned char)OC_CLAMPI( - 0, (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - - _aux[OC_MINI(2, c_w - 1)] + 64) >> - 7, + 0, + (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] - + _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, 255); tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( - 0, (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - - 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> - 7, + 0, + (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] - + 5 * _aux[OC_MINI(2, c_w - 1)] + 64) >> + 7, 255); } for (; x < c_w - 2; x++) { tmp[x << 1] = - (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] + - 18 * _aux[x + 1] - _aux[x + 2] + 64) >> - 7, + (unsigned char)OC_CLAMPI(0, + (_aux[x - 1] + 110 * _aux[x] + + 18 * _aux[x + 1] - _aux[x + 2] + 64) >> + 7, 255); tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( - 0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - - 5 * _aux[x + 2] + 64) >> - 7, + 0, + (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] - + 5 * _aux[x + 2] + 64) >> + 7, 255); } for (; x < c_w; x++) { tmp[x << 1] = (unsigned char)OC_CLAMPI( - 0, (_aux[x - 1] + 110 * _aux[x] + - 18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >> - 7, + 0, + (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] - + _aux[c_w - 1] + 64) >> + 7, 255); if ((x << 1 | 1) < dst_c_w) { tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI( @@ -718,27 +774,29 @@ static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst, /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/ for (y = 0; y < c_h; y++) { for (x = 0; x < OC_MINI(c_w, 2); x += 2) { - tmp[x >> 1] = - OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - - 17 * _aux[OC_MINI(2, c_w - 1)] + - 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> - 7, - 255); + tmp[x >> 1] = OC_CLAMPI(0, + (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] - + 17 * _aux[OC_MINI(2, c_w - 1)] + + 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >> + 7, + 255); } for (; x < c_w - 3; x += 2) { - tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) - - 17 * (_aux[x - 1] + _aux[x + 2]) + - 78 * (_aux[x] + _aux[x + 1]) + 64) >> - 7, + tmp[x >> 1] = OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[x + 3]) - + 17 * (_aux[x - 1] + _aux[x + 2]) + + 78 * (_aux[x] + _aux[x + 1]) + 64) >> + 7, 255); } for (; x < c_w; x += 2) { - tmp[x >> 1] = OC_CLAMPI( - 0, (3 * (_aux[x - 2] + _aux[c_w - 1]) - - 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + - 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> - 7, - 255); + tmp[x >> 1] = + OC_CLAMPI(0, + (3 * (_aux[x - 2] + _aux[c_w - 1]) - + 17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) + + 78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >> + 7, + 255); } tmp += dst_c_w; _aux += c_w; @@ -769,299 +827,277 @@ static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst, (void)_aux; } -int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, - int only_420) { - char buffer[80] = { 0 }; - int ret; - int i; - /*Read until newline, or 80 cols, whichever happens first.*/ - for (i = 0; i < 79; i++) { - if (_nskip > 0) { - buffer[i] = *_skip++; - _nskip--; - } else { - if (!file_read(buffer + i, 1, _fin)) return -1; - } - if (buffer[i] == '\n') break; +static const char TAG[] = "YUV4MPEG2"; + +int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, + int num_skip, int only_420) { + // File must start with |TAG|. + char tag_buffer[9]; // 9 == strlen(TAG) + // Read as much as possible from |skip_buffer|, which were characters + // that were previously read from the file to do input-type detection. + assert(num_skip >= 0 && num_skip <= 8); + if (num_skip > 0) { + memcpy(tag_buffer, skip_buffer, num_skip); } - /*We skipped too much header data.*/ - if (_nskip > 0) return -1; - if (i == 79) { - fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n"); + // Start reading from the file now that the |skip_buffer| is depleted. + if (!file_read(tag_buffer + num_skip, 9 - num_skip, file)) { return -1; } - buffer[i] = '\0'; - if (memcmp(buffer, "YUV4MPEG", 8)) { - fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n"); + if (memcmp(TAG, tag_buffer, 9) != 0) { + fprintf(stderr, "Error parsing header: must start with %s\n", TAG); return -1; } - if (buffer[8] != '2') { - fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n"); + // Next character must be a space. + if (!file_read(tag_buffer, 1, file) || tag_buffer[0] != ' ') { + fprintf(stderr, "Error parsing header: space must follow %s\n", TAG); + return -1; } - ret = y4m_parse_tags(_y4m, buffer + 5); - if (ret < 0) { - fprintf(stderr, "Error parsing YUV4MPEG2 header.\n"); - return ret; + if (!parse_tags(y4m_ctx, file)) { + fprintf(stderr, "Error parsing %s header.\n", TAG); } - if (_y4m->interlace == '?') { + if (y4m_ctx->interlace == '?') { fprintf(stderr, "Warning: Input video interlacing format unknown; " "assuming progressive scan.\n"); - } else if (_y4m->interlace != 'p') { + } else if (y4m_ctx->interlace != 'p') { fprintf(stderr, "Input video is interlaced; " "Only progressive scan handled.\n"); return -1; } - _y4m->vpx_fmt = VPX_IMG_FMT_I420; - _y4m->bps = 12; - _y4m->bit_depth = 8; - if (strcmp(_y4m->chroma_type, "420") == 0 || - strcmp(_y4m->chroma_type, "420jpeg") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = - _y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I420; + y4m_ctx->bps = 12; + y4m_ctx->bit_depth = 8; + y4m_ctx->aux_buf = NULL; + y4m_ctx->dst_buf = NULL; + if (strcmp(y4m_ctx->chroma_type, "420") == 0 || + strcmp(y4m_ctx->chroma_type, "420jpeg") == 0 || + strcmp(y4m_ctx->chroma_type, "420mpeg2") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); /* Natively supported: no conversion required. */ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; - } else if (strcmp(_y4m->chroma_type, "420p10") == 0) { - _y4m->src_c_dec_h = 2; - _y4m->dst_c_dec_h = 2; - _y4m->src_c_dec_v = 2; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = - 2 * (_y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2)); + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "420p10") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2)); /* Natively supported: no conversion required. */ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; - _y4m->bit_depth = 10; - _y4m->bps = 15; - _y4m->vpx_fmt = VPX_IMG_FMT_I42016; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + y4m_ctx->bit_depth = 10; + y4m_ctx->bps = 15; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016; if (only_420) { fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "420p12") == 0) { - _y4m->src_c_dec_h = 2; - _y4m->dst_c_dec_h = 2; - _y4m->src_c_dec_v = 2; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = - 2 * (_y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2)); + } else if (strcmp(y4m_ctx->chroma_type, "420p12") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2)); /* Natively supported: no conversion required. */ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; - _y4m->bit_depth = 12; - _y4m->bps = 18; - _y4m->vpx_fmt = VPX_IMG_FMT_I42016; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; + y4m_ctx->bit_depth = 12; + y4m_ctx->bps = 18; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42016; if (only_420) { fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; - /*Chroma filter required: read into the aux buf first.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); - _y4m->convert = y4m_convert_42xmpeg2_42xjpeg; - } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + } else if (strcmp(y4m_ctx->chroma_type, "420paldv") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_v = + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ - _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); - _y4m->aux_buf_read_sz = - 2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2); - _y4m->convert = y4m_convert_42xpaldv_42xjpeg; - } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) { - _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2; - _y4m->src_c_dec_v = 1; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + y4m_ctx->aux_buf_sz = + 3 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * ((y4m_ctx->pic_h + 1) / 2); + y4m_ctx->convert = y4m_convert_42xpaldv_42xjpeg; + } else if (strcmp(y4m_ctx->chroma_type, "422jpeg") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = - 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->convert = y4m_convert_422jpeg_420jpeg; - } else if (strcmp(_y4m->chroma_type, "422") == 0) { - _y4m->src_c_dec_h = 2; - _y4m->src_c_dec_v = 1; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_422jpeg_420jpeg; + } else if (strcmp(y4m_ctx->chroma_type, "422") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; if (only_420) { - _y4m->dst_c_dec_h = 2; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ - _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->aux_buf_sz = - _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->convert = y4m_convert_422_420jpeg; + y4m_ctx->aux_buf_read_sz = + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_422_420jpeg; } else { - _y4m->vpx_fmt = VPX_IMG_FMT_I422; - _y4m->bps = 16; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = - _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I422; + y4m_ctx->bps = 16; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; /*Natively supported: no conversion required.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; } - } else if (strcmp(_y4m->chroma_type, "422p10") == 0) { - _y4m->src_c_dec_h = 2; - _y4m->src_c_dec_v = 1; - _y4m->vpx_fmt = VPX_IMG_FMT_I42216; - _y4m->bps = 20; - _y4m->bit_depth = 10; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h); - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "422p10") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216; + y4m_ctx->bps = 20; + y4m_ctx->bit_depth = 10; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h); + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "422p12") == 0) { - _y4m->src_c_dec_h = 2; - _y4m->src_c_dec_v = 1; - _y4m->vpx_fmt = VPX_IMG_FMT_I42216; - _y4m->bps = 24; - _y4m->bit_depth = 12; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h); - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "422p12") == 0) { + y4m_ctx->src_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I42216; + y4m_ctx->bps = 24; + y4m_ctx->bit_depth = 12; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = + 2 * (y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h); + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "411") == 0) { - _y4m->src_c_dec_h = 4; - _y4m->dst_c_dec_h = 2; - _y4m->src_c_dec_v = 1; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + } else if (strcmp(y4m_ctx->chroma_type, "411") == 0) { + y4m_ctx->src_c_dec_h = 4; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ - _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h; - _y4m->aux_buf_sz = - _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->convert = y4m_convert_411_420jpeg; - } else if (strcmp(_y4m->chroma_type, "444") == 0) { - _y4m->src_c_dec_h = 1; - _y4m->src_c_dec_v = 1; + y4m_ctx->aux_buf_read_sz = 2 * ((y4m_ctx->pic_w + 3) / 4) * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = + y4m_ctx->aux_buf_read_sz + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_411_420jpeg; + fprintf(stderr, "Unsupported conversion from yuv 411\n"); + return -1; + } else if (strcmp(y4m_ctx->chroma_type, "444") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; if (only_420) { - _y4m->dst_c_dec_h = 2; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + y4m_ctx->dst_c_dec_h = 2; + y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*Chroma filter required: read into the aux buf first. We need to make two filter passes, so we need some extra space in the aux buffer.*/ - _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h; - _y4m->aux_buf_sz = - _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h; - _y4m->convert = y4m_convert_444_420jpeg; + y4m_ctx->aux_buf_read_sz = 2 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz + + ((y4m_ctx->pic_w + 1) / 2) * y4m_ctx->pic_h; + y4m_ctx->convert = y4m_convert_444_420jpeg; } else { - _y4m->vpx_fmt = VPX_IMG_FMT_I444; - _y4m->bps = 24; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I444; + y4m_ctx->bps = 24; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; /*Natively supported: no conversion required.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; } - } else if (strcmp(_y4m->chroma_type, "444p10") == 0) { - _y4m->src_c_dec_h = 1; - _y4m->src_c_dec_v = 1; - _y4m->vpx_fmt = VPX_IMG_FMT_I44416; - _y4m->bps = 30; - _y4m->bit_depth = 10; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h; - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "444p10") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416; + y4m_ctx->bps = 30; + y4m_ctx->bit_depth = 10; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "444p12") == 0) { - _y4m->src_c_dec_h = 1; - _y4m->src_c_dec_v = 1; - _y4m->vpx_fmt = VPX_IMG_FMT_I44416; - _y4m->bps = 36; - _y4m->bit_depth = 12; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h; - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; + } else if (strcmp(y4m_ctx->chroma_type, "444p12") == 0) { + y4m_ctx->src_c_dec_h = 1; + y4m_ctx->src_c_dec_v = 1; + y4m_ctx->vpx_fmt = VPX_IMG_FMT_I44416; + y4m_ctx->bps = 36; + y4m_ctx->bit_depth = 12; + y4m_ctx->dst_c_dec_h = y4m_ctx->src_c_dec_h; + y4m_ctx->dst_c_dec_v = y4m_ctx->src_c_dec_v; + y4m_ctx->dst_buf_read_sz = 2 * 3 * y4m_ctx->pic_w * y4m_ctx->pic_h; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_null; if (only_420) { fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n"); return -1; } - } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) { - _y4m->src_c_dec_h = 1; - _y4m->src_c_dec_v = 1; - if (only_420) { - _y4m->dst_c_dec_h = 2; - _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; - /*Chroma filter required: read into the aux buf first. - We need to make two filter passes, so we need some extra space in the - aux buffer. - The extra plane also gets read into the aux buf. - It will be discarded.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h; - _y4m->convert = y4m_convert_444_420jpeg; - } else { - _y4m->vpx_fmt = VPX_IMG_FMT_444A; - _y4m->bps = 32; - _y4m->dst_c_dec_h = _y4m->src_c_dec_h; - _y4m->dst_c_dec_v = _y4m->src_c_dec_v; - _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h; - /*Natively supported: no conversion required.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_null; - } - } else if (strcmp(_y4m->chroma_type, "mono") == 0) { - _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0; - _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2; - _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h; + } else if (strcmp(y4m_ctx->chroma_type, "mono") == 0) { + y4m_ctx->src_c_dec_h = y4m_ctx->src_c_dec_v = 0; + y4m_ctx->dst_c_dec_h = y4m_ctx->dst_c_dec_v = 2; + y4m_ctx->dst_buf_read_sz = y4m_ctx->pic_w * y4m_ctx->pic_h; /*No extra space required, but we need to clear the chroma planes.*/ - _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0; - _y4m->convert = y4m_convert_mono_420jpeg; + y4m_ctx->aux_buf_sz = y4m_ctx->aux_buf_read_sz = 0; + y4m_ctx->convert = y4m_convert_mono_420jpeg; } else { - fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type); + fprintf(stderr, "Unknown chroma sampling type: %s\n", y4m_ctx->chroma_type); return -1; } /*The size of the final frame buffers is always computed from the destination chroma decimation type.*/ - _y4m->dst_buf_sz = - _y4m->pic_w * _y4m->pic_h + - 2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) * - ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v); - if (_y4m->bit_depth == 8) - _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz); + y4m_ctx->dst_buf_sz = + y4m_ctx->pic_w * y4m_ctx->pic_h + + 2 * ((y4m_ctx->pic_w + y4m_ctx->dst_c_dec_h - 1) / y4m_ctx->dst_c_dec_h) * + ((y4m_ctx->pic_h + y4m_ctx->dst_c_dec_v - 1) / y4m_ctx->dst_c_dec_v); + if (y4m_ctx->bit_depth == 8) + y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz); else - _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz); + y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz); + if (!y4m_ctx->dst_buf) return -1; - if (_y4m->aux_buf_sz > 0) - _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz); + if (y4m_ctx->aux_buf_sz > 0) { + y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz); + if (!y4m_ctx->aux_buf) { + free(y4m_ctx->dst_buf); + return -1; + } + } return 0; } @@ -1113,6 +1149,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) { _img->fmt = _y4m->vpx_fmt; _img->w = _img->d_w = _y4m->pic_w; _img->h = _img->d_h = _y4m->pic_h; + _img->bit_depth = _y4m->bit_depth; _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1; _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1; _img->bps = _y4m->bps; diff --git a/media/libvpx/libvpx/y4minput.h b/media/libvpx/libvpx/y4minput.h index 9e69ceb835..573750d749 100644 --- a/media/libvpx/libvpx/y4minput.h +++ b/media/libvpx/libvpx/y4minput.h @@ -11,8 +11,8 @@ * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors. */ -#ifndef Y4MINPUT_H_ -#define Y4MINPUT_H_ +#ifndef VPX_Y4MINPUT_H_ +#define VPX_Y4MINPUT_H_ #include #include "vpx/vpx_image.h" @@ -56,8 +56,16 @@ struct y4m_input { unsigned int bit_depth; }; -int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip, - int only_420); +/** + * Open the input file, treating it as Y4M. |y4m_ctx| is filled in after + * reading it. The |skip_buffer| indicates bytes that were previously read + * from |file|, to do input-type detection; this buffer will be read before + * the |file| is read. It is of size |num_skip|, which *must* be 8 or less. + * + * Returns 0 on success, -1 on failure. + */ +int y4m_input_open(y4m_input *y4m_ctx, FILE *file, char *skip_buffer, + int num_skip, int only_420); void y4m_input_close(y4m_input *_y4m); int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img); @@ -65,4 +73,4 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img); } // extern "C" #endif -#endif // Y4MINPUT_H_ +#endif // VPX_Y4MINPUT_H_ diff --git a/media/libvpx/moz.build b/media/libvpx/moz.build index 529d93741a..11d53f74c3 100644 --- a/media/libvpx/moz.build +++ b/media/libvpx/moz.build @@ -66,8 +66,37 @@ elif CONFIG['CPU_ARCH'] == 'arm': ASFLAGS += [ '-no-integrated-as', ] +elif CONFIG['CPU_ARCH'] == 'aarch64': + EXPORTS.vpx += files['AARCH64_EXPORTS'] + SOURCES += files['AARCH64_SOURCES'] + if CONFIG['OS_TARGET'] == 'Darwin': + ASFLAGS += [ '-I%s/media/libvpx/config/mac/arm64/' % TOPSRCDIR ] + CFLAGS += [ '-I%s/media/libvpx/config/mac/arm64/' % TOPSRCDIR ] + else: # Linux, BSDs, etc. + ASFLAGS += [ '-I%s/media/libvpx/config/linux/arm64/' % TOPSRCDIR ] + CFLAGS += [ '-I%s/media/libvpx/config/linux/arm64/' % TOPSRCDIR ] +elif CONFIG['CPU_ARCH'] == 'mips32': + EXPORTS.vpx += files['MIPS32_EXPORTS'] + SOURCES += files['MIPS32_SOURCES'] + ASFLAGS += [ '-I%s/media/libvpx/config/linux/mips32/' % TOPSRCDIR ] + CFLAGS += [ '-I%s/media/libvpx/config/linux/mips32/' % TOPSRCDIR ] +elif CONFIG['CPU_ARCH'] == 'mips64': + EXPORTS.vpx += files['MIPS64_EXPORTS'] + SOURCES += files['MIPS64_SOURCES'] + ASFLAGS += [ '-I%s/media/libvpx/config/linux/mips64/' % TOPSRCDIR ] + CFLAGS += [ '-I%s/media/libvpx/config/linux/mips64/' % TOPSRCDIR ] +elif CONFIG['CPU_ARCH'].startswith('ppc'): + EXPORTS.vpx += files['PPC64LE_EXPORTS'] + SOURCES += files['PPC64LE_SOURCES'] + ASFLAGS += [ '-I%s/media/libvpx/config/linux/ppc64le/' % TOPSRCDIR ] + CFLAGS += [ '-I%s/media/libvpx/config/linux/ppc64le/' % TOPSRCDIR ] +elif CONFIG['CPU_ARCH'].startswith('loongarch64'): + EXPORTS.vpx += files['LOONGARCH64_EXPORTS'] + SOURCES += files['LOONGARCH64_SOURCES'] + ASFLAGS += [ '-I%s/media/libvpx/config/linux/loongarch64/' % TOPSRCDIR ] + CFLAGS += [ '-I%s/media/libvpx/config/linux/loongarch64/' % TOPSRCDIR ] else: - # Generic C-only configuration + # Generic C-only configuration used by unsupported targets. EXPORTS.vpx += files['GENERIC_EXPORTS'] SOURCES += files['GENERIC_SOURCES'] ASFLAGS += [ '-I%s/media/libvpx/config/generic/' % TOPSRCDIR ] @@ -106,6 +135,30 @@ if CONFIG['CLANG_CL'] or not CONFIG['_MSC_VER']: SOURCES[f].flags += ['-mavx'] if 'avx2.c' in f: SOURCES[f].flags += ['-mavx2'] + if 'avx512.c' in f: + SOURCES[f].flags += [ + '-mavx512f', + '-mavx512cd', + '-mavx512bw', + '-mavx512dq', + '-mavx512vl', + ] + if 'neon_dotprod.c' in f: + SOURCES[f].flags += ['-march=armv8.2-a+dotprod'] + if 'neon_i8mm.c' in f: + SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm'] + if '_sve.c' in f: + SOURCES[f].flags += ['-march=armv8.2-a+dotprod+i8mm+sve'] + if '_sve2.c' in f: + SOURCES[f].flags += ['-march=armv9-a+i8mm+sve2'] + if '_vsx.c' in f: + SOURCES[f].flags += ['-maltivec', '-mvsx'] + if '_msa.c' in f: + SOURCES[f].flags += ['-mmsa'] + if '_lsx.c' in f: + SOURCES[f].flags += ['-mlsx'] + if '_lasx.c' in f: + SOURCES[f].flags += ['-mlasx'] # Suppress warnings in third-party code. if CONFIG['GNU_CC'] or CONFIG['CLANG_CL']: diff --git a/media/libvpx/sources.mozbuild b/media/libvpx/sources.mozbuild index 2a7f7f6e23..71b3f91e85 100644 --- a/media/libvpx/sources.mozbuild +++ b/media/libvpx/sources.mozbuild @@ -8,13 +8,17 @@ files = { 'libvpx/vpx/vpx_codec.h', 'libvpx/vpx/vpx_decoder.h', 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', 'libvpx/vpx/vpx_frame_buffer.h', 'libvpx/vpx/vpx_image.h', 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', 'libvpx/vpx_mem/vpx_mem.h', 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', 'libvpx/vpx_ports/system_state.h', 'libvpx/vpx_ports/vpx_timer.h', 'libvpx/vpx_ports/x86.h', @@ -24,7 +28,6 @@ files = { 'X64_SOURCES': [ 'libvpx/vp8/common/alloccommon.c', 'libvpx/vp8/common/blockd.c', - 'libvpx/vp8/common/copy_c.c', 'libvpx/vp8/common/dequantize.c', 'libvpx/vp8/common/entropy.c', 'libvpx/vp8/common/entropymode.c', @@ -49,10 +52,9 @@ files = { 'libvpx/vp8/common/swapyv12buffer.c', 'libvpx/vp8/common/treecoder.c', 'libvpx/vp8/common/vp8_loopfilter.c', - 'libvpx/vp8/common/x86/copy_sse2.asm', - 'libvpx/vp8/common/x86/copy_sse3.asm', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/common/x86/bilinear_filter_sse2.c', 'libvpx/vp8/common/x86/dequantize_mmx.asm', - 'libvpx/vp8/common/x86/filter_x86.c', 'libvpx/vp8/common/x86/idct_blk_mmx.c', 'libvpx/vp8/common/x86/idct_blk_sse2.c', 'libvpx/vp8/common/x86/idctllm_mmx.asm', @@ -76,6 +78,7 @@ files = { 'libvpx/vp8/decoder/threading.c', 'libvpx/vp8/encoder/bitstream.c', 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', 'libvpx/vp8/encoder/dct.c', 'libvpx/vp8/encoder/denoising.c', 'libvpx/vp8/encoder/encodeframe.c', @@ -98,17 +101,17 @@ files = { 'libvpx/vp8/encoder/tokenize.c', 'libvpx/vp8/encoder/treewriter.c', 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/encoder/x86/block_error_sse2.asm', + 'libvpx/vp8/encoder/x86/copy_sse2.asm', + 'libvpx/vp8/encoder/x86/copy_sse3.asm', 'libvpx/vp8/encoder/x86/dct_sse2.asm', 'libvpx/vp8/encoder/x86/denoising_sse2.c', - 'libvpx/vp8/encoder/x86/encodeopt.asm', 'libvpx/vp8/encoder/x86/fwalsh_sse2.asm', - 'libvpx/vp8/encoder/x86/quantize_mmx.asm', 'libvpx/vp8/encoder/x86/quantize_sse4.c', - 'libvpx/vp8/encoder/x86/quantize_ssse3.c', 'libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm', - 'libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c', 'libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c', 'libvpx/vp8/encoder/x86/vp8_quantize_sse2.c', + 'libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c', 'libvpx/vp8/vp8_cx_iface.c', 'libvpx/vp8/vp8_dx_iface.c', 'libvpx/vp9/common/vp9_alloccommon.c', @@ -141,7 +144,7 @@ files = { 'libvpx/vp9/decoder/vp9_decoder.c', 'libvpx/vp9/decoder/vp9_detokenize.c', 'libvpx/vp9/decoder/vp9_dsubexp.c', - 'libvpx/vp9/decoder/vp9_dthread.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', 'libvpx/vp9/encoder/vp9_aq_360.c', 'libvpx/vp9/encoder/vp9_aq_complexity.c', @@ -156,11 +159,14 @@ files = { 'libvpx/vp9/encoder/vp9_encodemv.c', 'libvpx/vp9/encoder/vp9_encoder.c', 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', 'libvpx/vp9/encoder/vp9_extend.c', 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', 'libvpx/vp9/encoder/vp9_lookahead.c', 'libvpx/vp9/encoder/vp9_mbgraph.c', 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', 'libvpx/vp9/encoder/vp9_noise_estimate.c', 'libvpx/vp9/encoder/vp9_picklpf.c', 'libvpx/vp9/encoder/vp9_pickmode.c', @@ -176,19 +182,22 @@ files = { 'libvpx/vp9/encoder/vp9_svc_layercontext.c', 'libvpx/vp9/encoder/vp9_temporal_filter.c', 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/encoder/x86/temporal_filter_avx2.c', + 'libvpx/vp9/encoder/x86/temporal_filter_sse4.c', + 'libvpx/vp9/encoder/x86/temporal_filter_ssse3.c', 'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c', 'libvpx/vp9/encoder/x86/vp9_dct_sse2.asm', - 'libvpx/vp9/encoder/x86/vp9_dct_ssse3.c', - 'libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c', - 'libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c', + 'libvpx/vp9/encoder/x86/vp9_error_avx2.c', 'libvpx/vp9/encoder/x86/vp9_error_sse2.asm', 'libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c', + 'libvpx/vp9/encoder/x86/vp9_quantize_avx2.c', 'libvpx/vp9/encoder/x86/vp9_quantize_sse2.c', - 'libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm', - 'libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm', + 'libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c', 'libvpx/vp9/vp9_cx_iface.c', 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', 'libvpx/vpx/src/vpx_codec.c', 'libvpx/vpx/src/vpx_decoder.c', 'libvpx/vpx/src/vpx_encoder.c', @@ -208,13 +217,18 @@ files = { 'libvpx/vpx_dsp/psnr.c', 'libvpx/vpx_dsp/quantize.c', 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', 'libvpx/vpx_dsp/subtract.c', 'libvpx/vpx_dsp/sum_squares.c', 'libvpx/vpx_dsp/variance.c', 'libvpx/vpx_dsp/vpx_convolve.c', 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', 'libvpx/vpx_dsp/x86/add_noise_sse2.asm', + 'libvpx/vpx_dsp/x86/avg_intrin_avx2.c', 'libvpx/vpx_dsp/x86/avg_intrin_sse2.c', + 'libvpx/vpx_dsp/x86/avg_pred_avx2.c', + 'libvpx/vpx_dsp/x86/avg_pred_sse2.c', 'libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm', 'libvpx/vpx_dsp/x86/deblock_sse2.asm', 'libvpx/vpx_dsp/x86/fwd_txfm_avx2.c', @@ -222,30 +236,34 @@ files = { 'libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm', 'libvpx/vpx_dsp/x86/intrapred_sse2.asm', 'libvpx/vpx_dsp/x86/intrapred_ssse3.asm', + 'libvpx/vpx_dsp/x86/inv_txfm_avx2.c', 'libvpx/vpx_dsp/x86/inv_txfm_sse2.c', - 'libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm', + 'libvpx/vpx_dsp/x86/inv_txfm_ssse3.c', 'libvpx/vpx_dsp/x86/inv_wht_sse2.asm', 'libvpx/vpx_dsp/x86/loopfilter_avx2.c', - 'libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c', - 'libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm', + 'libvpx/vpx_dsp/x86/loopfilter_sse2.c', + 'libvpx/vpx_dsp/x86/post_proc_sse2.c', + 'libvpx/vpx_dsp/x86/quantize_avx.c', + 'libvpx/vpx_dsp/x86/quantize_avx2.c', 'libvpx/vpx_dsp/x86/quantize_sse2.c', - 'libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm', + 'libvpx/vpx_dsp/x86/quantize_ssse3.c', 'libvpx/vpx_dsp/x86/sad4d_avx2.c', + 'libvpx/vpx_dsp/x86/sad4d_avx512.c', 'libvpx/vpx_dsp/x86/sad4d_sse2.asm', 'libvpx/vpx_dsp/x86/sad_avx2.c', + 'libvpx/vpx_dsp/x86/sad_avx512.c', 'libvpx/vpx_dsp/x86/sad_sse2.asm', - 'libvpx/vpx_dsp/x86/sad_sse3.asm', - 'libvpx/vpx_dsp/x86/sad_sse4.asm', - 'libvpx/vpx_dsp/x86/sad_ssse3.asm', + 'libvpx/vpx_dsp/x86/sse_avx2.c', + 'libvpx/vpx_dsp/x86/sse_sse4.c', 'libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm', 'libvpx/vpx_dsp/x86/subpel_variance_sse2.asm', + 'libvpx/vpx_dsp/x86/subtract_avx2.c', 'libvpx/vpx_dsp/x86/subtract_sse2.asm', 'libvpx/vpx_dsp/x86/sum_squares_sse2.c', 'libvpx/vpx_dsp/x86/variance_avx2.c', - 'libvpx/vpx_dsp/x86/variance_impl_avx2.c', 'libvpx/vpx_dsp/x86/variance_sse2.c', - 'libvpx/vpx_dsp/x86/vpx_asm_stubs.c', 'libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm', + 'libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c', 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c', 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c', 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm', @@ -253,7 +271,7 @@ files = { 'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm', 'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm', 'libvpx/vpx_mem/vpx_mem.c', - 'libvpx/vpx_ports/emms.asm', + 'libvpx/vpx_ports/emms_mmx.asm', 'libvpx/vpx_ports/x86_abi_support.asm', 'libvpx/vpx_scale/generic/gen_scalers.c', 'libvpx/vpx_scale/generic/vpx_scale.c', @@ -261,6 +279,7 @@ files = { 'libvpx/vpx_scale/generic/yv12extend.c', 'libvpx/vpx_scale/vpx_scale_rtcd.c', 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', ], 'IA32_EXPORTS': [ 'libvpx/vpx/vp8.h', @@ -269,13 +288,17 @@ files = { 'libvpx/vpx/vpx_codec.h', 'libvpx/vpx/vpx_decoder.h', 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', 'libvpx/vpx/vpx_frame_buffer.h', 'libvpx/vpx/vpx_image.h', 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', 'libvpx/vpx_mem/vpx_mem.h', 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', 'libvpx/vpx_ports/system_state.h', 'libvpx/vpx_ports/vpx_timer.h', 'libvpx/vpx_ports/x86.h', @@ -285,7 +308,6 @@ files = { 'IA32_SOURCES': [ 'libvpx/vp8/common/alloccommon.c', 'libvpx/vp8/common/blockd.c', - 'libvpx/vp8/common/copy_c.c', 'libvpx/vp8/common/dequantize.c', 'libvpx/vp8/common/entropy.c', 'libvpx/vp8/common/entropymode.c', @@ -310,10 +332,9 @@ files = { 'libvpx/vp8/common/swapyv12buffer.c', 'libvpx/vp8/common/treecoder.c', 'libvpx/vp8/common/vp8_loopfilter.c', - 'libvpx/vp8/common/x86/copy_sse2.asm', - 'libvpx/vp8/common/x86/copy_sse3.asm', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/common/x86/bilinear_filter_sse2.c', 'libvpx/vp8/common/x86/dequantize_mmx.asm', - 'libvpx/vp8/common/x86/filter_x86.c', 'libvpx/vp8/common/x86/idct_blk_mmx.c', 'libvpx/vp8/common/x86/idct_blk_sse2.c', 'libvpx/vp8/common/x86/idctllm_mmx.asm', @@ -336,6 +357,7 @@ files = { 'libvpx/vp8/decoder/threading.c', 'libvpx/vp8/encoder/bitstream.c', 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', 'libvpx/vp8/encoder/dct.c', 'libvpx/vp8/encoder/denoising.c', 'libvpx/vp8/encoder/encodeframe.c', @@ -358,17 +380,17 @@ files = { 'libvpx/vp8/encoder/tokenize.c', 'libvpx/vp8/encoder/treewriter.c', 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/encoder/x86/block_error_sse2.asm', + 'libvpx/vp8/encoder/x86/copy_sse2.asm', + 'libvpx/vp8/encoder/x86/copy_sse3.asm', 'libvpx/vp8/encoder/x86/dct_sse2.asm', 'libvpx/vp8/encoder/x86/denoising_sse2.c', - 'libvpx/vp8/encoder/x86/encodeopt.asm', 'libvpx/vp8/encoder/x86/fwalsh_sse2.asm', - 'libvpx/vp8/encoder/x86/quantize_mmx.asm', 'libvpx/vp8/encoder/x86/quantize_sse4.c', - 'libvpx/vp8/encoder/x86/quantize_ssse3.c', 'libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm', - 'libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c', 'libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c', 'libvpx/vp8/encoder/x86/vp8_quantize_sse2.c', + 'libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c', 'libvpx/vp8/vp8_cx_iface.c', 'libvpx/vp8/vp8_dx_iface.c', 'libvpx/vp9/common/vp9_alloccommon.c', @@ -401,7 +423,7 @@ files = { 'libvpx/vp9/decoder/vp9_decoder.c', 'libvpx/vp9/decoder/vp9_detokenize.c', 'libvpx/vp9/decoder/vp9_dsubexp.c', - 'libvpx/vp9/decoder/vp9_dthread.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', 'libvpx/vp9/encoder/vp9_aq_360.c', 'libvpx/vp9/encoder/vp9_aq_complexity.c', @@ -416,11 +438,14 @@ files = { 'libvpx/vp9/encoder/vp9_encodemv.c', 'libvpx/vp9/encoder/vp9_encoder.c', 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', 'libvpx/vp9/encoder/vp9_extend.c', 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', 'libvpx/vp9/encoder/vp9_lookahead.c', 'libvpx/vp9/encoder/vp9_mbgraph.c', 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', 'libvpx/vp9/encoder/vp9_noise_estimate.c', 'libvpx/vp9/encoder/vp9_picklpf.c', 'libvpx/vp9/encoder/vp9_pickmode.c', @@ -436,18 +461,22 @@ files = { 'libvpx/vp9/encoder/vp9_svc_layercontext.c', 'libvpx/vp9/encoder/vp9_temporal_filter.c', 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/encoder/x86/temporal_filter_avx2.c', + 'libvpx/vp9/encoder/x86/temporal_filter_sse4.c', + 'libvpx/vp9/encoder/x86/temporal_filter_ssse3.c', 'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c', 'libvpx/vp9/encoder/x86/vp9_dct_sse2.asm', - 'libvpx/vp9/encoder/x86/vp9_dct_ssse3.c', - 'libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c', - 'libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c', + 'libvpx/vp9/encoder/x86/vp9_error_avx2.c', 'libvpx/vp9/encoder/x86/vp9_error_sse2.asm', 'libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c', + 'libvpx/vp9/encoder/x86/vp9_quantize_avx2.c', 'libvpx/vp9/encoder/x86/vp9_quantize_sse2.c', - 'libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm', + 'libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c', 'libvpx/vp9/vp9_cx_iface.c', 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', 'libvpx/vpx/src/vpx_codec.c', 'libvpx/vpx/src/vpx_decoder.c', 'libvpx/vpx/src/vpx_encoder.c', @@ -467,38 +496,50 @@ files = { 'libvpx/vpx_dsp/psnr.c', 'libvpx/vpx_dsp/quantize.c', 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', 'libvpx/vpx_dsp/subtract.c', 'libvpx/vpx_dsp/sum_squares.c', 'libvpx/vpx_dsp/variance.c', 'libvpx/vpx_dsp/vpx_convolve.c', 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', 'libvpx/vpx_dsp/x86/add_noise_sse2.asm', + 'libvpx/vpx_dsp/x86/avg_intrin_avx2.c', 'libvpx/vpx_dsp/x86/avg_intrin_sse2.c', + 'libvpx/vpx_dsp/x86/avg_pred_avx2.c', + 'libvpx/vpx_dsp/x86/avg_pred_sse2.c', 'libvpx/vpx_dsp/x86/deblock_sse2.asm', 'libvpx/vpx_dsp/x86/fwd_txfm_avx2.c', 'libvpx/vpx_dsp/x86/fwd_txfm_sse2.c', 'libvpx/vpx_dsp/x86/intrapred_sse2.asm', 'libvpx/vpx_dsp/x86/intrapred_ssse3.asm', + 'libvpx/vpx_dsp/x86/inv_txfm_avx2.c', 'libvpx/vpx_dsp/x86/inv_txfm_sse2.c', + 'libvpx/vpx_dsp/x86/inv_txfm_ssse3.c', 'libvpx/vpx_dsp/x86/inv_wht_sse2.asm', 'libvpx/vpx_dsp/x86/loopfilter_avx2.c', - 'libvpx/vpx_dsp/x86/loopfilter_intrin_sse2.c', + 'libvpx/vpx_dsp/x86/loopfilter_sse2.c', + 'libvpx/vpx_dsp/x86/post_proc_sse2.c', + 'libvpx/vpx_dsp/x86/quantize_avx.c', + 'libvpx/vpx_dsp/x86/quantize_avx2.c', 'libvpx/vpx_dsp/x86/quantize_sse2.c', + 'libvpx/vpx_dsp/x86/quantize_ssse3.c', 'libvpx/vpx_dsp/x86/sad4d_avx2.c', + 'libvpx/vpx_dsp/x86/sad4d_avx512.c', 'libvpx/vpx_dsp/x86/sad4d_sse2.asm', 'libvpx/vpx_dsp/x86/sad_avx2.c', + 'libvpx/vpx_dsp/x86/sad_avx512.c', 'libvpx/vpx_dsp/x86/sad_sse2.asm', - 'libvpx/vpx_dsp/x86/sad_sse3.asm', - 'libvpx/vpx_dsp/x86/sad_sse4.asm', - 'libvpx/vpx_dsp/x86/sad_ssse3.asm', + 'libvpx/vpx_dsp/x86/sse_avx2.c', + 'libvpx/vpx_dsp/x86/sse_sse4.c', 'libvpx/vpx_dsp/x86/subpel_variance_sse2.asm', + 'libvpx/vpx_dsp/x86/subtract_avx2.c', 'libvpx/vpx_dsp/x86/subtract_sse2.asm', 'libvpx/vpx_dsp/x86/sum_squares_sse2.c', 'libvpx/vpx_dsp/x86/variance_avx2.c', - 'libvpx/vpx_dsp/x86/variance_impl_avx2.c', 'libvpx/vpx_dsp/x86/variance_sse2.c', - 'libvpx/vpx_dsp/x86/vpx_asm_stubs.c', 'libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm', + 'libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c', 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c', 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c', 'libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm', @@ -506,7 +547,7 @@ files = { 'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm', 'libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm', 'libvpx/vpx_mem/vpx_mem.c', - 'libvpx/vpx_ports/emms.asm', + 'libvpx/vpx_ports/emms_mmx.c', 'libvpx/vpx_ports/x86_abi_support.asm', 'libvpx/vpx_scale/generic/gen_scalers.c', 'libvpx/vpx_scale/generic/vpx_scale.c', @@ -514,6 +555,7 @@ files = { 'libvpx/vpx_scale/generic/yv12extend.c', 'libvpx/vpx_scale/vpx_scale_rtcd.c', 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', ], 'ARM_EXPORTS': [ 'libvpx/vpx/vp8.h', @@ -522,14 +564,19 @@ files = { 'libvpx/vpx/vpx_codec.h', 'libvpx/vpx/vpx_decoder.h', 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', 'libvpx/vpx/vpx_frame_buffer.h', 'libvpx/vpx/vpx_image.h', 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', 'libvpx/vpx_mem/vpx_mem.h', 'libvpx/vpx_ports/arm.h', + 'libvpx/vpx_ports/arm_cpudetect.h', 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', 'libvpx/vpx_ports/system_state.h', 'libvpx/vpx_ports/vpx_timer.h', 'libvpx/vpx_scale/vpx_scale.h', @@ -544,8 +591,6 @@ files = { 'libvpx/vp8/common/arm/neon/dequant_idct_neon.c', 'libvpx/vp8/common/arm/neon/dequantizeb_neon.c', 'libvpx/vp8/common/arm/neon/idct_blk_neon.c', - 'libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c', - 'libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c', 'libvpx/vp8/common/arm/neon/iwalsh_neon.c', 'libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c', 'libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c', @@ -554,7 +599,6 @@ files = { 'libvpx/vp8/common/arm/neon/sixtappredict_neon.c', 'libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c', 'libvpx/vp8/common/blockd.c', - 'libvpx/vp8/common/copy_c.c', 'libvpx/vp8/common/dequantize.c', 'libvpx/vp8/common/entropy.c', 'libvpx/vp8/common/entropymode.c', @@ -577,6 +621,7 @@ files = { 'libvpx/vp8/common/swapyv12buffer.c', 'libvpx/vp8/common/treecoder.c', 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', 'libvpx/vp8/decoder/dboolhuff.c', 'libvpx/vp8/decoder/decodeframe.c', 'libvpx/vp8/decoder/decodemv.c', @@ -589,6 +634,7 @@ files = { 'libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', 'libvpx/vp8/encoder/bitstream.c', 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', 'libvpx/vp8/encoder/dct.c', 'libvpx/vp8/encoder/denoising.c', 'libvpx/vp8/encoder/encodeframe.c', @@ -611,6 +657,7 @@ files = { 'libvpx/vp8/encoder/vp8_quantize.c', 'libvpx/vp8/vp8_cx_iface.c', 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c', 'libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c', 'libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c', 'libvpx/vp9/common/vp9_alloccommon.c', @@ -639,15 +686,13 @@ files = { 'libvpx/vp9/decoder/vp9_decoder.c', 'libvpx/vp9/decoder/vp9_detokenize.c', 'libvpx/vp9/decoder/vp9_dsubexp.c', - 'libvpx/vp9/decoder/vp9_dthread.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', 'libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c', 'libvpx/vp9/encoder/arm/neon/vp9_error_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c', 'libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c', - 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', - 'libvpx/vp9/encoder/vp9_aq_360.c', - 'libvpx/vp9/encoder/vp9_aq_complexity.c', 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', - 'libvpx/vp9/encoder/vp9_aq_variance.c', 'libvpx/vp9/encoder/vp9_bitstream.c', 'libvpx/vp9/encoder/vp9_context_tree.c', 'libvpx/vp9/encoder/vp9_cost.c', @@ -657,11 +702,12 @@ files = { 'libvpx/vp9/encoder/vp9_encodemv.c', 'libvpx/vp9/encoder/vp9_encoder.c', 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', 'libvpx/vp9/encoder/vp9_extend.c', - 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', 'libvpx/vp9/encoder/vp9_lookahead.c', - 'libvpx/vp9/encoder/vp9_mbgraph.c', 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', 'libvpx/vp9/encoder/vp9_noise_estimate.c', 'libvpx/vp9/encoder/vp9_picklpf.c', 'libvpx/vp9/encoder/vp9_pickmode.c', @@ -675,46 +721,62 @@ files = { 'libvpx/vp9/encoder/vp9_speed_features.c', 'libvpx/vp9/encoder/vp9_subexp.c', 'libvpx/vp9/encoder/vp9_svc_layercontext.c', - 'libvpx/vp9/encoder/vp9_temporal_filter.c', 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', 'libvpx/vp9/encoder/vp9_treewriter.c', 'libvpx/vp9/vp9_cx_iface.c', 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', 'libvpx/vpx/src/vpx_codec.c', 'libvpx/vpx/src/vpx_decoder.c', 'libvpx/vpx/src/vpx_encoder.c', 'libvpx/vpx/src/vpx_image.c', 'libvpx/vpx_dsp/arm/avg_neon.c', - 'libvpx/vpx_dsp/arm/fwd_txfm_neon.c', + 'libvpx/vpx_dsp/arm/avg_pred_neon.c', + 'libvpx/vpx_dsp/arm/fdct16x16_neon.c', + 'libvpx/vpx_dsp/arm/fdct32x32_neon.c', + 'libvpx/vpx_dsp/arm/fdct4x4_neon.c', + 'libvpx/vpx_dsp/arm/fdct8x8_neon.c', + 'libvpx/vpx_dsp/arm/fdct_partial_neon.c', 'libvpx/vpx_dsp/arm/hadamard_neon.c', - 'libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm', - 'libvpx/vpx_dsp/arm/idct16x16_add_neon.asm', - 'libvpx/vpx_dsp/arm/idct16x16_neon.c', + 'libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct16x16_add_neon.c', 'libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c', 'libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c', 'libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c', 'libvpx/vpx_dsp/arm/idct32x32_add_neon.c', 'libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm', 'libvpx/vpx_dsp/arm/idct4x4_add_neon.asm', - 'libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm', - 'libvpx/vpx_dsp/arm/idct8x8_add_neon.asm', + 'libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct8x8_add_neon.c', 'libvpx/vpx_dsp/arm/idct_neon.asm', 'libvpx/vpx_dsp/arm/intrapred_neon.c', 'libvpx/vpx_dsp/arm/intrapred_neon_asm.asm', 'libvpx/vpx_dsp/arm/loopfilter_16_neon.asm', 'libvpx/vpx_dsp/arm/loopfilter_4_neon.asm', 'libvpx/vpx_dsp/arm/loopfilter_8_neon.asm', + 'libvpx/vpx_dsp/arm/quantize_neon.c', 'libvpx/vpx_dsp/arm/sad4d_neon.c', 'libvpx/vpx_dsp/arm/sad_neon.c', 'libvpx/vpx_dsp/arm/save_reg_neon.asm', + 'libvpx/vpx_dsp/arm/sse_neon.c', 'libvpx/vpx_dsp/arm/subpel_variance_neon.c', 'libvpx/vpx_dsp/arm/subtract_neon.c', + 'libvpx/vpx_dsp/arm/sum_squares_neon.c', 'libvpx/vpx_dsp/arm/variance_neon.c', - 'libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm', - 'libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c', + 'libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm', + 'libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm', 'libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm', 'libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm', 'libvpx/vpx_dsp/arm/vpx_convolve_neon.c', + 'libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c', 'libvpx/vpx_dsp/avg.c', 'libvpx/vpx_dsp/bitreader.c', 'libvpx/vpx_dsp/bitreader_buffer.c', @@ -728,43 +790,65 @@ files = { 'libvpx/vpx_dsp/psnr.c', 'libvpx/vpx_dsp/quantize.c', 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', 'libvpx/vpx_dsp/subtract.c', 'libvpx/vpx_dsp/sum_squares.c', 'libvpx/vpx_dsp/variance.c', 'libvpx/vpx_dsp/vpx_convolve.c', 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', 'libvpx/vpx_mem/vpx_mem.c', - 'libvpx/vpx_ports/arm_cpudetect.c', + 'libvpx/vpx_ports/aarch32_cpudetect.c', 'libvpx/vpx_scale/generic/gen_scalers.c', 'libvpx/vpx_scale/generic/vpx_scale.c', 'libvpx/vpx_scale/generic/yv12config.c', 'libvpx/vpx_scale/generic/yv12extend.c', 'libvpx/vpx_scale/vpx_scale_rtcd.c', 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', ], - 'GENERIC_EXPORTS': [ + 'AARCH64_EXPORTS': [ 'libvpx/vpx/vp8.h', 'libvpx/vpx/vp8cx.h', 'libvpx/vpx/vp8dx.h', 'libvpx/vpx/vpx_codec.h', 'libvpx/vpx/vpx_decoder.h', 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', 'libvpx/vpx/vpx_frame_buffer.h', 'libvpx/vpx/vpx_image.h', 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/arm.h', + 'libvpx/vpx_ports/arm_cpudetect.h', 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', 'libvpx/vpx_ports/system_state.h', 'libvpx/vpx_ports/vpx_timer.h', 'libvpx/vpx_scale/vpx_scale.h', 'libvpx/vpx_scale/yv12config.h', ], - 'GENERIC_SOURCES': [ + 'AARCH64_SOURCES': [ 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/arm/loopfilter_arm.c', + 'libvpx/vp8/common/arm/neon/bilinearpredict_neon.c', + 'libvpx/vp8/common/arm/neon/copymem_neon.c', + 'libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c', + 'libvpx/vp8/common/arm/neon/dequant_idct_neon.c', + 'libvpx/vp8/common/arm/neon/dequantizeb_neon.c', + 'libvpx/vp8/common/arm/neon/idct_blk_neon.c', + 'libvpx/vp8/common/arm/neon/iwalsh_neon.c', + 'libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c', + 'libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c', + 'libvpx/vp8/common/arm/neon/mbloopfilter_neon.c', + 'libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c', + 'libvpx/vp8/common/arm/neon/sixtappredict_neon.c', + 'libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c', 'libvpx/vp8/common/blockd.c', - 'libvpx/vp8/common/copy_c.c', 'libvpx/vp8/common/dequantize.c', 'libvpx/vp8/common/entropy.c', 'libvpx/vp8/common/entropymode.c', @@ -787,6 +871,238 @@ files = { 'libvpx/vp8/common/swapyv12buffer.c', 'libvpx/vp8/common/treecoder.c', 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/decoder/dboolhuff.c', + 'libvpx/vp8/decoder/decodeframe.c', + 'libvpx/vp8/decoder/decodemv.c', + 'libvpx/vp8/decoder/detokenize.c', + 'libvpx/vp8/decoder/onyxd_if.c', + 'libvpx/vp8/decoder/threading.c', + 'libvpx/vp8/encoder/arm/neon/denoising_neon.c', + 'libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c', + 'libvpx/vp8/encoder/arm/neon/shortfdct_neon.c', + 'libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', + 'libvpx/vp8/encoder/bitstream.c', + 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', + 'libvpx/vp8/encoder/dct.c', + 'libvpx/vp8/encoder/denoising.c', + 'libvpx/vp8/encoder/encodeframe.c', + 'libvpx/vp8/encoder/encodeintra.c', + 'libvpx/vp8/encoder/encodemb.c', + 'libvpx/vp8/encoder/encodemv.c', + 'libvpx/vp8/encoder/ethreading.c', + 'libvpx/vp8/encoder/lookahead.c', + 'libvpx/vp8/encoder/mcomp.c', + 'libvpx/vp8/encoder/modecosts.c', + 'libvpx/vp8/encoder/mr_dissim.c', + 'libvpx/vp8/encoder/onyx_if.c', + 'libvpx/vp8/encoder/pickinter.c', + 'libvpx/vp8/encoder/picklpf.c', + 'libvpx/vp8/encoder/ratectrl.c', + 'libvpx/vp8/encoder/rdopt.c', + 'libvpx/vp8/encoder/segmentation.c', + 'libvpx/vp8/encoder/tokenize.c', + 'libvpx/vp8/encoder/treewriter.c', + 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/vp8_cx_iface.c', + 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c', + 'libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c', + 'libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c', + 'libvpx/vp9/common/vp9_alloccommon.c', + 'libvpx/vp9/common/vp9_blockd.c', + 'libvpx/vp9/common/vp9_common_data.c', + 'libvpx/vp9/common/vp9_entropy.c', + 'libvpx/vp9/common/vp9_entropymode.c', + 'libvpx/vp9/common/vp9_entropymv.c', + 'libvpx/vp9/common/vp9_filter.c', + 'libvpx/vp9/common/vp9_frame_buffers.c', + 'libvpx/vp9/common/vp9_idct.c', + 'libvpx/vp9/common/vp9_loopfilter.c', + 'libvpx/vp9/common/vp9_mvref_common.c', + 'libvpx/vp9/common/vp9_pred_common.c', + 'libvpx/vp9/common/vp9_quant_common.c', + 'libvpx/vp9/common/vp9_reconinter.c', + 'libvpx/vp9/common/vp9_reconintra.c', + 'libvpx/vp9/common/vp9_rtcd.c', + 'libvpx/vp9/common/vp9_scale.c', + 'libvpx/vp9/common/vp9_scan.c', + 'libvpx/vp9/common/vp9_seg_common.c', + 'libvpx/vp9/common/vp9_thread_common.c', + 'libvpx/vp9/common/vp9_tile_common.c', + 'libvpx/vp9/decoder/vp9_decodeframe.c', + 'libvpx/vp9/decoder/vp9_decodemv.c', + 'libvpx/vp9/decoder/vp9_decoder.c', + 'libvpx/vp9/decoder/vp9_detokenize.c', + 'libvpx/vp9/decoder/vp9_dsubexp.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', + 'libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_error_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_error_sve.c', + 'libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c', + 'libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c', + 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', + 'libvpx/vp9/encoder/vp9_bitstream.c', + 'libvpx/vp9/encoder/vp9_context_tree.c', + 'libvpx/vp9/encoder/vp9_cost.c', + 'libvpx/vp9/encoder/vp9_dct.c', + 'libvpx/vp9/encoder/vp9_encodeframe.c', + 'libvpx/vp9/encoder/vp9_encodemb.c', + 'libvpx/vp9/encoder/vp9_encodemv.c', + 'libvpx/vp9/encoder/vp9_encoder.c', + 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', + 'libvpx/vp9/encoder/vp9_extend.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', + 'libvpx/vp9/encoder/vp9_lookahead.c', + 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', + 'libvpx/vp9/encoder/vp9_noise_estimate.c', + 'libvpx/vp9/encoder/vp9_picklpf.c', + 'libvpx/vp9/encoder/vp9_pickmode.c', + 'libvpx/vp9/encoder/vp9_quantize.c', + 'libvpx/vp9/encoder/vp9_ratectrl.c', + 'libvpx/vp9/encoder/vp9_rd.c', + 'libvpx/vp9/encoder/vp9_rdopt.c', + 'libvpx/vp9/encoder/vp9_resize.c', + 'libvpx/vp9/encoder/vp9_segmentation.c', + 'libvpx/vp9/encoder/vp9_skin_detection.c', + 'libvpx/vp9/encoder/vp9_speed_features.c', + 'libvpx/vp9/encoder/vp9_subexp.c', + 'libvpx/vp9/encoder/vp9_svc_layercontext.c', + 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', + 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/vp9_cx_iface.c', + 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', + 'libvpx/vpx/src/vpx_codec.c', + 'libvpx/vpx/src/vpx_decoder.c', + 'libvpx/vpx/src/vpx_encoder.c', + 'libvpx/vpx/src/vpx_image.c', + 'libvpx/vpx_dsp/arm/avg_neon.c', + 'libvpx/vpx_dsp/arm/avg_pred_neon.c', + 'libvpx/vpx_dsp/arm/fdct16x16_neon.c', + 'libvpx/vpx_dsp/arm/fdct32x32_neon.c', + 'libvpx/vpx_dsp/arm/fdct4x4_neon.c', + 'libvpx/vpx_dsp/arm/fdct8x8_neon.c', + 'libvpx/vpx_dsp/arm/fdct_partial_neon.c', + 'libvpx/vpx_dsp/arm/hadamard_neon.c', + 'libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct16x16_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c', + 'libvpx/vpx_dsp/arm/idct32x32_add_neon.c', + 'libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct4x4_add_neon.c', + 'libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c', + 'libvpx/vpx_dsp/arm/idct8x8_add_neon.c', + 'libvpx/vpx_dsp/arm/intrapred_neon.c', + 'libvpx/vpx_dsp/arm/loopfilter_neon.c', + 'libvpx/vpx_dsp/arm/quantize_neon.c', + 'libvpx/vpx_dsp/arm/sad4d_neon.c', + 'libvpx/vpx_dsp/arm/sad4d_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/sad_neon.c', + 'libvpx/vpx_dsp/arm/sad_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/sse_neon.c', + 'libvpx/vpx_dsp/arm/sse_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/subpel_variance_neon.c', + 'libvpx/vpx_dsp/arm/subtract_neon.c', + 'libvpx/vpx_dsp/arm/sum_squares_neon.c', + 'libvpx/vpx_dsp/arm/sum_squares_sve.c', + 'libvpx/vpx_dsp/arm/variance_neon.c', + 'libvpx/vpx_dsp/arm/variance_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/vpx_convolve8_neon.c', + 'libvpx/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c', + 'libvpx/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c', + 'libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c', + 'libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c', + 'libvpx/vpx_dsp/arm/vpx_convolve_neon.c', + 'libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c', + 'libvpx/vpx_dsp/avg.c', + 'libvpx/vpx_dsp/bitreader.c', + 'libvpx/vpx_dsp/bitreader_buffer.c', + 'libvpx/vpx_dsp/bitwriter.c', + 'libvpx/vpx_dsp/bitwriter_buffer.c', + 'libvpx/vpx_dsp/fwd_txfm.c', + 'libvpx/vpx_dsp/intrapred.c', + 'libvpx/vpx_dsp/inv_txfm.c', + 'libvpx/vpx_dsp/loopfilter.c', + 'libvpx/vpx_dsp/prob.c', + 'libvpx/vpx_dsp/psnr.c', + 'libvpx/vpx_dsp/quantize.c', + 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', + 'libvpx/vpx_dsp/subtract.c', + 'libvpx/vpx_dsp/sum_squares.c', + 'libvpx/vpx_dsp/variance.c', + 'libvpx/vpx_dsp/vpx_convolve.c', + 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', + 'libvpx/vpx_mem/vpx_mem.c', + 'libvpx/vpx_ports/aarch64_cpudetect.c', + 'libvpx/vpx_scale/generic/gen_scalers.c', + 'libvpx/vpx_scale/generic/vpx_scale.c', + 'libvpx/vpx_scale/generic/yv12config.c', + 'libvpx/vpx_scale/generic/yv12extend.c', + 'libvpx/vpx_scale/vpx_scale_rtcd.c', + 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', +], + 'MIPS32_EXPORTS': [ + 'libvpx/vpx/vp8.h', + 'libvpx/vpx/vp8cx.h', + 'libvpx/vpx/vp8dx.h', + 'libvpx/vpx/vpx_codec.h', + 'libvpx/vpx/vpx_decoder.h', + 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', + 'libvpx/vpx/vpx_frame_buffer.h', + 'libvpx/vpx/vpx_image.h', + 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', + 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', + 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/asmdefs_mmi.h', + 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', + 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/mips.h', + 'libvpx/vpx_ports/static_assert.h', + 'libvpx/vpx_ports/system_state.h', + 'libvpx/vpx_ports/vpx_timer.h', + 'libvpx/vpx_scale/vpx_scale.h', + 'libvpx/vpx_scale/yv12config.h', +], + 'MIPS32_SOURCES': [ + 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/blockd.c', + 'libvpx/vp8/common/dequantize.c', + 'libvpx/vp8/common/entropy.c', + 'libvpx/vp8/common/entropymode.c', + 'libvpx/vp8/common/entropymv.c', + 'libvpx/vp8/common/extend.c', + 'libvpx/vp8/common/filter.c', + 'libvpx/vp8/common/findnearmv.c', + 'libvpx/vp8/common/generic/systemdependent.c', + 'libvpx/vp8/common/idct_blk.c', + 'libvpx/vp8/common/idctllm.c', + 'libvpx/vp8/common/loopfilter_filters.c', + 'libvpx/vp8/common/mbpitch.c', + 'libvpx/vp8/common/modecont.c', + 'libvpx/vp8/common/quant_common.c', + 'libvpx/vp8/common/reconinter.c', + 'libvpx/vp8/common/reconintra.c', + 'libvpx/vp8/common/reconintra4x4.c', + 'libvpx/vp8/common/rtcd.c', + 'libvpx/vp8/common/setupintrarecon.c', + 'libvpx/vp8/common/swapyv12buffer.c', + 'libvpx/vp8/common/treecoder.c', + 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', 'libvpx/vp8/decoder/dboolhuff.c', 'libvpx/vp8/decoder/decodeframe.c', 'libvpx/vp8/decoder/decodemv.c', @@ -795,6 +1111,7 @@ files = { 'libvpx/vp8/decoder/threading.c', 'libvpx/vp8/encoder/bitstream.c', 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', 'libvpx/vp8/encoder/dct.c', 'libvpx/vp8/encoder/denoising.c', 'libvpx/vp8/encoder/encodeframe.c', @@ -845,7 +1162,7 @@ files = { 'libvpx/vp9/decoder/vp9_decoder.c', 'libvpx/vp9/decoder/vp9_detokenize.c', 'libvpx/vp9/decoder/vp9_dsubexp.c', - 'libvpx/vp9/decoder/vp9_dthread.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', 'libvpx/vp9/encoder/vp9_aq_360.c', 'libvpx/vp9/encoder/vp9_aq_complexity.c', @@ -860,11 +1177,14 @@ files = { 'libvpx/vp9/encoder/vp9_encodemv.c', 'libvpx/vp9/encoder/vp9_encoder.c', 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', 'libvpx/vp9/encoder/vp9_extend.c', 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', 'libvpx/vp9/encoder/vp9_lookahead.c', 'libvpx/vp9/encoder/vp9_mbgraph.c', 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', 'libvpx/vp9/encoder/vp9_noise_estimate.c', 'libvpx/vp9/encoder/vp9_picklpf.c', 'libvpx/vp9/encoder/vp9_pickmode.c', @@ -880,9 +1200,11 @@ files = { 'libvpx/vp9/encoder/vp9_svc_layercontext.c', 'libvpx/vp9/encoder/vp9_temporal_filter.c', 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', 'libvpx/vp9/encoder/vp9_treewriter.c', 'libvpx/vp9/vp9_cx_iface.c', 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', 'libvpx/vpx/src/vpx_codec.c', 'libvpx/vpx/src/vpx_decoder.c', 'libvpx/vpx/src/vpx_encoder.c', @@ -900,6 +1222,844 @@ files = { 'libvpx/vpx_dsp/psnr.c', 'libvpx/vpx_dsp/quantize.c', 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', + 'libvpx/vpx_dsp/subtract.c', + 'libvpx/vpx_dsp/sum_squares.c', + 'libvpx/vpx_dsp/variance.c', + 'libvpx/vpx_dsp/vpx_convolve.c', + 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', + 'libvpx/vpx_mem/vpx_mem.c', + 'libvpx/vpx_ports/mips_cpudetect.c', + 'libvpx/vpx_scale/generic/gen_scalers.c', + 'libvpx/vpx_scale/generic/vpx_scale.c', + 'libvpx/vpx_scale/generic/yv12config.c', + 'libvpx/vpx_scale/generic/yv12extend.c', + 'libvpx/vpx_scale/vpx_scale_rtcd.c', + 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', +], + 'MIPS64_EXPORTS': [ + 'libvpx/vpx/vp8.h', + 'libvpx/vpx/vp8cx.h', + 'libvpx/vpx/vp8dx.h', + 'libvpx/vpx/vpx_codec.h', + 'libvpx/vpx/vpx_decoder.h', + 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', + 'libvpx/vpx/vpx_frame_buffer.h', + 'libvpx/vpx/vpx_image.h', + 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', + 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', + 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/asmdefs_mmi.h', + 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', + 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/mips.h', + 'libvpx/vpx_ports/static_assert.h', + 'libvpx/vpx_ports/system_state.h', + 'libvpx/vpx_ports/vpx_timer.h', + 'libvpx/vpx_scale/vpx_scale.h', + 'libvpx/vpx_scale/yv12config.h', +], + 'MIPS64_SOURCES': [ + 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/blockd.c', + 'libvpx/vp8/common/dequantize.c', + 'libvpx/vp8/common/entropy.c', + 'libvpx/vp8/common/entropymode.c', + 'libvpx/vp8/common/entropymv.c', + 'libvpx/vp8/common/extend.c', + 'libvpx/vp8/common/filter.c', + 'libvpx/vp8/common/findnearmv.c', + 'libvpx/vp8/common/generic/systemdependent.c', + 'libvpx/vp8/common/idct_blk.c', + 'libvpx/vp8/common/idctllm.c', + 'libvpx/vp8/common/loopfilter_filters.c', + 'libvpx/vp8/common/mbpitch.c', + 'libvpx/vp8/common/mips/mmi/copymem_mmi.c', + 'libvpx/vp8/common/mips/mmi/dequantize_mmi.c', + 'libvpx/vp8/common/mips/mmi/idct_blk_mmi.c', + 'libvpx/vp8/common/mips/mmi/idctllm_mmi.c', + 'libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c', + 'libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c', + 'libvpx/vp8/common/mips/msa/bilinear_filter_msa.c', + 'libvpx/vp8/common/mips/msa/copymem_msa.c', + 'libvpx/vp8/common/mips/msa/idct_msa.c', + 'libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c', + 'libvpx/vp8/common/mips/msa/sixtap_filter_msa.c', + 'libvpx/vp8/common/modecont.c', + 'libvpx/vp8/common/quant_common.c', + 'libvpx/vp8/common/reconinter.c', + 'libvpx/vp8/common/reconintra.c', + 'libvpx/vp8/common/reconintra4x4.c', + 'libvpx/vp8/common/rtcd.c', + 'libvpx/vp8/common/setupintrarecon.c', + 'libvpx/vp8/common/swapyv12buffer.c', + 'libvpx/vp8/common/treecoder.c', + 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/decoder/dboolhuff.c', + 'libvpx/vp8/decoder/decodeframe.c', + 'libvpx/vp8/decoder/decodemv.c', + 'libvpx/vp8/decoder/detokenize.c', + 'libvpx/vp8/decoder/onyxd_if.c', + 'libvpx/vp8/decoder/threading.c', + 'libvpx/vp8/encoder/bitstream.c', + 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', + 'libvpx/vp8/encoder/dct.c', + 'libvpx/vp8/encoder/denoising.c', + 'libvpx/vp8/encoder/encodeframe.c', + 'libvpx/vp8/encoder/encodeintra.c', + 'libvpx/vp8/encoder/encodemb.c', + 'libvpx/vp8/encoder/encodemv.c', + 'libvpx/vp8/encoder/ethreading.c', + 'libvpx/vp8/encoder/firstpass.c', + 'libvpx/vp8/encoder/lookahead.c', + 'libvpx/vp8/encoder/mcomp.c', + 'libvpx/vp8/encoder/mips/mmi/dct_mmi.c', + 'libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c', + 'libvpx/vp8/encoder/mips/msa/dct_msa.c', + 'libvpx/vp8/encoder/mips/msa/denoising_msa.c', + 'libvpx/vp8/encoder/mips/msa/encodeopt_msa.c', + 'libvpx/vp8/encoder/mips/msa/quantize_msa.c', + 'libvpx/vp8/encoder/mips/msa/temporal_filter_msa.c', + 'libvpx/vp8/encoder/modecosts.c', + 'libvpx/vp8/encoder/mr_dissim.c', + 'libvpx/vp8/encoder/onyx_if.c', + 'libvpx/vp8/encoder/pickinter.c', + 'libvpx/vp8/encoder/picklpf.c', + 'libvpx/vp8/encoder/ratectrl.c', + 'libvpx/vp8/encoder/rdopt.c', + 'libvpx/vp8/encoder/segmentation.c', + 'libvpx/vp8/encoder/temporal_filter.c', + 'libvpx/vp8/encoder/tokenize.c', + 'libvpx/vp8/encoder/treewriter.c', + 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/vp8_cx_iface.c', + 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c', + 'libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c', + 'libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c', + 'libvpx/vp9/common/vp9_alloccommon.c', + 'libvpx/vp9/common/vp9_blockd.c', + 'libvpx/vp9/common/vp9_common_data.c', + 'libvpx/vp9/common/vp9_entropy.c', + 'libvpx/vp9/common/vp9_entropymode.c', + 'libvpx/vp9/common/vp9_entropymv.c', + 'libvpx/vp9/common/vp9_filter.c', + 'libvpx/vp9/common/vp9_frame_buffers.c', + 'libvpx/vp9/common/vp9_idct.c', + 'libvpx/vp9/common/vp9_loopfilter.c', + 'libvpx/vp9/common/vp9_mvref_common.c', + 'libvpx/vp9/common/vp9_pred_common.c', + 'libvpx/vp9/common/vp9_quant_common.c', + 'libvpx/vp9/common/vp9_reconinter.c', + 'libvpx/vp9/common/vp9_reconintra.c', + 'libvpx/vp9/common/vp9_rtcd.c', + 'libvpx/vp9/common/vp9_scale.c', + 'libvpx/vp9/common/vp9_scan.c', + 'libvpx/vp9/common/vp9_seg_common.c', + 'libvpx/vp9/common/vp9_thread_common.c', + 'libvpx/vp9/common/vp9_tile_common.c', + 'libvpx/vp9/decoder/vp9_decodeframe.c', + 'libvpx/vp9/decoder/vp9_decodemv.c', + 'libvpx/vp9/decoder/vp9_decoder.c', + 'libvpx/vp9/decoder/vp9_detokenize.c', + 'libvpx/vp9/decoder/vp9_dsubexp.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', + 'libvpx/vp9/encoder/mips/msa/vp9_error_msa.c', + 'libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c', + 'libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c', + 'libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c', + 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', + 'libvpx/vp9/encoder/vp9_aq_360.c', + 'libvpx/vp9/encoder/vp9_aq_complexity.c', + 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', + 'libvpx/vp9/encoder/vp9_aq_variance.c', + 'libvpx/vp9/encoder/vp9_bitstream.c', + 'libvpx/vp9/encoder/vp9_context_tree.c', + 'libvpx/vp9/encoder/vp9_cost.c', + 'libvpx/vp9/encoder/vp9_dct.c', + 'libvpx/vp9/encoder/vp9_encodeframe.c', + 'libvpx/vp9/encoder/vp9_encodemb.c', + 'libvpx/vp9/encoder/vp9_encodemv.c', + 'libvpx/vp9/encoder/vp9_encoder.c', + 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', + 'libvpx/vp9/encoder/vp9_extend.c', + 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', + 'libvpx/vp9/encoder/vp9_lookahead.c', + 'libvpx/vp9/encoder/vp9_mbgraph.c', + 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', + 'libvpx/vp9/encoder/vp9_noise_estimate.c', + 'libvpx/vp9/encoder/vp9_picklpf.c', + 'libvpx/vp9/encoder/vp9_pickmode.c', + 'libvpx/vp9/encoder/vp9_quantize.c', + 'libvpx/vp9/encoder/vp9_ratectrl.c', + 'libvpx/vp9/encoder/vp9_rd.c', + 'libvpx/vp9/encoder/vp9_rdopt.c', + 'libvpx/vp9/encoder/vp9_resize.c', + 'libvpx/vp9/encoder/vp9_segmentation.c', + 'libvpx/vp9/encoder/vp9_skin_detection.c', + 'libvpx/vp9/encoder/vp9_speed_features.c', + 'libvpx/vp9/encoder/vp9_subexp.c', + 'libvpx/vp9/encoder/vp9_svc_layercontext.c', + 'libvpx/vp9/encoder/vp9_temporal_filter.c', + 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', + 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/vp9_cx_iface.c', + 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', + 'libvpx/vpx/src/vpx_codec.c', + 'libvpx/vpx/src/vpx_decoder.c', + 'libvpx/vpx/src/vpx_encoder.c', + 'libvpx/vpx/src/vpx_image.c', + 'libvpx/vpx_dsp/avg.c', + 'libvpx/vpx_dsp/bitreader.c', + 'libvpx/vpx_dsp/bitreader_buffer.c', + 'libvpx/vpx_dsp/bitwriter.c', + 'libvpx/vpx_dsp/bitwriter_buffer.c', + 'libvpx/vpx_dsp/fwd_txfm.c', + 'libvpx/vpx_dsp/intrapred.c', + 'libvpx/vpx_dsp/inv_txfm.c', + 'libvpx/vpx_dsp/loopfilter.c', + 'libvpx/vpx_dsp/mips/avg_msa.c', + 'libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c', + 'libvpx/vpx_dsp/mips/fwd_txfm_msa.c', + 'libvpx/vpx_dsp/mips/idct16x16_msa.c', + 'libvpx/vpx_dsp/mips/idct32x32_msa.c', + 'libvpx/vpx_dsp/mips/idct4x4_msa.c', + 'libvpx/vpx_dsp/mips/idct8x8_msa.c', + 'libvpx/vpx_dsp/mips/intrapred_msa.c', + 'libvpx/vpx_dsp/mips/loopfilter_16_msa.c', + 'libvpx/vpx_dsp/mips/loopfilter_4_msa.c', + 'libvpx/vpx_dsp/mips/loopfilter_8_msa.c', + 'libvpx/vpx_dsp/mips/sad_mmi.c', + 'libvpx/vpx_dsp/mips/sad_msa.c', + 'libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c', + 'libvpx/vpx_dsp/mips/subtract_mmi.c', + 'libvpx/vpx_dsp/mips/subtract_msa.c', + 'libvpx/vpx_dsp/mips/sum_squares_msa.c', + 'libvpx/vpx_dsp/mips/variance_mmi.c', + 'libvpx/vpx_dsp/mips/variance_msa.c', + 'libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c', + 'libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c', + 'libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c', + 'libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c', + 'libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c', + 'libvpx/vpx_dsp/mips/vpx_convolve8_msa.c', + 'libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c', + 'libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c', + 'libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c', + 'libvpx/vpx_dsp/prob.c', + 'libvpx/vpx_dsp/psnr.c', + 'libvpx/vpx_dsp/quantize.c', + 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', + 'libvpx/vpx_dsp/subtract.c', + 'libvpx/vpx_dsp/sum_squares.c', + 'libvpx/vpx_dsp/variance.c', + 'libvpx/vpx_dsp/vpx_convolve.c', + 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', + 'libvpx/vpx_mem/vpx_mem.c', + 'libvpx/vpx_ports/mips_cpudetect.c', + 'libvpx/vpx_scale/generic/gen_scalers.c', + 'libvpx/vpx_scale/generic/vpx_scale.c', + 'libvpx/vpx_scale/generic/yv12config.c', + 'libvpx/vpx_scale/generic/yv12extend.c', + 'libvpx/vpx_scale/vpx_scale_rtcd.c', + 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', +], + 'PPC64LE_EXPORTS': [ + 'libvpx/vpx/vp8.h', + 'libvpx/vpx/vp8cx.h', + 'libvpx/vpx/vp8dx.h', + 'libvpx/vpx/vpx_codec.h', + 'libvpx/vpx/vpx_decoder.h', + 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', + 'libvpx/vpx/vpx_frame_buffer.h', + 'libvpx/vpx/vpx_image.h', + 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', + 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', + 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', + 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/ppc.h', + 'libvpx/vpx_ports/static_assert.h', + 'libvpx/vpx_ports/system_state.h', + 'libvpx/vpx_ports/vpx_timer.h', + 'libvpx/vpx_scale/vpx_scale.h', + 'libvpx/vpx_scale/yv12config.h', +], + 'PPC64LE_SOURCES': [ + 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/blockd.c', + 'libvpx/vp8/common/dequantize.c', + 'libvpx/vp8/common/entropy.c', + 'libvpx/vp8/common/entropymode.c', + 'libvpx/vp8/common/entropymv.c', + 'libvpx/vp8/common/extend.c', + 'libvpx/vp8/common/filter.c', + 'libvpx/vp8/common/findnearmv.c', + 'libvpx/vp8/common/generic/systemdependent.c', + 'libvpx/vp8/common/idct_blk.c', + 'libvpx/vp8/common/idctllm.c', + 'libvpx/vp8/common/loopfilter_filters.c', + 'libvpx/vp8/common/mbpitch.c', + 'libvpx/vp8/common/modecont.c', + 'libvpx/vp8/common/quant_common.c', + 'libvpx/vp8/common/reconinter.c', + 'libvpx/vp8/common/reconintra.c', + 'libvpx/vp8/common/reconintra4x4.c', + 'libvpx/vp8/common/rtcd.c', + 'libvpx/vp8/common/setupintrarecon.c', + 'libvpx/vp8/common/swapyv12buffer.c', + 'libvpx/vp8/common/treecoder.c', + 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/decoder/dboolhuff.c', + 'libvpx/vp8/decoder/decodeframe.c', + 'libvpx/vp8/decoder/decodemv.c', + 'libvpx/vp8/decoder/detokenize.c', + 'libvpx/vp8/decoder/onyxd_if.c', + 'libvpx/vp8/decoder/threading.c', + 'libvpx/vp8/encoder/bitstream.c', + 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', + 'libvpx/vp8/encoder/dct.c', + 'libvpx/vp8/encoder/denoising.c', + 'libvpx/vp8/encoder/encodeframe.c', + 'libvpx/vp8/encoder/encodeintra.c', + 'libvpx/vp8/encoder/encodemb.c', + 'libvpx/vp8/encoder/encodemv.c', + 'libvpx/vp8/encoder/ethreading.c', + 'libvpx/vp8/encoder/firstpass.c', + 'libvpx/vp8/encoder/lookahead.c', + 'libvpx/vp8/encoder/mcomp.c', + 'libvpx/vp8/encoder/modecosts.c', + 'libvpx/vp8/encoder/mr_dissim.c', + 'libvpx/vp8/encoder/onyx_if.c', + 'libvpx/vp8/encoder/pickinter.c', + 'libvpx/vp8/encoder/picklpf.c', + 'libvpx/vp8/encoder/ratectrl.c', + 'libvpx/vp8/encoder/rdopt.c', + 'libvpx/vp8/encoder/segmentation.c', + 'libvpx/vp8/encoder/temporal_filter.c', + 'libvpx/vp8/encoder/tokenize.c', + 'libvpx/vp8/encoder/treewriter.c', + 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/vp8_cx_iface.c', + 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/ppc/vp9_idct_vsx.c', + 'libvpx/vp9/common/vp9_alloccommon.c', + 'libvpx/vp9/common/vp9_blockd.c', + 'libvpx/vp9/common/vp9_common_data.c', + 'libvpx/vp9/common/vp9_entropy.c', + 'libvpx/vp9/common/vp9_entropymode.c', + 'libvpx/vp9/common/vp9_entropymv.c', + 'libvpx/vp9/common/vp9_filter.c', + 'libvpx/vp9/common/vp9_frame_buffers.c', + 'libvpx/vp9/common/vp9_idct.c', + 'libvpx/vp9/common/vp9_loopfilter.c', + 'libvpx/vp9/common/vp9_mvref_common.c', + 'libvpx/vp9/common/vp9_pred_common.c', + 'libvpx/vp9/common/vp9_quant_common.c', + 'libvpx/vp9/common/vp9_reconinter.c', + 'libvpx/vp9/common/vp9_reconintra.c', + 'libvpx/vp9/common/vp9_rtcd.c', + 'libvpx/vp9/common/vp9_scale.c', + 'libvpx/vp9/common/vp9_scan.c', + 'libvpx/vp9/common/vp9_seg_common.c', + 'libvpx/vp9/common/vp9_thread_common.c', + 'libvpx/vp9/common/vp9_tile_common.c', + 'libvpx/vp9/decoder/vp9_decodeframe.c', + 'libvpx/vp9/decoder/vp9_decodemv.c', + 'libvpx/vp9/decoder/vp9_decoder.c', + 'libvpx/vp9/decoder/vp9_detokenize.c', + 'libvpx/vp9/decoder/vp9_dsubexp.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', + 'libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c', + 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', + 'libvpx/vp9/encoder/vp9_aq_360.c', + 'libvpx/vp9/encoder/vp9_aq_complexity.c', + 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', + 'libvpx/vp9/encoder/vp9_aq_variance.c', + 'libvpx/vp9/encoder/vp9_bitstream.c', + 'libvpx/vp9/encoder/vp9_context_tree.c', + 'libvpx/vp9/encoder/vp9_cost.c', + 'libvpx/vp9/encoder/vp9_dct.c', + 'libvpx/vp9/encoder/vp9_encodeframe.c', + 'libvpx/vp9/encoder/vp9_encodemb.c', + 'libvpx/vp9/encoder/vp9_encodemv.c', + 'libvpx/vp9/encoder/vp9_encoder.c', + 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', + 'libvpx/vp9/encoder/vp9_extend.c', + 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', + 'libvpx/vp9/encoder/vp9_lookahead.c', + 'libvpx/vp9/encoder/vp9_mbgraph.c', + 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', + 'libvpx/vp9/encoder/vp9_noise_estimate.c', + 'libvpx/vp9/encoder/vp9_picklpf.c', + 'libvpx/vp9/encoder/vp9_pickmode.c', + 'libvpx/vp9/encoder/vp9_quantize.c', + 'libvpx/vp9/encoder/vp9_ratectrl.c', + 'libvpx/vp9/encoder/vp9_rd.c', + 'libvpx/vp9/encoder/vp9_rdopt.c', + 'libvpx/vp9/encoder/vp9_resize.c', + 'libvpx/vp9/encoder/vp9_segmentation.c', + 'libvpx/vp9/encoder/vp9_skin_detection.c', + 'libvpx/vp9/encoder/vp9_speed_features.c', + 'libvpx/vp9/encoder/vp9_subexp.c', + 'libvpx/vp9/encoder/vp9_svc_layercontext.c', + 'libvpx/vp9/encoder/vp9_temporal_filter.c', + 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', + 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/vp9_cx_iface.c', + 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', + 'libvpx/vpx/src/vpx_codec.c', + 'libvpx/vpx/src/vpx_decoder.c', + 'libvpx/vpx/src/vpx_encoder.c', + 'libvpx/vpx/src/vpx_image.c', + 'libvpx/vpx_dsp/avg.c', + 'libvpx/vpx_dsp/bitreader.c', + 'libvpx/vpx_dsp/bitreader_buffer.c', + 'libvpx/vpx_dsp/bitwriter.c', + 'libvpx/vpx_dsp/bitwriter_buffer.c', + 'libvpx/vpx_dsp/fwd_txfm.c', + 'libvpx/vpx_dsp/intrapred.c', + 'libvpx/vpx_dsp/inv_txfm.c', + 'libvpx/vpx_dsp/loopfilter.c', + 'libvpx/vpx_dsp/ppc/fdct32x32_vsx.c', + 'libvpx/vpx_dsp/ppc/hadamard_vsx.c', + 'libvpx/vpx_dsp/ppc/intrapred_vsx.c', + 'libvpx/vpx_dsp/ppc/inv_txfm_vsx.c', + 'libvpx/vpx_dsp/ppc/quantize_vsx.c', + 'libvpx/vpx_dsp/ppc/sad_vsx.c', + 'libvpx/vpx_dsp/ppc/subtract_vsx.c', + 'libvpx/vpx_dsp/ppc/variance_vsx.c', + 'libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c', + 'libvpx/vpx_dsp/prob.c', + 'libvpx/vpx_dsp/psnr.c', + 'libvpx/vpx_dsp/quantize.c', + 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', + 'libvpx/vpx_dsp/subtract.c', + 'libvpx/vpx_dsp/sum_squares.c', + 'libvpx/vpx_dsp/variance.c', + 'libvpx/vpx_dsp/vpx_convolve.c', + 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', + 'libvpx/vpx_mem/vpx_mem.c', + 'libvpx/vpx_ports/ppc_cpudetect.c', + 'libvpx/vpx_scale/generic/gen_scalers.c', + 'libvpx/vpx_scale/generic/vpx_scale.c', + 'libvpx/vpx_scale/generic/yv12config.c', + 'libvpx/vpx_scale/generic/yv12extend.c', + 'libvpx/vpx_scale/vpx_scale_rtcd.c', + 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', +], + 'LOONGARCH64_EXPORTS': [ + 'libvpx/vpx/vp8.h', + 'libvpx/vpx/vp8cx.h', + 'libvpx/vpx/vp8dx.h', + 'libvpx/vpx/vpx_codec.h', + 'libvpx/vpx/vpx_decoder.h', + 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', + 'libvpx/vpx/vpx_frame_buffer.h', + 'libvpx/vpx/vpx_image.h', + 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', + 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', + 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', + 'libvpx/vpx_ports/loongarch.h', + 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', + 'libvpx/vpx_ports/system_state.h', + 'libvpx/vpx_ports/vpx_timer.h', + 'libvpx/vpx_scale/vpx_scale.h', + 'libvpx/vpx_scale/yv12config.h', +], + 'LOONGARCH64_SOURCES': [ + 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/blockd.c', + 'libvpx/vp8/common/dequantize.c', + 'libvpx/vp8/common/entropy.c', + 'libvpx/vp8/common/entropymode.c', + 'libvpx/vp8/common/entropymv.c', + 'libvpx/vp8/common/extend.c', + 'libvpx/vp8/common/filter.c', + 'libvpx/vp8/common/findnearmv.c', + 'libvpx/vp8/common/generic/systemdependent.c', + 'libvpx/vp8/common/idct_blk.c', + 'libvpx/vp8/common/idctllm.c', + 'libvpx/vp8/common/loongarch/idct_lsx.c', + 'libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c', + 'libvpx/vp8/common/loongarch/sixtap_filter_lsx.c', + 'libvpx/vp8/common/loopfilter_filters.c', + 'libvpx/vp8/common/mbpitch.c', + 'libvpx/vp8/common/modecont.c', + 'libvpx/vp8/common/quant_common.c', + 'libvpx/vp8/common/reconinter.c', + 'libvpx/vp8/common/reconintra.c', + 'libvpx/vp8/common/reconintra4x4.c', + 'libvpx/vp8/common/rtcd.c', + 'libvpx/vp8/common/setupintrarecon.c', + 'libvpx/vp8/common/swapyv12buffer.c', + 'libvpx/vp8/common/treecoder.c', + 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/decoder/dboolhuff.c', + 'libvpx/vp8/decoder/decodeframe.c', + 'libvpx/vp8/decoder/decodemv.c', + 'libvpx/vp8/decoder/detokenize.c', + 'libvpx/vp8/decoder/onyxd_if.c', + 'libvpx/vp8/decoder/threading.c', + 'libvpx/vp8/encoder/bitstream.c', + 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', + 'libvpx/vp8/encoder/dct.c', + 'libvpx/vp8/encoder/denoising.c', + 'libvpx/vp8/encoder/encodeframe.c', + 'libvpx/vp8/encoder/encodeintra.c', + 'libvpx/vp8/encoder/encodemb.c', + 'libvpx/vp8/encoder/encodemv.c', + 'libvpx/vp8/encoder/ethreading.c', + 'libvpx/vp8/encoder/firstpass.c', + 'libvpx/vp8/encoder/lookahead.c', + 'libvpx/vp8/encoder/loongarch/dct_lsx.c', + 'libvpx/vp8/encoder/loongarch/encodeopt_lsx.c', + 'libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c', + 'libvpx/vp8/encoder/mcomp.c', + 'libvpx/vp8/encoder/modecosts.c', + 'libvpx/vp8/encoder/mr_dissim.c', + 'libvpx/vp8/encoder/onyx_if.c', + 'libvpx/vp8/encoder/pickinter.c', + 'libvpx/vp8/encoder/picklpf.c', + 'libvpx/vp8/encoder/ratectrl.c', + 'libvpx/vp8/encoder/rdopt.c', + 'libvpx/vp8/encoder/segmentation.c', + 'libvpx/vp8/encoder/temporal_filter.c', + 'libvpx/vp8/encoder/tokenize.c', + 'libvpx/vp8/encoder/treewriter.c', + 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/vp8_cx_iface.c', + 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/vp9_alloccommon.c', + 'libvpx/vp9/common/vp9_blockd.c', + 'libvpx/vp9/common/vp9_common_data.c', + 'libvpx/vp9/common/vp9_entropy.c', + 'libvpx/vp9/common/vp9_entropymode.c', + 'libvpx/vp9/common/vp9_entropymv.c', + 'libvpx/vp9/common/vp9_filter.c', + 'libvpx/vp9/common/vp9_frame_buffers.c', + 'libvpx/vp9/common/vp9_idct.c', + 'libvpx/vp9/common/vp9_loopfilter.c', + 'libvpx/vp9/common/vp9_mvref_common.c', + 'libvpx/vp9/common/vp9_pred_common.c', + 'libvpx/vp9/common/vp9_quant_common.c', + 'libvpx/vp9/common/vp9_reconinter.c', + 'libvpx/vp9/common/vp9_reconintra.c', + 'libvpx/vp9/common/vp9_rtcd.c', + 'libvpx/vp9/common/vp9_scale.c', + 'libvpx/vp9/common/vp9_scan.c', + 'libvpx/vp9/common/vp9_seg_common.c', + 'libvpx/vp9/common/vp9_thread_common.c', + 'libvpx/vp9/common/vp9_tile_common.c', + 'libvpx/vp9/decoder/vp9_decodeframe.c', + 'libvpx/vp9/decoder/vp9_decodemv.c', + 'libvpx/vp9/decoder/vp9_decoder.c', + 'libvpx/vp9/decoder/vp9_detokenize.c', + 'libvpx/vp9/decoder/vp9_dsubexp.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', + 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', + 'libvpx/vp9/encoder/vp9_aq_360.c', + 'libvpx/vp9/encoder/vp9_aq_complexity.c', + 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', + 'libvpx/vp9/encoder/vp9_aq_variance.c', + 'libvpx/vp9/encoder/vp9_bitstream.c', + 'libvpx/vp9/encoder/vp9_context_tree.c', + 'libvpx/vp9/encoder/vp9_cost.c', + 'libvpx/vp9/encoder/vp9_dct.c', + 'libvpx/vp9/encoder/vp9_encodeframe.c', + 'libvpx/vp9/encoder/vp9_encodemb.c', + 'libvpx/vp9/encoder/vp9_encodemv.c', + 'libvpx/vp9/encoder/vp9_encoder.c', + 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', + 'libvpx/vp9/encoder/vp9_extend.c', + 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', + 'libvpx/vp9/encoder/vp9_lookahead.c', + 'libvpx/vp9/encoder/vp9_mbgraph.c', + 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', + 'libvpx/vp9/encoder/vp9_noise_estimate.c', + 'libvpx/vp9/encoder/vp9_picklpf.c', + 'libvpx/vp9/encoder/vp9_pickmode.c', + 'libvpx/vp9/encoder/vp9_quantize.c', + 'libvpx/vp9/encoder/vp9_ratectrl.c', + 'libvpx/vp9/encoder/vp9_rd.c', + 'libvpx/vp9/encoder/vp9_rdopt.c', + 'libvpx/vp9/encoder/vp9_resize.c', + 'libvpx/vp9/encoder/vp9_segmentation.c', + 'libvpx/vp9/encoder/vp9_skin_detection.c', + 'libvpx/vp9/encoder/vp9_speed_features.c', + 'libvpx/vp9/encoder/vp9_subexp.c', + 'libvpx/vp9/encoder/vp9_svc_layercontext.c', + 'libvpx/vp9/encoder/vp9_temporal_filter.c', + 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', + 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/vp9_cx_iface.c', + 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', + 'libvpx/vpx/src/vpx_codec.c', + 'libvpx/vpx/src/vpx_decoder.c', + 'libvpx/vpx/src/vpx_encoder.c', + 'libvpx/vpx/src/vpx_image.c', + 'libvpx/vpx_dsp/avg.c', + 'libvpx/vpx_dsp/bitreader.c', + 'libvpx/vpx_dsp/bitreader_buffer.c', + 'libvpx/vpx_dsp/bitwriter.c', + 'libvpx/vpx_dsp/bitwriter_buffer.c', + 'libvpx/vpx_dsp/fwd_txfm.c', + 'libvpx/vpx_dsp/intrapred.c', + 'libvpx/vpx_dsp/inv_txfm.c', + 'libvpx/vpx_dsp/loongarch/avg_lsx.c', + 'libvpx/vpx_dsp/loongarch/avg_pred_lsx.c', + 'libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c', + 'libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c', + 'libvpx/vpx_dsp/loongarch/idct32x32_lsx.c', + 'libvpx/vpx_dsp/loongarch/intrapred_lsx.c', + 'libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c', + 'libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c', + 'libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c', + 'libvpx/vpx_dsp/loongarch/quantize_lsx.c', + 'libvpx/vpx_dsp/loongarch/sad_lsx.c', + 'libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c', + 'libvpx/vpx_dsp/loongarch/subtract_lsx.c', + 'libvpx/vpx_dsp/loongarch/variance_lsx.c', + 'libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c', + 'libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c', + 'libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c', + 'libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c', + 'libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c', + 'libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c', + 'libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c', + 'libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c', + 'libvpx/vpx_dsp/loopfilter.c', + 'libvpx/vpx_dsp/prob.c', + 'libvpx/vpx_dsp/psnr.c', + 'libvpx/vpx_dsp/quantize.c', + 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', + 'libvpx/vpx_dsp/subtract.c', + 'libvpx/vpx_dsp/sum_squares.c', + 'libvpx/vpx_dsp/variance.c', + 'libvpx/vpx_dsp/vpx_convolve.c', + 'libvpx/vpx_dsp/vpx_dsp_rtcd.c', + 'libvpx/vpx_mem/vpx_mem.c', + 'libvpx/vpx_ports/loongarch_cpudetect.c', + 'libvpx/vpx_scale/generic/gen_scalers.c', + 'libvpx/vpx_scale/generic/vpx_scale.c', + 'libvpx/vpx_scale/generic/yv12config.c', + 'libvpx/vpx_scale/generic/yv12extend.c', + 'libvpx/vpx_scale/vpx_scale_rtcd.c', + 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', +], + 'GENERIC_EXPORTS': [ + 'libvpx/vpx/vp8.h', + 'libvpx/vpx/vp8cx.h', + 'libvpx/vpx/vp8dx.h', + 'libvpx/vpx/vpx_codec.h', + 'libvpx/vpx/vpx_decoder.h', + 'libvpx/vpx/vpx_encoder.h', + 'libvpx/vpx/vpx_ext_ratectrl.h', + 'libvpx/vpx/vpx_frame_buffer.h', + 'libvpx/vpx/vpx_image.h', + 'libvpx/vpx/vpx_integer.h', + 'libvpx/vpx/vpx_tpl.h', + 'libvpx/vpx_mem/include/vpx_mem_intrnl.h', + 'libvpx/vpx_mem/vpx_mem.h', + 'libvpx/vpx_ports/bitops.h', + 'libvpx/vpx_ports/compiler_attributes.h', + 'libvpx/vpx_ports/mem.h', + 'libvpx/vpx_ports/static_assert.h', + 'libvpx/vpx_ports/system_state.h', + 'libvpx/vpx_ports/vpx_timer.h', + 'libvpx/vpx_scale/vpx_scale.h', + 'libvpx/vpx_scale/yv12config.h', +], + 'GENERIC_SOURCES': [ + 'libvpx/vp8/common/alloccommon.c', + 'libvpx/vp8/common/blockd.c', + 'libvpx/vp8/common/dequantize.c', + 'libvpx/vp8/common/entropy.c', + 'libvpx/vp8/common/entropymode.c', + 'libvpx/vp8/common/entropymv.c', + 'libvpx/vp8/common/extend.c', + 'libvpx/vp8/common/filter.c', + 'libvpx/vp8/common/findnearmv.c', + 'libvpx/vp8/common/generic/systemdependent.c', + 'libvpx/vp8/common/idct_blk.c', + 'libvpx/vp8/common/idctllm.c', + 'libvpx/vp8/common/loopfilter_filters.c', + 'libvpx/vp8/common/mbpitch.c', + 'libvpx/vp8/common/modecont.c', + 'libvpx/vp8/common/quant_common.c', + 'libvpx/vp8/common/reconinter.c', + 'libvpx/vp8/common/reconintra.c', + 'libvpx/vp8/common/reconintra4x4.c', + 'libvpx/vp8/common/rtcd.c', + 'libvpx/vp8/common/setupintrarecon.c', + 'libvpx/vp8/common/swapyv12buffer.c', + 'libvpx/vp8/common/treecoder.c', + 'libvpx/vp8/common/vp8_loopfilter.c', + 'libvpx/vp8/common/vp8_skin_detection.c', + 'libvpx/vp8/decoder/dboolhuff.c', + 'libvpx/vp8/decoder/decodeframe.c', + 'libvpx/vp8/decoder/decodemv.c', + 'libvpx/vp8/decoder/detokenize.c', + 'libvpx/vp8/decoder/onyxd_if.c', + 'libvpx/vp8/decoder/threading.c', + 'libvpx/vp8/encoder/bitstream.c', + 'libvpx/vp8/encoder/boolhuff.c', + 'libvpx/vp8/encoder/copy_c.c', + 'libvpx/vp8/encoder/dct.c', + 'libvpx/vp8/encoder/denoising.c', + 'libvpx/vp8/encoder/encodeframe.c', + 'libvpx/vp8/encoder/encodeintra.c', + 'libvpx/vp8/encoder/encodemb.c', + 'libvpx/vp8/encoder/encodemv.c', + 'libvpx/vp8/encoder/ethreading.c', + 'libvpx/vp8/encoder/firstpass.c', + 'libvpx/vp8/encoder/lookahead.c', + 'libvpx/vp8/encoder/mcomp.c', + 'libvpx/vp8/encoder/modecosts.c', + 'libvpx/vp8/encoder/mr_dissim.c', + 'libvpx/vp8/encoder/onyx_if.c', + 'libvpx/vp8/encoder/pickinter.c', + 'libvpx/vp8/encoder/picklpf.c', + 'libvpx/vp8/encoder/ratectrl.c', + 'libvpx/vp8/encoder/rdopt.c', + 'libvpx/vp8/encoder/segmentation.c', + 'libvpx/vp8/encoder/temporal_filter.c', + 'libvpx/vp8/encoder/tokenize.c', + 'libvpx/vp8/encoder/treewriter.c', + 'libvpx/vp8/encoder/vp8_quantize.c', + 'libvpx/vp8/vp8_cx_iface.c', + 'libvpx/vp8/vp8_dx_iface.c', + 'libvpx/vp9/common/vp9_alloccommon.c', + 'libvpx/vp9/common/vp9_blockd.c', + 'libvpx/vp9/common/vp9_common_data.c', + 'libvpx/vp9/common/vp9_entropy.c', + 'libvpx/vp9/common/vp9_entropymode.c', + 'libvpx/vp9/common/vp9_entropymv.c', + 'libvpx/vp9/common/vp9_filter.c', + 'libvpx/vp9/common/vp9_frame_buffers.c', + 'libvpx/vp9/common/vp9_idct.c', + 'libvpx/vp9/common/vp9_loopfilter.c', + 'libvpx/vp9/common/vp9_mvref_common.c', + 'libvpx/vp9/common/vp9_pred_common.c', + 'libvpx/vp9/common/vp9_quant_common.c', + 'libvpx/vp9/common/vp9_reconinter.c', + 'libvpx/vp9/common/vp9_reconintra.c', + 'libvpx/vp9/common/vp9_rtcd.c', + 'libvpx/vp9/common/vp9_scale.c', + 'libvpx/vp9/common/vp9_scan.c', + 'libvpx/vp9/common/vp9_seg_common.c', + 'libvpx/vp9/common/vp9_thread_common.c', + 'libvpx/vp9/common/vp9_tile_common.c', + 'libvpx/vp9/decoder/vp9_decodeframe.c', + 'libvpx/vp9/decoder/vp9_decodemv.c', + 'libvpx/vp9/decoder/vp9_decoder.c', + 'libvpx/vp9/decoder/vp9_detokenize.c', + 'libvpx/vp9/decoder/vp9_dsubexp.c', + 'libvpx/vp9/decoder/vp9_job_queue.c', + 'libvpx/vp9/encoder/vp9_alt_ref_aq.c', + 'libvpx/vp9/encoder/vp9_aq_360.c', + 'libvpx/vp9/encoder/vp9_aq_complexity.c', + 'libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c', + 'libvpx/vp9/encoder/vp9_aq_variance.c', + 'libvpx/vp9/encoder/vp9_bitstream.c', + 'libvpx/vp9/encoder/vp9_context_tree.c', + 'libvpx/vp9/encoder/vp9_cost.c', + 'libvpx/vp9/encoder/vp9_dct.c', + 'libvpx/vp9/encoder/vp9_encodeframe.c', + 'libvpx/vp9/encoder/vp9_encodemb.c', + 'libvpx/vp9/encoder/vp9_encodemv.c', + 'libvpx/vp9/encoder/vp9_encoder.c', + 'libvpx/vp9/encoder/vp9_ethread.c', + 'libvpx/vp9/encoder/vp9_ext_ratectrl.c', + 'libvpx/vp9/encoder/vp9_extend.c', + 'libvpx/vp9/encoder/vp9_firstpass.c', + 'libvpx/vp9/encoder/vp9_frame_scale.c', + 'libvpx/vp9/encoder/vp9_lookahead.c', + 'libvpx/vp9/encoder/vp9_mbgraph.c', + 'libvpx/vp9/encoder/vp9_mcomp.c', + 'libvpx/vp9/encoder/vp9_multi_thread.c', + 'libvpx/vp9/encoder/vp9_noise_estimate.c', + 'libvpx/vp9/encoder/vp9_picklpf.c', + 'libvpx/vp9/encoder/vp9_pickmode.c', + 'libvpx/vp9/encoder/vp9_quantize.c', + 'libvpx/vp9/encoder/vp9_ratectrl.c', + 'libvpx/vp9/encoder/vp9_rd.c', + 'libvpx/vp9/encoder/vp9_rdopt.c', + 'libvpx/vp9/encoder/vp9_resize.c', + 'libvpx/vp9/encoder/vp9_segmentation.c', + 'libvpx/vp9/encoder/vp9_skin_detection.c', + 'libvpx/vp9/encoder/vp9_speed_features.c', + 'libvpx/vp9/encoder/vp9_subexp.c', + 'libvpx/vp9/encoder/vp9_svc_layercontext.c', + 'libvpx/vp9/encoder/vp9_temporal_filter.c', + 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', + 'libvpx/vp9/encoder/vp9_treewriter.c', + 'libvpx/vp9/vp9_cx_iface.c', + 'libvpx/vp9/vp9_dx_iface.c', + 'libvpx/vp9/vp9_iface_common.c', + 'libvpx/vpx/src/vpx_codec.c', + 'libvpx/vpx/src/vpx_decoder.c', + 'libvpx/vpx/src/vpx_encoder.c', + 'libvpx/vpx/src/vpx_image.c', + 'libvpx/vpx_dsp/avg.c', + 'libvpx/vpx_dsp/bitreader.c', + 'libvpx/vpx_dsp/bitreader_buffer.c', + 'libvpx/vpx_dsp/bitwriter.c', + 'libvpx/vpx_dsp/bitwriter_buffer.c', + 'libvpx/vpx_dsp/fwd_txfm.c', + 'libvpx/vpx_dsp/intrapred.c', + 'libvpx/vpx_dsp/inv_txfm.c', + 'libvpx/vpx_dsp/loopfilter.c', + 'libvpx/vpx_dsp/prob.c', + 'libvpx/vpx_dsp/psnr.c', + 'libvpx/vpx_dsp/quantize.c', + 'libvpx/vpx_dsp/sad.c', + 'libvpx/vpx_dsp/skin_detection.c', + 'libvpx/vpx_dsp/sse.c', 'libvpx/vpx_dsp/subtract.c', 'libvpx/vpx_dsp/sum_squares.c', 'libvpx/vpx_dsp/variance.c', @@ -912,5 +2072,6 @@ files = { 'libvpx/vpx_scale/generic/yv12extend.c', 'libvpx/vpx_scale/vpx_scale_rtcd.c', 'libvpx/vpx_util/vpx_thread.c', + 'libvpx/vpx_util/vpx_write_yuv_frame.c', ], }